Source code for xagg.auxfuncs

import xarray as xr
import numpy as np
import pandas as pd
import geopandas as gpd
import warnings
import os
import re
from .options import get_options


[docs]
def normalize(a,drop_na = False):
    """ Normalizes the vector `a`

    The vector `a` is divided by its sum. If all non-`np.nan` elements of `a` are 0, 
    then a vector with only `np.nan` elements is returned.  

    Parameters
    ---------------
    a : array_like
      A vector to be normalized

    drop_na : bool, optional, default = ``False``
      If drop_na = True, and there are nans in the vector a, then the normalization
      is calculated using only the non-nan locations in a, and the vector is returned
      with the nans in their original location. 
      In other words, `np.nansum(normalize(a),drop_na=True) == 1.0`

      If drop_na = False, and nans are present in a, then normalize just returns a 
      vector of np.nan the same length of a.
    
    Returns
    ---------------
    a : vector
      `a`, but normalized. 
  
    """
    
    if (drop_na) and (np.any(np.isnan(a))):
        a2 = a[~np.isnan(a)]
        if np.all(a2.sum()==0):
            # Return nans if the vector is only 0s
            # (/ 0 error)
            return a*np.nan

        else:
            a2 = a2/a2.sum()
            a[~np.isnan(a)] = a2

            return a

    elif (np.all(~np.isnan(a))) and (not np.all(a.sum()==0)):
        return a/a.sum()
    else:
        return a*np.nan



[docs]
def find_rel_area(df):
    """ 
    Find the relative area of each row in a geodataframe
    """
    df['rel_area'] = df.area/df.area.sum()
    return df




[docs]
def list_or_first(ser):
    lis = list(ser)
    # only the columns associated with the pixels should have multiple values;
    # for all other columns (those associated with the polygons), it should be
    # safe to return just the first item
    if all(x == lis[0] for x in lis) and ser.name not in ['pix_idx', 'coords', 'rel_area', 'lat', 'lon']:
        return lis[0]
    else:
        return lis


def _diagnose_nans(da,other_dims,
                   _return_somelocs = False):
    ''' Flag if partial nans in other dims

    Parameters
    ------------
    da : :py:class:`xr.DataArray`

    other_dims : :py:class:`list` of :py:meth:`str`
        Dimensions over which to test nan prevalence

    Returns
    ------------
    nanflags : :py:class:`dict`
        `nanflags['all']` : A dictionary with keys of dimension names 
                            and values True if any of the coordinates
                            in that dimension has all nans in the `da`
        `nanflags['some']` : A dictionary with keys of dimension names
                             and values True if some coordinates in 
                             that dimension have nans that other coordinates
                             do not


    '''
    # Copy, to make sure the original dataarray is untouched
    da = da.copy()

    # Flag if all values of the variable are nan for some specific coordinate
    # along a dimension (this is not per se a problem, but good to have a 
    # flag if needed) 
    allnan_flags = {dim:da.isnull().all(dim = [d for d in da.sizes if d != dim]).any().values
                    for dim in other_dims}

    # Flag if some values of the variable are nan for some specific coordinate
    # along a dimension (this _could_ be a problem, because it means that 
    # the weights will be different between different values of that coordinate
    # First, drop all-nan slices, since they're captured above and not as much 
    # of a problem
    for dim in other_dims: 
        da = da.dropna(dim,how='all')
    # Now, find for which `locs` there are inconsistent nans 
    somenan_flags = {dim:(da.isnull().any(dim = dim) != 
                          da.isnull().all(dim = dim)).any().values
                         if da.sizes[dim] != 0 else False
                     for dim in other_dims}

    # Combine into single dict
    nanflags = {'all':allnan_flags,
                'some':somenan_flags}

    if _return_somelocs:
        somelocs = {dim:(da.isnull().any(dim = dim) != 
                          da.isnull().all(dim = dim)).any(dim = [d for d in other_dims if d != dim])
                     for dim in other_dims}

        return nanflags,somelocs
    else:
        return nanflags
 

[docs]
class SomeNanWarning(UserWarning):
    ''' Warns when pixels are inconsistently nan along a dimension 
    '''
    pass


def _warn_ifsomenans(ds,var,dims,
                    _warn_trigger_partialnan=True,
                    _return_somelocs=False):
    ''' Wrapper for SomeNanWarning with standard text.
    Triggers warning if `_warn_trigger_partialnan` is True and 
    `_diagnose_nans(ds[var],other_dims)` returns True for at least
    one dim for `nanflags['some']` 

    ## TODO - is there a way to flag which poly_idxs this would 
    affect? Should be, since this is used in xa.aggregate, that has
    that info. Basically, can do a [is in] across pix_idxs to flag
    poly_idxs that should optionally be nan in this case 
    '''
    if type(ds) == xr.core.dataset.Dataset:
        ds = ds[var].copy()
    
    if _return_somelocs:
        raise NotImplementedError

        nanflags,somelocs = _diagnose_nans(ds,dims,
             _return_somelocs=True)

    else:
        nanflags = _diagnose_nans(ds,dims)


    if (_warn_trigger_partialnan and 
        np.any([flag for dim,flag in nanflags['some'].items()])):
        dims_with_somenans = [dim for dim,flag in nanflags['some'].items() if flag]
        
        warnings.warn('One or more grid cells in variable '+var+' have inconsistent nans along the dimension(s) '+
                       ', '.join(dims_with_somenans)+' (i.e., one or more grid cells are nan for some but not all coordinates of the dimension(s))'+
                       '. This means that grid cell weights will be different for different coordinates along '+
                       ', '.join(dims_with_somenans)+'. The aggregation '+
                       'calculation may therefore be incorrect, since it aggregates over different grid cells for '+
                       ' the same polgyon for different coordinates of the dimension(s)'+', '.join(dims_with_somenans)+'.',
                       SomeNanWarning)

        _warn_trigger_partialnan = False

    if _return_somelocs:
        from functools import reduce
        import operator

        # Get all locations that are at partially nan in one direction
        bad_locs = reduce(operator.or_, somelocs.values())
        bad_locs = bad_locs.where(bad_locs,drop=True)

        # Get their lat, lon locations
        bad_coords = [(lat,lon) for lat,lon in zip(bad_locs.lat.values,bad_locs.lon.values)]
        # Turn into set (for faster indexing)
        bad_coords = set(bad_coords)

        # Check each list of polygon grid cell overlap coordinates if they contain any bad coordinate
        # This will need to go into aggregate()
        #poly_badloc_mask = wm.agg.coords.apply(lambda x: bool(bad_coords.intersection(x)))
        # and then a later, something that replaces the output in those locations with nan

        return _warn_trigger_partialnan,bad_coords
    else:
        return _warn_trigger_partialnan
   



[docs]
def fix_ds(ds,var_cipher = {'latitude':{'latitude':'lat','longitude':'lon'},
                            'Latitude':{'Latitude':'lat','Longitude':'lon'},
                            'Lat':{'Lat':'lat','Lon':'lon'},
                            'latitude_1':{'latitude_1':'lat','longitude_1':'lon'},
                            'nav_lat':{'nav_lat':'lat','nav_lon':'lon'},
                            'Y':{'Y':'lat','X':'lon'},
                            'y':{'y':'lat','x':'lon'}},
          chg_bnds = True):
    """ Puts the input `ds` into a format compatible with the rest of the package
    
    1) grid variables are renamed "lat" and "lon"
    2) the lon dimension is made -180:180 to be consistent with most geographic
       data (as well as any lon_bnds variable if ``chg_bnds=True`` (by default))
    3) the dataset is sorted in ascending order in both lat and lon


    NOTE: there probably should be a safeguard in case "y" and "x" are multiindex 
    dimension names instead of lat/lon names... maybe a warning for now... (TO DO)
    
    
    Parameters
    ---------------
    ds : :class:`xarray.Dataset`
      an input :class:`xarray.Dataset`, which may or may not need adjustment to be 
      compatible with this package
    var_cipher : dict, optional
      a dict of dicts for renaming lat/lon variables to "lat"/"lon". The 
      form is ``{search_str:{lat_name:'lat',lon_name:'lon'},...}``, 
      the code looks for `search_str` in the dimensions of the `ds`; 
      based on that, it renames `lat_name` to 'lat' and `lon_name` to 'lon'.
      Common names for these variables ('latitude', 'Latitude', 'Lat',
      'latitude_1','nav_lat','Y') are included out of the box.
    chg_bnds  : bool, optional, default = ``True``
      if ``True``, the names of variables with "_bnd" in their names
      are assumed to be dimension bound variables, and are changed as well if
      the rest of their name matches 'o' (for lon) or 'a' (for lat). 
      ## DOES THIS WORK FOR "X" and "Y"?
                  
    Returns
    ---------------
    ds : :class:`xarray.Dataset`
      a dataset with lat/lon variables in the format necessary for this package to function
    
    
    """
    
    # List of variables that represent bounds
    if type(ds) == xr.core.dataset.Dataset:
        bnd_vars = [k for k in list(ds) if 'bnds' in k]
        # Ignore time bounds, since not needed for xagg
        bnd_vars = [k for k in bnd_vars if 'time' not in k]
    elif type(ds) == xr.core.dataarray.DataArray:
        bnd_vars = []
    else:
        raise TypeError('[ds] needs to be an xarray structure (Dataset or DataArray).')
    
    # Fix lat/lon variable names (sizes instead of dims to be compatible with both ds, da...)
    if 'lat' not in ds.sizes:
        test_dims = [k for k in var_cipher if k in ds.sizes]
        if len(test_dims) == 0:
            raise NameError('No valid lat/lon variables found in the dataset.')
        else:
            # should there be a elif len()>1? If there are multiple things found?
            # Could be an issue in terms of ones where x/y are the coordinates of a 
            # non-rectangular grid with variables lat, lon, which is problematic,
            # or just duplicate dimension names, which is weird. 
            ds = ds.rename(var_cipher[test_dims[0]])
        
        # Now same for the bounds variable, if they exist
        if chg_bnds:
            if len(bnd_vars)>0:
                try:
                    ds = ds.rename({(key+'_bnds'):(value+'_bnds') for (key,value) in var_cipher[test_dims[0]].items()})
                except ValueError:
                    try: 
                        warnings.warn('Assuming dim '+[k for k in bnd_vars if 'o' in k][0]+' is longitude bounds and '+
                                      ' dim '+[k for k in bnd_vars if 'a' in k][0] + ' is latitude bounds.')
                        ds = ds.rename({[k for k in bnd_vars if 'o' in k][0]:'lon_bnds',
                                        [k for k in bnd_vars if 'a' in k][0]:'lat_bnds'})
                    except:
                        warnings.warn('Could not identify which of the following bounds '+
                                      'variables corresponds to lat/lon grid: '+', '.join(bnd_vars)+
                                     '; no bound variables renamed.')
                        

    # Switch longitude to -180:180
    if ds.lon.max()>180:
        ds.coords['lon'] = (ds.coords['lon'] + 180) % 360 - 180
        
    # Do the same for a longitude bound, if present (the check for dataset is to 
    # avoid an error for .keys() for a dataarray; it seems like a good assumption 
    # that you're not running a dataarray of just lon_bnds through this). 
    if (type(ds) == xr.core.dataset.Dataset):
        # Three if statements because of what I believe to be a jupyter error
        # (where all three statements are evaluated instead of one at a time)
        if ('lon_bnds' in ds):
            if (ds.lon_bnds.max()>180):
                ds['lon_bnds'] = (ds['lon_bnds'] + 180) % 360 - 180

    # Sort by lon; this should be robust (and necessary to avoid fix_ds(fix_ds(ds)) 
    # from failing)
    # PERHAPS THERE SHOULD BE A TEST HERE - like `if ds.lon ~= ds.lon.sortby(ds.lon)`
    # Because currently this blows up memory use... 
    if not np.allclose(ds.lon.values,np.sort(ds.lon)):
        ds = ds.sortby(ds.lon)
    
    # Sort by latitude as well (to make latitude consistent)
    if not np.allclose(ds.lat.values,np.sort(ds.lat)):
        ds = ds.sortby(ds.lat)
        
    # Return fixed ds
    return ds




[docs]
def get_bnds(ds,wrap_around_thresh='dynamic',
             break_window_width=3,
             break_thresh_x=2,
             silent=None):

    """ Builds vectors of lat/lon bounds if not present in `ds`
    
    Assumes a regular rectangular grid - so each lat/lon bound
    is 0.5*(gap between pixels) over to the next pixel.
    
    
    Parameters
    ---------------
    ds : :class:`xarray.Dataset`
      an xarray dataset that may or may not 
      contain variables "lat_bnds" and 
      "lon_bnds"

    silent : bool, by default False
      if True, then suppresses standard output
    
    wrap_around_thresh : numeric or str, optional, default = ``'dynamic'``
      the minimum distance between the last grid cell 
      in longitude and the 'edges' of the coordinate 
      system for which the pixels are 'wrapped around'. 
      By default "dynamic", which sets this to twice
      the median difference between longitude values. 

      In either case, if the lowest and highest `lon`
      values have opposite sign, and are both within 
      `wrap_around_thresh` of -180 or 180, then the 
      grid is assumed to be "wrapping around", and pixels
      will line up around (or across, depending on the 
      grid) the anti-meridian.

    break_thresh_x : numeric, default = 2
      If the difference between consecutive coordinate
      values is `break_thresh_x` times the surrounding
      coordinate steps (determined by `break_window_width`), 
      then a break in the grid is assumed and the corresponding 
      grid cell is assumed to be as wide as the preceding 
      one, instead of reaching all the way across the "break". 
    
    break_window_width : numeric, default = 3
      Window used to determine how anomalous a difference
      between coordinate is when determining the location
      of breaks in the grid. 
          
    Returns
    ---------------
    ds : :class:`xarray.Dataset`
      the same dataset as inputted, unchanged if "lat/lon_bnds" 
      already existed, or with new variables "lat_bnds" and "lon_bnds"
      if not.
    """
    if silent is None:
        silent = get_options()['silent']

    #----------- Setup -----------
    if (type(wrap_around_thresh) == str) and (wrap_around_thresh != 'dynamic'):
        raise ValueError('`wrap_around_thresh` must either be numeric or the string "dynamic"; instead, it is '+str(wrap_around_thresh)+'.')
    
    if ('lat' not in ds) or ('lon' not in ds):
        raise KeyError('"lat"/"lon" not found in [ds]. Make sure the '+
                       'geographic dimensions follow this naming convention (e.g., run `xa.fix_ds(ds)` before inputting.')

    if ds.lon.max()>180:
        raise ValueError('Longitude seems to be in the 0:360 format.'+
                         ' -180:180 format required (e.g., run `xa.fix_ds(ds)` before inputting.')
        # Future versions should be able to work with 0:360 as well...
        # honestly, it *may* already work by just changing edges['lon']
        # to [0,360], but it's not tested yet. 
        
    
    if ('lat_bnds' in ds) and ('lon_bnds' in ds):
        # `xa.fix_ds()` should rename bounds to `lat/lon_bnds`
        # If bounds present, do nothing
        return ds
    else:
        if not silent:
            print('lat/lon bounds not found in dataset; they will be created.')
        # Build lat / lon bound 
        for var in ['lat','lon']:
            # Get coordinate spacing
            diffs = ds[var].diff(var)

            if wrap_around_thresh == 'dynamic':
                wat_tmp = diffs.median().values*2
            else:
                wat_tmp = wrap_around_thresh
            
            # Get edge of grid coordinates 
            # (this does double duty by checking for antimeridian coordinates
            # since `xa.fix_ds()` sorts in ascending order by lon)
            edge_coords = ds[var][[0,-1]]

            #---------- Wrapping ----------
            # Figure out if grid wraps around / is continuous
            if var == 'lon':
                wrap_flag = (# 1) are they different sign
                             (np.prod(np.sign(edge_coords)) == -1) and
                             # 2) are they both within wrap_around_thresh of 180
                             np.all(np.abs(np.abs(edge_coords) - 180) < wat_tmp))
            else:
                wrap_flag = False
            
            if wrap_flag:
                # Wrap around lons
                ec = edge_coords.copy(deep=True).values
            
                # Change edge_lons to 0:360, to get difference
                # across antimeridian
                ec[ec<0] = 360+ec[ec<0]
            
                diffs = xr.concat([xr.DataArray(data = [np.abs(np.diff(ec))[0]],
                                     coords = {var:([var],[ds[var][0].values])}),
                                   diffs],
                                  dim=var)
            else:
                # Just do second diff as the first diff
                diffs = xr.concat([xr.DataArray(data = [diffs[0].values],
                                     coords = {var:([var],[ds[var][0].values])}),
                                   diffs],
                                  dim=var)
            
            #---------- Breaks ----------
            # Now, have to identify possible breaks in the coordinate system
            # i.e., [-179,-178,-177,178,179,180] would have a break between 
            # -177 and 178, which would create a pixel width of 355 degrees
            # Identify those breaks, and replace their pixel width with the 
            # preceding pixel width
            
            # Build a rolling filter that gets the average of surrounding 
            # coordinate steps for each coordinate (but ignores the coordinate's
            # step itself)
            break_check = xr.DataArray(np.ones(break_window_width),dims=['window'])
            break_check[int(np.floor(break_window_width/(break_window_width-1)))] = 0
            break_check = break_check/break_check.sum()
            
            # Figure out if any coordinate steps are more than > break_thresh_x
            # larger than surrounding coordinate steps
            breaks = (diffs / 
                      (diffs.rolling({var:break_window_width},center=True).
                       construct('window').dot(break_check))) > break_thresh_x
            
            # Make those coordinate steps the preceding coordinate step instead
            nbreaks = breaks.sum().values
            if (not silent) and nbreaks>0:
                print('Found '+str(nbreaks)+' break(s)/jump(s) in '+var+" (i.e., grid doesn't cover whole planet); replacing grid cell width(s) with preceding grid cell width(s) at breakpoint.")
            diffs[np.where(breaks)[0]] = diffs[np.where(breaks)[0]-1].values

            # Somewhat hack-y solution to deal with the fact that if only one lon 
            # value is found east of the anti-meridian, but the grid does wrap 
            # around, the break is not currently correctly captured
            if wrap_flag and ((ds[var]>0).sum(var) == 1):
                diffs[-1] = np.abs(np.diff(ec))[0]
            
            #---------- Create bounds ----------
            # Now, create bounds using those diffs
            bnds_tmp = xr.concat([ds[var]-0.5*diffs,
                                  ds[var]+0.5*diffs],dim='bnds').transpose(var,'bnds')
            
            #---------- Clean up ----------
            # And now, fix lingering issues
            # 1. lon bounds > 180 / < -180 (which occurs when the grid crosses the antimeridian
            #    in the step above)
            # 2. lat bounds not at the right edges
            # 3. making sure the lon bounds line up right around the antimeridian, if relevant
            
            # Change lon_bounds > 180 to lon_bounds > -180
            if var == 'lon':
                bnds_tmp = bnds_tmp.where(bnds_tmp<=180,-360 + bnds_tmp.where(bnds_tmp>180))
                bnds_tmp = bnds_tmp.where(bnds_tmp>=-180,360 + bnds_tmp.where(bnds_tmp<-180))
            
            # Curtail lat bands to -90, 90
            if var == 'lat':
                bnds_tmp = bnds_tmp.where(bnds_tmp<=90,90)
                bnds_tmp = bnds_tmp.where(bnds_tmp>=-90,-90)
            
            # If wrapping, make sure the bounds line up
            if wrap_flag: 
                if ((not bnds_tmp[0,0] == bnds_tmp[-1,-1]) and  
                    (not ((bnds_tmp[0,0] == -180) and (bnds_tmp[-1,-1] == 180)))):
                    bnds_tmp[-1,-1] = bnds_tmp[0,0]
            # Add to ds
            ds[var+'_bnds'] = bnds_tmp
            del bnds_tmp

    # This is the cf_xarray standard, to have them as coordinates
    # Currently only keeping them as variables instead (which is 
    # CMIP standard I think?)
    #ds = ds.set_coords([var+'_bnds' for var in ['lat','lon']])

     # Return
    return ds    




[docs]
def subset_find(ds0,ds1,silent=None):
    """ Finds the grid of `ds1` in `ds0`, and subsets `ds0` to the grid in `ds1`
    
    Parameters
    ---------------
    ds0 : :class:`xarray.Dataset`
      an xarray Dataset to be subset based on the grid of ds1; must 
      contain grid variables "lat" or "lon" (could add a fix_ds call)
    ds1 : :class:`xarray.Dataset`, :class:`xarray.DataArray`
      either an `xarray` structrue (Dataset, DataArray) with "lat" "lon"
      variables, or a dictionary with DataArrays ['lat'] and ['lon'].
      IMPORTANT: `ds1` HAS TO BE BROADCAST - i.e. one value of lat, lon 
      each coordinate, with lat and lon vectors of equal length. This can 
      be done e.g. using ``ds1.stack(loc=('lat','lon'))``. 

    silent : bool, by default False
      if True, then suppresses standard output
        
    Returns
    ---------------
    ds0 : :class:`xarray.Dataset`
      The input `ds0`, subset to the locations in `ds1`. 
    
    """
    if silent is None:
        silent = get_options()['silent']
    
    if 'loc' not in ds0.sizes:
        ds0 = ds0.stack(loc = ('lat','lon'))
        was_stacked = True
    else:
        was_stacked = False
    #if 'loc' not in ds1.sizes:
    #    raise KeyError("`ds1` must be stacked loc=('lat','lon')")
    #    ds1 = ds1.stack(loc = ('lat','lon'))
    
    # Need a test to make sure the grid is the same. So maybe the gdf_out class 
    # has the lat/lon grid included - and then we can skip the lat/lon column
    # and just keep the pix_idxs
    if (not np.allclose(len(ds0.lat),len(ds1['lat'])) or (not np.allclose(len(ds0.lon),len(ds1['lon']))) or
         (not np.allclose(ds1['lat'],ds0.lat)) or (not np.allclose(ds1['lon'],ds0.lon))):
        if not silent:
            print('adjusting grid... (this may happen because only a subset of pixels '+
              'were used for aggregation for efficiency - i.e. [subset_bbox=True] in '+
             'xa.pixel_overlaps())') #(this also happens because ds and ds_bnds above was already subset)
        # Zip up lat,lon pairs to allow comparison
        latlons1 = [(lat,lon) for lat, lon in zip(ds1['lat'].values,ds1['lon'].values)]
        
        # Subset by indices of the other grid
        try:
            ds0 = ds0.sel({'loc':latlons1})
        
            if not silent:
                print('grid adjustment successful')
        except KeyError:
            raise ValueError('Was not able to match grids!')
        
    if was_stacked:
        # THIS MAY NOT WORK IN ALL CASES
        ds0 = ds0.unstack()
        
    return ds0