Source code for xagg.auxfuncs

import xarray as xr
import numpy as np
import pandas as pd
import geopandas as gpd
import warnings
import os
import re


[docs]
def normalize(a,drop_na = False):
    """ Normalizes the vector `a`

    The vector `a` is divided by its sum. If all non-`np.nan` elements of `a` are 0, 
    then a vector with only `np.nan` elements is returned.  

    Parameters
    ---------------
    a : array_like
      A vector to be normalized

    drop_na : bool, optional, default = ``False``
      If drop_na = True, and there are nans in the vector a, then the normalization
      is calculated using only the non-nan locations in a, and the vector is returned
      with the nans in their original location. 
      In other words, `np.nansum(normalize(a),drop_na=True) == 1.0`

      If drop_na = False, and nans are present in a, then normalize just returns a 
      vector of np.nan the same length of a.
    
    Returns
    ---------------
    a : vector
      `a`, but normalized. 
  
    """
    
    if (drop_na) and (np.any(np.isnan(a))):
        a2 = a[~np.isnan(a)]
        if np.all(a2.sum()==0):
            # Return nans if the vector is only 0s
            # (/ 0 error)
            return a*np.nan

        else:
            a2 = a2/a2.sum()
            a[~np.isnan(a)] = a2

            return a

    elif (np.all(~np.isnan(a))) and (not np.all(a.sum()==0)):
        return a/a.sum()
    else:
        return a*np.nan



[docs]
def find_rel_area(df):
    """ 
    Find the relative area of each row in a geodataframe
    """
    df['rel_area'] = df.area/df.area.sum()
    return df




[docs]
def list_or_first(ser):
    lis = list(ser)
    # only the columns associated with the pixels should have multiple values;
    # for all other columns (those associated with the polygons), it should be
    # safe to return just the first item
    if all(x == lis[0] for x in lis) and ser.name not in ['pix_idx', 'coords', 'rel_area', 'lat', 'lon']:
        return lis[0]
    else:
        return lis




[docs]
def fix_ds(ds,var_cipher = {'latitude':{'latitude':'lat','longitude':'lon'},
                            'Latitude':{'Latitude':'lat','Longitude':'lon'},
                            'Lat':{'Lat':'lat','Lon':'lon'},
                            'latitude_1':{'latitude_1':'lat','longitude_1':'lon'},
                            'nav_lat':{'nav_lat':'lat','nav_lon':'lon'},
                            'Y':{'Y':'lat','X':'lon'},
                            'y':{'y':'lat','x':'lon'}},
          chg_bnds = True):
    """ Puts the input `ds` into a format compatible with the rest of the package
    
    1) grid variables are renamed "lat" and "lon"
    2) the lon dimension is made -180:180 to be consistent with most geographic
       data (as well as any lon_bnds variable if ``chg_bnds=True`` (by default))
    3) the dataset is sorted in ascending order in both lat and lon


    NOTE: there probably should be a safeguard in case "y" and "x" are multiindex 
    dimension names instead of lat/lon names... maybe a warning for now... (TO DO)
    
    
    Parameters
    ---------------
    ds : :class:`xarray.Dataset`
      an input :class:`xarray.Dataset`, which may or may not need adjustment to be 
      compatible with this package
    var_cipher : dict, optional
      a dict of dicts for renaming lat/lon variables to "lat"/"lon". The 
      form is ``{search_str:{lat_name:'lat',lon_name:'lon'},...}``, 
      the code looks for `search_str` in the dimensions of the `ds`; 
      based on that, it renames `lat_name` to 'lat' and `lon_name` to 'lon'.
      Common names for these variables ('latitude', 'Latitude', 'Lat',
      'latitude_1','nav_lat','Y') are included out of the box.
    chg_bnds  : bool, optional, default = ``True``
      if ``True``, the names of variables with "_bnd" in their names
      are assumed to be dimension bound variables, and are changed as well if
      the rest of their name matches 'o' (for lon) or 'a' (for lat). 
      ## DOES THIS WORK FOR "X" and "Y"?
                  
    Returns
    ---------------
    ds : :class:`xarray.Dataset`
      a dataset with lat/lon variables in the format necessary for this package to function
    
    
    """
    
    # List of variables that represent bounds
    if type(ds) == xr.core.dataset.Dataset:
        bnd_vars = [k for k in list(ds) if 'bnds' in k]
        # Ignore time bounds, since not needed for xagg
        bnd_vars = [k for k in bnd_vars if 'time' not in k]
    elif type(ds) == xr.core.dataarray.DataArray:
        bnd_vars = []
    else:
        raise TypeError('[ds] needs to be an xarray structure (Dataset or DataArray).')
    
    # Fix lat/lon variable names (sizes instead of dims to be compatible with both ds, da...)
    if 'lat' not in ds.sizes:
        test_dims = [k for k in var_cipher if k in ds.sizes]
        if len(test_dims) == 0:
            raise NameError('No valid lat/lon variables found in the dataset.')
        else:
            # should there be a elif len()>1? If there are multiple things found?
            # Could be an issue in terms of ones where x/y are the coordinates of a 
            # non-rectangular grid with variables lat, lon, which is problematic,
            # or just duplicate dimension names, which is weird. 
            ds = ds.rename(var_cipher[test_dims[0]])
        
        # Now same for the bounds variable, if they exist
        if chg_bnds:
            if len(bnd_vars)>0:
                try:
                    ds = ds.rename({(key+'_bnds'):(value+'_bnds') for (key,value) in var_cipher[test_dims[0]].items()})
                except ValueError:
                    try: 
                        warnings.warn('Assuming dim '+[k for k in bnd_vars if 'o' in k][0]+' is longitude bounds and '+
                                      ' dim '+[k for k in bnd_vars if 'a' in k][0] + ' is latitude bounds.')
                        ds = ds.rename({[k for k in bnd_vars if 'o' in k][0]:'lon_bnds',
                                        [k for k in bnd_vars if 'a' in k][0]:'lat_bnds'})
                    except:
                        warnings.warn('Could not identify which of the following bounds '+
                                      'variables corresponds to lat/lon grid: '+', '.join(bnd_vars)+
                                     '; no bound variables renamed.')
                        

    # Switch longitude to -180:180
    if ds.lon.max()>180:
        ds.coords['lon'] = (ds.coords['lon'] + 180) % 360 - 180
        
    # Do the same for a longitude bound, if present (the check for dataset is to 
    # avoid an error for .keys() for a dataarray; it seems like a good assumption 
    # that you're not running a dataarray of just lon_bnds through this). 
    if (type(ds) == xr.core.dataset.Dataset):
        # Three if statements because of what I believe to be a jupyter error
        # (where all three statements are evaluated instead of one at a time)
        if ('lon_bnds' in ds):
            if (ds.lon_bnds.max()>180):
                ds['lon_bnds'] = (ds['lon_bnds'] + 180) % 360 - 180

    # Sort by lon; this should be robust (and necessary to avoid fix_ds(fix_ds(ds)) 
    # from failing)
    # PERHAPS THERE SHOULD BE A TEST HERE - like `if ds.lon ~= ds.lon.sortby(ds.lon)`
    # Because currently this blows up memory use... 
    if not np.allclose(ds.lon.values,np.sort(ds.lon)):
        ds = ds.sortby(ds.lon)
    
    # Sort by latitude as well (to make latitude consistent)
    if not np.allclose(ds.lat.values,np.sort(ds.lat)):
        ds = ds.sortby(ds.lat)
        
    # Return fixed ds
    return ds




[docs]
def get_bnds(ds,wrap_around_thresh='dynamic',
             break_window_width=3,
             break_thresh_x=2,
             silent=False):

    """ Builds vectors of lat/lon bounds if not present in `ds`
    
    Assumes a regular rectangular grid - so each lat/lon bound
    is 0.5*(gap between pixels) over to the next pixel.
    
    
    Parameters
    ---------------
    ds : :class:`xarray.Dataset`
      an xarray dataset that may or may not 
      contain variables "lat_bnds" and 
      "lon_bnds"

    silent : bool, by default False
      if True, then suppresses standard output
    
    wrap_around_thresh : numeric or str, optional, default = ``'dynamic'``
      the minimum distance between the last grid cell 
      in longitude and the 'edges' of the coordinate 
      system for which the pixels are 'wrapped around'. 
      By default "dynamic", which sets this to twice
      the median difference between longitude values. 

      In either case, if the lowest and highest `lon`
      values have opposite sign, and are both within 
      `wrap_around_thresh` of -180 or 180, then the 
      grid is assumed to be "wrapping around", and pixels
      will line up around (or across, depending on the 
      grid) the anti-meridian.

    break_thresh_x : numeric, default = 2
      If the difference between consecutive coordinate
      values is `break_thresh_x` times the surrounding
      coordinate steps (determined by `break_window_width`), 
      then a break in the grid is assumed and the corresponding 
      grid cell is assumed to be as wide as the preceding 
      one, instead of reaching all the way across the "break". 
    
    break_window_width : numeric, default = 3
      Window used to determine how anomalous a difference
      between coordinate is when determining the location
      of breaks in the grid. 
          
    Returns
    ---------------
    ds : :class:`xarray.Dataset`
      the same dataset as inputted, unchanged if "lat/lon_bnds" 
      already existed, or with new variables "lat_bnds" and "lon_bnds"
      if not.
    """
    #----------- Setup -----------
    if (type(wrap_around_thresh) == str) and (wrap_around_thresh != 'dynamic'):
        raise ValueError('`wrap_around_thresh` must either be numeric or the string "dynamic"; instead, it is '+str(wrap_around_thresh)+'.')
    
    if ds.lon.max()>180:
        raise ValueError('Longitude seems to be in the 0:360 format.'+
                         ' -180:180 format required (e.g., run `xa.fix_ds(ds)` before inputting.')
        # Future versions should be able to work with 0:360 as well...
        # honestly, it *may* already work by just changing edges['lon']
        # to [0,360], but it's not tested yet. 
        
    if ('lat' not in ds) or ('lon' not in ds):
        raise KeyError('"lat"/"lon" not found in [ds]. Make sure the '+
                       'geographic dimensions follow this naming convention (e.g., run `xa.fix_ds(ds)` before inputting.')
    
    if ('lat_bnds' in ds) and ('lon_bnds' in ds):
        # `xa.fix_ds()` should rename bounds to `lat/lon_bnds`
        # If bounds present, do nothing
        return ds
    else:
        if not silent:
            print('lat/lon bounds not found in dataset; they will be created.')
        # Build lat / lon bound 
        for var in ['lat','lon']:
            # Get coordinate spacing
            diffs = ds[var].diff(var)

            if wrap_around_thresh == 'dynamic':
                wat_tmp = diffs.median().values*2
            else:
                wat_tmp = wrap_around_thresh
            
            # Get edge of grid coordinates 
            # (this does double duty by checking for antimeridian coordinates
            # since `xa.fix_ds()` sorts in ascending order by lon)
            edge_coords = ds[var][[0,-1]]

            #---------- Wrapping ----------
            # Figure out if grid wraps around / is continuous
            if var == 'lon':
                wrap_flag = (# 1) are they different sign
                             (np.prod(np.sign(edge_coords)) == -1) and
                             # 2) are they both within wrap_around_thresh of 180
                             np.all(np.abs(np.abs(edge_coords) - 180) < wat_tmp))
            else:
                wrap_flag = False
            
            if wrap_flag:
                # Wrap around lons
                ec = edge_coords.copy(deep=True).values
            
                # Change edge_lons to 0:360, to get difference
                # across antimeridian
                ec[ec<0] = 360+ec[ec<0]
            
                diffs = xr.concat([xr.DataArray(data = [np.abs(np.diff(ec))[0]],
                                     coords = {var:([var],[ds[var][0].values])}),
                                   diffs],
                                  dim=var)
            else:
                # Just do second diff as the first diff
                diffs = xr.concat([xr.DataArray(data = [diffs[0].values],
                                     coords = {var:([var],[ds[var][0].values])}),
                                   diffs],
                                  dim=var)
            
            #---------- Breaks ----------
            # Now, have to identify possible breaks in the coordinate system
            # i.e., [-179,-178,-177,178,179,180] would have a break between 
            # -177 and 178, which would create a pixel width of 355 degrees
            # Identify those breaks, and replace their pixel width with the 
            # preceding pixel width
            
            # Build a rolling filter that gets the average of surrounding 
            # coordinate steps for each coordinate (but ignores the coordinate's
            # step itself)
            break_check = xr.DataArray(np.ones(break_window_width),dims=['window'])
            break_check[int(np.floor(break_window_width/(break_window_width-1)))] = 0
            break_check = break_check/break_check.sum()
            
            # Figure out if any coordinate steps are more than > break_thresh_x
            # larger than surrounding coordinate steps
            breaks = (diffs / 
                      (diffs.rolling({var:break_window_width},center=True).
                       construct('window').dot(break_check))) > break_thresh_x
            
            # Make those coordinate steps the preceding coordinate step instead
            nbreaks = breaks.sum().values
            if (not silent) and nbreaks>0:
                print('Found '+str(nbreaks)+' break(s)/jump(s) in '+var+" (i.e., grid doesn't cover whole planet); replacing grid cell width(s) with preceding grid cell width(s) at breakpoint.")
            diffs[np.where(breaks)[0]] = diffs[np.where(breaks)[0]-1].values

            # Somewhat hack-y solution to deal with the fact that if only one lon 
            # value is found east of the anti-meridian, but the grid does wrap 
            # around, the break is not currently correctly captured
            if wrap_flag and ((ds[var]>0).sum(var) == 1):
                diffs[-1] = np.abs(np.diff(ec))[0]
            
            #---------- Create bounds ----------
            # Now, create bounds using those diffs
            bnds_tmp = xr.concat([ds[var]-0.5*diffs,
                                  ds[var]+0.5*diffs],dim='bnds').transpose(var,'bnds')
            
            #---------- Clean up ----------
            # And now, fix lingering issues
            # 1. lon bounds > 180 / < -180 (which occurs when the grid crosses the antimeridian
            #    in the step above)
            # 2. lat bounds not at the right edges
            # 3. making sure the lon bounds line up right around the antimeridian, if relevant
            
            # Change lon_bounds > 180 to lon_bounds > -180
            if var == 'lon':
                bnds_tmp = bnds_tmp.where(bnds_tmp<=180,-360 + bnds_tmp.where(bnds_tmp>180))
                bnds_tmp = bnds_tmp.where(bnds_tmp>=-180,360 + bnds_tmp.where(bnds_tmp<-180))
            
            # Curtail lat bands to -90, 90
            if var == 'lat':
                bnds_tmp = bnds_tmp.where(bnds_tmp<=90,90)
                bnds_tmp = bnds_tmp.where(bnds_tmp>=-90,-90)
            
            # If wrapping, make sure the bounds line up
            if wrap_flag: 
                if ((not bnds_tmp[0,0] == bnds_tmp[-1,-1]) and  
                    (not ((bnds_tmp[0,0] == -180) and (bnds_tmp[-1,-1] == 180)))):
                    bnds_tmp[-1,-1] = bnds_tmp[0,0]
            # Add to ds
            ds[var+'_bnds'] = bnds_tmp
            del bnds_tmp

    # This is the cf_xarray standard, to have them as coordinates
    # Currently only keeping them as variables instead (which is 
    # CMIP standard I think?)
    #ds = ds.set_coords([var+'_bnds' for var in ['lat','lon']])

     # Return
    return ds    




[docs]
def subset_find(ds0,ds1,silent=False):
    """ Finds the grid of `ds1` in `ds0`, and subsets `ds0` to the grid in `ds1`
    
    Parameters
    ---------------
    ds0 : :class:`xarray.Dataset`
      an xarray Dataset to be subset based on the grid of ds1; must 
      contain grid variables "lat" or "lon" (could add a fix_ds call)
    ds1 : :class:`xarray.Dataset`, :class:`xarray.DataArray`
      either an `xarray` structrue (Dataset, DataArray) with "lat" "lon"
      variables, or a dictionary with DataArrays ['lat'] and ['lon'].
      IMPORTANT: `ds1` HAS TO BE BROADCAST - i.e. one value of lat, lon 
      each coordinate, with lat and lon vectors of equal length. This can 
      be done e.g. using ``ds1.stack(loc=('lat','lon'))``. 

    silent : bool, by default False
      if True, then suppresses standard output
        
    Returns
    ---------------
    ds0 : :class:`xarray.Dataset`
      The input `ds0`, subset to the locations in `ds1`. 
    
    """
    
    if 'loc' not in ds0.sizes:
        ds0 = ds0.stack(loc = ('lat','lon'))
        was_stacked = True
    else:
        was_stacked = False
    #if 'loc' not in ds1.sizes:
    #    ds1 = ds1.stack(loc = ('lat','lon'))
    
    # Need a test to make sure the grid is the same. So maybe the gdf_out class 
    # has the lat/lon grid included - and then we can skip the lat/lon column
    # and just keep the pix_idxs
    if (not np.allclose(len(ds0.lat),len(ds1['lat'])) or (not np.allclose(len(ds0.lon),len(ds1['lon']))) or
         (not np.allclose(ds1['lat'],ds0.lat)) or (not np.allclose(ds1['lon'],ds0.lon))):
        if not silent:
            print('adjusting grid... (this may happen because only a subset of pixels '+
              'were used for aggregation for efficiency - i.e. [subset_bbox=True] in '+
             'xa.pixel_overlaps())') #(this also happens because ds and ds_bnds above was already subset)
        # Zip up lat,lon pairs to allow comparison
        latlons = list(zip(ds0.lat.values,ds0.lon.values))
        latlons0 = list(zip(ds1['lat'].values,ds1['lon'].values))
        
        # Find indices of the used grid for aggregation in the input grid
        loc_idxs = [latlons.index(i) for i in latlons0]
        
        if np.allclose(len(loc_idxs),len(latlons0)):
            if not silent:
                print('grid adjustment successful')
            # Subset by those indices
            ds0 = ds0.isel(loc=loc_idxs)
        else:
            raise ValueError('Was not able to match grids!')
        
    if was_stacked:
        # THIS MAY NOT WORK IN ALL CASES
        ds0 = ds0.unstack()
        
    return ds0