Source code for xagg.core

import xarray as xr
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon
from shapely.geometry import MultiPolygon
import warnings
import re 
import os
try:
    import xesmf as xe
    _has_xesmf=True
except ImportError:
    _has_xesmf=False
try:
    from numba import jit as njit
    _has_numba = True
except ImportError:
    _has_numba = False

from . auxfuncs import (find_rel_area,normalize,fix_ds,get_bnds,subset_find,list_or_first,_warn_ifsomenans)
from . classes import (weightmap,aggregated)
from . options import get_options


[docs]
class NoOverlapError(Exception):
    """ Exception for when there's no overlap between pixels and polygons """
    pass




[docs]
def read_wm(path):
    """ Load temporary weightmap files from :py:meth:`wm.to_file()`

    Builds a weightmap out of saved weightmap component 
    files. Particularly useful if the weightmap took a 
    particularly long time to calculated (i.e., if the 
    grid is particularly high resolution). 

    Assumes the files are generated from :py:meth:`wm.to_file()`; 
    i.e., the files are all in a directory `name`: 

    - `name/name.shp` : 
        the geometry of the input polygons
    - `name/name.agg` : 
        the dataframe with the pixel overlap data
    - `name/name_lat.nc`, `name/name_lon.nc` : 
        the source grid of the raster data
    - `name/name_weights.nc` :
       the additional weights grid, if used (this file is optional; if no file with this name is found, no weights are assumed, and `wm.weights='noweights'`)

    Parameters
    ---------------
    path : :py:class:`str` 
        The directory in which the files are stored. They
        are assumed to follow the filename convention of
        sharing the name of the directory (i.e., the last
        part of this path.)

    Returns
    ---------------
    wm : :class:`xagg.weightmap`

    """
    # the last bit of the path is also the filename
    fn = re.split('/',path)[-1]
    

    ###### Load geometry
    geo = gpd.read_file(path+'/'+fn+'.shp')
    geo = geo['geometry']

    ####### Load agg 
    agg = pd.read_hdf(path+'/'+fn+'.h5', 'wm')

    ###### Load source grid
    source_grid = {k:xr.open_dataset(path+'/'+fn+'_'+k+'.nc').set_index({'loc':('lat','lon')})[k+'v'] for k in ['lat','lon']}
    # Rename, removing the v added for the multi-index issue in export
    source_grid = {k:v.rename(k) for k,v in source_grid.items()}

    ###### Load weights
    if os.path.exists(path+'/'+fn+'_weights.csv'):
        # Specifying column because it saves it as with 
        # a dummy index column that gets loaded in an 
        # unproductive way
        weights = pd.read_csv(path+'/'+fn+'_weights.csv')['weights'].astype(object)
        # ^^ Setting astype(object) to make sure integral weights
        # don't change the general type of the frame. This may
        # only affect the testing routines, but setting this here
        # to be explicit 
    else:
        weights = 'nowghts'

    ###### Combine into weightmap
    wm = weightmap(agg=agg,
                   geometry=geo,
                   source_grid=source_grid,
                   weights=weights)

    return wm




[docs]
def process_weights(ds,weights=None,target='ds',silent=None):
    """ Process weights - including regridding
    
    If ``target == 'ds'``, regrid `weights` to `ds`. If ``target == 'weights'``,
    regrid `ds` to `weights`. 

    Parameters
    ---------------
    ds : :class:`xarray.Dataset`, :class:`xarray.DataArray`
        an :class:`xarray.Dataset`/:class:`xarray.DataArray` to regrid
    
    weights : :class:`xarray.DataArray`, optional, default = ``None``
        an :class:`xarray.DataArray` containing a weight (numeric) 
        at each location
    
    target : :py:class:`str`, optional, default = ``'ds'``
        whether weights should be regridded to the `ds` grid (by
        default) or vice-versa (not yet supported, returns 
        ``NotImplementedError``)

    silent : :py:class:`bool`, default = `False` (set by :py:meth:`xa.set_options`)
        if True, then no status updates are printed to std out

    Returns
    ---------------
    ds : :class:`xarray.Dataset`, :class:`xarray.DataArrays`
        the input :class:`xarray.Dataset`/:class:`xarray.DataArray`, with a new variable 
        `weights` specifying weights for each pixel

    weights_info : :py:class:`dict` 
        a dictionary storing information about the 
        weights regridding process, with the fields:

        - ``target``: showing which of the two grids was retained
        - ``ds_grid``: a dictionary with the grid ``{"lat":ds.lat,"lon",ds.lon}``
        - ``weights_grid``: a dictionary with the grid ``{"lat":weights.lat,"lon":weights.lon}``
    """

    if silent is None:
        silent = get_options()['silent']

    
    if weights is None:
        # (for robustness against running this without an extra if statement
        # in a wrapper function)
        weights_info = 'nowghts'
    else:
        # Check types
        if type(weights) is not xr.core.dataarray.DataArray:
            raise TypeError('[weights] must be an xarray DataArray.')
        if type(ds) not in [xr.core.dataarray.DataArray,
                                xr.core.dataset.Dataset]:
            raise TypeError('[ds] must be an xarray structure (DataArray or Dataset)')
            
        # Stick weights into the same supported input format as ds
        weights = fix_ds(weights)
        
        # Set regridding info
        weights_info = {'target':target,
                        'ds_grid':{'lat':ds.lat,'lon':ds.lon},
                        'weights_grid':{'lat':weights.lat,'lon':weights.lon}}

        # Change nans to 0; often files used for weights (pop density, etc.)
        # have open water labeled as nan instead of 0 - but for the purposes of
        # calculating a weight, "0" is more accurate (conservative regridding 
        # algorithms will otherwise miss coastal pixels)
        if get_options()['nan_to_zero_regridding']:
            weights = weights.where(~np.isnan(weights),0)


        # Regrid, if necessary (do nothing if the grids match up to within
        # floating-point precision)
        if ((not ((ds.sizes['lat'] == weights.sizes['lat']) and (ds.sizes['lon'] == weights.sizes['lon']))) or 
            (not (np.allclose(ds.lat,weights.lat) and np.allclose(ds.lon,weights.lon)))):

            # Import xesmf here to allow the code to work without it (it 
            # often has dependency issues and isn't necessary for many 
            # features of xagg)
            if not _has_xesmf:
                raise ImportError('If the `weights` grid and the `ds` grid are different, '+
                                 '`xesmf` is needed for `xagg` to regrid them to match; however, '+
                                 '`xesmf` is not installed. Either install `xesmf` or '+
                                 'manually regrid them to match each other.')

            # Make sure the weights actually cover the dataset before 
            # regridding (which could otherwise interpolate / hallucinate
            # beyond its boundaries)
            bbox_ds = [ds.lat.min(),ds.lat.max(),ds.lon.min(),ds.lon.max()]
            bbox_wgts = [weights.lat.min(),weights.lat.max(),weights.lon.min(),weights.lon.max()]
            if ((bbox_wgts[0]>bbox_ds[0]) or (bbox_wgts[1]<bbox_ds[1]) or
                (bbox_wgts[2]>bbox_ds[2]) or (bbox_wgts[3]<bbox_wgts[3])):
                warnings.warn('The `weights` input (bbox latmin, latmax, lonmin, lonmax: '+
                              ', '.join([str(k.values) for k in bbox_wgts])+
                              ' spans less area than the `ds` input (bbox: '+
                              ', '.join([str(k.values) for k in bbox_ds])+'). Weights beyond '+
                              ' this bounding box may be set to 0 or incorrectly interpolated.')

            if target == 'ds':
                if not silent:
                    print('regridding weights to data grid...')
                # Create regridder to the [ds] coordinates
                rgrd = xe.Regridder(weights,ds,get_options()['rgrd_alg'])
                # Regrid [weights] to [ds] grids
                weights = rgrd(weights)

            elif target == 'weights':
                raise NotImplementedError('The '+target+' variable is not *yet* supported as a target for regridding. Please choose "ds" for now.')
                # This is because of lack of downstream capability right now... 
                if not silent:
                    print('regridding data to weights grid...')
                # Create regridder to the [weights] coordinates
                rgrd = xe.Regridder(ds,weights,get_options()['rgrd_alg'])
                # Regrid [ds] to [weights] grid
                ds = rgrd(ds)

            else:
                raise KeyError(target+' is not a supported target for regridding. Choose "weights" or "ds".')
        else:
            # Make sure the values are actually identical, not just "close", 
            # otherwise assigning may not work below 
            weights['lat'] = ds['lat'].values
            weights['lon'] = ds['lon'].values

        # Add weights to ds
        ds['weights'] = weights

        # Add warnings
        if np.isnan(ds['weights']).all():
            warnings.warn('All inputted `weights` are np.nan after regridding.')
        if (ds['weights'] == 0).all():
            warnings.warn('All inputted `weights` are 0 after regridding.')
            
    # Return
    return ds,weights_info




[docs]
def make_multipoly(pts):
    ''' Split pixel overlapping the antimeridian into MultiPolygon with each sub-Polygon in its own hemisphere

    NB: to be used in :py:meth:`create_raster_polygons()`
        
    '''
    pts = np.array(pts)
    # Get which pixels are east of the antimeridian
    neg = pts[:,0]<0

    # Get point order for split up pixel 
    pts = [ # west of antimeridian
            np.vstack([pts[~neg],np.array([[180,x[1]] for x in pts[~neg][::-1]])]),
            # east of antimeridian
           np.vstack([np.array([[-180,x[1]] for x in pts[neg][::-1]]),pts[neg]])]
    pts = [[tuple(x) for x in pt] for pt in pts]

    # Create multipolygon
    return MultiPolygon([Polygon(pt) for pt in pts])





[docs]
def create_raster_polygons(ds,
                           mask=None,subset_bbox=None,
                           weights=None,weights_target='ds',
                           wrap_around_thresh=5,
                          silent=None):
    """ Create polygons for each pixel in a raster

    Note: 
    'lat_bnds' and 'lon_bnds' can be created through the
    :func:`xagg.aux.get_bnds` function if they are not already included
    in the input raster file. 
    
    Note:
    Currently this code only supports regular 
    rectangular grids (so where every pixel side is
    a straight line in lat/lon space). Future versions
    may include support for irregular grids. 
    
    Parameters
    ---------------
    ds : :class:`xarray.Dataset`
        an xarray dataset with the variables 
        'lat_bnds' and 'lon_bnds', which are both
        lat/lon x 2 arrays giving the min and 
        max values of lat and lon for each pixel
        given by lat/lon
    subset_bbox : :class:`geopandas.GeoDataFrame`, optional, default = ``None`` 
        if a :class:`geopandas.GeoDataFrame`
        is entered, the bounding
        box around the geometries in the `gdf` 
        are used to mask the grid, to reduce
        the number of pixel polygons created
          
    Returns
    ---------------
    pix_agg: :py:class:`dict`
        a dictionary containing:

        - ``'gdf_pixels'`` 
            a :class:`geopandas.GeoDataFrame` containing a 
            'geometry' giving the pixel boundaries 
            for each 'lat' / 'lon' pair
        - ``'source_grid'`` 
            a dictionary containing the original
            lat and lon inputs under the keys
            "lat" and "lon" (just the 
            :class:`xarray.DataArray` of those variables in
            the input `ds`)
                      
    """

    if silent is None:
        silent = get_options()['silent']
    
    # Standardize inputs (including lat/lon order)
    ds = fix_ds(ds)
    ds = get_bnds(ds,silent=silent)
    #breakpoint()
    
    # Process weights
    ds,winf = process_weights(ds,weights,target=weights_target)

    # Subset by shapefile bounding box, if desired
    if subset_bbox is not None:
        if type(subset_bbox) == gpd.geodataframe.GeoDataFrame:
            # Using the biggest difference in lat/lon to make sure that the pixels are subset
            # in a way that the bounding box is fully filled out
            # bbox_thresh = np.max([ds.lat.diff('lat').max(),ds.lon.diff('lon').max()])+0.1 
            grid_dist = np.max([ds.lat.diff('lat').max(),ds.lon.diff('lon').max()]) # first get the max grid size
            bbox_thresh = grid_dist*2. # then set threshold to twice grid size, avoids huge subsets for high res grids
            ds = ds.sel(lon=slice(subset_bbox.total_bounds[0]-bbox_thresh,subset_bbox.total_bounds[2]+bbox_thresh),
                        lat=slice(subset_bbox.total_bounds[1]-bbox_thresh,subset_bbox.total_bounds[3]+bbox_thresh))
        else:
            warnings.warn('[subset_bbox] is not a geodataframe; no mask by polygon bounding box used.')
            
            
    # Mask
    if mask is not None:
        raise NotImplementedError('Masking by grid not yet supported. Stay tuned...')
        
    # Create dataset which has a lat/lon bound value for each individual pixel, 
    # broadcasted out over each lat/lon pair
    (ds_bnds,) = (xr.broadcast(ds.isel({d:0 for d in [k for k in ds.sizes if k not in ['lat','lon','bnds']]}).
                              drop_vars([v for v in ds.keys() if v not in ['lat_bnds','lon_bnds']])))
    # Stack so it's just pixels and bounds
    ds_bnds = ds_bnds.stack(loc=('lat','lon'))
        
    # In order:
    # (lon0,lat0),(lon0,lat1),(lon1,lat1),(lon1,lat1), but as a single array; to be 
    # put in the right format for Polygon in the next step
    pix_poly_coords = np.transpose(np.vstack([ds_bnds.lon_bnds.isel(bnds=0).values,ds_bnds.lat_bnds.isel(bnds=0).values,
                                                ds_bnds.lon_bnds.isel(bnds=0).values,ds_bnds.lat_bnds.isel(bnds=1).values,
                                                ds_bnds.lon_bnds.isel(bnds=1).values,ds_bnds.lat_bnds.isel(bnds=1).values,
                                                ds_bnds.lon_bnds.isel(bnds=1).values,ds_bnds.lat_bnds.isel(bnds=0).values]))

    # Reshape so each location has a 4 x 2 (vertex vs coordinate) array, 
    # and convert each of those vertices to tuples. This means every element
    # of pix_poly_coords is the input to shapely.geometry.Polygon of one pixel
    pix_poly_coords = tuple(map(tuple,np.reshape(pix_poly_coords,(np.shape(pix_poly_coords)[0],4,2))))

    # Figure out if any pixels cross the antimeridian; we'll have to deal with 
    # those separately... Identify them by seeing which pixels have longitudes
    # that are within the `wrap_around_thresh` (by default 5 degs) of both 
    # +180 and -180 degrees
    cross_antimeridian_idxs = ((np.abs(np.array(pix_poly_coords)[:,:,0] - -180) < wrap_around_thresh).any(axis=1) & 
     (np.abs(np.array(pix_poly_coords)[:,:,0] - 180) < wrap_around_thresh).any(axis=1))

    # Create empty geodataframe
    gdf_pixels = gpd.GeoDataFrame(pd.DataFrame({v:[None]*ds_bnds.sizes['loc']
                               for v in ['lat','lon','geometry']}),
                              geometry='geometry')
    if weights is not None:
        # Stack weights so they are linearly indexed like the ds (and fill
        # NAs with 0s)
        weights = ds.weights.stack(loc=('lat','lon')).fillna(0)
        # Preallocate weights column
        gdf_pixels['weights'] = [None]*ds_bnds.sizes['loc']
    
    # Now populate with a polygon for every pixel
    poly_dict = {'poly_pts': pix_poly_coords}
    df_poly = pd.DataFrame(poly_dict, columns=['poly_pts'])
    df_poly['poly'] = df_poly.poly_pts.apply(lambda pts: Polygon(pts))
    # Make MultiPolygons for pixels crossing the antimeridian
    df_poly.loc[np.where(cross_antimeridian_idxs)[0],'poly'] = df_poly.poly_pts.iloc[np.where(cross_antimeridian_idxs)[0]].apply(lambda pts: make_multipoly(pts))
    # Set geometry 
    gdf_pixels['geometry']=df_poly['poly']
    # Add lat/lons of pixels for identification 
    gdf_pixels['lat']=ds_bnds.lat.values
    gdf_pixels['lon']=ds_bnds.lon.values  
    if weights is not None:
        gdf_pixels['weights'] = weights.values

    # Add a "pixel idx" to make indexing better later
    gdf_pixels['pix_idx'] = gdf_pixels.index.values
    
    # Add crs (normal lat/lon onto WGS84)
    gdf_pixels = gdf_pixels.set_crs("EPSG:4326")
    #gdf_pixels.crs = {'init':'EPSG:4326'}
    
    # Save the source grid for further reference
    source_grid = {'lat':ds_bnds.lat,'lon':ds_bnds.lon}
    
    pix_agg = {'gdf_pixels':gdf_pixels,'source_grid':source_grid}
    
    # Return the created geodataframe
    return pix_agg




[docs]
def get_pixel_overlaps(gdf_in,pix_agg,impl=None):
    """ Get, for each polygon, the pixels that overlap and their area of overlap
    
    Finds, for each polygon in `gdf_in`, which pixels intersect it, and by how much. 
    
    Note: 
    Uses EASE-Grid 2.0 on the WGS84 datum to calculate relative areas
    (see https://nsidc.org/data/ease)
    
    Parameters
    ---------------
    gdf_in : :class:`geopandas.GeoDataFrane`
        a :class:`geopandas.GeoDataFrame` giving the polygons over which 
        the variables should be aggregated. Can be just a read
        shapefile (with the added column of "poly_idx", which 
        is just the index as a column).
    pix_agg : :py:class:`dict`
        the output of :func:`xagg.core.create_raster_polygons`; a dict containing:

        - ``'gdf_pixels'``
            a :class:`geopandas.GeoDataFrame` giving for each row 
            the columns "lat" and "lon" (with coordinates) 
            and a polygon giving the boundary of the pixel 
            given by lat/lon
        - ``'source_grid'``
            ``[da.lat,da.lon]`` of the grid used to create
            the pixel polygons

    impl : :py:class:`str` (set by :py:meth:`xa.set_options`)
        whether the output will be used for the dot-product aggregation 
        calculation (needs a slightly different format), either of: 
        - ``'for_loop'`` (default behavior)
        - ``'dot_product'`` (to set up for ``impl='dot_product'`` in ``xagg.core.aggregate``)

    Returns
    ---------------
    wm_out: :py:class:`dict`
        A dictionary containing: 
        
        - ``'agg'``:
            a dataframe containing all the fields of ``gdf_in`` (except
            geometry) and the additional columns: 

            - ``coords``:  the lat/lon coordiates of all pixels that overlap the polygon of that row
            - ``pix_idxs``: the linear indices of those pixels within the ``gdf_pixels`` grid
            - ``rel_area``: the relative area of each of the overlaps between the pixels and the polygon (summing to 1 - e.g. if the polygon is exactly the size and location of two pixels, their rel_areas would be 0.5 each)

        - ``'source_grid'``
            a dictionary with keys 'lat' and 'lon' giving the original lat/lon grid whose overlaps with the polygons was calculated
        - ``'geometry'`` 
            just the polygons from ``gdf_in``

    """

    if impl is None:
        impl = get_options()['impl']
    
    # Add an index for each polygon as a column to make indexing easier
    #if 'poly_idx' not in gdf_in.columns:
    #    gdf_in['poly_idx'] = gdf_in.index.values
    gdf_in['poly_idx'] = np.arange(0,len(gdf_in))
        
    # Match up CRSes
    pix_agg['gdf_pixels'] = pix_agg['gdf_pixels'].to_crs(gdf_in.crs)

    # Choose a common crs for both, just to minimize the chance
    # of geographic shenanigans
    # (using the EASE grid https://nsidc.org/data/ease)
    if np.all(gdf_in.total_bounds[[1,3]]>0):
        # If min/max lat are both in NH, use North grid
        #epsg_set = {'init':'EPSG:6931'} (change to below bc of depreciation of {'init':...} format in geopandas)
        epsg_set = 'EPSG:6931'
    elif np.all(gdf_in.total_bounds[[1,3]]<0):
        # If min/max lat are both in SH, use South grid
        #epsg_set = {'init':'EPSG:6932'}
        epsg_set = 'EPSG:6932'
    else:
        # Otherwise, use the global/temperate grid
        #epsg_set = {'init':'EPSG:6933'}
        epsg_set = 'EPSG:6933'
    
    # Get GeoDataFrame of the overlaps between every pixel and the polygons
    with warnings.catch_warnings():
        # Filter UserWarnings that flag when overlapping would result in 
        # lines, or point overlaps (but we only care about 3D overlaps, so
        # keep_geom_type=True is the right call, and we don't need the 
        # warning)
        warnings.filterwarnings('ignore',category=UserWarning)
        overlaps = gpd.overlay(gdf_in.to_crs(epsg_set),
                               pix_agg['gdf_pixels'].to_crs(epsg_set),
                               how='intersection')
    
    if overlaps.empty:
        raise NoOverlapError('No `ds` grid cells overlapped with any polygon in `gdf_in`. Check the input `ds` and `gdf_in`.')
    else:
        if impl=='dot_product':
            # Get relative area of each pixel
            overlaps = overlaps.groupby('poly_idx',group_keys=False)[overlaps.columns.tolist()].apply(find_rel_area,include_groups=False)
            overlaps['lat'] = overlaps['lat'].astype(float)
            overlaps['lon'] = overlaps['lon'].astype(float)

            # Now, group by poly_idx (each polygon in the shapefile)
            ov_groups = overlaps.groupby('poly_idx')

            overlap_info = ov_groups.agg(list_or_first)

            overlap_info = overlap_info.rename(columns={'pix_idx': 'pix_idxs'})
        elif impl=='for_loop':
            # Now, group by poly_idx (each polygon in the shapefile)
            ov_groups = overlaps.groupby('poly_idx')
            # Calculate relative area of each overlap (so how much of the total 
            # area of each polygon is taken up by each pixel), the pixels 
            # corresponding to those areas, and the lat/lon coordinates of 
            # those pixels
            overlap_info = ov_groups.agg(rel_area=pd.NamedAgg(column='geometry',aggfunc=lambda ds: [normalize(ds.area)]),
                                           pix_idxs=pd.NamedAgg(column='pix_idx',aggfunc=lambda ds: [idx for idx in ds]),
                                           lat=pd.NamedAgg(column='lat',aggfunc=lambda ds: [x for x in ds]),
                                           lon=pd.NamedAgg(column='lon',aggfunc=lambda ds: [x for x in ds]))

    # Zip lat, lon columns into a list of (lat,lon) coordinates
    # (separate from above because as of 12/20, named aggs with 
    # multiple columns is still an open issue in the pandas github)
    overlap_info['coords'] = overlap_info.apply(lambda row: list(zip(row['lat'],row['lon'])),axis=1)
    overlap_info = overlap_info.drop(columns=['lat','lon'])

    # Reset index to make poly_idx a column for merging with gdf_in
    overlap_info = overlap_info.reset_index()

    if impl=='dot_product':
        # Merge in pixel overlaps to the input polygon geodataframe
        overlap_columns = ['pix_idxs', 'rel_area', 'coords', 'poly_idx']
        gdf_in = pd.merge(gdf_in, overlap_info[overlap_columns],'outer', on='poly_idx')

        # make the weight grid an xarray dataset for later dot product
        idx_cols = ['lat', 'lon', 'poly_idx']
        overlap_da = overlaps.set_index(idx_cols)['rel_area'].to_xarray()
        overlap_da = overlap_da.stack(loc=['lat', 'lon'])
        overlap_da = overlap_da.fillna(0)
        wm_out = weightmap(agg=gdf_in.drop('geometry', axis=1),
                   source_grid=pix_agg['source_grid'],
                   geometry=gdf_in.geometry,
                   overlap_da = overlap_da)
    elif impl=='for_loop':
        # Merge in pixel overlaps to the input polygon geodataframe
        gdf_in = pd.merge(gdf_in,overlap_info,'outer')

        # Drop 'geometry' eventually, just for size/clarity
        wm_out = weightmap(agg=gdf_in.drop('geometry',axis=1),
                           source_grid=pix_agg['source_grid'],
                           geometry=gdf_in.geometry)
    
    if 'weights' in pix_agg['gdf_pixels'].columns:
        wm_out.weights = pix_agg['gdf_pixels'].weights

    return wm_out


#--------- NUMBA HELPER FUNCTIONS ---------
if _has_numba:
    @njit
    def numba_indexer(x,idxs):
        vals = np.full(len(idxs),np.nan)
        for i in range(len(idxs)):
            vals[i] = x[int(idxs[i])]
        return vals

    @njit
    def numba_aggregate(data,idxs,rel_area,add_weights=None):
        if add_weights is None:
            add_weights = np.ones(len(data))
        
        # Since idxs, rel_area are stored in a dataarray,
        # in the weightmap, they're padded with nans. 
        # Remove those. 
        idxs = idxs[~np.isnan(idxs)]
        rel_area = rel_area[~np.isnan(rel_area)]
    
        # Get the indices of the data needed
        data = numba_indexer(data,idxs)
        add_weights = numba_indexer(add_weights,idxs)

        # Normalize area and additional weights to a single
        # weights vector
        weights = rel_area * add_weights
    
        # Make sure to do aggregation only on non-nan pixels
        nans = np.isnan(data)
        data = data[~nans]
        weights = weights[~nans]
    
        if len(data) == 0:
            agg = np.nan
        else:
            agg = np.sum(data*weights)/np.sum(weights)
    
        return agg
#--------------------------------------------


[docs]
def aggregate(ds,wm,impl=None,silent=None):
    """ Aggregate raster variable(s) to polygon(s)
    
    Aggregates (N-D) raster variables in `ds` to the polygons
    in `gfd_out` - in other words, gives the weighted average
    of the values in [ds] based on each pixel's relative area
    overlap with the polygons. 
    
    The values will be additionally weighted if a `weight` was
    inputted into :func:`xagg.core.create_raster_polygons`
    
    The code checks whether the input lat/lon grid in `ds` is 
    equivalent to the linearly indexed grid in `wm`, or 
    if it can be cropped to that grid. 
    
    
    Parameters
    ---------------
    ds : :py:class:`xarray.Dataset`
        an :py:class:`xarray.Dataset` containing one or more
        variables with dimensions lat, lon (and possibly
        more). The dataset's geographic grid has to 
        include the lat/lon coordinates used in 
        determining the pixel overlaps in 
        :func:`xagg.core.get_pixel_overlaps` (and saved in 
        ``wm['source_grid']``)
               
    wm : :py:class:`xagg.classes.weightmap`
        the output to :func:`xagg.core.get_pixel_overlaps`; a 
        :py:class:`xagg.classes.weightmap` object containing 

        - ``['agg']``
            a dataframe, 
            with one row per polygon, and the columns `pix_idxs`
            and `rel_area`, giving the linear indices and 
            the relative area of each pixel over the polygon,
            respectively 
        - ``['source_grid']``  
            the lat/lon grid on which the aggregating parameters
            were calculated (and on which the linear indices 
            are based)

    impl : :py:class:str (def: ``'for_loop'``) (set by :py:meth:`xa.set_options`)
        which aggregation calculation method to use, either of: 

        - ``'for_loop'`` 
            default behavior, aggregation loops through
            all polygons in a for loop, requires less memory 
        - ``'dot_product'`` 
            aggregation is calculated using a dot product, 
            requires much more memory (due to broadcasting of
            variables) but may be faster in certain circumstances
        - ``'numba'``
            aggregation is calculated using a simplified function
            that uses numba's just-in-time compiler. Can be substantially
            faster 

    silent : :py:class:`bool`, default = `False` (set by :py:meth:`xa.set_options`)
        if True, then no status updates are printed to std out
               
    Returns
    ---------------
    agg_out : :class:`xagg.classes.aggregated`
        an :class:`xagg.classes.aggregated` object with the aggregated variables 
    
    """
    if impl is None:
        impl = get_options()['impl']
    if silent is None:
        silent = get_options()['silent']

    # Triggers if/once a partial nan warning is called, to avoid redoing the 
    # warning for every variable, every polygon
    _warn_trigger_partialnan = True  

    # Make sure pixel_overlaps was correctly run if using dot product
    if (impl=='dot_product') and (wm.overlap_da is None):
        raise ValueError("no 'overlap_da' was found in the `wm` input - since you're using the dot product implementation, "+
                         "make sure to run `pixel_overlaps()` with `impl='dot_product'` to avoid this error.")

    # Turn into dataset if dataarray
    if type(ds)==xr.core.dataarray.DataArray:
      if ds.name is None:
        warnings.warn('An unnamed xr.DataArray was inputted instead of a xr.Dataset; the output variable will be "var"')
        ds = ds.to_dataset(name='var')
      else:
        ds = ds.to_dataset()


    # Run ds through fix_ds (to fix lat/lon names, lon coords)
    ds = fix_ds(ds)

    # Stack 
    ds = ds.stack(loc=('lat','lon'))
    
    # Adjust grid of [ds] if necessary to match 
    ds = subset_find(ds,wm.source_grid,silent=silent)
    
    # Set weights; or replace with ones if no additional weight information
    #if wm.weights != 'nowghts':
    if type(wm.weights) == pd.core.series.Series:
        weights = np.array([float(k) for k in wm.weights])
    else:
        if wm.weights != 'nowghts':
            warnings.warn('wm.weights is: \n '+print(wm.weights)+
                            ', \n which is not a supported weight vector (in a pandas series) '+
                            'or "nowghts" as a string. Assuming no weights are included...')
        if impl=='dot_product':
            weights = np.ones((len(wm.overlap_da['loc'])))
        elif (impl=='for_loop') or (impl=='numba'):
            weights = np.ones((len(wm.source_grid['lat'])))
    
    if impl=='dot_product':
        data_dict = dict()
        for var in ds:
            # Process for every variable that has locational information, but isn't a 
            # bound variable
            if ('bnds' not in ds[var].sizes) & ('loc' in ds[var].sizes):
                if not silent:
                    print('aggregating '+var+'...')

                # select just data for which we have overlaps
                var_array = ds[var].sel(loc=wm.overlap_da['loc'])

                # Get the dimensions of the variable that aren't "loc" (location)
                other_dims = [k for k in np.atleast_1d(var_array.sizes) if k != 'loc']

                # Check first if any nans are "complete" (meaning that a pixel 
                # either has values for each step, or nans for each step - if
                # there are random nans along non-location dimensions for the 
                # same grid cell, throw a warning)
                _warn_trigger_partialnan=_warn_ifsomenans(var_array,var,other_dims,
                                                            _warn_trigger_partialnan)


                # multiply percent-overlaps by user-supplied weights 
                weights_and_overlaps = wm.overlap_da * weights

                # fill any nans in the gridded data to zero and change the corresponding
                # weights to zero so that they will not affect the aggregated value
                var_array_filled = var_array.fillna(0)
                weights_and_overlaps = weights_and_overlaps.where(var_array.notnull(), 0)

                # the weights now may add up to less than one because of nans or
                # because of the user-supplied weights. here we normalize the
                # weights so they add up to 1 and fill any nan's with 0 so they
                # won't be counted
                normed_weights = weights_and_overlaps/weights_and_overlaps.sum(dim = "loc", skipna=True)
                normed_weights = normed_weights.fillna(0)

                # finally we do the dot product to get the weighted averages
                aggregated_array = normed_weights.dot(var_array_filled, dim='loc')

                # if the original gridded values were all nan, make the final
                # aggregation nan
                if np.isnan(var_array.values).all():
                    aggregated_array = aggregated_array * np.nan

                data_dict[var] = aggregated_array

        ds_combined = xr.Dataset(data_dict)  
        print('ds_combined:')
        print(ds_combined)

        for var in ds:
            if ('bnds' not in ds[var].sizes) and ('loc' in ds[var].sizes):
                # convert to list of arrays - NOT SURE THIS IS THE RIGHT THING TO
                # DO, JUST TRYING TO MATCH ORIGINAL FORMAT
                wm.agg[var]=pd.Series([[[ds_combined[var].isel(poly_idx=i).values]] for i in range(len(ds_combined.poly_idx))])
        # Put in class format
        agg_out = aggregated(agg=wm.agg,source_grid=wm.source_grid,
                            geometry=wm.geometry,ds_in=ds_combined,weights=wm.weights)
    elif impl=='for_loop':
        for var in ds:
            # Process for every variable that has locational information, but isn't a 
            # bound variable
            if ('bnds' not in ds[var].sizes) and ('loc' in ds[var].sizes):
                if not silent:
                    print('aggregating '+var+'...')
                # Create the column for the relevant variable
                wm.agg[var] = None
                
                # Get weighted average of variable based on pixel overlap + other weights
                for poly_idx in wm.agg.poly_idx:
                    # Get average value of variable over the polygon; weighted by 
                    # how much of the pixel area is in the polygon, and by (optionally)
                    # a separate gridded weight. This if statement checks 
                    # - if the weights output for this polygon is just "np.nan", which 
                    #   indicates there's no overlap between the pixel grid and polygons
                    # - [this has been pushed down a few lines] or, if all the pixels in 
                    #   the grid have just nan values for this variable
                    # in both cases; the "aggregated variable" is just a vector of nans. 
                    if not np.isnan(wm.agg.iloc[poly_idx,:].pix_idxs).all():
                        # Get the dimensions of the variable that aren't "loc" (location)
                        other_dims = [k for k in np.atleast_1d(ds[var].sizes) if k != 'loc']
                        # Check first if any nans are "complete" (meaning that a pixel 
                        # either has values for each step, or nans for each step - if
                        # there are random nans along non-location dimensions for the 
                        # same grid cell, throw a warning)
                        _warn_trigger_partialnan=_warn_ifsomenans(ds,var,other_dims,
                                                                  _warn_trigger_partialnan)


                        if not np.isnan(ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)).all(): 
                            # Get relative areas for the pixels overlapping with this Polygon
                            tmp_areas = np.atleast_1d(np.squeeze(wm.agg.iloc[poly_idx,:].rel_area))
                            # Replace overlapping pixel areas with nans if the corresponding pixel
                            # is only composed of nans
                            tmp_areas[np.array(np.isnan(ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)).all(other_dims).values)] = np.nan
                            # Calculate the normalized area+weight of each pixel (taking into account
                            # nans)
                            normed_areaweights = normalize(tmp_areas*weights[wm.agg.iloc[poly_idx,:].pix_idxs],drop_na=True)

                            # Take the weighted average of all the pixel values to calculate the 
                            # aggregated value for the shapefile
                            wm.agg.loc[poly_idx,var] = [[((ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)*
                                                             normed_areaweights).
                                                            sum('loc')).values]]

                        else:
                            wm.agg.loc[poly_idx,var] = [[(ds[var].isel(loc=0)*np.nan).values]]

                    else:
                        wm.agg.loc[poly_idx,var] = [[(ds[var].isel(loc=0)*np.nan).values]]

        # Put in class format
        agg_out = aggregated(agg=wm.agg,source_grid=wm.source_grid,
                             geometry=wm.geometry,ds_in=ds,weights=wm.weights)
    elif impl=='numba':
        if not _has_numba:
            raise ImportError("impl == 'numba' requires `numba`, which is not installed.")
        # Extract pixel idxs, area overlap for each polygon from weightmap object
        pix_idxs = [np.atleast_1d(pix) for pix in wm.agg.pix_idxs.values]
        rel_area = [np.atleast_1d(area[0]) if area is not np.nan else 
                    np.atleast_1d(np.nan) for area in wm.agg.rel_area.values]
        max_pix = max(len(lst) for lst in pix_idxs)  # Max number of pixels per polygon, to pad to

        # Preallocate arrays filled with NaNs
        n_polys = len(wm.agg)
        pix_idxs_padded = np.full((n_polys, max_pix), np.nan)
        rel_area_padded = np.full((n_polys, max_pix), np.nan)

        # Fill the arrays, padding to max number of pixels with nans
        for i, (pix_idxs, rel_area) in enumerate(zip(pix_idxs, rel_area)):
            pix_idxs_padded[i, :len(pix_idxs)] = pix_idxs
            rel_area_padded[i, :len(rel_area)] = rel_area

        # Put into dataset
        idxs = xr.Dataset({'idxs': (['poly_idx', 'idx'], pix_idxs_padded),
                          'rel_area': (['poly_idx', 'idx'], rel_area_padded)},
                          coords={'poly_idx': wm.agg.poly_idx.values,
                                    'idx': np.arange(max_pix)}
                        )
        # Make into dask array 
        idxs = idxs.chunk({'poly_idx':'auto','idx':-1})

        # Make weights data array, to put into apply_ufunc below
        weights = xr.DataArray(weights,
                            dims = ['loc'],
                            coords = {'loc':ds.loc})

        for var in ds:
            if ('bnds' not in ds[var].sizes) and ('loc' in ds[var].sizes):
                if not silent:
                    print('aggregating '+var+'...')

                # Get the dimensions of the variable that aren't "loc" (location)
                other_dims = [k for k in np.atleast_1d(ds[var].sizes) if k != 'loc']

                # Check first if any nans are "complete" (meaning that a pixel 
                # either has values for each step, or nans for each step - if
                # there are random nans along non-location dimensions for the 
                # same grid cell, throw a warning)
                _warn_trigger_partialnan=_warn_ifsomenans(ds,var,other_dims,
                                                          _warn_trigger_partialnan)

                # Aggregate
                agg = xr.apply_ufunc(numba_aggregate,
                                   ds[var],
                                   idxs.idxs,
                                   idxs.rel_area,
                                   weights,
                                   input_core_dims = [['loc'],['idx'],['idx'],['loc']],
                                   output_core_dims = [[]],
                                   vectorize=True,
                                   dask='parallelized',
                                   output_dtypes = [float]).to_dataset(name=var)

                # Trigger computation, otherwise it separately triggers computation
                # for every poly_idx below, with tremendous overhead 
                agg = agg.compute()

                # Put in `aggregated` format (TODO: should reform this to remove some of the 
                # nested lists that are here for some reason)
                wm.agg[var] = [[[agg[var].sel(poly_idx = idx).values]] for idx in wm.agg.index]

        # Put in class format
        agg_out = aggregated(agg=wm.agg,source_grid=wm.source_grid,
                             geometry=wm.geometry,ds_in=ds,weights=wm.weights)


    # Return
    if not silent:
        print('all variables aggregated to polygons!')
    return agg_out