Source code for xagg.core

import xarray as xr
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon
from shapely.geometry import MultiPolygon
import warnings
import re 
import os
try:
    import xesmf as xe
    _has_xesmf=True
except ImportError:
    _has_xesmf=False
try:
    from numba import jit as njit
    _has_numba = True
except ImportError:
    _has_numba = False

from . auxfuncs import (find_rel_area,normalize,fix_ds,get_bnds,subset_find,list_or_first,_warn_ifsomenans)
from . classes import (weightmap,aggregated)
from . options import get_options

[docs] class NoOverlapError(Exception): """ Exception for when there's no overlap between pixels and polygons """ pass
[docs] def read_wm(path): """ Load temporary weightmap files from :py:meth:`wm.to_file()` Builds a weightmap out of saved weightmap component files. Particularly useful if the weightmap took a particularly long time to calculated (i.e., if the grid is particularly high resolution). Assumes the files are generated from :py:meth:`wm.to_file()`; i.e., the files are all in a directory `name`: - `name/name.shp` : the geometry of the input polygons - `name/name.agg` : the dataframe with the pixel overlap data - `name/name_lat.nc`, `name/name_lon.nc` : the source grid of the raster data - `name/name_weights.nc` : the additional weights grid, if used (this file is optional; if no file with this name is found, no weights are assumed, and `wm.weights='noweights'`) Parameters --------------- path : :py:class:`str` The directory in which the files are stored. They are assumed to follow the filename convention of sharing the name of the directory (i.e., the last part of this path.) Returns --------------- wm : :class:`xagg.weightmap` """ # the last bit of the path is also the filename fn = re.split('/',path)[-1] ###### Load geometry geo = gpd.read_file(path+'/'+fn+'.shp') geo = geo['geometry'] ####### Load agg agg = pd.read_hdf(path+'/'+fn+'.h5', 'wm') ###### Load source grid source_grid = {k:xr.open_dataset(path+'/'+fn+'_'+k+'.nc').set_index({'loc':('lat','lon')})[k+'v'] for k in ['lat','lon']} # Rename, removing the v added for the multi-index issue in export source_grid = {k:v.rename(k) for k,v in source_grid.items()} ###### Load weights if os.path.exists(path+'/'+fn+'_weights.csv'): # Specifying column because it saves it as with # a dummy index column that gets loaded in an # unproductive way weights = pd.read_csv(path+'/'+fn+'_weights.csv')['weights'].astype(object) # ^^ Setting astype(object) to make sure integral weights # don't change the general type of the frame. This may # only affect the testing routines, but setting this here # to be explicit else: weights = 'nowghts' ###### Combine into weightmap wm = weightmap(agg=agg, geometry=geo, source_grid=source_grid, weights=weights) return wm
[docs] def process_weights(ds,weights=None,target='ds',silent=None): """ Process weights - including regridding If ``target == 'ds'``, regrid `weights` to `ds`. If ``target == 'weights'``, regrid `ds` to `weights`. Parameters --------------- ds : :class:`xarray.Dataset`, :class:`xarray.DataArray` an :class:`xarray.Dataset`/:class:`xarray.DataArray` to regrid weights : :class:`xarray.DataArray`, optional, default = ``None`` an :class:`xarray.DataArray` containing a weight (numeric) at each location target : :py:class:`str`, optional, default = ``'ds'`` whether weights should be regridded to the `ds` grid (by default) or vice-versa (not yet supported, returns ``NotImplementedError``) silent : :py:class:`bool`, default = `False` (set by :py:meth:`xa.set_options`) if True, then no status updates are printed to std out Returns --------------- ds : :class:`xarray.Dataset`, :class:`xarray.DataArrays` the input :class:`xarray.Dataset`/:class:`xarray.DataArray`, with a new variable `weights` specifying weights for each pixel weights_info : :py:class:`dict` a dictionary storing information about the weights regridding process, with the fields: - ``target``: showing which of the two grids was retained - ``ds_grid``: a dictionary with the grid ``{"lat":ds.lat,"lon",ds.lon}`` - ``weights_grid``: a dictionary with the grid ``{"lat":weights.lat,"lon":weights.lon}`` """ if silent is None: silent = get_options()['silent'] if weights is None: # (for robustness against running this without an extra if statement # in a wrapper function) weights_info = 'nowghts' else: # Check types if type(weights) is not xr.core.dataarray.DataArray: raise TypeError('[weights] must be an xarray DataArray.') if type(ds) not in [xr.core.dataarray.DataArray, xr.core.dataset.Dataset]: raise TypeError('[ds] must be an xarray structure (DataArray or Dataset)') # Stick weights into the same supported input format as ds weights = fix_ds(weights) # Set regridding info weights_info = {'target':target, 'ds_grid':{'lat':ds.lat,'lon':ds.lon}, 'weights_grid':{'lat':weights.lat,'lon':weights.lon}} # Change nans to 0; often files used for weights (pop density, etc.) # have open water labeled as nan instead of 0 - but for the purposes of # calculating a weight, "0" is more accurate (conservative regridding # algorithms will otherwise miss coastal pixels) if get_options()['nan_to_zero_regridding']: weights = weights.where(~np.isnan(weights),0) # Regrid, if necessary (do nothing if the grids match up to within # floating-point precision) if ((not ((ds.sizes['lat'] == weights.sizes['lat']) and (ds.sizes['lon'] == weights.sizes['lon']))) or (not (np.allclose(ds.lat,weights.lat) and np.allclose(ds.lon,weights.lon)))): # Import xesmf here to allow the code to work without it (it # often has dependency issues and isn't necessary for many # features of xagg) if not _has_xesmf: raise ImportError('If the `weights` grid and the `ds` grid are different, '+ '`xesmf` is needed for `xagg` to regrid them to match; however, '+ '`xesmf` is not installed. Either install `xesmf` or '+ 'manually regrid them to match each other.') # Make sure the weights actually cover the dataset before # regridding (which could otherwise interpolate / hallucinate # beyond its boundaries) bbox_ds = [ds.lat.min(),ds.lat.max(),ds.lon.min(),ds.lon.max()] bbox_wgts = [weights.lat.min(),weights.lat.max(),weights.lon.min(),weights.lon.max()] if ((bbox_wgts[0]>bbox_ds[0]) or (bbox_wgts[1]<bbox_ds[1]) or (bbox_wgts[2]>bbox_ds[2]) or (bbox_wgts[3]<bbox_wgts[3])): warnings.warn('The `weights` input (bbox latmin, latmax, lonmin, lonmax: '+ ', '.join([str(k.values) for k in bbox_wgts])+ ' spans less area than the `ds` input (bbox: '+ ', '.join([str(k.values) for k in bbox_ds])+'). Weights beyond '+ ' this bounding box may be set to 0 or incorrectly interpolated.') if target == 'ds': if not silent: print('regridding weights to data grid...') # Create regridder to the [ds] coordinates rgrd = xe.Regridder(weights,ds,get_options()['rgrd_alg']) # Regrid [weights] to [ds] grids weights = rgrd(weights) elif target == 'weights': raise NotImplementedError('The '+target+' variable is not *yet* supported as a target for regridding. Please choose "ds" for now.') # This is because of lack of downstream capability right now... if not silent: print('regridding data to weights grid...') # Create regridder to the [weights] coordinates rgrd = xe.Regridder(ds,weights,get_options()['rgrd_alg']) # Regrid [ds] to [weights] grid ds = rgrd(ds) else: raise KeyError(target+' is not a supported target for regridding. Choose "weights" or "ds".') else: # Make sure the values are actually identical, not just "close", # otherwise assigning may not work below weights['lat'] = ds['lat'].values weights['lon'] = ds['lon'].values # Add weights to ds ds['weights'] = weights # Add warnings if np.isnan(ds['weights']).all(): warnings.warn('All inputted `weights` are np.nan after regridding.') if (ds['weights'] == 0).all(): warnings.warn('All inputted `weights` are 0 after regridding.') # Return return ds,weights_info
[docs] def make_multipoly(pts): ''' Split pixel overlapping the antimeridian into MultiPolygon with each sub-Polygon in its own hemisphere NB: to be used in :py:meth:`create_raster_polygons()` ''' pts = np.array(pts) # Get which pixels are east of the antimeridian neg = pts[:,0]<0 # Get point order for split up pixel pts = [ # west of antimeridian np.vstack([pts[~neg],np.array([[180,x[1]] for x in pts[~neg][::-1]])]), # east of antimeridian np.vstack([np.array([[-180,x[1]] for x in pts[neg][::-1]]),pts[neg]])] pts = [[tuple(x) for x in pt] for pt in pts] # Create multipolygon return MultiPolygon([Polygon(pt) for pt in pts])
[docs] def create_raster_polygons(ds, mask=None,subset_bbox=None, weights=None,weights_target='ds', wrap_around_thresh=5, silent=None): """ Create polygons for each pixel in a raster Note: 'lat_bnds' and 'lon_bnds' can be created through the :func:`xagg.aux.get_bnds` function if they are not already included in the input raster file. Note: Currently this code only supports regular rectangular grids (so where every pixel side is a straight line in lat/lon space). Future versions may include support for irregular grids. Parameters --------------- ds : :class:`xarray.Dataset` an xarray dataset with the variables 'lat_bnds' and 'lon_bnds', which are both lat/lon x 2 arrays giving the min and max values of lat and lon for each pixel given by lat/lon subset_bbox : :class:`geopandas.GeoDataFrame`, optional, default = ``None`` if a :class:`geopandas.GeoDataFrame` is entered, the bounding box around the geometries in the `gdf` are used to mask the grid, to reduce the number of pixel polygons created Returns --------------- pix_agg: :py:class:`dict` a dictionary containing: - ``'gdf_pixels'`` a :class:`geopandas.GeoDataFrame` containing a 'geometry' giving the pixel boundaries for each 'lat' / 'lon' pair - ``'source_grid'`` a dictionary containing the original lat and lon inputs under the keys "lat" and "lon" (just the :class:`xarray.DataArray` of those variables in the input `ds`) """ if silent is None: silent = get_options()['silent'] # Standardize inputs (including lat/lon order) ds = fix_ds(ds) ds = get_bnds(ds,silent=silent) #breakpoint() # Process weights ds,winf = process_weights(ds,weights,target=weights_target) # Subset by shapefile bounding box, if desired if subset_bbox is not None: if type(subset_bbox) == gpd.geodataframe.GeoDataFrame: # Using the biggest difference in lat/lon to make sure that the pixels are subset # in a way that the bounding box is fully filled out # bbox_thresh = np.max([ds.lat.diff('lat').max(),ds.lon.diff('lon').max()])+0.1 grid_dist = np.max([ds.lat.diff('lat').max(),ds.lon.diff('lon').max()]) # first get the max grid size bbox_thresh = grid_dist*2. # then set threshold to twice grid size, avoids huge subsets for high res grids ds = ds.sel(lon=slice(subset_bbox.total_bounds[0]-bbox_thresh,subset_bbox.total_bounds[2]+bbox_thresh), lat=slice(subset_bbox.total_bounds[1]-bbox_thresh,subset_bbox.total_bounds[3]+bbox_thresh)) else: warnings.warn('[subset_bbox] is not a geodataframe; no mask by polygon bounding box used.') # Mask if mask is not None: raise NotImplementedError('Masking by grid not yet supported. Stay tuned...') # Create dataset which has a lat/lon bound value for each individual pixel, # broadcasted out over each lat/lon pair (ds_bnds,) = (xr.broadcast(ds.isel({d:0 for d in [k for k in ds.sizes if k not in ['lat','lon','bnds']]}). drop_vars([v for v in ds.keys() if v not in ['lat_bnds','lon_bnds']]))) # Stack so it's just pixels and bounds ds_bnds = ds_bnds.stack(loc=('lat','lon')) # In order: # (lon0,lat0),(lon0,lat1),(lon1,lat1),(lon1,lat1), but as a single array; to be # put in the right format for Polygon in the next step pix_poly_coords = np.transpose(np.vstack([ds_bnds.lon_bnds.isel(bnds=0).values,ds_bnds.lat_bnds.isel(bnds=0).values, ds_bnds.lon_bnds.isel(bnds=0).values,ds_bnds.lat_bnds.isel(bnds=1).values, ds_bnds.lon_bnds.isel(bnds=1).values,ds_bnds.lat_bnds.isel(bnds=1).values, ds_bnds.lon_bnds.isel(bnds=1).values,ds_bnds.lat_bnds.isel(bnds=0).values])) # Reshape so each location has a 4 x 2 (vertex vs coordinate) array, # and convert each of those vertices to tuples. This means every element # of pix_poly_coords is the input to shapely.geometry.Polygon of one pixel pix_poly_coords = tuple(map(tuple,np.reshape(pix_poly_coords,(np.shape(pix_poly_coords)[0],4,2)))) # Figure out if any pixels cross the antimeridian; we'll have to deal with # those separately... Identify them by seeing which pixels have longitudes # that are within the `wrap_around_thresh` (by default 5 degs) of both # +180 and -180 degrees cross_antimeridian_idxs = ((np.abs(np.array(pix_poly_coords)[:,:,0] - -180) < wrap_around_thresh).any(axis=1) & (np.abs(np.array(pix_poly_coords)[:,:,0] - 180) < wrap_around_thresh).any(axis=1)) # Create empty geodataframe gdf_pixels = gpd.GeoDataFrame(pd.DataFrame({v:[None]*ds_bnds.sizes['loc'] for v in ['lat','lon','geometry']}), geometry='geometry') if weights is not None: # Stack weights so they are linearly indexed like the ds (and fill # NAs with 0s) weights = ds.weights.stack(loc=('lat','lon')).fillna(0) # Preallocate weights column gdf_pixels['weights'] = [None]*ds_bnds.sizes['loc'] # Now populate with a polygon for every pixel poly_dict = {'poly_pts': pix_poly_coords} df_poly = pd.DataFrame(poly_dict, columns=['poly_pts']) df_poly['poly'] = df_poly.poly_pts.apply(lambda pts: Polygon(pts)) # Make MultiPolygons for pixels crossing the antimeridian df_poly.loc[np.where(cross_antimeridian_idxs)[0],'poly'] = df_poly.poly_pts.iloc[np.where(cross_antimeridian_idxs)[0]].apply(lambda pts: make_multipoly(pts)) # Set geometry gdf_pixels['geometry']=df_poly['poly'] # Add lat/lons of pixels for identification gdf_pixels['lat']=ds_bnds.lat.values gdf_pixels['lon']=ds_bnds.lon.values if weights is not None: gdf_pixels['weights'] = weights.values # Add a "pixel idx" to make indexing better later gdf_pixels['pix_idx'] = gdf_pixels.index.values # Add crs (normal lat/lon onto WGS84) gdf_pixels = gdf_pixels.set_crs("EPSG:4326") #gdf_pixels.crs = {'init':'EPSG:4326'} # Save the source grid for further reference source_grid = {'lat':ds_bnds.lat,'lon':ds_bnds.lon} pix_agg = {'gdf_pixels':gdf_pixels,'source_grid':source_grid} # Return the created geodataframe return pix_agg
[docs] def get_pixel_overlaps(gdf_in,pix_agg,impl=None): """ Get, for each polygon, the pixels that overlap and their area of overlap Finds, for each polygon in `gdf_in`, which pixels intersect it, and by how much. Note: Uses EASE-Grid 2.0 on the WGS84 datum to calculate relative areas (see https://nsidc.org/data/ease) Parameters --------------- gdf_in : :class:`geopandas.GeoDataFrane` a :class:`geopandas.GeoDataFrame` giving the polygons over which the variables should be aggregated. Can be just a read shapefile (with the added column of "poly_idx", which is just the index as a column). pix_agg : :py:class:`dict` the output of :func:`xagg.core.create_raster_polygons`; a dict containing: - ``'gdf_pixels'`` a :class:`geopandas.GeoDataFrame` giving for each row the columns "lat" and "lon" (with coordinates) and a polygon giving the boundary of the pixel given by lat/lon - ``'source_grid'`` ``[da.lat,da.lon]`` of the grid used to create the pixel polygons impl : :py:class:`str` (set by :py:meth:`xa.set_options`) whether the output will be used for the dot-product aggregation calculation (needs a slightly different format), either of: - ``'for_loop'`` (default behavior) - ``'dot_product'`` (to set up for ``impl='dot_product'`` in ``xagg.core.aggregate``) Returns --------------- wm_out: :py:class:`dict` A dictionary containing: - ``'agg'``: a dataframe containing all the fields of ``gdf_in`` (except geometry) and the additional columns: - ``coords``: the lat/lon coordiates of all pixels that overlap the polygon of that row - ``pix_idxs``: the linear indices of those pixels within the ``gdf_pixels`` grid - ``rel_area``: the relative area of each of the overlaps between the pixels and the polygon (summing to 1 - e.g. if the polygon is exactly the size and location of two pixels, their rel_areas would be 0.5 each) - ``'source_grid'`` a dictionary with keys 'lat' and 'lon' giving the original lat/lon grid whose overlaps with the polygons was calculated - ``'geometry'`` just the polygons from ``gdf_in`` """ if impl is None: impl = get_options()['impl'] # Add an index for each polygon as a column to make indexing easier #if 'poly_idx' not in gdf_in.columns: # gdf_in['poly_idx'] = gdf_in.index.values gdf_in['poly_idx'] = np.arange(0,len(gdf_in)) # Match up CRSes pix_agg['gdf_pixels'] = pix_agg['gdf_pixels'].to_crs(gdf_in.crs) # Choose a common crs for both, just to minimize the chance # of geographic shenanigans # (using the EASE grid https://nsidc.org/data/ease) if np.all(gdf_in.total_bounds[[1,3]]>0): # If min/max lat are both in NH, use North grid #epsg_set = {'init':'EPSG:6931'} (change to below bc of depreciation of {'init':...} format in geopandas) epsg_set = 'EPSG:6931' elif np.all(gdf_in.total_bounds[[1,3]]<0): # If min/max lat are both in SH, use South grid #epsg_set = {'init':'EPSG:6932'} epsg_set = 'EPSG:6932' else: # Otherwise, use the global/temperate grid #epsg_set = {'init':'EPSG:6933'} epsg_set = 'EPSG:6933' # Get GeoDataFrame of the overlaps between every pixel and the polygons with warnings.catch_warnings(): # Filter UserWarnings that flag when overlapping would result in # lines, or point overlaps (but we only care about 3D overlaps, so # keep_geom_type=True is the right call, and we don't need the # warning) warnings.filterwarnings('ignore',category=UserWarning) overlaps = gpd.overlay(gdf_in.to_crs(epsg_set), pix_agg['gdf_pixels'].to_crs(epsg_set), how='intersection') if overlaps.empty: raise NoOverlapError('No `ds` grid cells overlapped with any polygon in `gdf_in`. Check the input `ds` and `gdf_in`.') else: if impl=='dot_product': # Get relative area of each pixel overlaps = overlaps.groupby('poly_idx',group_keys=False)[overlaps.columns.tolist()].apply(find_rel_area,include_groups=False) overlaps['lat'] = overlaps['lat'].astype(float) overlaps['lon'] = overlaps['lon'].astype(float) # Now, group by poly_idx (each polygon in the shapefile) ov_groups = overlaps.groupby('poly_idx') overlap_info = ov_groups.agg(list_or_first) overlap_info = overlap_info.rename(columns={'pix_idx': 'pix_idxs'}) elif impl=='for_loop': # Now, group by poly_idx (each polygon in the shapefile) ov_groups = overlaps.groupby('poly_idx') # Calculate relative area of each overlap (so how much of the total # area of each polygon is taken up by each pixel), the pixels # corresponding to those areas, and the lat/lon coordinates of # those pixels overlap_info = ov_groups.agg(rel_area=pd.NamedAgg(column='geometry',aggfunc=lambda ds: [normalize(ds.area)]), pix_idxs=pd.NamedAgg(column='pix_idx',aggfunc=lambda ds: [idx for idx in ds]), lat=pd.NamedAgg(column='lat',aggfunc=lambda ds: [x for x in ds]), lon=pd.NamedAgg(column='lon',aggfunc=lambda ds: [x for x in ds])) # Zip lat, lon columns into a list of (lat,lon) coordinates # (separate from above because as of 12/20, named aggs with # multiple columns is still an open issue in the pandas github) overlap_info['coords'] = overlap_info.apply(lambda row: list(zip(row['lat'],row['lon'])),axis=1) overlap_info = overlap_info.drop(columns=['lat','lon']) # Reset index to make poly_idx a column for merging with gdf_in overlap_info = overlap_info.reset_index() if impl=='dot_product': # Merge in pixel overlaps to the input polygon geodataframe overlap_columns = ['pix_idxs', 'rel_area', 'coords', 'poly_idx'] gdf_in = pd.merge(gdf_in, overlap_info[overlap_columns],'outer', on='poly_idx') # make the weight grid an xarray dataset for later dot product idx_cols = ['lat', 'lon', 'poly_idx'] overlap_da = overlaps.set_index(idx_cols)['rel_area'].to_xarray() overlap_da = overlap_da.stack(loc=['lat', 'lon']) overlap_da = overlap_da.fillna(0) wm_out = weightmap(agg=gdf_in.drop('geometry', axis=1), source_grid=pix_agg['source_grid'], geometry=gdf_in.geometry, overlap_da = overlap_da) elif impl=='for_loop': # Merge in pixel overlaps to the input polygon geodataframe gdf_in = pd.merge(gdf_in,overlap_info,'outer') # Drop 'geometry' eventually, just for size/clarity wm_out = weightmap(agg=gdf_in.drop('geometry',axis=1), source_grid=pix_agg['source_grid'], geometry=gdf_in.geometry) if 'weights' in pix_agg['gdf_pixels'].columns: wm_out.weights = pix_agg['gdf_pixels'].weights return wm_out
#--------- NUMBA HELPER FUNCTIONS --------- if _has_numba: @njit def numba_indexer(x,idxs): vals = np.full(len(idxs),np.nan) for i in range(len(idxs)): vals[i] = x[int(idxs[i])] return vals @njit def numba_aggregate(data,idxs,rel_area,add_weights=None): if add_weights is None: add_weights = np.ones(len(data)) # Since idxs, rel_area are stored in a dataarray, # in the weightmap, they're padded with nans. # Remove those. idxs = idxs[~np.isnan(idxs)] rel_area = rel_area[~np.isnan(rel_area)] # Get the indices of the data needed data = numba_indexer(data,idxs) add_weights = numba_indexer(add_weights,idxs) # Normalize area and additional weights to a single # weights vector weights = rel_area * add_weights # Make sure to do aggregation only on non-nan pixels nans = np.isnan(data) data = data[~nans] weights = weights[~nans] if len(data) == 0: agg = np.nan else: agg = np.sum(data*weights)/np.sum(weights) return agg #--------------------------------------------
[docs] def aggregate(ds,wm,impl=None,silent=None): """ Aggregate raster variable(s) to polygon(s) Aggregates (N-D) raster variables in `ds` to the polygons in `gfd_out` - in other words, gives the weighted average of the values in [ds] based on each pixel's relative area overlap with the polygons. The values will be additionally weighted if a `weight` was inputted into :func:`xagg.core.create_raster_polygons` The code checks whether the input lat/lon grid in `ds` is equivalent to the linearly indexed grid in `wm`, or if it can be cropped to that grid. Parameters --------------- ds : :py:class:`xarray.Dataset` an :py:class:`xarray.Dataset` containing one or more variables with dimensions lat, lon (and possibly more). The dataset's geographic grid has to include the lat/lon coordinates used in determining the pixel overlaps in :func:`xagg.core.get_pixel_overlaps` (and saved in ``wm['source_grid']``) wm : :py:class:`xagg.classes.weightmap` the output to :func:`xagg.core.get_pixel_overlaps`; a :py:class:`xagg.classes.weightmap` object containing - ``['agg']`` a dataframe, with one row per polygon, and the columns `pix_idxs` and `rel_area`, giving the linear indices and the relative area of each pixel over the polygon, respectively - ``['source_grid']`` the lat/lon grid on which the aggregating parameters were calculated (and on which the linear indices are based) impl : :py:class:str (def: ``'for_loop'``) (set by :py:meth:`xa.set_options`) which aggregation calculation method to use, either of: - ``'for_loop'`` default behavior, aggregation loops through all polygons in a for loop, requires less memory - ``'dot_product'`` aggregation is calculated using a dot product, requires much more memory (due to broadcasting of variables) but may be faster in certain circumstances - ``'numba'`` aggregation is calculated using a simplified function that uses numba's just-in-time compiler. Can be substantially faster silent : :py:class:`bool`, default = `False` (set by :py:meth:`xa.set_options`) if True, then no status updates are printed to std out Returns --------------- agg_out : :class:`xagg.classes.aggregated` an :class:`xagg.classes.aggregated` object with the aggregated variables """ if impl is None: impl = get_options()['impl'] if silent is None: silent = get_options()['silent'] # Triggers if/once a partial nan warning is called, to avoid redoing the # warning for every variable, every polygon _warn_trigger_partialnan = True # Make sure pixel_overlaps was correctly run if using dot product if (impl=='dot_product') and (wm.overlap_da is None): raise ValueError("no 'overlap_da' was found in the `wm` input - since you're using the dot product implementation, "+ "make sure to run `pixel_overlaps()` with `impl='dot_product'` to avoid this error.") # Turn into dataset if dataarray if type(ds)==xr.core.dataarray.DataArray: if ds.name is None: warnings.warn('An unnamed xr.DataArray was inputted instead of a xr.Dataset; the output variable will be "var"') ds = ds.to_dataset(name='var') else: ds = ds.to_dataset() # Run ds through fix_ds (to fix lat/lon names, lon coords) ds = fix_ds(ds) # Stack ds = ds.stack(loc=('lat','lon')) # Adjust grid of [ds] if necessary to match ds = subset_find(ds,wm.source_grid,silent=silent) # Set weights; or replace with ones if no additional weight information #if wm.weights != 'nowghts': if type(wm.weights) == pd.core.series.Series: weights = np.array([float(k) for k in wm.weights]) else: if wm.weights != 'nowghts': warnings.warn('wm.weights is: \n '+print(wm.weights)+ ', \n which is not a supported weight vector (in a pandas series) '+ 'or "nowghts" as a string. Assuming no weights are included...') if impl=='dot_product': weights = np.ones((len(wm.overlap_da['loc']))) elif (impl=='for_loop') or (impl=='numba'): weights = np.ones((len(wm.source_grid['lat']))) if impl=='dot_product': data_dict = dict() for var in ds: # Process for every variable that has locational information, but isn't a # bound variable if ('bnds' not in ds[var].sizes) & ('loc' in ds[var].sizes): if not silent: print('aggregating '+var+'...') # select just data for which we have overlaps var_array = ds[var].sel(loc=wm.overlap_da['loc']) # Get the dimensions of the variable that aren't "loc" (location) other_dims = [k for k in np.atleast_1d(var_array.sizes) if k != 'loc'] # Check first if any nans are "complete" (meaning that a pixel # either has values for each step, or nans for each step - if # there are random nans along non-location dimensions for the # same grid cell, throw a warning) _warn_trigger_partialnan=_warn_ifsomenans(var_array,var,other_dims, _warn_trigger_partialnan) # multiply percent-overlaps by user-supplied weights weights_and_overlaps = wm.overlap_da * weights # fill any nans in the gridded data to zero and change the corresponding # weights to zero so that they will not affect the aggregated value var_array_filled = var_array.fillna(0) weights_and_overlaps = weights_and_overlaps.where(var_array.notnull(), 0) # the weights now may add up to less than one because of nans or # because of the user-supplied weights. here we normalize the # weights so they add up to 1 and fill any nan's with 0 so they # won't be counted normed_weights = weights_and_overlaps/weights_and_overlaps.sum(dim = "loc", skipna=True) normed_weights = normed_weights.fillna(0) # finally we do the dot product to get the weighted averages aggregated_array = normed_weights.dot(var_array_filled, dim='loc') # if the original gridded values were all nan, make the final # aggregation nan if np.isnan(var_array.values).all(): aggregated_array = aggregated_array * np.nan data_dict[var] = aggregated_array ds_combined = xr.Dataset(data_dict) print('ds_combined:') print(ds_combined) for var in ds: if ('bnds' not in ds[var].sizes) and ('loc' in ds[var].sizes): # convert to list of arrays - NOT SURE THIS IS THE RIGHT THING TO # DO, JUST TRYING TO MATCH ORIGINAL FORMAT wm.agg[var]=pd.Series([[[ds_combined[var].isel(poly_idx=i).values]] for i in range(len(ds_combined.poly_idx))]) # Put in class format agg_out = aggregated(agg=wm.agg,source_grid=wm.source_grid, geometry=wm.geometry,ds_in=ds_combined,weights=wm.weights) elif impl=='for_loop': for var in ds: # Process for every variable that has locational information, but isn't a # bound variable if ('bnds' not in ds[var].sizes) and ('loc' in ds[var].sizes): if not silent: print('aggregating '+var+'...') # Create the column for the relevant variable wm.agg[var] = None # Get weighted average of variable based on pixel overlap + other weights for poly_idx in wm.agg.poly_idx: # Get average value of variable over the polygon; weighted by # how much of the pixel area is in the polygon, and by (optionally) # a separate gridded weight. This if statement checks # - if the weights output for this polygon is just "np.nan", which # indicates there's no overlap between the pixel grid and polygons # - [this has been pushed down a few lines] or, if all the pixels in # the grid have just nan values for this variable # in both cases; the "aggregated variable" is just a vector of nans. if not np.isnan(wm.agg.iloc[poly_idx,:].pix_idxs).all(): # Get the dimensions of the variable that aren't "loc" (location) other_dims = [k for k in np.atleast_1d(ds[var].sizes) if k != 'loc'] # Check first if any nans are "complete" (meaning that a pixel # either has values for each step, or nans for each step - if # there are random nans along non-location dimensions for the # same grid cell, throw a warning) _warn_trigger_partialnan=_warn_ifsomenans(ds,var,other_dims, _warn_trigger_partialnan) if not np.isnan(ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)).all(): # Get relative areas for the pixels overlapping with this Polygon tmp_areas = np.atleast_1d(np.squeeze(wm.agg.iloc[poly_idx,:].rel_area)) # Replace overlapping pixel areas with nans if the corresponding pixel # is only composed of nans tmp_areas[np.array(np.isnan(ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)).all(other_dims).values)] = np.nan # Calculate the normalized area+weight of each pixel (taking into account # nans) normed_areaweights = normalize(tmp_areas*weights[wm.agg.iloc[poly_idx,:].pix_idxs],drop_na=True) # Take the weighted average of all the pixel values to calculate the # aggregated value for the shapefile wm.agg.loc[poly_idx,var] = [[((ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)* normed_areaweights). sum('loc')).values]] else: wm.agg.loc[poly_idx,var] = [[(ds[var].isel(loc=0)*np.nan).values]] else: wm.agg.loc[poly_idx,var] = [[(ds[var].isel(loc=0)*np.nan).values]] # Put in class format agg_out = aggregated(agg=wm.agg,source_grid=wm.source_grid, geometry=wm.geometry,ds_in=ds,weights=wm.weights) elif impl=='numba': if not _has_numba: raise ImportError("impl == 'numba' requires `numba`, which is not installed.") # Extract pixel idxs, area overlap for each polygon from weightmap object pix_idxs = [np.atleast_1d(pix) for pix in wm.agg.pix_idxs.values] rel_area = [np.atleast_1d(area[0]) if area is not np.nan else np.atleast_1d(np.nan) for area in wm.agg.rel_area.values] max_pix = max(len(lst) for lst in pix_idxs) # Max number of pixels per polygon, to pad to # Preallocate arrays filled with NaNs n_polys = len(wm.agg) pix_idxs_padded = np.full((n_polys, max_pix), np.nan) rel_area_padded = np.full((n_polys, max_pix), np.nan) # Fill the arrays, padding to max number of pixels with nans for i, (pix_idxs, rel_area) in enumerate(zip(pix_idxs, rel_area)): pix_idxs_padded[i, :len(pix_idxs)] = pix_idxs rel_area_padded[i, :len(rel_area)] = rel_area # Put into dataset idxs = xr.Dataset({'idxs': (['poly_idx', 'idx'], pix_idxs_padded), 'rel_area': (['poly_idx', 'idx'], rel_area_padded)}, coords={'poly_idx': wm.agg.poly_idx.values, 'idx': np.arange(max_pix)} ) # Make into dask array idxs = idxs.chunk({'poly_idx':'auto','idx':-1}) # Make weights data array, to put into apply_ufunc below weights = xr.DataArray(weights, dims = ['loc'], coords = {'loc':ds.loc}) for var in ds: if ('bnds' not in ds[var].sizes) and ('loc' in ds[var].sizes): if not silent: print('aggregating '+var+'...') # Get the dimensions of the variable that aren't "loc" (location) other_dims = [k for k in np.atleast_1d(ds[var].sizes) if k != 'loc'] # Check first if any nans are "complete" (meaning that a pixel # either has values for each step, or nans for each step - if # there are random nans along non-location dimensions for the # same grid cell, throw a warning) _warn_trigger_partialnan=_warn_ifsomenans(ds,var,other_dims, _warn_trigger_partialnan) # Aggregate agg = xr.apply_ufunc(numba_aggregate, ds[var], idxs.idxs, idxs.rel_area, weights, input_core_dims = [['loc'],['idx'],['idx'],['loc']], output_core_dims = [[]], vectorize=True, dask='parallelized', output_dtypes = [float]).to_dataset(name=var) # Trigger computation, otherwise it separately triggers computation # for every poly_idx below, with tremendous overhead agg = agg.compute() # Put in `aggregated` format (TODO: should reform this to remove some of the # nested lists that are here for some reason) wm.agg[var] = [[[agg[var].sel(poly_idx = idx).values]] for idx in wm.agg.index] # Put in class format agg_out = aggregated(agg=wm.agg,source_grid=wm.source_grid, geometry=wm.geometry,ds_in=ds,weights=wm.weights) # Return if not silent: print('all variables aggregated to polygons!') return agg_out