import xarray as xr
import pandas as pd
import geopandas as gpd
import numpy as np
import os
import re
import warnings
import shutil
from . auxfuncs import (fix_ds,get_bnds,subset_find)
[docs]
def export_weightmap(wm_obj,fn,overwrite=False):
""" Save a copy of the weightmap, to avoid recalculating it
"""
###### NEED TO SETUP HD5 DEPENDENCY? THROUGH OPTIONAL DEPENDENCY MAYBE?
warnings.warn('export_weightmap() is still an experimental feature. use with care.')
if (not overwrite) and (os.path.exists(fn)):
raise FileExistsError(fn+'/ already exists. Either change the [fn] or set overwrite=True.')
try:
###### Save geometry
# This also creates a folder, into which we can add the rest
# of the files
wm_obj.geometry.to_file(fn)
####### Save agg
# Turn into a dataframe from a geodataframe; only dataframes
# can be converted to HD5 files (and there's no geographic
# information here that's needed; can all be saved in the
# geographic save below)
df_out = pd.DataFrame(wm_obj.agg)
with warnings.catch_warnings():
# Catch performance warning about pickling
warnings.filterwarnings('ignore')
df_out.to_hdf(fn+'/'+re.split('/',fn)[-1]+'.h5','wm')
###### Save source grid
for k in wm_obj.source_grid:
# Adding v to variable (i.e., 'latv', 'lonv') needed
# because of the multi-index being used here
source_grid_tmp = wm_obj.source_grid[k].reset_index('loc').to_dataset(name=k+'v')
# Replicate dataset, to avoid xarray bug stemming from
# Pandas MultiIndex testing...
source_grid_tmp = source_grid_tmp.reset_coords()
source_grid_tmp = xr.Dataset({k:([d for d in v.sizes],v.values) for k,v in source_grid_tmp.items()})
source_grid_tmp = source_grid_tmp.set_coords([d for d in source_grid_tmp if d not in ['loc', k+'v']])
# Save
source_grid_tmp.to_netcdf(fn+'/'+re.split('/',fn)[-1]+'_'+k+'.nc')
###### Save weights
# (If weights == 'nowghts', then no file to load)
if (type(wm_obj.weights) is not str) or (wm_obj.weights != 'nowghts'):
# Setting astype(object) to make sure integral weights
# don't change the general type of the frame. This may
# only affect the testing routines, but setting this here
# to be explicit
wm_obj.weights.astype(object).to_csv(fn+'/'+re.split('/',fn)[-1]+'_weights.csv')
except RuntimeError as error:
print(error)
# Remove files that have already been generated, to make
# sure no flawed files are floating around.
shutil.rmtree(fn)
raise Exception('Error while saving. The directory '+fn+' has been removed.')
[docs]
def prep_for_nc(agg_obj,loc_dim='poly_idx'):
""" Preps aggregated data for output as a netcdf
Concretely, aggregated data is placed in a new xarray dataset
with dimensions of location (the different polygons in gdf_out)
and any other dimension(s) in the original input raster data.
All fields from the input polygons are kept as variables with
dimension of location.
Parameters
---------------
agg_obj : :class:`xagg.classes.aggregated`
loc_dim : :py:class:`str`, optional
the name of the location dimension; by definition
'poly_idx'. Values of that dimension are currently
only an integer index (with further information given
by the field variables). Future versions may allow,
if loc_dim is set to the name of a field in the input
polygons, to replace the dimension with the values of
that field.
Returns:
---------------
ds: :class:`xarray.Dataset`
an :class:`xarray.Dataset` containing the aggregated variables in addition
to the original fields from the location polygons. Dimensions are
a location dimension (counting down the polygons - this is the
dimension of all the field contents) and any other non-location
dimensions contained in the variables before being aggregated
"""
# To output as netcdf, first put data back into an xarray dataset
# Create xarray dataset with the aggregation polygons (poly_idx)
# there.
ds_out = xr.Dataset(coords={'poly_idx':(['poly_idx'],agg_obj.agg.poly_idx.values)})
# Add other polygon attributes
for var in [c for c in agg_obj.agg.columns if c not in ['poly_idx','rel_area','pix_idxs','coords']]:
if var not in agg_obj.ds_in.var():
# For auxiliary variables (from the shapefile), just copy them wholesale into the dataset
ds_out[var] = xr.DataArray(data=agg_obj.agg[var],coords=[agg_obj.agg.poly_idx],dims=['poly_idx'])
else:
# For data variables (from the input grid), create empty array
ds_out[var] = xr.DataArray(data=np.zeros((len(agg_obj.agg),
*[agg_obj.ds_in[var].sizes[k] for k in agg_obj.ds_in[var].sizes.keys() if k not in ['lat','lon','loc','poly_idx']]))*np.nan,
dims=['poly_idx',*[k for k in agg_obj.ds_in[var].sizes.keys() if k not in ['lat','lon','loc','poly_idx']]],
coords=[[k for k in agg_obj.agg.poly_idx],*[agg_obj.ds_in[var][k].values for k in agg_obj.ds_in[var].sizes.keys() if k not in ['lat','lon','loc','poly_idx']]])
# Now insert aggregated values
for poly_idx in agg_obj.agg.poly_idx:
ds_out[var].loc[{'poly_idx':poly_idx}] = np.squeeze(agg_obj.agg.loc[poly_idx,var])
# Add non-geographic coordinates for the variables to be aggregated
for crd in [k for k in agg_obj.ds_in.sizes.keys() if (k not in ['lat','lon','loc','bnds'])]:
ds_out[crd] = xr.DataArray(dims=[crd],
data=agg_obj.ds_in[crd].values,
coords=[agg_obj.ds_in[crd].values])
# Remove "name" variable created in get_pixel_overlaps
if 'name' in ds_out:
if ((ds_out['name'].dims == ('poly_idx',)) and
(len(np.unique(ds_out['name'])) == 1) and
np.unique(ds_out['name'])[0] in ds_out):
ds_out = ds_out.drop_vars(['name'])
# Rename poly_idx if desired
if loc_dim != 'poly_idx':
ds_out = ds_out.rename({'poly_idx':loc_dim})
# Return ds_out
return ds_out
[docs]
def prep_for_csv(agg_obj,add_geom=False):
""" Preps aggregated data for output as a csv
Concretely, aggregated data is placed in a new pandas dataframe
and expanded **wide** - each aggregated variable is placed in new
columns; one column per coordinate in each dimension that isn't
the location (poolygon). So, for example, a lat x lon x time
variable "tas", aggregated to location x time, would be reshaped
long to columns "tas0", "tas1", "tas2",... for timestep 0, 1, etc.
For data **long**, use ``agg_obj.to_dataframe().to_csv()`` instead.
Note:
Currently no support for variables with more than one extra dimension
beyond their location dimensions. Potential options: a multi-index
column name, so [var]0-0, [var]0-1, etc...
Parameters
----------------
agg_obj : :class:`xagg.classes.aggregated`
the output from :func:`aggregate`
add_geom : bool, default `False`
if `True`, adds the geometry from the original shapefile/
geodataframe back into the prepped output; this is for
the `.to_geodataframe()` conversion option.
Returns
----------------
df : :class:`pandas.DataFrame`
a pandas dataframe containing all the fields from the original
location polygons + columns containing the values of the aggregated
variables at each location. This can then easily be exported as a
csv directly (using ``df.to_csv()``) or to shapefiles by first turning into
a geodataframe.
"""
# Test to make sure there's only one non-location dimension
var_dims = {var:[d for d in agg_obj.ds_in[var].sizes if d != 'loc'] for var in agg_obj.ds_in}
var_ndims = {var:len(dims) for var,dims in var_dims.items()}
if np.max([n for d,n in var_ndims.items()]) > 1:
raise NotImplementedError('The `agg` object has variables with more than 1 non-location dimension; '+
'agg.to_csv() and agg.to_geodataframe() return wide arrays, but the code can not yet create wide arrays spanning data with multiple `wide` dimensions. '+
'Try agg.to_dataframe() instead. (the offending variables with their non-location dimensions are '+
str({var:dims for var,dims in var_dims.items() if var_ndims[var]>1})+')')
# For output into csv, work with existing geopandas data frame
csv_out = agg_obj.agg.drop(columns=['rel_area','pix_idxs','coords','poly_idx'])
# Now expand the aggregated variable into multiple columns
for var in [c for c in agg_obj.agg.columns if ((c not in ['poly_idx','rel_area','pix_idxs','coords']) & (c in agg_obj.ds_in.var()))]:
# NOT YET IMPLEMENTED: dynamic column naming - so if it recognizes
# it as a date, then instead of doing var0, var1, var2,... it does
# varYYYYMMDD etc.
# These are the coordinates of the variable in the original raster
dimsteps = [agg_obj.ds_in[var][d].values for d in agg_obj.ds_in[var].sizes.keys() if d not in ['lat','lon','loc']]
# ALSO SHOULD check to see if the variables are multi-D - if they are
# there are two options:
# - a multi-index column title (var0-0, var0-1)
# - or an error saying csv output is not supported for this
# Reshape the variable wide and name the columns [var]0, [var]1,...
if len(dimsteps) == 0:
# (in this case, all it does is move from one list per row to
# one value per row)
#expanded_var = (pd.DataFrame(pd.DataFrame(csv_out[var].to_list())[0].to_list(),
# columns=[var]))
expanded_var = pd.DataFrame(csv_out[var].apply(np.squeeze).to_list(),
columns=[var])
else:
#expanded_var = (pd.DataFrame(pd.DataFrame(csv_out[var].to_list())[0].to_list(),
# columns=[var+str(idx) for idx in np.arange(0,len(csv_out[var][0][0]))]))
expanded_var = pd.DataFrame(csv_out[var].apply(np.squeeze).to_list(),
columns = [var+str(idx) for idx in range(len(csv_out[var].apply(np.squeeze))+1)])
# Append to existing series
csv_out = pd.concat([csv_out.drop(columns=(var)),
expanded_var],
axis=1)
del expanded_var
if add_geom:
# Return the geometry from the original geopandas.GeoDataFrame
csv_out['geometry'] = agg_obj.geometry
csv_out = csv_out.set_geometry('geometry')
# Return
return csv_out
[docs]
def output_data(agg_obj,output_format,output_fn,loc_dim='poly_idx',silent=False):
""" Wrapper for `prep_for_*` functions
Parameters
---------------
agg_obj : :class:`xagg.classes.aggregated`
object to be exported
output_format : :py:class:`str`
'netcdf', 'csv', or 'shp'
output_fn: :py:class:`str`
the output filename
loc_dim : :py:class:`str`, optional. default = ``'poly_idx'``
the name of the
dimension with location indices; used
by :func:`xagg.export.prep_for_nc` for nc,
csv output
Returns
---------------
the variable that gets saved, so depending on the `output_format`:
- "netcdf": the :class:`xarray.Dataset` on which ``.to_netcdf``
was called
- "csv": the :class:`pandas.Dataframe` on which ``.to_csv``
was called (uses the `xarray` `ds.to_dataframe()` functionality)
- "shp": the :class:`geopandas.GeoDataDrame` on which ``.to_file`` was called
"""
if output_format == 'netcdf':
ds_out = prep_for_nc(agg_obj,loc_dim=loc_dim)
# Save as netcdf
if not output_fn.endswith('.nc'):
output_fn = output_fn+'.nc'
ds_out.to_netcdf(output_fn)
if not silent:
print(output_fn+' saved!')
# Return
return ds_out
elif output_format == 'csv':
csv_out = prep_for_nc(agg_obj,loc_dim=loc_dim)
csv_out = csv_out.to_dataframe()
# Save as csv
if not output_fn.endswith('.csv'):
output_fn = output_fn+'.csv'
csv_out.to_csv(output_fn)
if not silent:
print(output_fn+' saved!')
# Return
return csv_out
elif output_format == 'shp':
# This uses the same processing as the to_csv option above, since
# shapefiles function similarly (each field can only have one value
# for each polygon, etc.)
csv_out = prep_for_csv(agg_obj)
# gdf_in.geometry should be replaced with gdf_out['geometry']
# which should be kept when gdf_out is created...
# Turn back into GeoDataFrame
shp_out = gpd.GeoDataFrame(csv_out,geometry=agg_obj.geometry)
# Export that GeoDataFrame to shapefile
if not output_fn.endswith('.shp'):
output_fn = output_fn+'.shp'
shp_out.to_file(output_fn)
if not silent:
print(output_fn+' saved!')
# Return
return shp_out
else:
raise KeyError(output_format+' is not a supported output format.')