Source code for xagg.export

import xarray as xr
import pandas as pd
import geopandas as gpd
import numpy as np
import os
import re
import warnings
import shutil

from . auxfuncs import (fix_ds,get_bnds,subset_find)



[docs]
def export_weightmap(wm_obj,fn,overwrite=False):
    """ Save a copy of the weightmap, to avoid recalculating it
    """
    ###### NEED TO SETUP HD5 DEPENDENCY? THROUGH OPTIONAL DEPENDENCY MAYBE? 
    warnings.warn('export_weightmap() is still an experimental feature. use with care.')


    if (not overwrite) and (os.path.exists(fn)):
        raise FileExistsError(fn+'/ already exists. Either change the [fn] or set overwrite=True.')

    try: 
        ###### Save geometry
        # This also creates a folder, into which we can add the rest
        # of the files
        wm_obj.geometry.to_file(fn)

        ####### Save agg 
        # Turn into a dataframe from a geodataframe; only dataframes
        # can be converted to HD5 files (and there's no geographic 
        # information here that's needed; can all be saved in the 
        # geographic save below)
        df_out = pd.DataFrame(wm_obj.agg)
        with warnings.catch_warnings():
            # Catch performance warning about pickling
            warnings.filterwarnings('ignore')
            df_out.to_hdf(fn+'/'+re.split('/',fn)[-1]+'.h5','wm')

        ###### Save source grid
        for k in wm_obj.source_grid:
            # Adding v to variable (i.e., 'latv', 'lonv') needed 
            # because of the multi-index being used here 
            source_grid_tmp = wm_obj.source_grid[k].reset_index('loc').to_dataset(name=k+'v')

            # Replicate dataset, to avoid xarray bug stemming from 
            # Pandas MultiIndex testing... 
            source_grid_tmp = source_grid_tmp.reset_coords()
            source_grid_tmp = xr.Dataset({k:([d for d in v.sizes],v.values) for k,v in source_grid_tmp.items()})
            source_grid_tmp = source_grid_tmp.set_coords([d for d in source_grid_tmp if d not in ['loc', k+'v']])

            # Save
            source_grid_tmp.to_netcdf(fn+'/'+re.split('/',fn)[-1]+'_'+k+'.nc')

        ###### Save weights
        # (If weights == 'nowghts', then no file to load)
        if (type(wm_obj.weights) is not str) or (wm_obj.weights != 'nowghts'):
            # Setting astype(object) to make sure integral weights
            # don't change the general type of the frame. This may
            # only affect the testing routines, but setting this here
            # to be explicit 
            wm_obj.weights.astype(object).to_csv(fn+'/'+re.split('/',fn)[-1]+'_weights.csv')
    except RuntimeError as error:
        print(error)
        # Remove files that have already been generated, to make 
        # sure no flawed files are floating around. 
        shutil.rmtree(fn)
        raise Exception('Error while saving. The directory '+fn+' has been removed.')




[docs]
def prep_for_nc(agg_obj,loc_dim='poly_idx'):
    """ Preps aggregated data for output as a netcdf
    
    Concretely, aggregated data is placed in a new xarray dataset 
    with dimensions of location (the different polygons in gdf_out)
    and any other dimension(s) in the original input raster data. 
    All fields from the input polygons are kept as variables with 
    dimension of location.
    
    Parameters
    ---------------
    agg_obj : :class:`xagg.classes.aggregated`

    loc_dim : :py:class:`str`, optional
        the name of the location dimension; by definition 
        'poly_idx'. Values of that dimension are currently
        only an integer index (with further information given
        by the field variables). Future versions may allow, 
        if loc_dim is set to the name of a field in the input
        polygons, to replace the dimension with the values of
        that field. 
               
    Returns:
    ---------------
    ds: :class:`xarray.Dataset`
        an :class:`xarray.Dataset` containing the aggregated variables in addition
        to the original fields from the location polygons. Dimensions are
        a location dimension (counting down the polygons - this is the 
        dimension of all the field contents) and any other non-location
        dimensions contained in the variables before being aggregated
    """
    # To output as netcdf, first put data back into an xarray dataset
    
    # Create xarray dataset with the aggregation polygons (poly_idx) 
    # there. 
    ds_out = xr.Dataset(coords={'poly_idx':(['poly_idx'],agg_obj.agg.poly_idx.values)})
    
    # Add other polygon attributes
    for var in [c for c in agg_obj.agg.columns if c not in ['poly_idx','rel_area','pix_idxs','coords']]:
        if var not in agg_obj.ds_in.var():
            # For auxiliary variables (from the shapefile), just copy them wholesale into the dataset
            ds_out[var] = xr.DataArray(data=agg_obj.agg[var],coords=[agg_obj.agg.poly_idx],dims=['poly_idx'])
        else:
            # For data variables (from the input grid), create empty array
            ds_out[var] = xr.DataArray(data=np.zeros((len(agg_obj.agg),
                                                         *[agg_obj.ds_in[var].sizes[k] for k in agg_obj.ds_in[var].sizes.keys() if k not in ['lat','lon','loc','poly_idx']]))*np.nan,
                                       dims=['poly_idx',*[k for k in agg_obj.ds_in[var].sizes.keys() if k not in ['lat','lon','loc','poly_idx']]],
                                       coords=[[k for k in agg_obj.agg.poly_idx],*[agg_obj.ds_in[var][k].values for k in agg_obj.ds_in[var].sizes.keys() if k not in ['lat','lon','loc','poly_idx']]])
            # Now insert aggregated values 
            for poly_idx in agg_obj.agg.poly_idx:
                ds_out[var].loc[{'poly_idx':poly_idx}] = np.squeeze(agg_obj.agg.loc[poly_idx,var])
    
    # Add non-geographic coordinates for the variables to be aggregated
    for crd in [k for k in agg_obj.ds_in.sizes.keys() if (k not in ['lat','lon','loc','bnds'])]:
        ds_out[crd] = xr.DataArray(dims=[crd],
                                   data=agg_obj.ds_in[crd].values,
                                   coords=[agg_obj.ds_in[crd].values])
        
    # Remove "name" variable created in get_pixel_overlaps
    if 'name' in ds_out:
        if ((ds_out['name'].dims == ('poly_idx',)) and 
            (len(np.unique(ds_out['name'])) == 1) and
            np.unique(ds_out['name'])[0] in ds_out):
            ds_out = ds_out.drop_vars(['name'])

    # Rename poly_idx if desired
    if loc_dim != 'poly_idx':
        ds_out = ds_out.rename({'poly_idx':loc_dim})
        
    # Return ds_out
    return ds_out




[docs]
def prep_for_csv(agg_obj,add_geom=False):
    """ Preps aggregated data for output as a csv
    
    Concretely, aggregated data is placed in a new pandas dataframe
    and expanded **wide** - each aggregated variable is placed in new 
    columns; one column per coordinate in each dimension that isn't
    the location (poolygon). So, for example, a lat x lon x time
    variable "tas", aggregated to location x time, would be reshaped 
    long to columns "tas0", "tas1", "tas2",... for timestep 0, 1, etc.

    For data **long**, use ``agg_obj.to_dataframe().to_csv()`` instead. 
    
    Note: 
    Currently no support for variables with more than one extra dimension
    beyond their location dimensions. Potential options: a multi-index
    column name, so [var]0-0, [var]0-1, etc...
    
    Parameters
    ----------------
    agg_obj : :class:`xagg.classes.aggregated`
        the output from :func:`aggregate`

    add_geom : bool, default `False`
        if `True`, adds the geometry from the original shapefile/
        geodataframe back into the prepped output; this is for 
        the `.to_geodataframe()` conversion option. 

               
    Returns
    ----------------
    df : :class:`pandas.DataFrame`
        a pandas dataframe containing all the fields from the original 
        location polygons + columns containing the values of the aggregated
        variables at each location. This can then easily be exported as a 
        csv directly (using ``df.to_csv()``) or to shapefiles by first turning into
        a geodataframe. 
    
    """
    # Test to make sure there's only one non-location dimension
    var_dims = {var:[d for d in agg_obj.ds_in[var].sizes if d != 'loc'] for var in agg_obj.ds_in}
    var_ndims = {var:len(dims) for var,dims in var_dims.items()}
    if np.max([n for d,n in var_ndims.items()]) > 1:
        raise NotImplementedError('The `agg` object has variables with more than 1 non-location dimension; '+
                                  'agg.to_csv() and agg.to_geodataframe() return wide arrays, but the code can not yet create wide arrays spanning data with multiple `wide` dimensions. '+
                                  'Try agg.to_dataframe() instead. (the offending variables with their non-location dimensions are '+
                                  str({var:dims for var,dims in var_dims.items() if var_ndims[var]>1})+')')

    # For output into csv, work with existing geopandas data frame
    csv_out = agg_obj.agg.drop(columns=['rel_area','pix_idxs','coords','poly_idx'])
    
    # Now expand the aggregated variable into multiple columns
    for var in [c for c in agg_obj.agg.columns if ((c not in ['poly_idx','rel_area','pix_idxs','coords']) & (c in agg_obj.ds_in.var()))]:
        # NOT YET IMPLEMENTED: dynamic column naming - so if it recognizes 
        # it as a date, then instead of doing var0, var1, var2,... it does
        # varYYYYMMDD etc.
        # These are the coordinates of the variable in the original raster
        dimsteps = [agg_obj.ds_in[var][d].values for d in agg_obj.ds_in[var].sizes.keys() if d not in ['lat','lon','loc']]
        # ALSO SHOULD check to see if the variables are multi-D - if they are
        # there are two options: 
        # - a multi-index column title (var0-0, var0-1)
        # - or an error saying csv output is not supported for this
    
        # Reshape the variable wide and name the columns [var]0, [var]1,...
        if len(dimsteps) == 0:
            # (in this case, all it does is move from one list per row to 
            # one value per row)
            #expanded_var = (pd.DataFrame(pd.DataFrame(csv_out[var].to_list())[0].to_list(),
            #                         columns=[var]))
            expanded_var = pd.DataFrame(csv_out[var].apply(np.squeeze).to_list(),
                                        columns=[var])
        else:
            #expanded_var = (pd.DataFrame(pd.DataFrame(csv_out[var].to_list())[0].to_list(),
            #                         columns=[var+str(idx) for idx in np.arange(0,len(csv_out[var][0][0]))]))
            expanded_var = pd.DataFrame(csv_out[var].apply(np.squeeze).to_list(),
                                        columns = [var+str(idx) for idx in range(len(csv_out[var].apply(np.squeeze))+1)])
        # Append to existing series
        csv_out = pd.concat([csv_out.drop(columns=(var)),
                             expanded_var],
                            axis=1)
        del expanded_var

    if add_geom:
        # Return the geometry from the original geopandas.GeoDataFrame
        csv_out['geometry'] = agg_obj.geometry
        csv_out = csv_out.set_geometry('geometry')
        
    # Return 
    return csv_out




[docs]
def output_data(agg_obj,output_format,output_fn,loc_dim='poly_idx',silent=False):
    """ Wrapper for `prep_for_*` functions
    
    
    Parameters
    ---------------
    agg_obj : :class:`xagg.classes.aggregated` 
        object to be exported   

    output_format : :py:class:`str`
        'netcdf', 'csv', or 'shp'
    output_fn: :py:class:`str`
        the output filename   
    loc_dim : :py:class:`str`, optional. default = ``'poly_idx'``
        the name of the
        dimension with location indices; used
        by :func:`xagg.export.prep_for_nc` for nc, 
        csv output
                     
    Returns
    ---------------
    the variable that gets saved, so depending on the `output_format`:
        -  "netcdf": the :class:`xarray.Dataset` on which ``.to_netcdf``
           was called
        -  "csv": the :class:`pandas.Dataframe` on which ``.to_csv`` 
           was called (uses the `xarray` `ds.to_dataframe()` functionality)
        - "shp": the :class:`geopandas.GeoDataDrame` on which ``.to_file`` was called
     
    """

    if output_format == 'netcdf':

        ds_out = prep_for_nc(agg_obj,loc_dim=loc_dim)

        # Save as netcdf
        if not output_fn.endswith('.nc'):
            output_fn = output_fn+'.nc'
        ds_out.to_netcdf(output_fn)
        if not silent:
            print(output_fn+' saved!')

        # Return
        return ds_out

    elif output_format == 'csv':

        csv_out = prep_for_nc(agg_obj,loc_dim=loc_dim)
        csv_out = csv_out.to_dataframe()

        # Save as csv
        if not output_fn.endswith('.csv'):
            output_fn = output_fn+'.csv'
        csv_out.to_csv(output_fn)
        if not silent:
            print(output_fn+' saved!')

        # Return 
        return csv_out

    elif output_format == 'shp':
        # This uses the same processing as the to_csv option above, since 
        # shapefiles function similarly (each field can only have one value
        # for each polygon, etc.)
        csv_out = prep_for_csv(agg_obj)

        # gdf_in.geometry should be replaced with gdf_out['geometry']
        # which should be kept when gdf_out is created... 
        # Turn back into GeoDataFrame
        shp_out = gpd.GeoDataFrame(csv_out,geometry=agg_obj.geometry)

        # Export that GeoDataFrame to shapefile
        if not output_fn.endswith('.shp'):
            output_fn = output_fn+'.shp'
        shp_out.to_file(output_fn)
        if not silent:
            print(output_fn+' saved!')

        # Return
        return shp_out

    else: 
        raise KeyError(output_format+' is not a supported output format.')