Source code for silicone.database_crunchers.rms_closest

"""
Module for the database cruncher which uses the 'closest RMS' technique.
"""
import warnings

import numpy as np
import pandas as pd
import pyam

from .base import _DatabaseCruncher


[docs]class RMSClosest(_DatabaseCruncher): """ Database cruncher which uses the 'closest RMS' technkque. This cruncher derives the relationship between two or more variables by finding the scenario which has the most similar timeseries for the lead gases in the database. The follower gas timeseries is then simply copied from the closest scenario. Here, 'most similar' is defined as the smallest time-averaged root mean squared (L2) difference. If multiple lead values are used, they may be weighted differently to account for differences between the reported units. The most similar model/scenario combination minimises .. math:: RMS error = \\sum_l w_l \\left ( \\frac{1}{n} \\sum_{t=0}^n (E_l(t) - e_l(t))^2 \\right )^{1/2} where :math:`l` is a lead gas, :math:`w_l` is a weighting for that lead gas, :math:`n` is the total number of timesteps in all lead gas timeseries, :math:`E_l(t)` is the lead gas emissions timeseries and :math:`e_l(t)` is a lead gas emissions timeseries in the infiller database. """
[docs] def derive_relationship(self, variable_follower, variable_leaders, weighting=None): """ Derive the relationship between the lead and the follow variables from the database. Parameters ---------- variable_follower : str The variable for which we want to calculate timeseries (e.g. ``"Emissions|C5F12"``). variable_leaders : list[str] The variable we want to use in order to infer timeseries of ``variable_follower`` (e.g. ``["Emissions|CO2"]``). This may contain multiple elements. weighting : dict{str: float} When used with multiple lead variables, this weighting factor controls the relative importance of different variables for determining closeness. E.g. if wanting to compare both CO2 and CH4 emissions reported in mass units but weighted by the AR5 GWP100 metric, this would be {"Emissions|CO2": 1, "Emissions|CH4": 28}. Returns ------- :obj:`func` Function which takes a :obj:`pyam.IamDataFrame` containing ``variable_leaders`` timeseries and returns timeseries for ``variable_follower`` based on the derived relationship between the two. Please see the source code for the exact definition (and docstring) of the returned function. Raises ------ ValueError ``variable_leaders`` contains more than one variable. ValueError There is no data for ``variable_leaders`` or ``variable_follower`` in the database. """ self._check_iamdf_lead(variable_leaders) iamdf_follower = self._get_iamdf_section(variable_follower) data_follower_time_col = iamdf_follower.time_col iamdf_lead = self._db.filter(variable=variable_leaders) if not weighting: weighting = {variab: 1 for variab in variable_leaders} if any(var not in weighting.keys() for var in variable_leaders): raise ValueError("Weighting does not include all lead variables.") iamdf_lead, iamdf_follower = _filter_for_overlap( iamdf_lead, iamdf_follower, ["scenario", "model", data_follower_time_col], variable_leaders, ) leader_var_unit = { var["variable"]: var["unit"] for _, var in iamdf_lead[["variable", "unit"]].drop_duplicates().iterrows() } def filler(in_iamdf): """ Filler function derived from :obj:`RMSClosest`. Parameters ---------- in_iamdf : :obj:`pyam.IamDataFrame` Input data to fill data in Returns ------- :obj:`pyam.IamDataFrame` Filled in data (without original source data) Raises ------ ValueError If there are any inconsistencies between the timeseries, units or expectations of the program and ``in_iamdf``, compared to the database used to generate this ``filler`` function. """ lead_var = in_iamdf.filter(variable=variable_leaders) var_units = lead_var.data[["variable", "unit"]].drop_duplicates() if any([key not in lead_var.variable for key in leader_var_unit.keys()]): raise ValueError( "Not all required variables are present in the infillee database" ) if any( unit["unit"] != leader_var_unit[unit["variable"]] for _, unit in var_units.iterrows() ): raise ValueError( "Units of lead variable is meant to be {}, found {}".format( leader_var_unit, var_units ) ) if data_follower_time_col != in_iamdf.time_col: raise ValueError( "`in_iamdf` time column must be the same as the time column used " "to generate this filler function (`{}`)".format( data_follower_time_col ) ) lead_var_timeseries = lead_var.timeseries() iamdf_lead_timeseries = iamdf_lead.pivot( index=[ col for col in iamdf_lead.columns if col not in [data_follower_time_col, "value"] ], columns=data_follower_time_col, values="value", ) common_cols = [ col for col in lead_var_timeseries.columns if col in iamdf_lead_timeseries.columns ] if not common_cols: raise ValueError( "No time series overlap between the original and unfilled data" ) lead_var_timeseries = lead_var_timeseries.loc[:, common_cols] iamdf_lead_timeseries = iamdf_lead_timeseries.loc[:, common_cols].dropna( axis=0 ) output_ts_list = [] for _, (model, scenario) in ( lead_var.data[["model", "scenario"]].drop_duplicates().iterrows() ): lead_var_mod_scen = lead_var_timeseries[ (lead_var_timeseries.index.get_level_values("model") == model) & ( lead_var_timeseries.index.get_level_values("scenario") == scenario ) ] if len(lead_var_mod_scen) != len(variable_leaders): raise ValueError( "Insufficient variables are found to infill model {}, scenario " "{}. Only found {}.".format(model, scenario, lead_var_mod_scen) ) closest_model, closest_scenario = _select_closest( iamdf_lead_timeseries, lead_var_mod_scen, weighting, variable_leaders, ) # Filter to find the matching follow data for the same model, scenario # and region tmp = iamdf_follower.loc[ (iamdf_follower.model == closest_model) & (iamdf_follower.scenario == closest_scenario) ].copy() # Update the model and scenario to match the elements of the input. tmp.loc[:, "model"] = model tmp.loc[:, "scenario"] = scenario for col in in_iamdf.extra_cols: tmp[col] = lead_var_mod_scen.index.get_level_values(col).tolist()[0] output_ts_list.append(pyam.IamDataFrame(tmp)) return pyam.concat(output_ts_list) return filler
def _check_iamdf_lead(self, variable_leaders): if not all([v in self._db.variable for v in variable_leaders]): error_msg = "No data for `variable_leaders` ({}) in database".format( variable_leaders ) raise ValueError(error_msg) def _get_iamdf_section(self, variables): # filter warning about empty data frame as we handle it ourselves with warnings.catch_warnings(): warnings.simplefilter("ignore") iamdf_section = self._db.filter(variable=variables) data_section = iamdf_section.data if data_section.empty: error_msg = "No data for `variable_follower` ({}) in database".format( variables ) raise ValueError(error_msg) return iamdf_section
def _select_closest(to_search_df, target_df, weighting, variable_leaders): """ Find model/scenario combo in ``to_search_df`` that is closest to that of the target. Here, 'closest' is in the root-mean squared sense. In the event that multiple model/ scenarios are equally close, returns first row. Parameters ---------- to_search_df : :obj:`pd.DataFrame` The model/scenario combos to search for the closest case. A timeseries. target_df : :obj:`pd.DataFrame` The data to which we want to be close. A timeseries. weighting : map{str: float} Maps the variable name onto the weighting for that variable. Returns ------- dict Index of the closest timeseries. """ rms = pd.Series(0, index=to_search_df.index, dtype=np.float64) for var in variable_leaders: target_for_var = target_df[ target_df.index.get_level_values("variable") == var ].squeeze() rms = rms.add( ( to_search_df[ to_search_df.index.get_level_values("variable") == var ].subtract(target_for_var, axis=1) ** 2 ).mean(axis=1) ** 0.5 * weighting[var], fill_value=0, ) rmssums = rms.groupby(level=["model", "scenario"], sort=False).sum() return rmssums.idxmin() def _filter_for_overlap(df1, df2, cols, leaders): """ Returns overlapping model/scenario combinations in the two input dataframes, which must have the same columns. Parameters ---------- df1 : :obj:`pd.DataFrame` The first dataframe (order is irrelevant) df2 : :obj:`pd.DataFrame` The second dataframe (order is irrelevant) cols: list[str] List of columns that should be identical between the two dataframes. Typically "scenario", "model", and whatever the time column is. leaders : list[str] List of lead variables that must be found in all acceptable model/scenarios combinations. Returns ------- (:obj:`pd.DataFrame`, :obj:`pd.DataFrame`) The two dataframes in the order they were put in, now filtered for some columns being identical. """ lead_data = df1.data.set_index(cols) follow_data = df2.data.set_index(cols) # We only want to select model/scenario cases where we have data for all leaders shared_indices = lead_data.index[ lead_data.index.isin(follow_data.index) ].value_counts() shared_indices = shared_indices[shared_indices == len(leaders)].index.tolist() if not shared_indices: raise ValueError("No model/scenario overlap between leader and follower data") lead_data = lead_data.loc[shared_indices] follow_data = follow_data.loc[shared_indices] return lead_data.reset_index(), follow_data.reset_index()