"""
Module for the database cruncher which uses the 'equal quantile walk' technique.
"""
import numpy as np
from pyam import IamDataFrame
from ..stats import calc_quantiles_of_data
from ..utils import _make_weighting_series
from .base import _DatabaseCruncher
[docs]class EqualQuantileWalk(_DatabaseCruncher):
"""
Database cruncher which uses the 'equal quantile walk' technique.
This cruncher assumes that the amount of effort going into reducing one emission set
is equal to that for another emission, therefore the lead and follow data should be
at the same quantile of all pathways in the infiller database.
It calculates the quantile of the lead infillee data in the lead infiller database,
then outputs that quantile of the follow data in the infiller database.
"""
[docs] def derive_relationship(
self, variable_follower, variable_leaders, smoothing=None, weighting=None
):
"""
Derive the relationship between two variables from the database.
Parameters
----------
variable_follower : str
The variable for which we want to calculate timeseries (e.g.
``"Emissions|C5F12"``).
variable_leaders : list[str]
The variable we want to use in order to infer timeseries of
``variable_follower`` (e.g. ``["Emissions|CO2"]``).
smoothing : float or string
By default, no smoothing is done on the distribution. If a value is
provided, it is fed into :func:`scipy.stats.gaussian_kde` - see full
documentation there. In short, if a float is input, we fit a Gaussian kernel
density estimator with that width to the points. If a string is used, it
must be either "scott" or "silverman", after those two methods of
determining the best kernel bandwidth.
weighting: Dict{(str, str) : float}
The dictionary, mapping the (mode, scenario) tuple onto the weight (relative
to a weight of 1 for the default). This does not have to include all scenarios
in df, but cannot include scenarios not in df.
Returns
-------
:obj:`func`
Function which takes a :obj:`pyam.IamDataFrame` containing
``variable_leaders`` timeseries and returns timeseries for
``variable_follower`` based on the derived relationship between the two.
Please see the source code for the exact definition (and docstring) of the
returned function.
Raises
------
ValueError
``variable_leaders`` contains more than one variable.
ValueError
There is no data for ``variable_leaders`` or ``variable_follower`` in the
database.
"""
iamdf_follower = self._get_iamdf_follower(variable_follower, variable_leaders)
follower_ts = iamdf_follower.timeseries()
data_follower_time_col = iamdf_follower.time_col
data_follower_unit = iamdf_follower["unit"].values[0]
lead_ts = self._db.filter(variable=variable_leaders).timeseries()
if weighting is not None:
if isinstance(weighting, dict):
weighting_follow = _make_weighting_series(follower_ts, weighting)
weighting_lead = _make_weighting_series(lead_ts, weighting)
else:
raise TypeError("``weighting`` should be a dictionary")
else:
weighting_follow = None
weighting_lead = None
lead_unit = lead_ts.index.get_level_values("unit")[0]
def filler(in_iamdf):
"""
Filler function derived from :obj:`EqualQuantileWalk`.
Parameters
----------
in_iamdf : :obj:`pyam.IamDataFrame`
Input data to fill data in
Returns
-------
:obj:`pyam.IamDataFrame`
Filled in data (without original source data)
Raises
------
ValueError
Not all required timepoints are present in the database we crunched...
"""
lead_in = in_iamdf.filter(variable=variable_leaders)
if not all([unit == lead_unit for unit in lead_in.unit]):
raise ValueError(
"Units of lead variable is meant to be `{}`, found `{}`".format(
lead_unit, lead_in.unit
)
)
if data_follower_time_col != in_iamdf.time_col:
raise ValueError(
"`in_iamdf` time column must be the same as the time column used "
"to generate this filler function (`{}`)".format(
data_follower_time_col
)
)
if lead_in.data.empty:
raise ValueError(
"There is no data for {} so it cannot be infilled".format(
variable_leaders
)
)
output_ts = lead_in.timeseries()
if any(
[
(time not in lead_ts.columns) or (time not in follower_ts.columns)
for time in output_ts.columns
]
):
# We allow for cases where either lead or follow have gaps
raise ValueError(
"Not all required timepoints are present in the database we "
"crunched, we crunched \n\t{} for the lead and \n\t{} for the "
"follow \nbut you passed in \n\t{}".format(
lead_ts.columns, follower_ts.columns, output_ts.columns
)
)
for col in output_ts.columns:
output_ts[col] = self._find_same_quantile(
follower_ts[col],
lead_ts[col],
output_ts[col],
smoothing,
weighting_lead,
weighting_follow,
)
output_ts = output_ts.reset_index()
output_ts["variable"] = variable_follower
output_ts["unit"] = data_follower_unit
return IamDataFrame(output_ts)
return filler
def _get_iamdf_follower(self, variable_follower, variable_leaders):
if len(variable_leaders) > 1:
raise ValueError(
"For `EqualQuantileWalk`, ``variable_leaders`` should only "
"contain one variable"
)
self._check_follower_and_leader_in_db(variable_follower, variable_leaders)
return self._db.filter(variable=variable_follower)
def _find_same_quantile(
self,
follow_vals,
lead_vals,
lead_input,
smoothing,
weighting_lead,
weighting_follow,
):
# Dispose of nans that can cloud the calculation
follow_vals = follow_vals[~np.isnan(follow_vals)]
input_quantiles = calc_quantiles_of_data(
lead_vals, lead_input, smoothing, weighting_lead
)
if all(np.isnan(input_quantiles)):
return np.nanmean(follow_vals)
return calc_quantiles_of_data(
follow_vals, input_quantiles, smoothing, weighting_follow, to_quantile=False
)