Source code for silicone.database_crunchers.latest_time_ratio

"""
Module for the database cruncher which uses the 'latest time ratio' technique.
"""
import logging
import warnings

import numpy as np
from pyam import IamDataFrame

from .base import _DatabaseCruncher

logger = logging.getLogger(__name__)


[docs]class LatestTimeRatio(_DatabaseCruncher):
    """
    Database cruncher which uses the 'latest time ratio' technique.

    This cruncher derives the relationship between two variables by simply assuming
    that the follower timeseries is equal to the lead timeseries multiplied by a
    scaling factor. The scaling factor is derived by calculating the ratio of the
    follower variable to the lead variable in the latest year in which the follower
    variable is available in the database. Additionally, since
    the derived relationship only depends on a single point in the database, no
    regressions or other calculations are performed.

    Once the relationship is derived, the 'filler' function will infill following:

    .. math::
        E_f(t) = R * E_l(t)

    where :math:`E_f(t)` is emissions of the follower variable and :math:`E_l(t)` is
    emissions of the lead variable, both in the infillee database.

    :math:`R` is the scaling factor, calculated as

    .. math::
        R = \\frac{ E_f(t_{\\text{last}}) }{ e_l(t_{\\text{last}}) }

    where :math:`t_{\\text{last}}` is the average of all values of the follower gas at
    the latest time it appears in the database, and the lower case :math:`e` represents
    the infiller database.
    """

[docs]    def derive_relationship(self, variable_follower, variable_leaders):
        """
        Derive the relationship between two variables from the database.

        Parameters
        ----------
        variable_follower : str
            The variable for which we want to calculate timeseries (e.g.
            ``"Emissions|C5F12"``).

        variable_leaders : list[str]
            The variable we want to use in order to infer timeseries of
            ``variable_follower`` (e.g. ``["Emissions|CO2"]``). Note that the 'latest
            time ratio' methodology gives the same result, independent of the value of
            ``variable_leaders`` in the database.

        Returns
        -------
        :obj:`func`
            Function which takes a :obj:`pyam.IamDataFrame` containing
            ``variable_leaders`` timeseries and returns timeseries for
            ``variable_follower`` based on the derived relationship between the two.
            Please see the source code for the exact definition (and docstring) of the
            returned function.

        Raises
        ------
        ValueError
            ``variable_leaders`` contains more than one variable.

        ValueError
            There is no data for ``variable_leaders`` or ``variable_follower`` in the
            database.

        """
        iamdf_follower = self._get_iamdf_follower(variable_follower, variable_leaders)
        data_follower = iamdf_follower.data

        data_follower_time_col = iamdf_follower.time_col
        data_follower_key_timepoint = max(data_follower[data_follower_time_col])
        key_timepoint_filter = {data_follower_time_col: [data_follower_key_timepoint]}
        data_follower_key_year_val = np.nanmean(
            iamdf_follower.filter(**key_timepoint_filter).data["value"].values
        )
        data_follower_unit = data_follower["unit"].values[0]

        if data_follower_time_col == "time":
            data_follower_key_timepoint = data_follower_key_timepoint.to_pydatetime()

        def filler(in_iamdf, interpolate=False):
            """
            Filler function derived from :obj:`LatestTimeRatio`.

            Parameters
            ----------
            in_iamdf : :obj:`pyam.IamDataFrame`
                Input data to fill data in

            interpolate : bool
                If the key year for filling is not in ``in_iamdf``, should a value be
                interpolated?

            Returns
            -------
            :obj:`pyam.IamDataFrame`
                Filled in data (without original source data)

            Raises
            ------
            ValueError
                The key year for filling is not in ``in_iamdf`` and ``interpolate is
                False``.
            """
            lead_var = in_iamdf.filter(variable=variable_leaders)

            if data_follower_time_col != in_iamdf.time_col:
                raise ValueError(
                    "`in_iamdf` time column must be the same as the time column used "
                    "to generate this filler function (`{}`)".format(
                        data_follower_time_col
                    )
                )
            if any(lead_var.data["value"] < 0):
                warn_str = "Note that the lead variable {} goes negative.".format(
                    variable_leaders
                )
                logger.warning(warn_str)
                print(warn_str)

            def get_values_in_key_timepoint(idf):
                # filter warning about empty data frame as we handle it ourselves
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    return idf.filter(**key_timepoint_filter)

            lead_var_val_in_key_timepoint = get_values_in_key_timepoint(lead_var)

            if lead_var_val_in_key_timepoint.data.empty:
                if not interpolate:
                    error_msg = (
                        "Required downscaling timepoint ({}) is not in the data for "
                        "the lead gas ({})".format(
                            data_follower_key_timepoint, variable_leaders[0]
                        )
                    )
                    raise ValueError(error_msg)
                lead_var = lead_var.interpolate(
                    data_follower_key_timepoint, inplace=False
                )
                lead_var_val_in_key_timepoint = get_values_in_key_timepoint(lead_var)
                lead_var.filter(**key_timepoint_filter, keep=False, inplace=True)

            lead_var_val_in_key_timepoint = lead_var_val_in_key_timepoint.timeseries()
            if not lead_var_val_in_key_timepoint.shape[1] == 1:  # pragma: no cover
                raise AssertionError(
                    "How did filtering for a single timepoint result in more than "
                    "one column?"
                )

            lead_var_val_in_key_timepoint = lead_var_val_in_key_timepoint.iloc[:, 0]

            scaling = data_follower_key_year_val / lead_var_val_in_key_timepoint
            output_ts = (lead_var.timeseries().T * scaling).T.reset_index()

            output_ts["variable"] = variable_follower
            output_ts["unit"] = data_follower_unit
            return IamDataFrame(output_ts)

        return filler

    def _get_iamdf_follower(self, variable_follower, variable_leaders):
        if len(variable_leaders) > 1:
            raise ValueError(
                "For `LatestTimeRatio`, ``variable_leaders`` should only "
                "contain one variable"
            )

        self._check_follower_and_leader_in_db(variable_follower, variable_leaders)

        return self._db.filter(variable=variable_follower)