Source code for silicone.database_crunchers.time_dep_ratio

"""
Module for the database cruncher which uses the 'time-dependent ratio' technique.
"""
import logging
import warnings

import numpy as np
import pandas as pd
from pyam import IamDataFrame

from .base import _DatabaseCruncher

logger = logging.getLogger(__name__)


[docs]class TimeDepRatio(_DatabaseCruncher):
    """
    Database cruncher which uses the 'time-dependent ratio' technique.

    This cruncher derives the relationship between two variables by simply assuming
    that the follower timeseries is equal to the lead timeseries multiplied by a
    time-dependent scaling factor. The scaling factor is the ratio of the
    follower variable to the lead variable. If the database contains many such pairs,
    the scaling factor is the ratio between the means of the values. By default, the
    calculation will include only values where the lead variable takes the same sign
    (+ or -) in the infilling database as in the case infilled. This prevents getting
    negative values of emissions that cannot be negative. To allow cases where we
    have no data of the correct sign, set `same_sign = False` in `derive_relationship`.

    Once the relationship is derived, the 'filler' function will infill following:

    .. math::
        E_f(t) = R(t) * E_l(t)

    where :math:`E_f(t)` is emissions of the follower variable and :math:`E_l(t)` is
    emissions of the lead variable.

    :math:`R(t)` is the scaling factor, calculated as the ratio of the means of the
    the follower and the leader in the infiller database, denoted with
    lower case e. By default, we include only cases where `sign(e_l(t))` is the same in
    both databases). The cruncher will raise a warning if the lead data is ever
    negative, which can create complications for the use of this cruncher.

    .. math::
        R(t) = \\frac{mean( e_f(t) )}{mean( e_l(t) )})

    """

[docs]    def derive_relationship(
        self,
        variable_follower,
        variable_leaders,
        same_sign=True,
        only_consistent_cases=True,
    ):
        """
        Derive the relationship between two variables from the database.

        Parameters
        ----------
        variable_follower : str
            The variable for which we want to calculate timeseries (e.g.
            ``"Emissions|C5F12"``).

        variable_leaders : list[str]
            The variable we want to use in order to infer timeseries of
            ``variable_follower`` (e.g. ``["Emissions|CO2"]``).

        same_sign : bool
            Do we want to only use data where the leader has the same sign in the
            infiller and infillee data? If so, we have a potential error from
            not having data of the correct sign, but have more confidence in the
            sign of the follower data.

        only_consistent_cases : bool
            Do we want to only use model/scenario combinations where both lead and
            follow have data at all times? This will reduce the risk of inconsistencies
            or unevenness in the results, but will slightly decrease performance speed
            if you know the data is consistent. Senario/model pairs where
            data is only returned at certain times will be removed, as will any
            scenarios not returning both lead and follow data.

        Returns
        -------
        :obj:`func`
            Function which takes a :obj:`pyam.IamDataFrame` containing
            ``variable_leaders`` timeseries and returns timeseries for
            ``variable_follower`` based on the derived relationship between the two.
            Please see the source code for the exact definition (and docstring) of the
            returned function.

        Raises
        ------
        ValueError
            ``variable_leaders`` contains more than one variable.

        ValueError
            There is no data for ``variable_leaders`` or ``variable_follower`` in the
            database.
        """
        if only_consistent_cases:
            consistent_cases = (
                self._db.filter(variable=variable_leaders + [variable_follower])
                .timeseries()
                .dropna()
            )
            consistent_cases = consistent_cases.loc[
                consistent_cases.index.to_frame().duplicated(
                    ["model", "scenario", "region"], keep=False
                )
            ]
            self._filtered_db = IamDataFrame(consistent_cases)
        else:
            self._filtered_db = self._db
        iamdf_follower, data_follower = self._get_iamdf_followers(
            variable_follower, variable_leaders
        )

        data_follower_unit = np.unique(iamdf_follower.data["unit"].values)
        if data_follower_unit.size == 1:
            data_follower_unit = data_follower_unit[0]
        else:
            raise ValueError("There are multiple/no units in follower data")
        data_follower_time_col = iamdf_follower.time_col
        iamdf_leader = self._filtered_db.filter(variable=variable_leaders[0])
        data_leader = iamdf_leader.timeseries()
        if iamdf_leader["unit"].nunique() != 1:
            raise ValueError("There are multiple/no units for the leader data.")
        if data_follower.size != data_leader.size:
            error_msg = "The follower and leader data have different sizes"
            raise ValueError(error_msg)
        # Calculate the ratios to use
        all_times = np.unique(iamdf_leader.data[iamdf_leader.time_col])
        scaling = pd.DataFrame(index=all_times, columns=["pos", "neg"])
        if same_sign:
            # We want to have separate positive and negative answers. We calculate a
            # tuple, first for positive and then negative values.
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                for year in all_times:
                    pos_inds = data_leader[year].values > 0
                    scaling["pos"][year] = np.nanmean(
                        data_follower[year].iloc[pos_inds].values
                    ) / np.nanmean(data_leader[year].iloc[pos_inds].values)
                    scaling["neg"][year] = np.nanmean(
                        data_follower[year].iloc[~pos_inds].values
                    ) / np.nanmean(data_leader[year].iloc[~pos_inds].values)
        else:
            # The tuple is the same in both cases
            for year in all_times:
                scaling["pos"][year] = np.mean(data_follower[year].values) / np.mean(
                    data_leader[year].values
                )
            scaling["neg"] = scaling["pos"]

        def filler(in_iamdf):
            """
            Filler function derived from :obj:`TimeDepRatio`.

            Parameters
            ----------
            in_iamdf : :obj:`pyam.IamDataFrame`
                Input data to fill data in

            Returns
            -------
            :obj:`pyam.IamDataFrame`
                Filled-in data (without original source data)

            Raises
            ------
            ValueError
                The key year for filling is not in ``in_iamdf``.
            """
            lead_var = in_iamdf.filter(variable=variable_leaders)
            assert (
                lead_var["unit"].nunique() == 1
            ), "There are multiple units for the lead variable."
            if data_follower_time_col != in_iamdf.time_col:
                raise ValueError(
                    "`in_iamdf` time column must be the same as the time column used "
                    "to generate this filler function (`{}`)".format(
                        data_follower_time_col
                    )
                )
            if any(lead_var.data["value"] < 0):
                warn_str = (
                    "Note that the lead variable {} goes negative. The time dependent "
                    "ratio cruncher can produce unexpected results in this case.".format(
                        variable_leaders
                    )
                )
                logger.warning(warn_str)
                print(warn_str)
            times_needed = set(in_iamdf.data[in_iamdf.time_col])
            if any(
                [
                    k not in set(iamdf_follower[data_follower_time_col])
                    for k in times_needed
                ]
            ):
                error_msg = (
                    "Not all required timepoints are in the data for "
                    "the lead gas ({})".format(variable_leaders[0])
                )
                raise ValueError(error_msg)
            output_ts = lead_var.timeseries()

            for year in times_needed:
                if (
                    scaling.loc[year][
                        output_ts[year].map(lambda x: "neg" if x < 0 else "pos")
                    ]
                    .isnull()
                    .values.any()
                ):
                    raise ValueError(
                        "Attempt to infill {} data using the time_dep_ratio cruncher "
                        "where the infillee data has a sign not seen in the infiller "
                        "database for year "
                        "{}.".format(variable_leaders, year)
                    )
                output_ts[year] = (
                    output_ts[year].values
                    * scaling.loc[year][
                        output_ts[year].map(lambda x: "pos" if x > 0 else "neg")
                    ].values
                )
            output_ts.reset_index(inplace=True)
            output_ts["variable"] = variable_follower
            output_ts["unit"] = data_follower_unit

            return IamDataFrame(output_ts)

        return filler

    def _get_iamdf_followers(self, variable_follower, variable_leaders):
        if len(variable_leaders) > 1:
            raise ValueError(
                "For `TimeDepRatio`, ``variable_leaders`` should only "
                "contain one variable"
            )

        self._check_follower_and_leader_in_db(variable_follower, variable_leaders)

        iamdf_follower = self._filtered_db.filter(variable=variable_follower)
        if iamdf_follower.empty:
            raise ValueError(
                "No data is complete enough to use in the time-dependent ratio cruncher"
            )
        data_follower = iamdf_follower.timeseries()

        return iamdf_follower, data_follower