Source code for silicone.multiple_infillers.infill_all_required_emissions_for_openscm

import logging
import warnings

import numpy as np
import pandas as pd
import pyam
import tqdm

from silicone.database_crunchers import ConstantRatio, QuantileRollingWindows

"""
Infills all required data for MAGICC and FAIR emulators with minimal configuration 
"""


[docs]def infill_all_required_variables(
    to_fill,
    database,
    variable_leaders,
    required_variables_list=None,
    cruncher=QuantileRollingWindows,
    output_timesteps=None,
    infilled_data_prefix=None,
    to_fill_old_prefix=None,
    check_data_returned=False,
    **kwargs,
):
    """
    This is function designed to infill all required data given a minimal amount of
    input.

    Parameters
    ----------
    to_fill : :obj:`pyam.IamDataFrame`
        The dataframe which is to be infilled

    database: :obj:`pyam.IamDataFrame`
        The dataframe containing all information to be used in the infilling process.

    variable_leaders: list[str]
        The name of the variable(s) found in to_fill which should be used to determine
        the values of the other variables. For most infillers (including the default)
        this list must contain only one entry. E.g. ["Emissions|CO2"]

    required_variables_list: list[str]
        The list of variables to infill. Each will be done separately. The default
        behaviour (None option) will result in this being filled with the complete list
        of required emissions.

    cruncher : :class:
        The class of cruncher to use to compute the infilled values. Defaults to
        QuantileRollingWindows, which uses the median value of a rolling
        window. See the cruncher documentation for more details.

    output_timesteps : list[int or datetime]
        List of times at which to return infilled values. Will interpolate values in
        between known data, but will not extend beyond the range of data provided.

    infilled_data_prefix : str
        A string that should be prefixed on all the variable names of the results
        returned. Used to distinguish returned values from those input.

    to_fill_old_prefix : str
        Any string already found at the beginning of the variables names of the input
        `to_fill` dataframe. This will be removed before comparing the variable names
        with `database`.

    check_data_returned : bool
        If true, we perform checks that all desired data has been returned. Potential
        reasons for failing this include requesting results at times outside our input
        time range, as well as code bugs.

    ** kwargs :
        An optional dictionary of keyword : arguments to be used with the cruncher.

    Returns
    -------
    :obj:`pyam.IamDataFrame`
        The infilled dataframe (including input data) at requested times. All variables
        now begin with infilled_data_prefix instead of to_fill_old_prefix.
    """
    # Use default arguments for unfilled options:
    if output_timesteps is None:
        if to_fill.time_col == "time":
            raise ValueError(
                "No default behaviour for output_timesteps when dataframe has time "
                "column instead of years"
            )
        output_timesteps = [2015] + list(range(2020, 2101, 10))

    if required_variables_list is None:
        required_variables_list = [
            "Emissions|BC",
            "Emissions|PFC|CF4",
            "Emissions|PFC|C2F6",
            "Emissions|PFC|C6F14",
            "Emissions|CH4",
            "Emissions|CO2|AFOLU",
            "Emissions|CO2|Energy and Industrial Processes",
            "Emissions|CO",
            "Emissions|HFC|HFC134a",
            "Emissions|HFC|HFC143a",
            "Emissions|HFC|HFC227ea",
            "Emissions|HFC|HFC23",
            "Emissions|HFC|HFC32",
            "Emissions|HFC|HFC43-10",
            "Emissions|HFC|HFC245ca",
            "Emissions|HFC|HFC125",
            "Emissions|N2O",
            "Emissions|NH3",
            "Emissions|NOx",
            "Emissions|OC",
            "Emissions|SF6",
            "Emissions|Sulfur",
            "Emissions|VOC",
        ]

    # Check that the input is valid
    if to_fill_old_prefix:
        if any(
            to_fill.data["variable"].map(lambda x: x[: len(to_fill_old_prefix)])
            != to_fill_old_prefix
        ):
            raise ValueError("Not all of the data begins with the expected prefix")

        to_fill.rename(
            {
                "variable": {
                    var: var.replace(to_fill_old_prefix + "|", "")
                    for var in to_fill.variable
                }
            },
            inplace=True,
        )

    if infilled_data_prefix:
        if any(
            to_fill.data["variable"].map(lambda x: x[: len(infilled_data_prefix)])
            == infilled_data_prefix
        ):
            raise ValueError(
                "This data already contains values with the expected final "
                "prefix. This suggests that some of it has already been infilled."
            )

    assert len(to_fill.region) == 1, "There are {} regions in the data.".format(
        len(to_fill.region)
    )
    assert len(database.region) == 1
    assert (
        to_fill.data["region"].iloc[0] == database.data["region"].iloc[0]
    ), "The cruncher data and the infilled data have different regions."

    # Perform any interpolations required here
    to_fill_orig = to_fill.copy()

    timecol = database.time_col
    assert timecol == to_fill.time_col

    # ensure we have all required timesteps
    if isinstance(output_timesteps, np.ndarray):
        output_timesteps = list(output_timesteps)

    if timecol == "year":
        output_timesteps = [int(v) for v in output_timesteps]

    database = database.interpolate(output_timesteps, inplace=False)
    to_fill = to_fill.interpolate(output_timesteps, inplace=False)

    # Nans in additional columns break pyam, so we overwrite them
    database.data[database.extra_cols] = database.data[database.extra_cols].fillna(0)
    to_fill.data[to_fill.extra_cols] = to_fill.data[to_fill.extra_cols].fillna(0)
    # Filter for desired times
    if timecol == "year":
        database = database.filter(year=output_timesteps)
        to_fill = to_fill.filter(year=output_timesteps)
    else:
        database = database.filter(time=output_timesteps)
        to_fill = to_fill.filter(time=output_timesteps)
    # Infill unavailable data
    assert not database.data.isnull().any().any()
    assert not to_fill.data.isnull().any().any()

    unavailable_variables = [
        variab for variab in required_variables_list if variab not in database.variable
    ]
    if unavailable_variables:
        warnings.warn(
            UserWarning(
                "No data for {}, it will be infilled with 0s".format(
                    unavailable_variables
                )
            )
        )
        # Infill the required variables with 0s.
        kwarg_dict = {"ratio": 0, "units": "Mt CO2-equiv/yr"}
        to_fill = _perform_crunch_and_check(
            unavailable_variables,
            variable_leaders,
            to_fill,
            database,
            ConstantRatio,
            output_timesteps,
            to_fill_orig,
            check_data_returned=False,
            **kwarg_dict,
        )

    available_variables = [
        variab
        for variab in required_variables_list
        if variab not in unavailable_variables
    ]
    if available_variables:
        to_fill = _perform_crunch_and_check(
            available_variables,
            variable_leaders,
            to_fill,
            database,
            cruncher,
            output_timesteps,
            to_fill_orig,
            check_data_returned=check_data_returned,
            **kwargs,
        )

    if infilled_data_prefix:
        to_fill.rename(
            {
                "variable": {
                    var: infilled_data_prefix + "|" + var for var in to_fill.variable
                }
            },
            inplace=True,
        )

    return to_fill


def _perform_crunch_and_check(
    required_variables,
    leaders,
    to_fill,
    df,
    type_of_cruncher,
    output_timesteps,
    to_fill_orig,
    check_data_returned=False,
    **kwargs,
):
    """
    Takes a list of scenarios to infill and infills them according to the options
    presented.

    Parameters
    ----------
    required_variables : list[str]
        The variable names to infill

    leaders : list[str]
        The leaders to guide the infilling

    to_fill : IamDataFrame
        The data frame to infill

    df : IamDataFrame
        The data frame to base the infilling on

    type_of_cruncher : :obj: silicone cruncher
        the silicone package cruncher class to use for the infilling

    output_timesteps : list[int or datetime]
        When there should be data returned. Time-based interpolation will occur if
        this is more frequent than the data allows, data will be filtered out if
        there is additional time information.

    to_fill_orig : IamDataFrame
        The original, unfiltered and unaltered data input. We use this for
        performing checks.

    kwargs : Dict
        Any key word arguments to include in the cruncher calculation

    Returns
    -------
    :obj:IamDataFrame
        The infilled dataframe
    """
    cruncher = type_of_cruncher(df)
    filled = [to_fill]
    for req_var in tqdm.tqdm(required_variables, desc="Filling required variables"):
        try:
            infilled = _infill_variable(cruncher, req_var, leaders, to_fill, **kwargs)
        except ValueError as err:
            warnings.warn(f"Error encountered when infilling {req_var}")
            raise ValueError(err)
        if infilled:
            filled.append(infilled)

    filled = pyam.concat(filled)

    # Optionally check we have added all the required data
    if not check_data_returned:
        return filled

    assert not filled.empty
    check_ts = filled.timeseries()
    assert not check_ts.isnull().any().any()

    missing_time_error = "We do not have data for all required timesteps"
    if filled.time_col == "year":
        assert all(y in check_ts.columns for y in output_timesteps), missing_time_error
    else:
        assert all(
            pd.to_datetime(t) in check_ts.columns for t in output_timesteps
        ), missing_time_error

    # Check no data was overwritten by accident
    orig_ts = to_fill_orig.timeseries()
    common_times = check_ts.columns.intersection(orig_ts.columns)
    if not common_times.empty:
        check_ts, orig_ts = check_ts.align(orig_ts, join="right")
        pd.testing.assert_frame_equal(
            check_ts[common_times],
            orig_ts[common_times],
            obj="Consistency with original model data checks",
        )

    return filled


def _infill_variable(cruncher_i, req_variable, leader_i, to_fill_i, **kwargs):
    """
    A function used to iterate the actual crunching if the data doesn't already
    exist.
    Parameters
    ----------
    cruncher_i : :obj: silicone cruncher
        the initiated silicone cruncher to use for the infilling

    req_variable : str
        The follower variable to infill.

    leader_i : list[str]
        The leader variable to guide the infilling.

    to_fill_i : IamDataFrame
        The dataframe to infill.

    kwargs : Dict
        Any key word arguments to include in the cruncher calculation

    Returns
    -------
    :obj:IamDataFrame
        The infilled component of the dataframe (or None if no infilling done)
    """
    filler = cruncher_i.derive_relationship(req_variable, leader_i, **kwargs)

    # only fill for scenarios who don't have that variable
    # quieten logging about empty data frame as it doesn't matter here
    logging.getLogger("pyam.core").setLevel(logging.CRITICAL)

    mod_scens_already_full = to_fill_i.meta.copy()
    mod_scens_already_full["already_filled"] = False
    mod_scens_already_full.loc[
        to_fill_i.filter(variable=req_variable).meta.index, "already_filled"
    ] = True
    to_fill_i.set_meta(mod_scens_already_full["already_filled"])
    to_fill_var = to_fill_i.filter(already_filled=False)

    if not to_fill_var.data.empty:
        infilled = filler(to_fill_var)

        return infilled

    logging.getLogger("pyam.core").setLevel(logging.WARNING)
    return None