import copy
import numpy as np
import pandas as pd
import logging

from dataiku.core.doctor_constants import END_OF_WEEK_DAY, UNIT_ALIGNMENT, CATEGORY_HANDLING, DUMMIFY
from dataiku.core.doctor_constants import NUMBER_OF_TIME_UNITS
from dataiku.core.doctor_constants import ROLLING_WINDOWS_RESOURCES
from dataiku.core.doctor_constants import TARGET_VARIABLE
from dataiku.core.doctor_constants import TIMESERIES_IDENTIFIER_COLUMNS
from dataiku.core.doctor_constants import TIME_STEP_PARAMS
from dataiku.core.doctor_constants import TIME_VARIABLE
from dataiku.core.doctor_constants import TIME_UNIT
from dataiku.doctor.diagnostics.timeseries import check_zero_target_ratio
from dataiku.doctor.utils import get_filtered_features
from dataiku.doctor.timeseries.preparation.resampling import Resampler
from dataiku.doctor.timeseries.preparation.resampling.utils import get_time_unit_end_of_week, get_monthly_day_alignment
from dataiku.doctor import step_constants
from dataiku.doctor.timeseries.utils import FULL_TIMESERIES_DF_IDENTIFIER
from dataiku.doctor.timeseries.utils import timeseries_iterator
from dataiku.doctor.timeseries.utils import add_timeseries_identifiers_columns
from dataiku.doctor.preprocessing_collector import PredictionPreprocessingDataCollector
from dataiku.doctor.preprocessing_handler import PredictionPreprocessingHandler
from dataiku.doctor.preprocessing_handler import read_resource
from dataiku.doctor.preprocessing_handler import write_resource
from dataiku.doctor.preprocessing.dataframe_preprocessing import OutputRawColumns
from dataiku.doctor.preprocessing.dataframe_preprocessing import RemapValueToOutput
from dataiku.doctor.preprocessing.dataframe_preprocessing import FetchRollingWindows


logger = logging.getLogger(__name__)


def get_external_features(preprocessing_params, supports_shifts=False):
    include_roles = ["INPUT", "INPUT_PAST_ONLY"] if supports_shifts else ["INPUT"]
    return get_filtered_features(preprocessing_params, include_roles=include_roles)

def has_external_features_or_windows(preprocessing_params, supports_shifts_and_windows=False):
    return len(get_external_features(preprocessing_params, supports_shifts_and_windows)) > 0 or (supports_shifts_and_windows and len(get_windows_list(preprocessing_params)) > 0)

def resample_timeseries(
    df, schema, resampling_params, core_params, numerical_columns, categorical_columns,
    compute_zero_target_ratio_diagnostic=False
):
    time_step_params = core_params[TIME_STEP_PARAMS]
    number_of_time_units = time_step_params[NUMBER_OF_TIME_UNITS]
    time_unit = time_step_params[TIME_UNIT]
    time_unit_end_of_week = get_time_unit_end_of_week(time_step_params[END_OF_WEEK_DAY])
    unit_alignment = time_step_params.get(UNIT_ALIGNMENT)
    monthly_alignment = get_monthly_day_alignment(core_params)
    timeseries_identifier_columns = core_params[TIMESERIES_IDENTIFIER_COLUMNS]
    resampler = Resampler(
        interpolation_method=resampling_params["numericalInterpolateMethod"],
        extrapolation_method=resampling_params["numericalExtrapolateMethod"],
        interpolation_constant_value=resampling_params.get("numericalInterpolateConstantValue"),
        extrapolation_constant_value=resampling_params.get("numericalExtrapolateConstantValue"),
        category_imputation_method=resampling_params["categoricalImputeMethod"],
        category_constant_value=resampling_params.get("categoricalConstantValue"),
        start_date_mode=resampling_params.get("startDateMode"),
        custom_start_date=resampling_params.get("customStartDate"),
        end_date_mode=resampling_params.get("endDateMode"),
        custom_end_date=resampling_params.get("customEndDate"),
        time_step=number_of_time_units,
        time_unit=time_unit,
        time_unit_end_of_week=time_unit_end_of_week,
        duplicate_timestamps_handling_method=resampling_params["duplicateTimestampsHandlingMethod"],
        unit_alignment=unit_alignment,
        monthly_alignment=monthly_alignment,
    )

    log_resampling_params = resampling_params.copy()
    log_resampling_params["numericalColumns"] = len(numerical_columns) if numerical_columns else 0
    log_resampling_params["categoricalColumns"] = len(categorical_columns) if categorical_columns else 0
    log_resampling_params["identifiersColumns"] = len(timeseries_identifier_columns) if timeseries_identifier_columns else 0
    log_resampling_params["numberOfTimeunits"] = number_of_time_units
    log_resampling_params["timeUnit"] = time_unit
    log_resampling_params["endOfWeekDay"] = time_unit_end_of_week
    log_resampling_params["unitAlignment"] = unit_alignment
    log_resampling_params["monthlyAlignment"] = monthly_alignment
    logger.info("Resampling with params: {}".format(log_resampling_params))

    resampled_df = resampler.transform(
        df,
        datetime_column=core_params[TIME_VARIABLE],
        timeseries_identifier_columns=timeseries_identifier_columns,
        numerical_columns=numerical_columns,
        categorical_columns=categorical_columns
    )

    columns_to_round = [
        column["name"]
        for column in schema["columns"]
        if column["type"] in ["tinyint", "smallint", "int", "bigint"]
        and column["name"] in numerical_columns
    ]
    # int columns must be resampled into int values (note that they can also contain NaN values)
    resampled_df[columns_to_round] = resampled_df[columns_to_round].round()

    if compute_zero_target_ratio_diagnostic:
        check_zero_target_ratio(resampled_df, timeseries_identifier_columns, core_params[TARGET_VARIABLE])
    
    return resampled_df

def should_compute_auto_shifts(preprocessing_params, modeling_params):
    if not modeling_params.get("isShiftWindowsCompatible", False):
        return False
    auto_shift_columns = get_auto_shift_columns(preprocessing_params)
    has_auto_shift_features = len(auto_shift_columns["past_only"]) > 0 or len(auto_shift_columns["known_in_advance"]) > 0
    if has_auto_shift_features and not preprocessing_params.get("feature_generation", {}).get("auto_shifts_params", False):
        raise ValueError("Invalid configuration, some feature generations have auto-shift enabled, but 'auto_shifts_params' were not provided. Under the 'Feature generation' tab, disable auto shifts or set auto shits params, then try again.")
    else:
        return has_auto_shift_features

def get_auto_shifts_past_only_range(preprocessing_params):
    auto_shifts_params = preprocessing_params.get("feature_generation", {}).get("auto_shifts_params", {})
    return {
        "min": auto_shifts_params.get("min_horizon_shift_past_only", -1),
        "max": auto_shifts_params.get("max_horizon_shift_past_only", -1)
    }

def get_auto_shifts_known_in_advance_range(preprocessing_params):
    auto_shifts_params = preprocessing_params.get("feature_generation", {}).get("auto_shifts_params", {})
    return {
        "min": auto_shifts_params.get("min_horizon_shift_known_in_advance", 0),
        "max": auto_shifts_params.get("max_horizon_shift_known_in_advance", 0)
    }

def is_shift_compatible(role):
    return role in ["TARGET", "INPUT", "INPUT_PAST_ONLY"]

def is_window_num_operation_compatible(feature):
    return feature["type"] == "NUMERIC" and ("numerical_handling" not in feature or feature["numerical_handling"] == "REGULAR")

def is_window_cat_operation_compatible(feature):
    return feature["type"] == "CATEGORY" and ("category_handling" not in feature or feature["category_handling"] == "DUMMIFY")

def is_window_compatible(feature, operation):
    return (is_shift_compatible(feature["role"]) and (
            (operation in ["MEAN", "MEDIAN", "STD", "MIN", "MAX"] and is_window_num_operation_compatible(feature)) or
            (operation == "FREQUENCY" and is_window_cat_operation_compatible(feature))
    ))

def is_past_only(role):
    return role in ["TARGET", "INPUT_PAST_ONLY"]


class ShiftMap(object):

    def __init__(self, input_shift_map, generated_features_mappings):
        """
        :param input_shift_map: dict with:
            - keys: str columns from external variables or target variable or rolling window aggregates
            - values: dict with:
                - key "past_covariate" / value True/False
                - key "from_forecast" / value list of integers
                - key "from_horizon" / value list of integers
        :param generated_features_mappings: dict with:
            - keys: time series id
            - values: GeneratedFeaturesMapping
        """
        self.input_shift_map = input_shift_map
        self.generated_features_mappings = generated_features_mappings

    def get_forecast_shifts(self, column):
        return self.input_shift_map[column]["from_forecast"]

    def get_horizon_shifts(self, column):
        return self.input_shift_map[column]["from_horizon"]

    def get_shifted_columns(self):
        return list(self.input_shift_map.keys())

    def get_preprocessed_columns(self, column, timeseries_identifier):
        if self.generated_features_mappings is None or timeseries_identifier not in self.generated_features_mappings:
            return []
        generated_features_mapping = self.generated_features_mappings[timeseries_identifier]
        preprocessed_columns = generated_features_mapping.get_features_from_origin_column(column)
        return preprocessed_columns

    def is_past_only(self, column):
        return self.input_shift_map[column]["past_covariate"]

    def is_empty(self):
        has_at_least_one_shift = self.input_shift_map is not None and any(
            len(shifts["from_forecast"]) > 0 or len(shifts["from_horizon"]) > 0
            for shifts in self.input_shift_map.values()
        )
        return not has_at_least_one_shift


def get_auto_shift_columns(preprocessing_params):
    auto_shift_columns = {
        "past_only": [], # Includes the role TARGET
        "known_in_advance": []
    }
    for column in list(preprocessing_params.get("feature_generation", {}).get("shifts", {}).keys()):
        shift_params = preprocessing_params["feature_generation"]["shifts"][column]
        role = preprocessing_params["per_feature"][column]["role"]
        if is_shift_compatible(role) and shift_params.get("from_horizon_mode", 'FIXED') == 'AUTO':
            if is_past_only(role):
                auto_shift_columns["past_only"].append(column)
            else:
                auto_shift_columns["known_in_advance"].append(column)
    return auto_shift_columns


def get_shift_map(preprocessing_params, generated_features_mappings):
    """
    :param preprocessing_params: dict
    :param generated_features_mappings: dict
    :return: ShiftMap
    """
    shift_map = copy.deepcopy(preprocessing_params["feature_generation"]["shifts"])
    for column in list(shift_map.keys()):
        role = preprocessing_params["per_feature"][column]["role"]
        if not is_shift_compatible(role):
            shift_map.pop(column)
        else:
            shift_map[column]["past_covariate"] = is_past_only(role)
            if shift_map[column].get("from_horizon_mode", 'FIXED') == 'AUTO':
                # Replace provided values, with automatically selected values
                shift_map[column]["from_horizon"] = shift_map[column]["from_horizon_auto"]
    for window in get_windows_list(preprocessing_params):
        operations_map = window["operations_map"]
        shift = window["shift"]
        is_from_forecast = window["is_from_forecast"]
        for feature, operations in operations_map.items():
            role = preprocessing_params["per_feature"][feature]["role"]
            for operation in operations:
                preprocessed_feature_name = "rolling_window:{}:{}:{}".format(window["length"], operation, feature)
                if preprocessed_feature_name not in shift_map:
                    shift_params = {
                        "past_covariate": is_past_only(role),
                        "from_forecast": [],
                        "from_horizon": [],
                    }
                    shift_map[preprocessed_feature_name] = shift_params
                if is_from_forecast:
                    shift_map[preprocessed_feature_name]["from_forecast"].append(shift)
                else:
                    shift_map[preprocessed_feature_name]["from_horizon"].append(shift)
    return ShiftMap(shift_map, generated_features_mappings)


def get_windows_list(preprocessing_params):
    windows_list = copy.deepcopy(preprocessing_params["feature_generation"]["windows"])
    for window in windows_list:
        operations_map = window["operations_map"]
        for feature, operations in operations_map.items():
            operations_map[feature] = [op["operation"] for op in operations if op["enabled"] and is_window_compatible(preprocessing_params["per_feature"][feature], op["operation"])]
    return windows_list


class TimeseriesPreprocessing:
    def __init__(self, data_folder_context, core_params, preprocessing_params, modeling_params, listener, windows_resources_folder_context=None):
        self.data_folder_context = data_folder_context
        if windows_resources_folder_context is not None:
            self.windows_resources_folder_context = windows_resources_folder_context
        else:
            self.windows_resources_folder_context = data_folder_context
        self.core_params = core_params
        self.preprocessing_params = preprocessing_params
        self.modeling_params = modeling_params
        self.listener = listener

        self.preproc_handler_by_timeseries = {}
        self.pipeline_by_timeseries = {}
        self.resources = {}
        self.collector_data = {}
        self.external_features = {}

    def create_timeseries_preprocessing_handlers(self, df, on_full_df, use_saved_resources=False):
        if not has_external_features_or_windows(self.preprocessing_params, self.modeling_params.get("isShiftWindowsCompatible", False)):
            return

        with self.listener.push_step(step_constants.ProcessingStep.STEP_COLLECTING):
            if on_full_df:
                self._create_single_timeseries_preprocessing_handler(
                    FULL_TIMESERIES_DF_IDENTIFIER,
                    df,
                    timeseries_identifiers_to_output=True,
                    use_saved_resources=use_saved_resources,
                )
            else:
                for timeseries_identifier, df_of_timeseries_identifier in timeseries_iterator(
                    df, self.core_params[TIMESERIES_IDENTIFIER_COLUMNS]
                ):
                    self._create_single_timeseries_preprocessing_handler(
                        timeseries_identifier,
                        df_of_timeseries_identifier,
                        timeseries_identifiers_to_output=False,
                        use_saved_resources=use_saved_resources,
                    )

    def fit_and_process(self, df, step_name, on_full_df, save_data=False):
        if not has_external_features_or_windows(self.preprocessing_params, self.modeling_params.get("isShiftWindowsCompatible", False)):
            if save_data:
                self._save_data()
                self._report()
            return df

        self.create_timeseries_preprocessing_handlers(df, on_full_df, use_saved_resources=False)

        with self.listener.push_step(step_name):
            if on_full_df:
                transformed_df = self._fit_and_process_single_timeseries(FULL_TIMESERIES_DF_IDENTIFIER, df)
            else:
                transformed_df = pd.DataFrame()
                for timeseries_identifier, df_of_timeseries_identifier in timeseries_iterator(
                    df, self.core_params[TIMESERIES_IDENTIFIER_COLUMNS]
                ):
                    transformed_df = pd.concat([
                        transformed_df,
                        self._fit_and_process_single_timeseries(timeseries_identifier, df_of_timeseries_identifier)
                    ], ignore_index=True)

        if save_data:
            self._save_data()
            self._report()

        self.listener.save_status()
        return transformed_df

    def process(self, df, step_name, on_full_df):
        if not has_external_features_or_windows(self.preprocessing_params, self.modeling_params.get("isShiftWindowsCompatible", False)):
            return df

        timeseries_identifier_columns = self.core_params[TIMESERIES_IDENTIFIER_COLUMNS]

        with self.listener.push_step(step_name):
            if on_full_df:
                transformed_df = self._process_single_timeseries(FULL_TIMESERIES_DF_IDENTIFIER, df)
            else:
                transformed_df = pd.DataFrame()
                for timeseries_identifier, df_of_timeseries_identifier in timeseries_iterator(
                    df, timeseries_identifier_columns
                ):
                    transformed_df = pd.concat([
                        transformed_df,
                        self._process_single_timeseries(timeseries_identifier, df_of_timeseries_identifier)
                    ], ignore_index=True)

        self.listener.save_status()
        return transformed_df

    def load_resources(self):
        """Load resource files listed in resource_types.json
        and store them and their file type by name in self.resources.
        Load collector data in self.collector_data.
        """
        resource_types = read_resource(self.data_folder_context, "resource_types", "json")
        for resource_name, resource_type in resource_types.items():
            resource = read_resource(self.data_folder_context, resource_name, resource_type)
            if resource:
                self.resources[resource_name] = (resource, resource_type)

        self.collector_data = read_resource(self.data_folder_context, "collector_data", "json")

    def _create_single_timeseries_preprocessing_handler(
        self,
        timeseries_identifier,
        df_of_timeseries_identifier,
        timeseries_identifiers_to_output=False,
        use_saved_resources=False,
    ):
        if use_saved_resources:
            collector_data = self.collector_data[timeseries_identifier]
        else:
            collector_data = PredictionPreprocessingDataCollector(
                df_of_timeseries_identifier, self.preprocessing_params
            ).build()

        # We separate:
        # - `windows_resources_folder_context` which always refers to the `preprocessing_folder_context` from
        # - `data_folder_context` which can either refer to:
        #   - `preprocessing_folder_context` when writing (e.g. preprocessing on the full df) or reading (e.g. preprocessing before scoring) preprocessing steps resources
        #   - `model_folder_context` when preprocessing steps resources should only be computed and neither written nor read (e.g. preprocessing before HP search)
        preproc_handler = SingleTimeseriesPreprocessingHandler(
            self.core_params, self.preprocessing_params, self.modeling_params, self.data_folder_context, self.windows_resources_folder_context, collector_data, timeseries_identifier
        )

        if use_saved_resources:
            preproc_handler.set_resources(self.resources, timeseries_identifier)

        pipeline = preproc_handler.build_preprocessing_pipeline(
            with_timeseries_identifiers=timeseries_identifiers_to_output
        )

        self.preproc_handler_by_timeseries[timeseries_identifier] = preproc_handler
        self.pipeline_by_timeseries[timeseries_identifier] = pipeline

    def _fit_and_process_single_timeseries(self, timeseries_identifier, df_of_timeseries_identifier):
        transformed_multiframe = self.pipeline_by_timeseries[timeseries_identifier].fit_and_process(
            df_of_timeseries_identifier
        )

        # "TRAIN" because of the last preprocessing step in PredictionPreprocessingHandler: EmitCurrentMFAsResult("TRAIN")
        self.external_features[timeseries_identifier] = transformed_multiframe["TRAIN"].columns()

        return self._multiframe_to_df(transformed_multiframe, timeseries_identifier)

    def _process_single_timeseries(self, timeseries_identifier, df_of_timeseries_identifier):
        transformed_multiframe = self.pipeline_by_timeseries[timeseries_identifier].process(
            df_of_timeseries_identifier
        )

        # "TRAIN" because of the last preprocessing step in PredictionPreprocessingHandler: EmitCurrentMFAsResult("TRAIN")
        if timeseries_identifier in self.external_features:
            assert (
                self.external_features[timeseries_identifier] == transformed_multiframe["TRAIN"].columns()
            ), "External features columns mismatch"
        else:
            self.external_features[timeseries_identifier] = transformed_multiframe["TRAIN"].columns()

        return self._multiframe_to_df(transformed_multiframe, timeseries_identifier)

    def _multiframe_to_df(self, multiframe, timeseries_identifier):
        processed_df = multiframe["TRAIN"].as_dataframe()

        # TODO @timeseries make "target" and "time" constants (needs to edit multiple steps and preprocessing handler)
        processed_df[self.core_params[TARGET_VARIABLE]] = multiframe["target"].reset_index(drop=True)
        processed_df[self.core_params[TIME_VARIABLE]] = multiframe["time"].reset_index(drop=True)

        if timeseries_identifier == FULL_TIMESERIES_DF_IDENTIFIER:
            if self.core_params[TIMESERIES_IDENTIFIER_COLUMNS]:
                # In this case we've have used OutputRawColumns to move time series identifiers away in input dataframe, directly to output
                processed_df[self.core_params[TIMESERIES_IDENTIFIER_COLUMNS]] = multiframe[
                    TIMESERIES_IDENTIFIER_COLUMNS
                ].reset_index(drop=True)
        else:
            add_timeseries_identifiers_columns(processed_df, timeseries_identifier)

        return processed_df

    def _report(self):
        report_by_timeseries = {}
        for timeseries_identifier, pipeline in self.pipeline_by_timeseries.items():
            report = {}
            if hasattr(self, "core_params"):
                pipeline.report_fit(report, self.core_params)
            else:
                pipeline.report_fit(report, {})
            report_by_timeseries[timeseries_identifier] = report
        write_resource(self.data_folder_context, "preprocessing_report", "json", report_by_timeseries)

    def _save_data(self):
        collector_data_by_timeseries = {}
        resources = {}
        resource_types = {}
        for timeseries_identifier, preproc_handler in self.preproc_handler_by_timeseries.items():
            collector_data_by_timeseries[timeseries_identifier] = preproc_handler.get_collector_data()
            for resource_name, resource, resource_type in preproc_handler.list_resources():
                if resource_name not in resource_types:
                    resource_types[resource_name] = resource_type
                if resource_name not in resources:
                    resources[resource_name] = {}
                resources[resource_name][timeseries_identifier] = resource

        write_resource(self.data_folder_context, "collector_data", "json", collector_data_by_timeseries)

        for resource_name, resource_by_timeseries in resources.items():
            write_resource(self.data_folder_context, resource_name, resource_types[resource_name], resource_by_timeseries)

        write_resource(self.data_folder_context, "resource_types", "json", resource_types)


class SingleTimeseriesPreprocessingHandler(PredictionPreprocessingHandler):
    def __init__(self, core_params, preprocessing_params, modeling_params, data_folder_context, windows_resources_folder_context, collector_data=None, timeseries_identifier=None):
        super(SingleTimeseriesPreprocessingHandler, self).__init__(core_params, preprocessing_params, data_folder_context)
        self.modeling_params = modeling_params
        self.collector_data = collector_data
        self.timeseries_identifier = timeseries_identifier
        self.windows_resources_folder_context = windows_resources_folder_context

    @property
    def time_variable(self):
        return self.core_params.get(TIME_VARIABLE)

    @property
    def target_map(self, with_target=False):
        return None

    def get_windows_resources_map(self):
        """
        Note: the windows resources are computed before the preprocessing, only reading them is required.
        """
        if self.windows_resources_folder_context.isfile(ROLLING_WINDOWS_RESOURCES):
            full_windows_resources_map = self.windows_resources_folder_context.read_json(ROLLING_WINDOWS_RESOURCES)
            return full_windows_resources_map[self.timeseries_identifier]
        else:
            return None

    def get_collector_data(self):
        return self.collector_data

    def preprocessing_steps(self, with_timeseries_identifiers=False):
        # Move time column away
        yield RemapValueToOutput(self.time_variable, "time", None)

        # Move time series identifiers away
        if with_timeseries_identifiers and self.core_params[TIMESERIES_IDENTIFIER_COLUMNS]:
            yield OutputRawColumns(self.core_params[TIMESERIES_IDENTIFIER_COLUMNS], TIMESERIES_IDENTIFIER_COLUMNS)

        if self.modeling_params.get("isShiftWindowsCompatible", False):
            yield FetchRollingWindows(get_windows_list(self.preprocessing_params), self.get_windows_resources_map())

        for step in super(SingleTimeseriesPreprocessingHandler, self).preprocessing_steps(with_target=True):
            yield step
        # TODO @timeseries check if RealignTarget can be used ? if it's the case, needs to add a RealignTime step and RealignTimeseriesIdentifiers step

    def set_resources(self, resources, timeseries_identifier):
        """Set the __resources and __resource_types fields of the preprocessing handler of a single timeseries
        using its identifiers encoding and the complete resources (resources of all timeseries) that was loaded previously.
        """
        for resource_name, (resource, resource_type) in resources.items():
            if timeseries_identifier in resource:
                self.set_resource(resource_name, resource[timeseries_identifier], resource_type)
            else:
                self.set_resource(resource_name, {}, resource_type)


def add_rolling_windows_for_training(full_df, core_params, windows_list, preprocessing_params, folder_context):
    """"
    :param folder_context: Required to write the mapping column -> values for categorical-like features
    """
    return add_rolling_windows(full_df, core_params, windows_list, preprocessing_params, folder_context, True)

def add_rolling_windows_for_scoring(full_df, core_params, windows_list, preprocessing_params, folder_context):
    """
    :param folder_context: Required to read the mapping column -> values for categorical-like features
    """
    return add_rolling_windows(full_df, core_params, windows_list, preprocessing_params, folder_context, False)


def add_rolling_windows(full_df, core_params, windows_list, preprocessing_params, folder_context, is_training):
    if len(windows_list)==0:
        return full_df
    to_concat = []
    rescale_numericals = preprocessing_params.get("feature_generation", {}).get("windows_rescale_numericals", False)
    max_categories = preprocessing_params.get("feature_generation", {}).get("windows_max_categories", 10)
    windows_res_map = {}
    if not is_training and folder_context.isfile(ROLLING_WINDOWS_RESOURCES):
        windows_res_map = folder_context.read_json(ROLLING_WINDOWS_RESOURCES)
    for timeseries_identifier, df_of_timeseries_identifier in timeseries_iterator(
            full_df, core_params[TIMESERIES_IDENTIFIER_COLUMNS]
    ):
        if is_training:
            windows_res_map[timeseries_identifier] = {}
        for window in windows_list:
            length = window["length"]
            operations_map = window["operations_map"]
            for column_name in operations_map.keys():
                if column_name not in full_df.columns:
                    raise ValueError("The input does not include the required column '{}'".format(column_name))
                if len(operations_map[column_name]) == 0:
                    continue
                single_col_series = df_of_timeseries_identifier[column_name].copy()
                feature_type = preprocessing_params["per_feature"][column_name]["type"]
                if feature_type == "CATEGORY" and preprocessing_params["per_feature"][column_name][CATEGORY_HANDLING] == DUMMIFY:
                    if is_training:
                        all_categories, counts = np.unique(single_col_series.values, return_counts=True)
                        categories = [category for category, _ in sorted(zip(all_categories, counts), key=lambda x: x[1], reverse=True)[:max_categories]]
                        windows_res_map[timeseries_identifier][column_name] = {}
                        windows_res_map[timeseries_identifier][column_name]["categories"] = categories
                    else:
                        # scoring (recipe or API)
                        categories = windows_res_map[timeseries_identifier][column_name]["categories"]
                    other_cat_freq = np.ones_like(single_col_series.values, dtype=float)
                    for category in categories:
                        # operation = "FREQUENCY"
                        result = (single_col_series == category).rolling(length).mean()
                        other_cat_freq -= result
                        window_name = "rolling_window:{}:{}:{}:{}".format(length, "FREQUENCY", column_name, category)
                        df_of_timeseries_identifier[window_name] = result
                    other_cat_window_name = "rolling_window:{}:{}:{}:{}".format(length, "FREQUENCY", column_name, "__DKU_OTHER__")
                    df_of_timeseries_identifier[other_cat_window_name] = other_cat_freq
                elif feature_type == "NUMERIC":
                    w = single_col_series.rolling(length)
                    for operation in operations_map[column_name]:
                        window_name = "rolling_window:{}:{}:{}".format(length, operation, column_name)
                        if window_name not in df_of_timeseries_identifier.columns:
                            if operation == "MEAN":
                                result = w.mean()
                            elif operation == "MEDIAN":
                                result = w.median()
                            elif operation == "STD":
                                result = w.std()
                            elif operation == "MIN":
                                result = w.min()
                            elif operation == "MAX":
                                result = w.max()
                            else:
                                continue
                            if rescale_numericals:
                                if is_training:
                                    windows_res_map[timeseries_identifier][window_name] = {}
                                    average = result.mean()
                                    scale = result.std()
                                    # Numerical noise occurs in the windows computation so a `result` Series that should be
                                    # identically of the same value can end up having a small non-zero variance.
                                    # Considering this, we aggressively consider small `scale` values (relative to `average`) as 0.
                                    if (average != 0 and abs(scale / average) < 1e-8) or scale == 0.:
                                        # `scale` is 0: we cannot perform rescaling, so we pick 1 (no rescaling)
                                        scale = 1.
                                    windows_res_map[timeseries_identifier][window_name]["average"] = float(average)
                                    windows_res_map[timeseries_identifier][window_name]["scale"] = float(scale)
                                else:
                                    average = windows_res_map.get(timeseries_identifier, {}).get(window_name, {}).get("average", 0.)
                                    scale = windows_res_map.get(timeseries_identifier, {}).get(window_name, {}).get("scale", 1.)
                                result = (result - average) / scale
                            df_of_timeseries_identifier[window_name] = result
                else:
                    logger.warning("Unsupported input feature type: " + feature_type)
        to_concat.append(df_of_timeseries_identifier)
    if is_training:
        folder_context.write_json(ROLLING_WINDOWS_RESOURCES, windows_res_map)
    return pd.concat(to_concat)


class SingleRow(object):

    def __init__(self):
        self.feature_names = []
        self.feature_values = []

    def to_dataframe(self):
        return pd.DataFrame(np.array([[v] for v in self.feature_values]).T, columns=self.feature_names)


def get_rolling_windows_features_from_cat_windows_column_prefix(external_features_columns, column):
    return [preprocessed_column for preprocessed_column in external_features_columns
            if preprocessed_column.startswith(column + ":")]

class MultiHorizonShiftExpanderMixin:
    shift_map = None
    target_variable = None
    supports_nan = False

    def _update_series_list(self, column, preprocessed_column, preprocessed_series, current_horizon, series_list):
        forecast_shifts = self.shift_map.get_forecast_shifts(column)
        for forecast_shift in forecast_shifts:
            series_name = "forecast_shift:{}:{}".format(forecast_shift, preprocessed_column)
            if series_name not in [series.name for series in series_list]:
                out_series = preprocessed_series.shift(-forecast_shift)
                out_series.name = series_name
                series_list.append(out_series)
        horizon_shifts = self.shift_map.get_horizon_shifts(column)
        for horizon_shift in horizon_shifts:
            forecast_shift = current_horizon + horizon_shift
            if forecast_shift in forecast_shifts:
                # Prevent duplicate data
                continue
            if self.shift_map.is_past_only(column) and forecast_shift > 0:
                logger.info("Ignoring shift from horizon {} looking into the future at horizon {} for past-only feature {}".format(horizon_shift, current_horizon, column))
            else:
                series_name = "horizon_shift:{}:{}".format(horizon_shift, preprocessed_column)
                if series_name not in [series.name for series in series_list]:
                    out_series = preprocessed_series.shift(-forecast_shift)
                    out_series.name = series_name
                    series_list.append(out_series)

    def expand_shifts_for_training(self, df_external_features_values, target_values, current_horizon, timeseries_identifier):
        """
        Note that underlying time value is sorted in increasing order
        :param df_external_features_values: pd.DataFrame
        :param target_values: pd.Series
        :param current_horizon: int in the [1, ..., prediction_length] interval, prediction_length being the full prediction horizon
        :param timeseries_identifier: str
        :return: X_df, y: pd.DataFrame, 1d np.array
        """
        if current_horizon <= 0:
            raise ValueError("Expecting all forecasting horizons > 0, but value was: {}".format(current_horizon))
        series_list = []
        for column in self.shift_map.get_shifted_columns():
            if column == self.target_variable:
                series = target_values
                self._update_series_list(column, column, series, current_horizon, series_list)
            elif df_external_features_values is not None and column in df_external_features_values.columns:
                # `column` (the input column) matches a column in the preprocessed dataframe `external_features_values` (e.g. numerical column with standard handling)
                series = df_external_features_values[column]
                self._update_series_list(column, column, series, current_horizon, series_list)
            elif len(self.shift_map.get_preprocessed_columns(column, timeseries_identifier)) > 0:
                # `column` (the input column) does not match the output columns in the preprocessed dataframe `external_features_values`,
                # because it has been preprocessed (e.g. dummies of categorical variables), so we need to find the all the output columns
                # that are computed from `column` in the preprocessing, and apply `column` shift_map settings to them
                for preprocessed_column in self.shift_map.get_preprocessed_columns(column, timeseries_identifier):
                    preprocessed_series = df_external_features_values[preprocessed_column]
                    self._update_series_list(column, preprocessed_column, preprocessed_series, current_horizon, series_list)
            else:
                # rolling windows for categorical variables
                for preprocessed_column in get_rolling_windows_features_from_cat_windows_column_prefix(df_external_features_values.columns, column):
                    preprocessed_series = df_external_features_values[preprocessed_column]
                    self._update_series_list(column, preprocessed_column, preprocessed_series, current_horizon, series_list)

        y = target_values[current_horizon:].values  # [y_horizon, y_horizon+1, ..., y_t+horizon] with t+horizon the last available date with value for target
        if len(series_list) == 0:
            raise ValueError("Feature generation didn't manage to find any viable shift/aggregation window.")
        X_df = pd.concat(series_list, axis=1)[:-current_horizon] # [X_0, X_1, ..., X_t]

        if not self.supports_nan:
            X_df.dropna(inplace=True)
            y = y[X_df.index]
            X_df.reset_index(inplace=True, drop=True)
        return X_df, y

    def _update_row(self, column, preprocessed_column, current_horizon, past_series, future_series, row):
        forecast_shifts = self.shift_map.get_forecast_shifts(column)
        for forecast_shift in forecast_shifts:
            feature_name = "forecast_shift:{}:{}".format(forecast_shift, preprocessed_column)
            if feature_name not in row.feature_names:
                row.feature_names.append(feature_name)
                row.feature_values.append(past_series.values[forecast_shift - 1])
        for horizon_shift in self.shift_map.get_horizon_shifts(column):
            forecast_shift = current_horizon + horizon_shift
            if forecast_shift in forecast_shifts:
                # Prevent duplicate data
                continue
            if forecast_shift > 0 and self.shift_map.is_past_only(column):
                logger.info("Ignoring shift from horizon {} looking into the future at horizon {} for past-only feature {}".format(horizon_shift, current_horizon, column))
            else:
                feature_name = "horizon_shift:{}:{}".format(horizon_shift, preprocessed_column)
                if feature_name not in row.feature_names:
                    row.feature_names.append(feature_name)
                    if forecast_shift <= 0:
                        # current_horizon + horizon_shift is in the past
                        row.feature_values.append(past_series.values[forecast_shift - 1])
                    else:
                        # current_horizon + horizon_shift is in the future
                        row.feature_values.append(future_series.values[forecast_shift - 1])


    def expand_shifts_for_prediction(self, past_target_series, past_df, future_df, current_horizon, timeseries_identifier):
        """
        :param past_target_series: pd.Series
        :param past_df: pd.DataFrame
        :param future_df: pd.DataFrame
        :param current_horizon: int in the [1, ..., prediction_length] interval, prediction_length being the full prediction horizon
        :return: pd.DataFrame with 1 row and columns consistent with first output of expand_shifts_for_training
        """
        if current_horizon <= 0:
            raise ValueError("Expecting all forecast horizons > 0, but value was: {}".format(current_horizon))
        row = SingleRow()
        for column in self.shift_map.get_shifted_columns():
            if column == self.target_variable:
                past_series = past_target_series
                future_series = None
                self._update_row(column, column, current_horizon, past_series, future_series, row)
            elif column in past_df.columns:
                past_series = past_df[column]
                future_series = future_df[column] if future_df is not None else None
                self._update_row(column, column, current_horizon, past_series, future_series, row)
            elif len(self.shift_map.get_preprocessed_columns(column, timeseries_identifier)) > 0:
                for preprocessed_column in self.shift_map.get_preprocessed_columns(column, timeseries_identifier):
                    past_series = past_df[preprocessed_column]
                    future_series = future_df[preprocessed_column] if future_df is not None else None
                    self._update_row(column, preprocessed_column, current_horizon, past_series, future_series, row)
            else:
                for preprocessed_column in get_rolling_windows_features_from_cat_windows_column_prefix(past_df.columns, column):
                    past_series = past_df[preprocessed_column]
                    future_series = future_df[preprocessed_column] if future_df is not None else None
                    self._update_row(column, preprocessed_column, current_horizon, past_series, future_series, row)

        return row.to_dataframe()
