
# how to use : df should be the dataframe restricted to categorical values to impact,
# target should be the pd.series of target values.
# Use fit, transform etc.
# three types : binary (classification), multiple (classification), regression.
# for now m is a param <===== but what should we put here ? I guess some function of total shape.
# I mean what would be the value of m we want to have for 0.5 ?

import pandas as pd
import numpy as np

NULL = '_NA_'
default = '_default_'


class ImpactCoding(object):

    def __init__(self,
                 impact_type='binary',
                 m=10,
                 rescaling=False,
                 scaler=None,
                 filepath=None,
                 kfold=True,
                 kfold_k=5,
                 kfold_seed=1337):
        self.type = impact_type
        self.m = m
        self.rescaling = rescaling
        self.scaler = scaler
        self.filepath = filepath
        self.kfold = kfold
        self.kfold_k = kfold_k
        self.kfold_seed = kfold_seed

    def _check_fitted(self):
        if not hasattr(self, "impact"):
            raise AttributeError("Impact coding has not been trained yet.")

    def _check_columns(self, X):
        diff = set(self.feature_list).difference(set(list(X.columns)))
        if len(diff) > 1:
            AttributeError("New Dataframe columns do not correspond to the ones on which preprocessing was fitted.")

    def _impact_missing(self, df):
        df[self.feature_list] = df[self.feature_list].fillna(NULL)
        # Nulls are simply taken as another category

    def _compute_impact(self, series, target_series):
        category_counts = series.value_counts()

        feature = series.name or "feature"
        df = pd.DataFrame({feature: series, "target": target_series})

        # calculate the impact coding
        target_mean = target_series.mean()
        category_means = df.groupby(feature)["target"].mean()
        lambda_weights = category_counts.astype("float") / (category_counts + self.m)
        impact_coded_values = lambda_weights * category_means + (1 - lambda_weights) * target_mean

        # add default value
        mapping = pd.DataFrame(pd.concat([
            impact_coded_values,
            pd.Series([target_mean], index=[default]),
        ], axis=0))

        # return mapping (rescaled if needed)
        return self._rescale(series, mapping)

    def _simple_impact(self, series, target_series, t_value):
        mapping = self._compute_impact(series, target_series)
        if t_value is not None:
            # Classification
            mapping.columns = ["target_value:" + str(t_value)]
        else:
            mapping.columns = ["all_target_values"]

        return mapping

    def _kfold_impact(self, series, target_series, t_value):
        # [sc-162009] Importing sklearn at the top level breaks the
        # "export as dataset" button in Jupyter notebooks.
        from sklearn.model_selection import KFold
        from sklearn.model_selection import StratifiedKFold

        feature = series.name or "feature"
        df = pd.DataFrame({feature: series, "target": target_series})

        impact_coded = []
        global_mean = target_series.mean()

        mapping_computations = []

        # Only use stratified kfold for classification tasks
        if self.type in {'binary', 'multiple'}:
            cv = StratifiedKFold(n_splits=self.kfold_k, shuffle=True, random_state=self.kfold_seed)
        elif self.type == 'regression':
            cv = KFold(n_splits=self.kfold_k, shuffle=True, random_state=self.kfold_seed)
        else:
            raise ValueError("Prediction type not matching allowed types")

        if t_value is not None:
            columns = ["target_value:" + str(t_value)]
        else:
            columns = ["all_target_values"]

        for split, (infold, oofold) in enumerate(cv.split(series,target_series)):
            # Fit using infold data
            mapping = self._compute_impact(df[feature].iloc[infold], df["target"].iloc[infold])

            # Use mapping to transform oof data
            oof_series = df[feature].iloc[oofold]
            for v in oof_series.unique():
                # First replace values unseen during fit by default value
                if v not in mapping.index:
                    oof_series.loc[oof_series == v] = default
            impact_coded_oof = pd.DataFrame({feature: oof_series})
            impact_coded_oof = impact_coded_oof.astype(object) # Make sure data to encode is object type to allow for merge
            impact_coded_oof = impact_coded_oof.merge(mapping, left_on=feature, right_index=True, how='left')
            del impact_coded_oof[feature]
            impact_coded_oof.columns = columns

            impact_coded.append(impact_coded_oof)

            if t_value is not None:
                mapping.columns = ["target_value:" + str(t_value) + "_fold_" + str(split)]
            else:
                mapping.columns = ["all_target_values_fold_" + str(split)]
            mapping_computations.append(mapping)

        mapping_computation_df = pd.concat(mapping_computations, axis=1, sort=True)

        # Use the global mean to replace all values which did not appear in a given fold
        mapping_computation_df.fillna(global_mean, inplace=True)

        # Mean mapping will be used to transform validation / test data
        mean_mapping = pd.DataFrame(mapping_computation_df.mean(axis=1))
        if t_value is not None:
            # Classification
            mean_mapping.columns = ["target_value:" + str(t_value)]
        else:
            mean_mapping.columns = ["all_target_values"]

        impact_coded_df = pd.concat(impact_coded, axis=0).sort_index()
        return mean_mapping, impact_coded_df

    def _impact_classification(self, df):
        target_values = df['impact_target'].unique()
        # create one column per value but one
        target_values = target_values[:len(target_values) - 1]
        target_df = pd.get_dummies(df['impact_target'])
        if not self.kfold:
            for feature in self.feature_list:
                impact_list = []
                for val in target_values:
                    impact_list.append(self._simple_impact(df[feature], target_df[val], val))
                self.impact[feature] = pd.concat(impact_list, axis=1)
            return df  # For simple impact coding, transform is done separately
        else:
            # Always fit_transform for kfold
            impact_coded_df_list = []
            for feature in self.feature_list:
                mapping_list = []
                for val in target_values:
                    mapping, impact_coded_df = self._kfold_impact(df[feature], target_df[val], val)
                    mapping_list.append(mapping)
                    impact_coded_df_list.append(impact_coded_df)
                self.impact[feature] = pd.concat(mapping_list, axis=1)

            return pd.concat(impact_coded_df_list, axis=1)

    def _impact_regression(self, df):
        df['impact_target'] = df['impact_target'].astype('float')
        if not self.kfold:
            for feature in self.feature_list:
                if not self.kfold:
                    self.impact[feature] = self._simple_impact(df[feature], df['impact_target'], None)
            return df  # For simple impact coding, transform is done separately
        else:
            # Always fit_transform for kfold
            impact_coded_df_list = []
            for feature in self.feature_list:
                self.impact[feature], impact_coded_df = self._kfold_impact(df[feature], df['impact_target'], None)
                impact_coded_df_list.append(impact_coded_df)

            return pd.concat(impact_coded_df_list, axis=1)

    def _rescale(self, series, encoding_map):
        if not self.rescaling or self.scaler == "NONE":
            return encoding_map
        if self.scaler not in {"AVGSTD", "MINMAX"}:
            raise ValueError("Unknown rescaling method %s", self.scaler)

        transformed_series = self._transform_series(series, encoding_map[0])
        if self.scaler == "AVGSTD":
            shift, scale = transformed_series.mean(), transformed_series.std()
        else:
            series_min = transformed_series.min()
            shift, scale = series_min, transformed_series.max() - series_min
        if np.isnan(scale) or scale == 0:
            inv_scale = 0
        else:
            inv_scale = 1. / scale
        return (encoding_map - shift) * inv_scale

    def _fit_transform(self, X, target):
        # recreate dataframe
        target.name = 'impact_target'
        self.feature_list = X.columns
        df = X.copy()
        df['impact_target'] = target
        self.impact = {}
        # calculate the impact coding
        self._impact_missing(df)
        if self.type in {'binary', 'multiple'}:
            df = self._impact_classification(df)
        elif self.type == 'regression':
            df = self._impact_regression(df)
        else:
            raise AttributeError("The impact_type value is unknown. Please use 'binary', 'multiple' or 'regression'. ")
        return df

    def fit(self, X, target):
        if self.kfold:
            raise NotImplementedError("For kfold impact coding use fit_transform")
        self._fit_transform(X, target)

    def transform(self, X):
        self._check_fitted()
        self._check_columns(X)
        df = X.copy()
        # fill na with null value
        self._impact_missing(df)
        for feature in df.columns:
            #  change unknown values to default.
            cat_values = df[feature].unique()
            cat_values_fitted = self.impact[feature].index
            for cat_val in cat_values:
                if cat_val not in cat_values_fitted:
                    feat = df[feature]
                    feat.loc[feat == cat_val] = default
            # merge with result of feature
            df = df.reset_index().merge(self.impact[feature], left_on=feature, right_index=True, how='left').set_index('index')
            del df[feature]
        return df

    def _transform_series(self, series, mapping):
        cat_values = series.unique()
        cat_values_fitted = mapping.index
        for cat_val in cat_values:
            if cat_val not in cat_values_fitted:
                series.loc[series == cat_val] = default
        # merge with result of feature
        df = pd.DataFrame({series.name: series}).merge(mapping, left_on=series.name, right_index=True, how='left')
        del df[series.name]
        return df[0:]

    def fit_transform(self, X, target):
        if not self.kfold:
            self.fit(X, target)
            return self.transform(X)
        else:
            return self._fit_transform(X, target)
