import logging
import os.path as osp
import sys
import gzip

from dataiku.base import dku_pickle
from dataiku.core import dkujson
from dataiku.core import get_resources_dir
from dataiku.doctor.deep_learning.preprocessing import DummyFileReader
from dataiku.doctor.prediction.feature_selection import FeatureSelectionStep
from dataiku.doctor.preprocessing import PCA
from dataiku.doctor.preprocessing.dataframe_preprocessing import AddReferenceInOutput
from dataiku.doctor.preprocessing.dataframe_preprocessing import AllInteractionFeaturesGenerator
from dataiku.doctor.preprocessing.dataframe_preprocessing import BinarizeSeries
from dataiku.doctor.preprocessing.dataframe_preprocessing import BlockStdRescalingProcessor
from dataiku.doctor.preprocessing.dataframe_preprocessing import CategoricalCategoricalInteraction
from dataiku.doctor.preprocessing.dataframe_preprocessing import CategoricalFeatureHashingProcessor
from dataiku.doctor.preprocessing.dataframe_preprocessing import CopyMultipleColumnsFromInput
from dataiku.doctor.preprocessing.dataframe_preprocessing import CustomPreprocessingStep
from dataiku.doctor.preprocessing.dataframe_preprocessing import DatetimeCyclicalEncodingStep
from dataiku.doctor.preprocessing.dataframe_preprocessing import DropRowsWhereNoTarget
from dataiku.doctor.preprocessing.dataframe_preprocessing import DropRowsWhereNoTargetOrNoPrediction
from dataiku.doctor.preprocessing.dataframe_preprocessing import DropRowsWhereNoTargetOrNoTreatment
from dataiku.doctor.preprocessing.dataframe_preprocessing import DropRowsWhereNoTargetOrNoTreatmentOrNoPrediction
from dataiku.doctor.preprocessing.dataframe_preprocessing import DropRowsWhereNoTargetOrNoWeight
from dataiku.doctor.preprocessing.dataframe_preprocessing import DropRowsWhereNoTargetOrNoWeightOrNoPrediction
from dataiku.doctor.preprocessing.dataframe_preprocessing import DumpPipelineState
from dataiku.doctor.preprocessing.dataframe_preprocessing import EmitCurrentMFAsResult
from dataiku.doctor.preprocessing.dataframe_preprocessing import ExtractMLAssertionMasks
from dataiku.doctor.preprocessing.dataframe_preprocessing import ExtractMLAssertionMasksNbInitialRows
from dataiku.doctor.preprocessing.dataframe_preprocessing import FastSparseDummifyProcessor
from dataiku.doctor.preprocessing.dataframe_preprocessing import FileFunctionPreprocessor
from dataiku.doctor.preprocessing.dataframe_preprocessing import FlagMissingValue2
from dataiku.doctor.preprocessing.dataframe_preprocessing import FlushDFBuilder
from dataiku.doctor.preprocessing.dataframe_preprocessing import FrequencyEncodingStep
from dataiku.doctor.preprocessing.dataframe_preprocessing import MultipleImputeMissingFromInput
from dataiku.doctor.preprocessing.dataframe_preprocessing import NumericalCategoricalInteraction
from dataiku.doctor.preprocessing.dataframe_preprocessing import NumericalDerivativesGenerator
from dataiku.doctor.preprocessing.dataframe_preprocessing import NumericalNumericalInteraction
from dataiku.doctor.preprocessing.dataframe_preprocessing import OrdinalEncodingStep
from dataiku.doctor.preprocessing.dataframe_preprocessing import OutlierDetection
from dataiku.doctor.preprocessing.dataframe_preprocessing import OutputRawColumns
from dataiku.doctor.preprocessing.dataframe_preprocessing import PCAStep
from dataiku.doctor.preprocessing.dataframe_preprocessing import PairwiseLinearCombinationsGenerator
from dataiku.doctor.preprocessing.dataframe_preprocessing import PreprocessingPipeline
from dataiku.doctor.preprocessing.dataframe_preprocessing import QuantileBinSeries
from dataiku.doctor.preprocessing.dataframe_preprocessing import RealignPrediction
from dataiku.doctor.preprocessing.dataframe_preprocessing import RealignTarget
from dataiku.doctor.preprocessing.dataframe_preprocessing import RealignTreatment
from dataiku.doctor.preprocessing.dataframe_preprocessing import RealignWeight
from dataiku.doctor.preprocessing.dataframe_preprocessing import RemapTreatmentToOutput
from dataiku.doctor.preprocessing.dataframe_preprocessing import RemapValueToOutput
from dataiku.doctor.preprocessing.dataframe_preprocessing import RescalingProcessor2
from dataiku.doctor.preprocessing.dataframe_preprocessing import SingleColumnDropNARows
from dataiku.doctor.preprocessing.dataframe_preprocessing import TargetEncodingStep
from dataiku.doctor.preprocessing.dataframe_preprocessing import TextCountVectorizerProcessor
from dataiku.doctor.preprocessing.dataframe_preprocessing import TextHashingVectorizerProcessor
from dataiku.doctor.preprocessing.dataframe_preprocessing import TextHashingVectorizerWithSVDProcessor
from dataiku.doctor.preprocessing.dataframe_preprocessing import TextTFIDFVectorizerProcessor
from dataiku.doctor.preprocessing.dataframe_preprocessing import UnfoldVectorProcessor
from dataiku.doctor.preprocessing.multimodal_preprocessings.image_embedding_extraction import ImageEmbeddingExtractor
from dataiku.doctor.preprocessing.multimodal_preprocessings.sentence_embedding_extraction import CodeEnvResourceSentenceEmbeddingExtractor
from dataiku.doctor.preprocessing.multimodal_preprocessings.sentence_embedding_extraction import LLMApiSentenceEmbeddingExtractor
from dataiku.doctor.utils import doctor_constants
from dataiku.doctor.utils.gpu_execution import get_gpu_config_from_core_params

logger = logging.getLogger(__name__)


def load_relfilepath(basepath, relative_filepath):
    """ Returns None if the file does not exists """
    filepath = osp.join(basepath, relative_filepath)
    if osp.exists(filepath):
        return dkujson.load_from_filepath(filepath)
    else:
        return None


def extract_input_columns(preprocessing_params, with_target=False, with_profiling=True, with_sample_weight=False):
    role_filter = {"INPUT"}
    if with_sample_weight:
        role_filter.add("WEIGHT")
    if with_profiling:
        role_filter.add("PROFILING")
    if with_target:
        role_filter.add("TARGET")
    return [
        column_name
        for column_name, column_params in preprocessing_params["per_feature"].items()
        if column_params["role"] in role_filter
    ]


def _to_ascii(resource_name):
    return resource_name.encode(encoding="ascii", errors="xmlcharrefreplace").decode(encoding="ascii")


def write_resource(folder_context, resource_name, resource_type, obj):
    if resource_type == "pkl":
        #  Due to a bug in gzip.py in python 2.7.6 and older versions, we can't allow unicode chars in gzip file names
        #  bzip2 read/write is too slow for our needs, and lzma is only available natively on py 3.3+, so that's our best cross compat bet
        file_name = u"{}.{}.gz".format(_to_ascii(resource_name), resource_type)
        with folder_context.get_file_path_to_write(file_name) as file_path:
            with gzip.open(file_path, "wb") as f:
                dku_pickle.dump(obj, f)
    elif resource_type == "json":
        file_name = u"{}.{}".format(resource_name, resource_type)
        folder_context.write_json(file_name, obj)
    else:
        raise ValueError("Unknown type: {} for saving {}".format(resource_type, resource_name))


def read_resource(folder_context, resource_name, resource_type):
    if folder_context is None:
        return None
    legacy_filename = u"{}.{}".format(resource_name, resource_type)
    file_name = u"{}.{}".format(_to_ascii(resource_name), resource_type)
    if resource_type == "pkl":
        zipped_name = file_name + ".gz"
        if folder_context.isfile(zipped_name):
            with folder_context.get_file_path_to_read(zipped_name) as zipped_path:
                with gzip.open(zipped_path, "rb") as f:
                    return dku_pickle.load(f)
        # Maintain compatibility for non-gzipped pkl files
        elif folder_context.isfile(legacy_filename):
            with folder_context.get_file_path_to_read(legacy_filename) as legacy_path:
                with open(legacy_path, 'rb') as resource_file:
                    return dku_pickle.load(resource_file)
        return None
    elif resource_type == "json":
        if folder_context.isfile(legacy_filename):
            return folder_context.read_json(legacy_filename)
        return None
    else:
        raise ValueError("Unknown type: {} for reading {}".format(resource_type, resource_name))

###
# This file contains the handlers for preprocessing, ie. responsible
# for building and saving the preprocessing pipelines
#
# core_params must be of ResolvedPredictionCoreParams type
###


class PreprocessingHandler(object):
    """Manager class for the preprocessing"""

    def __init__(self, core_params, preprocessing_params, data_folder_context, assertions=None, active_gpu_config=None, nan_support=None):
        """data_path is the path of the preprocessing set in the modelid"""
        self.core_params = core_params
        self.preprocessing_params = preprocessing_params
        self._data_folder_context = data_folder_context
        self.assertions = assertions
        self.active_gpu_config = active_gpu_config
        if nan_support is not None:
            self.nan_support = nan_support
        else:
            from dataiku.doctor.prediction.common import DefaultPredictionAlgorithmNaNSupport
            self.nan_support = DefaultPredictionAlgorithmNaNSupport()
        self.__resources = {}
        self.__resource_types = {}

    def exist_resource(self, resource_name, resource_type):
        if resource_name in self.__resources:
            return True
        resource = read_resource(self._data_folder_context, resource_name, resource_type)
        if resource is None:
            return False
        # Do not waste having read the resource, putting it in the dict
        self.__resources[resource_name] = resource
        self.__resource_types[resource_name] = resource_type
        return True

    def get_resource(self, resource_name, resource_type):
        """
        Resources are just dictionaries either:
          - pickled in a .pkl named after their resource name
          - dumped to a .json named after their resource name
        """
        if resource_name in self.__resources:
            return self.__resources[resource_name]
        self.__resource_types[resource_name] = resource_type
        resource = read_resource(self._data_folder_context, resource_name, resource_type)
        if resource is None:
            resource = {}
        self.__resources[resource_name] = resource
        return self.__resources[resource_name]

    def _save_resource(self, resource_name):
        assert resource_name in self.__resources
        resource = self.__resources[resource_name]
        resource_type = self.__resource_types[resource_name]
        if len(resource):
            # we only save non-empty resources
            write_resource(self._data_folder_context, resource_name, resource_type, resource)

    def input_columns(self, with_target=True, with_profiling=True):
        """ Return the list of input features.

        Can help limit RAM usage, by giving that
        to get_dataframe.

        (includes profiling columns)
        """
        return extract_input_columns(self.preprocessing_params, with_target, with_profiling)

    ###
    # Shorter accessors to some specific resources
    ###

    def get_pca_resource(self,):
        return self.get_resource('pca', 'pkl')

    @property
    def prediction_type(self,):
        return self.core_params[doctor_constants.PREDICTION_TYPE]

    def save_data(self,):
        if self._data_folder_context is None:
            logger.warning("Not saving preprocessing data as no folder was defined")
            return
        write_resource(self._data_folder_context, "collector_data", "json", self.collector_data)
        for resource_name in self.__resources.keys():
            self._save_resource(resource_name)

    def preprocessing_steps(self, verbose=True, **kwargs):
        raise NotImplementedError()

    def build_preprocessing_pipeline(self, *args, **kwargs):
        pipeline = PreprocessingPipeline(steps=list(self.preprocessing_steps(*args, **kwargs)),
                                         unrecorded_value=self.nan_support.unrecorded_value)
        pipeline.init_resources(self)
        return pipeline

    def list_resources(self):
        for resource_name, resource in self.__resources.items():
            resource_type = self.__resource_types[resource_name]
            yield resource_name, resource, resource_type

    def set_resource(self, resource_name, resource, resource_type):
        if resource_name not in self.__resources:
            self.__resources[resource_name] = resource
            self.__resource_types[resource_name] = resource_type

    @property
    def target_variable(self,):
        return self.core_params.get(doctor_constants.TARGET_VARIABLE, None)

    @property
    def treatment_variable(self):
        return self.core_params.get(doctor_constants.TREATMENT_VARIABLE, None)

    @property
    def prediction_variable(self,):
        return self.core_params.get(doctor_constants.PREDICTION_VARIABLE, None)

    @property
    def probas_variables(self,):
        return self.core_params.get(doctor_constants.PROBA_COLUMNS, None)

    def _feature_interaction_steps(self, roles_filter):
        logger.info("generating interactions")
        interactions = self.preprocessing_params["feature_generation"]["manual_interactions"]["interactions"]

        def get_type(column_name):
            return self.preprocessing_params["per_feature"][column_name]["type"]

        def is_numeric(column_name):
            return get_type(column_name) == doctor_constants.NUMERIC

        num_num = filter(lambda x: is_numeric(x["column_1"]) and is_numeric(x["column_2"]), interactions)
        cat_cat = filter(lambda x: (not is_numeric(x["column_1"])) and not (is_numeric(x["column_2"])), interactions)
        num_cat = filter(lambda x: get_type(x["column_1"]) != get_type(x["column_2"])
                                   and (is_numeric(x["column_1"]) or is_numeric(x["column_2"])), interactions)

        num_block = "interaction"

        for interaction in num_num:
            logger.info("generating : %s x %s" %(interaction["column_1"], interaction["column_2"]))
            yield NumericalNumericalInteraction(num_block, interaction["column_1"], interaction["column_2"],
                                                rescale=interaction["rescale"])
        yield FlushDFBuilder(num_block)

        for interaction in num_cat:
            logger.info("generating : %s x %s" %(interaction["column_1"], interaction["column_2"]))
            num, cat = (interaction["column_1"], interaction["column_2"]) if is_numeric(interaction["column_1"]) \
                else (interaction["column_2"], interaction["column_1"])
            out_block = "interaction:%s:%s" % (cat, num)
            yield NumericalCategoricalInteraction(out_block, cat, num,
                                                  interaction["max_features"])
            yield FlushDFBuilder(out_block)

        for interaction in cat_cat:
            logger.info("generating : %s x %s" %(interaction["column_1"], interaction["column_2"]))
            out_block = "interaction:%s:%s" % (interaction["column_1"], interaction["column_2"])
            yield CategoricalCategoricalInteraction(out_block, interaction["column_1"], interaction["column_2"],
                                                    interaction["max_features"])
            yield FlushDFBuilder(out_block)

    def _std_numerical_steps(self, roles_filter):

        def selected_num_features():
            for column_name in self.collector_data["feature_order"]:
                column_params = self.preprocessing_params["per_feature"][column_name]
                if column_params["role"] in roles_filter and column_params["type"] == doctor_constants.NUMERIC:
                    yield column_name

        # Numericals. Four main handlings
        #  * "Regular": Imputation + optional derivatives + optional rescaling
        #  * Datetime cyclical: Imputation + sine/cosine computation + "Regular-like" handling
        #  * Flag presence
        #  * Binarization: Imputation + binarization
        #  * Binning (with NA bin)
        #
        # Nb: if we create derivative features and have rescaling then
        # the pipeline needs to fit

        # After imputing values on a column, we may not want to keep this original column but only the ones generated by
        # its subsequent preprocessing. Depending on the value of 'keep_regular', we put the column in the block of
        # imputed columns that are kept or not.
        numerical_imputer_with_original_map = {}
        numerical_imputed_with_original_block = doctor_constants.NUM_IMPUTED_KEPT
        numerical_imputer_without_original_map = {}
        numerical_imputed_without_original_block = doctor_constants.NUM_IMPUTED_NOT_KEPT

        derivatives_to_generate = []

        rescalers = []
        derivative_block = "NUM_DERIVATIVE"
        column_collectors = self.collector_data["per_feature"]

        # First pass for drop rows. After that, no row can get dropped
        for column_name in selected_num_features():
            column_params = self.preprocessing_params["per_feature"][column_name]
            method = column_params.get(doctor_constants.NUMERICAL_HANDLING, None)
            missing_handling_method = column_params.get(doctor_constants.MISSING_HANDLING, None)

            if method != "FLAG_PRESENCE" and (missing_handling_method == doctor_constants.DROP_ROW or
                                              ((missing_handling_method == doctor_constants.KEEP_NAN_OR_DROP) and not self.nan_support.supports_nan)):
                yield SingleColumnDropNARows(column_name)

        combination_candidates = []
        binarizers, quantizers, keep_missing, custom_imputed_steps = [], [], [], []

        for column_name in selected_num_features():
            column_params = self.preprocessing_params["per_feature"][column_name]
            method = column_params.get(doctor_constants.NUMERICAL_HANDLING, None)
            missing_handling_method = column_params.get(doctor_constants.MISSING_HANDLING, None)

            is_impute = (missing_handling_method == doctor_constants.IMPUTE or
                         ((missing_handling_method == doctor_constants.KEEP_NAN_OR_IMPUTE) and not self.nan_support.supports_nan))
            is_droprow = (missing_handling_method == doctor_constants.DROP_ROW or
                          ((missing_handling_method == doctor_constants.KEEP_NAN_OR_DROP) and not self.nan_support.supports_nan))
            is_keep_nan = (missing_handling_method in [doctor_constants.KEEP_NAN_OR_IMPUTE, doctor_constants.KEEP_NAN_OR_DROP]) and self.nan_support.supports_nan

            column_collector = column_collectors[column_name]

            if method == "REGULAR":

                if is_impute:
                    numerical_imputer_with_original_map[column_name] = column_collector["missing_impute_with_value"]
                    combination_candidates.append(column_name)

                elif is_droprow:
                    # Dropping the rows is actually done by the SingleColumnDropNARows step yielded before (l. 341)
                    numerical_imputer_with_original_map[column_name] = None
                    combination_candidates.append(column_name)

                elif is_keep_nan:
                    numerical_imputer_with_original_map[column_name] = None

                if column_params.get(doctor_constants.GENERATE_DERIVATIVE, False):
                    derivatives_to_generate.append(column_name)

                if column_params[doctor_constants.RESCALING] != "NONE":
                    rescalers.append(get_rescaler(numerical_imputed_with_original_block, column_name, column_params, column_collector))

            elif method == "DATETIME_CYCLICAL":
                if not is_droprow:
                    raise Exception(
                        "'Drop rows' is the only supported missing handling method for cyclical datetime encoding"
                    )
                selected_periods = column_params.get("datetime_cyclical_periods", [])
                if selected_periods:
                    yield DatetimeCyclicalEncodingStep(column_name, selected_periods, "datetime_cyclical")
                else:
                    logger.warning("No period specified to compute cyclical encoding of datetime data")
                if column_params.get("keep_regular", False):
                    numerical_imputer_with_original_map[column_name] = None
                    if column_params[doctor_constants.RESCALING] != "NONE":
                        rescalers.append(get_rescaler(numerical_imputed_with_original_block, column_name, column_params, column_collector))

            elif method == "FLAG_PRESENCE":
                yield FlagMissingValue2(column_name, "num_flagonly")
            elif method == "BINARIZE":
                if column_params["binarize_threshold_mode"] == "MEDIAN":
                    thresh = column_collector["stats"]["median"]
                elif column_params["binarize_threshold_mode"] == "MEAN":
                    thresh = column_collector["stats"]["average"]
                else:
                    thresh = column_params["binarize_constant_threshold"]
                if is_impute:
                    if column_params.get("keep_regular", False):
                        numerical_imputer_with_original_map[column_name] = column_collector["missing_impute_with_value"]
                        block = numerical_imputed_with_original_block
                        if column_params[doctor_constants.RESCALING] != "NONE":
                            rescalers.append(get_rescaler(block, column_name, column_params, column_collector))
                    else:
                        numerical_imputer_without_original_map[column_name] = column_collector["missing_impute_with_value"]
                        block = numerical_imputed_without_original_block
                    binarizers.append(BinarizeSeries(block, column_name, "num_binarized", thresh))
                else:
                    raise Exception("'Impute' is the only supported missing handling method for binarization")

            elif method == "QUANTILE_BIN":
                if is_impute:
                    if column_params.get("keep_regular", False):
                        numerical_imputer_with_original_map[column_name] = column_collector["missing_impute_with_value"]
                        block = numerical_imputed_with_original_block
                        if column_params[doctor_constants.RESCALING] != "NONE":
                            rescalers.append(get_rescaler(block, column_name, column_params, column_collector))
                    else:
                        numerical_imputer_without_original_map[column_name] = column_collector["missing_impute_with_value"]
                        block = numerical_imputed_without_original_block
                    # I have NO FUCKING IDEA why this becomes a float ...
                    quantizers.append(QuantileBinSeries(block, column_name, "num_quantized",
                                                        int(column_params["quantile_bin_nb_bins"])))

                else:
                    raise Exception("'Impute' is the only supported missing handling method for quantile binning")
            elif method == "CUSTOM":
                if is_impute:
                    numerical_imputer_without_original_map[column_name] = column_collector["missing_impute_with_value"]
                    custom_imputed_steps.append(CustomPreprocessingStep(numerical_imputed_without_original_block,
                                                                        column_name,
                                                                        column_params["customHandlingCode"],
                                                                        column_params["customProcessorWantsMatrix"]))
                elif is_droprow or is_keep_nan:
                    # Dropping the rows is actually done by the SingleColumnDropNARows step yielded before (l. 341)
                    yield CustomPreprocessingStep(None, column_name, column_params["customHandlingCode"],
                                                  column_params["customProcessorWantsMatrix"])

            else:
                raise Exception("Unknown numerical method %s for column %s" % (method, column_name))

        # We yield two distinct MultipleImputeMissingFromInput steps here so we can change the keep_output_block param
        # between the block of imputed columns that will be kept and the one that will not
        yield MultipleImputeMissingFromInput(numerical_imputer_with_original_map, numerical_imputed_with_original_block,
                                             keep_output_block=True, as_categorical=False)
        yield MultipleImputeMissingFromInput(numerical_imputer_without_original_map,
                                             numerical_imputed_without_original_block, keep_output_block=False,
                                             as_categorical=False)

        for proc in rescalers:
            yield proc

        if len(binarizers) > 0:
            for bin in binarizers:
                yield bin
            yield FlushDFBuilder("num_binarized")

        if len(quantizers) > 0:
            for bin in quantizers:
                yield bin
            yield FlushDFBuilder("num_quantized")

        for step in custom_imputed_steps:
            yield step

        if len(derivatives_to_generate) > 0:
            yield NumericalDerivativesGenerator(numerical_imputed_with_original_block, derivative_block, derivatives_to_generate)
            yield BlockStdRescalingProcessor(derivative_block)

        yield FlushDFBuilder("num_flagonly")

        yield FlushDFBuilder("datetime_cyclical")

        if self.preprocessing_params["feature_generation"]["pairwise_linear"]["behavior"] == "ENABLED_MANUAL" and len(combination_candidates) >= 2:
            yield PairwiseLinearCombinationsGenerator(numerical_imputed_with_original_block, "pw_linear", combination_candidates)
            yield BlockStdRescalingProcessor("pw_linear")
        if self.preprocessing_params["feature_generation"]["polynomial_combinations"]["behavior"] == "ENABLED_MANUAL" and len(combination_candidates) >= 2:
            yield AllInteractionFeaturesGenerator(numerical_imputed_with_original_block, "polynomial_interaction", combination_candidates)
            yield BlockStdRescalingProcessor("polynomial_interaction")

    def _std_categorical_steps(self, role_filter):
        # Category handling. One of:
        #  - Flag
        #  - Optional Impute + (Dummify or Impact)

        categorical_imputed_block = "CAT_IMPUTED"
        categorical_imputer_map = {}
        post_impute = []
        column_collectors = self.collector_data["per_feature"]
        flaggers = []

        for column_name in self.collector_data["feature_order"]:
            column_params = self.preprocessing_params["per_feature"][column_name]
            role = column_params["role"]
            column_type = column_params["type"]
            if role not in role_filter or column_type != doctor_constants.CATEGORY:
                continue

            column_collector = column_collectors[column_name]

            method = column_params[doctor_constants.CATEGORY_HANDLING]
            missing_handling_method = column_params.get(doctor_constants.MISSING_HANDLING, None)
            is_impute = missing_handling_method == doctor_constants.IMPUTE
            is_treat_as_regular = missing_handling_method == doctor_constants.NONE
            is_droprow = missing_handling_method == doctor_constants.DROP_ROW

            # The below code is slightly duplicated and could be made more compact
            # but I think it's more readable this way as it matches the behavior of the UI.

            if method == doctor_constants.DUMMIFY:
                should_drop = column_params.get("dummy_drop", "NONE") == "DROP"
                vals = column_collector[doctor_constants.CATEGORY_POSSIBLE_VALUES]
                if is_impute:
                    impute_val = column_collector["missing_impute_with_value"]
                    if sys.version_info < (3,0) and isinstance(impute_val, unicode):
                        impute_val = impute_val.encode("utf8")
                    categorical_imputer_map[column_name] = impute_val
                    post_impute.append(FastSparseDummifyProcessor(categorical_imputed_block, column_name, vals, should_drop))
                elif is_treat_as_regular:
                    # TODO: Better to impute and just add a value to the possibles ones !!
                    yield FastSparseDummifyProcessor(None, column_name, vals, should_drop)
                elif is_droprow:
                    yield SingleColumnDropNARows(column_name)
                    yield FastSparseDummifyProcessor(None, column_name, vals, should_drop)

            elif method == doctor_constants.IMPACT:
                impact_coder_params = {
                    "impact_method": column_params.get("impact_method", "M_ESTIMATOR"),
                    "impact_m": column_params.get("impact_m", 10),
                    "impact_kfold": column_params.get("impact_kfold", True),
                    "impact_kfold_k": column_params.get("impact_kfold_k", 5),
                    "impact_kfold_seed": column_params.get("impact_kfold_seed", 1337),
                    "categorical_rescaling": column_params.get("categorical_rescaling", doctor_constants.AVGSTD)
                }
                encoding_name = "glmm" if column_params.get("impact_method", "M_ESTIMATOR") == "GLMM" else "impact"
                out_block = encoding_name + ":" + column_name

                if is_impute:
                    categorical_imputer_map[column_name] = column_collector["missing_impute_with_value"]
                    post_impute.append(
                        TargetEncodingStep(categorical_imputed_block, column_name, impact_coder_params,
                                           self.target_variable, out_block))
                elif is_treat_as_regular:
                    categorical_imputer_map[column_name] = "_NA_"
                    post_impute.append(
                        TargetEncodingStep(categorical_imputed_block, column_name, impact_coder_params,
                                           self.target_variable, out_block))
                elif is_droprow:
                    yield SingleColumnDropNARows(column_name)
                    yield TargetEncodingStep(None, column_name, impact_coder_params, self.target_variable, out_block)

            elif method == "FLAG_PRESENCE":
                flaggers.append(FlagMissingValue2(column_name, "cat_flagpresence"))

            elif method == "HASHING":
                nb_bins_hashing = column_params.get("nb_bins_hashing", 1048576)
                hash_whole_categories = column_params.get("hash_whole_categories", True)
                if is_impute:
                    categorical_imputer_map[column_name] = column_collector["missing_impute_with_value"]
                    post_impute.append(
                        CategoricalFeatureHashingProcessor(categorical_imputed_block, column_name, hash_whole_categories, nb_bins_hashing))
                elif is_treat_as_regular:
                    categorical_imputer_map[column_name] = "_NA_"
                    post_impute.append(
                        CategoricalFeatureHashingProcessor(categorical_imputed_block, column_name, hash_whole_categories, nb_bins_hashing))
                elif is_droprow:
                    yield SingleColumnDropNARows(column_name)
                    yield CategoricalFeatureHashingProcessor(None, column_name, hash_whole_categories, nb_bins_hashing)

            elif method == doctor_constants.ORDINAL:
                out_block = "ordinal:" + column_name
                ordinal_params = {
                    "order": column_params.get("ordinal_order", "COUNT"),
                    "ascending": column_params.get("ordinal_ascending", False),
                    "default_mode": column_params.get("ordinal_default_mode", "HIGHEST"),
                    "default_value": column_params.get("ordinal_default_value")
                }
                if is_impute:
                    categorical_imputer_map[column_name] = column_collector["missing_impute_with_value"]
                    post_impute.append(
                        OrdinalEncodingStep(categorical_imputed_block, column_name, out_block, ordinal_params))
                elif is_treat_as_regular:
                    categorical_imputer_map[column_name] = "_NA_"
                    post_impute.append(
                        OrdinalEncodingStep(categorical_imputed_block, column_name, out_block, ordinal_params))
                elif is_droprow:
                    yield SingleColumnDropNARows(column_name)
                    yield OrdinalEncodingStep(None, column_name, out_block, ordinal_params)

            elif method == doctor_constants.FREQUENCY:
                out_block = "frequency:" + column_name
                frequency_params = {
                    "normalized": column_params.get("frequency_normalized", True),
                    "default_mode": column_params.get("frequency_default_mode", "EXPLICIT"),
                    "default_value": column_params.get("frequency_default_value", 0.),
                    "categorical_rescaling": column_params.get("categorical_rescaling", doctor_constants.AVGSTD)
                }
                if is_impute:
                    categorical_imputer_map[column_name] = column_collector["missing_impute_with_value"]
                    post_impute.append(
                        FrequencyEncodingStep(categorical_imputed_block, column_name, out_block, frequency_params))
                elif is_treat_as_regular:
                    categorical_imputer_map[column_name] = "_NA_"
                    post_impute.append(
                        FrequencyEncodingStep(categorical_imputed_block, column_name, out_block, frequency_params))
                elif is_droprow:
                    yield SingleColumnDropNARows(column_name)
                    yield FrequencyEncodingStep(None, column_name, out_block, frequency_params)

            elif method == "CUSTOM":
                if is_impute:
                    categorical_imputer_map[column_name] = column_collector["missing_impute_with_value"]
                    post_impute.append(
                        CustomPreprocessingStep(categorical_imputed_block, column_name, column_params["customHandlingCode"],
                                                column_params["customProcessorWantsMatrix"]))
                elif is_treat_as_regular:
                    categorical_imputer_map[column_name] = "_NA_"
                    post_impute.append(
                        CustomPreprocessingStep(categorical_imputed_block, column_name, column_params["customHandlingCode"],
                                                column_params["customProcessorWantsMatrix"]))
                elif is_droprow:
                    yield SingleColumnDropNARows(column_name)
                    yield CustomPreprocessingStep(None, column_name, column_params["customHandlingCode"],
                                                  column_params["customProcessorWantsMatrix"])
            else:
                raise ValueError("Category handling method %s is unknown" % method)

        yield MultipleImputeMissingFromInput(categorical_imputer_map, categorical_imputed_block,
                                             keep_output_block=False, as_categorical=True)
        #yield DumpMFDetails("After IMPUTE CAT")
        for step in post_impute:
            yield step

        # We send the flaggers now because there must not be droppers between the flaggers and
        # the flagged block flush
        for flagger in flaggers:
            yield flagger

        yield FlushDFBuilder("cat_flagpresence")

    def _std_text_steps(self, roles_filter):
        # Text features have only one way to handle missing values: treating them as a regular value (aka replacing null
        # values with empty strings)
        post_impute = []
        text_imputed_block = "TEXT_IMPUTED"
        text_imputer_map = {}
        random_state = int(self.preprocessing_params.get('preprocessingFitSampleSeed', 1337))
        for column_name in self.collector_data["feature_order"]:
            column_params = self.preprocessing_params["per_feature"][column_name]
            role = column_params["role"]
            column_type = column_params["type"]
            is_keras_custom_feature = column_params.get("isSpecialFeature", False)
            if role not in roles_filter or column_type != doctor_constants.TEXT:
                continue

            method = column_params["text_handling"]

            if method == "TOKENIZE_HASHING":
                hash_size = int(column_params.get("hashSize", 200000))
                yield TextHashingVectorizerProcessor(column_name, hash_size)
            elif method == "TOKENIZE_HASHING_SVD":
                hash_size = int(column_params.get("hashSize", 200000))
                svd_limit = int(column_params.get("hashSVDSVDLimit", 50000))
                n_components = int(column_params.get("hashSVDSVDComponents", 100))
                yield TextHashingVectorizerWithSVDProcessor(column_name, n_components, hash_size, svd_limit, random_state)
            elif method == "TOKENIZE_COUNTS":
                stopwords = self._load_stop_words(column_params, "word_counts.json")
                yield TextCountVectorizerProcessor(column_name,
                        column_params["minRowsRatio"],
                        column_params["maxRowsRatio"],
                        int(column_params["maxWords"]),
                        int(column_params["ngramMinSize"]),
                        int(column_params["ngramMaxSize"]),
                        stopwords,
                        column_params["useCustomVectorizer"] and column_params["customVectorizerCode"] or None)
            elif method == "TOKENIZE_TFIDF":
                stopwords = self._load_stop_words(column_params, "tfidf.json")
                yield TextTFIDFVectorizerProcessor(column_name,
                        column_params["minRowsRatio"],
                        column_params["maxRowsRatio"],
                        int(column_params["maxWords"]),
                        int(column_params["ngramMinSize"]),
                        int(column_params["ngramMaxSize"]),
                        stopwords,
                        column_params["useCustomVectorizer"] and column_params["customVectorizerCode"] or None)
            elif method == "SENTENCE_EMBEDDING":
                if column_params.get("isStructuredRef"):
                                    yield LLMApiSentenceEmbeddingExtractor(
                                        column_name,
                                        column_params["sentenceEmbeddingModel"],
                                        column_params.get("embeddingSize")  # not always defined for custom HF models
                                    )
                else:
                    if self.active_gpu_config:
                        gpu_config = self.active_gpu_config
                    else:
                        gpu_config = get_gpu_config_from_core_params(self.core_params)

                    # Legacy way:
                    yield CodeEnvResourceSentenceEmbeddingExtractor(
                        column_name,
                        column_params["sentenceEmbeddingModel"],
                        column_params["maxSequenceLength"],
                        int(column_params["sentenceEmbeddingBatchSize"]),
                        gpu_config=gpu_config
                    )
            elif method == "CUSTOM":
                text_imputer_map[column_name] = ""
                post_impute.append(
                    CustomPreprocessingStep(text_imputed_block,
                                            column_name,
                                            column_params["customHandlingCode"],
                                            column_params["customProcessorWantsMatrix"],
                                            accepts_tensor=is_keras_custom_feature,
                                            fit_and_process_only_fits=is_keras_custom_feature))
            else:
                raise ValueError("Not implemented text method %s" % method)
        yield MultipleImputeMissingFromInput(text_imputer_map, text_imputed_block, keep_output_block=False,
                                             as_categorical=True)
        for step in post_impute:
            yield step

    def _load_stop_words(self, column_params, vectorizer_data_filename):
        stopwords_mode = column_params.get("stopWordsMode", "NONE")
        if stopwords_mode == "CUSTOM":
            return column_params["customStopWords"].split(" ")
        elif stopwords_mode != "NONE":
            if self._data_folder_context.isfile(vectorizer_data_filename):
                logger.info("Reading stop words previously saved for {} in: {}".format(stopwords_mode,
                                                                                       vectorizer_data_filename))
                # Words saved from a previous training: we are in a saved model
                return self._data_folder_context.read_json(vectorizer_data_filename)["stop_words"]
            else:
                # The logic to fetch stop words does not rely on a folder context because the resource dir of
                # a DSS install does not fit well with the folder context approach. So we keep the following
                # legacy logic.
                stopwords_file_path = osp.join(
                    get_resources_dir(),
                    "nlp", "stopwords_{}.txt".format(stopwords_mode.lower()))
                with open(stopwords_file_path) as f:
                    logger.info("Reading stop words from resource dir for {} in: {}".format(stopwords_mode,
                                                                                            stopwords_file_path))
                    # No "stop_words" file found, we load the reference one: in a training
                    return f.read().splitlines()
        return None

    def _std_vector_steps(self, roles_filter):

        vector_imputer_map = {}
        vector_imputed_block = "VECTOR_IMPUTED"
        vec_steps = []

        for column_name in self.collector_data["feature_order"]:
            column_params = self.preprocessing_params["per_feature"][column_name]
            role = column_params["role"]
            column_type = column_params["type"]
            if role not in roles_filter or column_type != doctor_constants.VECTOR:
                continue

            column_collectors = self.collector_data["per_feature"][column_name]
            vec_length = column_collectors["vector_length"]
            method = column_params["vector_handling"]
            missing_handling_method = column_params.get(doctor_constants.MISSING_HANDLING, doctor_constants.DROP_ROW)
            is_impute = missing_handling_method == doctor_constants.IMPUTE

            if missing_handling_method == doctor_constants.DROP_ROW:
                yield SingleColumnDropNARows(column_name)

            if method == doctor_constants.UNFOLD:
                if is_impute:
                    impute_val = column_collectors["missing_impute_with_value"]
                    if sys.version_info < (3,0) and isinstance(impute_val, unicode):
                        impute_val = impute_val.encode("utf8")
                    vector_imputer_map[column_name] = impute_val
                    vec_steps.append(
                        UnfoldVectorProcessor(column_name, vec_length, in_block=vector_imputed_block)
                    )
                else:
                    vec_steps.append(
                        UnfoldVectorProcessor(column_name, vec_length, in_block=None)
                    )

        # First compute imputes
        yield MultipleImputeMissingFromInput(vector_imputer_map, vector_imputed_block, keep_output_block=False,
                                             as_categorical=True)

        # Then, treat each Vector step
        for step in vec_steps:
            yield step

    def _std_image_steps(self, roles_filter):

        for column_name in self.collector_data["feature_order"]:
            column_params = self.preprocessing_params["per_feature"][column_name]
            role = column_params["role"]
            column_type = column_params["type"]
            is_special_feature = column_params.get("isSpecialFeature", False)
            if role not in roles_filter or column_type != doctor_constants.IMAGE:
                continue

            method = column_params["image_handling"]
            missing_handling_method = column_params.get(doctor_constants.MISSING_HANDLING, doctor_constants.DROP_ROW)

            if missing_handling_method == doctor_constants.DROP_ROW:
                yield SingleColumnDropNARows(column_name)

            if method == "CUSTOM":
                img_reader = DummyFileReader(column_params["managed_folder_id"])
                logger.info("Reading from managedFolder {} for feature {}".format(column_params["managed_folder_id"], column_name))
                yield FileFunctionPreprocessor(column_name, column_params["customHandlingCode"], img_reader,
                                               func_name="preprocess_image",
                                               fit_and_process_only_fits=is_special_feature)
            elif method == "EMBEDDING_EXTRACTION":
                assert self.core_params.get("managedFolderSmartId", "") != "", \
                    "a ManagedFolder should be selected for the Image Embedding extraction preprocessing to work"
                logger.info("Reading from managedFolder {} for feature {}".format(self.core_params["managedFolderSmartId"], column_name))

                impute_missing_values = missing_handling_method == doctor_constants.IMPUTE
                impute_invalid_paths = impute_missing_values and column_params.get(doctor_constants.IMPUTE_INVALID_PATHS, False)
                img_reader = DummyFileReader(self.core_params["managedFolderSmartId"])
                yield ImageEmbeddingExtractor(column_name, file_reader=img_reader, model_structured_ref=column_params["pretrainedModelsParams"]["structureRefId"], impute_missing_values=impute_missing_values, impute_invalid_paths=impute_invalid_paths)
            else:
                raise ValueError("Not implemented image handling method %s" % method)

    def report(self, pipeline):
        report = {}
        if hasattr(self, "core_params"):
            pipeline.report_fit(report, self.core_params)
        else:
            pipeline.report_fit(report, {})
        self._data_folder_context.write_json("preprocessing_report.json", report)


def get_rescaler(in_block, column_name, column_params, column_collector):
    """Build a rescaler for the original column"""
    # TODO Do we really want to use the collector for this?
    rescaling_method = column_params["rescaling"]
    if rescaling_method == doctor_constants.MINMAX:
        min_value = column_collector["stats"]["min"]
        max_value = column_collector["stats"]["max"]
        return RescalingProcessor2(in_block, column_name, min_value, max_value-min_value)
    else:
        # Assumes rescaling is AVGSTD
        avg_value = column_collector["stats"]["average"]
        std_value = column_collector["stats"]["std"]
        return RescalingProcessor2(in_block, column_name, avg_value, std_value)


class ClusteringPreprocessingHandler(PreprocessingHandler):
    """
        Build the preprocessing pipeline for clustering projects

        Clustering preprocessing is especially difficult from
        misc reasons, we need to keep track of the multiframe at different
        state of its processing :

        - train
            The model used for clustering performs on
            preprocessed INPUT columns, on which we
            may or may not remove outliers, and may
            or may not apply a PCA.

            * TRAIN

        - profiling
            Columns that are not actually INPUT should still
            be preprocessed (e.g. Dummified) in order to compute
            different statistics on the the different values.
            Such columns have a role called "PROFILING".

            Dataframe preprocessed, (including PROFILING columns)

            * PREPROCESSED

        - feature importance
            Feature importance is done by making a classification on
            the variables.
            In order to have its result human readable, we need
            to do this analysis on prepca values.

            * TRAIN_PREPCA

        - outliers
            The outliers labels is used to make sure we can
            reannotated the initial datasets (for feature importance
            and profiling)

            * OUTLIERS

        """

    def preprocessing_steps(self, **kwargs):
        column_collectors = self.collector_data["per_feature"]

        # First, handle profiling.
        #  - Numericals are kept as-is
        #  - Text is dropped
        #  - Categorical is both kept as-is (for cluster profiles) and dummified (for scatterplot)
        numerical_copier_arr = []
        numerical_copied_block = "NUM_COPIED"

        cat_copier_arr = []
        cat_copied_block = "CAT_COPIED"

        for column_name in self.collector_data["feature_order"]:
            column_params = self.preprocessing_params["per_feature"][column_name]
            role = column_params["role"]

            if role == "PROFILING" or role == "INPUT":
                column_collector = column_collectors[column_name]
                column_type = column_params["type"]

                if column_type == "CATEGORY":
                    cat_copier_arr.append(column_name)
                    if column_params["category_handling"] == "DUMMIFY":
                        vals = column_collector[doctor_constants.CATEGORY_POSSIBLE_VALUES]
                        should_drop = column_params.get("dummy_drop", "NONE") == "DROP"
                        yield FastSparseDummifyProcessor(None, column_name, vals, should_drop)

                elif column_type == doctor_constants.NUMERIC:
                    numerical_copier_arr.append(column_name)

                else:
                    # Just drop text ...
                    pass

        yield CopyMultipleColumnsFromInput(numerical_copier_arr, numerical_copied_block)
        yield CopyMultipleColumnsFromInput(cat_copier_arr, cat_copied_block)
        yield EmitCurrentMFAsResult("PROFILING")

        yield DumpPipelineState("After create profiling")

        # Then handle the "regular" stuff - here, similar to prediction

        roles_filter = {"INPUT"}

        # Type coercion for all
        # TODO ??

        # Numericals
        for step in self._std_numerical_steps(roles_filter):
            yield step

        # Categories
        for step in self._std_categorical_steps(roles_filter):
            yield step

        # Text
        for step in self._std_text_steps(roles_filter):
            yield step

        # Vector
        for step in self._std_vector_steps(roles_filter):
            yield step

        for step in self._feature_interaction_steps(roles_filter):
            yield step

        yield DumpPipelineState("After std handling")

        # Outliers detection
        kept_variance = self.preprocessing_params['reduce'].get('kept_variance')
        seed = int(self.preprocessing_params.get('preprocessingFitSampleSeed', 1337))
        if kept_variance == 0.0:
            kept_variance = 0.9
        if self.preprocessing_params["outliers"]["method"] != "NONE":
            min_n = self.preprocessing_params['outliers']['min_n']
            min_cum_ratio = self.preprocessing_params['outliers']['min_cum_ratio']
            yield OutlierDetection(
                pca_kept_variance=kept_variance,
                min_n=min_n,
                min_cum_ratio=min_cum_ratio,
                outlier_name='OUTLIERS',
                random_state=seed)

        yield DumpPipelineState("After outliers")

        yield EmitCurrentMFAsResult("TRAIN_PREPCA")

        if self.preprocessing_params["reduce"]["enabled"]:
            pca_res = self.get_pca_resource()
            if 'END_PCA' not in pca_res:
                pca_res['END_PCA'] = PCA(kept_variance=kept_variance, normalize=True)
            yield PCAStep(pca=pca_res['END_PCA'], input_name = 'TRAIN_PREPCA', output_name='TRAIN')
        else:
            yield AddReferenceInOutput("TRAIN_PREPCA", "TRAIN")

        yield DumpPipelineState("After PCA")


class PredictionPreprocessingHandler(PreprocessingHandler):
    @property
    def target_map(self, with_target=False):
        raise NotImplementedError()

    @property
    def weight_map(self):
        return None

    @property
    def sample_weight_variable(self):
        return self.core_params.get("weight", {}).get("sampleWeightVariable", None)

    @property
    def has_sample_weight_variable(self):
        return (self.core_params.get("weight", {})["weightMethod"] == "SAMPLE_WEIGHT") and \
               (self.core_params.get("weight", {})["sampleWeightVariable"] is not None)

    @staticmethod
    def build(core_params, preprocessing_params, data_folder_context, assertions=None, active_gpu_config=None, nan_support=None):
        """Build the proper type of prediction preprocessing handling depending on the preprocessing params"""
        prediction_type = core_params[doctor_constants.PREDICTION_TYPE]
        # TODO @timeseries handle time series:
        # Time series preprocessing relies on the TimeseriesPreprocessing class which does not inherit
        # from PreprocessingHandler (multiple time series datasets require multiple PreprocessingHandlers
        # within one TimeseriesPreprocessing).
        # We should add specific time series logic, to use TimeseriesPreprocessing instead of
        # PredictionPreprocessingHandler.build, whenever it is required.
        return PREDICTION_PREPROCESSING_HANDLER_TYPE_MAP[prediction_type](core_params, preprocessing_params,
                                                                          data_folder_context,
                                                                          assertions=assertions,
                                                                          active_gpu_config=active_gpu_config,
                                                                          nan_support=nan_support)

    def preprocessing_steps(self, with_target=False, verbose=True, allow_empty_mf=False, with_prediction=False, with_treatment=False):
        """"
        with_target, with_treatment and with_prediction boolean parameters cannot be inferred based on other variables, unlike
        the presence of sample weights (by checking the sample_weight_variable attribute)
        """
        # Move target away
        if with_target:
            yield RemapValueToOutput(self.target_variable, "target", self.target_map)

        if with_treatment:
            control_value = self.core_params["control_value"]
            treatment_values = self.core_params.get("treatment_values", [])
            yield RemapTreatmentToOutput(self.treatment_variable, control_value, self.preprocessing_params["drop_missing_treatment_values"], self.core_params.get("enable_multi_treatment", False), treatment_values)

        # Move prediction away
        if with_prediction:
            yield RemapValueToOutput(self.prediction_variable, "prediction", self.target_map)
            if self.probas_variables:
                yield OutputRawColumns(self.probas_variables, doctor_constants.PROBA_COLUMNS)

        # Set weight apart
        # NB: only for training (not scoring / evaluate) recipes, so only when with_target is true
        if with_target and self.sample_weight_variable is not None:
            yield RemapValueToOutput(self.sample_weight_variable, "weight", self.weight_map)

        if self.assertions:
            yield ExtractMLAssertionMasksNbInitialRows(self.assertions)

        roles_filter = {"INPUT"}

        # Numericals
        for step in self._std_numerical_steps(roles_filter):
            yield step

        # Categories
        for step in self._std_categorical_steps(roles_filter):
            yield step

        # Text
        for step in self._std_text_steps(roles_filter):
            yield step

        # Vector
        for step in self._std_vector_steps(roles_filter):
            yield step

        # Image
        for step in self._std_image_steps(roles_filter):
            yield step


        for step in self._feature_interaction_steps(roles_filter):
            yield step

        logger.info(str(self.preprocessing_params))

        # Set weight apart
        # NB: only for training (not scoring / evaluate) recipes, so only when with_target is true
        if with_target and self.sample_weight_variable is not None:
            yield RealignWeight()

        if with_target:
            yield RealignTarget()
            if self.sample_weight_variable is not None:
                if with_prediction:
                    yield RealignPrediction(has_probas=self.probas_variables)
                    yield DropRowsWhereNoTargetOrNoWeightOrNoPrediction(allow_empty_mf=allow_empty_mf,
                                                                        has_probas=self.probas_variables)
                else:
                    yield DropRowsWhereNoTargetOrNoWeight(allow_empty_mf=allow_empty_mf)
            else:
                if with_prediction:
                    yield RealignPrediction(has_probas=self.probas_variables)
                    if self.treatment_variable is not None:
                        yield RealignTreatment()
                        yield DropRowsWhereNoTargetOrNoTreatmentOrNoPrediction(allow_empty_mf=allow_empty_mf,
                                                                               has_probas=self.probas_variables)
                    else:
                        yield DropRowsWhereNoTargetOrNoPrediction(allow_empty_mf=allow_empty_mf,
                                                                  has_probas=self.probas_variables)
                else:
                    if self.treatment_variable is not None:
                        yield RealignTreatment()
                        yield DropRowsWhereNoTargetOrNoTreatment(allow_empty_mf=allow_empty_mf)
                    else:
                        yield DropRowsWhereNoTarget(allow_empty_mf=allow_empty_mf)

        # Features selection (output)
        yield DumpPipelineState("Before feature selection")

        # Feature generation experiments
        #yield RandomColumnsGenerator(10)
        #nfcg_settings = {
        #    "behavior" : "ENABLED_MANUAL",
        #    "all_features" : True,
        #    "k" : 5,
        #    "transformation_mode": "DUMMIFY_CLUSTERID" #"REPLACE_BY_DISTANCE"#DUMMIFY_CLUSTERID"#"REPLACE_BY_DISTANCE"
        #}
        #yield NumericalFeaturesClusteringGenerator(self.preprocessing_params, nfcg_settings)
        #cctg_settings = {
        #    "behavior" : "ENABLED_MANUAL",
        #    "all_features" : True
        #}
        #yield CategoricalsCountTransformerGenerator(self.preprocessing_params, cctg_settings)
        #yield DumpMFDetails("After feature generation")

        if "feature_selection_params" in self.preprocessing_params \
                and self.preprocessing_params["feature_selection_params"]["method"] != "NONE":
            logger.info("Performing feature reduction")
            random_state = int(self.preprocessing_params.get('preprocessingFitSampleSeed', 1337))
            yield FeatureSelectionStep(self.preprocessing_params["feature_selection_params"],
                                       self.core_params["prediction_type"],
                                       random_state=random_state)
        else:
            logger.info("No feature selection to perform")

        # running assertion as last step before emitting to have the last "version" of input_df, i.e. all rows
        # that needed to be dropped have been dropped
        if self.assertions:
            yield ExtractMLAssertionMasks(self.assertions)

        yield EmitCurrentMFAsResult("TRAIN")

        yield DumpPipelineState("At end")


class BinaryClassificationPreprocessingHandler(PredictionPreprocessingHandler):
    @property
    def target_map(self, with_target=False):
        ret = {}
        for tv in self.preprocessing_params["target_remapping"]:
            ret[tv["sourceValue"]] = tv["mappedValue"]
        if len(ret) != 2:
            raise ValueError("This is not a binary classification, found %s classes" % len(ret))
        return ret


class MulticlassPreprocessingHandler(PredictionPreprocessingHandler):
    @property
    def target_map(self, with_target=False):
        ret = {}
        for tv in self.preprocessing_params["target_remapping"]:
            ret[tv["sourceValue"]] = tv["mappedValue"]
        if len(ret) <= 2:
            raise ValueError("This is not multiclass, found %s classes" % len(ret))
        return ret


class RegressionPreprocessingHandler(PredictionPreprocessingHandler):
    @property
    def target_map(self, with_target=False):
        return None


PREDICTION_PREPROCESSING_HANDLER_TYPE_MAP = {
    doctor_constants.BINARY_CLASSIFICATION: BinaryClassificationPreprocessingHandler,
    doctor_constants.MULTICLASS: MulticlassPreprocessingHandler,
    doctor_constants.REGRESSION: RegressionPreprocessingHandler,
    doctor_constants.CAUSAL_BINARY_CLASSIFICATION: BinaryClassificationPreprocessingHandler,
    doctor_constants.CAUSAL_REGRESSION: RegressionPreprocessingHandler,
}
