# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
#%config Completer.use_jedi = False

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from commons.dku_utils.core import get_current_project_and_variables
from utils import information_value

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Read recipe inputs
applications_joined = dataiku.Dataset("applications_train")
applications_joined_df = applications_joined.get_dataframe()
project, variables = get_current_project_and_variables()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
features = applications_joined_df.drop(['credit_event', 'id'], axis=1)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
for feature in features:
    if applications_joined_df[feature].dtype.kind in 'ifc':
        if len(applications_joined_df[feature].value_counts()) > 6: # regroup into quantiles when too many values
            applications_joined_df[feature] = pd.cut(applications_joined_df[feature], 6)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
Y = np.ravel(np.array(applications_joined_df[['credit_event']]))

X = np.array(applications_joined_df.drop(['credit_event', 'id'], axis=1))

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
scores = pd.DataFrame()
oe = OrdinalEncoder()

for i, feature in enumerate(features):
    X_feature = X[:, i].astype(str).reshape(-1, 1)
    oe.fit(X_feature)
    X_enc = oe.transform(X_feature)
    iv = information_value(X=X_enc, Y=Y)
    chi = chi2(X=X_enc, y=Y)
    scores = scores.append(pd.DataFrame({'feature': [feature], 'information_value': [iv],
                                         'chi2_pvalue': [chi[1][0]]}))

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
scores['information_value_threshold'] = variables['standard']['information_value_threshold']
scores['chi2_pvalue_threshold'] = variables['standard']['chi2_pvalue_threshold']

variables['standard']['chi2_information_value_filtered'] = len(scores[(scores['information_value']<scores['information_value_threshold']) & (scores['chi2_pvalue']>scores['chi2_pvalue_threshold'])])
variables['standard']['information_value_only_filtered'] = len(scores[(scores['information_value']<scores['information_value_threshold']) & (scores['chi2_pvalue']<scores['chi2_pvalue_threshold'])])
variables['standard']['chi2_only_filtered'] = len(scores[(scores['information_value']>scores['information_value_threshold']) & (scores['chi2_pvalue']>scores['chi2_pvalue_threshold'])])

project.set_variables(variables)

# Write recipe outputs
applications_entropy = dataiku.Dataset("applications_entropy")

applications_entropy.write_with_schema(scores)