# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import scorecardpy as sc
from monotonic_binning.monotonic_woe_binning import Binning
from sklearn.linear_model import LinearRegression
from commons.dku_utils.core import get_current_project_and_variables
from commons.dku_utils.datasets.dataset_commons import get_dataset_schema

lr = LinearRegression()

# Read recipe inputs
applications_joined = dataiku.Dataset("applications_correlation_filtered")
applications_joined_df = applications_joined.get_dataframe()

project, variables = get_current_project_and_variables()

categorization_threshold = variables['standard']['categorization_threshold']
share_bin = variables['standard']['share_bin']
share_minority_bin = variables['standard']['share_minority_bin']
p_value = variables['standard']['p_value']
iv_threshold = variables['standard']['iv_threshold']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
features = [var for var in applications_joined_df.columns if var not in ['credit_event', 'id']]
ivs = pd.DataFrame()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
all_bins = dict()
nb_observations = len(applications_joined_df)

for variable in features:
    if len(applications_joined_df[variable].unique()) > 1:
        if applications_joined_df[variable].dtype.kind in 'if' and len(applications_joined_df[variable].value_counts()) > categorization_threshold:
            check_sign_data = applications_joined_df[['credit_event', variable]].dropna()
            len_break = 1
            factor = 10
            while len_break < 2:
                breaks = np.unique([float('%.4g' % x) for x in check_sign_data[variable].quantile([x/factor for x in range(factor)]).unique()])
                factor *= 10
                len_break = len(breaks)
            check_sign_data['quantile'] = pd.cut(check_sign_data[variable], bins=list(breaks), right=False)
            quantized_data = check_sign_data.groupby('quantile')[['credit_event', variable]].mean().reset_index()
            lr.fit(np.array(quantized_data[variable]).reshape(-1, 1), np.array(quantized_data['credit_event']))
            coef = lr.coef_[0]
            applications_copy = applications_joined_df[['credit_event', variable]].copy()
            if applications_joined_df[variable].dtype.kind == 'i':
                applications_copy[variable] = [int(float('%.2g' % x)) for x in applications_copy[variable]]
            else:
                applications_copy[variable] = [float('%.2g' % x) for x in applications_copy[variable]]
            try:
                bin_object = Binning('credit_event', n_threshold = nb_observations*share_bin, y_threshold = nb_observations*share_minority_bin, p_threshold = p_value, sign=coef<0)
                bin_object.fit(applications_copy[['credit_event', variable]])
            except ValueError:
                bin_object = Binning('credit_event', n_threshold = nb_observations*share_bin, y_threshold = nb_observations*share_minority_bin, p_threshold = p_value, sign=coef>0)
                bin_object.fit(applications_copy[['credit_event', variable]])
            if len(bin_object.woe_summary)==1:
                bin_object = Binning('credit_event', n_threshold = nb_observations*share_bin, y_threshold = nb_observations*share_minority_bin, p_threshold = p_value, sign=coef>0)
                bin_object.fit(applications_copy[['credit_event', variable]])
            bins = {variable: bin_object}
        else:
            applications_joined_df[variable] = applications_joined_df[variable].astype(str)
            bins = sc.woebin(applications_joined_df[[variable, 'credit_event']], y="credit_event",
                             check_cate_num=False, count_distr_limit=share_bin)
        all_bins.update(bins)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
total_ivs = pd.DataFrame({'variable': [k for k in all_bins.keys()], 'iv': [all_bins[k].total_iv if type(all_bins[k]) == Binning else all_bins[k]['total_iv'].iloc[0] for k in all_bins.keys()]})
total_ivs = total_ivs.sort_values('iv', ascending=False)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
def compute_woe_numeric(data, variable, binning):
    woe_summary = binning.woe_summary
    # deal with negative sign
    if binning.sign == False:
        woe_summary = woe_summary.iloc[::-1]
        woe_summary[variable + '_shift'] = woe_summary[variable].shift(1)
        woe_summary[variable + '_shift'].iloc[0] = '-inf'
        woe_summary['labels'] = woe_summary[variable + '_shift'].astype(str) + '-' + woe_summary[variable].astype(str)
    # deal with na
    has_na = sum(data[variable].isna()) > 0
    if has_na:
        means = list(woe_summary['means'])
        mean = data[data[variable].isna()][['credit_event']].mean()[0]
        index_mean = min(range(len(means)), key=lambda i: abs(means[i]-mean))
        woe_bins = woe_summary
        woe_bins['include_na'] = [True if i == index_mean else False for i in range(len(woe_bins))]
        counts = data[data[variable].isna()].groupby(['credit_event']).size().reset_index()
        counts.columns = ['credit_event', 'count']
        ones = counts[counts['credit_event']==1]
        one = 0
        if len(ones) > 0:
            one = ones['count'].iloc[0]
        zeros = counts[counts['credit_event']==0]
        zero = 0
        if len(zeros) > 0:
            zero = zeros['count'].iloc[0]
        woe_bins.at[index_mean, 'bads'] += one
        woe_bins.at[index_mean, 'goods'] += zero
        total_goods = np.sum(woe_bins["goods"])
        total_bads = np.sum(woe_bins["bads"])
        woe_bins["dist_good"] = woe_bins["goods"] / total_goods
        woe_bins["dist_bad"] = woe_bins["bads"] / total_bads
        woe_bins["WOE_" + variable] = np.log(woe_bins["dist_good"] / woe_bins["dist_bad"])
        woe_output = woe_bins[[variable, 'include_na', "WOE_" + variable, 'goods', 'bads', 'labels']]
        woe_output.columns = ['bin', 'include_na', 'woe', 'good', 'bad', 'label']
        woe_output['variable'] = variable
    else:
        woe_output = woe_summary[[variable, "WOE_" + variable, 'goods', 'bads', 'labels']]
        woe_output.columns = ['bin', 'woe', 'good', 'bad', 'label']
        woe_output['include_na'] = False
        woe_output['variable'] = variable
    # convert to woe
    woes = list(woe_output['woe'])
    if binning.sign:
        final_bins = [b for i, b in enumerate(binning.bins) if i != 1]
    else:
        final_bins = [b for i, b in enumerate(binning.bins) if i != len(binning.bins) - 2]
        woes.reverse()
    data[variable] = pd.cut(data[variable], final_bins, labels=range(len(final_bins)-1))
    if has_na:
        data[variable] = data[variable].fillna(index_mean)
    data[variable] = [woes[x] for x in data[variable]]
    return woe_output, data

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
def compute_woe_categorical(data, variable, binning):
    result = pd.DataFrame([(d, tup.bin, tup.woe, tup.good, tup.bad) for tup in binning.itertuples() for d in tup.bin.split('%,%')])
    result.columns = ['category', 'bin', 'woe', 'good', 'bad']
    result['variable'] = variable
    data[variable] = data[variable].astype(str)
    data_woe = pd.merge(data, result, how='left', left_on=variable, right_on='category')
    data_output = data_woe[['category', 'woe']]
    return result, data_output

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
woe_bins_numeric = pd.DataFrame()
woe_bins_categorical = pd.DataFrame()
applications_binned = applications_joined_df.copy()

for variable in list(all_bins.keys()):
    # if numeric
    if type(all_bins[variable]) == Binning:
        print(all_bins[variable].sign)
        woe_bins, transformed_woe = compute_woe_numeric(applications_binned[[variable, 'credit_event']], variable, all_bins[variable])
        applications_binned[variable] = transformed_woe[variable]
        woe_bins['sign'] = all_bins[variable].sign
        if len(woe_bins_numeric) == 0:
            woe_bins_numeric = woe_bins
        else:
            woe_bins_numeric = woe_bins_numeric.append(woe_bins)
    else:
        woe_bins, transformed_woe = compute_woe_categorical(applications_binned[[variable, 'credit_event']], variable, all_bins[variable])
        applications_binned[variable] = transformed_woe['woe']
        woe_bins_categorical = woe_bins_categorical.append(woe_bins)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
total_ivs["iv_threshold"] = iv_threshold
feature_ivs = dataiku.Dataset("feature_ivs")
feature_ivs.write_with_schema(total_ivs)

if len(woe_bins_numeric) > 0:
    woe_bins_numeric['total'] = woe_bins_numeric['good'] + woe_bins_numeric['bad']

woe_bins_numeric_dataset = dataiku.Dataset("woe_bins_numeric")
woe_bins_numeric_dataset.write_with_schema(woe_bins_numeric)

if len(woe_bins_categorical) > 0:
    woe_bins_categorical['total'] = woe_bins_categorical['good'] + woe_bins_categorical['bad']

woe_bins_categorical_dataset = dataiku.Dataset("woe_bins_categorical")
woe_bins_categorical_dataset.write_with_schema(woe_bins_categorical)

output_schema = [{'name': 'id', 'type': 'string'}, {'name': 'credit_event', 'type': 'bigint'}]

for variable in [col for col in applications_binned.columns if col not in ['id', 'credit_event']]:
    output_schema.append({'name': variable, 'type': 'double'})

applications_binned_dataset = dataiku.Dataset("applications_binned")
applications_binned_dataset.write_schema(output_schema)
applications_binned_dataset.write_dataframe(applications_binned, infer_schema=False, dropAndCreate=True)