# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import random

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Read recipe inputs
model_comparison_prepared = dataiku.Dataset("model_comparison_prepared")
model_comparison_prepared_df = model_comparison_prepared.get_dataframe()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
def lorenz_curve(y_true, y_pred, exposure):
    y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
    exposure = np.asarray(exposure)

    # order samples by increasing predicted risk:
    ranking = np.argsort(y_pred)
    ranked_exposure = exposure[ranking]
    ranked_pure_premium = y_true[ranking]
    cumulated_claim_amount = np.cumsum(ranked_pure_premium * ranked_exposure)
    cumulated_claim_amount /= cumulated_claim_amount[-1]
    cumulated_samples = np.linspace(0, 1, len(cumulated_claim_amount))
    return cumulated_samples, cumulated_claim_amount

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
tweedie_cum_samples, tweedie_cum_claim_amount = lorenz_curve(model_comparison_prepared_df['ClaimAmount'],
                                                             model_comparison_prepared_df['PremiumPrediction'],
                                                             model_comparison_prepared_df['Exposure'])
product_cum_samples, product_cum_claim_amount = lorenz_curve(model_comparison_prepared_df['ClaimAmount'],
                                                             model_comparison_prepared_df['PremiumPredictionCompound'],
                                                             model_comparison_prepared_df['Exposure'])
oracle_cum_samples, oracle_cum_claim_amount = lorenz_curve(model_comparison_prepared_df['ClaimAmount'],
                                                           model_comparison_prepared_df['ClaimAmount'],
                                                           model_comparison_prepared_df['Exposure'])
ordered_array = [x for x in range(len(model_comparison_prepared_df))]
random.shuffle(ordered_array)
random_cum_samples, random_cum_claim_amount = lorenz_curve(model_comparison_prepared_df['ClaimAmount'],
                                                           ordered_array,
                                                           model_comparison_prepared_df['Exposure'])

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
lorenz_curves_df = pd.DataFrame(data={'cum_samples': tweedie_cum_samples,
                                   'tweedie_cum_claim_amount': tweedie_cum_claim_amount,
                                   'product_cum_claim_amount': product_cum_claim_amount,
                                   'oracle_cum_claim_amount': oracle_cum_claim_amount,
                                   'random_cum_claim_amount': random_cum_claim_amount})

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
lorenz_curves = dataiku.Dataset("lorenz_curves")
lorenz_curves.write_with_schema(lorenz_curves_df)