# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Librairies
# ==============================================================================
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
from scipy.stats import pearsonr
from scipy.stats import spearmanr

# Read recipe inputs
historical_ts_data_drivers = dataiku.Dataset("historical_ts_data_drivers")
df = historical_ts_data_drivers.get_dataframe()
df = df[df['actual_value']>0]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# calculate Pearson's and Spearman correlation between drivers and actual_value
# ==============================================================================
drivers = df['driver'].unique()
corr_df = pd.DataFrame(index=np.arange(df['driver'].nunique()), columns=["driver","pearsons_correlation","spearman_correlation"])
loc = 0
for i in drivers:
    df_driver = df[df['driver']==i]
    corr, _ = pearsonr(df_driver['driver_value'], df_driver['actual_value'])
    print(i,'Pearsons correlation: %.3f' % corr)
    corr_df["driver"][loc]=i
    corr_df["pearsons_correlation"][loc]=corr
    corr, _ = spearmanr(df_driver['driver_value'], df_driver['actual_value'])
    print(i,'Spearman correlation: %.3f' % corr)
    corr_df["spearman_correlation"][loc]=corr
    loc = loc +1

corr_df

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
corr_df["threshold"]=0.8
corr_df

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# calculate Pearson's and Spearman correlation between drivers and actual_value by subcategory
# ==============================================================================
drivers = df['driver'].unique()
subcategories = df['category'].unique()
corr_df_sub = pd.DataFrame(index=np.arange(df['driver'].nunique()*df['category'].nunique()), columns=["driver","category","pearsons_correlation","spearman_correlation"])
loc = 0
for i in drivers:
    for j in subcategories:
        df_driver = df.loc[(df['driver']==i)&(df['category']==j)]
        corr, _ = pearsonr(df_driver['driver_value'], df_driver['actual_value'])
        #print(i,'Pearsons correlation: %.3f' % corr)
        corr_df_sub["driver"][loc]=i
        corr_df_sub["category"][loc]=j
        corr_df_sub["pearsons_correlation"][loc]=corr
        corr, _ = spearmanr(df_driver['driver_value'], df_driver['actual_value'])
        #print(i,'Spearman correlation: %.3f' % corr)
        corr_df_sub["spearman_correlation"][loc]=corr
        loc = loc +1

corr_df_sub

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
corr_df_sub["threshold"]=0.8
corr_df_sub

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
# ==============================================================================
drivers_correlation = dataiku.Dataset("drivers_correlation")
drivers_correlation.write_with_schema(corr_df)

drivers_correlation_per_subcategory = dataiku.Dataset("drivers_correlation_per_subcategory")
drivers_correlation_per_subcategory.write_with_schema(corr_df_sub)