# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import ast

# Read recipe inputs
# Dataset new_condition_scored_tract renamed to new_condit by georgia.kouyialis@dataiku.com on 2023-04-18 18:14:57
# Dataset new_condition_scored_tract renamed to new_measure_scored_tract by georgia.kouyialis@dataiku.com on 2023-04-19 18:18:11
new_condition_scored_tract = dataiku.Dataset("new_measure_scored_tract")
df = new_condition_scored_tract.get_dataframe()

# Standardized FIPS code
df['FIPS'] = df['FIPS'].astype(str).apply(lambda x: '0'+x  if len(x)!=11 else x)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Create two datasets
# 1. A dataset with name and version
df_model = df[['Health_Measure', 'smmd_modelVersion']].drop_duplicates().rename(columns={'smmd_modelVersion': 'Model_version'})
# 2. Drop the model info
df_final = df.drop(['prediction','smmd_savedModelId', 'smmd_modelVersion', 'smmd_fullModelId', 'smmd_predictionTime'], axis = 1).drop_duplicates()
# Standardize FIPS code

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Convert to long format and expan explanations to svi_factors and shap_values
long_data = []
for index, row in df_final.iterrows():
    fips = row['FIPS']
    measure_value = row['Percent_Measure_Value']
    health = row['Health_Measure']
    measure_value_perc = row['Measure_Value_Percentile']
    population = row['Population_tract']
    social_vulnerability_index = row['Social Vulnerability Index']
    County_name = row['County_name']
    State_name = row['State_name']
    explanations = ast.literal_eval(row['explanations'])
    for key, value in explanations.items():
        long_data.append([fips, measure_value, health, measure_value_perc,population, 
                          social_vulnerability_index,County_name, State_name,
                          key.replace("Percent ", ""), round(value, 4)])

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Create a DataFrame from the long format data
df_long = pd.DataFrame(long_data, columns=['FIPS', 'Percent_Measure_Value', 'Health_Measure', 'Measure_Value_Percentile', 'Population_tract', 'Social Vulnerability Index', 'County_name', 'State_name', 'SVI_factors', 'SHAP_value'])

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
# Dataset new_condition_tract_format renamed to new_measure_tract_format by georgia.kouyialis@dataiku.com on 2023-04-19 18:18:59
new_condition_tract_format = dataiku.Dataset("new_measure_tract_format")
new_condition_tract_format.write_with_schema(df_final)

model_version_tract = dataiku.Dataset("model_version_tract")
model_version_tract.write_with_schema(df_model)

new_measure_scored = dataiku.Dataset('new_measure_scored_tract_prepared')
new_measure_scored.write_with_schema(df_long)