# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import itertools
# Read recipe inputs
new_measure_tract_format_aggr = dataiku.Dataset("new_measure_tract_format_aggr_joined")
df = new_measure_tract_format_aggr.get_dataframe()

# Rename State_County_code to FIPS
df.rename(columns={'State_County_code': 'FIPS'}, inplace=True)

# Remove nan values
df = df[df['Number_measure_cases_sum'].notna()]

# Compute the percentage values of each health measure
df['Percent_Measure_Value'] = df['Percent_Measure_Value'].round()

# A list of the unique names of health measure
conditions_names = df['Health_Measure'].unique()
cleanedList = [x for x in conditions_names if str(x) != 'nan']

# Compute the percentile values of each health measure
list_comp = [df['Percent_Measure_Value'][df['Health_Measure'] == res].rank(pct=True) for res in cleanedList]
df['Measure_Value_Percentile'] = list(itertools.chain.from_iterable(list_comp))
df['Measure_Value_Percentile'] = df['Measure_Value_Percentile'].round(2)
df = df.drop('Number_measure_cases_sum', axis = 1)

# Pivot the percent and percentile values for each health reason
# Standardize the prefix and suffix to follow the rest of the health conditions format
# Remove any duplications
percent = df.pivot(index = 'FIPS', columns="Health_Measure",values="Percent_Measure_Value").reset_index().add_prefix('Percent ').add_suffix(' Disease_county')
percentile = df.pivot(index = 'FIPS', columns="Health_Measure",values="Measure_Value_Percentile").reset_index().add_suffix(' Disease_county Percentile')

# Create a dataframe with the core data: location, SV factors and the other health reasons
CleanTable = df.drop(['Health_Measure','Percent_Measure_Value','Measure_Value_Percentile'], axis = 1).drop_duplicates()

# Merge the new health conditions
MetricsTable =  percent.merge(percentile, how="inner", left_on=["Percent FIPS Disease_county"], right_on =["FIPS Disease_county Percentile"]).drop([ 'FIPS Disease_county Percentile'], axis = 1)
FinalTable = CleanTable.merge(MetricsTable, how="inner", left_on=["FIPS"], right_on =["Percent FIPS Disease_county"]).drop([ 'Percent FIPS Disease_county'], axis = 1)

# Fill NaN values with 0
FinalTable = FinalTable.replace(np.nan, 0)

# Standardize FIPS, State and County codes to the right digit format
FinalTable['FIPS'] = FinalTable['FIPS'].astype(str).apply(lambda x: '0'+x  if len(x)!=5 else x)
FinalTable['State_code'] = FinalTable['State_code'].astype(str).apply(lambda x: '0'+x  if len(x)!=2 else x)
FinalTable['County_code'] = FinalTable['County_code'].astype(str).apply(lambda x: '0'+x  if len(x)!=3 else x)

# Write recipe outputs
new_measure_tract_aggregate_county = dataiku.Dataset("new_measure_tract_aggregate_county")
new_measure_tract_aggregate_county.write_with_schema(FinalTable)