# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import itertools
import math
from fractions import Fraction as frac
from numpy import log as ln

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Read recipe inputs
Reports_faers_deduplicate = dataiku.Dataset("Reports_faers_deduplicate_filtered")
Reports_faers_deduplicate_df = Reports_faers_deduplicate.get_dataframe()

population_F = dataiku.Dataset("population_F")
population_F_df = population_F.get_dataframe()

population_M = dataiku.Dataset("population_M")
population_M_df = population_M.get_dataframe()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
''' parametrized columns usse throuout this notebook'''
events_column = "adverse_event"
drugs_column = "drug"
manufacturer_column = "manufacturer"
primaryid_column = 'primaryid'
age_column = 'age_group'
drug_interactions_column = 'drug_interactions'
Interaction_detected_column = "Interaction_detected"
indication_column = 'indication'
indication_unknown_column = "indication_unknown"
reporter_column = "reporter"
reporter_country_column = "reporter_country"
outcome_column = "outcome"
seriousness_column = "seriousness"
date_column = "event_date_year"
total_medications_column = "total_medications"

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
''' get project global variables'''
project = dataiku.api_client().get_project(dataiku.default_project_key())
variables = project.get_variables()
events_mode = variables['standard']['events_mode_filter'] #"all", "most_frequent", "least_frequent", "select_individual"
events_frequency_selection = variables['standard']['events_frequency_filter'] #if most and least frequent -> specify percantage
manufacturer_filter = variables['standard']['manufacturer_filter']
adverse_event_filter = variables['standard']['adverse_event_filter']
drugs_mode_filter = variables['standard']["drugs_mode_filter"] #"select_individual_manufacturer", "select_individual_drug", "all"
drug_filter = variables['standard']["drug_filter"]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
'''
filter the most frequent events -> to generate the pairs of (drug X most frequent events)
'''
def dataset_event_filters(df, event_md, events_freq, manufacturer_list):
    if (event_md == 'most_frequent'):
        list_filter_asc = (df[events_column].value_counts(normalize=True) * 100).loc[lambda x : x>=float(events_freq)].index.tolist()
        final_events_df = df[df[events_column].isin(list_filter_asc)]
    elif (event_md == 'least_frequent'):
        list_filter_dsc =(df[events_column].value_counts(ascending=True, normalize=True) * 100).loc[lambda x : x<=float(events_freq)].index.tolist()
        final_events_df = df[df[events_column].isin(list_filter_dsc)]
    elif(event_md == 'select_individual'):
        final_events_df = df[df[events_column].isin(adverse_event_filter)]
    else:
        final_events_df = df
    if (drugs_mode_filter == 'select_individual_manufacturer'):
        final_df_filtered = final_events_df[final_events_df[manufacturer_column].isin(manufacturer_filter)]
    elif(drugs_mode_filter ==  "select_individual_drug"):
        final_df_filtered = final_events_df[final_events_df[drugs_column].isin(drug_filter)]
    else:
        final_df_filtered = final_events_df
    return(final_df_filtered)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
final_data_filtered = dataset_event_filters(Reports_faers_deduplicate_df, events_mode, events_frequency_selection, manufacturer_filter)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
'''
Generate the list of pairs (adverse event X drug)
'''
list_pt = final_data_filtered[events_column].value_counts().index.tolist()
list_drug = final_data_filtered[drugs_column].value_counts().index.tolist()
list_drug_event= list(itertools.product(*[list_pt,list_drug]))

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
'''
Calculate the meaasures of disproportionality (see wiki for reference) and
statistical metrics: Proportional Reporting Ratio, Reporting Odd Ratio,Multi-item Gamma Poisson Shrinker (MGPS) and
95% Confidence intervals
'''
def statistical_metrics(dataset, list_drug_pt):
    pt_name = list_drug_pt[0]
    drug_name = list_drug_pt[1]

    """ These metrics are used to compute frequencies in calculating the DPA metrics by specifying a number of drug - event pairs.
    """
    #drug of interest and adverse event of interest
    result_drug_event = len(dataset[(dataset[drugs_column] ==drug_name) & (dataset[events_column] == pt_name)])
    #all the other drugs and all the other events
    result_nodrug_noevent = len(dataset[(dataset[drugs_column] !=drug_name) & (dataset[events_column] != pt_name)])
    #all the other drugs and the event of interest
    result_nodrug_event = len(dataset[(dataset[drugs_column] !=drug_name) & (dataset[events_column] == pt_name)])
    #drug of interest and all the other events
    result_drug_noevent = len(dataset[(dataset[drugs_column] ==drug_name) & (dataset[events_column] != pt_name)])
    drug_freq_perc = (result_drug_event + result_drug_noevent) / (result_drug_event + result_nodrug_noevent + result_nodrug_event + result_drug_noevent)
    event_freq_perc = (result_drug_event + result_nodrug_event) / (result_drug_event + result_nodrug_noevent + result_nodrug_event + result_drug_noevent)
    """ This method returns the Multi-item Gamma Poisson Shrinker (MGPS) drug-event pairs.
    """
    try:
        EBGM = (result_drug_event*(result_drug_event + result_nodrug_event + result_drug_noevent + result_nodrug_noevent)) / ((result_drug_event + result_drug_noevent)*(result_drug_event + result_nodrug_event))
    except ZeroDivisionError:
        EBGM = 0

    """ This method returns the Reporting Odds Ratio (ROR) of drug-event pairs.
    """
    try:
        ror = (result_drug_event/result_drug_noevent) / (result_nodrug_event/result_nodrug_noevent)
        ln_ror = ln(ror)
        se_ror = math.sqrt(frac(1,result_drug_event) + frac(1,result_nodrug_event) + frac(1,result_drug_noevent) + frac(1,result_nodrug_noevent))
        ror_lower_lim = math.exp(ln_ror - (1.96*se_ror))
        ror_upper_lim = math.exp(ln_ror + (1.96*se_ror))
    except ZeroDivisionError:
        ror = 0
        ln_ror = 0
        se_ror = 0
        ror_lower_lim = 0
        ror_upper_lim = 0
        EBGM = 0

    """ This method calculates the Proportional Reporting Ratio (PRR) of drug-event pairs.
    """
    try:
        prr = (result_drug_event/(result_drug_event + result_nodrug_event)) / (result_drug_noevent / (result_drug_noevent + result_nodrug_noevent))
        ln_prr = ln(prr)
        se_prr = math.sqrt(frac(1,result_drug_event) + frac(1,result_nodrug_event) + frac(1,result_drug_noevent) + frac(1,result_nodrug_noevent))
        prr_lower_lim = math.exp(ln_prr - (1.96*se_prr))
        prr_upper_lim = math.exp(ln_prr + (1.96*se_prr))
    except ZeroDivisionError:
        prr = 0
        ln_prr = 0
        se_prr = 0
        prr_lower_lim = 0
        prr_upper_lim = 0
        EBGM = 0
    return(prr, ror, prr_lower_lim, prr_upper_lim, ror_lower_lim, ror_upper_lim, EBGM, result_drug_event, result_nodrug_noevent, result_nodrug_event, result_drug_noevent, drug_freq_perc, event_freq_perc)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
drug_list = [item[1] for item in list_drug_event]
adverse_event_list = [item[0] for item in list_drug_event]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
'''
Call function statistical_metrics to generate the statistics for all the combinations of (drug X adverse event) from drug_list, adverse_event_list
Repoeat the process for all the subpopulations
'''
'''
WARNING: In case of ValueError: not enough values to unpack (expected 13, got 0). Check that list_drug_event linked to manufacturer and event frequency in NON empty.
'''

PropReportingRatio_all, PropOddRatio_all, PropReportingRatio_lower_all, PropReportingRatio_upper_all, PropOddRation_lower_all, PropOddRation_upper_all, Bayes_Mean_all, drug_event_freq_all, drugother_eventother_freq_all, drugother_event_freq_all, drug_eventother_freq_all, drug_freq_all, event_freq_all = map(list,zip(*[statistical_metrics(Reports_faers_deduplicate_df, value) for value in list_drug_event]))

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
drug_event_freq_all

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
PropReportingRatio_F, PropOddRatio_F, PropReportingRatio_lower_F, PropReportingRatio_upper_F, PropOddRation_lower_F, PropOddRation_upper_F, Bayes_Mean_F, drug_event_freq_F, drugother_eventother_freq_F, drugother_event_freq_F, drug_eventother_freq_F, drug_freq_F, event_freq_F = map(list,zip(*[statistical_metrics(population_F_df, value) for value in list_drug_event]))

PropReportingRatio_M, PropOddRatio_M, PropReportingRatio_lower_M, PropReportingRatio_upper_M, PropOddRation_lower_M, PropOddRation_upper_M, Bayes_Mean_M, drug_event_freq_M, drugother_eventother_freq_M, drugother_event_freq_M, drug_eventother_freq_M, drug_freq_M, event_freq_M = map(list,zip(*[statistical_metrics(population_M_df, value) for value in list_drug_event]))

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
data_all = {drugs_column : drug_list,
            events_column : adverse_event_list,
            'PRR_all': PropReportingRatio_all,
            'PRR_F': PropReportingRatio_F,
            'PRR_M': PropReportingRatio_M,
            'ROR_all': PropOddRatio_all,
            'ROR_F': PropOddRatio_F,
            'ROR_M': PropOddRatio_M,
            'PRR_CI_L_all': PropReportingRatio_lower_all,
            'PRR_CI_L_F': PropReportingRatio_lower_F,
            'PRR_CI_L_M': PropReportingRatio_lower_M,
            #'PRR_CI_U': PropReportingRatio_upper_all,
            'ROR_CI_L_all': PropOddRation_lower_all,
            'ROR_CI_L_F': PropOddRation_lower_F,
            'ROR_CI_L_M': PropOddRation_lower_M,
          #  'ROR_CI_U': PropOddRation_upper_all,
            'EBGM_all': Bayes_Mean_all,
            'EBGM_F': Bayes_Mean_all,
            'EBGM_M': Bayes_Mean_F,
           "drug_event_occur": drug_event_freq_M,
           "drugother_eventother_occur": drugother_eventother_freq_all,
           "drugother_event_occur": drugother_event_freq_all,
           "drug_eventother_occur": drug_eventother_freq_all,
            "drug_percentage_freq": drug_freq_all,
            "event_percentage_freq" : event_freq_all
            }

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
df_final_all = pd.DataFrame(data_all)
df_final_all.drop(df_final_all[df_final_all['drug_event_occur'] == 0].index, inplace = True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
df_final_merge_all = df_final_all.merge(Reports_faers_deduplicate_df[[events_column, drugs_column]],
                          on=[events_column, drugs_column],
                         how='inner'
                         ).drop_duplicates()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
merged = Reports_faers_deduplicate_df.merge(df_final_all,
    on=[events_column, drugs_column],
    how='inner'
    )

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
data_F = {drugs_column : drug_list,
            events_column : adverse_event_list,
            'PRR': PropReportingRatio_F,
            'ROR': PropOddRatio_F,
            'PRR_CI_L': PropReportingRatio_lower_F,
            'PRR_CI_U': PropReportingRatio_upper_F,
            'ROR_CI_L': PropOddRation_lower_F,
            'ROR_CI_U': PropOddRation_upper_F,
            'EBGM': Bayes_Mean_F,
           "drug_event_occur": drug_event_freq_F,
           "drugother_eventother_occur": drugother_eventother_freq_F,
           "drugother_event_occur": drugother_event_freq_F,
           "drug_eventother_occur": drug_eventother_freq_F,
            "drug_percentage_freq": drug_freq_F,
            "event_percentage_freq" : event_freq_F
            }

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
df_final_F = pd.DataFrame(data_F)
df_final_F.drop(df_final_F[df_final_F['drug_event_occur'] == 0].index, inplace = True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
df_final_merge_F = df_final_F.merge(population_F_df[[primaryid_column, total_medications_column, date_column, events_column, drugs_column, manufacturer_column, age_column, outcome_column, seriousness_column, reporter_country_column, reporter_column,indication_column, drug_interactions_column, Interaction_detected_column, indication_unknown_column]],
                          on=[events_column, drugs_column],
                         how='inner'
                         ).drop_duplicates()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
data_M = {drugs_column : drug_list,
            events_column : adverse_event_list,
            'PRR': PropReportingRatio_M,
            'ROR': PropOddRatio_M,
            'PRR_CI_L': PropReportingRatio_lower_M,
            'PRR_CI_U': PropReportingRatio_upper_M,
            'ROR_CI_L': PropOddRation_lower_M,
            'ROR_CI_U': PropOddRation_upper_M,
            'EBGM': Bayes_Mean_M,
           "drug_event_occur": drug_event_freq_M,
           "drugother_eventother_occur": drugother_eventother_freq_M,
           "drugother_event_occur": drugother_event_freq_M,
           "drug_eventother_occur": drug_eventother_freq_M,
            "drug_percentage_freq": drug_freq_M,
            "event_percentage_freq" : event_freq_M
            }

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
df_final_M = pd.DataFrame(data_M)
df_final_M.drop(df_final_M[df_final_M['drug_event_occur'] == 0].index, inplace = True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
df_final_merge_M = df_final_M.merge(population_M_df[[primaryid_column, total_medications_column, date_column, events_column, drugs_column, manufacturer_column, age_column, outcome_column, seriousness_column, reporter_country_column, reporter_column,indication_column, drug_interactions_column, Interaction_detected_column, indication_unknown_column]],
                          on=[events_column, drugs_column],
                         how='inner'
                         ).drop_duplicates()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
Disproportionality_stats = dataiku.Dataset("disproportionality_stats")
Disproportionality_stats.write_with_schema(df_final_merge_all)

Disproportionality_pairs_F = dataiku.Dataset("disproportionality_pairs_F")
Disproportionality_pairs_F.write_with_schema(df_final_merge_F)

Disproportionality_pairs_M = dataiku.Dataset("disproportionality_pairs_M")
Disproportionality_pairs_M.write_with_schema(df_final_merge_M)