# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import re
import io

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# set folder specifications
folder_id = "LbuXL8Es"
input_files = dataiku.Folder(folder_id)
paths = input_files.list_paths_in_partition()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Iterate through files, check if they fit certain regex condition of file name *.txt
drug_checked = []
reaction_checked = []
indication_checked = []
therapy_checked = []
demographics_checked = []
outcome_checked = []
sources_checked = []
other_checked = []
for path in paths:
    if re.match(r"/DRUG\d+", path):
        drug_checked.append(path)
    elif re.match(r"/REAC\d+", path):
        reaction_checked.append(path)
    elif re.match(r"/INDI\d+", path):
        indication_checked.append(path)
    #elif re.match(r"/THER\d+", path):
    #    therapy_checked.append(path)
    elif re.match(r"/DEMO\d+", path):
        demographics_checked.append(path)
    elif re.match(r"/OUTC\d+", path):
        outcome_checked.append(path)
   # elif re.match(r"/RPSR\d+", path):
   #     sources_checked.append(path)
    else:
        other_checked.append(path)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Function that gets python file-like objects, access .txt files and convert them to dataframes
def convert_files(file_list):
    temporary_dataframe = []
    for file in file_list:
        dataframe = []
        with input_files.get_download_stream(f"{file}") as f:
            text_as_bytes = f.read()
        io_file = io.BytesIO(text_as_bytes)
        f.close()
        dataframe = pd.read_csv(io_file, sep=f"$", header=None, on_bad_lines='skip', low_memory=False)
        final_dataframe = dataframe.rename(columns=dataframe.iloc[0]).drop(dataframe.index[0])
        temporary_dataframe.append(final_dataframe)
    output_data = pd.concat(temporary_dataframe)
    return(output_data)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Use function convert_files to convert and merge input datasets
medication_df = convert_files(drug_checked)
reaction_df = convert_files(reaction_checked)
indication_df = convert_files(indication_checked)
#therapy_df = convert_files(therapy_checked)
demographics_df = convert_files(demographics_checked)
outcome_df = convert_files(outcome_checked)
#sources_df = convert_files(sources_checked)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# map the code for drug's reported role in event
medication_dic = {
                    "PS" : "Primary Suspect Drug",
                    "SS" : "Secondary Suspect Drug",
                     "C" : "Concomitant",
                     "I" : "Interacting"
    
}
medication_df['role_cod'] = medication_df['role_cod'].map(medication_dic)


# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
#map the outcome code and create a list of seriousness indication factor
outcome_dic = {
                "DE" : "Death",
                "LT" : "Life-Threatening",
                "HO" : "Hospitalization",
                "DS" : "Disability",
                "CA" : "Congenital Anomaly",
                "RI" : "Intervention",
                "OT" : "Other Serious"
              }
seriousness_dic = {
                    "Death": 1,
                    "Life-Threatening" : 2,
                    "Hospitalization" : 3,
                    "Disability" : 4,
                    "Congenital Anomaly" : 4,
                    "Intervention" : 6,
                    "Other Serious" : 7
                    }

outcome_df["outc_cod"] = outcome_df["outc_cod"].map(outcome_dic)
outcome_df["seriousness"] = outcome_df["outc_cod"].map(seriousness_dic)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
#remove duplication based on primaryid key and keep only the record with most serious outcome
outcome_df = outcome_df.loc[outcome_df.groupby('primaryid').seriousness.idxmin()]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
#map the occupation code
occupation_dic ={
                 "MD" : "Physician",
                 "PH" : "Pharmacist",
                 "HP" : "Health Professional",
                 "LW" : "Lawyer",
                 "CN" : "Consumer",
                 "OT" : "Other Health Professional"
                   }
demographics_df["occp_cod"] = demographics_df["occp_cod"].map(occupation_dic)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
#convert the age into years and bin to age categories
demographics_dic = {
                    "YR"  : 1,
                    "MON" : 12,
                    "WK"  : 53,
                    "DY"  : 365,
                    "HR"  : 8760
                    }
demographics_df['age_cod'] = demographics_df['age_cod'].map(demographics_dic)
demographics_df['age_cod'] = demographics_df['age_cod'].apply(pd.to_numeric)
demographics_df['age'] = demographics_df['age'].apply(pd.to_numeric)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
demographics_df['age_years'] = round(demographics_df['age'] /  demographics_df['age_cod'])
#--------------------------------------------------------------------------------
# map code of the typr of report submitted
report_dic = {
              "EXP" : "Expedited",
              "PER" : "Periodic",
              "DIR" : "Direct"
}
demographics_df['rept_cod'] = demographics_df['rept_cod'].map(report_dic)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Recipe outputs
medication_faers_input = dataiku.Dataset("Medication_faers_input")
medication_faers_input.write_with_schema(medication_df)
demographics_faers_input = dataiku.Dataset("Demographics_faers_input")
demographics_faers_input.write_with_schema(demographics_df)
indication_faers_input = dataiku.Dataset("Indication_faers_input")
indication_faers_input.write_with_schema(indication_df)
outcome_faers_input = dataiku.Dataset("Outcome_faers_input")
outcome_faers_input.write_with_schema(outcome_df)
reaction_faers_input = dataiku.Dataset("Reaction_faers_input")
reaction_faers_input.write_with_schema(reaction_df)
#therapy_faers = dataiku.Dataset("Therapy_faers")
#therapy_faers.write_with_schema(therapy_df)
#sources_faers = dataiku.Dataset("Sources_faers")
#sources_faers.write_with_schema(sources_df)