# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd
import ast
import requests
import json

# custom function that can be found within Libraries tab > G+L
from census_api_functions import get_project_variables, get_query_text, state_name_list

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# variable specifications
census_api_key = get_project_variables('standard','api_key')
# census API variables specifications
year = 2022

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# url path to call census API services
url_path = f"https://api.census.gov/data/{year}/acs/acs5/profile?get=NAME&for=state:*&key={census_api_key}"

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# numerical list of US states
state_nums_list = state_name_list(url_path)

def get_county_code_dataset(census_code):
    all_county_df = pd.DataFrame()
    for state in state_nums_list:
        try:
            state_all_county_query_url = f"https://api.census.gov/data/{year}/acs/acs5?get=NAME,group({census_code})&for=county:*&in=state:{state}&key={census_api_key}"
            state_all_county_names_query_text = get_query_text(state_all_county_query_url)
            state_all_county_names_query_result_list = json.loads(state_all_county_names_query_text)
            state_all_county_names_df = pd.DataFrame(state_all_county_names_query_result_list[1:], columns=state_all_county_names_query_result_list[0])
            df = state_all_county_names_df.iloc[:, 1:]

            # Use pd.concat instead of append
            all_county_df = pd.concat([all_county_df, df], ignore_index=True)
        except Exception as e:
            print(f"Error processing state {state}: {e}")
            continue
    return all_county_df

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# For each census group call an API request to gather the dataset from U.S. Census Bureau

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
all_tracts_df_B09001 = get_county_code_dataset('B09001')
all_tracts_df_B09001 = all_tracts_df_B09001[['GEO_ID', 'state', 'county', 'B09001_001E'
                              ]]
all_tracts_df_B09001 = all_tracts_df_B09001.rename(columns = {'B09001_001E':'E_AGE17'})

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
all_tracts_df_B16005 = get_county_code_dataset('B16005')
all_tracts_df_B16005 = all_tracts_df_B16005[['GEO_ID', 'B16005_001E', 'B16005_007E', 'B16005_008E', 'B16005_012E',
                                'B16005_013E', 'B16005_017E', 'B16005_018E', 'B16005_022E',
                               'B16005_023E', 'B16005_029E', 'B16005_030E', 'B16005_034E', 'B16005_035E',
                               'B16005_039E', 'B16005_040E', 'B16005_044E', 'B16005_045E',
                              ]]

all_tracts_df_B16005.iloc[:, 1:] = all_tracts_df_B16005.iloc[:, 1:].astype(float)

all_tracts_df_B16005['E_LIMENG'] = all_tracts_df_B16005.iloc[:, 2:].sum(axis=1)

try:
    all_tracts_df_B16005['EP_LIMENG'] = (all_tracts_df_B16005['E_LIMENG'] / all_tracts_df_B16005['B16005_001E']) * 100
except ZeroDivisionError:
    all_tracts_df_B16005['EP_LIMENG'] = 0


all_tracts_df_B16005 = all_tracts_df_B16005.drop(['B16005_001E', 'B16005_007E', 'B16005_008E', 'B16005_012E',
                                'B16005_013E', 'B16005_017E', 'B16005_018E', 'B16005_022E',
                               'B16005_023E', 'B16005_029E', 'B16005_030E', 'B16005_034E', 'B16005_035E',
                               'B16005_039E', 'B16005_040E', 'B16005_044E', 'B16005_045E'], axis = 1 )

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
all_tracts_df_B11012 = get_county_code_dataset('B11012')
all_tracts_df_B11012 = all_tracts_df_B11012[['GEO_ID', 'B11012_010E', 'B11012_015E'
                              ]]

all_tracts_df_B11012.iloc[:, 1:] = all_tracts_df_B11012.iloc[:, 1:].astype(float)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
all_tracts_df_B11012['E_SNGPNT'] = all_tracts_df_B11012['B11012_010E'] + all_tracts_df_B11012['B11012_015E']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
all_tracts_df_B11012.drop(['B11012_010E', 'B11012_015E'], axis =1, inplace = True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
all_tracts_df_B26001 = get_county_code_dataset('B26001')
all_tracts_df_B26001 = all_tracts_df_B26001[['GEO_ID', 'B26001_001E']]
all_tracts_df_B26001 = all_tracts_df_B26001.rename(columns = {'B26001_001E':'E_GROUPQ'})

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
all_tracts_df_B06009 = get_county_code_dataset('B06009')
all_tracts_df_B06009 = all_tracts_df_B06009[['GEO_ID','B06009_002E']]
all_tracts_df_B06009 = all_tracts_df_B06009.rename(columns = {'B06009_002E':'E_NOHSDP'})

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
dfs = [all_tracts_df_B09001, all_tracts_df_B16005, all_tracts_df_B11012, all_tracts_df_B26001, all_tracts_df_B06009]
dfs = [df.set_index('GEO_ID') for df in dfs]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
final_data = dfs[0].join(dfs[1:])

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
B_svi_county = dataiku.Dataset("B_svi_county")
B_svi_county.write_with_schema(final_data)