# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import dataiku
import requests
import pandas as pd
import io
from sodapy import Socrata
import os

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Socrata data settings
socrata_domain = 'chronicdata.cdc.gov'
socrata_dataset_identifier = 'cwsq-ngmh'
socrata_token = 'XocbdZdcPHOG9nETvDYBwI3Ks'  # Replace with environment variable for better security
year = "year = '2022'"

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
client = Socrata(socrata_domain, socrata_token)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Fetch data with pagination
def fetch_socrata_data(dataset_identifier, where_clause, limit=50000):
    all_results = []
    offset = 0
    
    while True:
        # Fetch data with offset and limit
        results = client.get(dataset_identifier, where=where_clause, limit=limit, offset=offset)
        if not results:
            break
        all_results.extend(results)
        offset += limit
        print(f"Fetched {len(all_results)} records so far...")
    
    return pd.DataFrame.from_dict(all_results)

# Call the API service and gather the data for 2022
df = fetch_socrata_data(socrata_dataset_identifier, year)
print(f"Total records fetched: {len(df)}")

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Select columns and create the Health Reason (disease name) category 
cdc_df = df[['data_value', 'locationid',  'statedesc','countyname','totalpopulation']]
cdc_df.columns = ['Percent Disease Value', 'FIPS', 'State_name','County_name', 'Population']
cdc_df['Health Reason'] = df['categoryid'] + '_' + df['short_question_text']
cdc_df['Percent Disease Value'] = cdc_df['Percent Disease Value'].astype(float)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Rename metadata: location, area and description information
cdc_metadata = df[['locationid',  'statedesc','countyname', 'geolocation', 'category', 'measureid','measure','short_question_text']]
cdc_metadata.columns = ['FIPS',  'State_name','County_name','Geolocation','Category', 'MeasureID', 'Descriptions', 'Short Text']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
cdc_metadata['State_code'] = cdc_metadata['FIPS'].astype(str).str[:2]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
cdc_metadata['County_code'] = cdc_metadata['FIPS'].str.slice(2, 5)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
# Dataset cdc_dis renamed to cdc_disease by georgia.kouyialis@dataiku.com on 2024-11-20 00:45:28
cdc_data = dataiku.Dataset("cdc_disease")
cdc_data.write_with_schema(cdc_df)

# Dataset cdc_meta renamed to cdc_metadata by georgia.kouyialis@dataiku.com on 2024-11-20 00:45:18
cdc_data_meta = dataiku.Dataset("cdc_metadata")
cdc_data_meta.write_with_schema(cdc_metadata)
