# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
import ast
from dataiku import pandasutils as pdu

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Read recipe inputs
facilities1 = dataiku.Dataset("clinicaltrialgov_dataset")
schema = [
    {'name': 'NCTId',   'type':'string'},
    {'name': 'Location',   'type':'string'}]

# Write recipe outputs
# Dataset facilities2 renamed to facilities_exploded by liheng.fu@dataiku.com on 2023-11-21 16:20:18
facilities2 = dataiku.Dataset("facilities_exploded")
facilities2.write_schema(schema)


# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
with facilities2.get_writer() as writer:
    for partial_dataframe in facilities1.iter_dataframes(chunksize=3000):
        partial_dataframe = partial_dataframe.loc[:,['NCTId', 'Location']]
        partial_dataframe = partial_dataframe.dropna()
        # Convert string representation of list of dictionaries to actual list of dictionaries
        partial_dataframe = (
            partial_dataframe.assign(Location=partial_dataframe['Location'].apply(ast.literal_eval)))

        # Explode the 'Location' column into rows
        partial_dataframe = partial_dataframe.explode('Location', ignore_index=True)

        writer.write_dataframe(partial_dataframe)

