# -*- coding: utf-8 -*-
import dataiku
import pandas as pd
import json
import regex as re

folder = dataiku.Folder("iJFvJqJA")

# Initialize an empty DataFrame with the required columns
df = pd.DataFrame(columns=["company", "date", "address", "total", "ID"])

# Iterate through each file in the folder
for file in folder.list_paths_in_partition():
    # Match files that have the pattern 'SROIE.*txt' (SROIE dataset text files)
    if re.match(r"^/SROIE.*txt$", file):
        # Extract the file ID from the filename (assuming it's in 'X[0-9()]' format)
        filename = re.search("X[0-9()].*", file)
        if filename:
            filename = filename.group().split(".")[0]

            # Open the file and read its content as a JSON object
            with folder.get_download_stream(file) as f:
                data = f.read()
                data = json.loads(data.decode("utf-8"))

                # Add the extracted filename as the 'ID' field in the data
                data["ID"] = filename

                # Convert the dictionary to a DataFrame and append it to the main DataFrame
                df_dictionary = pd.DataFrame([data])
                df = pd.concat([df, df_dictionary], ignore_index=True)


# Clean and format the 'total' column: Remove non-numeric characters and convert to float
df["total"] = df["total"].str.replace(r"[^\d.]", "", regex=True)
df["total"] = pd.to_numeric(df["total"], errors="raise")


# Write the cleaned data to a Dataiku dataset
dataiku.Dataset("ground_truth_KIE").write_with_schema(df)