# -*- coding: utf-8 -*-
import dataiku
import pandas as pd
import json

folder = dataiku.Folder("iJFvJqJA")

LLM_ID = dataiku.get_custom_variables()["LLM_ID"]
llm = dataiku.api_client().get_default_project().get_llm(LLM_ID)

# Define the tool for extracting data from the receipts
tools = [
    {
        "type": "function",
        "function": {
            "name": "extract_data",
            "description": "Extract structured information from receipts",
            "parameters": {
                "type": "object",
                "properties": {
                    "company": {
                        "type": "string",
                        "description": "Company name on the receipt",
                    },
                    "address": {
                        "type": "string",
                        "description": "Address on the receipt",
                    },
                    "total": {
                        "type": "number",
                        "description": "Total amount on the receipt",
                    },
                    "date": {
                        "type": "string",
                        "pattern": "([0-2][0-9]|3[0-1])/(0[1-9]|1[0-2])/20[0-2][0-9]",
                        "description": "Date on the receipt",
                    },
                },
                "required": ["company", "address", "date", "total"],
            },
        },
    }
]

# Initialize lists to store extracted data
filenames = []
companies = []
addresses = []
dates = []
totals = []

# Iterate over the sample files
for file in folder.list_paths_in_partition():
    folder_file = file.split("/")[1]
    if folder_file == "SROIE_test_images":
        filenames.append(
            file.split("/")[-1][:-4]
        )  # Extract the filename without extension

        # Execute the LLM request for information extraction
        completion = llm.new_completion()
        completion.settings["maxOutputTokens"] = 1000
        completion.settings["temperature"] = 0
        completion.settings["tools"] = tools
        completion.settings["toolChoice"] = {
            "type": "tool_name",
            "name": "extract_data",
        }

        mp_message = completion.new_multipart_message()
        mp_message.with_text("Extract key information: company, date, address, and total amount.")
        with folder.get_download_stream(file) as stream:
            mp_message.with_inline_image(stream.read())
        mp_message.add()

        # Get the response and extract the structured data
        resp = completion.execute()
        if resp.success:
            extracted_data = json.loads(
                resp._raw["toolCalls"][0]["function"]["arguments"]
            )
            companies.append(extracted_data["company"])
            addresses.append(extracted_data["address"])
            dates.append(extracted_data["date"])
            totals.append(extracted_data["total"])


# Create a DataFrame with the extracted information
df = pd.DataFrame(
    {
        "ID": filenames,
        "company": companies,
        "address": addresses,
        "date": dates,
        "total": totals,
    }
)

# Format the 'date' column to datetime format
df["date"] = pd.to_datetime(df["date"], errors="coerce", dayfirst=True)

# Write the DataFrame to a Dataiku dataset
dataiku.Dataset("results_KIE_GPT").write_with_schema(df)