# -*- coding: utf-8 -*-
import dataiku
import os
import json
import tempfile
import pandas as pd

images_df = dataiku.Dataset("figures_annotated").get_dataframe().set_index("index")
texts_df = dataiku.Dataset("texts").get_dataframe().set_index("index")
tables_df = dataiku.Dataset("tables").get_dataframe().set_index("index")

# Initialize dictionaries to store document content and metadata
docs, metadata = {"index": [], "content": []}, {}

# Process text documents
for i in texts_df.index:
    # Extract text content from the dataframe
    text = texts_df.at[i, "text"]
    
    # Check if the text is not empty and has a sufficient length
    if text == text and len(text) > 10:
        # Store metadata for the text document
        metadata[i] = {
            "filename": texts_df.at[i, "filename"],
            "page": int(texts_df.at[i, "page"]),
            "points": texts_df.at[i, "points"],
            "type": "text"
        }
        
        # Add document index and content to the dictionary
        docs["index"].append(i)
        docs["content"].append(text)

# Process table documents
for i in tables_df.index:
    # Extract caption from the table dataframe
    caption = tables_df.at[i, "caption"]
    
    # Store metadata for the table document
    metadata[i] = {
        "filename": tables_df.at[i, "filename"],
        "points": tables_df.at[i, "points"],
        "page": int(tables_df.at[i, "page"]),
        "caption": caption[:1000] if caption == caption else "",  # Truncate caption to 1000 characters if it exists
        "type": "table"
    }
    
    # Process content and caption for the table
    for text_type in ["content", "caption"]:
        text = tables_df.at[i, text_type]
        
        # Check if the text is not empty and has a sufficient length
        if text == text and len(text) > 10:            
            # Add document index and content to the dictionary
            docs["index"].append(i)
            docs["content"].append(text)

# Process image documents
for i in images_df.index:
    # Extract caption from the images dataframe
    caption = images_df.at[i, "caption"]
    
    # Store metadata for the image document
    metadata[i] = {
        "filename": images_df.at[i, "filename"],
        "points": images_df.at[i, "points"],
        "page": int(images_df.at[i, "page"]),
        "type": images_df.at[i, "image_type"],
        "caption": caption[:1000] if caption == caption else "",  # Truncate caption to 1000 characters if it exists
    }
    
    # Process content and caption for the image
    for text_type in ["content", "caption"]:
        text = images_df.at[i, text_type]
        
        # Check if the text is not empty and has a sufficient length
        if text == text and len(text) > 10:        
            # Add document index and content to the dictionary
            docs["index"].append(i)
            docs["content"].append(text)

# Write metadata to a dataset
dataiku.Dataset("metadata").write_with_schema(
    pd.DataFrame.from_dict(
        {
            "index": [k for k in metadata],  # Extract index from metadata dictionary
            "metadata": [json.dumps(metadata[k]) for k in metadata]  # Convert metadata to JSON string
        }
    )
)

# Write document content to a dataset
dataiku.Dataset("to_embed").write_with_schema(
    pd.DataFrame.from_dict(docs)  # Convert docs dictionary to a DataFrame
)
