# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Import necessary modules
import dataiku
import io
import logging
import pytesseract  # For OCR (Optical Character Recognition)
from PIL import Image
from project_utils import load_image, save_image

LANGUAGE = "eng"  # Language setting for OCR

df = dataiku.Dataset("images").get_dataframe().set_index("index")
folder = dataiku.Folder("vOjkXoGz")

to_remove = []  # List to keep track of images to be removed

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Iterate over each image in the dataset
for f in df.index:
    image = load_image(folder, f)  # Load the image using a custom utility function
    width, height = image.size  # Get the dimensions of the image
    short_side, long_side = min(height, width), max(height, width)  # Determine the short and long sides of the image

    # Discard images that are too small
    if short_side < 50:
        to_remove.append(f)
    else:
        modified = False
        try:
            # Detect the text orientation of the image
            results = pytesseract.image_to_osd(image, output_type=pytesseract.Output.DICT)
            logging.info(f"Text orientation for {f}: {results['orientation']}")
            # Rotate the image if needed
            # Only a 90° rotation to the left is taken into account to avoid the case in which a rotation is incorrectly detected.
            if results["orientation"] == 270:
                image = image.rotate(results["orientation"], Image.NEAREST, expand=1)
                modified = True
                # Perform OCR on the rotated image and update the content in the dataframe
                df.at[f, "content"] = pytesseract.image_to_string(image, lang=LANGUAGE)
                logging.info(f"Image rotated: {f}")
        except pytesseract.TesseractError:
            pass
        # Resize the image if its dimensions exceed the allowed size for GPT-4V
        if short_side > 768 or long_side > 2000:
            ratio = min(768/short_side, 2000/long_side)
            image = image.resize((int(image.size[0] * ratio), int(image.size[1] * ratio)))
            modified = True

        if modified:
            save_image(folder, f, image)

# Drop the images that are too small from the dataframe and reset the index
df = df.drop(to_remove).reset_index()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Filter the DataFrame to get only the rows where the category is "Table"
tables_df = df[df["category"] == "Table"]

# Filter the DataFrame to get only the rows where the category is "Image"
figures_df = df[df["category"] == "Image"]

# Remove the 'category' column from the tables DataFrame
del tables_df["category"]

# Remove the 'category' column from the figures DataFrame
del figures_df["category"]

# Write the tables DataFrame to a Dataiku dataset named "tables"
dataiku.Dataset("tables").write_with_schema(tables_df)
# Write the figures DataFrame to a Dataiku dataset named "figures"
dataiku.Dataset("figures").write_with_schema(figures_df)