/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.docextraction;

import com.dataiku.common.stereotype.PartOfPublicAPI;
import com.dataiku.dip.connections.AbstractSQLConnection;
import com.dataiku.dip.docextraction.Content;
import com.dataiku.dip.docextraction.StructuredContentDTO;
import com.dataiku.dip.docextraction.common.InputRefs;
import com.dataiku.dip.docextraction.common.chunks.TextFirstExtractionChunk;
import com.dataiku.dip.docextraction.common.chunks.VlmExtractionChunk;
import com.dataiku.dip.recipes.nlp.rag_embedding.RAGEmbeddingRecipeCreator;
import com.dataiku.dip.utils.ExceptionUtils;
import com.dataiku.dss.shadelib.com.google.common.base.Strings;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import javax.annotation.Nullable;

public abstract class Extractor {
    public static final String PROMPT_LIMIT_PLACEHOLDER = "__DKU_NUMBER_OF_CHARS__";
    public static final String VLM_ANNOTATION_PROMPT_TEMPLATE = String.format(" You are analysing a document image to extract structured information for downstream processing and retrieval (RAG). Your goal is to extract all meaningful, reusable content in a markdown format up to %s characters.\n Instructions:\n 1. Classify the image type: diagram, chart, table, handwriting, printed text, or mixed.\n 2. Extract and describe based on type:\n - Diagram or Chart: Provide a detailed description of the structure, relationships, labels, axes, values, legends, and overall meaning. Convert all meaningful content into text. Summarize insights if relevant.\n - Handwritten or Printed Text: Transcribe all visible text accurately. Preserve formatting and structure. Use markdown headings, bullets, or code blocks as appropriate.\n - Tables: Reconstruct the table as markdown. Ensure all rows and columns are preserved. Retain headings and numerical precision.\n - Mixed Content: Separate and extract each part individually (e.g., transcribe text, describe chart, and format table).\n 3. Preserve structure and context: Use markdown to reflect hierarchy, formatting, and clarity.\n 4. Avoid assumptions or hallucinations. If any part is unclear, describe it as such.\n 5. Ensure each section is a reusable, standalone chunk.\n If the image is a logo, signature, or QR code, don't describe it and return an empty string.\n", "__DKU_NUMBER_OF_CHARS__");
    public static final String VLM_SUMMARY_EXTRACTION_PROMPT_TEMPLATE = String.format("Generate a concise summary, up to %s characters, derived from the screenshot(s) of document page(s) provided.\nBegin with a brief overview and highlight crucial words, facts, or concepts to enhance both semantic and keyword searchability.\nOmit any references to the original source.\n", "__DKU_NUMBER_OF_CHARS__");
    public static final String VLM_FULL_EXTRACTION_PROMPT = "Extract the information from the screenshot(s) of document page(s) provided at the end, maintaining the original text without alteration. Follow these guidelines:\n* Use Markdown to format the text, including headers such as Title, Subtitle, and Main Sections, as well as any tables present in the document.\n* Describe any images and charts within the document screenshot(s), as the visual content cannot be directly extracted.\n* Exclude any hyperlinks, as they cannot be extracted from an image.\n* Ensure the text content remains unchanged.\n* Extract all text, including any footers or reference lists.\n";

    public static String getExtractionPromptFromTokensLimit(String extractionPromptTemplate, @Nullable Integer maxTokensLimit) {
        return Extractor.getExtractionPromptFromCharsLimit(extractionPromptTemplate, RAGEmbeddingRecipeCreator.adaptDefaultChunkSizeCharacters(maxTokensLimit));
    }

    public static String getExtractionPromptFromCharsLimit(String extractionPromptTemplate, int charsLimit) {
        if (Strings.isNullOrEmpty((String)extractionPromptTemplate)) {
            return extractionPromptTemplate;
        }
        return extractionPromptTemplate.replace(PROMPT_LIMIT_PLACEHOLDER, String.valueOf(charsLimit));
    }

    public static String getSummaryExtractionPrompt(@Nullable Integer maxTokensLimit) {
        return Extractor.getExtractionPromptFromTokensLimit(VLM_SUMMARY_EXTRACTION_PROMPT_TEMPLATE, maxTokensLimit);
    }

    public static String getVlmDefaultAnnotationPromptFromTokensLimit(@Nullable Integer maxTokensLimit) {
        return Extractor.getVlmDefaultAnnotationPromptFromCharsLimit(RAGEmbeddingRecipeCreator.adaptDefaultChunkSizeCharacters(maxTokensLimit));
    }

    public static String getVlmDefaultAnnotationPromptFromCharsLimit(int charsLimit) {
        return VLM_ANNOTATION_PROMPT_TEMPLATE.replace(PROMPT_LIMIT_PLACEHOLDER, String.valueOf(charsLimit));
    }

    public static abstract class Settings {
        public List<AbstractSQLConnection.CustomDatabaseProperty> dkuProperties = new ArrayList<AbstractSQLConnection.CustomDatabaseProperty>();

        @PartOfPublicAPI
        public static class VLM
        extends Settings {
            public int windowSize;
            public int windowOverlap;
            public String llmId;
            public String llmPrompt;
            public boolean aggregateResults = false;

            public String toString() {
                return "windowSize=" + this.windowSize + ", windowOverlap=" + this.windowOverlap + ", llmId=" + this.llmId + ", llmPrompt='" + this.llmPrompt + "'";
            }
        }

        @PartOfPublicAPI
        public static class Structured
        extends TextFirst {
            public int maxSectionDepth = 6;
            public String outputManagedFolderId;
            public VLMAnnotationSettings vlmAnnotationSettings;
            public boolean imageValidation = true;

            @Override
            public String toString() {
                StringBuilder sb = new StringBuilder();
                sb.append("maxSectionDepth=").append(this.maxSectionDepth).append(", outputManagedFolderId=").append(this.outputManagedFolderId).append(", imageValidation=").append(this.imageValidation);
                if (this.vlmAnnotationSettings != null) {
                    sb.append(", vlmId=").append(this.vlmAnnotationSettings.llmId).append(", llmPrompt=").append(this.vlmAnnotationSettings.llmPrompt);
                }
                sb.append(", ").append(super.toString());
                return sb.toString();
            }
        }

        @PartOfPublicAPI
        public static class TextFirst
        extends Settings {
            public ImageHandlingMode imageHandlingMode = ImageHandlingMode.IGNORE;
            public OCRSettings ocrSettings;

            public String toString() {
                StringBuilder sb = new StringBuilder();
                sb.append("imageHandlingMode=").append(this.imageHandlingMode.name());
                if (this.ocrSettings != null) {
                    sb.append(", ocrEngine=").append(this.ocrSettings.ocrEngine.name()).append(", ocrLanguages=").append(this.ocrSettings.ocrLanguages);
                }
                if (this.dkuProperties != null && !this.dkuProperties.isEmpty()) {
                    sb.append(", recipeProperties={");
                    sb.append(this.dkuProperties.stream().map(AbstractSQLConnection.CustomDatabaseProperty::toString).collect(Collectors.joining(", ")));
                    sb.append("}");
                }
                return sb.toString();
            }
        }
    }

    @PartOfPublicAPI
    public static enum ImageHandlingMode {
        OCR,
        VLM_ANNOTATE,
        IGNORE;

    }

    @PartOfPublicAPI
    public static class VLMAnnotationSettings {
        public String llmId;
        public String llmPrompt = Extractor.getVlmDefaultAnnotationPromptFromTokensLimit(null);
    }

    @PartOfPublicAPI
    public static class OCRSettings {
        public OCREngine ocrEngine;
        public String ocrLanguages = "en";

        public static enum OCREngine {
            EASYOCR,
            TESSERACT,
            AUTO;

        }
    }

    public static class Inputs {

        @PartOfPublicAPI
        public static class VLM {
            public InputRefs.ImagesRef imagesRef;
        }

        @PartOfPublicAPI
        public static class TextFirst {
            public InputRefs.DocumentRef document;
        }
    }

    public static abstract class ResponseOrError {
        public boolean ok;
        public String errorMessage;

        @PartOfPublicAPI
        public static class TextFirstResponseDTO
        extends ResponseOrError {
            public StructuredContentDTO content;
        }

        public static class VLM
        extends ResponseOrError {
            public List<? extends VlmExtractionChunk> chunks = new ArrayList<VlmExtractionChunk>();

            public static VLM fromSuccess(List<? extends VlmExtractionChunk> chunks) {
                VLM resp = new VLM();
                resp.ok = true;
                resp.chunks = chunks;
                return resp;
            }

            public static VLM fromError(Throwable e) {
                VLM resp = new VLM();
                resp.ok = false;
                resp.errorMessage = ExceptionUtils.getMessageWithCauses((Throwable)e);
                return resp;
            }
        }

        public static class Tree
        extends ResponseOrError {
            public Content content;

            public static Tree fromSuccess(Content response) {
                Tree resp = new Tree();
                resp.ok = true;
                resp.content = response;
                return resp;
            }

            public static Tree fromError(Throwable e) {
                Tree resp = new Tree();
                resp.ok = false;
                resp.errorMessage = ExceptionUtils.getMessageWithCauses((Throwable)e);
                return resp;
            }

            public TextFirstResponseDTO toDTO(boolean forExtractContentRecipe) {
                TextFirstResponseDTO res = new TextFirstResponseDTO();
                res.ok = this.ok;
                res.errorMessage = this.errorMessage;
                if (this.ok) {
                    res.content = this.content.toDTO(forExtractContentRecipe);
                }
                return res;
            }
        }

        public static class Flat
        extends ResponseOrError {
            public List<TextFirstExtractionChunk> chunks = new ArrayList<TextFirstExtractionChunk>();

            public static Flat fromSuccess(List<TextFirstExtractionChunk> chunks) {
                Flat resp = new Flat();
                resp.ok = true;
                resp.chunks = chunks;
                return resp;
            }

            public static Flat fromError(Throwable e) {
                Flat resp = new Flat();
                resp.ok = false;
                resp.errorMessage = ExceptionUtils.getMessageWithCauses((Throwable)e);
                return resp;
            }
        }
    }

    public static class Request {
        private Request() {
            throw new IllegalStateException("Utility class");
        }

        @PartOfPublicAPI
        public static class VLM {
            public Inputs.VLM inputs = new Inputs.VLM();
            public Settings.VLM settings = new Settings.VLM();

            public VLM() {
            }

            public VLM(InputRefs.ImagesRef imagesRef, Settings.VLM settings) {
                this.inputs.imagesRef = imagesRef;
                this.settings = settings;
            }
        }

        @PartOfPublicAPI
        public static class Structured {
            public Inputs.TextFirst inputs = new Inputs.TextFirst();
            public Settings.Structured settings = new Settings.Structured();

            public Structured() {
            }

            public Structured(InputRefs.DocumentRef document, Settings.Structured RawSettings) {
                this.inputs.document = document;
                this.settings = RawSettings;
            }
        }

        @PartOfPublicAPI
        public static class RawText {
            public Inputs.TextFirst inputs = new Inputs.TextFirst();
            public Settings.TextFirst settings = new Settings.TextFirst();

            public RawText() {
            }

            public RawText(InputRefs.DocumentRef document, Settings.TextFirst RawSettings) {
                this.inputs.document = document;
                this.settings = RawSettings;
            }
        }
    }
}

