/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.recipes.nlp.common.doc_extraction;

import com.dataiku.dip.connections.AbstractSQLConnection;
import com.dataiku.dip.dataflow.exec.filter.FilterDesc;
import com.dataiku.dip.docextraction.StructuredExtractor;
import com.dataiku.dip.docextraction.VLMExtractor;
import com.dataiku.dip.llm.EnrichedLLMStructuredRef;
import com.dataiku.dip.recipes.nlp.rag_embedding.RAGEmbeddingRecipeCreator;
import com.dataiku.dip.utils.DKULogger;
import java.util.List;
import javax.annotation.Nullable;

public class DocExtractionRule {
    public FilterDesc filter;
    public ActionToPerform actionToPerform = ActionToPerform.VLM;
    public UiVLMExtractorSettings vlmSettings;
    public UIStructuredExtractorSettings structuredSettings;
    @Nullable
    public SplittingSettings splittingSettings;
    public MultimodalContentType storeInMultimodalColumn;
    public boolean reExtractUnmodifiedDocuments = false;
    public static final DKULogger logger = DKULogger.getLogger((String)"dku.recipes.nlp.embed_documents.embedDocumentsRule");

    public DocExtractionRule() {
    }

    public DocExtractionRule(ActionToPerform extractor, String[] supportedFileExtensions, @Nullable EnrichedLLMStructuredRef embeddingModelRef) {
        this.actionToPerform = extractor;
        this.filter = new FilterDesc();
        this.filter.enabled = true;
        this.filter.uiData = new FilterDesc.FilterUiData();
        this.filter.uiData.mode = "||";
        for (String file_ext : supportedFileExtensions) {
            FilterDesc.FilterUiCondition extension_condition = new FilterDesc.FilterUiCondition();
            extension_condition.operator = FilterDesc.FilterUiOperator.EQUALS_CASE_INSENSITIVE_STRING.getRepr();
            extension_condition.input = "file extension";
            extension_condition.string = file_ext;
            extension_condition.col = "file name";
            this.filter.uiData.conditions.add(extension_condition);
        }
        this.splittingSettings = new SplittingSettings(embeddingModelRef);
    }

    public static enum ActionToPerform {
        DONOTEXTRACT,
        VLM,
        STRUCTURED;

    }

    public static class SplittingSettings {
        public int chunkSizeCharacters;
        public int chunkOverlapCharacters;

        public SplittingSettings(@Nullable EnrichedLLMStructuredRef ref) {
            this.chunkSizeCharacters = RAGEmbeddingRecipeCreator.adaptDefaultChunkSizeCharacters(ref != null ? ref.maxTokensLimit : null);
            this.chunkOverlapCharacters = RAGEmbeddingRecipeCreator.adaptDefaultOverlapCharacters(this.chunkSizeCharacters);
        }
    }

    public static class UIStructuredExtractorSettings {
        public SplitUnit splitUnit = SplitUnit.SECTION;
        public Integer maxSectionDepth = 6;
        public StructuredExtractor.ImageHandlingMode imageHandlingMode = StructuredExtractor.ImageHandlingMode.IGNORE;
        public StructuredExtractor.OCRSettings.OCREngine ocrEngine = StructuredExtractor.OCRSettings.OCREngine.AUTO;
        public String ocrLanguages = "en";
        public String llmId = "";
        public boolean useCustomVLMPrompt = false;
        public String customVLMPrompt = StructuredExtractor.getVlmDefaultAnnotationPromptFromTokensLimit(null);
        public boolean enableImageClassificationFiltering = true;

        public StructuredExtractor.StructuredExtractorSettings toStructuredExtractorSettings(String defaultVLMAnnotationPrompt, List<AbstractSQLConnection.CustomDatabaseProperty> dkuProperties) {
            StructuredExtractor.StructuredExtractorSettings settings = new StructuredExtractor.StructuredExtractorSettings();
            settings.imageHandlingMode = this.imageHandlingMode;
            settings.imageValidation = this.enableImageClassificationFiltering;
            if (this.imageHandlingMode == StructuredExtractor.ImageHandlingMode.OCR) {
                settings.ocrSettings = new StructuredExtractor.OCRSettings();
                settings.ocrSettings.ocrEngine = this.ocrEngine;
                settings.ocrSettings.ocrLanguages = this.ocrLanguages;
            } else if (this.imageHandlingMode == StructuredExtractor.ImageHandlingMode.VLM_ANNOTATE) {
                settings.vlmAnnotationSettings = new StructuredExtractor.VLMAnnotationSettings();
                settings.vlmAnnotationSettings.llmId = this.llmId;
                settings.vlmAnnotationSettings.llmPrompt = this.useCustomVLMPrompt ? this.customVLMPrompt : defaultVLMAnnotationPrompt;
            }
            settings.maxSectionDepth = this.splitUnit.equals((Object)SplitUnit.DOCUMENT) ? 0 : this.maxSectionDepth;
            settings.dkuProperties = dkuProperties;
            return settings;
        }

        public static enum SplitUnit {
            DOCUMENT,
            SECTION;

        }
    }

    public static class UiVLMExtractorSettings {
        public SplitUnit splitUnit = SplitUnit.PAGE;
        public int customNbPages = 1;
        public int customPagesOverlap = 0;
        public String llmId;
        public boolean useCustomPrompt = false;
        public String customPrompt = VLMExtractor.getSummaryExtractionPrompt(null);

        public UiVLMExtractorSettings() {
        }

        public UiVLMExtractorSettings(String llmId, String defaultVLMExtractionPrompt) {
            this.llmId = llmId;
            this.customPrompt = defaultVLMExtractionPrompt;
        }

        public VLMExtractor.VLMExtractorSettings toVLMExtractorSettings(String vlmExtractionPrompt) {
            VLMExtractor.VLMExtractorSettings settings = new VLMExtractor.VLMExtractorSettings();
            settings.llmId = this.llmId;
            if (this.splitUnit.equals((Object)SplitUnit.CUSTOM)) {
                settings.windowSize = this.customNbPages;
                settings.windowOverlap = Math.min(this.customPagesOverlap, this.customNbPages - 1);
            } else if (this.splitUnit.equals((Object)SplitUnit.PAGE)) {
                settings.windowSize = 1;
                settings.windowOverlap = 0;
            } else if (this.splitUnit.equals((Object)SplitUnit.DOCUMENT)) {
                settings.windowSize = 1;
                settings.windowOverlap = 0;
                settings.aggregateResults = true;
            }
            settings.llmPrompt = this.useCustomPrompt ? this.customPrompt : vlmExtractionPrompt;
            return settings;
        }
    }

    public static enum SplitUnit {
        PAGE,
        CUSTOM,
        DOCUMENT;

    }

    public static enum MultimodalContentType {
        PROMPT_OUTPUT,
        CHUNKED_PROMPT_OUTPUT,
        FULL_CONTENT,
        IMAGES;

    }
}

