/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.shaker.processors.expr;

import com.dataiku.dip.datalayer.Column;
import com.dataiku.dip.datalayer.Processor;
import com.dataiku.dip.datalayer.Row;
import com.dataiku.dip.datalayer.SingleInputSingleOutputRowProcessor;
import com.dataiku.dip.datalineage.DatasetPairLineage;
import com.dataiku.dip.datalineage.RecipeLineage;
import com.dataiku.dip.shaker.model.ProcessorScriptStep;
import com.dataiku.dip.shaker.processors.Category;
import com.dataiku.dip.shaker.processors.ProcessorMeta;
import com.dataiku.dip.shaker.processors.ProcessorTag;
import com.dataiku.dip.shaker.processors.expr.RecursiveCharacterTextSplitter;
import com.dataiku.dip.shaker.server.ProcessorDesc;
import com.dataiku.dip.utils.JSON;
import com.dataiku.dip.utils.Pair;
import com.google.common.collect.Sets;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import org.apache.commons.lang.StringUtils;

public class SplitIntoChunks
extends SingleInputSingleOutputRowProcessor {
    public static final ProcessorMeta<SplitIntoChunks, Parameter> META = new ProcessorMeta<SplitIntoChunks, Parameter>(){

        @Override
        public String getName() {
            return "SplitIntoChunks";
        }

        @Override
        public String getDocPage() {
            return "split-into-chunks";
        }

        @Override
        public Category getCategory() {
            return Category.TRANSFORMATION;
        }

        @Override
        public Set<ProcessorTag> getTags() {
            return Sets.newHashSet((Object[])new ProcessorTag[]{ProcessorTag.NLP});
        }

        @Override
        public Class<Parameter> stepParamClass() {
            return Parameter.class;
        }

        @Override
        public String getHelp(String language) {
            return this.translate(language, "SHAKER.PROCESSOR.SplitIntoChunks.HELP", "This processor splits text into chunks (one row per chunk) using a recursive splitter approach.\n\n# Example use case\nYou want to perform embedding and semantic search on a corpus of text documents for Retrieval-Augmented Generation (RAG). Splitting the text into smaller chunks ensures that each piece of text can be embedded into a vector store efficiently.\n\n# Output\nFor each chunk, a new row is generated. The row contains a copy of all other columns in the original row and a new chunk column.\n# Options\n* Maximum Chunk Size: Define the maximum number of characters each chunk can contain. This helps to keep chunks within manageable sizes for embedding.\n* Chunk Overlap: Specify the number of characters that should overlap between consecutive chunks. This is useful for ensuring context continuity across chunks.\n* Separators: Specify the separators to consider for splitting the text. By default, the separators are (in the following order) double new lines, new line, space and between any character. The order of separators matters as the processor will apply them sequentially. You can add or remove separators as required.\n\n* Regular expressions can also be used to specify separators, providing additional flexibility for complex text structures.\n\n* Keep Separators: you can choose whether the separators should be kept in the output chunks or removed.");
        }

        @Override
        public ProcessorDesc describe(String language) {
            ProcessorDesc ps2 = ProcessorDesc.withCustomForm(this.getName(), this.translate(language, "SHAKER.PROCESSOR.SplitIntoChunks.DESCRIPTION", 1.actionVerb("Split") + " column into chunks"));
            ps2.withMNEColParam("inCol", "");
            ps2.withMNEColParam("outCol", "");
            ps2.withColParam("chunkIdCol", "");
            RecursiveCharacterTextSplitter.withParams(ps2);
            return ps2;
        }

        @Override
        public SplitIntoChunks build(Parameter params) {
            return new SplitIntoChunks(params);
        }

        @Override
        public RecipeLineage getUpdatedRecipeLineage(ProcessorScriptStep pss, RecipeLineage previousRecipeLineage) {
            if (!(pss.params instanceof Parameter)) {
                throw new IllegalArgumentException("Unsupported param type: " + pss.params.getClass().getSimpleName());
            }
            Parameter splitParam = (Parameter)pss.params;
            if (StringUtils.isBlank((String)splitParam.inCol)) {
                throw new IllegalArgumentException("Split into chunks: Input column cannot be blank.");
            }
            if (StringUtils.isBlank((String)splitParam.outCol)) {
                throw new IllegalArgumentException("Split into chunks: Output column cannot be blank.");
            }
            RecipeLineage updatedRecipeLineage = new RecipeLineage();
            previousRecipeLineage.getDatasetPairLineages().forEach((datasetPair, previousDatasetPairLineage) -> {
                DatasetPairLineage updatedDatasetPairLineage = new DatasetPairLineage((DatasetPairLineage)previousDatasetPairLineage);
                updatedDatasetPairLineage.removeRelationsOnColumn(splitParam.outCol);
                updatedDatasetPairLineage.addFactorizedColumnRelations(splitParam.inCol, splitParam.outCol);
                if (StringUtils.isNotBlank((String)splitParam.chunkIdCol)) {
                    updatedDatasetPairLineage.removeRelationsOnColumn(splitParam.chunkIdCol);
                    updatedDatasetPairLineage.addFactorizedColumnRelations(splitParam.inCol, splitParam.chunkIdCol);
                }
                updatedRecipeLineage.setDatasetPairLineage((Pair<String, String>)datasetPair, updatedDatasetPairLineage);
            });
            return updatedRecipeLineage;
        }

        @Override
        protected Object selfReport(Parameter param) {
            return JSON.deepCopyExcept((Object)param, (String[])new String[]{"inCol", "outCol"});
        }
    };
    Parameter params;
    Column inCD;
    Column outCD;
    boolean withChunkIdCol = false;
    Column chunkIdCD;
    RecursiveCharacterTextSplitter splitter;

    public SplitIntoChunks(Parameter params) {
        this.params = params;
    }

    public void init() {
        this.inCD = this.getCf().column(this.params.inCol, Processor.ProcessorRole.INPUT_COLUMN);
        this.outCD = this.getCf().columnAfter(this.params.inCol, this.params.outCol, Processor.ProcessorRole.OUTPUT_COLUMN);
        if (StringUtils.isNotBlank((String)this.params.chunkIdCol)) {
            this.chunkIdCD = this.getCf().columnAfter(this.params.outCol, this.params.chunkIdCol, Processor.ProcessorRole.OUTPUT_COLUMN);
            this.withChunkIdCol = true;
        }
        this.splitter = new RecursiveCharacterTextSplitter(this.params);
    }

    public void processRow(Row row) throws Exception {
        String v = row.get(this.inCD);
        if (StringUtils.isEmpty((String)v)) {
            return;
        }
        int chunkId = 0;
        List<String> chunks = this.splitter.splitText(v);
        for (String chunk : chunks) {
            Row newRow = this.getRf().row();
            for (Column c2 : this.getCf().columns()) {
                String colVal;
                if (Objects.equals(c2.getName(), this.outCD.getName()) || StringUtils.isBlank((String)(colVal = row.get(c2)))) continue;
                newRow.put(c2, colVal);
            }
            newRow.put(this.outCD, chunk);
            if (this.withChunkIdCol) {
                newRow.put(this.chunkIdCD, chunkId);
            }
            this.getProcessorOutput().emitRow(newRow);
            ++chunkId;
        }
    }

    public void postProcess() throws Exception {
        this.getProcessorOutput().lastRowEmitted();
    }

    public static class Parameter
    extends RecursiveCharacterTextSplitter.Parameter {
        private static final long serialVersionUID = -1L;
        public String inCol = "";
        public String outCol = "";
        public String chunkIdCol = "";
    }
}

