/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.shaker.processors.expr;

import com.dataiku.dip.datalayer.Column;
import com.dataiku.dip.datalayer.Processor;
import com.dataiku.dip.datalayer.Row;
import com.dataiku.dip.datalayer.SingleInputSingleOutputRowProcessor;
import com.dataiku.dip.datalineage.DatasetPairLineage;
import com.dataiku.dip.datalineage.RecipeLineage;
import com.dataiku.dip.exceptions.IllegalConfigurationException;
import com.dataiku.dip.shaker.model.ProcessorScriptStep;
import com.dataiku.dip.shaker.model.StepParams;
import com.dataiku.dip.shaker.processors.Category;
import com.dataiku.dip.shaker.processors.ProcessorMeta;
import com.dataiku.dip.shaker.processors.ProcessorTag;
import com.dataiku.dip.shaker.processors.expr.TextSimplifier;
import com.dataiku.dip.shaker.processors.expr.TokenizedText;
import com.dataiku.dip.shaker.server.ProcessorDesc;
import com.dataiku.dip.utils.JSON;
import com.dataiku.dip.utils.Pair;
import com.google.common.collect.Sets;
import java.util.Objects;
import java.util.Set;
import org.apache.commons.lang.StringUtils;

public class Tokenizer
extends SingleInputSingleOutputRowProcessor {
    public static final ProcessorMeta<Tokenizer, Parameter> META = new ProcessorMeta<Tokenizer, Parameter>(){

        @Override
        public String getName() {
            return "Tokenizer";
        }

        @Override
        public String getDocPage() {
            return "tokenizer";
        }

        @Override
        public Category getCategory() {
            return Category.TRANSFORMATION;
        }

        @Override
        public Set<ProcessorTag> getTags() {
            return Sets.newHashSet((Object[])new ProcessorTag[]{ProcessorTag.NLP});
        }

        @Override
        public Class<Parameter> stepParamClass() {
            return Parameter.class;
        }

        @Override
        public String getHelp(String language) {
            return this.translate(language, "SHAKER.PROCESSOR.Tokenizer.HELP", "This processor tokenizes (splits in words) a text column.\n\n# Example use case\n\nYou want to perform statistics on the words used in a product catalog or query log.\nTokenization allows you to handle words separately.\n\n# Output\n\nThe tokenizer offers several output modes:\n\n* Convert to array: An array (JSON-formatted) containing the words is generated, either in the input column or in another column.\nThis mode is most useful if you intend to perform some custom processing and need to retain the structure of the original text.\n* One token per row: in this mode, for each token, a new row is generated. The row contains a copy of all other columns in the original row.\nThis mode is most useful if you intend to group by word afterwards.\n* One token per column: in this mode, a new column is generated for each token. For example, if a column contains 4 words, and you use 'out_' as prefix, columns 'out_0', 'out_1', 'out_2' and 'out_3' will be generated.\n\n# Simplification\n\nVery often, you'll want to simplify the text to remove some variance in your text corpus.\nThis processor offers several possible simplifications on the text to tokenize.\n\n" + TextSimplifier.getHelp());
        }

        @Override
        public ProcessorDesc describe(String language) {
            ProcessorDesc pd = new ProcessorDesc(this.getName(), this.translate(language, "SHAKER.PROCESSOR.Tokenizer.DESCRIPTION", 1.actionVerb("Tokenize") + " text"), false).withMNEColParam("inCol", this.translate(language, "SHAKER.PROCESSOR.Tokenizer.DESCRIPTION.IN_COL", "Column")).withMandSParam("outCol", this.translate(language, "SHAKER.PROCESSOR.Tokenizer.DESCRIPTION.OUT_COL", "Output column"));
            TextSimplifier.withParams(language, pd, true);
            return pd;
        }

        @Override
        public Tokenizer build(Parameter params) throws Exception {
            return new Tokenizer(params);
        }

        @Override
        protected Object selfReport(Parameter param) {
            return JSON.deepCopyExcept((Object)((Object)param), (String[])new String[]{"inCol", "outCol"});
        }

        @Override
        public RecipeLineage getUpdatedRecipeLineage(ProcessorScriptStep pss, RecipeLineage previousRecipeLineage) {
            if (!(pss.params instanceof Parameter)) {
                throw new IllegalArgumentException("Unsupported param type: " + pss.params.getClass().getSimpleName());
            }
            Parameter tokenizerParams = (Parameter)pss.params;
            if (StringUtils.isBlank((String)tokenizerParams.inCol)) {
                throw new IllegalConfigurationException("Missing input column information for lineage on the tokenizer processor.");
            }
            RecipeLineage updatedRecipeLineage = new RecipeLineage();
            previousRecipeLineage.getDatasetPairLineages().forEach((datasetPair, previousDatasetPairLineage) -> {
                DatasetPairLineage updatedDatasetPairLineage = new DatasetPairLineage((DatasetPairLineage)previousDatasetPairLineage);
                block0 : switch (tokenizerParams.operation) {
                    case SPLIT: {
                        updatedRecipeLineage.setUncertain(true);
                        for (int i = 0; i < previousDatasetPairLineage.getOutputDatasetSchema().getColumns().size(); ++i) {
                            String columnName;
                            String string = columnName = StringUtils.isBlank((String)tokenizerParams.prefix) ? tokenizerParams.inCol + "_" + i : tokenizerParams.prefix + i;
                            if (!previousDatasetPairLineage.getOutputDatasetSchema().hasColumn(columnName)) break block0;
                            updatedDatasetPairLineage.addFactorizedColumnRelations(tokenizerParams.inCol, columnName);
                        }
                        break;
                    }
                    case TO_JSON: 
                    case FOLD: {
                        if (!StringUtils.isNotBlank((String)tokenizerParams.outCol)) break;
                        if (!Objects.equals(tokenizerParams.outCol, tokenizerParams.inCol)) {
                            updatedDatasetPairLineage.removeRelationsOnColumn(tokenizerParams.outCol);
                        }
                        updatedDatasetPairLineage.addFactorizedColumnRelations(tokenizerParams.inCol, tokenizerParams.outCol);
                    }
                }
                updatedRecipeLineage.setDatasetPairLineage((Pair<String, String>)datasetPair, updatedDatasetPairLineage);
            });
            return updatedRecipeLineage;
        }
    };
    Parameter params;
    Column inCD;
    Column outCD;
    Column nextColCD;
    String nextColName;
    TextSimplifier textSimplifier;

    public Tokenizer(Parameter params) throws Exception {
        this.params = params;
        this.textSimplifier = new TextSimplifier(params);
    }

    public void init() {
        this.inCD = this.getCf().column(this.params.inCol, Processor.ProcessorRole.INPUT_COLUMN);
        if (this.params.operation == TextSimplifier.OperationType.TO_JSON || this.params.operation == TextSimplifier.OperationType.FOLD) {
            this.outCD = this.params.outCol != null && !this.params.outCol.equals("") ? this.getCf().columnAfter(this.params.inCol, this.params.outCol, Processor.ProcessorRole.OUTPUT_COLUMN) : this.getCf().column(this.params.inCol, Processor.ProcessorRole.OUTPUT_COLUMN);
        } else if (this.params.operation == TextSimplifier.OperationType.SPLIT) {
            this.nextColCD = this.getCf().getColumnAfter(this.params.inCol);
            this.nextColName = this.nextColCD == null ? null : this.nextColCD.getName();
        }
    }

    public void processRow(Row row) throws Exception {
        String v = row.get(this.inCD);
        TokenizedText tokenized = new TokenizedText(v);
        this.textSimplifier.simplify(tokenized);
        if (this.params.operation == TextSimplifier.OperationType.TO_JSON) {
            row.put(this.outCD, tokenized.toJSONArray().toString());
            this.getProcessorOutput().emitRow(row);
        } else if (this.params.operation == TextSimplifier.OperationType.FOLD) {
            if (tokenized.size() == 0) {
                row.put(this.outCD, "");
                this.getProcessorOutput().emitRow(row);
            }
            for (int i = 0; i < tokenized.size(); ++i) {
                Row newRow = this.getRf().row();
                for (Column c2 : this.getCf().columns()) {
                    String colVal = row.get(c2);
                    if (StringUtils.isBlank((String)colVal)) continue;
                    newRow.put(c2, colVal);
                }
                newRow.put(this.outCD, tokenized.get(i).toString());
                this.getProcessorOutput().emitRow(newRow);
            }
        } else if (this.params.operation == TextSimplifier.OperationType.SPLIT) {
            for (int i = 0; i < tokenized.size(); ++i) {
                Object prefix = this.params.prefix == null ? this.params.inCol + "_" : this.params.prefix;
                row.put(this.getCf().columnBefore(this.nextColName, (String)prefix + Integer.toString(i), Processor.ProcessorRole.OUTPUT_COLUMN), tokenized.get(i).toString());
            }
            this.getProcessorOutput().emitRow(row);
        }
    }

    public void postProcess() throws Exception {
        this.getProcessorOutput().lastRowEmitted();
    }

    public static class Parameter
    extends TextSimplifier.Parameter
    implements StepParams {
        private static final long serialVersionUID = -1L;
        public String inCol;
        public String outCol;

        @Override
        public void validate() throws IllegalArgumentException {
        }
    }
}

