/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.shaker.processors.expr;

import com.dataiku.dip.connections.AbstractSQLConnection;
import com.dataiku.dip.datalayer.Column;
import com.dataiku.dip.datalayer.Processor;
import com.dataiku.dip.datalayer.Row;
import com.dataiku.dip.datalayer.SingleRowProcessor;
import com.dataiku.dip.datalineage.DatasetPairLineage;
import com.dataiku.dip.datalineage.RecipeLineage;
import com.dataiku.dip.datasets.Type;
import com.dataiku.dip.exceptions.IllegalConfigurationException;
import com.dataiku.dip.shaker.ProcessorWithRecordedReport;
import com.dataiku.dip.shaker.model.ProcessorScriptStep;
import com.dataiku.dip.shaker.model.StepParams;
import com.dataiku.dip.shaker.processors.Category;
import com.dataiku.dip.shaker.processors.PrepareSnowflakeUDFUtils;
import com.dataiku.dip.shaker.processors.ProcessorCapabilities;
import com.dataiku.dip.shaker.processors.ProcessorMeta;
import com.dataiku.dip.shaker.processors.ProcessorTag;
import com.dataiku.dip.shaker.processors.expr.TextSimplifier;
import com.dataiku.dip.shaker.processors.expr.TokenizedText;
import com.dataiku.dip.shaker.server.ProcessorDesc;
import com.dataiku.dip.shaker.sql.ProcessorSQLTranslator;
import com.dataiku.dip.shaker.sql.SQLQueryWithSchema;
import com.dataiku.dip.shaker.sql.SnowflakeUDFProcessorTranslator;
import com.dataiku.dip.sql.SQLDialect;
import com.dataiku.dip.sql.queries.ExpressionBuilder;
import com.dataiku.dip.util.SecretKeyGenerator;
import com.dataiku.dip.utils.JSON;
import com.dataiku.dip.utils.Pair;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import org.apache.commons.lang.StringUtils;

public class TextSimplifierProcessor {
    public static final ProcessorMeta<StreamImpl, Parameter> META = new ProcessorMeta<StreamImpl, Parameter>(){

        @Override
        public String getName() {
            return "TextSimplifierProcessor";
        }

        @Override
        public String getDocPage() {
            return "simplify-text";
        }

        @Override
        public Category getCategory() {
            return Category.TRANSFORMATION;
        }

        @Override
        public Set<ProcessorTag> getTags() {
            return Sets.newHashSet((Object[])new ProcessorTag[]{ProcessorTag.NLP});
        }

        @Override
        public Class<Parameter> stepParamClass() {
            return Parameter.class;
        }

        @Override
        public String getHelp(String language) {
            return this.translate(language, "SHAKER.PROCESSOR.TextSimplifierProcessor.HELP", "Perform various simplifications on a text column.\n\n# Options\n\n* **Normalize text**: Transform to lowercase, remove punctuation and accents and perform Unicode NFD normalization (*Caf\u00e9* -> *cafe*).\n\n* **Stem words:** Transform each word into its \"stem\", i.e. its grammatical root. For example, *grammatical* is transformed to *grammat*. This transformation is language-specific.\n\n* **Clear stop words**: Remove so-called \"stop words\" (*the*, *I*, *a*, *of*, ...). This transformation is language-specific.\n\n* **Sort words alphabetically:** Sorts all words of the text. For example, *the small dog* is transformed to *dog small the*, allowing strings containing the same words in different order to be matched.\n\n<u>*Note:*</u>  \n   Other processors with text operation \u2014 tokenization, n-gram extraction, fuzzy join \u2014 benefit from built-in text simplification options. You do not need to perform text simplification separately prior to using them.");
        }

        @Override
        public ProcessorDesc describe(String language) {
            ProcessorDesc pd = new ProcessorDesc(this.getName(), this.translate(language, "SHAKER.PROCESSOR.TextSimplifierProcessor.DESCRIPTION", 1.actionVerb("Simplify") + " text"), false).withMNEColParam("inCol", this.translate(language, "SHAKER.PROCESSOR.TextSimplifierProcessor.DESCRIPTION.IN_COL", "Column")).withParam("outCol", "string", false, true, this.translate(language, "SHAKER.PROCESSORS.DESCRIPTION.OUTPUT_COLUMN_EMPTY_FOR_INPLACE", "Output column (empty for in-place)"));
            TextSimplifier.withParams(language, pd, false);
            return pd;
        }

        @Override
        public ProcessorMeta.ProcessorCapabilitiesSummary getCapabilities(StepParams sp, ProcessorWithRecordedReport.ProcessorRecordedReport report, SQLDialect dialect) {
            return this.getCapabilities(sp, report, dialect, null);
        }

        @Override
        public ProcessorMeta.ProcessorCapabilitiesSummary getCapabilities(StepParams params, ProcessorWithRecordedReport.ProcessorRecordedReport report, SQLDialect dialect, AbstractSQLConnection conn) {
            ProcessorMeta.ProcessorCapabilitiesSummary ret = new ProcessorMeta.ProcessorCapabilitiesSummary();
            if (PrepareSnowflakeUDFUtils.canUseSnowflakeUDF(conn)) {
                ret.withCan(ProcessorCapabilities.SQL_TRANSLATABLE);
            }
            return ret;
        }

        @Override
        public Object selfReport(Parameter parameter) {
            return JSON.deepCopyExcept((Object)((Object)parameter), (String[])new String[]{"inCol", "outCol"});
        }

        @Override
        public StreamImpl build(Parameter params) throws Exception {
            return new StreamImpl(params);
        }

        @Override
        public ProcessorSQLTranslator getSQLTranslator(StepParams parameter, ProcessorWithRecordedReport.ProcessorRecordedReport report) {
            return new SnowflakeUDFSQLTranslator((Parameter)parameter);
        }

        @Override
        public RecipeLineage getUpdatedRecipeLineage(ProcessorScriptStep pss, RecipeLineage previousRecipeLineage) {
            if (!(pss.params instanceof Parameter)) {
                throw new IllegalArgumentException("Unsupported param type: " + pss.params.getClass().getSimpleName());
            }
            Parameter textSimplifierProcessorParams = (Parameter)pss.params;
            if (StringUtils.isBlank((String)textSimplifierProcessorParams.inCol)) {
                throw new IllegalConfigurationException("Missing input column information for lineage on the text simplifier processor.");
            }
            RecipeLineage updatedRecipeLineage = new RecipeLineage();
            previousRecipeLineage.getDatasetPairLineages().forEach((datasetPair, previousDatasetPairLineage) -> {
                DatasetPairLineage updatedDatasetPairLineage = new DatasetPairLineage((DatasetPairLineage)previousDatasetPairLineage);
                if (StringUtils.isNotBlank((String)textSimplifierProcessorParams.outCol)) {
                    updatedDatasetPairLineage.addFactorizedColumnRelations(textSimplifierProcessorParams.inCol, textSimplifierProcessorParams.outCol);
                }
                updatedRecipeLineage.setDatasetPairLineage((Pair<String, String>)datasetPair, updatedDatasetPairLineage);
            });
            return updatedRecipeLineage;
        }
    };

    static class StreamImpl
    extends SingleRowProcessor
    implements Processor {
        private final String inCol;
        private final String outCol;
        private final TextSimplifier simplifier;
        private Column inCD;
        private Column outCD;

        public StreamImpl(Parameter params) throws Exception {
            this.inCol = params.inCol;
            this.outCol = params.outCol;
            this.simplifier = new TextSimplifier(params);
        }

        public void init() throws Exception {
            this.inCD = this.getColumnFactory().column(this.inCol, Processor.ProcessorRole.INPUT_COLUMN);
            this.outCD = StringUtils.isNotBlank((String)this.outCol) ? this.getColumnFactory().columnAfter(this.inCol, this.outCol, Processor.ProcessorRole.OUTPUT_COLUMN) : this.getColumnFactory().column(this.inCol, Processor.ProcessorRole.OUTPUT_COLUMN);
        }

        public void processRow(Row row) throws Exception {
            String inV = row.get(this.inCD);
            TokenizedText tokenized = new TokenizedText(inV);
            this.simplifier.simplify(tokenized);
            row.put(this.outCD, tokenized.toString(" "));
        }

        public void postProcess() throws Exception {
        }
    }

    private static class SnowflakeUDFSQLTranslator
    implements SnowflakeUDFProcessorTranslator {
        private final String functionName = "TextSimplifier_" + SecretKeyGenerator.generate();
        private final Parameter parameter;

        private SnowflakeUDFSQLTranslator(Parameter parameter) {
            this.parameter = parameter;
        }

        @Override
        public List<SnowflakeUDFProcessorTranslator.SnowflakeUDFResource> getUDFResources() throws IOException {
            List<SnowflakeUDFProcessorTranslator.SnowflakeUDFResource> resources = SnowflakeUDFProcessorTranslator.createStandardResourceList();
            if (this.parameter.clearStopWords || this.parameter.stem) {
                SnowflakeUDFProcessorTranslator.addStandardResources(resources, SnowflakeUDFProcessorTranslator.StandardResource.GUAVA_JAR, SnowflakeUDFProcessorTranslator.StandardResource.DKU_CORE_JAR, SnowflakeUDFProcessorTranslator.StandardResource.SHADELIB, SnowflakeUDFProcessorTranslator.StandardResource.LOG4J_JAR);
            }
            if (this.parameter.stem) {
                SnowflakeUDFProcessorTranslator.addStandardResources(resources, SnowflakeUDFProcessorTranslator.StandardResource.COMMONS_LANG_JAR);
            }
            return resources;
        }

        @Override
        public List<SnowflakeUDFProcessorTranslator.SnowflakeFunctionDef> getUDFs() {
            SnowflakeUDFProcessorTranslator.SnowflakeFunctionDef def = new SnowflakeUDFProcessorTranslator.SnowflakeFunctionDef(this.functionName, "com.dataiku.dip.shaker.processors.transform.TextSimplifierUDF.process", "input STRING, normalize BOOLEAN, stem BOOLEAN, clearStopWords BOOLEAN, sortAlphabetically BOOLEAN, language STRING", "STRING, BOOLEAN, BOOLEAN, BOOLEAN, BOOLEAN, STRING", "STRING");
            if (this.parameter.clearStopWords || this.parameter.stem) {
                def.importStandardResources(SnowflakeUDFProcessorTranslator.StandardResource.GUAVA_JAR, SnowflakeUDFProcessorTranslator.StandardResource.DKU_CORE_JAR, SnowflakeUDFProcessorTranslator.StandardResource.SHADELIB, SnowflakeUDFProcessorTranslator.StandardResource.LOG4J_JAR);
            }
            if (this.parameter.stem) {
                def.importStandardResources(SnowflakeUDFProcessorTranslator.StandardResource.COMMONS_LANG_JAR);
            }
            return Lists.newArrayList((Object[])new SnowflakeUDFProcessorTranslator.SnowflakeFunctionDef[]{def});
        }

        @Override
        public SQLQueryWithSchema translate(SQLQueryWithSchema chain) {
            ExpressionBuilder.ExpressionBuilderFactory ebf = new ExpressionBuilder.ExpressionBuilderFactory();
            SQLDialect d = chain.getDialect();
            if (chain.isCreatedOrModifiedByCurrentQuery(this.parameter.inCol)) {
                chain = chain.makeSubquery();
            }
            ExpressionBuilder eb = ebf.expr(String.format("%s(%s, %b, %b, %b, %b, %s)", this.functionName, d.quoteIdentifier(this.parameter.inCol), this.parameter.normalize, this.parameter.stem, this.parameter.clearStopWords, this.parameter.sortAlphabetically, d.quoteString(this.parameter.language)));
            chain.addAfterOrReplaceColumn(chain.getCurrentColumn(this.parameter.inCol), eb, Type.STRING, this.parameter.outCol, false);
            return chain;
        }
    }

    public static class Parameter
    extends TextSimplifier.Parameter
    implements StepParams {
        private static final long serialVersionUID = -1L;
        public String inCol;
        public String outCol;

        public Parameter() {
            this.normalize = true;
        }

        @Override
        public void validate() throws IllegalArgumentException {
        }
    }
}

