/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.shaker.processors.transform;

import com.dataiku.dip.connections.AbstractSQLConnection;
import com.dataiku.dip.coremodel.SchemaColumn;
import com.dataiku.dip.datalineage.DatasetPairLineage;
import com.dataiku.dip.datalineage.RecipeLineage;
import com.dataiku.dip.datasets.Type;
import com.dataiku.dip.shaker.ProcessorWithRecordedReport;
import com.dataiku.dip.shaker.model.ProcessorScriptStep;
import com.dataiku.dip.shaker.model.StepParams;
import com.dataiku.dip.shaker.processors.Category;
import com.dataiku.dip.shaker.processors.PrepareSnowflakeUDFUtils;
import com.dataiku.dip.shaker.processors.ProcessorCapabilities;
import com.dataiku.dip.shaker.processors.ProcessorMeta;
import com.dataiku.dip.shaker.processors.ProcessorTag;
import com.dataiku.dip.shaker.processors.StaticColumnsCreatorAdapter;
import com.dataiku.dip.shaker.processors.StaticColumnsCreatorProcessor;
import com.dataiku.dip.shaker.processors.transform.RegexpExtractorAlgorithm;
import com.dataiku.dip.shaker.server.ProcessorDesc;
import com.dataiku.dip.shaker.sql.ProcessorSQLTranslator;
import com.dataiku.dip.shaker.sql.SQLQueryWithSchema;
import com.dataiku.dip.shaker.sql.SnowflakeUDFProcessorTranslator;
import com.dataiku.dip.sql.SQLDialect;
import com.dataiku.dip.sql.queries.ExpressionBuilder;
import com.dataiku.dip.util.SecretKeyGenerator;
import com.dataiku.dip.utils.JSON;
import com.dataiku.dip.utils.Pair;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import java.io.File;
import java.io.IOException;
import java.lang.invoke.CallSite;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;

public class RegexpExtractor {
    public static final ProcessorMeta<StaticColumnsCreatorAdapter<StreamImpl>, Parameter> META = new ProcessorMeta<StaticColumnsCreatorAdapter<StreamImpl>, Parameter>(){

        @Override
        public String getName() {
            return "RegexpExtractor";
        }

        @Override
        public String getDocPage() {
            return "pattern-extract";
        }

        @Override
        public Category getCategory() {
            return Category.TRANSFORMATION;
        }

        @Override
        public Set<ProcessorTag> getTags() {
            return Sets.newHashSet((Object[])new ProcessorTag[]{ProcessorTag.STRING, ProcessorTag.SPLIT});
        }

        @Override
        public Class<Parameter> stepParamClass() {
            return Parameter.class;
        }

        @Override
        public String getHelp(String language) {
            return this.translate(language, "SHAKER.PROCESSOR.RegexpExtractor.HELP", "Extract chunks from a column using a regular expression. Note that regular expressions are not anchored: `([0-9]*)` will capture `232` in `val-232`.\n# Options\n**Regular expression**\nOnce the input column is filled, use **Find with Smart Pattern** to help generate a regular expression.\n**Capture groups**\nUse named or unnamed capture groups to extract distinct chunks into several output columns. Unnamed capture groups use the `(pattern)` syntax and place matches into numbered columns. Named capture groups use the `((?<groupname>pattern)` syntax and place matches into named columns using the group name. \nExample, unnamed group: \n* Cell value: `id-37-X234`\n* Pattern: `id-([0-9]*)-([0-9A-Z]*)`\n* Output column prefix: `extracted_`\n* Result: `extracted_1=37 extracted_2=X234`\n\nExample, named group:\n* Cell value: `id-37-X234`\n* Pattern: `id-(?<numidentifier>[0-9]*)-(?<identifier2>[0-9A-Z]*)`\n* Output column prefix: `extracted_`\n* Result: `extracted_numidentifier=37 extracted_identifier2=X234`\n\n**Found column**\nEnable this option to create a column name *found* containing a boolean to indicate whether or not the pattern matched.\n**Extract all occurrences**\nEnable this option to extract multiple matches of a group into one array. \n# Related resources\nSee [How-To: Extract Patterns With the Smart Pattern Builder](https://knowledge.dataiku.com/latest/courses/advanced-data-prep/prepare-recipe/smart-pattern-builder.html) for a detailed example of working with the Smart Pattern Builder.");
        }

        @Override
        public ProcessorDesc describe(String language) {
            return new ProcessorDesc(this.getName(), this.translate(language, "SHAKER.PROCESSOR.RegexpExtractor.DESCRIPTION", 1.actionVerb("Extract") + " with regular expression"), false).withHiddenDescription("regexp extract pattern regex").withParam("column", "column", true, false, this.translate(language, "SHAKER.PROCESSOR.RegexpExtractor.DESCRIPTION.COLUMN", "Input column")).withParam("prefix", "generic", true, true, this.translate(language, "SHAKER.PROCESSOR.RegexpExtractor.DESCRIPTION.PREFIX", "Prefix for output column(s)")).withParam("pattern", "generic", true, false, this.translate(language, "SHAKER.PROCESSOR.RegexpExtractor.DESCRIPTION.PATTERN", "Regular expression"), "(pattern)").withBool("found_col", this.translate(language, "SHAKER.PROCESSOR.RegexpExtractor.DESCRIPTION.FOUND_COL", "Create a special 'found' column")).withBool("extractAllOccurrences", this.translate(language, "SHAKER.PROCESSOR.RegexpExtractor.DESCRIPTION.EXTRACT_ALL_OCCURRENCES", "Extract all occurrences"));
        }

        @Override
        public Object selfReport(Parameter p) {
            return JSON.deepCopyExcept((Object)p, (String[])new String[]{"column", "prefix", "pattern"});
        }

        @Override
        public ProcessorMeta.ProcessorCapabilitiesSummary getCapabilities(StepParams sp, ProcessorWithRecordedReport.ProcessorRecordedReport report, SQLDialect dialect) {
            return this.getCapabilities(sp, report, dialect, null);
        }

        @Override
        public ProcessorMeta.ProcessorCapabilitiesSummary getCapabilities(StepParams sp, ProcessorWithRecordedReport.ProcessorRecordedReport report, SQLDialect dialect, AbstractSQLConnection conn) {
            Parameter p = (Parameter)sp;
            ProcessorMeta.ProcessorCapabilitiesSummary ret = new ProcessorMeta.ProcessorCapabilitiesSummary();
            String sqlCouldReason = RegexpExtractor.checkSQLCapability(p, dialect);
            if (PrepareSnowflakeUDFUtils.canUseSnowflakeUDF(conn)) {
                ret.withCan(ProcessorCapabilities.SQL_TRANSLATABLE);
            } else if (sqlCouldReason == null) {
                ret.withCan(ProcessorCapabilities.SQL_TRANSLATABLE);
            } else {
                ret.withCould(ProcessorCapabilities.SQL_TRANSLATABLE, sqlCouldReason);
            }
            ret.withCan(ProcessorCapabilities.NATIVE_SPARK_IMPL);
            ret.withCan(ProcessorCapabilities.STATIC_COLUMNS_CREATOR);
            ret.withCan(ProcessorCapabilities.KNOWN_INPUTS);
            ret.withCan(ProcessorCapabilities.KNOWN_OUTPUTS);
            ret.withCan(ProcessorCapabilities.TYPE_FIXED_COLUMNS_CREATOR);
            return ret;
        }

        @Override
        public StaticColumnsCreatorAdapter<StreamImpl> build(Parameter parameter) {
            return new StaticColumnsCreatorAdapter<StreamImpl>(new StreamImpl(parameter));
        }

        @Override
        public StaticColumnsCreatorProcessor buildStaticColumnsCreator(StepParams params) {
            return new StreamImpl((Parameter)params);
        }

        @Override
        public ProcessorSQLTranslator getSQLTranslator(StepParams parameter, ProcessorWithRecordedReport.ProcessorRecordedReport report, AbstractSQLConnection conn) {
            Parameter params = (Parameter)parameter;
            if (RegexpExtractor.checkSQLCapability(params, conn.getDialect()) == null) {
                return new SQLTranslator(params);
            }
            return new SnowflakeUDFSQLTranslator(params);
        }

        @Override
        public RecipeLineage getUpdatedRecipeLineage(ProcessorScriptStep pss, RecipeLineage previousRecipeLineage) {
            if (!(pss.params instanceof Parameter)) {
                throw new IllegalArgumentException("Unsupported param type: " + pss.params.getClass().getSimpleName());
            }
            Parameter regexpExtractorParam = (Parameter)pss.params;
            ArrayList<CallSite> outputColumns = new ArrayList<CallSite>();
            RegexpExtractorAlgorithm algo = new RegexpExtractorAlgorithm(regexpExtractorParam.found_col, regexpExtractorParam.extractAllOccurrences, regexpExtractorParam.pattern);
            for (String groupName : algo.getNamedGroupNames()) {
                outputColumns.add((CallSite)((Object)(regexpExtractorParam.prefix + groupName)));
            }
            for (int i = 1; i <= algo.getUnnamedGroupCount(); ++i) {
                outputColumns.add((CallSite)((Object)(regexpExtractorParam.prefix + i)));
            }
            if (regexpExtractorParam.found_col) {
                outputColumns.add((CallSite)((Object)(regexpExtractorParam.prefix + "found")));
            }
            RecipeLineage updatedRecipeLineage = new RecipeLineage();
            previousRecipeLineage.getDatasetPairLineages().forEach((datasetPair, previousDatasetPairLineage) -> {
                DatasetPairLineage updatedDatasetPairLineage = new DatasetPairLineage((DatasetPairLineage)previousDatasetPairLineage);
                for (String column : outputColumns) {
                    updatedDatasetPairLineage.addFactorizedColumnRelations(regexpExtractorParam.column, column);
                }
                updatedRecipeLineage.setDatasetPairLineage((Pair<String, String>)datasetPair, updatedDatasetPairLineage);
            });
            return updatedRecipeLineage;
        }
    };

    private static String checkSQLCapability(Parameter params, SQLDialect dialect) {
        if (dialect.regexSupport() == SQLDialect.RegexSupport.NONE) {
            return "Cannot use SQL engine: regular expressions not available in this DB";
        }
        if (params.extractAllOccurrences) {
            return "Cannot use SQL engine: can only extract one occurrence";
        }
        if (params.found_col) {
            return "Cannot use SQL engine: cannot output match flag";
        }
        if (RegexpExtractorAlgorithm.hasNamedGroups((String)params.pattern)) {
            return "Cannot use SQL engine: cannot handle named regex groups";
        }
        if (Pattern.compile(params.pattern).matcher("x").groupCount() >= 10) {
            return "Cannot use SQL engine: too many capture groups (backreferences are limited to 1-9)";
        }
        if (!dialect.supportsNonCapturingGroups() && RegexpExtractorAlgorithm.hasNonCapturingGroups((String)params.pattern)) {
            return "Cannot use SQL engine: non-capturing groups are not available in this DB";
        }
        if (!dialect.supportsLookAroundExpressions() && RegexpExtractorAlgorithm.hasLookAroundExpressions((String)params.pattern)) {
            return "Cannot use SQL engine: look around expressions are not available in this DB";
        }
        return null;
    }

    public static class Parameter
    implements StepParams {
        private static final long serialVersionUID = -1L;
        public String column;
        public String pattern;
        public String prefix = "";
        public boolean found_col;
        boolean extractAllOccurrences;

        public void validate() throws IllegalArgumentException {
            String string = this.prefix = this.prefix == null ? "" : this.prefix;
            if (StringUtils.isBlank((String)this.column)) {
                throw new IllegalArgumentException("Empty column name");
            }
            if (StringUtils.isBlank((String)this.pattern)) {
                throw new IllegalArgumentException("Empty pattern");
            }
        }
    }

    static class StreamImpl
    implements StaticColumnsCreatorProcessor {
        private static final long serialVersionUID = 1L;
        private final String inputColumn;
        private final List<SchemaColumn> producedOutputs = new ArrayList<SchemaColumn>();
        private final RegexpExtractorAlgorithm algo;

        StreamImpl(Parameter parameter) {
            this.inputColumn = parameter.column;
            this.algo = new RegexpExtractorAlgorithm(parameter.found_col, parameter.extractAllOccurrences, parameter.pattern);
            Type groupType = parameter.extractAllOccurrences ? Type.ARRAY : Type.STRING;
            for (String groupName : this.algo.getNamedGroupNames()) {
                String newColumnName = parameter.prefix + groupName;
                SchemaColumn sc = new SchemaColumn(newColumnName, groupType);
                if (sc.getType() == Type.ARRAY) {
                    sc.arrayContent = new SchemaColumn("content", Type.STRING);
                }
                this.producedOutputs.add(sc);
            }
            for (int i = 1; i <= this.algo.getUnnamedGroupCount(); ++i) {
                String newColumnName = parameter.prefix + i;
                SchemaColumn sc = new SchemaColumn(newColumnName, groupType);
                if (sc.getType() == Type.ARRAY) {
                    sc.arrayContent = new SchemaColumn("content", Type.STRING);
                }
                this.producedOutputs.add(sc);
            }
            if (parameter.found_col) {
                this.producedOutputs.add(new SchemaColumn(parameter.prefix + "found", Type.BOOLEAN));
            }
        }

        @Override
        public List<String> getRequiredInputs() {
            return Lists.newArrayList((Object[])new String[]{this.inputColumn});
        }

        @Override
        public List<SchemaColumn> getProducedOutputs() {
            return this.producedOutputs;
        }

        @Override
        public StaticColumnsCreatorProcessor.Output produceRow(StaticColumnsCreatorProcessor.Input input) {
            StaticColumnsCreatorProcessor.Output res = new StaticColumnsCreatorProcessor.Output();
            String v = input.values.get(0) != null ? input.values.get(0).toString() : null;
            res.values = this.algo.process(v);
            return res;
        }

        @Override
        public void init(Map<String, File> resources) {
        }
    }

    private static class SnowflakeUDFSQLTranslator
    implements SnowflakeUDFProcessorTranslator {
        private final String functionName;
        private final Parameter parameter;
        private final List<String> columnsNames = new ArrayList<String>();
        private final List<Type> columnsTypes = new ArrayList<Type>();

        private SnowflakeUDFSQLTranslator(Parameter parameter) {
            this.functionName = "REGEXP_EXTRACTOR_" + SecretKeyGenerator.generate();
            this.parameter = parameter;
        }

        @Override
        public List<SnowflakeUDFProcessorTranslator.SnowflakeUDFResource> getUDFResources() throws IOException {
            List<SnowflakeUDFProcessorTranslator.SnowflakeUDFResource> resources = SnowflakeUDFProcessorTranslator.createStandardResourceList();
            SnowflakeUDFProcessorTranslator.addStandardResources(resources, SnowflakeUDFProcessorTranslator.StandardResource.DKU_CORE_JAR, SnowflakeUDFProcessorTranslator.StandardResource.ORGJSON_JAR);
            return resources;
        }

        @Override
        public List<SnowflakeUDFProcessorTranslator.SnowflakeFunctionDef> getUDFs() {
            SnowflakeUDFProcessorTranslator.SnowflakeFunctionDef def = new SnowflakeUDFProcessorTranslator.SnowflakeFunctionDef(this.functionName, "com.dataiku.dip.shaker.processors.transform.RegexpExtractorUDF.process", "input STRING, found_col BOOLEAN, extractAllOccurrences BOOLEAN, pattern STRING", "STRING, BOOLEAN, BOOLEAN, STRING", "ARRAY");
            def.importStandardResources(SnowflakeUDFProcessorTranslator.StandardResource.DKU_CORE_JAR, SnowflakeUDFProcessorTranslator.StandardResource.ORGJSON_JAR);
            return Lists.newArrayList((Object[])new SnowflakeUDFProcessorTranslator.SnowflakeFunctionDef[]{def});
        }

        private void prepareOutputs() {
            Type groupType = Type.STRING;
            RegexpExtractorAlgorithm algo = new RegexpExtractorAlgorithm(this.parameter.found_col, this.parameter.extractAllOccurrences, this.parameter.pattern);
            for (String groupName : algo.getNamedGroupNames()) {
                String newColumnName = this.parameter.prefix + groupName;
                this.columnsNames.add(newColumnName);
                this.columnsTypes.add(groupType);
            }
            for (int i = 1; i <= algo.getUnnamedGroupCount(); ++i) {
                String newColumnName = this.parameter.prefix + i;
                this.columnsNames.add(newColumnName);
                this.columnsTypes.add(groupType);
            }
            if (this.parameter.found_col) {
                this.columnsNames.add(this.parameter.prefix + "found");
                this.columnsTypes.add(Type.BOOLEAN);
            }
        }

        @Override
        public SQLQueryWithSchema translate(SQLQueryWithSchema chain) {
            this.prepareOutputs();
            String tmpColumnName = "REGEXP_EXTRACTOR_" + SecretKeyGenerator.generate((int)16);
            ExpressionBuilder.ExpressionBuilderFactory ebf = new ExpressionBuilder.ExpressionBuilderFactory();
            SQLDialect d = chain.getDialect();
            if (chain.isAnyCreatedOrModifiedByCurrentQuery(this.columnsNames)) {
                chain = chain.makeSubquery();
            }
            String functionCall = String.format("%s(%s, %s, %s, %s)", this.functionName, d.quoteIdentifier(this.parameter.column), this.parameter.found_col, this.parameter.extractAllOccurrences, d.quoteString(this.parameter.pattern));
            chain.select(ebf.expr(functionCall), tmpColumnName);
            SQLQueryWithSchema outer = chain.makeSubquery();
            SchemaColumn inputSchemaColumn = outer.getCurrentColumn(this.parameter.column);
            for (int i = 0; i < this.columnsNames.size(); ++i) {
                ExpressionBuilder expr = ebf.expr(d.quoteIdentifier(tmpColumnName) + "[" + i + "]");
                outer.addAfterOrReplaceColumn(inputSchemaColumn, expr, this.columnsTypes.get(i), this.columnsNames.get(i), false);
            }
            outer.deleteColumn(tmpColumnName);
            outer.deleteSelect(tmpColumnName);
            return outer;
        }
    }

    private static class SQLTranslator
    implements ProcessorSQLTranslator {
        private final Parameter parameter;

        private SQLTranslator(Parameter parameter) {
            this.parameter = parameter;
        }

        @Override
        public SQLQueryWithSchema translate(SQLQueryWithSchema input) {
            ArrayList affectedColumns = Lists.newArrayList((Object[])new String[]{this.parameter.column});
            int groupCount = Pattern.compile(this.parameter.pattern).matcher("x").groupCount();
            for (int i = 1; i <= groupCount; ++i) {
                String newColumnName = this.parameter.prefix + i;
                affectedColumns.add(newColumnName);
            }
            boolean needsSubquery = input.isAnyCreatedOrModifiedByCurrentQuery(affectedColumns);
            if (needsSubquery) {
                input = input.makeSubquery();
            }
            ExpressionBuilder.ExpressionBuilderFactory ebf = new ExpressionBuilder.ExpressionBuilderFactory();
            Object prevColumn = this.parameter.column;
            for (int i = 1; i <= groupCount; ++i) {
                String newColumnName = this.parameter.prefix + i;
                ExpressionBuilder col = input.col(this.parameter.column);
                ExpressionBuilder captured = col.regexpReplace(this.parameter.pattern, "__dku_match__");
                captured = captured.regexpReplace("__dku_match__.*$", "");
                captured = col.substr(captured.length().coalesce(ebf.cst(0)).plus(ebf.cst(1)));
                captured = captured.regexpReplace("^(" + this.parameter.pattern + ").*$", input.getDialect().captureGroup(1));
                captured = captured.regexpReplace(this.parameter.pattern, input.getDialect().captureGroup(i));
                ExpressionBuilder expr = ebf.caseWhen(col.regexpReplace(this.parameter.pattern, "").length().coalesce(ebf.cst(0)).lt(col.length()), captured, ebf.nullValue(Type.STRING, 10));
                input.addAfterOrReplaceColumn(input.getCurrentColumn((String)prevColumn), expr, Type.STRING, newColumnName, true);
                prevColumn = newColumnName;
            }
            return input;
        }
    }
}

