/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.dataflow.exec.fuzzyjoin.builtinengine;

import com.dataiku.dip.dataflow.exec.fuzzyjoin.FuzzyJoinRecipePayloadParams;
import com.dataiku.dip.datasets.Type;
import com.dataiku.dip.shaker.processors.expr.TextSimplifier;
import com.dataiku.dip.shaker.processors.expr.TokenizedText;
import com.dataiku.dip.shaker.text.StringNormalizer;
import com.dataiku.dip.shaker.types.GeoPoint;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.log4j.Logger;

public class TextNormaliser {
    private static final Set<String> SALUTATIONS = Stream.of("Miss", "Mistress", "Madam", "Lady", "Ms", "Mister", "Sir", "M", "Mr", "Lord", "Doctor", "Dr", "Mx").map(String::toLowerCase).collect(Collectors.toSet());
    private TextSimplifier simplifier;
    private final FuzzyJoinRecipePayloadParams.NormaliseDesc normaliseDesc;
    private static final Logger logger = Logger.getLogger((String)"dku.recipe.fuzzyjoin.textnormaliser");

    public static boolean isSalutation(String word) {
        return word != null && SALUTATIONS.contains(word.toLowerCase());
    }

    public TextNormaliser(FuzzyJoinRecipePayloadParams.NormaliseDesc normaliseDesc) {
        this.normaliseDesc = normaliseDesc;
        if (!this.requireSimplifierProcessing()) {
            return;
        }
        try {
            this.simplifier = new TextSimplifier(this.makeTextSimplifierParameter());
        }
        catch (Exception e) {
            logger.error((Object)("Couldn't initialize text simplifier: " + String.valueOf(e)));
        }
    }

    public String apply(String joinValue) {
        return this.apply(joinValue, Type.STRING);
    }

    public String apply(String joinValue, Type type) {
        if (joinValue == null) {
            return null;
        }
        if (type == Type.GEOPOINT) {
            joinValue = GeoPoint.convert(joinValue).toWKT();
        } else if (type == Type.STRING) {
            if (this.normaliseDesc.unicodeCasting) {
                joinValue = StringNormalizer.normalize((String)joinValue);
            }
            if (this.normaliseDesc.caseInsensitive) {
                joinValue = joinValue.toLowerCase();
            }
            if (this.normaliseDesc.clearSalutations) {
                joinValue = this.removeSalutations(joinValue);
            }
            if (this.normaliseDesc.normaliseText || this.requireSimplifierProcessing()) {
                TokenizedText tokenized = new TokenizedText(joinValue);
                if (this.requireSimplifierProcessing()) {
                    this.simplifier.simplify(tokenized);
                }
                joinValue = tokenized.toString(" ");
            }
        }
        return joinValue;
    }

    private boolean requireSimplifierProcessing() {
        return this.normaliseDesc.clearStopWords || this.normaliseDesc.sortAlphabetically || this.normaliseDesc.transformToStem;
    }

    private TextSimplifier.Parameter makeTextSimplifierParameter() {
        TextSimplifier.Parameter params = new TextSimplifier.Parameter();
        params.sortAlphabetically = this.normaliseDesc.sortAlphabetically;
        params.clearStopWords = this.normaliseDesc.clearStopWords;
        params.language = this.normaliseDesc.language;
        params.stem = this.normaliseDesc.transformToStem;
        params.language = this.normaliseDesc.language;
        params.normalize = false;
        return params;
    }

    private String removeSalutations(String joinValue) {
        StringBuilder newJoinValue = new StringBuilder();
        for (String word : joinValue.split(" ")) {
            if (SALUTATIONS.contains(word.toLowerCase())) continue;
            newJoinValue.append(word).append(" ");
        }
        if (newJoinValue.length() > 0) {
            newJoinValue = new StringBuilder(newJoinValue.substring(0, newJoinValue.length() - 1));
        }
        return newJoinValue.toString();
    }
}

