/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.shaker.processors.transform;

import com.dataiku.dip.shaker.processors.expr.TokenizedText;
import com.dataiku.dip.utils.DKUtils;
import com.dataiku.dss.shadelib.org.tartarus.snowball.SnowballStemmer;
import java.io.Serializable;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.lang.StringUtils;

public class TextSimplifierAlgorithm {
    private final boolean sortAlphabetically;
    private final boolean clearStopWords;
    private final boolean normalize;
    private final boolean stem;
    private SnowballStemmer stemmer;
    private Set<String> stopWords;

    public TextSimplifierAlgorithm(Parameter params) throws Exception {
        this.normalize = params.normalize;
        this.sortAlphabetically = params.sortAlphabetically;
        this.clearStopWords = params.clearStopWords;
        this.stem = params.stem;
        if (this.clearStopWords) {
            String stopWordFileContent = DKUtils.getResourceFileContentUTF8((String)("com/dataiku/dip/shaker/processors/expr/stopwords_" + params.language + ".txt"));
            String[] splitContent = stopWordFileContent.split("\n");
            this.stopWords = new HashSet<String>(splitContent.length);
            for (String s : splitContent) {
                this.stopWords.add(s.toLowerCase());
            }
        }
        if (this.stem) {
            try {
                Class<?> stemClass = Class.forName("com.dataiku.dss.shadelib.org.tartarus.snowball.ext." + StringUtils.capitalize((String)params.getLanguage()) + "Stemmer");
                this.stemmer = (SnowballStemmer)stemClass.newInstance();
            }
            catch (ClassNotFoundException e) {
                throw new IllegalArgumentException("Stemming is not supported in " + StringUtils.capitalize((String)params.language) + " , please deactivate 'Stem words' or change the 'Language' parameter");
            }
        }
    }

    public void simplify(TokenizedText t) {
        if (this.normalize) {
            t.normalize();
        }
        if (this.sortAlphabetically) {
            t.alphaSort();
        }
        if (this.clearStopWords && this.stopWords != null) {
            t.clearStopWords(this.stopWords);
        }
        if (this.stem && this.stemmer != null) {
            t.stem(this.stemmer);
        }
    }

    public static class Parameter
    implements Serializable {
        private static final long serialVersionUID = -1L;
        public boolean sortAlphabetically;
        public boolean clearStopWords;
        public boolean normalize;
        public boolean stem;
        public String language = "english_2021";

        public String getLanguage() {
            return this.language.replaceAll("_\\d+$", "");
        }
    }
}

