/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.labeling.text;

import com.dataiku.dip.labeling.BaseLabelingAnswer;
import com.dataiku.dip.labeling.Label;
import com.dataiku.dip.labeling.entityextraction.NamedEntity;
import com.dataiku.dip.labeling.text.Token;
import com.dataiku.dip.labeling.text.TokenizationParam;
import com.dataiku.dip.labeling.text.Tokenizer;
import com.dataiku.dss.shadelib.com.google.common.base.Strings;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

public class SimpleTokenizer
implements Tokenizer {
    private final TokenizationParam.TokenizationSplitMechanism splitMechanism;

    public SimpleTokenizer(TokenizationParam.TokenizationSplitMechanism splitMechanism) {
        this.splitMechanism = splitMechanism;
    }

    @Override
    public List<Token> tokenize(String text) {
        if (Strings.isNullOrEmpty((String)text)) {
            return Collections.emptyList();
        }
        ArrayList<Token> tokens = new ArrayList<Token>();
        Matcher matcher = this.getPattern().matcher(text);
        while (matcher.find()) {
            tokens.add(new Token(matcher.start(), matcher.end(), matcher.group(0)));
        }
        for (int i = 0; i < tokens.size(); ++i) {
            boolean isLastToken;
            Token currToken = (Token)tokens.get(i);
            boolean bl = isLastToken = i == tokens.size() - 1;
            if (isLastToken) {
                currToken.delimiter = "";
                continue;
            }
            Token nextToken = (Token)tokens.get(i + 1);
            currToken.delimiter = text.substring(currToken.endIndex, nextToken.beginningIndex);
        }
        return tokens;
    }

    @Override
    public List<Token> tokenizeWithInferredMechanism(String text, List<? extends BaseLabelingAnswer> answers) {
        List<Token> tokens = this.tokenize(text);
        if (this.splitMechanism == TokenizationParam.TokenizationSplitMechanism.WHITESPACE) {
            Set tokenBeginningIndices = tokens.stream().map(t -> t.beginningIndex).collect(Collectors.toSet());
            Set tokenEndIndices = tokens.stream().map(t -> t.endIndex).collect(Collectors.toSet());
            for (BaseLabelingAnswer baseLabelingAnswer : answers) {
                for (NamedEntity annotation : ((Label.NamedEntityExtractionLabel)baseLabelingAnswer.label).annotations) {
                    if (tokenBeginningIndices.contains(annotation.beginningIndex) && tokenEndIndices.contains(annotation.endIndex)) continue;
                    SimpleTokenizer tokenizer = new SimpleTokenizer(TokenizationParam.TokenizationSplitMechanism.CHARACTER);
                    return tokenizer.tokenize(text);
                }
            }
        }
        return tokens;
    }

    private Pattern getPattern() {
        switch (this.splitMechanism) {
            case CHARACTER: {
                return Pattern.compile(".");
            }
            case WHITESPACE: {
                return Pattern.compile("\\p{LD}+|[^\\p{LD}\\s]");
            }
        }
        throw new IllegalArgumentException("Unknown engine: " + String.valueOf((Object)this.splitMechanism));
    }
}

