/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.shaker.processors.join;

import com.dataiku.dip.futures.FutureProgressState;
import com.dataiku.dip.shaker.processors.expr.WordType;
import com.dataiku.dip.shaker.processors.join.FuzzySearchEngine;
import com.dataiku.dip.shaker.processors.join.MemoryEquiJoiner;
import com.dataiku.dip.utils.DKULogger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;

public class BlockSearchEngine
extends FuzzySearchEngine {
    private int blockSize = -1;
    private final Map<String, String> originalMapping = new HashMap<String, String>();
    private final Map<String, Set<String>> blocks = new HashMap<String, Set<String>>();
    private final AtomicInteger totalProcessedRows = new AtomicInteger(0);
    private final AtomicInteger totalProcessedInputChars = new AtomicInteger(0);
    private final AtomicInteger totalProcessedInputBlocks = new AtomicInteger(0);
    private final AtomicInteger totalProcessedCandidateWords = new AtomicInteger(0);
    private final AtomicInteger totalComputedDistances = new AtomicInteger(0);
    private final AtomicInteger totalAvoidedDistancesOnLength = new AtomicInteger(0);
    private final AtomicInteger totalProcessedInThreshold = new AtomicInteger(0);
    private final AtomicInteger totalMatches = new AtomicInteger(0);
    private static final DKULogger logger = DKULogger.getLogger((String)"dku.shaker.fuzzy_join");

    public BlockSearchEngine(MemoryEquiJoiner.Parameter params) {
        super(params);
    }

    @Override
    public void populate(String s) {
        String simplified = this.simplify(s);
        this.originalMapping.put(simplified, s);
    }

    @Override
    public void endPopulating() {
        int meanLength;
        int estimationSampleSize = 500;
        int sum = 0;
        int count = 0;
        for (String s : this.originalMapping.keySet()) {
            sum += s.length();
            if (++count <= estimationSampleSize) continue;
            break;
        }
        this.blockSize = (meanLength = sum / count) <= 10 ? 3 : (meanLength <= 16 ? 4 : (meanLength <= 25 ? 5 : (int)Math.ceil(Math.sqrt(meanLength))));
        logger.infoV("Finalizing BlockSearchEngine mappingSize=%d meanLength=%d blockSize=%d", new Object[]{this.originalMapping.size(), meanLength, this.blockSize});
        int processedEntries = 0;
        for (String simplified : this.originalMapping.keySet()) {
            ArrayList tokens = new ArrayList();
            WordType.blockTokenize((String)simplified, tokens, (int)this.blockSize);
            for (WordType wt : tokens) {
                Set<Object> l;
                String ss = wt.toString();
                if (!this.blocks.containsKey(ss)) {
                    l = new HashSet();
                    this.blocks.put(ss, l);
                } else {
                    l = this.blocks.get(ss);
                }
                l.add(simplified);
            }
            if (processedEntries++ % 1000 != 0) continue;
            logger.infoV("Processed %d mapping entries - nbBlocks=%d", new Object[]{processedEntries, this.blocks.size()});
        }
        logger.infoV("BlockEngine built nbBlocks=%d", new Object[]{this.blocks.size()});
    }

    @Override
    public String search(String search) throws InterruptedException {
        int totalMatchesLoc;
        int processedRows = this.totalProcessedRows.incrementAndGet();
        HashSet<String> blocksToSearch = new HashSet<String>();
        ArrayList tokens = new ArrayList();
        String preprocessed = this.simplify(search);
        int processedInputChars = this.totalProcessedInputChars.addAndGet(preprocessed.length());
        WordType.blockTokenize((String)preprocessed, tokens, (int)this.blockSize);
        int processedInputBlocks = this.totalProcessedInputBlocks.addAndGet(tokens.size());
        for (WordType wt : tokens) {
            blocksToSearch.add(wt.toString());
        }
        int minD = this.threshold + 1;
        String res = null;
        int processedCandidateWords = 0;
        int avoidedDistancesOnLength = 0;
        int computedDistances = 0;
        int processedInThreshold = 0;
        for (String blockName : blocksToSearch) {
            Set<String> block = this.blocks.get(blockName);
            if (block == null) continue;
            processedCandidateWords += block.size();
            for (String s : block) {
                int searchlength;
                int slength = s.length();
                if (slength - (searchlength = preprocessed.length()) > this.threshold || slength - searchlength < -this.threshold) {
                    ++avoidedDistancesOnLength;
                    continue;
                }
                ++computedDistances;
                int d = WordType.levenshtein((String)s, (String)preprocessed);
                if (d < minD) {
                    minD = d;
                    res = this.originalMapping.get(s);
                    ++processedInThreshold;
                }
                if (minD != 0) continue;
                return res;
            }
        }
        int totalProcessedCandidateWordsLoc = this.totalProcessedCandidateWords.addAndGet(processedCandidateWords);
        int totalAvoidedDistancesOnLengthLoc = this.totalAvoidedDistancesOnLength.addAndGet(avoidedDistancesOnLength);
        int totalComputedDistancesLoc = this.totalComputedDistances.addAndGet(computedDistances);
        int totalProcessedInThresholdLoc = this.totalProcessedInThreshold.addAndGet(processedInThreshold);
        int n = totalMatchesLoc = res != null ? this.totalMatches.incrementAndGet() : this.totalMatches.get();
        if (processedRows % 100 == 0) {
            FutureProgressState.checkInterrupt();
        }
        if (processedRows % 50 == 0) {
            logger.infoV("Fuzzy report rows=%d inputChars=%d inputBlocks=%d candidateWords=%d computedDistances=%d avoidedDistances=%d inThreshold=%d match=%d", new Object[]{processedRows, processedInputChars, processedInputBlocks, totalProcessedCandidateWordsLoc, totalComputedDistancesLoc, totalAvoidedDistancesOnLengthLoc, totalProcessedInThresholdLoc, totalMatchesLoc});
        }
        return res;
    }
}

