/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.shaker.text;

import com.dataiku.dip.shaker.processors.expr.WordType;
import com.dataiku.dip.shaker.text.Clusterer;
import com.dataiku.dip.shaker.text.StringNormalizer;
import com.dataiku.dip.utils.DKULogger;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

public class SetBasedClusterer
extends Clusterer {
    Map<Set<WordType>, Set<String>> originalStrings = new HashMap<Set<WordType>, Set<String>>();
    Map<String, Set<Set<WordType>>> blocks = new HashMap<String, Set<Set<WordType>>>();
    Set<Set<WordType>> identicalBags = new HashSet<Set<WordType>>();
    boolean multithreaded;
    private static DKULogger logger = DKULogger.getLogger((String)"dip.shaker.analysis.clusterer");

    public float jaccard(Set<WordType> a, Set<WordType> b) {
        float commonElements = 0.0f;
        for (WordType t : a) {
            if (!b.contains(t)) continue;
            commonElements += 1.0f;
        }
        return commonElements / ((float)(a.size() + b.size()) - commonElements);
    }

    public SetBasedClusterer(boolean multithreaded) {
        this.multithreaded = multithreaded;
    }

    @Override
    public void populate(String s) {
        HashSet bag = new HashSet();
        WordType.tokenize((String)StringNormalizer.normalize((String)s).toUpperCase().toLowerCase(), bag);
        Set<String> correspondingStrings = this.originalStrings.get(bag);
        if (correspondingStrings == null) {
            correspondingStrings = new HashSet<String>();
            correspondingStrings.add(s);
            this.originalStrings.put(bag, correspondingStrings);
        } else {
            correspondingStrings.add(s);
            this.identicalBags.add(bag);
        }
        for (WordType wt : bag) {
            String ss = wt.toString();
            Set<Set<WordType>> l = this.blocks.get(ss);
            if (l == null) {
                l = new HashSet<Set<WordType>>();
                this.blocks.put(ss, l);
            }
            l.add(bag);
        }
    }

    @Override
    public void endPopulate() {
        Iterator<Set<WordType>> i = this.identicalBags.iterator();
        while (i.hasNext()) {
            Set<String> from = this.originalStrings.get(i.next());
            if (from.size() != 1) continue;
            i.remove();
        }
    }

    @Override
    public List<Set<String>> getClusters(float radius, int timeOut) {
        ArrayList<BlockEvaluator> evaluators;
        ArrayList<Map> clusterMaps;
        ExecutorService executor;
        if (radius < 0.0f) {
            logger.error((Object)"Error: radius < 0");
            return null;
        }
        HashSet clusters = new HashSet();
        for (Set<WordType> bag : this.identicalBags) {
            HashSet finalCluster = new HashSet();
            finalCluster.addAll(this.originalStrings.get(bag));
            clusters.add(finalCluster);
        }
        if (this.multithreaded) {
            int cores = Runtime.getRuntime().availableProcessors();
            int threadSizeLimit = (1000 - clusters.size()) / cores;
            executor = Executors.newFixedThreadPool(cores, new ThreadFactoryBuilder().setNameFormat("SetBasedCluster-%d").build());
            int size = this.blocks.size();
            int range = size / cores + 1;
            clusterMaps = new ArrayList(cores);
            evaluators = new ArrayList(cores);
            for (int i = 0; i < cores; ++i) {
                int rangeStart = range * i;
                int rangeEnd = range * (i + 1);
                if (rangeEnd > size) {
                    rangeEnd = size;
                }
                evaluators.add(new BlockEvaluator(new ArrayList<Set<Set<WordType>>>(this.blocks.values()), radius, rangeStart, rangeEnd, threadSizeLimit));
            }
        } else {
            executor = Executors.newSingleThreadExecutor(new ThreadFactoryBuilder().setNameFormat("SetBasedCluster-%d").build());
            clusterMaps = new ArrayList<Map>(1);
            evaluators = new ArrayList<BlockEvaluator>(1);
            evaluators.add(new BlockEvaluator(new ArrayList<Set<Set<WordType>>>(this.blocks.values()), radius, 0, this.blocks.size(), 1000 - clusters.size()));
        }
        try {
            List futures = executor.invokeAll(evaluators, timeOut, TimeUnit.SECONDS);
            int i = 0;
            for (Future future : futures) {
                if (future.isCancelled()) {
                    clusterMaps.add(((BlockEvaluator)evaluators.get(i)).getResults());
                    this.timedOut = true;
                } else {
                    try {
                        clusterMaps.add((Map)future.get());
                    }
                    catch (Exception e) {
                        this.timedOut = true;
                    }
                }
                ++i;
            }
        }
        catch (InterruptedException e) {
            Thread.currentThread().interrupt();
            logger.error((Object)"clusterer has been interrupted", (Throwable)e);
        }
        executor.shutdown();
        for (Map cluster_map : clusterMaps) {
            for (Set cluster : cluster_map.values()) {
                HashSet finalCluster = new HashSet();
                for (Set bag : cluster) {
                    finalCluster.addAll(this.originalStrings.get(bag));
                }
                clusters.add(finalCluster);
            }
        }
        ArrayList<Set<String>> sortedClusters = new ArrayList<Set<String>>(clusters);
        Collections.sort(sortedClusters, new Clusterer.SizeComparator());
        return sortedClusters;
    }

    public class BlockEvaluator
    implements Callable<Map<Set<WordType>, Set<Set<WordType>>>> {
        int start;
        int stop;
        int maxSize;
        float radius;
        List<Set<Set<WordType>>> blocks;
        Map<Set<WordType>, Set<Set<WordType>>> clusterMap = new HashMap<Set<WordType>, Set<Set<WordType>>>();

        public BlockEvaluator(List<Set<Set<WordType>>> blocks, float radius, int start, int stop, int maxSize) {
            this.blocks = blocks;
            this.start = start;
            this.stop = stop;
            this.radius = radius;
            this.maxSize = maxSize;
        }

        public Map<Set<WordType>, Set<Set<WordType>>> getResults() {
            return this.clusterMap;
        }

        @Override
        public Map<Set<WordType>, Set<Set<WordType>>> call() {
            Thread.currentThread().setName("dss-clusterer");
            for (int i = this.start; i < this.stop && this.clusterMap.size() <= this.maxSize; ++i) {
                Set<Set<WordType>> set = this.blocks.get(i);
                if (set.size() < 2) continue;
                Iterator<Set<WordType>> iter = set.iterator();
                while (iter.hasNext() && !SetBasedClusterer.this.timedOut) {
                    Set<WordType> a = iter.next();
                    for (Set<WordType> b : set) {
                        Set<Object> l;
                        float d;
                        if (a == b || this.clusterMap.containsKey(a) && this.clusterMap.get(a).contains(b) || this.clusterMap.containsKey(b) && this.clusterMap.get(b).contains(a) || !((d = SetBasedClusterer.this.jaccard(a, b)) >= this.radius)) continue;
                        if (!this.clusterMap.containsKey(a)) {
                            l = new HashSet<Set<WordType>>();
                            l.add(a);
                            this.clusterMap.put(a, l);
                        } else {
                            l = this.clusterMap.get(a);
                        }
                        l.add(b);
                    }
                }
            }
            if (SetBasedClusterer.this.timedOut) {
                logger.debug((Object)("Ended and timed out : cluster size " + this.clusterMap.size()));
            } else {
                logger.debug((Object)("Ended naturally : cluster size " + this.clusterMap.size()));
            }
            return this.clusterMap;
        }
    }
}

