/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.input.formats;

import com.dataiku.dip.ApplicationConfigurator;
import com.dataiku.dip.DKUApp;
import com.dataiku.dip.coremodel.Dataset;
import com.dataiku.dip.coremodel.FormatParams;
import com.dataiku.dip.datasets.DatasetInspector;
import com.dataiku.dip.datasets.fs.FSDatasetUtils;
import com.dataiku.dip.exceptions.CodedException;
import com.dataiku.dip.exceptions.DKUSecurityException;
import com.dataiku.dip.formats.FormatFactory;
import com.dataiku.dip.formats.FormatMeta;
import com.dataiku.dip.formats.avro.AvroFormatDetector;
import com.dataiku.dip.formats.excel.ExcelFormatDetector;
import com.dataiku.dip.formats.excel.ExcelFormatExtractor;
import com.dataiku.dip.formats.geo.GeoJSONFormatExtractor;
import com.dataiku.dip.formats.geo.ShapefileFormatExtractor;
import com.dataiku.dip.formats.yxdb.AlteryxDBFormatDetector;
import com.dataiku.dip.fs.FSPath;
import com.dataiku.dip.input.formats.BinaryHeader;
import com.dataiku.dip.input.formats.FixedWidthFormatDetector;
import com.dataiku.dip.input.formats.FixedWidthFormatExtractor;
import com.dataiku.dip.input.formats.JSONFormatExtractor;
import com.dataiku.dip.input.formats.LineFormatExtractor;
import com.dataiku.dip.input.formats.LineOrientedInputSample;
import com.dataiku.dip.input.formats.MiscFormatsDetector;
import com.dataiku.dip.input.formats.MySQLDumpFormatExtractor;
import com.dataiku.dip.input.formats.RegexpBasedFormats;
import com.dataiku.dip.input.formats.SASFormatDetector;
import com.dataiku.dip.input.formats.SASFormatExtractor;
import com.dataiku.dip.input.formats.XmlFormatDetector;
import com.dataiku.dip.input.formats.csv.CSVFormatConfig;
import com.dataiku.dip.input.formats.csv.CSVFormatExtractor;
import com.dataiku.dip.input.formats.csv.CSVInputFormatDetector;
import com.dataiku.dip.input.formats.hive.orcfile.ORCFileFormatDetector;
import com.dataiku.dip.input.formats.hive.rcfile.RCFileFormatDetector;
import com.dataiku.dip.input.formats.hive.sequencefile.SequenceFileFormatDetector;
import com.dataiku.dip.input.formats.parquet.ParquetFormatDetector;
import com.dataiku.dip.input.formats.parquet.ParquetFormatMeta;
import com.dataiku.dip.input.stream.EnrichedInputStream;
import com.dataiku.dip.io.StupidBoundedBufferedReader;
import com.dataiku.dip.utils.DKULogger;
import com.dataiku.dss.shadelib.org.apache.commons.io.ByteOrderMark;
import com.dataiku.dss.shadelib.org.apache.commons.io.IOUtils;
import com.dataiku.dss.shadelib.org.apache.commons.io.input.BOMInputStream;
import com.google.common.collect.Lists;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.mozilla.universalchardet.Constants;
import org.mozilla.universalchardet.UniversalDetector;

public class InputFormatsDetector {
    public static final int MAX_REASONABLE_LINE_LENGTH = 300000;
    private static final int DEFAULT_BYTES_TO_FETCH_TO_DETECT_CHARSET = 128000;
    private static final int DEFAULT_BYTES_TO_FETCH_TO_VALIDATE_CHARSET = 1000000;
    private static DKULogger logger = DKULogger.getLogger((String)"dku.input.detect");

    public static List<FormatWithMetadata> detectFormat(Dataset dataset, FSDatasetUtils.PathsBasedSplit split) throws Exception {
        ArrayList<FormatWithMetadata> candidates = new ArrayList<FormatWithMetadata>();
        FSPath firstPath = split.getPaths().get(0);
        InputFormatsDetector.detectOnFile(dataset, split, firstPath, firstPath.path(), candidates);
        return candidates;
    }

    public static FormatWithMetadata detectOneFormat(Dataset dataset, FSDatasetUtils.PathsBasedSplit split, String name) throws Exception {
        FSPath firstPath = split.getPaths().get(0);
        return InputFormatsDetector.detectOneOnFile(dataset, split, firstPath, name, firstPath.path());
    }

    public static Map<String, String> recomputeMetadata(String autodetectedFormatId, Dataset dataset, FSDatasetUtils.PathsBasedSplit split) throws Exception {
        FSPath path = split.getPaths().get(0);
        return InputFormatsDetector.recomputeMetadataOnFile(autodetectedFormatId, dataset, split, path, path.path());
    }

    public static LineOrientedInputSample getLinesSampleWithCharset(InputStream is, String charset) throws IOException {
        LineOrientedInputSample sample = new LineOrientedInputSample();
        sample.usedCharset = charset;
        CharsetDecoder cd = Charset.forName(charset).newDecoder();
        cd.onMalformedInput(CodingErrorAction.REPORT);
        cd.onUnmappableCharacter(CodingErrorAction.REPORT);
        if (com.dataiku.dip.utils.StringUtils.isUtf8((String)charset)) {
            is = new BOMInputStream(is);
        } else if (com.dataiku.dip.utils.StringUtils.isUtf16((String)charset)) {
            is = new BOMInputStream(is, false, new ByteOrderMark[]{ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE});
        }
        InputStreamReader r = new InputStreamReader(is, cd);
        try (StupidBoundedBufferedReader br = new StupidBoundedBufferedReader(300000, new BufferedReader(r));){
            for (int i = 0; i < 50; ++i) {
                String line = br.readLine();
                sample.hadHugeLines |= br.hadHugeLine();
                if (line == null) break;
                sample.allLines.append(line);
                sample.allLines.append('\n');
                line = StringUtils.replace((String)StringUtils.replace((String)line, (String)"\n", (String)""), (String)"\r", (String)"");
                sample.lines.add(line);
                if (br.hadHugeLine()) break;
            }
            if (sample.hadHugeLines) {
                logger.info((Object)"Huge lines detected");
            }
        }
        return sample;
    }

    public static LineOrientedInputSample getLineBasedSample(FSDatasetUtils.PathsBasedSplit split, FSPath path) throws Exception {
        logger.info((Object)"Getting dataset sample");
        ArrayList charsetsToTest = Lists.newArrayList((Object[])new String[]{"utf-8", "iso-8859-15"});
        String detectedCharset = InputFormatsDetector.detectCharset(split, path);
        if (detectedCharset != null) {
            logger.info((Object)("Detected charset " + detectedCharset));
            charsetsToTest.remove(detectedCharset);
            charsetsToTest.add(0, detectedCharset);
        } else {
            logger.info((Object)"No valid charset detected");
        }
        for (String charset : charsetsToTest) {
            logger.info((Object)("Trying to get sample with charset " + charset));
            try {
                LineOrientedInputSample sample = InputFormatsDetector.tryReadWithCharset(split, path, charset);
                if ("windows-1252".equals(charset) && InputFormatsDetector.hasNullCharacters(sample)) {
                    logger.info((Object)"Sample can be read with windows-1252 but looks like utf-16. Trying utf-16.");
                    try {
                        LineOrientedInputSample utf16Sample = InputFormatsDetector.tryReadWithCharset(split, path, "utf-16");
                        logger.info((Object)"Detected charset utf-16, which seems better than windows-1252");
                        return utf16Sample;
                    }
                    catch (CharacterCodingException e) {
                        logger.info((Object)"Cannot read sample using utf-16. Keeping windows-1252", (Throwable)e);
                    }
                }
                return sample;
            }
            catch (CharacterCodingException e) {
                logger.info((Object)"Charset is bad, going to next", (Throwable)e);
            }
        }
        throw new IOException("Failed to read input using detectable charsets, please manually configure");
    }

    private static boolean hasNullCharacters(LineOrientedInputSample sample) {
        long nullCount = 0L;
        for (String line : sample.lines) {
            int length = line.length();
            for (int i = 0; i < length; ++i) {
                if (line.charAt(i) != '\u0000' || ++nullCount < 2L) continue;
                return true;
            }
        }
        return false;
    }

    private static <T> void addIfNotNull(List<T> list, T maybe) {
        if (maybe != null) {
            list.add(maybe);
        }
    }

    private static LineOrientedInputSample tryReadWithCharset(FSDatasetUtils.PathsBasedSplit split, FSPath path, String charset) throws InterruptedException, CodedException, DKUSecurityException, IOException {
        int bytesToFetch = ApplicationConfigurator.getParams().getIntParam("dku.datasets.detection.bytesToFetchForCharsetValidation", Integer.valueOf(1000000));
        try (InputStream is = split.getStreamForPath(path).decompressedHeadStream((long)bytesToFetch);){
            LineOrientedInputSample lineOrientedInputSample = InputFormatsDetector.getLinesSampleWithCharset(is, charset);
            return lineOrientedInputSample;
        }
    }

    /*
     * Enabled aggressive block sorting
     * Enabled unnecessary exception pruning
     * Enabled aggressive exception aggregation
     */
    private static String detectCharset(FSDatasetUtils.PathsBasedSplit split, FSPath path) throws InterruptedException, CodedException, DKUSecurityException {
        int bytesToFetch = ApplicationConfigurator.getParams().getIntParam("dku.datasets.detection.bytesToFetchForCharset", Integer.valueOf(128000));
        try (InputStream is = split.getStreamForPath(path).decompressedHeadStream((long)bytesToFetch);){
            UniversalDetector detector = new UniversalDetector();
            byte[] buffer = new byte[4096];
            try {
                int readCount;
                while ((readCount = is.read(buffer)) > 0 && !detector.isDone()) {
                    detector.handleData(buffer, 0, readCount);
                }
            }
            catch (IOException e) {
                logger.warn((Object)"Error while reading stream while detecting charset. Using data already read to figure out the best charset.", (Throwable)e);
            }
            detector.dataEnd();
            String detectedCharset = detector.getDetectedCharset();
            if (detectedCharset == null || Constants.CHARSET_HZ_GB_2312.equals(detectedCharset) || Constants.CHARSET_X_ISO_10646_UCS_4_3412.equals(detectedCharset) || Constants.CHARSET_X_ISO_10646_UCS_4_2143.equals(detectedCharset)) {
                String string2 = null;
                return string2;
            }
            String string = detectedCharset.toLowerCase(Locale.ROOT);
            return string;
        }
        catch (IOException e) {
            logger.warn((Object)"Unable to read data to detecting charset.", (Throwable)e);
            return null;
        }
    }

    private static void detectOnFile(Dataset dataset, FSDatasetUtils.PathsBasedSplit split, FSPath path, String filename, List<FormatWithMetadata> candidates) throws Exception {
        switch (InputFormatsDetector.guessFormatFromFilename(filename)) {
            case EXCEL: {
                candidates.addAll(new ExcelFormatDetector().detect(dataset, split, path, filename));
                break;
            }
            case SQL_DUMP: {
                InputFormatsDetector.addIfNotNull(candidates, MiscFormatsDetector.detectMySQLDump(InputFormatsDetector.getByteSample(split, path), filename));
                break;
            }
            case SAS: {
                candidates.addAll(new SASFormatDetector().detect(dataset, split, path, filename));
                break;
            }
            case ALTERYX_DB: {
                candidates.addAll(new AlteryxDBFormatDetector().detect(dataset, split, path, filename));
                break;
            }
            case SHAPE: {
                candidates.add(new FormatWithMetadata(ShapefileFormatExtractor.META.getType(), new ShapefileFormatExtractor.Config(), 400));
                break;
            }
            case GEO: {
                candidates.add(new FormatWithMetadata(GeoJSONFormatExtractor.META.getType(), new GeoJSONFormatExtractor.Config(), 400));
                break;
            }
            case XML: {
                candidates.addAll(new XmlFormatDetector().detect(dataset, split, path, filename));
                break;
            }
            default: {
                LineOrientedInputSample sample = InputFormatsDetector.getLineBasedSample(split, path);
                InputFormatsDetector.addIfNotNull(candidates, MiscFormatsDetector.detectJSON(sample, filename));
                candidates.addAll(new CSVInputFormatDetector().detect(sample, filename));
                candidates.addAll(new FixedWidthFormatDetector().detect(sample, filename));
                candidates.addAll(new RegexpBasedFormats().detect(sample));
                candidates.addAll(new ParquetFormatDetector().detect(dataset, split));
                BinaryHeader header = InputFormatsDetector.getBinaryHeader(split, path);
                if (DatasetInspector.supportsORC(dataset)) {
                    candidates.addAll(new ORCFileFormatDetector().detect(header));
                }
                if (DatasetInspector.supportsParquet(dataset)) {
                    candidates.addAll(new ParquetFormatDetector().detect(header));
                }
                if (DatasetInspector.canHDFS(dataset)) {
                    candidates.addAll(new RCFileFormatDetector().detect(header));
                    candidates.addAll(new SequenceFileFormatDetector().detect(header));
                }
                candidates.addAll(new AvroFormatDetector().detect(header));
                if (candidates.size() != 0) break;
                FormatWithMetadata oneLineFormat = new FormatWithMetadata(LineFormatExtractor.META.getType());
                oneLineFormat.params = new LineFormatExtractor.Config();
                ((LineFormatExtractor.Config)oneLineFormat.params).charset = sample.usedCharset;
                InputFormatsDetector.setDefaultFormatParamsOnDetect(oneLineFormat.params);
                candidates.add(oneLineFormat);
            }
        }
    }

    private static BinaryHeader getBinaryHeader(FSDatasetUtils.PathsBasedSplit split, FSPath path) throws InterruptedException, CodedException, DKUSecurityException, IOException {
        int targetSize = 100;
        int bytesToFetchForBinaryHeader = ApplicationConfigurator.getParams().getIntParam("dku.datasets.detection.bytesToFetchForBinaryHeader", Integer.valueOf(10000));
        EnrichedInputStream eis = split.getStreamForPath(path);
        try (InputStream is = eis.decompressedHeadStream((long)bytesToFetchForBinaryHeader);){
            BinaryHeader header = new BinaryHeader();
            byte[] buffer = new byte[targetSize];
            int readBytes = is.read(buffer);
            if (readBytes != -1) {
                header.bytes = Arrays.copyOfRange(buffer, 0, readBytes);
            }
            BinaryHeader binaryHeader = header;
            return binaryHeader;
        }
    }

    private static FormatWithMetadata detectOneOnFile(Dataset dataset, FSDatasetUtils.PathsBasedSplit split, FSPath path, String format, String filename) throws Exception {
        FormatMeta<?, ?> meta = FormatFactory.getMeta(format);
        if (meta == CSVFormatExtractor.META) {
            LineOrientedInputSample sample = InputFormatsDetector.getLineBasedSample(split, path);
            List<FormatWithMetadata> possible = new CSVInputFormatDetector().detect(sample, filename);
            if (possible.size() > 0) {
                return possible.get(0);
            }
        } else if (meta == ExcelFormatExtractor.META) {
            List<FormatWithMetadata> possible = new ExcelFormatDetector().detect(dataset, split, path, filename);
            if (possible.size() > 0) {
                return possible.get(0);
            }
        } else if (meta == ParquetFormatMeta.META) {
            List<FormatWithMetadata> possible = new ParquetFormatDetector().detect(dataset, split);
            if (possible.size() > 0) {
                return possible.get(0);
            }
        } else if (meta == SASFormatExtractor.META) {
            List<FormatWithMetadata> possible = new SASFormatDetector().detect(dataset, split, path, filename);
            if (possible.size() > 0) {
                return possible.get(0);
            }
        } else if (meta == FixedWidthFormatExtractor.META) {
            LineOrientedInputSample sample = InputFormatsDetector.getLineBasedSample(split, path);
            List<FormatWithMetadata> possible = new FixedWidthFormatDetector().detect(sample, filename);
            if (possible.size() > 0) {
                return possible.get(0);
            }
        } else {
            LineOrientedInputSample sample;
            FormatWithMetadata fmt;
            if (meta == MySQLDumpFormatExtractor.META) {
                byte[] bsample = InputFormatsDetector.getByteSample(split, path);
                return MiscFormatsDetector.detectMySQLDump(bsample, filename);
            }
            if (meta == JSONFormatExtractor.META && (fmt = MiscFormatsDetector.detectJSON(sample = InputFormatsDetector.getLineBasedSample(split, path), filename)) != null) {
                return fmt;
            }
        }
        logger.info((Object)("detect-one-format: returning default params for " + meta.getType()));
        FormatWithMetadata mwd = new FormatWithMetadata(meta.getType());
        mwd.params = meta.paramsClass().newInstance();
        InputFormatsDetector.setDefaultFormatParamsOnDetect(mwd.params);
        return mwd;
    }

    private static void setDefaultFormatParamsOnDetect(FormatParams params) {
        if (params instanceof CSVFormatConfig && ((CSVFormatConfig)params).getMaxRowChars() == null) {
            ((CSVFormatConfig)params).setMaxRowChars(DKUApp.getParams().getIntParam("dku.input.formats.csv.maxRowChars", Integer.valueOf(100000000)));
        }
        if (params instanceof LineFormatExtractor.Config && ((LineFormatExtractor.Config)params).getMaxLineChars() == null) {
            ((LineFormatExtractor.Config)params).setMaxLineChars(DKUApp.getParams().getIntParam("dku.input.formats.line.maxLineChars", Integer.valueOf(500000000)));
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private static byte[] getByteSample(FSDatasetUtils.PathsBasedSplit split, FSPath path) throws Exception {
        InputStream is = null;
        try {
            is = split.getStreamForPath(path).decompressedHeadStream(10000L);
            byte[] byArray = IOUtils.toByteArray((InputStream)is);
            return byArray;
        }
        finally {
            logger.info((Object)"Closing sample stream");
            IOUtils.closeQuietly((InputStream)is);
            logger.info((Object)"Closed sample stream");
        }
    }

    private static Map<String, String> recomputeMetadataOnFile(String autoDetectedFormatId, Dataset dataset, FSDatasetUtils.PathsBasedSplit split, FSPath path, String filename) throws Exception {
        switch (InputFormatsDetector.guessFormatFromFilename(filename)) {
            case EXCEL: {
                return new ExcelFormatDetector().recomputeMetadata(autoDetectedFormatId, dataset, split, path, filename);
            }
            case XML: {
                return new XmlFormatDetector().recomputeMetadata(autoDetectedFormatId, dataset, split, path, filename);
            }
            case SAS: {
                return new SASFormatDetector().recomputeMetadata(autoDetectedFormatId, dataset, split, path, filename);
            }
            case ALTERYX_DB: {
                return new AlteryxDBFormatDetector().recomputeMetadata(autoDetectedFormatId, dataset, split, path, filename);
            }
        }
        LineOrientedInputSample sample = InputFormatsDetector.getLineBasedSample(split, path);
        Map<String, String> metadata = new CSVInputFormatDetector().recomputeMetadata(autoDetectedFormatId, sample, filename);
        if (metadata != null) {
            return metadata;
        }
        return new FixedWidthFormatDetector().recomputeMetadata(autoDetectedFormatId, sample, filename);
    }

    private static FileFormat guessFormatFromFilename(String filename) {
        String name = filename.toLowerCase(Locale.ROOT);
        if (com.dataiku.dip.utils.StringUtils.endsWithAny((String)name, (String[])new String[]{".xlsx", ".xls", ".xltx", ".xlsm", ".xlt", ".xltm", ".xlsb"})) {
            return FileFormat.EXCEL;
        }
        if (com.dataiku.dip.utils.StringUtils.endsWithAny((String)name, (String[])new String[]{".sql", ".dump", ".dmp"})) {
            return FileFormat.SQL_DUMP;
        }
        if (com.dataiku.dip.utils.StringUtils.endsWithAny((String)name, (String[])new String[]{".sas7bdat"})) {
            return FileFormat.SAS;
        }
        if (com.dataiku.dip.utils.StringUtils.endsWithAny((String)name, (String[])new String[]{".yxdb"})) {
            return FileFormat.ALTERYX_DB;
        }
        if (com.dataiku.dip.utils.StringUtils.endsWithAny((String)name, (String[])new String[]{".shp", "shp.zip"})) {
            return FileFormat.SHAPE;
        }
        if (com.dataiku.dip.utils.StringUtils.endsWithAny((String)name, (String[])new String[]{".geojson"})) {
            return FileFormat.GEO;
        }
        if (com.dataiku.dip.utils.StringUtils.endsWithAny((String)name, (String[])new String[]{".xml", ".xml.zip", ".xml.gz"})) {
            return FileFormat.XML;
        }
        return FileFormat.OTHER;
    }

    public static class FormatWithMetadata {
        public String type;
        public FormatParams params;
        public Map<String, String> metadata = new HashMap<String, String>();
        public double detectionScore;

        public FormatWithMetadata() {
        }

        public FormatWithMetadata(String type) {
            this.type = type;
        }

        public FormatWithMetadata(String type, int detectionScore) {
            this.type = type;
            this.detectionScore = detectionScore;
        }

        public FormatWithMetadata(String type, FormatParams params, int detectionScore) {
            this.type = type;
            this.params = params;
            this.detectionScore = detectionScore;
        }
    }

    private static enum FileFormat {
        EXCEL,
        SQL_DUMP,
        SAS,
        ALTERYX_DB,
        XML,
        GEO,
        SHAPE,
        OTHER;

    }
}

