/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.shaker.mrimpl;

import com.dataiku.dip.DatasetDependency;
import com.dataiku.dip.ProcessorWithResourceFiles;
import com.dataiku.dip.cluster.ClusterSelector;
import com.dataiku.dip.cluster.HadoopSettings;
import com.dataiku.dip.code.CodeEnvResolutionService;
import com.dataiku.dip.contribs.PythonContrib;
import com.dataiku.dip.contribs.PythonContribService;
import com.dataiku.dip.coremodel.Dataset;
import com.dataiku.dip.coremodel.Schema;
import com.dataiku.dip.coremodel.SimpleKeyValue;
import com.dataiku.dip.dao.DatasetsDAO;
import com.dataiku.dip.dataflow.JobActivity;
import com.dataiku.dip.dataflow.JobAuthCtxService;
import com.dataiku.dip.dataflow.RecipeRunnableSubgraph;
import com.dataiku.dip.dataflow.exec.AbortableRecipeRunner;
import com.dataiku.dip.dataflow.exec.ActivityAbortedException;
import com.dataiku.dip.dataflow.exec.FlowRunnable;
import com.dataiku.dip.dataflow.exec.RecipeRunnerWithPayload;
import com.dataiku.dip.dataflow.graph.FlowDataset;
import com.dataiku.dip.dataflow.graph.FlowRecipe;
import com.dataiku.dip.dataflow.graph.utils.GraphUtils;
import com.dataiku.dip.dataflow.utils.FlowJobUtils;
import com.dataiku.dip.datalayer.ColumnFactory;
import com.dataiku.dip.datalayer.ProcessorOutput;
import com.dataiku.dip.datalayer.RowFactory;
import com.dataiku.dip.datalayer.SRPAdapter;
import com.dataiku.dip.datalayer.streamimpl.StreamColumnFactory;
import com.dataiku.dip.datalayer.streamimpl.StreamRowFactory;
import com.dataiku.dip.datasets.DatasetInspector;
import com.dataiku.dip.datasets.StreamableDatasetSelection;
import com.dataiku.dip.datasets.UniversalSingleThreadPusher;
import com.dataiku.dip.datasets.fs.HDFSDatasetHandler;
import com.dataiku.dip.export.ZipUnzipDir;
import com.dataiku.dip.fs.FSPath;
import com.dataiku.dip.input.DatasetHandlerFactory;
import com.dataiku.dip.meanings.MeaningsDAO;
import com.dataiku.dip.partitioning.FilePartitioner;
import com.dataiku.dip.partitioning.Partition;
import com.dataiku.dip.recipes.RecipeRegistry;
import com.dataiku.dip.recipes.RecipeRunner;
import com.dataiku.dip.recipes.shaker.ShakerRecipeParams;
import com.dataiku.dip.server.datasets.DatasetAccessService;
import com.dataiku.dip.server.recipes.ShakerRecipeService;
import com.dataiku.dip.shaker.ShakerUtils;
import com.dataiku.dip.shaker.model.ScriptStep;
import com.dataiku.dip.shaker.model.SerializedShakerScript;
import com.dataiku.dip.shaker.mrimpl.DiagnosticsHandler;
import com.dataiku.dip.shaker.mrimpl.ShakerMRJobRunner;
import com.dataiku.dip.shaker.mrimpl.formats.UniversalFileInputFormat;
import com.dataiku.dip.shaker.mrimpl.formats.UniversalFileOutputFormat;
import com.dataiku.dip.shaker.mrimpl.models.SerializedPythonContribs;
import com.dataiku.dip.shaker.mrimpl.models.SerializedThrowableWithContext;
import com.dataiku.dip.shaker.processors.BaseProcessorsFactory;
import com.dataiku.dip.shaker.resources.MapReduceYarnPythonBinResourcesGatherer;
import com.dataiku.dip.shaker.sampleio.SampleWriter;
import com.dataiku.dip.shaker.streamimpl.StreamPipelineFactory;
import com.dataiku.dip.util.AutoDelete;
import com.dataiku.dip.util.DatasetLocUtils;
import com.dataiku.dip.utils.ErrorContext;
import com.dataiku.dip.utils.JSON;
import com.dataiku.dip.variables.VariablesService;
import com.dataiku.dip.warnings.WarningsContext;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.Lists;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;
import org.springframework.beans.factory.annotation.Autowired;

public class ShakerMRRecipeRunner
implements FlowRunnable,
RecipeRunner,
AbortableRecipeRunner,
RecipeRunnerWithPayload {
    public static final String DATASET_DEPENDENCY = "dku.shaker.dependency";
    public static final String CONTRIBUTED_PROCESSORS = "dku.shaker.contrib";
    public static final String USER_DEFINED_MEANINGS = "dku.shaker.udm";
    public static final String YARN_PYTHON_BINS = "dku.shaker.yarn.python.bins";
    @Autowired
    private DatasetsDAO datasetsDAO;
    @Autowired
    private DatasetAccessService datasetAccessService;
    @Autowired
    private MeaningsDAO meaningsDAO;
    @Autowired
    private JobAuthCtxService authCtxService;
    @Autowired
    private VariablesService variablesSevice;
    @Autowired
    private CodeEnvResolutionService codeEnvResolutionService;
    private volatile ShakerMRJobRunner runner;
    private volatile boolean abortNotified;
    private final FlowRecipe recipe;
    private final JobActivity activity;
    private final RecipeRunnableSubgraph subgraph;
    private Path outputPath;
    private String outputRootPathURI;
    private List<SimpleKeyValue> outputFSExtraConf;
    private SerializedShakerScript shakerScript;
    private static Logger logger;

    public ShakerMRRecipeRunner(JobActivity activity) {
        this.activity = activity;
        this.subgraph = (RecipeRunnableSubgraph)activity.getSubgraph();
        this.recipe = this.subgraph.getRecipe();
        this.activity.initStatus();
    }

    @Override
    public void setPayload(String payload) {
        logger.info((Object)("SET PAYLOAD " + payload));
        if (payload == null) {
            throw new Error("shaker script data not found for " + this.recipe.getName());
        }
        this.shakerScript = (SerializedShakerScript)JSON.parse((String)payload, SerializedShakerScript.class);
    }

    @Override
    public void init() throws Exception {
        ShakerRecipeParams params = RecipeRegistry.getParamsAs(this.activity, ShakerRecipeParams.class);
        SerializedShakerScript expandedScript = this.shakerScript.expandedDeepCopy(this.variablesSevice.getForProject(this.recipe.getProjectKey()));
        ShakerUtils.warnDeprecatedPythonVersion(this.activity.warnContext, this.recipe.getProjectKey(), expandedScript.steps);
        HashMap<Object, File> allResourceFiles = new HashMap<Object, File>();
        ArrayList resourceJars = Lists.newArrayList();
        ArrayList configKeys = Lists.newArrayList();
        HadoopSettings hadoopSettings = new ClusterSelector().selectForProject(this.authCtxService.getAuthCtx(), this.recipe.getProjectKey()).getHadoopSettings();
        configKeys.addAll(hadoopSettings.extraConf.getAsSimpleKeyValueList());
        configKeys.addAll(params.engineParams.hadoopConfigKeys);
        FlowDataset mainFlowDataset = this.subgraph.getSingleSourceDatasetForRole("main");
        Dataset mainInputDataset = mainFlowDataset.getMandatory(this.datasetsDAO);
        if (UniversalFileInputFormat.getFormatAdapter(mainInputDataset.getFormatParams()) == null) {
            throw new RuntimeException("The format of the input dataset " + mainInputDataset.getFullName() + " cannot be used on Hadoop (" + mainInputDataset.getFormatType() + "). Please uncheck \"Run on Hadoop\".");
        }
        UniversalFileInputFormat.registerDependencies(this.authCtxService.getAuthCtx(), mainInputDataset, configKeys, resourceJars);
        ArrayList<Path> mainInputPaths = new ArrayList<Path>();
        if (DatasetInspector.canHDFS(mainInputDataset)) {
            try (HDFSDatasetHandler handler = (HDFSDatasetHandler)DatasetHandlerFactory.build(this.authCtxService.getAuthCtx(), mainInputDataset);){
                logger.info((Object)("Input HDFS root path for dataset " + mainFlowDataset.getFullName()));
                if (mainInputDataset.getPartitioningSchema().isPartitioned()) {
                    Preconditions.checkArgument((this.subgraph.getSourcePartitions(mainFlowDataset) != null ? 1 : 0) != 0, (Object)"Source partition is null");
                    for (Partition p : this.subgraph.getSourcePartitions(mainFlowDataset)) {
                        for (FSPath fSPath : handler.enumeratePartition(p, handler.getEnumerationSettings())) {
                            mainInputPaths.add(new Path(handler.getFullyQualifiedRootPath() + fSPath.path()));
                        }
                    }
                } else {
                    for (FSPath dssPath : handler.enumerateFilesystem()) {
                        mainInputPaths.add(new Path(handler.getFullyQualifiedRootPath() + dssPath.path()));
                    }
                }
            }
            for (Path in : mainInputPaths) {
                logger.info((Object)("HDFS input path for " + mainFlowDataset.getFullName() + ": " + String.valueOf(in)));
            }
        } else {
            throw ErrorContext.iaef((String)"The input dataset %s is not stored on HDFS. Please uncheck \"Run on Hadoop\".", (Object)mainInputDataset.getFullName(), (Object[])new Object[0]);
        }
        FlowDataset outputFDS = GraphUtils.getSingleTarget(this.subgraph.getRecipe());
        Dataset outputDataset = outputFDS.getMandatory(this.datasetsDAO);
        if (UniversalFileOutputFormat.getFormatAdapter(outputDataset.getFormatParams()) == null) {
            throw new RuntimeException("The format of the input dataset " + outputDataset.getFullName() + " cannot be used on Hadoop (" + outputDataset.getFormatType() + "). Please uncheck \"Run on Hadoop\".");
        }
        UniversalFileOutputFormat.registerDependencies(this.authCtxService.getAuthCtx(), outputDataset, configKeys, resourceJars);
        if (DatasetInspector.canHDFS(outputDataset)) {
            try (HDFSDatasetHandler handler = (HDFSDatasetHandler)DatasetHandlerFactory.build(this.authCtxService.getAuthCtx(), outputDataset);){
                String partitionOut = FilePartitioner.computePartitionRelPathAsFolder(this.subgraph.getTargetPartition(outputFDS), outputDataset.getPartitioningSchema());
                this.outputRootPathURI = handler.getConnectionRootSchemeAndAuthority();
                this.outputFSExtraConf = Lists.newArrayList(handler.getFSExtraConf());
                this.outputPath = new Path(handler.getFullyQualifiedRootPath() + (String)partitionOut);
                logger.info((Object)("HDFS output path for " + outputDataset.getFullName() + ": " + String.valueOf(this.outputPath)));
                handler.clearPartitions(Arrays.asList(this.subgraph.getTargetPartition(outputFDS)));
            }
        } else {
            throw ErrorContext.iaef((String)"The output dataset %s is not stored on HDFS. Please uncheck \"Run on Hadoop\".", (Object)outputDataset.getFullName(), (Object[])new Object[0]);
        }
        StreamPipelineFactory.StreamPipeline streamPipeline = StreamPipelineFactory.build(expandedScript.steps, BaseProcessorsFactory.PipelineContext.fakePipelineContext(), false);
        for (Object e : streamPipeline.allProcessors) {
            Object rproc;
            if (e instanceof ProcessorWithResourceFiles) {
                rproc = (ProcessorWithResourceFiles)e;
                allResourceFiles.putAll(rproc.gatherRequirements());
            }
            if (!(e instanceof SRPAdapter) || !(((SRPAdapter)e).getProcessor() instanceof ProcessorWithResourceFiles)) continue;
            rproc = (ProcessorWithResourceFiles)((SRPAdapter)e).getProcessor();
            allResourceFiles.putAll(rproc.gatherRequirements());
        }
        List<ShakerRecipeService.StepDependency> additionalDeps = ShakerRecipeService.determineAdditionalDependencies(expandedScript.steps);
        List<DatasetDependency> list = ShakerRecipeService.mergeDatasetDependencies((List<DatasetDependency>)FluentIterable.from(additionalDeps).transform((Function)new Function<ShakerRecipeService.StepDependency, DatasetDependency>(){

            public DatasetDependency apply(ShakerRecipeService.StepDependency stepDependency) {
                return stepDependency.dependency;
            }
        }).toList());
        for (DatasetDependency dep : list) {
            Dataset dataset = this.datasetAccessService.getMandatory(DatasetLocUtils.resolveSmart(this.recipe.getProjectKey(), dep.datasetSM));
            logger.info((Object)("Gathering additional dataset " + dataset.getFullName()));
            if (dep.columnNames != null) {
                logger.info((Object)("Keep only relevant columns : " + StringUtils.join(dep.columnNames, (String)", ")));
            } else {
                logger.info((Object)"Keep only relevant columns : they are all required");
            }
            Schema schema = dataset.getSchema();
            if (dep.columnNames != null) {
                schema = schema.filter(dep.columnNames);
            }
            AutoDelete tempDatafile = FlowJobUtils.getJobTempFile("shaker-mr", "stepdep_dataset_" + dep.datasetSM, "dss1");
            try (SampleWriter sw = new SampleWriter((File)tempDatafile, true);){
                StreamColumnFactory cf = new StreamColumnFactory();
                StreamRowFactory rf = new StreamRowFactory();
                ProcessorOutput sampleOutput = sw.writeFromProcessor(schema, (ColumnFactory)cf);
                UniversalSingleThreadPusher.push(this.authCtxService.getAuthCtx(), dataset, StreamableDatasetSelection.full(), sampleOutput, (ColumnFactory)cf, (RowFactory)rf);
            }
            logger.info((Object)("Serialized it to file " + String.valueOf(tempDatafile) + ", size = " + tempDatafile.length()));
            allResourceFiles.put("dku.shaker.dependency." + dep.datasetSM, (File)tempDatafile);
        }
        allResourceFiles.putAll(new MapReduceYarnPythonBinResourcesGatherer(this.codeEnvResolutionService).dumpAndGather(expandedScript.steps, this.recipe.getProjectKey(), this.authCtxService.getAuthCtx(), logger));
        ArrayList<PythonContrib> contributedProcessors = new ArrayList<PythonContrib>(PythonContribService.getInstance().getContribs());
        logger.info((Object)("Serialized " + contributedProcessors.size() + " Python processor(s)"));
        AutoDelete tempDatafile = FlowJobUtils.getJobTempFile("shaker-mr", "contributed_python_processors", "json");
        JSON.prettyToFile((Object)new SerializedPythonContribs(contributedProcessors), (File)tempDatafile);
        allResourceFiles.put(CONTRIBUTED_PROCESSORS, (File)tempDatafile);
        AutoDelete meaningsDataFile = FlowJobUtils.getJobTempFile("shaker-mr", "meanings", "json");
        MeaningsDAO.MeaningsList ml = new MeaningsDAO.MeaningsList();
        ml.meanings = this.meaningsDAO.listUnsafe();
        JSON.prettyToFile((Object)ml, (File)meaningsDataFile);
        allResourceFiles.put(USER_DEFINED_MEANINGS, (File)meaningsDataFile);
        HashMap<String, String> resourceMapping = new HashMap<String, String>();
        ArrayList<File> resourceFiles = new ArrayList<File>();
        for (Map.Entry entry : allResourceFiles.entrySet()) {
            logger.info((Object)("Job required file " + (String)entry.getKey() + " -> " + String.valueOf(entry.getValue())));
            if (((File)entry.getValue()).isDirectory()) {
                logger.info((Object)"Adding a directory -> zip it");
                AutoDelete zipFileNoExt = FlowJobUtils.getJobTempFile("shaker-mr", "shadoop-zip-" + (String)entry.getKey(), "zip");
                File zipFile = new File(zipFileNoExt.getAbsolutePath() + ".zip");
                ZipUnzipDir.zipDirectory((File)entry.getValue(), zipFile);
                String filename = zipFile.getName();
                if (resourceMapping.containsValue(filename)) {
                    throw new RuntimeException("Each resource must be named differently (" + filename + ")");
                }
                resourceMapping.put((String)entry.getKey(), filename);
                resourceFiles.add(zipFile);
                continue;
            }
            String filename = ((File)entry.getValue()).getName();
            if (resourceMapping.containsValue(filename)) {
                throw new RuntimeException("Each resource must be named differently (" + filename + ")");
            }
            resourceFiles.add((File)entry.getValue());
            resourceMapping.put((String)entry.getKey(), filename);
        }
        System.out.println("*************** START CONFIG ************");
        System.out.println(JSON.log((Object)configKeys));
        this.runner = new ShakerMRJobRunner(this.authCtxService.getAuthCtx(), mainInputDataset, outputDataset, expandedScript, mainInputPaths, this.outputPath, resourceFiles, resourceJars, resourceMapping, configKeys, this.recipe.getProjectKey());
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    @Override
    public void run() throws Exception {
        if (this.abortNotified) {
            throw new ActivityAbortedException();
        }
        try {
            this.runner.run();
        }
        catch (Throwable throwable) {
            WarningsContext mergedWarnings = DiagnosticsHandler.extractMergedWarnings(this.outputRootPathURI, this.outputFSExtraConf, this.outputPath);
            List<SerializedThrowableWithContext> throwables = DiagnosticsHandler.extractErrors(this.outputRootPathURI, this.outputFSExtraConf, this.outputPath);
            this.activity.warnContext.merge(mergedWarnings.getOutput());
            for (SerializedThrowableWithContext t : throwables) {
                logger.error((Object)("Error occurred in task attempt : " + t.attemptId + ":\n" + t.serializedThrowable.stack + "\n"));
            }
            if (throwables.size() > 0) {
                SerializedThrowableWithContext t = throwables.get(0);
                this.activity.setStatusMessage("Failed : " + t.serializedThrowable.message);
            }
            throw throwable;
        }
        WarningsContext mergedWarnings = DiagnosticsHandler.extractMergedWarnings(this.outputRootPathURI, this.outputFSExtraConf, this.outputPath);
        List<SerializedThrowableWithContext> throwables = DiagnosticsHandler.extractErrors(this.outputRootPathURI, this.outputFSExtraConf, this.outputPath);
        this.activity.warnContext.merge(mergedWarnings.getOutput());
        for (SerializedThrowableWithContext t : throwables) {
            logger.error((Object)("Error occurred in task attempt : " + t.attemptId + ":\n" + t.serializedThrowable.stack + "\n"));
        }
        if (throwables.size() > 0) {
            SerializedThrowableWithContext t = throwables.get(0);
            this.activity.setStatusMessage("Failed : " + t.serializedThrowable.message);
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    @Override
    public void notifyBeforeAborting() {
        ShakerMRJobRunner runnerCopy;
        ShakerMRRecipeRunner shakerMRRecipeRunner = this;
        synchronized (shakerMRRecipeRunner) {
            if (this.abortNotified) {
                return;
            }
            this.abortNotified = true;
            runnerCopy = this.runner;
        }
        if (runnerCopy != null) {
            logger.info((Object)"MR job initialized, notify abort");
            runnerCopy.notifyAbort();
        } else {
            logger.info((Object)"MR job not initialized, nothing to abort");
        }
    }

    static {
        ScriptStep.loadClass();
        logger = Logger.getLogger((String)"dku.shaker.mr");
    }
}

