/*
 * Decompiled with CFR 0.152.
 */
package com.dataiku.dip.recipes.nlp.extract_content;

import com.dataiku.dip.coremodel.Dataset;
import com.dataiku.dip.coremodel.Schema;
import com.dataiku.dip.coremodel.SchemaColumn;
import com.dataiku.dip.coremodel.SerializedRecipe;
import com.dataiku.dip.dataflow.JobActivity;
import com.dataiku.dip.dataflow.RecipeRunnableSubgraph;
import com.dataiku.dip.dataflow.exec.stream.ToDatasetStreamer;
import com.dataiku.dip.dataflow.graph.FlowDataset;
import com.dataiku.dip.dataflow.streaming.DatasetWriter;
import com.dataiku.dip.datalayer.Column;
import com.dataiku.dip.datalayer.ColumnFactory;
import com.dataiku.dip.datalayer.ProcessorOutput;
import com.dataiku.dip.datalayer.Row;
import com.dataiku.dip.datalayer.streamimpl.StreamColumnFactory;
import com.dataiku.dip.datalayer.streamimpl.StreamRowFactory;
import com.dataiku.dip.datasets.Type;
import com.dataiku.dip.docextraction.common.InputRefs;
import com.dataiku.dip.output.Output;
import com.dataiku.dip.partitioning.Partition;
import com.dataiku.dip.recipes.nlp.common.doc_extraction.ExtractedData;
import com.dataiku.dip.recipes.nlp.common.doc_extraction.ExtractedDataWriter;
import com.dataiku.dip.recipes.nlp.common.doc_extraction.chunks.SingleExtractedChunk;
import com.dataiku.dip.security.AuthCtx;
import com.google.gson.Gson;
import java.nio.file.Paths;
import java.util.Set;

public class ExtractedDataDatasetWriter
extends ExtractedDataWriter {
    private final StreamColumnFactory cf = new StreamColumnFactory();
    private final StreamRowFactory rf = new StreamRowFactory();
    private ProcessorOutput outputProcessor;
    private Gson gson = new Gson();
    static final String SOURCE_FILEPATH_COLUMN = "source_file";
    static final String EXTRACTION_UNIT_INDEX_COLUMN = "index_in_file";
    static final String PAGE_RANGE_COLUMN = "page_range";
    static final String SECTION_OUTLINE_COLUMN = "section_outline";
    static final String EXTRACTED_CONTENT_COLUMN = "extracted_content";
    static final String JSON_CONTENT_COLUMN = "structured_content";
    static final String EXTRACTION_ENGINE_COLUMN = "extraction_engine";
    static final String EXTRACTION_UNIT_ID_COLUMN = "dku_content_id";

    public ExtractedDataDatasetWriter(SerializedRecipe.RecipeOutput datasetRO, Dataset outputDataset, AuthCtx authCtx, RecipeRunnableSubgraph subgraph, JobActivity activity) throws Exception {
        Output.WriteMode writeMode = datasetRO.getWriteMode();
        FlowDataset flowDataset = subgraph.getSingleTargetDatasetForRole("main");
        Partition partition = subgraph.getTargetPartition(flowDataset);
        writeMode = DatasetWriter.getWriteModeAfterDatasetCleanup(outputDataset, partition, writeMode, authCtx);
        ToDatasetStreamer dsStreamer = ToDatasetStreamer.newWithAutoBucketing(authCtx, outputDataset, partition, (ColumnFactory)this.cf, activity.warnContext, writeMode);
        this.outputProcessor = dsStreamer.getAsOutput();
    }

    @Override
    protected synchronized void writeAppend(InputRefs.ManagedFolderDocumentRefWithMetadata document, ExtractedData extractedData) {
        for (int rowIndex = 0; rowIndex < extractedData.chunks.size(); ++rowIndex) {
            Row row = this.buildRow(this.rf, this.cf, extractedData.chunks.get(rowIndex), extractedData.extractorEngine, rowIndex);
            try {
                this.outputProcessor.emitRow(row);
                continue;
            }
            catch (Exception e) {
                throw new RuntimeException("Failed to write output dataset row: ", e);
            }
        }
    }

    public void close() throws Exception {
        this.outputProcessor.lastRowEmitted();
    }

    public static Schema getMainOutputSchema() {
        Schema outputSchema = new Schema();
        outputSchema.addColumn(new SchemaColumn(SOURCE_FILEPATH_COLUMN, Type.STRING));
        outputSchema.addColumn(new SchemaColumn(EXTRACTION_UNIT_INDEX_COLUMN, Type.INT));
        outputSchema.addColumn(new SchemaColumn(PAGE_RANGE_COLUMN, Type.STRING, 25));
        outputSchema.addColumn(new SchemaColumn(SECTION_OUTLINE_COLUMN, Type.STRING));
        outputSchema.addColumn(new SchemaColumn(EXTRACTED_CONTENT_COLUMN, Type.STRING));
        outputSchema.addColumn(new SchemaColumn(JSON_CONTENT_COLUMN, Type.STRING));
        outputSchema.addColumn(new SchemaColumn(EXTRACTION_ENGINE_COLUMN, Type.STRING, 30));
        outputSchema.addColumn(new SchemaColumn(EXTRACTION_UNIT_ID_COLUMN, Type.STRING, 45));
        return outputSchema;
    }

    public Row buildRow(StreamRowFactory rf, StreamColumnFactory cf, SingleExtractedChunk chunk, ExtractedData.ExtractorEngine extractorEngine, int rowIndex) {
        Row row = rf.row();
        String document_path = Paths.get(chunk.extractedMetadata.sourceFile.path, new String[0]).toString();
        row.put((Column)cf.column(SOURCE_FILEPATH_COLUMN), document_path);
        row.put((Column)cf.column(EXTRACTION_UNIT_INDEX_COLUMN), rowIndex + 1);
        if (chunk.extractedMetadata.pageRange != null) {
            row.put((Column)cf.column(PAGE_RANGE_COLUMN), chunk.extractedMetadata.pageRange.toString());
        }
        if (chunk.extractedMetadata.sectionOutline != null) {
            row.put((Column)cf.column(SECTION_OUTLINE_COLUMN), this.gson.toJson(chunk.extractedMetadata.sectionOutline));
        }
        row.put((Column)cf.column(EXTRACTED_CONTENT_COLUMN), chunk.embedValue);
        row.put((Column)cf.column(JSON_CONTENT_COLUMN), chunk.json);
        row.put((Column)cf.column(EXTRACTION_ENGINE_COLUMN), extractorEngine.displayName);
        row.put((Column)cf.column(EXTRACTION_UNIT_ID_COLUMN), chunk.uuid);
        return row;
    }

    @Override
    public void dumpOutdatedChunksUuids(Set<String> chunksIdsToDelete) {
    }
}

