# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import io

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# set folder specifications
folder_id = "Dh3OkheU"
file = '/Products.txt'
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Function that gets python file-like objects, access .txt files and convert them to dataframes
def convert_files(folder_id, file):
    folder = dataiku.Folder(folder_id)
    # Get the file stream from the folder
    with folder.get_download_stream(file) as f:
        text_as_bytes = f.read()

    text_as_io = io.BytesIO(text_as_bytes)

    # Read the CSV treating all columns as text
    df = pd.read_csv(text_as_io, sep="\t", on_bad_lines='skip', header=0, dtype=str)
    return(df)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Use function convert_files to convert .txt files to a dataframe
products_df = convert_files(folder_id, file)
products_df = products_df.groupby(['ApplNo']).nth(0)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
FDA_products_input = dataiku.Dataset("FDA_products_input")
FDA_products_input.write_with_schema(products_df)