# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import csv, gzip

# Read recipe inputs
raw = dataiku.Folder("Pathways")
files = raw.list_paths_in_partition()
wiki = list(filter(lambda x : 'wiki' in x,files))[0]
gmt = raw.file_path(wiki)

def read_gmt(path):
    
    if path.endswith('.gz'):
        read_file = gzip.open(path,'rt')
    else:
        read_file = open(path)  
    
    reader = csv.reader(read_file, delimiter='\t')
    
    for row in reader:
        if not row:
            continue
        name = row[0]
        description = row[1]
        genes = set(row[2:])
        yield name, description, genes
    read_file.close()

gen = read_gmt(gmt)

wiki_df = pd.DataFrame(gen,columns = ['name', 'description', 'genes'])

comms = list(filter(lambda x : 'Commons' in x,files))[0]
pc = raw.file_path(comms)
pgen = read_gmt(pc)

pf = pd.DataFrame(pgen, columns=['url','data','genes'])

# # # Write recipe outputs
wiki_pathways = dataiku.Dataset("wiki_pathways")
wiki_pathways.write_with_schema(wiki_df)

allpaths = dataiku.Dataset("all_pathways")
allpaths.write_with_schema(pf)