# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import re
import io
import pickle
import dataiku

from project_utils import load

id_label = dataiku.get_custom_variables()["id_label"]
text_label = dataiku.get_custom_variables()["text_label"]

embeddings = dataiku.Folder("P4SttKJS")
df = dataiku.Dataset("data").get_dataframe().set_index(id_label)
exact_search_index = dataiku.Folder("evJsZfu6")

corpus_ids = load(embeddings, "ids.npy")


def get_words(s):
    words = re.findall(r"\b\w+\b", s)
    return set([word.lower() for word in words])


index = {}
for i in range(len(corpus_ids)):
    row = df.loc[corpus_ids[i]]
    for word in get_words(row[text_label]):
        if word in index:
            index[word].append(i)
        else:
            index[word] = [i]

with io.BytesIO() as buf:
    pickle.dump(index, buf, protocol=pickle.HIGHEST_PROTOCOL)
    exact_search_index.upload_data("index.pickle", buf.getvalue())
