# -*- coding: utf-8 -*-
import dataiku
import numpy as np
import io
from sentence_transformers import SentenceTransformer

BATCH_SIZE = 16

model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2')

df = dataiku.Dataset("train").get_dataframe()

folder = dataiku.Folder("GuB6FLVF")

dim_embeddings = int(model.encode("").shape[0])
emb = np.empty((len(df), dim_embeddings), dtype=np.float32)

i = 0
while i < len(df):
    end = min(i + BATCH_SIZE, len(df))
    emb[i:end, :] = model.encode(list(df.text.iloc[i:end]))
    i += BATCH_SIZE

buf = io.BytesIO()
np.save(buf, emb)
folder.upload_data("embeddings.npy", buf.getvalue())
buf.close()