# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import io
import pandas as pd
from bs4 import BeautifulSoup
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    HTMLHeaderTextSplitter,
)

folder = dataiku.Folder("o87Qdwlh")

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
headers = [
    ("h1", "h1"),
    ("h2", "h2"),
    ("h3", "h3"),
    ("h4", "h4"),
    ("h5", "h5"),
    ("h6", "h6"),
]

html_splitter = HTMLHeaderTextSplitter(
    headers_to_split_on=headers, return_each_element=False
)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
d = {"chunk": [], "url": []}

for path in folder.list_paths_in_partition():
    url = "https://" + path[1:].replace("=", "/")
    with folder.get_download_stream(path) as stream:
        s = io.BytesIO(stream.read()).read().decode("utf-8")
        result = text_splitter.split_documents(html_splitter.split_text(s))
        if "/plugins/" in url:
            soup = BeautifulSoup(s, "html.parser")

    for i in range(len(result)):
        header = " > ".join(
            [
                result[i].metadata[k[1]].replace("#", "").strip()
                for k in headers
                if k[1] in result[i].metadata
                if result[i].metadata[k[1]] is not None
            ]
        )

        if "/plugins/" in url:
            title = soup.find("title").string.replace(" | Dataiku", "")
            header = title + (" > " if len(header) > 0 else "") + header
        d["chunk"].append((header + "\n\n" + result[i].page_content).strip())
        d["url"].append(url)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
df = pd.DataFrame.from_dict(d)
df["chunk_id"] = range(len(df))
dataiku.Dataset("chunks").write_with_schema(df)
