from io import StringIO
from typing import List, Optional

try:
    from langchain_classic.docstore.document import Document
except ModuleNotFoundError:
    from langchain.docstore.document import Document
try:
    from langchain_core.document_loaders.base import BaseLoader
except ModuleNotFoundError:
    from langchain.document_loaders.base import BaseLoader

from markdown import Markdown


def unmark_element(element, stream=None): # type: ignore
    if stream is None:
        stream = StringIO()
    if element.text:
        stream.write(element.text)
    for sub in element:
        unmark_element(sub, stream)
    if element.tail:
        stream.write(element.tail)
    return stream.getvalue()


# patching Markdown
Markdown.output_formats["plain"] = unmark_element # type: ignore
__md = Markdown(output_format="plain") # type: ignore
__md.stripTopLevelTags = False


def unmark(text: str) -> str:
    return __md.convert(text)

class DkuWikiLoader(BaseLoader):
    def __init__(self, project_key: Optional[str] = None):
        self.project_key = project_key

    def load(self) -> List[Document]:
        import dataiku

        project = dataiku.api_client().get_default_project()

        wiki = project.get_wiki()
        articles = wiki.list_articles()
        articles_data = [article.get_data() for article in articles]

        documents = []
        for article_data in articles_data:
            unmarked = unmark(article_data.get_body())
            document = Document(page_content=unmarked, metadata={"source" : "Wiki article %s" % article_data.get_name()})
            documents.append(document)

        return documents