# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import logging
import requests
from urllib.parse import urljoin, urlsplit
import base64
import re

import dataiku

from bs4 import BeautifulSoup
from markdownify import markdownify as md

folder = dataiku.Folder("vElSoRUz")

to_explore = list(dataiku.Dataset("list_plugins").get_dataframe().url) + [
    "https://doc.dataiku.com/dss/latest/",
    "https://knowledge.dataiku.com/latest/",
    "https://developer.dataiku.com/latest/",
]
urls = set(to_explore)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
pattern_href = re.compile(r"(https://(doc|knowledge|developer)\.dataiku.com/(dss/)?latest/.*\.html)")
def get_url(href):
    """
    Return the URL if it's in the scope of the web scraping
    """
    if "https://developer.dataiku.com/latest/api-reference/" in href:
        return None
    m = pattern_href.match(href)
    if m is None:
        return None
    return m.group(1)

def remove_links(soup, url):
    """
    Remove all links except those corresponding to headings
    """
    c = "#" if "developer.dataiku.com" in url else "¶"
    for a in soup.find_all('a', href=True):
        if a.text != c:
            a.unwrap()
        else:
            a.string.replace_with("¶")
            if not a['href'].startswith('http'):
                a['href'] = urljoin(url, a['href'])

def remove_all_links(soup):
    """
    Remove all links from a soup
    """
    for a in soup.find_all('a', href=True):
        a.unwrap()
                
def html2markdown(html):
    """
    Convert a HTML string in Markdown format
    """
    try:
        return(md(html, strip=["img"], heading_style="ATX"))
    except TypeError:
        return ""

pattern_sequential_white_spaces = re.compile(r"\n\s*\n")
def strip(s):
    """
    Remove consecutive \n
    """
    return pattern_sequential_white_spaces.sub("\n", s.strip())

def soup2markdown(soup, url):
    """
    Convert a BeautifulSoup soup in Markdown format
    """
    result = []
    for element in soup:
        if element.name is not None:
            if element.name in ["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "table", "pre", "dt"]:
                markdown = html2markdown(str(element))
                if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"] and "[¶](" not in markdown:
                    markdown = markdown.strip() + f"[¶]({url})"
                if "\n" in markdown:
                    if element.name in ["ul", "table", "pre"]:
                        result += markdown.split("\n")
                    elif element.name == "dt":
                        result.append("".join(markdown.split("\n")))
                    else:
                        result.append(" ".join(markdown.split("\n")))
                else:
                    result.append(markdown)
            elif element.name in ["dl"]:
                try:
                    is_class = "class" in element["class"] or "method" in element["class"]
                except KeyError:
                    is_class = False
                if is_class:
                    remove_all_links(element)
                    result += [x.replace("¶", "") for x in soup2markdown(element, url)]
                else:
                    for line in element.select("dt,dd"):
                        if line.name == "dt":
                            result.append(f"* {html2markdown(str(line))}")
                        elif line.name == "dd":
                            result[-1] += f": {html2markdown(str(line))}"
            else:
                if "https://www.dataiku.com/product/plugins" in url:
                    try:
                        if element.name == "div" and "breadcrumbs" in element["class"]:
                            continue
                    except KeyError:
                        pass
                result += soup2markdown(element, url)
    result = [strip(x) for x in result if not x.isspace()]
    return result

def post_process(content):
    """
    Adapt the Markdown content so that the text splitter won't break tables and code sections
    """
    ongoing_code = False
    ongoing_table = False
    columns = 0
    missing = 0
    for i in range(len(content)):
        if content[i].startswith("```"):
            ongoing_code = not ongoing_code
            ongoing_table, missing = False, 0
            content[i] = ""
        else:
            if ongoing_code:
                content[i] = f"§ {content[i]}"
            else:
                if content[i].startswith("|") and not ongoing_table:
                    ongoing_table = True
                    columns = sum(1 for c in content[i] if c == "|")
                if missing > 0:
                    n_columns = sum(1 for c in content[i] if c == "|")
                    missing = missing - n_columns if n_columns < missing else 0
                    if content[i][-1] == "\n":
                        content[i] = content[i][:-1]
                else:
                    if content[i].startswith("|"):
                        n_columns = sum(1 for c in content[i] if c == "|")
                        missing = columns - n_columns if n_columns < columns else 0
                    else:
                        ongoing_table, missing = False, 0
        if missing == 0 and len(content[i]) > 0 and content[i][-1] != "\n":
            content[i] += "\n"

    return [x for x in content if len(x) > 0 and not x.isspace() and not re.match(r"^\s*\*\s*$", x)]

def correct_headings(content):
    """
    Handle the case of inconsistent headings (for example <h3> after <h1>)
    """
    current_depth = 0
    for i in range(len(content)):
        if content[i].startswith("#"):
            splitted = content[i].split(" ")
            depth = splitted[0].count("#")
            if depth > current_depth + 1:
                depth = current_depth + 1
                splitted[0] = depth*"#"
                content[i] = " ".join(splitted)
            current_depth = depth
    return content

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
i = 0
while len(to_explore) > 0:
    to_explore2 = []
    for url in to_explore:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        for link in soup.find_all("a"):
            try:
                href = get_url(urljoin(url, link["href"]))
                if href is not None and href not in urls:
                    urls.add(href)
                    to_explore2.append(href)
            except KeyError:
                pass

        if urlsplit(url).path == "/product/plugins/":
            continue
        remove_links(soup, url)
        try:
            if "developer.dataiku.com" in url:
                root = "div.article-container"
            elif "https://www.dataiku.com/product/plugins/" in url:
                root = "main.main"
            else:
                root = "div.document"
            content = [x for x in soup2markdown(soup.select(root)[0], url) if len(x) > 0]
        except IndexError:
            continue
        if "https://www.dataiku.com/product/plugins/" in url:
            content = correct_headings(content)
        content = "\n".join(post_process(content))
        filename = str(base64.b64encode(url.encode()), 'utf-8')
        with folder.get_writer(f"{filename}.md") as w:
            w.write(bytes(content, 'utf-8'))
        i += 1
        if i % 100 == 0:
            logging.info(f"{i} pages scrapped")
    to_explore = list(to_explore2)