# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd
import re
import time
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from selenium import webdriver
wd = webdriver.Chrome('chromedriver')

url = "https://www.dataiku.com/product/plugins/"
to_explore = [url]
urls = set()

pattern_href = re.compile(r"(https://www.dataiku.com/product/plugins/[^#]+)(#.*)?")
def get_url(href):
    """
    Return the URL if it's in the scope of the web scraping
    """
    m = pattern_href.match(href)
    if m is None:
        return None
    return m.group(1)

while len(to_explore) > 0:
    to_explore2 = []
    for url in to_explore:
        wd.get(url)
        time.sleep(5)
        soup = BeautifulSoup(wd.page_source, 'html.parser')
        for link in soup.find_all("a"):
            try:
                href = get_url(urljoin(url, link["href"]))
                if href is not None and href not in urls:
                    urls.add(href)
                    if "?" in href:
                        to_explore2.append(href)

            except KeyError:
                pass
    to_explore = list(to_explore2)

urls = sorted([url for url in list(urls) if "?" not in url])
df = pd.DataFrame.from_dict({"url": urls})
dataiku.Dataset("list_plugins").write_with_schema(df)