from other_api_utils.google_docs.typing_commons import Credentials
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import time
import numpy as np
import requests
from urllib.parse import urljoin


def get_google_presentation(google_credentials: Credentials, presentation_id: str)->dict:
    """Retrieves a google presentation.
    
    :param google_credentials: Credentials: A google credentials object.
    :param presentation_id: str: The ID of the presentation to retrieve.
    
    :returns: presentation: dict: The dictionary associated with a google presentation. 
    """
    print(f"Retrieving the google presentation '{presentation_id}' ...")
    slides_service = build('slides', 'v1', credentials=google_credentials)
    presentation = slides_service.presentations().get(presentationId=presentation_id).execute()
    print(f"Google presentation '{presentation_id}' successfully retrieved!")
    return presentation


def extract_slide_notes_text(slide: dict):
    """Extracts the notes text from a google presentation slide.
    
    :param slide: dict: The dictionary associated with a google presentation slide.
    
    :returns: slide_notes_text: dict: The slide notes text. 
    """
    slide_notes_text = ""
    slide_notes = slide.get("slideProperties", {}).get("notesPage", {})
    slide_note_elements = slide_notes.get("pageElements", [])
    if slide_note_elements:
        for element in slide_note_elements:
            element_text_elements = element.get("shape", {}).get("text", {}).get("textElements", [])
            for text_element in element_text_elements:
                text = text_element.get("textRun", {}).get("content", "")
                slide_notes_text += text
    return slide_notes_text


def extract_slide_text(presentation_id: str, slide: dict, reject_notes: bool= True)->list:
    """Extracts the text from a google presentation slide.

    :param presentation_id: str: The ID of the presentation containing the slide.
    :param slide: dict: The dictionary associated with a google presentation slide.
    :param reject_notes: bool: Precises whether the slide notes should be rejected or not from the extraction process.

    :returns: slide_text: list: The list containing all the slide text chunks.
    """
    URLS_START = "https://docs.google.com/presentation/d/"
    slide_text = []
    last_slide_block = [slide]  # Initializes 'last_slide_block' with the initial slide dictionary

    if reject_notes:
        if "notesPage" in slide.get("slideProperties", {}):
            del slide["slideProperties"]["notesPage"]

    while last_slide_block:
        current = last_slide_block.pop()
        if isinstance(current, dict):
            for key, value in current.items():
                if key == "textRun":
                    link_reference = ""
                    if 'link' in value["style"]:
                        link_data = value["style"]["link"]
                        if 'url' in link_data:
                            link_reference = link_data["url"]
                        elif 'pageObjectId' in link_data:
                            link_reference = link_data["pageObjectId"]
                            link_reference = urljoin(URLS_START, f"{presentation_id}/edit#slide=id.{link_reference}")
                    slide_text.append({"content": value["content"], "link_reference": link_reference})
                if isinstance(value, (dict, list)):
                    last_slide_block.append(value)
        elif isinstance(current, list):
            for item in current:
                if isinstance(item, (dict, list)):
                    last_slide_block.append(item)
    slide_text.reverse()
    return slide_text


class GooglePresentationCrawler:
    """Crawls a google presentation to extract it's content"""

    URLS_START = "https://docs.google.com/presentation/d/"
    ALLOWED_CONTENT_TO_EXTRACT = ['slide_thumbnails', 'slide_notes', 'slide_text']

    def __init__(self, google_credentials: Credentials, presentation_id: str, presentation_name: str, content_to_extract: list=['slide_text', 'slide_thumbnails', 'slide_notes'],
                 note_tags_to_filter_slides: list=[], note_tags_role: str="remove_slides"):
        """
        :param google_credentials: Credentials: A google credentials object.
        :param presentation_id: str: The ID of the presentation to crawl.
        :param presentation_name: str: The Name of the presentation to crawl.
        :param content_to_extract: list: The type of content to extract from the presentation.
            The available options are:
            - 'slide_text': Choose this option to retrieve the slides text
            - 'slide_thumbnails': Choose this option to retrieve the slide thumbnails as well as well as the slide image bytes.
            - 'slide_notes': Choose this option to retrieve the slide notes.
        :param note_tags_to_filter_slides: list: You can leverage slide notes to filter some of the slides from the data collection. This parameter allows to
            set a list of tags allowing to apply that filter.
            NOTE: 'slide_notes' must be part of the 'content_to_extract' if this list is not empty.
        :param note_tags_role: str:  This parameter defines how the slides must be filtered depending on the tags set in 'note_tags_to_filter_slides'. 
            It can take the values:
                - 'remove_slides': Choose this paremeter if you want to remove all the slides that have one of the tags set in 'note_tags_to_filter_slides'.
                - 'keep_slides': Choose this paremeter if you want to only keep the slides that have one of the tags set in 'note_tags_to_filter_slides'.
        """
        self.google_credentials = google_credentials
        self.slides_service = build('slides', 'v1', credentials=google_credentials)
        self.presentation_id = presentation_id
        self.presentation = get_google_presentation(google_credentials, presentation_id)
        self.presentation_name = presentation_name
        self.content_to_extract = content_to_extract
        self.note_tags_to_filter_slides = note_tags_to_filter_slides
        self.note_tags_role = note_tags_role
        self.check_content_to_extract()
        self.check_slide_filtering_settings()
        self.presentation_slide_ids = []
        self.presentation_slides = {}
        for slide_index, slide in enumerate(self.presentation.get("slides")):
            slide_id = slide["objectId"]
            self.presentation_slides[slide_id] = {}
            self.presentation_slide_ids.append(slide_id)
            self.presentation_slides[slide_id]["index"] = slide_index+1
            self.presentation_slides[slide_id]["content"] = slide
        self.crawled_slides = set()
        self.last_crawled_slides = []
        self.n_remaining_slides_to_crawl_history = []
        self.remaining_slides_to_crawl = set(self.presentation_slide_ids)
        self.last_handled_slide = None
        self.crawling_failures = []
        self.slides_data = {}

    
    def check_content_to_extract(self):
        """Checks that the parameters of 'self.content_to_extract' are valid."""
        if not isinstance(self.content_to_extract, list) or (not self.content_to_extract):
            raise ValueError(f"Invalid settings for the parameter 'content_to_extract': it must be a list with values from {self.ALLOWED_CONTENT_TO_EXTRACT}")
        
        invalid_content_to_extract = set(self.content_to_extract).difference(set(self.ALLOWED_CONTENT_TO_EXTRACT))
        if invalid_content_to_extract:
            raise ValueError(f"Invalid settings for the parameter 'content_to_extract': it must be a list with values from {self.ALLOWED_CONTENT_TO_EXTRACT}. You chose the values {list(invalid_content_to_extract)}: please edit it")
        pass

    def check_slide_filtering_settings(self):
        """Checks that the parameter 'note_tags_to_filter_slides' is consistent with the selection made in the parameter 'content_to_extract'."""
        if self.note_tags_to_filter_slides:
            if 'slide_notes' not in self.content_to_extract:
                log_message = "If you want to use note tags to filter slides, you must allow the extraction of slide notes. "\
                "Please add the value 'slide_notes' to the parameter 'content_to_extract'"
                raise ValueError(log_message)

    def crawl_slides(self, max_stagnation_steps: int=15, rate_limit_sleep_time: int=15, verbose: bool=True):
        """Crawls the slides to extract their content.

        :param max_stagnation_steps: int: The maximum number of loop stagnation allowed when crawling the slides.
        :param rate_limit_sleep_time: int: The sleeping time to wait for when the API rate limit is encountered.
        :param verbose: bool: Allows to lo tog the crawling evolution.
        """
        iteration_index = 1
        continue_crawling = True
        crawling_start_time = time.time()
        while continue_crawling:
            for slide_id in self.remaining_slides_to_crawl:
                self.last_handled_slide = slide_id
                if slide_id not in self.slides_data:
                    self.slides_data[slide_id] = {}
                if verbose:
                    log_message = f"Iteration n°{iteration_index} | "\
                    f"Crawling the slide n°{len(self.crawled_slides)+1} ({slide_id})"
                    print(log_message)
                try:
                    if "slide_thumbnails" in self.content_to_extract:
                        slide_thumbnail = self.slides_service.presentations().pages().getThumbnail(
                            presentationId=self.presentation_id,
                            pageObjectId=slide_id,
                            thumbnailProperties_thumbnailSize='LARGE'
                            ).execute()
                        self.slides_data[slide_id]["thumbnail"] = slide_thumbnail
                        request_response = requests.get(slide_thumbnail["contentUrl"], headers={'Authorization': 'Bearer ' + self.google_credentials.token})
                        slide_image_bytes = request_response.content
                        self.slides_data[slide_id]["slide_image_bytes"] = slide_image_bytes
                        
                    if "slide_notes" in self.content_to_extract:
                        slide_note = extract_slide_notes_text(self.presentation_slides[slide_id]["content"])
                        self.slides_data[slide_id]["note"] = slide_note
                    
                    if "slide_text" in self.content_to_extract:
                        slide_text = extract_slide_text(self.presentation_id, self.presentation_slides[slide_id]["content"])
                        self.slides_data[slide_id]["text"] = slide_text

                    self.slides_data[slide_id]["index"] = self.presentation_slides[slide_id]["index"]
                    self.slides_data[slide_id]["presentation_id"] = self.presentation_id
                    self.slides_data[slide_id]["presentation_name"] = self.presentation_name
                    self.slides_data[slide_id]["page_id"] = f"{self.presentation_name}-page_{self.presentation_slides[slide_id]['index']}-pres_id_{self.presentation_id}-slide_id_{slide_id}"
                    self.slides_data[slide_id]["page_url"] = urljoin(self.URLS_START, f"{self.presentation_id}/edit#slide=id.{slide_id}")
                    self.crawled_slides.add(slide_id)
                
                except HttpError as google_http_error:
                    error_trace = str(google_http_error)
                    if "429" in error_trace:
                        print(f"\tException '429' (Too many requests) met: sleeping for {rate_limit_sleep_time} seconds...")
                        time.sleep(rate_limit_sleep_time)
                    else:
                        print(f"The following exception has been met when crawling the slide_id '{slide_id}':\n\n {error_trace}")
                        self.crawling_failures.append({"slide_id": slide_id, "error_trace": error_trace})
                self.last_crawled_slides.append(slide_id)
            self.remaining_slides_to_crawl = self.remaining_slides_to_crawl.difference(self.crawled_slides)
            self.n_remaining_slides_to_crawl_history.append(len(self.remaining_slides_to_crawl))
            continue_crawling = self.get_continue_crawling_status(max_stagnation_steps)
            iteration_index += 1
        
        crawling_time = time.time() - crawling_start_time
        print(f"Crawling finished ! '{len(self.crawled_slides)}' pages were successfully crawled in {crawling_time} seconds!")
        if self.note_tags_to_filter_slides:
            self.remove_slides_with_tags()            

        pass                
    
    def get_continue_crawling_status(self, max_stagnation_steps: int=15)->bool:
        """Checks if the crawling should continue or not.
        :param max_stagnation_steps: int: The maximum number of loop stagnation allowed when crawling the slides.

        :returns: continue_crawling: bool: The status precising whether the crawling should continue or not.
        """
        continue_crawling = True
        if len(self.remaining_slides_to_crawl) == 0:
            continue_crawling = False
            print("All slides have been successfully crawled!\n Stopping the crawling ...")
        else:
            print(f"{len(self.remaining_slides_to_crawl)} slides remain to be crawled after this iteration!\n Continuing the crawling...\n\n")
        last_n_remaining_slides_to_crawl_history = self.n_remaining_slides_to_crawl_history[-max_stagnation_steps:]
        if len(last_n_remaining_slides_to_crawl_history) >= max_stagnation_steps:
            crawling_stagnates = (len(np.unique(last_n_remaining_slides_to_crawl_history)) == 1)
            if crawling_stagnates:
                continue_crawling = False
                print(f"Stopping the crawling after {max_stagnation_steps} successive failures...")
        return continue_crawling
    
    def remove_slides_with_tags(self):
        print("Identifying the slides to remove from the crawled slides data ...")
        slides_to_remove = []
        for slide_index, slide_id in enumerate(self.slides_data.keys()):
            remove_slide = False
            note_contains_tag = False
            deletion_explanation = ""
            slide_note = self.slides_data[slide_id]["note"]
            matching_tags = []
            for tag in self.note_tags_to_filter_slides:
                if tag in slide_note:
                    note_contains_tag = True
                    matching_tags.append(tag)
            
            if self.note_tags_role == "remove_slides":
                if note_contains_tag:
                    remove_slide = True
                    deletion_explanation = f"it contained the tags '{matching_tags}'."

            elif self.note_tags_role == "keep_slides":
                if not note_contains_tag:
                    remove_slide = True
                    deletion_explanation = f"it did not contain any of the tags set in '{self.note_tags_to_filter_slides}'."
            
            if remove_slide:
                print(f"\nBased on 'note_tags_to_filter_slides' ({self.note_tags_to_filter_slides}) and 'note_tags_role' ({self.note_tags_role}), "\
                      f"the data from the slide n°{slide_index+1} ({slide_id}) will be removed as {deletion_explanation}.\n")
                slides_to_remove.append(slide_id)
        
        print("Removing the slides ...")
        for slide_id in slides_to_remove:
            del self.slides_data[slide_id]
            print(f"The slide '{slide_id}' has been successfully removed !")
        pass