import json
from typing import Any, Dict, List, Optional, TypedDict

from answers.llm_assist.llm_tooling.tools import (
    get_all_dataset_descriptions,
    get_all_suggested_joins,
    get_dataset_description,
    get_user_query_examples,
)
from answers.solutions.chains.db.generic_db_agent import GenericDBAgent


class SqlError(TypedDict):
    response: str
    query: str
    error: str


class SelectSQLAgent(GenericDBAgent):
    # Simple Agent that builds a search query for SQL table based on the last user input
    def __init__(self) -> None:
        self.previous_sql_errors: List[SqlError] = []
        self.all_dataset_descriptions = get_all_dataset_descriptions()
        self.all_suggested_joins = get_all_suggested_joins()
        self._chosen_tables_and_columns = ""
        self.chosen_joins = ""
        self._graph_justification = ""

        self._graph_system_prompt = f"""# Role and Guidelines
        Your role is to analyze a question along with the provided SQL table descriptions and schema to determine if these SQL tables can be used to answer the question. If they can, you should decide which tables and columns are required.
        - Additional information is needed when the question requires actual data from the SQL tables in order to correctly answer the.
        - If the question can be answered from tables/columns descriptions alone, no additional information is required.
        - If additional information is required, identify which tables and columns are needed to perform SQL queries on.
        - If NO additional information is required, provide a justification explaining why it is not needed. In this case, all keys other than "justification" should be null.
        - Setting `tables_and_columns` and `suggested_joins` keys to null in the response is crucial to do when no additional information is needed!
        - You should respond in a valid JSON object with the following main keys:
        
        # Main JSON keys
        - tables_and_columns: A list of objects containing the keys "table_name" and "columns" only when additional information is needed. It should be null if no additional information is needed. This helps ensure the system only retrieves or performs additional SQL queries when necessary.
        - suggested_joins: A list of strings suggesting joins if SQL queries are needed. It should be null if no joins are needed.
        - justification: A string explaining why additional information is required or why not. If additional information is needed, explain why the chosen dataset(s) and column(s) are relevant.
        - JSON object should have double quotes for keys and values. It is important to follow the JSON format strictly.

        # "justification" format
        - If additional information is needed, include an explanation of why the specific dataset(s) and column(s) were chosen.
        - If no additional information is needed, return a JSON object with all keys null except for "justification", explaining why no additional information from the SQL tables is necessary.
        # Example Response when no additional information is needed
        For the question "Hi", the expected response would be:
        {json.dumps({
            "tables_and_columns": None,
            "suggested_joins": None,
            "justification": "no additional information is needed for the SQL table. Only a polite greeting."
        })}

        # Examples using the correct JSON format when additional information is required
        These examples use data from the following sources
        # SQL Table Descriptions and Schemas
        Table Name: movies_table
        SQL Database type: PostgreSQL
        Datasource description: Table from IMDB (internet movie database) containing information about movies throughout history.

        ## Columns description
        - Name: 'title' | Type: 'string' | Description: 'Full title of the movie (Example: The Great Train Robbery, City Lights, Metropolis).s'
        - Name: 'rating' | Type: 'float' | Description: 'Float rating from 0.0 - 10.0 of the movie'
        - Name: 'vote_count' | Type: 'int' | Description: 'Number of votes that the movie has received'
        - Name: 'start_year' | Type: 'int' | Description: 'The year that filming of the movie began'
        - Name: 'minutes' | Type: 'int' | Description: 'Length of the movie in minutes.'
        - Name: 'genres' | Type: 'string' | Description: 'The genre of the movie. All possible values are 'Musical', 'Horror','Film-Noir', 'Comedy', 'Western', 'Biography','Talk-Show','History','Adventure','Crime','Fantasy','Thriller','Short','War','Documentary','Sci-Fi','Drama','Music','Reality-TV','Family','Animation','Game-Show','Action','Mystery','Sport','News','Romance''
        - Name: 'imdb_id' | Type: 'string' | Description: 'ID of the movie in IMDB (Example: tt0004972, tt0039631, tt0015324)'
        - Name: 'id' | Type: 'int' | Description: 'Numeric primary key id (1 - 9875)'
        - Name: 'director_id' | Type: 'int' | Description: 'Foreign key with id of the movie's director (1 - 9875)'
        -- End of Description --

        Table Name: directors_table
        SQL Database type: PostgreSQL
        Datasource description: Table from IMDB (internet movie database) containing information about directors throughout history.

        ## Columns description
        - Name: 'name' | Type: 'string' | Description: 'Full name of the movie director (Example: Rupert Julian, Tod Browning)'
        - Name: 'birth_year' | Type: 'int' | Description: 'Birth year of the movie director (Example: 1901, 1985)'
        - Name: 'death_year' | Type: 'int' | Description: 'The year the movie director died (Example: 2001, 1980)'
        - Name: 'imdb_director_id' | Type: 'string' | Description: 'ID of the director in IMDB (Example: nm0001452, nm0291035)'
        - Name: 'id' | Type: 'int' | Description: 'Numeric primary key id (1 - 9875)'
        -- End of Description --

        # Suggested Joins
        movies_table.director_id = directors_table.id

        ## Example 1
        user query: "How many directors are there in the data?"
        your JSON object:
        {json.dumps({
            "tables_and_columns": [
                {
                    "table_name": "directors_table",
                    "columns": ["id"],
                }
            ],
            "suggested_joins": None,
            "justification": "We need to count the number of directors in the directors table only. This only requires counting the 'id' column."
        })}
        

        ## Example 2
        user query: "What's the name of the director with the highest average rating?"
        your JSON object:
        {json.dumps({
            "tables_and_columns": [
                {
                    "table_name": "movies_table",
                    "columns": ["rating", "director_id"]
                },
                {
                    "table_name": "directors_table",
                    "columns": ["name", "id"]
                }
            ],
            "suggested_joins": ["movies_table.director_id = directors_table.id"],
            "justification": "We need to find the average rating by director. This requires the 'rating' column from the 'movies_table' and the 'name' column from the 'directors_table' table."
        })}
        
        ## Example 3
        user query: "What's the moon - earth distance ?"
        your JSON object:
        {json.dumps({
            "tables_and_columns": None,
            "suggested_joins": None,
            "justification": "The query is unrelated to SQL table analysis and is out of scope."
        })}

        
        ## Example 4
        user query: "What's the best to use, mongodb or postgresql ?"
        your JSON object:
        {json.dumps({
            "tables_and_columns": None,
            "suggested_joins": None,
            "justification": "The query is unrelated to SQL table analysis and is out of scope."
        })}
        

        # Important rules to keep in mind:
        - Expected output is ALWAYS a valid JSON object.
        - First analyze if the user query is related or not to SQL table analysis.
            - If it's related : 
                - reply the required JSON object
            - else if it's not related:
                - return this EXACT JSON object:
                {json.dumps({
                    "tables_and_columns": None,
                    "suggested_joins": None,
                    "justification": "The query is unrelated to SQL table analysis and is out of scope."
                })}

        """
        self._graph_user_prompt = f"""
        Step 1: Evaluate the following user query and SQL table descriptions and schemas to decide which are required to provide additional information.
        Step 2: Generate the response as a JSON object following the format using the main JSON keys only.
        Step 3: Before replying ensure you're writing a valid json object
        # SQL Table Description and Schema
        {self.all_dataset_descriptions}
        # Suggested Joins
        {self.all_suggested_joins}
        """

    def update_graph(self, selected_graph: Optional[Dict[str, Any]], justification: str):
        self._chosen_tables_and_columns = ""
        self.chosen_joins = ""
        if selected_graph is None:
            self._graph_justification = justification
            return self
        tables_and_columns = selected_graph.get("tables_and_columns")
        if tables_and_columns is not None:
            for tc in tables_and_columns:
                dataset_name = tc.get("table_name", "")
                reduced_schema = tc.get("columns", [])
                self._chosen_tables_and_columns = get_dataset_description(dataset_name, reduced_schema)
            if selected_graph["suggested_joins"] is not None:
                for j in selected_graph["suggested_joins"]:
                    self.chosen_joins += f"{j} \n"
            else:
                self.chosen_joins = "None"
            self._graph_justification = justification
            self.build_query_prompt()
        else:
            self._graph_justification = justification
        return self

    def build_query_prompt(self):
        COL_FORMAT = """
        {'type': 'COLUMN', 'name': '<NAME_OF_COLUMN>'}
        if referencing an aliased table then the optional 'table' key is used.
        {'type': 'COLUMN', 'name': '<NAME_OF_COLUMN>', 'table': {"type": "TABLE","name": '<ALIAS_OF_TABLE>'}}
        ### value
        {'type': 'CONSTANT', 'value': '<VALUE>'}
        ### list
        {'type': 'LIST', 'items': [{'type': 'CONSTANT', 'value': '<VALUE1>'}, {'type': 'CONSTANT', 'value': '<VALUE2>'}]}
        """
        BY_FORMAT = """{"expr": {"type": "COLUMN", "name": <NAME_OF_COLUMN>}"""
        OPERATOR_EXPRESSION_FORMAT = json.dumps({'with': None,
                                                                    'selectList': [{'expr': {'type': 'COLUMN', 'name': 'customer'}},
                                                                    {'expr': {'type': 'OPERATOR',
                                                                        'op': 'AVG',
                                                                        'args': [{'type': 'COLUMN', 'name': 'cost'}]},
                                                                    'alias': 'average_user_cost'}],
                                                                    'from': {'type': 'TABLE', 'name': 'customer_data'},
                                                                    'join': [],
                                                                    'where': [],
                                                                    'groupBy': [{'expr': {'type': 'COLUMN', 'name': 'customer'}}],
                                                                    'having': [{'type': 'OPERATOR',
                                                                    'op': 'GT',
                                                                    'args': [{'type': 'OPERATOR',
                                                                        'op': 'AVG',
                                                                        'args': [{'type': 'COLUMN', 'name': 'cost'}]},
                                                                        {'type': 'CONSTANT', 'value': 200}]}],
                                                                    'orderBy': [],
                                                                    'limit': None,
                                                                    'justification': 'A query is required in order to identify customers whose average cost exceeds a threshold of 200.'})

        DATE_OPERATOR_FORMAT = f"""Date operators are operator expressions but it is important to pay attention to the argument order.
        - DATETRUNC(COLUMN, CONSTANT)
        For example DATETRUNC('WEEK',"timestamp") is represented as
        {json.dumps({'expr': {'type': 'OPERATOR', 'op': 'DATETRUNC', 'args': [{'type': 'COLUMN', 'name': 'timestamp'}, {'type': 'CONSTANT', 'value': 'week'}]}})}
        - DATEPART(COLUMN, CONSTANT)
        EXTRACT(YEAR FROM ("timestamp")) is represented as
        {json.dumps({'expr': {'type': 'OPERATOR', 'op': 'DATEPART', 'args': [{'type': 'COLUMN', 'name': 'timestamp'}, {'type': 'CONSTANT', 'value': 'YEAR'}]}})}
        - DATEDIFF(COLUMN, COLUMN, COLUMN)
        DATEDIFF(YEAR, "timestamp1", "timestamp2") is represented as
        {json.dumps({'expr': {'type': 'OPERATOR', 'op': 'DATEDIFF', 'args': [{'type': 'COLUMN', 'name': 'timestamp1'}, {'type': 'COLUMN', 'name': 'timestamp2'}, {'type': 'CONSTANT', 'value': 'year'}]}})}
        For example, if you are asked "How is the usage of the web application distributed on a daily basis?"
        a query like this:
        SELECT date_trunc('DAY',"datetime") AS "day", 
        COUNT(*) AS "total_calls", 
        SUM("token_count") AS "total_tokens",
        AVG("call_length_seconds") AS "average_call_length"  
        FROM "webapp_usage_logs"  
        GROUP BY "day"  
        ORDER BY "day" ASC
        is represented like this:
        {json.dumps({'with': None,
                            'selectList': [{'expr': {'type': 'OPERATOR',
                                'op': 'DATETRUNC',
                                'args': [{'type': 'COLUMN', 'name': 'datetime'},
                                {'type': 'CONSTANT', 'value': 'day'}]},
                            'alias': 'day'},
                            {'expr': {'type': 'OPERATOR',
                                'op': 'COUNT',
                                'args': [{'type': 'CONSTANT', 'value': '*'}]},
                            'alias': 'total_calls'},
                            {'expr': {'type': 'OPERATOR',
                                'op': 'SUM',
                                'args': [{'type': 'COLUMN', 'name': 'token_count'}]},
                            'alias': 'total_tokens'},
                            {'expr': {'type': 'OPERATOR',
                                'op': 'AVG',
                                'args': [{'type': 'COLUMN', 'name': 'call_length_seconds'}]},
                            'alias': 'average_call_length'}],
                            'from': {'type': 'TABLE', 'name': 'webapp_usage_logs'},
                            'join': [],
                            'where': [],
                            'groupBy': [{'expr': {'type': 'COLUMN', 'name': 'day'}}],
                            'having': [],
                            'orderBy': [{'expr': {'type': 'COLUMN', 'name': 'day'}, 'orderType': 'ASC'}],
                            'limit': None,
                            'justification': 'A query is required in order to analyze and summarize web application usage data on a daily basis.'})}
        """
        DISTINCT_OPERATOR_FORMAT = """{'type': 'OPERATOR', 'op': 'DISTINCT', 'args': [{'type': 'COLUMN', 'name': <NAME_OF_COLUMN>}]}"""
        IN_OPERATOR_FORMAT = """{'type': 'OPERATOR', 'op': 'IN', 'args': [{'type': 'COLUMN', 'name': 'user_id'}, {'type': 'LIST', 'items': [{'type': 'CONSTANT', 'value': 'user1'}, {'type': 'CONSTANT', 'value': 'user3'}]}]}"""
        CASE_OPERATOR_FORMAT = json.dumps({'expr': {'type': 'CASE',
                                                                         'items': [{'when': {'type': 'OPERATOR',
                                                                                             'op': 'GT',
                                                                                             'args': [{'type': 'COLUMN', 'name': 'call_length_seconds'},
                                                                                                      {'type': 'CONSTANT', 'value': 20}]},
                                                                                    'then': {'type': 'CONSTANT', 'value': 'large'}},
                                                                                   {'when': {'type': 'OPERATOR',
                                                                                             'op': 'GT',
                                                                                             'args': [{'type': 'COLUMN', 'name': 'call_length_seconds'},
                                                                                                      {'type': 'CONSTANT', 'value': 10}]},
                                                                                    'then': {'type': 'CONSTANT', 'value': 'medium'}}],
                                                                         'elseValue': {'type': 'CONSTANT', 'value': 'small'}},
                                                                'alias': 'call_bins'})

        WHERE_OPERATOR_FORMAT = f"""WHERE "id" = 689 AND "price" < 100
        "where": {json.dumps([{'type': 'OPERATOR',
                                        'op': 'AND',
                                        'args': [{'type': 'OPERATOR',
                                            'op': 'EQ',
                                            'args': [{'type': 'COLUMN', 'name': 'id'},
                                            {'type': 'CONSTANT', 'value': 689}]},
                                        {'type': 'OPERATOR',
                                            'op': 'LT',
                                            'args': [{'type': 'COLUMN', 'name': 'price'},
                                            {'type': 'CONSTANT', 'value': 100}]}]}])}
        HAVING "id" = 112 AND "price" > 200
        "having" : {json.dumps([{'type': 'OPERATOR',
                                            'op': 'AND',
                                            'args': [{'type': 'OPERATOR',
                                                'op': 'EQ',
                                                'args': [{'type': 'COLUMN', 'name': 'id'},
                                                {'type': 'CONSTANT', 'value': 112}]},
                                            {'type': 'OPERATOR',
                                                'op': 'GT',
                                                'args': [{'type': 'COLUMN', 'name': 'price'},
                                                {'type': 'CONSTANT', 'value': 200}]}]}])}
        """

        ALIAS_FORMAT = json.dumps({'expr': {'type': 'OPERATOR',
                                                                 'op': 'AVG',
                                                                 'args': [{'type': 'COLUMN', 'name': 'speed_in_seconds'}]},
                                                        'alias': 'average_speed_by_user'})

        IN_LINE_FORMAT = json.dumps(
            {"expr": {"type": "INLINE_EXPRESSION", "expr": 'NOW() - INTERVAL "5 MINUTES"'}})
        WINDOW_FORMAT = f"""window is an optional key for creating window expressions.
        RANK()
        DENSE_RANK()
        ROW_NUMBER()
        CUME_DIST()

        SUM(COLUMN)
        AVG(COLUMN)
        MAX(COLUMN)
        MIN(COLUMN)
        COUNT(COLUMN)

        The CONSTANT in these functions should be passed as a sting. For example {json.dumps([{'type': 'CONSTANT', 'value': '4'}])}
        NTILE(CONSTANT)
        LAG(CONSTANT)
        LEAD(CONSTANT)

        CONSTANT is either true or false to ignore null or not. For example {json.dumps([{'type': 'CONSTANT', 'value': True}])}
        FIRST_VALUE(COLUMN, CONSTANT)
        LAST_VALUE(COLUMN, CONSTANT)

        - partitionExpressions (list, optional): Can be null or a list of expressions of columns to partition by.
        Example: partitionExpressions: {json.dumps([{"type": "COLUMN", "name": "user_id"}])}
        - orderExpressions (list, optional): Can be null or a list of expressions of the columns to order by.
        Example: orderExpressions: {json.dumps([{"type": "COLUMN", "name": "transaction_date"}])}
        
        - orderTypes (list, optional): Can be null or a list of order types: either "ASC" or "DESC".
        Example: orderTypes: ["ASC"]
        - frameMode (string, optional): Can be null or a string "RANGE" or "ROWS".
        Example: frameMode: "ROWS"
        - frameStart (string, optional): Can be null or a string representing the window frame start.
        Example: frameStart: "UNBOUNDED"
        - frameStartDirection (string, optional): Can be null or a string "PRECEDING" or "FOLLOWING".
        Example: frameStartDirection: "PRECEDING"
        - frameEnd (string, optional): Can be null or a string representing the window frame end.
        Example: frameEnd: "CURRENT ROW"
        - frameEndDirection (string, optional): Can be null or a string "PRECEDING" or "FOLLOWING".
        Example: frameEndDirection: "FOLLOWING"
        - dateDiffUnit (optional): Can be null or a date difference unit: "YEAR", "MONTH", "WEEK", "DAY", "HOUR", "MINUTE", "SECOND". Be aware that time intervals are not implemented for all databases.
        Example: dateDiffUnit: "DAY"

        Examples of windows
        Example 1
        You are asked "give a rolling average of transactions by users"
        a query like this:
        SELECT 
        "user_id",
        "transaction_date",
        AVG("cost") OVER (PARTITION BY "user_id" ORDER BY "transaction_date" 
                        ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS "rolling_cost"
        FROM 'sales';
        is represented like this:
        {json.dumps({'with': [],
                            'selectList': [{'expr': {'type': 'COLUMN', 'name': 'user_id'}},
                            {'expr': {'type': 'COLUMN', 'name': 'transaction_date'}},
                            {'expr': {'type': 'OPERATOR',
                                'op': 'AVG',
                                'args': [{'type': 'COLUMN', 'name': 'cost'}],
                                'window': {'partitionExpressions': [{'type': 'COLUMN', 'name': 'user_id'}],
                                    'orderExpressions': [{'type': 'COLUMN', 'name': 'transaction_date'}],
                                    'orderTypes': ['ASC'],
                                    'frameMode': 'ROWS',
                                    'frameStart': 'UNBOUNDED',
                                    'frameStartDirection': 'PRECEDING',
                                    'frameEnd': 'CURRENT ROW',
                                    'frameEndDirection': 'FOLLOWING'}},
                            'alias': 'rolling_cost'}],
                            'from': {'type': 'TABLE', 'name': 'sales'},
                            'join': [],
                            'where': [],
                            'groupBy': [],
                            'having': [],
                            'orderBy': [],
                            'limit': None,
                            'justification': 'To calculate the rolling sum of cost by users ordered by transaction date.'})}

        Example 2
        You are asked  "give me a cumulative count of emails sent to customers in Germany over time."
        a query like this:
        SELECT
        "email_date",
        COUNT("customer_email") OVER (ORDER BY "email_date" ASC NULLS FIRST) AS "cumulative_emails"
        FROM
        "customer_table"
        WHERE
        "customer_country" = 'Germany'
        ORDER BY
        "email_date" ASC NULLS FIRST;
        is represented like this:
        {json.dumps({'with': [],
                        'selectList': [{'expr': {'type': 'COLUMN', 'name': 'email_date'}},
                        {'expr': {'type': 'OPERATOR',
                            'op': 'COUNT',
                            'args': [{'type': 'COLUMN', 'name': 'customer_email'}],
                            'window': {'orderExpressions': [{'type': 'COLUMN', 'name': 'email_date'}],
                            'orderTypes': ['ASC']}},
                        'alias': 'cumulative_emails'}],
                        'where': [{'type': 'OPERATOR',
                        'op': 'EQ',
                        'args': [{'type': 'COLUMN', 'name': 'customer_country'},
                            {'type': 'CONSTANT', 'value': 'Germany'}]}],
                        'from': {'type': 'TABLE', 'name': 'customer_table'},
                        'join': [],
                        'groupBy': [],
                        'having': [],
                        'orderBy': [{'expr': {'type': 'COLUMN', 'name': 'email_date'},
                        'orderType': 'ASC'}],
                        'limit': None,
                        'justification': 'To get the cumulative count of emails sent to customers in Germany, ordered by the last email date.'})}
        Example 3
        You are asked  "produce quartiles of the molecular weight"
        a query like this:
        SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "weight") AS "Q1_weight", PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY "weight") AS "Q2_weight", PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "weight") AS "Q3_weight", 
        is represented like this:
        {json.dumps({'with': [],
                    'selectList': [{'expr': {'type': 'INLINE_EXPRESSION',
                        'expr': 'PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "MolWt")'},
                    'alias': 'Q1_MolWt'},
                    {'expr': {'type': 'INLINE_EXPRESSION',
                        'expr': 'PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY "MolWt")'},
                    'alias': 'Q2_MolWt'},
                    {'expr': {'type': 'INLINE_EXPRESSION',
                        'expr': 'PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "MolWt")'},
                    'alias': 'Q3_MolWt'}],
                    'from': {'type': 'TABLE', 'name': 'molecular_data'},
                    'join': [],
                    'where': [],
                    'groupBy': [],
                    'having': [],
                    'orderBy': [],
                    'limit': None,
                    'justification': 'To calculate the 25th, 50th, and 75th percentiles for MolWt and MolLogP.'})}
        """
        FROM_JOIN_FORMAT = f"""For example
        SELECT s.sale_date, c.customer_name
        FROM sales s
        LEFT JOIN customers c ON s.customer_id = c.customer_id; 
        is represented like this:
        {json.dumps({'with': [],
                    'selectList': [{'expr': {'type': 'COLUMN',
                        'name': 'sale_date',
                        'table': {'type': 'TABLE', 'name': 's'}}},
                    {'expr': {'type': 'COLUMN',
                        'name': 'customer_name',
                        'table': {'type': 'TABLE', 'name': 'c'}}}],
                    'from': {'type': 'TABLE', 'name': 'sales', 'alias': 's'},
                    'join': [{'type': 'LEFT',
                    'tableLike': {'type': 'TABLE', 'name': 'customers', 'alias': 'c'},
                    'on': [{'type': 'OPERATOR',
                        'op': 'EQ',
                        'args': [{'type': 'COLUMN',
                        'name': 'customer_id',
                        'table': {'type': 'TABLE', 'name': 's'}},
                        {'type': 'COLUMN',
                        'name': 'customer_id',
                        'table': {'type': 'TABLE', 'name': 'c'}}]}]}],
                    'where': [],
                    'groupBy': [],
                    'having': [],
                    'orderBy': [],
                    'limit': None,
                    'justification': None})}
        """
        CTE_FORMAT = json.dumps({'with': [{'alias': 'select_with_limit',
                                        'selectList': [{'expr': {'type': 'COLUMN', 'name': '*'}}],
                                        'from': {'type': 'TABLE', 'name': 'NEW_FEATURE_SQL_RET_cols'},
                                        'limit': 10}],
                                'selectList': [{'expr': {'type': 'COLUMN', 'name': '*'}}],
                                'from': {'type': 'TABLE', 'name': 'select_with_limit'},
                                'join': [],
                                'where': [],
                                'groupBy': [],
                                'having': [],
                                'orderBy': [],
                                'limit': None,
                                'justification': None})

        self._query_system_prompt = f"""
        # Role and Guidelines
        You are a SQL SELECT DSL (domain specific language) query builder. Your role is to analyze the user query and SQL table descriptions and schemas then decide what additional information from the SQL table is necessary.
        When additional information is required you should craft a SQL query in as a JSON object in the following format.
        # Main JSON keys
        1. "with": ARRAY
        2. "selectList": ARRAY
        3. "from": OBJECT
        4. "join": ARRAY
        5. "where": ARRAY
        6. "groupBy": ARRAY
        4. "having": ARRAY
        5. "orderBy": ARRAY
        6. "limit": INT
        7. "justification": STRING

        You should return a JSON object only.
                
        # JSON Query Format
        ### column
        {COL_FORMAT}

        ## expressions
        ### "groupBy" and "orderBy" both use "expr" as key and a column as a value.
        for example
        "groupBy" : [{BY_FORMAT}]
        "orderBy" : [{BY_FORMAT}]
        "orderBy" has the optional key "orderType" which can be "ASC", "DESC"

        ## operator expressions
        The operator expressions take 3 required keys: 'type', 'op' and 'args'. Plus the optional 'window':
        - 'type' STRING: Has the value 'OPERATOR'.
        - 'op': STRING Is the type of operator to use in the query.
        - 'args': ARRAY: Is a list containing the expressions of the operator.
        - 'window': OBJECT (optional): A window expression type.
        The available operators are as follows:
        - String operators:
            - UPPER
            - LOWER
            - CONCAT
            - CONTAINS
            - STARTS_WITH
            - LENGTH
            - TRIM
            - CONCAT
            - CONTAINS
            - STARTS_WITH
            - COALESCE
            - REPLACE
        - Numerical operators:
            - AVG
            - MEDIAN
            - MIN
            - MAX
            - SUM
            - PLUS
            - MINUS
            - TIMES
            - DIV
            - MOD
            - POW
            - STDDEV_SAMP
            - STDDEV_POP
            - ABS
            - FLOOR
            - CEIL
            - ROUND
            - SQRT
            - EXP
            - LN
            - LOG
        - Boolean operators:
            - EQ (equals)
            - NE (not equal)
            - GT (greater than)
            - LT (less than)
            - GE (greater than or equal)
            - LE (less than or equal)
            - LIKE (SQL LIKE operator)
            - AND (logical AND)
            - OR (logical OR)
            - NOT (logical NOT)
            - ISNULL (is null)
            - ISNOTNULL (is not null)

        For example if you are asked "which customers have an average cost above 200?"
        a query like this:
        SELECT "customer", AVG("cost") AS average_user_cost
        FROM "customer_data"
        GROUP BY "customer"
        HAVING AVG("cost") > 200;
        would be represented as 
        {OPERATOR_EXPRESSION_FORMAT}

        ### Date operators:
        {DATE_OPERATOR_FORMAT}
        ### Other operators: 
            - COUNT
            - DISTINCT
            - FIRST_VALUE
            - LAST_VALUE
        For example DISTINCT can be made in the following way:
        {DISTINCT_OPERATOR_FORMAT}
        - IN operator
        IN expressions take a list of values.
        For example, the WHERE condition
        WHERE "user_id" IN ('user1','user3')
        should be defined as
        {IN_OPERATOR_FORMAT}
        
        ## CASE
        Case expressions takes 2 keys "items" and "elseValue".
        - "items": an array of WhenItem
        - "elseValue": a value expression
        WhenItem take 2 keys "when" and "then"
        - "when": an operator expression which evaluates to a boolean
        - "then": a value expression
        For example, if you are asked to "group the data into 3 groups based on call length. Everything above 20 is long, above 10 medium and the rest are short."
        This SQL snippet
        SELECT CASE 
            WHEN "call_length_seconds" > 20.0 THEN 'large'
            WHEN "call_length_seconds" > 10.0 THEN 'medium'
            ELSE 'small'
        END AS "call_bins"
        would be expressed as:
        {CASE_OPERATOR_FORMAT}

        ### "where" and "having"
        "where" and "having" are arrays of conditional expression.
        For example
        {WHERE_OPERATOR_FORMAT}

        ### alias
        "alias" is optional key that can be added to an "expr"
        When aggregates it is important create aliases so that is clear what column you are creating.
        For example, here we add "average_speed_by_user" instead of "avg":
        {ALIAS_FORMAT}
        
        ### In line expressions
        In line expressions such as INTERVAL, CURRENT_DATE, CURRENT_TIME, CURRENT_TIMESTAMP, NOW() can be made using "INLINE_EXPRESSION" which takes 2 keys 
        - "type": "INLINE_EXPRESSION"
        - "expr": string: the inline expression
        For example:
        NOW() - INTERVAL '5 MINUTES'
        {IN_LINE_FORMAT}
        This expression works because no columns are referenced in the expression. If you want to refer to columns you should do so using the standard column format.

        ### window
        {WINDOW_FORMAT}

        ## "from" and "join"
        The "from" key takes an OBJECT with 3 keys:
        - "type": "TABLE".
        - "name": sting of table name.
        - "alias": string of alias for the table (if an alias is used).
        It is essential that whenever an alias is created for a table in the "from" or "join" clauses, this alias must be used in all subsequent references to that table within the query, especially within the selectList, groupBy, orderBy, and where clauses.


        The "join" key takes an ARRAY of join objects. These objects take these keys:
        - "type": string: one of 'INNER', 'LEFT', 'RIGHT', 'FULL', 'CROSS', 'NATURAL_INNER', 'NATURAL_LEFT', 'NATURAL_RIGHT', 'NATURAL_FULL'
        - "tableLike": object with the structure as "from"
        - "on": array of join conditions using operator expressions
        - "operatorBetweenConditions": string either 'AND' or 'OR' default is 'AND'

        {FROM_JOIN_FORMAT}

        ## "with" CTE(common table expressions)
        The "with" key can take the same main keys as the regular JSON format. It's also important to set the following key:
        - "alias": sting: alias to reference the query
        For example, the query 
        WITH
        "select_with_limit" AS  (
            SELECT *
            FROM "NEW_FEATURE_SQL_RET_cols"
            LIMIT 10
            )
        SELECT *
        FROM "select_with_limit"
        Is represented as
        {CTE_FORMAT}

        ### limit
        int or null

        # "justification" format
        Justification should contain explanation of why a query was chosen.
        If no query is needed a JSON should be returned with all keys set to default except "justification" which should explain why no query is necessary.

        ## Important rules
        - Make string searches case insensitive by using UPPER.
        - Whenever an alias is created for a table in the "from" or "join" clauses, this alias must be used in all subsequent references to that table within the query, especially within the selectList, groupBy, orderBy, and where clauses.
        - Only use operators that appear in this list above.
        - Only use column names that appear in the SQL Table Description.
        - Provide a JSON object only. Do not add any other explanations, decorations, or additional information beyond the JSON objects.
        """

        self._query_user_prompt = f"""
        Evaluate the following user query and SQL Table Descriptions and Schemas to determine if a SQL queries should be generated.
        Generate the response as a JSON object following the JSON query format using the main JSON keys only.
        Provide a JSON object only. Do not add any other explanations, decorations, or additional information beyond the JSON object.
        # SQL Table Descriptions and Schemas
        {self._chosen_tables_and_columns}

        {get_user_query_examples()}
        Suggested Joins:
        {self.chosen_joins}
        Query hint:
        {self._graph_justification}
        """

        self._error_system_prompt = f"""
        You are an expert in fixing SQL query errors. You help the query builder to fix its errors.
        - Remember to use the table and column names as they appear in 'SQL Table Descriptions and Schemas' only.
        Look at the chat history, task and response and fix it. It is essential that your response is in the same JSON format as the task.
        Provide the JSON object only. Do not add any other explanations, decorations, or additional information beyond the JSON object.
        Here are the instructions that the query builder received
        # Previous instructions:
        {self.query_system_prompt}
        """

        self._error_user_prompt = """
        Correct the following JSON search query response.
        You must follow the JSON Query Format and use the same keys:
        - Generate a search query only if there is a need for more information and there were no previous errors.
        - The error query might contain table names that are incorrect. You should only use the table names that appear in 'SQL Table Descriptions and Schemas'.
        - If no query is needed, all other keys should be null and 'justification' should explain why no query is necessary.
        - Provide a JSON object only. Do not add any other explanations, decorations, or additional information beyond the a JSON object.
        - JSON object should have double quotes for keys and values. It is important to follow the JSON format strictly.
        {attempts_history}
        """


    @property
    def graph_system_prompt(self):
        return self._graph_system_prompt

    @property
    def graph_user_prompt(self):
        return self._graph_user_prompt

    @property
    def query_system_prompt(self):
        return self._query_system_prompt

    @property
    def query_user_prompt(self):
        return self._query_user_prompt

    @property
    def chosen_tables_and_columns(self):
        return self._chosen_tables_and_columns

    @property
    def graph_justification(self):
        return self._graph_justification

    @property
    def prompt(self):
        return self._prompt

    @property
    def error_system_prompt(self):
        return self._error_system_prompt

    @property
    def error_user_prompt(self):
        return self._error_user_prompt

    @property
    def previous_sql_errors(self):
        return self._previous_sql_errors

    @previous_sql_errors.setter
    def previous_sql_errors(self, value: List[SqlError]):
        self._previous_sql_errors = value
