import pandas as pd
import numpy as np
import logging
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from typing import Dict, List, Optional, Tuple
import dataiku

from .utils import (
    # Common
    identify_column_types,
    # Clustering
    prepare_mixed_data,
    extract_numeric_centers,
    compute_categorical_distributions,
    compute_clustering_feature_importance,
    find_optimal_clusters,
    interpret_quality_metrics,
    generate_clustering_insights_summary,
    # Outliers
    detect_numeric_outliers,
    detect_categorical_outliers,
    compute_outlier_summary,
    extract_outlier_records,
    identify_outlier_patterns,
    generate_outlier_recommendations,
    generate_outlier_insights_summary,
    # Root Cause Analysis
    compute_target_summary,
    compute_rca_feature_importance,
    compute_correlations,
    compute_categorical_impacts,
    compute_feature_interactions,
    compute_conditional_analysis,
    identify_root_causes,
    generate_rca_insights_summary,
    # Time Series Forecasting
    aggregate_time_series,
    detect_date_frequency,
    generate_future_dates,
    analyze_trend,
    analyze_seasonality,
    forecast_moving_average,
    forecast_linear_trend,
    forecast_exponential_smoothing,
    forecast_seasonal_decompose,
    forecast_prophet,
    determine_best_forecast_method,
    generate_forecast_insights,
    generate_time_series_insights_summary
)


#============== Clustering ==============

def analytic_clustering(
    input_dataset_name: str,
    output_data_with_cluster_dataset_name: str,
    output_insight_dataset_name: str,
    feature_columns: List[str],
    n_clusters: Optional[int] = None,
    max_clusters: int = 10,
    random_state: int = 42,
    standardize: bool = True,
    max_categories_for_onehot: int = 10
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Performs clustering on a Dataiku dataset and saves results to output datasets.
    Automatically handles both numeric and categorical data.
    Saves the data with cluster labels to an output dataset and insights to a metrics dataset.
    
    Parameters
    ----------
    input_dataset_name : str
        Name of the input Dataiku dataset containing the data to cluster
    output_data_with_cluster_dataset_name : str
        Name of the output Dataiku dataset where data with cluster labels will be saved.
        The output will contain all original columns plus a 'cluster' column.
    output_insight_dataset_name : str
        Name of the output Dataiku dataset where clustering insights will be saved.
        The dataset contains two columns: 'feature_name' and 'feature_value', with one row
        per insight (n_clusters, cluster_sizes, cluster_centers, cluster_statistics,
        quality_metrics, feature_importance, insights_summary).
    feature_columns : List[str]
        List of columns to use for clustering
    n_clusters : Optional[int], default=None
        Desired number of clusters. If None, uses the elbow method to automatically
        determine the optimal number of clusters.
    max_clusters : int, default=10
        Maximum number of clusters to test if n_clusters is None
    random_state : int, default=42
        Random seed for reproducibility
    standardize : bool, default=True
        If True, standardizes numeric features before clustering
    max_categories_for_onehot : int, default=10
        Maximum number of unique categories to use one-hot encoding.
        Beyond this, uses label encoding.
        
    Returns
    -------
    Tuple[pd.DataFrame, pd.DataFrame]
        A tuple containing:
        - df_result : DataFrame with original data plus 'cluster' column
        - df_insights : DataFrame with insights (columns: 'feature_name', 'feature_value')
            containing:
            - n_clusters : number of clusters used
            - cluster_sizes : size of each cluster (one row per cluster)
            - cluster_centers : centers of each cluster (one row per cluster and feature)
            - cluster_statistics : descriptive statistics per cluster
            - quality_metrics : quality metrics (silhouette_score, davies_bouldin_score,
              inertia, interpretation)
            - feature_importance : relative importance of features in cluster separation
            - insights_summary : textual summary of main insights
    """
    
    # Load dataset
    df = dataiku.Dataset(input_dataset_name).get_dataframe(infer_with_pandas=False)
    
    # Column validation
    missing_cols = [col for col in feature_columns if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing columns in DataFrame: {missing_cols}")
    
    # Data preparation
    X = df[feature_columns].copy()
    
    # Separate numeric and categorical columns
    numeric_cols, categorical_cols, column_types = identify_column_types(X)
    
    # Encode and prepare data
    X_encoded, encoders_info = prepare_mixed_data(
        X, numeric_cols, categorical_cols, 
        max_categories_for_onehot, standardize
    )
    
    # Handle missing values in encoded data
    if pd.DataFrame(X_encoded).isnull().any().any():
        X_encoded = pd.DataFrame(X_encoded).fillna(0).values
    
    # Calculate elbow plot data (always compute for visualization)
    K_range = range(2, min(max_clusters + 1, len(X_encoded)))
    elbow_data = []
    
    for k in K_range:
        kmeans_test = KMeans(n_clusters=k, random_state=random_state, n_init=10)
        cluster_labels_test = kmeans_test.fit_predict(X_encoded)
        inertia_test = kmeans_test.inertia_
        
        # Calculate silhouette score for this k
        try:
            silhouette_test = silhouette_score(X_encoded, cluster_labels_test)
        except:
            silhouette_test = None
        
        elbow_data.append({
            "k": k,
            "inertia": float(inertia_test),
            "silhouette_score": float(silhouette_test) if silhouette_test is not None else None
        })
    
    # Determine optimal number of clusters if not specified
    if n_clusters is None:
        n_clusters = find_optimal_clusters(X_encoded, max_clusters, random_state)
    
    # Clustering with selected number of clusters
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
    cluster_labels = kmeans.fit_predict(X_encoded)
    
    # Add labels to original DataFrame
    df_data_with_clusters = df.copy()
    df_data_with_clusters['cluster'] = cluster_labels
    
    # Calculate quality metrics
    silhouette = silhouette_score(X_encoded, cluster_labels)
    davies_bouldin = davies_bouldin_score(X_encoded, cluster_labels)
    inertia = kmeans.inertia_
    
    # Statistics per cluster
    cluster_sizes = pd.Series(cluster_labels).value_counts().sort_index().to_dict()
    
    # Descriptive statistics per cluster (numeric only)
    cluster_stats = {}
    cluster_centers_dict = {}
    
    if numeric_cols:
        # Reconstruct centers for numeric columns only
        numeric_centers = extract_numeric_centers(
            kmeans.cluster_centers_, encoders_info, 
            numeric_cols, standardize
        )
        
        cluster_centers_df = pd.DataFrame(
            numeric_centers,
            columns=numeric_cols,
            index=[f"Cluster_{i}" for i in range(n_clusters)]
        )
        cluster_centers_dict = cluster_centers_df.to_dict('index')
        
        # Statistics for numeric columns
        for cluster_id in range(n_clusters):
            cluster_data = df[cluster_labels == cluster_id][numeric_cols]
            cluster_stats[f"Cluster_{cluster_id}"] = {
                "size": int(cluster_sizes[cluster_id]),
                "percentage": round(cluster_sizes[cluster_id] / len(df) * 100, 2),
                "numeric_means": cluster_data.mean().to_dict() if len(numeric_cols) > 0 else {},
                "numeric_stds": cluster_data.std().to_dict() if len(numeric_cols) > 0 else {},
                "numeric_medians": cluster_data.median().to_dict() if len(numeric_cols) > 0 else {}
            }
    else:
        for cluster_id in range(n_clusters):
            cluster_stats[f"Cluster_{cluster_id}"] = {
                "size": int(cluster_sizes[cluster_id]),
                "percentage": round(cluster_sizes[cluster_id] / len(df) * 100, 2)
            }
    
    # Categorical distributions per cluster
    categorical_distributions = compute_categorical_distributions(
        df, categorical_cols, cluster_labels, n_clusters
    )
    
    # Feature importance
    feature_importance = compute_clustering_feature_importance(
        df, numeric_cols, categorical_cols, cluster_labels, 
        n_clusters, encoders_info
    )
    
    # Generate textual summary of insights
    insights_summary = generate_clustering_insights_summary(
        n_clusters, cluster_sizes, cluster_stats, categorical_distributions,
        silhouette, davies_bouldin, feature_importance, column_types
    )
    
    # Build result
    insights = {
        "n_clusters": n_clusters,
        "cluster_sizes": cluster_sizes,
        #"cluster_centers": cluster_centers_dict,
        "cluster_statistics": cluster_stats,
        #"categorical_distributions": categorical_distributions,
        "quality_metrics": {
            "silhouette_score": round(silhouette, 4),
            "davies_bouldin_score": round(davies_bouldin, 4),
            "inertia": round(inertia, 2),
            "interpretation": interpret_quality_metrics(silhouette, davies_bouldin)
        },
        "feature_importance": feature_importance,
        "insights_summary": insights_summary,
        "elbow_plot_data": elbow_data
    }
    df_insights = pd.DataFrame(insights.items(), columns=['feature_name', 'feature_value'])
    
    dataiku.Dataset(output_data_with_cluster_dataset_name).write_with_schema(df_data_with_clusters)
    dataiku.Dataset(output_insight_dataset_name).write_with_schema(df_insights)
    
    return df_data_with_clusters, df_insights


#============== Outliers Detection ==============

def _analytic_detect_outliers_df(
    df: pd.DataFrame,
    feature_columns: List[str],
    contamination: float = 0.05
) -> Dict:
    """
    Detects outliers in a DataFrame using Isolation Forest.
    Returns structured insights to be injected into an AI agent context.
    Handles both numeric and categorical features automatically.
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing the data to analyze
    feature_columns : List[str]
        List of columns to analyze for outliers
    contamination : float, default=0.05
        Expected proportion of outliers (for Isolation Forest)
        
    Returns
    -------
    Dict
        Dictionary containing:
        - outlier_summary : overall summary of outliers detected
        - outliers_by_column : detailed outlier detection per column
        - outlier_records : actual outlier records identified
        - outlier_patterns : patterns in outlier detection
        - recommendations : recommendations based on findings
        - insights_summary : textual summary of outlier findings
        - all_outlier_indices : set of all outlier indices (for flagging in output dataset)
    """
    
    # Validation
    missing_cols = [col for col in feature_columns if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing columns in DataFrame: {missing_cols}")
    
    # Force Isolation Forest method
    methods = ['isolation_forest']
    
    # Prepare data
    df_clean = df[feature_columns].copy()
    
    # Identify column types
    numeric_cols, categorical_cols, column_types = identify_column_types(df_clean)
    
    # Detect outliers for each column
    outliers_by_column = {}
    all_outlier_indices = set()
    
    for col in feature_columns:
        if col in numeric_cols:
            col_outliers = detect_numeric_outliers(
                df_clean, col, methods, contamination
            )
        elif col in categorical_cols:
            col_outliers = detect_categorical_outliers(df_clean, col)
        else:
            continue
        
        if col_outliers:
            outliers_by_column[col] = col_outliers
            all_outlier_indices.update(col_outliers.get("outlier_indices", []))
    
    # Overall summary
    outlier_summary = compute_outlier_summary(
        df_clean, outliers_by_column, all_outlier_indices
    )
    
    # Outlier records
    outlier_records = extract_outlier_records(
        df, list(all_outlier_indices), feature_columns
    )
    
    # Outlier patterns
    outlier_patterns = identify_outlier_patterns(
        df_clean, outliers_by_column, all_outlier_indices
    )
    
    # Clean outliers_by_column: remove outlier_indices to reduce context size
    # Keep only: outlier_count, outlier_percentage, method_used, top_outlier_values, boxplot_stats (for numeric)
    cleaned_outliers_by_column = {}
    for col, col_info in outliers_by_column.items():
        cleaned_info = {
            "outlier_count": col_info.get("outlier_count", 0),
            "outlier_percentage": col_info.get("outlier_percentage", 0),
            "method_used": col_info.get("method_used", "unknown"),
            "top_outlier_values": col_info.get("top_outlier_values", [])
        }
        # Add boxplot_stats for numeric columns
        if "boxplot_stats" in col_info:
            cleaned_info["boxplot_stats"] = col_info["boxplot_stats"]
        cleaned_outliers_by_column[col] = cleaned_info
    
    # Recommendations
    recommendations = generate_outlier_recommendations(
        outlier_summary, cleaned_outliers_by_column, outlier_patterns
    )
    
    # Generate insights summary
    insights_summary = generate_outlier_insights_summary(
        outlier_summary, cleaned_outliers_by_column, outlier_patterns
    )
    
    # Build result
    result = {
        "outlier_summary": outlier_summary,
        "outliers_by_column": cleaned_outliers_by_column,  # Use cleaned version
        "outlier_records": outlier_records,
        "outlier_patterns": outlier_patterns,
        "recommendations": recommendations,
        "insights_summary": insights_summary,
        "column_types": {col: column_types.get(col, "unknown") for col in feature_columns},
        "all_outlier_indices": all_outlier_indices  # Keep for flagging in output dataset
    }
    
    return result





def analytic_outlier_detection(
    input_dataset_name: str,
    output_data_with_outliers_dataset_name: str,
    output_insight_dataset_name: str,
    feature_columns: List[str],
    contamination: float = 0.05
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Detects outliers in a Dataiku dataset using Isolation Forest and saves results to output datasets.
    
    Parameters
    ----------
    input_dataset_name : str
        Name of the input Dataiku dataset containing the data to analyze
    output_data_with_outliers_dataset_name : str
        Name of the output Dataiku dataset where data with outlier flags will be saved.
    output_insight_dataset_name : str
        Name of the output Dataiku dataset where outlier detection insights will be saved.
        The dataset contains two columns: 'feature_name' and 'feature_value'.
    feature_columns : List[str]
        List of columns to analyze for outliers
    contamination : float, default=0.05
        Expected proportion of outliers (for Isolation Forest)
        
    Returns
    -------
    Tuple[pd.DataFrame, pd.DataFrame]
        A tuple containing:
        - df_result : DataFrame with original data plus outlier flags
        - df_insights : DataFrame with insights (columns: 'feature_name', 'feature_value')
    """
    # Load dataset
    df = dataiku.Dataset(input_dataset_name).get_dataframe(infer_with_pandas=True)
    
    # Call the DataFrame-based function
    result = _analytic_detect_outliers_df(
        df=df,
        feature_columns=feature_columns,
        contamination=contamination
    )
    
    # Add outlier flags to original DataFrame
    df_result = df.copy()
    outlier_indices = result.get("all_outlier_indices", set())
    df_result['is_outlier'] = df_result.index.isin(outlier_indices)
    
    # Build insights DataFrame
    insights = {
        "outlier_summary": result.get("outlier_summary", {}),
        "outliers_by_column": result.get("outliers_by_column", {}),
        "outlier_patterns": result.get("outlier_patterns", {}),
        "recommendations": result.get("recommendations", []),
        "insights_summary": result.get("insights_summary", "")
    }
    df_insights = pd.DataFrame(insights.items(), columns=['feature_name', 'feature_value'])
    
    # Save to output datasets
    dataiku.Dataset(output_data_with_outliers_dataset_name).write_with_schema(df_result)
    dataiku.Dataset(output_insight_dataset_name).write_with_schema(df_insights)
    
    return df_result, df_insights


#============== Root Cause Analysis ==============
def analytic_root_cause_analysis(
    input_dataset_name: str,
    output_insight_dataset_name: str,
    target_column: str,
    feature_columns: List[str],
    max_interactions: int = 5,
    min_correlation_threshold: float = 0.1
) -> pd.DataFrame:
    """
    Performs root cause analysis on a Dataiku dataset and saves results to output dataset.
    
    Parameters
    ----------
    input_dataset_name : str
        Name of the input Dataiku dataset containing the data to analyze
    output_insight_dataset_name : str
        Name of the output Dataiku dataset where RCA insights will be saved.
        The dataset contains two columns: 'feature_name' and 'feature_value'.
    target_column : str
        Name of the target variable to analyze
    feature_columns : List[str]
        List of feature columns to analyze as potential root causes
    max_interactions : int, default=5
        Maximum number of feature interactions to analyze
    min_correlation_threshold : float, default=0.1
        Minimum correlation threshold to consider a feature as significant
        
    Returns
    -------
    pd.DataFrame
        DataFrame with insights (columns: 'feature_name', 'feature_value')
    """
    # Load dataset
    df = dataiku.Dataset(input_dataset_name).get_dataframe(infer_with_pandas=True)
    
    # Call the DataFrame-based function
    result = _analytic_root_cause_analysis_df(
        df=df,
        target_column=target_column,
        feature_columns=feature_columns,
        max_interactions=max_interactions,
        min_correlation_threshold=min_correlation_threshold
    )
    
    # Build insights DataFrame
    insights = {
        "target_summary": result.get("target_summary", {}),
        "feature_importance": result.get("feature_importance", {}),
        "correlations": result.get("correlations", {}),
        "categorical_impacts": result.get("categorical_impacts", {}),
        "interactions": result.get("interactions", []),
        "conditional_analysis": result.get("conditional_analysis", {}),
        "root_causes": result.get("root_causes", []),
        "insights_summary": result.get("insights_summary", "")
    }
    df_insights = pd.DataFrame(insights.items(), columns=['feature_name', 'feature_value'])
    
    # Save to output dataset
    dataiku.Dataset(output_insight_dataset_name).write_with_schema(df_insights)
    
    return df_insights



#============== Time Series Forecasting Light (Prophet only) ==============

def analytic_time_series_forecasting(
    input_dataset_name: str,
    output_data_with_forecast_dataset_name: str,
    output_insight_dataset_name: str,
    date_column: str,
    value_column: str,
    split_column: Optional[str] = None,
    aggregation_freq: str = 'D',
    aggregation_method: str = 'mean',
    forecast_horizon: int = 10
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Simplified time series forecasting using only Prophet.
    Performs forecasting on aggregated data and saves results to output datasets.
    Supports optional splitting by a column for group-wise forecasting.
    
    Parameters
    ----------
    input_dataset_name : str
        Name of the input Dataiku dataset containing the time series data
    output_data_with_forecast_dataset_name : str
        Name of the output Dataiku dataset where aggregated data with forecast values will be saved.
        The output will contain aggregated data plus: forecast, forecast_lower, forecast_upper, is_forecast columns.
    output_insight_dataset_name : str
        Name of the output Dataiku dataset where forecasting insights will be saved.
        The dataset contains two columns: 'feature_name' and 'feature_value'.
        Insights include: time_series_summary, trend_analysis, seasonality_analysis, 
        forecast_accuracy, decomposition_analysis, model_quality, insights_summary.
    date_column : str
        Name of the column containing dates/timestamps
    value_column : str
        Name of the column containing the values to forecast
    split_column : Optional[str], default=None
        Optional column name to split forecasting by groups. If provided, forecasting will be performed
        separately for each unique value in this column.
    aggregation_freq : str, default='D'
        Frequency for data aggregation before forecasting. Examples: 'D' (daily), 'W' (weekly), 
        'M' (monthly), 'Q' (quarterly), 'Y' (yearly).
        Uses pandas frequency strings: https://pandas.pydata.org/docs/user_guide/timeseries.html#offset-aliases
    aggregation_method : str, default='mean'
        Method for aggregation. Options: 'mean', 'sum', 'median', 'min', 'max', 'count'.
    forecast_horizon : int, default=10
        Number of periods ahead to forecast
        
    Returns
    -------
    Tuple[pd.DataFrame, pd.DataFrame]
        A tuple containing:
        - df_result : DataFrame with aggregated data plus forecast columns (forecast, forecast_lower, forecast_upper, is_forecast)
        - df_insights : DataFrame with insights (columns: 'feature_name', 'feature_value')
            containing: time_series_summary, trend_analysis, seasonality_analysis, 
            forecast_accuracy, insights_summary
    """
    # Load dataset
    df = dataiku.Dataset(input_dataset_name).get_dataframe(infer_with_pandas=True)
    
    # Validation
    if date_column not in df.columns:
        raise ValueError(f"Date column '{date_column}' not found in DataFrame")
    
    if value_column not in df.columns:
        raise ValueError(f"Value column '{value_column}' not found in DataFrame")
    
    # Prepare data
    columns_needed = [date_column, value_column]
    if split_column:
        if split_column not in df.columns:
            raise ValueError(f"Split column '{split_column}' not found in DataFrame")
        columns_needed.append(split_column)
    
    df_clean = df[columns_needed].copy()
    
    # Convert date column to datetime
    try:
        df_clean[date_column] = pd.to_datetime(df_clean[date_column])
    except:
        raise ValueError(f"Unable to convert '{date_column}' to datetime")
    
    # Remove rows with missing values
    df_clean = df_clean.dropna()
    
    if len(df_clean) < 3:
        raise ValueError("Insufficient data for time series forecasting (minimum 3 observations required)")
    
    # Sort by date (and split_column if present)
    sort_cols = [split_column, date_column] if split_column else [date_column]
    df_clean = df_clean.sort_values(sort_cols).reset_index(drop=True)
    
    # Process each split group separately if split_column is provided
    if split_column:
        split_groups = df_clean[split_column].unique()
        all_results = []
        all_insights = []
        
        for group_value in split_groups:
            df_group = df_clean[df_clean[split_column] == group_value].copy()
            df_group_result, df_group_insights = _process_single_forecast(
                df_group, date_column, value_column, split_column, group_value,
                aggregation_freq, aggregation_method, forecast_horizon
            )
            all_results.append(df_group_result)
            all_insights.append(df_group_insights)
        
        # Combine results
        df_result = pd.concat(all_results, ignore_index=True)
        df_insights = pd.concat(all_insights, ignore_index=True)
    else:
        # Process without splitting
        df_result, df_insights = _process_single_forecast(
            df_clean, date_column, value_column, None, None,
            aggregation_freq, aggregation_method, forecast_horizon
        )
    
    # Save to output datasets
    dataiku.Dataset(output_data_with_forecast_dataset_name).write_with_schema(df_result)
    dataiku.Dataset(output_insight_dataset_name).write_with_schema(df_insights)
    
    return df_result, df_insights

## Keep
def _process_single_forecast(
    df: pd.DataFrame,
    date_column: str,
    value_column: str,
    split_column: Optional[str],
    split_value: Optional[str],
    aggregation_freq: str,
    aggregation_method: str,
    forecast_horizon: int
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Helper function to process forecasting for a single group (or entire dataset if no split).
    """
    # Apply aggregation (preserving split_column if present)
    if split_column:
        # Group by split_column and date, then aggregate
        df_agg = df.copy()
        df_agg[date_column] = pd.to_datetime(df_agg[date_column])
        df_agg = df_agg.set_index(date_column)
        
        # Group by split_column and resample
        grouped = df_agg.groupby(split_column)
        agg_dfs = []
        for group_name, group_df in grouped:
            # Resample by frequency
            if aggregation_method == 'mean':
                group_agg = group_df[[value_column]].resample(aggregation_freq).mean()
            elif aggregation_method == 'sum':
                group_agg = group_df[[value_column]].resample(aggregation_freq).sum()
            elif aggregation_method == 'median':
                group_agg = group_df[[value_column]].resample(aggregation_freq).median()
            elif aggregation_method == 'min':
                group_agg = group_df[[value_column]].resample(aggregation_freq).min()
            elif aggregation_method == 'max':
                group_agg = group_df[[value_column]].resample(aggregation_freq).max()
            elif aggregation_method == 'count':
                group_agg = group_df[[value_column]].resample(aggregation_freq).count()
            else:
                raise ValueError(f"Unknown aggregation method: {aggregation_method}")
            
            # Add split_column back
            group_agg[split_column] = group_name
            agg_dfs.append(group_agg)
        
        df_agg = pd.concat(agg_dfs).reset_index()
        df_agg = df_agg.dropna()
    else:
        # No split column, use standard aggregation
        df_agg = aggregate_time_series(
            df, 
            date_column, 
            [value_column], 
            aggregation_freq, 
            aggregation_method
        )
    
    if len(df_agg) < 3:
        raise ValueError(f"Insufficient data after aggregation with frequency '{aggregation_freq}' (minimum 3 observations required)")
    
    # Extract dates and values
    dates = df_agg[date_column].values
    ts_raw = pd.to_numeric(df_agg[value_column], errors='coerce')
    
    # Remove any NaN values
    valid_mask = ~ts_raw.isna()
    if not valid_mask.any():
        raise ValueError(f"No valid numeric values in '{value_column}' after aggregation")
    
    ts = np.asarray(ts_raw[valid_mask].values, dtype=np.float64)
    dates_filtered = dates[valid_mask.values]
    
    if len(ts) < 3:
        raise ValueError(f"Insufficient valid data after filtering (minimum 3 observations required)")
    
    # Detect date frequency
    date_frequency_info = {}
    if pd.api.types.is_datetime64_any_dtype(pd.Series(dates_filtered)) and len(dates_filtered) > 1:
        date_frequency_info = detect_date_frequency(dates_filtered)
    
    # Time series summary
    time_series_summary = {
        "start_date": str(dates_filtered[0]),
        "end_date": str(dates_filtered[-1]),
        "n_observations": len(ts),
        "mean": float(np.mean(ts)),
        "median": float(np.median(ts)),
        "std": float(np.std(ts)),
        "min": float(np.min(ts)),
        "max": float(np.max(ts)),
        "trend_direction": "increasing" if ts[-1] > ts[0] else "decreasing" if ts[-1] < ts[0] else "stable",
        "trend_magnitude": float((float(ts[-1]) - float(ts[0])) / float(len(ts))) if len(ts) > 1 else 0.0
    }
    
    # Trend analysis
    trend_analysis = analyze_trend(ts, dates_filtered)
    
    # Seasonality analysis
    seasonality_analysis = analyze_seasonality(ts, dates_filtered, None)
    
    # Split data for validation (use last 20% for testing if enough data)
    train_size = max(int(len(ts) * 0.8), len(ts) - forecast_horizon)
    if train_size < len(ts):
        ts_train = ts[:train_size]
        ts_test = ts[train_size:]
        dates_train = dates_filtered[:train_size]
        dates_test = dates_filtered[train_size:]
    else:
        ts_train = ts
        ts_test = None
        dates_train = dates_filtered
        dates_test = None
    
    # Generate forecast using Prophet
    prophet_forecast, prophet_accuracy = forecast_prophet(
        ts_train, dates_train, ts_test, dates_test, forecast_horizon
    )
    
    if not prophet_forecast:
        raise ValueError("Prophet forecasting failed. Please check your data and try again.")
    
    # Get forecast values and intervals
    forecast_values = prophet_forecast.get("forecast_values", [])
    forecast_lower = prophet_forecast.get("forecast_lower", [])
    forecast_upper = prophet_forecast.get("forecast_upper", [])
    
    # Generate future dates
    last_date = dates_filtered[-1]
    if pd.api.types.is_datetime64_any_dtype(pd.Series(dates_filtered)) and len(dates_filtered) > 1:
        future_dates = generate_future_dates(last_date, forecast_horizon, date_frequency_info)
    else:
        future_dates = [i + 1 for i in range(forecast_horizon)]
    
    # Create result DataFrame with aggregated data
    df_result = df_agg.copy()
    df_result['is_forecast'] = False
    df_result['forecast'] = None
    df_result['forecast_lower'] = None
    df_result['forecast_upper'] = None
    
    # Add forecast rows
    forecast_rows = []
    for i in range(forecast_horizon):
        row = df_result.iloc[0].copy()  # Use first row as template
        row[date_column] = future_dates[i] if i < len(future_dates) else None
        row[value_column] = None  # No actual value for forecast
        if split_column and split_value is not None:
            row[split_column] = split_value  # Preserve split value
        row['is_forecast'] = True
        row['forecast'] = forecast_values[i] if i < len(forecast_values) else None
        row['forecast_lower'] = forecast_lower[i] if i < len(forecast_lower) else None
        row['forecast_upper'] = forecast_upper[i] if i < len(forecast_upper) else None
        forecast_rows.append(row)
    
    # Append forecast rows
    if forecast_rows:
        forecast_df = pd.DataFrame(forecast_rows)
        df_result = pd.concat([df_result, forecast_df], ignore_index=True)
    
    # Generate decomposition analysis using Prophet
    decomposition_analysis = _generate_prophet_decomposition(ts_train, dates_train)
    
    # Generate model quality metrics
    model_quality = _generate_model_quality_metrics(
        ts_train, dates_train, ts_test, dates_test, prophet_forecast, prophet_accuracy
    )
    
    # Generate textual summary
    insights_summary = _generate_light_insights_summary(
        time_series_summary, trend_analysis, seasonality_analysis,
        prophet_accuracy, decomposition_analysis, model_quality
    )
    
    # Build insights DataFrame
    insights_data = {
        'feature_name': [
            'time_series_summary',
            'trend_analysis',
            'seasonality_analysis',
            'forecast_accuracy',
            #'decomposition_analysis',
            #'model_quality',
            'insights_summary'
        ],
        'feature_value': [
            time_series_summary,
            trend_analysis,
            seasonality_analysis,
            prophet_accuracy or {},
            #decomposition_analysis,
            #model_quality,
            insights_summary
        ]
    }
    
    # Add split information if applicable
    if split_column and split_value is not None:
        insights_data['split_column'] = [split_column] * 7
        insights_data['split_value'] = [split_value] * 7
    
    df_insights = pd.DataFrame(insights_data)
    
    return df_result, df_insights


def _generate_prophet_decomposition(ts_train: np.ndarray, dates_train: np.ndarray) -> Dict:
    """
    Generate decomposition analysis using Prophet model.
    Returns trend, seasonal, and residual components.
    """
    try:
        from prophet import Prophet
        import pandas as pd
        
        # Prepare data for Prophet
        df_train = pd.DataFrame({
            'ds': pd.to_datetime(dates_train),
            'y': ts_train.astype(np.float64)
        })
        
        # Fit Prophet model
        model = Prophet(
            yearly_seasonality='auto',
            weekly_seasonality='auto',
            daily_seasonality=False,
            seasonality_mode='additive'
        )
        model.fit(df_train)
        
        # Get components
        components = model.predict(df_train)
        
        # Extract components
        trend = components['trend'].values.tolist()
        yearly = components.get('yearly', pd.Series([0] * len(df_train))).values.tolist()
        weekly = components.get('weekly', pd.Series([0] * len(df_train))).values.tolist()
        
        # Calculate residual
        fitted = components['yhat'].values
        residual = (ts_train - fitted).tolist()
        
        decomposition = {
            "trend": [float(v) for v in trend],
            "yearly_seasonal": [float(v) for v in yearly],
            "weekly_seasonal": [float(v) for v in weekly],
            "residual": [float(v) for v in residual],
            "fitted": [float(v) for v in fitted]
        }
        
        return decomposition
    except Exception as e:
        logging.exception("Error in time series decomposition")
        return {
            "error": str(e),
            "trend": [],
            "yearly_seasonal": [],
            "weekly_seasonal": [],
            "residual": [],
            "fitted": []
        }


def _generate_model_quality_metrics(
    ts_train: np.ndarray,
    dates_train: np.ndarray,
    ts_test: Optional[np.ndarray],
    dates_test: Optional[np.ndarray],
    prophet_forecast: Dict,
    prophet_accuracy: Optional[Dict]
) -> Dict:
    """
    Generate model quality metrics including actual vs predicted and residuals over time.
    """
    try:
        from prophet import Prophet
        import pandas as pd
        
        # Prepare training data
        df_train = pd.DataFrame({
            'ds': pd.to_datetime(dates_train),
            'y': ts_train.astype(np.float64)
        })
        
        # Fit Prophet model
        model = Prophet(
            yearly_seasonality='auto',
            weekly_seasonality='auto',
            daily_seasonality=False,
            seasonality_mode='additive'
        )
        model.fit(df_train)
        
        # Get predictions on training data
        train_predictions = model.predict(df_train)
        train_predicted = train_predictions['yhat'].values
        train_residuals = (ts_train - train_predicted).tolist()
        
        # Get predictions on test data if available
        test_predicted = None
        test_residuals = None
        if ts_test is not None and dates_test is not None and len(ts_test) > 0:
            df_test = pd.DataFrame({
                'ds': pd.to_datetime(dates_test),
                'y': ts_test.astype(np.float64)
            })
            test_predictions = model.predict(df_test)
            test_predicted = test_predictions['yhat'].values.tolist()
            test_residuals = (ts_test - test_predicted).tolist()
        
        quality_metrics = {
            "train_actual": ts_train.tolist(),
            "train_predicted": [float(v) for v in train_predicted],
            "train_residuals": [float(v) for v in train_residuals],
            "train_dates": [str(d) for d in dates_train],
            "train_mae": float(np.mean(np.abs(train_residuals))),
            "train_rmse": float(np.sqrt(np.mean(np.array(train_residuals) ** 2))),
        }
        
        if test_predicted is not None:
            quality_metrics.update({
                "test_actual": ts_test.tolist(),
                "test_predicted": test_predicted,
                "test_residuals": [float(v) for v in test_residuals],
                "test_dates": [str(d) for d in dates_test],
            })
        
        # Add accuracy metrics if available
        if prophet_accuracy:
            quality_metrics.update({
                "test_mae": prophet_accuracy.get("mae"),
                "test_rmse": prophet_accuracy.get("rmse"),
                "test_mape": prophet_accuracy.get("mape")
            })
        
        return quality_metrics
    except Exception as e:
        logging.exception("Error in prophet forecast quality metrics")
        return {
            "error": str(e),
            "train_actual": [],
            "train_predicted": [],
            "train_residuals": [],
            "train_dates": []
        }


def _generate_light_insights_summary(
    time_series_summary: Dict,
    trend_analysis: Dict,
    seasonality_analysis: Dict,
    forecast_accuracy: Optional[Dict],
    decomposition_analysis: Dict,
    model_quality: Dict
) -> str:
    """
    Generate a textual summary of forecasting insights for the light version.
    """
    summary_parts = []
    
    # Time series overview
    summary_parts.append(f"Time series spans from {time_series_summary['start_date']} to {time_series_summary['end_date']} "
                         f"with {time_series_summary['n_observations']} observations.")
    summary_parts.append(f"Mean value: {time_series_summary['mean']:.2f}, "
                         f"showing a {time_series_summary['trend_direction']} trend.")
    
    # Trend analysis
    if trend_analysis.get("trend_type"):
        summary_parts.append(f"Trend analysis indicates a {trend_analysis['trend_type']} pattern.")
    
    # Seasonality
    if seasonality_analysis.get("has_seasonality"):
        summary_parts.append(f"Seasonality detected with period {seasonality_analysis.get('period', 'unknown')}.")
    else:
        summary_parts.append("No significant seasonality detected.")
    
    # Model quality
    if model_quality.get("train_mae"):
        summary_parts.append(f"Model training MAE: {model_quality['train_mae']:.2f}, "
                           f"RMSE: {model_quality['train_rmse']:.2f}.")
    
    if forecast_accuracy:
        summary_parts.append(f"Forecast accuracy - MAE: {forecast_accuracy.get('mae', 'N/A')}, "
                           f"RMSE: {forecast_accuracy.get('rmse', 'N/A')}, "
                           f"MAPE: {forecast_accuracy.get('mape', 'N/A')}%.")
    
    return " ".join(summary_parts)


#============== Root Cause Analysis (DataFrame-based) ==============

def _analytic_root_cause_analysis_df(
    df: pd.DataFrame,
    target_column: str,
    feature_columns: List[str],
    max_interactions: int = 5,
    min_correlation_threshold: float = 0.1
) -> Dict:
    """
    Performs root cause analysis on a DataFrame to identify factors influencing a target variable.
    Returns structured insights to be injected into an AI agent context.
    Handles both numeric and categorical features automatically.
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing the data to analyze
    target_column : str
        Name of the target variable to analyze
    feature_columns : List[str]
        List of feature columns to analyze as potential root causes
    max_interactions : int, default=5
        Maximum number of feature interactions to analyze
    min_correlation_threshold : float, default=0.1
        Minimum correlation threshold to consider a feature as significant
        
    Returns
    -------
    Dict
        Dictionary containing:
        - target_summary : summary statistics of the target variable
        - feature_importance : ranked list of features by importance
        - correlations : correlation analysis (numeric features)
        - categorical_impacts : impact analysis (categorical features)
        - interactions : top feature interactions
        - conditional_analysis : conditional patterns and insights
        - root_causes : identified root causes with evidence
        - insights_summary : textual summary of main findings
    """
    
    # Validation
    if target_column not in df.columns:
        raise ValueError(f"Target column '{target_column}' not found in DataFrame")
    
    missing_cols = [col for col in feature_columns if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing feature columns in DataFrame: {missing_cols}")
    
    if target_column in feature_columns:
        feature_columns = [col for col in feature_columns if col != target_column]
    
    # Prepare data
    df_clean = df[[target_column] + feature_columns].copy()
    
    # Remove rows with missing target
    df_clean = df_clean.dropna(subset=[target_column])
    
    if len(df_clean) == 0:
        raise ValueError("No valid data after removing missing target values")
    
    # Identify column types
    numeric_cols, categorical_cols, column_types = identify_column_types(df_clean[feature_columns])
    
    # Target summary
    target_summary = compute_target_summary(df_clean, target_column)
    
    # Feature importance analysis
    feature_importance = compute_rca_feature_importance(
        df_clean, target_column, numeric_cols, categorical_cols
    )
    
    # Correlation analysis for numeric features
    correlations = {}
    if numeric_cols:
        correlations = compute_correlations(df_clean, target_column, numeric_cols)
    
    # Categorical impact analysis
    categorical_impacts = {}
    if categorical_cols:
        categorical_impacts = compute_categorical_impacts(
            df_clean, target_column, categorical_cols
        )
    
    # Feature interactions
    interactions = compute_feature_interactions(
        df_clean, target_column, feature_columns, 
        numeric_cols, categorical_cols, max_interactions
    )
    
    # Conditional analysis
    conditional_analysis = compute_conditional_analysis(
        df_clean, target_column, feature_columns, 
        numeric_cols, categorical_cols
    )
    
    # Root causes identification
    root_causes = identify_root_causes(
        feature_importance, correlations, categorical_impacts, 
        interactions, min_correlation_threshold
    )
    
    # Generate insights summary
    insights_summary = generate_rca_insights_summary(
        target_summary, feature_importance, correlations, 
        categorical_impacts, root_causes
    )
    
    # Build result
    result = {
        "target_summary": target_summary,
        "feature_importance": feature_importance,
        "correlations": correlations,
        "categorical_impacts": categorical_impacts,
        "interactions": interactions,
        "conditional_analysis": conditional_analysis,
        "root_causes": root_causes,
        "insights_summary": insights_summary,
        "column_types": {col: column_types.get(col, "unknown") for col in feature_columns}
    }
    
    return result
