import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

from faker import Faker

class UniversalDataGenerator:
    """
    A generic engine to generate synthetic data based on a user-defined schema.
    Supports:
    - Statistical Distributions (Normal, Uniform, Integer)
    - Categorical Sampling (Weighted)
    - Date Ranges
    - Formula-based Columns (for creating relationships and targets)
    """
    
    def __init__(self, n_rows=1000, seed=42, locale="en_US"):
        self.n_rows = n_rows
        self.seed = seed
        self.df = pd.DataFrame()
        self.faker = Faker(locale)
        self.faker.seed_instance(seed)
        np.random.seed(seed)
        random.seed(seed)

    def generate_independent(self, name, type, **kwargs):
        """
        Generates a standalone column based on a distribution.
        
        :param type: 'normal', 'uniform', 'integer', 'category', 'date'
        """
        if type == 'normal':
            mean = kwargs.get('mean', 0)
            std = kwargs.get('std', 1)
            self.df[name] = np.random.normal(mean, std, self.n_rows)
            
        elif type == 'uniform':
            low = kwargs.get('min', 0)
            high = kwargs.get('max', 1)
            self.df[name] = np.random.uniform(low, high, self.n_rows)
            
        elif type == 'integer':
            low = kwargs.get('min', 0)
            high = kwargs.get('max', 100)
            self.df[name] = np.random.randint(low, high, self.n_rows)
            
        elif type == 'category':
            choices = kwargs.get('choices', ['A', 'B'])
            weights = kwargs.get('weights', None) # List of probs summing to 1
            self.df[name] = np.random.choice(choices, self.n_rows, p=weights)
            
        elif type == 'date':
            start_str = kwargs.get('start', '2023-01-01')
            end_str = kwargs.get('end', '2024-01-01')
            start = datetime.strptime(start_str, "%Y-%m-%d")
            end = datetime.strptime(end_str, "%Y-%m-%d")
            delta = end - start
            days_range = delta.days
            
            # Add random days to start date
            random_days = np.random.randint(0, days_range, self.n_rows)
            self.df[name] = [start + timedelta(days=int(d)) for d in random_days]
            
        return self

    def generate_correlated(self, name, dependencies, noise_std=0.0, output_type='numeric', 
                          threshold=0.5, rescale=False, min_value=0.0, max_value=100.0):
        """Generate a column correlated with existing columns using provided weights.
        
        :param output_type: 'numeric' for regression or 'binary' for classification (0/1)
        :param threshold: For binary output, values >= threshold become 1, others become 0
        :param rescale: For numeric output, whether to rescale to a specific range
        :param min_value: Minimum value when rescaling numeric output
        :param max_value: Maximum value when rescaling numeric output
        """
        if not dependencies:
            raise ValueError("Dependencies list cannot be empty when creating correlated columns")

        combined = np.zeros(self.n_rows, dtype=float)

        for dep in dependencies:
            column = dep.get("column") or dep.get("source_column")
            if column not in self.df.columns:
                raise ValueError(f"Column '{column}' does not exist and cannot be used as a dependency")

            weight = float(dep.get("correlation", 1.0))
            source_series = self.df[column]
            numeric_series = self._ensure_numeric(source_series)
            centered = numeric_series - numeric_series.mean()
            std = numeric_series.std()
            normalized = centered / std if std and std > 0 else centered

            combined += weight * normalized.values

        if noise_std > 0:
            combined += np.random.normal(0, noise_std, self.n_rows)

        # Apply output type transformation
        if output_type == 'binary':
            # Use percentile-based threshold for binary classification
            threshold_value = np.percentile(combined, threshold * 100)
            self.df[name] = (combined >= threshold_value).astype(int)
        else:
            # Numeric output - optionally rescale to specified range
            if rescale:
                # Min-max normalization to [min_value, max_value]
                current_min = combined.min()
                current_max = combined.max()
                if current_max > current_min:
                    combined = (combined - current_min) / (current_max - current_min)
                    combined = combined * (max_value - min_value) + min_value
            self.df[name] = combined
            
        return self

    def get_dataframe(self):
        return self.df

    def generate_faker(self, name, provider, unique=False, **kwargs):
        """Generate a column using any Faker provider."""
        fake_source = self.faker.unique if unique else self.faker

        if provider == "profile":
            fields = kwargs.get("fields")
            data = [fake_source.profile(fields=fields) for _ in range(self.n_rows)]
        else:
            if not hasattr(fake_source, provider):
                raise ValueError(f"Faker provider '{provider}' is not available")
            faker_callable = getattr(fake_source, provider)
            data = [faker_callable(**kwargs) for _ in range(self.n_rows)]

        self.df[name] = data
        return self

    @staticmethod
    def _ensure_numeric(series):
        if pd.api.types.is_numeric_dtype(series):
            return series.astype(float)
        codes, _ = pd.factorize(series, sort=True)
        return pd.Series(codes, index=series.index, dtype=float)