BeyondCX_Insights/src/aggregation/models.py

"""
CXInsights - Aggregation Models

Data models for aggregated RCA analysis.
Transforms individual call analyses into actionable insights.
"""

from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from typing import Literal


class ImpactLevel(str, Enum):
    """Impact level for RCA drivers."""

    CRITICAL = "critical"  # >10% of calls, high severity
    HIGH = "high"          # >5% of calls or high severity
    MEDIUM = "medium"      # >2% of calls
    LOW = "low"            # <2% of calls


class TrendDirection(str, Enum):
    """Trend direction for time-series analysis."""

    INCREASING = "increasing"
    STABLE = "stable"
    DECREASING = "decreasing"
    UNKNOWN = "unknown"


# Category types for v2.0
DriverCategory = Literal[
    "lost_sales", "poor_cx", "fcr_failure", "churn_risk",
    "agent_positive", "agent_improvement"
]


@dataclass
class DriverFrequency:
    """Frequency statistics for a single driver."""

    driver_code: str
    category: DriverCategory

    # Counts
    total_occurrences: int
    calls_affected: int
    total_calls_in_batch: int

    # Rates
    occurrence_rate: float  # occurrences / total_calls
    call_rate: float        # calls_affected / total_calls

    # Confidence stats
    avg_confidence: float
    min_confidence: float
    max_confidence: float

    # Co-occurrence
    commonly_co_occurs_with: list[str] = field(default_factory=list)

    def __post_init__(self):
        """Validate rates."""
        if not 0 <= self.occurrence_rate <= 1:
            raise ValueError(f"occurrence_rate must be 0-1, got {self.occurrence_rate}")
        if not 0 <= self.call_rate <= 1:
            raise ValueError(f"call_rate must be 0-1, got {self.call_rate}")


@dataclass
class DriverSeverity:
    """Severity scoring for a driver."""

    driver_code: str
    category: DriverCategory

    # Base severity from taxonomy
    base_severity: float  # 0-1 from config

    # Computed severity factors
    frequency_factor: float      # Higher frequency = higher impact
    confidence_factor: float     # Higher confidence = more reliable
    co_occurrence_factor: float  # Often with other issues = systemic

    # Final score
    severity_score: float  # 0-100 scale

    # Impact classification
    impact_level: ImpactLevel

    def __post_init__(self):
        """Validate severity score."""
        if not 0 <= self.severity_score <= 100:
            raise ValueError(f"severity_score must be 0-100, got {self.severity_score}")


@dataclass
class ConditionalProbability:
    """Conditional probability between drivers."""

    driver_a: str  # Given this driver...
    driver_b: str  # ...probability of this driver
    category_a: DriverCategory
    category_b: DriverCategory

    # P(B|A) - probability of B given A
    probability: float
    support: int  # Number of co-occurrences

    # Lift: P(B|A) / P(B) - how much A increases likelihood of B
    lift: float

    def __post_init__(self):
        """Validate probability."""
        if not 0 <= self.probability <= 1:
            raise ValueError(f"probability must be 0-1, got {self.probability}")


@dataclass
class RCANode:
    """A node in the RCA tree."""

    driver_code: str
    category: DriverCategory

    # Statistics
    frequency: DriverFrequency
    severity: DriverSeverity

    # Hierarchy
    parent_code: str | None = None
    children: list["RCANode"] = field(default_factory=list)

    # Actionability
    recommended_actions: list[str] = field(default_factory=list)
    priority_rank: int = 0

    # Evidence summary
    sample_evidence: list[str] = field(default_factory=list)

    def to_dict(self) -> dict:
        """Convert to dictionary for serialization."""
        return {
            "driver_code": self.driver_code,
            "category": self.category,
            "frequency": {
                "total_occurrences": self.frequency.total_occurrences,
                "calls_affected": self.frequency.calls_affected,
                "occurrence_rate": self.frequency.occurrence_rate,
                "call_rate": self.frequency.call_rate,
            },
            "severity": {
                "severity_score": self.severity.severity_score,
                "impact_level": self.severity.impact_level.value,
            },
            "priority_rank": self.priority_rank,
            "children": [c.to_dict() for c in self.children],
            "sample_evidence": self.sample_evidence[:3],
        }


@dataclass
class RCATree:
    """Complete RCA tree for a batch."""

    batch_id: str
    created_at: datetime = field(default_factory=datetime.utcnow)

    # Root nodes (top-level drivers)
    lost_sales_root: list[RCANode] = field(default_factory=list)
    poor_cx_root: list[RCANode] = field(default_factory=list)
    fcr_failure_root: list[RCANode] = field(default_factory=list)  # v2.0
    churn_risk_root: list[RCANode] = field(default_factory=list)   # v2.0

    # Summary stats
    total_calls: int = 0
    calls_with_lost_sales: int = 0
    calls_with_poor_cx: int = 0
    calls_with_both: int = 0

    # FCR stats (v2.0)
    calls_first_call: int = 0
    calls_repeat_call: int = 0
    repeat_call_rate: float = 0.0

    # Churn stats (v2.0)
    calls_at_risk: int = 0
    churn_risk_rate: float = 0.0

    # Agent stats (v2.0)
    agents_good_performer: int = 0
    agents_needs_improvement: int = 0
    agents_mixed: int = 0

    # Top drivers by impact
    top_lost_sales_drivers: list[str] = field(default_factory=list)
    top_poor_cx_drivers: list[str] = field(default_factory=list)
    top_fcr_failure_drivers: list[str] = field(default_factory=list)  # v2.0
    top_churn_risk_drivers: list[str] = field(default_factory=list)   # v2.0

    # Cross-category patterns
    conditional_probabilities: list[ConditionalProbability] = field(default_factory=list)

    def get_driver_by_code(self, code: str) -> RCANode | None:
        """Find a driver node by code."""
        all_roots = (
            self.lost_sales_root + self.poor_cx_root +
            self.fcr_failure_root + self.churn_risk_root
        )
        for node in all_roots:
            if node.driver_code == code:
                return node
            for child in node.children:
                if child.driver_code == code:
                    return child
        return None

    def to_dict(self) -> dict:
        """Convert to dictionary for serialization."""
        return {
            "batch_id": self.batch_id,
            "created_at": self.created_at.isoformat(),
            "summary": {
                "total_calls": self.total_calls,
                "calls_with_lost_sales": self.calls_with_lost_sales,
                "calls_with_poor_cx": self.calls_with_poor_cx,
                "calls_with_both": self.calls_with_both,
                "lost_sales_rate": self.calls_with_lost_sales / self.total_calls if self.total_calls > 0 else 0,
                "poor_cx_rate": self.calls_with_poor_cx / self.total_calls if self.total_calls > 0 else 0,
                # v2.0 stats
                "calls_first_call": self.calls_first_call,
                "calls_repeat_call": self.calls_repeat_call,
                "repeat_call_rate": self.repeat_call_rate,
                "calls_at_risk": self.calls_at_risk,
                "churn_risk_rate": self.churn_risk_rate,
                "agents_good_performer": self.agents_good_performer,
                "agents_needs_improvement": self.agents_needs_improvement,
            },
            "top_drivers": {
                "lost_sales": self.top_lost_sales_drivers[:5],
                "poor_cx": self.top_poor_cx_drivers[:5],
                "fcr_failure": self.top_fcr_failure_drivers[:5],
                "churn_risk": self.top_churn_risk_drivers[:5],
            },
            "lost_sales_tree": [n.to_dict() for n in self.lost_sales_root],
            "poor_cx_tree": [n.to_dict() for n in self.poor_cx_root],
            "fcr_failure_tree": [n.to_dict() for n in self.fcr_failure_root],
            "churn_risk_tree": [n.to_dict() for n in self.churn_risk_root],
        }


@dataclass
class BatchAggregation:
    """Complete aggregation results for a batch."""

    batch_id: str
    created_at: datetime = field(default_factory=datetime.utcnow)

    # Input stats
    total_calls_processed: int = 0
    successful_analyses: int = 0
    failed_analyses: int = 0

    # Driver frequencies
    lost_sales_frequencies: list[DriverFrequency] = field(default_factory=list)
    poor_cx_frequencies: list[DriverFrequency] = field(default_factory=list)
    fcr_failure_frequencies: list[DriverFrequency] = field(default_factory=list)  # v2.0
    churn_risk_frequencies: list[DriverFrequency] = field(default_factory=list)   # v2.0
    agent_positive_frequencies: list[DriverFrequency] = field(default_factory=list)   # v2.0
    agent_improvement_frequencies: list[DriverFrequency] = field(default_factory=list)  # v2.0

    # Severity scores
    lost_sales_severities: list[DriverSeverity] = field(default_factory=list)
    poor_cx_severities: list[DriverSeverity] = field(default_factory=list)
    fcr_failure_severities: list[DriverSeverity] = field(default_factory=list)  # v2.0
    churn_risk_severities: list[DriverSeverity] = field(default_factory=list)   # v2.0

    # RCA Tree
    rca_tree: RCATree | None = None

    # Emergent patterns (OTHER_EMERGENT analysis)
    emergent_patterns: list[dict] = field(default_factory=list)

    # v2.0 aggregate stats
    fcr_stats: dict = field(default_factory=dict)
    churn_stats: dict = field(default_factory=dict)
    agent_stats: dict = field(default_factory=dict)

    def get_top_drivers(
        self,
        category: DriverCategory,
        n: int = 5,
        by: Literal["frequency", "severity"] = "severity",
    ) -> list[str]:
        """Get top N drivers by frequency or severity."""
        freq_map = {
            "lost_sales": self.lost_sales_frequencies,
            "poor_cx": self.poor_cx_frequencies,
            "fcr_failure": self.fcr_failure_frequencies,
            "churn_risk": self.churn_risk_frequencies,
            "agent_positive": self.agent_positive_frequencies,
            "agent_improvement": self.agent_improvement_frequencies,
        }
        sev_map = {
            "lost_sales": self.lost_sales_severities,
            "poor_cx": self.poor_cx_severities,
            "fcr_failure": self.fcr_failure_severities,
            "churn_risk": self.churn_risk_severities,
        }

        if by == "frequency":
            items = sorted(
                freq_map.get(category, []),
                key=lambda x: x.occurrence_rate,
                reverse=True,
            )
        else:
            items = sorted(
                sev_map.get(category, []),
                key=lambda x: x.severity_score,
                reverse=True,
            )

        return [item.driver_code for item in items[:n]]


@dataclass
class AggregationConfig:
    """Configuration for aggregation."""

    # Severity weights
    frequency_weight: float = 0.4
    confidence_weight: float = 0.3
    co_occurrence_weight: float = 0.3

    # Impact thresholds
    critical_threshold: float = 0.10  # >10% of calls
    high_threshold: float = 0.05      # >5% of calls
    medium_threshold: float = 0.02    # >2% of calls

    # Minimum support for conditional probabilities
    min_support: int = 5

    # Top N for reports
    top_n_drivers: int = 10

    # Include emergent patterns
    include_emergent: bool = True