""" CXInsights - Call Analysis Schema v1.0 Data contracts for the call analysis pipeline. All outputs MUST include: schema_version, prompt_version, model_id This schema defines: - OBSERVED: Facts extracted from STT (deterministic) - INFERRED: Conclusions from LLM (requires evidence) """ from datetime import datetime from enum import Enum from typing import Literal from pydantic import BaseModel, Field, field_validator # ============================================ # SCHEMA VERSION # ============================================ SCHEMA_VERSION = "1.0.0" # ============================================ # ENUMS # ============================================ class DataSource(str, Enum): """Source of data - critical for audit trail""" OBSERVED = "observed" # From STT, deterministic INFERRED = "inferred" # From LLM, requires evidence class ProcessingStatus(str, Enum): """Processing status for each call""" SUCCESS = "success" PARTIAL = "partial" FAILED = "failed" class FailureReason(str, Enum): """Reasons for processing failure""" LOW_AUDIO_QUALITY = "LOW_AUDIO_QUALITY" TRANSCRIPTION_FAILED = "TRANSCRIPTION_FAILED" LLM_PARSE_ERROR = "LLM_PARSE_ERROR" NO_EVIDENCE_FOUND = "NO_EVIDENCE_FOUND" SCHEMA_VALIDATION_ERROR = "SCHEMA_VALIDATION_ERROR" TIMEOUT = "TIMEOUT" RATE_LIMITED = "RATE_LIMITED" UNKNOWN = "UNKNOWN" class EventType(str, Enum): """Observable events (detected without LLM)""" HOLD_START = "HOLD_START" HOLD_END = "HOLD_END" TRANSFER = "TRANSFER" ESCALATION = "ESCALATION" SILENCE = "SILENCE" INTERRUPTION = "INTERRUPTION" class CallOutcome(str, Enum): """Final outcome of the call""" SALE_COMPLETED = "SALE_COMPLETED" SALE_LOST = "SALE_LOST" CANCELLATION_SAVED = "CANCELLATION_SAVED" CANCELLATION_COMPLETED = "CANCELLATION_COMPLETED" INQUIRY_RESOLVED = "INQUIRY_RESOLVED" INQUIRY_UNRESOLVED = "INQUIRY_UNRESOLVED" COMPLAINT_RESOLVED = "COMPLAINT_RESOLVED" COMPLAINT_UNRESOLVED = "COMPLAINT_UNRESOLVED" TRANSFER_OUT = "TRANSFER_OUT" CALLBACK_SCHEDULED = "CALLBACK_SCHEDULED" UNKNOWN = "UNKNOWN" # ============================================ # TRACEABILITY (Required on all outputs) # ============================================ class Traceability(BaseModel): """Traceability metadata - REQUIRED on all analysis outputs""" schema_version: str = Field( default=SCHEMA_VERSION, description="Version of this schema", ) prompt_version: str = Field( description="Version of the prompt used for inference", ) model_id: str = Field( description="Model identifier (e.g., gpt-4o-mini-2024-07-18)", ) created_at: datetime = Field( default_factory=datetime.utcnow, description="Timestamp of analysis", ) # ============================================ # TRANSCRIPT MODELS (OBSERVED) # ============================================ class SpeakerTurn(BaseModel): """Single speaker turn in transcript""" speaker: str = Field(description="Speaker identifier (A, B, agent, customer)") text: str = Field(description="Transcribed text") start_time: float = Field(description="Start time in seconds") end_time: float = Field(description="End time in seconds") confidence: float | None = Field( default=None, ge=0.0, le=1.0, description="STT confidence score", ) class TranscriptMetadata(BaseModel): """Metadata about the transcript""" audio_duration_sec: float = Field(description="Total audio duration in seconds") language: str = Field(default="es", description="Detected language") provider: str = Field(description="STT provider (assemblyai, whisper, etc.)") job_id: str | None = Field(default=None, description="Provider job ID") created_at: datetime = Field( default_factory=datetime.utcnow, description="Timestamp of transcription", ) class Transcript(BaseModel): """Complete transcript with speaker diarization - OBSERVED data""" call_id: str = Field(description="Unique call identifier") turns: list[SpeakerTurn] = Field(description="List of speaker turns") metadata: TranscriptMetadata = Field(description="Transcript metadata") full_text: str | None = Field( default=None, description="Full concatenated text (optional)", ) # ============================================ # EVENT MODELS (OBSERVED) # ============================================ class Event(BaseModel): """Observable event detected without LLM - OBSERVED data""" event_type: EventType = Field(description="Type of event") start_time: float = Field(description="Event start time in seconds") end_time: float | None = Field( default=None, description="Event end time in seconds (if applicable)", ) duration_sec: float | None = Field( default=None, description="Event duration in seconds", ) metadata: dict | None = Field( default=None, description="Additional event-specific data", ) source: Literal["observed"] = Field( default="observed", description="Events are always observed, not inferred", ) # ============================================ # TURN METRICS (OBSERVED) # ============================================ class TurnMetrics(BaseModel): """Metrics computed from transcript - OBSERVED data""" total_turns: int = Field(description="Total number of turns") agent_turns: int = Field(description="Number of agent turns") customer_turns: int = Field(description="Number of customer turns") agent_talk_ratio: float = Field( ge=0.0, le=1.0, description="Ratio of agent talk time", ) customer_talk_ratio: float = Field( ge=0.0, le=1.0, description="Ratio of customer talk time", ) silence_ratio: float = Field( ge=0.0, le=1.0, description="Ratio of silence time", ) interruption_count: int = Field( default=0, description="Number of detected interruptions", ) avg_turn_duration_sec: float = Field(description="Average turn duration") source: Literal["observed"] = Field( default="observed", description="Metrics are always observed, not inferred", ) # ============================================ # OBSERVED FEATURES (Aggregated) # ============================================ class ObservedFeatures(BaseModel): """All observed features for a call - deterministic, no LLM""" call_id: str = Field(description="Unique call identifier") events: list[Event] = Field( default_factory=list, description="Detected events", ) turn_metrics: TurnMetrics = Field(description="Turn-based metrics") hold_count: int = Field(default=0, description="Number of hold events") total_hold_duration_sec: float = Field( default=0.0, description="Total hold duration", ) transfer_count: int = Field(default=0, description="Number of transfers") silence_count: int = Field( default=0, description="Number of significant silences", ) created_at: datetime = Field(default_factory=datetime.utcnow) # ============================================ # EVIDENCE MODELS (For INFERRED data) # ============================================ class EvidenceSpan(BaseModel): """Evidence from transcript supporting an inference""" text: str = Field( max_length=500, description="Quoted text from transcript", ) start_time: float = Field(description="Start time in seconds") end_time: float = Field(description="End time in seconds") speaker: str | None = Field( default=None, description="Speaker of this evidence", ) @field_validator("text") @classmethod def text_not_empty(cls, v: str) -> str: if not v.strip(): raise ValueError("Evidence text cannot be empty") return v.strip() # ============================================ # RCA LABELS (INFERRED) # ============================================ class RCALabel(BaseModel): """Root Cause Analysis label - INFERRED data (requires evidence)""" driver_code: str = Field( description="Driver code from taxonomy (e.g., PRICE_TOO_HIGH)", ) confidence: float = Field( ge=0.0, le=1.0, description="Confidence score (0-1)", ) evidence_spans: list[EvidenceSpan] = Field( min_length=1, description="Supporting evidence (minimum 1 required)", ) reasoning: str | None = Field( default=None, max_length=500, description="Brief reasoning for this classification", ) proposed_label: str | None = Field( default=None, description="For OTHER_EMERGENT: proposed new label", ) source: Literal["inferred"] = Field( default="inferred", description="RCA labels are always inferred", ) @field_validator("evidence_spans") @classmethod def at_least_one_evidence(cls, v: list[EvidenceSpan]) -> list[EvidenceSpan]: if len(v) < 1: raise ValueError("At least one evidence span is required") return v # ============================================ # CALL ANALYSIS (Complete Output) # ============================================ class CallAnalysis(BaseModel): """ Complete analysis output for a single call. Combines: - OBSERVED: Features, events, metrics (from STT) - INFERRED: RCA labels, outcome (from LLM) MUST include traceability for audit. """ # === Identifiers === call_id: str = Field(description="Unique call identifier") batch_id: str = Field(description="Batch identifier") # === Processing Status === status: ProcessingStatus = Field(description="Processing status") failure_reason: FailureReason | None = Field( default=None, description="Reason for failure (if status != success)", ) # === OBSERVED Data === observed: ObservedFeatures = Field(description="Observed features (deterministic)") # === INFERRED Data === outcome: CallOutcome = Field(description="Call outcome (inferred)") lost_sales_drivers: list[RCALabel] = Field( default_factory=list, description="Lost sales RCA labels", ) poor_cx_drivers: list[RCALabel] = Field( default_factory=list, description="Poor CX RCA labels", ) # === Traceability (REQUIRED) === traceability: Traceability = Field(description="Version and audit metadata") # === Timestamps === created_at: datetime = Field(default_factory=datetime.utcnow) # ============================================ # COMPRESSED TRANSCRIPT (For LLM Input) # ============================================ class CompressedTranscript(BaseModel): """Compressed transcript for LLM inference - reduces token usage""" call_id: str = Field(description="Unique call identifier") customer_intent: str = Field(description="Summarized customer intent") agent_offers: list[str] = Field( default_factory=list, description="Key offers made by agent", ) objections: list[str] = Field( default_factory=list, description="Customer objections", ) resolution_statements: list[str] = Field( default_factory=list, description="Resolution statements", ) key_exchanges: list[dict] = Field( default_factory=list, description="Key exchanges with timestamps", ) original_token_count: int = Field(description="Tokens in original transcript") compressed_token_count: int = Field(description="Tokens after compression") compression_ratio: float = Field( ge=0.0, le=1.0, description="Compression ratio achieved", ) # ============================================ # BATCH MANIFEST # ============================================ class BatchManifest(BaseModel): """Manifest for a processing batch""" batch_id: str = Field(description="Unique batch identifier") total_calls: int = Field(description="Total calls in batch") processed_calls: int = Field(default=0, description="Calls processed") success_count: int = Field(default=0, description="Successful processing") partial_count: int = Field(default=0, description="Partial processing") failed_count: int = Field(default=0, description="Failed processing") status: str = Field(default="pending", description="Batch status") started_at: datetime | None = Field(default=None) completed_at: datetime | None = Field(default=None) traceability: Traceability = Field(description="Version metadata")