Dashboard Features: - 8 navigation sections: Overview, Outcomes, Poor CX, FCR, Churn, Agent, Call Explorer, Export - Beyond Brand Identity styling (colors #6D84E3, Outfit font) - RCA Sankey diagram (Driver → Outcome → Churn Risk flow) - Correlation heatmaps (driver co-occurrence, driver-outcome) - Outcome Deep Dive (root causes, correlation, duration analysis) - Export functionality (Excel, HTML, JSON) Blueprint Compliance: - FCR: 4 categories (Primera Llamada/Rellamada × Sin/Con Riesgo de Fuga) - Churn: Binary view (Sin Riesgo de Fuga / En Riesgo de Fuga) - Agent: Talento Para Replicar / Oportunidades de Mejora - Fixed FCR rate calculation (only FIRST_CALL counts as success) Technical: - Streamlit + Plotly for interactive visualizations - Light theme configuration (.streamlit/config.toml) - Fixed Plotly colorbar titlefont deprecation Documentation: - Updated PROJECT_CONTEXT.md, TODO.md, CHANGELOG.md - Added 4 new technical decisions (TD-014 to TD-017) - Created TROUBLESHOOTING.md with 10 common issues Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
417 lines
13 KiB
Python
417 lines
13 KiB
Python
"""
|
|
CXInsights - Call Analysis Schema v1.0
|
|
|
|
Data contracts for the call analysis pipeline.
|
|
All outputs MUST include: schema_version, prompt_version, model_id
|
|
|
|
This schema defines:
|
|
- OBSERVED: Facts extracted from STT (deterministic)
|
|
- INFERRED: Conclusions from LLM (requires evidence)
|
|
"""
|
|
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
from typing import Literal
|
|
|
|
from pydantic import BaseModel, Field, field_validator
|
|
|
|
|
|
# ============================================
|
|
# SCHEMA VERSION
|
|
# ============================================
|
|
|
|
SCHEMA_VERSION = "1.0.0"
|
|
|
|
|
|
# ============================================
|
|
# ENUMS
|
|
# ============================================
|
|
|
|
|
|
class DataSource(str, Enum):
|
|
"""Source of data - critical for audit trail"""
|
|
|
|
OBSERVED = "observed" # From STT, deterministic
|
|
INFERRED = "inferred" # From LLM, requires evidence
|
|
|
|
|
|
class ProcessingStatus(str, Enum):
|
|
"""Processing status for each call"""
|
|
|
|
SUCCESS = "success"
|
|
PARTIAL = "partial"
|
|
FAILED = "failed"
|
|
|
|
|
|
class FailureReason(str, Enum):
|
|
"""Reasons for processing failure"""
|
|
|
|
LOW_AUDIO_QUALITY = "LOW_AUDIO_QUALITY"
|
|
TRANSCRIPTION_FAILED = "TRANSCRIPTION_FAILED"
|
|
LLM_PARSE_ERROR = "LLM_PARSE_ERROR"
|
|
NO_EVIDENCE_FOUND = "NO_EVIDENCE_FOUND"
|
|
SCHEMA_VALIDATION_ERROR = "SCHEMA_VALIDATION_ERROR"
|
|
TIMEOUT = "TIMEOUT"
|
|
RATE_LIMITED = "RATE_LIMITED"
|
|
UNKNOWN = "UNKNOWN"
|
|
|
|
|
|
class EventType(str, Enum):
|
|
"""Observable events (detected without LLM)"""
|
|
|
|
HOLD_START = "HOLD_START"
|
|
HOLD_END = "HOLD_END"
|
|
TRANSFER = "TRANSFER"
|
|
ESCALATION = "ESCALATION"
|
|
SILENCE = "SILENCE"
|
|
INTERRUPTION = "INTERRUPTION"
|
|
|
|
|
|
class CallOutcome(str, Enum):
|
|
"""Final outcome of the call"""
|
|
|
|
SALE_COMPLETED = "SALE_COMPLETED"
|
|
SALE_LOST = "SALE_LOST"
|
|
CANCELLATION_SAVED = "CANCELLATION_SAVED"
|
|
CANCELLATION_COMPLETED = "CANCELLATION_COMPLETED"
|
|
INQUIRY_RESOLVED = "INQUIRY_RESOLVED"
|
|
INQUIRY_UNRESOLVED = "INQUIRY_UNRESOLVED"
|
|
COMPLAINT_RESOLVED = "COMPLAINT_RESOLVED"
|
|
COMPLAINT_UNRESOLVED = "COMPLAINT_UNRESOLVED"
|
|
TRANSFER_OUT = "TRANSFER_OUT"
|
|
CALLBACK_SCHEDULED = "CALLBACK_SCHEDULED"
|
|
UNKNOWN = "UNKNOWN"
|
|
|
|
|
|
# ============================================
|
|
# TRACEABILITY (Required on all outputs)
|
|
# ============================================
|
|
|
|
|
|
class Traceability(BaseModel):
|
|
"""Traceability metadata - REQUIRED on all analysis outputs"""
|
|
|
|
schema_version: str = Field(
|
|
default=SCHEMA_VERSION,
|
|
description="Version of this schema",
|
|
)
|
|
prompt_version: str = Field(
|
|
description="Version of the prompt used for inference",
|
|
)
|
|
model_id: str = Field(
|
|
description="Model identifier (e.g., gpt-4o-mini-2024-07-18)",
|
|
)
|
|
created_at: datetime = Field(
|
|
default_factory=datetime.utcnow,
|
|
description="Timestamp of analysis",
|
|
)
|
|
|
|
|
|
# ============================================
|
|
# TRANSCRIPT MODELS (OBSERVED)
|
|
# ============================================
|
|
|
|
|
|
class SpeakerTurn(BaseModel):
|
|
"""Single speaker turn in transcript"""
|
|
|
|
speaker: str = Field(description="Speaker identifier (A, B, agent, customer)")
|
|
text: str = Field(description="Transcribed text")
|
|
start_time: float = Field(description="Start time in seconds")
|
|
end_time: float = Field(description="End time in seconds")
|
|
confidence: float | None = Field(
|
|
default=None,
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="STT confidence score",
|
|
)
|
|
|
|
|
|
class TranscriptMetadata(BaseModel):
|
|
"""Metadata about the transcript"""
|
|
|
|
audio_duration_sec: float = Field(description="Total audio duration in seconds")
|
|
language: str = Field(default="es", description="Detected language")
|
|
provider: str = Field(description="STT provider (assemblyai, whisper, etc.)")
|
|
job_id: str | None = Field(default=None, description="Provider job ID")
|
|
created_at: datetime = Field(
|
|
default_factory=datetime.utcnow,
|
|
description="Timestamp of transcription",
|
|
)
|
|
|
|
|
|
class Transcript(BaseModel):
|
|
"""Complete transcript with speaker diarization - OBSERVED data"""
|
|
|
|
call_id: str = Field(description="Unique call identifier")
|
|
turns: list[SpeakerTurn] = Field(description="List of speaker turns")
|
|
metadata: TranscriptMetadata = Field(description="Transcript metadata")
|
|
full_text: str | None = Field(
|
|
default=None,
|
|
description="Full concatenated text (optional)",
|
|
)
|
|
|
|
|
|
# ============================================
|
|
# EVENT MODELS (OBSERVED)
|
|
# ============================================
|
|
|
|
|
|
class Event(BaseModel):
|
|
"""Observable event detected without LLM - OBSERVED data"""
|
|
|
|
event_type: EventType = Field(description="Type of event")
|
|
start_time: float = Field(description="Event start time in seconds")
|
|
end_time: float | None = Field(
|
|
default=None,
|
|
description="Event end time in seconds (if applicable)",
|
|
)
|
|
duration_sec: float | None = Field(
|
|
default=None,
|
|
description="Event duration in seconds",
|
|
)
|
|
metadata: dict | None = Field(
|
|
default=None,
|
|
description="Additional event-specific data",
|
|
)
|
|
source: Literal["observed"] = Field(
|
|
default="observed",
|
|
description="Events are always observed, not inferred",
|
|
)
|
|
|
|
|
|
# ============================================
|
|
# TURN METRICS (OBSERVED)
|
|
# ============================================
|
|
|
|
|
|
class TurnMetrics(BaseModel):
|
|
"""Metrics computed from transcript - OBSERVED data"""
|
|
|
|
total_turns: int = Field(description="Total number of turns")
|
|
agent_turns: int = Field(description="Number of agent turns")
|
|
customer_turns: int = Field(description="Number of customer turns")
|
|
agent_talk_ratio: float = Field(
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Ratio of agent talk time",
|
|
)
|
|
customer_talk_ratio: float = Field(
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Ratio of customer talk time",
|
|
)
|
|
silence_ratio: float = Field(
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Ratio of silence time",
|
|
)
|
|
interruption_count: int = Field(
|
|
default=0,
|
|
description="Number of detected interruptions",
|
|
)
|
|
avg_turn_duration_sec: float = Field(description="Average turn duration")
|
|
source: Literal["observed"] = Field(
|
|
default="observed",
|
|
description="Metrics are always observed, not inferred",
|
|
)
|
|
|
|
|
|
# ============================================
|
|
# OBSERVED FEATURES (Aggregated)
|
|
# ============================================
|
|
|
|
|
|
class ObservedFeatures(BaseModel):
|
|
"""All observed features for a call - deterministic, no LLM"""
|
|
|
|
call_id: str = Field(description="Unique call identifier")
|
|
events: list[Event] = Field(
|
|
default_factory=list,
|
|
description="Detected events",
|
|
)
|
|
turn_metrics: TurnMetrics = Field(description="Turn-based metrics")
|
|
hold_count: int = Field(default=0, description="Number of hold events")
|
|
total_hold_duration_sec: float = Field(
|
|
default=0.0,
|
|
description="Total hold duration",
|
|
)
|
|
transfer_count: int = Field(default=0, description="Number of transfers")
|
|
silence_count: int = Field(
|
|
default=0,
|
|
description="Number of significant silences",
|
|
)
|
|
created_at: datetime = Field(default_factory=datetime.utcnow)
|
|
|
|
|
|
# ============================================
|
|
# EVIDENCE MODELS (For INFERRED data)
|
|
# ============================================
|
|
|
|
|
|
class EvidenceSpan(BaseModel):
|
|
"""Evidence from transcript supporting an inference"""
|
|
|
|
text: str = Field(
|
|
max_length=500,
|
|
description="Quoted text from transcript",
|
|
)
|
|
start_time: float = Field(description="Start time in seconds")
|
|
end_time: float = Field(description="End time in seconds")
|
|
speaker: str | None = Field(
|
|
default=None,
|
|
description="Speaker of this evidence",
|
|
)
|
|
|
|
@field_validator("text")
|
|
@classmethod
|
|
def text_not_empty(cls, v: str) -> str:
|
|
if not v.strip():
|
|
raise ValueError("Evidence text cannot be empty")
|
|
return v.strip()
|
|
|
|
|
|
# ============================================
|
|
# RCA LABELS (INFERRED)
|
|
# ============================================
|
|
|
|
|
|
class RCALabel(BaseModel):
|
|
"""Root Cause Analysis label - INFERRED data (requires evidence)"""
|
|
|
|
driver_code: str = Field(
|
|
description="Driver code from taxonomy (e.g., PRICE_TOO_HIGH)",
|
|
)
|
|
confidence: float = Field(
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Confidence score (0-1)",
|
|
)
|
|
evidence_spans: list[EvidenceSpan] = Field(
|
|
min_length=1,
|
|
description="Supporting evidence (minimum 1 required)",
|
|
)
|
|
reasoning: str | None = Field(
|
|
default=None,
|
|
max_length=500,
|
|
description="Brief reasoning for this classification",
|
|
)
|
|
proposed_label: str | None = Field(
|
|
default=None,
|
|
description="For OTHER_EMERGENT: proposed new label",
|
|
)
|
|
source: Literal["inferred"] = Field(
|
|
default="inferred",
|
|
description="RCA labels are always inferred",
|
|
)
|
|
|
|
@field_validator("evidence_spans")
|
|
@classmethod
|
|
def at_least_one_evidence(cls, v: list[EvidenceSpan]) -> list[EvidenceSpan]:
|
|
if len(v) < 1:
|
|
raise ValueError("At least one evidence span is required")
|
|
return v
|
|
|
|
|
|
# ============================================
|
|
# CALL ANALYSIS (Complete Output)
|
|
# ============================================
|
|
|
|
|
|
class CallAnalysis(BaseModel):
|
|
"""
|
|
Complete analysis output for a single call.
|
|
|
|
Combines:
|
|
- OBSERVED: Features, events, metrics (from STT)
|
|
- INFERRED: RCA labels, outcome (from LLM)
|
|
|
|
MUST include traceability for audit.
|
|
"""
|
|
|
|
# === Identifiers ===
|
|
call_id: str = Field(description="Unique call identifier")
|
|
batch_id: str = Field(description="Batch identifier")
|
|
|
|
# === Processing Status ===
|
|
status: ProcessingStatus = Field(description="Processing status")
|
|
failure_reason: FailureReason | None = Field(
|
|
default=None,
|
|
description="Reason for failure (if status != success)",
|
|
)
|
|
|
|
# === OBSERVED Data ===
|
|
observed: ObservedFeatures = Field(description="Observed features (deterministic)")
|
|
|
|
# === INFERRED Data ===
|
|
outcome: CallOutcome = Field(description="Call outcome (inferred)")
|
|
lost_sales_drivers: list[RCALabel] = Field(
|
|
default_factory=list,
|
|
description="Lost sales RCA labels",
|
|
)
|
|
poor_cx_drivers: list[RCALabel] = Field(
|
|
default_factory=list,
|
|
description="Poor CX RCA labels",
|
|
)
|
|
|
|
# === Traceability (REQUIRED) ===
|
|
traceability: Traceability = Field(description="Version and audit metadata")
|
|
|
|
# === Timestamps ===
|
|
created_at: datetime = Field(default_factory=datetime.utcnow)
|
|
|
|
|
|
# ============================================
|
|
# COMPRESSED TRANSCRIPT (For LLM Input)
|
|
# ============================================
|
|
|
|
|
|
class CompressedTranscript(BaseModel):
|
|
"""Compressed transcript for LLM inference - reduces token usage"""
|
|
|
|
call_id: str = Field(description="Unique call identifier")
|
|
customer_intent: str = Field(description="Summarized customer intent")
|
|
agent_offers: list[str] = Field(
|
|
default_factory=list,
|
|
description="Key offers made by agent",
|
|
)
|
|
objections: list[str] = Field(
|
|
default_factory=list,
|
|
description="Customer objections",
|
|
)
|
|
resolution_statements: list[str] = Field(
|
|
default_factory=list,
|
|
description="Resolution statements",
|
|
)
|
|
key_exchanges: list[dict] = Field(
|
|
default_factory=list,
|
|
description="Key exchanges with timestamps",
|
|
)
|
|
original_token_count: int = Field(description="Tokens in original transcript")
|
|
compressed_token_count: int = Field(description="Tokens after compression")
|
|
compression_ratio: float = Field(
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Compression ratio achieved",
|
|
)
|
|
|
|
|
|
# ============================================
|
|
# BATCH MANIFEST
|
|
# ============================================
|
|
|
|
|
|
class BatchManifest(BaseModel):
|
|
"""Manifest for a processing batch"""
|
|
|
|
batch_id: str = Field(description="Unique batch identifier")
|
|
total_calls: int = Field(description="Total calls in batch")
|
|
processed_calls: int = Field(default=0, description="Calls processed")
|
|
success_count: int = Field(default=0, description="Successful processing")
|
|
partial_count: int = Field(default=0, description="Partial processing")
|
|
failed_count: int = Field(default=0, description="Failed processing")
|
|
status: str = Field(default="pending", description="Batch status")
|
|
started_at: datetime | None = Field(default=None)
|
|
completed_at: datetime | None = Field(default=None)
|
|
traceability: Traceability = Field(description="Version metadata")
|