feat: Add Streamlit dashboard with Blueprint compliance (v2.1.0)
Dashboard Features: - 8 navigation sections: Overview, Outcomes, Poor CX, FCR, Churn, Agent, Call Explorer, Export - Beyond Brand Identity styling (colors #6D84E3, Outfit font) - RCA Sankey diagram (Driver → Outcome → Churn Risk flow) - Correlation heatmaps (driver co-occurrence, driver-outcome) - Outcome Deep Dive (root causes, correlation, duration analysis) - Export functionality (Excel, HTML, JSON) Blueprint Compliance: - FCR: 4 categories (Primera Llamada/Rellamada × Sin/Con Riesgo de Fuga) - Churn: Binary view (Sin Riesgo de Fuga / En Riesgo de Fuga) - Agent: Talento Para Replicar / Oportunidades de Mejora - Fixed FCR rate calculation (only FIRST_CALL counts as success) Technical: - Streamlit + Plotly for interactive visualizations - Light theme configuration (.streamlit/config.toml) - Fixed Plotly colorbar titlefont deprecation Documentation: - Updated PROJECT_CONTEXT.md, TODO.md, CHANGELOG.md - Added 4 new technical decisions (TD-014 to TD-017) - Created TROUBLESHOOTING.md with 10 common issues Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
416
config/schemas/call_analysis_v1.py
Normal file
416
config/schemas/call_analysis_v1.py
Normal file
@@ -0,0 +1,416 @@
|
||||
"""
|
||||
CXInsights - Call Analysis Schema v1.0
|
||||
|
||||
Data contracts for the call analysis pipeline.
|
||||
All outputs MUST include: schema_version, prompt_version, model_id
|
||||
|
||||
This schema defines:
|
||||
- OBSERVED: Facts extracted from STT (deterministic)
|
||||
- INFERRED: Conclusions from LLM (requires evidence)
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
|
||||
|
||||
# ============================================
|
||||
# SCHEMA VERSION
|
||||
# ============================================
|
||||
|
||||
SCHEMA_VERSION = "1.0.0"
|
||||
|
||||
|
||||
# ============================================
|
||||
# ENUMS
|
||||
# ============================================
|
||||
|
||||
|
||||
class DataSource(str, Enum):
|
||||
"""Source of data - critical for audit trail"""
|
||||
|
||||
OBSERVED = "observed" # From STT, deterministic
|
||||
INFERRED = "inferred" # From LLM, requires evidence
|
||||
|
||||
|
||||
class ProcessingStatus(str, Enum):
|
||||
"""Processing status for each call"""
|
||||
|
||||
SUCCESS = "success"
|
||||
PARTIAL = "partial"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class FailureReason(str, Enum):
|
||||
"""Reasons for processing failure"""
|
||||
|
||||
LOW_AUDIO_QUALITY = "LOW_AUDIO_QUALITY"
|
||||
TRANSCRIPTION_FAILED = "TRANSCRIPTION_FAILED"
|
||||
LLM_PARSE_ERROR = "LLM_PARSE_ERROR"
|
||||
NO_EVIDENCE_FOUND = "NO_EVIDENCE_FOUND"
|
||||
SCHEMA_VALIDATION_ERROR = "SCHEMA_VALIDATION_ERROR"
|
||||
TIMEOUT = "TIMEOUT"
|
||||
RATE_LIMITED = "RATE_LIMITED"
|
||||
UNKNOWN = "UNKNOWN"
|
||||
|
||||
|
||||
class EventType(str, Enum):
|
||||
"""Observable events (detected without LLM)"""
|
||||
|
||||
HOLD_START = "HOLD_START"
|
||||
HOLD_END = "HOLD_END"
|
||||
TRANSFER = "TRANSFER"
|
||||
ESCALATION = "ESCALATION"
|
||||
SILENCE = "SILENCE"
|
||||
INTERRUPTION = "INTERRUPTION"
|
||||
|
||||
|
||||
class CallOutcome(str, Enum):
|
||||
"""Final outcome of the call"""
|
||||
|
||||
SALE_COMPLETED = "SALE_COMPLETED"
|
||||
SALE_LOST = "SALE_LOST"
|
||||
CANCELLATION_SAVED = "CANCELLATION_SAVED"
|
||||
CANCELLATION_COMPLETED = "CANCELLATION_COMPLETED"
|
||||
INQUIRY_RESOLVED = "INQUIRY_RESOLVED"
|
||||
INQUIRY_UNRESOLVED = "INQUIRY_UNRESOLVED"
|
||||
COMPLAINT_RESOLVED = "COMPLAINT_RESOLVED"
|
||||
COMPLAINT_UNRESOLVED = "COMPLAINT_UNRESOLVED"
|
||||
TRANSFER_OUT = "TRANSFER_OUT"
|
||||
CALLBACK_SCHEDULED = "CALLBACK_SCHEDULED"
|
||||
UNKNOWN = "UNKNOWN"
|
||||
|
||||
|
||||
# ============================================
|
||||
# TRACEABILITY (Required on all outputs)
|
||||
# ============================================
|
||||
|
||||
|
||||
class Traceability(BaseModel):
|
||||
"""Traceability metadata - REQUIRED on all analysis outputs"""
|
||||
|
||||
schema_version: str = Field(
|
||||
default=SCHEMA_VERSION,
|
||||
description="Version of this schema",
|
||||
)
|
||||
prompt_version: str = Field(
|
||||
description="Version of the prompt used for inference",
|
||||
)
|
||||
model_id: str = Field(
|
||||
description="Model identifier (e.g., gpt-4o-mini-2024-07-18)",
|
||||
)
|
||||
created_at: datetime = Field(
|
||||
default_factory=datetime.utcnow,
|
||||
description="Timestamp of analysis",
|
||||
)
|
||||
|
||||
|
||||
# ============================================
|
||||
# TRANSCRIPT MODELS (OBSERVED)
|
||||
# ============================================
|
||||
|
||||
|
||||
class SpeakerTurn(BaseModel):
|
||||
"""Single speaker turn in transcript"""
|
||||
|
||||
speaker: str = Field(description="Speaker identifier (A, B, agent, customer)")
|
||||
text: str = Field(description="Transcribed text")
|
||||
start_time: float = Field(description="Start time in seconds")
|
||||
end_time: float = Field(description="End time in seconds")
|
||||
confidence: float | None = Field(
|
||||
default=None,
|
||||
ge=0.0,
|
||||
le=1.0,
|
||||
description="STT confidence score",
|
||||
)
|
||||
|
||||
|
||||
class TranscriptMetadata(BaseModel):
|
||||
"""Metadata about the transcript"""
|
||||
|
||||
audio_duration_sec: float = Field(description="Total audio duration in seconds")
|
||||
language: str = Field(default="es", description="Detected language")
|
||||
provider: str = Field(description="STT provider (assemblyai, whisper, etc.)")
|
||||
job_id: str | None = Field(default=None, description="Provider job ID")
|
||||
created_at: datetime = Field(
|
||||
default_factory=datetime.utcnow,
|
||||
description="Timestamp of transcription",
|
||||
)
|
||||
|
||||
|
||||
class Transcript(BaseModel):
|
||||
"""Complete transcript with speaker diarization - OBSERVED data"""
|
||||
|
||||
call_id: str = Field(description="Unique call identifier")
|
||||
turns: list[SpeakerTurn] = Field(description="List of speaker turns")
|
||||
metadata: TranscriptMetadata = Field(description="Transcript metadata")
|
||||
full_text: str | None = Field(
|
||||
default=None,
|
||||
description="Full concatenated text (optional)",
|
||||
)
|
||||
|
||||
|
||||
# ============================================
|
||||
# EVENT MODELS (OBSERVED)
|
||||
# ============================================
|
||||
|
||||
|
||||
class Event(BaseModel):
|
||||
"""Observable event detected without LLM - OBSERVED data"""
|
||||
|
||||
event_type: EventType = Field(description="Type of event")
|
||||
start_time: float = Field(description="Event start time in seconds")
|
||||
end_time: float | None = Field(
|
||||
default=None,
|
||||
description="Event end time in seconds (if applicable)",
|
||||
)
|
||||
duration_sec: float | None = Field(
|
||||
default=None,
|
||||
description="Event duration in seconds",
|
||||
)
|
||||
metadata: dict | None = Field(
|
||||
default=None,
|
||||
description="Additional event-specific data",
|
||||
)
|
||||
source: Literal["observed"] = Field(
|
||||
default="observed",
|
||||
description="Events are always observed, not inferred",
|
||||
)
|
||||
|
||||
|
||||
# ============================================
|
||||
# TURN METRICS (OBSERVED)
|
||||
# ============================================
|
||||
|
||||
|
||||
class TurnMetrics(BaseModel):
|
||||
"""Metrics computed from transcript - OBSERVED data"""
|
||||
|
||||
total_turns: int = Field(description="Total number of turns")
|
||||
agent_turns: int = Field(description="Number of agent turns")
|
||||
customer_turns: int = Field(description="Number of customer turns")
|
||||
agent_talk_ratio: float = Field(
|
||||
ge=0.0,
|
||||
le=1.0,
|
||||
description="Ratio of agent talk time",
|
||||
)
|
||||
customer_talk_ratio: float = Field(
|
||||
ge=0.0,
|
||||
le=1.0,
|
||||
description="Ratio of customer talk time",
|
||||
)
|
||||
silence_ratio: float = Field(
|
||||
ge=0.0,
|
||||
le=1.0,
|
||||
description="Ratio of silence time",
|
||||
)
|
||||
interruption_count: int = Field(
|
||||
default=0,
|
||||
description="Number of detected interruptions",
|
||||
)
|
||||
avg_turn_duration_sec: float = Field(description="Average turn duration")
|
||||
source: Literal["observed"] = Field(
|
||||
default="observed",
|
||||
description="Metrics are always observed, not inferred",
|
||||
)
|
||||
|
||||
|
||||
# ============================================
|
||||
# OBSERVED FEATURES (Aggregated)
|
||||
# ============================================
|
||||
|
||||
|
||||
class ObservedFeatures(BaseModel):
|
||||
"""All observed features for a call - deterministic, no LLM"""
|
||||
|
||||
call_id: str = Field(description="Unique call identifier")
|
||||
events: list[Event] = Field(
|
||||
default_factory=list,
|
||||
description="Detected events",
|
||||
)
|
||||
turn_metrics: TurnMetrics = Field(description="Turn-based metrics")
|
||||
hold_count: int = Field(default=0, description="Number of hold events")
|
||||
total_hold_duration_sec: float = Field(
|
||||
default=0.0,
|
||||
description="Total hold duration",
|
||||
)
|
||||
transfer_count: int = Field(default=0, description="Number of transfers")
|
||||
silence_count: int = Field(
|
||||
default=0,
|
||||
description="Number of significant silences",
|
||||
)
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow)
|
||||
|
||||
|
||||
# ============================================
|
||||
# EVIDENCE MODELS (For INFERRED data)
|
||||
# ============================================
|
||||
|
||||
|
||||
class EvidenceSpan(BaseModel):
|
||||
"""Evidence from transcript supporting an inference"""
|
||||
|
||||
text: str = Field(
|
||||
max_length=500,
|
||||
description="Quoted text from transcript",
|
||||
)
|
||||
start_time: float = Field(description="Start time in seconds")
|
||||
end_time: float = Field(description="End time in seconds")
|
||||
speaker: str | None = Field(
|
||||
default=None,
|
||||
description="Speaker of this evidence",
|
||||
)
|
||||
|
||||
@field_validator("text")
|
||||
@classmethod
|
||||
def text_not_empty(cls, v: str) -> str:
|
||||
if not v.strip():
|
||||
raise ValueError("Evidence text cannot be empty")
|
||||
return v.strip()
|
||||
|
||||
|
||||
# ============================================
|
||||
# RCA LABELS (INFERRED)
|
||||
# ============================================
|
||||
|
||||
|
||||
class RCALabel(BaseModel):
|
||||
"""Root Cause Analysis label - INFERRED data (requires evidence)"""
|
||||
|
||||
driver_code: str = Field(
|
||||
description="Driver code from taxonomy (e.g., PRICE_TOO_HIGH)",
|
||||
)
|
||||
confidence: float = Field(
|
||||
ge=0.0,
|
||||
le=1.0,
|
||||
description="Confidence score (0-1)",
|
||||
)
|
||||
evidence_spans: list[EvidenceSpan] = Field(
|
||||
min_length=1,
|
||||
description="Supporting evidence (minimum 1 required)",
|
||||
)
|
||||
reasoning: str | None = Field(
|
||||
default=None,
|
||||
max_length=500,
|
||||
description="Brief reasoning for this classification",
|
||||
)
|
||||
proposed_label: str | None = Field(
|
||||
default=None,
|
||||
description="For OTHER_EMERGENT: proposed new label",
|
||||
)
|
||||
source: Literal["inferred"] = Field(
|
||||
default="inferred",
|
||||
description="RCA labels are always inferred",
|
||||
)
|
||||
|
||||
@field_validator("evidence_spans")
|
||||
@classmethod
|
||||
def at_least_one_evidence(cls, v: list[EvidenceSpan]) -> list[EvidenceSpan]:
|
||||
if len(v) < 1:
|
||||
raise ValueError("At least one evidence span is required")
|
||||
return v
|
||||
|
||||
|
||||
# ============================================
|
||||
# CALL ANALYSIS (Complete Output)
|
||||
# ============================================
|
||||
|
||||
|
||||
class CallAnalysis(BaseModel):
|
||||
"""
|
||||
Complete analysis output for a single call.
|
||||
|
||||
Combines:
|
||||
- OBSERVED: Features, events, metrics (from STT)
|
||||
- INFERRED: RCA labels, outcome (from LLM)
|
||||
|
||||
MUST include traceability for audit.
|
||||
"""
|
||||
|
||||
# === Identifiers ===
|
||||
call_id: str = Field(description="Unique call identifier")
|
||||
batch_id: str = Field(description="Batch identifier")
|
||||
|
||||
# === Processing Status ===
|
||||
status: ProcessingStatus = Field(description="Processing status")
|
||||
failure_reason: FailureReason | None = Field(
|
||||
default=None,
|
||||
description="Reason for failure (if status != success)",
|
||||
)
|
||||
|
||||
# === OBSERVED Data ===
|
||||
observed: ObservedFeatures = Field(description="Observed features (deterministic)")
|
||||
|
||||
# === INFERRED Data ===
|
||||
outcome: CallOutcome = Field(description="Call outcome (inferred)")
|
||||
lost_sales_drivers: list[RCALabel] = Field(
|
||||
default_factory=list,
|
||||
description="Lost sales RCA labels",
|
||||
)
|
||||
poor_cx_drivers: list[RCALabel] = Field(
|
||||
default_factory=list,
|
||||
description="Poor CX RCA labels",
|
||||
)
|
||||
|
||||
# === Traceability (REQUIRED) ===
|
||||
traceability: Traceability = Field(description="Version and audit metadata")
|
||||
|
||||
# === Timestamps ===
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow)
|
||||
|
||||
|
||||
# ============================================
|
||||
# COMPRESSED TRANSCRIPT (For LLM Input)
|
||||
# ============================================
|
||||
|
||||
|
||||
class CompressedTranscript(BaseModel):
|
||||
"""Compressed transcript for LLM inference - reduces token usage"""
|
||||
|
||||
call_id: str = Field(description="Unique call identifier")
|
||||
customer_intent: str = Field(description="Summarized customer intent")
|
||||
agent_offers: list[str] = Field(
|
||||
default_factory=list,
|
||||
description="Key offers made by agent",
|
||||
)
|
||||
objections: list[str] = Field(
|
||||
default_factory=list,
|
||||
description="Customer objections",
|
||||
)
|
||||
resolution_statements: list[str] = Field(
|
||||
default_factory=list,
|
||||
description="Resolution statements",
|
||||
)
|
||||
key_exchanges: list[dict] = Field(
|
||||
default_factory=list,
|
||||
description="Key exchanges with timestamps",
|
||||
)
|
||||
original_token_count: int = Field(description="Tokens in original transcript")
|
||||
compressed_token_count: int = Field(description="Tokens after compression")
|
||||
compression_ratio: float = Field(
|
||||
ge=0.0,
|
||||
le=1.0,
|
||||
description="Compression ratio achieved",
|
||||
)
|
||||
|
||||
|
||||
# ============================================
|
||||
# BATCH MANIFEST
|
||||
# ============================================
|
||||
|
||||
|
||||
class BatchManifest(BaseModel):
|
||||
"""Manifest for a processing batch"""
|
||||
|
||||
batch_id: str = Field(description="Unique batch identifier")
|
||||
total_calls: int = Field(description="Total calls in batch")
|
||||
processed_calls: int = Field(default=0, description="Calls processed")
|
||||
success_count: int = Field(default=0, description="Successful processing")
|
||||
partial_count: int = Field(default=0, description="Partial processing")
|
||||
failed_count: int = Field(default=0, description="Failed processing")
|
||||
status: str = Field(default="pending", description="Batch status")
|
||||
started_at: datetime | None = Field(default=None)
|
||||
completed_at: datetime | None = Field(default=None)
|
||||
traceability: Traceability = Field(description="Version metadata")
|
||||
Reference in New Issue
Block a user