feat: Add Streamlit dashboard with Blueprint compliance (v2.1.0)

Dashboard Features:
- 8 navigation sections: Overview, Outcomes, Poor CX, FCR, Churn, Agent, Call Explorer, Export
- Beyond Brand Identity styling (colors #6D84E3, Outfit font)
- RCA Sankey diagram (Driver → Outcome → Churn Risk flow)
- Correlation heatmaps (driver co-occurrence, driver-outcome)
- Outcome Deep Dive (root causes, correlation, duration analysis)
- Export functionality (Excel, HTML, JSON)

Blueprint Compliance:
- FCR: 4 categories (Primera Llamada/Rellamada × Sin/Con Riesgo de Fuga)
- Churn: Binary view (Sin Riesgo de Fuga / En Riesgo de Fuga)
- Agent: Talento Para Replicar / Oportunidades de Mejora
- Fixed FCR rate calculation (only FIRST_CALL counts as success)

Technical:
- Streamlit + Plotly for interactive visualizations
- Light theme configuration (.streamlit/config.toml)
- Fixed Plotly colorbar titlefont deprecation

Documentation:
- Updated PROJECT_CONTEXT.md, TODO.md, CHANGELOG.md
- Added 4 new technical decisions (TD-014 to TD-017)
- Created TROUBLESHOOTING.md with 10 common issues

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
sujucu70
2026-01-19 16:27:30 +01:00
commit 75e7b9da3d
110 changed files with 28247 additions and 0 deletions

View File

@@ -0,0 +1,47 @@
"""
CXInsights - Schema Definitions
Export all schema models from the current version.
"""
from config.schemas.call_analysis_v1 import (
SCHEMA_VERSION,
BatchManifest,
CallAnalysis,
CallOutcome,
CompressedTranscript,
DataSource,
Event,
EventType,
EvidenceSpan,
FailureReason,
ObservedFeatures,
ProcessingStatus,
RCALabel,
SpeakerTurn,
Traceability,
Transcript,
TranscriptMetadata,
TurnMetrics,
)
__all__ = [
"SCHEMA_VERSION",
"DataSource",
"ProcessingStatus",
"FailureReason",
"EventType",
"CallOutcome",
"Traceability",
"SpeakerTurn",
"TranscriptMetadata",
"Transcript",
"Event",
"TurnMetrics",
"ObservedFeatures",
"EvidenceSpan",
"RCALabel",
"CallAnalysis",
"CompressedTranscript",
"BatchManifest",
]

View File

@@ -0,0 +1,416 @@
"""
CXInsights - Call Analysis Schema v1.0
Data contracts for the call analysis pipeline.
All outputs MUST include: schema_version, prompt_version, model_id
This schema defines:
- OBSERVED: Facts extracted from STT (deterministic)
- INFERRED: Conclusions from LLM (requires evidence)
"""
from datetime import datetime
from enum import Enum
from typing import Literal
from pydantic import BaseModel, Field, field_validator
# ============================================
# SCHEMA VERSION
# ============================================
SCHEMA_VERSION = "1.0.0"
# ============================================
# ENUMS
# ============================================
class DataSource(str, Enum):
"""Source of data - critical for audit trail"""
OBSERVED = "observed" # From STT, deterministic
INFERRED = "inferred" # From LLM, requires evidence
class ProcessingStatus(str, Enum):
"""Processing status for each call"""
SUCCESS = "success"
PARTIAL = "partial"
FAILED = "failed"
class FailureReason(str, Enum):
"""Reasons for processing failure"""
LOW_AUDIO_QUALITY = "LOW_AUDIO_QUALITY"
TRANSCRIPTION_FAILED = "TRANSCRIPTION_FAILED"
LLM_PARSE_ERROR = "LLM_PARSE_ERROR"
NO_EVIDENCE_FOUND = "NO_EVIDENCE_FOUND"
SCHEMA_VALIDATION_ERROR = "SCHEMA_VALIDATION_ERROR"
TIMEOUT = "TIMEOUT"
RATE_LIMITED = "RATE_LIMITED"
UNKNOWN = "UNKNOWN"
class EventType(str, Enum):
"""Observable events (detected without LLM)"""
HOLD_START = "HOLD_START"
HOLD_END = "HOLD_END"
TRANSFER = "TRANSFER"
ESCALATION = "ESCALATION"
SILENCE = "SILENCE"
INTERRUPTION = "INTERRUPTION"
class CallOutcome(str, Enum):
"""Final outcome of the call"""
SALE_COMPLETED = "SALE_COMPLETED"
SALE_LOST = "SALE_LOST"
CANCELLATION_SAVED = "CANCELLATION_SAVED"
CANCELLATION_COMPLETED = "CANCELLATION_COMPLETED"
INQUIRY_RESOLVED = "INQUIRY_RESOLVED"
INQUIRY_UNRESOLVED = "INQUIRY_UNRESOLVED"
COMPLAINT_RESOLVED = "COMPLAINT_RESOLVED"
COMPLAINT_UNRESOLVED = "COMPLAINT_UNRESOLVED"
TRANSFER_OUT = "TRANSFER_OUT"
CALLBACK_SCHEDULED = "CALLBACK_SCHEDULED"
UNKNOWN = "UNKNOWN"
# ============================================
# TRACEABILITY (Required on all outputs)
# ============================================
class Traceability(BaseModel):
"""Traceability metadata - REQUIRED on all analysis outputs"""
schema_version: str = Field(
default=SCHEMA_VERSION,
description="Version of this schema",
)
prompt_version: str = Field(
description="Version of the prompt used for inference",
)
model_id: str = Field(
description="Model identifier (e.g., gpt-4o-mini-2024-07-18)",
)
created_at: datetime = Field(
default_factory=datetime.utcnow,
description="Timestamp of analysis",
)
# ============================================
# TRANSCRIPT MODELS (OBSERVED)
# ============================================
class SpeakerTurn(BaseModel):
"""Single speaker turn in transcript"""
speaker: str = Field(description="Speaker identifier (A, B, agent, customer)")
text: str = Field(description="Transcribed text")
start_time: float = Field(description="Start time in seconds")
end_time: float = Field(description="End time in seconds")
confidence: float | None = Field(
default=None,
ge=0.0,
le=1.0,
description="STT confidence score",
)
class TranscriptMetadata(BaseModel):
"""Metadata about the transcript"""
audio_duration_sec: float = Field(description="Total audio duration in seconds")
language: str = Field(default="es", description="Detected language")
provider: str = Field(description="STT provider (assemblyai, whisper, etc.)")
job_id: str | None = Field(default=None, description="Provider job ID")
created_at: datetime = Field(
default_factory=datetime.utcnow,
description="Timestamp of transcription",
)
class Transcript(BaseModel):
"""Complete transcript with speaker diarization - OBSERVED data"""
call_id: str = Field(description="Unique call identifier")
turns: list[SpeakerTurn] = Field(description="List of speaker turns")
metadata: TranscriptMetadata = Field(description="Transcript metadata")
full_text: str | None = Field(
default=None,
description="Full concatenated text (optional)",
)
# ============================================
# EVENT MODELS (OBSERVED)
# ============================================
class Event(BaseModel):
"""Observable event detected without LLM - OBSERVED data"""
event_type: EventType = Field(description="Type of event")
start_time: float = Field(description="Event start time in seconds")
end_time: float | None = Field(
default=None,
description="Event end time in seconds (if applicable)",
)
duration_sec: float | None = Field(
default=None,
description="Event duration in seconds",
)
metadata: dict | None = Field(
default=None,
description="Additional event-specific data",
)
source: Literal["observed"] = Field(
default="observed",
description="Events are always observed, not inferred",
)
# ============================================
# TURN METRICS (OBSERVED)
# ============================================
class TurnMetrics(BaseModel):
"""Metrics computed from transcript - OBSERVED data"""
total_turns: int = Field(description="Total number of turns")
agent_turns: int = Field(description="Number of agent turns")
customer_turns: int = Field(description="Number of customer turns")
agent_talk_ratio: float = Field(
ge=0.0,
le=1.0,
description="Ratio of agent talk time",
)
customer_talk_ratio: float = Field(
ge=0.0,
le=1.0,
description="Ratio of customer talk time",
)
silence_ratio: float = Field(
ge=0.0,
le=1.0,
description="Ratio of silence time",
)
interruption_count: int = Field(
default=0,
description="Number of detected interruptions",
)
avg_turn_duration_sec: float = Field(description="Average turn duration")
source: Literal["observed"] = Field(
default="observed",
description="Metrics are always observed, not inferred",
)
# ============================================
# OBSERVED FEATURES (Aggregated)
# ============================================
class ObservedFeatures(BaseModel):
"""All observed features for a call - deterministic, no LLM"""
call_id: str = Field(description="Unique call identifier")
events: list[Event] = Field(
default_factory=list,
description="Detected events",
)
turn_metrics: TurnMetrics = Field(description="Turn-based metrics")
hold_count: int = Field(default=0, description="Number of hold events")
total_hold_duration_sec: float = Field(
default=0.0,
description="Total hold duration",
)
transfer_count: int = Field(default=0, description="Number of transfers")
silence_count: int = Field(
default=0,
description="Number of significant silences",
)
created_at: datetime = Field(default_factory=datetime.utcnow)
# ============================================
# EVIDENCE MODELS (For INFERRED data)
# ============================================
class EvidenceSpan(BaseModel):
"""Evidence from transcript supporting an inference"""
text: str = Field(
max_length=500,
description="Quoted text from transcript",
)
start_time: float = Field(description="Start time in seconds")
end_time: float = Field(description="End time in seconds")
speaker: str | None = Field(
default=None,
description="Speaker of this evidence",
)
@field_validator("text")
@classmethod
def text_not_empty(cls, v: str) -> str:
if not v.strip():
raise ValueError("Evidence text cannot be empty")
return v.strip()
# ============================================
# RCA LABELS (INFERRED)
# ============================================
class RCALabel(BaseModel):
"""Root Cause Analysis label - INFERRED data (requires evidence)"""
driver_code: str = Field(
description="Driver code from taxonomy (e.g., PRICE_TOO_HIGH)",
)
confidence: float = Field(
ge=0.0,
le=1.0,
description="Confidence score (0-1)",
)
evidence_spans: list[EvidenceSpan] = Field(
min_length=1,
description="Supporting evidence (minimum 1 required)",
)
reasoning: str | None = Field(
default=None,
max_length=500,
description="Brief reasoning for this classification",
)
proposed_label: str | None = Field(
default=None,
description="For OTHER_EMERGENT: proposed new label",
)
source: Literal["inferred"] = Field(
default="inferred",
description="RCA labels are always inferred",
)
@field_validator("evidence_spans")
@classmethod
def at_least_one_evidence(cls, v: list[EvidenceSpan]) -> list[EvidenceSpan]:
if len(v) < 1:
raise ValueError("At least one evidence span is required")
return v
# ============================================
# CALL ANALYSIS (Complete Output)
# ============================================
class CallAnalysis(BaseModel):
"""
Complete analysis output for a single call.
Combines:
- OBSERVED: Features, events, metrics (from STT)
- INFERRED: RCA labels, outcome (from LLM)
MUST include traceability for audit.
"""
# === Identifiers ===
call_id: str = Field(description="Unique call identifier")
batch_id: str = Field(description="Batch identifier")
# === Processing Status ===
status: ProcessingStatus = Field(description="Processing status")
failure_reason: FailureReason | None = Field(
default=None,
description="Reason for failure (if status != success)",
)
# === OBSERVED Data ===
observed: ObservedFeatures = Field(description="Observed features (deterministic)")
# === INFERRED Data ===
outcome: CallOutcome = Field(description="Call outcome (inferred)")
lost_sales_drivers: list[RCALabel] = Field(
default_factory=list,
description="Lost sales RCA labels",
)
poor_cx_drivers: list[RCALabel] = Field(
default_factory=list,
description="Poor CX RCA labels",
)
# === Traceability (REQUIRED) ===
traceability: Traceability = Field(description="Version and audit metadata")
# === Timestamps ===
created_at: datetime = Field(default_factory=datetime.utcnow)
# ============================================
# COMPRESSED TRANSCRIPT (For LLM Input)
# ============================================
class CompressedTranscript(BaseModel):
"""Compressed transcript for LLM inference - reduces token usage"""
call_id: str = Field(description="Unique call identifier")
customer_intent: str = Field(description="Summarized customer intent")
agent_offers: list[str] = Field(
default_factory=list,
description="Key offers made by agent",
)
objections: list[str] = Field(
default_factory=list,
description="Customer objections",
)
resolution_statements: list[str] = Field(
default_factory=list,
description="Resolution statements",
)
key_exchanges: list[dict] = Field(
default_factory=list,
description="Key exchanges with timestamps",
)
original_token_count: int = Field(description="Tokens in original transcript")
compressed_token_count: int = Field(description="Tokens after compression")
compression_ratio: float = Field(
ge=0.0,
le=1.0,
description="Compression ratio achieved",
)
# ============================================
# BATCH MANIFEST
# ============================================
class BatchManifest(BaseModel):
"""Manifest for a processing batch"""
batch_id: str = Field(description="Unique batch identifier")
total_calls: int = Field(description="Total calls in batch")
processed_calls: int = Field(default=0, description="Calls processed")
success_count: int = Field(default=0, description="Successful processing")
partial_count: int = Field(default=0, description="Partial processing")
failed_count: int = Field(default=0, description="Failed processing")
status: str = Field(default="pending", description="Batch status")
started_at: datetime | None = Field(default=None)
completed_at: datetime | None = Field(default=None)
traceability: Traceability = Field(description="Version metadata")