{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 04 - Aggregation & RCA Trees Validation\n", "\n", "**Checkpoint 7 validation notebook**\n", "\n", "This notebook validates the aggregation module:\n", "1. Frequency statistics calculation\n", "2. Conditional probability analysis\n", "3. Severity scoring with explicit rules\n", "4. RCA tree building and prioritization" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.insert(0, '..')\n", "\n", "import json\n", "from datetime import datetime\n", "\n", "# Project imports\n", "from src.aggregation import (\n", " AggregationConfig,\n", " BatchAggregation,\n", " RCATree,\n", " RCATreeBuilder,\n", " StatisticsCalculator,\n", " SeverityCalculator,\n", " ImpactLevel,\n", " aggregate_batch,\n", " build_rca_tree,\n", " calculate_batch_statistics,\n", ")\n", "from src.models.call_analysis import (\n", " CallAnalysis,\n", " CallOutcome,\n", " EvidenceSpan,\n", " ObservedFeatures,\n", " ProcessingStatus,\n", " RCALabel,\n", " Traceability,\n", ")\n", "\n", "print(\"Imports successful!\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Create Simulated Call Analyses\n", "\n", "We'll simulate 100 call analyses with realistic driver distributions." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import random\n", "\n", "def create_sample_analyses(n: int = 100) -> list[CallAnalysis]:\n", " \"\"\"Create n sample call analyses with realistic distributions.\"\"\"\n", " random.seed(42) # Reproducible\n", " \n", " base_observed = ObservedFeatures(audio_duration_sec=60.0, events=[])\n", " base_trace = Traceability(\n", " schema_version=\"1.0.0\",\n", " prompt_version=\"v1.0\",\n", " model_id=\"gpt-4o-mini\",\n", " )\n", " \n", " # Driver probabilities (realistic distribution)\n", " lost_sales_probs = {\n", " \"PRICE_TOO_HIGH\": 0.25,\n", " \"COMPETITOR_PREFERENCE\": 0.12,\n", " \"TIMING_NOT_RIGHT\": 0.10,\n", " \"NO_NEED\": 0.08,\n", " \"OBJECTION_NOT_HANDLED\": 0.15,\n", " \"NO_SAVE_OFFER\": 0.10,\n", " \"POOR_PITCH\": 0.05,\n", " }\n", " \n", " poor_cx_probs = {\n", " \"LONG_HOLD\": 0.20,\n", " \"MULTI_TRANSFER\": 0.08,\n", " \"LOW_EMPATHY\": 0.10,\n", " \"ISSUE_NOT_RESOLVED\": 0.12,\n", " \"INTERRUPTIONS\": 0.05,\n", " \"CALLBACK_REQUIRED\": 0.08,\n", " }\n", " \n", " analyses = []\n", " \n", " for i in range(n):\n", " call_id = f\"CALL{i+1:04d}\"\n", " \n", " # Determine if this is a lost sale (40% of calls)\n", " is_lost_sale = random.random() < 0.40\n", " \n", " # Determine if poor CX (30% of calls)\n", " has_poor_cx = random.random() < 0.30\n", " \n", " # Generate lost sales drivers\n", " lost_sales = []\n", " if is_lost_sale:\n", " for code, prob in lost_sales_probs.items():\n", " if random.random() < prob:\n", " lost_sales.append(RCALabel(\n", " driver_code=code,\n", " confidence=random.uniform(0.6, 0.95),\n", " evidence_spans=[EvidenceSpan(\n", " text=f\"Evidence for {code}\",\n", " start_time=random.uniform(0, 50),\n", " end_time=random.uniform(50, 60),\n", " )],\n", " ))\n", " \n", " # Generate poor CX drivers\n", " poor_cx = []\n", " if has_poor_cx:\n", " for code, prob in poor_cx_probs.items():\n", " if random.random() < prob:\n", " poor_cx.append(RCALabel(\n", " driver_code=code,\n", " confidence=random.uniform(0.6, 0.95),\n", " evidence_spans=[EvidenceSpan(\n", " text=f\"Evidence for {code}\",\n", " start_time=random.uniform(0, 50),\n", " end_time=random.uniform(50, 60),\n", " )],\n", " ))\n", " \n", " # Determine outcome\n", " if is_lost_sale:\n", " outcome = CallOutcome.SALE_LOST\n", " elif random.random() < 0.5:\n", " outcome = CallOutcome.SALE_COMPLETED\n", " else:\n", " outcome = CallOutcome.INQUIRY_RESOLVED\n", " \n", " analyses.append(CallAnalysis(\n", " call_id=call_id,\n", " batch_id=\"validation_batch\",\n", " status=ProcessingStatus.SUCCESS,\n", " observed=base_observed,\n", " outcome=outcome,\n", " lost_sales_drivers=lost_sales,\n", " poor_cx_drivers=poor_cx,\n", " traceability=base_trace,\n", " ))\n", " \n", " return analyses\n", "\n", "# Create 100 sample analyses\n", "analyses = create_sample_analyses(100)\n", "print(f\"Created {len(analyses)} sample analyses\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Calculate Frequency Statistics" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "calculator = StatisticsCalculator()\n", "lost_sales_freqs, poor_cx_freqs = calculator.calculate_frequencies(analyses)\n", "\n", "print(\"=== LOST SALES DRIVER FREQUENCIES ===\")\n", "print(f\"{'Driver':<25} {'Occurrences':>12} {'Call Rate':>10} {'Avg Conf':>10}\")\n", "print(\"-\" * 60)\n", "\n", "for freq in lost_sales_freqs:\n", " print(f\"{freq.driver_code:<25} {freq.total_occurrences:>12} {freq.call_rate:>9.1%} {freq.avg_confidence:>10.2f}\")\n", "\n", "print(f\"\\nTotal lost sales drivers: {len(lost_sales_freqs)}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"=== POOR CX DRIVER FREQUENCIES ===\")\n", "print(f\"{'Driver':<25} {'Occurrences':>12} {'Call Rate':>10} {'Avg Conf':>10}\")\n", "print(\"-\" * 60)\n", "\n", "for freq in poor_cx_freqs:\n", " print(f\"{freq.driver_code:<25} {freq.total_occurrences:>12} {freq.call_rate:>9.1%} {freq.avg_confidence:>10.2f}\")\n", "\n", "print(f\"\\nTotal poor CX drivers: {len(poor_cx_freqs)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Outcome Rate Analysis" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "outcome_rates = calculator.calculate_outcome_rates(analyses)\n", "\n", "print(\"=== OUTCOME RATES ===\")\n", "print(f\"Total calls analyzed: {outcome_rates['total_calls']}\")\n", "print(f\"\\nCalls with lost sales drivers: {outcome_rates['lost_sales_count']} ({outcome_rates['lost_sales_rate']:.1%})\")\n", "print(f\"Calls with poor CX drivers: {outcome_rates['poor_cx_count']} ({outcome_rates['poor_cx_rate']:.1%})\")\n", "print(f\"Calls with BOTH: {outcome_rates['both_count']} ({outcome_rates['both_rate']:.1%})\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Severity Scoring" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "severity_calc = SeverityCalculator()\n", "lost_sales_sevs, poor_cx_sevs = severity_calc.calculate_all_severities(\n", " lost_sales_freqs, poor_cx_freqs\n", ")\n", "\n", "print(\"=== LOST SALES SEVERITY SCORES ===\")\n", "print(f\"{'Rank':<5} {'Driver':<25} {'Score':>8} {'Impact':>12}\")\n", "print(\"-\" * 55)\n", "\n", "for rank, sev in enumerate(lost_sales_sevs, 1):\n", " print(f\"{rank:<5} {sev.driver_code:<25} {sev.severity_score:>7.1f} {sev.impact_level.value:>12}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"=== POOR CX SEVERITY SCORES ===\")\n", "print(f\"{'Rank':<5} {'Driver':<25} {'Score':>8} {'Impact':>12}\")\n", "print(\"-\" * 55)\n", "\n", "for rank, sev in enumerate(poor_cx_sevs, 1):\n", " print(f\"{rank:<5} {sev.driver_code:<25} {sev.severity_score:>7.1f} {sev.impact_level.value:>12}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Show severity formula breakdown for top driver\n", "if lost_sales_sevs:\n", " top = lost_sales_sevs[0]\n", " print(f\"=== SEVERITY BREAKDOWN: {top.driver_code} ===\")\n", " print(f\"Base severity (from taxonomy): {top.base_severity:.2f}\")\n", " print(f\"Frequency factor: {top.frequency_factor:.2f}\")\n", " print(f\"Confidence factor: {top.confidence_factor:.2f}\")\n", " print(f\"Co-occurrence factor: {top.co_occurrence_factor:.2f}\")\n", " print(f\"\\nFinal severity score: {top.severity_score:.1f}\")\n", " print(f\"Impact level: {top.impact_level.value}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Conditional Probabilities" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "config = AggregationConfig(min_support=3)\n", "calc = StatisticsCalculator(config=config)\n", "cond_probs = calc.calculate_conditional_probabilities(analyses)\n", "\n", "print(\"=== TOP CONDITIONAL PROBABILITIES (by Lift) ===\")\n", "print(f\"{'Driver A':<25} → {'Driver B':<25} {'P(B|A)':>8} {'Lift':>6} {'Support':>8}\")\n", "print(\"-\" * 80)\n", "\n", "for cp in cond_probs[:10]:\n", " print(f\"{cp.driver_a:<25} → {cp.driver_b:<25} {cp.probability:>7.1%} {cp.lift:>6.2f} {cp.support:>8}\")\n", "\n", "print(f\"\\nInterpretation: Lift > 1 means drivers co-occur more than expected by chance.\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Build RCA Tree" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "builder = RCATreeBuilder()\n", "tree = builder.build(\"validation_batch\", analyses)\n", "\n", "print(\"=== RCA TREE SUMMARY ===\")\n", "print(f\"Batch ID: {tree.batch_id}\")\n", "print(f\"Total calls: {tree.total_calls}\")\n", "print(f\"Calls with lost sales: {tree.calls_with_lost_sales} ({tree.calls_with_lost_sales/tree.total_calls:.1%})\")\n", "print(f\"Calls with poor CX: {tree.calls_with_poor_cx} ({tree.calls_with_poor_cx/tree.total_calls:.1%})\")\n", "print(f\"Calls with both: {tree.calls_with_both} ({tree.calls_with_both/tree.total_calls:.1%})\")\n", "\n", "print(f\"\\nTop lost sales drivers: {tree.top_lost_sales_drivers}\")\n", "print(f\"Top poor CX drivers: {tree.top_poor_cx_drivers}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"=== LOST SALES RCA TREE ===\")\n", "print(f\"{'Rank':<5} {'Driver':<25} {'Impact':>10} {'Call Rate':>10} {'Score':>8}\")\n", "print(\"-\" * 65)\n", "\n", "for node in tree.lost_sales_root:\n", " print(f\"{node.priority_rank:<5} {node.driver_code:<25} {node.severity.impact_level.value:>10} {node.frequency.call_rate:>9.1%} {node.severity.severity_score:>8.1f}\")\n", " if node.sample_evidence:\n", " print(f\" └── Evidence: \\\"{node.sample_evidence[0][:50]}...\\\"\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"=== POOR CX RCA TREE ===\")\n", "print(f\"{'Rank':<5} {'Driver':<25} {'Impact':>10} {'Call Rate':>10} {'Score':>8}\")\n", "print(\"-\" * 65)\n", "\n", "for node in tree.poor_cx_root:\n", " print(f\"{node.priority_rank:<5} {node.driver_code:<25} {node.severity.impact_level.value:>10} {node.frequency.call_rate:>9.1%} {node.severity.severity_score:>8.1f}\")\n", " if node.recommended_actions:\n", " print(f\" └── Action: {node.recommended_actions[0]}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7. Full Batch Aggregation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "aggregation = aggregate_batch(\"validation_batch\", analyses)\n", "\n", "print(\"=== BATCH AGGREGATION SUMMARY ===\")\n", "print(f\"Batch ID: {aggregation.batch_id}\")\n", "print(f\"Total processed: {aggregation.total_calls_processed}\")\n", "print(f\"Successful: {aggregation.successful_analyses}\")\n", "print(f\"Failed: {aggregation.failed_analyses}\")\n", "print(f\"\\nLost sales drivers found: {len(aggregation.lost_sales_frequencies)}\")\n", "print(f\"Poor CX drivers found: {len(aggregation.poor_cx_frequencies)}\")\n", "print(f\"Emergent patterns: {len(aggregation.emergent_patterns)}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get top drivers by severity\n", "top_lost_sales = aggregation.get_top_drivers(\"lost_sales\", n=5, by=\"severity\")\n", "top_poor_cx = aggregation.get_top_drivers(\"poor_cx\", n=5, by=\"severity\")\n", "\n", "print(\"=== TOP 5 DRIVERS BY SEVERITY ===\")\n", "print(f\"\\nLost Sales: {top_lost_sales}\")\n", "print(f\"Poor CX: {top_poor_cx}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 8. JSON Export" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Export tree to JSON\n", "tree_json = tree.to_dict()\n", "\n", "print(\"=== RCA TREE JSON STRUCTURE ===\")\n", "print(json.dumps(tree_json, indent=2, default=str)[:2000])\n", "print(\"\\n... [truncated]\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 9. Validation Checks" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"=== VALIDATION CHECKS ===\")\n", "\n", "# Check 1: Numbers add up\n", "total_ls_occurrences = sum(f.total_occurrences for f in lost_sales_freqs)\n", "total_pcx_occurrences = sum(f.total_occurrences for f in poor_cx_freqs)\n", "\n", "# Count from analyses\n", "actual_ls = sum(len(a.lost_sales_drivers) for a in analyses)\n", "actual_pcx = sum(len(a.poor_cx_drivers) for a in analyses)\n", "\n", "print(f\"✓ Lost sales occurrences match: {total_ls_occurrences} == {actual_ls}\")\n", "print(f\"✓ Poor CX occurrences match: {total_pcx_occurrences} == {actual_pcx}\")\n", "\n", "# Check 2: Severity scores in range\n", "all_sevs = lost_sales_sevs + poor_cx_sevs\n", "all_in_range = all(0 <= s.severity_score <= 100 for s in all_sevs)\n", "print(f\"✓ All severity scores in 0-100 range: {all_in_range}\")\n", "\n", "# Check 3: Rates in range\n", "all_freqs = lost_sales_freqs + poor_cx_freqs\n", "rates_valid = all(0 <= f.call_rate <= 1 for f in all_freqs)\n", "print(f\"✓ All call rates in 0-1 range: {rates_valid}\")\n", "\n", "# Check 4: Prioritization is consistent\n", "for i in range(len(tree.lost_sales_root) - 1):\n", " assert tree.lost_sales_root[i].severity.severity_score >= tree.lost_sales_root[i+1].severity.severity_score\n", "print(f\"✓ Drivers correctly prioritized by severity\")\n", "\n", "print(\"\\n✓ All validation checks passed!\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 10. Summary\n", "\n", "### Aggregation Module Validated:\n", "\n", "1. **Frequency Statistics** ✓\n", " - Occurrence counts and rates\n", " - Confidence statistics (avg, min, max)\n", " - Co-occurrence tracking\n", "\n", "2. **Conditional Probabilities** ✓\n", " - P(B|A) calculation\n", " - Lift metric for pattern significance\n", " - Support threshold filtering\n", "\n", "3. **Severity Scoring** ✓\n", " - Base severity from taxonomy\n", " - Weighted formula: base + frequency + confidence + co-occurrence\n", " - Impact level classification (CRITICAL, HIGH, MEDIUM, LOW)\n", "\n", "4. **RCA Tree Building** ✓\n", " - Hierarchical structure by driver category\n", " - Priority ranking by severity\n", " - Sample evidence collection\n", " - Recommended actions per category\n", "\n", "5. **Batch Aggregation** ✓\n", " - Complete statistics bundle\n", " - JSON export for downstream use\n", " - Top drivers by frequency or severity" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"=\"*50)\n", "print(\"CHECKPOINT 7 - AGGREGATION VALIDATION COMPLETE\")\n", "print(\"=\"*50)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 4 }