{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 01 - Transcription Validation\n", "\n", "**Objective:** Validate STT quality before proceeding to inference.\n", "\n", "## Metrics to Evaluate\n", "- Latency per call\n", "- Cost per minute\n", "- Diarization quality (% turns with speaker)\n", "- Language detection accuracy\n", "- Overall confidence scores\n", "\n", "## STOP/GO Criteria\n", "- [ ] Quality acceptable (>90% usable transcriptions)\n", "- [ ] Cost known (verify against estimates)\n", "- [ ] STT provider decision confirmed" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Setup\n", "import asyncio\n", "import os\n", "import sys\n", "from pathlib import Path\n", "\n", "# Add project root to path\n", "project_root = Path.cwd().parent\n", "sys.path.insert(0, str(project_root))\n", "\n", "# Load environment\n", "from dotenv import load_dotenv\n", "load_dotenv(project_root / '.env')\n", "\n", "print(f\"Project root: {project_root}\")\n", "print(f\"API key configured: {'ASSEMBLYAI_API_KEY' in os.environ}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Imports\n", "from src.transcription import (\n", " AssemblyAITranscriber,\n", " BatchTranscriptionProcessor,\n", " TranscriptionConfig,\n", " get_audio_metadata_sync,\n", " validate_audio_file,\n", " estimate_transcription_cost,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Discover Test Audio Files" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Configure test audio directory\n", "# Replace with your actual test audio path\n", "TEST_AUDIO_DIR = project_root / \"data\" / \"raw\" / \"audio\" / \"test_batch\"\n", "\n", "# Or use fixtures for testing\n", "# TEST_AUDIO_DIR = project_root / \"tests\" / \"fixtures\" / \"sample_audio\"\n", "\n", "print(f\"Looking for audio in: {TEST_AUDIO_DIR}\")\n", "print(f\"Directory exists: {TEST_AUDIO_DIR.exists()}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Discover audio files\n", "audio_files = []\n", "if TEST_AUDIO_DIR.exists():\n", " for ext in ['.mp3', '.wav', '.m4a']:\n", " audio_files.extend(TEST_AUDIO_DIR.glob(f'*{ext}'))\n", "\n", "audio_files = sorted(audio_files)[:10] # Limit to 10 for validation\n", "print(f\"Found {len(audio_files)} audio files\")\n", "for f in audio_files:\n", " print(f\" - {f.name}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Pre-validation & Cost Estimation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Validate and get metadata\n", "validation_results = []\n", "total_duration_sec = 0\n", "\n", "for audio_path in audio_files:\n", " is_valid, error = validate_audio_file(audio_path)\n", " \n", " if is_valid:\n", " try:\n", " metadata = get_audio_metadata_sync(audio_path)\n", " total_duration_sec += metadata.duration_sec\n", " validation_results.append({\n", " 'file': audio_path.name,\n", " 'valid': True,\n", " 'duration_min': metadata.duration_minutes,\n", " 'size_mb': metadata.file_size_mb,\n", " })\n", " except Exception as e:\n", " validation_results.append({\n", " 'file': audio_path.name,\n", " 'valid': False,\n", " 'error': str(e),\n", " })\n", " else:\n", " validation_results.append({\n", " 'file': audio_path.name,\n", " 'valid': False,\n", " 'error': error,\n", " })\n", "\n", "# Display results\n", "import pandas as pd\n", "df_validation = pd.DataFrame(validation_results)\n", "display(df_validation)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Cost estimation\n", "total_minutes = total_duration_sec / 60\n", "cost_estimate = estimate_transcription_cost(total_minutes)\n", "\n", "print(\"=\" * 50)\n", "print(\"COST ESTIMATION\")\n", "print(\"=\" * 50)\n", "print(f\"Total files: {len(audio_files)}\")\n", "print(f\"Total duration: {cost_estimate['total_minutes']:.1f} minutes ({cost_estimate['total_hours']:.2f} hours)\")\n", "print(f\"Average duration: {total_minutes / len(audio_files):.1f} minutes per file\")\n", "print(f\"\")\n", "print(f\"Estimated cost (USD): ${cost_estimate['estimated_cost_usd']:.2f}\")\n", "print(f\"Estimated cost (EUR): €{cost_estimate['estimated_cost_eur']:.2f}\")\n", "print(\"=\" * 50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Transcription Test" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Initialize transcriber\n", "transcriber = AssemblyAITranscriber()\n", "config = TranscriptionConfig(\n", " language_code='es',\n", " speaker_labels=True,\n", " punctuate=True,\n", ")\n", "\n", "print(f\"Provider: {transcriber.provider_name}\")\n", "print(f\"Config: {config}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Transcribe single file (for quick test)\n", "if audio_files:\n", " test_file = audio_files[0]\n", " print(f\"Testing with: {test_file.name}\")\n", " \n", " import time\n", " start_time = time.time()\n", " \n", " result = await transcriber.transcribe(test_file, config)\n", " \n", " elapsed = time.time() - start_time\n", " \n", " print(f\"\\nStatus: {result.status}\")\n", " print(f\"Success: {result.is_success}\")\n", " print(f\"Processing time: {elapsed:.1f}s\")\n", " \n", " if result.is_success and result.transcript:\n", " t = result.transcript\n", " print(f\"\\nTranscript details:\")\n", " print(f\" - Job ID: {t.metadata.job_id}\")\n", " print(f\" - Duration: {t.metadata.audio_duration_sec:.1f}s\")\n", " print(f\" - Language: {t.metadata.language}\")\n", " print(f\" - Speakers: {t.metadata.speaker_count}\")\n", " print(f\" - Turns: {t.total_turns}\")\n", " print(f\" - Words: {t.total_words}\")\n", " print(f\" - Confidence: {t.metadata.overall_confidence}\")\n", " else:\n", " print(f\"\\nError: {result.error}\")\n", " print(f\"Message: {result.error_message}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# View sample turns\n", "if result.is_success and result.transcript:\n", " print(\"\\n=== Sample Turns ===\")\n", " for i, turn in enumerate(result.transcript.turns[:5]):\n", " print(f\"\\n[{turn.speaker}] ({turn.start_time:.1f}s - {turn.end_time:.1f}s)\")\n", " print(f\" {turn.text}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Batch Transcription (5-10 files)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Batch transcription\n", "valid_files = [f for f in audio_files if any(\n", " r['file'] == f.name and r.get('valid', False) \n", " for r in validation_results\n", ")]\n", "\n", "print(f\"Processing {len(valid_files)} valid files...\")\n", "\n", "def progress_callback(processed, total, current):\n", " print(f\" [{processed}/{total}] Processing: {current}\")\n", "\n", "start_time = time.time()\n", "batch_results = await transcriber.transcribe_batch(\n", " valid_files,\n", " config=config,\n", " max_concurrent=5,\n", ")\n", "total_elapsed = time.time() - start_time\n", "\n", "print(f\"\\nTotal time: {total_elapsed:.1f}s\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Quality Analysis" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Analyze results\n", "quality_data = []\n", "\n", "for result in batch_results:\n", " row = {\n", " 'call_id': result.call_id,\n", " 'success': result.is_success,\n", " 'error': result.error.value if result.error else None,\n", " }\n", " \n", " if result.is_success and result.transcript:\n", " t = result.transcript\n", " m = t.metadata\n", " \n", " # Count turns with speaker labels\n", " turns_with_speaker = sum(\n", " 1 for turn in t.turns \n", " if turn.speaker and turn.speaker != 'unknown'\n", " )\n", " \n", " row.update({\n", " 'duration_sec': m.audio_duration_sec,\n", " 'processing_sec': m.processing_time_sec,\n", " 'language': m.language,\n", " 'confidence': m.overall_confidence,\n", " 'speaker_count': m.speaker_count,\n", " 'total_turns': t.total_turns,\n", " 'turns_with_speaker': turns_with_speaker,\n", " 'diarization_rate': turns_with_speaker / t.total_turns if t.total_turns > 0 else 0,\n", " 'total_words': t.total_words,\n", " })\n", " \n", " quality_data.append(row)\n", "\n", "df_quality = pd.DataFrame(quality_data)\n", "display(df_quality)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Summary statistics\n", "print(\"=\" * 50)\n", "print(\"QUALITY SUMMARY\")\n", "print(\"=\" * 50)\n", "\n", "success_count = df_quality['success'].sum()\n", "total_count = len(df_quality)\n", "success_rate = success_count / total_count * 100\n", "\n", "print(f\"Success rate: {success_rate:.1f}% ({success_count}/{total_count})\")\n", "\n", "if 'confidence' in df_quality.columns:\n", " avg_confidence = df_quality['confidence'].mean()\n", " print(f\"Average confidence: {avg_confidence:.2f}\")\n", "\n", "if 'diarization_rate' in df_quality.columns:\n", " avg_diarization = df_quality['diarization_rate'].mean()\n", " print(f\"Average diarization rate: {avg_diarization:.1%}\")\n", "\n", "if 'language' in df_quality.columns:\n", " spanish_count = (df_quality['language'] == 'es').sum()\n", " print(f\"Spanish detected: {spanish_count}/{success_count} ({spanish_count/success_count*100:.1f}%)\")\n", "\n", "print(\"=\" * 50)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Cost analysis\n", "if 'duration_sec' in df_quality.columns:\n", " total_duration_min = df_quality['duration_sec'].sum() / 60\n", " total_processing_sec = df_quality['processing_sec'].sum()\n", " \n", " actual_cost = estimate_transcription_cost(total_duration_min)\n", " \n", " print(\"\\n=== COST ANALYSIS ===\")\n", " print(f\"Total audio: {total_duration_min:.1f} minutes\")\n", " print(f\"Total processing: {total_processing_sec:.1f} seconds\")\n", " print(f\"Actual cost: ${actual_cost['estimated_cost_usd']:.2f}\")\n", " print(f\"Cost per call: ${actual_cost['estimated_cost_usd'] / success_count:.3f}\")\n", " print(f\"Avg latency: {total_processing_sec / success_count:.1f}s per call\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. STOP/GO Decision\n", "\n", "### Criteria Checklist\n", "\n", "| Criteria | Target | Actual | Status |\n", "|----------|--------|--------|--------|\n", "| Success rate | >90% | ___ | [ ] |\n", "| Avg confidence | >0.8 | ___ | [ ] |\n", "| Diarization rate | >80% | ___ | [ ] |\n", "| Spanish detection | >95% | ___ | [ ] |\n", "| Cost per call | <$0.05 | ___ | [ ] |\n", "\n", "### Decision\n", "\n", "- [ ] **GO**: Quality acceptable, proceed to Checkpoint 3\n", "- [ ] **STOP**: Issues found, investigate before proceeding" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Save results for reference\n", "output_dir = project_root / 'data' / 'outputs' / 'validation'\n", "output_dir.mkdir(parents=True, exist_ok=True)\n", "\n", "df_quality.to_csv(output_dir / 'transcription_quality.csv', index=False)\n", "print(f\"Results saved to: {output_dir / 'transcription_quality.csv'}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 4 }