""" CXInsights - Feature Extraction Tests Tests for deterministic feature extraction. """ import pytest from src.features.event_detector import EventDetector, EventDetectorConfig, detect_events from src.features.extractor import FeatureExtractor, extract_features from src.features.turn_metrics import TurnMetricsCalculator, calculate_turn_metrics from src.models.call_analysis import EventType from src.transcription.models import SpeakerTurn, Transcript, TranscriptMetadata @pytest.fixture def sample_transcript(): """Create a sample transcript for testing.""" return Transcript( call_id="TEST001", turns=[ SpeakerTurn( speaker="agent", text="Buenos días, ¿en qué puedo ayudarle?", start_time=0.0, end_time=3.0, ), SpeakerTurn( speaker="customer", text="Hola, quiero cancelar mi servicio.", start_time=3.5, end_time=6.5, ), SpeakerTurn( speaker="agent", text="Entiendo. Un momento, por favor, le pongo en espera mientras consulto.", start_time=7.0, end_time=12.0, ), # Silence gap (hold) SpeakerTurn( speaker="agent", text="Gracias por la espera. Le cuento que tenemos una oferta especial.", start_time=45.0, end_time=52.0, ), SpeakerTurn( speaker="customer", text="No me interesa, es demasiado caro.", start_time=52.5, end_time=56.0, ), SpeakerTurn( speaker="agent", text="Le voy a transferir con el departamento de retenciones.", start_time=56.5, end_time=61.0, ), ], metadata=TranscriptMetadata( audio_duration_sec=120.0, audio_file="TEST001.mp3", provider="test", speaker_count=2, ), ) @pytest.fixture def transcript_with_interruptions(): """Create a transcript with overlapping speech.""" return Transcript( call_id="TEST002", turns=[ SpeakerTurn( speaker="agent", text="Le explico cómo funciona el proceso...", start_time=0.0, end_time=5.0, ), SpeakerTurn( speaker="customer", text="Pero es que yo ya lo sé...", start_time=4.5, # Starts before agent ends end_time=7.0, ), SpeakerTurn( speaker="agent", text="Perdone, le decía que...", start_time=6.8, # Starts before customer ends end_time=10.0, ), ], metadata=TranscriptMetadata( audio_duration_sec=60.0, audio_file="TEST002.mp3", provider="test", ), ) @pytest.fixture def transcript_with_silences(): """Create a transcript with significant silences.""" return Transcript( call_id="TEST003", turns=[ SpeakerTurn( speaker="agent", text="Voy a comprobar su cuenta.", start_time=0.0, end_time=3.0, ), # 10 second gap SpeakerTurn( speaker="agent", text="Ya tengo la información.", start_time=13.0, end_time=16.0, ), # 8 second gap SpeakerTurn( speaker="customer", text="¿Y qué dice?", start_time=24.0, end_time=26.0, ), ], metadata=TranscriptMetadata( audio_duration_sec=30.0, audio_file="TEST003.mp3", provider="test", ), ) class TestEventDetector: """Tests for EventDetector.""" def test_detect_hold_start(self, sample_transcript): """Test detection of hold start patterns.""" events = detect_events(sample_transcript) hold_starts = [e for e in events if e.event_type == EventType.HOLD_START] assert len(hold_starts) >= 1 # Should detect "Un momento, por favor, le pongo en espera" def test_detect_hold_end(self, sample_transcript): """Test detection of hold end patterns.""" events = detect_events(sample_transcript) hold_ends = [e for e in events if e.event_type == EventType.HOLD_END] assert len(hold_ends) >= 1 # Should detect "Gracias por la espera" def test_detect_transfer(self, sample_transcript): """Test detection of transfer patterns.""" events = detect_events(sample_transcript) transfers = [e for e in events if e.event_type == EventType.TRANSFER] assert len(transfers) >= 1 # Should detect "Le voy a transferir" def test_detect_silence(self, transcript_with_silences): """Test detection of significant silences.""" config = EventDetectorConfig(silence_threshold_sec=5.0) detector = EventDetector(config) events = detector.detect_all(transcript_with_silences) silences = [e for e in events if e.event_type == EventType.SILENCE] assert len(silences) == 2 # Two gaps > 5 seconds assert silences[0].duration_sec == 10.0 assert silences[1].duration_sec == 8.0 def test_detect_interruptions(self, transcript_with_interruptions): """Test detection of interruptions.""" events = detect_events(transcript_with_interruptions) interruptions = [e for e in events if e.event_type == EventType.INTERRUPTION] assert len(interruptions) == 2 # Two overlapping segments def test_events_sorted_by_time(self, sample_transcript): """Test that events are sorted by start time.""" events = detect_events(sample_transcript) for i in range(1, len(events)): assert events[i].start_time >= events[i - 1].start_time def test_event_has_observed_source(self, sample_transcript): """Test that all events have source='observed'.""" events = detect_events(sample_transcript) for event in events: assert event.source == "observed" class TestTurnMetrics: """Tests for TurnMetricsCalculator.""" def test_turn_counts(self, sample_transcript): """Test turn counting.""" metrics = calculate_turn_metrics(sample_transcript) assert metrics.total_turns == 6 assert metrics.agent_turns == 4 assert metrics.customer_turns == 2 def test_talk_ratios(self, sample_transcript): """Test talk ratio calculations.""" metrics = calculate_turn_metrics(sample_transcript) # Ratios should be between 0 and 1 assert 0 <= metrics.agent_talk_ratio <= 1 assert 0 <= metrics.customer_talk_ratio <= 1 assert 0 <= metrics.silence_ratio <= 1 # Sum should be approximately 1 (may have gaps) total = metrics.agent_talk_ratio + metrics.customer_talk_ratio + metrics.silence_ratio assert total <= 1.1 # Allow small rounding def test_interruption_count(self, transcript_with_interruptions): """Test interruption counting in metrics.""" metrics = calculate_turn_metrics(transcript_with_interruptions) assert metrics.interruption_count == 2 def test_avg_turn_duration(self, sample_transcript): """Test average turn duration calculation.""" metrics = calculate_turn_metrics(sample_transcript) assert metrics.avg_turn_duration_sec > 0 def test_metrics_has_observed_source(self, sample_transcript): """Test that metrics have source='observed'.""" metrics = calculate_turn_metrics(sample_transcript) assert metrics.source == "observed" def test_empty_transcript(self): """Test handling of empty transcript.""" empty = Transcript( call_id="EMPTY", turns=[], metadata=TranscriptMetadata( audio_duration_sec=0.0, audio_file="empty.mp3", provider="test", ), ) metrics = calculate_turn_metrics(empty) assert metrics.total_turns == 0 assert metrics.agent_turns == 0 assert metrics.customer_turns == 0 class TestFeatureExtractor: """Tests for FeatureExtractor.""" def test_extract_features(self, sample_transcript): """Test complete feature extraction.""" features = extract_features(sample_transcript) assert features.call_id == "TEST001" assert features.audio_duration_sec == 120.0 assert features.language == "es" def test_features_have_events(self, sample_transcript): """Test that features include detected events.""" features = extract_features(sample_transcript) assert len(features.events) > 0 def test_features_have_metrics(self, sample_transcript): """Test that features include turn metrics.""" features = extract_features(sample_transcript) assert features.turn_metrics is not None assert features.turn_metrics.total_turns == 6 def test_hold_aggregation(self, sample_transcript): """Test hold count aggregation.""" features = extract_features(sample_transcript) # Should have at least one hold assert features.hold_count >= 1 def test_transfer_aggregation(self, sample_transcript): """Test transfer count aggregation.""" features = extract_features(sample_transcript) assert features.transfer_count >= 1 def test_silence_aggregation(self, transcript_with_silences): """Test silence count aggregation.""" features = extract_features(transcript_with_silences) assert features.silence_count == 2 def test_interruption_aggregation(self, transcript_with_interruptions): """Test interruption count aggregation.""" features = extract_features(transcript_with_interruptions) assert features.interruption_count == 2 def test_deterministic_output(self, sample_transcript): """Test that extraction is deterministic.""" features1 = extract_features(sample_transcript) features2 = extract_features(sample_transcript) # Same input should produce same output assert features1.hold_count == features2.hold_count assert features1.transfer_count == features2.transfer_count assert features1.silence_count == features2.silence_count assert len(features1.events) == len(features2.events) class TestSpanishPatterns: """Tests for Spanish language pattern detection.""" def test_hold_patterns_spanish(self): """Test various Spanish hold patterns.""" patterns_to_test = [ ("Un momento, por favor", True), ("Le voy a poner en espera", True), ("Espere un segundo", True), ("No cuelgue", True), ("Déjeme verificar", True), ("Buenos días", False), ("Gracias por llamar", False), ] for text, should_match in patterns_to_test: transcript = Transcript( call_id="TEST", turns=[ SpeakerTurn( speaker="agent", text=text, start_time=0.0, end_time=3.0, ), ], metadata=TranscriptMetadata( audio_duration_sec=10.0, audio_file="test.mp3", provider="test", ), ) events = detect_events(transcript) hold_starts = [e for e in events if e.event_type == EventType.HOLD_START] if should_match: assert len(hold_starts) >= 1, f"Should match: {text}" else: assert len(hold_starts) == 0, f"Should not match: {text}" def test_transfer_patterns_spanish(self): """Test various Spanish transfer patterns.""" patterns_to_test = [ ("Le voy a transferir con el departamento de ventas", True), ("Le paso con mi compañero", True), ("Le comunico con facturación", True), ("Va a ser transferido", True), ("Gracias por su paciencia", False), ] for text, should_match in patterns_to_test: transcript = Transcript( call_id="TEST", turns=[ SpeakerTurn( speaker="agent", text=text, start_time=0.0, end_time=3.0, ), ], metadata=TranscriptMetadata( audio_duration_sec=10.0, audio_file="test.mp3", provider="test", ), ) events = detect_events(transcript) transfers = [e for e in events if e.event_type == EventType.TRANSFER] if should_match: assert len(transfers) >= 1, f"Should match: {text}" else: assert len(transfers) == 0, f"Should not match: {text}"