Initial commit: frontend + backend integration

2025-12-29 18:12:32 +01:00
commit 2cd6d6b95c
146 changed files with 31503 additions and 0 deletions
--- a/frontend/process_genesys_data.py
+++ b/frontend/process_genesys_data.py
@@ -0,0 +1,302 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Genesys Data Processing Script
+Step 1: Data Cleaning
+Step 2: Skill Grouping (Fuzzy Matching)
+Step 3: Validation Report
+Step 4: Export Clean Data & Mappings
+"""
+
+import sys
+import io
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+import pandas as pd
+import numpy as np
+from difflib import SequenceMatcher
+import unicodedata
+import re
+from datetime import datetime
+import openpyxl
+from openpyxl.styles import Font, PatternFill, Alignment
+
+def normalize_text(text):
+    """Normalize text: lowercase, remove extra spaces, normalize accents"""
+    if pd.isna(text):
+        return ""
+
+    text = str(text).strip()
+    # Remove extra spaces
+    text = re.sub(r'\s+', ' ', text)
+    # Lowercase
+    text = text.lower()
+    # Normalize unicode (remove accents)
+    text = unicodedata.normalize('NFKD', text)
+    text = text.encode('ascii', 'ignore').decode('utf-8')
+
+    return text
+
+def correct_common_typos(text):
+    """Fix common typos and variations"""
+    if not text:
+        return text
+
+    replacements = {
+        'telefonico': 'telefonico',
+        'telefónico': 'telefonico',
+        'teléfonico': 'telefonico',
+        'cobros': 'cobros',
+        'cobro': 'cobros',
+        'facturacion': 'facturacion',
+        'facturación': 'facturacion',
+        'información': 'informacion',
+        'informacion': 'informacion',
+        'consulta': 'consulta',
+        'consultas': 'consulta',
+        'soporte': 'soporte',
+        'soportes': 'soporte',
+        'contrato': 'contrato',
+        'contratos': 'contrato',
+        'averia': 'averia',
+        'averias': 'averia',
+        'automatizacion': 'automatizacion',
+        'automatización': 'automatizacion',
+        'reclamo': 'reclamo',
+        'reclamos': 'reclamo',
+        'gestion': 'gestion',
+        'gestión': 'gestion',
+    }
+
+    for typo, correction in replacements.items():
+        if typo in text:
+            text = text.replace(typo, correction)
+
+    return text
+
+def similarity_ratio(a, b):
+    """Calculate similarity between two strings (0-1)"""
+    return SequenceMatcher(None, a, b).ratio()
+
+def group_similar_skills(skills, threshold=0.85):
+    """Group similar skills using fuzzy matching"""
+    unique_skills = sorted(list(set(skills)))
+    skill_mapping = {}
+    grouped_skills = {}
+    used = set()
+
+    for i, skill1 in enumerate(unique_skills):
+        if skill1 in used:
+            continue
+
+        group = [skill1]
+        used.add(skill1)
+
+        # Find similar skills
+        for j, skill2 in enumerate(unique_skills):
+            if i != j and skill2 not in used:
+                ratio = similarity_ratio(skill1, skill2)
+                if ratio >= threshold:
+                    group.append(skill2)
+                    used.add(skill2)
+
+        # Use the first (alphabetically shortest) as canonical
+        canonical = min(group, key=lambda x: (len(x), x))
+        grouped_skills[canonical] = sorted(group)
+
+        for skill in group:
+            skill_mapping[skill] = canonical
+
+    return skill_mapping, grouped_skills
+
+def main():
+    print("="*80)
+    print("GENESYS DATA PROCESSING - 4 STEPS")
+    print("="*80)
+
+    # ===== STEP 1: DATA CLEANING =====
+    print("\n[STEP 1] DATA CLEANING...")
+    print("-" * 80)
+
+    # Read Excel file
+    try:
+        df = pd.read_excel('data.xlsx')
+        print(f"[OK] Loaded data.xlsx: {len(df)} records")
+    except Exception as e:
+        print(f"[ERROR] Error reading file: {e}")
+        return
+
+    print(f"  Columns: {list(df.columns)}")
+    initial_records = len(df)
+
+    # Store original data for comparison
+    df_original = df.copy()
+
+    # Normalize text columns
+    text_columns = df.select_dtypes(include=['object']).columns
+    for col in text_columns:
+        if col in df.columns:
+            df[col] = df[col].apply(normalize_text)
+            df[col] = df[col].apply(correct_common_typos)
+
+    print(f"[OK] Normalized all text columns: {len(text_columns)} columns")
+
+    # Remove duplicates
+    duplicates_before = len(df)
+    df = df.drop_duplicates()
+    duplicates_removed = duplicates_before - len(df)
+    print(f"[OK] Removed duplicates: {duplicates_removed} duplicate rows removed")
+
+    cleaned_records = len(df)
+
+    # ===== STEP 2: SKILL GROUPING =====
+    print("\n[STEP 2] SKILL GROUPING (Fuzzy Matching)...")
+    print("-" * 80)
+
+    # Identify skill column (likely 'queue_skill', 'skill', 'skills', etc.)
+    skill_column = None
+    for col in ['queue_skill', 'skill', 'skills', 'queue', 'category', 'type']:
+        if col in df.columns:
+            skill_column = col
+            break
+
+    if not skill_column:
+        # Find the column with most string values and use that
+        for col in text_columns:
+            if df[col].nunique() < len(df) * 0.5:
+                skill_column = col
+                break
+
+    if skill_column:
+        unique_skills_before = df[skill_column].nunique()
+        print(f"[OK] Identified skill column: '{skill_column}'")
+        print(f"  Unique skills BEFORE grouping: {unique_skills_before}")
+
+        # Group similar skills
+        skill_mapping, grouped_skills = group_similar_skills(
+            df[skill_column].unique().tolist(),
+            threshold=0.80
+        )
+
+        # Apply mapping
+        df[skill_column] = df[skill_column].map(skill_mapping)
+
+        unique_skills_after = df[skill_column].nunique()
+        skills_grouped = unique_skills_before - unique_skills_after
+
+        print(f"[OK] Unique skills AFTER grouping: {unique_skills_after}")
+        print(f"  Skills grouped: {skills_grouped}")
+        print(f"  Reduction: {(skills_grouped/unique_skills_before)*100:.1f}%")
+    else:
+        print("[WARN] Warning: Could not identify skill column")
+        skill_mapping = {}
+        grouped_skills = {}
+        unique_skills_before = 0
+        unique_skills_after = 0
+
+    # ===== STEP 3: VALIDATION REPORT =====
+    print("\n[STEP 3] GENERATING VALIDATION REPORT...")
+    print("-" * 80)
+
+    report_lines = []
+    report_lines.append("="*80)
+    report_lines.append("GENESYS DATA CLEANING REPORT")
+    report_lines.append("="*80)
+    report_lines.append(f"\nGenerated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+
+    report_lines.append("DATA QUALITY METRICS")
+    report_lines.append("-" * 80)
+    report_lines.append(f"Records before cleaning:    {initial_records}")
+    report_lines.append(f"Records after cleaning:     {cleaned_records}")
+    report_lines.append(f"Duplicate records removed:  {duplicates_removed}")
+    report_lines.append(f"Record reduction:           {(duplicates_removed/initial_records)*100:.2f}%")
+
+    report_lines.append(f"\nSKILL CONSOLIDATION")
+    report_lines.append("-" * 80)
+    report_lines.append(f"Unique skills before:       {unique_skills_before}")
+    report_lines.append(f"Unique skills after:        {unique_skills_after}")
+    report_lines.append(f"Skills grouped:             {skills_grouped}")
+    report_lines.append(f"Consolidation rate:         {(skills_grouped/unique_skills_before)*100:.2f}%")
+
+    report_lines.append(f"\nCLEANING OPERATIONS")
+    report_lines.append("-" * 80)
+    report_lines.append(f"[OK] Text normalization:       {len(text_columns)} columns normalized")
+    report_lines.append(f"[OK] Typo correction:          Applied to all text fields")
+    report_lines.append(f"[OK] Duplicate removal:        {duplicates_removed} rows removed")
+    report_lines.append(f"[OK] Skill grouping:           {len(skill_mapping)} original skills consolidated")
+
+    if skill_column:
+        report_lines.append(f"\nSKILL MAPPING (Top 20)")
+        report_lines.append("-" * 80)
+
+        # Show some examples of mappings
+        mapping_examples = {}
+        for orig, canonical in sorted(skill_mapping.items())[:20]:
+            if orig != canonical:
+                if canonical not in mapping_examples:
+                    mapping_examples[canonical] = []
+                mapping_examples[canonical].append(orig)
+
+        for canonical, originals in sorted(mapping_examples.items()):
+            if len(originals) > 1:
+                report_lines.append(f"\n'{canonical}' (consolidated from {len(originals)} variants)")
+                for orig in sorted(originals)[:5]:
+                    report_lines.append(f"  → {orig}")
+                if len(originals) > 5:
+                    report_lines.append(f"  ... and {len(originals)-5} more")
+
+    report_lines.append(f"\nFILE OUTPUT SUMMARY")
+    report_lines.append("-" * 80)
+    report_lines.append(f"[OK] datos-limpios.xlsx:       {cleaned_records} cleaned records")
+    report_lines.append(f"[OK] skills-mapping.xlsx:      Skill consolidation mapping")
+    report_lines.append(f"[OK] informe-limpieza.txt:     This report")
+
+    report_lines.append(f"\nEND OF REPORT")
+    report_lines.append("="*80)
+
+    report_text = "\n".join(report_lines)
+    print(report_text)
+
+    # ===== STEP 4: EXPORT =====
+    print("\n[STEP 4] EXPORTING DATA & REPORTS...")
+    print("-" * 80)
+
+    # Export cleaned data
+    try:
+        df.to_excel('datos-limpios.xlsx', index=False)
+        print("[OK] Exported: datos-limpios.xlsx")
+    except Exception as e:
+        print(f"[ERROR] Error exporting cleaned data: {e}")
+
+    # Export skill mapping
+    try:
+        if skill_mapping:
+            mapping_df = pd.DataFrame([
+                {'Original Skill': orig, 'Canonical Skill': canonical, 'Group Size': len(grouped_skills.get(canonical, []))}
+                for orig, canonical in sorted(skill_mapping.items())
+            ])
+            mapping_df.to_excel('skills-mapping.xlsx', index=False)
+            print("[OK] Exported: skills-mapping.xlsx")
+        else:
+            print("[WARN] No skill mapping to export")
+    except Exception as e:
+        print(f"[ERROR] Error exporting skill mapping: {e}")
+
+    # Export report
+    try:
+        with open('informe-limpieza.txt', 'w', encoding='utf-8') as f:
+            f.write(report_text)
+        print("[OK] Exported: informe-limpieza.txt")
+    except Exception as e:
+        print(f"[ERROR] Error exporting report: {e}")
+
+    print("\n" + "="*80)
+    print("PROCESSING COMPLETE!")
+    print("="*80)
+    print(f"\nSummary:")
+    print(f"  • Records: {initial_records} → {cleaned_records} (-{duplicates_removed})")
+    print(f"  • Skills: {unique_skills_before} → {unique_skills_after} (-{skills_grouped})")
+    print(f"  • All files saved to current directory")
+
+if __name__ == "__main__":
+    main()