#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Genesys Data Processing Script Step 1: Data Cleaning Step 2: Skill Grouping (Fuzzy Matching) Step 3: Validation Report Step 4: Export Clean Data & Mappings """ import sys import io sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') import pandas as pd import numpy as np from difflib import SequenceMatcher import unicodedata import re from datetime import datetime import openpyxl from openpyxl.styles import Font, PatternFill, Alignment def normalize_text(text): """Normalize text: lowercase, remove extra spaces, normalize accents""" if pd.isna(text): return "" text = str(text).strip() # Remove extra spaces text = re.sub(r'\s+', ' ', text) # Lowercase text = text.lower() # Normalize unicode (remove accents) text = unicodedata.normalize('NFKD', text) text = text.encode('ascii', 'ignore').decode('utf-8') return text def correct_common_typos(text): """Fix common typos and variations""" if not text: return text replacements = { 'telefonico': 'telefonico', 'telefónico': 'telefonico', 'teléfonico': 'telefonico', 'cobros': 'cobros', 'cobro': 'cobros', 'facturacion': 'facturacion', 'facturación': 'facturacion', 'información': 'informacion', 'informacion': 'informacion', 'consulta': 'consulta', 'consultas': 'consulta', 'soporte': 'soporte', 'soportes': 'soporte', 'contrato': 'contrato', 'contratos': 'contrato', 'averia': 'averia', 'averias': 'averia', 'automatizacion': 'automatizacion', 'automatización': 'automatizacion', 'reclamo': 'reclamo', 'reclamos': 'reclamo', 'gestion': 'gestion', 'gestión': 'gestion', } for typo, correction in replacements.items(): if typo in text: text = text.replace(typo, correction) return text def similarity_ratio(a, b): """Calculate similarity between two strings (0-1)""" return SequenceMatcher(None, a, b).ratio() def group_similar_skills(skills, threshold=0.85): """Group similar skills using fuzzy matching""" unique_skills = sorted(list(set(skills))) skill_mapping = {} grouped_skills = {} used = set() for i, skill1 in enumerate(unique_skills): if skill1 in used: continue group = [skill1] used.add(skill1) # Find similar skills for j, skill2 in enumerate(unique_skills): if i != j and skill2 not in used: ratio = similarity_ratio(skill1, skill2) if ratio >= threshold: group.append(skill2) used.add(skill2) # Use the first (alphabetically shortest) as canonical canonical = min(group, key=lambda x: (len(x), x)) grouped_skills[canonical] = sorted(group) for skill in group: skill_mapping[skill] = canonical return skill_mapping, grouped_skills def main(): print("="*80) print("GENESYS DATA PROCESSING - 4 STEPS") print("="*80) # ===== STEP 1: DATA CLEANING ===== print("\n[STEP 1] DATA CLEANING...") print("-" * 80) # Read Excel file try: df = pd.read_excel('data.xlsx') print(f"[OK] Loaded data.xlsx: {len(df)} records") except Exception as e: print(f"[ERROR] Error reading file: {e}") return print(f" Columns: {list(df.columns)}") initial_records = len(df) # Store original data for comparison df_original = df.copy() # Normalize text columns text_columns = df.select_dtypes(include=['object']).columns for col in text_columns: if col in df.columns: df[col] = df[col].apply(normalize_text) df[col] = df[col].apply(correct_common_typos) print(f"[OK] Normalized all text columns: {len(text_columns)} columns") # Remove duplicates duplicates_before = len(df) df = df.drop_duplicates() duplicates_removed = duplicates_before - len(df) print(f"[OK] Removed duplicates: {duplicates_removed} duplicate rows removed") cleaned_records = len(df) # ===== STEP 2: SKILL GROUPING ===== print("\n[STEP 2] SKILL GROUPING (Fuzzy Matching)...") print("-" * 80) # Identify skill column (likely 'queue_skill', 'skill', 'skills', etc.) skill_column = None for col in ['queue_skill', 'skill', 'skills', 'queue', 'category', 'type']: if col in df.columns: skill_column = col break if not skill_column: # Find the column with most string values and use that for col in text_columns: if df[col].nunique() < len(df) * 0.5: skill_column = col break if skill_column: unique_skills_before = df[skill_column].nunique() print(f"[OK] Identified skill column: '{skill_column}'") print(f" Unique skills BEFORE grouping: {unique_skills_before}") # Group similar skills skill_mapping, grouped_skills = group_similar_skills( df[skill_column].unique().tolist(), threshold=0.80 ) # Apply mapping df[skill_column] = df[skill_column].map(skill_mapping) unique_skills_after = df[skill_column].nunique() skills_grouped = unique_skills_before - unique_skills_after print(f"[OK] Unique skills AFTER grouping: {unique_skills_after}") print(f" Skills grouped: {skills_grouped}") print(f" Reduction: {(skills_grouped/unique_skills_before)*100:.1f}%") else: print("[WARN] Warning: Could not identify skill column") skill_mapping = {} grouped_skills = {} unique_skills_before = 0 unique_skills_after = 0 # ===== STEP 3: VALIDATION REPORT ===== print("\n[STEP 3] GENERATING VALIDATION REPORT...") print("-" * 80) report_lines = [] report_lines.append("="*80) report_lines.append("GENESYS DATA CLEANING REPORT") report_lines.append("="*80) report_lines.append(f"\nGenerated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") report_lines.append("DATA QUALITY METRICS") report_lines.append("-" * 80) report_lines.append(f"Records before cleaning: {initial_records}") report_lines.append(f"Records after cleaning: {cleaned_records}") report_lines.append(f"Duplicate records removed: {duplicates_removed}") report_lines.append(f"Record reduction: {(duplicates_removed/initial_records)*100:.2f}%") report_lines.append(f"\nSKILL CONSOLIDATION") report_lines.append("-" * 80) report_lines.append(f"Unique skills before: {unique_skills_before}") report_lines.append(f"Unique skills after: {unique_skills_after}") report_lines.append(f"Skills grouped: {skills_grouped}") report_lines.append(f"Consolidation rate: {(skills_grouped/unique_skills_before)*100:.2f}%") report_lines.append(f"\nCLEANING OPERATIONS") report_lines.append("-" * 80) report_lines.append(f"[OK] Text normalization: {len(text_columns)} columns normalized") report_lines.append(f"[OK] Typo correction: Applied to all text fields") report_lines.append(f"[OK] Duplicate removal: {duplicates_removed} rows removed") report_lines.append(f"[OK] Skill grouping: {len(skill_mapping)} original skills consolidated") if skill_column: report_lines.append(f"\nSKILL MAPPING (Top 20)") report_lines.append("-" * 80) # Show some examples of mappings mapping_examples = {} for orig, canonical in sorted(skill_mapping.items())[:20]: if orig != canonical: if canonical not in mapping_examples: mapping_examples[canonical] = [] mapping_examples[canonical].append(orig) for canonical, originals in sorted(mapping_examples.items()): if len(originals) > 1: report_lines.append(f"\n'{canonical}' (consolidated from {len(originals)} variants)") for orig in sorted(originals)[:5]: report_lines.append(f" → {orig}") if len(originals) > 5: report_lines.append(f" ... and {len(originals)-5} more") report_lines.append(f"\nFILE OUTPUT SUMMARY") report_lines.append("-" * 80) report_lines.append(f"[OK] datos-limpios.xlsx: {cleaned_records} cleaned records") report_lines.append(f"[OK] skills-mapping.xlsx: Skill consolidation mapping") report_lines.append(f"[OK] informe-limpieza.txt: This report") report_lines.append(f"\nEND OF REPORT") report_lines.append("="*80) report_text = "\n".join(report_lines) print(report_text) # ===== STEP 4: EXPORT ===== print("\n[STEP 4] EXPORTING DATA & REPORTS...") print("-" * 80) # Export cleaned data try: df.to_excel('datos-limpios.xlsx', index=False) print("[OK] Exported: datos-limpios.xlsx") except Exception as e: print(f"[ERROR] Error exporting cleaned data: {e}") # Export skill mapping try: if skill_mapping: mapping_df = pd.DataFrame([ {'Original Skill': orig, 'Canonical Skill': canonical, 'Group Size': len(grouped_skills.get(canonical, []))} for orig, canonical in sorted(skill_mapping.items()) ]) mapping_df.to_excel('skills-mapping.xlsx', index=False) print("[OK] Exported: skills-mapping.xlsx") else: print("[WARN] No skill mapping to export") except Exception as e: print(f"[ERROR] Error exporting skill mapping: {e}") # Export report try: with open('informe-limpieza.txt', 'w', encoding='utf-8') as f: f.write(report_text) print("[OK] Exported: informe-limpieza.txt") except Exception as e: print(f"[ERROR] Error exporting report: {e}") print("\n" + "="*80) print("PROCESSING COMPLETE!") print("="*80) print(f"\nSummary:") print(f" • Records: {initial_records} → {cleaned_records} (-{duplicates_removed})") print(f" • Skills: {unique_skills_before} → {unique_skills_after} (-{skills_grouped})") print(f" • All files saved to current directory") if __name__ == "__main__": main()