Initial commit: frontend + backend integration

This commit is contained in:
Ignacio
2025-12-29 18:12:32 +01:00
commit 2cd6d6b95c
146 changed files with 31503 additions and 0 deletions

View File

@@ -0,0 +1,302 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Genesys Data Processing Script
Step 1: Data Cleaning
Step 2: Skill Grouping (Fuzzy Matching)
Step 3: Validation Report
Step 4: Export Clean Data & Mappings
"""
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import unicodedata
import re
from datetime import datetime
import openpyxl
from openpyxl.styles import Font, PatternFill, Alignment
def normalize_text(text):
"""Normalize text: lowercase, remove extra spaces, normalize accents"""
if pd.isna(text):
return ""
text = str(text).strip()
# Remove extra spaces
text = re.sub(r'\s+', ' ', text)
# Lowercase
text = text.lower()
# Normalize unicode (remove accents)
text = unicodedata.normalize('NFKD', text)
text = text.encode('ascii', 'ignore').decode('utf-8')
return text
def correct_common_typos(text):
"""Fix common typos and variations"""
if not text:
return text
replacements = {
'telefonico': 'telefonico',
'telefónico': 'telefonico',
'teléfonico': 'telefonico',
'cobros': 'cobros',
'cobro': 'cobros',
'facturacion': 'facturacion',
'facturación': 'facturacion',
'información': 'informacion',
'informacion': 'informacion',
'consulta': 'consulta',
'consultas': 'consulta',
'soporte': 'soporte',
'soportes': 'soporte',
'contrato': 'contrato',
'contratos': 'contrato',
'averia': 'averia',
'averias': 'averia',
'automatizacion': 'automatizacion',
'automatización': 'automatizacion',
'reclamo': 'reclamo',
'reclamos': 'reclamo',
'gestion': 'gestion',
'gestión': 'gestion',
}
for typo, correction in replacements.items():
if typo in text:
text = text.replace(typo, correction)
return text
def similarity_ratio(a, b):
"""Calculate similarity between two strings (0-1)"""
return SequenceMatcher(None, a, b).ratio()
def group_similar_skills(skills, threshold=0.85):
"""Group similar skills using fuzzy matching"""
unique_skills = sorted(list(set(skills)))
skill_mapping = {}
grouped_skills = {}
used = set()
for i, skill1 in enumerate(unique_skills):
if skill1 in used:
continue
group = [skill1]
used.add(skill1)
# Find similar skills
for j, skill2 in enumerate(unique_skills):
if i != j and skill2 not in used:
ratio = similarity_ratio(skill1, skill2)
if ratio >= threshold:
group.append(skill2)
used.add(skill2)
# Use the first (alphabetically shortest) as canonical
canonical = min(group, key=lambda x: (len(x), x))
grouped_skills[canonical] = sorted(group)
for skill in group:
skill_mapping[skill] = canonical
return skill_mapping, grouped_skills
def main():
print("="*80)
print("GENESYS DATA PROCESSING - 4 STEPS")
print("="*80)
# ===== STEP 1: DATA CLEANING =====
print("\n[STEP 1] DATA CLEANING...")
print("-" * 80)
# Read Excel file
try:
df = pd.read_excel('data.xlsx')
print(f"[OK] Loaded data.xlsx: {len(df)} records")
except Exception as e:
print(f"[ERROR] Error reading file: {e}")
return
print(f" Columns: {list(df.columns)}")
initial_records = len(df)
# Store original data for comparison
df_original = df.copy()
# Normalize text columns
text_columns = df.select_dtypes(include=['object']).columns
for col in text_columns:
if col in df.columns:
df[col] = df[col].apply(normalize_text)
df[col] = df[col].apply(correct_common_typos)
print(f"[OK] Normalized all text columns: {len(text_columns)} columns")
# Remove duplicates
duplicates_before = len(df)
df = df.drop_duplicates()
duplicates_removed = duplicates_before - len(df)
print(f"[OK] Removed duplicates: {duplicates_removed} duplicate rows removed")
cleaned_records = len(df)
# ===== STEP 2: SKILL GROUPING =====
print("\n[STEP 2] SKILL GROUPING (Fuzzy Matching)...")
print("-" * 80)
# Identify skill column (likely 'queue_skill', 'skill', 'skills', etc.)
skill_column = None
for col in ['queue_skill', 'skill', 'skills', 'queue', 'category', 'type']:
if col in df.columns:
skill_column = col
break
if not skill_column:
# Find the column with most string values and use that
for col in text_columns:
if df[col].nunique() < len(df) * 0.5:
skill_column = col
break
if skill_column:
unique_skills_before = df[skill_column].nunique()
print(f"[OK] Identified skill column: '{skill_column}'")
print(f" Unique skills BEFORE grouping: {unique_skills_before}")
# Group similar skills
skill_mapping, grouped_skills = group_similar_skills(
df[skill_column].unique().tolist(),
threshold=0.80
)
# Apply mapping
df[skill_column] = df[skill_column].map(skill_mapping)
unique_skills_after = df[skill_column].nunique()
skills_grouped = unique_skills_before - unique_skills_after
print(f"[OK] Unique skills AFTER grouping: {unique_skills_after}")
print(f" Skills grouped: {skills_grouped}")
print(f" Reduction: {(skills_grouped/unique_skills_before)*100:.1f}%")
else:
print("[WARN] Warning: Could not identify skill column")
skill_mapping = {}
grouped_skills = {}
unique_skills_before = 0
unique_skills_after = 0
# ===== STEP 3: VALIDATION REPORT =====
print("\n[STEP 3] GENERATING VALIDATION REPORT...")
print("-" * 80)
report_lines = []
report_lines.append("="*80)
report_lines.append("GENESYS DATA CLEANING REPORT")
report_lines.append("="*80)
report_lines.append(f"\nGenerated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
report_lines.append("DATA QUALITY METRICS")
report_lines.append("-" * 80)
report_lines.append(f"Records before cleaning: {initial_records}")
report_lines.append(f"Records after cleaning: {cleaned_records}")
report_lines.append(f"Duplicate records removed: {duplicates_removed}")
report_lines.append(f"Record reduction: {(duplicates_removed/initial_records)*100:.2f}%")
report_lines.append(f"\nSKILL CONSOLIDATION")
report_lines.append("-" * 80)
report_lines.append(f"Unique skills before: {unique_skills_before}")
report_lines.append(f"Unique skills after: {unique_skills_after}")
report_lines.append(f"Skills grouped: {skills_grouped}")
report_lines.append(f"Consolidation rate: {(skills_grouped/unique_skills_before)*100:.2f}%")
report_lines.append(f"\nCLEANING OPERATIONS")
report_lines.append("-" * 80)
report_lines.append(f"[OK] Text normalization: {len(text_columns)} columns normalized")
report_lines.append(f"[OK] Typo correction: Applied to all text fields")
report_lines.append(f"[OK] Duplicate removal: {duplicates_removed} rows removed")
report_lines.append(f"[OK] Skill grouping: {len(skill_mapping)} original skills consolidated")
if skill_column:
report_lines.append(f"\nSKILL MAPPING (Top 20)")
report_lines.append("-" * 80)
# Show some examples of mappings
mapping_examples = {}
for orig, canonical in sorted(skill_mapping.items())[:20]:
if orig != canonical:
if canonical not in mapping_examples:
mapping_examples[canonical] = []
mapping_examples[canonical].append(orig)
for canonical, originals in sorted(mapping_examples.items()):
if len(originals) > 1:
report_lines.append(f"\n'{canonical}' (consolidated from {len(originals)} variants)")
for orig in sorted(originals)[:5]:
report_lines.append(f"{orig}")
if len(originals) > 5:
report_lines.append(f" ... and {len(originals)-5} more")
report_lines.append(f"\nFILE OUTPUT SUMMARY")
report_lines.append("-" * 80)
report_lines.append(f"[OK] datos-limpios.xlsx: {cleaned_records} cleaned records")
report_lines.append(f"[OK] skills-mapping.xlsx: Skill consolidation mapping")
report_lines.append(f"[OK] informe-limpieza.txt: This report")
report_lines.append(f"\nEND OF REPORT")
report_lines.append("="*80)
report_text = "\n".join(report_lines)
print(report_text)
# ===== STEP 4: EXPORT =====
print("\n[STEP 4] EXPORTING DATA & REPORTS...")
print("-" * 80)
# Export cleaned data
try:
df.to_excel('datos-limpios.xlsx', index=False)
print("[OK] Exported: datos-limpios.xlsx")
except Exception as e:
print(f"[ERROR] Error exporting cleaned data: {e}")
# Export skill mapping
try:
if skill_mapping:
mapping_df = pd.DataFrame([
{'Original Skill': orig, 'Canonical Skill': canonical, 'Group Size': len(grouped_skills.get(canonical, []))}
for orig, canonical in sorted(skill_mapping.items())
])
mapping_df.to_excel('skills-mapping.xlsx', index=False)
print("[OK] Exported: skills-mapping.xlsx")
else:
print("[WARN] No skill mapping to export")
except Exception as e:
print(f"[ERROR] Error exporting skill mapping: {e}")
# Export report
try:
with open('informe-limpieza.txt', 'w', encoding='utf-8') as f:
f.write(report_text)
print("[OK] Exported: informe-limpieza.txt")
except Exception as e:
print(f"[ERROR] Error exporting report: {e}")
print("\n" + "="*80)
print("PROCESSING COMPLETE!")
print("="*80)
print(f"\nSummary:")
print(f" • Records: {initial_records}{cleaned_records} (-{duplicates_removed})")
print(f" • Skills: {unique_skills_before}{unique_skills_after} (-{skills_grouped})")
print(f" • All files saved to current directory")
if __name__ == "__main__":
main()