Initial commit: frontend + backend integration
This commit is contained in:
302
frontend/process_genesys_data.py
Normal file
302
frontend/process_genesys_data.py
Normal file
@@ -0,0 +1,302 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Genesys Data Processing Script
|
||||
Step 1: Data Cleaning
|
||||
Step 2: Skill Grouping (Fuzzy Matching)
|
||||
Step 3: Validation Report
|
||||
Step 4: Export Clean Data & Mappings
|
||||
"""
|
||||
|
||||
import sys
|
||||
import io
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from difflib import SequenceMatcher
|
||||
import unicodedata
|
||||
import re
|
||||
from datetime import datetime
|
||||
import openpyxl
|
||||
from openpyxl.styles import Font, PatternFill, Alignment
|
||||
|
||||
def normalize_text(text):
|
||||
"""Normalize text: lowercase, remove extra spaces, normalize accents"""
|
||||
if pd.isna(text):
|
||||
return ""
|
||||
|
||||
text = str(text).strip()
|
||||
# Remove extra spaces
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
# Lowercase
|
||||
text = text.lower()
|
||||
# Normalize unicode (remove accents)
|
||||
text = unicodedata.normalize('NFKD', text)
|
||||
text = text.encode('ascii', 'ignore').decode('utf-8')
|
||||
|
||||
return text
|
||||
|
||||
def correct_common_typos(text):
|
||||
"""Fix common typos and variations"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
replacements = {
|
||||
'telefonico': 'telefonico',
|
||||
'telefónico': 'telefonico',
|
||||
'teléfonico': 'telefonico',
|
||||
'cobros': 'cobros',
|
||||
'cobro': 'cobros',
|
||||
'facturacion': 'facturacion',
|
||||
'facturación': 'facturacion',
|
||||
'información': 'informacion',
|
||||
'informacion': 'informacion',
|
||||
'consulta': 'consulta',
|
||||
'consultas': 'consulta',
|
||||
'soporte': 'soporte',
|
||||
'soportes': 'soporte',
|
||||
'contrato': 'contrato',
|
||||
'contratos': 'contrato',
|
||||
'averia': 'averia',
|
||||
'averias': 'averia',
|
||||
'automatizacion': 'automatizacion',
|
||||
'automatización': 'automatizacion',
|
||||
'reclamo': 'reclamo',
|
||||
'reclamos': 'reclamo',
|
||||
'gestion': 'gestion',
|
||||
'gestión': 'gestion',
|
||||
}
|
||||
|
||||
for typo, correction in replacements.items():
|
||||
if typo in text:
|
||||
text = text.replace(typo, correction)
|
||||
|
||||
return text
|
||||
|
||||
def similarity_ratio(a, b):
|
||||
"""Calculate similarity between two strings (0-1)"""
|
||||
return SequenceMatcher(None, a, b).ratio()
|
||||
|
||||
def group_similar_skills(skills, threshold=0.85):
|
||||
"""Group similar skills using fuzzy matching"""
|
||||
unique_skills = sorted(list(set(skills)))
|
||||
skill_mapping = {}
|
||||
grouped_skills = {}
|
||||
used = set()
|
||||
|
||||
for i, skill1 in enumerate(unique_skills):
|
||||
if skill1 in used:
|
||||
continue
|
||||
|
||||
group = [skill1]
|
||||
used.add(skill1)
|
||||
|
||||
# Find similar skills
|
||||
for j, skill2 in enumerate(unique_skills):
|
||||
if i != j and skill2 not in used:
|
||||
ratio = similarity_ratio(skill1, skill2)
|
||||
if ratio >= threshold:
|
||||
group.append(skill2)
|
||||
used.add(skill2)
|
||||
|
||||
# Use the first (alphabetically shortest) as canonical
|
||||
canonical = min(group, key=lambda x: (len(x), x))
|
||||
grouped_skills[canonical] = sorted(group)
|
||||
|
||||
for skill in group:
|
||||
skill_mapping[skill] = canonical
|
||||
|
||||
return skill_mapping, grouped_skills
|
||||
|
||||
def main():
|
||||
print("="*80)
|
||||
print("GENESYS DATA PROCESSING - 4 STEPS")
|
||||
print("="*80)
|
||||
|
||||
# ===== STEP 1: DATA CLEANING =====
|
||||
print("\n[STEP 1] DATA CLEANING...")
|
||||
print("-" * 80)
|
||||
|
||||
# Read Excel file
|
||||
try:
|
||||
df = pd.read_excel('data.xlsx')
|
||||
print(f"[OK] Loaded data.xlsx: {len(df)} records")
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Error reading file: {e}")
|
||||
return
|
||||
|
||||
print(f" Columns: {list(df.columns)}")
|
||||
initial_records = len(df)
|
||||
|
||||
# Store original data for comparison
|
||||
df_original = df.copy()
|
||||
|
||||
# Normalize text columns
|
||||
text_columns = df.select_dtypes(include=['object']).columns
|
||||
for col in text_columns:
|
||||
if col in df.columns:
|
||||
df[col] = df[col].apply(normalize_text)
|
||||
df[col] = df[col].apply(correct_common_typos)
|
||||
|
||||
print(f"[OK] Normalized all text columns: {len(text_columns)} columns")
|
||||
|
||||
# Remove duplicates
|
||||
duplicates_before = len(df)
|
||||
df = df.drop_duplicates()
|
||||
duplicates_removed = duplicates_before - len(df)
|
||||
print(f"[OK] Removed duplicates: {duplicates_removed} duplicate rows removed")
|
||||
|
||||
cleaned_records = len(df)
|
||||
|
||||
# ===== STEP 2: SKILL GROUPING =====
|
||||
print("\n[STEP 2] SKILL GROUPING (Fuzzy Matching)...")
|
||||
print("-" * 80)
|
||||
|
||||
# Identify skill column (likely 'queue_skill', 'skill', 'skills', etc.)
|
||||
skill_column = None
|
||||
for col in ['queue_skill', 'skill', 'skills', 'queue', 'category', 'type']:
|
||||
if col in df.columns:
|
||||
skill_column = col
|
||||
break
|
||||
|
||||
if not skill_column:
|
||||
# Find the column with most string values and use that
|
||||
for col in text_columns:
|
||||
if df[col].nunique() < len(df) * 0.5:
|
||||
skill_column = col
|
||||
break
|
||||
|
||||
if skill_column:
|
||||
unique_skills_before = df[skill_column].nunique()
|
||||
print(f"[OK] Identified skill column: '{skill_column}'")
|
||||
print(f" Unique skills BEFORE grouping: {unique_skills_before}")
|
||||
|
||||
# Group similar skills
|
||||
skill_mapping, grouped_skills = group_similar_skills(
|
||||
df[skill_column].unique().tolist(),
|
||||
threshold=0.80
|
||||
)
|
||||
|
||||
# Apply mapping
|
||||
df[skill_column] = df[skill_column].map(skill_mapping)
|
||||
|
||||
unique_skills_after = df[skill_column].nunique()
|
||||
skills_grouped = unique_skills_before - unique_skills_after
|
||||
|
||||
print(f"[OK] Unique skills AFTER grouping: {unique_skills_after}")
|
||||
print(f" Skills grouped: {skills_grouped}")
|
||||
print(f" Reduction: {(skills_grouped/unique_skills_before)*100:.1f}%")
|
||||
else:
|
||||
print("[WARN] Warning: Could not identify skill column")
|
||||
skill_mapping = {}
|
||||
grouped_skills = {}
|
||||
unique_skills_before = 0
|
||||
unique_skills_after = 0
|
||||
|
||||
# ===== STEP 3: VALIDATION REPORT =====
|
||||
print("\n[STEP 3] GENERATING VALIDATION REPORT...")
|
||||
print("-" * 80)
|
||||
|
||||
report_lines = []
|
||||
report_lines.append("="*80)
|
||||
report_lines.append("GENESYS DATA CLEANING REPORT")
|
||||
report_lines.append("="*80)
|
||||
report_lines.append(f"\nGenerated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
||||
|
||||
report_lines.append("DATA QUALITY METRICS")
|
||||
report_lines.append("-" * 80)
|
||||
report_lines.append(f"Records before cleaning: {initial_records}")
|
||||
report_lines.append(f"Records after cleaning: {cleaned_records}")
|
||||
report_lines.append(f"Duplicate records removed: {duplicates_removed}")
|
||||
report_lines.append(f"Record reduction: {(duplicates_removed/initial_records)*100:.2f}%")
|
||||
|
||||
report_lines.append(f"\nSKILL CONSOLIDATION")
|
||||
report_lines.append("-" * 80)
|
||||
report_lines.append(f"Unique skills before: {unique_skills_before}")
|
||||
report_lines.append(f"Unique skills after: {unique_skills_after}")
|
||||
report_lines.append(f"Skills grouped: {skills_grouped}")
|
||||
report_lines.append(f"Consolidation rate: {(skills_grouped/unique_skills_before)*100:.2f}%")
|
||||
|
||||
report_lines.append(f"\nCLEANING OPERATIONS")
|
||||
report_lines.append("-" * 80)
|
||||
report_lines.append(f"[OK] Text normalization: {len(text_columns)} columns normalized")
|
||||
report_lines.append(f"[OK] Typo correction: Applied to all text fields")
|
||||
report_lines.append(f"[OK] Duplicate removal: {duplicates_removed} rows removed")
|
||||
report_lines.append(f"[OK] Skill grouping: {len(skill_mapping)} original skills consolidated")
|
||||
|
||||
if skill_column:
|
||||
report_lines.append(f"\nSKILL MAPPING (Top 20)")
|
||||
report_lines.append("-" * 80)
|
||||
|
||||
# Show some examples of mappings
|
||||
mapping_examples = {}
|
||||
for orig, canonical in sorted(skill_mapping.items())[:20]:
|
||||
if orig != canonical:
|
||||
if canonical not in mapping_examples:
|
||||
mapping_examples[canonical] = []
|
||||
mapping_examples[canonical].append(orig)
|
||||
|
||||
for canonical, originals in sorted(mapping_examples.items()):
|
||||
if len(originals) > 1:
|
||||
report_lines.append(f"\n'{canonical}' (consolidated from {len(originals)} variants)")
|
||||
for orig in sorted(originals)[:5]:
|
||||
report_lines.append(f" → {orig}")
|
||||
if len(originals) > 5:
|
||||
report_lines.append(f" ... and {len(originals)-5} more")
|
||||
|
||||
report_lines.append(f"\nFILE OUTPUT SUMMARY")
|
||||
report_lines.append("-" * 80)
|
||||
report_lines.append(f"[OK] datos-limpios.xlsx: {cleaned_records} cleaned records")
|
||||
report_lines.append(f"[OK] skills-mapping.xlsx: Skill consolidation mapping")
|
||||
report_lines.append(f"[OK] informe-limpieza.txt: This report")
|
||||
|
||||
report_lines.append(f"\nEND OF REPORT")
|
||||
report_lines.append("="*80)
|
||||
|
||||
report_text = "\n".join(report_lines)
|
||||
print(report_text)
|
||||
|
||||
# ===== STEP 4: EXPORT =====
|
||||
print("\n[STEP 4] EXPORTING DATA & REPORTS...")
|
||||
print("-" * 80)
|
||||
|
||||
# Export cleaned data
|
||||
try:
|
||||
df.to_excel('datos-limpios.xlsx', index=False)
|
||||
print("[OK] Exported: datos-limpios.xlsx")
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Error exporting cleaned data: {e}")
|
||||
|
||||
# Export skill mapping
|
||||
try:
|
||||
if skill_mapping:
|
||||
mapping_df = pd.DataFrame([
|
||||
{'Original Skill': orig, 'Canonical Skill': canonical, 'Group Size': len(grouped_skills.get(canonical, []))}
|
||||
for orig, canonical in sorted(skill_mapping.items())
|
||||
])
|
||||
mapping_df.to_excel('skills-mapping.xlsx', index=False)
|
||||
print("[OK] Exported: skills-mapping.xlsx")
|
||||
else:
|
||||
print("[WARN] No skill mapping to export")
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Error exporting skill mapping: {e}")
|
||||
|
||||
# Export report
|
||||
try:
|
||||
with open('informe-limpieza.txt', 'w', encoding='utf-8') as f:
|
||||
f.write(report_text)
|
||||
print("[OK] Exported: informe-limpieza.txt")
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Error exporting report: {e}")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("PROCESSING COMPLETE!")
|
||||
print("="*80)
|
||||
print(f"\nSummary:")
|
||||
print(f" • Records: {initial_records} → {cleaned_records} (-{duplicates_removed})")
|
||||
print(f" • Skills: {unique_skills_before} → {unique_skills_after} (-{skills_grouped})")
|
||||
print(f" • All files saved to current directory")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user