Initial commit - ACME demo version

This commit is contained in:
sujucu70
2026-02-04 11:08:21 +01:00
commit 1bb0765766
180 changed files with 52249 additions and 0 deletions

View File

@@ -0,0 +1,55 @@
"""
beyond_metrics package
======================
Capa pública del sistema BeyondMetrics.
Expone:
- Dimensiones (Volumetría, Eficiencia, ...)
- Pipeline principal
- Conectores de IO (local, S3, ...)
"""
from .dimensions import (
VolumetriaMetrics,
OperationalPerformanceMetrics,
SatisfactionExperienceMetrics,
EconomyCostMetrics,
)
from .pipeline import (
BeyondMetricsPipeline,
build_pipeline,
load_dimensions_config, # opcional, pero útil
)
from .io import (
DataSource,
ResultsSink,
LocalDataSource,
LocalResultsSink,
S3DataSource,
S3ResultsSink,
# si has añadido GoogleDrive, puedes exponerlo aquí también:
# GoogleDriveDataSource,
# GoogleDriveResultsSink,
)
__all__ = [
# Dimensiones
"VolumetriaMetrics",
"OperationalPerformanceMetrics",
"SatisfactionExperienceMetrics",
"EconomyCostMetrics",
# Pipeline
"BeyondMetricsPipeline",
"build_pipeline",
"load_dimensions_config",
# IO
"DataSource",
"ResultsSink",
"LocalDataSource",
"LocalResultsSink",
"S3DataSource",
"S3ResultsSink",
# "GoogleDriveDataSource",
# "GoogleDriveResultsSink",
]

View File

@@ -0,0 +1,310 @@
from __future__ import annotations
import json
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Optional, Sequence
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader
from openai import OpenAI
DEFAULT_SYSTEM_PROMPT = (
"Eres un consultor experto en contact centers. "
"Vas a recibir resultados analíticos de un sistema de métricas "
"(BeyondMetrics) en formato JSON. Tu tarea es generar un informe claro, "
"accionable y orientado a negocio, destacando los principales hallazgos, "
"riesgos y oportunidades de mejora."
)
@dataclass
class ReportAgentConfig:
"""
Configuración básica del agente de informes.
openai_api_key:
Se puede pasar explícitamente o leer de la variable de entorno OPENAI_API_KEY.
model:
Modelo de ChatGPT a utilizar, p.ej. 'gpt-4.1-mini' o similar.
system_prompt:
Prompt de sistema para controlar el estilo del informe.
"""
openai_api_key: Optional[str] = None
model: str = "gpt-4.1-mini"
system_prompt: str = DEFAULT_SYSTEM_PROMPT
class BeyondMetricsReportAgent:
"""
Agente muy sencillo que:
1) Lee el JSON de resultados de una ejecución de BeyondMetrics.
2) Construye un prompt con esos resultados.
3) Llama a ChatGPT para generar un informe en texto.
4) Guarda el informe en un PDF en disco, EMBEBIENDO las imágenes PNG
generadas por el pipeline como anexos.
MVP: centrado en texto + figuras incrustadas.
"""
def __init__(self, config: Optional[ReportAgentConfig] = None) -> None:
self.config = config or ReportAgentConfig()
api_key = self.config.openai_api_key or os.getenv("OPENAI_API_KEY")
if not api_key:
raise RuntimeError(
"Falta la API key de OpenAI. "
"Pásala en ReportAgentConfig(openai_api_key=...) o "
"define la variable de entorno OPENAI_API_KEY."
)
# Cliente de la nueva API de OpenAI
self._client = OpenAI(api_key=api_key)
# ------------------------------------------------------------------
# API pública principal
# ------------------------------------------------------------------
def generate_pdf_report(
self,
run_base: str,
output_pdf_path: Optional[str] = None,
extra_user_prompt: str = "",
) -> str:
"""
Genera un informe en PDF a partir de una carpeta de resultados.
Parámetros:
- run_base:
Carpeta base de la ejecución. Debe contener al menos 'results.json'
y, opcionalmente, imágenes PNG generadas por el pipeline.
- output_pdf_path:
Ruta completa del PDF de salida. Si es None, se crea
'beyondmetrics_report.pdf' dentro de run_base.
- extra_user_prompt:
Texto adicional para afinar la petición al agente
(p.ej. "enfatiza eficiencia y SLA", etc.)
Devuelve:
- La ruta del PDF generado.
"""
run_dir = Path(run_base)
results_json = run_dir / "results.json"
if not results_json.exists():
raise FileNotFoundError(
f"No se ha encontrado {results_json}. "
"Asegúrate de ejecutar primero el pipeline."
)
# 1) Leer JSON de resultados
with results_json.open("r", encoding="utf-8") as f:
results_data: Dict[str, Any] = json.load(f)
# 2) Buscar imágenes generadas
image_files = sorted(p for p in run_dir.glob("*.png"))
# 3) Construir prompt de usuario
user_prompt = self._build_user_prompt(
results=results_data,
image_files=[p.name for p in image_files],
extra_user_prompt=extra_user_prompt,
)
# 4) Llamar a ChatGPT para obtener el texto del informe
report_text = self._call_chatgpt(user_prompt)
# 5) Crear PDF con texto + imágenes embebidas
if output_pdf_path is None:
output_pdf_path = str(run_dir / "beyondmetrics_report.pdf")
self._write_pdf(output_pdf_path, report_text, image_files)
return output_pdf_path
# ------------------------------------------------------------------
# Construcción del prompt
# ------------------------------------------------------------------
def _build_user_prompt(
self,
results: Dict[str, Any],
image_files: Sequence[str],
extra_user_prompt: str = "",
) -> str:
"""
Construye el mensaje de usuario que se enviará al modelo.
Para un MVP, serializamos el JSON de resultados entero.
Más adelante se puede resumir si el JSON crece demasiado.
"""
results_str = json.dumps(results, indent=2, ensure_ascii=False)
images_section = (
"Imágenes generadas en la ejecución:\n"
+ "\n".join(f"- {name}" for name in image_files)
if image_files
else "No se han generado imágenes en esta ejecución."
)
extra = (
f"\n\nInstrucciones adicionales del usuario:\n{extra_user_prompt}"
if extra_user_prompt
else ""
)
prompt = (
"A continuación te proporciono los resultados de una ejecución de BeyondMetrics "
"en formato JSON. Debes elaborar un INFORME EJECUTIVO para un cliente de "
"contact center. El informe debe incluir:\n"
"- Resumen ejecutivo en lenguaje de negocio.\n"
"- Principales hallazgos por dimensión.\n"
"- Riesgos o problemas detectados.\n"
"- Recomendaciones accionables.\n\n"
"Resultados (JSON):\n"
f"{results_str}\n\n"
f"{images_section}"
f"{extra}"
)
return prompt
# ------------------------------------------------------------------
# Llamada a ChatGPT (nueva API)
# ------------------------------------------------------------------
def _call_chatgpt(self, user_prompt: str) -> str:
"""
Llama al modelo de ChatGPT y devuelve el contenido del mensaje de respuesta.
Implementado con la nueva API de OpenAI.
"""
resp = self._client.chat.completions.create(
model=self.config.model,
messages=[
{"role": "system", "content": self.config.system_prompt},
{"role": "user", "content": user_prompt},
],
temperature=0.3,
)
content = resp.choices[0].message.content
if not isinstance(content, str):
raise RuntimeError("La respuesta del modelo no contiene texto.")
return content
# ------------------------------------------------------------------
# Escritura de PDF (texto + imágenes)
# ------------------------------------------------------------------
def _write_pdf(
self,
output_path: str,
text: str,
image_paths: Sequence[Path],
) -> None:
"""
Crea un PDF A4 con:
1) Texto del informe (páginas iniciales).
2) Una sección de anexos donde se incrustan las imágenes PNG
generadas por el pipeline, escaladas para encajar en la página.
"""
output_path = str(output_path)
c = canvas.Canvas(output_path, pagesize=A4)
width, height = A4
margin_x = 50
margin_y = 50
max_width = width - 2 * margin_x
line_height = 14
c.setFont("Helvetica", 11)
# --- Escribir texto principal ---
def _wrap_line(line: str, max_chars: int = 100) -> list[str]:
parts: list[str] = []
current: list[str] = []
count = 0
for word in line.split():
if count + len(word) + 1 > max_chars:
parts.append(" ".join(current))
current = [word]
count = len(word) + 1
else:
current.append(word)
count += len(word) + 1
if current:
parts.append(" ".join(current))
return parts
y = height - margin_y
for raw_line in text.splitlines():
wrapped_lines = _wrap_line(raw_line)
for line in wrapped_lines:
if y < margin_y:
c.showPage()
c.setFont("Helvetica", 11)
y = height - margin_y
c.drawString(margin_x, y, line)
y -= line_height
# --- Anexar imágenes como figuras ---
if image_paths:
# Nueva página para las figuras
c.showPage()
c.setFont("Helvetica-Bold", 14)
c.drawString(margin_x, height - margin_y, "Anexo: Figuras")
c.setFont("Helvetica", 11)
current_y = height - margin_y - 2 * line_height
for img_path in image_paths:
# Si no cabe la imagen en la página, pasamos a la siguiente
available_height = current_y - margin_y
if available_height < 100: # espacio mínimo
c.showPage()
c.setFont("Helvetica-Bold", 14)
c.drawString(margin_x, height - margin_y, "Anexo: Figuras (cont.)")
c.setFont("Helvetica", 11)
current_y = height - margin_y - 2 * line_height
available_height = current_y - margin_y
# Título de la figura
title = f"Figura: {img_path.name}"
c.drawString(margin_x, current_y, title)
current_y -= line_height
# Cargar imagen y escalarla
try:
img = ImageReader(str(img_path))
iw, ih = img.getSize()
# Escala para encajar en ancho y alto disponibles
max_img_height = available_height - 2 * line_height
scale = min(max_width / iw, max_img_height / ih)
if scale <= 0:
scale = 1.0 # fallback
draw_w = iw * scale
draw_h = ih * scale
x = margin_x
y_img = current_y - draw_h
c.drawImage(
img,
x,
y_img,
width=draw_w,
height=draw_h,
preserveAspectRatio=True,
mask="auto",
)
current_y = y_img - 2 * line_height
except Exception as e:
# Si falla la carga, lo indicamos en el PDF
err_msg = f"No se pudo cargar la imagen {img_path.name}: {e}"
c.drawString(margin_x, current_y, err_msg)
current_y -= 2 * line_height
c.save()

View File

@@ -0,0 +1,27 @@
{
"dimensions": {
"volumetry": {
"class": "beyond_metrics.VolumetriaMetrics",
"enabled": true,
"metrics": [
"volume_by_channel",
"volume_by_skill"
]
},
"operational_performance": {
"class": "beyond_metrics.dimensions.OperationalPerformance.OperationalPerformanceMetrics",
"enabled": false,
"metrics": []
},
"customer_satisfaction": {
"class": "beyond_metrics.dimensions.SatisfactionExperience.SatisfactionExperienceMetrics",
"enabled": false,
"metrics": []
},
"economy_costs": {
"class": "beyond_metrics.dimensions.EconomyCost.EconomyCostMetrics",
"enabled": false,
"metrics": []
}
}
}

View File

@@ -0,0 +1,58 @@
{
"dimensions": {
"volumetry": {
"class": "beyond_metrics.VolumetriaMetrics",
"enabled": true,
"metrics": [
"volume_by_channel",
"volume_by_skill",
"channel_distribution_pct",
"skill_distribution_pct",
"heatmap_24x7",
"monthly_seasonality_cv",
"peak_offpeak_ratio",
"concentration_top20_skills_pct"
]
},
"operational_performance": {
"class": "beyond_metrics.dimensions.OperationalPerformance.OperationalPerformanceMetrics",
"enabled": true,
"metrics": [
"aht_distribution",
"talk_hold_acw_p50_by_skill",
"metrics_by_skill",
"fcr_rate",
"escalation_rate",
"abandonment_rate",
"high_hold_time_rate",
"recurrence_rate_7d",
"repeat_channel_rate",
"occupancy_rate",
"performance_score"
]
},
"customer_satisfaction": {
"class": "beyond_metrics.dimensions.SatisfactionExperience.SatisfactionExperienceMetrics",
"enabled": true,
"metrics": [
"csat_global",
"csat_avg_by_skill_channel",
"nps_avg_by_skill_channel",
"ces_avg_by_skill_channel",
"csat_aht_correlation",
"csat_aht_skill_summary"
]
},
"economy_costs": {
"class": "beyond_metrics.dimensions.EconomyCost.EconomyCostMetrics",
"enabled": true,
"metrics": [
"cpi_by_skill_channel",
"annual_cost_by_skill_channel",
"cost_breakdown",
"inefficiency_cost_by_skill_channel",
"potential_savings"
]
}
}
}

View File

@@ -0,0 +1,494 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, List, Optional, Any
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.axes import Axes
REQUIRED_COLUMNS_ECON: List[str] = [
"interaction_id",
"datetime_start",
"queue_skill",
"channel",
"duration_talk",
"hold_time",
"wrap_up_time",
]
@dataclass
class EconomyConfig:
"""
Parámetros manuales para la dimensión de Economía y Costes.
- labor_cost_per_hour: coste total/hora de un agente (fully loaded).
- overhead_rate: % overhead variable (ej. 0.1 = 10% sobre labor).
- tech_costs_annual: coste anual de tecnología (licencias, infra, ...).
- automation_cpi: coste por interacción automatizada (ej. 0.15€).
- automation_volume_share: % del volumen automatizable (0-1).
- automation_success_rate: % éxito de la automatización (0-1).
- customer_segments: mapping opcional skill -> segmento ("high"/"medium"/"low")
para futuros insights de ROI por segmento.
"""
labor_cost_per_hour: float
overhead_rate: float = 0.0
tech_costs_annual: float = 0.0
automation_cpi: Optional[float] = None
automation_volume_share: float = 0.0
automation_success_rate: float = 0.0
customer_segments: Optional[Dict[str, str]] = None
@dataclass
class EconomyCostMetrics:
"""
DIMENSIÓN 4: ECONOMÍA y COSTES
Propósito:
- Cuantificar el COSTE actual (CPI, coste anual).
- Estimar el impacto de overhead y tecnología.
- Calcular un primer estimado de "coste de ineficiencia" y ahorro potencial.
Requiere:
- Columnas del dataset transaccional (ver REQUIRED_COLUMNS_ECON).
Inputs opcionales vía EconomyConfig:
- labor_cost_per_hour (obligatorio para cualquier cálculo de €).
- overhead_rate, tech_costs_annual, automation_*.
- customer_segments (para insights de ROI por segmento).
"""
df: pd.DataFrame
config: Optional[EconomyConfig] = None
def __post_init__(self) -> None:
self._validate_columns()
self._prepare_data()
# ------------------------------------------------------------------ #
# Helpers internos
# ------------------------------------------------------------------ #
def _validate_columns(self) -> None:
missing = [c for c in REQUIRED_COLUMNS_ECON if c not in self.df.columns]
if missing:
raise ValueError(
f"Faltan columnas obligatorias para EconomyCostMetrics: {missing}"
)
def _prepare_data(self) -> None:
df = self.df.copy()
df["datetime_start"] = pd.to_datetime(df["datetime_start"], errors="coerce")
for col in ["duration_talk", "hold_time", "wrap_up_time"]:
df[col] = pd.to_numeric(df[col], errors="coerce")
df["queue_skill"] = df["queue_skill"].astype(str).str.strip()
df["channel"] = df["channel"].astype(str).str.strip()
# Handle time = talk + hold + wrap
df["handle_time"] = (
df["duration_talk"].fillna(0)
+ df["hold_time"].fillna(0)
+ df["wrap_up_time"].fillna(0)
) # segundos
# Filtrar por record_status para cálculos de AHT/CPI
# Solo incluir registros VALID (excluir NOISE, ZOMBIE, ABANDON)
if "record_status" in df.columns:
df["record_status"] = df["record_status"].astype(str).str.strip().str.upper()
df["_is_valid_for_cost"] = df["record_status"] == "VALID"
else:
# Legacy data sin record_status: incluir todo
df["_is_valid_for_cost"] = True
self.df = df
@property
def is_empty(self) -> bool:
return self.df.empty
def _has_cost_config(self) -> bool:
return self.config is not None and self.config.labor_cost_per_hour is not None
# ------------------------------------------------------------------ #
# KPI 1: CPI por canal/skill
# ------------------------------------------------------------------ #
def cpi_by_skill_channel(self) -> pd.DataFrame:
"""
CPI (Coste Por Interacción) por skill/canal.
CPI = (Labor_cost_per_interaction + Overhead_variable) / EFFECTIVE_PRODUCTIVITY
- Labor_cost_per_interaction = (labor_cost_per_hour * AHT_hours)
- Overhead_variable = overhead_rate * Labor_cost_per_interaction
- EFFECTIVE_PRODUCTIVITY = 0.70 (70% - accounts for non-productive time)
Excluye registros abandonados del cálculo de costes para consistencia
con el path del frontend (fresh CSV).
Si no hay config de costes -> devuelve DataFrame vacío.
Incluye queue_skill y channel como columnas (no solo índice) para que
el frontend pueda hacer lookup por nombre de skill.
"""
if not self._has_cost_config():
return pd.DataFrame()
cfg = self.config
assert cfg is not None # para el type checker
df = self.df.copy()
if df.empty:
return pd.DataFrame()
# Filter out abandonments for cost calculation (consistency with frontend)
if "is_abandoned" in df.columns:
df_cost = df[df["is_abandoned"] != True]
else:
df_cost = df
# Filtrar por record_status: solo VALID para cálculo de AHT
# Excluye NOISE, ZOMBIE, ABANDON
if "_is_valid_for_cost" in df_cost.columns:
df_cost = df_cost[df_cost["_is_valid_for_cost"] == True]
if df_cost.empty:
return pd.DataFrame()
# AHT por skill/canal (en segundos) - solo registros VALID
grouped = df_cost.groupby(["queue_skill", "channel"])["handle_time"].mean()
if grouped.empty:
return pd.DataFrame()
aht_sec = grouped
aht_hours = aht_sec / 3600.0
# Apply productivity factor (70% effectiveness)
# This accounts for non-productive agent time (breaks, training, etc.)
EFFECTIVE_PRODUCTIVITY = 0.70
labor_cost = cfg.labor_cost_per_hour * aht_hours
overhead = labor_cost * cfg.overhead_rate
raw_cpi = labor_cost + overhead
cpi = raw_cpi / EFFECTIVE_PRODUCTIVITY
out = pd.DataFrame(
{
"aht_seconds": aht_sec.round(2),
"labor_cost": labor_cost.round(4),
"overhead_cost": overhead.round(4),
"cpi_total": cpi.round(4),
}
)
# Reset index to include queue_skill and channel as columns for frontend lookup
return out.sort_index().reset_index()
# ------------------------------------------------------------------ #
# KPI 2: coste anual por skill/canal
# ------------------------------------------------------------------ #
def annual_cost_by_skill_channel(self) -> pd.DataFrame:
"""
Coste anual por skill/canal.
cost_annual = CPI * volumen (cantidad de interacciones de la muestra).
Nota: por simplicidad asumimos que el dataset refleja un periodo anual.
Si en el futuro quieres anualizar (ej. dataset = 1 mes) se puede añadir
un factor de escalado en EconomyConfig.
"""
cpi_table = self.cpi_by_skill_channel()
if cpi_table.empty:
return pd.DataFrame()
df = self.df.copy()
volume = (
df.groupby(["queue_skill", "channel"])["interaction_id"]
.nunique()
.rename("volume")
)
# Set index on cpi_table to match volume's MultiIndex for join
cpi_indexed = cpi_table.set_index(["queue_skill", "channel"])
joined = cpi_indexed.join(volume, how="left").fillna({"volume": 0})
joined["annual_cost"] = (joined["cpi_total"] * joined["volume"]).round(2)
return joined
# ------------------------------------------------------------------ #
# KPI 3: desglose de costes (labor / tech / overhead)
# ------------------------------------------------------------------ #
def cost_breakdown(self) -> Dict[str, float]:
"""
Desglose % de costes: labor, overhead, tech.
labor_total = sum(labor_cost_per_interaction)
overhead_total = labor_total * overhead_rate
tech_total = tech_costs_annual (si se ha proporcionado)
Devuelve porcentajes sobre el total.
Si falta configuración de coste -> devuelve {}.
"""
if not self._has_cost_config():
return {}
cfg = self.config
assert cfg is not None
cpi_table = self.cpi_by_skill_channel()
if cpi_table.empty:
return {}
df = self.df.copy()
volume = (
df.groupby(["queue_skill", "channel"])["interaction_id"]
.nunique()
.rename("volume")
)
# Set index on cpi_table to match volume's MultiIndex for join
cpi_indexed = cpi_table.set_index(["queue_skill", "channel"])
joined = cpi_indexed.join(volume, how="left").fillna({"volume": 0})
# Costes anuales de labor y overhead
annual_labor = (joined["labor_cost"] * joined["volume"]).sum()
annual_overhead = (joined["overhead_cost"] * joined["volume"]).sum()
annual_tech = cfg.tech_costs_annual
total = annual_labor + annual_overhead + annual_tech
if total <= 0:
return {}
return {
"labor_pct": round(annual_labor / total * 100, 2),
"overhead_pct": round(annual_overhead / total * 100, 2),
"tech_pct": round(annual_tech / total * 100, 2),
"labor_annual": round(annual_labor, 2),
"overhead_annual": round(annual_overhead, 2),
"tech_annual": round(annual_tech, 2),
"total_annual": round(total, 2),
}
# ------------------------------------------------------------------ #
# KPI 4: coste de ineficiencia (€ por variabilidad/escalación)
# ------------------------------------------------------------------ #
def inefficiency_cost_by_skill_channel(self) -> pd.DataFrame:
"""
Estimación muy simplificada de coste de ineficiencia:
Para cada skill/canal:
- AHT_p50, AHT_p90 (segundos).
- Delta = max(0, AHT_p90 - AHT_p50).
- Se asume que ~40% de las interacciones están por encima de la mediana.
- Ineff_seconds = Delta * volume * 0.4
- Ineff_cost = LaborCPI_per_second * Ineff_seconds
NOTA: Es un modelo aproximado para cuantificar "orden de magnitud".
"""
if not self._has_cost_config():
return pd.DataFrame()
cfg = self.config
assert cfg is not None
df = self.df.copy()
# Filtrar por record_status: solo VALID para cálculo de AHT
# Excluye NOISE, ZOMBIE, ABANDON
if "_is_valid_for_cost" in df.columns:
df = df[df["_is_valid_for_cost"] == True]
grouped = df.groupby(["queue_skill", "channel"])
stats = grouped["handle_time"].agg(
aht_p50=lambda s: float(np.percentile(s.dropna(), 50)),
aht_p90=lambda s: float(np.percentile(s.dropna(), 90)),
volume="count",
)
if stats.empty:
return pd.DataFrame()
# CPI para obtener coste/segundo de labor
# cpi_by_skill_channel now returns with reset_index, so we need to set index for join
cpi_table_raw = self.cpi_by_skill_channel()
if cpi_table_raw.empty:
return pd.DataFrame()
# Set queue_skill+channel as index for the join
cpi_table = cpi_table_raw.set_index(["queue_skill", "channel"])
merged = stats.join(cpi_table[["labor_cost"]], how="left")
merged = merged.fillna(0.0)
delta = (merged["aht_p90"] - merged["aht_p50"]).clip(lower=0.0)
affected_fraction = 0.4 # aproximación
ineff_seconds = delta * merged["volume"] * affected_fraction
# labor_cost = coste por interacción con AHT medio;
# aproximamos coste/segundo como labor_cost / AHT_medio
aht_mean = grouped["handle_time"].mean()
merged["aht_mean"] = aht_mean
cost_per_second = merged["labor_cost"] / merged["aht_mean"].replace(0, np.nan)
cost_per_second = cost_per_second.fillna(0.0)
ineff_cost = (ineff_seconds * cost_per_second).round(2)
merged["ineff_seconds"] = ineff_seconds.round(2)
merged["ineff_cost"] = ineff_cost
# Reset index to include queue_skill and channel as columns for frontend lookup
return merged[["aht_p50", "aht_p90", "volume", "ineff_seconds", "ineff_cost"]].reset_index()
# ------------------------------------------------------------------ #
# KPI 5: ahorro potencial anual por automatización
# ------------------------------------------------------------------ #
def potential_savings(self) -> Dict[str, Any]:
"""
Ahorro potencial anual basado en:
Ahorro = (CPI_humano - CPI_automatizado) * Volumen_automatizable * Tasa_éxito
Donde:
- CPI_humano = media ponderada de cpi_total.
- CPI_automatizado = config.automation_cpi
- Volumen_automatizable = volume_total * automation_volume_share
- Tasa_éxito = automation_success_rate
Si faltan parámetros en config -> devuelve {}.
"""
if not self._has_cost_config():
return {}
cfg = self.config
assert cfg is not None
if cfg.automation_cpi is None or cfg.automation_volume_share <= 0 or cfg.automation_success_rate <= 0:
return {}
cpi_table = self.annual_cost_by_skill_channel()
if cpi_table.empty:
return {}
total_volume = cpi_table["volume"].sum()
if total_volume <= 0:
return {}
# CPI humano medio ponderado
weighted_cpi = (
(cpi_table["cpi_total"] * cpi_table["volume"]).sum() / total_volume
)
volume_automatizable = total_volume * cfg.automation_volume_share
effective_volume = volume_automatizable * cfg.automation_success_rate
delta_cpi = max(0.0, weighted_cpi - cfg.automation_cpi)
annual_savings = delta_cpi * effective_volume
return {
"cpi_humano": round(weighted_cpi, 4),
"cpi_automatizado": round(cfg.automation_cpi, 4),
"volume_total": float(total_volume),
"volume_automatizable": float(volume_automatizable),
"effective_volume": float(effective_volume),
"annual_savings": round(annual_savings, 2),
}
# ------------------------------------------------------------------ #
# PLOTS
# ------------------------------------------------------------------ #
def plot_cost_waterfall(self) -> Axes:
"""
Waterfall de costes anuales (labor + tech + overhead).
"""
breakdown = self.cost_breakdown()
if not breakdown:
fig, ax = plt.subplots()
ax.text(0.5, 0.5, "Sin configuración de costes", ha="center", va="center")
ax.set_axis_off()
return ax
labels = ["Labor", "Overhead", "Tech"]
values = [
breakdown["labor_annual"],
breakdown["overhead_annual"],
breakdown["tech_annual"],
]
fig, ax = plt.subplots(figsize=(8, 4))
running = 0.0
positions = []
bottoms = []
for v in values:
positions.append(running)
bottoms.append(running)
running += v
# barras estilo waterfall
x = np.arange(len(labels))
ax.bar(x, values)
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.set_ylabel("€ anuales")
ax.set_title("Desglose anual de costes")
for idx, v in enumerate(values):
ax.text(idx, v, f"{v:,.0f}", ha="center", va="bottom")
ax.grid(axis="y", alpha=0.3)
return ax
def plot_cpi_by_channel(self) -> Axes:
"""
Gráfico de barras de CPI medio por canal.
"""
cpi_table = self.cpi_by_skill_channel()
if cpi_table.empty:
fig, ax = plt.subplots()
ax.text(0.5, 0.5, "Sin configuración de costes", ha="center", va="center")
ax.set_axis_off()
return ax
df = self.df.copy()
volume = (
df.groupby(["queue_skill", "channel"])["interaction_id"]
.nunique()
.rename("volume")
)
# Set index on cpi_table to match volume's MultiIndex for join
cpi_indexed = cpi_table.set_index(["queue_skill", "channel"])
joined = cpi_indexed.join(volume, how="left").fillna({"volume": 0})
# CPI medio ponderado por canal
per_channel = (
joined.reset_index()
.groupby("channel")
.apply(lambda g: (g["cpi_total"] * g["volume"]).sum() / max(g["volume"].sum(), 1))
.rename("cpi_mean")
.round(4)
)
fig, ax = plt.subplots(figsize=(6, 4))
per_channel.plot(kind="bar", ax=ax)
ax.set_xlabel("Canal")
ax.set_ylabel("CPI medio (€)")
ax.set_title("Coste por interacción (CPI) por canal")
ax.grid(axis="y", alpha=0.3)
return ax

View File

@@ -0,0 +1,716 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Dict, List
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.axes import Axes
import math
REQUIRED_COLUMNS_OP: List[str] = [
"interaction_id",
"datetime_start",
"queue_skill",
"channel",
"duration_talk",
"hold_time",
"wrap_up_time",
"agent_id",
"transfer_flag",
]
@dataclass
class OperationalPerformanceMetrics:
"""
Dimensión: RENDIMIENTO OPERACIONAL Y DE SERVICIO
Propósito: medir el balance entre rapidez (eficiencia) y calidad de resolución,
más la variabilidad del servicio.
Requiere como mínimo:
- interaction_id
- datetime_start
- queue_skill
- channel
- duration_talk (segundos)
- hold_time (segundos)
- wrap_up_time (segundos)
- agent_id
- transfer_flag (bool/int)
Columnas opcionales:
- is_resolved (bool/int) -> para FCR
- abandoned_flag (bool/int) -> para tasa de abandono
- customer_id / caller_id -> para reincidencia y repetición de canal
- logged_time (segundos) -> para occupancy_rate
"""
df: pd.DataFrame
# Benchmarks / parámetros de normalización (puedes ajustarlos)
AHT_GOOD: float = 300.0 # 5 min
AHT_BAD: float = 900.0 # 15 min
VAR_RATIO_GOOD: float = 1.2 # P90/P50 ~1.2 muy estable
VAR_RATIO_BAD: float = 3.0 # P90/P50 >=3 muy inestable
def __post_init__(self) -> None:
self._validate_columns()
self._prepare_data()
# ------------------------------------------------------------------ #
# Helpers internos
# ------------------------------------------------------------------ #
def _validate_columns(self) -> None:
missing = [c for c in REQUIRED_COLUMNS_OP if c not in self.df.columns]
if missing:
raise ValueError(
f"Faltan columnas obligatorias para OperationalPerformanceMetrics: {missing}"
)
def _prepare_data(self) -> None:
df = self.df.copy()
# Tipos
df["datetime_start"] = pd.to_datetime(df["datetime_start"], errors="coerce")
for col in ["duration_talk", "hold_time", "wrap_up_time"]:
df[col] = pd.to_numeric(df[col], errors="coerce")
# Handle Time
df["handle_time"] = (
df["duration_talk"].fillna(0)
+ df["hold_time"].fillna(0)
+ df["wrap_up_time"].fillna(0)
)
# v3.0: Filtrar NOISE y ZOMBIE para cálculos de variabilidad
# record_status: 'VALID', 'NOISE', 'ZOMBIE', 'ABANDON'
# Para AHT/CV solo usamos 'VALID' (excluye noise, zombie, abandon)
if "record_status" in df.columns:
df["record_status"] = df["record_status"].astype(str).str.strip().str.upper()
# Crear máscara para registros válidos: SOLO "VALID"
# Excluye explícitamente NOISE, ZOMBIE, ABANDON y cualquier otro valor
df["_is_valid_for_cv"] = df["record_status"] == "VALID"
# Log record_status breakdown for debugging
status_counts = df["record_status"].value_counts()
valid_count = int(df["_is_valid_for_cv"].sum())
print(f"[OperationalPerformance] Record status breakdown:")
print(f" Total rows: {len(df)}")
for status, count in status_counts.items():
print(f" - {status}: {count}")
print(f" VALID rows for AHT calculation: {valid_count}")
else:
# Legacy data sin record_status: incluir todo
df["_is_valid_for_cv"] = True
print(f"[OperationalPerformance] No record_status column - using all {len(df)} rows")
# Normalización básica
df["queue_skill"] = df["queue_skill"].astype(str).str.strip()
df["channel"] = df["channel"].astype(str).str.strip()
df["agent_id"] = df["agent_id"].astype(str).str.strip()
# Flags opcionales convertidos a bool cuando existan
for flag_col in ["is_resolved", "abandoned_flag", "transfer_flag"]:
if flag_col in df.columns:
df[flag_col] = df[flag_col].astype(int).astype(bool)
# customer_id: usamos customer_id si existe, si no caller_id
if "customer_id" in df.columns:
df["customer_id"] = df["customer_id"].astype(str)
elif "caller_id" in df.columns:
df["customer_id"] = df["caller_id"].astype(str)
else:
df["customer_id"] = None
# logged_time opcional
# Normalizamos logged_time: siempre será una serie float con NaN si no existe
df["logged_time"] = pd.to_numeric(df.get("logged_time", np.nan), errors="coerce")
self.df = df
@property
def is_empty(self) -> bool:
return self.df.empty
# ------------------------------------------------------------------ #
# AHT y variabilidad
# ------------------------------------------------------------------ #
def aht_distribution(self) -> Dict[str, float]:
"""
Devuelve P10, P50, P90 del AHT y el ratio P90/P50 como medida de variabilidad.
v3.0: Filtra NOISE y ZOMBIE para el cálculo de variabilidad.
Solo usa registros con record_status='valid' o sin status (legacy).
"""
# Filtrar solo registros válidos para cálculo de variabilidad
df_valid = self.df[self.df["_is_valid_for_cv"] == True]
ht = df_valid["handle_time"].dropna().astype(float)
if ht.empty:
return {}
p10 = float(np.percentile(ht, 10))
p50 = float(np.percentile(ht, 50))
p90 = float(np.percentile(ht, 90))
ratio = float(p90 / p50) if p50 > 0 else float("nan")
return {
"p10": round(p10, 2),
"p50": round(p50, 2),
"p90": round(p90, 2),
"p90_p50_ratio": round(ratio, 3),
}
def talk_hold_acw_p50_by_skill(self) -> pd.DataFrame:
"""
P50 de talk_time, hold_time y wrap_up_time por skill.
Incluye queue_skill como columna (no solo índice) para que
el frontend pueda hacer lookup por nombre de skill.
"""
df = self.df
def perc(s: pd.Series, q: float) -> float:
s = s.dropna().astype(float)
if s.empty:
return float("nan")
return float(np.percentile(s, q))
grouped = df.groupby("queue_skill")
result = pd.DataFrame(
{
"talk_p50": grouped["duration_talk"].apply(lambda s: perc(s, 50)),
"hold_p50": grouped["hold_time"].apply(lambda s: perc(s, 50)),
"acw_p50": grouped["wrap_up_time"].apply(lambda s: perc(s, 50)),
}
)
# Reset index to include queue_skill as column for frontend lookup
return result.round(2).sort_index().reset_index()
# ------------------------------------------------------------------ #
# FCR, escalación, abandono, reincidencia, repetición canal
# ------------------------------------------------------------------ #
def fcr_rate(self) -> float:
"""
FCR (First Contact Resolution).
Prioridad 1: Usar fcr_real_flag del CSV si existe
Prioridad 2: Calcular como 100 - escalation_rate
"""
df = self.df
total = len(df)
if total == 0:
return float("nan")
# Prioridad 1: Usar fcr_real_flag si existe
if "fcr_real_flag" in df.columns:
col = df["fcr_real_flag"]
# Normalizar a booleano
if col.dtype == "O":
fcr_mask = (
col.astype(str)
.str.strip()
.str.lower()
.isin(["true", "t", "1", "yes", "y", "si", ""])
)
else:
fcr_mask = pd.to_numeric(col, errors="coerce").fillna(0) > 0
fcr_count = int(fcr_mask.sum())
fcr = (fcr_count / total) * 100.0
return float(max(0.0, min(100.0, round(fcr, 2))))
# Prioridad 2: Fallback a 100 - escalation_rate
try:
esc = self.escalation_rate()
except Exception:
esc = float("nan")
if esc is not None and not math.isnan(esc):
fcr = 100.0 - esc
return float(max(0.0, min(100.0, round(fcr, 2))))
return float("nan")
def escalation_rate(self) -> float:
"""
% de interacciones que requieren escalación (transfer_flag == True).
"""
df = self.df
total = len(df)
if total == 0:
return float("nan")
escalated = df["transfer_flag"].sum()
return float(round(escalated / total * 100, 2))
def abandonment_rate(self) -> float:
"""
% de interacciones abandonadas.
Busca en orden: is_abandoned, abandoned_flag, abandoned
Si ninguna columna existe, devuelve NaN.
"""
df = self.df
total = len(df)
if total == 0:
return float("nan")
# Buscar columna de abandono en orden de prioridad
abandon_col = None
for col_name in ["is_abandoned", "abandoned_flag", "abandoned"]:
if col_name in df.columns:
abandon_col = col_name
break
if abandon_col is None:
return float("nan")
col = df[abandon_col]
# Normalizar a booleano
if col.dtype == "O":
abandon_mask = (
col.astype(str)
.str.strip()
.str.lower()
.isin(["true", "t", "1", "yes", "y", "si", ""])
)
else:
abandon_mask = pd.to_numeric(col, errors="coerce").fillna(0) > 0
abandoned = int(abandon_mask.sum())
return float(round(abandoned / total * 100, 2))
def high_hold_time_rate(self, threshold_seconds: float = 60.0) -> float:
"""
% de interacciones con hold_time > threshold (por defecto 60s).
Proxy de complejidad: si el agente tuvo que poner en espera al cliente
más de 60 segundos, probablemente tuvo que consultar/investigar.
"""
df = self.df
total = len(df)
if total == 0:
return float("nan")
hold_times = df["hold_time"].fillna(0)
high_hold_count = (hold_times > threshold_seconds).sum()
return float(round(high_hold_count / total * 100, 2))
def recurrence_rate_7d(self) -> float:
"""
% de clientes que vuelven a contactar en < 7 días para el MISMO skill.
Se basa en customer_id (o caller_id si no hay customer_id) + queue_skill.
Calcula:
- Para cada combinación cliente + skill, ordena por datetime_start
- Si hay dos contactos consecutivos separados < 7 días (mismo cliente, mismo skill),
cuenta como "recurrente"
- Tasa = nº clientes recurrentes / nº total de clientes
NOTA: Solo cuenta como recurrencia si el cliente llama por el MISMO skill.
Un cliente que llama a "Ventas" y luego a "Soporte" NO es recurrente.
"""
df = self.df.dropna(subset=["datetime_start"]).copy()
# Normalizar identificador de cliente
if "customer_id" not in df.columns:
if "caller_id" in df.columns:
df["customer_id"] = df["caller_id"]
else:
# No hay identificador de cliente -> no se puede calcular
return float("nan")
df = df.dropna(subset=["customer_id"])
if df.empty:
return float("nan")
# Ordenar por cliente + skill + fecha
df = df.sort_values(["customer_id", "queue_skill", "datetime_start"])
# Diferencia de tiempo entre contactos consecutivos por cliente Y skill
# Esto asegura que solo contamos recontactos del mismo cliente para el mismo skill
df["delta"] = df.groupby(["customer_id", "queue_skill"])["datetime_start"].diff()
# Marcamos los contactos que ocurren a menos de 7 días del anterior (mismo skill)
recurrence_mask = df["delta"] < pd.Timedelta(days=7)
# Nº de clientes que tienen al menos un contacto recurrente (para cualquier skill)
recurrent_customers = df.loc[recurrence_mask, "customer_id"].nunique()
total_customers = df["customer_id"].nunique()
if total_customers == 0:
return float("nan")
rate = recurrent_customers / total_customers * 100.0
return float(round(rate, 2))
def repeat_channel_rate(self) -> float:
"""
% de reincidencias (<7 días) en las que el cliente usa el MISMO canal.
Si no hay customer_id/caller_id o solo un contacto por cliente, devuelve NaN.
"""
df = self.df.dropna(subset=["datetime_start"]).copy()
if df["customer_id"].isna().all():
return float("nan")
df = df.sort_values(["customer_id", "datetime_start"])
df["next_customer"] = df["customer_id"].shift(-1)
df["next_datetime"] = df["datetime_start"].shift(-1)
df["next_channel"] = df["channel"].shift(-1)
same_customer = df["customer_id"] == df["next_customer"]
within_7d = (df["next_datetime"] - df["datetime_start"]) < pd.Timedelta(days=7)
recurrent_mask = same_customer & within_7d
if not recurrent_mask.any():
return float("nan")
same_channel = df["channel"] == df["next_channel"]
same_channel_recurrent = (recurrent_mask & same_channel).sum()
total_recurrent = recurrent_mask.sum()
return float(round(same_channel_recurrent / total_recurrent * 100, 2))
# ------------------------------------------------------------------ #
# Occupancy
# ------------------------------------------------------------------ #
def occupancy_rate(self) -> float:
"""
Tasa de ocupación:
occupancy = sum(handle_time) / sum(logged_time) * 100.
Requiere columna 'logged_time'. Si no existe o es todo 0, devuelve NaN.
"""
df = self.df
if "logged_time" not in df.columns:
return float("nan")
logged = df["logged_time"].fillna(0)
handle = df["handle_time"].fillna(0)
total_logged = logged.sum()
if total_logged == 0:
return float("nan")
occ = handle.sum() / total_logged
return float(round(occ * 100, 2))
# ------------------------------------------------------------------ #
# Score de rendimiento 0-10
# ------------------------------------------------------------------ #
def performance_score(self) -> Dict[str, float]:
"""
Calcula un score 0-10 combinando:
- AHT (bajo es mejor)
- FCR (alto es mejor)
- Variabilidad (P90/P50, bajo es mejor)
- Otros factores (ocupación / escalación)
Fórmula:
score = 0.4 * (10 - AHT_norm) +
0.3 * FCR_norm +
0.2 * (10 - Var_norm) +
0.1 * Otros_score
Donde *_norm son valores en escala 0-10.
"""
dist = self.aht_distribution()
if not dist:
return {"score": float("nan")}
p50 = dist["p50"]
ratio = dist["p90_p50_ratio"]
# AHT_normalized: 0 (mejor) a 10 (peor)
aht_norm = self._scale_to_0_10(p50, self.AHT_GOOD, self.AHT_BAD)
# FCR_normalized: 0-10 directamente desde % (0-100)
fcr_pct = self.fcr_rate()
fcr_norm = fcr_pct / 10.0 if not np.isnan(fcr_pct) else 0.0
# Variabilidad_normalized: 0 (ratio bueno) a 10 (ratio malo)
var_norm = self._scale_to_0_10(ratio, self.VAR_RATIO_GOOD, self.VAR_RATIO_BAD)
# Otros factores: combinamos ocupación (ideal ~80%) y escalación (ideal baja)
occ = self.occupancy_rate()
esc = self.escalation_rate()
other_score = self._compute_other_factors_score(occ, esc)
score = (
0.4 * (10.0 - aht_norm)
+ 0.3 * fcr_norm
+ 0.2 * (10.0 - var_norm)
+ 0.1 * other_score
)
# Clamp 0-10
score = max(0.0, min(10.0, score))
return {
"score": round(score, 2),
"aht_norm": round(aht_norm, 2),
"fcr_norm": round(fcr_norm, 2),
"var_norm": round(var_norm, 2),
"other_score": round(other_score, 2),
}
def _scale_to_0_10(self, value: float, good: float, bad: float) -> float:
"""
Escala linealmente un valor:
- good -> 0
- bad -> 10
Con saturación fuera de rango.
"""
if np.isnan(value):
return 5.0 # neutro
if good == bad:
return 5.0
if good < bad:
# Menor es mejor
if value <= good:
return 0.0
if value >= bad:
return 10.0
return 10.0 * (value - good) / (bad - good)
else:
# Mayor es mejor
if value >= good:
return 0.0
if value <= bad:
return 10.0
return 10.0 * (good - value) / (good - bad)
def _compute_other_factors_score(self, occ_pct: float, esc_pct: float) -> float:
"""
Otros factores (0-10) basados en:
- ocupación ideal alrededor de 80%
- tasa de escalación ideal baja (<10%)
"""
# Ocupación: 0 penalización si está entre 75-85, se penaliza fuera
if np.isnan(occ_pct):
occ_penalty = 5.0
else:
deviation = abs(occ_pct - 80.0)
occ_penalty = min(10.0, deviation / 5.0 * 2.0) # cada 5 puntos se suman 2, máx 10
occ_score = max(0.0, 10.0 - occ_penalty)
# Escalación: 0-10 donde 0% -> 10 puntos, >=40% -> 0
if np.isnan(esc_pct):
esc_score = 5.0
else:
if esc_pct <= 0:
esc_score = 10.0
elif esc_pct >= 40:
esc_score = 0.0
else:
esc_score = 10.0 * (1.0 - esc_pct / 40.0)
# Media simple de ambos
return (occ_score + esc_score) / 2.0
# ------------------------------------------------------------------ #
# Plots
# ------------------------------------------------------------------ #
def plot_aht_boxplot_by_skill(self) -> Axes:
"""
Boxplot del AHT por skill (P10-P50-P90 visual).
"""
df = self.df.copy()
if df.empty or "handle_time" not in df.columns:
fig, ax = plt.subplots()
ax.text(0.5, 0.5, "Sin datos de AHT", ha="center", va="center")
ax.set_axis_off()
return ax
df = df.dropna(subset=["handle_time"])
if df.empty:
fig, ax = plt.subplots()
ax.text(0.5, 0.5, "AHT no disponible", ha="center", va="center")
ax.set_axis_off()
return ax
fig, ax = plt.subplots(figsize=(8, 4))
df.boxplot(column="handle_time", by="queue_skill", ax=ax, showfliers=False)
ax.set_xlabel("Skill / Cola")
ax.set_ylabel("AHT (segundos)")
ax.set_title("Distribución de AHT por skill")
plt.suptitle("")
plt.xticks(rotation=45, ha="right")
ax.grid(axis="y", alpha=0.3)
return ax
def plot_resolution_funnel_by_skill(self) -> Axes:
"""
Funnel / barras apiladas de Talk + Hold + ACW por skill (P50).
Permite ver el equilibrio de tiempos por skill.
"""
p50 = self.talk_hold_acw_p50_by_skill()
if p50.empty:
fig, ax = plt.subplots()
ax.text(0.5, 0.5, "Sin datos para funnel", ha="center", va="center")
ax.set_axis_off()
return ax
fig, ax = plt.subplots(figsize=(10, 4))
skills = p50.index
talk = p50["talk_p50"]
hold = p50["hold_p50"]
acw = p50["acw_p50"]
x = np.arange(len(skills))
ax.bar(x, talk, label="Talk P50")
ax.bar(x, hold, bottom=talk, label="Hold P50")
ax.bar(x, acw, bottom=talk + hold, label="ACW P50")
ax.set_xticks(x)
ax.set_xticklabels(skills, rotation=45, ha="right")
ax.set_ylabel("Segundos")
ax.set_title("Funnel de resolución (P50) por skill")
ax.legend()
ax.grid(axis="y", alpha=0.3)
return ax
# ------------------------------------------------------------------ #
# Métricas por skill (para consistencia frontend cached/fresh)
# ------------------------------------------------------------------ #
def metrics_by_skill(self) -> List[Dict[str, Any]]:
"""
Calcula métricas operacionales por skill:
- transfer_rate: % de interacciones con transfer_flag == True
- abandonment_rate: % de interacciones abandonadas
- fcr_tecnico: 100 - transfer_rate (sin transferencia)
- fcr_real: % sin transferencia Y sin recontacto 7d (si hay datos)
- volume: número de interacciones
Devuelve una lista de dicts, uno por skill, para que el frontend
tenga acceso a las métricas reales por skill (no estimadas).
"""
df = self.df
if df.empty:
return []
results = []
# Detectar columna de abandono
abandon_col = None
for col_name in ["is_abandoned", "abandoned_flag", "abandoned"]:
if col_name in df.columns:
abandon_col = col_name
break
# Detectar columna de repeat_call_7d para FCR real
repeat_col = None
for col_name in ["repeat_call_7d", "repeat_7d", "is_repeat_7d"]:
if col_name in df.columns:
repeat_col = col_name
break
for skill, group in df.groupby("queue_skill"):
total = len(group)
if total == 0:
continue
# Transfer rate
if "transfer_flag" in group.columns:
transfer_count = group["transfer_flag"].sum()
transfer_rate = float(round(transfer_count / total * 100, 2))
else:
transfer_rate = 0.0
# FCR Técnico = 100 - transfer_rate
fcr_tecnico = float(round(100.0 - transfer_rate, 2))
# Abandonment rate
abandonment_rate = 0.0
if abandon_col:
col = group[abandon_col]
if col.dtype == "O":
abandon_mask = (
col.astype(str)
.str.strip()
.str.lower()
.isin(["true", "t", "1", "yes", "y", "si", ""])
)
else:
abandon_mask = pd.to_numeric(col, errors="coerce").fillna(0) > 0
abandoned = int(abandon_mask.sum())
abandonment_rate = float(round(abandoned / total * 100, 2))
# FCR Real (sin transferencia Y sin recontacto 7d)
fcr_real = fcr_tecnico # default to fcr_tecnico if no repeat data
if repeat_col and "transfer_flag" in group.columns:
repeat_data = group[repeat_col]
if repeat_data.dtype == "O":
repeat_mask = (
repeat_data.astype(str)
.str.strip()
.str.lower()
.isin(["true", "t", "1", "yes", "y", "si", ""])
)
else:
repeat_mask = pd.to_numeric(repeat_data, errors="coerce").fillna(0) > 0
# FCR Real: no transfer AND no repeat
fcr_real_mask = (~group["transfer_flag"]) & (~repeat_mask)
fcr_real_count = fcr_real_mask.sum()
fcr_real = float(round(fcr_real_count / total * 100, 2))
# AHT Mean (promedio de handle_time sobre registros válidos)
# Filtramos solo registros 'valid' (excluye noise/zombie) para consistencia
if "_is_valid_for_cv" in group.columns:
valid_records = group[group["_is_valid_for_cv"]]
else:
valid_records = group
if len(valid_records) > 0 and "handle_time" in valid_records.columns:
aht_mean = float(round(valid_records["handle_time"].mean(), 2))
else:
aht_mean = 0.0
# AHT Total (promedio de handle_time sobre TODOS los registros)
# Incluye NOISE, ZOMBIE, ABANDON - solo para información/comparación
if len(group) > 0 and "handle_time" in group.columns:
aht_total = float(round(group["handle_time"].mean(), 2))
else:
aht_total = 0.0
# Hold Time Mean (promedio de hold_time sobre registros válidos)
# Consistente con fresh path que usa MEAN, no P50
if len(valid_records) > 0 and "hold_time" in valid_records.columns:
hold_time_mean = float(round(valid_records["hold_time"].mean(), 2))
else:
hold_time_mean = 0.0
results.append({
"skill": str(skill),
"volume": int(total),
"transfer_rate": transfer_rate,
"abandonment_rate": abandonment_rate,
"fcr_tecnico": fcr_tecnico,
"fcr_real": fcr_real,
"aht_mean": aht_mean,
"aht_total": aht_total,
"hold_time_mean": hold_time_mean,
})
return results

View File

@@ -0,0 +1,318 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, List, Any
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.axes import Axes
# Solo columnas del dataset “core”
REQUIRED_COLUMNS_SAT: List[str] = [
"interaction_id",
"datetime_start",
"queue_skill",
"channel",
"duration_talk",
"hold_time",
"wrap_up_time",
]
@dataclass
class SatisfactionExperienceMetrics:
"""
Dimensión 3: SATISFACCIÓN y EXPERIENCIA
Todas las columnas de satisfacción (csat/nps/ces/aht) son OPCIONALES.
Si no están, las métricas que las usan devuelven vacío/NaN pero
nunca rompen el pipeline.
"""
df: pd.DataFrame
def __post_init__(self) -> None:
self._validate_columns()
self._prepare_data()
# ------------------------------------------------------------------ #
# Helpers
# ------------------------------------------------------------------ #
def _validate_columns(self) -> None:
missing = [c for c in REQUIRED_COLUMNS_SAT if c not in self.df.columns]
if missing:
raise ValueError(
f"Faltan columnas obligatorias para SatisfactionExperienceMetrics: {missing}"
)
def _prepare_data(self) -> None:
df = self.df.copy()
df["datetime_start"] = pd.to_datetime(df["datetime_start"], errors="coerce")
# Duraciones base siempre existen
for col in ["duration_talk", "hold_time", "wrap_up_time"]:
df[col] = pd.to_numeric(df[col], errors="coerce")
# Handle time
df["handle_time"] = (
df["duration_talk"].fillna(0)
+ df["hold_time"].fillna(0)
+ df["wrap_up_time"].fillna(0)
)
# csat_score opcional
df["csat_score"] = pd.to_numeric(df.get("csat_score", np.nan), errors="coerce")
# aht opcional: si existe columna explícita la usamos, si no usamos handle_time
if "aht" in df.columns:
df["aht"] = pd.to_numeric(df["aht"], errors="coerce")
else:
df["aht"] = df["handle_time"]
# NPS / CES opcionales
df["nps_score"] = pd.to_numeric(df.get("nps_score", np.nan), errors="coerce")
df["ces_score"] = pd.to_numeric(df.get("ces_score", np.nan), errors="coerce")
df["queue_skill"] = df["queue_skill"].astype(str).str.strip()
df["channel"] = df["channel"].astype(str).str.strip()
self.df = df
@property
def is_empty(self) -> bool:
return self.df.empty
# ------------------------------------------------------------------ #
# KPIs
# ------------------------------------------------------------------ #
def csat_avg_by_skill_channel(self) -> pd.DataFrame:
"""
CSAT promedio por skill/canal.
Si no hay csat_score, devuelve DataFrame vacío.
"""
df = self.df
if "csat_score" not in df.columns or df["csat_score"].notna().sum() == 0:
return pd.DataFrame()
df = df.dropna(subset=["csat_score"])
if df.empty:
return pd.DataFrame()
pivot = (
df.pivot_table(
index="queue_skill",
columns="channel",
values="csat_score",
aggfunc="mean",
)
.sort_index()
.round(2)
)
return pivot
def nps_avg_by_skill_channel(self) -> pd.DataFrame:
"""
NPS medio por skill/canal, si existe nps_score.
"""
df = self.df
if "nps_score" not in df.columns or df["nps_score"].notna().sum() == 0:
return pd.DataFrame()
df = df.dropna(subset=["nps_score"])
if df.empty:
return pd.DataFrame()
pivot = (
df.pivot_table(
index="queue_skill",
columns="channel",
values="nps_score",
aggfunc="mean",
)
.sort_index()
.round(2)
)
return pivot
def ces_avg_by_skill_channel(self) -> pd.DataFrame:
"""
CES medio por skill/canal, si existe ces_score.
"""
df = self.df
if "ces_score" not in df.columns or df["ces_score"].notna().sum() == 0:
return pd.DataFrame()
df = df.dropna(subset=["ces_score"])
if df.empty:
return pd.DataFrame()
pivot = (
df.pivot_table(
index="queue_skill",
columns="channel",
values="ces_score",
aggfunc="mean",
)
.sort_index()
.round(2)
)
return pivot
def csat_global(self) -> float:
"""
CSAT medio global (todas las interacciones).
Usa la columna opcional `csat_score`:
- Si no existe, devuelve NaN.
- Si todos los valores son NaN / vacíos, devuelve NaN.
"""
df = self.df
if "csat_score" not in df.columns:
return float("nan")
series = pd.to_numeric(df["csat_score"], errors="coerce").dropna()
if series.empty:
return float("nan")
mean = series.mean()
return float(round(mean, 2))
def csat_aht_correlation(self) -> Dict[str, Any]:
"""
Correlación Pearson CSAT vs AHT.
Si falta csat o aht, o no hay varianza, devuelve NaN y código adecuado.
"""
df = self.df
if "csat_score" not in df.columns or df["csat_score"].notna().sum() == 0:
return {"r": float("nan"), "n": 0.0, "interpretation_code": "sin_datos"}
if "aht" not in df.columns or df["aht"].notna().sum() == 0:
return {"r": float("nan"), "n": 0.0, "interpretation_code": "sin_datos"}
df = df.dropna(subset=["csat_score", "aht"]).copy()
n = len(df)
if n < 2:
return {"r": float("nan"), "n": float(n), "interpretation_code": "insuficiente"}
x = df["aht"].astype(float)
y = df["csat_score"].astype(float)
if x.std(ddof=1) == 0 or y.std(ddof=1) == 0:
return {"r": float("nan"), "n": float(n), "interpretation_code": "sin_varianza"}
r = float(np.corrcoef(x, y)[0, 1])
if r < -0.3:
interpretation = "negativo"
elif r > 0.3:
interpretation = "positivo"
else:
interpretation = "neutral"
return {"r": round(r, 3), "n": float(n), "interpretation_code": interpretation}
def csat_aht_skill_summary(self) -> pd.DataFrame:
"""
Resumen por skill con clasificación del "sweet spot".
Si falta csat o aht, devuelve DataFrame vacío.
"""
df = self.df
if df["csat_score"].notna().sum() == 0 or df["aht"].notna().sum() == 0:
return pd.DataFrame(columns=["csat_avg", "aht_avg", "classification"])
df = df.dropna(subset=["csat_score", "aht"]).copy()
if df.empty:
return pd.DataFrame(columns=["csat_avg", "aht_avg", "classification"])
grouped = df.groupby("queue_skill").agg(
csat_avg=("csat_score", "mean"),
aht_avg=("aht", "mean"),
)
aht_all = df["aht"].astype(float)
csat_all = df["csat_score"].astype(float)
aht_p40 = float(np.percentile(aht_all, 40))
aht_p60 = float(np.percentile(aht_all, 60))
csat_p40 = float(np.percentile(csat_all, 40))
csat_p60 = float(np.percentile(csat_all, 60))
def classify(row) -> str:
csat = row["csat_avg"]
aht = row["aht_avg"]
if aht <= aht_p40 and csat >= csat_p60:
return "ideal_automatizar"
if aht >= aht_p60 and csat >= csat_p40:
return "requiere_humano"
return "neutral"
grouped["classification"] = grouped.apply(classify, axis=1)
return grouped.round({"csat_avg": 2, "aht_avg": 2})
# ------------------------------------------------------------------ #
# Plots
# ------------------------------------------------------------------ #
def plot_csat_vs_aht_scatter(self) -> Axes:
"""
Scatter CSAT vs AHT por skill.
Si no hay datos suficientes, devuelve un Axes con mensaje.
"""
df = self.df
if df["csat_score"].notna().sum() == 0 or df["aht"].notna().sum() == 0:
fig, ax = plt.subplots()
ax.text(0.5, 0.5, "Sin datos de CSAT/AHT", ha="center", va="center")
ax.set_axis_off()
return ax
df = df.dropna(subset=["csat_score", "aht"]).copy()
if df.empty:
fig, ax = plt.subplots()
ax.text(0.5, 0.5, "Sin datos de CSAT/AHT", ha="center", va="center")
ax.set_axis_off()
return ax
fig, ax = plt.subplots(figsize=(8, 5))
for skill, sub in df.groupby("queue_skill"):
ax.scatter(sub["aht"], sub["csat_score"], label=skill, alpha=0.7)
ax.set_xlabel("AHT (segundos)")
ax.set_ylabel("CSAT")
ax.set_title("CSAT vs AHT por skill")
ax.grid(alpha=0.3)
ax.legend(title="Skill", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
return ax
def plot_csat_distribution(self) -> Axes:
"""
Histograma de CSAT.
Si no hay csat_score, devuelve un Axes con mensaje.
"""
df = self.df
if "csat_score" not in df.columns or df["csat_score"].notna().sum() == 0:
fig, ax = plt.subplots()
ax.text(0.5, 0.5, "Sin datos de CSAT", ha="center", va="center")
ax.set_axis_off()
return ax
df = df.dropna(subset=["csat_score"]).copy()
if df.empty:
fig, ax = plt.subplots()
ax.text(0.5, 0.5, "Sin datos de CSAT", ha="center", va="center")
ax.set_axis_off()
return ax
fig, ax = plt.subplots(figsize=(6, 4))
ax.hist(df["csat_score"], bins=10, alpha=0.7)
ax.set_xlabel("CSAT")
ax.set_ylabel("Frecuencia")
ax.set_title("Distribución de CSAT")
ax.grid(axis="y", alpha=0.3)
return ax

View File

@@ -0,0 +1,268 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import List
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.axes import Axes
REQUIRED_COLUMNS_VOLUMETRIA: List[str] = [
"interaction_id",
"datetime_start",
"queue_skill",
"channel",
]
@dataclass
class VolumetriaMetrics:
"""
Métricas de volumetría basadas en el nuevo esquema de datos.
Columnas mínimas requeridas:
- interaction_id
- datetime_start
- queue_skill
- channel
Otras columnas pueden existir pero no son necesarias para estas métricas.
"""
df: pd.DataFrame
def __post_init__(self) -> None:
self._validate_columns()
self._prepare_data()
# ------------------------------------------------------------------ #
# Helpers internos
# ------------------------------------------------------------------ #
def _validate_columns(self) -> None:
missing = [c for c in REQUIRED_COLUMNS_VOLUMETRIA if c not in self.df.columns]
if missing:
raise ValueError(
f"Faltan columnas obligatorias para VolumetriaMetrics: {missing}"
)
def _prepare_data(self) -> None:
df = self.df.copy()
# Asegurar tipo datetime
df["datetime_start"] = pd.to_datetime(df["datetime_start"], errors="coerce")
# Normalizar strings
df["queue_skill"] = df["queue_skill"].astype(str).str.strip()
df["channel"] = df["channel"].astype(str).str.strip()
# Guardamos el df preparado
self.df = df
# ------------------------------------------------------------------ #
# Propiedades útiles
# ------------------------------------------------------------------ #
@property
def is_empty(self) -> bool:
return self.df.empty
# ------------------------------------------------------------------ #
# Métricas numéricas / tabulares
# ------------------------------------------------------------------ #
def volume_by_channel(self) -> pd.Series:
"""
Nº de interacciones por canal.
"""
return self.df.groupby("channel")["interaction_id"].nunique().sort_values(
ascending=False
)
def volume_by_skill(self) -> pd.Series:
"""
Nº de interacciones por skill / cola.
"""
return self.df.groupby("queue_skill")["interaction_id"].nunique().sort_values(
ascending=False
)
def channel_distribution_pct(self) -> pd.Series:
"""
Distribución porcentual del volumen por canal.
"""
counts = self.volume_by_channel()
total = counts.sum()
if total == 0:
return counts * 0.0
return (counts / total * 100).round(2)
def skill_distribution_pct(self) -> pd.Series:
"""
Distribución porcentual del volumen por skill.
"""
counts = self.volume_by_skill()
total = counts.sum()
if total == 0:
return counts * 0.0
return (counts / total * 100).round(2)
def heatmap_24x7(self) -> pd.DataFrame:
"""
Matriz [día_semana x hora] con nº de interacciones.
dayofweek: 0=Lunes ... 6=Domingo
"""
df = self.df.dropna(subset=["datetime_start"]).copy()
if df.empty:
# Devolvemos un df vacío pero con índice/columnas esperadas
idx = range(7)
cols = range(24)
return pd.DataFrame(0, index=idx, columns=cols)
df["dow"] = df["datetime_start"].dt.dayofweek
df["hour"] = df["datetime_start"].dt.hour
pivot = (
df.pivot_table(
index="dow",
columns="hour",
values="interaction_id",
aggfunc="nunique",
fill_value=0,
)
.reindex(index=range(7), fill_value=0)
.reindex(columns=range(24), fill_value=0)
)
return pivot
def monthly_seasonality_cv(self) -> float:
"""
Coeficiente de variación del volumen mensual.
CV = std / mean (en %).
"""
df = self.df.dropna(subset=["datetime_start"]).copy()
if df.empty:
return float("nan")
df["year_month"] = df["datetime_start"].dt.to_period("M")
monthly_counts = (
df.groupby("year_month")["interaction_id"].nunique().astype(float)
)
if len(monthly_counts) < 2:
return float("nan")
mean = monthly_counts.mean()
std = monthly_counts.std(ddof=1)
if mean == 0:
return float("nan")
return float(round(std / mean * 100, 2))
def peak_offpeak_ratio(self) -> float:
"""
Ratio de volumen entre horas pico y valle.
Definimos pico como horas 10:0019:59, resto valle.
"""
df = self.df.dropna(subset=["datetime_start"]).copy()
if df.empty:
return float("nan")
df["hour"] = df["datetime_start"].dt.hour
peak_hours = list(range(10, 20))
is_peak = df["hour"].isin(peak_hours)
peak_vol = df.loc[is_peak, "interaction_id"].nunique()
off_vol = df.loc[~is_peak, "interaction_id"].nunique()
if off_vol == 0:
return float("inf") if peak_vol > 0 else float("nan")
return float(round(peak_vol / off_vol, 3))
def concentration_top20_skills_pct(self) -> float:
"""
% del volumen concentrado en el top 20% de skills (por nº de interacciones).
"""
counts = (
self.df.groupby("queue_skill")["interaction_id"].nunique().sort_values(
ascending=False
)
)
n_skills = len(counts)
if n_skills == 0:
return float("nan")
top_n = max(1, int(np.ceil(0.2 * n_skills)))
top_vol = counts.head(top_n).sum()
total = counts.sum()
if total == 0:
return float("nan")
return float(round(top_vol / total * 100, 2))
# ------------------------------------------------------------------ #
# Plots
# ------------------------------------------------------------------ #
def plot_heatmap_24x7(self) -> Axes:
"""
Heatmap de volumen por día de la semana (0-6) y hora (0-23).
Devuelve Axes para que el pipeline pueda guardar la figura.
"""
data = self.heatmap_24x7()
fig, ax = plt.subplots(figsize=(10, 4))
im = ax.imshow(data.values, aspect="auto", origin="lower")
ax.set_xticks(range(24))
ax.set_xticklabels([str(h) for h in range(24)])
ax.set_yticks(range(7))
ax.set_yticklabels(["L", "M", "X", "J", "V", "S", "D"])
ax.set_xlabel("Hora del día")
ax.set_ylabel("Día de la semana")
ax.set_title("Volumen por día de la semana y hora")
plt.colorbar(im, ax=ax, label="Nº interacciones")
return ax
def plot_channel_distribution(self) -> Axes:
"""
Distribución de volumen por canal.
"""
series = self.volume_by_channel()
fig, ax = plt.subplots(figsize=(6, 4))
series.plot(kind="bar", ax=ax)
ax.set_xlabel("Canal")
ax.set_ylabel("Nº interacciones")
ax.set_title("Volumen por canal")
ax.grid(axis="y", alpha=0.3)
return ax
def plot_skill_pareto(self) -> Axes:
"""
Pareto simple de volumen por skill (solo barras de volumen).
"""
series = self.volume_by_skill()
fig, ax = plt.subplots(figsize=(10, 4))
series.plot(kind="bar", ax=ax)
ax.set_xlabel("Skill / Cola")
ax.set_ylabel("Nº interacciones")
ax.set_title("Pareto de volumen por skill")
ax.grid(axis="y", alpha=0.3)
plt.xticks(rotation=45, ha="right")
return ax

View File

@@ -0,0 +1,13 @@
from .Volumetria import VolumetriaMetrics
from .OperationalPerformance import OperationalPerformanceMetrics
from .SatisfactionExperience import SatisfactionExperienceMetrics
from .EconomyCost import EconomyCostMetrics, EconomyConfig
__all__ = [
# Dimensiones
"VolumetriaMetrics",
"OperationalPerformanceMetrics",
"SatisfactionExperienceMetrics",
"EconomyCostMetrics",
"EconomyConfig",
]

View File

@@ -0,0 +1,22 @@
from .base import DataSource, ResultsSink
from .local import LocalDataSource, LocalResultsSink
from .s3 import S3DataSource, S3ResultsSink
from .google_drive import (
GoogleDriveDataSource,
GoogleDriveConfig,
GoogleDriveResultsSink,
GoogleDriveSinkConfig,
)
__all__ = [
"DataSource",
"ResultsSink",
"LocalDataSource",
"LocalResultsSink",
"S3DataSource",
"S3ResultsSink",
"GoogleDriveDataSource",
"GoogleDriveConfig",
"GoogleDriveResultsSink",
"GoogleDriveSinkConfig",
]

View File

@@ -0,0 +1,36 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Any, Dict
import pandas as pd
from matplotlib.figure import Figure
class DataSource(ABC):
"""Interfaz de lectura de datos (CSV)."""
@abstractmethod
def read_csv(self, path: str) -> pd.DataFrame:
"""
Lee un CSV y devuelve un DataFrame.
El significado de 'path' depende de la implementación:
- LocalDataSource: ruta en el sistema de ficheros
- S3DataSource: 's3://bucket/key'
"""
raise NotImplementedError
class ResultsSink(ABC):
"""Interfaz de escritura de resultados (JSON e imágenes)."""
@abstractmethod
def write_json(self, path: str, data: Dict[str, Any]) -> None:
"""Escribe un dict como JSON en 'path'."""
raise NotImplementedError
@abstractmethod
def write_figure(self, path: str, fig: Figure) -> None:
"""Guarda una figura matplotlib en 'path'."""
raise NotImplementedError

View File

@@ -0,0 +1,160 @@
# beyond_metrics/io/google_drive.py
from __future__ import annotations
import io
import json
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Dict, Any
import pandas as pd
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload, MediaIoBaseUpload
from .base import DataSource, ResultsSink
GDRIVE_SCOPES = ["https://www.googleapis.com/auth/drive.readonly",
"https://www.googleapis.com/auth/drive.file"]
def _extract_file_id(file_id_or_url: str) -> str:
"""
Acepta:
- un ID directo de Google Drive (ej: '1AbC...')
- una URL de Google Drive compartida
y devuelve siempre el file_id.
"""
if "http://" not in file_id_or_url and "https://" not in file_id_or_url:
return file_id_or_url.strip()
patterns = [
r"/d/([a-zA-Z0-9_-]{10,})", # https://drive.google.com/file/d/<ID>/view
r"id=([a-zA-Z0-9_-]{10,})", # https://drive.google.com/open?id=<ID>
]
for pattern in patterns:
m = re.search(pattern, file_id_or_url)
if m:
return m.group(1)
raise ValueError(f"No se pudo extraer un file_id de la URL de Google Drive: {file_id_or_url}")
# -------- DataSource --------
@dataclass
class GoogleDriveConfig:
credentials_path: str # ruta al JSON de service account
impersonate_user: Optional[str] = None
class GoogleDriveDataSource(DataSource):
"""
DataSource que lee CSVs desde Google Drive.
"""
def __init__(self, config: GoogleDriveConfig) -> None:
self._config = config
self._service = self._build_service(readonly=True)
def _build_service(self, readonly: bool = True):
scopes = ["https://www.googleapis.com/auth/drive.readonly"] if readonly else GDRIVE_SCOPES
creds = service_account.Credentials.from_service_account_file(
self._config.credentials_path,
scopes=scopes,
)
if self._config.impersonate_user:
creds = creds.with_subject(self._config.impersonate_user)
service = build("drive", "v3", credentials=creds)
return service
def read_csv(self, path: str) -> pd.DataFrame:
file_id = _extract_file_id(path)
request = self._service.files().get_media(fileId=file_id)
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, request)
done = False
while not done:
_, done = downloader.next_chunk()
fh.seek(0)
df = pd.read_csv(fh)
return df
# -------- ResultsSink --------
@dataclass
class GoogleDriveSinkConfig:
credentials_path: str # ruta al JSON de service account
base_folder_id: str # ID de la carpeta de Drive donde escribir
impersonate_user: Optional[str] = None
class GoogleDriveResultsSink(ResultsSink):
"""
ResultsSink que sube JSONs e imágenes a una carpeta de Google Drive.
Nota: por simplicidad, usamos solo el nombre del fichero (basename de `path`).
Es decir, si le pasas 'data/output/123/results.json', en Drive se guardará
como 'results.json' dentro de base_folder_id.
"""
def __init__(self, config: GoogleDriveSinkConfig) -> None:
self._config = config
self._service = self._build_service()
def _build_service(self):
creds = service_account.Credentials.from_service_account_file(
self._config.credentials_path,
scopes=GDRIVE_SCOPES,
)
if self._config.impersonate_user:
creds = creds.with_subject(self._config.impersonate_user)
service = build("drive", "v3", credentials=creds)
return service
def _upload_bytes(self, data: bytes, mime_type: str, target_path: str) -> str:
"""
Sube un fichero en memoria a Drive y devuelve el file_id.
"""
filename = Path(target_path).name
media = MediaIoBaseUpload(io.BytesIO(data), mimetype=mime_type, resumable=False)
file_metadata = {
"name": filename,
"parents": [self._config.base_folder_id],
}
created = self._service.files().create(
body=file_metadata,
media_body=media,
fields="id",
).execute()
return created["id"]
def write_json(self, path: str, data: Dict[str, Any]) -> None:
payload = json.dumps(data, ensure_ascii=False, indent=2).encode("utf-8")
self._upload_bytes(payload, "application/json", path)
def write_figure(self, path: str, fig) -> None:
from matplotlib.figure import Figure
if not isinstance(fig, Figure):
raise TypeError("write_figure espera un matplotlib.figure.Figure")
buf = io.BytesIO()
fig.savefig(buf, format="png", bbox_inches="tight")
buf.seek(0)
self._upload_bytes(buf.read(), "image/png", path)

View File

@@ -0,0 +1,57 @@
from __future__ import annotations
import json
import os
from typing import Any, Dict
import pandas as pd
from matplotlib.figure import Figure
from .base import DataSource, ResultsSink
class LocalDataSource(DataSource):
"""
DataSource que lee CSV desde el sistema de ficheros local.
- base_dir: se prefiere que todos los paths sean relativos a esta carpeta.
"""
def __init__(self, base_dir: str = ".") -> None:
self.base_dir = base_dir
def _full_path(self, path: str) -> str:
if os.path.isabs(path):
return path
return os.path.join(self.base_dir, path)
def read_csv(self, path: str) -> pd.DataFrame:
full = self._full_path(path)
return pd.read_csv(full)
class LocalResultsSink(ResultsSink):
"""
ResultsSink que escribe JSON e imágenes en el sistema de ficheros local.
"""
def __init__(self, base_dir: str = ".") -> None:
self.base_dir = base_dir
def _full_path(self, path: str) -> str:
if os.path.isabs(path):
full = path
else:
full = os.path.join(self.base_dir, path)
# Crear carpetas si no existen
os.makedirs(os.path.dirname(full), exist_ok=True)
return full
def write_json(self, path: str, data: Dict[str, Any]) -> None:
full = self._full_path(path)
with open(full, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def write_figure(self, path: str, fig: Figure) -> None:
full = self._full_path(path)
fig.savefig(full, bbox_inches="tight")

View File

@@ -0,0 +1,62 @@
from __future__ import annotations
import io
import json
from typing import Any, Dict, Tuple
import boto3
import pandas as pd
from matplotlib.figure import Figure
from .base import DataSource, ResultsSink
def _split_s3_path(path: str) -> Tuple[str, str]:
"""
Convierte 's3://bucket/key' en (bucket, key).
"""
if not path.startswith("s3://"):
raise ValueError(f"Ruta S3 inválida: {path}")
without_scheme = path[len("s3://") :]
parts = without_scheme.split("/", 1)
if len(parts) != 2:
raise ValueError(f"Ruta S3 inválida: {path}")
return parts[0], parts[1]
class S3DataSource(DataSource):
"""
DataSource que lee CSV desde S3 usando boto3.
"""
def __init__(self, boto3_client: Any | None = None) -> None:
self.s3 = boto3_client or boto3.client("s3")
def read_csv(self, path: str) -> pd.DataFrame:
bucket, key = _split_s3_path(path)
obj = self.s3.get_object(Bucket=bucket, Key=key)
body = obj["Body"].read()
buffer = io.BytesIO(body)
return pd.read_csv(buffer)
class S3ResultsSink(ResultsSink):
"""
ResultsSink que escribe JSON e imágenes en S3.
"""
def __init__(self, boto3_client: Any | None = None) -> None:
self.s3 = boto3_client or boto3.client("s3")
def write_json(self, path: str, data: Dict[str, Any]) -> None:
bucket, key = _split_s3_path(path)
body = json.dumps(data, ensure_ascii=False, indent=2).encode("utf-8")
self.s3.put_object(Bucket=bucket, Key=key, Body=body)
def write_figure(self, path: str, fig: Figure) -> None:
bucket, key = _split_s3_path(path)
buf = io.BytesIO()
fig.savefig(buf, format="png", bbox_inches="tight")
buf.seek(0)
self.s3.put_object(Bucket=bucket, Key=key, Body=buf.getvalue(), ContentType="image/png")

View File

@@ -0,0 +1,291 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime
from importlib import import_module
from typing import Any, Dict, List, Mapping, Optional, cast, Callable
import logging
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.axes import Axes
from matplotlib.figure import Figure
from .io import (
DataSource,
ResultsSink,
)
LOGGER = logging.getLogger(__name__)
def setup_basic_logging(level: str = "INFO") -> None:
"""
Configuración básica de logging, por si se necesita desde scripts.
"""
logging.basicConfig(
level=getattr(logging, level.upper(), logging.INFO),
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
)
def _import_class(path: str) -> type:
"""
Import dinámico de una clase a partir de un string tipo:
"beyond_metrics.dimensions.VolumetriaMetrics"
"""
LOGGER.debug("Importando clase %s", path)
module_name, class_name = path.rsplit(".", 1)
module = import_module(module_name)
cls = getattr(module, class_name)
return cls
def _serialize_for_json(obj: Any) -> Any:
"""
Convierte objetos típicos de numpy/pandas en tipos JSON-friendly.
"""
if obj is None or isinstance(obj, (str, int, float, bool)):
return obj
if isinstance(obj, (np.integer, np.floating)):
return float(obj)
if isinstance(obj, pd.DataFrame):
return obj.to_dict(orient="records")
if isinstance(obj, pd.Series):
return obj.to_list()
if isinstance(obj, (list, tuple)):
return [_serialize_for_json(x) for x in obj]
if isinstance(obj, dict):
return {str(k): _serialize_for_json(v) for k, v in obj.items()}
return str(obj)
PostRunCallback = Callable[[Dict[str, Any], str, ResultsSink], None]
@dataclass
class BeyondMetricsPipeline:
"""
Pipeline principal de BeyondMetrics.
- Lee un CSV desde un DataSource (local, S3, Google Drive, etc.).
- Ejecuta dimensiones configuradas en un dict de configuración.
- Serializa resultados numéricos/tabulares a JSON.
- Guarda las imágenes de los métodos que comienzan por 'plot_'.
"""
datasource: DataSource
sink: ResultsSink
dimensions_config: Mapping[str, Any]
dimension_params: Optional[Mapping[str, Mapping[str, Any]]] = None
post_run: Optional[List[PostRunCallback]] = None
def run(
self,
input_path: str,
run_dir: str,
*,
write_results_json: bool = True,
) -> Dict[str, Any]:
LOGGER.info("Inicio de ejecución de BeyondMetricsPipeline")
LOGGER.info("Leyendo CSV de entrada: %s", input_path)
# 1) Leer datos
df = self.datasource.read_csv(input_path)
LOGGER.info("CSV leído con %d filas y %d columnas", df.shape[0], df.shape[1])
# 2) Determinar carpeta/base de salida para esta ejecución
run_base = run_dir.rstrip("/")
LOGGER.info("Ruta base de esta ejecución: %s", run_base)
# 3) Ejecutar dimensiones
dimensions_cfg = self.dimensions_config
if not isinstance(dimensions_cfg, dict):
raise ValueError("El bloque 'dimensions' debe ser un dict.")
all_results: Dict[str, Any] = {}
for dim_name, dim_cfg in dimensions_cfg.items():
if not isinstance(dim_cfg, dict):
raise ValueError(f"Config inválida para dimensión '{dim_name}' (debe ser dict).")
if not dim_cfg.get("enabled", True):
LOGGER.info("Dimensión '%s' desactivada; se omite.", dim_name)
continue
class_path = dim_cfg.get("class")
if not class_path:
raise ValueError(f"Falta 'class' en la dimensión '{dim_name}'.")
metrics: List[str] = dim_cfg.get("metrics", [])
if not metrics:
LOGGER.info("Dimensión '%s' sin métricas configuradas; se omite.", dim_name)
continue
cls = _import_class(class_path)
extra_kwargs = {}
if self.dimension_params is not None:
extra_kwargs = self.dimension_params.get(dim_name, {}) or {}
# Las dimensiones reciben df en el constructor
instance = cls(df, **extra_kwargs)
dim_results: Dict[str, Any] = {}
for metric_name in metrics:
LOGGER.info(" - Ejecutando métrica '%s.%s'", dim_name, metric_name)
result = self._execute_metric(instance, metric_name, run_base, dim_name)
dim_results[metric_name] = result
all_results[dim_name] = dim_results
# 4) Guardar JSON de resultados (opcional)
if write_results_json:
results_json_path = f"{run_base}/results.json"
LOGGER.info("Guardando resultados en JSON: %s", results_json_path)
self.sink.write_json(results_json_path, all_results)
# 5) Ejecutar callbacks post-run (scorers, agentes, etc.)
if self.post_run:
LOGGER.info("Ejecutando %d callbacks post-run...", len(self.post_run))
for cb in self.post_run:
try:
LOGGER.info("Ejecutando post-run callback: %s", cb)
cb(all_results, run_base, self.sink)
except Exception:
LOGGER.exception("Error ejecutando post-run callback %s", cb)
LOGGER.info("Ejecución completada correctamente.")
return all_results
def _execute_metric(
self,
instance: Any,
metric_name: str,
run_base: str,
dim_name: str,
) -> Any:
"""
Ejecuta una métrica:
- Si empieza por 'plot_' -> se asume que devuelve Axes:
- se guarda la figura como PNG
- se devuelve {"type": "image", "path": "..."}
- Si no, se serializa el valor a JSON.
Además, para métricas categóricas (por skill/canal) de la dimensión
'volumetry', devolvemos explícitamente etiquetas y valores para que
el frontend pueda saber a qué pertenece cada número.
"""
method = getattr(instance, metric_name, None)
if method is None or not callable(method):
raise ValueError(
f"La métrica '{metric_name}' no existe en {type(instance).__name__}"
)
# Caso plots
if metric_name.startswith("plot_"):
ax = method()
if not isinstance(ax, Axes):
raise TypeError(
f"La métrica '{metric_name}' de '{type(instance).__name__}' "
f"debería devolver un matplotlib.axes.Axes"
)
fig = ax.get_figure()
if fig is None:
raise RuntimeError(
"Axes.get_figure() devolvió None, lo cual no debería pasar."
)
fig = cast(Figure, fig)
filename = f"{dim_name}_{metric_name}.png"
img_path = f"{run_base}/{filename}"
LOGGER.debug("Guardando figura en %s", img_path)
self.sink.write_figure(img_path, fig)
plt.close(fig)
return {
"type": "image",
"path": img_path,
}
# Caso numérico/tabular
value = method()
# Caso especial: series categóricas de volumetría (por skill / canal)
# Devolvemos {"labels": [...], "values": [...]} para mantener la
# información de etiquetas en el JSON.
if (
dim_name == "volumetry"
and isinstance(value, pd.Series)
and metric_name
in {
"volume_by_channel",
"volume_by_skill",
"channel_distribution_pct",
"skill_distribution_pct",
}
):
labels = [str(idx) for idx in value.index.tolist()]
# Aseguramos que todos los valores sean numéricos JSON-friendly
values = [float(v) for v in value.astype(float).tolist()]
return {
"labels": labels,
"values": values,
}
return _serialize_for_json(value)
def load_dimensions_config(path: str) -> Dict[str, Any]:
"""
Carga un JSON de configuración que contiene solo el bloque 'dimensions'.
"""
import json
from pathlib import Path
with Path(path).open("r", encoding="utf-8") as f:
cfg = json.load(f)
dimensions = cfg.get("dimensions")
if dimensions is None:
raise ValueError("El fichero de configuración debe contener un bloque 'dimensions'.")
return dimensions
def build_pipeline(
dimensions_config_path: str,
datasource: DataSource,
sink: ResultsSink,
dimension_params: Optional[Mapping[str, Mapping[str, Any]]] = None,
post_run: Optional[List[PostRunCallback]] = None,
) -> BeyondMetricsPipeline:
"""
Crea un BeyondMetricsPipeline a partir de:
- ruta al JSON con dimensiones/métricas
- un DataSource ya construido (local/S3/Drive)
- un ResultsSink ya construido (local/S3/Drive)
- una lista opcional de callbacks post_run que se ejecutan al final
(útil para scorers, agentes de IA, etc.)
"""
dims_cfg = load_dimensions_config(dimensions_config_path)
return BeyondMetricsPipeline(
datasource=datasource,
sink=sink,
dimensions_config=dims_cfg,
dimension_params=dimension_params,
post_run=post_run,
)