BeyondCXAnalytics-Demo/backend/beyond_metrics/dimensions/Volumetria.py

from __future__ import annotations

from dataclasses import dataclass
from typing import List

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.axes import Axes


REQUIRED_COLUMNS_VOLUMETRIA: List[str] = [
    "interaction_id",
    "datetime_start",
    "queue_skill",
    "channel",
]


@dataclass
class VolumetriaMetrics:
    """
    Métricas de volumetría basadas en el nuevo esquema de datos.

    Columnas mínimas requeridas:
    - interaction_id
    - datetime_start
    - queue_skill
    - channel

    Otras columnas pueden existir pero no son necesarias para estas métricas.
    """

    df: pd.DataFrame

    def __post_init__(self) -> None:
        self._validate_columns()
        self._prepare_data()

    # ------------------------------------------------------------------ #
    # Helpers internos
    # ------------------------------------------------------------------ #
    def _validate_columns(self) -> None:
        missing = [c for c in REQUIRED_COLUMNS_VOLUMETRIA if c not in self.df.columns]
        if missing:
            raise ValueError(
                f"Faltan columnas obligatorias para VolumetriaMetrics: {missing}"
            )

    def _prepare_data(self) -> None:
        df = self.df.copy()

        # Asegurar tipo datetime
        df["datetime_start"] = pd.to_datetime(df["datetime_start"], errors="coerce")

        # Normalizar strings
        df["queue_skill"] = df["queue_skill"].astype(str).str.strip()
        df["channel"] = df["channel"].astype(str).str.strip()

        # Guardamos el df preparado
        self.df = df

    # ------------------------------------------------------------------ #
    # Propiedades útiles
    # ------------------------------------------------------------------ #
    @property
    def is_empty(self) -> bool:
        return self.df.empty

    # ------------------------------------------------------------------ #
    # Métricas numéricas / tabulares
    # ------------------------------------------------------------------ #
    def volume_by_channel(self) -> pd.Series:
        """
        Nº de interacciones por canal.
        """
        return self.df.groupby("channel")["interaction_id"].nunique().sort_values(
            ascending=False
        )

    def volume_by_skill(self) -> pd.Series:
        """
        Nº de interacciones por skill / cola.
        """
        return self.df.groupby("queue_skill")["interaction_id"].nunique().sort_values(
            ascending=False
        )

    def channel_distribution_pct(self) -> pd.Series:
        """
        Distribución porcentual del volumen por canal.
        """
        counts = self.volume_by_channel()
        total = counts.sum()
        if total == 0:
            return counts * 0.0
        return (counts / total * 100).round(2)

    def skill_distribution_pct(self) -> pd.Series:
        """
        Distribución porcentual del volumen por skill.
        """
        counts = self.volume_by_skill()
        total = counts.sum()
        if total == 0:
            return counts * 0.0
        return (counts / total * 100).round(2)

    def heatmap_24x7(self) -> pd.DataFrame:
        """
        Matriz [día_semana x hora] con nº de interacciones.
        dayofweek: 0=Lunes ... 6=Domingo
        """
        df = self.df.dropna(subset=["datetime_start"]).copy()
        if df.empty:
            # Devolvemos un df vacío pero con índice/columnas esperadas
            idx = range(7)
            cols = range(24)
            return pd.DataFrame(0, index=idx, columns=cols)

        df["dow"] = df["datetime_start"].dt.dayofweek
        df["hour"] = df["datetime_start"].dt.hour

        pivot = (
            df.pivot_table(
                index="dow",
                columns="hour",
                values="interaction_id",
                aggfunc="nunique",
                fill_value=0,
            )
            .reindex(index=range(7), fill_value=0)
            .reindex(columns=range(24), fill_value=0)
        )

        return pivot

    def monthly_seasonality_cv(self) -> float:
        """
        Coeficiente de variación del volumen mensual.
        CV = std / mean (en %).
        """
        df = self.df.dropna(subset=["datetime_start"]).copy()
        if df.empty:
            return float("nan")

        df["year_month"] = df["datetime_start"].dt.to_period("M")
        monthly_counts = (
            df.groupby("year_month")["interaction_id"].nunique().astype(float)
        )

        if len(monthly_counts) < 2:
            return float("nan")

        mean = monthly_counts.mean()
        std = monthly_counts.std(ddof=1)
        if mean == 0:
            return float("nan")

        return float(round(std / mean * 100, 2))

    def peak_offpeak_ratio(self) -> float:
        """
        Ratio de volumen entre horas pico y valle.

        Definimos pico como horas 10:00–19:59, resto valle.
        """
        df = self.df.dropna(subset=["datetime_start"]).copy()
        if df.empty:
            return float("nan")

        df["hour"] = df["datetime_start"].dt.hour

        peak_hours = list(range(10, 20))
        is_peak = df["hour"].isin(peak_hours)

        peak_vol = df.loc[is_peak, "interaction_id"].nunique()
        off_vol = df.loc[~is_peak, "interaction_id"].nunique()

        if off_vol == 0:
            return float("inf") if peak_vol > 0 else float("nan")

        return float(round(peak_vol / off_vol, 3))

    def concentration_top20_skills_pct(self) -> float:
        """
        % del volumen concentrado en el top 20% de skills (por nº de interacciones).
        """
        counts = (
            self.df.groupby("queue_skill")["interaction_id"].nunique().sort_values(
                ascending=False
            )
        )

        n_skills = len(counts)
        if n_skills == 0:
            return float("nan")

        top_n = max(1, int(np.ceil(0.2 * n_skills)))
        top_vol = counts.head(top_n).sum()
        total = counts.sum()

        if total == 0:
            return float("nan")

        return float(round(top_vol / total * 100, 2))

    # ------------------------------------------------------------------ #
    # Plots
    # ------------------------------------------------------------------ #
    def plot_heatmap_24x7(self) -> Axes:
        """
        Heatmap de volumen por día de la semana (0-6) y hora (0-23).
        Devuelve Axes para que el pipeline pueda guardar la figura.
        """
        data = self.heatmap_24x7()

        fig, ax = plt.subplots(figsize=(10, 4))
        im = ax.imshow(data.values, aspect="auto", origin="lower")

        ax.set_xticks(range(24))
        ax.set_xticklabels([str(h) for h in range(24)])

        ax.set_yticks(range(7))
        ax.set_yticklabels(["L", "M", "X", "J", "V", "S", "D"])


        ax.set_xlabel("Hora del día")
        ax.set_ylabel("Día de la semana")
        ax.set_title("Volumen por día de la semana y hora")

        plt.colorbar(im, ax=ax, label="Nº interacciones")

        return ax

    def plot_channel_distribution(self) -> Axes:
        """
        Distribución de volumen por canal.
        """
        series = self.volume_by_channel()

        fig, ax = plt.subplots(figsize=(6, 4))
        series.plot(kind="bar", ax=ax)

        ax.set_xlabel("Canal")
        ax.set_ylabel("Nº interacciones")
        ax.set_title("Volumen por canal")
        ax.grid(axis="y", alpha=0.3)

        return ax

    def plot_skill_pareto(self) -> Axes:
        """
        Pareto simple de volumen por skill (solo barras de volumen).
        """
        series = self.volume_by_skill()

        fig, ax = plt.subplots(figsize=(10, 4))
        series.plot(kind="bar", ax=ax)

        ax.set_xlabel("Skill / Cola")
        ax.set_ylabel("Nº interacciones")
        ax.set_title("Pareto de volumen por skill")
        ax.grid(axis="y", alpha=0.3)

        plt.xticks(rotation=45, ha="right")

        return ax