Initial commit: frontend + backend integration

This commit is contained in:
Ignacio
2025-12-29 18:12:32 +01:00
commit 2cd6d6b95c
146 changed files with 31503 additions and 0 deletions

View File

@@ -0,0 +1,22 @@
from .base import DataSource, ResultsSink
from .local import LocalDataSource, LocalResultsSink
from .s3 import S3DataSource, S3ResultsSink
from .google_drive import (
GoogleDriveDataSource,
GoogleDriveConfig,
GoogleDriveResultsSink,
GoogleDriveSinkConfig,
)
__all__ = [
"DataSource",
"ResultsSink",
"LocalDataSource",
"LocalResultsSink",
"S3DataSource",
"S3ResultsSink",
"GoogleDriveDataSource",
"GoogleDriveConfig",
"GoogleDriveResultsSink",
"GoogleDriveSinkConfig",
]

View File

@@ -0,0 +1,36 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Any, Dict
import pandas as pd
from matplotlib.figure import Figure
class DataSource(ABC):
"""Interfaz de lectura de datos (CSV)."""
@abstractmethod
def read_csv(self, path: str) -> pd.DataFrame:
"""
Lee un CSV y devuelve un DataFrame.
El significado de 'path' depende de la implementación:
- LocalDataSource: ruta en el sistema de ficheros
- S3DataSource: 's3://bucket/key'
"""
raise NotImplementedError
class ResultsSink(ABC):
"""Interfaz de escritura de resultados (JSON e imágenes)."""
@abstractmethod
def write_json(self, path: str, data: Dict[str, Any]) -> None:
"""Escribe un dict como JSON en 'path'."""
raise NotImplementedError
@abstractmethod
def write_figure(self, path: str, fig: Figure) -> None:
"""Guarda una figura matplotlib en 'path'."""
raise NotImplementedError

View File

@@ -0,0 +1,160 @@
# beyond_metrics/io/google_drive.py
from __future__ import annotations
import io
import json
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Dict, Any
import pandas as pd
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload, MediaIoBaseUpload
from .base import DataSource, ResultsSink
GDRIVE_SCOPES = ["https://www.googleapis.com/auth/drive.readonly",
"https://www.googleapis.com/auth/drive.file"]
def _extract_file_id(file_id_or_url: str) -> str:
"""
Acepta:
- un ID directo de Google Drive (ej: '1AbC...')
- una URL de Google Drive compartida
y devuelve siempre el file_id.
"""
if "http://" not in file_id_or_url and "https://" not in file_id_or_url:
return file_id_or_url.strip()
patterns = [
r"/d/([a-zA-Z0-9_-]{10,})", # https://drive.google.com/file/d/<ID>/view
r"id=([a-zA-Z0-9_-]{10,})", # https://drive.google.com/open?id=<ID>
]
for pattern in patterns:
m = re.search(pattern, file_id_or_url)
if m:
return m.group(1)
raise ValueError(f"No se pudo extraer un file_id de la URL de Google Drive: {file_id_or_url}")
# -------- DataSource --------
@dataclass
class GoogleDriveConfig:
credentials_path: str # ruta al JSON de service account
impersonate_user: Optional[str] = None
class GoogleDriveDataSource(DataSource):
"""
DataSource que lee CSVs desde Google Drive.
"""
def __init__(self, config: GoogleDriveConfig) -> None:
self._config = config
self._service = self._build_service(readonly=True)
def _build_service(self, readonly: bool = True):
scopes = ["https://www.googleapis.com/auth/drive.readonly"] if readonly else GDRIVE_SCOPES
creds = service_account.Credentials.from_service_account_file(
self._config.credentials_path,
scopes=scopes,
)
if self._config.impersonate_user:
creds = creds.with_subject(self._config.impersonate_user)
service = build("drive", "v3", credentials=creds)
return service
def read_csv(self, path: str) -> pd.DataFrame:
file_id = _extract_file_id(path)
request = self._service.files().get_media(fileId=file_id)
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, request)
done = False
while not done:
_, done = downloader.next_chunk()
fh.seek(0)
df = pd.read_csv(fh)
return df
# -------- ResultsSink --------
@dataclass
class GoogleDriveSinkConfig:
credentials_path: str # ruta al JSON de service account
base_folder_id: str # ID de la carpeta de Drive donde escribir
impersonate_user: Optional[str] = None
class GoogleDriveResultsSink(ResultsSink):
"""
ResultsSink que sube JSONs e imágenes a una carpeta de Google Drive.
Nota: por simplicidad, usamos solo el nombre del fichero (basename de `path`).
Es decir, si le pasas 'data/output/123/results.json', en Drive se guardará
como 'results.json' dentro de base_folder_id.
"""
def __init__(self, config: GoogleDriveSinkConfig) -> None:
self._config = config
self._service = self._build_service()
def _build_service(self):
creds = service_account.Credentials.from_service_account_file(
self._config.credentials_path,
scopes=GDRIVE_SCOPES,
)
if self._config.impersonate_user:
creds = creds.with_subject(self._config.impersonate_user)
service = build("drive", "v3", credentials=creds)
return service
def _upload_bytes(self, data: bytes, mime_type: str, target_path: str) -> str:
"""
Sube un fichero en memoria a Drive y devuelve el file_id.
"""
filename = Path(target_path).name
media = MediaIoBaseUpload(io.BytesIO(data), mimetype=mime_type, resumable=False)
file_metadata = {
"name": filename,
"parents": [self._config.base_folder_id],
}
created = self._service.files().create(
body=file_metadata,
media_body=media,
fields="id",
).execute()
return created["id"]
def write_json(self, path: str, data: Dict[str, Any]) -> None:
payload = json.dumps(data, ensure_ascii=False, indent=2).encode("utf-8")
self._upload_bytes(payload, "application/json", path)
def write_figure(self, path: str, fig) -> None:
from matplotlib.figure import Figure
if not isinstance(fig, Figure):
raise TypeError("write_figure espera un matplotlib.figure.Figure")
buf = io.BytesIO()
fig.savefig(buf, format="png", bbox_inches="tight")
buf.seek(0)
self._upload_bytes(buf.read(), "image/png", path)

View File

@@ -0,0 +1,57 @@
from __future__ import annotations
import json
import os
from typing import Any, Dict
import pandas as pd
from matplotlib.figure import Figure
from .base import DataSource, ResultsSink
class LocalDataSource(DataSource):
"""
DataSource que lee CSV desde el sistema de ficheros local.
- base_dir: se prefiere que todos los paths sean relativos a esta carpeta.
"""
def __init__(self, base_dir: str = ".") -> None:
self.base_dir = base_dir
def _full_path(self, path: str) -> str:
if os.path.isabs(path):
return path
return os.path.join(self.base_dir, path)
def read_csv(self, path: str) -> pd.DataFrame:
full = self._full_path(path)
return pd.read_csv(full)
class LocalResultsSink(ResultsSink):
"""
ResultsSink que escribe JSON e imágenes en el sistema de ficheros local.
"""
def __init__(self, base_dir: str = ".") -> None:
self.base_dir = base_dir
def _full_path(self, path: str) -> str:
if os.path.isabs(path):
full = path
else:
full = os.path.join(self.base_dir, path)
# Crear carpetas si no existen
os.makedirs(os.path.dirname(full), exist_ok=True)
return full
def write_json(self, path: str, data: Dict[str, Any]) -> None:
full = self._full_path(path)
with open(full, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def write_figure(self, path: str, fig: Figure) -> None:
full = self._full_path(path)
fig.savefig(full, bbox_inches="tight")

View File

@@ -0,0 +1,62 @@
from __future__ import annotations
import io
import json
from typing import Any, Dict, Tuple
import boto3
import pandas as pd
from matplotlib.figure import Figure
from .base import DataSource, ResultsSink
def _split_s3_path(path: str) -> Tuple[str, str]:
"""
Convierte 's3://bucket/key' en (bucket, key).
"""
if not path.startswith("s3://"):
raise ValueError(f"Ruta S3 inválida: {path}")
without_scheme = path[len("s3://") :]
parts = without_scheme.split("/", 1)
if len(parts) != 2:
raise ValueError(f"Ruta S3 inválida: {path}")
return parts[0], parts[1]
class S3DataSource(DataSource):
"""
DataSource que lee CSV desde S3 usando boto3.
"""
def __init__(self, boto3_client: Any | None = None) -> None:
self.s3 = boto3_client or boto3.client("s3")
def read_csv(self, path: str) -> pd.DataFrame:
bucket, key = _split_s3_path(path)
obj = self.s3.get_object(Bucket=bucket, Key=key)
body = obj["Body"].read()
buffer = io.BytesIO(body)
return pd.read_csv(buffer)
class S3ResultsSink(ResultsSink):
"""
ResultsSink que escribe JSON e imágenes en S3.
"""
def __init__(self, boto3_client: Any | None = None) -> None:
self.s3 = boto3_client or boto3.client("s3")
def write_json(self, path: str, data: Dict[str, Any]) -> None:
bucket, key = _split_s3_path(path)
body = json.dumps(data, ensure_ascii=False, indent=2).encode("utf-8")
self.s3.put_object(Bucket=bucket, Key=key, Body=body)
def write_figure(self, path: str, fig: Figure) -> None:
bucket, key = _split_s3_path(path)
buf = io.BytesIO()
fig.savefig(buf, format="png", bbox_inches="tight")
buf.seek(0)
self.s3.put_object(Bucket=bucket, Key=key, Body=buf.getvalue(), ContentType="image/png")