Initial commit - ACME demo version
This commit is contained in:
22
backend/beyond_metrics/io/__init__.py
Normal file
22
backend/beyond_metrics/io/__init__.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from .base import DataSource, ResultsSink
|
||||
from .local import LocalDataSource, LocalResultsSink
|
||||
from .s3 import S3DataSource, S3ResultsSink
|
||||
from .google_drive import (
|
||||
GoogleDriveDataSource,
|
||||
GoogleDriveConfig,
|
||||
GoogleDriveResultsSink,
|
||||
GoogleDriveSinkConfig,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"DataSource",
|
||||
"ResultsSink",
|
||||
"LocalDataSource",
|
||||
"LocalResultsSink",
|
||||
"S3DataSource",
|
||||
"S3ResultsSink",
|
||||
"GoogleDriveDataSource",
|
||||
"GoogleDriveConfig",
|
||||
"GoogleDriveResultsSink",
|
||||
"GoogleDriveSinkConfig",
|
||||
]
|
||||
36
backend/beyond_metrics/io/base.py
Normal file
36
backend/beyond_metrics/io/base.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict
|
||||
|
||||
import pandas as pd
|
||||
from matplotlib.figure import Figure
|
||||
|
||||
|
||||
class DataSource(ABC):
|
||||
"""Interfaz de lectura de datos (CSV)."""
|
||||
|
||||
@abstractmethod
|
||||
def read_csv(self, path: str) -> pd.DataFrame:
|
||||
"""
|
||||
Lee un CSV y devuelve un DataFrame.
|
||||
|
||||
El significado de 'path' depende de la implementación:
|
||||
- LocalDataSource: ruta en el sistema de ficheros
|
||||
- S3DataSource: 's3://bucket/key'
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class ResultsSink(ABC):
|
||||
"""Interfaz de escritura de resultados (JSON e imágenes)."""
|
||||
|
||||
@abstractmethod
|
||||
def write_json(self, path: str, data: Dict[str, Any]) -> None:
|
||||
"""Escribe un dict como JSON en 'path'."""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def write_figure(self, path: str, fig: Figure) -> None:
|
||||
"""Guarda una figura matplotlib en 'path'."""
|
||||
raise NotImplementedError
|
||||
160
backend/beyond_metrics/io/google_drive.py
Normal file
160
backend/beyond_metrics/io/google_drive.py
Normal file
@@ -0,0 +1,160 @@
|
||||
# beyond_metrics/io/google_drive.py
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
import pandas as pd
|
||||
from google.oauth2 import service_account
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.http import MediaIoBaseDownload, MediaIoBaseUpload
|
||||
|
||||
from .base import DataSource, ResultsSink
|
||||
|
||||
|
||||
GDRIVE_SCOPES = ["https://www.googleapis.com/auth/drive.readonly",
|
||||
"https://www.googleapis.com/auth/drive.file"]
|
||||
|
||||
|
||||
def _extract_file_id(file_id_or_url: str) -> str:
|
||||
"""
|
||||
Acepta:
|
||||
- un ID directo de Google Drive (ej: '1AbC...')
|
||||
- una URL de Google Drive compartida
|
||||
|
||||
y devuelve siempre el file_id.
|
||||
"""
|
||||
if "http://" not in file_id_or_url and "https://" not in file_id_or_url:
|
||||
return file_id_or_url.strip()
|
||||
|
||||
patterns = [
|
||||
r"/d/([a-zA-Z0-9_-]{10,})", # https://drive.google.com/file/d/<ID>/view
|
||||
r"id=([a-zA-Z0-9_-]{10,})", # https://drive.google.com/open?id=<ID>
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
m = re.search(pattern, file_id_or_url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
|
||||
raise ValueError(f"No se pudo extraer un file_id de la URL de Google Drive: {file_id_or_url}")
|
||||
|
||||
|
||||
# -------- DataSource --------
|
||||
|
||||
@dataclass
|
||||
class GoogleDriveConfig:
|
||||
credentials_path: str # ruta al JSON de service account
|
||||
impersonate_user: Optional[str] = None
|
||||
|
||||
|
||||
class GoogleDriveDataSource(DataSource):
|
||||
"""
|
||||
DataSource que lee CSVs desde Google Drive.
|
||||
"""
|
||||
|
||||
def __init__(self, config: GoogleDriveConfig) -> None:
|
||||
self._config = config
|
||||
self._service = self._build_service(readonly=True)
|
||||
|
||||
def _build_service(self, readonly: bool = True):
|
||||
scopes = ["https://www.googleapis.com/auth/drive.readonly"] if readonly else GDRIVE_SCOPES
|
||||
creds = service_account.Credentials.from_service_account_file(
|
||||
self._config.credentials_path,
|
||||
scopes=scopes,
|
||||
)
|
||||
|
||||
if self._config.impersonate_user:
|
||||
creds = creds.with_subject(self._config.impersonate_user)
|
||||
|
||||
service = build("drive", "v3", credentials=creds)
|
||||
return service
|
||||
|
||||
def read_csv(self, path: str) -> pd.DataFrame:
|
||||
file_id = _extract_file_id(path)
|
||||
|
||||
request = self._service.files().get_media(fileId=file_id)
|
||||
fh = io.BytesIO()
|
||||
downloader = MediaIoBaseDownload(fh, request)
|
||||
|
||||
done = False
|
||||
while not done:
|
||||
_, done = downloader.next_chunk()
|
||||
|
||||
fh.seek(0)
|
||||
df = pd.read_csv(fh)
|
||||
return df
|
||||
|
||||
|
||||
# -------- ResultsSink --------
|
||||
|
||||
@dataclass
|
||||
class GoogleDriveSinkConfig:
|
||||
credentials_path: str # ruta al JSON de service account
|
||||
base_folder_id: str # ID de la carpeta de Drive donde escribir
|
||||
impersonate_user: Optional[str] = None
|
||||
|
||||
|
||||
class GoogleDriveResultsSink(ResultsSink):
|
||||
"""
|
||||
ResultsSink que sube JSONs e imágenes a una carpeta de Google Drive.
|
||||
|
||||
Nota: por simplicidad, usamos solo el nombre del fichero (basename de `path`).
|
||||
Es decir, si le pasas 'data/output/123/results.json', en Drive se guardará
|
||||
como 'results.json' dentro de base_folder_id.
|
||||
"""
|
||||
|
||||
def __init__(self, config: GoogleDriveSinkConfig) -> None:
|
||||
self._config = config
|
||||
self._service = self._build_service()
|
||||
|
||||
def _build_service(self):
|
||||
creds = service_account.Credentials.from_service_account_file(
|
||||
self._config.credentials_path,
|
||||
scopes=GDRIVE_SCOPES,
|
||||
)
|
||||
|
||||
if self._config.impersonate_user:
|
||||
creds = creds.with_subject(self._config.impersonate_user)
|
||||
|
||||
service = build("drive", "v3", credentials=creds)
|
||||
return service
|
||||
|
||||
def _upload_bytes(self, data: bytes, mime_type: str, target_path: str) -> str:
|
||||
"""
|
||||
Sube un fichero en memoria a Drive y devuelve el file_id.
|
||||
"""
|
||||
filename = Path(target_path).name
|
||||
|
||||
media = MediaIoBaseUpload(io.BytesIO(data), mimetype=mime_type, resumable=False)
|
||||
file_metadata = {
|
||||
"name": filename,
|
||||
"parents": [self._config.base_folder_id],
|
||||
}
|
||||
|
||||
created = self._service.files().create(
|
||||
body=file_metadata,
|
||||
media_body=media,
|
||||
fields="id",
|
||||
).execute()
|
||||
|
||||
return created["id"]
|
||||
|
||||
def write_json(self, path: str, data: Dict[str, Any]) -> None:
|
||||
payload = json.dumps(data, ensure_ascii=False, indent=2).encode("utf-8")
|
||||
self._upload_bytes(payload, "application/json", path)
|
||||
|
||||
def write_figure(self, path: str, fig) -> None:
|
||||
from matplotlib.figure import Figure
|
||||
|
||||
if not isinstance(fig, Figure):
|
||||
raise TypeError("write_figure espera un matplotlib.figure.Figure")
|
||||
|
||||
buf = io.BytesIO()
|
||||
fig.savefig(buf, format="png", bbox_inches="tight")
|
||||
buf.seek(0)
|
||||
self._upload_bytes(buf.read(), "image/png", path)
|
||||
57
backend/beyond_metrics/io/local.py
Normal file
57
backend/beyond_metrics/io/local.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
from typing import Any, Dict
|
||||
|
||||
import pandas as pd
|
||||
from matplotlib.figure import Figure
|
||||
|
||||
from .base import DataSource, ResultsSink
|
||||
|
||||
|
||||
class LocalDataSource(DataSource):
|
||||
"""
|
||||
DataSource que lee CSV desde el sistema de ficheros local.
|
||||
|
||||
- base_dir: se prefiere que todos los paths sean relativos a esta carpeta.
|
||||
"""
|
||||
|
||||
def __init__(self, base_dir: str = ".") -> None:
|
||||
self.base_dir = base_dir
|
||||
|
||||
def _full_path(self, path: str) -> str:
|
||||
if os.path.isabs(path):
|
||||
return path
|
||||
return os.path.join(self.base_dir, path)
|
||||
|
||||
def read_csv(self, path: str) -> pd.DataFrame:
|
||||
full = self._full_path(path)
|
||||
return pd.read_csv(full)
|
||||
|
||||
|
||||
class LocalResultsSink(ResultsSink):
|
||||
"""
|
||||
ResultsSink que escribe JSON e imágenes en el sistema de ficheros local.
|
||||
"""
|
||||
|
||||
def __init__(self, base_dir: str = ".") -> None:
|
||||
self.base_dir = base_dir
|
||||
|
||||
def _full_path(self, path: str) -> str:
|
||||
if os.path.isabs(path):
|
||||
full = path
|
||||
else:
|
||||
full = os.path.join(self.base_dir, path)
|
||||
# Crear carpetas si no existen
|
||||
os.makedirs(os.path.dirname(full), exist_ok=True)
|
||||
return full
|
||||
|
||||
def write_json(self, path: str, data: Dict[str, Any]) -> None:
|
||||
full = self._full_path(path)
|
||||
with open(full, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
def write_figure(self, path: str, fig: Figure) -> None:
|
||||
full = self._full_path(path)
|
||||
fig.savefig(full, bbox_inches="tight")
|
||||
62
backend/beyond_metrics/io/s3.py
Normal file
62
backend/beyond_metrics/io/s3.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import json
|
||||
from typing import Any, Dict, Tuple
|
||||
|
||||
import boto3
|
||||
import pandas as pd
|
||||
from matplotlib.figure import Figure
|
||||
|
||||
from .base import DataSource, ResultsSink
|
||||
|
||||
|
||||
def _split_s3_path(path: str) -> Tuple[str, str]:
|
||||
"""
|
||||
Convierte 's3://bucket/key' en (bucket, key).
|
||||
"""
|
||||
if not path.startswith("s3://"):
|
||||
raise ValueError(f"Ruta S3 inválida: {path}")
|
||||
|
||||
without_scheme = path[len("s3://") :]
|
||||
parts = without_scheme.split("/", 1)
|
||||
if len(parts) != 2:
|
||||
raise ValueError(f"Ruta S3 inválida: {path}")
|
||||
return parts[0], parts[1]
|
||||
|
||||
|
||||
class S3DataSource(DataSource):
|
||||
"""
|
||||
DataSource que lee CSV desde S3 usando boto3.
|
||||
"""
|
||||
|
||||
def __init__(self, boto3_client: Any | None = None) -> None:
|
||||
self.s3 = boto3_client or boto3.client("s3")
|
||||
|
||||
def read_csv(self, path: str) -> pd.DataFrame:
|
||||
bucket, key = _split_s3_path(path)
|
||||
obj = self.s3.get_object(Bucket=bucket, Key=key)
|
||||
body = obj["Body"].read()
|
||||
buffer = io.BytesIO(body)
|
||||
return pd.read_csv(buffer)
|
||||
|
||||
|
||||
class S3ResultsSink(ResultsSink):
|
||||
"""
|
||||
ResultsSink que escribe JSON e imágenes en S3.
|
||||
"""
|
||||
|
||||
def __init__(self, boto3_client: Any | None = None) -> None:
|
||||
self.s3 = boto3_client or boto3.client("s3")
|
||||
|
||||
def write_json(self, path: str, data: Dict[str, Any]) -> None:
|
||||
bucket, key = _split_s3_path(path)
|
||||
body = json.dumps(data, ensure_ascii=False, indent=2).encode("utf-8")
|
||||
self.s3.put_object(Bucket=bucket, Key=key, Body=body)
|
||||
|
||||
def write_figure(self, path: str, fig: Figure) -> None:
|
||||
bucket, key = _split_s3_path(path)
|
||||
buf = io.BytesIO()
|
||||
fig.savefig(buf, format="png", bbox_inches="tight")
|
||||
buf.seek(0)
|
||||
self.s3.put_object(Bucket=bucket, Key=key, Body=buf.getvalue(), ContentType="image/png")
|
||||
Reference in New Issue
Block a user