Initial clean commit

This commit is contained in:
jlimolina 2026-01-13 13:39:51 +01:00
commit 6784d81c2c
141 changed files with 25219 additions and 0 deletions

View file

@ -0,0 +1,267 @@
import os
import time
import logging
from typing import List
import numpy as np
import psycopg2
import psycopg2.extras
from psycopg2.extras import execute_values
import torch
from sentence_transformers import SentenceTransformer
# ================================================================
# Logging
# ================================================================
logging.basicConfig(
level=logging.INFO,
format='[EMB] %(asctime)s %(levelname)s: %(message)s'
)
log = logging.getLogger("embeddings_worker")
# ================================================================
# Configuración
# ================================================================
DB = dict(
host=os.environ.get("DB_HOST", "localhost"),
port=int(os.environ.get("DB_PORT", 5432)),
dbname=os.environ.get("DB_NAME", "rss"),
user=os.environ.get("DB_USER", "rss"),
password=os.environ.get("DB_PASS", "x"),
)
EMB_MODEL = os.environ.get(
"EMB_MODEL",
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
)
EMB_BATCH = int(os.environ.get("EMB_BATCH", "128"))
SLEEP_IDLE = float(os.environ.get("EMB_SLEEP_IDLE", "5.0"))
# ej: "es,en,fr"
EMB_LANGS = [
s.strip()
for s in os.environ.get("EMB_LANGS", "es").split(",")
if s.strip()
]
DEVICE_ENV = os.environ.get("DEVICE", "auto").lower()
EMB_LIMIT = int(os.environ.get("EMB_LIMIT", "1000"))
# ================================================================
# Conexión
# ================================================================
def get_conn():
return psycopg2.connect(**DB)
# ================================================================
# Esquema — se asegura que exista
# ================================================================
def ensure_schema(conn):
"""
Asegura que la tabla de embeddings existe. Idempotente.
"""
with conn.cursor() as cur:
cur.execute(
"""
CREATE TABLE IF NOT EXISTS traduccion_embeddings (
id SERIAL PRIMARY KEY,
traduccion_id INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE,
model TEXT NOT NULL,
dim INT NOT NULL,
embedding DOUBLE PRECISION[] NOT NULL,
created_at TIMESTAMP DEFAULT NOW(),
UNIQUE (traduccion_id, model)
);
"""
)
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_tr_emb_model
ON traduccion_embeddings(model);
""")
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_tr_emb_traduccion_id
ON traduccion_embeddings(traduccion_id);
""")
conn.commit()
# ================================================================
# Fetch de trabajos pendientes
# ================================================================
def fetch_batch_pending(conn) -> List[psycopg2.extras.DictRow]:
"""
Obtiene traducciones en status 'done' que aún no tienen embedding
para este modelo.
"""
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute(
"""
SELECT
t.id AS traduccion_id,
t.lang_to AS lang_to,
COALESCE(NULLIF(t.titulo_trad,''), '') AS titulo_trad,
COALESCE(NULLIF(t.resumen_trad,''), '') AS resumen_trad,
n.id AS noticia_id
FROM traducciones t
JOIN noticias n ON n.id = t.noticia_id
LEFT JOIN traduccion_embeddings e
ON e.traduccion_id = t.id AND e.model = %s
WHERE t.status = 'done'
AND t.lang_to = ANY(%s)
AND e.traduccion_id IS NULL
ORDER BY t.id
LIMIT %s;
""",
(EMB_MODEL, EMB_LANGS, EMB_LIMIT),
)
return cur.fetchall()
# ================================================================
# Preparación de textos
# ================================================================
def texts_from_rows(rows: List[psycopg2.extras.DictRow]) -> List[str]:
"""
Devuelve textos combinados para embeddings.
Evita pasar texto vacío al modelo.
"""
texts = []
for r in rows:
title = (r["titulo_trad"] or "").strip()
body = (r["resumen_trad"] or "").strip()
if title and body:
texts.append(f"{title}\n{body}")
else:
texts.append(title or body or "")
return texts
# ================================================================
# Upsert
# ================================================================
def upsert_embeddings(conn, rows, embs: np.ndarray, model_name: str):
"""
Inserta o actualiza embeddings en la base de datos.
"""
if embs.size == 0 or not rows:
return
dim = int(embs.shape[1])
data = [
(
int(r["traduccion_id"]),
model_name,
dim,
embs[i].astype(float).tolist(),
)
for i, r in enumerate(rows)
]
with conn.cursor() as cur:
execute_values(
cur,
"""
INSERT INTO traduccion_embeddings
(traduccion_id, model, dim, embedding)
VALUES %s
ON CONFLICT (traduccion_id, model)
DO UPDATE SET
embedding = EXCLUDED.embedding,
dim = EXCLUDED.dim,
created_at = NOW();
""",
data,
)
conn.commit()
# ================================================================
# Load model
# ================================================================
def resolve_device() -> str:
"""
Determina el dispositivo a usar.
"""
if DEVICE_ENV in ("cpu", "cuda"):
if DEVICE_ENV == "cuda" and not torch.cuda.is_available():
return "cpu"
return DEVICE_ENV
# auto
return "cuda" if torch.cuda.is_available() else "cpu"
def load_model() -> SentenceTransformer:
"""
Carga el modelo con fallback CPU si CUDA falla.
"""
device = resolve_device()
log.info(f"Cargando modelo {EMB_MODEL} en device={device}")
try:
return SentenceTransformer(EMB_MODEL, device=device)
except Exception as e:
log.error(f"Fallo cargando modelo en {device}: {e}")
if device == "cuda":
log.warning("→ Reintentando en CPU…")
return SentenceTransformer(EMB_MODEL, device="cpu")
raise
# ================================================================
# Main Worker
# ================================================================
def main():
log.info(
f"Iniciando embeddings_worker | model={EMB_MODEL} | batch={EMB_BATCH} | lang={','.join(EMB_LANGS)} | limit={EMB_LIMIT}"
)
model = load_model()
while True:
try:
with get_conn() as conn:
ensure_schema(conn)
rows = fetch_batch_pending(conn)
if not rows:
time.sleep(SLEEP_IDLE)
continue
texts = texts_from_rows(rows)
# Encode
embs = model.encode(
texts,
batch_size=EMB_BATCH,
convert_to_numpy=True,
show_progress_bar=False,
normalize_embeddings=True,
)
# Upsert
upsert_embeddings(conn, rows, embs, EMB_MODEL)
log.info(f"Embeddings generados: {len(rows)}")
except Exception as e:
log.exception(f"Error en embeddings_worker: {e}")
time.sleep(SLEEP_IDLE)
if __name__ == "__main__":
main()