Initial clean commit

2026-01-13 13:39:51 +01:00 · 2026-01-13 13:39:51 +01:00 · 6784d81c2c
commit 6784d81c2c
141 changed files with 25219 additions and 0 deletions
--- a/workers/embeddings_worker.py
+++ b/workers/embeddings_worker.py
@ -0,0 +1,267 @@
+import os
+import time
+import logging
+from typing import List
+
+import numpy as np
+import psycopg2
+import psycopg2.extras
+from psycopg2.extras import execute_values
+
+import torch
+from sentence_transformers import SentenceTransformer
+
+
+# ================================================================
+# Logging
+# ================================================================
+logging.basicConfig(
+    level=logging.INFO,
+    format='[EMB] %(asctime)s %(levelname)s: %(message)s'
+)
+log = logging.getLogger("embeddings_worker")
+
+
+# ================================================================
+# Configuración
+# ================================================================
+DB = dict(
+    host=os.environ.get("DB_HOST", "localhost"),
+    port=int(os.environ.get("DB_PORT", 5432)),
+    dbname=os.environ.get("DB_NAME", "rss"),
+    user=os.environ.get("DB_USER", "rss"),
+    password=os.environ.get("DB_PASS", "x"),
+)
+
+EMB_MODEL = os.environ.get(
+    "EMB_MODEL",
+    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
+)
+
+EMB_BATCH = int(os.environ.get("EMB_BATCH", "128"))
+SLEEP_IDLE = float(os.environ.get("EMB_SLEEP_IDLE", "5.0"))
+
+# ej: "es,en,fr"
+EMB_LANGS = [
+    s.strip()
+    for s in os.environ.get("EMB_LANGS", "es").split(",")
+    if s.strip()
+]
+
+DEVICE_ENV = os.environ.get("DEVICE", "auto").lower()
+EMB_LIMIT = int(os.environ.get("EMB_LIMIT", "1000"))
+
+
+# ================================================================
+# Conexión
+# ================================================================
+def get_conn():
+    return psycopg2.connect(**DB)
+
+
+# ================================================================
+# Esquema — se asegura que exista
+# ================================================================
+def ensure_schema(conn):
+    """
+    Asegura que la tabla de embeddings existe. Idempotente.
+    """
+    with conn.cursor() as cur:
+        cur.execute(
+            """
+            CREATE TABLE IF NOT EXISTS traduccion_embeddings (
+                id SERIAL PRIMARY KEY,
+                traduccion_id INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE,
+                model TEXT NOT NULL,
+                dim INT NOT NULL,
+                embedding DOUBLE PRECISION[] NOT NULL,
+                created_at TIMESTAMP DEFAULT NOW(),
+                UNIQUE (traduccion_id, model)
+            );
+            """
+        )
+        cur.execute("""
+            CREATE INDEX IF NOT EXISTS idx_tr_emb_model
+                ON traduccion_embeddings(model);
+        """)
+        cur.execute("""
+            CREATE INDEX IF NOT EXISTS idx_tr_emb_traduccion_id
+                ON traduccion_embeddings(traduccion_id);
+        """)
+
+    conn.commit()
+
+
+# ================================================================
+# Fetch de trabajos pendientes
+# ================================================================
+def fetch_batch_pending(conn) -> List[psycopg2.extras.DictRow]:
+    """
+    Obtiene traducciones en status 'done' que aún no tienen embedding
+    para este modelo.
+    """
+    with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+        cur.execute(
+            """
+            SELECT 
+                t.id AS traduccion_id,
+                t.lang_to AS lang_to,
+                COALESCE(NULLIF(t.titulo_trad,''), '') AS titulo_trad,
+                COALESCE(NULLIF(t.resumen_trad,''), '') AS resumen_trad,
+                n.id AS noticia_id
+            FROM traducciones t
+            JOIN noticias n ON n.id = t.noticia_id
+            LEFT JOIN traduccion_embeddings e
+                   ON e.traduccion_id = t.id AND e.model = %s
+            WHERE t.status = 'done'
+              AND t.lang_to = ANY(%s)
+              AND e.traduccion_id IS NULL
+            ORDER BY t.id
+            LIMIT %s;
+            """,
+            (EMB_MODEL, EMB_LANGS, EMB_LIMIT),
+        )
+        return cur.fetchall()
+
+
+# ================================================================
+# Preparación de textos
+# ================================================================
+def texts_from_rows(rows: List[psycopg2.extras.DictRow]) -> List[str]:
+    """
+    Devuelve textos combinados para embeddings.
+    Evita pasar texto vacío al modelo.
+    """
+    texts = []
+    for r in rows:
+        title = (r["titulo_trad"] or "").strip()
+        body = (r["resumen_trad"] or "").strip()
+
+        if title and body:
+            texts.append(f"{title}\n{body}")
+        else:
+            texts.append(title or body or "")
+
+    return texts
+
+
+# ================================================================
+# Upsert
+# ================================================================
+def upsert_embeddings(conn, rows, embs: np.ndarray, model_name: str):
+    """
+    Inserta o actualiza embeddings en la base de datos.
+    """
+    if embs.size == 0 or not rows:
+        return
+
+    dim = int(embs.shape[1])
+
+    data = [
+        (
+            int(r["traduccion_id"]),
+            model_name,
+            dim,
+            embs[i].astype(float).tolist(),
+        )
+        for i, r in enumerate(rows)
+    ]
+
+    with conn.cursor() as cur:
+        execute_values(
+            cur,
+            """
+            INSERT INTO traduccion_embeddings 
+                (traduccion_id, model, dim, embedding)
+            VALUES %s
+            ON CONFLICT (traduccion_id, model)
+            DO UPDATE SET
+                embedding  = EXCLUDED.embedding,
+                dim        = EXCLUDED.dim,
+                created_at = NOW();
+            """,
+            data,
+        )
+
+    conn.commit()
+
+
+# ================================================================
+# Load model
+# ================================================================
+def resolve_device() -> str:
+    """
+    Determina el dispositivo a usar.
+    """
+    if DEVICE_ENV in ("cpu", "cuda"):
+        if DEVICE_ENV == "cuda" and not torch.cuda.is_available():
+            return "cpu"
+        return DEVICE_ENV
+
+    # auto
+    return "cuda" if torch.cuda.is_available() else "cpu"
+
+
+def load_model() -> SentenceTransformer:
+    """
+    Carga el modelo con fallback CPU si CUDA falla.
+    """
+    device = resolve_device()
+    log.info(f"Cargando modelo {EMB_MODEL} en device={device} …")
+
+    try:
+        return SentenceTransformer(EMB_MODEL, device=device)
+    except Exception as e:
+        log.error(f"Fallo cargando modelo en {device}: {e}")
+
+        if device == "cuda":
+            log.warning("→ Reintentando en CPU…")
+            return SentenceTransformer(EMB_MODEL, device="cpu")
+
+        raise
+
+
+# ================================================================
+# Main Worker
+# ================================================================
+def main():
+    log.info(
+        f"Iniciando embeddings_worker | model={EMB_MODEL} | batch={EMB_BATCH} | lang={','.join(EMB_LANGS)} | limit={EMB_LIMIT}"
+    )
+
+    model = load_model()
+
+    while True:
+        try:
+            with get_conn() as conn:
+                ensure_schema(conn)
+
+                rows = fetch_batch_pending(conn)
+                if not rows:
+                    time.sleep(SLEEP_IDLE)
+                    continue
+
+                texts = texts_from_rows(rows)
+
+                # Encode
+                embs = model.encode(
+                    texts,
+                    batch_size=EMB_BATCH,
+                    convert_to_numpy=True,
+                    show_progress_bar=False,
+                    normalize_embeddings=True,
+                )
+
+                # Upsert
+                upsert_embeddings(conn, rows, embs, EMB_MODEL)
+
+                log.info(f"Embeddings generados: {len(rows)}")
+
+        except Exception as e:
+            log.exception(f"Error en embeddings_worker: {e}")
+            time.sleep(SLEEP_IDLE)
+
+
+if __name__ == "__main__":
+    main()
+