arreglo de ui y busquedas

2025-11-21 04:42:02 +01:00 · 2025-11-21 04:42:02 +01:00 · fc06566928
commit fc06566928
parent cb8f69fb93
15 changed files with 1115 additions and 435 deletions
--- a/related_worker.py
+++ b/related_worker.py
@ -1,9 +1,9 @@
 import os
 import time
-import math
 import logging
 from typing import List, Tuple

+import numpy as np
 import psycopg2
 import psycopg2.extras

@ -22,7 +22,6 @@ DB = dict(

 TOPK         = int(os.environ.get("RELATED_TOPK", 10))
 BATCH_IDS    = int(os.environ.get("RELATED_BATCH_IDS", 200))
-BATCH_SIM    = int(os.environ.get("RELATED_BATCH_SIM", 2000))
 SLEEP_IDLE   = float(os.environ.get("RELATED_SLEEP", 10))
 MIN_SCORE    = float(os.environ.get("RELATED_MIN_SCORE", 0.0))
 WINDOW_HOURS = int(os.environ.get("RELATED_WINDOW_H", 0))
@ -32,44 +31,64 @@ def get_conn():
    return psycopg2.connect(**DB)


+# ---------------------------------------------------------
+# Cargar embeddings SOLO de traducciones en español (lang_to='es')
+# ---------------------------------------------------------
 def _fetch_all_embeddings(cur):
+    base_sql = """
+        SELECT e.traduccion_id, e.vec
+        FROM embeddings e
+        JOIN traducciones t ON t.id = e.traduccion_id
+        JOIN noticias n ON n.id = t.noticia_id
+        WHERE t.lang_to = 'es'
+    """
+
+    params = []
    if WINDOW_HOURS > 0:
-        cur.execute(
-            """
-            SELECT e.traduccion_id, e.vec
-            FROM embeddings e
-            JOIN traducciones t ON t.id = e.traduccion_id
-            JOIN noticias n ON n.id = t.noticia_id
-            WHERE n.fecha >= NOW() - INTERVAL %s
-            """,
-            (f"{WINDOW_HOURS} hours",),
-        )
-    else:
-        cur.execute("SELECT traduccion_id, vec FROM embeddings")
+        base_sql += " AND n.fecha >= NOW() - INTERVAL %s"
+        params.append(f"{WINDOW_HOURS} hours")
+
+    cur.execute(base_sql, params)

    rows = cur.fetchall()
    if not rows:
-        return [], [], []
+        return [], None

    ids = []
    vecs = []
-    norms = []
-    for tr_id, v in rows:
+
+    for tid, v in rows:
        if v is None:
-            v = []
-        nrm = math.sqrt(sum(((x or 0.0) * (x or 0.0)) for x in v)) or 1e-8
-        ids.append(tr_id)
+            continue
+        ids.append(tid)
        vecs.append(v)
-        norms.append(nrm)
-    return ids, vecs, norms
+
+    if not ids:
+        return [], None
+
+    # Convertimos a matriz numpy
+    mat = np.array(vecs, dtype=np.float32)
+
+    # Normalizamos (evita división por 0)
+    norms = np.linalg.norm(mat, axis=1, keepdims=True)
+    norms[norms == 0] = 1e-8
+    mat = mat / norms
+
+    return ids, mat


+# ---------------------------------------------------------
+# Obtiene IDs pendientes
+# ---------------------------------------------------------
 def _fetch_pending_ids(cur, limit) -> List[int]:
    cur.execute(
        """
        SELECT e.traduccion_id
        FROM embeddings e
-        LEFT JOIN related_noticias r ON r.traduccion_id = e.traduccion_id
+        JOIN traducciones t ON t.id = e.traduccion_id
+        LEFT JOIN related_noticias r
+               ON r.traduccion_id = e.traduccion_id
+        WHERE t.lang_to = 'es'
        GROUP BY e.traduccion_id
        HAVING COUNT(r.related_traduccion_id) = 0
        ORDER BY e.traduccion_id DESC
@ -80,42 +99,44 @@ def _fetch_pending_ids(cur, limit) -> List[int]:
    return [r[0] for r in cur.fetchall()]


-def _cosine_with_norms(a, b, na, nb):
-    num = 0.0
-    for x, y in zip(a, b):
-        xv = x or 0.0
-        yv = y or 0.0
-        num += xv * yv
-    denom = na * nb
-    if denom <= 0.0:
-        return 0.0
-    return num / denom
-
-
-def _topk_for_one(
+# ---------------------------------------------------------
+# TOP-K usando NumPy (súper rápido)
+# ---------------------------------------------------------
+def _topk_numpy(
    idx: int,
    ids_all: List[int],
-    vecs_all: List[List[float]],
-    norms_all: List[float],
-    pool_indices: List[int],
-    K: int,
+    mat: np.ndarray,
+    K: int
 ) -> List[Tuple[int, float]]:
-    me_vec = vecs_all[idx]
-    me_norm = norms_all[idx]

-    out: List[Tuple[int, float]] = []
-    for j in pool_indices:
-        if j == idx:
-            continue
-        s = _cosine_with_norms(me_vec, vecs_all[j], me_norm, norms_all[j])
-        out.append((ids_all[j], s))
+    # vector de la noticia central
+    q = mat[idx]  # (dim,)

-    out.sort(key=lambda t: t[1], reverse=True)
-    if MIN_SCORE > 0.0:
-        out = [p for p in out if p[1] >= MIN_SCORE]
-    return out[:K]
+    # similitudes coseno: dot product (matriz · vector)
+    sims = np.dot(mat, q)
+
+    # eliminar self-match
+    sims[idx] = -999.0
+
+    # filtramos por score mínimo
+    if MIN_SCORE > 0:
+        mask = sims >= MIN_SCORE
+        sims = np.where(mask, sims, -999.0)
+
+    # obtenemos los índices top-k (mucho más rápido que ordenar todo)
+    if K >= len(sims):
+        top_idx = np.argsort(-sims)
+    else:
+        part = np.argpartition(-sims, K)[:K]
+        top_idx = part[np.argsort(-sims[part])]
+
+    out = [(ids_all[j], float(sims[j])) for j in top_idx[:K]]
+    return out


+# ---------------------------------------------------------
+# Inserta en la tabla related_noticias
+# ---------------------------------------------------------
 def _insert_related(cur, tr_id: int, pairs: List[Tuple[int, float]]):
    if not pairs:
        return
@ -127,48 +148,47 @@ def _insert_related(cur, tr_id: int, pairs: List[Tuple[int, float]]):
        ON CONFLICT (traduccion_id, related_traduccion_id)
        DO UPDATE SET score = EXCLUDED.score
        """,
-        [(tr_id, rid, float(score)) for (rid, score) in pairs],
+        [(tr_id, rid, score) for (rid, score) in pairs],
    )


+# ---------------------------------------------------------
+# Procesar IDs objetivo
+# ---------------------------------------------------------
 def build_for_ids(conn, target_ids: List[int]) -> int:
    with conn.cursor() as cur:
-        ids_all, vecs_all, norms_all = _fetch_all_embeddings(cur)
-    if not ids_all:
+        ids_all, mat = _fetch_all_embeddings(cur)
+
+    if not ids_all or mat is None:
        return 0

+    # Mapa ID → index
    pos = {tid: i for i, tid in enumerate(ids_all)}
-    n = len(ids_all)
    processed = 0

    with conn.cursor() as cur:
        for tr_id in target_ids:
            if tr_id not in pos:
                continue
-            i = pos[tr_id]

-            top: List[Tuple[int, float]] = []
-            for start in range(0, n, BATCH_SIM):
-                block = list(range(start, min(start + BATCH_SIM, n)))
-                candidates = _topk_for_one(i, ids_all, vecs_all, norms_all, block, TOPK)
-                top += candidates
-                top.sort(key=lambda t: t[1], reverse=True)
-                if len(top) > TOPK:
-                    top = top[:TOPK]
-
-            _insert_related(cur, tr_id, top)
+            idx = pos[tr_id]
+            pairs = _topk_numpy(idx, ids_all, mat, TOPK)
+            _insert_related(cur, tr_id, pairs)
            processed += 1

        conn.commit()
+
    return processed


+# ---------------------------------------------------------
+# MAIN
+# ---------------------------------------------------------
 def main():
    logging.info(
-        "Iniciando related_worker (TOPK=%s, BATCH_IDS=%s, BATCH_SIM=%s, MIN_SCORE=%.3f, WINDOW_H=%s)",
+        "Iniciando related_worker (TOPK=%s, BATCH_IDS=%s, MIN_SCORE=%.3f, WINDOW_H=%s)",
        TOPK,
        BATCH_IDS,
-        BATCH_SIM,
        MIN_SCORE,
        WINDOW_HOURS,
    )