optimizaciones

2025-11-24 02:37:05 +01:00 · 2025-11-24 02:37:05 +01:00 · 86ee083b90
commit 86ee083b90
parent 937da3f90b
5 changed files with 26 additions and 100 deletions
--- a/related_worker.py
+++ b/related_worker.py
@ -20,10 +20,10 @@ DB = dict(
    password=os.environ.get("DB_PASS", "x"),
 )

-TOPK         = int(os.environ.get("RELATED_TOPK", 10))
-BATCH_IDS    = int(os.environ.get("RELATED_BATCH_IDS", 200))
-SLEEP_IDLE   = float(os.environ.get("RELATED_SLEEP", 10))
-MIN_SCORE    = float(os.environ.get("RELATED_MIN_SCORE", 0.0))
+TOPK = int(os.environ.get("RELATED_TOPK", 10))
+BATCH_IDS = int(os.environ.get("RELATED_BATCH_IDS", 200))
+SLEEP_IDLE = float(os.environ.get("RELATED_SLEEP", 10))
+MIN_SCORE = float(os.environ.get("RELATED_MIN_SCORE", 0.0))
 WINDOW_HOURS = int(os.environ.get("RELATED_WINDOW_H", 0))


@ -31,9 +31,6 @@ def get_conn():
    return psycopg2.connect(**DB)


-# ---------------------------------------------------------
-# Cargar embeddings SOLO de traducciones en español (lang_to='es')
-# ---------------------------------------------------------
 def _fetch_all_embeddings(cur):
    base_sql = """
        SELECT e.traduccion_id, e.vec
@ -49,8 +46,8 @@ def _fetch_all_embeddings(cur):
        params.append(f"{WINDOW_HOURS} hours")

    cur.execute(base_sql, params)
-
    rows = cur.fetchall()
+
    if not rows:
        return [], None

@ -66,10 +63,7 @@ def _fetch_all_embeddings(cur):
    if not ids:
        return [], None

-    # Convertimos a matriz numpy
    mat = np.array(vecs, dtype=np.float32)
-
-    # Normalizamos (evita división por 0)
    norms = np.linalg.norm(mat, axis=1, keepdims=True)
    norms[norms == 0] = 1e-8
    mat = mat / norms
@ -77,9 +71,6 @@ def _fetch_all_embeddings(cur):
    return ids, mat


-# ---------------------------------------------------------
-# Obtiene IDs pendientes
-# ---------------------------------------------------------
 def _fetch_pending_ids(cur, limit) -> List[int]:
    cur.execute(
        """
@ -99,44 +90,24 @@ def _fetch_pending_ids(cur, limit) -> List[int]:
    return [r[0] for r in cur.fetchall()]


-# ---------------------------------------------------------
-# TOP-K usando NumPy (súper rápido)
-# ---------------------------------------------------------
-def _topk_numpy(
-    idx: int,
-    ids_all: List[int],
-    mat: np.ndarray,
-    K: int
-) -> List[Tuple[int, float]]:
-
-    # vector de la noticia central
-    q = mat[idx]  # (dim,)
-
-    # similitudes coseno: dot product (matriz · vector)
+def _topk_numpy(idx: int, ids_all: List[int], mat: np.ndarray, K: int) -> List[Tuple[int, float]]:
+    q = mat[idx]
    sims = np.dot(mat, q)
-
-    # eliminar self-match
    sims[idx] = -999.0

-    # filtramos por score mínimo
    if MIN_SCORE > 0:
        mask = sims >= MIN_SCORE
        sims = np.where(mask, sims, -999.0)

-    # obtenemos los índices top-k (mucho más rápido que ordenar todo)
    if K >= len(sims):
        top_idx = np.argsort(-sims)
    else:
        part = np.argpartition(-sims, K)[:K]
        top_idx = part[np.argsort(-sims[part])]

-    out = [(ids_all[j], float(sims[j])) for j in top_idx[:K]]
-    return out
+    return [(ids_all[j], float(sims[j])) for j in top_idx[:K]]


-# ---------------------------------------------------------
-# Inserta en la tabla related_noticias
-# ---------------------------------------------------------
 def _insert_related(cur, tr_id: int, pairs: List[Tuple[int, float]]):
    if not pairs:
        return
@ -152,9 +123,6 @@ def _insert_related(cur, tr_id: int, pairs: List[Tuple[int, float]]):
    )


-# ---------------------------------------------------------
-# Procesar IDs objetivo
-# ---------------------------------------------------------
 def build_for_ids(conn, target_ids: List[int]) -> int:
    with conn.cursor() as cur:
        ids_all, mat = _fetch_all_embeddings(cur)
@ -162,7 +130,6 @@ def build_for_ids(conn, target_ids: List[int]) -> int:
    if not ids_all or mat is None:
        return 0

-    # Mapa ID → index
    pos = {tid: i for i, tid in enumerate(ids_all)}
    processed = 0

@ -181,9 +148,6 @@ def build_for_ids(conn, target_ids: List[int]) -> int:
    return processed


-# ---------------------------------------------------------
-# MAIN
-# ---------------------------------------------------------
 def main():
    logging.info(
        "Iniciando related_worker (TOPK=%s, BATCH_IDS=%s, MIN_SCORE=%.3f, WINDOW_H=%s)",
@ -192,6 +156,7 @@ def main():
        MIN_SCORE,
        WINDOW_HOURS,
    )
+
    while True:
        try:
            with get_conn() as conn, conn.cursor() as cur: