arreglo de ui y busquedas

This commit is contained in:
jlimolina 2025-11-21 04:42:02 +01:00
parent cb8f69fb93
commit fc06566928
15 changed files with 1115 additions and 435 deletions

View file

@ -1,9 +1,9 @@
import os
import time
import math
import logging
from typing import List, Tuple
import numpy as np
import psycopg2
import psycopg2.extras
@ -22,7 +22,6 @@ DB = dict(
TOPK = int(os.environ.get("RELATED_TOPK", 10))
BATCH_IDS = int(os.environ.get("RELATED_BATCH_IDS", 200))
BATCH_SIM = int(os.environ.get("RELATED_BATCH_SIM", 2000))
SLEEP_IDLE = float(os.environ.get("RELATED_SLEEP", 10))
MIN_SCORE = float(os.environ.get("RELATED_MIN_SCORE", 0.0))
WINDOW_HOURS = int(os.environ.get("RELATED_WINDOW_H", 0))
@ -32,44 +31,64 @@ def get_conn():
return psycopg2.connect(**DB)
# ---------------------------------------------------------
# Cargar embeddings SOLO de traducciones en español (lang_to='es')
# ---------------------------------------------------------
def _fetch_all_embeddings(cur):
base_sql = """
SELECT e.traduccion_id, e.vec
FROM embeddings e
JOIN traducciones t ON t.id = e.traduccion_id
JOIN noticias n ON n.id = t.noticia_id
WHERE t.lang_to = 'es'
"""
params = []
if WINDOW_HOURS > 0:
cur.execute(
"""
SELECT e.traduccion_id, e.vec
FROM embeddings e
JOIN traducciones t ON t.id = e.traduccion_id
JOIN noticias n ON n.id = t.noticia_id
WHERE n.fecha >= NOW() - INTERVAL %s
""",
(f"{WINDOW_HOURS} hours",),
)
else:
cur.execute("SELECT traduccion_id, vec FROM embeddings")
base_sql += " AND n.fecha >= NOW() - INTERVAL %s"
params.append(f"{WINDOW_HOURS} hours")
cur.execute(base_sql, params)
rows = cur.fetchall()
if not rows:
return [], [], []
return [], None
ids = []
vecs = []
norms = []
for tr_id, v in rows:
for tid, v in rows:
if v is None:
v = []
nrm = math.sqrt(sum(((x or 0.0) * (x or 0.0)) for x in v)) or 1e-8
ids.append(tr_id)
continue
ids.append(tid)
vecs.append(v)
norms.append(nrm)
return ids, vecs, norms
if not ids:
return [], None
# Convertimos a matriz numpy
mat = np.array(vecs, dtype=np.float32)
# Normalizamos (evita división por 0)
norms = np.linalg.norm(mat, axis=1, keepdims=True)
norms[norms == 0] = 1e-8
mat = mat / norms
return ids, mat
# ---------------------------------------------------------
# Obtiene IDs pendientes
# ---------------------------------------------------------
def _fetch_pending_ids(cur, limit) -> List[int]:
cur.execute(
"""
SELECT e.traduccion_id
FROM embeddings e
LEFT JOIN related_noticias r ON r.traduccion_id = e.traduccion_id
JOIN traducciones t ON t.id = e.traduccion_id
LEFT JOIN related_noticias r
ON r.traduccion_id = e.traduccion_id
WHERE t.lang_to = 'es'
GROUP BY e.traduccion_id
HAVING COUNT(r.related_traduccion_id) = 0
ORDER BY e.traduccion_id DESC
@ -80,42 +99,44 @@ def _fetch_pending_ids(cur, limit) -> List[int]:
return [r[0] for r in cur.fetchall()]
def _cosine_with_norms(a, b, na, nb):
num = 0.0
for x, y in zip(a, b):
xv = x or 0.0
yv = y or 0.0
num += xv * yv
denom = na * nb
if denom <= 0.0:
return 0.0
return num / denom
def _topk_for_one(
# ---------------------------------------------------------
# TOP-K usando NumPy (súper rápido)
# ---------------------------------------------------------
def _topk_numpy(
idx: int,
ids_all: List[int],
vecs_all: List[List[float]],
norms_all: List[float],
pool_indices: List[int],
K: int,
mat: np.ndarray,
K: int
) -> List[Tuple[int, float]]:
me_vec = vecs_all[idx]
me_norm = norms_all[idx]
out: List[Tuple[int, float]] = []
for j in pool_indices:
if j == idx:
continue
s = _cosine_with_norms(me_vec, vecs_all[j], me_norm, norms_all[j])
out.append((ids_all[j], s))
# vector de la noticia central
q = mat[idx] # (dim,)
out.sort(key=lambda t: t[1], reverse=True)
if MIN_SCORE > 0.0:
out = [p for p in out if p[1] >= MIN_SCORE]
return out[:K]
# similitudes coseno: dot product (matriz · vector)
sims = np.dot(mat, q)
# eliminar self-match
sims[idx] = -999.0
# filtramos por score mínimo
if MIN_SCORE > 0:
mask = sims >= MIN_SCORE
sims = np.where(mask, sims, -999.0)
# obtenemos los índices top-k (mucho más rápido que ordenar todo)
if K >= len(sims):
top_idx = np.argsort(-sims)
else:
part = np.argpartition(-sims, K)[:K]
top_idx = part[np.argsort(-sims[part])]
out = [(ids_all[j], float(sims[j])) for j in top_idx[:K]]
return out
# ---------------------------------------------------------
# Inserta en la tabla related_noticias
# ---------------------------------------------------------
def _insert_related(cur, tr_id: int, pairs: List[Tuple[int, float]]):
if not pairs:
return
@ -127,48 +148,47 @@ def _insert_related(cur, tr_id: int, pairs: List[Tuple[int, float]]):
ON CONFLICT (traduccion_id, related_traduccion_id)
DO UPDATE SET score = EXCLUDED.score
""",
[(tr_id, rid, float(score)) for (rid, score) in pairs],
[(tr_id, rid, score) for (rid, score) in pairs],
)
# ---------------------------------------------------------
# Procesar IDs objetivo
# ---------------------------------------------------------
def build_for_ids(conn, target_ids: List[int]) -> int:
with conn.cursor() as cur:
ids_all, vecs_all, norms_all = _fetch_all_embeddings(cur)
if not ids_all:
ids_all, mat = _fetch_all_embeddings(cur)
if not ids_all or mat is None:
return 0
# Mapa ID → index
pos = {tid: i for i, tid in enumerate(ids_all)}
n = len(ids_all)
processed = 0
with conn.cursor() as cur:
for tr_id in target_ids:
if tr_id not in pos:
continue
i = pos[tr_id]
top: List[Tuple[int, float]] = []
for start in range(0, n, BATCH_SIM):
block = list(range(start, min(start + BATCH_SIM, n)))
candidates = _topk_for_one(i, ids_all, vecs_all, norms_all, block, TOPK)
top += candidates
top.sort(key=lambda t: t[1], reverse=True)
if len(top) > TOPK:
top = top[:TOPK]
_insert_related(cur, tr_id, top)
idx = pos[tr_id]
pairs = _topk_numpy(idx, ids_all, mat, TOPK)
_insert_related(cur, tr_id, pairs)
processed += 1
conn.commit()
return processed
# ---------------------------------------------------------
# MAIN
# ---------------------------------------------------------
def main():
logging.info(
"Iniciando related_worker (TOPK=%s, BATCH_IDS=%s, BATCH_SIM=%s, MIN_SCORE=%.3f, WINDOW_H=%s)",
"Iniciando related_worker (TOPK=%s, BATCH_IDS=%s, MIN_SCORE=%.3f, WINDOW_H=%s)",
TOPK,
BATCH_IDS,
BATCH_SIM,
MIN_SCORE,
WINDOW_HOURS,
)