195 lines
5.1 KiB
Python
195 lines
5.1 KiB
Python
import os
|
|
import time
|
|
import math
|
|
import logging
|
|
from typing import List, Tuple
|
|
|
|
import psycopg2
|
|
import psycopg2.extras
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='[related] %(asctime)s %(levelname)s: %(message)s'
|
|
)
|
|
|
|
DB = dict(
|
|
host=os.environ.get("DB_HOST", "localhost"),
|
|
port=int(os.environ.get("DB_PORT", 5432)),
|
|
dbname=os.environ.get("DB_NAME", "rss"),
|
|
user=os.environ.get("DB_USER", "rss"),
|
|
password=os.environ.get("DB_PASS", "x"),
|
|
)
|
|
|
|
TOPK = int(os.environ.get("RELATED_TOPK", 10))
|
|
BATCH_IDS = int(os.environ.get("RELATED_BATCH_IDS", 200))
|
|
BATCH_SIM = int(os.environ.get("RELATED_BATCH_SIM", 2000))
|
|
SLEEP_IDLE = float(os.environ.get("RELATED_SLEEP", 10))
|
|
MIN_SCORE = float(os.environ.get("RELATED_MIN_SCORE", 0.0))
|
|
WINDOW_HOURS = int(os.environ.get("RELATED_WINDOW_H", 0))
|
|
|
|
|
|
def get_conn():
|
|
return psycopg2.connect(**DB)
|
|
|
|
|
|
def _fetch_all_embeddings(cur):
|
|
if WINDOW_HOURS > 0:
|
|
cur.execute(
|
|
"""
|
|
SELECT e.traduccion_id, e.vec
|
|
FROM embeddings e
|
|
JOIN traducciones t ON t.id = e.traduccion_id
|
|
JOIN noticias n ON n.id = t.noticia_id
|
|
WHERE n.fecha >= NOW() - INTERVAL %s
|
|
""",
|
|
(f"{WINDOW_HOURS} hours",),
|
|
)
|
|
else:
|
|
cur.execute("SELECT traduccion_id, vec FROM embeddings")
|
|
|
|
rows = cur.fetchall()
|
|
if not rows:
|
|
return [], [], []
|
|
|
|
ids = []
|
|
vecs = []
|
|
norms = []
|
|
for tr_id, v in rows:
|
|
if v is None:
|
|
v = []
|
|
nrm = math.sqrt(sum(((x or 0.0) * (x or 0.0)) for x in v)) or 1e-8
|
|
ids.append(tr_id)
|
|
vecs.append(v)
|
|
norms.append(nrm)
|
|
return ids, vecs, norms
|
|
|
|
|
|
def _fetch_pending_ids(cur, limit) -> List[int]:
|
|
cur.execute(
|
|
"""
|
|
SELECT e.traduccion_id
|
|
FROM embeddings e
|
|
LEFT JOIN related_noticias r ON r.traduccion_id = e.traduccion_id
|
|
GROUP BY e.traduccion_id
|
|
HAVING COUNT(r.related_traduccion_id) = 0
|
|
ORDER BY e.traduccion_id DESC
|
|
LIMIT %s;
|
|
""",
|
|
(limit,),
|
|
)
|
|
return [r[0] for r in cur.fetchall()]
|
|
|
|
|
|
def _cosine_with_norms(a, b, na, nb):
|
|
num = 0.0
|
|
for x, y in zip(a, b):
|
|
xv = x or 0.0
|
|
yv = y or 0.0
|
|
num += xv * yv
|
|
denom = na * nb
|
|
if denom <= 0.0:
|
|
return 0.0
|
|
return num / denom
|
|
|
|
|
|
def _topk_for_one(
|
|
idx: int,
|
|
ids_all: List[int],
|
|
vecs_all: List[List[float]],
|
|
norms_all: List[float],
|
|
pool_indices: List[int],
|
|
K: int,
|
|
) -> List[Tuple[int, float]]:
|
|
me_vec = vecs_all[idx]
|
|
me_norm = norms_all[idx]
|
|
|
|
out: List[Tuple[int, float]] = []
|
|
for j in pool_indices:
|
|
if j == idx:
|
|
continue
|
|
s = _cosine_with_norms(me_vec, vecs_all[j], me_norm, norms_all[j])
|
|
out.append((ids_all[j], s))
|
|
|
|
out.sort(key=lambda t: t[1], reverse=True)
|
|
if MIN_SCORE > 0.0:
|
|
out = [p for p in out if p[1] >= MIN_SCORE]
|
|
return out[:K]
|
|
|
|
|
|
def _insert_related(cur, tr_id: int, pairs: List[Tuple[int, float]]):
|
|
if not pairs:
|
|
return
|
|
psycopg2.extras.execute_values(
|
|
cur,
|
|
"""
|
|
INSERT INTO related_noticias (traduccion_id, related_traduccion_id, score)
|
|
VALUES %s
|
|
ON CONFLICT (traduccion_id, related_traduccion_id)
|
|
DO UPDATE SET score = EXCLUDED.score
|
|
""",
|
|
[(tr_id, rid, float(score)) for (rid, score) in pairs],
|
|
)
|
|
|
|
|
|
def build_for_ids(conn, target_ids: List[int]) -> int:
|
|
with conn.cursor() as cur:
|
|
ids_all, vecs_all, norms_all = _fetch_all_embeddings(cur)
|
|
if not ids_all:
|
|
return 0
|
|
|
|
pos = {tid: i for i, tid in enumerate(ids_all)}
|
|
n = len(ids_all)
|
|
processed = 0
|
|
|
|
with conn.cursor() as cur:
|
|
for tr_id in target_ids:
|
|
if tr_id not in pos:
|
|
continue
|
|
i = pos[tr_id]
|
|
|
|
top: List[Tuple[int, float]] = []
|
|
for start in range(0, n, BATCH_SIM):
|
|
block = list(range(start, min(start + BATCH_SIM, n)))
|
|
candidates = _topk_for_one(i, ids_all, vecs_all, norms_all, block, TOPK)
|
|
top += candidates
|
|
top.sort(key=lambda t: t[1], reverse=True)
|
|
if len(top) > TOPK:
|
|
top = top[:TOPK]
|
|
|
|
_insert_related(cur, tr_id, top)
|
|
processed += 1
|
|
|
|
conn.commit()
|
|
return processed
|
|
|
|
|
|
def main():
|
|
logging.info(
|
|
"Iniciando related_worker (TOPK=%s, BATCH_IDS=%s, BATCH_SIM=%s, MIN_SCORE=%.3f, WINDOW_H=%s)",
|
|
TOPK,
|
|
BATCH_IDS,
|
|
BATCH_SIM,
|
|
MIN_SCORE,
|
|
WINDOW_HOURS,
|
|
)
|
|
while True:
|
|
try:
|
|
with get_conn() as conn, conn.cursor() as cur:
|
|
todo = _fetch_pending_ids(cur, BATCH_IDS)
|
|
|
|
if not todo:
|
|
time.sleep(SLEEP_IDLE)
|
|
continue
|
|
|
|
with get_conn() as conn:
|
|
done = build_for_ids(conn, todo)
|
|
logging.info("Relacionadas generadas/actualizadas para %d traducciones.", done)
|
|
|
|
except Exception:
|
|
logging.exception("Error en related_worker")
|
|
time.sleep(SLEEP_IDLE)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|