optimizaciones
This commit is contained in:
parent
937da3f90b
commit
86ee083b90
5 changed files with 26 additions and 100 deletions
|
|
@ -20,10 +20,10 @@ DB = dict(
|
|||
password=os.environ.get("DB_PASS", "x"),
|
||||
)
|
||||
|
||||
TOPK = int(os.environ.get("RELATED_TOPK", 10))
|
||||
BATCH_IDS = int(os.environ.get("RELATED_BATCH_IDS", 200))
|
||||
SLEEP_IDLE = float(os.environ.get("RELATED_SLEEP", 10))
|
||||
MIN_SCORE = float(os.environ.get("RELATED_MIN_SCORE", 0.0))
|
||||
TOPK = int(os.environ.get("RELATED_TOPK", 10))
|
||||
BATCH_IDS = int(os.environ.get("RELATED_BATCH_IDS", 200))
|
||||
SLEEP_IDLE = float(os.environ.get("RELATED_SLEEP", 10))
|
||||
MIN_SCORE = float(os.environ.get("RELATED_MIN_SCORE", 0.0))
|
||||
WINDOW_HOURS = int(os.environ.get("RELATED_WINDOW_H", 0))
|
||||
|
||||
|
||||
|
|
@ -31,9 +31,6 @@ def get_conn():
|
|||
return psycopg2.connect(**DB)
|
||||
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Cargar embeddings SOLO de traducciones en español (lang_to='es')
|
||||
# ---------------------------------------------------------
|
||||
def _fetch_all_embeddings(cur):
|
||||
base_sql = """
|
||||
SELECT e.traduccion_id, e.vec
|
||||
|
|
@ -49,8 +46,8 @@ def _fetch_all_embeddings(cur):
|
|||
params.append(f"{WINDOW_HOURS} hours")
|
||||
|
||||
cur.execute(base_sql, params)
|
||||
|
||||
rows = cur.fetchall()
|
||||
|
||||
if not rows:
|
||||
return [], None
|
||||
|
||||
|
|
@ -66,10 +63,7 @@ def _fetch_all_embeddings(cur):
|
|||
if not ids:
|
||||
return [], None
|
||||
|
||||
# Convertimos a matriz numpy
|
||||
mat = np.array(vecs, dtype=np.float32)
|
||||
|
||||
# Normalizamos (evita división por 0)
|
||||
norms = np.linalg.norm(mat, axis=1, keepdims=True)
|
||||
norms[norms == 0] = 1e-8
|
||||
mat = mat / norms
|
||||
|
|
@ -77,9 +71,6 @@ def _fetch_all_embeddings(cur):
|
|||
return ids, mat
|
||||
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Obtiene IDs pendientes
|
||||
# ---------------------------------------------------------
|
||||
def _fetch_pending_ids(cur, limit) -> List[int]:
|
||||
cur.execute(
|
||||
"""
|
||||
|
|
@ -99,44 +90,24 @@ def _fetch_pending_ids(cur, limit) -> List[int]:
|
|||
return [r[0] for r in cur.fetchall()]
|
||||
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# TOP-K usando NumPy (súper rápido)
|
||||
# ---------------------------------------------------------
|
||||
def _topk_numpy(
|
||||
idx: int,
|
||||
ids_all: List[int],
|
||||
mat: np.ndarray,
|
||||
K: int
|
||||
) -> List[Tuple[int, float]]:
|
||||
|
||||
# vector de la noticia central
|
||||
q = mat[idx] # (dim,)
|
||||
|
||||
# similitudes coseno: dot product (matriz · vector)
|
||||
def _topk_numpy(idx: int, ids_all: List[int], mat: np.ndarray, K: int) -> List[Tuple[int, float]]:
|
||||
q = mat[idx]
|
||||
sims = np.dot(mat, q)
|
||||
|
||||
# eliminar self-match
|
||||
sims[idx] = -999.0
|
||||
|
||||
# filtramos por score mínimo
|
||||
if MIN_SCORE > 0:
|
||||
mask = sims >= MIN_SCORE
|
||||
sims = np.where(mask, sims, -999.0)
|
||||
|
||||
# obtenemos los índices top-k (mucho más rápido que ordenar todo)
|
||||
if K >= len(sims):
|
||||
top_idx = np.argsort(-sims)
|
||||
else:
|
||||
part = np.argpartition(-sims, K)[:K]
|
||||
top_idx = part[np.argsort(-sims[part])]
|
||||
|
||||
out = [(ids_all[j], float(sims[j])) for j in top_idx[:K]]
|
||||
return out
|
||||
return [(ids_all[j], float(sims[j])) for j in top_idx[:K]]
|
||||
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Inserta en la tabla related_noticias
|
||||
# ---------------------------------------------------------
|
||||
def _insert_related(cur, tr_id: int, pairs: List[Tuple[int, float]]):
|
||||
if not pairs:
|
||||
return
|
||||
|
|
@ -152,9 +123,6 @@ def _insert_related(cur, tr_id: int, pairs: List[Tuple[int, float]]):
|
|||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Procesar IDs objetivo
|
||||
# ---------------------------------------------------------
|
||||
def build_for_ids(conn, target_ids: List[int]) -> int:
|
||||
with conn.cursor() as cur:
|
||||
ids_all, mat = _fetch_all_embeddings(cur)
|
||||
|
|
@ -162,7 +130,6 @@ def build_for_ids(conn, target_ids: List[int]) -> int:
|
|||
if not ids_all or mat is None:
|
||||
return 0
|
||||
|
||||
# Mapa ID → index
|
||||
pos = {tid: i for i, tid in enumerate(ids_all)}
|
||||
processed = 0
|
||||
|
||||
|
|
@ -181,9 +148,6 @@ def build_for_ids(conn, target_ids: List[int]) -> int:
|
|||
return processed
|
||||
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# MAIN
|
||||
# ---------------------------------------------------------
|
||||
def main():
|
||||
logging.info(
|
||||
"Iniciando related_worker (TOPK=%s, BATCH_IDS=%s, MIN_SCORE=%.3f, WINDOW_H=%s)",
|
||||
|
|
@ -192,6 +156,7 @@ def main():
|
|||
MIN_SCORE,
|
||||
WINDOW_HOURS,
|
||||
)
|
||||
|
||||
while True:
|
||||
try:
|
||||
with get_conn() as conn, conn.cursor() as cur:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue