quito comentarios

This commit is contained in:
jlimolina 2025-11-24 01:40:46 +01:00
parent 68a5528f2f
commit 937da3f90b
8 changed files with 48 additions and 496 deletions

View file

@ -31,30 +31,41 @@ EVENT_BATCH_IDS = int(os.environ.get("EVENT_BATCH_IDS", "200"))
EVENT_SLEEP_IDLE = float(os.environ.get("EVENT_SLEEP_IDLE", "5.0"))
EVENT_DIST_THRESHOLD = float(os.environ.get("EVENT_DIST_THRESHOLD", "0.25"))
EMB_MODEL = os.environ.get(
"EMB_MODEL",
"sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
)
def get_conn():
return psycopg2.connect(**DB)
def ensure_schema(conn):
"""
Asegura que la tabla de eventos y las columnas necesarias existen.
Aquí se asume el esquema original de eventos con centroid JSONB.
"""
with conn.cursor() as cur:
cur.execute(
"""
CREATE TABLE IF NOT EXISTS eventos (
id SERIAL PRIMARY KEY,
creado_en TIMESTAMP NOT NULL DEFAULT NOW(),
creado_en TIMESTAMP NOT NULL DEFAULT NOW(),
actualizado_en TIMESTAMP NOT NULL DEFAULT NOW(),
centroid JSONB NOT NULL,
centroid JSONB NOT NULL,
total_traducciones INTEGER NOT NULL DEFAULT 1
);
"""
)
cur.execute(
"""
ALTER TABLE traducciones
ADD COLUMN IF NOT EXISTS evento_id INTEGER REFERENCES eventos(id);
"""
)
cur.execute(
"""
CREATE INDEX IF NOT EXISTS idx_traducciones_evento
@ -67,6 +78,7 @@ def ensure_schema(conn):
ON traducciones(evento_id, noticia_id);
"""
)
cur.execute(
"""
CREATE OR REPLACE FUNCTION actualizar_evento_modificado()
@ -91,43 +103,55 @@ def ensure_schema(conn):
def fetch_pending_traducciones(conn) -> List[int]:
"""
Traducciones con status 'done', sin evento asignado
y que ya tienen embedding en traduccion_embeddings para EMB_MODEL.
"""
with conn.cursor() as cur:
cur.execute(
"""
SELECT t.id
FROM traducciones t
JOIN embeddings e ON e.traduccion_id = t.id
JOIN traduccion_embeddings e
ON e.traduccion_id = t.id
AND e.model = %s
WHERE t.status = 'done'
AND t.evento_id IS NULL
AND t.lang_to = ANY(%s)
ORDER BY t.id DESC
LIMIT %s;
""",
(EVENT_LANGS, EVENT_BATCH_IDS),
(EMB_MODEL, EVENT_LANGS, EVENT_BATCH_IDS),
)
rows = cur.fetchall()
return [r[0] for r in rows]
def fetch_embeddings_for(conn, tr_ids: List[int]) -> Dict[int, np.ndarray]:
"""
Devuelve un diccionario {traduccion_id: vector_numpy}
leyendo de traduccion_embeddings.embedding para el EMB_MODEL.
"""
if not tr_ids:
return {}
with conn.cursor() as cur:
cur.execute(
"""
SELECT traduccion_id, vec
FROM embeddings
WHERE traduccion_id = ANY(%s);
SELECT traduccion_id, embedding
FROM traduccion_embeddings
WHERE traduccion_id = ANY(%s)
AND model = %s;
""",
(tr_ids,),
(tr_ids, EMB_MODEL),
)
rows = cur.fetchall()
out: Dict[int, np.ndarray] = {}
for tr_id, vec in rows:
if not vec:
for tr_id, emb in rows:
if not emb:
continue
arr = np.array([float(x or 0.0) for x in vec], dtype="float32")
arr = np.array([float(x or 0.0) for x in emb], dtype="float32")
if arr.size == 0:
continue
out[int(tr_id)] = arr
@ -135,6 +159,9 @@ def fetch_embeddings_for(conn, tr_ids: List[int]) -> Dict[int, np.ndarray]:
def fetch_centroids(conn) -> List[Dict[str, Any]]:
"""
Carga todos los centroides actuales desde eventos.
"""
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute(
"""
@ -180,6 +207,10 @@ def assign_to_event(
vec: np.ndarray,
centroids: List[Dict[str, Any]],
) -> None:
"""
Asigna una traducción a un evento existente (si distancia <= umbral)
o crea un evento nuevo con este vector como centroide.
"""
from psycopg2.extras import Json
if vec is None or vec.size == 0:
@ -260,11 +291,12 @@ def assign_to_event(
def main():
log.info(
"Iniciando cluster_worker eventos "
"(EVENT_LANGS=%s, BATCH_IDS=%s, DIST_THRESHOLD=%.3f, SLEEP=%.1fs)",
"(EVENT_LANGS=%s, BATCH_IDS=%s, DIST_THRESHOLD=%.3f, SLEEP=%.1fs, EMB_MODEL=%s)",
",".join(EVENT_LANGS),
EVENT_BATCH_IDS,
EVENT_DIST_THRESHOLD,
EVENT_SLEEP_IDLE,
EMB_MODEL,
)
while True: