import os import time import logging from typing import List, Dict, Any, Optional, Tuple import numpy as np import psycopg2 import psycopg2.extras from psycopg2.extras import Json, execute_values # ------------------------------------------------------------- # LOGGING # ------------------------------------------------------------- logging.basicConfig( level=logging.INFO, format='[cluster_worker] %(asctime)s %(levelname)s: %(message)s' ) log = logging.getLogger(__name__) # ------------------------------------------------------------- # CONFIG # ------------------------------------------------------------- DB = dict( host=os.environ.get("DB_HOST", "localhost"), port=int(os.environ.get("DB_PORT", 5432)), dbname=os.environ.get("DB_NAME", "rss"), user=os.environ.get("DB_USER", "rss"), password=os.environ.get("DB_PASS", "x"), ) EVENT_LANGS = [ s.strip().lower() for s in os.environ.get("EVENT_LANGS", "es").split(",") if s.strip() ] EVENT_BATCH_IDS = int(os.environ.get("EVENT_BATCH_IDS", "200")) EVENT_SLEEP_IDLE = float(os.environ.get("EVENT_SLEEP_IDLE", "5.0")) EVENT_DIST_THRESHOLD = float(os.environ.get("EVENT_DIST_THRESHOLD", "0.25")) EMB_MODEL = os.environ.get( "EMB_MODEL", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", ) # ------------------------------------------------------------- # DB CONNECTION # ------------------------------------------------------------- def get_conn(): return psycopg2.connect(**DB) # ------------------------------------------------------------- # SCHEMA CHECK # ------------------------------------------------------------- def ensure_schema(conn): """Crea índices si no existen (seguro en producción).""" with conn.cursor() as cur: cur.execute(""" CREATE INDEX IF NOT EXISTS idx_traducciones_evento ON traducciones(evento_id); """) cur.execute(""" CREATE INDEX IF NOT EXISTS idx_traducciones_evento_fecha ON traducciones(evento_id, noticia_id); """) conn.commit() # ------------------------------------------------------------- # FETCH PENDING # ------------------------------------------------------------- def fetch_pending_traducciones(conn) -> List[int]: """Traducciones completadas sin evento asignado pero con embedding.""" with conn.cursor() as cur: cur.execute( """ SELECT t.id FROM traducciones t JOIN traduccion_embeddings e ON e.traduccion_id = t.id AND e.model = %s WHERE t.status = 'done' AND t.evento_id IS NULL AND t.lang_to = ANY(%s) ORDER BY t.id DESC LIMIT %s; """, (EMB_MODEL, EVENT_LANGS, EVENT_BATCH_IDS), ) rows = cur.fetchall() return [r[0] for r in rows] # ------------------------------------------------------------- # FETCH EMBEDDINGS # ------------------------------------------------------------- def fetch_embeddings_for(conn, tr_ids: List[int]) -> Dict[int, np.ndarray]: """Obtiene embeddings como vectores float32, validados y normales.""" if not tr_ids: return {} with conn.cursor() as cur: cur.execute( """ SELECT traduccion_id, embedding FROM traduccion_embeddings WHERE traduccion_id = ANY(%s) AND model = %s; """, (tr_ids, EMB_MODEL), ) rows = cur.fetchall() out = {} for tr_id, emb in rows: if not emb: continue try: arr = np.asarray(emb, dtype=np.float32) if arr.ndim != 1 or arr.size == 0: continue if np.isnan(arr).any(): continue norm = np.linalg.norm(arr) if norm > 0: arr = arr / norm out[int(tr_id)] = arr except Exception: continue return out # ------------------------------------------------------------- # FETCH CENTROIDS (optimized with matrix) # ------------------------------------------------------------- class CentroidIndex: """Índice vectorizado para búsqueda rápida de centroides.""" def __init__(self): self.centroids: List[Dict[str, Any]] = [] self._matrix: Optional[np.ndarray] = None self._ids: List[int] = [] def load_from_db(self, conn): """Carga centroides de la BD.""" with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute(""" SELECT id, centroid, total_traducciones FROM eventos ORDER BY id; """) rows = cur.fetchall() self.centroids = [] vectors = [] for r in rows: raw = r["centroid"] if not isinstance(raw, list): continue try: arr = np.asarray(raw, dtype=np.float32) if arr.ndim != 1 or arr.size == 0: continue if np.isnan(arr).any(): continue norm = np.linalg.norm(arr) if norm > 0: arr = arr / norm self.centroids.append({ "id": int(r["id"]), "vec": arr, "n": int(r["total_traducciones"] or 1), }) vectors.append(arr) except Exception: continue # Build matrix for vectorized search if vectors: self._matrix = np.vstack(vectors) self._ids = [c["id"] for c in self.centroids] else: self._matrix = None self._ids = [] def find_nearest(self, vec: np.ndarray) -> Tuple[Optional[int], float]: """Encuentra el centroide más cercano usando operaciones vectorizadas.""" if self._matrix is None or len(self.centroids) == 0: return None, 1.0 # Vectorized cosine similarity: dot product with normalized vectors similarities = self._matrix @ vec best_idx = int(np.argmax(similarities)) best_sim = float(similarities[best_idx]) best_dist = 1.0 - max(-1.0, min(1.0, best_sim)) return best_idx, best_dist def add_centroid(self, evento_id: int, vec: np.ndarray): """Añade un nuevo centroide al índice.""" self.centroids.append({"id": evento_id, "vec": vec.copy(), "n": 1}) if self._matrix is None: self._matrix = vec.reshape(1, -1) else: self._matrix = np.vstack([self._matrix, vec]) self._ids.append(evento_id) def update_centroid(self, idx: int, new_vec: np.ndarray, new_n: int): """Actualiza un centroide existente.""" self.centroids[idx]["vec"] = new_vec self.centroids[idx]["n"] = new_n if self._matrix is not None: self._matrix[idx] = new_vec # ------------------------------------------------------------- # BATCH FETCH TRADUCCION INFO # ------------------------------------------------------------- def fetch_traducciones_info_batch(conn, tr_ids: List[int]) -> Dict[int, Dict[str, Any]]: """Obtiene info de múltiples traducciones en una sola consulta.""" if not tr_ids: return {} with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute( """ SELECT t.id AS traduccion_id, t.noticia_id, n.fecha, COALESCE(NULLIF(t.titulo_trad,''), n.titulo) AS titulo_evento FROM traducciones t JOIN noticias n ON n.id = t.noticia_id WHERE t.id = ANY(%s); """, (tr_ids,), ) rows = cur.fetchall() result = {} for row in rows: tr_id = int(row["traduccion_id"]) result[tr_id] = { "traduccion_id": tr_id, "noticia_id": row["noticia_id"], "fecha": row["fecha"], "titulo_evento": row["titulo_evento"] or "", } return result # ------------------------------------------------------------- # BATCH PROCESSING # ------------------------------------------------------------- def process_batch_optimized( conn, pending_ids: List[int], emb_by_tr: Dict[int, np.ndarray], centroid_index: CentroidIndex, ) -> int: """Procesa un batch completo con operaciones optimizadas.""" # 1. Fetch all traduccion info in one query infos = fetch_traducciones_info_batch(conn, pending_ids) # Prepare batch operations new_eventos = [] # (vec, info) for new eventos assign_existing = [] # (tr_id, evento_id, idx, vec, info) assign_new = [] # (tr_id, vec, info) - will get evento_id after insert processed = 0 for tr_id in pending_ids: vec = emb_by_tr.get(tr_id) if vec is None: continue info = infos.get(tr_id) if not info: continue processed += 1 if len(centroid_index.centroids) == 0: # First event ever assign_new.append((tr_id, vec, info)) else: best_idx, best_dist = centroid_index.find_nearest(vec) if best_idx is not None and best_dist <= EVENT_DIST_THRESHOLD: assign_existing.append((tr_id, centroid_index.centroids[best_idx]["id"], best_idx, vec, info)) else: assign_new.append((tr_id, vec, info)) with conn.cursor() as cur: # 2. Insert new eventos in batch new_evento_ids = {} for tr_id, vec, info in assign_new: cur.execute( """ INSERT INTO eventos (centroid, total_traducciones, fecha_inicio, fecha_fin, n_noticias, titulo) VALUES (%s, 1, %s, %s, 1, %s) RETURNING id; """, ( Json(vec.tolist()), info["fecha"], info["fecha"], info["titulo_evento"], ), ) new_id = cur.fetchone()[0] new_evento_ids[tr_id] = new_id centroid_index.add_centroid(new_id, vec) # 3. Update existing eventos and centroids for tr_id, evento_id, idx, vec, info in assign_existing: c = centroid_index.centroids[idx] n_old = c["n"] n_new = n_old + 1 new_vec = (c["vec"] * n_old + vec) / float(n_new) norm = np.linalg.norm(new_vec) if norm > 0: new_vec = new_vec / norm centroid_index.update_centroid(idx, new_vec, n_new) cur.execute( """ UPDATE eventos SET centroid = %s, total_traducciones = total_traducciones + 1, fecha_inicio = LEAST(fecha_inicio, %s), fecha_fin = GREATEST(fecha_fin, %s), n_noticias = n_noticias + 1 WHERE id = %s; """, (Json(new_vec.tolist()), info["fecha"], info["fecha"], evento_id), ) # 4. Batch update traducciones.evento_id trad_updates = [] for tr_id, evento_id, _, _, _ in assign_existing: trad_updates.append((evento_id, tr_id)) for tr_id, _, _ in assign_new: trad_updates.append((new_evento_ids[tr_id], tr_id)) if trad_updates: execute_values( cur, """ UPDATE traducciones AS t SET evento_id = v.evento_id FROM (VALUES %s) AS v(evento_id, id) WHERE t.id = v.id; """, trad_updates, ) # 5. Batch insert eventos_noticias en_inserts = [] for tr_id, evento_id, _, _, info in assign_existing: if info.get("noticia_id"): en_inserts.append((evento_id, info["noticia_id"], info["traduccion_id"])) for tr_id, _, info in assign_new: if info.get("noticia_id"): en_inserts.append((new_evento_ids[tr_id], info["noticia_id"], info["traduccion_id"])) if en_inserts: execute_values( cur, """ INSERT INTO eventos_noticias (evento_id, noticia_id, traduccion_id) VALUES %s ON CONFLICT DO NOTHING; """, en_inserts, ) return processed # ------------------------------------------------------------- # MAIN LOOP # ------------------------------------------------------------- def main(): log.info( "Iniciando cluster_worker (optimized) langs=%s batch=%d threshold=%.3f emb=%s", ",".join(EVENT_LANGS), EVENT_BATCH_IDS, EVENT_DIST_THRESHOLD, EMB_MODEL, ) while True: try: with get_conn() as conn: ensure_schema(conn) pending_ids = fetch_pending_traducciones(conn) if not pending_ids: time.sleep(EVENT_SLEEP_IDLE) continue emb_by_tr = fetch_embeddings_for(conn, pending_ids) if not emb_by_tr: time.sleep(EVENT_SLEEP_IDLE) continue # Load centroids with vectorized index centroid_index = CentroidIndex() centroid_index.load_from_db(conn) # Process batch with optimizations t0 = time.time() processed = process_batch_optimized(conn, pending_ids, emb_by_tr, centroid_index) dt = time.time() - t0 conn.commit() log.info("Cluster OK: %d procesadas en %.2fs (%.1f/s)", processed, dt, processed / dt if dt > 0 else 0) except Exception: log.exception("Error en cluster_worker") time.sleep(EVENT_SLEEP_IDLE) if __name__ == "__main__": main()