Initial clean commit

2026-01-13 13:39:51 +01:00 · 2026-01-13 13:39:51 +01:00 · 6784d81c2c
commit 6784d81c2c
141 changed files with 25219 additions and 0 deletions
--- a/workers/init.py
+++ b/workers/init.py
@ -0,0 +1 @@
+# Workers package
--- a/workers/cluster_worker.py
+++ b/workers/cluster_worker.py
@ -0,0 +1,447 @@
+import os
+import time
+import logging
+from typing import List, Dict, Any, Optional, Tuple
+
+import numpy as np
+import psycopg2
+import psycopg2.extras
+from psycopg2.extras import Json, execute_values
+
+
+# -------------------------------------------------------------
+# LOGGING
+# -------------------------------------------------------------
+logging.basicConfig(
+    level=logging.INFO,
+    format='[cluster_worker] %(asctime)s %(levelname)s: %(message)s'
+)
+log = logging.getLogger(__name__)
+
+
+# -------------------------------------------------------------
+# CONFIG
+# -------------------------------------------------------------
+DB = dict(
+    host=os.environ.get("DB_HOST", "localhost"),
+    port=int(os.environ.get("DB_PORT", 5432)),
+    dbname=os.environ.get("DB_NAME", "rss"),
+    user=os.environ.get("DB_USER", "rss"),
+    password=os.environ.get("DB_PASS", "x"),
+)
+
+EVENT_LANGS = [
+    s.strip().lower()
+    for s in os.environ.get("EVENT_LANGS", "es").split(",")
+    if s.strip()
+]
+
+EVENT_BATCH_IDS = int(os.environ.get("EVENT_BATCH_IDS", "200"))
+EVENT_SLEEP_IDLE = float(os.environ.get("EVENT_SLEEP_IDLE", "5.0"))
+EVENT_DIST_THRESHOLD = float(os.environ.get("EVENT_DIST_THRESHOLD", "0.25"))
+
+EMB_MODEL = os.environ.get(
+    "EMB_MODEL",
+    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
+)
+
+
+# -------------------------------------------------------------
+# DB CONNECTION
+# -------------------------------------------------------------
+def get_conn():
+    return psycopg2.connect(**DB)
+
+
+# -------------------------------------------------------------
+# SCHEMA CHECK
+# -------------------------------------------------------------
+def ensure_schema(conn):
+    """Crea índices si no existen (seguro en producción)."""
+    with conn.cursor() as cur:
+        cur.execute("""
+            CREATE INDEX IF NOT EXISTS idx_traducciones_evento
+            ON traducciones(evento_id);
+        """)
+        cur.execute("""
+            CREATE INDEX IF NOT EXISTS idx_traducciones_evento_fecha
+            ON traducciones(evento_id, noticia_id);
+        """)
+    conn.commit()
+
+
+# -------------------------------------------------------------
+# FETCH PENDING
+# -------------------------------------------------------------
+def fetch_pending_traducciones(conn) -> List[int]:
+    """Traducciones completadas sin evento asignado pero con embedding."""
+    with conn.cursor() as cur:
+        cur.execute(
+            """
+            SELECT t.id
+            FROM traducciones t
+            JOIN traduccion_embeddings e
+              ON e.traduccion_id = t.id
+             AND e.model = %s
+            WHERE t.status = 'done'
+              AND t.evento_id IS NULL
+              AND t.lang_to = ANY(%s)
+            ORDER BY t.id DESC
+            LIMIT %s;
+            """,
+            (EMB_MODEL, EVENT_LANGS, EVENT_BATCH_IDS),
+        )
+        rows = cur.fetchall()
+    return [r[0] for r in rows]
+
+
+# -------------------------------------------------------------
+# FETCH EMBEDDINGS
+# -------------------------------------------------------------
+def fetch_embeddings_for(conn, tr_ids: List[int]) -> Dict[int, np.ndarray]:
+    """Obtiene embeddings como vectores float32, validados y normales."""
+    if not tr_ids:
+        return {}
+
+    with conn.cursor() as cur:
+        cur.execute(
+            """
+            SELECT traduccion_id, embedding
+            FROM traduccion_embeddings
+            WHERE traduccion_id = ANY(%s)
+              AND model = %s;
+            """,
+            (tr_ids, EMB_MODEL),
+        )
+        rows = cur.fetchall()
+
+    out = {}
+    for tr_id, emb in rows:
+        if not emb:
+            continue
+
+        try:
+            arr = np.asarray(emb, dtype=np.float32)
+            if arr.ndim != 1 or arr.size == 0:
+                continue
+            if np.isnan(arr).any():
+                continue
+
+            norm = np.linalg.norm(arr)
+            if norm > 0:
+                arr = arr / norm
+
+            out[int(tr_id)] = arr
+
+        except Exception:
+            continue
+
+    return out
+
+
+# -------------------------------------------------------------
+# FETCH CENTROIDS (optimized with matrix)
+# -------------------------------------------------------------
+class CentroidIndex:
+    """Índice vectorizado para búsqueda rápida de centroides."""
+    
+    def __init__(self):
+        self.centroids: List[Dict[str, Any]] = []
+        self._matrix: Optional[np.ndarray] = None
+        self._ids: List[int] = []
+    
+    def load_from_db(self, conn):
+        """Carga centroides de la BD."""
+        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+            cur.execute("""
+                SELECT id, centroid, total_traducciones
+                FROM eventos
+                ORDER BY id;
+            """)
+            rows = cur.fetchall()
+
+        self.centroids = []
+        vectors = []
+        
+        for r in rows:
+            raw = r["centroid"]
+            if not isinstance(raw, list):
+                continue
+
+            try:
+                arr = np.asarray(raw, dtype=np.float32)
+                if arr.ndim != 1 or arr.size == 0:
+                    continue
+                if np.isnan(arr).any():
+                    continue
+
+                norm = np.linalg.norm(arr)
+                if norm > 0:
+                    arr = arr / norm
+
+                self.centroids.append({
+                    "id": int(r["id"]),
+                    "vec": arr,
+                    "n": int(r["total_traducciones"] or 1),
+                })
+                vectors.append(arr)
+
+            except Exception:
+                continue
+        
+        # Build matrix for vectorized search
+        if vectors:
+            self._matrix = np.vstack(vectors)
+            self._ids = [c["id"] for c in self.centroids]
+        else:
+            self._matrix = None
+            self._ids = []
+    
+    def find_nearest(self, vec: np.ndarray) -> Tuple[Optional[int], float]:
+        """Encuentra el centroide más cercano usando operaciones vectorizadas."""
+        if self._matrix is None or len(self.centroids) == 0:
+            return None, 1.0
+        
+        # Vectorized cosine similarity: dot product with normalized vectors
+        similarities = self._matrix @ vec
+        best_idx = int(np.argmax(similarities))
+        best_sim = float(similarities[best_idx])
+        best_dist = 1.0 - max(-1.0, min(1.0, best_sim))
+        
+        return best_idx, best_dist
+    
+    def add_centroid(self, evento_id: int, vec: np.ndarray):
+        """Añade un nuevo centroide al índice."""
+        self.centroids.append({"id": evento_id, "vec": vec.copy(), "n": 1})
+        
+        if self._matrix is None:
+            self._matrix = vec.reshape(1, -1)
+        else:
+            self._matrix = np.vstack([self._matrix, vec])
+        self._ids.append(evento_id)
+    
+    def update_centroid(self, idx: int, new_vec: np.ndarray, new_n: int):
+        """Actualiza un centroide existente."""
+        self.centroids[idx]["vec"] = new_vec
+        self.centroids[idx]["n"] = new_n
+        if self._matrix is not None:
+            self._matrix[idx] = new_vec
+
+
+# -------------------------------------------------------------
+# BATCH FETCH TRADUCCION INFO
+# -------------------------------------------------------------
+def fetch_traducciones_info_batch(conn, tr_ids: List[int]) -> Dict[int, Dict[str, Any]]:
+    """Obtiene info de múltiples traducciones en una sola consulta."""
+    if not tr_ids:
+        return {}
+    
+    with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+        cur.execute(
+            """
+            SELECT
+                t.id AS traduccion_id,
+                t.noticia_id,
+                n.fecha,
+                COALESCE(NULLIF(t.titulo_trad,''), n.titulo) AS titulo_evento
+            FROM traducciones t
+            JOIN noticias n ON n.id = t.noticia_id
+            WHERE t.id = ANY(%s);
+            """,
+            (tr_ids,),
+        )
+        rows = cur.fetchall()
+
+    result = {}
+    for row in rows:
+        tr_id = int(row["traduccion_id"])
+        result[tr_id] = {
+            "traduccion_id": tr_id,
+            "noticia_id": row["noticia_id"],
+            "fecha": row["fecha"],
+            "titulo_evento": row["titulo_evento"] or "",
+        }
+    return result
+
+
+# -------------------------------------------------------------
+# BATCH PROCESSING
+# -------------------------------------------------------------
+def process_batch_optimized(
+    conn,
+    pending_ids: List[int],
+    emb_by_tr: Dict[int, np.ndarray],
+    centroid_index: CentroidIndex,
+) -> int:
+    """Procesa un batch completo con operaciones optimizadas."""
+    
+    # 1. Fetch all traduccion info in one query
+    infos = fetch_traducciones_info_batch(conn, pending_ids)
+    
+    # Prepare batch operations
+    new_eventos = []  # (vec, info) for new eventos
+    assign_existing = []  # (tr_id, evento_id, idx, vec, info)
+    assign_new = []  # (tr_id, vec, info) - will get evento_id after insert
+    
+    processed = 0
+    
+    for tr_id in pending_ids:
+        vec = emb_by_tr.get(tr_id)
+        if vec is None:
+            continue
+        
+        info = infos.get(tr_id)
+        if not info:
+            continue
+        
+        processed += 1
+        
+        if len(centroid_index.centroids) == 0:
+            # First event ever
+            assign_new.append((tr_id, vec, info))
+        else:
+            best_idx, best_dist = centroid_index.find_nearest(vec)
+            
+            if best_idx is not None and best_dist <= EVENT_DIST_THRESHOLD:
+                assign_existing.append((tr_id, centroid_index.centroids[best_idx]["id"], best_idx, vec, info))
+            else:
+                assign_new.append((tr_id, vec, info))
+    
+    with conn.cursor() as cur:
+        # 2. Insert new eventos in batch
+        new_evento_ids = {}
+        for tr_id, vec, info in assign_new:
+            cur.execute(
+                """
+                INSERT INTO eventos (centroid, total_traducciones,
+                                     fecha_inicio, fecha_fin, n_noticias, titulo)
+                VALUES (%s, 1, %s, %s, 1, %s)
+                RETURNING id;
+                """,
+                (
+                    Json(vec.tolist()),
+                    info["fecha"],
+                    info["fecha"],
+                    info["titulo_evento"],
+                ),
+            )
+            new_id = cur.fetchone()[0]
+            new_evento_ids[tr_id] = new_id
+            centroid_index.add_centroid(new_id, vec)
+        
+        # 3. Update existing eventos and centroids
+        for tr_id, evento_id, idx, vec, info in assign_existing:
+            c = centroid_index.centroids[idx]
+            n_old = c["n"]
+            n_new = n_old + 1
+            
+            new_vec = (c["vec"] * n_old + vec) / float(n_new)
+            norm = np.linalg.norm(new_vec)
+            if norm > 0:
+                new_vec = new_vec / norm
+            
+            centroid_index.update_centroid(idx, new_vec, n_new)
+            
+            cur.execute(
+                """
+                UPDATE eventos
+                SET centroid = %s,
+                    total_traducciones = total_traducciones + 1,
+                    fecha_inicio = LEAST(fecha_inicio, %s),
+                    fecha_fin = GREATEST(fecha_fin, %s),
+                    n_noticias = n_noticias + 1
+                WHERE id = %s;
+                """,
+                (Json(new_vec.tolist()), info["fecha"], info["fecha"], evento_id),
+            )
+        
+        # 4. Batch update traducciones.evento_id
+        trad_updates = []
+        for tr_id, evento_id, _, _, _ in assign_existing:
+            trad_updates.append((evento_id, tr_id))
+        for tr_id, _, _ in assign_new:
+            trad_updates.append((new_evento_ids[tr_id], tr_id))
+        
+        if trad_updates:
+            execute_values(
+                cur,
+                """
+                UPDATE traducciones AS t
+                SET evento_id = v.evento_id
+                FROM (VALUES %s) AS v(evento_id, id)
+                WHERE t.id = v.id;
+                """,
+                trad_updates,
+            )
+        
+        # 5. Batch insert eventos_noticias
+        en_inserts = []
+        for tr_id, evento_id, _, _, info in assign_existing:
+            if info.get("noticia_id"):
+                en_inserts.append((evento_id, info["noticia_id"], info["traduccion_id"]))
+        for tr_id, _, info in assign_new:
+            if info.get("noticia_id"):
+                en_inserts.append((new_evento_ids[tr_id], info["noticia_id"], info["traduccion_id"]))
+        
+        if en_inserts:
+            execute_values(
+                cur,
+                """
+                INSERT INTO eventos_noticias (evento_id, noticia_id, traduccion_id)
+                VALUES %s
+                ON CONFLICT DO NOTHING;
+                """,
+                en_inserts,
+            )
+    
+    return processed
+
+
+# -------------------------------------------------------------
+# MAIN LOOP
+# -------------------------------------------------------------
+def main():
+    log.info(
+        "Iniciando cluster_worker (optimized) langs=%s batch=%d threshold=%.3f emb=%s",
+        ",".join(EVENT_LANGS),
+        EVENT_BATCH_IDS,
+        EVENT_DIST_THRESHOLD,
+        EMB_MODEL,
+    )
+
+    while True:
+        try:
+            with get_conn() as conn:
+                ensure_schema(conn)
+
+                pending_ids = fetch_pending_traducciones(conn)
+                if not pending_ids:
+                    time.sleep(EVENT_SLEEP_IDLE)
+                    continue
+
+                emb_by_tr = fetch_embeddings_for(conn, pending_ids)
+                if not emb_by_tr:
+                    time.sleep(EVENT_SLEEP_IDLE)
+                    continue
+
+                # Load centroids with vectorized index
+                centroid_index = CentroidIndex()
+                centroid_index.load_from_db(conn)
+
+                # Process batch with optimizations
+                t0 = time.time()
+                processed = process_batch_optimized(conn, pending_ids, emb_by_tr, centroid_index)
+                dt = time.time() - t0
+
+                conn.commit()
+                log.info("Cluster OK: %d procesadas en %.2fs (%.1f/s)", 
+                         processed, dt, processed / dt if dt > 0 else 0)
+
+        except Exception:
+            log.exception("Error en cluster_worker")
+            time.sleep(EVENT_SLEEP_IDLE)
+
+
+if __name__ == "__main__":
+    main()
+
--- a/workers/embeddings_worker.py
+++ b/workers/embeddings_worker.py
@ -0,0 +1,267 @@
+import os
+import time
+import logging
+from typing import List
+
+import numpy as np
+import psycopg2
+import psycopg2.extras
+from psycopg2.extras import execute_values
+
+import torch
+from sentence_transformers import SentenceTransformer
+
+
+# ================================================================
+# Logging
+# ================================================================
+logging.basicConfig(
+    level=logging.INFO,
+    format='[EMB] %(asctime)s %(levelname)s: %(message)s'
+)
+log = logging.getLogger("embeddings_worker")
+
+
+# ================================================================
+# Configuración
+# ================================================================
+DB = dict(
+    host=os.environ.get("DB_HOST", "localhost"),
+    port=int(os.environ.get("DB_PORT", 5432)),
+    dbname=os.environ.get("DB_NAME", "rss"),
+    user=os.environ.get("DB_USER", "rss"),
+    password=os.environ.get("DB_PASS", "x"),
+)
+
+EMB_MODEL = os.environ.get(
+    "EMB_MODEL",
+    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
+)
+
+EMB_BATCH = int(os.environ.get("EMB_BATCH", "128"))
+SLEEP_IDLE = float(os.environ.get("EMB_SLEEP_IDLE", "5.0"))
+
+# ej: "es,en,fr"
+EMB_LANGS = [
+    s.strip()
+    for s in os.environ.get("EMB_LANGS", "es").split(",")
+    if s.strip()
+]
+
+DEVICE_ENV = os.environ.get("DEVICE", "auto").lower()
+EMB_LIMIT = int(os.environ.get("EMB_LIMIT", "1000"))
+
+
+# ================================================================
+# Conexión
+# ================================================================
+def get_conn():
+    return psycopg2.connect(**DB)
+
+
+# ================================================================
+# Esquema — se asegura que exista
+# ================================================================
+def ensure_schema(conn):
+    """
+    Asegura que la tabla de embeddings existe. Idempotente.
+    """
+    with conn.cursor() as cur:
+        cur.execute(
+            """
+            CREATE TABLE IF NOT EXISTS traduccion_embeddings (
+                id SERIAL PRIMARY KEY,
+                traduccion_id INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE,
+                model TEXT NOT NULL,
+                dim INT NOT NULL,
+                embedding DOUBLE PRECISION[] NOT NULL,
+                created_at TIMESTAMP DEFAULT NOW(),
+                UNIQUE (traduccion_id, model)
+            );
+            """
+        )
+        cur.execute("""
+            CREATE INDEX IF NOT EXISTS idx_tr_emb_model
+                ON traduccion_embeddings(model);
+        """)
+        cur.execute("""
+            CREATE INDEX IF NOT EXISTS idx_tr_emb_traduccion_id
+                ON traduccion_embeddings(traduccion_id);
+        """)
+
+    conn.commit()
+
+
+# ================================================================
+# Fetch de trabajos pendientes
+# ================================================================
+def fetch_batch_pending(conn) -> List[psycopg2.extras.DictRow]:
+    """
+    Obtiene traducciones en status 'done' que aún no tienen embedding
+    para este modelo.
+    """
+    with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+        cur.execute(
+            """
+            SELECT 
+                t.id AS traduccion_id,
+                t.lang_to AS lang_to,
+                COALESCE(NULLIF(t.titulo_trad,''), '') AS titulo_trad,
+                COALESCE(NULLIF(t.resumen_trad,''), '') AS resumen_trad,
+                n.id AS noticia_id
+            FROM traducciones t
+            JOIN noticias n ON n.id = t.noticia_id
+            LEFT JOIN traduccion_embeddings e
+                   ON e.traduccion_id = t.id AND e.model = %s
+            WHERE t.status = 'done'
+              AND t.lang_to = ANY(%s)
+              AND e.traduccion_id IS NULL
+            ORDER BY t.id
+            LIMIT %s;
+            """,
+            (EMB_MODEL, EMB_LANGS, EMB_LIMIT),
+        )
+        return cur.fetchall()
+
+
+# ================================================================
+# Preparación de textos
+# ================================================================
+def texts_from_rows(rows: List[psycopg2.extras.DictRow]) -> List[str]:
+    """
+    Devuelve textos combinados para embeddings.
+    Evita pasar texto vacío al modelo.
+    """
+    texts = []
+    for r in rows:
+        title = (r["titulo_trad"] or "").strip()
+        body = (r["resumen_trad"] or "").strip()
+
+        if title and body:
+            texts.append(f"{title}\n{body}")
+        else:
+            texts.append(title or body or "")
+
+    return texts
+
+
+# ================================================================
+# Upsert
+# ================================================================
+def upsert_embeddings(conn, rows, embs: np.ndarray, model_name: str):
+    """
+    Inserta o actualiza embeddings en la base de datos.
+    """
+    if embs.size == 0 or not rows:
+        return
+
+    dim = int(embs.shape[1])
+
+    data = [
+        (
+            int(r["traduccion_id"]),
+            model_name,
+            dim,
+            embs[i].astype(float).tolist(),
+        )
+        for i, r in enumerate(rows)
+    ]
+
+    with conn.cursor() as cur:
+        execute_values(
+            cur,
+            """
+            INSERT INTO traduccion_embeddings 
+                (traduccion_id, model, dim, embedding)
+            VALUES %s
+            ON CONFLICT (traduccion_id, model)
+            DO UPDATE SET
+                embedding  = EXCLUDED.embedding,
+                dim        = EXCLUDED.dim,
+                created_at = NOW();
+            """,
+            data,
+        )
+
+    conn.commit()
+
+
+# ================================================================
+# Load model
+# ================================================================
+def resolve_device() -> str:
+    """
+    Determina el dispositivo a usar.
+    """
+    if DEVICE_ENV in ("cpu", "cuda"):
+        if DEVICE_ENV == "cuda" and not torch.cuda.is_available():
+            return "cpu"
+        return DEVICE_ENV
+
+    # auto
+    return "cuda" if torch.cuda.is_available() else "cpu"
+
+
+def load_model() -> SentenceTransformer:
+    """
+    Carga el modelo con fallback CPU si CUDA falla.
+    """
+    device = resolve_device()
+    log.info(f"Cargando modelo {EMB_MODEL} en device={device} …")
+
+    try:
+        return SentenceTransformer(EMB_MODEL, device=device)
+    except Exception as e:
+        log.error(f"Fallo cargando modelo en {device}: {e}")
+
+        if device == "cuda":
+            log.warning("→ Reintentando en CPU…")
+            return SentenceTransformer(EMB_MODEL, device="cpu")
+
+        raise
+
+
+# ================================================================
+# Main Worker
+# ================================================================
+def main():
+    log.info(
+        f"Iniciando embeddings_worker | model={EMB_MODEL} | batch={EMB_BATCH} | lang={','.join(EMB_LANGS)} | limit={EMB_LIMIT}"
+    )
+
+    model = load_model()
+
+    while True:
+        try:
+            with get_conn() as conn:
+                ensure_schema(conn)
+
+                rows = fetch_batch_pending(conn)
+                if not rows:
+                    time.sleep(SLEEP_IDLE)
+                    continue
+
+                texts = texts_from_rows(rows)
+
+                # Encode
+                embs = model.encode(
+                    texts,
+                    batch_size=EMB_BATCH,
+                    convert_to_numpy=True,
+                    show_progress_bar=False,
+                    normalize_embeddings=True,
+                )
+
+                # Upsert
+                upsert_embeddings(conn, rows, embs, EMB_MODEL)
+
+                log.info(f"Embeddings generados: {len(rows)}")
+
+        except Exception as e:
+            log.exception(f"Error en embeddings_worker: {e}")
+            time.sleep(SLEEP_IDLE)
+
+
+if __name__ == "__main__":
+    main()
+
--- a/workers/ner_worker.py
+++ b/workers/ner_worker.py
@ -0,0 +1,414 @@
+import os
+import time
+import logging
+import re
+import string
+from typing import List, Tuple
+from collections import Counter
+
+import psycopg2
+import psycopg2.extras
+import spacy
+from bs4 import BeautifulSoup
+
+# ==========================================================
+# Logging
+# ==========================================================
+logging.basicConfig(
+    level=logging.INFO,
+    format='[NER] %(asctime)s %(levelname)s: %(message)s'
+)
+log = logging.getLogger("ner_worker")
+
+# ==========================================================
+# Config DB
+# ==========================================================
+DB = dict(
+    host=os.environ.get("DB_HOST", "localhost"),
+    port=int(os.environ.get("DB_PORT", 5432)),
+    dbname=os.environ.get("DB_NAME", "rss"),
+    user=os.environ.get("DB_USER", "rss"),
+    password=os.environ.get("DB_PASS", "x"),
+)
+
+NER_LANG = os.environ.get("NER_LANG", "es").strip().lower()
+BATCH = int(os.environ.get("NER_BATCH", 64))
+
+# ==========================================================
+# Mapeo de entidades spaCy → nuestro modelo SQL
+# ==========================================================
+ENT_LABELS = {
+    "PERSON": "persona",
+    "PER": "persona",
+    "ORG": "organizacion",
+    "GPE": "lugar",
+    "LOC": "lugar",
+    "MISC": "tema",
+}
+
+# ==========================================================
+# Limpieza avanzada
+# ==========================================================
+_ws_re = re.compile(r"\s+")
+
+HTML_TRASH_PATTERNS = [
+    r"<[^>]+>",
+    r"&[a-z]+;",
+    r"&#\d+;?",
+    r'width="\d+"',
+    r'height="\d+"',
+]
+
+GENERIC_BAD_TAGS = {
+    "república", "estado", "centro", "gobierno", "el gobierno",
+    "gobiernos", "report", "sp", "unión", "union", "dólares",
+    "dolar", "dólar", "the post", "post", "artículo", "el artículo",
+    "la ciudad", "mundo", "país", "pais", "países", "paises",
+    "la noche", "la publicación", "este miércoles", "el miércoles",
+    "hoy", "ayer", "mañana", "servicio", "servicios", "el presidente",
+    "presidente", "el ministro", "ministro", "la guerra", "guerra",
+    "seguridad", "wp-content", "internal_photos", "/internal_photos",
+    "https", "http", "src"
+}
+
+STOPWORDS = set()
+
+ARTICLES = {
+    "el", "la", "los", "las", "un", "una", "uno", "al", "del"
+}
+
+# Límites
+TOPIC_MIN_CHARS = 4
+TOPIC_MAX_WORDS = 6
+TOPIC_MAX_PER_DOC = 15
+
+
+# ==========================================================
+# Helpers
+# ==========================================================
+def get_conn():
+    return psycopg2.connect(**DB)
+
+
+def _looks_like_attr_or_path(text_lower: str) -> bool:
+    """Filtra basura tipo rutas, html, atributos, URLs, etc."""
+    if text_lower.startswith("/"):
+        return True
+    if "http://" in text_lower or "https://" in text_lower:
+        return True
+    if any(ext in text_lower for ext in (".jpg", ".jpeg", ".png", ".gif", ".webp")):
+        return True
+    if re.search(r"\b(src|alt|style|class)\s*=", text_lower):
+        return True
+    if "data-" in text_lower:
+        return True
+    if re.search(r"&#\d+;?", text_lower):
+        return True
+    if "=" in text_lower and " " not in text_lower.strip():
+        return True
+
+    # tokens-hash tipo "ajsdh7287sdhjshd8" (solo si no tiene espacios)
+    if " " not in text_lower and re.fullmatch(r"[a-z0-9_]{15,}", text_lower.replace("-", "")):
+        return True
+
+    # palabras sin espacios largas con guiones
+    if "-" in text_lower and " " not in text_lower:
+        return True
+
+    return False
+
+
+# ==========================================================
+# Limpieza de entidades
+# ==========================================================
+def clean_tag_text(text: str) -> str | None:
+    if not text:
+        return None
+
+    text = BeautifulSoup(text, "html.parser").get_text()
+    for pat in HTML_TRASH_PATTERNS:
+        text = re.sub(pat, "", text)
+
+    text = _ws_re.sub(" ", text).strip()
+    text = text.strip(string.punctuation + " ")
+
+    if len(text) < 3:
+        log.debug(f"Clean reject (too short): {text}")
+        return None
+    if re.search(r"[<>/\\]", text):
+        log.debug(f"Clean reject (bad chars): {text}")
+        return None
+
+    lower = text.lower()
+    if lower.startswith("href="):
+        log.debug(f"Clean reject (href): {text}")
+        return None
+    if _looks_like_attr_or_path(lower):
+        log.debug(f"Clean reject (attr/path): {text}")
+        return None
+    if lower in GENERIC_BAD_TAGS:
+        log.debug(f"Clean reject (generic bad): {text}")
+        return None
+
+    replacements = {
+        "ee.uu.": "Estados Unidos",
+        "los estados unidos": "Estados Unidos",
+        "eeuu": "Estados Unidos",
+        "eu": "Unión Europea",
+        "ue": "Unión Europea",
+        "kosova": "Kosovo",
+        # Specific User Requests
+        "trump": "Donald Trump",
+        "mr. trump": "Donald Trump",
+        "mr trump": "Donald Trump",
+        "doland trump": "Donald Trump",
+        "el presidente trump": "Donald Trump",
+        "president trump": "Donald Trump",
+        "ex-president trump": "Donald Trump",
+        "expresidente trump": "Donald Trump",
+        "putin": "Vladimir Putin",
+        "vladimir putin": "Vladimir Putin",
+        "v. putin": "Vladimir Putin",
+        "presidente putin": "Vladimir Putin",
+        # New requests
+        "sanchez": "Pedro Sánchez",
+        "pedro sanchez": "Pedro Sánchez",
+        "p. sanchez": "Pedro Sánchez",
+        "mr. sanchez": "Pedro Sánchez",
+        "sánchez": "Pedro Sánchez", # explicit match just in case
+        "pedro sánchez": "Pedro Sánchez",
+        "maduro": "Nicolás Maduro",
+        "nicolas maduro": "Nicolás Maduro",
+        "mr. maduro": "Nicolás Maduro",
+        "lula": "Lula da Silva",
+        "lula da silva": "Lula da Silva",
+        "luiz inácio lula da silva": "Lula da Silva",
+    }
+    if lower in replacements:
+        return replacements[lower]
+
+    # Blacklist (explicit removals requested)
+    blacklist = {
+        "getty images", "netflix", "fiscalia", "segun", "estoy", # People blacklist
+        "and more", "app", "estamos", "ultra", # Orgs blacklist
+        "hacienda", "fiscalía" 
+    }
+    if lower in blacklist:
+        log.debug(f"Clean reject (blacklist): {text}")
+        return None
+
+    return text
+
+
+# ==========================================================
+# Limpieza de topics (noun-chunks)
+# ==========================================================
+def clean_topic_text(text: str) -> str | None:
+    if not text:
+        return None
+
+    text = BeautifulSoup(text, "html.parser").get_text()
+    for pat in HTML_TRASH_PATTERNS:
+        text = re.sub(pat, "", text)
+
+    text = _ws_re.sub(" ", text).strip()
+    text = text.strip(string.punctuation + " ")
+
+    if len(text) < TOPIC_MIN_CHARS:
+        return None
+
+    lower = text.lower()
+    if _looks_like_attr_or_path(lower):
+        return None
+
+    tokens = [
+        t.strip(string.punctuation)
+        for t in lower.split()
+        if t.strip(string.punctuation)
+    ]
+    if not tokens:
+        return None
+
+    # remover artículos iniciales
+    if tokens[0] in ARTICLES:
+        tokens = tokens[1:]
+        if not tokens:
+            return None
+
+    norm = " ".join(tokens).strip()
+
+    if len(norm) < TOPIC_MIN_CHARS:
+        return None
+    if norm in GENERIC_BAD_TAGS:
+        return None
+    if len(tokens) > TOPIC_MAX_WORDS:
+        return None
+    if all(t in STOPWORDS for t in tokens):
+        return None
+    if re.fullmatch(r"[0-9\s\.,\-:/]+", norm):
+        return None
+
+    return norm
+
+
+# ==========================================================
+# Extracción NER + Topics
+# ==========================================================
+def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]:
+    ents = []
+    topics = []
+
+    if not text:
+        return ents, topics
+
+    doc = nlp(text)
+    # log.debug(f"Processing text ({len(text)} chars): {text[:30]}...")
+    # log.debug(f"Entities found: {len(doc.ents)}")
+
+    # --- ENTIDADES ---
+    for ent in doc.ents:
+        tipo = ENT_LABELS.get(ent.label_)
+        if not tipo:
+            continue
+
+        cleaned = clean_tag_text(ent.text)
+        if not cleaned:
+            # log.debug(f"Rejected entity: {ent.text} ({ent.label_})")
+            continue
+            
+        if tipo == "persona":
+            lower_cleaned = cleaned.lower()
+            # Aggressive normalization rules for VIPs
+            # Use token checks or substring checks carefully
+            if "trump" in lower_cleaned.split(): 
+                # Token 'trump' exists? e.g. "donald trump", "trump", "mr. trump"
+                # Exclude family members
+                family = ["ivanka", "melania", "eric", "tiffany", "barron", "lara", "mary", "fred"]
+                if not any(f in lower_cleaned for f in family):
+                    cleaned = "Donald Trump"
+            
+            elif "sanchez" in lower_cleaned or "sánchez" in lower_cleaned:
+                # Be careful of other Sanchez? But user context implies Pedro.
+                if "pedro" in lower_cleaned or "presidente" in lower_cleaned or lower_cleaned in ["sanchez", "sánchez"]:
+                    cleaned = "Pedro Sánchez"
+            
+            elif "maduro" in lower_cleaned:
+                cleaned = "Nicolás Maduro"
+            
+            elif "lula" in lower_cleaned:
+                cleaned = "Lula da Silva"
+            
+            elif "putin" in lower_cleaned:
+                cleaned = "Vladimir Putin"
+
+        # log.debug(f"Accepted entity: {cleaned} ({tipo})")
+        ents.append((cleaned, tipo))
+
+    # --- TOPICS ---
+    topic_counter = Counter()
+    for chunk in doc.noun_chunks:
+        cleaned = clean_topic_text(chunk.text)
+        if cleaned:
+            topic_counter[cleaned] += 1
+
+    ent_values = {v for (v, _) in ents}
+
+    for val, _count in topic_counter.most_common(TOPIC_MAX_PER_DOC):
+        if val in ent_values:
+            continue
+        topics.append((val, "tema"))
+
+    return list(set(ents)), list(set(topics))
+
+
+# ==========================================================
+# Worker principal
+# ==========================================================
+def main():
+    global STOPWORDS
+
+    # Cargar spaCy
+    log.info("Cargando modelo spaCy es_core_news_md...")
+    nlp = spacy.load("es_core_news_md", disable=["lemmatizer", "textcat"])
+    STOPWORDS = set(nlp.Defaults.stop_words)
+    log.info("Modelo spaCy cargado correctamente.")
+
+    while True:
+        try:
+            with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+                cur.execute(
+                    """
+                    SELECT t.id, t.titulo_trad, t.resumen_trad
+                    FROM traducciones t
+                    WHERE t.status = 'done'
+                      AND t.lang_to = %s
+                      AND NOT EXISTS (
+                          SELECT 1 FROM tags_noticia tn WHERE tn.traduccion_id = t.id
+                      )
+                    ORDER BY t.id DESC
+                    LIMIT %s;
+                    """,
+                    (NER_LANG, BATCH),
+                )
+
+
+                rows = cur.fetchall()
+
+                if not rows:
+                    time.sleep(5)
+                    continue
+
+                log.info(f"Procesando {len(rows)} traducciones para NER/temas...")
+
+                inserted_links = 0
+
+                for r in rows:
+                    text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
+                    if not text:
+                        continue
+
+                    ents, topics = extract_entities_and_topics(nlp, text)
+                    tags = ents + topics
+                    if not tags:
+                        continue
+
+                    for valor, tipo in tags:
+                        try:
+                            cur.execute(
+                                """
+                                INSERT INTO tags (valor, tipo)
+                                VALUES (%s, %s)
+                                ON CONFLICT (valor, tipo)
+                                DO UPDATE SET valor = EXCLUDED.valor
+                                RETURNING id;
+                                """,
+                                (valor, tipo),
+                            )
+                            tag_id = cur.fetchone()[0]
+
+                            cur.execute(
+                                """
+                                INSERT INTO tags_noticia (traduccion_id, tag_id)
+                                VALUES (%s, %s)
+                                ON CONFLICT DO NOTHING;
+                                """,
+                                (r["id"], tag_id),
+                            )
+
+                            if cur.rowcount > 0:
+                                inserted_links += 1
+
+                        except Exception:
+                            log.exception("Error insertando tag/relación")
+
+                conn.commit()
+                log.info(f"Lote NER OK. Nuevas relaciones tag_noticia: {inserted_links}")
+
+        except Exception:
+            log.exception("Error general en NER loop")
+            time.sleep(5)
+
+
+if __name__ == "__main__":
+    main()
+
--- a/workers/qdrant_worker.py
+++ b/workers/qdrant_worker.py
@ -0,0 +1,334 @@
+"""
+Worker de Qdrant
+Vectoriza noticias traducidas y las sube a Qdrant para búsquedas semánticas.
+"""
+
+import os
+import sys
+import time
+import uuid
+from datetime import datetime
+from typing import List, Dict, Any
+
+# Añadir el directorio raíz al path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from db import get_read_conn, get_write_conn
+
+try:
+    from qdrant_client import QdrantClient
+    from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
+except ImportError:
+    print("❌ Error: qdrant-client no instalado. Ejecuta: pip install qdrant-client")
+    sys.exit(1)
+
+try:
+    from sentence_transformers import SentenceTransformer
+except ImportError:
+    print("❌ Error: sentence-transformers no instalado")
+    sys.exit(1)
+
+# Configuración
+QDRANT_HOST = os.environ.get("QDRANT_HOST", "localhost")
+QDRANT_PORT = int(os.environ.get("QDRANT_PORT", "6333"))
+QDRANT_COLLECTION = os.environ.get("QDRANT_COLLECTION_NAME", "news_vectors")
+
+EMB_MODEL = os.environ.get("EMB_MODEL", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
+EMB_DEVICE = os.environ.get("EMB_DEVICE", "cuda")
+BATCH_SIZE = int(os.environ.get("QDRANT_BATCH_SIZE", "100"))
+SLEEP_IDLE = int(os.environ.get("QDRANT_SLEEP_IDLE", "30"))
+
+# Cliente Qdrant global
+qdrant_client = None
+embedding_model = None
+
+
+def init_qdrant_client():
+    """
+    Inicializa el cliente de Qdrant y crea la colección si no existe.
+    """
+    global qdrant_client
+    
+    print(f"🔌 Conectando a Qdrant en {QDRANT_HOST}:{QDRANT_PORT}...")
+    qdrant_client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
+    
+    # Verificar si la colección existe
+    collections = qdrant_client.get_collections().collections
+    collection_names = [c.name for c in collections]
+    
+    if QDRANT_COLLECTION not in collection_names:
+        print(f"📦 Creando colección '{QDRANT_COLLECTION}'...")
+        
+        # Obtener dimensión del modelo de embeddings
+        # paraphrase-multilingual-MiniLM-L12-v2 = 384 dimensiones
+        vector_size = 384
+        
+        qdrant_client.create_collection(
+            collection_name=QDRANT_COLLECTION,
+            vectors_config=VectorParams(
+                size=vector_size,
+                distance=Distance.COSINE
+            )
+        )
+        print(f"✅ Colección '{QDRANT_COLLECTION}' creada (dimensión: {vector_size})")
+    else:
+        print(f"✅ Colección '{QDRANT_COLLECTION}' ya existe")
+    
+    # Obtener info de la colección
+    collection_info = qdrant_client.get_collection(QDRANT_COLLECTION)
+    print(f"📊 Puntos en colección: {collection_info.points_count}")
+
+
+def init_embedding_model():
+    """
+    Inicializa el modelo de embeddings.
+    """
+    global embedding_model
+    
+    print(f"🤖 Cargando modelo de embeddings: {EMB_MODEL}")
+    print(f"🖥️  Dispositivo: {EMB_DEVICE}")
+    
+    embedding_model = SentenceTransformer(EMB_MODEL, device=EMB_DEVICE)
+    
+    print(f"✅ Modelo cargado correctamente")
+
+
+def get_pending_news(limit: int = BATCH_SIZE) -> List[Dict[str, Any]]:
+    """
+    Obtiene noticias traducidas pendientes de vectorizar.
+    
+    Args:
+        limit: Número máximo de noticias a obtener
+        
+    Returns:
+        Lista de noticias
+    """
+    with get_read_conn() as conn:
+        with conn.cursor() as cur:
+            cur.execute("""
+                SELECT 
+                    t.id as traduccion_id,
+                    t.noticia_id,
+                    t.lang_to as lang,
+                    t.titulo_trad as titulo,
+                    t.resumen_trad as resumen,
+                    n.url,
+                    n.fecha,
+                    n.fuente_nombre,
+                    n.categoria_id,
+                    n.pais_id
+                FROM traducciones t
+                INNER JOIN noticias n ON t.noticia_id = n.id
+                WHERE t.vectorized = FALSE
+                AND t.status = 'done'
+                ORDER BY t.created_at ASC
+                LIMIT %s
+            """, (limit,))
+            
+            columns = [desc[0] for desc in cur.description]
+            results = []
+            for row in cur.fetchall():
+                results.append(dict(zip(columns, row)))
+            
+            return results
+
+
+def generate_embeddings(texts: List[str]) -> List[List[float]]:
+    """
+    Genera embeddings para una lista de textos.
+    
+    Args:
+        texts: Lista de textos
+        
+    Returns:
+        Lista de vectores de embeddings
+    """
+    embeddings = embedding_model.encode(
+        texts,
+        batch_size=32,
+        show_progress_bar=False,
+        convert_to_numpy=True
+    )
+    return embeddings.tolist()
+
+
+def upload_to_qdrant(news_batch: List[Dict[str, Any]]):
+    """
+    Sube un lote de noticias a Qdrant.
+    
+    Args:
+        news_batch: Lista de noticias
+    """
+    if not news_batch:
+        return
+    
+    # Preparar textos para embeddings (título + resumen)
+    texts = [
+        f"{news['titulo']} {news['resumen']}"
+        for news in news_batch
+    ]
+    
+    print(f"  🧮 Generando embeddings para {len(texts)} noticias...")
+    embeddings = generate_embeddings(texts)
+    
+    # Preparar puntos para Qdrant
+    points = []
+    for news, embedding in zip(news_batch, embeddings):
+        point_id = str(uuid.uuid4())
+        
+        # Preparar payload (metadata)
+        payload = {
+            "news_id": news['noticia_id'],
+            "traduccion_id": news['traduccion_id'],
+            "titulo": news['titulo'],
+            "resumen": news['resumen'],
+            "url": news['url'],
+            "fecha": news['fecha'].isoformat() if news['fecha'] else None,
+            "fuente_nombre": news['fuente_nombre'],
+            "categoria_id": news['categoria_id'],
+            "pais_id": news['pais_id'],
+            "lang": news['lang']
+        }
+        
+        point = PointStruct(
+            id=point_id,
+            vector=embedding,
+            payload=payload
+        )
+        points.append(point)
+        
+        # Guardar point_id para actualizar DB
+        news['qdrant_point_id'] = point_id
+    
+    # Subir a Qdrant
+    print(f"  ⬆️  Subiendo {len(points)} puntos a Qdrant...")
+    qdrant_client.upsert(
+        collection_name=QDRANT_COLLECTION,
+        points=points
+    )
+    
+    # Actualizar base de datos
+    print(f"  💾 Actualizando estado en PostgreSQL...")
+    with get_write_conn() as conn:
+        with conn.cursor() as cur:
+            for news in news_batch:
+                cur.execute("""
+                    UPDATE traducciones
+                    SET 
+                        vectorized = TRUE,
+                        vectorization_date = NOW(),
+                        qdrant_point_id = %s
+                    WHERE id = %s
+                """, (news['qdrant_point_id'], news['traduccion_id']))
+        conn.commit()
+    
+    print(f"  ✅ Lote subido correctamente")
+
+
+def process_batch():
+    """
+    Procesa un lote de noticias traducidas.
+    
+    Returns:
+        Número de noticias procesadas
+    """
+    news_batch = get_pending_news()
+    
+    if not news_batch:
+        return 0
+    
+    print(f"\n📋 Procesando {len(news_batch)} noticias traducidas...")
+    
+    try:
+        upload_to_qdrant(news_batch)
+        return len(news_batch)
+    except Exception as e:
+        print(f"❌ Error procesando lote: {e}")
+        return 0
+
+
+def get_stats():
+    """
+    Obtiene estadísticas del sistema.
+    """
+    with get_read_conn() as conn:
+        with conn.cursor() as cur:
+            cur.execute("""
+                SELECT 
+                    COUNT(*) as total,
+                    COUNT(*) FILTER (WHERE vectorized = TRUE) as vectorizadas,
+                    COUNT(*) FILTER (WHERE vectorized = FALSE AND status = 'done') as pendientes
+                FROM traducciones
+                WHERE lang_to = 'es' 
+            """)
+            row = cur.fetchone()
+            return {
+                'total': row[0],
+                'vectorizadas': row[1],
+                'pendientes': row[2]
+            }
+
+
+def main():
+    """
+    Loop principal del worker.
+    """
+    print("=" * 80)
+    print("🚀 Qdrant Vectorization Worker (Direct Translation)")
+    print("=" * 80)
+    print(f"Qdrant: {QDRANT_HOST}:{QDRANT_PORT}")
+    print(f"Colección: {QDRANT_COLLECTION}")
+    print(f"Modelo: {EMB_MODEL}")
+    print(f"Dispositivo: {EMB_DEVICE}")
+    print(f"Tamaño de lote: {BATCH_SIZE}")
+    print("=" * 80)
+    
+    # Inicializar Qdrant
+    try:
+        init_qdrant_client()
+    except Exception as e:
+        print(f"❌ Error inicializando Qdrant: {e}")
+        print("⚠️  Asegúrate de que Qdrant esté corriendo")
+        return
+    
+    # Inicializar modelo de embeddings
+    try:
+        init_embedding_model()
+    except Exception as e:
+        print(f"❌ Error cargando modelo de embeddings: {e}")
+        return
+    
+    print("\n🔄 Iniciando loop de procesamiento...\n")
+    
+    total_processed = 0
+    
+    while True:
+        try:
+            processed = process_batch()
+            total_processed += processed
+            
+            if processed > 0:
+                print(f"\n✅ Lote completado: {processed} noticias vectorizadas")
+                print(f"📊 Total procesado en esta sesión: {total_processed}")
+                
+                # Mostrar estadísticas
+                stats = get_stats()
+                print(f"📈 Estadísticas globales:")
+                print(f"   Total traducciones: {stats['total']}")
+                print(f"   Vectorizadas: {stats['vectorizadas']}")
+                print(f"   Pendientes: {stats['pendientes']}")
+            else:
+                print(f"💤 No hay noticias pendientes. Esperando {SLEEP_IDLE}s...")
+                time.sleep(SLEEP_IDLE)
+                
+        except KeyboardInterrupt:
+            print("\n\n⏹️  Worker detenido por el usuario")
+            break
+        except Exception as e:
+            print(f"\n❌ Error en loop principal: {e}")
+            print(f"⏳ Esperando {SLEEP_IDLE}s antes de reintentar...")
+            time.sleep(SLEEP_IDLE)
+
+
+if __name__ == "__main__":
+    main()
--- a/workers/related_worker.py
+++ b/workers/related_worker.py
@ -0,0 +1,202 @@
+import os
+import time
+import logging
+from typing import List, Tuple
+
+import numpy as np
+import psycopg2
+import psycopg2.extras
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='[related] %(asctime)s %(levelname)s: %(message)s'
+)
+
+DB = dict(
+    host=os.environ.get("DB_HOST", "localhost"),
+    port=int(os.environ.get("DB_PORT", 5432)),
+    dbname=os.environ.get("DB_NAME", "rss"),
+    user=os.environ.get("DB_USER", "rss"),
+    password=os.environ.get("DB_PASS", "x"),
+)
+
+EMB_MODEL = os.environ.get(
+    "EMB_MODEL",
+    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+)
+
+TOPK = int(os.environ.get("RELATED_TOPK", 10))
+BATCH_IDS = int(os.environ.get("RELATED_BATCH_IDS", 200))
+SLEEP_IDLE = float(os.environ.get("RELATED_SLEEP", 10))
+MIN_SCORE = float(os.environ.get("RELATED_MIN_SCORE", 0.0))
+WINDOW_HOURS = int(os.environ.get("RELATED_WINDOW_H", 0))
+
+
+def get_conn():
+    return psycopg2.connect(**DB)
+
+
+def fetch_all_embeddings(cur) -> Tuple[List[int], np.ndarray]:
+    sql = """
+        SELECT e.traduccion_id, e.embedding, n.fecha
+        FROM traduccion_embeddings e
+        JOIN traducciones t ON t.id = e.traduccion_id
+        JOIN noticias n ON n.id = t.noticia_id
+        WHERE e.model = %s
+          AND t.status = 'done'
+          AND t.lang_to = 'es'
+    """
+    params = [EMB_MODEL]
+
+    if WINDOW_HOURS > 0:
+        sql += " AND n.fecha >= NOW() - INTERVAL %s"
+        params.append(f"{WINDOW_HOURS} hours")
+
+    cur.execute(sql, params)
+    rows = cur.fetchall()
+
+    if not rows:
+        return [], None
+
+    ids = []
+    vecs = []
+
+    for tr_id, emb, _ in rows:
+        if not emb:
+            continue
+        arr = np.asarray(emb, dtype=np.float32)
+        if arr.ndim != 1 or arr.size == 0:
+            continue
+        ids.append(tr_id)
+        vecs.append(arr)
+
+    if not ids:
+        return [], None
+
+    mat = np.vstack(vecs)
+    norms = np.linalg.norm(mat, axis=1, keepdims=True)
+    norms[norms == 0] = 1e-8
+    mat = mat / norms
+
+    return ids, mat
+
+
+def fetch_pending_ids(cur, limit) -> List[int]:
+    cur.execute(
+        """
+        SELECT t.id
+        FROM traducciones t
+        JOIN traduccion_embeddings e
+             ON e.traduccion_id = t.id AND e.model = %s
+        LEFT JOIN related_noticias r
+               ON r.traduccion_id = t.id
+        WHERE t.lang_to = 'es'
+          AND t.status = 'done'
+        GROUP BY t.id
+        HAVING COUNT(r.related_traduccion_id) = 0
+        ORDER BY t.id DESC
+        LIMIT %s;
+        """,
+        (EMB_MODEL, limit),
+    )
+    return [r[0] for r in cur.fetchall()]
+
+
+def topk(idx: int, ids_all: List[int], mat: np.ndarray, K: int) -> List[Tuple[int, float]]:
+    q = mat[idx]
+    sims = np.dot(mat, q)
+    sims[idx] = -999.0
+
+    if MIN_SCORE > 0:
+        mask = sims >= MIN_SCORE
+        sims = np.where(mask, sims, -999.0)
+
+    if K >= len(sims):
+        top_idx = np.argsort(-sims)
+    else:
+        part = np.argpartition(-sims, K)[:K]
+        top_idx = part[np.argsort(-sims[part])]
+
+    return [(ids_all[j], float(sims[j])) for j in top_idx[:K]]
+
+
+def insert_related(cur, tr_id: int, pairs):
+    clean = []
+    for rid, score in pairs:
+        if rid == tr_id:
+            continue
+        s = float(score)
+        if s <= 0:
+            continue
+        clean.append((tr_id, rid, s))
+
+    if not clean:
+        return
+
+    psycopg2.extras.execute_values(
+        cur,
+        """
+        INSERT INTO related_noticias (traduccion_id, related_traduccion_id, score)
+        VALUES %s
+        ON CONFLICT (traduccion_id, related_traduccion_id)
+        DO UPDATE SET score = EXCLUDED.score;
+        """,
+        clean,
+    )
+
+
+def build_for_ids(conn, target_ids: List[int]) -> int:
+    with conn.cursor() as cur:
+        ids_all, mat = fetch_all_embeddings(cur)
+
+    if not ids_all or mat is None:
+        return 0
+
+    pos = {tid: i for i, tid in enumerate(ids_all)}
+    processed = 0
+
+    with conn.cursor() as cur:
+        for tr_id in target_ids:
+            if tr_id not in pos:
+                continue
+            idx = pos[tr_id]
+            pairs = topk(idx, ids_all, mat, TOPK)
+            insert_related(cur, tr_id, pairs)
+            processed += 1
+
+        conn.commit()
+
+    return processed
+
+
+def main():
+    logging.info(
+        "Iniciando related_worker (EMB=%s TOPK=%s BATCH=%s MIN=%.3f WINDOW_H=%s)",
+        EMB_MODEL,
+        TOPK,
+        BATCH_IDS,
+        MIN_SCORE,
+        WINDOW_HOURS,
+    )
+
+    while True:
+        try:
+            with get_conn() as conn, conn.cursor() as cur:
+                todo = fetch_pending_ids(cur, BATCH_IDS)
+
+            if not todo:
+                time.sleep(SLEEP_IDLE)
+                continue
+
+            with get_conn() as conn:
+                done = build_for_ids(conn, todo)
+                logging.info("Relacionadas generadas/actualizadas para %d traducciones.", done)
+
+        except Exception:
+            logging.exception("Error en related_worker")
+            time.sleep(SLEEP_IDLE)
+
+
+if __name__ == "__main__":
+    main()
+
--- a/workers/topics_worker.py
+++ b/workers/topics_worker.py
@ -0,0 +1,244 @@
+import os
+import time
+import logging
+import json
+import psycopg2
+from psycopg2.extras import execute_values
+
+# Logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='[topics_worker] %(asctime)s %(levelname)s: %(message)s'
+)
+log = logging.getLogger(__name__)
+
+# Config
+DB_CONFIG = {
+    "host": os.environ.get("DB_HOST", "localhost"),
+    "port": int(os.environ.get("DB_PORT", 5432)),
+    "dbname": os.environ.get("DB_NAME", "rss"),
+    "user": os.environ.get("DB_USER", "rss"),
+    "password": os.environ.get("DB_PASS", "x"),
+}
+
+SLEEP_IDLE = 10
+BATCH_SIZE = 500
+
+def get_conn():
+    return psycopg2.connect(**DB_CONFIG)
+
+def load_topics(conn):
+    """
+    Load topics and heir keywords.
+    Returns list of dicts: [{'id': 1, 'weight': 5, 'keywords': ['foo', 'bar']}]
+    """
+    with conn.cursor() as cur:
+        cur.execute("SELECT id, weight, keywords FROM topics")
+        rows = cur.fetchall()
+    
+    topics = []
+    for r in rows:
+        tid, weight, kw_str = r
+        if not kw_str:
+            continue
+        # Keywords are comma separated based on insert script
+        kws = [k.strip().lower() for k in kw_str.split(",") if k.strip()]
+        topics.append({
+            "id": tid,
+            "weight": weight,
+            "keywords": kws
+        })
+    return topics
+
+
+def load_countries(conn):
+    """
+    Load countries.
+    Returns list: [{'id': 1, 'name': 'España', 'keywords': ['españa', 'madrid']}]
+    """
+    with conn.cursor() as cur:
+        cur.execute("SELECT id, nombre FROM paises")
+        rows = cur.fetchall()
+    
+    countries = []
+    # Hardcoded aliases for simplicity. A separate table would be better.
+    ALIASES = {
+        "Estados Unidos": ["eeuu", "ee.uu.", "usa", "estadounidense", "washington"],
+        "Rusia": ["ruso", "rusa", "moscú", "kremlin"],
+        "China": ["chino", "china", "pekin", "beijing"],
+        "Ucrania": ["ucraniano", "kiev", "kyiv"],
+        "Israel": ["israelí", "tel aviv", "jerusalén"],
+        "España": ["español", "madrid"],
+        "Reino Unido": ["uk", "londres", "británico"],
+        "Francia": ["francés", "parís"],
+        "Alemania": ["alemán", "berlín"],
+        "Palestina": ["palestino", "gaza", "cisjordania"],
+        "Irán": ["iraní", "teherán"],
+    }
+    
+    for r in rows:
+        cid, name = r
+        kws = [name.lower()]
+        if name in ALIASES:
+            kws.extend(ALIASES[name])
+        countries.append({"id": cid, "name": name, "keywords": kws})
+    return countries
+
+def process_batch(conn, topics, countries):
+    """
+    Fetch batch of processed=False news.
+    Match against topics AND countries.
+    Insert into news_topics.
+    Mark processed.
+    """
+    with conn.cursor() as cur:
+        # Fetch news
+        cur.execute("""
+            SELECT id, titulo, resumen 
+            FROM noticias 
+            WHERE topics_processed = FALSE 
+            ORDER BY fecha DESC 
+            LIMIT %s
+        """, (BATCH_SIZE,))
+        news_items = cur.fetchall()
+
+    if not news_items:
+        return 0
+
+    inserts = [] # (noticia_id, topic_id, score)
+    processed_ids = []
+    
+    # Batch updates for pais_id
+    country_updates = [] # (pais_id, noticia_id)
+
+    for item in news_items:
+        nid, titulo, resumen = item
+        text = (titulo or "") + " " + (resumen or "")
+        text_lower = text.lower()
+        
+        # 1. Match Topics
+        for topic in topics:
+            matched_count = 0
+            for kw in topic["keywords"]:
+                if kw in text_lower:
+                    matched_count += 1
+            
+            if matched_count > 0:
+                score = topic["weight"] * matched_count
+                inserts.append((nid, topic["id"], score))
+
+        # 2. Match Country (Find best match)
+        best_country = None
+        # Simple heuristic: First found? Or count matches?
+        # Let's count matches.
+        max_matches = 0
+        
+        for c in countries:
+            matches = 0
+            for kw in c["keywords"]:
+                # simple word matching. can be improved with regex word boundaries
+                if kw in text_lower:
+                    matches += 1
+            
+            if matches > max_matches:
+                max_matches = matches
+                best_country = c["id"]
+        
+        if best_country:
+            country_updates.append((best_country, nid))
+
+        processed_ids.append(nid)
+
+    with conn.cursor() as cur:
+        # Insert relations
+        if inserts:
+            execute_values(cur, """
+                INSERT INTO news_topics (noticia_id, topic_id, score)
+                VALUES %s
+                ON CONFLICT (noticia_id, topic_id) DO UPDATE SET score = EXCLUDED.score
+            """, inserts)
+        
+        # Update Countries
+        if country_updates:
+            execute_values(cur, """
+                UPDATE noticias AS n
+                SET pais_id = v.pais_id
+                FROM (VALUES %s) AS v(pais_id, noticia_id)
+                WHERE n.id = v.noticia_id
+            """, country_updates)
+        
+        # Mark processed
+        cur.execute("""
+            UPDATE noticias 
+            SET topics_processed = TRUE 
+            WHERE id = ANY(%s)
+        """, (processed_ids,))
+        
+    conn.commit()
+    return len(news_items)
+
+def initialize_schema(conn):
+    """
+    Ensure required tables and columns exist.
+    """
+    log.info("Checking/Initializing schema...")
+    with conn.cursor() as cur:
+        cur.execute("""
+            CREATE TABLE IF NOT EXISTS topics (
+                id SERIAL PRIMARY KEY,
+                slug VARCHAR(50) UNIQUE NOT NULL,
+                name VARCHAR(100) NOT NULL,
+                weight INTEGER DEFAULT 1,
+                keywords TEXT,
+                group_name VARCHAR(50)
+            );
+            CREATE TABLE IF NOT EXISTS news_topics (
+                noticia_id VARCHAR(32) REFERENCES noticias(id) ON DELETE CASCADE,
+                topic_id INTEGER REFERENCES topics(id) ON DELETE CASCADE,
+                score INTEGER DEFAULT 0,
+                created_at TIMESTAMP DEFAULT NOW(),
+                PRIMARY KEY (noticia_id, topic_id)
+            );
+            ALTER TABLE noticias ADD COLUMN IF NOT EXISTS topics_processed BOOLEAN DEFAULT FALSE;
+        """)
+        conn.commit()
+    log.info("Schema OK.")
+
+def main():
+    log.info("Starting topics_worker...")
+    
+    # Run migrations once at startup
+    try:
+        with get_conn() as conn:
+            initialize_schema(conn)
+    except Exception as e:
+        log.error(f"Error during schema initialization: {e}")
+        # We might want to exit here if the schema is crucial
+        # sys.exit(1)
+
+    while True:
+        try:
+            with get_conn() as conn:
+
+                topics = load_topics(conn)
+                if not topics:
+                    log.warning("No topics found in DB. Sleeping.")
+                    time.sleep(SLEEP_IDLE)
+                    continue
+                
+                # Load countries
+                countries = load_countries(conn)
+                
+                count = process_batch(conn, topics, countries)
+
+                if count < BATCH_SIZE:
+                     time.sleep(SLEEP_IDLE)
+                else:
+                    log.info(f"Processed {count} items.")
+                    
+        except Exception as e:
+            log.exception("Error in topics_worker")
+            time.sleep(SLEEP_IDLE)
+
+if __name__ == "__main__":
+    main()
--- a/workers/translation_worker.py
+++ b/workers/translation_worker.py
@ -0,0 +1,599 @@
+import os
+import time
+import logging
+import re
+from typing import List, Optional
+
+import psycopg2
+import psycopg2.extras
+from psycopg2.extras import execute_values
+
+import ctranslate2
+from transformers import AutoTokenizer
+from langdetect import detect, DetectorFactory
+
+DetectorFactory.seed = 0
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
+LOG = logging.getLogger("translator")
+
+# =========================
+# DB CONFIG
+# =========================
+DB_CONFIG = {
+    "host": os.environ.get("DB_HOST", "localhost"),
+    "port": int(os.environ.get("DB_PORT", 5432)),
+    "dbname": os.environ.get("DB_NAME", "rss"),
+    "user": os.environ.get("DB_USER", "rss"),
+    "password": os.environ.get("DB_PASS", "x"),
+}
+
+# =========================
+# ENV HELPERS
+# =========================
+def _env_list(name: str, default="es"):
+    raw = os.environ.get(name)
+    if raw:
+        return [s.strip() for s in raw.split(",") if s.strip()]
+    return [default]
+
+def _env_int(name: str, default: int = 8):
+    v = os.environ.get(name)
+    try:
+        return int(v)
+    except Exception:
+        return default
+
+def _env_float(name: str, default: float = 5.0):
+    v = os.environ.get(name)
+    try:
+        return float(v)
+    except Exception:
+        return default
+
+def _env_str(name: str, default=None):
+    v = os.environ.get(name)
+    return v if v else default
+
+# =========================
+# CONFIG
+# =========================
+TARGET_LANGS = _env_list("TARGET_LANGS")  # por defecto ["es"]
+BATCH_SIZE = _env_int("TRANSLATOR_BATCH", 8)
+ENQUEUE_MAX = _env_int("ENQUEUE", 200)
+SLEEP_IDLE = _env_float("TRANSLATOR_SLEEP_IDLE", 5.0)
+
+# CTranslate2 Configuration
+CT2_MODEL_PATH = _env_str("CT2_MODEL_PATH", "./models/nllb-ct2")
+CT2_DEVICE = _env_str("CT2_DEVICE", "auto")  # auto, cpu, cuda
+CT2_COMPUTE_TYPE = _env_str("CT2_COMPUTE_TYPE", "auto")  # auto, int8, float16, int8_float16
+
+MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", 512)
+MAX_NEW_TOKENS_TITLE = _env_int("MAX_NEW_TOKENS_TITLE", 96)
+MAX_NEW_TOKENS_BODY = _env_int("MAX_NEW_TOKENS_BODY", 512)
+
+NUM_BEAMS_TITLE = _env_int("NUM_BEAMS_TITLE", 2)
+NUM_BEAMS_BODY = _env_int("NUM_BEAMS_BODY", 1)
+
+# HuggingFace model name (used for tokenizer)
+UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", "facebook/nllb-200-distilled-600M")
+IDENTITY_PAISES_ES = _env_int("IDENTITY_PAISES_ES", 58)
+
+BODY_CHARS_CHUNK = _env_int("BODY_CHARS_CHUNK", 900)
+
+# =========================
+# LANG MAP
+# =========================
+NLLB_LANG = {
+    "es": "spa_Latn", "en": "eng_Latn", "fr": "fra_Latn", "de": "deu_Latn",
+    "it": "ita_Latn", "pt": "por_Latn", "nl": "nld_Latn", "sv": "swe_Latn",
+    "da": "dan_Latn", "fi": "fin_Latn", "no": "nob_Latn",
+    "pl": "pol_Latn", "cs": "ces_Latn", "sk": "slk_Latn",
+    "sl": "slv_Latn", "hu": "hun_Latn", "ro": "ron_Latn",
+    "el": "ell_Grek", "ru": "rus_Cyrl", "uk": "ukr_Cyrl",
+    "tr": "tur_Latn", "ar": "arb_Arab", "fa": "pes_Arab",
+    "he": "heb_Hebr", "zh": "zho_Hans", "ja": "jpn_Jpan",
+    "ko": "kor_Hang", "vi": "vie_Latn",
+}
+
+def map_to_nllb(code: Optional[str]):
+    if not code:
+        return None
+    c = code.strip().lower()
+    return NLLB_LANG.get(c, f"{c}_Latn")
+
+def normalize_lang(code: Optional[str], default=None):
+    return (code or default).strip().lower() if code else default
+
+def _norm(s: str) -> str:
+    return re.sub(r"\W+", "", (s or "").lower()).strip()
+
+# =========================
+# DB
+# =========================
+def get_conn():
+    return psycopg2.connect(**DB_CONFIG)
+
+def ensure_indexes(conn):
+    with conn.cursor() as cur:
+        cur.execute("CREATE INDEX IF NOT EXISTS traducciones_lang_to_status_idx ON traducciones (lang_to, status);")
+        cur.execute("CREATE INDEX IF NOT EXISTS traducciones_status_idx ON traducciones (status);")
+    conn.commit()
+
+    pass # Moved to translation_ops.py
+
+    pass # Moved to translation_ops.py
+
+def fetch_pending_batch(conn, lang_to: str, batch: int):
+    """Fetch pending translations with row locking to support multiple workers."""
+    if batch <= 0:
+        return []
+    
+    # Use FOR UPDATE SKIP LOCKED to allow multiple workers
+    # Each worker will get different rows without conflicts
+    with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+        cur.execute(
+            """
+            SELECT t.id AS tr_id, t.noticia_id, t.lang_from, t.lang_to,
+                   n.titulo, n.resumen
+            FROM traducciones t
+            JOIN noticias n ON n.id=t.noticia_id
+            WHERE t.lang_to=%s AND t.status='pending'
+            ORDER BY t.id
+            LIMIT %s
+            FOR UPDATE OF t SKIP LOCKED;
+            """,
+            (lang_to, batch),
+        )
+        rows = cur.fetchall()
+        
+        # Update status within the same transaction while rows are locked
+        if rows:
+            ids = [r["tr_id"] for r in rows]
+            cur.execute("UPDATE traducciones SET status='processing' WHERE id = ANY(%s)", (ids,))
+    
+    conn.commit()
+    return rows
+
+# =========================
+# LANGUAGE DETECTION
+# =========================
+def detect_lang(text1: str, text2: str):
+    txt = (text1 or "").strip() or (text2 or "").strip()
+    if not txt:
+        return None
+    try:
+        return detect(txt)
+    except Exception:
+        return None
+
+# =========================
+# MODEL LOADING (CTranslate2)
+# =========================
+_TOKENIZER = None
+_TRANSLATOR = None
+_DEVICE = None
+
+def _resolve_device():
+    if CT2_DEVICE == "cpu":
+        return "cpu"
+    if CT2_DEVICE == "cuda":
+        return "cuda"
+    # auto
+    return "cuda" if ctranslate2.get_cuda_device_count() > 0 else "cpu"
+
+def _ensure_ct2_model():
+    """Convert HuggingFace model to CTranslate2 format if not exists."""
+    import os
+    import subprocess
+    
+    model_dir = CT2_MODEL_PATH
+    
+    # Check if model already exists
+    if os.path.isdir(model_dir) and os.path.exists(os.path.join(model_dir, "model.bin")):
+        LOG.info("CTranslate2 model already exists at %s", model_dir)
+        return True
+    
+    LOG.info("CTranslate2 model not found, converting from %s...", UNIVERSAL_MODEL)
+    LOG.info("This may take 5-10 minutes on first run...")
+    
+    # Create directory if needed
+    os.makedirs(os.path.dirname(model_dir) or ".", exist_ok=True)
+    
+    # Convert the model
+    quantization = CT2_COMPUTE_TYPE if CT2_COMPUTE_TYPE != "auto" else "int8_float16"
+    
+    cmd = [
+        "ct2-transformers-converter",
+        "--model", UNIVERSAL_MODEL,
+        "--output_dir", model_dir,
+        "--quantization", quantization,
+        "--force"
+    ]
+    
+    try:
+        LOG.info("Running: %s", " ".join(cmd))
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=1800)
+        
+        if result.returncode != 0:
+            LOG.error("Model conversion failed: %s", result.stderr)
+            return False
+        
+        LOG.info("Model conversion completed successfully")
+        return True
+        
+    except subprocess.TimeoutExpired:
+        LOG.error("Model conversion timed out after 30 minutes")
+        return False
+    except Exception as e:
+        LOG.error("Model conversion error: %s", e)
+        return False
+
+def get_universal_components():
+    global _TOKENIZER, _TRANSLATOR, _DEVICE
+    if _TRANSLATOR:
+        return _TOKENIZER, _TRANSLATOR
+
+    # Ensure CT2 model exists (convert if needed)
+    if not _ensure_ct2_model():
+        raise RuntimeError(f"Failed to load/convert CTranslate2 model at {CT2_MODEL_PATH}")
+
+    device = _resolve_device()
+    
+    LOG.info("Loading CTranslate2 model from %s on %s", CT2_MODEL_PATH, device)
+    
+    _TRANSLATOR = ctranslate2.Translator(
+        CT2_MODEL_PATH,
+        device=device,
+        compute_type=CT2_COMPUTE_TYPE,
+    )
+    _TOKENIZER = AutoTokenizer.from_pretrained(UNIVERSAL_MODEL)
+    _DEVICE = device
+    
+    LOG.info("CTranslate2 model loaded successfully")
+    return _TOKENIZER, _TRANSLATOR
+
+# =========================
+# TRANSLATION (CTranslate2)
+# =========================
+def _safe_src_len(tokenizer):
+    max_len = getattr(tokenizer, "model_max_length", 1024) or 1024
+    if max_len > 100000:
+        max_len = 1024
+    return min(MAX_SRC_TOKENS, max_len - 16)
+
+def _translate_texts(src, tgt, texts, beams, max_new_tokens):
+    """Translate texts using CTranslate2."""
+    if not texts:
+        return []
+    
+    clean = [(t or "").strip() for t in texts]
+    if all(not t for t in clean):
+        return ["" for _ in clean]
+
+    tok, translator = get_universal_components()
+    src_code = map_to_nllb(src)
+    tgt_code = map_to_nllb(tgt)
+
+    # Set source language on tokenizer
+    try:
+        tok.src_lang = src_code
+    except Exception:
+        pass
+
+    safe_len = _safe_src_len(tok)
+    max_new = max(16, min(int(max_new_tokens), MAX_NEW_TOKENS_BODY))
+
+    # Tokenize: convert text to tokens
+    sources = []
+    for t in clean:
+        if t:
+            ids = tok.encode(t, truncation=True, max_length=safe_len)
+            tokens = tok.convert_ids_to_tokens(ids)
+            sources.append(tokens)
+        else:
+            sources.append([])
+
+    # Target language prefix for NLLB
+    target_prefix = [[tgt_code]] * len(sources)
+
+    # Translate with CTranslate2
+    start = time.time()
+    results = translator.translate_batch(
+        sources,
+        target_prefix=target_prefix,
+        beam_size=beams,
+        max_decoding_length=max_new,
+    )
+    dt = time.time() - start
+
+    # Decode results
+    translated = []
+    total_tokens = 0
+    for result, src_tokens in zip(results, sources):
+        if result.hypotheses:
+            # Skip the first token (language prefix)
+            tokens = result.hypotheses[0][1:]
+            total_tokens += len(tokens) + len(src_tokens)
+            text = tok.decode(tok.convert_tokens_to_ids(tokens))
+            translated.append(text.strip())
+        else:
+            translated.append("")
+
+    if total_tokens > 0:
+        LOG.info("  → tokens=%d tiempo=%.2fs velocidad=%d tok/s", 
+                 total_tokens, dt, int(total_tokens / dt) if dt > 0 else 0)
+
+    return translated
+
+def _split_body_into_chunks(text: str) -> List[str]:
+    text = (text or "").strip()
+    if len(text) <= BODY_CHARS_CHUNK:
+        return [text] if text else []
+
+    parts = re.split(r'(\n\n+|(?<=[\.\!\?؛؟。])\s+)', text)
+    chunks = []
+    current = ""
+
+    for part in parts:
+        if not part:
+            continue
+        if len(current) + len(part) <= BODY_CHARS_CHUNK:
+            current += part
+        else:
+            if current.strip():
+                chunks.append(current.strip())
+            current = part
+    if current.strip():
+        chunks.append(current.strip())
+
+    if not chunks:
+        return [text]
+    return chunks
+
+def translate_body_long(src: str, tgt: str, body: str) -> str:
+    body = (body or "").strip()
+    if not body:
+        return ""
+
+    chunks = _split_body_into_chunks(body)
+    if len(chunks) == 1:
+        translated = _translate_texts(src, tgt, [body], NUM_BEAMS_BODY, MAX_NEW_TOKENS_BODY)[0]
+        return translated.strip()
+
+    translated_chunks = []
+    for ch in chunks:
+        tr = _translate_texts(src, tgt, [ch], NUM_BEAMS_BODY, MAX_NEW_TOKENS_BODY)[0]
+        translated_chunks.append(tr.strip())
+    return "\n\n".join(c for c in translated_chunks if c)
+
+# =========================
+# BATCH PROCESS
+# =========================
+def process_batch(conn, rows):
+    todo = []
+    done = []
+    errors = []
+
+    for r in rows:
+        lang_to = normalize_lang(r["lang_to"], "es") or "es"
+        lang_from = (
+            normalize_lang(r["lang_from"])
+            or detect_lang(r["titulo"], r["resumen"])
+            or "en"
+        )
+
+        titulo = (r["titulo"] or "").strip()
+        resumen = (r["resumen"] or "").strip()
+
+        if map_to_nllb(lang_from) == map_to_nllb(lang_to):
+            done.append((titulo, resumen, lang_from, r["tr_id"]))
+        else:
+            todo.append({
+                "tr_id": r["tr_id"],
+                "lang_from": lang_from,
+                "lang_to": lang_to,
+                "titulo": titulo,
+                "resumen": resumen,
+            })
+
+    from collections import defaultdict
+    groups = defaultdict(list)
+    for item in todo:
+        key = (item["lang_from"], item["lang_to"])
+        groups[key].append(item)
+
+    for (lang_from, lang_to), items in groups.items():
+        LOG.info("Translating %s -> %s (%d items)", lang_from, lang_to, len(items))
+
+        titles = [i["titulo"] for i in items]
+
+        try:
+            tt = _translate_texts(
+                lang_from,
+                lang_to,
+                titles,
+                NUM_BEAMS_TITLE,
+                MAX_NEW_TOKENS_TITLE,
+            )
+
+            bodies_translated: List[str] = []
+            for i in items:
+                bodies_translated.append(
+                    translate_body_long(lang_from, lang_to, i["resumen"])
+                )
+
+            for i, ttr, btr in zip(items, tt, bodies_translated):
+                ttr = (ttr or "").strip()
+                btr = (btr or "").strip()
+
+                if not ttr or _norm(ttr) == _norm(i["titulo"]):
+                    ttr = i["titulo"]
+                if not btr or _norm(btr) == _norm(i["resumen"]):
+                    btr = i["resumen"]
+
+                # CLEANING: Remove <unk> tokens
+                if ttr:
+                    ttr = ttr.replace("<unk>", "").replace("  ", " ").strip()
+                if btr:
+                    btr = btr.replace("<unk>", "").replace("  ", " ").strip()
+
+                done.append((ttr, btr, lang_from, i["tr_id"]))
+
+        except Exception as e:
+            err = str(e)[:800]
+            LOG.exception("Error translating %s -> %s: %s", lang_from, lang_to, err)
+            for i in items:
+                errors.append((err, i["tr_id"]))
+
+    with conn.cursor() as cur:
+        if done:
+            execute_values(
+                cur,
+                """
+                UPDATE traducciones AS t
+                SET titulo_trad=v.titulo_trad,
+                    resumen_trad=v.resumen_trad,
+                    lang_from=COALESCE(t.lang_from, v.lang_from),
+                    status='done',
+                    error=NULL
+                FROM (VALUES %s) AS v(titulo_trad,resumen_trad,lang_from,id)
+                WHERE t.id=v.id;
+                """,
+                done,
+            )
+            
+            # --- NEW: Persist stats ---
+            # Insert a record for each translated item into translation_stats
+            # We need the language 'lang_to'. In this batch, lang_to is uniform for the group usually,
+            # but let's extract it from the 'done' items structure if we had it, or pass it down.
+            # In process_batch, we iterate groups. 
+            # 'done' list here is flattened from multiple groups? 
+            # process_batch logic:
+            # 1. 'done' checks map_to_nllb identity (already done?) -> these have lang_to from row?
+            # 2. 'groups' loop -> translates -> appends to 'done' with lang_from.
+            # 
+            # Wait, 'done' list doesn't have lang_to in the tuple: (titulo, resumen, lang_from, tr_id).
+            # We need to change the 'done' collection to include lang_to OR we insert based on tr_id.
+            
+            # Let's verify process_batch logic.
+            # rows has all info.
+            # define a mapping tr_id -> lang_to
+            tr_map = {r["tr_id"]: r["lang_to"] for r in rows}
+            
+            stats_data = []
+            for item in done:
+                 # item is (titulo, resumen, lang_from, tr_id)
+                 lang_from = item[2]
+                 lang_to = tr_map.get(item[3], "es")
+                 stats_data.append((lang_from, lang_to))
+
+            execute_values(
+                cur,
+                "INSERT INTO translation_stats (lang_from, lang_to) VALUES %s",
+                stats_data
+            )
+            # --------------------------
+
+        if errors:
+            execute_values(
+                cur,
+                """
+                UPDATE traducciones AS t
+                SET status='error', error=v.error
+                FROM (VALUES %s) AS v(error,id)
+                WHERE t.id=v.id;
+                """,
+                errors,
+            )
+
+    conn.commit()
+
+def process_entity_summaries(conn):
+    """Translate pending entity summaries from Wikipedia."""
+    from cache import cache_del
+    
+    LOG.info("DEBUG: Checking for pending entity summaries...")
+    
+    with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+        cur.execute("""
+            SELECT id, entity_name, summary, summary_en 
+            FROM entity_images 
+            WHERE status_es = 'pending' 
+            LIMIT 20
+            FOR UPDATE SKIP LOCKED;
+        """)
+        rows = cur.fetchall()
+        
+        if not rows:
+            return False
+            
+        LOG.info("DEBUG: Found %d pending entity summaries to process", len(rows))
+        
+        for r in rows:
+            entity_id = r["id"]
+            name = r["entity_name"]
+            text = r["summary_en"] or r["summary"]
+            
+            if not text:
+                cur.execute("UPDATE entity_images SET status_es = 'none' WHERE id = %s", (entity_id,))
+                continue
+                
+            try:
+                # English -> Spanish
+                translated = translate_body_long('en', 'es', text)
+                if translated:
+                    cur.execute("""
+                        UPDATE entity_images 
+                        SET summary_es = %s, status_es = 'done' 
+                        WHERE id = %s
+                    """, (translated, entity_id))
+                    # Invalidate cache
+                    cache_del(f"wiki:data:{name.lower()}")
+                    LOG.info("  → Translated entity summary: %s", name)
+                else:
+                     cur.execute("UPDATE entity_images SET status_es = 'error' WHERE id = %s", (entity_id,))
+            except Exception as e:
+                LOG.error("Error translating entity summary [%s]: %s", name, e)
+                cur.execute("UPDATE entity_images SET status_es = 'error' WHERE id = %s", (entity_id,))
+        
+        conn.commit()
+    return True
+
+# =========================
+# MAIN LOOP
+# =========================
+def main():
+    LOG.info("Translator worker iniciado (CTranslate2)")
+    os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+    get_universal_components()
+
+    while True:
+        any_work = False
+        with get_conn() as conn:
+            ensure_indexes(conn)
+            
+            # 1. Process entity summaries (Wikipedia) -> REMOVED per user request
+            # Logic moved out to keep translator focused on news ONLY.
+            # try:
+            #     if process_entity_summaries(conn):
+            #         any_work = True
+            # except Exception as e:
+            #     LOG.error("Error in process_entity_summaries: %s", e)
+
+            # 2. Process news translations
+            for tgt in TARGET_LANGS:
+                while True:
+                    rows = fetch_pending_batch(conn, tgt, BATCH_SIZE)
+                    if not rows:
+                        break
+                    any_work = True
+                    LOG.info("[%s] %d elementos", tgt, len(rows))
+                    process_batch(conn, rows)
+
+        if not any_work:
+            time.sleep(SLEEP_IDLE)
+
+if __name__ == "__main__":
+    main()
+
--- a/workers/url_discovery_worker.py
+++ b/workers/url_discovery_worker.py
@ -0,0 +1,471 @@
+"""
+URL Feed Discovery Worker
+This worker automatically discovers RSS feeds from URLs stored in fuentes_url table
+and creates entries in the feeds table (or feeds_pending for review).
+Runs every 15 minutes.
+"""
+
+import os
+import sys
+import time
+import logging
+from datetime import datetime
+from typing import List, Dict
+
+# Add parent directory to path to import modules
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from db import get_conn
+from utils.feed_discovery import discover_feeds, get_feed_metadata
+from utils.feed_analysis import (
+    analyze_feed,
+    get_country_id_by_name,
+    get_category_id_by_name
+)
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Configuration
+CHECK_INTERVAL = int(os.getenv('URL_DISCOVERY_INTERVAL_MIN', '15')) * 60  # Default: 15 minutes
+BATCH_SIZE = int(os.getenv('URL_DISCOVERY_BATCH_SIZE', '10'))  # Process URLs in batches
+MAX_FEEDS_PER_URL = int(os.getenv('MAX_FEEDS_PER_URL', '5'))  # Max feeds to create per URL
+
+
+def get_pending_urls(limit: int = BATCH_SIZE) -> List[Dict]:
+    """
+    Get URLs that need to be processed.
+    Priority: never checked > failed checks > oldest successful checks
+    """
+    with get_conn() as conn:
+        with conn.cursor() as cur:
+            cur.execute("""
+                SELECT id, nombre, url, categoria_id, pais_id, idioma, last_status
+                FROM fuentes_url
+                WHERE active = TRUE
+                ORDER BY 
+                    CASE 
+                        WHEN last_check IS NULL THEN 1  -- Never checked (highest priority)
+                        WHEN last_status = 'error' THEN 2  -- Failed checks
+                        WHEN last_status = 'no_feeds' THEN 3  -- No feeds found
+                        ELSE 4  -- Successful checks (lowest priority)
+                    END,
+                    last_check ASC NULLS FIRST
+                LIMIT %s
+            """, (limit,))
+            
+            columns = [desc[0] for desc in cur.description]
+            return [dict(zip(columns, row)) for row in cur.fetchall()]
+
+
+def update_url_status(url_id: int, status: str, message: str = None, http_code: int = None):
+    """Update the status of a URL source"""
+    with get_conn() as conn:
+        with conn.cursor() as cur:
+            cur.execute("""
+                UPDATE fuentes_url
+                SET last_check = NOW(),
+                    last_status = %s,
+                    status_message = %s,
+                    last_http_code = %s
+                WHERE id = %s
+            """, (status, message, http_code, url_id))
+        conn.commit()
+
+
+def create_pending_feed(
+    fuente_url_id: int,
+    feed_url: str,
+    metadata: Dict,
+    analysis: Dict,
+    categoria_id: int = None,
+    pais_id: int = None,
+    idioma: str = None
+) -> bool:
+    """
+    Create a pending feed entry for manual review
+    """
+    try:
+        with get_conn() as conn:
+            # Get detected country ID
+            detected_country_id = None
+            if analysis.get('detected_country'):
+                detected_country_id = get_country_id_by_name(conn, analysis['detected_country'])
+            
+            # Get suggested category ID
+            suggested_categoria_id = None
+            if analysis.get('suggested_category'):
+                suggested_categoria_id = get_category_id_by_name(conn, analysis['suggested_category'])
+            
+            with conn.cursor() as cur:
+                cur.execute("""
+                   INSERT INTO feeds_pending (
+                        fuente_url_id, feed_url, feed_title, feed_description,
+                        feed_language, feed_type, entry_count,
+                        detected_country_id, suggested_categoria_id,
+                        categoria_id, pais_id, idioma, notes
+                    )
+                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+                    ON CONFLICT (feed_url) DO UPDATE
+                        SET feed_title = EXCLUDED.feed_title,
+                            feed_description = EXCLUDED.feed_description,
+                            discovered_at = NOW()
+                    RETURNING id
+                """, (
+                    fuente_url_id,
+                    feed_url,
+                    metadata.get('title', 'Feed sin título'),
+                    metadata.get('description', '')[:500],
+                    analysis.get('language'),
+                    'rss',  # Default type
+                    metadata.get('entry_count', 0),
+                    detected_country_id,
+                    suggested_categoria_id,
+                    categoria_id,
+                    pais_id,
+                    idioma,
+                    analysis.get('analysis_notes', '')
+                ))
+                
+                result = cur.fetchone()
+                conn.commit()
+                
+                if result:
+                    logger.info(f"Created pending feed for review: {metadata.get('title')} ({feed_url})")
+                    return True
+                else:
+                    logger.debug(f"Pending feed updated: {feed_url}")
+                    return False
+                    
+    except Exception as e:
+        logger.error(f"Error creating pending feed {feed_url}: {e}")
+        return False
+
+
+def create_feed_from_metadata(
+    feed_url: str,
+    fuente_url_id: int = None,
+    categoria_id: int = None,
+    pais_id: int = None,
+    idioma: str = None,
+    auto_approve: bool = False,
+    context_title: str = None
+) -> Dict:
+    """
+    Create a feed entry from discovered feed URL with intelligent analysis.
+    
+    Returns:
+        {
+            'created': True/False,
+            'pending': True/False,
+            'status': 'created'/'pending'/'existing'/'error',
+            'message': 'Description'
+        }
+    """
+    result = {
+        'created': False,
+        'pending': False,
+        'status': 'error',
+        'message': ''
+    }
+    
+    try:
+        # Get feed metadata
+        metadata = get_feed_metadata(feed_url, timeout=10)
+        
+        if not metadata:
+            result['message'] = 'No se pudo obtener metadata del feed'
+            logger.warning(f"{result['message']}: {feed_url}")
+            return result
+        
+        # Add URL to metadata for analysis
+        metadata['url'] = feed_url
+        
+        # Use context title if provided, otherwise use metadata title
+        # This helps when feed XML title is generic (e.g. "RSS Feed") but site link had meaningful text
+        feed_title = context_title if context_title else metadata.get('title', 'Feed sin título')
+        # Update metadata for consistency in pending feeds AND analysis
+        metadata['title'] = feed_title
+        
+        # Perform intelligent analysis
+        analysis = analyze_feed(metadata)
+        
+        # Determine if we need manual review
+        needs_review = False
+        
+        # If parent URL has no category or country, we need review
+        if not categoria_id or not pais_id:
+            needs_review = True
+            logger.info(f"Feed needs review (missing categoria_id={categoria_id} or pais_id={pais_id})")
+        
+        # If auto_approve is disabled, we need review
+        if not auto_approve:
+            needs_review = True
+        
+        # Enhance metadata with analysis
+        if not idioma and analysis.get('language'):
+            idioma = analysis['language']
+        
+        # If needs review, create pending feed
+        if needs_review:
+            created_pending = create_pending_feed(
+                fuente_url_id=fuente_url_id,
+                feed_url=feed_url,
+                metadata=metadata,
+                analysis=analysis,
+                categoria_id=categoria_id,
+                pais_id=pais_id,
+                idioma=idioma
+            )
+            
+            result['pending'] = created_pending
+            result['status'] = 'pending'
+            result['message'] = f"Feed creado y pendiente de revisión (país: {analysis.get('detected_country', 'N/A')}, categoría sugerida: {analysis.get('suggested_category', 'N/A')})"
+            return result
+        
+        # Otherwise, create feed directly
+        nombre = feed_title
+        descripcion = metadata.get('description', '')
+        
+        with get_conn() as conn:
+            with conn.cursor() as cur:
+                cur.execute("""
+                    INSERT INTO feeds (nombre, descripcion, url, categoria_id, pais_id, idioma, fuente_url_id, activo)
+                    VALUES (%s, %s, %s, %s, %s, %s, %s, TRUE)
+                    ON CONFLICT (url) DO NOTHING
+                    RETURNING id
+                """, (
+                    nombre,
+                    descripcion[:500] if descripcion else None,
+                    feed_url,
+                    categoria_id,
+                    pais_id,
+                    idioma,
+                    fuente_url_id
+                ))
+                
+                feed_result = cur.fetchone()
+                conn.commit()
+                
+                if feed_result:
+                    logger.info(f"Created new feed: {nombre} ({feed_url})")
+                    result['created'] = True
+                    result['status'] = 'created'
+                    result['message'] = f"Feed creado exitosamente"
+                else:
+                    logger.debug(f"Feed already exists: {feed_url}")
+                    result['status'] = 'existing'
+                    result['message'] = 'El feed ya existe'
+                    
+    except Exception as e:
+        logger.error(f"Error creating feed from {feed_url}: {e}")
+        result['message'] = str(e)
+        result['status'] = 'error'
+    
+    return result
+
+
+def process_url_source(url_data: Dict) -> Dict:
+    """
+    Process a single URL source to discover and create feeds.
+    Returns statistics about the operation.
+    """
+    url_id = url_data['id']
+    source_url = url_data['url']
+    nombre = url_data['nombre']
+    categoria_id = url_data['categoria_id']
+    pais_id = url_data['pais_id']
+    idioma = url_data['idioma']
+    
+    logger.info(f"Processing URL source: {nombre} ({source_url})")
+    logger.info(f"  Parent metadata: categoria_id={categoria_id}, pais_id={pais_id}, idioma={idioma}")
+    
+    stats = {
+        'url_id': url_id,
+        'url': source_url,
+        'discovered': 0,
+        'created': 0,
+        'pending': 0,
+        'existing': 0,
+        'errors': 0,
+        'status': 'unknown'
+    }
+    
+    try:
+        # Discover feeds from URL
+        discovered = discover_feeds(source_url, timeout=15)
+        stats['discovered'] = len(discovered)
+        
+        if not discovered:
+            logger.warning(f"No feeds discovered from: {source_url}")
+            update_url_status(url_id, 'no_feeds', 'No se encontraron feeds RSS', 200)
+            stats['status'] = 'no_feeds'
+            return stats
+        
+        # Filter only valid feeds
+        valid_feeds = [f for f in discovered if f.get('valid', False)]
+        
+        if not valid_feeds:
+            logger.warning(f"No valid feeds found for: {source_url}")
+            update_url_status(url_id, 'no_valid_feeds', f'Se encontraron {len(discovered)} feeds pero ninguno válido')
+            stats['status'] = 'no_valid_feeds'
+            return stats
+        
+        # Limit number of feeds per URL
+        feeds_to_create = valid_feeds[:MAX_FEEDS_PER_URL]
+        
+        logger.info(f"Found {len(valid_feeds)} valid feeds, processing up to {len(feeds_to_create)}")
+        
+        # Determine if auto-approve (parent has category AND country)
+        auto_approve = bool(categoria_id and pais_id)
+        
+        if not auto_approve:
+            logger.info("→ Feeds will require manual review (parent lacks category or country)")
+        else:
+            logger.info("→ Feeds will be auto-approved (parent has complete metadata)")
+        
+        # Create feeds
+        for feed_info in feeds_to_create:
+            feed_url = feed_info['url']
+            
+            try:
+                result = create_feed_from_metadata(
+                    feed_url=feed_url,
+                    fuente_url_id=url_id,
+                    categoria_id=categoria_id,
+                    pais_id=pais_id,
+                    idioma=idioma,
+                    auto_approve=auto_approve,
+                    context_title=feed_info.get('context_label')
+                )
+                
+                if result['status'] == 'created':
+                    stats['created'] += 1
+                elif result['status'] == 'pending':
+                    stats['pending'] += 1
+                elif result['status'] == 'existing':
+                    stats['existing'] += 1
+                else:
+                    stats['errors'] += 1
+                    
+            except Exception as e:
+                logger.error(f"Error creating feed {feed_url}: {e}")
+                stats['errors'] += 1
+        
+        # Update URL status
+        if stats['created'] > 0 or stats['pending'] > 0:
+            parts = []
+            if stats['created'] > 0:
+                parts.append(f"{stats['created']} creados")
+            if stats['pending'] > 0:
+                parts.append(f"{stats['pending']} pendientes de revisión")
+            if stats['existing'] > 0:
+                parts.append(f"{stats['existing']} ya existían")
+            
+            message = ", ".join(parts)
+            update_url_status(url_id, 'success', message, 200)
+            stats['status'] = 'success'
+        elif stats['existing'] > 0:
+            message = f"Todos los {stats['existing']} feeds ya existían"
+            update_url_status(url_id, 'existing', message, 200)
+            stats['status'] = 'existing'
+        else:
+            message = f"No se pudieron procesar feeds ({stats['errors']} errores)"
+            update_url_status(url_id, 'error', message)
+            stats['status'] = 'error'
+        
+        logger.info(f"Processed {source_url}: {stats['created']} created, {stats['pending']} pending, {stats['existing']} existing, {stats['errors']} errors")
+        
+    except Exception as e:
+        logger.error(f"Error processing URL {source_url}: {e}")
+        update_url_status(url_id, 'error', str(e)[:200])
+        stats['status'] = 'error'
+        stats['errors'] += 1
+    
+    return stats
+
+
+def process_batch():
+    """Process a batch of URL sources"""
+    logger.info("=" * 80)
+    logger.info(f"Starting URL discovery batch - {datetime.now().isoformat()}")
+    
+    # Get pending URLs
+    urls = get_pending_urls(limit=BATCH_SIZE)
+    
+    if not urls:
+        logger.info("No pending URLs to process")
+        return
+    
+    logger.info(f"Processing {len(urls)} URL sources")
+    
+    # Process statistics
+    total_stats = {
+        'processed': 0,
+        'discovered': 0,
+        'created': 0,
+        'pending': 0,
+        'existing': 0,
+        'errors': 0
+    }
+    
+    # Process each URL
+    for url_data in urls:
+        stats = process_url_source(url_data)
+        
+        total_stats['processed'] += 1
+        total_stats['discovered'] += stats['discovered']
+        total_stats['created'] += stats['created']
+        total_stats['pending'] += stats['pending']
+        total_stats['existing'] += stats['existing']
+        total_stats['errors'] += stats['errors']
+        
+        # Small delay between URLs to avoid hammering servers
+        time.sleep(2)
+    
+    # Log summary
+    logger.info("-" * 80)
+    logger.info(f"Batch complete:")
+    logger.info(f"  - Processed: {total_stats['processed']} URLs")
+    logger.info(f"  - Discovered: {total_stats['discovered']} feeds")
+    logger.info(f"  - Created: {total_stats['created']} new feeds")
+    logger.info(f"  - Pending review: {total_stats['pending']} feeds")
+    logger.info(f"  - Already existing: {total_stats['existing']} feeds")
+    logger.info(f"  - Errors: {total_stats['errors']}")
+    logger.info("=" * 80)
+
+
+def main():
+    """Main worker loop"""
+    logger.info("URL Feed Discovery Worker started")
+    logger.info(f"Check interval: {CHECK_INTERVAL} seconds ({CHECK_INTERVAL // 60} minutes)")
+    logger.info(f"Batch size: {BATCH_SIZE}")
+    logger.info(f"Max feeds per URL: {MAX_FEEDS_PER_URL}")
+    
+    # Run immediately on start
+    try:
+        process_batch()
+    except Exception as e:
+        logger.error(f"Error in initial batch: {e}", exc_info=True)
+    
+    # Main loop
+    while True:
+        try:
+            logger.info(f"Waiting {CHECK_INTERVAL} seconds until next batch...")
+            time.sleep(CHECK_INTERVAL)
+            process_batch()
+            
+        except KeyboardInterrupt:
+            logger.info("Worker stopped by user")
+            break
+        except Exception as e:
+            logger.error(f"Error in main loop: {e}", exc_info=True)
+            # Wait a bit before retrying to avoid rapid failure loops
+            time.sleep(60)
+
+
+if __name__ == "__main__":
+    main()
--- a/workers/url_worker.py
+++ b/workers/url_worker.py
@ -0,0 +1,125 @@
+import logging
+import hashlib
+from datetime import datetime
+from newspaper import Article, ArticleException, Config
+import requests
+from db import get_write_conn, get_read_conn
+
+# Configuration
+logger = logging.getLogger("url_worker")
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+def get_active_urls():
+    """Get all active URL sources."""
+    with get_read_conn() as conn:
+        with conn.cursor() as cur:
+            cur.execute("""
+                SELECT id, nombre, url, categoria_id, pais_id, idioma
+                FROM fuentes_url 
+                WHERE active = true
+            """)
+            return cur.fetchall()
+
+def update_source_status(source_id, status, message, http_code=0):
+    """Update the status of a URL source."""
+    with get_write_conn() as conn:
+        with conn.cursor() as cur:
+            cur.execute("""
+                UPDATE fuentes_url 
+                SET last_check = NOW(),
+                    last_status = %s,
+                    status_message = %s,
+                    last_http_code = %s
+                WHERE id = %s
+            """, (status, message, http_code, source_id))
+        conn.commit()
+
+def save_article(source, article):
+    """Save the extracted article to the database."""
+    source_id, source_name, source_url, cat_id, pais_id, lang = source
+    
+    # Use the article url if possible, otherwise source_url
+    final_url = article.url or source_url
+    noticia_id = hashlib.md5(final_url.encode("utf-8")).hexdigest()
+    
+    with get_write_conn() as conn:
+        with conn.cursor() as cur:
+            # Check if exists
+            cur.execute("SELECT id FROM noticias WHERE id = %s", (noticia_id,))
+            if cur.fetchone():
+                return False # Already exists
+            
+            # Prepare data
+            title = article.title or "Sin título"
+            summary = article.summary or article.text[:500] 
+            image_url = article.top_image
+            pub_date = article.publish_date or datetime.utcnow()
+            
+            cur.execute("""
+                INSERT INTO noticias (
+                    id, titulo, resumen, url, fecha, imagen_url, 
+                    fuente_nombre, categoria_id, pais_id
+                ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
+                ON CONFLICT (id) DO NOTHING
+            """, (
+                noticia_id, title, summary, final_url, pub_date, image_url,
+                source_name, cat_id, pais_id
+            ))
+        conn.commit()
+    return True
+
+def process_url(source):
+    """Process a single URL source."""
+    source_id, name, url, _, _, _ = source
+    
+    logger.info(f"Processing URL: {url} ({name})")
+    
+    try:
+        # Browser-like headers
+        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        config = Config()
+        config.browser_user_agent = user_agent
+        config.request_timeout = 30
+
+        article = Article(url, config=config, language='es')
+        article.download()
+        
+        if not article.html:
+             update_source_status(source_id, "ERROR", "No content downloaded (Empty HTML)", 0)
+             return
+
+        article.parse()
+        try:
+            article.nlp()
+        except:
+            pass 
+            
+        if not article.title:
+            update_source_status(source_id, "ERROR_PARSE", "Could not extract title (Page might be not an article)", 200)
+            return
+            
+        saved = save_article(source, article)
+        
+        status_msg = "News created successfully" if saved else "News already exists"
+        update_source_status(source_id, "OK", status_msg, 200)
+        logger.info(f"Success {url}: {status_msg}")
+
+    except ArticleException as ae:
+        logger.error(f"Newspaper Error {url}: {ae}")
+        update_source_status(source_id, "ERROR_DOWNLOAD", str(ae)[:200], 0)
+    except requests.exceptions.RequestException as re:
+        logger.error(f"Network Error {url}: {re}")
+        update_source_status(source_id, "ERROR_NETWORK", str(re)[:200], 0)
+    except Exception as e:
+        logger.error(f"Unexpected Error {url}: {e}")
+        update_source_status(source_id, "ERROR_UNKNOWN", str(e)[:200], 500)
+
+def main():
+    logger.info("Starting URL Worker")
+    urls = get_active_urls()
+    logger.info(f"Found {len(urls)} active URLs")
+    for source in urls:
+        process_url(source)
+
+if __name__ == "__main__":
+    main()
--- a/workers/url_worker_daemon.py
+++ b/workers/url_worker_daemon.py
@ -0,0 +1,31 @@
+import time
+import logging
+import sys
+from workers.url_worker import main as run_once
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    stream=sys.stdout
+)
+logger = logging.getLogger("url_worker_daemon")
+
+INTERVAL = 300  # 5 minutes
+
+def main():
+    logger.info("Starting URL Worker Daemon")
+    logger.info(f"Check interval: {INTERVAL} seconds")
+    
+    while True:
+        try:
+            logger.info("Running job cycle...")
+            run_once()
+            logger.info("Cycle completed.")
+        except Exception as e:
+            logger.exception(f"Error in job cycle: {e}")
+        
+        time.sleep(INTERVAL)
+
+if __name__ == "__main__":
+    main()