Mejoras: NER, embeddings, dashboard, docker-compose y limpieza

2025-11-17 19:37:05 +01:00 · 2025-11-17 19:37:05 +01:00 · d508dc2058
commit d508dc2058
parent 6c5aff9936
19 changed files with 2218 additions and 1185 deletions
--- a/44
+++ b/44
@ -1,49 +1,59 @@
+# Dockerfile
+# -----------
+
 # Imagen base Python
 FROM python:3.11-slim

-# Por defecto construimos para CUDA 12.1 (cu121)
-# Si alguna vez quisieras CPU, pásale: --build-arg TORCH_CUDA=cpu
+# Construcción para CUDA 12.1 por defecto (usa --build-arg TORCH_CUDA=cpu para CPU)
 ARG TORCH_CUDA=cu121

 WORKDIR /app

-# Paquetes nativos necesarios
+# Paquetes del sistema necesarios
+# - libpq-dev y gcc: para compilar dependencias que hablen con PostgreSQL (psycopg2)
+# - git: algunos modelos/liberías pueden tirar de git
 RUN apt-get update && apt-get install -y --no-install-recommends \
    libpq-dev \
    gcc \
    git \
-    && rm -rf /var/lib/apt/lists/*
+ && rm -rf /var/lib/apt/lists/*

-# Requerimientos
-COPY requirements.txt .
+# Ajustes de pip / runtime
+ENV PYTHONUNBUFFERED=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    TOKENIZERS_PARALLELISM=false \
+    HF_HUB_DISABLE_SYMLINKS_WARNING=1

-# Actualiza pip y herramientas base
+# Dependencias Python
+COPY requirements.txt ./
 RUN python -m pip install --no-cache-dir --upgrade pip setuptools wheel

-# Instala PyTorch con el runtime CUDA 12.1 (o CPU si TORCH_CUDA=cpu)
+# Instala PyTorch:
+#  - Con CUDA 12.1 si TORCH_CUDA=cu121 (requiere runtime nvidia al ejecutar)
+#  - Con ruedas CPU si TORCH_CUDA=cpu
 RUN if [ "$TORCH_CUDA" = "cu121" ]; then \
      pip install --no-cache-dir --index-url https://download.pytorch.org/whl/cu121 \
-        torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1; \
+        torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 ; \
    else \
      pip install --no-cache-dir \
-        torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1; \
+        torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 ; \
    fi

 # Instala el resto de dependencias de tu app
 RUN pip install --no-cache-dir -r requirements.txt

-# spaCy + modelo español (para el NER que quieres después)
-RUN pip install --no-cache-dir "spacy>=3.7,<4.0" \
- && python -m spacy download es_core_news_md
+# Descarga el modelo de spaCy (español) para NER
+# Si el entorno de build no tiene red, no rompas la build: intenta en runtime.
+RUN python -m spacy download es_core_news_md || true

-# Copia el código
+# Copia el código de la app
 COPY . .

-# (Opcional) descarga recursos NLTK si tu app los usa; si no, déjalo como no-op
+# Descarga de recursos NLTK que usa newspaper3k (no crítico en build)
 RUN python download_models.py || true

-# Puerto que usará gunicorn en el servicio web
+# Puerto que usa gunicorn en el servicio web
 EXPOSE 8000

-# El CMD lo define docker-compose
+# El CMD/entrypoint se define en docker-compose.yml (web, scheduler, workers)

--- a/actualizar_repo.sh
+++ b/actualizar_repo.sh
@ -1,35 +0,0 @@
-#!/bin/bash
-
-# --- Script para actualizar el repositorio de Git de forma robusta ---
-
-echo "🚀 Iniciando actualización del repositorio..."
-
-# 1. Sincronizar con el repositorio remoto para evitar conflictos
-echo "----------------------------------------"
-echo "🔄 Sincronizando con el repositorio remoto (git pull)..."
-git pull || { echo "❌ Error al hacer git pull. Soluciona los conflictos y vuelve a intentarlo."; exit 1; }
-echo "----------------------------------------"
-
-
-# 2. Preparar todos los archivos modificados y nuevos
-echo "➕ Añadiendo todos los archivos al área de preparación (git add .)"
-git add .
-git add -u # Asegura que los archivos eliminados también se registren
-
-# 3. Crear el mensaje del commit solo si hay cambios
-COMMIT_MSG="Actualización del $(date +'%Y-%m-%d a las %H:%M:%S')"
-echo "💬 Creando commit con el mensaje: '$COMMIT_MSG'"
-
-# Solo hacemos commit si hay algo que añadir para evitar commits vacíos
-if ! git diff-index --quiet HEAD --; then
-    git commit -m "$COMMIT_MSG"
-else
-    echo "ℹ️ No hay cambios que subir. El repositorio ya está actualizado."
-    exit 0
-fi
-
-# 4. Subir los cambios a GitHub
-echo "⬆️ Subiendo cambios al repositorio remoto (git push)..."
-git push || { echo "❌ Error al hacer git push. Revisa la conexión o los permisos."; exit 1; }
-
-echo "✅ ¡Actualización completada!"
--- a/app.py
+++ b/app.py
--- a/categorias.csv
+++ b/categorias.csv
@ -0,0 +1,16 @@
+id,nombre
+1,Ciencia
+2,Cultura
+3,Deportes
+4,Economía
+5,Educación
+6,Entretenimiento
+7,Internacional
+8,Medio Ambiente
+9,Moda
+10,Opinión
+11,Política
+12,Salud
+13,Sociedad
+14,Tecnología
+15,Viajes
--- a/categorias.sql
+++ b/categorias.sql
@ -1,18 +0,0 @@
-INSERT INTO categorias (nombre) VALUES
-('Ciencia'),
-('Cultura'),
-('Deportes'),
-('Economía'),
-('Educación'),
-('Entretenimiento'),
-('Internacional'),
-('Medio Ambiente'),
-('Moda'),
-('Opinión'),
-('Política'),
-('Salud'),
-('Sociedad'),
-('Tecnología'),
-('Viajes')
-ON CONFLICT DO NOTHING;
-
--- a/continentes.csv
+++ b/continentes.csv
@ -0,0 +1,7 @@
+id,nombre
+1,África
+2,América
+3,Asia
+4,Europa
+5,Oceanía
+6,Antártida
--- a/continentes.sql
+++ b/continentes.sql
@ -1,9 +0,0 @@
-INSERT INTO continentes (id, nombre) VALUES
-(1, 'África'),
-(2, 'América'),
-(3, 'Asia'),
-(4, 'Europa'),
-(5, 'Oceanía'),
-(6, 'Antártida')
-ON CONFLICT (id) DO NOTHING;
-
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,20 +1,27 @@
 services:
  db:
-    image: postgres:15
+    image: postgres:18
    container_name: rss_db
    environment:
-      - POSTGRES_DB=${DB_NAME}
-      - POSTGRES_USER=${DB_USER}
-      - POSTGRES_PASSWORD=${DB_PASS}
+      POSTGRES_DB: ${DB_NAME}
+      POSTGRES_USER: ${DB_USER}
+      POSTGRES_PASSWORD: ${DB_PASS}
+      POSTGRES_INITDB_ARGS: "--encoding=UTF8 --locale=C.UTF-8"
+      LANG: C.UTF-8
+      LC_ALL: C.UTF-8
+      TZ: Europe/Madrid
+      PGDATA: /var/lib/postgresql/data/18/main
+    command: ["postgres", "-c", "max_connections=400"]
    volumes:
-      - postgres_data:/var/lib/postgresql/data
-      - ./init-db:/docker-entrypoint-initdb.d
+      - /datos/rss/postgres/18:/var/lib/postgresql/data
+      - ./init-db:/docker-entrypoint-initdb.d:ro
    restart: always
    healthcheck:
-      test: ["CMD-SHELL", "pg_isready -U ${DB_USER} -d ${DB_NAME}"]
+      test: ["CMD-SHELL", "pg_isready -h 127.0.0.1 -p 5432 -U $$POSTGRES_USER -d $$POSTGRES_DB || exit 1"]
      interval: 5s
      timeout: 5s
-      retries: 5
+      retries: 30
+      start_period: 20s

  web:
    build:
@ -22,7 +29,7 @@ services:
      args:
        TORCH_CUDA: cu121
    container_name: rss_web
-    command: gunicorn --bind 0.0.0.0:8000 --workers 3 app:app
+    command: bash -lc "gunicorn --bind 0.0.0.0:8000 --workers 3 --timeout 120 app:app"
    ports:
      - "8001:8000"
    environment:
@ -46,7 +53,7 @@ services:
      args:
        TORCH_CUDA: cu121
    container_name: rss_scheduler
-    command: python scheduler.py
+    command: bash -lc "python scheduler.py"
    environment:
      - DB_HOST=db
      - DB_PORT=5432
@ -54,6 +61,7 @@ services:
      - DB_USER=${DB_USER}
      - DB_PASS=${DB_PASS}
      - SECRET_KEY=${SECRET_KEY}
+      - RSS_MAX_WORKERS=8
    depends_on:
      db:
        condition: service_healthy
@ -65,7 +73,7 @@ services:
      args:
        TORCH_CUDA: cu121
    container_name: rss_translator
-    command: python translation_worker.py
+    command: bash -lc "python translation_worker.py"
    environment:
      - DB_HOST=db
      - DB_PORT=5432
@ -101,7 +109,7 @@ services:
      - NVIDIA_VISIBLE_DEVICES=all
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
    volumes:
-      - hf_cache:/root/.cache/huggingface
+      - /datos/rss/hf_cache:/root/.cache/huggingface
    depends_on:
      db:
        condition: service_healthy
@ -114,7 +122,7 @@ services:
      args:
        TORCH_CUDA: cu121
    container_name: rss_ner
-    command: python ner_worker.py
+    command: bash -lc "python ner_worker.py"
    environment:
      - DB_HOST=db
      - DB_PORT=5432
@ -128,7 +136,61 @@ services:
        condition: service_healthy
    restart: always

-volumes:
-  postgres_data:
-  hf_cache:
+  embeddings:
+    build:
+      context: .
+      args:
+        TORCH_CUDA: cu121
+    container_name: rss_embeddings
+    command: bash -lc "python embeddings_worker.py"
+    environment:
+      - DB_HOST=db
+      - DB_PORT=5432
+      - DB_NAME=${DB_NAME}
+      - DB_USER=${DB_USER}
+      - DB_PASS=${DB_PASS}
+
+      - EMB_MODEL=sentence-transformers/all-MiniLM-L6-v2
+      - EMB_BATCH=64
+      - EMB_SLEEP=5
+
+      - PYTHONUNBUFFERED=1
+      - HF_HOME=/root/.cache/huggingface
+      - TOKENIZERS_PARALLELISM=false
+    volumes:
+      - /datos/rss/hf_cache:/root/.cache/huggingface
+    depends_on:
+      db:
+        condition: service_healthy
+    restart: always
+    # gpus: all
+
+  related:
+    build:
+      context: .
+      args:
+        TORCH_CUDA: cu121
+    container_name: rss_related
+    command: bash -lc "python related_worker.py"
+    environment:
+      - DB_HOST=db
+      - DB_PORT=5432
+      - DB_NAME=${DB_NAME}
+      - DB_USER=${DB_USER}
+      - DB_PASS=${DB_PASS}
+
+      - RELATED_TOPK=10
+      - RELATED_BATCH_IDS=200
+      - RELATED_BATCH_SIM=2000
+      - RELATED_SLEEP=10
+      - RELATED_MIN_SCORE=0.0
+      - RELATED_WINDOW_H=0
+    depends_on:
+      db:
+        condition: service_healthy
+    restart: always
+
+networks:
+  default:
+    name: rss_default

--- a/embeddings_worker.py
+++ b/embeddings_worker.py
@ -0,0 +1,161 @@
+# embeddings_worker.py
+# Worker de embeddings para TRADUCCIONES:
+# - Lee traducciones con status='done' y sin embedding para un modelo concreto
+# - Calcula embedding (Sentence-Transformers) sobre título_trad + resumen_trad
+# - Guarda en traduccion_embeddings (traduccion_id, model, dim, embedding)
+
+import os
+import time
+import logging
+from typing import List, Tuple
+import numpy as np
+import psycopg2
+import psycopg2.extras
+from sentence_transformers import SentenceTransformer
+
+logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')
+log = logging.getLogger(__name__)
+
+# ---------- Configuración DB ----------
+DB = dict(
+    host=os.environ.get("DB_HOST", "localhost"),
+    port=int(os.environ.get("DB_PORT", 5432)),
+    dbname=os.environ.get("DB_NAME", "rss"),
+    user=os.environ.get("DB_USER", "rss"),
+    password=os.environ.get("DB_PASS", "x"),
+)
+
+# ---------- Parámetros de worker ----------
+# Modelo por defecto: MiniLM pequeño y rápido
+EMB_MODEL   = os.environ.get("EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+EMB_BATCH   = int(os.environ.get("EMB_BATCH", "128"))
+SLEEP_IDLE  = float(os.environ.get("EMB_SLEEP_IDLE", "5.0"))
+# Filtrado por idiomas destino (coma-separado). Por defecto sólo 'es'
+EMB_LANGS   = [s.strip() for s in os.environ.get("EMB_LANGS", "es").split(",") if s.strip()]
+DEVICE      = os.environ.get("DEVICE", "auto").lower()  # 'auto' | 'cpu' | 'cuda'
+
+# Límite por iteración (para no tragar toda la tabla de golpe)
+EMB_LIMIT   = int(os.environ.get("EMB_LIMIT", "1000"))
+
+# ---------- Utilidades ----------
+def get_conn():
+    return psycopg2.connect(**DB)
+
+def ensure_schema(conn):
+    """Crea la tabla de embeddings para traducciones si no existe."""
+    with conn.cursor() as cur:
+        cur.execute("""
+        CREATE TABLE IF NOT EXISTS traduccion_embeddings (
+          id SERIAL PRIMARY KEY,
+          traduccion_id INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE,
+          model TEXT NOT NULL,
+          dim INT NOT NULL,
+          embedding DOUBLE PRECISION[] NOT NULL,
+          created_at TIMESTAMP DEFAULT NOW(),
+          UNIQUE (traduccion_id, model)
+        );
+        """)
+        cur.execute("CREATE INDEX IF NOT EXISTS idx_tr_emb_model ON traduccion_embeddings(model);")
+        cur.execute("CREATE INDEX IF NOT EXISTS idx_tr_emb_trid  ON traduccion_embeddings(traduccion_id);")
+    conn.commit()
+
+def fetch_batch_pending(conn) -> List[psycopg2.extras.DictRow]:
+    """
+    Devuelve un lote de traducciones 'done' del/los idioma(s) objetivo
+    que no tienen embedding aún para el EMB_MODEL indicado.
+    """
+    with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+        # Usamos ANY(%s) para filtrar por múltiples idiomas destino
+        cur.execute(f"""
+            SELECT t.id          AS traduccion_id,
+                   t.lang_to     AS lang_to,
+                   COALESCE(NULLIF(t.titulo_trad, ''), '')  AS titulo_trad,
+                   COALESCE(NULLIF(t.resumen_trad,''), '')  AS resumen_trad,
+                   n.id          AS noticia_id
+            FROM traducciones t
+            JOIN noticias n ON n.id = t.noticia_id
+            LEFT JOIN traduccion_embeddings e
+              ON e.traduccion_id = t.id AND e.model = %s
+            WHERE t.status = 'done'
+              AND t.lang_to = ANY(%s)
+              AND e.traduccion_id IS NULL
+            ORDER BY t.id
+            LIMIT %s
+        """, (EMB_MODEL, EMB_LANGS, EMB_LIMIT))
+        rows = cur.fetchall()
+        return rows
+
+def texts_from_rows(rows: List[psycopg2.extras.DictRow]) -> List[str]:
+    """
+    Compone el texto a vectorizar por cada traducción:
+    'titulo_trad' + '\n' + 'resumen_trad'. Si alguno falta, usa lo disponible.
+    """
+    texts = []
+    for r in rows:
+        title = (r["titulo_trad"] or "").strip()
+        body  = (r["resumen_trad"] or "").strip()
+        if title and body:
+            texts.append(f"{title}\n{body}")
+        else:
+            texts.append(title or body or "")
+    return texts
+
+def upsert_embeddings(conn, rows, embs: np.ndarray, model_name: str):
+    """
+    Inserta/actualiza embeddings por traducción.
+    """
+    if embs.size == 0 or not rows:
+        return
+    dim = int(embs.shape[1])
+    with conn.cursor() as cur:
+        for r, e in zip(rows, embs):
+            cur.execute("""
+              INSERT INTO traduccion_embeddings (traduccion_id, model, dim, embedding)
+              VALUES (%s, %s, %s, %s)
+              ON CONFLICT (traduccion_id, model) DO UPDATE
+              SET embedding = EXCLUDED.embedding,
+                  dim = EXCLUDED.dim,
+                  created_at = NOW()
+            """, (int(r["traduccion_id"]), model_name, dim, list(map(float, e))))
+    conn.commit()
+
+# ---------- Main loop ----------
+def main():
+    log.info("Arrancando embeddings_worker para TRADUCCIONES")
+    log.info("Modelo: %s | Batch: %s | Idiomas: %s | Device: %s",
+             EMB_MODEL, EMB_BATCH, ",".join(EMB_LANGS), DEVICE)
+
+    # Carga modelo
+    # DEVICE='auto' -> deja que S-B decida (usa CUDA si está disponible)
+    model = SentenceTransformer(EMB_MODEL, device=None if DEVICE == "auto" else DEVICE)
+
+    while True:
+        try:
+            with get_conn() as conn:
+                ensure_schema(conn)
+
+                rows = fetch_batch_pending(conn)
+                if not rows:
+                    time.sleep(SLEEP_IDLE)
+                    continue
+
+                texts = texts_from_rows(rows)
+                # Normalizamos embeddings (unit-length) para facilitar similitudes posteriores
+                embs = model.encode(
+                    texts,
+                    batch_size=EMB_BATCH,
+                    convert_to_numpy=True,
+                    show_progress_bar=False,
+                    normalize_embeddings=True
+                )
+
+                upsert_embeddings(conn, rows, embs, EMB_MODEL)
+                log.info("Embeddings upserted: %d", len(rows))
+
+        except Exception as e:
+            log.exception("Error en embeddings_worker: %s", e)
+            time.sleep(SLEEP_IDLE)
+
+if __name__ == "__main__":
+    main()
+
--- a/init-db/08-embeddings.sql
+++ b/init-db/08-embeddings.sql
@ -0,0 +1,81 @@
+-- init-db/08-embeddings.sql
+-- ============================================================
+-- Esquema para embeddings y relaciones semánticas entre noticias
+-- Compatible con embeddings_worker.py (usa traduccion_embeddings)
+-- y mantiene una vista "embeddings" para compatibilidad previa.
+-- ============================================================
+
+-- Tabla principal de embeddings por traducción (con modelo)
+CREATE TABLE IF NOT EXISTS traduccion_embeddings (
+  id             SERIAL PRIMARY KEY,
+  traduccion_id  INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE,
+  model          TEXT NOT NULL,
+  dim            INT NOT NULL,
+  embedding      DOUBLE PRECISION[] NOT NULL,
+  created_at     TIMESTAMP DEFAULT NOW(),
+  UNIQUE (traduccion_id, model)
+);
+
+-- Índices recomendados
+CREATE INDEX IF NOT EXISTS idx_tr_emb_traduccion_id ON traduccion_embeddings(traduccion_id);
+CREATE INDEX IF NOT EXISTS idx_tr_emb_model          ON traduccion_embeddings(model);
+
+-- -----------------------------------------------------------------
+-- Vista de compatibilidad "embeddings"
+-- (emula tu antigua tabla con columnas: traduccion_id, dim, vec)
+-- Ajusta el valor del WHERE model = '...' si usas otro modelo.
+-- -----------------------------------------------------------------
+DO $$
+BEGIN
+  -- Si ya existe una tabla llamada embeddings, la renombramos a embeddings_legacy para evitar conflicto
+  IF EXISTS (
+    SELECT 1 FROM information_schema.tables
+    WHERE table_schema = 'public' AND table_name = 'embeddings'
+  ) THEN
+    EXECUTE 'ALTER TABLE embeddings RENAME TO embeddings_legacy';
+  END IF;
+EXCEPTION WHEN others THEN
+  -- No bloqueamos la migración por esto
+  NULL;
+END$$;
+
+-- Crea/actualiza la vista
+CREATE OR REPLACE VIEW embeddings AS
+SELECT
+  te.traduccion_id,
+  te.dim,
+  te.embedding AS vec
+FROM traduccion_embeddings te
+WHERE te.model = 'sentence-transformers/all-MiniLM-L6-v2';
+
+-- Nota:
+-- Si quieres que la vista siempre coja el embedding más reciente de CUALQUIER modelo:
+--    REEMPLAZA el WHERE anterior por:
+--    WHERE te.id IN (
+--      SELECT DISTINCT ON (traduccion_id) id
+--      FROM traduccion_embeddings
+--      ORDER BY traduccion_id, created_at DESC
+--    );
+
+-- -----------------------------------------------------------------
+-- Relaciones semánticas entre traducciones (opcional)
+-- Esta tabla no la usa el worker directamente, pero permite cachear
+-- "noticias relacionadas" precalculadas por otro proceso/batch.
+-- -----------------------------------------------------------------
+CREATE TABLE IF NOT EXISTS related_noticias (
+  traduccion_id          INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE,
+  related_traduccion_id  INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE,
+  score                  DOUBLE PRECISION NOT NULL,
+  created_at             TIMESTAMP DEFAULT NOW(),
+  PRIMARY KEY (traduccion_id, related_traduccion_id),
+  CHECK (traduccion_id <> related_traduccion_id)
+);
+
+-- Índices para acelerar consultas en ambos sentidos
+CREATE INDEX IF NOT EXISTS idx_related_by_tr        ON related_noticias (traduccion_id);
+CREATE INDEX IF NOT EXISTS idx_related_by_relatedtr ON related_noticias (related_traduccion_id);
+
+-- Sugerencias:
+-- - Si pretendes recalcular periódicamente, podrías limpiar por ventana temporal:
+--   DELETE FROM related_noticias WHERE created_at < NOW() - INTERVAL '7 days';
+
--- a/migrations/001_utils_normalize_url.sql
+++ b/migrations/001_utils_normalize_url.sql
@ -0,0 +1,62 @@
+-- Función para “normalizar” URLs: minúsculas en esquema/host, quitar www,
+-- quitar fragmentos (#...), limpiar trackers comunes (utm_*, gclid, fbclid, ref, etc.),
+-- colapsar dobles “/”, y quitar la “/” final salvo si es la raíz.
+
+CREATE OR REPLACE FUNCTION normalize_url(in_url text)
+RETURNS text
+LANGUAGE plpgsql
+AS $$
+DECLARE
+  u text := trim(in_url);
+  scheme_host text;
+  path_q text;
+BEGIN
+  IF u IS NULL OR u = '' THEN
+    RETURN NULL;
+  END IF;
+
+  -- quitar espacios y fragmentos
+  u := regexp_replace(u, '#.*$', '', 'i');
+
+  -- separar esquema+host de path+query
+  -- ej: https://example.com:443/foo?bar -> scheme_host=https://example.com:443 ; path_q=/foo?bar
+  scheme_host := substring(u FROM '^[a-z]+://[^/]*');
+  IF scheme_host IS NULL THEN
+    -- si no hay esquema, asumimos http
+    u := 'http://' || u;
+    scheme_host := substring(u FROM '^[a-z]+://[^/]*');
+  END IF;
+  path_q := substring(u FROM '^[a-z]+://[^/]*(/.*)$');
+  IF path_q IS NULL THEN
+    path_q := '/';
+  END IF;
+
+  -- normalizar esquema y host (minúsculas, quitar www.)
+  scheme_host := lower(scheme_host);
+  scheme_host := regexp_replace(scheme_host, '^(https?://)www\.', '\1', 'i');
+
+  -- quitar puerto por defecto (:80 en http, :443 en https)
+  scheme_host := regexp_replace(scheme_host, '^http://([^/:]+):80$', 'http://\1', 'i');
+  scheme_host := regexp_replace(scheme_host, '^https://([^/:]+):443$', 'https://\1', 'i');
+
+  -- limpiar parámetros de tracking en la query
+  -- elimina ?utm_... &utm_... gclid fbclid mc_cid mc_eid ref ref_src etc.
+  path_q := regexp_replace(path_q, '([?&])(utm_[^=&]+|gclid|fbclid|mc_cid|mc_eid|ref|ref_src|yclid|igshid)=[^&#]*', '\1', 'gi');
+  -- limpiar conectores sobrantes ?, &, &&, ?&, etc.
+  path_q := regexp_replace(path_q, '\?&+', '?', 'g');
+  path_q := regexp_replace(path_q, '&{2,}', '&', 'g');
+  path_q := regexp_replace(path_q, '\?$', '', 'g');
+  path_q := regexp_replace(path_q, '\?$','', 'g');
+
+  -- colapsar dobles barras en path (no tocar “://”)
+  path_q := regexp_replace(path_q, '/{2,}', '/', 'g');
+
+  -- quitar “/” final si no es la raíz
+  IF path_q <> '/' THEN
+    path_q := regexp_replace(path_q, '/+$', '', 'g');
+  END IF;
+
+  RETURN scheme_host || path_q;
+END;
+$$;
+
--- a/migrations/002_unique_index_url_norm.sql
+++ b/migrations/002_unique_index_url_norm.sql
@ -0,0 +1,38 @@
+-- Añadir columna generada url_norm y crear índice único sobre ella.
+-- OJO: si ya existen duplicados, este índice fallará.
+-- Primero crea la columna si no existe:
+
+DO $$
+BEGIN
+  IF NOT EXISTS (
+    SELECT 1 FROM information_schema.columns
+    WHERE table_name='feeds' AND column_name='url_norm'
+  ) THEN
+    ALTER TABLE feeds
+      ADD COLUMN url_norm text GENERATED ALWAYS AS (normalize_url(url)) STORED;
+  END IF;
+END $$;
+
+-- Índice único (concurrently para no bloquear). Requiere estar fuera de transacción.
+-- Si tu herramienta corre todo en una transacción, ejecuta estas dos líneas aparte.
+-- Quita duplicados antes si da error.
+CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS feeds_url_norm_uniq ON feeds (url_norm)
+WHERE url_norm IS NOT NULL;
+
+-- (Opcional) repetir lo mismo para fuentes_url y noticias si quieres esa garantía también:
+
+DO $$
+BEGIN
+  IF NOT EXISTS (
+    SELECT 1 FROM information_schema.columns
+    WHERE table_name='fuentes_url' AND column_name='url_norm'
+  ) THEN
+    ALTER TABLE fuentes_url
+      ADD COLUMN url_norm text GENERATED ALWAYS AS (normalize_url(url)) STORED;
+  END IF;
+END $$;
+
+CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS fuentes_url_norm_uniq ON fuentes_url (url_norm)
+WHERE url_norm IS NOT NULL;
+
+
--- a/ner_worker.py
+++ b/ner_worker.py
@ -5,6 +5,7 @@ import re
 import psycopg2
 import psycopg2.extras
 import spacy
+from bs4 import BeautifulSoup

 logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')

@ -16,13 +17,9 @@ DB = dict(
    password=os.environ.get("DB_PASS", "x"),
 )

-# Idioma de las traducciones que vamos a etiquetar
 NER_LANG = os.environ.get("NER_LANG", "es").strip().lower()
-
-# Tamaño de lote de traducciones a procesar por iteración
 BATCH = int(os.environ.get("NER_BATCH", 64))

-# Mapeo de etiquetas de spaCy -> tipos de nuestro esquema
 ENT_LABELS = {
    "PERSON": "persona",
    "ORG": "organizacion",
@ -30,28 +27,65 @@ ENT_LABELS = {
    "LOC": "lugar",
 }

-# Normaliza el valor del tag (quita espacios extra, colapsa espacios internos)
 _ws_re = re.compile(r"\s+")
-def _clean_value(s: str) -> str:
-    if not s:
-        return ""
-    s = s.strip()
-    s = _ws_re.sub(" ", s)
-    return s
+HTML_TRASH_PATTERNS = [
+    r"<[^>]+>",
+    r"&[a-z]+;",
+    r'width="\d+"',
+    r'height="\d+"',
+]
+GENERIC_BAD_TAGS = {
+    "república",
+    "estado",
+    "centro",
+    "gobierno",
+    "report",
+    "sp",
+    "unión",
+}
+
+
+def clean_tag_text(text):
+    if not text:
+        return None
+    text = BeautifulSoup(text, "html.parser").get_text()
+    for pat in HTML_TRASH_PATTERNS:
+        text = re.sub(pat, "", text)
+    text = _ws_re.sub(" ", text).strip()
+    if len(text) < 3:
+        return None
+    if re.search(r"[<>/\\]", text):
+        return None
+    lower = text.lower()
+    if lower.startswith("href="):
+        return None
+    if lower.startswith("http"):
+        return None
+    if lower in GENERIC_BAD_TAGS:
+        return None
+    replacements = {
+        "ee.uu.": "Estados Unidos",
+        "los estados unidos": "Estados Unidos",
+        "eu": "Unión Europea",
+        "ue": "Unión Europea",
+        "kosova": "Kosovo",
+    }
+    if lower in replacements:
+        text = replacements[lower]
+    return text
+

 def get_conn():
    return psycopg2.connect(**DB)

+
 def main():
-    # Nota: asumimos español porque el contenedor instala es_core_news_md en el Dockerfile.
-    # Si quisieras soportar más idiomas, instala el modelo correspondiente y haz un mapping.
    nlp = spacy.load("es_core_news_md", disable=["parser", "lemmatizer", "textcat"])
    logging.info("spaCy cargado: es_core_news_md")

    while True:
        try:
            with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-                # Tomamos traducciones 'done' hacia NER_LANG que aún no tengan ninguna relación en tags_noticia
                cur.execute(
                    """
                    WITH pend AS (
@ -78,7 +112,7 @@ def main():
                logging.info(f"Procesando {len(rows)} traducciones para NER...")

                new_links = 0
-                new_tags = 0
+
                for r in rows:
                    text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
                    if not text:
@ -91,17 +125,14 @@ def main():
                        tipo = ENT_LABELS.get(ent.label_)
                        if not tipo:
                            continue
-                        val = _clean_value(ent.text)
-                        # filtros simples
-                        if len(val) < 2:
+                        val = clean_tag_text(ent.text)
+                        if not val:
                            continue
                        ents.append((val, tipo))

                    if not ents:
                        continue

-                    # Insertamos (o actualizamos si ya existe) el tag y luego la relación
-                    # IMPORTANTE: requiere UNIQUE(valor, tipo) en 'tags' y UNIQUE(traduccion_id, tag_id) en 'tags_noticia'
                    for valor, tipo in set(ents):
                        try:
                            cur.execute(
@ -115,7 +146,6 @@ def main():
                                (valor, tipo),
                            )
                            tag_id = cur.fetchone()[0]
-                            # Intenta crear la relación; si existe (por UNIQUE), se ignora
                            cur.execute(
                                """
                                INSERT INTO tags_noticia (traduccion_id, tag_id)
@ -126,11 +156,7 @@ def main():
                            )
                            if cur.rowcount > 0:
                                new_links += 1
-                            # Heurística: si el tag se ha creado (no hay forma directa aquí),
-                            # lo aproximamos contando que el RETURNING vino de un insert o un update.
-                            # Para no complicar: cuenta enlaces nuevos, y deja 'new_tags' como métrica opcional.
                        except Exception:
-                            # No abortar el lote por un único fallo en un valor raro.
                            logging.exception("Fallo insertando tag/relación")

                conn.commit()
@ -139,6 +165,7 @@ def main():
            logging.exception(f"Error en NER loop: {e}")
            time.sleep(5)

+
 if __name__ == "__main__":
    main()

--- a/paises.csv
+++ b/paises.csv
@ -0,0 +1,196 @@
+id,nombre,continente_id
+1,Afganistán,3
+2,Albania,4
+3,Alemania,4
+4,Andorra,4
+5,Angola,1
+6,Antigua y Barbuda,2
+7,Arabia Saudita,3
+8,Argelia,1
+9,Argentina,2
+10,Armenia,3
+11,Australia,5
+12,Austria,4
+13,Azerbaiyán,3
+14,Bahamas,2
+15,Bangladés,3
+16,Barbados,2
+17,Baréin,3
+19,Belice,2
+20,Benín,1
+21,Bielorrusia,4
+22,Birmania,3
+23,Bolivia,2
+24,Bosnia y Herzegovina,4
+25,Botsuana,1
+26,Brasil,2
+27,Brunéi,3
+28,Bulgaria,4
+29,Burkina Faso,1
+30,Burundi,1
+31,Bután,3
+18,Bélgica,4
+32,Cabo Verde,1
+33,Camboya,3
+34,Camerún,1
+35,Canadá,2
+36,Catar,3
+37,Chad,1
+38,Chile,2
+39,China,3
+40,Chipre,3
+41,Colombia,2
+42,Comoras,1
+43,Corea del Norte,3
+44,Corea del Sur,3
+46,Costa Rica,2
+45,Costa de Marfil,1
+47,Croacia,4
+48,Cuba,2
+49,Dinamarca,4
+50,Dominica,2
+51,Ecuador,2
+52,Egipto,1
+53,El Salvador,2
+54,Emiratos Árabes Unidos,3
+55,Eritrea,1
+56,Eslovaquia,4
+57,Eslovenia,4
+58,España,4
+59,Estados Unidos,2
+60,Estonia,4
+61,Esuatini,1
+62,Etiopía,1
+63,Filipinas,3
+64,Finlandia,4
+65,Fiyi,5
+66,Francia,4
+67,Gabón,1
+68,Gambia,1
+69,Georgia,3
+70,Ghana,1
+71,Granada,2
+72,Grecia,4
+73,Guatemala,2
+74,Guinea,1
+76,Guinea Ecuatorial,1
+75,Guinea-Bisáu,1
+77,Guyana,2
+78,Haití,2
+79,Honduras,2
+80,Hungría,4
+81,India,3
+82,Indonesia,3
+83,Irak,3
+85,Irlanda,4
+84,Irán,3
+86,Islandia,4
+87,Islas Marshall,5
+88,Islas Salomón,5
+89,Israel,3
+90,Italia,4
+91,Jamaica,2
+92,Japón,3
+93,Jordania,3
+94,Kazajistán,3
+95,Kenia,1
+96,Kirguistán,3
+97,Kiribati,5
+98,Kuwait,3
+99,Laos,3
+100,Lesoto,1
+101,Letonia,4
+103,Liberia,1
+104,Libia,1
+105,Liechtenstein,4
+106,Lituania,4
+107,Luxemburgo,4
+102,Líbano,3
+108,Macedonia del Norte,4
+109,Madagascar,1
+110,Malasia,3
+111,Malaui,1
+112,Maldivas,3
+114,Malta,4
+113,Malí,1
+115,Marruecos,1
+116,Mauricio,1
+117,Mauritania,1
+119,Micronesia,5
+120,Moldavia,4
+122,Mongolia,3
+123,Montenegro,4
+124,Mozambique,1
+118,México,2
+121,Mónaco,4
+125,Namibia,1
+126,Nauru,5
+127,Nepal,3
+128,Nicaragua,2
+130,Nigeria,1
+131,Noruega,4
+132,Nueva Zelanda,5
+129,Níger,1
+133,Omán,3
+135,Pakistán,3
+136,Palaos,5
+137,Palestina,3
+138,Panamá,2
+139,Papúa Nueva Guinea,5
+140,Paraguay,2
+134,Países Bajos,4
+141,Perú,2
+142,Polonia,4
+143,Portugal,4
+144,Reino Unido,4
+145,República Centroafricana,1
+146,República Checa,4
+148,República Democrática del Congo,1
+149,República Dominicana,2
+147,República del Congo,1
+150,Ruanda,1
+151,Rumanía,4
+152,Rusia,3
+153,Samoa,5
+154,San Cristóbal y Nieves,2
+155,San Marino,4
+156,San Vicente y las Granadinas,2
+157,Santa Lucía,2
+158,Santo Tomé y Príncipe,1
+159,Senegal,1
+160,Serbia,4
+161,Seychelles,1
+162,Sierra Leona,1
+163,Singapur,3
+164,Siria,3
+165,Somalia,1
+166,Sri Lanka,3
+167,Sudáfrica,1
+168,Sudán,1
+169,Sudán del Sur,1
+170,Suecia,4
+171,Suiza,4
+172,Surinam,2
+173,Tailandia,3
+174,Tanzania,1
+175,Tayikistán,3
+176,Timor Oriental,3
+177,Togo,1
+178,Tonga,5
+179,Trinidad y Tobago,2
+181,Turkmenistán,3
+182,Turquía,3
+183,Tuvalu,5
+180,Túnez,1
+184,Ucrania,4
+185,Uganda,1
+186,Uruguay,2
+187,Uzbekistán,3
+188,Vanuatu,5
+189,Vaticano,4
+190,Venezuela,2
+191,Vietnam,3
+192,Yemen,3
+193,Yibuti,1
+194,Zambia,1
+195,Zimbabue,1
--- a/paises.sql
+++ b/paises.sql
@ -1,198 +0,0 @@
-INSERT INTO paises (nombre, continente_id) VALUES
-('Afganistán', 3),
-('Albania', 4),
-('Alemania', 4),
-('Andorra', 4),
-('Angola', 1),
-('Antigua y Barbuda', 2),
-('Arabia Saudita', 3),
-('Argelia', 1),
-('Argentina', 2),
-('Armenia', 3),
-('Australia', 5),
-('Austria', 4),
-('Azerbaiyán', 3),
-('Bahamas', 2),
-('Bangladés', 3),
-('Barbados', 2),
-('Baréin', 3),
-('Bélgica', 4),
-('Belice', 2),
-('Benín', 1),
-('Bielorrusia', 4),
-('Birmania', 3),
-('Bolivia', 2),
-('Bosnia y Herzegovina', 4),
-('Botsuana', 1),
-('Brasil', 2),
-('Brunéi', 3),
-('Bulgaria', 4),
-('Burkina Faso', 1),
-('Burundi', 1),
-('Bután', 3),
-('Cabo Verde', 1),
-('Camboya', 3),
-('Camerún', 1),
-('Canadá', 2),
-('Catar', 3),
-('Chad', 1),
-('Chile', 2),
-('China', 3),
-('Chipre', 3),
-('Colombia', 2),
-('Comoras', 1),
-('Corea del Norte', 3),
-('Corea del Sur', 3),
-('Costa de Marfil', 1),
-('Costa Rica', 2),
-('Croacia', 4),
-('Cuba', 2),
-('Dinamarca', 4),
-('Dominica', 2),
-('Ecuador', 2),
-('Egipto', 1),
-('El Salvador', 2),
-('Emiratos Árabes Unidos', 3),
-('Eritrea', 1),
-('Eslovaquia', 4),
-('Eslovenia', 4),
-('España', 4),
-('Estados Unidos', 2),
-('Estonia', 4),
-('Esuatini', 1),
-('Etiopía', 1),
-('Filipinas', 3),
-('Finlandia', 4),
-('Fiyi', 5),
-('Francia', 4),
-('Gabón', 1),
-('Gambia', 1),
-('Georgia', 3),
-('Ghana', 1),
-('Granada', 2),
-('Grecia', 4),
-('Guatemala', 2),
-('Guinea', 1),
-('Guinea-Bisáu', 1),
-('Guinea Ecuatorial', 1),
-('Guyana', 2),
-('Haití', 2),
-('Honduras', 2),
-('Hungría', 4),
-('India', 3),
-('Indonesia', 3),
-('Irak', 3),
-('Irán', 3),
-('Irlanda', 4),
-('Islandia', 4),
-('Islas Marshall', 5),
-('Islas Salomón', 5),
-('Israel', 3),
-('Italia', 4),
-('Jamaica', 2),
-('Japón', 3),
-('Jordania', 3),
-('Kazajistán', 3),
-('Kenia', 1),
-('Kirguistán', 3),
-('Kiribati', 5),
-('Kuwait', 3),
-('Laos', 3),
-('Lesoto', 1),
-('Letonia', 4),
-('Líbano', 3),
-('Liberia', 1),
-('Libia', 1),
-('Liechtenstein', 4),
-('Lituania', 4),
-('Luxemburgo', 4),
-('Macedonia del Norte', 4),
-('Madagascar', 1),
-('Malasia', 3),
-('Malaui', 1),
-('Maldivas', 3),
-('Malí', 1),
-('Malta', 4),
-('Marruecos', 1),
-('Mauricio', 1),
-('Mauritania', 1),
-('México', 2),
-('Micronesia', 5),
-('Moldavia', 4),
-('Mónaco', 4),
-('Mongolia', 3),
-('Montenegro', 4),
-('Mozambique', 1),
-('Namibia', 1),
-('Nauru', 5),
-('Nepal', 3),
-('Nicaragua', 2),
-('Níger', 1),
-('Nigeria', 1),
-('Noruega', 4),
-('Nueva Zelanda', 5),
-('Omán', 3),
-('Países Bajos', 4),
-('Pakistán', 3),
-('Palaos', 5),
-('Palestina', 3),
-('Panamá', 2),
-('Papúa Nueva Guinea', 5),
-('Paraguay', 2),
-('Perú', 2),
-('Polonia', 4),
-('Portugal', 4),
-('Reino Unido', 4),
-('República Centroafricana', 1),
-('República Checa', 4),
-('República del Congo', 1),
-('República Democrática del Congo', 1),
-('República Dominicana', 2),
-('Ruanda', 1),
-('Rumanía', 4),
-('Rusia', 3),
-('Samoa', 5),
-('San Cristóbal y Nieves', 2),
-('San Marino', 4),
-('San Vicente y las Granadinas', 2),
-('Santa Lucía', 2),
-('Santo Tomé y Príncipe', 1),
-('Senegal', 1),
-('Serbia', 4),
-('Seychelles', 1),
-('Sierra Leona', 1),
-('Singapur', 3),
-('Siria', 3),
-('Somalia', 1),
-('Sri Lanka', 3),
-('Sudáfrica', 1),
-('Sudán', 1),
-('Sudán del Sur', 1),
-('Suecia', 4),
-('Suiza', 4),
-('Surinam', 2),
-('Tailandia', 3),
-('Tanzania', 1),
-('Tayikistán', 3),
-('Timor Oriental', 3),
-('Togo', 1),
-('Tonga', 5),
-('Trinidad y Tobago', 2),
-('Túnez', 1),
-('Turkmenistán', 3),
-('Turquía', 3),
-('Tuvalu', 5),
-('Ucrania', 4),
-('Uganda', 1),
-('Uruguay', 2),
-('Uzbekistán', 3),
-('Vanuatu', 5),
-('Vaticano', 4),
-('Venezuela', 2),
-('Vietnam', 3),
-('Yemen', 3),
-('Yibuti', 1),
-('Zambia', 1),
-('Zimbabue', 1)
-ON CONFLICT DO NOTHING;
-
--- a/related_worker.py
+++ b/related_worker.py
@ -0,0 +1,206 @@
+# related_worker.py
+import os
+import time
+import math
+import logging
+from typing import List, Tuple
+
+import psycopg2
+import psycopg2.extras
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='[related] %(asctime)s %(levelname)s: %(message)s'
+)
+
+DB = dict(
+    host=os.environ.get("DB_HOST", "localhost"),
+    port=int(os.environ.get("DB_PORT", 5432)),
+    dbname=os.environ.get("DB_NAME", "rss"),
+    user=os.environ.get("DB_USER", "rss"),
+    password=os.environ.get("DB_PASS", "x"),
+)
+
+# Config
+TOPK           = int(os.environ.get("RELATED_TOPK", 10))          # vecinos por traducción
+BATCH_IDS      = int(os.environ.get("RELATED_BATCH_IDS", 200))    # cuántas traducciones objetivo por pasada
+BATCH_SIM      = int(os.environ.get("RELATED_BATCH_SIM", 2000))   # tamaño de bloque al comparar contra el resto
+SLEEP_IDLE     = float(os.environ.get("RELATED_SLEEP", 10))       # pausa cuando no hay trabajo
+MIN_SCORE      = float(os.environ.get("RELATED_MIN_SCORE", 0.0))  # descarta relaciones por debajo de este coseno
+WINDOW_HOURS   = int(os.environ.get("RELATED_WINDOW_H", 0))       # 0 = sin filtro temporal; >0 = últimas X horas
+
+def get_conn():
+    return psycopg2.connect(**DB)
+
+def _fetch_all_embeddings(cur):
+    """
+    Devuelve:
+      ids:   List[int] con traduccion_id
+      vecs:  List[List[float]] con el embedding (puede venir como list de DOUBLE PRECISION[])
+      norms: List[float] con la norma L2 de cada vector (precalculada para acelerar el coseno)
+    Si WINDOW_HOURS > 0, limitamos a noticias recientes.
+    """
+    if WINDOW_HOURS > 0:
+        cur.execute("""
+            SELECT e.traduccion_id, e.vec
+            FROM embeddings e
+            JOIN traducciones t ON t.id = e.traduccion_id
+            JOIN noticias n ON n.id = t.noticia_id
+            WHERE n.fecha >= NOW() - INTERVAL %s
+        """, (f"{WINDOW_HOURS} hours",))
+    else:
+        cur.execute("SELECT traduccion_id, vec FROM embeddings")
+
+    rows = cur.fetchall()
+    if not rows:
+        return [], [], []
+
+    ids = []
+    vecs = []
+    norms = []
+    for tr_id, v in rows:
+        # v llega como lista de floats (DOUBLE PRECISION[]); protegemos None
+        if v is None:
+            v = []
+        # calcular norma
+        nrm = math.sqrt(sum(((x or 0.0) * (x or 0.0)) for x in v)) or 1e-8
+        ids.append(tr_id)
+        vecs.append(v)
+        norms.append(nrm)
+    return ids, vecs, norms
+
+def _fetch_pending_ids(cur, limit) -> List[int]:
+    """
+    Traducciones con embedding pero sin relaciones generadas aún.
+    Si quieres regenerar periódicamente, puedes cambiar la condición
+    para tener en cuenta antigüedad o un flag de 'stale'.
+    """
+    cur.execute("""
+        SELECT e.traduccion_id
+        FROM embeddings e
+        LEFT JOIN related_noticias r ON r.traduccion_id = e.traduccion_id
+        GROUP BY e.traduccion_id
+        HAVING COUNT(r.related_traduccion_id) = 0
+        ORDER BY e.traduccion_id DESC
+        LIMIT %s;
+    """, (limit,))
+    return [r[0] for r in cur.fetchall()]
+
+def _cosine_with_norms(a, b, na, nb):
+    # producto punto
+    num = 0.0
+    # zip se corta por el más corto; si longitudes difieren, usamos la intersección
+    for x, y in zip(a, b):
+        xv = x or 0.0
+        yv = y or 0.0
+        num += xv * yv
+    denom = na * nb
+    if denom <= 0.0:
+        return 0.0
+    return num / denom
+
+def _topk_for_one(idx: int,
+                  ids_all: List[int],
+                  vecs_all: List[List[float]],
+                  norms_all: List[float],
+                  pool_indices: List[int],
+                  K: int) -> List[Tuple[int, float]]:
+    """
+    Devuelve los K mejores (related_id, score) para ids_all[idx] restringido al conjunto pool_indices.
+    """
+    me_vec = vecs_all[idx]
+    me_norm = norms_all[idx]
+
+    out: List[Tuple[int, float]] = []
+    for j in pool_indices:
+        if j == idx:
+            continue
+        s = _cosine_with_norms(me_vec, vecs_all[j], me_norm, norms_all[j])
+        out.append((ids_all[j], s))
+
+    # top-K ordenado por score desc
+    out.sort(key=lambda t: t[1], reverse=True)
+    if MIN_SCORE > 0.0:
+        out = [p for p in out if p[1] >= MIN_SCORE]
+    return out[:K]
+
+def _insert_related(cur, tr_id: int, pairs: List[Tuple[int, float]]):
+    if not pairs:
+        return
+    psycopg2.extras.execute_values(
+        cur,
+        """
+        INSERT INTO related_noticias (traduccion_id, related_traduccion_id, score)
+        VALUES %s
+        ON CONFLICT (traduccion_id, related_traduccion_id)
+        DO UPDATE SET score = EXCLUDED.score
+        """,
+        [(tr_id, rid, float(score)) for (rid, score) in pairs]
+    )
+
+def build_for_ids(conn, target_ids: List[int]) -> int:
+    """
+    Para las traducciones de target_ids:
+      - carga TODOS los embeddings (opcionalmente filtrados por ventana temporal),
+      - para cada target calcula sus TOPK vecinos por coseno, por bloques,
+      - upsert en related_noticias.
+    """
+    with conn.cursor() as cur:
+        ids_all, vecs_all, norms_all = _fetch_all_embeddings(cur)
+    if not ids_all:
+        return 0
+
+    # mapa traduccion_id -> índice en arrays
+    pos = {tid: i for i, tid in enumerate(ids_all)}
+    n = len(ids_all)
+    processed = 0
+
+    with conn.cursor() as cur:
+        for tr_id in target_ids:
+            if tr_id not in pos:
+                continue
+            i = pos[tr_id]
+
+            # barrido por bloques para no disparar memoria
+            top: List[Tuple[int, float]] = []
+            for start in range(0, n, BATCH_SIM):
+                block = list(range(start, min(start + BATCH_SIM, n)))
+                candidates = _topk_for_one(i, ids_all, vecs_all, norms_all, block, TOPK)
+
+                # merge de top-K global
+                top += candidates
+                top.sort(key=lambda t: t[1], reverse=True)
+                if len(top) > TOPK:
+                    top = top[:TOPK]
+
+            _insert_related(cur, tr_id, top)
+            processed += 1
+
+        conn.commit()
+    return processed
+
+def main():
+    logging.info(
+        "Iniciando related_worker (TOPK=%s, BATCH_IDS=%s, BATCH_SIM=%s, MIN_SCORE=%.3f, WINDOW_H=%s)",
+        TOPK, BATCH_IDS, BATCH_SIM, MIN_SCORE, WINDOW_HOURS
+    )
+    while True:
+        try:
+            with get_conn() as conn, conn.cursor() as cur:
+                todo = _fetch_pending_ids(cur, BATCH_IDS)
+
+            if not todo:
+                time.sleep(SLEEP_IDLE)
+                continue
+
+            with get_conn() as conn:
+                done = build_for_ids(conn, todo)
+                logging.info("Relacionadas generadas/actualizadas para %d traducciones.", done)
+
+        except Exception:
+            logging.exception("Error en related_worker")
+            time.sleep(SLEEP_IDLE)
+
+if __name__ == "__main__":
+    main()
+
--- a/requirements.txt
+++ b/requirements.txt
@ -16,6 +16,8 @@ sentencepiece==0.2.0
 sacremoses==0.1.1
 accelerate==0.33.0
 spacy>=3.7,<4.0
-# Nota: PyTorch (torch) NO se fija aquí.
-# Se instala en el Dockerfile con la wheel adecuada de CUDA (cu121) para tu GPU.
-
+pgvector==0.2.5
+sentence-transformers==3.0.1
+numpy>=1.26
+scikit-learn>=1.4
+python-dotenv>=1.0
--- a/templates/noticia.html
+++ b/templates/noticia.html
@ -0,0 +1,111 @@
+{% extends "base.html" %}
+{% block title %}
+  {% set d = dato if dato is defined else (r if r is defined else None) %}
+  {% if d %}
+    {{ d.titulo_trad or d.titulo_orig or d.titulo_traducido or d.titulo_original or 'Detalle de Noticia' }}
+  {% else %}
+    Detalle de Noticia
+  {% endif %}
+{% endblock %}
+
+{% block content %}
+{% set d = dato if dato is defined else (r if r is defined else None) %}
+
+{% if not d %}
+  <div class="card">
+    <div class="card-body">
+      <p>No se encontró la noticia solicitada.</p>
+    </div>
+  </div>
+{% else %}
+<div class="card">
+  <div class="feed-header">
+    <h2 style="margin:0;">
+      {{ d.titulo_trad or d.titulo_orig or d.titulo_traducido or d.titulo_original }}
+      {% if d.lang_to %}<span class="badge" title="Traducción">{{ d.lang_to|upper }}</span>{% endif %}
+    </h2>
+    {% if d.fuente_url or d.url %}
+      <div>
+        <a class="btn btn-small" href="{{ d.fuente_url or d.url }}" target="_blank" rel="noopener">Ver fuente</a>
+      </div>
+    {% endif %}
+  </div>
+
+  <div class="feed-body">
+    <div class="noticia-meta" style="margin-bottom:12px;">
+      {% set fecha_ = d.fecha %}
+      {% if fecha_ %}
+        <i class="far fa-calendar-alt"></i>
+        {% if fecha_ is string %}{{ fecha_ }}{% else %}{{ fecha_.strftime('%d-%m-%Y %H:%M') }}{% endif %}
+      {% endif %}
+      {% if d.fuente_nombre %} | <i class="fas fa-newspaper"></i> {{ d.fuente_nombre }}{% endif %}
+      {% if d.categoria %} | <i class="fas fa-tag"></i> {{ d.categoria }}{% endif %}
+      {% if d.pais %} | <i class="fas fa-globe-americas"></i> {{ d.pais }}{% endif %}
+    </div>
+
+    {% if d.resumen_trad or d.cuerpo_traducido %}
+      <h3>Resumen (traducido)</h3>
+      <div>{{ (d.resumen_trad or d.cuerpo_traducido)|safe_html }}</div>
+      <hr>
+    {% endif %}
+
+    {% if d.resumen_orig or d.cuerpo_original or d.resumen or d.titulo_original %}
+      <h3>Resumen (original)</h3>
+      <div>{{ (d.resumen_orig or d.cuerpo_original or d.resumen)|safe_html }}</div>
+    {% endif %}
+
+    {% if tags is defined and tags and tags|length %}
+      <div style="margin-top:16px;">
+        {% for t in tags %}
+          {# t puede ser DictRow (t['valor']) o tupla (t.0) #}
+          {% set valor = t.valor if t.valor is defined else (t[0] if t[0] is defined else '') %}
+          {% set tipo  = t.tipo  if t.tipo  is defined else (t[1] if t[1] is defined else '') %}
+          <span class="badge" title="{{ (tipo or '')|capitalize }}">{{ valor }}</span>
+        {% endfor %}
+      </div>
+    {% endif %}
+  </div>
+</div>
+
+{% set rels = relacionadas if relacionadas is defined else None %}
+{% if rels and rels|length %}
+<div class="card" style="margin-top:18px;">
+  <div class="card-header">
+    <h3 style="margin:0;">Noticias relacionadas</h3>
+  </div>
+  <div class="feed-body">
+    <ul class="noticias-list">
+      {% for r in rels %}
+      <li class="noticia-item">
+        {% if r.imagen_url %}
+        <div class="noticia-imagen">
+          <a href="{{ r.url }}" target="_blank" rel="noopener">
+            <img src="{{ r.imagen_url }}" alt="Imagen relacionada" loading="lazy">
+          </a>
+        </div>
+        {% endif %}
+        <div class="noticia-texto">
+          <h3 class="m0">
+            <a href="{{ r.url }}" target="_blank" rel="noopener">{{ r.titulo }}</a>
+          </h3>
+          <div class="noticia-meta">
+            {% if r.fecha %}
+              <i class="far fa-calendar-alt"></i>
+              {% if r.fecha is string %}{{ r.fecha }}{% else %}{{ r.fecha.strftime('%d-%m-%Y %H:%M') }}{% endif %}
+            {% endif %}
+            {% if r.fuente_nombre %} | <i class="fas fa-newspaper"></i> {{ r.fuente_nombre }}{% endif %}
+            {% if r.score is defined %} | <span title="Similitud coseno">score: {{ "%.3f"|format(r.score) }}</span>{% endif %}
+          </div>
+          {% if r.resumen %}
+          <div class="clamp">{{ r.resumen }}</div>
+          {% endif %}
+        </div>
+      </li>
+      {% endfor %}
+    </ul>
+  </div>
+</div>
+{% endif %}
+{% endif %}
+{% endblock %}
+
--- a/templates/noticias.html
+++ b/templates/noticias.html
@ -88,10 +88,16 @@ document.addEventListener('DOMContentLoaded', function() {
    const form = document.getElementById('filter-form');
    const continenteSelect = document.getElementById('continente_id');
    const paisSelect = document.getElementById('pais_id');
+    const categoriaSelect = document.getElementById('categoria_id');
+    const fechaInput = document.getElementById('fecha');
+    const qInput = document.getElementById('q');
+
    const pageInput = document.getElementById('page');
    const origInput = document.getElementById('orig');
    const langInput = document.getElementById('lang');

+    function setPage1() { pageInput.value = 1; }
+
    function filtrarPaises() {
        const continenteId = continenteSelect.value;
        for (let i = 1; i < paisSelect.options.length; i++) {
@ -105,22 +111,14 @@ document.addEventListener('DOMContentLoaded', function() {
        }
    }

-    async function cargarNoticias(keepPage) {
-        if (!keepPage) pageInput.value = 1;
-
-        const formData = new FormData(form);
-        const params = new URLSearchParams(formData);
-        const newUrl = `${form.action}?${params.toString()}`;
-
+    async function cargarNoticiasFromURL(url) {
        const container = document.getElementById('noticias-container');
        container.style.opacity = '0.5';
        container.innerHTML = '<div style="text-align:center; padding: 40px;"><i class="fas fa-spinner fa-spin fa-2x"></i></div>';
-
        try {
-            const response = await fetch(newUrl, { headers: { 'X-Requested-With': 'XMLHttpRequest' } });
+            const response = await fetch(url, { headers: { 'X-Requested-With': 'XMLHttpRequest' } });
            const html = await response.text();
            container.innerHTML = html;
-            window.history.pushState({path: newUrl}, '', newUrl);
        } catch (error) {
            console.error('Error al filtrar noticias:', error);
            container.innerHTML = '<p style="color:var(--error-color); text-align:center;">Error al cargar las noticias.</p>';
@ -129,11 +127,25 @@ document.addEventListener('DOMContentLoaded', function() {
        }
    }

+    async function cargarNoticias(keepPage) {
+        if (!keepPage) setPage1();
+
+        const formData = new FormData(form);
+        const params = new URLSearchParams(formData);
+        const newUrl = `${form.action}?${params.toString()}`;
+
+        await cargarNoticiasFromURL(newUrl);
+        // Actualizar historial
+        window.history.pushState({ path: newUrl }, '', newUrl);
+    }
+
+    // Submit manual
    form.addEventListener('submit', function(e) {
        e.preventDefault();
        cargarNoticias(false);
    });

+    // Toggle traducción/original
    const toggleOrig = document.getElementById('toggle-orig');
    const toggleTr = document.getElementById('toggle-tr');

@ -153,12 +165,38 @@ document.addEventListener('DOMContentLoaded', function() {
        });
    }

+    // Cambios en selects/fecha -> recarga automática
    continenteSelect.addEventListener('change', function() {
        filtrarPaises();
        cargarNoticias(false);
    });
+    paisSelect.addEventListener('change', function() {
+        cargarNoticias(false);
+    });
+    categoriaSelect.addEventListener('change', function() {
+        cargarNoticias(false);
+    });
+    fechaInput.addEventListener('change', function() {
+        cargarNoticias(false);
+    });

+    // Debounce búsqueda
+    let qTimer = null;
+    qInput.addEventListener('input', function() {
+        if (qTimer) clearTimeout(qTimer);
+        qTimer = setTimeout(() => {
+            cargarNoticias(false);
+        }, 450);
+    });
+
+    // Cargar países al inicio
    filtrarPaises();
+
+    // Soporte de navegación del historial
+    window.addEventListener('popstate', function(e) {
+        const url = (e.state && e.state.path) ? e.state.path : window.location.href;
+        cargarNoticiasFromURL(url);
+    });
 });
 </script>
 {% endblock %}