ranking y tags

2025-10-17 04:10:09 +02:00 · 2025-10-17 04:10:09 +02:00 · d29152a0f6
commit d29152a0f6
parent 0bfeb610a9
9 changed files with 344 additions and 48 deletions
--- a/38
+++ b/38
@ -1,12 +1,10 @@
-# Usa una imagen base de Python ligera y moderna
+# Imagen base Python
 FROM python:3.11-slim
-# Permite elegir CPU o CUDA en build:
+# Por defecto construimos para CUDA 12.1 (cu121)
-#  - TORCH_CUDA=cpu   -> instalar torch CPU
+# Si alguna vez quisieras CPU, pásale: --build-arg TORCH_CUDA=cpu
-#  - TORCH_CUDA=cu121 -> instalar torch con CUDA 12.1
+ARG TORCH_CUDA=cu121
 ARG TORCH_CUDA=cpu
 # Establece el directorio de trabajo dentro del contenedor
 WORKDIR /app
 # Paquetes nativos necesarios
@ -16,30 +14,36 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    git \
    && rm -rf /var/lib/apt/lists/*
-# Copia requirements primero para aprovechar caché
+# Requerimientos
 COPY requirements.txt .
-# Instala dependencias Python "comunes"
+# Actualiza pip y herramientas base
-RUN pip install --no-cache-dir -r requirements.txt
+RUN python -m pip install --no-cache-dir --upgrade pip setuptools wheel
-# Instala PyTorch según ARG (CPU o CUDA 12.1)
+# Instala PyTorch con el runtime CUDA 12.1 (o CPU si TORCH_CUDA=cpu)
 # (Versión de ejemplo; puedes alinear con tu stack)
 RUN if [ "$TORCH_CUDA" = "cu121" ]; then \
      pip install --no-cache-dir --index-url https://download.pytorch.org/whl/cu121 \
        torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1; \
    else \
-      pip install --no-cache-dir torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1; \
+      pip install --no-cache-dir \
        torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1; \
    fi
-# Copia el resto del código
+# Instala el resto de dependencias de tu app
 RUN pip install --no-cache-dir -r requirements.txt
 # spaCy + modelo español (para el NER que quieres después)
 RUN pip install --no-cache-dir "spacy>=3.7,<4.0" \
 && python -m spacy download es_core_news_md
 # Copia el código
 COPY . .
-# Descarga recursos NLTK si tu app los necesita
+# (Opcional) descarga recursos NLTK si tu app los usa; si no, déjalo como no-op
 # (si no los usas, comenta esta línea)
 RUN python download_models.py || true
-# Expone el puerto de Gunicorn (servicio web)
+# Puerto que usará gunicorn en el servicio web
 EXPOSE 8000
-# El CMD lo define docker-compose para cada servicio
+# El CMD lo define docker-compose
--- a/app.py
+++ b/app.py
@ -236,6 +236,7 @@ def home():
    total_results = 0
    total_pages = 0
    tags_por_trad = {}
    try:
        with get_conn() as conn:
@ -265,6 +266,19 @@ def home():
                cursor.execute(sql_page, tuple(params_page))
                noticias = cursor.fetchall()
                # Cargar tags por traducción (si aplica)
                tr_ids = [row['traduccion_id'] for row in noticias if row.get('traduccion_id')]
                if tr_ids:
                    cursor.execute("""
                        SELECT tn.traduccion_id, tg.valor, tg.tipo
                        FROM tags_noticia tn
                        JOIN tags tg ON tg.id = tn.tag_id
                        WHERE tn.traduccion_id = ANY(%s)
                        ORDER BY tg.tipo, tg.valor
                    """, (tr_ids,))
                    for trid, valor, tipo in cursor.fetchall():
                        tags_por_trad.setdefault(trid, []).append((valor, tipo))
    except psycopg2.Error as db_err:
        app.logger.error(f"[DB ERROR] Al leer noticias: {db_err}", exc_info=True)
        flash("Error de base de datos al cargar las noticias.", "error")
@ -274,7 +288,8 @@ def home():
        cat_id=int(cat_id) if cat_id else None, cont_id=int(cont_id) if cont_id else None,
        pais_id=int(pais_id) if pais_id else None, fecha_filtro=fecha_filtro, q=q,
        page=page, per_page=per_page, total_pages=total_pages, total_results=total_results,
-        lang=lang, use_tr=use_tr
+        lang=lang, use_tr=use_tr,
        tags_por_trad=tags_por_trad
    )
    if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
@ -326,19 +341,31 @@ def noticia(tr_id):
@app.route("/dashboard")
 def dashboard():
    stats = {'feeds_totales': 0, 'noticias_totales': 0, 'feeds_caidos': 0}
    top_tags = []
    try:
        with get_conn() as conn:
-            with conn.cursor() as cursor:
+            # Usamos DictCursor aquí para poder usar t.valor / t.tipo / t.apariciones en Jinja
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
                cursor.execute("SELECT COUNT(*) FROM feeds")
                stats['feeds_totales'] = cursor.fetchone()[0]
                cursor.execute("SELECT COUNT(*) FROM noticias")
                stats['noticias_totales'] = cursor.fetchone()[0]
                cursor.execute("SELECT COUNT(*) FROM feeds WHERE activo = FALSE")
                stats['feeds_caidos'] = cursor.fetchone()[0]
                cursor.execute("""
                  SELECT valor, tipo, apariciones
                  FROM v_tag_counts_24h
                  ORDER BY apariciones DESC, valor
                  LIMIT 20
                """)
                top_tags = cursor.fetchall()
    except psycopg2.Error as db_err:
        app.logger.error(f"[DB ERROR] Al calcular estadísticas: {db_err}")
        flash("Error al conectar con la base de datos.", "error")
-    return render_template("dashboard.html", stats=stats)
+    return render_template("dashboard.html", stats=stats, top_tags=top_tags)
@app.route("/feeds/manage")
 def manage_feeds():
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -20,7 +20,6 @@ services:
    build:
      context: .
      args:
        # Reutiliza Dockerfile con torch-cu121; la web no usa GPU.
        TORCH_CUDA: cu121
    container_name: rss_web
    command: gunicorn --bind 0.0.0.0:8000 --workers 3 app:app
@ -33,8 +32,6 @@ services:
      - DB_USER=${DB_USER}
      - DB_PASS=${DB_PASS}
      - SECRET_KEY=${SECRET_KEY}
      # UI opcional
      # - NEWS_PER_PAGE=20
      - WEB_TRANSLATED_DEFAULT=1
      - DEFAULT_LANG=es
      - TRANSLATION_PREFERRED_LANGS=es
@ -66,51 +63,41 @@ services:
    build:
      context: .
      args:
-        TORCH_CUDA: cu121  # PyTorch con CUDA 12.1 en la imagen
+        TORCH_CUDA: cu121
    container_name: rss_translator
    command: python translation_worker.py
    environment:
      # --- DB ---
      - DB_HOST=db
      - DB_PORT=5432
      - DB_NAME=${DB_NAME}
      - DB_USER=${DB_USER}
      - DB_PASS=${DB_PASS}
      # --- Worker (ajustes estables VRAM) ---
      - TARGET_LANGS=es
-      - TRANSLATOR_BATCH=8           # cuántas filas toma por ciclo
+      - TRANSLATOR_BATCH=8
      - ENQUEUE=200
      - TRANSLATOR_SLEEP_IDLE=5
-      # Tokens (seguro para NLLB-1.3B; evita >1024)
+      - MAX_SRC_TOKENS=680
-      - MAX_SRC_TOKENS=680           # margen bajo el límite real del modelo
+      - MAX_NEW_TOKENS=400
      - MAX_NEW_TOKENS=400           # permite salidas más largas en cuerpos
      # Beams: mejor en títulos, eficiente en cuerpo
      - NUM_BEAMS_TITLE=2
      - NUM_BEAMS_BODY=1
      # Modelo NLLB 1.3B
      - UNIVERSAL_MODEL=facebook/nllb-200-1.3B
      # Chunking por frases (mejor coherencia en artículos largos)
      - CHUNK_BY_SENTENCES=True
-      - CHUNK_MAX_TOKENS=700         # <= MAX_SRC_TOKENS (con margen)
+      - CHUNK_MAX_TOKENS=700
-      - CHUNK_OVERLAP_SENTS=1        # solape de 1 frase para evitar cortes bruscos
+      - CHUNK_OVERLAP_SENTS=1
-      - CLEAN_ARTICLE=1              # limpia “The post…”, “Læs også…”, etc.
+      - CLEAN_ARTICLE=1
      # Dispositivo (usa GPU si hay; cae a CPU si hay OOM)
      - DEVICE=cuda
      # Rendimiento / estabilidad
      - PYTHONUNBUFFERED=1
      - HF_HOME=/root/.cache/huggingface
      - TOKENIZERS_PARALLELISM=false
      # Evita el assert del allocator de PyTorch
      - PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64,garbage_collection_threshold:0.9
      # GPU (requiere NVIDIA Container Toolkit en el host)
      - NVIDIA_VISIBLE_DEVICES=all
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
    volumes:
@ -119,14 +106,27 @@ services:
      db:
        condition: service_healthy
    restart: always
    # Habilita GPU (Compose v2 + nvidia-container-toolkit)
    gpus: all
-    # Alternativa con 'deploy':
+
-    # deploy:
+  ner:
-    #   resources:
+    build:
-    #     reservations:
+      context: .
-    #       devices:
+      args:
-    #         - capabilities: [gpu]
+        TORCH_CUDA: cu121
    container_name: rss_ner
    command: python ner_worker.py
    environment:
      - DB_HOST=db
      - DB_PORT=5432
      - DB_NAME=${DB_NAME}
      - DB_USER=${DB_USER}
      - DB_PASS=${DB_PASS}
      - NER_LANG=es
      - NER_BATCH=64
    depends_on:
      db:
        condition: service_healthy
    restart: always
 volumes:
  postgres_data:
--- a/init-db/06-tags.sql
+++ b/init-db/06-tags.sql
@ -0,0 +1,24 @@
 -- init-db/06-tags.sql  (modelo simple compatible con ner_worker.py)
 -- Tabla de tags
 CREATE TABLE IF NOT EXISTS tags (
  id    SERIAL PRIMARY KEY,
  valor TEXT NOT NULL,
  tipo  TEXT NOT NULL,         -- 'persona','organizacion','lugar', ...
  UNIQUE (valor, tipo)
 );
 -- Relación tag <-> traducción
 CREATE TABLE IF NOT EXISTS tags_noticia (
  id             SERIAL PRIMARY KEY,
  traduccion_id  INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE,
  tag_id         INT NOT NULL REFERENCES tags(id) ON DELETE CASCADE,
  UNIQUE (traduccion_id, tag_id)
 );
 -- Índices útiles
 CREATE INDEX IF NOT EXISTS idx_tags_valor ON tags(valor);
 CREATE INDEX IF NOT EXISTS idx_tags_tipo  ON tags(tipo);
 CREATE INDEX IF NOT EXISTS idx_tags_noticia_trid ON tags_noticia(traduccion_id);
 CREATE INDEX IF NOT EXISTS idx_tags_noticia_tag  ON tags_noticia(tag_id);
--- a/init-db/07-tags-views.sql
+++ b/init-db/07-tags-views.sql
@ -0,0 +1,42 @@
 -- init-db/07-tags-views.sql
 -- Vista de Top tags (24h) para el esquema:
 --   tags(id, valor, tipo)
 --   tags_noticia(id, traduccion_id, tag_id)
 --   traducciones(id, noticia_id, lang_to, status, ...)
 --   noticias(id, fecha, ...)
 CREATE OR REPLACE VIEW public.v_tag_counts_24h AS
 SELECT
  tg.id,
  tg.valor,
  tg.tipo,
  COUNT(*) AS apariciones
 FROM public.tags tg
 JOIN public.tags_noticia tn ON tn.tag_id = tg.id
 JOIN public.traducciones t   ON t.id = tn.traduccion_id
 JOIN public.noticias n       ON n.id = t.noticia_id
 WHERE t.status = 'done'
  AND t.lang_to = 'es'
  AND n.fecha >= now() - INTERVAL '24 hours'
 GROUP BY tg.id, tg.valor, tg.tipo
 ORDER BY apariciones DESC, tg.valor;
 -- Índices recomendados para acelerar la vista (idempotentes)
 CREATE INDEX IF NOT EXISTS idx_noticias_fecha
  ON public.noticias (fecha);
 CREATE INDEX IF NOT EXISTS idx_traducciones_noticia_lang_status
  ON public.traducciones (noticia_id, lang_to, status);
 CREATE INDEX IF NOT EXISTS idx_tags_noticia_traduccion
  ON public.tags_noticia (traduccion_id);
 CREATE INDEX IF NOT EXISTS idx_tags_noticia_tag
  ON public.tags_noticia (tag_id);
 -- (Opcionales si no existen ya, pero ayudan en búsquedas ad hoc)
 CREATE INDEX IF NOT EXISTS idx_tags_valor
  ON public.tags (valor);
 CREATE INDEX IF NOT EXISTS idx_tags_tipo
  ON public.tags (tipo);
--- a/ner_worker.py
+++ b/ner_worker.py
@ -0,0 +1,144 @@
 import os
 import time
 import logging
 import re
 import psycopg2
 import psycopg2.extras
 import spacy
 logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')
 DB = dict(
    host=os.environ.get("DB_HOST", "localhost"),
    port=int(os.environ.get("DB_PORT", 5432)),
    dbname=os.environ.get("DB_NAME", "rss"),
    user=os.environ.get("DB_USER", "rss"),
    password=os.environ.get("DB_PASS", "x"),
 )
 # Idioma de las traducciones que vamos a etiquetar
 NER_LANG = os.environ.get("NER_LANG", "es").strip().lower()
 # Tamaño de lote de traducciones a procesar por iteración
 BATCH = int(os.environ.get("NER_BATCH", 64))
 # Mapeo de etiquetas de spaCy -> tipos de nuestro esquema
 ENT_LABELS = {
    "PERSON": "persona",
    "ORG": "organizacion",
    "GPE": "lugar",
    "LOC": "lugar",
 }
 # Normaliza el valor del tag (quita espacios extra, colapsa espacios internos)
 _ws_re = re.compile(r"\s+")
 def _clean_value(s: str) -> str:
    if not s:
        return ""
    s = s.strip()
    s = _ws_re.sub(" ", s)
    return s
 def get_conn():
    return psycopg2.connect(**DB)
 def main():
    # Nota: asumimos español porque el contenedor instala es_core_news_md en el Dockerfile.
    # Si quisieras soportar más idiomas, instala el modelo correspondiente y haz un mapping.
    nlp = spacy.load("es_core_news_md", disable=["parser", "lemmatizer", "textcat"])
    logging.info("spaCy cargado: es_core_news_md")
    while True:
        try:
            with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                # Tomamos traducciones 'done' hacia NER_LANG que aún no tengan ninguna relación en tags_noticia
                cur.execute(
                    """
                    WITH pend AS (
                      SELECT t.id, t.titulo_trad, t.resumen_trad
                      FROM traducciones t
                      LEFT JOIN tags_noticia tn ON tn.traduccion_id = t.id
                      WHERE t.status = 'done'
                        AND t.lang_to = %s
                      GROUP BY t.id, t.titulo_trad, t.resumen_trad
                      HAVING COUNT(tn.tag_id) = 0
                      ORDER BY t.id DESC
                      LIMIT %s
                    )
                    SELECT * FROM pend;
                    """,
                    (NER_LANG, BATCH),
                )
                rows = cur.fetchall()
                if not rows:
                    time.sleep(5)
                    continue
                logging.info(f"Procesando {len(rows)} traducciones para NER...")
                new_links = 0
                new_tags = 0
                for r in rows:
                    text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
                    if not text:
                        continue
                    doc = nlp(text)
                    ents = []
                    for ent in doc.ents:
                        tipo = ENT_LABELS.get(ent.label_)
                        if not tipo:
                            continue
                        val = _clean_value(ent.text)
                        # filtros simples
                        if len(val) < 2:
                            continue
                        ents.append((val, tipo))
                    if not ents:
                        continue
                    # Insertamos (o actualizamos si ya existe) el tag y luego la relación
                    # IMPORTANTE: requiere UNIQUE(valor, tipo) en 'tags' y UNIQUE(traduccion_id, tag_id) en 'tags_noticia'
                    for valor, tipo in set(ents):
                        try:
                            cur.execute(
                                """
                                INSERT INTO tags (valor, tipo)
                                VALUES (%s, %s)
                                ON CONFLICT (valor, tipo)
                                DO UPDATE SET valor = EXCLUDED.valor
                                RETURNING id
                                """,
                                (valor, tipo),
                            )
                            tag_id = cur.fetchone()[0]
                            # Intenta crear la relación; si existe (por UNIQUE), se ignora
                            cur.execute(
                                """
                                INSERT INTO tags_noticia (traduccion_id, tag_id)
                                VALUES (%s, %s)
                                ON CONFLICT DO NOTHING
                                """,
                                (r["id"], tag_id),
                            )
                            if cur.rowcount > 0:
                                new_links += 1
                            # Heurística: si el tag se ha creado (no hay forma directa aquí),
                            # lo aproximamos contando que el RETURNING vino de un insert o un update.
                            # Para no complicar: cuenta enlaces nuevos, y deja 'new_tags' como métrica opcional.
                        except Exception:
                            # No abortar el lote por un único fallo en un valor raro.
                            logging.exception("Fallo insertando tag/relación")
                conn.commit()
                logging.info(f"NER lote OK. Nuevos enlaces: {new_links}.")
        except Exception as e:
            logging.exception(f"Error en NER loop: {e}")
            time.sleep(5)
 if __name__ == "__main__":
    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -15,6 +15,7 @@ transformers==4.43.3
 sentencepiece==0.2.0
 sacremoses==0.1.1
 accelerate==0.33.0
 spacy>=3.7,<4.0
 # Nota: PyTorch (torch) NO se fija aquí.
 # Se instala en el Dockerfile con la wheel adecuada de CUDA (cu121) para tu GPU.
--- a/templates/_noticias_list.html
+++ b/templates/_noticias_list.html
@ -53,6 +53,16 @@
                <button class="ver-mas-btn" type="button">Ver más</button>
              {% endif %}
            </div>
            {# === Chips de tags para la TRADUCCIÓN (si existen) === #}
            {% set chips = (tags_por_trad.get(n.traduccion_id) if (n.traduccion_id and tags_por_trad) else None) %}
            {% if chips %}
              <div class="noticia-tags" style="margin-top:8px;" aria-label="Etiquetas">
                {% for valor, tipo in chips %}
                  <span class="badge" title="{{ (tipo or '')|capitalize }}">{{ valor }}</span>
                {% endfor %}
              </div>
            {% endif %}
          </div>
          <div class="tab-panel {% if not (use_tr and n.tiene_traduccion) %}active{% endif %}" data-panel="orig">
--- a/templates/dashboard.html
+++ b/templates/dashboard.html
@ -40,7 +40,7 @@
            <div class="card-body">
                <p>Exporta tu lista de fuentes URL o restaura/importa desde un archivo CSV.</p>
                <a href="{{ url_for('backup_urls') }}" class="btn"><i class="fas fa-download"></i> Exportar URLs</a>
-                <a href="{{ url_for('restore_urls') }}" class="btn btn-info"><i class="fas fa-upload"></i> Importar URLs</a>
+                <a href="{{ url_for('restore_urls') }}" class="btn btn-info"><i class="fas fa-upload"></i> Importar Fuentes URL</a>
            </div>
        </div>
    </div>
@ -55,4 +55,48 @@
        <a href="{{ url_for('backup_completo') }}" class="btn btn-secondary"><i class="fas fa-archive"></i> Backup Completo (.zip)</a>
    </div>
 </div>
 {% if top_tags and top_tags|length > 0 %}
 <div class="card">
    <div class="card-header">
        <h3>Top tags (últimas 24h)</h3>
    </div>
    <div class="card-body" style="padding:0;">
        <table style="width:100%; border-collapse: collapse;">
            <thead>
                <tr style="background-color: rgba(0,0,0,0.05);">
                    <th style="padding: 12px 15px; text-align: left;">Tag</th>
                    <th style="padding: 12px 15px; text-align: left;">Tipo</th>
                    <th style="padding: 12px 15px; text-align: right;">Apariciones</th>
                </tr>
            </thead>
            <tbody>
                {% for t in top_tags %}
                <tr>
                    <td style="padding: 12px 15px; border-top: 1px solid var(--border-color);">
                        {{ t.valor }}
                    </td>
                    <td style="padding: 12px 15px; border-top: 1px solid var(--border-color); text-transform: capitalize;">
                        {{ t.tipo }}
                    </td>
                    <td style="padding: 12px 15px; border-top: 1px solid var(--border-color); text-align: right;">
                        {{ t.apariciones }}
                    </td>
                </tr>
                {% endfor %}
            </tbody>
        </table>
    </div>
 </div>
 {% else %}
 <div class="card">
    <div class="card-header">
        <h3>Top tags (últimas 24h)</h3>
    </div>
    <div class="card-body">
        <p style="color: var(--text-color-light); margin: 0;">No hay tags para mostrar todavía.</p>
    </div>
 </div>
 {% endif %}
 {% endblock %}