ranking y tags

2025-10-17 04:10:09 +02:00 · 2025-10-17 04:10:09 +02:00 · d29152a0f6
commit d29152a0f6
parent 0bfeb610a9
9 changed files with 344 additions and 48 deletions
--- a/38
+++ b/38
@ -1,12 +1,10 @@
-# Usa una imagen base de Python ligera y moderna
+# Imagen base Python
 FROM python:3.11-slim

-# Permite elegir CPU o CUDA en build:
-#  - TORCH_CUDA=cpu   -> instalar torch CPU
-#  - TORCH_CUDA=cu121 -> instalar torch con CUDA 12.1
-ARG TORCH_CUDA=cpu
+# Por defecto construimos para CUDA 12.1 (cu121)
+# Si alguna vez quisieras CPU, pásale: --build-arg TORCH_CUDA=cpu
+ARG TORCH_CUDA=cu121

-# Establece el directorio de trabajo dentro del contenedor
 WORKDIR /app

 # Paquetes nativos necesarios
@ -16,30 +14,36 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    git \
    && rm -rf /var/lib/apt/lists/*

-# Copia requirements primero para aprovechar caché
+# Requerimientos
 COPY requirements.txt .

-# Instala dependencias Python "comunes"
-RUN pip install --no-cache-dir -r requirements.txt
+# Actualiza pip y herramientas base
+RUN python -m pip install --no-cache-dir --upgrade pip setuptools wheel

-# Instala PyTorch según ARG (CPU o CUDA 12.1)
-# (Versión de ejemplo; puedes alinear con tu stack)
+# Instala PyTorch con el runtime CUDA 12.1 (o CPU si TORCH_CUDA=cpu)
 RUN if [ "$TORCH_CUDA" = "cu121" ]; then \
      pip install --no-cache-dir --index-url https://download.pytorch.org/whl/cu121 \
        torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1; \
    else \
-      pip install --no-cache-dir torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1; \
+      pip install --no-cache-dir \
+        torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1; \
    fi

-# Copia el resto del código
+# Instala el resto de dependencias de tu app
+RUN pip install --no-cache-dir -r requirements.txt
+
+# spaCy + modelo español (para el NER que quieres después)
+RUN pip install --no-cache-dir "spacy>=3.7,<4.0" \
+ && python -m spacy download es_core_news_md
+
+# Copia el código
 COPY . .

-# Descarga recursos NLTK si tu app los necesita
-# (si no los usas, comenta esta línea)
+# (Opcional) descarga recursos NLTK si tu app los usa; si no, déjalo como no-op
 RUN python download_models.py || true

-# Expone el puerto de Gunicorn (servicio web)
+# Puerto que usará gunicorn en el servicio web
 EXPOSE 8000

-# El CMD lo define docker-compose para cada servicio
+# El CMD lo define docker-compose

--- a/app.py
+++ b/app.py
@ -236,6 +236,7 @@ def home():

    total_results = 0
    total_pages = 0
+    tags_por_trad = {}

    try:
        with get_conn() as conn:
@ -265,6 +266,19 @@ def home():
                cursor.execute(sql_page, tuple(params_page))
                noticias = cursor.fetchall()

+                # Cargar tags por traducción (si aplica)
+                tr_ids = [row['traduccion_id'] for row in noticias if row.get('traduccion_id')]
+                if tr_ids:
+                    cursor.execute("""
+                        SELECT tn.traduccion_id, tg.valor, tg.tipo
+                        FROM tags_noticia tn
+                        JOIN tags tg ON tg.id = tn.tag_id
+                        WHERE tn.traduccion_id = ANY(%s)
+                        ORDER BY tg.tipo, tg.valor
+                    """, (tr_ids,))
+                    for trid, valor, tipo in cursor.fetchall():
+                        tags_por_trad.setdefault(trid, []).append((valor, tipo))
+
    except psycopg2.Error as db_err:
        app.logger.error(f"[DB ERROR] Al leer noticias: {db_err}", exc_info=True)
        flash("Error de base de datos al cargar las noticias.", "error")
@ -274,7 +288,8 @@ def home():
        cat_id=int(cat_id) if cat_id else None, cont_id=int(cont_id) if cont_id else None,
        pais_id=int(pais_id) if pais_id else None, fecha_filtro=fecha_filtro, q=q,
        page=page, per_page=per_page, total_pages=total_pages, total_results=total_results,
-        lang=lang, use_tr=use_tr
+        lang=lang, use_tr=use_tr,
+        tags_por_trad=tags_por_trad
    )

    if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
@ -326,19 +341,31 @@ def noticia(tr_id):
@app.route("/dashboard")
 def dashboard():
    stats = {'feeds_totales': 0, 'noticias_totales': 0, 'feeds_caidos': 0}
+    top_tags = []
+
    try:
        with get_conn() as conn:
-            with conn.cursor() as cursor:
+            # Usamos DictCursor aquí para poder usar t.valor / t.tipo / t.apariciones en Jinja
+            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
                cursor.execute("SELECT COUNT(*) FROM feeds")
                stats['feeds_totales'] = cursor.fetchone()[0]
                cursor.execute("SELECT COUNT(*) FROM noticias")
                stats['noticias_totales'] = cursor.fetchone()[0]
                cursor.execute("SELECT COUNT(*) FROM feeds WHERE activo = FALSE")
                stats['feeds_caidos'] = cursor.fetchone()[0]
+
+                cursor.execute("""
+                  SELECT valor, tipo, apariciones
+                  FROM v_tag_counts_24h
+                  ORDER BY apariciones DESC, valor
+                  LIMIT 20
+                """)
+                top_tags = cursor.fetchall()
+
    except psycopg2.Error as db_err:
        app.logger.error(f"[DB ERROR] Al calcular estadísticas: {db_err}")
        flash("Error al conectar con la base de datos.", "error")
-    return render_template("dashboard.html", stats=stats)
+    return render_template("dashboard.html", stats=stats, top_tags=top_tags)

@app.route("/feeds/manage")
 def manage_feeds():
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -20,7 +20,6 @@ services:
    build:
      context: .
      args:
-        # Reutiliza Dockerfile con torch-cu121; la web no usa GPU.
        TORCH_CUDA: cu121
    container_name: rss_web
    command: gunicorn --bind 0.0.0.0:8000 --workers 3 app:app
@ -33,8 +32,6 @@ services:
      - DB_USER=${DB_USER}
      - DB_PASS=${DB_PASS}
      - SECRET_KEY=${SECRET_KEY}
-      # UI opcional
-      # - NEWS_PER_PAGE=20
      - WEB_TRANSLATED_DEFAULT=1
      - DEFAULT_LANG=es
      - TRANSLATION_PREFERRED_LANGS=es
@ -66,51 +63,41 @@ services:
    build:
      context: .
      args:
-        TORCH_CUDA: cu121  # PyTorch con CUDA 12.1 en la imagen
+        TORCH_CUDA: cu121
    container_name: rss_translator
    command: python translation_worker.py
    environment:
-      # --- DB ---
      - DB_HOST=db
      - DB_PORT=5432
      - DB_NAME=${DB_NAME}
      - DB_USER=${DB_USER}
      - DB_PASS=${DB_PASS}

-      # --- Worker (ajustes estables VRAM) ---
      - TARGET_LANGS=es
-      - TRANSLATOR_BATCH=8           # cuántas filas toma por ciclo
+      - TRANSLATOR_BATCH=8
      - ENQUEUE=200
      - TRANSLATOR_SLEEP_IDLE=5

-      # Tokens (seguro para NLLB-1.3B; evita >1024)
-      - MAX_SRC_TOKENS=680           # margen bajo el límite real del modelo
-      - MAX_NEW_TOKENS=400           # permite salidas más largas en cuerpos
+      - MAX_SRC_TOKENS=680
+      - MAX_NEW_TOKENS=400

-      # Beams: mejor en títulos, eficiente en cuerpo
      - NUM_BEAMS_TITLE=2
      - NUM_BEAMS_BODY=1

-      # Modelo NLLB 1.3B
      - UNIVERSAL_MODEL=facebook/nllb-200-1.3B

-      # Chunking por frases (mejor coherencia en artículos largos)
      - CHUNK_BY_SENTENCES=True
-      - CHUNK_MAX_TOKENS=700         # <= MAX_SRC_TOKENS (con margen)
-      - CHUNK_OVERLAP_SENTS=1        # solape de 1 frase para evitar cortes bruscos
-      - CLEAN_ARTICLE=1              # limpia “The post…”, “Læs også…”, etc.
+      - CHUNK_MAX_TOKENS=700
+      - CHUNK_OVERLAP_SENTS=1
+      - CLEAN_ARTICLE=1

-      # Dispositivo (usa GPU si hay; cae a CPU si hay OOM)
      - DEVICE=cuda

-      # Rendimiento / estabilidad
      - PYTHONUNBUFFERED=1
      - HF_HOME=/root/.cache/huggingface
      - TOKENIZERS_PARALLELISM=false
-      # Evita el assert del allocator de PyTorch
      - PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64,garbage_collection_threshold:0.9

-      # GPU (requiere NVIDIA Container Toolkit en el host)
      - NVIDIA_VISIBLE_DEVICES=all
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
    volumes:
@ -119,14 +106,27 @@ services:
      db:
        condition: service_healthy
    restart: always
-    # Habilita GPU (Compose v2 + nvidia-container-toolkit)
    gpus: all
-    # Alternativa con 'deploy':
-    # deploy:
-    #   resources:
-    #     reservations:
-    #       devices:
-    #         - capabilities: [gpu]
+
+  ner:
+    build:
+      context: .
+      args:
+        TORCH_CUDA: cu121
+    container_name: rss_ner
+    command: python ner_worker.py
+    environment:
+      - DB_HOST=db
+      - DB_PORT=5432
+      - DB_NAME=${DB_NAME}
+      - DB_USER=${DB_USER}
+      - DB_PASS=${DB_PASS}
+      - NER_LANG=es
+      - NER_BATCH=64
+    depends_on:
+      db:
+        condition: service_healthy
+    restart: always

 volumes:
  postgres_data:
--- a/init-db/06-tags.sql
+++ b/init-db/06-tags.sql
@ -0,0 +1,24 @@
+-- init-db/06-tags.sql  (modelo simple compatible con ner_worker.py)
+
+-- Tabla de tags
+CREATE TABLE IF NOT EXISTS tags (
+  id    SERIAL PRIMARY KEY,
+  valor TEXT NOT NULL,
+  tipo  TEXT NOT NULL,         -- 'persona','organizacion','lugar', ...
+  UNIQUE (valor, tipo)
+);
+
+-- Relación tag <-> traducción
+CREATE TABLE IF NOT EXISTS tags_noticia (
+  id             SERIAL PRIMARY KEY,
+  traduccion_id  INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE,
+  tag_id         INT NOT NULL REFERENCES tags(id) ON DELETE CASCADE,
+  UNIQUE (traduccion_id, tag_id)
+);
+
+-- Índices útiles
+CREATE INDEX IF NOT EXISTS idx_tags_valor ON tags(valor);
+CREATE INDEX IF NOT EXISTS idx_tags_tipo  ON tags(tipo);
+CREATE INDEX IF NOT EXISTS idx_tags_noticia_trid ON tags_noticia(traduccion_id);
+CREATE INDEX IF NOT EXISTS idx_tags_noticia_tag  ON tags_noticia(tag_id);
+
--- a/init-db/07-tags-views.sql
+++ b/init-db/07-tags-views.sql
@ -0,0 +1,42 @@
+-- init-db/07-tags-views.sql
+-- Vista de Top tags (24h) para el esquema:
+--   tags(id, valor, tipo)
+--   tags_noticia(id, traduccion_id, tag_id)
+--   traducciones(id, noticia_id, lang_to, status, ...)
+--   noticias(id, fecha, ...)
+
+CREATE OR REPLACE VIEW public.v_tag_counts_24h AS
+SELECT
+  tg.id,
+  tg.valor,
+  tg.tipo,
+  COUNT(*) AS apariciones
+FROM public.tags tg
+JOIN public.tags_noticia tn ON tn.tag_id = tg.id
+JOIN public.traducciones t   ON t.id = tn.traduccion_id
+JOIN public.noticias n       ON n.id = t.noticia_id
+WHERE t.status = 'done'
+  AND t.lang_to = 'es'
+  AND n.fecha >= now() - INTERVAL '24 hours'
+GROUP BY tg.id, tg.valor, tg.tipo
+ORDER BY apariciones DESC, tg.valor;
+
+-- Índices recomendados para acelerar la vista (idempotentes)
+CREATE INDEX IF NOT EXISTS idx_noticias_fecha
+  ON public.noticias (fecha);
+
+CREATE INDEX IF NOT EXISTS idx_traducciones_noticia_lang_status
+  ON public.traducciones (noticia_id, lang_to, status);
+
+CREATE INDEX IF NOT EXISTS idx_tags_noticia_traduccion
+  ON public.tags_noticia (traduccion_id);
+
+CREATE INDEX IF NOT EXISTS idx_tags_noticia_tag
+  ON public.tags_noticia (tag_id);
+
+-- (Opcionales si no existen ya, pero ayudan en búsquedas ad hoc)
+CREATE INDEX IF NOT EXISTS idx_tags_valor
+  ON public.tags (valor);
+CREATE INDEX IF NOT EXISTS idx_tags_tipo
+  ON public.tags (tipo);
+
--- a/ner_worker.py
+++ b/ner_worker.py
@ -0,0 +1,144 @@
+import os
+import time
+import logging
+import re
+import psycopg2
+import psycopg2.extras
+import spacy
+
+logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')
+
+DB = dict(
+    host=os.environ.get("DB_HOST", "localhost"),
+    port=int(os.environ.get("DB_PORT", 5432)),
+    dbname=os.environ.get("DB_NAME", "rss"),
+    user=os.environ.get("DB_USER", "rss"),
+    password=os.environ.get("DB_PASS", "x"),
+)
+
+# Idioma de las traducciones que vamos a etiquetar
+NER_LANG = os.environ.get("NER_LANG", "es").strip().lower()
+
+# Tamaño de lote de traducciones a procesar por iteración
+BATCH = int(os.environ.get("NER_BATCH", 64))
+
+# Mapeo de etiquetas de spaCy -> tipos de nuestro esquema
+ENT_LABELS = {
+    "PERSON": "persona",
+    "ORG": "organizacion",
+    "GPE": "lugar",
+    "LOC": "lugar",
+}
+
+# Normaliza el valor del tag (quita espacios extra, colapsa espacios internos)
+_ws_re = re.compile(r"\s+")
+def _clean_value(s: str) -> str:
+    if not s:
+        return ""
+    s = s.strip()
+    s = _ws_re.sub(" ", s)
+    return s
+
+def get_conn():
+    return psycopg2.connect(**DB)
+
+def main():
+    # Nota: asumimos español porque el contenedor instala es_core_news_md en el Dockerfile.
+    # Si quisieras soportar más idiomas, instala el modelo correspondiente y haz un mapping.
+    nlp = spacy.load("es_core_news_md", disable=["parser", "lemmatizer", "textcat"])
+    logging.info("spaCy cargado: es_core_news_md")
+
+    while True:
+        try:
+            with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+                # Tomamos traducciones 'done' hacia NER_LANG que aún no tengan ninguna relación en tags_noticia
+                cur.execute(
+                    """
+                    WITH pend AS (
+                      SELECT t.id, t.titulo_trad, t.resumen_trad
+                      FROM traducciones t
+                      LEFT JOIN tags_noticia tn ON tn.traduccion_id = t.id
+                      WHERE t.status = 'done'
+                        AND t.lang_to = %s
+                      GROUP BY t.id, t.titulo_trad, t.resumen_trad
+                      HAVING COUNT(tn.tag_id) = 0
+                      ORDER BY t.id DESC
+                      LIMIT %s
+                    )
+                    SELECT * FROM pend;
+                    """,
+                    (NER_LANG, BATCH),
+                )
+                rows = cur.fetchall()
+
+                if not rows:
+                    time.sleep(5)
+                    continue
+
+                logging.info(f"Procesando {len(rows)} traducciones para NER...")
+
+                new_links = 0
+                new_tags = 0
+                for r in rows:
+                    text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
+                    if not text:
+                        continue
+
+                    doc = nlp(text)
+                    ents = []
+
+                    for ent in doc.ents:
+                        tipo = ENT_LABELS.get(ent.label_)
+                        if not tipo:
+                            continue
+                        val = _clean_value(ent.text)
+                        # filtros simples
+                        if len(val) < 2:
+                            continue
+                        ents.append((val, tipo))
+
+                    if not ents:
+                        continue
+
+                    # Insertamos (o actualizamos si ya existe) el tag y luego la relación
+                    # IMPORTANTE: requiere UNIQUE(valor, tipo) en 'tags' y UNIQUE(traduccion_id, tag_id) en 'tags_noticia'
+                    for valor, tipo in set(ents):
+                        try:
+                            cur.execute(
+                                """
+                                INSERT INTO tags (valor, tipo)
+                                VALUES (%s, %s)
+                                ON CONFLICT (valor, tipo)
+                                DO UPDATE SET valor = EXCLUDED.valor
+                                RETURNING id
+                                """,
+                                (valor, tipo),
+                            )
+                            tag_id = cur.fetchone()[0]
+                            # Intenta crear la relación; si existe (por UNIQUE), se ignora
+                            cur.execute(
+                                """
+                                INSERT INTO tags_noticia (traduccion_id, tag_id)
+                                VALUES (%s, %s)
+                                ON CONFLICT DO NOTHING
+                                """,
+                                (r["id"], tag_id),
+                            )
+                            if cur.rowcount > 0:
+                                new_links += 1
+                            # Heurística: si el tag se ha creado (no hay forma directa aquí),
+                            # lo aproximamos contando que el RETURNING vino de un insert o un update.
+                            # Para no complicar: cuenta enlaces nuevos, y deja 'new_tags' como métrica opcional.
+                        except Exception:
+                            # No abortar el lote por un único fallo en un valor raro.
+                            logging.exception("Fallo insertando tag/relación")
+
+                conn.commit()
+                logging.info(f"NER lote OK. Nuevos enlaces: {new_links}.")
+        except Exception as e:
+            logging.exception(f"Error en NER loop: {e}")
+            time.sleep(5)
+
+if __name__ == "__main__":
+    main()
+
--- a/requirements.txt
+++ b/requirements.txt
@ -15,6 +15,7 @@ transformers==4.43.3
 sentencepiece==0.2.0
 sacremoses==0.1.1
 accelerate==0.33.0
+spacy>=3.7,<4.0
 # Nota: PyTorch (torch) NO se fija aquí.
 # Se instala en el Dockerfile con la wheel adecuada de CUDA (cu121) para tu GPU.

--- a/templates/_noticias_list.html
+++ b/templates/_noticias_list.html
@ -53,6 +53,16 @@
                <button class="ver-mas-btn" type="button">Ver más</button>
              {% endif %}
            </div>
+
+            {# === Chips de tags para la TRADUCCIÓN (si existen) === #}
+            {% set chips = (tags_por_trad.get(n.traduccion_id) if (n.traduccion_id and tags_por_trad) else None) %}
+            {% if chips %}
+              <div class="noticia-tags" style="margin-top:8px;" aria-label="Etiquetas">
+                {% for valor, tipo in chips %}
+                  <span class="badge" title="{{ (tipo or '')|capitalize }}">{{ valor }}</span>
+                {% endfor %}
+              </div>
+            {% endif %}
          </div>

          <div class="tab-panel {% if not (use_tr and n.tiene_traduccion) %}active{% endif %}" data-panel="orig">
--- a/templates/dashboard.html
+++ b/templates/dashboard.html
@ -40,7 +40,7 @@
            <div class="card-body">
                <p>Exporta tu lista de fuentes URL o restaura/importa desde un archivo CSV.</p>
                <a href="{{ url_for('backup_urls') }}" class="btn"><i class="fas fa-download"></i> Exportar URLs</a>
-                <a href="{{ url_for('restore_urls') }}" class="btn btn-info"><i class="fas fa-upload"></i> Importar URLs</a>
+                <a href="{{ url_for('restore_urls') }}" class="btn btn-info"><i class="fas fa-upload"></i> Importar Fuentes URL</a>
            </div>
        </div>
    </div>
@ -55,4 +55,48 @@
        <a href="{{ url_for('backup_completo') }}" class="btn btn-secondary"><i class="fas fa-archive"></i> Backup Completo (.zip)</a>
    </div>
 </div>
+
+{% if top_tags and top_tags|length > 0 %}
+<div class="card">
+    <div class="card-header">
+        <h3>Top tags (últimas 24h)</h3>
+    </div>
+    <div class="card-body" style="padding:0;">
+        <table style="width:100%; border-collapse: collapse;">
+            <thead>
+                <tr style="background-color: rgba(0,0,0,0.05);">
+                    <th style="padding: 12px 15px; text-align: left;">Tag</th>
+                    <th style="padding: 12px 15px; text-align: left;">Tipo</th>
+                    <th style="padding: 12px 15px; text-align: right;">Apariciones</th>
+                </tr>
+            </thead>
+            <tbody>
+                {% for t in top_tags %}
+                <tr>
+                    <td style="padding: 12px 15px; border-top: 1px solid var(--border-color);">
+                        {{ t.valor }}
+                    </td>
+                    <td style="padding: 12px 15px; border-top: 1px solid var(--border-color); text-transform: capitalize;">
+                        {{ t.tipo }}
+                    </td>
+                    <td style="padding: 12px 15px; border-top: 1px solid var(--border-color); text-align: right;">
+                        {{ t.apariciones }}
+                    </td>
+                </tr>
+                {% endfor %}
+            </tbody>
+        </table>
+    </div>
+</div>
+{% else %}
+<div class="card">
+    <div class="card-header">
+        <h3>Top tags (últimas 24h)</h3>
+    </div>
+    <div class="card-body">
+        <p style="color: var(--text-color-light); margin: 0;">No hay tags para mostrar todavía.</p>
+    </div>
+</div>
+{% endif %}
 {% endblock %}
+