From a9c1e16bddff82b93dc9b2b40c54ca4422529c4f Mon Sep 17 00:00:00 2001 From: jlimolina Date: Sun, 12 Oct 2025 17:51:14 +0200 Subject: [PATCH] cambios en la web --- app.py | 101 +++++++++--------- docker-compose.yml | 32 +++--- requirements.txt | 16 +-- static/style.css | 39 +++---- templates/_noticias_list.html | 36 +++---- translation_worker.py | 190 +++++++++++++++++++++++++++++++--- 6 files changed, 283 insertions(+), 131 deletions(-) diff --git a/app.py b/app.py index c7f22bb..f5397c5 100644 --- a/app.py +++ b/app.py @@ -11,7 +11,7 @@ from contextlib import contextmanager from concurrent.futures import ThreadPoolExecutor, as_completed from tqdm import tqdm -from flask import Flask, render_template, request, redirect, url_for, Response, flash, make_response +from flask import Flask, render_template, request, redirect, url_for, Response, flash, make_response, abort import psycopg2 import psycopg2.extras import psycopg2.pool @@ -36,11 +36,7 @@ DB_CONFIG = { MAX_WORKERS = int(os.environ.get("RSS_MAX_WORKERS", 20)) SINGLE_FEED_TIMEOUT = int(os.environ.get("RSS_FEED_TIMEOUT", 30)) MAX_FALLOS = int(os.environ.get("RSS_MAX_FAILURES", 5)) - -# Tamaño de página configurable (límite en 10–100 por seguridad) NEWS_PER_PAGE = int(os.environ.get("NEWS_PER_PAGE", 20)) - -# Idioma/traducción por defecto DEFAULT_TRANSLATION_LANG = os.environ.get("DEFAULT_TRANSLATION_LANG", "es").strip().lower() DEFAULT_LANG = os.environ.get("DEFAULT_LANG", DEFAULT_TRANSLATION_LANG).strip().lower() WEB_TRANSLATED_DEFAULT = os.environ.get("WEB_TRANSLATED_DEFAULT", "1").strip().lower() in ("1", "true", "yes") @@ -81,8 +77,8 @@ def safe_html(text): return "" return bleach.clean( text, - tags={'a', 'b', 'strong', 'i', 'em', 'p', 'br'}, - attributes={'a': ['href', 'title']}, + tags={'a', 'b', 'strong', 'i', 'em', 'p', 'br', 'ul', 'ol', 'li', 'blockquote', 'h3', 'h4'}, + attributes={'a': ['href', 'title', 'rel', 'target']}, strip=True ) @@ -94,31 +90,18 @@ def _get_form_dependencies(cursor): return categorias, paises def _get_lang_and_flags(): - """ - Determina el idioma preferido y si se debe usar traducción por defecto. - Permite forzar original con ?orig=1 y cambiar idioma con ?lang=xx (se guarda en cookie). - """ qlang = request.args.get("lang", "").strip().lower() cookie_lang = (request.cookies.get("lang") or "").strip().lower() lang = qlang or cookie_lang or DEFAULT_LANG or "es" - force_orig = request.args.get("orig") == "1" use_translation = (not force_orig) and WEB_TRANSLATED_DEFAULT return lang, use_translation, bool(qlang) def _build_news_query(args, *, count=False, limit=None, offset=None, lang="es", use_translation=True): - """ - Construye la consulta SQL y los parámetros basados en los argumentos de la petición. - Si count=True => SELECT COUNT(*) - Si count=False => SELECT columnas con ORDER + LIMIT/OFFSET. - Integra traducciones vía LEFT JOIN LATERAL cuando use_translation=True (status='done', lang_to=lang). - """ - # Para controlar orden de parámetros según apariciones de %s: select_rank_params = [] from_params = [] where_params = [] tail_params = [] - conditions = [] q = args.get("q", "").strip() @@ -127,7 +110,6 @@ def _build_news_query(args, *, count=False, limit=None, offset=None, lang="es", pais_id = args.get("pais_id") fecha_filtro = args.get("fecha") - # FROM base sql_from = """ FROM noticias n LEFT JOIN categorias c ON n.categoria_id = c.id @@ -135,11 +117,10 @@ def _build_news_query(args, *, count=False, limit=None, offset=None, lang="es", LEFT JOIN continentes co ON p.continente_id = co.id """ - # LEFT JOIN LATERAL traducción (solo en SELECT de página; el conteo no la necesita) if (not count) and use_translation: sql_from += """ LEFT JOIN LATERAL ( - SELECT titulo_trad, resumen_trad + SELECT id AS traduccion_id, titulo_trad, resumen_trad FROM traducciones WHERE traducciones.noticia_id = n.id AND traducciones.lang_to = %s @@ -150,9 +131,7 @@ def _build_news_query(args, *, count=False, limit=None, offset=None, lang="es", """ from_params.append(lang) - # WHERE dinámico if q: - # Buscar por relevancia en el tsvector de la noticia original conditions.append("n.tsv @@ plainto_tsquery('spanish', %s)") where_params.append(q) @@ -178,24 +157,26 @@ def _build_news_query(args, *, count=False, limit=None, offset=None, lang="es", where_clause = " WHERE " + " AND ".join(conditions) if conditions else "" if count: - # Conteo total (sin necesidad de traducciones) sql_count = "SELECT COUNT(*) " + sql_from + where_clause - sql_params = from_params + where_params # from_params estará vacío en count + sql_params = from_params + where_params return sql_count, sql_params - # Selección de columnas para página if use_translation: select_cols = """ - SELECT n.fecha, - COALESCE(t.titulo_trad, n.titulo) AS titulo, - COALESCE(t.resumen_trad, n.resumen) AS resumen, + SELECT + COALESCE(t.traduccion_id, NULL) AS traduccion_id, + n.fecha, + COALESCE(t.titulo_trad, n.titulo) AS titulo, + COALESCE(t.resumen_trad, n.resumen) AS resumen, n.url, n.imagen_url, n.fuente_nombre, c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente, (t.titulo_trad IS NOT NULL OR t.resumen_trad IS NOT NULL) AS usa_trad """ else: select_cols = """ - SELECT n.fecha, n.titulo, n.resumen, + SELECT + NULL::int AS traduccion_id, + n.fecha, n.titulo, n.resumen, n.url, n.imagen_url, n.fuente_nombre, c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente, FALSE AS usa_trad @@ -204,7 +185,6 @@ def _build_news_query(args, *, count=False, limit=None, offset=None, lang="es", order_clause = " ORDER BY n.fecha DESC NULLS LAST" if q: - # Ranking por relevancia (primer placeholder) select_cols = select_cols.replace( "SELECT", "SELECT ts_rank(n.tsv, plainto_tsquery('spanish', %s)) AS rank," @@ -212,7 +192,6 @@ def _build_news_query(args, *, count=False, limit=None, offset=None, lang="es", select_rank_params.append(q) order_clause = " ORDER BY rank DESC, n.fecha DESC NULLS LAST" - # Paginación if limit is not None: order_clause += " LIMIT %s" tail_params.append(limit) @@ -228,20 +207,16 @@ def _build_news_query(args, *, count=False, limit=None, offset=None, lang="es", def home(): noticias, categorias, continentes, paises = [], [], [], [] - # Estado de filtros (para mantenerlos en la UI) q = request.args.get("q", "").strip() cat_id = request.args.get("categoria_id") cont_id = request.args.get("continente_id") pais_id = request.args.get("pais_id") fecha_filtro = request.args.get("fecha") - # Preferencias idioma/uso de traducción lang, use_tr, set_cookie = _get_lang_and_flags() - # Paginación page = request.args.get("page", default=1, type=int) per_page = request.args.get("per_page", default=NEWS_PER_PAGE, type=int) - # límites de seguridad if per_page is None or per_page <= 0: per_page = NEWS_PER_PAGE per_page = 100 if per_page > 100 else (10 if per_page < 10 else per_page) @@ -255,7 +230,6 @@ def home(): try: with get_conn() as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: - # Dependencias de UI cursor.execute("SELECT id, nombre FROM categorias ORDER BY nombre") categorias = cursor.fetchall() cursor.execute("SELECT id, nombre FROM continentes ORDER BY nombre") @@ -263,7 +237,6 @@ def home(): cursor.execute("SELECT id, nombre, continente_id FROM paises ORDER BY nombre") paises = cursor.fetchall() - # 1) Conteo total (no requiere join de traducciones) sql_count, params_count = _build_news_query( request.args, count=True, lang=lang, use_translation=use_tr ) @@ -271,7 +244,6 @@ def home(): total_results = cursor.fetchone()[0] or 0 total_pages = math.ceil(total_results / per_page) if total_results else 0 - # 2) Página actual (con COALESCE a traducción si procede) sql_page, params_page = _build_news_query( request.args, count=False, @@ -295,20 +267,52 @@ def home(): lang=lang, use_tr=use_tr ) - # Respuesta parcial para AJAX if request.headers.get('X-Requested-With') == 'XMLHttpRequest': resp = make_response(render_template('_noticias_list.html', **ctx)) if set_cookie: resp.set_cookie("lang", lang, max_age=60*60*24*365) return resp - # Render completo html = render_template("noticias.html", **ctx) resp = make_response(html) if set_cookie: resp.set_cookie("lang", lang, max_age=60*60*24*365) return resp +@app.get("/noticia/") +def noticia(tr_id): + with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: + cur.execute( + """ + SELECT + t.id, + n.id AS noticia_id, + n.fecha, + n.titulo AS titulo_original, + n.resumen AS cuerpo_original, + t.titulo_trad AS titulo_traducido, + t.resumen_trad AS cuerpo_traducido, + n.url AS fuente_url, + n.fuente_nombre, + p.nombre AS pais, + co.nombre AS continente, + c.nombre AS categoria, + t.lang_to, + t.status + FROM traducciones t + JOIN noticias n ON n.id = t.noticia_id + LEFT JOIN paises p ON n.pais_id = p.id + LEFT JOIN continentes co ON p.continente_id = co.id + LEFT JOIN categorias c ON n.categoria_id = c.id + WHERE t.id = %s + """, + (tr_id,) + ) + row = cur.fetchone() + if not row: + abort(404) + return render_template("noticia.html", r=row) + @app.route("/dashboard") def dashboard(): stats = {'feeds_totales': 0, 'noticias_totales': 0, 'feeds_caidos': 0} @@ -540,8 +544,6 @@ def fetch_and_store_all(): feeds_fallidos = [] feeds_exitosos = [] feeds_para_actualizar_headers = [] - - # --- Parte 1: Procesando Feeds RSS --- logging.info("=> Parte 1: Procesando Feeds RSS...") feeds_to_process = [] try: @@ -578,7 +580,6 @@ def fetch_and_store_all(): noticias_desde_rss_count = len(todas_las_noticias) logging.info(f"=> Parte 1 Finalizada. Noticias desde RSS: {noticias_desde_rss_count}. Éxitos: {len(feeds_exitosos)}. Fallos: {len(feeds_fallidos)}.") - # --- Parte 2: Procesando Fuentes URL --- logging.info("=> Parte 2: Procesando Fuentes URL...") urls_to_process = [] try: @@ -590,7 +591,6 @@ def fetch_and_store_all(): except Exception as e: logging.error(f"Error de BD al obtener fuentes URL: {e}") - # Paraleliza la captura desde newspaper3k if urls_to_process: with ThreadPoolExecutor(max_workers=10) as executor: future_to_url = { @@ -612,7 +612,6 @@ def fetch_and_store_all(): noticias_desde_urls_count = len(todas_las_noticias) - noticias_desde_rss_count logging.info(f"=> Parte 2 Finalizada. Noticias encontradas desde URLs: {noticias_desde_urls_count}.") - # --- Parte 3: Actualizando la base de datos --- logging.info("=> Parte 3: Actualizando la base de datos...") if not any([todas_las_noticias, feeds_fallidos, feeds_exitosos, feeds_para_actualizar_headers]): logging.info("No se encontraron nuevas noticias ni cambios en los feeds. Nada que actualizar.") @@ -655,8 +654,6 @@ def fetch_and_store_all(): logging.info("--- CICLO DE CAPTURA GLOBAL FINALIZADO ---") -# --- Funciones de Backup y Restore (sin cambios) --- - @app.route("/backup_feeds") def backup_feeds(): try: @@ -755,7 +752,6 @@ def backup_completo(): with zipfile.ZipFile(memory_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf: with get_conn() as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: - # Backup Feeds cursor.execute("SELECT * FROM feeds ORDER BY id") feeds_data = cursor.fetchall() if feeds_data: @@ -765,7 +761,6 @@ def backup_completo(): writer_feeds.writerows([dict(f) for f in feeds_data]) zipf.writestr("feeds.csv", output_feeds.getvalue()) - # Backup Fuentes URL cursor.execute("SELECT * FROM fuentes_url ORDER BY id") fuentes_data = cursor.fetchall() if fuentes_data: @@ -775,7 +770,6 @@ def backup_completo(): writer_fuentes.writerows([dict(f) for f in fuentes_data]) zipf.writestr("fuentes_url.csv", output_fuentes.getvalue()) - # Backup Noticias cursor.execute("SELECT * FROM noticias ORDER BY fecha DESC") noticias_data = cursor.fetchall() if noticias_data: @@ -892,7 +886,6 @@ def restore_urls(): return render_template("restore_urls.html") - if __name__ == "__main__": if not db_pool: app.logger.error("La aplicación no puede arrancar sin una conexión a la base de datos.") diff --git a/docker-compose.yml b/docker-compose.yml index d671866..f3bb0a8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -20,7 +20,7 @@ services: build: context: . args: - # La imagen llevará torch-cu121 por reutilizar Dockerfile; web no usa GPU. + # Reutiliza Dockerfile con torch-cu121; la web no usa GPU. TORCH_CUDA: cu121 container_name: rss_web command: gunicorn --bind 0.0.0.0:8000 --workers 3 app:app @@ -33,9 +33,8 @@ services: - DB_USER=${DB_USER} - DB_PASS=${DB_PASS} - SECRET_KEY=${SECRET_KEY} - # Opcionales UI + # UI opcional # - NEWS_PER_PAGE=20 - # Mostrar traducciones por defecto en la web - WEB_TRANSLATED_DEFAULT=1 - DEFAULT_LANG=es - TRANSLATION_PREFERRED_LANGS=es @@ -78,31 +77,38 @@ services: - DB_USER=${DB_USER} - DB_PASS=${DB_PASS} - # --- Worker --- + # --- Worker (ajustes estables VRAM) --- - TARGET_LANGS=es - - TRANSLATOR_BATCH=4 # estable con 1.3B en 12 GB; ajusta si cambia la VRAM disponible + - TRANSLATOR_BATCH=8 # cuántas filas toma por ciclo - ENQUEUE=200 - TRANSLATOR_SLEEP_IDLE=5 - # Tokens (equilibrio calidad/VRAM ~<7GB) - - MAX_SRC_TOKENS=512 - - MAX_NEW_TOKENS=256 + # Tokens (seguro para NLLB-1.3B; evita >1024) + - MAX_SRC_TOKENS=680 # margen bajo el límite real del modelo + - MAX_NEW_TOKENS=400 # permite salidas más largas en cuerpos - # Beams: mejor título, cuerpo eficiente - - NUM_BEAMS_TITLE=3 - - NUM_BEAMS_BODY=2 + # Beams: mejor en títulos, eficiente en cuerpo + - NUM_BEAMS_TITLE=2 + - NUM_BEAMS_BODY=1 # Modelo NLLB 1.3B - UNIVERSAL_MODEL=facebook/nllb-200-1.3B - # Dispositivo (forzar GPU si está disponible; el worker cae a CPU si hay OOM) + # Chunking por frases (mejor coherencia en artículos largos) + - CHUNK_BY_SENTENCES=True + - CHUNK_MAX_TOKENS=700 # <= MAX_SRC_TOKENS (con margen) + - CHUNK_OVERLAP_SENTS=1 # solape de 1 frase para evitar cortes bruscos + - CLEAN_ARTICLE=1 # limpia “The post…”, “Læs også…”, etc. + + # Dispositivo (usa GPU si hay; cae a CPU si hay OOM) - DEVICE=cuda # Rendimiento / estabilidad - PYTHONUNBUFFERED=1 - HF_HOME=/root/.cache/huggingface - TOKENIZERS_PARALLELISM=false - - PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:64,garbage_collection_threshold:0.9 + # Evita el assert del allocator de PyTorch + - PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64,garbage_collection_threshold:0.9 # GPU (requiere NVIDIA Container Toolkit en el host) - NVIDIA_VISIBLE_DEVICES=all diff --git a/requirements.txt b/requirements.txt index 6e8d558..33f37f8 100755 --- a/requirements.txt +++ b/requirements.txt @@ -4,15 +4,17 @@ APScheduler==3.10.4 psycopg2-binary==2.9.10 bleach==6.1.0 gunicorn==22.0.0 -waitress -tqdm -beautifulsoup4 -requests -newspaper3k -lxml-html-clean +waitress==2.1.2 +tqdm>=4.66 +beautifulsoup4>=4.12 +requests>=2.31 +newspaper3k==0.2.8 +lxml[html_clean]>=4.9.3 langdetect==1.0.9 transformers==4.43.3 sentencepiece==0.2.0 sacremoses==0.1.1 -torch==2.3.1 # CPU. Para GPU ver nota más abajo. accelerate==0.33.0 +# Nota: PyTorch (torch) NO se fija aquí. +# Se instala en el Dockerfile con la wheel adecuada de CUDA (cu121) para tu GPU. + diff --git a/static/style.css b/static/style.css index c36ba68..bfc869a 100644 --- a/static/style.css +++ b/static/style.css @@ -1,4 +1,3 @@ -/* --- Variables Globales de Diseño --- */ :root { --primary-color: #6a11cb; --secondary-color: #2575fc; @@ -14,7 +13,6 @@ --transition-speed: 0.3s; } -/* --- Estilos Base --- */ * { box-sizing: border-box; } body { font-family: 'Poppins', 'Segoe UI', Tahoma, sans-serif; @@ -26,7 +24,6 @@ body { font-weight: 400; } -/* --- Contenedor Principal con Efecto Vidrio --- */ .container { max-width: 900px; margin: 30px auto; @@ -39,19 +36,16 @@ body { -webkit-backdrop-filter: blur(12px); } -/* --- Encabezados y Títulos --- */ header { text-align: center; margin-bottom: 40px; border-bottom: 1px solid var(--border-color); padding-bottom: 30px; } h1 { font-size: 2.8rem; font-weight: 700; margin: 0 0 5px 0; background: var(--gradiente-principal); -webkit-background-clip: text; -webkit-text-fill-color: transparent; display: inline-block; } h2 { font-size: 1.8rem; font-weight: 600; color: var(--primary-color); margin-bottom: 20px; } .subtitle { color: var(--text-color-light); font-size: 1.1rem; margin-top: 5px; } -/* --- Formularios y Controles --- */ .form-section, .card { margin-bottom: 30px; background: rgba(255, 255, 255, 0.6); padding: 25px; border-radius: var(--border-radius-md); border: 1px solid var(--border-color); } label { display: block; margin-bottom: 8px; font-weight: 600; color: var(--text-color); font-size: 0.9rem; } select, input[type="text"], input[type="url"], input[type="file"], textarea { width: 100%; padding: 12px 15px; border: 1px solid var(--border-color); background-color: #f8f9fa; border-radius: var(--border-radius-sm); font-size: 1rem; font-family: 'Poppins', sans-serif; transition: all var(--transition-speed) ease; } select:focus, input:focus, textarea:focus { outline: none; border-color: var(--primary-color); box-shadow: 0 0 0 3px var(--shadow-color); background-color: white; } -/* --- Botones y Enlaces --- */ .btn, button { padding: 12px 25px; background: var(--gradiente-principal); color: white !important; border: none; border-radius: var(--border-radius-sm); font-size: 1rem; font-weight: 600; cursor: pointer; transition: all var(--transition-speed) ease; box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1); text-decoration: none; display: inline-block; text-align: center; } .btn:hover, button:hover { transform: translateY(-3px); box-shadow: 0 6px 20px rgba(0, 0, 0, 0.2); text-decoration: none; } .btn-secondary { background: #34495e; } .btn-secondary:hover { background: #2c3e50; } @@ -62,7 +56,6 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a: .top-link { display: inline-block; margin-bottom: 25px; font-weight: 500; color: var(--primary-color); } .top-link:hover { text-decoration: underline; } -/* --- Estilos para la lista de noticias --- */ .noticias-list { list-style: none; padding: 0; margin: 0; } .noticia-item { display: flex; gap: 20px; padding: 20px 10px; border-bottom: 1px solid var(--border-color); transition: background-color 0.2s ease; } .noticia-item:last-child { border-bottom: none; } @@ -73,14 +66,12 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a: .noticia-texto h3 a:hover { color: var(--primary-color); } .noticia-meta { font-size: 0.8rem; color: var(--text-color-light); margin-bottom: 8px; } -/* --- Alertas y Mensajes Flash --- */ .flash-messages { list-style: none; padding: 0; margin-bottom: 20px; } .flash-messages li { padding: 15px 20px; border-radius: var(--border-radius-sm); border-left: 5px solid; } .flash-messages .error { background-color: #fff0f3; color: #d90429; border-color: var(--error-color); } .flash-messages .success { background-color: #e6fcf5; color: #00b894; border-color: #00b894; } .flash-messages .warning { background-color: #fffbeb; color: #f39c12; border-color: #f39c12; } -/* --- Estilos para Dashboard y Paginación --- */ .dashboard-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px; margin-bottom: 40px; } .stat-card { background: rgba(255, 255, 255, 0.8); padding: 20px; border-radius: var(--border-radius-md); text-align: center; border: 1px solid var(--border-color); transition: all 0.3s ease; } .stat-card:hover { transform: translateY(-5px); box-shadow: 0 4px 15px rgba(0,0,0,0.08); } @@ -98,7 +89,6 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a: .feed-body dt { font-weight: 600; color: var(--text-color-light); } .feed-body dd { margin: 0; word-break: break-all; } -/* --- Estilos para la Navegación Principal --- */ .main-nav { display: flex; justify-content: center; @@ -127,7 +117,6 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a: margin-left: 20px; } -/* --- Responsividad --- */ @media (max-width: 768px) { .container { padding: 20px; margin: 15px; } h1 { font-size: 2rem; } @@ -137,10 +126,7 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a: .nav-actions { margin-left: 0; margin-top: 10px; } } -/* --- Estilos para el botón Ver Más --- */ -.resumen-container { - position: relative; -} +.resumen-container { position: relative; } .ver-mas-btn { background: none; border: none; @@ -149,23 +135,28 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a: cursor: pointer; padding: 5px 0; margin-top: 5px; -} -.ver-mas-btn:hover { text-decoration: underline; } -/* --- Estilos para la fila principal del formulario de filtros --- */ .filter-main-row { display: flex; align-items: flex-end; gap: 15px; margin-bottom: 20px; } -.filter-search-box { - flex-grow: 1; +.filter-search-box { flex-grow: 1; } +.filter-actions { display: flex; gap: 10px; white-space: nowrap; } + +.clamp { + display: -webkit-box; + -webkit-line-clamp: 6; + -webkit-box-orient: vertical; + overflow: hidden; + word-break: break-word; } -.filter-actions { - display: flex; - gap: 10px; - white-space: nowrap; +.clamp.expanded { + -webkit-line-clamp: unset; + max-height: none; + overflow: visible; } + diff --git a/templates/_noticias_list.html b/templates/_noticias_list.html index e7893b4..1f986c5 100644 --- a/templates/_noticias_list.html +++ b/templates/_noticias_list.html @@ -13,7 +13,11 @@

{{ noticia.titulo }} {% if use_tr %} - Traducido + {% if noticia.usa_tr %} + Traducido + {% else %} + Original + {% endif %} {% endif %}

@@ -43,12 +47,7 @@
{% set resumen_txt = noticia.resumen | safe_html %} -
- {{ resumen_txt | truncate(280, True) }} -
- +
{{ resumen_txt }}
{% if noticia.resumen and noticia.resumen|length > 280 %} {% endif %} @@ -62,7 +61,6 @@ {% endfor %} -{# Resumen y paginación #} {% if total_results and total_results > 0 %}
{% set start_i = (page - 1) * per_page + 1 %} @@ -75,12 +73,10 @@ {% endif %} -{# Toggle "Ver más / Ver menos" con delegación; se liga una sola vez #} diff --git a/translation_worker.py b/translation_worker.py index ff50a31..ab9a583 100644 --- a/translation_worker.py +++ b/translation_worker.py @@ -1,7 +1,9 @@ +# translation_worker.py import os import time import logging import contextlib +import re from typing import List, Optional import psycopg2 @@ -62,6 +64,12 @@ def _env_str(name: str, *fallbacks: str, default: Optional[str] = None) -> Optio return val return default +def _env_bool(name: str, default: bool = False) -> bool: + val = os.environ.get(name) + if val is None: + return default + return str(val).strip().lower() in ("1", "true", "yes", "y", "on") + TARGET_LANGS = _env_list("TARGET_LANGS", "TRANSLATE_TO", default="es") BATCH_SIZE = _env_int("BATCH", "TRANSLATOR_BATCH", "TRANSLATE_BATCH", default=8) ENQUEUE_MAX = _env_int("ENQUEUE", "TRANSLATOR_ENQUEUE", "TRANSLATE_ENQUEUE", default=200) @@ -69,8 +77,8 @@ SLEEP_IDLE = _env_float("SLEEP_IDLE", "TRANSLATOR_SLEEP_IDLE", "TRANSLATE_SLEE DEVICE_CFG = (_env_str("DEVICE", default="auto") or "auto").lower() # 'cpu' | 'cuda' | 'auto' # Límites de tokens (ajusta si ves OOM) -MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", default=384) -MAX_NEW_TOKENS = _env_int("MAX_NEW_TOKENS", default=192) +MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", default=512) +MAX_NEW_TOKENS = _env_int("MAX_NEW_TOKENS", default=256) # ---- Beams: por defecto 2 para títulos y 1 para cuerpo; respeta NUM_BEAMS si sólo se define ese ---- def _beams_from_env(): @@ -91,6 +99,50 @@ NUM_BEAMS_TITLE, NUM_BEAMS_BODY = _beams_from_env() # Modelo por defecto: NLLB 600M (cámbialo por facebook/nllb-200-1.3B si quieres el 1.3B) UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", default="facebook/nllb-200-distilled-600M") +# ---------- Chunking por frases (para artículos largos) ---------- +# Activo por defecto para evitar secuencias > límite del modelo +CHUNK_BY_SENTENCES = _env_bool("CHUNK_BY_SENTENCES", default=True) +CHUNK_MAX_TOKENS = _env_int("CHUNK_MAX_TOKENS", default=900) # <= modelo - margen +CHUNK_OVERLAP_SENTS = _env_int("CHUNK_OVERLAP_SENTS", default=0) # 0 o 1 + +# Abreviaturas comunes y marcador temporal +_ABBR = ("Sr", "Sra", "Dr", "Dra", "Ing", "Lic", "pág", "etc") +_ABBR_MARK = "§" # no debería aparecer en texto normal + +def _protect_abbrev(text: str) -> str: + # Iniciales de una letra: "E.", "A." + t = re.sub(r"\b([A-ZÁÉÍÓÚÑÄÖÜ])\.", r"\1" + _ABBR_MARK, text) + # Abreviaturas de la lista (case-insensitive) + pat = r"\b(?:" + "|".join(map(re.escape, _ABBR)) + r")\." + t = re.sub(pat, lambda m: m.group(0)[:-1] + _ABBR_MARK, t, flags=re.IGNORECASE) + return t + +def _restore_abbrev(text: str) -> str: + return text.replace(_ABBR_MARK, ".") + +# Regex de corte SIN look-behind variable: +# - Corta tras [.!?…] si hay espacios y luego comienza otra frase (letra mayúscula, comillas, paréntesis, dígito) +# - O cuando hay doble salto de línea +_SENT_SPLIT_RE = re.compile( + r'(?<=[\.!\?…])\s+(?=["“\(\[A-ZÁÉÍÓÚÑÄÖÜ0-9])|(?:\n{2,})' +) + +def split_into_sentences(text: str) -> List[str]: + text = (text or "").strip() + if not text: + return [] + protected = _protect_abbrev(text) + parts = [p.strip() for p in _SENT_SPLIT_RE.split(protected) if p and p.strip()] + parts = [_restore_abbrev(p) for p in parts] + # Une piezas muy cortas con la anterior para más coherencia + merged: List[str] = [] + for p in parts: + if merged and len(p) < 40: + merged[-1] = merged[-1] + " " + p + else: + merged.append(p) + return merged + # ---------- Mapeo idiomas a códigos NLLB ---------- NLLB_LANG = { # básicos @@ -171,8 +223,8 @@ def fetch_pending_batch(conn, lang_to: str, batch_size: int): rows = cur.fetchall() if rows: ids = [r["tr_id"] for r in rows] - with conn.cursor() as cur: - cur.execute("UPDATE traducciones SET status='processing' WHERE id = ANY(%s)", (ids,)) + with conn.cursor() as cur2: + cur2.execute("UPDATE traducciones SET status='processing' WHERE id = ANY(%s)", (ids,)) conn.commit() return rows @@ -277,8 +329,14 @@ def get_universal_components(): _load_model_on(torch.device("cpu")) return _TOKENIZER, _MODEL, _DEVICE -# ---------- Utilidades ---------- +# ---------- Utilidades de tokenización / chunking ---------- +def _safe_src_len(tokenizer) -> int: + model_max = getattr(tokenizer, "model_max_length", 1024) or 1024 + # margen para tokens especiales/ruido + return min(MAX_SRC_TOKENS, int(model_max) - 16) + def _token_chunks(tokenizer, text: str, max_tokens: int) -> List[str]: + """Troceo simple por tokens (fallback)""" if not text: return [] ids = tokenizer.encode(text, add_special_tokens=False) @@ -293,8 +351,8 @@ def _token_chunks(tokenizer, text: str, max_tokens: int) -> List[str]: return chunks def _norm(s: str) -> str: - import re - return re.sub(r"\W+", "", (s or "").lower()).strip() + import re as _re + return _re.sub(r"\W+", "", (s or "").lower()).strip() def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_code: str) -> int: """ @@ -344,8 +402,13 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c LOG.warning("No pude resolver lang code id para '%s'. Uso fallback (eos/bos).", tgt_code) return getattr(tokenizer, "eos_token_id", None) or getattr(tokenizer, "bos_token_id", None) or 0 +# ---------- Traducción base ---------- @torch.inference_mode() def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1, _tries: int = 0) -> str: + """ + Traduce un texto (usando troceo por tokens si excede MAX_SRC_TOKENS). + Se usa para títulos y como núcleo para chunks de artículos. + """ if not text or not text.strip(): return "" @@ -361,13 +424,14 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1, forced_bos = _forced_bos_id(tok, mdl, tgt_code) - parts = _token_chunks(tok, text, MAX_SRC_TOKENS) + safe_len = _safe_src_len(tok) + parts = _token_chunks(tok, text, safe_len) outs: List[str] = [] try: autocast_ctx = torch.amp.autocast("cuda", dtype=torch.float16) if device.type == "cuda" else contextlib.nullcontext() for p in parts: - enc = tok(p, return_tensors="pt", truncation=True, max_length=MAX_SRC_TOKENS) + enc = tok(p, return_tensors="pt", truncation=True, max_length=safe_len) enc = {k: v.to(device) for k, v in enc.items()} gen_kwargs = dict( @@ -377,7 +441,6 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1, do_sample=False, use_cache=False, # ↓ memoria ) - # Evita el warning cuando num_beams = 1 if int(num_beams) > 1: gen_kwargs["early_stopping"] = True @@ -411,6 +474,102 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1, return translate_text(src_lang, tgt_lang, text, num_beams=num_beams, _tries=_tries + 1) raise +# ---------- Chunking por frases para artículos ---------- +def _sent_token_len(tokenizer, sent: str) -> int: + return len(tokenizer(sent, add_special_tokens=False).input_ids) + +def _pack_sentences_to_token_chunks( + tokenizer, sentences: List[str], max_tokens: int, overlap_sents: int = 0 +) -> List[List[str]]: + chunks: List[List[str]] = [] + cur: List[str] = [] + cur_tokens = 0 + for s in sentences: + slen = _sent_token_len(tokenizer, s) + if slen > max_tokens: + # Si una sola frase excede el límite, córtala por tokens como último recurso + ids = tokenizer(s, add_special_tokens=False).input_ids + step = max_tokens + for i in range(0, len(ids), step): + sub = tokenizer.decode(ids[i:i+step], skip_special_tokens=True) + if cur: + chunks.append(cur) + cur = [] + cur_tokens = 0 + chunks.append([sub]) + continue + + if cur_tokens + slen <= max_tokens: + cur.append(s); cur_tokens += slen + else: + if cur: + chunks.append(cur) + if overlap_sents > 0 and len(cur) > 0: + overlap = cur[-overlap_sents:] + cur = overlap + [s] + cur_tokens = sum(_sent_token_len(tokenizer, x) for x in cur) + else: + cur = [s]; cur_tokens = slen + if cur: + chunks.append(cur) + return chunks + +def _smart_concatenate(parts: List[str], tail_window: int = 120) -> str: + """Une partes evitando duplicados obvios en el borde (heurística ligera).""" + if not parts: + return "" + out = parts[0] + for nxt in parts[1:]: + tail = out[-tail_window:] + cut = 0 + for k in range(min(len(tail), len(nxt)), 20, -1): + if nxt.startswith(tail[-k:]): + cut = k + break + out += ("" if cut == 0 else nxt[cut:]) if nxt else "" + return out + +def translate_article_full( + src_lang: str, + tgt_lang: str, + text: str, + num_beams: int, +) -> str: + """ + Traduce un artículo completo: + - Divide por frases (sin look-behind variable) + - Empaqueta en chunks <= límite de tokens + - Traduce chunk a chunk (usa translate_text internamente) + - Une con heurística para evitar duplicados en bordes + """ + if not text or not text.strip(): + return "" + + if not CHUNK_BY_SENTENCES: + # Ruta rápida: una sola pasada con truncamiento interno + return translate_text(src_lang, tgt_lang, text, num_beams=num_beams) + + tok, _, _ = get_universal_components() + safe_len = _safe_src_len(tok) + max_chunk_tokens = min(CHUNK_MAX_TOKENS, safe_len) + + sents = split_into_sentences(text) + if not sents: + return "" + + chunks_sents = _pack_sentences_to_token_chunks( + tok, sents, max_tokens=max_chunk_tokens, overlap_sents=CHUNK_OVERLAP_SENTS + ) + + translated_parts: List[str] = [] + for group in chunks_sents: + chunk_text = " ".join(group) + translated = translate_text(src_lang, tgt_lang, chunk_text, num_beams=num_beams) + translated_parts.append(translated) + + return _smart_concatenate([p for p in translated_parts if p]) + +# ---------- Procesamiento por lotes ---------- def process_batch(conn, rows): for r in rows: tr_id = r["tr_id"] @@ -426,9 +585,10 @@ def process_batch(conn, rows): continue try: - # Beams distintos: mejor calidad en títulos con coste de VRAM controlado + # Títulos: cortos, traducción directa (beams más altos si quieres) title_tr = translate_text(lang_from, lang_to, title, num_beams=NUM_BEAMS_TITLE) if title else "" - body_tr = translate_text(lang_from, lang_to, body, num_beams=NUM_BEAMS_BODY) if body else "" + # Cuerpo/resumen: artículo completo con chunking por frases + body_tr = translate_article_full(lang_from, lang_to, body, num_beams=NUM_BEAMS_BODY) if body else "" # Si la "traducción" es igual al original, déjala vacía if _norm(title_tr) == _norm(title): @@ -443,8 +603,10 @@ def process_batch(conn, rows): def main(): LOG.info( - "Arrancando worker de traducción (NLLB). TARGET_LANGS=%s, BATCH=%s, ENQUEUE=%s, DEVICE=%s, BEAMS(title/body)=%s/%s", - TARGET_LANGS, BATCH_SIZE, ENQUEUE_MAX, DEVICE_CFG, NUM_BEAMS_TITLE, NUM_BEAMS_BODY + "Arrancando worker de traducción (NLLB). TARGET_LANGS=%s, BATCH=%s, ENQUEUE=%s, DEVICE=%s, " + "BEAMS(title/body)=%s/%s, CHUNK_BY_SENTENCES=%s, CHUNK_MAX_TOKENS=%s, OVERLAP_SENTS=%s", + TARGET_LANGS, BATCH_SIZE, ENQUEUE_MAX, DEVICE_CFG, NUM_BEAMS_TITLE, NUM_BEAMS_BODY, + CHUNK_BY_SENTENCES, CHUNK_MAX_TOKENS, CHUNK_OVERLAP_SENTS ) # Pre-carga el modelo una vez para reservar memoria de forma limpia get_universal_components()