cambios en la web

This commit is contained in:
jlimolina 2025-10-12 17:51:14 +02:00
parent 046a5ff369
commit a9c1e16bdd
6 changed files with 283 additions and 131 deletions

101
app.py
View file

@ -11,7 +11,7 @@ from contextlib import contextmanager
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm from tqdm import tqdm
from flask import Flask, render_template, request, redirect, url_for, Response, flash, make_response from flask import Flask, render_template, request, redirect, url_for, Response, flash, make_response, abort
import psycopg2 import psycopg2
import psycopg2.extras import psycopg2.extras
import psycopg2.pool import psycopg2.pool
@ -36,11 +36,7 @@ DB_CONFIG = {
MAX_WORKERS = int(os.environ.get("RSS_MAX_WORKERS", 20)) MAX_WORKERS = int(os.environ.get("RSS_MAX_WORKERS", 20))
SINGLE_FEED_TIMEOUT = int(os.environ.get("RSS_FEED_TIMEOUT", 30)) SINGLE_FEED_TIMEOUT = int(os.environ.get("RSS_FEED_TIMEOUT", 30))
MAX_FALLOS = int(os.environ.get("RSS_MAX_FAILURES", 5)) MAX_FALLOS = int(os.environ.get("RSS_MAX_FAILURES", 5))
# Tamaño de página configurable (límite en 10100 por seguridad)
NEWS_PER_PAGE = int(os.environ.get("NEWS_PER_PAGE", 20)) NEWS_PER_PAGE = int(os.environ.get("NEWS_PER_PAGE", 20))
# Idioma/traducción por defecto
DEFAULT_TRANSLATION_LANG = os.environ.get("DEFAULT_TRANSLATION_LANG", "es").strip().lower() DEFAULT_TRANSLATION_LANG = os.environ.get("DEFAULT_TRANSLATION_LANG", "es").strip().lower()
DEFAULT_LANG = os.environ.get("DEFAULT_LANG", DEFAULT_TRANSLATION_LANG).strip().lower() DEFAULT_LANG = os.environ.get("DEFAULT_LANG", DEFAULT_TRANSLATION_LANG).strip().lower()
WEB_TRANSLATED_DEFAULT = os.environ.get("WEB_TRANSLATED_DEFAULT", "1").strip().lower() in ("1", "true", "yes") WEB_TRANSLATED_DEFAULT = os.environ.get("WEB_TRANSLATED_DEFAULT", "1").strip().lower() in ("1", "true", "yes")
@ -81,8 +77,8 @@ def safe_html(text):
return "" return ""
return bleach.clean( return bleach.clean(
text, text,
tags={'a', 'b', 'strong', 'i', 'em', 'p', 'br'}, tags={'a', 'b', 'strong', 'i', 'em', 'p', 'br', 'ul', 'ol', 'li', 'blockquote', 'h3', 'h4'},
attributes={'a': ['href', 'title']}, attributes={'a': ['href', 'title', 'rel', 'target']},
strip=True strip=True
) )
@ -94,31 +90,18 @@ def _get_form_dependencies(cursor):
return categorias, paises return categorias, paises
def _get_lang_and_flags(): def _get_lang_and_flags():
"""
Determina el idioma preferido y si se debe usar traducción por defecto.
Permite forzar original con ?orig=1 y cambiar idioma con ?lang=xx (se guarda en cookie).
"""
qlang = request.args.get("lang", "").strip().lower() qlang = request.args.get("lang", "").strip().lower()
cookie_lang = (request.cookies.get("lang") or "").strip().lower() cookie_lang = (request.cookies.get("lang") or "").strip().lower()
lang = qlang or cookie_lang or DEFAULT_LANG or "es" lang = qlang or cookie_lang or DEFAULT_LANG or "es"
force_orig = request.args.get("orig") == "1" force_orig = request.args.get("orig") == "1"
use_translation = (not force_orig) and WEB_TRANSLATED_DEFAULT use_translation = (not force_orig) and WEB_TRANSLATED_DEFAULT
return lang, use_translation, bool(qlang) return lang, use_translation, bool(qlang)
def _build_news_query(args, *, count=False, limit=None, offset=None, lang="es", use_translation=True): def _build_news_query(args, *, count=False, limit=None, offset=None, lang="es", use_translation=True):
"""
Construye la consulta SQL y los parámetros basados en los argumentos de la petición.
Si count=True => SELECT COUNT(*)
Si count=False => SELECT columnas con ORDER + LIMIT/OFFSET.
Integra traducciones vía LEFT JOIN LATERAL cuando use_translation=True (status='done', lang_to=lang).
"""
# Para controlar orden de parámetros según apariciones de %s:
select_rank_params = [] select_rank_params = []
from_params = [] from_params = []
where_params = [] where_params = []
tail_params = [] tail_params = []
conditions = [] conditions = []
q = args.get("q", "").strip() q = args.get("q", "").strip()
@ -127,7 +110,6 @@ def _build_news_query(args, *, count=False, limit=None, offset=None, lang="es",
pais_id = args.get("pais_id") pais_id = args.get("pais_id")
fecha_filtro = args.get("fecha") fecha_filtro = args.get("fecha")
# FROM base
sql_from = """ sql_from = """
FROM noticias n FROM noticias n
LEFT JOIN categorias c ON n.categoria_id = c.id LEFT JOIN categorias c ON n.categoria_id = c.id
@ -135,11 +117,10 @@ def _build_news_query(args, *, count=False, limit=None, offset=None, lang="es",
LEFT JOIN continentes co ON p.continente_id = co.id LEFT JOIN continentes co ON p.continente_id = co.id
""" """
# LEFT JOIN LATERAL traducción (solo en SELECT de página; el conteo no la necesita)
if (not count) and use_translation: if (not count) and use_translation:
sql_from += """ sql_from += """
LEFT JOIN LATERAL ( LEFT JOIN LATERAL (
SELECT titulo_trad, resumen_trad SELECT id AS traduccion_id, titulo_trad, resumen_trad
FROM traducciones FROM traducciones
WHERE traducciones.noticia_id = n.id WHERE traducciones.noticia_id = n.id
AND traducciones.lang_to = %s AND traducciones.lang_to = %s
@ -150,9 +131,7 @@ def _build_news_query(args, *, count=False, limit=None, offset=None, lang="es",
""" """
from_params.append(lang) from_params.append(lang)
# WHERE dinámico
if q: if q:
# Buscar por relevancia en el tsvector de la noticia original
conditions.append("n.tsv @@ plainto_tsquery('spanish', %s)") conditions.append("n.tsv @@ plainto_tsquery('spanish', %s)")
where_params.append(q) where_params.append(q)
@ -178,24 +157,26 @@ def _build_news_query(args, *, count=False, limit=None, offset=None, lang="es",
where_clause = " WHERE " + " AND ".join(conditions) if conditions else "" where_clause = " WHERE " + " AND ".join(conditions) if conditions else ""
if count: if count:
# Conteo total (sin necesidad de traducciones)
sql_count = "SELECT COUNT(*) " + sql_from + where_clause sql_count = "SELECT COUNT(*) " + sql_from + where_clause
sql_params = from_params + where_params # from_params estará vacío en count sql_params = from_params + where_params
return sql_count, sql_params return sql_count, sql_params
# Selección de columnas para página
if use_translation: if use_translation:
select_cols = """ select_cols = """
SELECT n.fecha, SELECT
COALESCE(t.titulo_trad, n.titulo) AS titulo, COALESCE(t.traduccion_id, NULL) AS traduccion_id,
COALESCE(t.resumen_trad, n.resumen) AS resumen, n.fecha,
COALESCE(t.titulo_trad, n.titulo) AS titulo,
COALESCE(t.resumen_trad, n.resumen) AS resumen,
n.url, n.imagen_url, n.fuente_nombre, n.url, n.imagen_url, n.fuente_nombre,
c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente, c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente,
(t.titulo_trad IS NOT NULL OR t.resumen_trad IS NOT NULL) AS usa_trad (t.titulo_trad IS NOT NULL OR t.resumen_trad IS NOT NULL) AS usa_trad
""" """
else: else:
select_cols = """ select_cols = """
SELECT n.fecha, n.titulo, n.resumen, SELECT
NULL::int AS traduccion_id,
n.fecha, n.titulo, n.resumen,
n.url, n.imagen_url, n.fuente_nombre, n.url, n.imagen_url, n.fuente_nombre,
c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente, c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente,
FALSE AS usa_trad FALSE AS usa_trad
@ -204,7 +185,6 @@ def _build_news_query(args, *, count=False, limit=None, offset=None, lang="es",
order_clause = " ORDER BY n.fecha DESC NULLS LAST" order_clause = " ORDER BY n.fecha DESC NULLS LAST"
if q: if q:
# Ranking por relevancia (primer placeholder)
select_cols = select_cols.replace( select_cols = select_cols.replace(
"SELECT", "SELECT",
"SELECT ts_rank(n.tsv, plainto_tsquery('spanish', %s)) AS rank," "SELECT ts_rank(n.tsv, plainto_tsquery('spanish', %s)) AS rank,"
@ -212,7 +192,6 @@ def _build_news_query(args, *, count=False, limit=None, offset=None, lang="es",
select_rank_params.append(q) select_rank_params.append(q)
order_clause = " ORDER BY rank DESC, n.fecha DESC NULLS LAST" order_clause = " ORDER BY rank DESC, n.fecha DESC NULLS LAST"
# Paginación
if limit is not None: if limit is not None:
order_clause += " LIMIT %s" order_clause += " LIMIT %s"
tail_params.append(limit) tail_params.append(limit)
@ -228,20 +207,16 @@ def _build_news_query(args, *, count=False, limit=None, offset=None, lang="es",
def home(): def home():
noticias, categorias, continentes, paises = [], [], [], [] noticias, categorias, continentes, paises = [], [], [], []
# Estado de filtros (para mantenerlos en la UI)
q = request.args.get("q", "").strip() q = request.args.get("q", "").strip()
cat_id = request.args.get("categoria_id") cat_id = request.args.get("categoria_id")
cont_id = request.args.get("continente_id") cont_id = request.args.get("continente_id")
pais_id = request.args.get("pais_id") pais_id = request.args.get("pais_id")
fecha_filtro = request.args.get("fecha") fecha_filtro = request.args.get("fecha")
# Preferencias idioma/uso de traducción
lang, use_tr, set_cookie = _get_lang_and_flags() lang, use_tr, set_cookie = _get_lang_and_flags()
# Paginación
page = request.args.get("page", default=1, type=int) page = request.args.get("page", default=1, type=int)
per_page = request.args.get("per_page", default=NEWS_PER_PAGE, type=int) per_page = request.args.get("per_page", default=NEWS_PER_PAGE, type=int)
# límites de seguridad
if per_page is None or per_page <= 0: if per_page is None or per_page <= 0:
per_page = NEWS_PER_PAGE per_page = NEWS_PER_PAGE
per_page = 100 if per_page > 100 else (10 if per_page < 10 else per_page) per_page = 100 if per_page > 100 else (10 if per_page < 10 else per_page)
@ -255,7 +230,6 @@ def home():
try: try:
with get_conn() as conn: with get_conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
# Dependencias de UI
cursor.execute("SELECT id, nombre FROM categorias ORDER BY nombre") cursor.execute("SELECT id, nombre FROM categorias ORDER BY nombre")
categorias = cursor.fetchall() categorias = cursor.fetchall()
cursor.execute("SELECT id, nombre FROM continentes ORDER BY nombre") cursor.execute("SELECT id, nombre FROM continentes ORDER BY nombre")
@ -263,7 +237,6 @@ def home():
cursor.execute("SELECT id, nombre, continente_id FROM paises ORDER BY nombre") cursor.execute("SELECT id, nombre, continente_id FROM paises ORDER BY nombre")
paises = cursor.fetchall() paises = cursor.fetchall()
# 1) Conteo total (no requiere join de traducciones)
sql_count, params_count = _build_news_query( sql_count, params_count = _build_news_query(
request.args, count=True, lang=lang, use_translation=use_tr request.args, count=True, lang=lang, use_translation=use_tr
) )
@ -271,7 +244,6 @@ def home():
total_results = cursor.fetchone()[0] or 0 total_results = cursor.fetchone()[0] or 0
total_pages = math.ceil(total_results / per_page) if total_results else 0 total_pages = math.ceil(total_results / per_page) if total_results else 0
# 2) Página actual (con COALESCE a traducción si procede)
sql_page, params_page = _build_news_query( sql_page, params_page = _build_news_query(
request.args, request.args,
count=False, count=False,
@ -295,20 +267,52 @@ def home():
lang=lang, use_tr=use_tr lang=lang, use_tr=use_tr
) )
# Respuesta parcial para AJAX
if request.headers.get('X-Requested-With') == 'XMLHttpRequest': if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
resp = make_response(render_template('_noticias_list.html', **ctx)) resp = make_response(render_template('_noticias_list.html', **ctx))
if set_cookie: if set_cookie:
resp.set_cookie("lang", lang, max_age=60*60*24*365) resp.set_cookie("lang", lang, max_age=60*60*24*365)
return resp return resp
# Render completo
html = render_template("noticias.html", **ctx) html = render_template("noticias.html", **ctx)
resp = make_response(html) resp = make_response(html)
if set_cookie: if set_cookie:
resp.set_cookie("lang", lang, max_age=60*60*24*365) resp.set_cookie("lang", lang, max_age=60*60*24*365)
return resp return resp
@app.get("/noticia/<int:tr_id>")
def noticia(tr_id):
with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute(
"""
SELECT
t.id,
n.id AS noticia_id,
n.fecha,
n.titulo AS titulo_original,
n.resumen AS cuerpo_original,
t.titulo_trad AS titulo_traducido,
t.resumen_trad AS cuerpo_traducido,
n.url AS fuente_url,
n.fuente_nombre,
p.nombre AS pais,
co.nombre AS continente,
c.nombre AS categoria,
t.lang_to,
t.status
FROM traducciones t
JOIN noticias n ON n.id = t.noticia_id
LEFT JOIN paises p ON n.pais_id = p.id
LEFT JOIN continentes co ON p.continente_id = co.id
LEFT JOIN categorias c ON n.categoria_id = c.id
WHERE t.id = %s
""",
(tr_id,)
)
row = cur.fetchone()
if not row:
abort(404)
return render_template("noticia.html", r=row)
@app.route("/dashboard") @app.route("/dashboard")
def dashboard(): def dashboard():
stats = {'feeds_totales': 0, 'noticias_totales': 0, 'feeds_caidos': 0} stats = {'feeds_totales': 0, 'noticias_totales': 0, 'feeds_caidos': 0}
@ -540,8 +544,6 @@ def fetch_and_store_all():
feeds_fallidos = [] feeds_fallidos = []
feeds_exitosos = [] feeds_exitosos = []
feeds_para_actualizar_headers = [] feeds_para_actualizar_headers = []
# --- Parte 1: Procesando Feeds RSS ---
logging.info("=> Parte 1: Procesando Feeds RSS...") logging.info("=> Parte 1: Procesando Feeds RSS...")
feeds_to_process = [] feeds_to_process = []
try: try:
@ -578,7 +580,6 @@ def fetch_and_store_all():
noticias_desde_rss_count = len(todas_las_noticias) noticias_desde_rss_count = len(todas_las_noticias)
logging.info(f"=> Parte 1 Finalizada. Noticias desde RSS: {noticias_desde_rss_count}. Éxitos: {len(feeds_exitosos)}. Fallos: {len(feeds_fallidos)}.") logging.info(f"=> Parte 1 Finalizada. Noticias desde RSS: {noticias_desde_rss_count}. Éxitos: {len(feeds_exitosos)}. Fallos: {len(feeds_fallidos)}.")
# --- Parte 2: Procesando Fuentes URL ---
logging.info("=> Parte 2: Procesando Fuentes URL...") logging.info("=> Parte 2: Procesando Fuentes URL...")
urls_to_process = [] urls_to_process = []
try: try:
@ -590,7 +591,6 @@ def fetch_and_store_all():
except Exception as e: except Exception as e:
logging.error(f"Error de BD al obtener fuentes URL: {e}") logging.error(f"Error de BD al obtener fuentes URL: {e}")
# Paraleliza la captura desde newspaper3k
if urls_to_process: if urls_to_process:
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=10) as executor:
future_to_url = { future_to_url = {
@ -612,7 +612,6 @@ def fetch_and_store_all():
noticias_desde_urls_count = len(todas_las_noticias) - noticias_desde_rss_count noticias_desde_urls_count = len(todas_las_noticias) - noticias_desde_rss_count
logging.info(f"=> Parte 2 Finalizada. Noticias encontradas desde URLs: {noticias_desde_urls_count}.") logging.info(f"=> Parte 2 Finalizada. Noticias encontradas desde URLs: {noticias_desde_urls_count}.")
# --- Parte 3: Actualizando la base de datos ---
logging.info("=> Parte 3: Actualizando la base de datos...") logging.info("=> Parte 3: Actualizando la base de datos...")
if not any([todas_las_noticias, feeds_fallidos, feeds_exitosos, feeds_para_actualizar_headers]): if not any([todas_las_noticias, feeds_fallidos, feeds_exitosos, feeds_para_actualizar_headers]):
logging.info("No se encontraron nuevas noticias ni cambios en los feeds. Nada que actualizar.") logging.info("No se encontraron nuevas noticias ni cambios en los feeds. Nada que actualizar.")
@ -655,8 +654,6 @@ def fetch_and_store_all():
logging.info("--- CICLO DE CAPTURA GLOBAL FINALIZADO ---") logging.info("--- CICLO DE CAPTURA GLOBAL FINALIZADO ---")
# --- Funciones de Backup y Restore (sin cambios) ---
@app.route("/backup_feeds") @app.route("/backup_feeds")
def backup_feeds(): def backup_feeds():
try: try:
@ -755,7 +752,6 @@ def backup_completo():
with zipfile.ZipFile(memory_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf: with zipfile.ZipFile(memory_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
with get_conn() as conn: with get_conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
# Backup Feeds
cursor.execute("SELECT * FROM feeds ORDER BY id") cursor.execute("SELECT * FROM feeds ORDER BY id")
feeds_data = cursor.fetchall() feeds_data = cursor.fetchall()
if feeds_data: if feeds_data:
@ -765,7 +761,6 @@ def backup_completo():
writer_feeds.writerows([dict(f) for f in feeds_data]) writer_feeds.writerows([dict(f) for f in feeds_data])
zipf.writestr("feeds.csv", output_feeds.getvalue()) zipf.writestr("feeds.csv", output_feeds.getvalue())
# Backup Fuentes URL
cursor.execute("SELECT * FROM fuentes_url ORDER BY id") cursor.execute("SELECT * FROM fuentes_url ORDER BY id")
fuentes_data = cursor.fetchall() fuentes_data = cursor.fetchall()
if fuentes_data: if fuentes_data:
@ -775,7 +770,6 @@ def backup_completo():
writer_fuentes.writerows([dict(f) for f in fuentes_data]) writer_fuentes.writerows([dict(f) for f in fuentes_data])
zipf.writestr("fuentes_url.csv", output_fuentes.getvalue()) zipf.writestr("fuentes_url.csv", output_fuentes.getvalue())
# Backup Noticias
cursor.execute("SELECT * FROM noticias ORDER BY fecha DESC") cursor.execute("SELECT * FROM noticias ORDER BY fecha DESC")
noticias_data = cursor.fetchall() noticias_data = cursor.fetchall()
if noticias_data: if noticias_data:
@ -892,7 +886,6 @@ def restore_urls():
return render_template("restore_urls.html") return render_template("restore_urls.html")
if __name__ == "__main__": if __name__ == "__main__":
if not db_pool: if not db_pool:
app.logger.error("La aplicación no puede arrancar sin una conexión a la base de datos.") app.logger.error("La aplicación no puede arrancar sin una conexión a la base de datos.")

View file

@ -20,7 +20,7 @@ services:
build: build:
context: . context: .
args: args:
# La imagen llevará torch-cu121 por reutilizar Dockerfile; web no usa GPU. # Reutiliza Dockerfile con torch-cu121; la web no usa GPU.
TORCH_CUDA: cu121 TORCH_CUDA: cu121
container_name: rss_web container_name: rss_web
command: gunicorn --bind 0.0.0.0:8000 --workers 3 app:app command: gunicorn --bind 0.0.0.0:8000 --workers 3 app:app
@ -33,9 +33,8 @@ services:
- DB_USER=${DB_USER} - DB_USER=${DB_USER}
- DB_PASS=${DB_PASS} - DB_PASS=${DB_PASS}
- SECRET_KEY=${SECRET_KEY} - SECRET_KEY=${SECRET_KEY}
# Opcionales UI # UI opcional
# - NEWS_PER_PAGE=20 # - NEWS_PER_PAGE=20
# Mostrar traducciones por defecto en la web
- WEB_TRANSLATED_DEFAULT=1 - WEB_TRANSLATED_DEFAULT=1
- DEFAULT_LANG=es - DEFAULT_LANG=es
- TRANSLATION_PREFERRED_LANGS=es - TRANSLATION_PREFERRED_LANGS=es
@ -78,31 +77,38 @@ services:
- DB_USER=${DB_USER} - DB_USER=${DB_USER}
- DB_PASS=${DB_PASS} - DB_PASS=${DB_PASS}
# --- Worker --- # --- Worker (ajustes estables VRAM) ---
- TARGET_LANGS=es - TARGET_LANGS=es
- TRANSLATOR_BATCH=4 # estable con 1.3B en 12 GB; ajusta si cambia la VRAM disponible - TRANSLATOR_BATCH=8 # cuántas filas toma por ciclo
- ENQUEUE=200 - ENQUEUE=200
- TRANSLATOR_SLEEP_IDLE=5 - TRANSLATOR_SLEEP_IDLE=5
# Tokens (equilibrio calidad/VRAM ~<7GB) # Tokens (seguro para NLLB-1.3B; evita >1024)
- MAX_SRC_TOKENS=512 - MAX_SRC_TOKENS=680 # margen bajo el límite real del modelo
- MAX_NEW_TOKENS=256 - MAX_NEW_TOKENS=400 # permite salidas más largas en cuerpos
# Beams: mejor título, cuerpo eficiente # Beams: mejor en títulos, eficiente en cuerpo
- NUM_BEAMS_TITLE=3 - NUM_BEAMS_TITLE=2
- NUM_BEAMS_BODY=2 - NUM_BEAMS_BODY=1
# Modelo NLLB 1.3B # Modelo NLLB 1.3B
- UNIVERSAL_MODEL=facebook/nllb-200-1.3B - UNIVERSAL_MODEL=facebook/nllb-200-1.3B
# Dispositivo (forzar GPU si está disponible; el worker cae a CPU si hay OOM) # Chunking por frases (mejor coherencia en artículos largos)
- CHUNK_BY_SENTENCES=True
- CHUNK_MAX_TOKENS=700 # <= MAX_SRC_TOKENS (con margen)
- CHUNK_OVERLAP_SENTS=1 # solape de 1 frase para evitar cortes bruscos
- CLEAN_ARTICLE=1 # limpia “The post…”, “Læs også…”, etc.
# Dispositivo (usa GPU si hay; cae a CPU si hay OOM)
- DEVICE=cuda - DEVICE=cuda
# Rendimiento / estabilidad # Rendimiento / estabilidad
- PYTHONUNBUFFERED=1 - PYTHONUNBUFFERED=1
- HF_HOME=/root/.cache/huggingface - HF_HOME=/root/.cache/huggingface
- TOKENIZERS_PARALLELISM=false - TOKENIZERS_PARALLELISM=false
- PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:64,garbage_collection_threshold:0.9 # Evita el assert del allocator de PyTorch
- PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64,garbage_collection_threshold:0.9
# GPU (requiere NVIDIA Container Toolkit en el host) # GPU (requiere NVIDIA Container Toolkit en el host)
- NVIDIA_VISIBLE_DEVICES=all - NVIDIA_VISIBLE_DEVICES=all

View file

@ -4,15 +4,17 @@ APScheduler==3.10.4
psycopg2-binary==2.9.10 psycopg2-binary==2.9.10
bleach==6.1.0 bleach==6.1.0
gunicorn==22.0.0 gunicorn==22.0.0
waitress waitress==2.1.2
tqdm tqdm>=4.66
beautifulsoup4 beautifulsoup4>=4.12
requests requests>=2.31
newspaper3k newspaper3k==0.2.8
lxml-html-clean lxml[html_clean]>=4.9.3
langdetect==1.0.9 langdetect==1.0.9
transformers==4.43.3 transformers==4.43.3
sentencepiece==0.2.0 sentencepiece==0.2.0
sacremoses==0.1.1 sacremoses==0.1.1
torch==2.3.1 # CPU. Para GPU ver nota más abajo.
accelerate==0.33.0 accelerate==0.33.0
# Nota: PyTorch (torch) NO se fija aquí.
# Se instala en el Dockerfile con la wheel adecuada de CUDA (cu121) para tu GPU.

View file

@ -1,4 +1,3 @@
/* --- Variables Globales de Diseño --- */
:root { :root {
--primary-color: #6a11cb; --primary-color: #6a11cb;
--secondary-color: #2575fc; --secondary-color: #2575fc;
@ -14,7 +13,6 @@
--transition-speed: 0.3s; --transition-speed: 0.3s;
} }
/* --- Estilos Base --- */
* { box-sizing: border-box; } * { box-sizing: border-box; }
body { body {
font-family: 'Poppins', 'Segoe UI', Tahoma, sans-serif; font-family: 'Poppins', 'Segoe UI', Tahoma, sans-serif;
@ -26,7 +24,6 @@ body {
font-weight: 400; font-weight: 400;
} }
/* --- Contenedor Principal con Efecto Vidrio --- */
.container { .container {
max-width: 900px; max-width: 900px;
margin: 30px auto; margin: 30px auto;
@ -39,19 +36,16 @@ body {
-webkit-backdrop-filter: blur(12px); -webkit-backdrop-filter: blur(12px);
} }
/* --- Encabezados y Títulos --- */
header { text-align: center; margin-bottom: 40px; border-bottom: 1px solid var(--border-color); padding-bottom: 30px; } header { text-align: center; margin-bottom: 40px; border-bottom: 1px solid var(--border-color); padding-bottom: 30px; }
h1 { font-size: 2.8rem; font-weight: 700; margin: 0 0 5px 0; background: var(--gradiente-principal); -webkit-background-clip: text; -webkit-text-fill-color: transparent; display: inline-block; } h1 { font-size: 2.8rem; font-weight: 700; margin: 0 0 5px 0; background: var(--gradiente-principal); -webkit-background-clip: text; -webkit-text-fill-color: transparent; display: inline-block; }
h2 { font-size: 1.8rem; font-weight: 600; color: var(--primary-color); margin-bottom: 20px; } h2 { font-size: 1.8rem; font-weight: 600; color: var(--primary-color); margin-bottom: 20px; }
.subtitle { color: var(--text-color-light); font-size: 1.1rem; margin-top: 5px; } .subtitle { color: var(--text-color-light); font-size: 1.1rem; margin-top: 5px; }
/* --- Formularios y Controles --- */
.form-section, .card { margin-bottom: 30px; background: rgba(255, 255, 255, 0.6); padding: 25px; border-radius: var(--border-radius-md); border: 1px solid var(--border-color); } .form-section, .card { margin-bottom: 30px; background: rgba(255, 255, 255, 0.6); padding: 25px; border-radius: var(--border-radius-md); border: 1px solid var(--border-color); }
label { display: block; margin-bottom: 8px; font-weight: 600; color: var(--text-color); font-size: 0.9rem; } label { display: block; margin-bottom: 8px; font-weight: 600; color: var(--text-color); font-size: 0.9rem; }
select, input[type="text"], input[type="url"], input[type="file"], textarea { width: 100%; padding: 12px 15px; border: 1px solid var(--border-color); background-color: #f8f9fa; border-radius: var(--border-radius-sm); font-size: 1rem; font-family: 'Poppins', sans-serif; transition: all var(--transition-speed) ease; } select, input[type="text"], input[type="url"], input[type="file"], textarea { width: 100%; padding: 12px 15px; border: 1px solid var(--border-color); background-color: #f8f9fa; border-radius: var(--border-radius-sm); font-size: 1rem; font-family: 'Poppins', sans-serif; transition: all var(--transition-speed) ease; }
select:focus, input:focus, textarea:focus { outline: none; border-color: var(--primary-color); box-shadow: 0 0 0 3px var(--shadow-color); background-color: white; } select:focus, input:focus, textarea:focus { outline: none; border-color: var(--primary-color); box-shadow: 0 0 0 3px var(--shadow-color); background-color: white; }
/* --- Botones y Enlaces --- */
.btn, button { padding: 12px 25px; background: var(--gradiente-principal); color: white !important; border: none; border-radius: var(--border-radius-sm); font-size: 1rem; font-weight: 600; cursor: pointer; transition: all var(--transition-speed) ease; box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1); text-decoration: none; display: inline-block; text-align: center; } .btn, button { padding: 12px 25px; background: var(--gradiente-principal); color: white !important; border: none; border-radius: var(--border-radius-sm); font-size: 1rem; font-weight: 600; cursor: pointer; transition: all var(--transition-speed) ease; box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1); text-decoration: none; display: inline-block; text-align: center; }
.btn:hover, button:hover { transform: translateY(-3px); box-shadow: 0 6px 20px rgba(0, 0, 0, 0.2); text-decoration: none; } .btn:hover, button:hover { transform: translateY(-3px); box-shadow: 0 6px 20px rgba(0, 0, 0, 0.2); text-decoration: none; }
.btn-secondary { background: #34495e; } .btn-secondary:hover { background: #2c3e50; } .btn-secondary { background: #34495e; } .btn-secondary:hover { background: #2c3e50; }
@ -62,7 +56,6 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
.top-link { display: inline-block; margin-bottom: 25px; font-weight: 500; color: var(--primary-color); } .top-link { display: inline-block; margin-bottom: 25px; font-weight: 500; color: var(--primary-color); }
.top-link:hover { text-decoration: underline; } .top-link:hover { text-decoration: underline; }
/* --- Estilos para la lista de noticias --- */
.noticias-list { list-style: none; padding: 0; margin: 0; } .noticias-list { list-style: none; padding: 0; margin: 0; }
.noticia-item { display: flex; gap: 20px; padding: 20px 10px; border-bottom: 1px solid var(--border-color); transition: background-color 0.2s ease; } .noticia-item { display: flex; gap: 20px; padding: 20px 10px; border-bottom: 1px solid var(--border-color); transition: background-color 0.2s ease; }
.noticia-item:last-child { border-bottom: none; } .noticia-item:last-child { border-bottom: none; }
@ -73,14 +66,12 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
.noticia-texto h3 a:hover { color: var(--primary-color); } .noticia-texto h3 a:hover { color: var(--primary-color); }
.noticia-meta { font-size: 0.8rem; color: var(--text-color-light); margin-bottom: 8px; } .noticia-meta { font-size: 0.8rem; color: var(--text-color-light); margin-bottom: 8px; }
/* --- Alertas y Mensajes Flash --- */
.flash-messages { list-style: none; padding: 0; margin-bottom: 20px; } .flash-messages { list-style: none; padding: 0; margin-bottom: 20px; }
.flash-messages li { padding: 15px 20px; border-radius: var(--border-radius-sm); border-left: 5px solid; } .flash-messages li { padding: 15px 20px; border-radius: var(--border-radius-sm); border-left: 5px solid; }
.flash-messages .error { background-color: #fff0f3; color: #d90429; border-color: var(--error-color); } .flash-messages .error { background-color: #fff0f3; color: #d90429; border-color: var(--error-color); }
.flash-messages .success { background-color: #e6fcf5; color: #00b894; border-color: #00b894; } .flash-messages .success { background-color: #e6fcf5; color: #00b894; border-color: #00b894; }
.flash-messages .warning { background-color: #fffbeb; color: #f39c12; border-color: #f39c12; } .flash-messages .warning { background-color: #fffbeb; color: #f39c12; border-color: #f39c12; }
/* --- Estilos para Dashboard y Paginación --- */
.dashboard-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px; margin-bottom: 40px; } .dashboard-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px; margin-bottom: 40px; }
.stat-card { background: rgba(255, 255, 255, 0.8); padding: 20px; border-radius: var(--border-radius-md); text-align: center; border: 1px solid var(--border-color); transition: all 0.3s ease; } .stat-card { background: rgba(255, 255, 255, 0.8); padding: 20px; border-radius: var(--border-radius-md); text-align: center; border: 1px solid var(--border-color); transition: all 0.3s ease; }
.stat-card:hover { transform: translateY(-5px); box-shadow: 0 4px 15px rgba(0,0,0,0.08); } .stat-card:hover { transform: translateY(-5px); box-shadow: 0 4px 15px rgba(0,0,0,0.08); }
@ -98,7 +89,6 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
.feed-body dt { font-weight: 600; color: var(--text-color-light); } .feed-body dt { font-weight: 600; color: var(--text-color-light); }
.feed-body dd { margin: 0; word-break: break-all; } .feed-body dd { margin: 0; word-break: break-all; }
/* --- Estilos para la Navegación Principal --- */
.main-nav { .main-nav {
display: flex; display: flex;
justify-content: center; justify-content: center;
@ -127,7 +117,6 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
margin-left: 20px; margin-left: 20px;
} }
/* --- Responsividad --- */
@media (max-width: 768px) { @media (max-width: 768px) {
.container { padding: 20px; margin: 15px; } .container { padding: 20px; margin: 15px; }
h1 { font-size: 2rem; } h1 { font-size: 2rem; }
@ -137,10 +126,7 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
.nav-actions { margin-left: 0; margin-top: 10px; } .nav-actions { margin-left: 0; margin-top: 10px; }
} }
/* --- Estilos para el botón Ver Más --- */ .resumen-container { position: relative; }
.resumen-container {
position: relative;
}
.ver-mas-btn { .ver-mas-btn {
background: none; background: none;
border: none; border: none;
@ -149,23 +135,28 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
cursor: pointer; cursor: pointer;
padding: 5px 0; padding: 5px 0;
margin-top: 5px; margin-top: 5px;
}
.ver-mas-btn:hover {
text-decoration: underline; text-decoration: underline;
} }
/* --- Estilos para la fila principal del formulario de filtros --- */
.filter-main-row { .filter-main-row {
display: flex; display: flex;
align-items: flex-end; align-items: flex-end;
gap: 15px; gap: 15px;
margin-bottom: 20px; margin-bottom: 20px;
} }
.filter-search-box { .filter-search-box { flex-grow: 1; }
flex-grow: 1; .filter-actions { display: flex; gap: 10px; white-space: nowrap; }
.clamp {
display: -webkit-box;
-webkit-line-clamp: 6;
-webkit-box-orient: vertical;
overflow: hidden;
word-break: break-word;
} }
.filter-actions { .clamp.expanded {
display: flex; -webkit-line-clamp: unset;
gap: 10px; max-height: none;
white-space: nowrap; overflow: visible;
} }

View file

@ -13,7 +13,11 @@
<h3 style="margin:0 0 6px 0;"> <h3 style="margin:0 0 6px 0;">
<a href="{{ noticia.url }}" target="_blank" rel="noopener noreferrer">{{ noticia.titulo }}</a> <a href="{{ noticia.url }}" target="_blank" rel="noopener noreferrer">{{ noticia.titulo }}</a>
{% if use_tr %} {% if use_tr %}
<span class="badge" title="Mostrando traducciones por defecto" style="margin-left:8px;">Traducido</span> {% if noticia.usa_tr %}
<span class="badge" title="Mostrando traducción">Traducido</span>
{% else %}
<span class="badge" title="Mostrando original">Original</span>
{% endif %}
{% endif %} {% endif %}
</h3> </h3>
@ -43,12 +47,7 @@
<div class="resumen-container"> <div class="resumen-container">
{% set resumen_txt = noticia.resumen | safe_html %} {% set resumen_txt = noticia.resumen | safe_html %}
<div class="resumen-corto"> <div class="resumen clamp">{{ resumen_txt }}</div>
{{ resumen_txt | truncate(280, True) }}
</div>
<div class="resumen-completo" style="display:none;">
{{ resumen_txt }}
</div>
{% if noticia.resumen and noticia.resumen|length > 280 %} {% if noticia.resumen and noticia.resumen|length > 280 %}
<button class="ver-mas-btn" type="button">Ver más</button> <button class="ver-mas-btn" type="button">Ver más</button>
{% endif %} {% endif %}
@ -62,7 +61,6 @@
{% endfor %} {% endfor %}
</ul> </ul>
{# Resumen y paginación #}
{% if total_results and total_results > 0 %} {% if total_results and total_results > 0 %}
<div style="text-align:center; margin-top:10px; color: var(--text-color-light);"> <div style="text-align:center; margin-top:10px; color: var(--text-color-light);">
{% set start_i = (page - 1) * per_page + 1 %} {% set start_i = (page - 1) * per_page + 1 %}
@ -75,12 +73,10 @@
<nav class="pagination" aria-label="Paginación de noticias" style="margin-top:15px;"> <nav class="pagination" aria-label="Paginación de noticias" style="margin-top:15px;">
{% set current = page %} {% set current = page %}
{# Anterior #}
{% if current > 1 %} {% if current > 1 %}
<a href="#" class="page-link" data-page="{{ current - 1 }}">&laquo; Anterior</a> <a href="#" class="page-link" data-page="{{ current - 1 }}">&laquo; Anterior</a>
{% endif %} {% endif %}
{# Ventana de páginas (máx 5 alrededor) #}
{% set start = 1 if current - 2 < 1 else current - 2 %} {% set start = 1 if current - 2 < 1 else current - 2 %}
{% set end = total_pages if current + 2 > total_pages else current + 2 %} {% set end = total_pages if current + 2 > total_pages else current + 2 %}
@ -102,20 +98,19 @@
<a href="#" class="page-link" data-page="{{ total_pages }}">{{ total_pages }}</a> <a href="#" class="page-link" data-page="{{ total_pages }}">{{ total_pages }}</a>
{% endif %} {% endif %}
{# Siguiente #}
{% if current < total_pages %} {% if current < total_pages %}
<a href="#" class="page-link" data-page="{{ current + 1 }}">Siguiente &raquo;</a> <a href="#" class="page-link" data-page="{{ current + 1 }}">Siguiente &raquo;</a>
{% endif %} {% endif %}
</nav> </nav>
{% endif %} {% endif %}
{# Toggle "Ver más / Ver menos" con delegación; se liga una sola vez #}
<script> <script>
(function () { (function () {
if (window.__noticiasToggleBound) return; if (window.__noticiasToggleBound) return;
window.__noticiasToggleBound = true; window.__noticiasToggleBound = true;
const container = document.getElementById('noticias-container') || document; const container = document.getElementById('noticias-container') || document;
container.addEventListener('click', function (e) { container.addEventListener('click', function (e) {
const btn = e.target.closest('.ver-mas-btn'); const btn = e.target.closest('.ver-mas-btn');
if (!btn) return; if (!btn) return;
@ -123,14 +118,17 @@
const wrap = btn.closest('.resumen-container'); const wrap = btn.closest('.resumen-container');
if (!wrap) return; if (!wrap) return;
const corto = wrap.querySelector('.resumen-corto'); const resumen = wrap.querySelector('.resumen.clamp');
const completo = wrap.querySelector('.resumen-completo'); if (!resumen) return;
if (!corto || !completo) return;
const expanded = completo.style.display === 'block'; const isExpanded = resumen.classList.contains('expanded');
completo.style.display = expanded ? 'none' : 'block'; if (isExpanded) {
corto.style.display = expanded ? 'block' : 'none'; resumen.classList.remove('expanded');
btn.textContent = expanded ? 'Ver más' : 'Ver menos'; btn.textContent = 'Ver más';
} else {
resumen.classList.add('expanded');
btn.textContent = 'Ver menos';
}
}); });
})(); })();
</script> </script>

View file

@ -1,7 +1,9 @@
# translation_worker.py
import os import os
import time import time
import logging import logging
import contextlib import contextlib
import re
from typing import List, Optional from typing import List, Optional
import psycopg2 import psycopg2
@ -62,6 +64,12 @@ def _env_str(name: str, *fallbacks: str, default: Optional[str] = None) -> Optio
return val return val
return default return default
def _env_bool(name: str, default: bool = False) -> bool:
val = os.environ.get(name)
if val is None:
return default
return str(val).strip().lower() in ("1", "true", "yes", "y", "on")
TARGET_LANGS = _env_list("TARGET_LANGS", "TRANSLATE_TO", default="es") TARGET_LANGS = _env_list("TARGET_LANGS", "TRANSLATE_TO", default="es")
BATCH_SIZE = _env_int("BATCH", "TRANSLATOR_BATCH", "TRANSLATE_BATCH", default=8) BATCH_SIZE = _env_int("BATCH", "TRANSLATOR_BATCH", "TRANSLATE_BATCH", default=8)
ENQUEUE_MAX = _env_int("ENQUEUE", "TRANSLATOR_ENQUEUE", "TRANSLATE_ENQUEUE", default=200) ENQUEUE_MAX = _env_int("ENQUEUE", "TRANSLATOR_ENQUEUE", "TRANSLATE_ENQUEUE", default=200)
@ -69,8 +77,8 @@ SLEEP_IDLE = _env_float("SLEEP_IDLE", "TRANSLATOR_SLEEP_IDLE", "TRANSLATE_SLEE
DEVICE_CFG = (_env_str("DEVICE", default="auto") or "auto").lower() # 'cpu' | 'cuda' | 'auto' DEVICE_CFG = (_env_str("DEVICE", default="auto") or "auto").lower() # 'cpu' | 'cuda' | 'auto'
# Límites de tokens (ajusta si ves OOM) # Límites de tokens (ajusta si ves OOM)
MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", default=384) MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", default=512)
MAX_NEW_TOKENS = _env_int("MAX_NEW_TOKENS", default=192) MAX_NEW_TOKENS = _env_int("MAX_NEW_TOKENS", default=256)
# ---- Beams: por defecto 2 para títulos y 1 para cuerpo; respeta NUM_BEAMS si sólo se define ese ---- # ---- Beams: por defecto 2 para títulos y 1 para cuerpo; respeta NUM_BEAMS si sólo se define ese ----
def _beams_from_env(): def _beams_from_env():
@ -91,6 +99,50 @@ NUM_BEAMS_TITLE, NUM_BEAMS_BODY = _beams_from_env()
# Modelo por defecto: NLLB 600M (cámbialo por facebook/nllb-200-1.3B si quieres el 1.3B) # Modelo por defecto: NLLB 600M (cámbialo por facebook/nllb-200-1.3B si quieres el 1.3B)
UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", default="facebook/nllb-200-distilled-600M") UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", default="facebook/nllb-200-distilled-600M")
# ---------- Chunking por frases (para artículos largos) ----------
# Activo por defecto para evitar secuencias > límite del modelo
CHUNK_BY_SENTENCES = _env_bool("CHUNK_BY_SENTENCES", default=True)
CHUNK_MAX_TOKENS = _env_int("CHUNK_MAX_TOKENS", default=900) # <= modelo - margen
CHUNK_OVERLAP_SENTS = _env_int("CHUNK_OVERLAP_SENTS", default=0) # 0 o 1
# Abreviaturas comunes y marcador temporal
_ABBR = ("Sr", "Sra", "Dr", "Dra", "Ing", "Lic", "pág", "etc")
_ABBR_MARK = "§" # no debería aparecer en texto normal
def _protect_abbrev(text: str) -> str:
# Iniciales de una letra: "E.", "A."
t = re.sub(r"\b([A-ZÁÉÍÓÚÑÄÖÜ])\.", r"\1" + _ABBR_MARK, text)
# Abreviaturas de la lista (case-insensitive)
pat = r"\b(?:" + "|".join(map(re.escape, _ABBR)) + r")\."
t = re.sub(pat, lambda m: m.group(0)[:-1] + _ABBR_MARK, t, flags=re.IGNORECASE)
return t
def _restore_abbrev(text: str) -> str:
return text.replace(_ABBR_MARK, ".")
# Regex de corte SIN look-behind variable:
# - Corta tras [.!?…] si hay espacios y luego comienza otra frase (letra mayúscula, comillas, paréntesis, dígito)
# - O cuando hay doble salto de línea
_SENT_SPLIT_RE = re.compile(
r'(?<=[\.!\?…])\s+(?=["\(\[A-ZÁÉÍÓÚÑÄÖÜ0-9])|(?:\n{2,})'
)
def split_into_sentences(text: str) -> List[str]:
text = (text or "").strip()
if not text:
return []
protected = _protect_abbrev(text)
parts = [p.strip() for p in _SENT_SPLIT_RE.split(protected) if p and p.strip()]
parts = [_restore_abbrev(p) for p in parts]
# Une piezas muy cortas con la anterior para más coherencia
merged: List[str] = []
for p in parts:
if merged and len(p) < 40:
merged[-1] = merged[-1] + " " + p
else:
merged.append(p)
return merged
# ---------- Mapeo idiomas a códigos NLLB ---------- # ---------- Mapeo idiomas a códigos NLLB ----------
NLLB_LANG = { NLLB_LANG = {
# básicos # básicos
@ -171,8 +223,8 @@ def fetch_pending_batch(conn, lang_to: str, batch_size: int):
rows = cur.fetchall() rows = cur.fetchall()
if rows: if rows:
ids = [r["tr_id"] for r in rows] ids = [r["tr_id"] for r in rows]
with conn.cursor() as cur: with conn.cursor() as cur2:
cur.execute("UPDATE traducciones SET status='processing' WHERE id = ANY(%s)", (ids,)) cur2.execute("UPDATE traducciones SET status='processing' WHERE id = ANY(%s)", (ids,))
conn.commit() conn.commit()
return rows return rows
@ -277,8 +329,14 @@ def get_universal_components():
_load_model_on(torch.device("cpu")) _load_model_on(torch.device("cpu"))
return _TOKENIZER, _MODEL, _DEVICE return _TOKENIZER, _MODEL, _DEVICE
# ---------- Utilidades ---------- # ---------- Utilidades de tokenización / chunking ----------
def _safe_src_len(tokenizer) -> int:
model_max = getattr(tokenizer, "model_max_length", 1024) or 1024
# margen para tokens especiales/ruido
return min(MAX_SRC_TOKENS, int(model_max) - 16)
def _token_chunks(tokenizer, text: str, max_tokens: int) -> List[str]: def _token_chunks(tokenizer, text: str, max_tokens: int) -> List[str]:
"""Troceo simple por tokens (fallback)"""
if not text: if not text:
return [] return []
ids = tokenizer.encode(text, add_special_tokens=False) ids = tokenizer.encode(text, add_special_tokens=False)
@ -293,8 +351,8 @@ def _token_chunks(tokenizer, text: str, max_tokens: int) -> List[str]:
return chunks return chunks
def _norm(s: str) -> str: def _norm(s: str) -> str:
import re import re as _re
return re.sub(r"\W+", "", (s or "").lower()).strip() return _re.sub(r"\W+", "", (s or "").lower()).strip()
def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_code: str) -> int: def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_code: str) -> int:
""" """
@ -344,8 +402,13 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c
LOG.warning("No pude resolver lang code id para '%s'. Uso fallback (eos/bos).", tgt_code) LOG.warning("No pude resolver lang code id para '%s'. Uso fallback (eos/bos).", tgt_code)
return getattr(tokenizer, "eos_token_id", None) or getattr(tokenizer, "bos_token_id", None) or 0 return getattr(tokenizer, "eos_token_id", None) or getattr(tokenizer, "bos_token_id", None) or 0
# ---------- Traducción base ----------
@torch.inference_mode() @torch.inference_mode()
def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1, _tries: int = 0) -> str: def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1, _tries: int = 0) -> str:
"""
Traduce un texto (usando troceo por tokens si excede MAX_SRC_TOKENS).
Se usa para títulos y como núcleo para chunks de artículos.
"""
if not text or not text.strip(): if not text or not text.strip():
return "" return ""
@ -361,13 +424,14 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1,
forced_bos = _forced_bos_id(tok, mdl, tgt_code) forced_bos = _forced_bos_id(tok, mdl, tgt_code)
parts = _token_chunks(tok, text, MAX_SRC_TOKENS) safe_len = _safe_src_len(tok)
parts = _token_chunks(tok, text, safe_len)
outs: List[str] = [] outs: List[str] = []
try: try:
autocast_ctx = torch.amp.autocast("cuda", dtype=torch.float16) if device.type == "cuda" else contextlib.nullcontext() autocast_ctx = torch.amp.autocast("cuda", dtype=torch.float16) if device.type == "cuda" else contextlib.nullcontext()
for p in parts: for p in parts:
enc = tok(p, return_tensors="pt", truncation=True, max_length=MAX_SRC_TOKENS) enc = tok(p, return_tensors="pt", truncation=True, max_length=safe_len)
enc = {k: v.to(device) for k, v in enc.items()} enc = {k: v.to(device) for k, v in enc.items()}
gen_kwargs = dict( gen_kwargs = dict(
@ -377,7 +441,6 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1,
do_sample=False, do_sample=False,
use_cache=False, # ↓ memoria use_cache=False, # ↓ memoria
) )
# Evita el warning cuando num_beams = 1
if int(num_beams) > 1: if int(num_beams) > 1:
gen_kwargs["early_stopping"] = True gen_kwargs["early_stopping"] = True
@ -411,6 +474,102 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1,
return translate_text(src_lang, tgt_lang, text, num_beams=num_beams, _tries=_tries + 1) return translate_text(src_lang, tgt_lang, text, num_beams=num_beams, _tries=_tries + 1)
raise raise
# ---------- Chunking por frases para artículos ----------
def _sent_token_len(tokenizer, sent: str) -> int:
return len(tokenizer(sent, add_special_tokens=False).input_ids)
def _pack_sentences_to_token_chunks(
tokenizer, sentences: List[str], max_tokens: int, overlap_sents: int = 0
) -> List[List[str]]:
chunks: List[List[str]] = []
cur: List[str] = []
cur_tokens = 0
for s in sentences:
slen = _sent_token_len(tokenizer, s)
if slen > max_tokens:
# Si una sola frase excede el límite, córtala por tokens como último recurso
ids = tokenizer(s, add_special_tokens=False).input_ids
step = max_tokens
for i in range(0, len(ids), step):
sub = tokenizer.decode(ids[i:i+step], skip_special_tokens=True)
if cur:
chunks.append(cur)
cur = []
cur_tokens = 0
chunks.append([sub])
continue
if cur_tokens + slen <= max_tokens:
cur.append(s); cur_tokens += slen
else:
if cur:
chunks.append(cur)
if overlap_sents > 0 and len(cur) > 0:
overlap = cur[-overlap_sents:]
cur = overlap + [s]
cur_tokens = sum(_sent_token_len(tokenizer, x) for x in cur)
else:
cur = [s]; cur_tokens = slen
if cur:
chunks.append(cur)
return chunks
def _smart_concatenate(parts: List[str], tail_window: int = 120) -> str:
"""Une partes evitando duplicados obvios en el borde (heurística ligera)."""
if not parts:
return ""
out = parts[0]
for nxt in parts[1:]:
tail = out[-tail_window:]
cut = 0
for k in range(min(len(tail), len(nxt)), 20, -1):
if nxt.startswith(tail[-k:]):
cut = k
break
out += ("" if cut == 0 else nxt[cut:]) if nxt else ""
return out
def translate_article_full(
src_lang: str,
tgt_lang: str,
text: str,
num_beams: int,
) -> str:
"""
Traduce un artículo completo:
- Divide por frases (sin look-behind variable)
- Empaqueta en chunks <= límite de tokens
- Traduce chunk a chunk (usa translate_text internamente)
- Une con heurística para evitar duplicados en bordes
"""
if not text or not text.strip():
return ""
if not CHUNK_BY_SENTENCES:
# Ruta rápida: una sola pasada con truncamiento interno
return translate_text(src_lang, tgt_lang, text, num_beams=num_beams)
tok, _, _ = get_universal_components()
safe_len = _safe_src_len(tok)
max_chunk_tokens = min(CHUNK_MAX_TOKENS, safe_len)
sents = split_into_sentences(text)
if not sents:
return ""
chunks_sents = _pack_sentences_to_token_chunks(
tok, sents, max_tokens=max_chunk_tokens, overlap_sents=CHUNK_OVERLAP_SENTS
)
translated_parts: List[str] = []
for group in chunks_sents:
chunk_text = " ".join(group)
translated = translate_text(src_lang, tgt_lang, chunk_text, num_beams=num_beams)
translated_parts.append(translated)
return _smart_concatenate([p for p in translated_parts if p])
# ---------- Procesamiento por lotes ----------
def process_batch(conn, rows): def process_batch(conn, rows):
for r in rows: for r in rows:
tr_id = r["tr_id"] tr_id = r["tr_id"]
@ -426,9 +585,10 @@ def process_batch(conn, rows):
continue continue
try: try:
# Beams distintos: mejor calidad en títulos con coste de VRAM controlado # Títulos: cortos, traducción directa (beams más altos si quieres)
title_tr = translate_text(lang_from, lang_to, title, num_beams=NUM_BEAMS_TITLE) if title else "" title_tr = translate_text(lang_from, lang_to, title, num_beams=NUM_BEAMS_TITLE) if title else ""
body_tr = translate_text(lang_from, lang_to, body, num_beams=NUM_BEAMS_BODY) if body else "" # Cuerpo/resumen: artículo completo con chunking por frases
body_tr = translate_article_full(lang_from, lang_to, body, num_beams=NUM_BEAMS_BODY) if body else ""
# Si la "traducción" es igual al original, déjala vacía # Si la "traducción" es igual al original, déjala vacía
if _norm(title_tr) == _norm(title): if _norm(title_tr) == _norm(title):
@ -443,8 +603,10 @@ def process_batch(conn, rows):
def main(): def main():
LOG.info( LOG.info(
"Arrancando worker de traducción (NLLB). TARGET_LANGS=%s, BATCH=%s, ENQUEUE=%s, DEVICE=%s, BEAMS(title/body)=%s/%s", "Arrancando worker de traducción (NLLB). TARGET_LANGS=%s, BATCH=%s, ENQUEUE=%s, DEVICE=%s, "
TARGET_LANGS, BATCH_SIZE, ENQUEUE_MAX, DEVICE_CFG, NUM_BEAMS_TITLE, NUM_BEAMS_BODY "BEAMS(title/body)=%s/%s, CHUNK_BY_SENTENCES=%s, CHUNK_MAX_TOKENS=%s, OVERLAP_SENTS=%s",
TARGET_LANGS, BATCH_SIZE, ENQUEUE_MAX, DEVICE_CFG, NUM_BEAMS_TITLE, NUM_BEAMS_BODY,
CHUNK_BY_SENTENCES, CHUNK_MAX_TOKENS, CHUNK_OVERLAP_SENTS
) )
# Pre-carga el modelo una vez para reservar memoria de forma limpia # Pre-carga el modelo una vez para reservar memoria de forma limpia
get_universal_components() get_universal_components()