arreglo de ui y busquedas

This commit is contained in:
jlimolina 2025-11-21 04:42:02 +01:00
parent cb8f69fb93
commit fc06566928
15 changed files with 1115 additions and 435 deletions

98
app.py
View file

@ -3,6 +3,7 @@ import csv
import io import io
import time import time
import socket import socket
import zipfile
from datetime import datetime, date from datetime import datetime, date
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
@ -337,21 +338,48 @@ def home():
params.append(int(continente_id)) params.append(int(continente_id))
if q: if q:
where.append("n.tsv @@ plainto_tsquery('spanish', %s)") search_like = f"%{q}%"
params.append(q) if use_tr:
where.append(
"""
(
n.tsv @@ websearch_to_tsquery('spanish', %s)
OR t.titulo_trad ILIKE %s
OR t.resumen_trad ILIKE %s
OR n.titulo ILIKE %s
OR n.resumen ILIKE %s
)
"""
)
params.extend([q, search_like, search_like, search_like, search_like])
else:
where.append(
"""
(
n.tsv @@ websearch_to_tsquery('spanish', %s)
OR n.titulo ILIKE %s
OR n.resumen ILIKE %s
)
"""
)
params.extend([q, search_like, search_like])
where_sql = " AND ".join(where) where_sql = " AND ".join(where)
with conn.cursor(cursor_factory=extras.DictCursor) as cur: with conn.cursor(cursor_factory=extras.DictCursor) as cur:
cur.execute( cur.execute(
f""" f"""
SELECT COUNT(*) SELECT COUNT(DISTINCT n.id)
FROM noticias n FROM noticias n
LEFT JOIN categorias c ON c.id = n.categoria_id LEFT JOIN categorias c ON c.id = n.categoria_id
LEFT JOIN paises p ON p.id = n.pais_id LEFT JOIN paises p ON p.id = n.pais_id
LEFT JOIN traducciones t
ON t.noticia_id = n.id
AND t.lang_to = %s
AND t.status = 'done'
WHERE {where_sql} WHERE {where_sql}
""", """,
params, [lang] + params,
) )
total_results = cur.fetchone()[0] if cur.rowcount else 0 total_results = cur.fetchone()[0] if cur.rowcount else 0
total_pages = (total_results // per_page) + (1 if total_results % per_page else 0) total_pages = (total_results // per_page) + (1 if total_results % per_page else 0)
@ -925,7 +953,7 @@ def add_url_source():
""" """
INSERT INTO fuentes_url (nombre, url, categoria_id, pais_id, idioma) INSERT INTO fuentes_url (nombre, url, categoria_id, pais_id, idioma)
VALUES (%s, %s, %s, %s, %s) VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (url_norm) DO UPDATE ON CONFLICT (url) DO UPDATE
SET nombre = EXCLUDED.nombre, SET nombre = EXCLUDED.nombre,
categoria_id = EXCLUDED.categoria_id, categoria_id = EXCLUDED.categoria_id,
pais_id = EXCLUDED.pais_id, pais_id = EXCLUDED.pais_id,
@ -1172,8 +1200,6 @@ def scrape_url():
@app.route("/backup_completo") @app.route("/backup_completo")
def backup_completo(): def backup_completo():
import zipfile
mem_file = io.BytesIO() mem_file = io.BytesIO()
with zipfile.ZipFile(mem_file, mode="w", compression=zipfile.ZIP_DEFLATED) as zf: with zipfile.ZipFile(mem_file, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
with get_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur: with get_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur:
@ -1207,6 +1233,64 @@ def backup_completo():
) )
@app.route("/restore_completo", methods=["GET", "POST"])
def restore_completo():
if request.method == "GET":
return render_template("restore_completo.html")
file = request.files.get("backup_file")
if not file or file.filename == "":
flash("No se ha seleccionado ningún archivo.", "error")
return redirect(url_for("restore_completo"))
filename = file.filename.lower()
if not filename.endswith(".zip"):
flash("El archivo debe ser un .zip.", "error")
return redirect(url_for("restore_completo"))
raw = file.read()
try:
zf = zipfile.ZipFile(io.BytesIO(raw))
except zipfile.BadZipFile:
flash("El archivo no es un .zip válido.", "error")
return redirect(url_for("restore_completo"))
restored_counts = {}
conn = get_conn()
try:
with conn:
with conn.cursor() as cur:
if "feeds.csv" in zf.namelist():
cur.execute("TRUNCATE TABLE feeds RESTART IDENTITY;")
with zf.open("feeds.csv") as f:
text_f = io.TextIOWrapper(f, encoding="utf-8")
cur.copy_expert("COPY feeds FROM STDIN CSV HEADER", text_f)
restored_counts["feeds"] = cur.rowcount if cur.rowcount is not None else 0
if "fuentes_url.csv" in zf.namelist():
cur.execute("TRUNCATE TABLE fuentes_url RESTART IDENTITY;")
with zf.open("fuentes_url.csv") as f2:
text_f2 = io.TextIOWrapper(f2, encoding="utf-8")
cur.copy_expert("COPY fuentes_url FROM STDIN CSV HEADER", text_f2)
restored_counts["fuentes_url"] = cur.rowcount if cur.rowcount is not None else 0
except Exception as e:
conn.rollback()
conn.close()
flash(f"Error al restaurar el backup: {e}", "error")
return redirect(url_for("restore_completo"))
conn.close()
if restored_counts:
partes = [f"{tabla}: {n} filas" for tabla, n in restored_counts.items()]
flash("Restauración completada: " + ", ".join(partes), "success")
else:
flash("Backup procesado pero no se encontraron ficheros reconocidos (feeds.csv, fuentes_url.csv).", "warning")
return redirect(url_for("dashboard"))
if __name__ == "__main__": if __name__ == "__main__":
app.run(host="0.0.0.0", port=8001, debug=True) app.run(host="0.0.0.0", port=8001, debug=True)

View file

@ -141,7 +141,7 @@ services:
- DB_NAME=${DB_NAME} - DB_NAME=${DB_NAME}
- DB_USER=${DB_USER} - DB_USER=${DB_USER}
- DB_PASS=${DB_PASS} - DB_PASS=${DB_PASS}
- EMB_MODEL=sentence-transformers/all-MiniLM-L6-v2 - EMB_MODEL=sentence-transformers/paraphrase-multilingual-mpnet-base-v2
- EMB_BATCH=256 - EMB_BATCH=256
- EMB_SLEEP_IDLE=5 - EMB_SLEEP_IDLE=5
- EMB_LANGS=es - EMB_LANGS=es

View file

@ -1,16 +1,12 @@
# embeddings_worker.py
# Worker de embeddings para TRADUCCIONES:
# - Lee traducciones con status='done' y sin embedding para un modelo concreto
# - Calcula embedding (Sentence-Transformers) sobre titulo_trad + resumen_trad
# - Guarda en traduccion_embeddings (traduccion_id, model, dim, embedding)
import os import os
import time import time
import logging import logging
from typing import List from typing import List
import numpy as np import numpy as np
import psycopg2 import psycopg2
import psycopg2.extras import psycopg2.extras
from psycopg2.extras import execute_values
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
import torch import torch
@ -30,7 +26,7 @@ DB = dict(
# Modelo por defecto: multilingüe, bueno para muchas lenguas # Modelo por defecto: multilingüe, bueno para muchas lenguas
EMB_MODEL = os.environ.get( EMB_MODEL = os.environ.get(
"EMB_MODEL", "EMB_MODEL",
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
) )
EMB_BATCH = int(os.environ.get("EMB_BATCH", "128")) EMB_BATCH = int(os.environ.get("EMB_BATCH", "128"))
SLEEP_IDLE = float(os.environ.get("EMB_SLEEP_IDLE", "5.0")) SLEEP_IDLE = float(os.environ.get("EMB_SLEEP_IDLE", "5.0"))
@ -65,8 +61,13 @@ def ensure_schema(conn):
); );
""" """
) )
cur.execute("CREATE INDEX IF NOT EXISTS idx_tr_emb_model ON traduccion_embeddings(model);") # Alineado con init-db/08-embeddings.sql
cur.execute("CREATE INDEX IF NOT EXISTS idx_tr_emb_trid ON traduccion_embeddings(traduccion_id);") cur.execute(
"CREATE INDEX IF NOT EXISTS idx_tr_emb_model ON traduccion_embeddings(model);"
)
cur.execute(
"CREATE INDEX IF NOT EXISTS idx_tr_emb_traduccion_id ON traduccion_embeddings(traduccion_id);"
)
conn.commit() conn.commit()
@ -104,7 +105,7 @@ def texts_from_rows(rows: List[psycopg2.extras.DictRow]) -> List[str]:
Compone el texto a vectorizar por cada traducción: Compone el texto a vectorizar por cada traducción:
'titulo_trad' + '\n' + 'resumen_trad'. Si alguno falta, usa lo disponible. 'titulo_trad' + '\n' + 'resumen_trad'. Si alguno falta, usa lo disponible.
""" """
texts = [] texts: List[str] = []
for r in rows: for r in rows:
title = (r["titulo_trad"] or "").strip() title = (r["titulo_trad"] or "").strip()
body = (r["resumen_trad"] or "").strip() body = (r["resumen_trad"] or "").strip()
@ -117,23 +118,36 @@ def texts_from_rows(rows: List[psycopg2.extras.DictRow]) -> List[str]:
def upsert_embeddings(conn, rows, embs: np.ndarray, model_name: str): def upsert_embeddings(conn, rows, embs: np.ndarray, model_name: str):
""" """
Inserta/actualiza embeddings por traducción. Inserta/actualiza embeddings por traducción en lote (batch insert).
""" """
if embs.size == 0 or not rows: if embs.size == 0 or not rows:
return return
dim = int(embs.shape[1]) dim = int(embs.shape[1])
# Preparamos los datos para execute_values
data = [
(
int(r["traduccion_id"]),
model_name,
dim,
[float(x) for x in e],
)
for r, e in zip(rows, embs)
]
with conn.cursor() as cur: with conn.cursor() as cur:
for r, e in zip(rows, embs): execute_values(
cur.execute( cur,
""" """
INSERT INTO traduccion_embeddings (traduccion_id, model, dim, embedding) INSERT INTO traduccion_embeddings (traduccion_id, model, dim, embedding)
VALUES (%s, %s, %s, %s) VALUES %s
ON CONFLICT (traduccion_id, model) DO UPDATE ON CONFLICT (traduccion_id, model) DO UPDATE
SET embedding = EXCLUDED.embedding, SET embedding = EXCLUDED.embedding,
dim = EXCLUDED.dim, dim = EXCLUDED.dim,
created_at = NOW() created_at = NOW()
""", """,
(int(r["traduccion_id"]), model_name, dim, list(map(float, e))), data,
) )
conn.commit() conn.commit()

View file

@ -29,7 +29,7 @@ SELECT
te.dim, te.dim,
te.embedding AS vec te.embedding AS vec
FROM traduccion_embeddings te FROM traduccion_embeddings te
WHERE te.model = 'sentence-transformers/all-MiniLM-L6-v2'; WHERE te.model = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2';
CREATE TABLE IF NOT EXISTS related_noticias ( CREATE TABLE IF NOT EXISTS related_noticias (
traduccion_id INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE, traduccion_id INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE,

View file

@ -1,62 +0,0 @@
-- Función para “normalizar” URLs: minúsculas en esquema/host, quitar www,
-- quitar fragmentos (#...), limpiar trackers comunes (utm_*, gclid, fbclid, ref, etc.),
-- colapsar dobles “/”, y quitar la “/” final salvo si es la raíz.
CREATE OR REPLACE FUNCTION normalize_url(in_url text)
RETURNS text
LANGUAGE plpgsql
AS $$
DECLARE
u text := trim(in_url);
scheme_host text;
path_q text;
BEGIN
IF u IS NULL OR u = '' THEN
RETURN NULL;
END IF;
-- quitar espacios y fragmentos
u := regexp_replace(u, '#.*$', '', 'i');
-- separar esquema+host de path+query
-- ej: https://example.com:443/foo?bar -> scheme_host=https://example.com:443 ; path_q=/foo?bar
scheme_host := substring(u FROM '^[a-z]+://[^/]*');
IF scheme_host IS NULL THEN
-- si no hay esquema, asumimos http
u := 'http://' || u;
scheme_host := substring(u FROM '^[a-z]+://[^/]*');
END IF;
path_q := substring(u FROM '^[a-z]+://[^/]*(/.*)$');
IF path_q IS NULL THEN
path_q := '/';
END IF;
-- normalizar esquema y host (minúsculas, quitar www.)
scheme_host := lower(scheme_host);
scheme_host := regexp_replace(scheme_host, '^(https?://)www\.', '\1', 'i');
-- quitar puerto por defecto (:80 en http, :443 en https)
scheme_host := regexp_replace(scheme_host, '^http://([^/:]+):80$', 'http://\1', 'i');
scheme_host := regexp_replace(scheme_host, '^https://([^/:]+):443$', 'https://\1', 'i');
-- limpiar parámetros de tracking en la query
-- elimina ?utm_... &utm_... gclid fbclid mc_cid mc_eid ref ref_src etc.
path_q := regexp_replace(path_q, '([?&])(utm_[^=&]+|gclid|fbclid|mc_cid|mc_eid|ref|ref_src|yclid|igshid)=[^&#]*', '\1', 'gi');
-- limpiar conectores sobrantes ?, &, &&, ?&, etc.
path_q := regexp_replace(path_q, '\?&+', '?', 'g');
path_q := regexp_replace(path_q, '&{2,}', '&', 'g');
path_q := regexp_replace(path_q, '\?$', '', 'g');
path_q := regexp_replace(path_q, '\?$','', 'g');
-- colapsar dobles barras en path (no tocar “://”)
path_q := regexp_replace(path_q, '/{2,}', '/', 'g');
-- quitar “/” final si no es la raíz
IF path_q <> '/' THEN
path_q := regexp_replace(path_q, '/+$', '', 'g');
END IF;
RETURN scheme_host || path_q;
END;
$$;

View file

@ -1,38 +0,0 @@
-- Añadir columna generada url_norm y crear índice único sobre ella.
-- OJO: si ya existen duplicados, este índice fallará.
-- Primero crea la columna si no existe:
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name='feeds' AND column_name='url_norm'
) THEN
ALTER TABLE feeds
ADD COLUMN url_norm text GENERATED ALWAYS AS (normalize_url(url)) STORED;
END IF;
END $$;
-- Índice único (concurrently para no bloquear). Requiere estar fuera de transacción.
-- Si tu herramienta corre todo en una transacción, ejecuta estas dos líneas aparte.
-- Quita duplicados antes si da error.
CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS feeds_url_norm_uniq ON feeds (url_norm)
WHERE url_norm IS NOT NULL;
-- (Opcional) repetir lo mismo para fuentes_url y noticias si quieres esa garantía también:
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name='fuentes_url' AND column_name='url_norm'
) THEN
ALTER TABLE fuentes_url
ADD COLUMN url_norm text GENERATED ALWAYS AS (normalize_url(url)) STORED;
END IF;
END $$;
CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS fuentes_url_norm_uniq ON fuentes_url (url_norm)
WHERE url_norm IS NOT NULL;

View file

@ -2,6 +2,10 @@ import os
import time import time
import logging import logging
import re import re
import string
from typing import List, Tuple
from collections import Counter
import psycopg2 import psycopg2
import psycopg2.extras import psycopg2.extras
import spacy import spacy
@ -28,30 +32,120 @@ ENT_LABELS = {
} }
_ws_re = re.compile(r"\s+") _ws_re = re.compile(r"\s+")
HTML_TRASH_PATTERNS = [ HTML_TRASH_PATTERNS = [
r"<[^>]+>", r"<[^>]+>",
r"&[a-z]+;", r"&[a-z]+;",
r"&#\d+;?", # entidades numéricas tipo &#8230;
r'width="\d+"', r'width="\d+"',
r'height="\d+"', r'height="\d+"',
] ]
# Palabras/sintagmas demasiado genéricos o claramente ruido
GENERIC_BAD_TAGS = { GENERIC_BAD_TAGS = {
"república", "república",
"estado", "estado",
"centro", "centro",
"gobierno", "gobierno",
"el gobierno",
"gobiernos",
"report", "report",
"sp", "sp",
"unión", "unión",
"union",
"dólares",
"dolar",
"dólar",
"the post",
"post",
"artículo",
"el artículo",
"la ciudad",
"mundo",
"país",
"pais",
"países",
"paises",
"la noche",
"la publicación",
"este miércoles",
"el miércoles",
"hoy",
"ayer",
"mañana",
"servicio",
"servicios",
"el presidente",
"presidente",
"el ministro",
"ministro",
"la guerra",
"guerra",
"seguridad",
"wp-content",
"internal_photos",
"/internal_photos",
"https",
"http",
"src",
} }
STOPWORDS = set()
def clean_tag_text(text): ARTICLES = {
"el",
"la",
"los",
"las",
"un",
"una",
"uno",
"al",
"del",
}
TOPIC_MIN_CHARS = 4
TOPIC_MAX_WORDS = 6
TOPIC_MAX_PER_DOC = 15
def _looks_like_attr_or_path(text_lower: str) -> bool:
"""Detecta cosas tipo rutas, atributos HTML, ids raros, etc."""
if text_lower.startswith("/"):
return True
if "http://" in text_lower or "https://" in text_lower:
return True
if ".com" in text_lower or ".net" in text_lower or ".org" in text_lower:
return True
if any(ext in text_lower for ext in (".jpg", ".jpeg", ".png", ".gif", ".webp")):
return True
if re.search(r"\b(src|alt|style|class)\s*=", text_lower):
return True
if "data-" in text_lower:
return True
if re.search(r"&#\d+;?", text_lower):
return True
# cosas tipo atributo=valor
if "=" in text_lower and " " not in text_lower.strip():
return True
# cadenas largas sin espacios (ids, hashes…)
if re.fullmatch(r"[a-z0-9_]{15,}", text_lower.replace("-", "").replace(" ", "")):
return True
# palabra única con guión suele ser ruta/slug: wp-content, internal-photos…
if "-" in text_lower and " " not in text_lower:
return True
return False
def clean_tag_text(text: str) -> str | None:
"""Limpieza para entidades (PERSON/ORG/GPE/LOC)."""
if not text: if not text:
return None return None
text = BeautifulSoup(text, "html.parser").get_text() text = BeautifulSoup(text, "html.parser").get_text()
for pat in HTML_TRASH_PATTERNS: for pat in HTML_TRASH_PATTERNS:
text = re.sub(pat, "", text) text = re.sub(pat, "", text)
text = _ws_re.sub(" ", text).strip() text = _ws_re.sub(" ", text).strip()
text = text.strip(string.punctuation + " ")
if len(text) < 3: if len(text) < 3:
return None return None
if re.search(r"[<>/\\]", text): if re.search(r"[<>/\\]", text):
@ -59,13 +153,15 @@ def clean_tag_text(text):
lower = text.lower() lower = text.lower()
if lower.startswith("href="): if lower.startswith("href="):
return None return None
if lower.startswith("http"): if _looks_like_attr_or_path(lower):
return None return None
if lower in GENERIC_BAD_TAGS: if lower in GENERIC_BAD_TAGS:
return None return None
replacements = { replacements = {
"ee.uu.": "Estados Unidos", "ee.uu.": "Estados Unidos",
"los estados unidos": "Estados Unidos", "los estados unidos": "Estados Unidos",
"eeuu": "Estados Unidos",
"eu": "Unión Europea", "eu": "Unión Europea",
"ue": "Unión Europea", "ue": "Unión Europea",
"kosova": "Kosovo", "kosova": "Kosovo",
@ -75,13 +171,112 @@ def clean_tag_text(text):
return text return text
def clean_topic_text(text: str) -> str | None:
"""Limpieza para posibles 'temas' (noun_chunks)."""
if not text:
return None
text = BeautifulSoup(text, "html.parser").get_text()
for pat in HTML_TRASH_PATTERNS:
text = re.sub(pat, "", text)
text = _ws_re.sub(" ", text).strip()
text = text.strip(string.punctuation + " ")
if len(text) < TOPIC_MIN_CHARS:
return None
lower = text.lower()
if _looks_like_attr_or_path(lower):
return None
# tokenizamos en minúsculas y quitamos puntuación
tokens = [
t.strip(string.punctuation)
for t in lower.split()
if t.strip(string.punctuation)
]
if not tokens:
return None
# quitamos artículo inicial si lo hay
if tokens and tokens[0] in ARTICLES:
tokens = tokens[1:]
if not tokens:
return None
# reconstruimos texto normalizado sin artículo
norm = " ".join(tokens).strip()
if len(norm) < TOPIC_MIN_CHARS:
return None
if norm in GENERIC_BAD_TAGS:
return None
# límite máximo de palabras
if len(tokens) > TOPIC_MAX_WORDS:
return None
# todos stopwords => fuera
if all(t in STOPWORDS for t in tokens):
return None
# sólo números/fechas
if re.fullmatch(r"[0-9\s\.,\-:/]+", norm):
return None
return norm
def get_conn(): def get_conn():
return psycopg2.connect(**DB) return psycopg2.connect(**DB)
def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]:
ents: List[Tuple[str, str]] = []
topics: List[Tuple[str, str]] = []
if not text:
return ents, topics
doc = nlp(text)
# Entidades "clásicas"
for ent in doc.ents:
tipo = ENT_LABELS.get(ent.label_)
if not tipo:
continue
val = clean_tag_text(ent.text)
if not val:
continue
ents.append((val, tipo))
# Candidatos a "tema" a partir de noun_chunks
topic_counter: Counter[str] = Counter()
for chunk in doc.noun_chunks:
val = clean_topic_text(chunk.text)
if not val:
continue
topic_counter[val] += 1
ent_values = {v for (v, _) in ents}
for val, _count in topic_counter.most_common(TOPIC_MAX_PER_DOC):
if val in ent_values:
continue
topics.append((val, "tema"))
# quitamos duplicados
ents = list(set(ents))
topics = list(set(topics))
return ents, topics
def main(): def main():
nlp = spacy.load("es_core_news_md", disable=["parser", "lemmatizer", "textcat"]) global STOPWORDS
logging.info("spaCy cargado: es_core_news_md")
nlp = spacy.load("es_core_news_md", disable=["lemmatizer", "textcat"])
STOPWORDS = set(nlp.Defaults.stop_words)
logging.info("spaCy cargado: es_core_news_md (NER + parser)")
while True: while True:
try: try:
@ -109,7 +304,7 @@ def main():
time.sleep(5) time.sleep(5)
continue continue
logging.info(f"Procesando {len(rows)} traducciones para NER...") logging.info(f"Procesando {len(rows)} traducciones para NER/temas...")
new_links = 0 new_links = 0
@ -118,22 +313,12 @@ def main():
if not text: if not text:
continue continue
doc = nlp(text) ents, topics = extract_entities_and_topics(nlp, text)
ents = [] all_tags = ents + topics
if not all_tags:
for ent in doc.ents:
tipo = ENT_LABELS.get(ent.label_)
if not tipo:
continue
val = clean_tag_text(ent.text)
if not val:
continue
ents.append((val, tipo))
if not ents:
continue continue
for valor, tipo in set(ents): for valor, tipo in all_tags:
try: try:
cur.execute( cur.execute(
""" """
@ -160,9 +345,9 @@ def main():
logging.exception("Fallo insertando tag/relación") logging.exception("Fallo insertando tag/relación")
conn.commit() conn.commit()
logging.info(f"NER lote OK. Nuevos enlaces: {new_links}.") logging.info(f"NER/temas lote OK. Nuevos enlaces: {new_links}.")
except Exception as e: except Exception:
logging.exception(f"Error en NER loop: {e}") logging.exception("Error en NER loop")
time.sleep(5) time.sleep(5)

View file

@ -1,9 +1,9 @@
import os import os
import time import time
import math
import logging import logging
from typing import List, Tuple from typing import List, Tuple
import numpy as np
import psycopg2 import psycopg2
import psycopg2.extras import psycopg2.extras
@ -22,7 +22,6 @@ DB = dict(
TOPK = int(os.environ.get("RELATED_TOPK", 10)) TOPK = int(os.environ.get("RELATED_TOPK", 10))
BATCH_IDS = int(os.environ.get("RELATED_BATCH_IDS", 200)) BATCH_IDS = int(os.environ.get("RELATED_BATCH_IDS", 200))
BATCH_SIM = int(os.environ.get("RELATED_BATCH_SIM", 2000))
SLEEP_IDLE = float(os.environ.get("RELATED_SLEEP", 10)) SLEEP_IDLE = float(os.environ.get("RELATED_SLEEP", 10))
MIN_SCORE = float(os.environ.get("RELATED_MIN_SCORE", 0.0)) MIN_SCORE = float(os.environ.get("RELATED_MIN_SCORE", 0.0))
WINDOW_HOURS = int(os.environ.get("RELATED_WINDOW_H", 0)) WINDOW_HOURS = int(os.environ.get("RELATED_WINDOW_H", 0))
@ -32,44 +31,64 @@ def get_conn():
return psycopg2.connect(**DB) return psycopg2.connect(**DB)
# ---------------------------------------------------------
# Cargar embeddings SOLO de traducciones en español (lang_to='es')
# ---------------------------------------------------------
def _fetch_all_embeddings(cur): def _fetch_all_embeddings(cur):
if WINDOW_HOURS > 0: base_sql = """
cur.execute(
"""
SELECT e.traduccion_id, e.vec SELECT e.traduccion_id, e.vec
FROM embeddings e FROM embeddings e
JOIN traducciones t ON t.id = e.traduccion_id JOIN traducciones t ON t.id = e.traduccion_id
JOIN noticias n ON n.id = t.noticia_id JOIN noticias n ON n.id = t.noticia_id
WHERE n.fecha >= NOW() - INTERVAL %s WHERE t.lang_to = 'es'
""", """
(f"{WINDOW_HOURS} hours",),
) params = []
else: if WINDOW_HOURS > 0:
cur.execute("SELECT traduccion_id, vec FROM embeddings") base_sql += " AND n.fecha >= NOW() - INTERVAL %s"
params.append(f"{WINDOW_HOURS} hours")
cur.execute(base_sql, params)
rows = cur.fetchall() rows = cur.fetchall()
if not rows: if not rows:
return [], [], [] return [], None
ids = [] ids = []
vecs = [] vecs = []
norms = []
for tr_id, v in rows: for tid, v in rows:
if v is None: if v is None:
v = [] continue
nrm = math.sqrt(sum(((x or 0.0) * (x or 0.0)) for x in v)) or 1e-8 ids.append(tid)
ids.append(tr_id)
vecs.append(v) vecs.append(v)
norms.append(nrm)
return ids, vecs, norms if not ids:
return [], None
# Convertimos a matriz numpy
mat = np.array(vecs, dtype=np.float32)
# Normalizamos (evita división por 0)
norms = np.linalg.norm(mat, axis=1, keepdims=True)
norms[norms == 0] = 1e-8
mat = mat / norms
return ids, mat
# ---------------------------------------------------------
# Obtiene IDs pendientes
# ---------------------------------------------------------
def _fetch_pending_ids(cur, limit) -> List[int]: def _fetch_pending_ids(cur, limit) -> List[int]:
cur.execute( cur.execute(
""" """
SELECT e.traduccion_id SELECT e.traduccion_id
FROM embeddings e FROM embeddings e
LEFT JOIN related_noticias r ON r.traduccion_id = e.traduccion_id JOIN traducciones t ON t.id = e.traduccion_id
LEFT JOIN related_noticias r
ON r.traduccion_id = e.traduccion_id
WHERE t.lang_to = 'es'
GROUP BY e.traduccion_id GROUP BY e.traduccion_id
HAVING COUNT(r.related_traduccion_id) = 0 HAVING COUNT(r.related_traduccion_id) = 0
ORDER BY e.traduccion_id DESC ORDER BY e.traduccion_id DESC
@ -80,42 +99,44 @@ def _fetch_pending_ids(cur, limit) -> List[int]:
return [r[0] for r in cur.fetchall()] return [r[0] for r in cur.fetchall()]
def _cosine_with_norms(a, b, na, nb): # ---------------------------------------------------------
num = 0.0 # TOP-K usando NumPy (súper rápido)
for x, y in zip(a, b): # ---------------------------------------------------------
xv = x or 0.0 def _topk_numpy(
yv = y or 0.0
num += xv * yv
denom = na * nb
if denom <= 0.0:
return 0.0
return num / denom
def _topk_for_one(
idx: int, idx: int,
ids_all: List[int], ids_all: List[int],
vecs_all: List[List[float]], mat: np.ndarray,
norms_all: List[float], K: int
pool_indices: List[int],
K: int,
) -> List[Tuple[int, float]]: ) -> List[Tuple[int, float]]:
me_vec = vecs_all[idx]
me_norm = norms_all[idx]
out: List[Tuple[int, float]] = [] # vector de la noticia central
for j in pool_indices: q = mat[idx] # (dim,)
if j == idx:
continue
s = _cosine_with_norms(me_vec, vecs_all[j], me_norm, norms_all[j])
out.append((ids_all[j], s))
out.sort(key=lambda t: t[1], reverse=True) # similitudes coseno: dot product (matriz · vector)
if MIN_SCORE > 0.0: sims = np.dot(mat, q)
out = [p for p in out if p[1] >= MIN_SCORE]
return out[:K] # eliminar self-match
sims[idx] = -999.0
# filtramos por score mínimo
if MIN_SCORE > 0:
mask = sims >= MIN_SCORE
sims = np.where(mask, sims, -999.0)
# obtenemos los índices top-k (mucho más rápido que ordenar todo)
if K >= len(sims):
top_idx = np.argsort(-sims)
else:
part = np.argpartition(-sims, K)[:K]
top_idx = part[np.argsort(-sims[part])]
out = [(ids_all[j], float(sims[j])) for j in top_idx[:K]]
return out
# ---------------------------------------------------------
# Inserta en la tabla related_noticias
# ---------------------------------------------------------
def _insert_related(cur, tr_id: int, pairs: List[Tuple[int, float]]): def _insert_related(cur, tr_id: int, pairs: List[Tuple[int, float]]):
if not pairs: if not pairs:
return return
@ -127,48 +148,47 @@ def _insert_related(cur, tr_id: int, pairs: List[Tuple[int, float]]):
ON CONFLICT (traduccion_id, related_traduccion_id) ON CONFLICT (traduccion_id, related_traduccion_id)
DO UPDATE SET score = EXCLUDED.score DO UPDATE SET score = EXCLUDED.score
""", """,
[(tr_id, rid, float(score)) for (rid, score) in pairs], [(tr_id, rid, score) for (rid, score) in pairs],
) )
# ---------------------------------------------------------
# Procesar IDs objetivo
# ---------------------------------------------------------
def build_for_ids(conn, target_ids: List[int]) -> int: def build_for_ids(conn, target_ids: List[int]) -> int:
with conn.cursor() as cur: with conn.cursor() as cur:
ids_all, vecs_all, norms_all = _fetch_all_embeddings(cur) ids_all, mat = _fetch_all_embeddings(cur)
if not ids_all:
if not ids_all or mat is None:
return 0 return 0
# Mapa ID → index
pos = {tid: i for i, tid in enumerate(ids_all)} pos = {tid: i for i, tid in enumerate(ids_all)}
n = len(ids_all)
processed = 0 processed = 0
with conn.cursor() as cur: with conn.cursor() as cur:
for tr_id in target_ids: for tr_id in target_ids:
if tr_id not in pos: if tr_id not in pos:
continue continue
i = pos[tr_id]
top: List[Tuple[int, float]] = [] idx = pos[tr_id]
for start in range(0, n, BATCH_SIM): pairs = _topk_numpy(idx, ids_all, mat, TOPK)
block = list(range(start, min(start + BATCH_SIM, n))) _insert_related(cur, tr_id, pairs)
candidates = _topk_for_one(i, ids_all, vecs_all, norms_all, block, TOPK)
top += candidates
top.sort(key=lambda t: t[1], reverse=True)
if len(top) > TOPK:
top = top[:TOPK]
_insert_related(cur, tr_id, top)
processed += 1 processed += 1
conn.commit() conn.commit()
return processed return processed
# ---------------------------------------------------------
# MAIN
# ---------------------------------------------------------
def main(): def main():
logging.info( logging.info(
"Iniciando related_worker (TOPK=%s, BATCH_IDS=%s, BATCH_SIM=%s, MIN_SCORE=%.3f, WINDOW_H=%s)", "Iniciando related_worker (TOPK=%s, BATCH_IDS=%s, MIN_SCORE=%.3f, WINDOW_H=%s)",
TOPK, TOPK,
BATCH_IDS, BATCH_IDS,
BATCH_SIM,
MIN_SCORE, MIN_SCORE,
WINDOW_HOURS, WINDOW_HOURS,
) )

View file

@ -36,7 +36,7 @@ if __name__ == '__main__':
# --- CORRECCIÓN 2: Se cambió 'fetch_and_store' por 'fetch_and_store_all' --- # --- CORRECCIÓN 2: Se cambió 'fetch_and_store' por 'fetch_and_store_all' ---
fetch_and_store_all, fetch_and_store_all,
"interval", "interval",
minutes=3, minutes=10,
id="rss_job", id="rss_job",
next_run_time=datetime.utcnow() + timedelta(seconds=10) next_run_time=datetime.utcnow() + timedelta(seconds=10)
) )

View file

@ -14,6 +14,7 @@
} }
* { box-sizing: border-box; } * { box-sizing: border-box; }
body { body {
font-family: 'Poppins', 'Segoe UI', Tahoma, sans-serif; font-family: 'Poppins', 'Segoe UI', Tahoma, sans-serif;
margin: 0; margin: 0;
@ -24,6 +25,12 @@ body {
font-weight: 400; font-weight: 400;
} }
img {
max-width: 100%;
height: auto;
display: block;
}
.container { .container {
max-width: 900px; max-width: 900px;
margin: 30px auto; margin: 30px auto;
@ -36,58 +43,324 @@ body {
-webkit-backdrop-filter: blur(12px); -webkit-backdrop-filter: blur(12px);
} }
header { text-align: center; margin-bottom: 40px; border-bottom: 1px solid var(--border-color); padding-bottom: 30px; } header {
h1 { font-size: 2.8rem; font-weight: 700; margin: 0 0 5px 0; background: var(--gradiente-principal); -webkit-background-clip: text; -webkit-text-fill-color: transparent; display: inline-block; } text-align: center;
h2 { font-size: 1.8rem; font-weight: 600; color: var(--primary-color); margin-bottom: 20px; } margin-bottom: 40px;
.subtitle { color: var(--text-color-light); font-size: 1.1rem; margin-top: 5px; } border-bottom: 1px solid var(--border-color);
padding-bottom: 30px;
}
.form-section, .card { margin-bottom: 30px; background: rgba(255, 255, 255, 0.6); padding: 25px; border-radius: var(--border-radius-md); border: 1px solid var(--border-color); } h1 {
label { display: block; margin-bottom: 8px; font-weight: 600; color: var(--text-color); font-size: 0.9rem; } font-size: 2.8rem;
select, input[type="text"], input[type="url"], input[type="file"], textarea { width: 100%; padding: 12px 15px; border: 1px solid var(--border-color); background-color: #f8f9fa; border-radius: var(--border-radius-sm); font-size: 1rem; font-family: 'Poppins', sans-serif; transition: all var(--transition-speed) ease; } font-weight: 700;
select:focus, input:focus, textarea:focus { outline: none; border-color: var(--primary-color); box-shadow: 0 0 0 3px var(--shadow-color); background-color: white; } margin: 0 0 5px 0;
background: var(--gradiente-principal);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
display: inline-block;
}
.btn, button { padding: 12px 25px; background: var(--gradiente-principal); color: white !important; border: none; border-radius: var(--border-radius-sm); font-size: 1rem; font-weight: 600; cursor: pointer; transition: all var(--transition-speed) ease; box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1); text-decoration: none; display: inline-block; text-align: center; } h2 {
.btn:hover, button:hover { transform: translateY(-3px); box-shadow: 0 6px 20px rgba(0, 0, 0, 0.2); text-decoration: none; } font-size: 1.8rem;
.btn-secondary { background: #34495e; } .btn-secondary:hover { background: #2c3e50; } font-weight: 600;
.btn-info { background: #17a2b8; } .btn-info:hover { background: #138496; } color: var(--primary-color);
.btn-danger { background: #dc3545; } .btn-danger:hover { background: #c82333; } margin-bottom: 20px;
.btn-small { padding: 6px 14px; font-size: 0.9rem; } }
a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:hover { text-decoration: underline; }
.top-link { display: inline-block; margin-bottom: 25px; font-weight: 500; color: var(--primary-color); }
.top-link:hover { text-decoration: underline; }
.noticias-list { list-style: none; padding: 0; margin: 0; } .subtitle {
.noticia-item { display: flex; gap: 20px; padding: 20px 10px; border-bottom: 1px solid var(--border-color); transition: background-color 0.2s ease; } color: var(--text-color-light);
.noticia-item:last-child { border-bottom: none; } font-size: 1.1rem;
.noticia-item:hover { background-color: rgba(255,255,255,0.4); } margin-top: 5px;
.noticia-imagen img { width: 150px; height: 100px; border-radius: var(--border-radius-sm); object-fit: cover; } }
.noticia-texto h3 { margin: 0 0 5px 0; }
.noticia-texto h3 a { color: var(--text-color); font-weight: 600; }
.noticia-texto h3 a:hover { color: var(--primary-color); }
.noticia-meta { font-size: 0.8rem; color: var(--text-color-light); margin-bottom: 8px; }
.flash-messages { list-style: none; padding: 0; margin-bottom: 20px; } .form-section,
.flash-messages li { padding: 15px 20px; border-radius: var(--border-radius-sm); border-left: 5px solid; } .card {
.flash-messages .error { background-color: #fff0f3; color: #d90429; border-color: var(--error-color); } margin-bottom: 30px;
.flash-messages .success { background-color: #e6fcf5; color: #00b894; border-color: #00b894; } background: rgba(255, 255, 255, 0.6);
.flash-messages .warning { background-color: #fffbeb; color: #f39c12; border-color: #f39c12; } padding: 25px;
border-radius: var(--border-radius-md);
border: 1px solid var(--border-color);
}
label {
display: block;
margin-bottom: 8px;
font-weight: 600;
color: var(--text-color);
font-size: 0.9rem;
}
select,
input[type="text"],
input[type="url"],
input[type="file"],
textarea {
width: 100%;
padding: 12px 15px;
border: 1px solid var(--border-color);
background-color: #f8f9fa;
border-radius: var(--border-radius-sm);
font-size: 1rem;
font-family: 'Poppins', sans-serif;
transition: all var(--transition-speed) ease;
}
select:focus,
input:focus,
textarea:focus {
outline: none;
border-color: var(--primary-color);
box-shadow: 0 0 0 3px var(--shadow-color);
background-color: white;
}
.btn,
button {
padding: 12px 25px;
background: var(--gradiente-principal);
color: white !important;
border: none;
border-radius: var(--border-radius-sm);
font-size: 1rem;
font-weight: 600;
cursor: pointer;
transition: all var(--transition-speed) ease;
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1);
text-decoration: none;
display: inline-block;
text-align: center;
}
.btn:hover,
button:hover {
transform: translateY(-3px);
box-shadow: 0 6px 20px rgba(0, 0, 0, 0.2);
text-decoration: none;
}
.btn-secondary { background: #34495e; }
.btn-secondary:hover { background: #2c3e50; }
.btn-info { background: #17a2b8; }
.btn-info:hover { background: #138496; }
.btn-danger { background: #dc3545; }
.btn-danger:hover { background: #c82333; }
.btn-small {
padding: 6px 14px;
font-size: 0.9rem;
}
a {
color: var(--secondary-color);
text-decoration: none;
font-weight: 500;
}
a:hover {
text-decoration: underline;
}
.top-link {
display: inline-block;
margin-bottom: 25px;
font-weight: 500;
color: var(--primary-color);
}
.top-link:hover {
text-decoration: underline;
}
.noticias-list {
list-style: none;
padding: 0;
margin: 0;
}
.noticia-item {
display: flex;
gap: 20px;
padding: 20px 10px;
border-bottom: 1px solid var(--border-color);
transition: background-color 0.2s ease;
align-items: flex-start;
}
.noticia-item:last-child {
border-bottom: none;
}
.noticia-item:hover {
background-color: rgba(255, 255, 255, 0.4);
}
.noticia-imagen {
flex: 0 0 180px;
max-width: 180px;
}
.noticia-imagen img {
width: 100%;
height: 120px;
border-radius: var(--border-radius-sm);
object-fit: cover;
}
.noticia-texto h3 {
margin: 0 0 5px 0;
}
.noticia-texto h3 a {
color: var(--text-color);
font-weight: 600;
}
.noticia-texto h3 a:hover {
color: var(--primary-color);
}
.noticia-meta {
font-size: 0.8rem;
color: var(--text-color-light);
margin-bottom: 8px;
}
.flash-messages {
list-style: none;
padding: 0;
margin-bottom: 20px;
}
.flash-messages li {
padding: 15px 20px;
border-radius: var(--border-radius-sm);
border-left: 5px solid;
}
.flash-messages .error {
background-color: #fff0f3;
color: #d90429;
border-color: var(--error-color);
}
.flash-messages .success {
background-color: #e6fcf5;
color: #00b894;
border-color: #00b894;
}
.flash-messages .warning {
background-color: #fffbeb;
color: #f39c12;
border-color: #f39c12;
}
.dashboard-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 20px;
margin-bottom: 40px;
}
.stat-card {
background: rgba(255, 255, 255, 0.8);
padding: 20px;
border-radius: var(--border-radius-md);
text-align: center;
border: 1px solid var(--border-color);
transition: all 0.3s ease;
}
.stat-card:hover {
transform: translateY(-5px);
box-shadow: 0 4px 15px rgba(0,0,0,0.08);
}
.stat-card .stat-number {
font-size: 2.5rem;
font-weight: 600;
background: var(--gradiente-principal);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
line-height: 1.2;
}
.stat-card .stat-label {
font-size: 0.9rem;
color: var(--text-color-light);
font-weight: 500;
margin-top: 5px;
}
.pagination {
display: flex;
justify-content: center;
align-items: center;
gap: 5px;
margin: 30px 0;
flex-wrap: wrap;
}
.page-link {
display: inline-block;
padding: 8px 14px;
background: rgba(255, 255, 255, 0.6);
border: 1px solid var(--border-color);
border-radius: var(--border-radius-sm);
color: var(--primary-color);
text-decoration: none;
transition: all 0.2s ease;
}
.page-link:hover {
background: white;
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
}
.page-link.active {
background: var(--gradiente-principal);
color: white;
border-color: transparent;
cursor: default;
}
.dashboard-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px; margin-bottom: 40px; }
.stat-card { background: rgba(255, 255, 255, 0.8); padding: 20px; border-radius: var(--border-radius-md); text-align: center; border: 1px solid var(--border-color); transition: all 0.3s ease; }
.stat-card:hover { transform: translateY(-5px); box-shadow: 0 4px 15px rgba(0,0,0,0.08); }
.stat-card .stat-number { font-size: 2.5rem; font-weight: 600; background: var(--gradiente-principal); -webkit-background-clip: text; -webkit-text-fill-color: transparent; line-height: 1.2; }
.stat-card .stat-label { font-size: 0.9rem; color: var(--text-color-light); font-weight: 500; margin-top: 5px; }
.pagination { display: flex; justify-content: center; align-items: center; gap: 5px; margin: 30px 0; flex-wrap: wrap; }
.page-link { display: inline-block; padding: 8px 14px; background: rgba(255, 255, 255, 0.6); border: 1px solid var(--border-color); border-radius: var(--border-radius-sm); color: var(--primary-color); text-decoration: none; transition: all 0.2s ease; }
.page-link:hover { background: white; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
.page-link.active { background: var(--gradiente-principal); color: white; border-color: transparent; cursor: default; }
.feed-detail-card { padding: 0; } .feed-detail-card { padding: 0; }
.feed-header { display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap; gap: 10px; background: rgba(233, 236, 239, 0.5); padding: 15px 25px; border-bottom: 1px solid var(--border-color); }
.feed-header h2 { margin: 0; font-size: 1.4rem; } .feed-header {
display: flex;
justify-content: space-between;
align-items: center;
flex-wrap: wrap;
gap: 10px;
background: rgba(233, 236, 239, 0.5);
padding: 15px 25px;
border-bottom: 1px solid var(--border-color);
}
.feed-header h2 {
margin: 0;
font-size: 1.4rem;
}
.feed-body { padding: 25px; } .feed-body { padding: 25px; }
.feed-body dl { display: grid; grid-template-columns: 120px 1fr; gap: 10px 20px; }
.feed-body dt { font-weight: 600; color: var(--text-color-light); } .feed-body dl {
.feed-body dd { margin: 0; word-break: break-all; } display: grid;
grid-template-columns: 120px 1fr;
gap: 10px 20px;
}
.feed-body dt {
font-weight: 600;
color: var(--text-color-light);
}
.feed-body dd {
margin: 0;
word-break: break-all;
}
.main-nav { .main-nav {
display: flex; display: flex;
@ -98,6 +371,7 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
flex-wrap: wrap; flex-wrap: wrap;
padding-top: 15px; padding-top: 15px;
} }
.nav-link { .nav-link {
font-weight: 500; font-weight: 500;
color: var(--text-color); color: var(--text-color);
@ -106,11 +380,13 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
border-radius: var(--border-radius-sm); border-radius: var(--border-radius-sm);
transition: all var(--transition-speed); transition: all var(--transition-speed);
} }
.nav-link:hover { .nav-link:hover {
background-color: rgba(255,255,255,0.6); background-color: rgba(255,255,255,0.6);
text-decoration: none; text-decoration: none;
color: var(--primary-color); color: var(--primary-color);
} }
.nav-actions { .nav-actions {
display: flex; display: flex;
gap: 10px; gap: 10px;
@ -118,15 +394,78 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
} }
@media (max-width: 768px) { @media (max-width: 768px) {
.container { padding: 20px; margin: 15px; } .container {
padding: 20px;
margin: 15px;
}
h1 { font-size: 2rem; } h1 { font-size: 2rem; }
.noticia-item { flex-direction: column; }
.feed-body dl { grid-template-columns: 100px 1fr; } .noticia-item {
.main-nav { flex-direction: column; gap: 10px; } flex-direction: column;
.nav-actions { margin-left: 0; margin-top: 10px; } }
.noticia-imagen {
flex: 0 0 auto;
max-width: 100%;
}
.noticia-imagen img {
width: 100%;
height: auto;
}
.feed-body dl {
grid-template-columns: 100px 1fr;
}
.main-nav {
flex-direction: column;
gap: 10px;
}
.nav-actions {
margin-left: 0;
margin-top: 10px;
}
}
.resumen-container {
position: relative;
}
/* Neutralizar estilos raros que vienen dentro del HTML de los resúmenes */
.resumen-container .btn,
.resumen-container button,
.resumen-container input[type="button"],
.resumen-container input[type="submit"],
.resumen-container .wp-block-button__link {
padding: 0;
margin: 0;
background: none;
border: none;
border-radius: 0;
box-shadow: none;
font: inherit;
display: inline;
color: var(--secondary-color);
text-decoration: underline;
cursor: pointer;
}
.resumen-container .btn:hover,
.resumen-container button:hover,
.resumen-container .wp-block-button__link:hover {
transform: none;
box-shadow: none;
text-decoration: underline;
}
.resumen-container a {
color: var(--secondary-color);
text-decoration: underline;
} }
.resumen-container { position: relative; }
.ver-mas-btn { .ver-mas-btn {
background: none; background: none;
border: none; border: none;
@ -144,8 +483,14 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
gap: 15px; gap: 15px;
margin-bottom: 20px; margin-bottom: 20px;
} }
.filter-search-box { flex-grow: 1; } .filter-search-box { flex-grow: 1; }
.filter-actions { display: flex; gap: 10px; white-space: nowrap; }
.filter-actions {
display: flex;
gap: 10px;
white-space: nowrap;
}
.clamp { .clamp {
display: -webkit-box; display: -webkit-box;
@ -154,14 +499,21 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
overflow: hidden; overflow: hidden;
word-break: break-word; word-break: break-word;
} }
.clamp.expanded { .clamp.expanded {
-webkit-line-clamp: unset; -webkit-line-clamp: unset;
max-height: none; max-height: none;
overflow: visible; overflow: visible;
} }
/* Pestañas por noticia */
.tabs { width: 100%; } .tabs { width: 100%; }
.tabs-header { display: flex; gap: 8px; margin-bottom: 8px; }
.tabs-header {
display: flex;
gap: 8px;
margin-bottom: 8px;
}
.tab-btn { .tab-btn {
background: rgba(255,255,255,0.7); background: rgba(255,255,255,0.7);
border: 1px solid var(--border-color); border: 1px solid var(--border-color);
@ -170,16 +522,20 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
font-weight: 600; font-weight: 600;
cursor: pointer; cursor: pointer;
} }
.tab-btn.active { .tab-btn.active {
background: var(--gradiente-principal); background: var(--gradiente-principal);
color: #fff !important; color: #fff !important;
border-color: transparent; border-color: transparent;
} }
.tab-btn[disabled] { .tab-btn[disabled] {
opacity: .45; opacity: .45;
cursor: not-allowed; cursor: not-allowed;
} }
.tab-panel { display: none; } .tab-panel { display: none; }
.tab-panel.active { display: block; } .tab-panel.active { display: block; }
.badge { .badge {
@ -192,6 +548,13 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
color: var(--secondary-color); color: var(--secondary-color);
vertical-align: middle; vertical-align: middle;
} }
.badge-secondary { background: #f1f3f5; color: #555; }
.badge-secondary {
background: #f1f3f5;
color: #555;
}
.mini-link { margin-left: 8px; font-size: .8rem; } .mini-link { margin-left: 8px; font-size: .8rem; }
.m0 { margin: 0 0 6px 0; } .m0 { margin: 0 0 6px 0; }

View file

@ -9,7 +9,12 @@
{% if n.imagen_url %} {% if n.imagen_url %}
<div class="noticia-imagen"> <div class="noticia-imagen">
<a href="{{ detalle_url }}"> <a href="{{ detalle_url }}">
<img src="{{ n.imagen_url }}" alt="Imagen para {{ n.titulo }}" loading="lazy"> <img
src="{{ n.imagen_url }}"
alt="Imagen para {{ n.titulo }}"
loading="lazy"
onerror="this.closest('.noticia-imagen').style.display='none';"
>
</a> </a>
</div> </div>
{% endif %} {% endif %}

View file

@ -51,8 +51,17 @@
<h3>Operaciones del Sistema</h3> <h3>Operaciones del Sistema</h3>
</div> </div>
<div class="card-body"> <div class="card-body">
<p>Genera una copia de seguridad completa de todas tus fuentes y noticias en un archivo .zip.</p> <p>Genera o restaura una copia de seguridad completa de todas tus fuentes y noticias.</p>
<a href="{{ url_for('backup_completo') }}" class="btn btn-secondary"><i class="fas fa-archive"></i> Backup Completo (.zip)</a>
<div style="display:flex; gap:10px; flex-wrap:wrap;">
<a href="{{ url_for('backup_completo') }}" class="btn btn-secondary">
<i class="fas fa-archive"></i> Backup Completo (.zip)
</a>
<a href="{{ url_for('restore_completo') }}" class="btn btn-info">
<i class="fas fa-upload"></i> Restaurar Backup (.zip)
</a>
</div>
</div> </div>
</div> </div>

View file

@ -50,19 +50,28 @@
{% if d.imagen_url %} {% if d.imagen_url %}
<div style="margin-bottom:16px; text-align:center;"> <div style="margin-bottom:16px; text-align:center;">
<img src="{{ d.imagen_url }}" alt="Imagen de la noticia" style="max-width:100%; height:auto;" loading="lazy"> <img
src="{{ d.imagen_url }}"
alt="Imagen de la noticia"
loading="lazy"
onerror="this.style.display='none';"
>
</div> </div>
{% endif %} {% endif %}
{% if d.resumen_trad %} {% if d.resumen_trad %}
<h3>Resumen (traducido)</h3> <h3>Resumen (traducido)</h3>
<div>{{ d.resumen_trad|safe_html }}</div> <div class="resumen-container">
<div class="resumen-completo" style="display:block;">{{ d.resumen_trad|safe_html }}</div>
</div>
<hr> <hr>
{% endif %} {% endif %}
{% if d.resumen_orig %} {% if d.resumen_orig %}
<h3>Resumen (original)</h3> <h3>Resumen (original)</h3>
<div>{{ d.resumen_orig|safe_html }}</div> <div class="resumen-container">
<div class="resumen-completo" style="display:block;">{{ d.resumen_orig|safe_html }}</div>
</div>
{% endif %} {% endif %}
{% if tags and tags|length %} {% if tags and tags|length %}
@ -87,7 +96,12 @@
{% if r.imagen_url %} {% if r.imagen_url %}
<div class="noticia-imagen"> <div class="noticia-imagen">
<a href="{{ r.url }}" target="_blank" rel="noopener"> <a href="{{ r.url }}" target="_blank" rel="noopener">
<img src="{{ r.imagen_url }}" alt="Imagen relacionada" loading="lazy"> <img
src="{{ r.imagen_url }}"
alt="Imagen relacionada"
loading="lazy"
onerror="this.closest('.noticia-imagen').style.display='none';"
>
</a> </a>
</div> </div>
{% endif %} {% endif %}

View file

@ -0,0 +1,49 @@
{% extends "base.html" %}
{% block title %}Restaurar Backup Completo{% endblock %}
{% block content %}
<div class="card">
<div class="card-header">
<h3>Restaurar Backup Completo</h3>
</div>
<div class="card-body">
<p>
Sube un archivo <strong>.zip</strong> generado desde
<em>"Backup Completo (.zip)"</em> en el dashboard.
</p>
<div style="background: #fff3cd; border: 1px solid #ffeeba; padding: 10px 12px; border-radius: 8px; margin: 15px 0;">
<strong>⚠ Atención:</strong>
<ul style="margin: 8px 0 0 18px; padding: 0;">
<li>Se <strong>vaciarán</strong> las tablas <code>feeds</code> y <code>fuentes_url</code>.</li>
<li>Los datos de esos CSV se volverán a cargar desde el backup.</li>
<li>No se tocan noticias, traducciones ni tags.</li>
</ul>
</div>
<form action="{{ url_for('restore_completo') }}" method="post" enctype="multipart/form-data">
<div class="form-group" style="margin-bottom: 15px;">
<label for="backup_file"><strong>Archivo .zip de backup completo</strong></label>
<input
type="file"
id="backup_file"
name="backup_file"
accept=".zip"
required
style="display:block; margin-top:8px;"
>
</div>
<div style="margin-top: 20px; display:flex; gap:10px;">
<a href="{{ url_for('dashboard') }}" class="btn btn-secondary">Cancelar</a>
<button type="submit" class="btn btn-danger">
<i class="fas fa-exclamation-triangle"></i>
Restaurar desde backup
</button>
</div>
</form>
</div>
</div>
{% endblock %}

View file

@ -1,4 +1,3 @@
# translation_worker.py
import os import os
import time import time
import logging import logging
@ -8,17 +7,17 @@ from typing import List, Optional
import psycopg2 import psycopg2
import psycopg2.extras import psycopg2.extras
from psycopg2.extras import execute_values
import torch import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from langdetect import detect, DetectorFactory from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0 # resultados reproducibles DetectorFactory.seed = 0
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s") logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
# ---------- Config DB ----------
DB_CONFIG = { DB_CONFIG = {
"host": os.environ.get("DB_HOST", "localhost"), "host": os.environ.get("DB_HOST", "localhost"),
"port": int(os.environ.get("DB_PORT", 5432)), "port": int(os.environ.get("DB_PORT", 5432)),
@ -27,7 +26,7 @@ DB_CONFIG = {
"password": os.environ.get("DB_PASS", "x"), "password": os.environ.get("DB_PASS", "x"),
} }
# ---------- Helpers ENV (con retrocompatibilidad) ----------
def _env_list(name: str, *fallbacks: str, default: str = "es") -> List[str]: def _env_list(name: str, *fallbacks: str, default: str = "es") -> List[str]:
raw = None raw = None
for key in (name, *fallbacks): for key in (name, *fallbacks):
@ -37,6 +36,7 @@ def _env_list(name: str, *fallbacks: str, default: str = "es") -> List[str]:
raw = raw if raw is not None else default raw = raw if raw is not None else default
return [s.strip() for s in raw.split(",") if s and s.strip()] return [s.strip() for s in raw.split(",") if s and s.strip()]
def _env_int(name: str, *fallbacks: str, default: int = 8) -> int: def _env_int(name: str, *fallbacks: str, default: int = 8) -> int:
for key in (name, *fallbacks): for key in (name, *fallbacks):
val = os.environ.get(key) val = os.environ.get(key)
@ -47,6 +47,7 @@ def _env_int(name: str, *fallbacks: str, default: int = 8) -> int:
pass pass
return default return default
def _env_float(name: str, *fallbacks: str, default: float = 5.0) -> float: def _env_float(name: str, *fallbacks: str, default: float = 5.0) -> float:
for key in (name, *fallbacks): for key in (name, *fallbacks):
val = os.environ.get(key) val = os.environ.get(key)
@ -57,6 +58,7 @@ def _env_float(name: str, *fallbacks: str, default: float = 5.0) -> float:
pass pass
return default return default
def _env_str(name: str, *fallbacks: str, default: Optional[str] = None) -> Optional[str]: def _env_str(name: str, *fallbacks: str, default: Optional[str] = None) -> Optional[str]:
for key in (name, *fallbacks): for key in (name, *fallbacks):
val = os.environ.get(key) val = os.environ.get(key)
@ -64,23 +66,24 @@ def _env_str(name: str, *fallbacks: str, default: Optional[str] = None) -> Optio
return val return val
return default return default
def _env_bool(name: str, default: bool = False) -> bool: def _env_bool(name: str, default: bool = False) -> bool:
val = os.environ.get(name) val = os.environ.get(name)
if val is None: if val is None:
return default return default
return str(val).strip().lower() in ("1", "true", "yes", "y", "on") return str(val).strip().lower() in ("1", "true", "yes", "y", "on")
TARGET_LANGS = _env_list("TARGET_LANGS", "TRANSLATE_TO", default="es") TARGET_LANGS = _env_list("TARGET_LANGS", "TRANSLATE_TO", default="es")
BATCH_SIZE = _env_int("BATCH", "TRANSLATOR_BATCH", "TRANSLATE_BATCH", default=8) BATCH_SIZE = _env_int("BATCH", "TRANSLATOR_BATCH", "TRANSLATE_BATCH", default=8)
ENQUEUE_MAX = _env_int("ENQUEUE", "TRANSLATOR_ENQUEUE", "TRANSLATE_ENQUEUE", default=200) ENQUEUE_MAX = _env_int("ENQUEUE", "TRANSLATOR_ENQUEUE", "TRANSLATE_ENQUEUE", default=200)
SLEEP_IDLE = _env_float("SLEEP_IDLE", "TRANSLATOR_SLEEP_IDLE", "TRANSLATE_SLEEP_IDLE", default=5.0) SLEEP_IDLE = _env_float("SLEEP_IDLE", "TRANSLATOR_SLEEP_IDLE", "TRANSLATE_SLEEP_IDLE", default=5.0)
DEVICE_CFG = (_env_str("DEVICE", default="auto") or "auto").lower() # 'cpu' | 'cuda' | 'auto' DEVICE_CFG = (_env_str("DEVICE", default="auto") or "auto").lower()
# Límites de tokens (ajusta si ves OOM)
MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", default=512) MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", default=512)
MAX_NEW_TOKENS = _env_int("MAX_NEW_TOKENS", default=256) MAX_NEW_TOKENS = _env_int("MAX_NEW_TOKENS", default=256)
# ---- Beams: por defecto 2 para títulos y 1 para cuerpo; respeta NUM_BEAMS si sólo se define ese ----
def _beams_from_env(): def _beams_from_env():
nb_global = os.environ.get("NUM_BEAMS") nb_global = os.environ.get("NUM_BEAMS")
has_title = os.environ.get("NUM_BEAMS_TITLE") is not None has_title = os.environ.get("NUM_BEAMS_TITLE") is not None
@ -91,41 +94,77 @@ def _beams_from_env():
return v, v return v, v
except ValueError: except ValueError:
pass pass
# por defecto: 2 (título), 1 (cuerpo)
return _env_int("NUM_BEAMS_TITLE", default=2), _env_int("NUM_BEAMS_BODY", default=1) return _env_int("NUM_BEAMS_TITLE", default=2), _env_int("NUM_BEAMS_BODY", default=1)
NUM_BEAMS_TITLE, NUM_BEAMS_BODY = _beams_from_env() NUM_BEAMS_TITLE, NUM_BEAMS_BODY = _beams_from_env()
# Modelo por defecto: NLLB 600M (cámbialo por facebook/nllb-200-1.3B si quieres el 1.3B)
UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", default="facebook/nllb-200-distilled-600M") UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", default="facebook/nllb-200-distilled-600M")
# ---------- Chunking por frases (para artículos largos) ----------
# Activo por defecto para evitar secuencias > límite del modelo
CHUNK_BY_SENTENCES = _env_bool("CHUNK_BY_SENTENCES", default=True) CHUNK_BY_SENTENCES = _env_bool("CHUNK_BY_SENTENCES", default=True)
CHUNK_MAX_TOKENS = _env_int("CHUNK_MAX_TOKENS", default=900) # <= modelo - margen CHUNK_MAX_TOKENS = _env_int("CHUNK_MAX_TOKENS", default=900)
CHUNK_OVERLAP_SENTS = _env_int("CHUNK_OVERLAP_SENTS", default=0) # 0 o 1 CHUNK_OVERLAP_SENTS = _env_int("CHUNK_OVERLAP_SENTS", default=0)
# Abreviaturas comunes y marcador temporal
_ABBR = ("Sr", "Sra", "Dr", "Dra", "Ing", "Lic", "pág", "etc") _ABBR = ("Sr", "Sra", "Dr", "Dra", "Ing", "Lic", "pág", "etc")
_ABBR_MARK = "§" # no debería aparecer en texto normal _ABBR_MARK = "§"
_SENT_SPLIT_RE = re.compile(
r'(?<=[\.!\?…])\s+(?=["\(\[A-ZÁÉÍÓÚÑÄÖÜ0-9])|(?:\n{2,})'
)
NLLB_LANG = {
"es": "spa_Latn",
"en": "eng_Latn",
"fr": "fra_Latn",
"de": "deu_Latn",
"it": "ita_Latn",
"pt": "por_Latn",
"nl": "nld_Latn",
"sv": "swe_Latn",
"da": "dan_Latn",
"fi": "fin_Latn",
"no": "nob_Latn",
"nb": "nob_Latn",
"nn": "nno_Latn",
"pl": "pol_Latn",
"cs": "ces_Latn",
"sk": "slk_Latn",
"sl": "slv_Latn",
"hu": "hun_Latn",
"ro": "ron_Latn",
"bg": "bul_Cyrl",
"el": "ell_Grek",
"ru": "rus_Cyrl",
"uk": "ukr_Cyrl",
"hr": "hrv_Latn",
"sr": "srp_Cyrl",
"bs": "bos_Latn",
"tr": "tur_Latn",
"ar": "arb_Arab",
"fa": "pes_Arab",
"he": "heb_Hebr",
"zh": "zho_Hans",
"ja": "jpn_Jpan",
"ko": "kor_Hang",
"vi": "vie_Latn",
"th": "tha_Thai",
"id": "ind_Latn",
"ms": "zsm_Latn",
"pt-br": "por_Latn",
"pt-pt": "por_Latn",
}
def _protect_abbrev(text: str) -> str: def _protect_abbrev(text: str) -> str:
# Iniciales de una letra: "E.", "A."
t = re.sub(r"\b([A-ZÁÉÍÓÚÑÄÖÜ])\.", r"\1" + _ABBR_MARK, text) t = re.sub(r"\b([A-ZÁÉÍÓÚÑÄÖÜ])\.", r"\1" + _ABBR_MARK, text)
# Abreviaturas de la lista (case-insensitive)
pat = r"\b(?:" + "|".join(map(re.escape, _ABBR)) + r")\." pat = r"\b(?:" + "|".join(map(re.escape, _ABBR)) + r")\."
t = re.sub(pat, lambda m: m.group(0)[:-1] + _ABBR_MARK, t, flags=re.IGNORECASE) t = re.sub(pat, lambda m: m.group(0)[:-1] + _ABBR_MARK, t, flags=re.IGNORECASE)
return t return t
def _restore_abbrev(text: str) -> str: def _restore_abbrev(text: str) -> str:
return text.replace(_ABBR_MARK, ".") return text.replace(_ABBR_MARK, ".")
# Regex de corte SIN look-behind variable:
# - Corta tras [.!?…] si hay espacios y luego comienza otra frase (letra mayúscula, comillas, paréntesis, dígito)
# - O cuando hay doble salto de línea
_SENT_SPLIT_RE = re.compile(
r'(?<=[\.!\?…])\s+(?=["\(\[A-ZÁÉÍÓÚÑÄÖÜ0-9])|(?:\n{2,})'
)
def split_into_sentences(text: str) -> List[str]: def split_into_sentences(text: str) -> List[str]:
text = (text or "").strip() text = (text or "").strip()
@ -134,7 +173,6 @@ def split_into_sentences(text: str) -> List[str]:
protected = _protect_abbrev(text) protected = _protect_abbrev(text)
parts = [p.strip() for p in _SENT_SPLIT_RE.split(protected) if p and p.strip()] parts = [p.strip() for p in _SENT_SPLIT_RE.split(protected) if p and p.strip()]
parts = [_restore_abbrev(p) for p in parts] parts = [_restore_abbrev(p) for p in parts]
# Une piezas muy cortas con la anterior para más coherencia
merged: List[str] = [] merged: List[str] = []
for p in parts: for p in parts:
if merged and len(p) < 40: if merged and len(p) < 40:
@ -143,26 +181,6 @@ def split_into_sentences(text: str) -> List[str]:
merged.append(p) merged.append(p)
return merged return merged
# ---------- Mapeo idiomas a códigos NLLB ----------
NLLB_LANG = {
# básicos
"es": "spa_Latn", "en": "eng_Latn", "fr": "fra_Latn", "de": "deu_Latn", "it": "ita_Latn", "pt": "por_Latn",
# nórdicos
"nl": "nld_Latn", "sv": "swe_Latn", "da": "dan_Latn", "fi": "fin_Latn",
# noruego
"no": "nob_Latn", "nb": "nob_Latn", "nn": "nno_Latn",
# CEE
"pl": "pol_Latn", "cs": "ces_Latn", "sk": "slk_Latn", "sl": "slv_Latn",
"hu": "hun_Latn", "ro": "ron_Latn", "bg": "bul_Cyrl", "el": "ell_Grek",
"ru": "rus_Cyrl", "uk": "ukr_Cyrl", "hr": "hrv_Latn", "sr": "srp_Cyrl", "bs": "bos_Latn",
# ME/Asia
"tr": "tur_Latn", "ar": "arb_Arab", "fa": "pes_Arab", "he": "heb_Hebr",
"zh": "zho_Hans", "ja": "jpn_Jpan", "ko": "kor_Hang",
# SEA
"vi": "vie_Latn", "th": "tha_Thai", "id": "ind_Latn", "ms": "zsm_Latn",
# variantes
"pt-br": "por_Latn", "pt-pt": "por_Latn",
}
def map_to_nllb(code: Optional[str]) -> Optional[str]: def map_to_nllb(code: Optional[str]) -> Optional[str]:
if not code: if not code:
@ -172,29 +190,35 @@ def map_to_nllb(code: Optional[str]) -> Optional[str]:
return NLLB_LANG[code] return NLLB_LANG[code]
return f"{code}_Latn" return f"{code}_Latn"
def normalize_lang(code: Optional[str], default: Optional[str] = None) -> Optional[str]: def normalize_lang(code: Optional[str], default: Optional[str] = None) -> Optional[str]:
if not code: if not code:
return default return default
code = code.strip().lower() code = code.strip().lower()
return code if code else default return code if code else default
# ---------- DB ----------
def get_conn(): def get_conn():
return psycopg2.connect(**DB_CONFIG) return psycopg2.connect(**DB_CONFIG)
def ensure_indexes(conn): def ensure_indexes(conn):
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute(""" cur.execute(
"""
CREATE INDEX IF NOT EXISTS traducciones_lang_to_status_idx CREATE INDEX IF NOT EXISTS traducciones_lang_to_status_idx
ON traducciones (lang_to, status); ON traducciones (lang_to, status);
CREATE INDEX IF NOT EXISTS traducciones_status_idx CREATE INDEX IF NOT EXISTS traducciones_status_idx
ON traducciones (status); ON traducciones (status);
""") """
)
conn.commit() conn.commit()
def ensure_pending(conn, lang_to: str, enqueue_limit: int): def ensure_pending(conn, lang_to: str, enqueue_limit: int):
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute(""" cur.execute(
"""
INSERT INTO traducciones (noticia_id, lang_from, lang_to, status) INSERT INTO traducciones (noticia_id, lang_from, lang_to, status)
SELECT sub.id, NULL, %s, 'pending' SELECT sub.id, NULL, %s, 'pending'
FROM ( FROM (
@ -206,12 +230,16 @@ def ensure_pending(conn, lang_to: str, enqueue_limit: int):
ORDER BY n.fecha DESC NULLS LAST, n.id ORDER BY n.fecha DESC NULLS LAST, n.id
LIMIT %s LIMIT %s
) AS sub; ) AS sub;
""", (lang_to, lang_to, enqueue_limit)) """,
(lang_to, lang_to, enqueue_limit),
)
conn.commit() conn.commit()
def fetch_pending_batch(conn, lang_to: str, batch_size: int): def fetch_pending_batch(conn, lang_to: str, batch_size: int):
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute(""" cur.execute(
"""
SELECT t.id AS tr_id, t.noticia_id, t.lang_from, t.lang_to, SELECT t.id AS tr_id, t.noticia_id, t.lang_from, t.lang_to,
n.titulo, n.resumen n.titulo, n.resumen
FROM traducciones t FROM traducciones t
@ -219,7 +247,9 @@ def fetch_pending_batch(conn, lang_to: str, batch_size: int):
WHERE t.lang_to = %s AND t.status = 'pending' WHERE t.lang_to = %s AND t.status = 'pending'
ORDER BY t.id ORDER BY t.id
LIMIT %s; LIMIT %s;
""", (lang_to, batch_size)) """,
(lang_to, batch_size),
)
rows = cur.fetchall() rows = cur.fetchall()
if rows: if rows:
ids = [r["tr_id"] for r in rows] ids = [r["tr_id"] for r in rows]
@ -228,21 +258,6 @@ def fetch_pending_batch(conn, lang_to: str, batch_size: int):
conn.commit() conn.commit()
return rows return rows
def mark_done(conn, tr_id: int, title_tr: str, body_tr: str, lang_from: Optional[str]):
with conn.cursor() as cur:
cur.execute("""
UPDATE traducciones
SET titulo_trad=%s, resumen_trad=%s,
lang_from = COALESCE(lang_from, %s),
status='done', error=NULL
WHERE id=%s;
""", (title_tr, body_tr, lang_from, tr_id))
conn.commit()
def mark_error(conn, tr_id: int, msg: str):
with conn.cursor() as cur:
cur.execute("UPDATE traducciones SET status='error', error=%s WHERE id=%s;", (msg[:1500], tr_id))
conn.commit()
def detect_lang(text1: str, text2: str) -> Optional[str]: def detect_lang(text1: str, text2: str) -> Optional[str]:
txt = (text1 or "").strip() or (text2 or "").strip() txt = (text1 or "").strip() or (text2 or "").strip()
@ -253,13 +268,14 @@ def detect_lang(text1: str, text2: str) -> Optional[str]:
except Exception: except Exception:
return None return None
# ---------- Modelo único y manejo de CUDA (NLLB) ----------
_TOKENIZER: Optional[AutoTokenizer] = None _TOKENIZER: Optional[AutoTokenizer] = None
_MODEL: Optional[AutoModelForSeq2SeqLM] = None _MODEL: Optional[AutoModelForSeq2SeqLM] = None
_DEVICE: Optional[torch.device] = None _DEVICE: Optional[torch.device] = None
_CUDA_FAILS: int = 0 _CUDA_FAILS: int = 0
_CUDA_DISABLED: bool = False _CUDA_DISABLED: bool = False
def _resolve_device() -> torch.device: def _resolve_device() -> torch.device:
global _CUDA_DISABLED global _CUDA_DISABLED
if _CUDA_DISABLED: if _CUDA_DISABLED:
@ -268,13 +284,14 @@ def _resolve_device() -> torch.device:
return torch.device("cpu") return torch.device("cpu")
if DEVICE_CFG == "cuda": if DEVICE_CFG == "cuda":
return torch.device("cuda" if torch.cuda.is_available() else "cpu") return torch.device("cuda" if torch.cuda.is_available() else "cpu")
# auto
return torch.device("cuda" if torch.cuda.is_available() else "cpu") return torch.device("cuda" if torch.cuda.is_available() else "cpu")
def _is_cuda_mem_error(exc: Exception) -> bool: def _is_cuda_mem_error(exc: Exception) -> bool:
s = str(exc) s = str(exc)
return ("CUDA out of memory" in s) or ("CUDACachingAllocator" in s) or ("expandable_segment" in s) return ("CUDA out of memory" in s) or ("CUDACachingAllocator" in s) or ("expandable_segment" in s)
def _free_cuda(): def _free_cuda():
if torch.cuda.is_available(): if torch.cuda.is_available():
try: try:
@ -283,8 +300,8 @@ def _free_cuda():
except Exception: except Exception:
pass pass
def _load_model_on(device: torch.device): def _load_model_on(device: torch.device):
"""Carga (o recarga) el modelo/tokenizer en el dispositivo indicado."""
global _TOKENIZER, _MODEL, _DEVICE global _TOKENIZER, _MODEL, _DEVICE
dtype = torch.float16 if device.type == "cuda" else torch.float32 dtype = torch.float16 if device.type == "cuda" else torch.float32
@ -293,9 +310,9 @@ def _load_model_on(device: torch.device):
mdl = AutoModelForSeq2SeqLM.from_pretrained( mdl = AutoModelForSeq2SeqLM.from_pretrained(
UNIVERSAL_MODEL, UNIVERSAL_MODEL,
torch_dtype=dtype, torch_dtype=dtype,
low_cpu_mem_usage=True low_cpu_mem_usage=True,
) )
# use_cache=False reduce picos de VRAM en generación
try: try:
mdl.config.use_cache = False mdl.config.use_cache = False
except Exception: except Exception:
@ -306,8 +323,8 @@ def _load_model_on(device: torch.device):
_TOKENIZER, _MODEL, _DEVICE = tok, mdl, device _TOKENIZER, _MODEL, _DEVICE = tok, mdl, device
def get_universal_components(): def get_universal_components():
"""Devuelve (tokenizer, model, device). Carga en GPU si está disponible y estable."""
global _TOKENIZER, _MODEL, _DEVICE, _CUDA_FAILS, _CUDA_DISABLED global _TOKENIZER, _MODEL, _DEVICE, _CUDA_FAILS, _CUDA_DISABLED
if _MODEL is not None and _DEVICE is not None: if _MODEL is not None and _DEVICE is not None:
@ -329,14 +346,13 @@ def get_universal_components():
_load_model_on(torch.device("cpu")) _load_model_on(torch.device("cpu"))
return _TOKENIZER, _MODEL, _DEVICE return _TOKENIZER, _MODEL, _DEVICE
# ---------- Utilidades de tokenización / chunking ----------
def _safe_src_len(tokenizer) -> int: def _safe_src_len(tokenizer) -> int:
model_max = getattr(tokenizer, "model_max_length", 1024) or 1024 model_max = getattr(tokenizer, "model_max_length", 1024) or 1024
# margen para tokens especiales/ruido
return min(MAX_SRC_TOKENS, int(model_max) - 16) return min(MAX_SRC_TOKENS, int(model_max) - 16)
def _token_chunks(tokenizer, text: str, max_tokens: int) -> List[str]: def _token_chunks(tokenizer, text: str, max_tokens: int) -> List[str]:
"""Troceo simple por tokens (fallback)"""
if not text: if not text:
return [] return []
ids = tokenizer.encode(text, add_special_tokens=False) ids = tokenizer.encode(text, add_special_tokens=False)
@ -344,22 +360,20 @@ def _token_chunks(tokenizer, text: str, max_tokens: int) -> List[str]:
return [text] return [text]
chunks = [] chunks = []
for i in range(0, len(ids), max_tokens): for i in range(0, len(ids), max_tokens):
sub = ids[i:i+max_tokens] sub = ids[i : i + max_tokens]
piece = tokenizer.decode(sub, skip_special_tokens=True, clean_up_tokenization_spaces=True) piece = tokenizer.decode(sub, skip_special_tokens=True, clean_up_tokenization_spaces=True)
if piece.strip(): if piece.strip():
chunks.append(piece.strip()) chunks.append(piece.strip())
return chunks return chunks
def _norm(s: str) -> str: def _norm(s: str) -> str:
import re as _re import re as _re
return _re.sub(r"\W+", "", (s or "").lower()).strip() return _re.sub(r"\W+", "", (s or "").lower()).strip()
def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_code: str) -> int: def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_code: str) -> int:
"""
Resuelve el id del token de idioma destino para NLLB de forma robusta,
funcionando aunque falte `lang_code_to_id` en el tokenizer.
"""
# 1) tokenizer.lang_code_to_id (si existe)
try: try:
mapping = getattr(tokenizer, "lang_code_to_id", None) mapping = getattr(tokenizer, "lang_code_to_id", None)
if isinstance(mapping, dict): if isinstance(mapping, dict):
@ -369,7 +383,6 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c
except Exception: except Exception:
pass pass
# 2) model.config.lang_code_to_id (si existe)
try: try:
mapping = getattr(getattr(model, "config", None), "lang_code_to_id", None) mapping = getattr(getattr(model, "config", None), "lang_code_to_id", None)
if isinstance(mapping, dict): if isinstance(mapping, dict):
@ -379,7 +392,6 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c
except Exception: except Exception:
pass pass
# 3) convert_tokens_to_ids (algunos builds registran el código como token especial)
try: try:
tid = tokenizer.convert_tokens_to_ids(tgt_code) tid = tokenizer.convert_tokens_to_ids(tgt_code)
if isinstance(tid, int) and tid not in (-1, getattr(tokenizer, "unk_token_id", -1)): if isinstance(tid, int) and tid not in (-1, getattr(tokenizer, "unk_token_id", -1)):
@ -387,7 +399,6 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c
except Exception: except Exception:
pass pass
# 4) additional_special_tokens/_ids (buscar el código tal cual)
try: try:
ats = getattr(tokenizer, "additional_special_tokens", None) ats = getattr(tokenizer, "additional_special_tokens", None)
ats_ids = getattr(tokenizer, "additional_special_tokens_ids", None) ats_ids = getattr(tokenizer, "additional_special_tokens_ids", None)
@ -398,17 +409,12 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c
except Exception: except Exception:
pass pass
# 5) último recurso: usa eos/bos para no romper generate()
LOG.warning("No pude resolver lang code id para '%s'. Uso fallback (eos/bos).", tgt_code) LOG.warning("No pude resolver lang code id para '%s'. Uso fallback (eos/bos).", tgt_code)
return getattr(tokenizer, "eos_token_id", None) or getattr(tokenizer, "bos_token_id", None) or 0 return getattr(tokenizer, "eos_token_id", None) or getattr(tokenizer, "bos_token_id", None) or 0
# ---------- Traducción base ----------
@torch.inference_mode() @torch.inference_mode()
def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1, _tries: int = 0) -> str: def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1, _tries: int = 0) -> str:
"""
Traduce un texto (usando troceo por tokens si excede MAX_SRC_TOKENS).
Se usa para títulos y como núcleo para chunks de artículos.
"""
if not text or not text.strip(): if not text or not text.strip():
return "" return ""
@ -416,7 +422,6 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1,
src_code = map_to_nllb(src_lang) or "eng_Latn" src_code = map_to_nllb(src_lang) or "eng_Latn"
tgt_code = map_to_nllb(tgt_lang) or "spa_Latn" tgt_code = map_to_nllb(tgt_lang) or "spa_Latn"
# Configura idioma origen (si la prop existe)
try: try:
tok.src_lang = src_code tok.src_lang = src_code
except Exception: except Exception:
@ -439,7 +444,7 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1,
max_new_tokens=MAX_NEW_TOKENS, max_new_tokens=MAX_NEW_TOKENS,
num_beams=max(1, int(num_beams)), num_beams=max(1, int(num_beams)),
do_sample=False, do_sample=False,
use_cache=False, # ↓ memoria use_cache=False,
) )
if int(num_beams) > 1: if int(num_beams) > 1:
gen_kwargs["early_stopping"] = True gen_kwargs["early_stopping"] = True
@ -459,7 +464,6 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1,
except Exception as e: except Exception as e:
if device.type == "cuda" and _is_cuda_mem_error(e) and _tries < 2: if device.type == "cuda" and _is_cuda_mem_error(e) and _tries < 2:
LOG.warning("CUDA OOM/allocator: intento de recuperación %d. Detalle: %s", _tries + 1, e) LOG.warning("CUDA OOM/allocator: intento de recuperación %d. Detalle: %s", _tries + 1, e)
# desactiva CUDA y relanza en CPU
global _MODEL, _DEVICE, _CUDA_DISABLED global _MODEL, _DEVICE, _CUDA_DISABLED
_CUDA_DISABLED = True _CUDA_DISABLED = True
try: try:
@ -474,10 +478,11 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1,
return translate_text(src_lang, tgt_lang, text, num_beams=num_beams, _tries=_tries + 1) return translate_text(src_lang, tgt_lang, text, num_beams=num_beams, _tries=_tries + 1)
raise raise
# ---------- Chunking por frases para artículos ----------
def _sent_token_len(tokenizer, sent: str) -> int: def _sent_token_len(tokenizer, sent: str) -> int:
return len(tokenizer(sent, add_special_tokens=False).input_ids) return len(tokenizer(sent, add_special_tokens=False).input_ids)
def _pack_sentences_to_token_chunks( def _pack_sentences_to_token_chunks(
tokenizer, sentences: List[str], max_tokens: int, overlap_sents: int = 0 tokenizer, sentences: List[str], max_tokens: int, overlap_sents: int = 0
) -> List[List[str]]: ) -> List[List[str]]:
@ -487,11 +492,10 @@ def _pack_sentences_to_token_chunks(
for s in sentences: for s in sentences:
slen = _sent_token_len(tokenizer, s) slen = _sent_token_len(tokenizer, s)
if slen > max_tokens: if slen > max_tokens:
# Si una sola frase excede el límite, córtala por tokens como último recurso
ids = tokenizer(s, add_special_tokens=False).input_ids ids = tokenizer(s, add_special_tokens=False).input_ids
step = max_tokens step = max_tokens
for i in range(0, len(ids), step): for i in range(0, len(ids), step):
sub = tokenizer.decode(ids[i:i+step], skip_special_tokens=True) sub = tokenizer.decode(ids[i : i + step], skip_special_tokens=True)
if cur: if cur:
chunks.append(cur) chunks.append(cur)
cur = [] cur = []
@ -500,7 +504,8 @@ def _pack_sentences_to_token_chunks(
continue continue
if cur_tokens + slen <= max_tokens: if cur_tokens + slen <= max_tokens:
cur.append(s); cur_tokens += slen cur.append(s)
cur_tokens += slen
else: else:
if cur: if cur:
chunks.append(cur) chunks.append(cur)
@ -509,13 +514,14 @@ def _pack_sentences_to_token_chunks(
cur = overlap + [s] cur = overlap + [s]
cur_tokens = sum(_sent_token_len(tokenizer, x) for x in cur) cur_tokens = sum(_sent_token_len(tokenizer, x) for x in cur)
else: else:
cur = [s]; cur_tokens = slen cur = [s]
cur_tokens = slen
if cur: if cur:
chunks.append(cur) chunks.append(cur)
return chunks return chunks
def _smart_concatenate(parts: List[str], tail_window: int = 120) -> str: def _smart_concatenate(parts: List[str], tail_window: int = 120) -> str:
"""Une partes evitando duplicados obvios en el borde (heurística ligera)."""
if not parts: if not parts:
return "" return ""
out = parts[0] out = parts[0]
@ -529,24 +535,17 @@ def _smart_concatenate(parts: List[str], tail_window: int = 120) -> str:
out += ("" if cut == 0 else nxt[cut:]) if nxt else "" out += ("" if cut == 0 else nxt[cut:]) if nxt else ""
return out return out
def translate_article_full( def translate_article_full(
src_lang: str, src_lang: str,
tgt_lang: str, tgt_lang: str,
text: str, text: str,
num_beams: int, num_beams: int,
) -> str: ) -> str:
"""
Traduce un artículo completo:
- Divide por frases (sin look-behind variable)
- Empaqueta en chunks <= límite de tokens
- Traduce chunk a chunk (usa translate_text internamente)
- Une con heurística para evitar duplicados en bordes
"""
if not text or not text.strip(): if not text or not text.strip():
return "" return ""
if not CHUNK_BY_SENTENCES: if not CHUNK_BY_SENTENCES:
# Ruta rápida: una sola pasada con truncamiento interno
return translate_text(src_lang, tgt_lang, text, num_beams=num_beams) return translate_text(src_lang, tgt_lang, text, num_beams=num_beams)
tok, _, _ = get_universal_components() tok, _, _ = get_universal_components()
@ -569,8 +568,11 @@ def translate_article_full(
return _smart_concatenate([p for p in translated_parts if p]) return _smart_concatenate([p for p in translated_parts if p])
# ---------- Procesamiento por lotes ----------
def process_batch(conn, rows): def process_batch(conn, rows):
done_rows = []
error_rows = []
for r in rows: for r in rows:
tr_id = r["tr_id"] tr_id = r["tr_id"]
lang_to = normalize_lang(r["lang_to"], "es") or "es" lang_to = normalize_lang(r["lang_to"], "es") or "es"
@ -579,36 +581,70 @@ def process_batch(conn, rows):
title = (r["titulo"] or "").strip() title = (r["titulo"] or "").strip()
body = (r["resumen"] or "").strip() body = (r["resumen"] or "").strip()
# Si ya está en el mismo idioma, copia tal cual
if (map_to_nllb(lang_from) or "eng_Latn") == (map_to_nllb(lang_to) or "spa_Latn"): if (map_to_nllb(lang_from) or "eng_Latn") == (map_to_nllb(lang_to) or "spa_Latn"):
mark_done(conn, tr_id, title, body, lang_from) done_rows.append((title, body, lang_from, tr_id))
continue continue
try: try:
# Títulos: cortos, traducción directa (beams más altos si quieres)
title_tr = translate_text(lang_from, lang_to, title, num_beams=NUM_BEAMS_TITLE) if title else "" title_tr = translate_text(lang_from, lang_to, title, num_beams=NUM_BEAMS_TITLE) if title else ""
# Cuerpo/resumen: artículo completo con chunking por frases
body_tr = translate_article_full(lang_from, lang_to, body, num_beams=NUM_BEAMS_BODY) if body else "" body_tr = translate_article_full(lang_from, lang_to, body, num_beams=NUM_BEAMS_BODY) if body else ""
# Si la "traducción" es igual al original, déjala vacía
if _norm(title_tr) == _norm(title): if _norm(title_tr) == _norm(title):
title_tr = "" title_tr = ""
if _norm(body_tr) == _norm(body): if _norm(body_tr) == _norm(body):
body_tr = "" body_tr = ""
mark_done(conn, tr_id, title_tr, body_tr, lang_from) done_rows.append((title_tr, body_tr, lang_from, tr_id))
except Exception as e: except Exception as e:
LOG.exception("Error traduciendo fila") LOG.exception("Error traduciendo fila")
mark_error(conn, tr_id, str(e)) error_rows.append((str(e)[:1500], tr_id))
with conn.cursor() as cur:
if done_rows:
execute_values(
cur,
"""
UPDATE traducciones AS t
SET titulo_trad = v.titulo_trad,
resumen_trad = v.resumen_trad,
lang_from = COALESCE(t.lang_from, v.lang_from),
status = 'done',
error = NULL
FROM (VALUES %s) AS v(titulo_trad, resumen_trad, lang_from, id)
WHERE t.id = v.id;
""",
done_rows,
)
if error_rows:
execute_values(
cur,
"""
UPDATE traducciones AS t
SET status = 'error',
error = v.error
FROM (VALUES %s) AS v(error, id)
WHERE t.id = v.id;
""",
error_rows,
)
conn.commit()
def main(): def main():
LOG.info( LOG.info(
"Arrancando worker de traducción (NLLB). TARGET_LANGS=%s, BATCH=%s, ENQUEUE=%s, DEVICE=%s, " "Arrancando worker de traducción (NLLB). TARGET_LANGS=%s, BATCH=%s, ENQUEUE=%s, DEVICE=%s, "
"BEAMS(title/body)=%s/%s, CHUNK_BY_SENTENCES=%s, CHUNK_MAX_TOKENS=%s, OVERLAP_SENTS=%s", "BEAMS(title/body)=%s/%s, CHUNK_BY_SENTENCES=%s, CHUNK_MAX_TOKENS=%s, OVERLAP_SENTS=%s",
TARGET_LANGS, BATCH_SIZE, ENQUEUE_MAX, DEVICE_CFG, NUM_BEAMS_TITLE, NUM_BEAMS_BODY, TARGET_LANGS,
CHUNK_BY_SENTENCES, CHUNK_MAX_TOKENS, CHUNK_OVERLAP_SENTS BATCH_SIZE,
ENQUEUE_MAX,
DEVICE_CFG,
NUM_BEAMS_TITLE,
NUM_BEAMS_BODY,
CHUNK_BY_SENTENCES,
CHUNK_MAX_TOKENS,
CHUNK_OVERLAP_SENTS,
) )
# Pre-carga el modelo una vez para reservar memoria de forma limpia
get_universal_components() get_universal_components()
while True: while True:
@ -628,6 +664,7 @@ def main():
if not any_work: if not any_work:
time.sleep(SLEEP_IDLE) time.sleep(SLEEP_IDLE)
if __name__ == "__main__": if __name__ == "__main__":
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
main() main()