From fc065669286f464b27058472b64d6fbc29935f90 Mon Sep 17 00:00:00 2001 From: jlimolina Date: Fri, 21 Nov 2025 04:42:02 +0100 Subject: [PATCH] arreglo de ui y busquedas --- app.py | 98 ++++- docker-compose.yml | 2 +- embeddings_worker.py | 66 +-- init-db/08-embeddings.sql | 2 +- migrations/001_utils_normalize_url.sql | 62 --- migrations/002_unique_index_url_norm.sql | 38 -- ner_worker.py | 229 +++++++++- related_worker.py | 158 ++++--- scheduler.py | 2 +- static/style.css | 511 +++++++++++++++++++---- templates/_noticias_list.html | 7 +- templates/dashboard.html | 15 +- templates/noticia.html | 22 +- templates/restore_completo.html | 49 +++ translation_worker.py | 289 +++++++------ 15 files changed, 1115 insertions(+), 435 deletions(-) delete mode 100644 migrations/001_utils_normalize_url.sql delete mode 100644 migrations/002_unique_index_url_norm.sql create mode 100644 templates/restore_completo.html diff --git a/app.py b/app.py index e4412ab..6cdb04f 100644 --- a/app.py +++ b/app.py @@ -3,6 +3,7 @@ import csv import io import time import socket +import zipfile from datetime import datetime, date from concurrent.futures import ThreadPoolExecutor, as_completed @@ -337,21 +338,48 @@ def home(): params.append(int(continente_id)) if q: - where.append("n.tsv @@ plainto_tsquery('spanish', %s)") - params.append(q) + search_like = f"%{q}%" + if use_tr: + where.append( + """ + ( + n.tsv @@ websearch_to_tsquery('spanish', %s) + OR t.titulo_trad ILIKE %s + OR t.resumen_trad ILIKE %s + OR n.titulo ILIKE %s + OR n.resumen ILIKE %s + ) + """ + ) + params.extend([q, search_like, search_like, search_like, search_like]) + else: + where.append( + """ + ( + n.tsv @@ websearch_to_tsquery('spanish', %s) + OR n.titulo ILIKE %s + OR n.resumen ILIKE %s + ) + """ + ) + params.extend([q, search_like, search_like]) where_sql = " AND ".join(where) with conn.cursor(cursor_factory=extras.DictCursor) as cur: cur.execute( f""" - SELECT COUNT(*) + SELECT COUNT(DISTINCT n.id) FROM noticias n LEFT JOIN categorias c ON c.id = n.categoria_id LEFT JOIN paises p ON p.id = n.pais_id + LEFT JOIN traducciones t + ON t.noticia_id = n.id + AND t.lang_to = %s + AND t.status = 'done' WHERE {where_sql} """, - params, + [lang] + params, ) total_results = cur.fetchone()[0] if cur.rowcount else 0 total_pages = (total_results // per_page) + (1 if total_results % per_page else 0) @@ -925,7 +953,7 @@ def add_url_source(): """ INSERT INTO fuentes_url (nombre, url, categoria_id, pais_id, idioma) VALUES (%s, %s, %s, %s, %s) - ON CONFLICT (url_norm) DO UPDATE + ON CONFLICT (url) DO UPDATE SET nombre = EXCLUDED.nombre, categoria_id = EXCLUDED.categoria_id, pais_id = EXCLUDED.pais_id, @@ -1172,8 +1200,6 @@ def scrape_url(): @app.route("/backup_completo") def backup_completo(): - import zipfile - mem_file = io.BytesIO() with zipfile.ZipFile(mem_file, mode="w", compression=zipfile.ZIP_DEFLATED) as zf: with get_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur: @@ -1207,6 +1233,64 @@ def backup_completo(): ) +@app.route("/restore_completo", methods=["GET", "POST"]) +def restore_completo(): + if request.method == "GET": + return render_template("restore_completo.html") + + file = request.files.get("backup_file") + if not file or file.filename == "": + flash("No se ha seleccionado ningún archivo.", "error") + return redirect(url_for("restore_completo")) + + filename = file.filename.lower() + if not filename.endswith(".zip"): + flash("El archivo debe ser un .zip.", "error") + return redirect(url_for("restore_completo")) + + raw = file.read() + try: + zf = zipfile.ZipFile(io.BytesIO(raw)) + except zipfile.BadZipFile: + flash("El archivo no es un .zip válido.", "error") + return redirect(url_for("restore_completo")) + + restored_counts = {} + + conn = get_conn() + try: + with conn: + with conn.cursor() as cur: + if "feeds.csv" in zf.namelist(): + cur.execute("TRUNCATE TABLE feeds RESTART IDENTITY;") + with zf.open("feeds.csv") as f: + text_f = io.TextIOWrapper(f, encoding="utf-8") + cur.copy_expert("COPY feeds FROM STDIN CSV HEADER", text_f) + restored_counts["feeds"] = cur.rowcount if cur.rowcount is not None else 0 + + if "fuentes_url.csv" in zf.namelist(): + cur.execute("TRUNCATE TABLE fuentes_url RESTART IDENTITY;") + with zf.open("fuentes_url.csv") as f2: + text_f2 = io.TextIOWrapper(f2, encoding="utf-8") + cur.copy_expert("COPY fuentes_url FROM STDIN CSV HEADER", text_f2) + restored_counts["fuentes_url"] = cur.rowcount if cur.rowcount is not None else 0 + except Exception as e: + conn.rollback() + conn.close() + flash(f"Error al restaurar el backup: {e}", "error") + return redirect(url_for("restore_completo")) + + conn.close() + + if restored_counts: + partes = [f"{tabla}: {n} filas" for tabla, n in restored_counts.items()] + flash("Restauración completada: " + ", ".join(partes), "success") + else: + flash("Backup procesado pero no se encontraron ficheros reconocidos (feeds.csv, fuentes_url.csv).", "warning") + + return redirect(url_for("dashboard")) + + if __name__ == "__main__": app.run(host="0.0.0.0", port=8001, debug=True) diff --git a/docker-compose.yml b/docker-compose.yml index 7e77869..d81aec3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -141,7 +141,7 @@ services: - DB_NAME=${DB_NAME} - DB_USER=${DB_USER} - DB_PASS=${DB_PASS} - - EMB_MODEL=sentence-transformers/all-MiniLM-L6-v2 + - EMB_MODEL=sentence-transformers/paraphrase-multilingual-mpnet-base-v2 - EMB_BATCH=256 - EMB_SLEEP_IDLE=5 - EMB_LANGS=es diff --git a/embeddings_worker.py b/embeddings_worker.py index 4643f53..405241f 100644 --- a/embeddings_worker.py +++ b/embeddings_worker.py @@ -1,16 +1,12 @@ -# embeddings_worker.py -# Worker de embeddings para TRADUCCIONES: -# - Lee traducciones con status='done' y sin embedding para un modelo concreto -# - Calcula embedding (Sentence-Transformers) sobre titulo_trad + resumen_trad -# - Guarda en traduccion_embeddings (traduccion_id, model, dim, embedding) - import os import time import logging from typing import List + import numpy as np import psycopg2 import psycopg2.extras +from psycopg2.extras import execute_values from sentence_transformers import SentenceTransformer import torch @@ -28,20 +24,20 @@ DB = dict( # ---------- Parámetros de worker ---------- # Modelo por defecto: multilingüe, bueno para muchas lenguas -EMB_MODEL = os.environ.get( +EMB_MODEL = os.environ.get( "EMB_MODEL", - "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", + "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", ) -EMB_BATCH = int(os.environ.get("EMB_BATCH", "128")) +EMB_BATCH = int(os.environ.get("EMB_BATCH", "128")) SLEEP_IDLE = float(os.environ.get("EMB_SLEEP_IDLE", "5.0")) # Filtrado por idiomas destino (coma-separado). Por defecto sólo 'es' -EMB_LANGS = [s.strip() for s in os.environ.get("EMB_LANGS", "es").split(",") if s.strip()] +EMB_LANGS = [s.strip() for s in os.environ.get("EMB_LANGS", "es").split(",") if s.strip()] # DEVICE_ENV: 'auto' | 'cpu' | 'cuda' DEVICE_ENV = os.environ.get("DEVICE", "auto").lower() # Límite por iteración (para no tragar toda la tabla de golpe) -EMB_LIMIT = int(os.environ.get("EMB_LIMIT", "1000")) +EMB_LIMIT = int(os.environ.get("EMB_LIMIT", "1000")) # ---------- Utilidades ---------- @@ -65,8 +61,13 @@ def ensure_schema(conn): ); """ ) - cur.execute("CREATE INDEX IF NOT EXISTS idx_tr_emb_model ON traduccion_embeddings(model);") - cur.execute("CREATE INDEX IF NOT EXISTS idx_tr_emb_trid ON traduccion_embeddings(traduccion_id);") + # Alineado con init-db/08-embeddings.sql + cur.execute( + "CREATE INDEX IF NOT EXISTS idx_tr_emb_model ON traduccion_embeddings(model);" + ) + cur.execute( + "CREATE INDEX IF NOT EXISTS idx_tr_emb_traduccion_id ON traduccion_embeddings(traduccion_id);" + ) conn.commit() @@ -104,7 +105,7 @@ def texts_from_rows(rows: List[psycopg2.extras.DictRow]) -> List[str]: Compone el texto a vectorizar por cada traducción: 'titulo_trad' + '\n' + 'resumen_trad'. Si alguno falta, usa lo disponible. """ - texts = [] + texts: List[str] = [] for r in rows: title = (r["titulo_trad"] or "").strip() body = (r["resumen_trad"] or "").strip() @@ -117,24 +118,37 @@ def texts_from_rows(rows: List[psycopg2.extras.DictRow]) -> List[str]: def upsert_embeddings(conn, rows, embs: np.ndarray, model_name: str): """ - Inserta/actualiza embeddings por traducción. + Inserta/actualiza embeddings por traducción en lote (batch insert). """ if embs.size == 0 or not rows: return + dim = int(embs.shape[1]) + + # Preparamos los datos para execute_values + data = [ + ( + int(r["traduccion_id"]), + model_name, + dim, + [float(x) for x in e], + ) + for r, e in zip(rows, embs) + ] + with conn.cursor() as cur: - for r, e in zip(rows, embs): - cur.execute( - """ - INSERT INTO traduccion_embeddings (traduccion_id, model, dim, embedding) - VALUES (%s, %s, %s, %s) - ON CONFLICT (traduccion_id, model) DO UPDATE - SET embedding = EXCLUDED.embedding, - dim = EXCLUDED.dim, - created_at = NOW() + execute_values( + cur, + """ + INSERT INTO traduccion_embeddings (traduccion_id, model, dim, embedding) + VALUES %s + ON CONFLICT (traduccion_id, model) DO UPDATE + SET embedding = EXCLUDED.embedding, + dim = EXCLUDED.dim, + created_at = NOW() """, - (int(r["traduccion_id"]), model_name, dim, list(map(float, e))), - ) + data, + ) conn.commit() diff --git a/init-db/08-embeddings.sql b/init-db/08-embeddings.sql index 6966670..5b9a821 100644 --- a/init-db/08-embeddings.sql +++ b/init-db/08-embeddings.sql @@ -29,7 +29,7 @@ SELECT te.dim, te.embedding AS vec FROM traduccion_embeddings te -WHERE te.model = 'sentence-transformers/all-MiniLM-L6-v2'; +WHERE te.model = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'; CREATE TABLE IF NOT EXISTS related_noticias ( traduccion_id INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE, diff --git a/migrations/001_utils_normalize_url.sql b/migrations/001_utils_normalize_url.sql deleted file mode 100644 index 7ba2b1a..0000000 --- a/migrations/001_utils_normalize_url.sql +++ /dev/null @@ -1,62 +0,0 @@ --- Función para “normalizar” URLs: minúsculas en esquema/host, quitar www, --- quitar fragmentos (#...), limpiar trackers comunes (utm_*, gclid, fbclid, ref, etc.), --- colapsar dobles “/”, y quitar la “/” final salvo si es la raíz. - -CREATE OR REPLACE FUNCTION normalize_url(in_url text) -RETURNS text -LANGUAGE plpgsql -AS $$ -DECLARE - u text := trim(in_url); - scheme_host text; - path_q text; -BEGIN - IF u IS NULL OR u = '' THEN - RETURN NULL; - END IF; - - -- quitar espacios y fragmentos - u := regexp_replace(u, '#.*$', '', 'i'); - - -- separar esquema+host de path+query - -- ej: https://example.com:443/foo?bar -> scheme_host=https://example.com:443 ; path_q=/foo?bar - scheme_host := substring(u FROM '^[a-z]+://[^/]*'); - IF scheme_host IS NULL THEN - -- si no hay esquema, asumimos http - u := 'http://' || u; - scheme_host := substring(u FROM '^[a-z]+://[^/]*'); - END IF; - path_q := substring(u FROM '^[a-z]+://[^/]*(/.*)$'); - IF path_q IS NULL THEN - path_q := '/'; - END IF; - - -- normalizar esquema y host (minúsculas, quitar www.) - scheme_host := lower(scheme_host); - scheme_host := regexp_replace(scheme_host, '^(https?://)www\.', '\1', 'i'); - - -- quitar puerto por defecto (:80 en http, :443 en https) - scheme_host := regexp_replace(scheme_host, '^http://([^/:]+):80$', 'http://\1', 'i'); - scheme_host := regexp_replace(scheme_host, '^https://([^/:]+):443$', 'https://\1', 'i'); - - -- limpiar parámetros de tracking en la query - -- elimina ?utm_... &utm_... gclid fbclid mc_cid mc_eid ref ref_src etc. - path_q := regexp_replace(path_q, '([?&])(utm_[^=&]+|gclid|fbclid|mc_cid|mc_eid|ref|ref_src|yclid|igshid)=[^&#]*', '\1', 'gi'); - -- limpiar conectores sobrantes ?, &, &&, ?&, etc. - path_q := regexp_replace(path_q, '\?&+', '?', 'g'); - path_q := regexp_replace(path_q, '&{2,}', '&', 'g'); - path_q := regexp_replace(path_q, '\?$', '', 'g'); - path_q := regexp_replace(path_q, '\?$','', 'g'); - - -- colapsar dobles barras en path (no tocar “://”) - path_q := regexp_replace(path_q, '/{2,}', '/', 'g'); - - -- quitar “/” final si no es la raíz - IF path_q <> '/' THEN - path_q := regexp_replace(path_q, '/+$', '', 'g'); - END IF; - - RETURN scheme_host || path_q; -END; -$$; - diff --git a/migrations/002_unique_index_url_norm.sql b/migrations/002_unique_index_url_norm.sql deleted file mode 100644 index 0736c09..0000000 --- a/migrations/002_unique_index_url_norm.sql +++ /dev/null @@ -1,38 +0,0 @@ --- Añadir columna generada url_norm y crear índice único sobre ella. --- OJO: si ya existen duplicados, este índice fallará. --- Primero crea la columna si no existe: - -DO $$ -BEGIN - IF NOT EXISTS ( - SELECT 1 FROM information_schema.columns - WHERE table_name='feeds' AND column_name='url_norm' - ) THEN - ALTER TABLE feeds - ADD COLUMN url_norm text GENERATED ALWAYS AS (normalize_url(url)) STORED; - END IF; -END $$; - --- Índice único (concurrently para no bloquear). Requiere estar fuera de transacción. --- Si tu herramienta corre todo en una transacción, ejecuta estas dos líneas aparte. --- Quita duplicados antes si da error. -CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS feeds_url_norm_uniq ON feeds (url_norm) -WHERE url_norm IS NOT NULL; - --- (Opcional) repetir lo mismo para fuentes_url y noticias si quieres esa garantía también: - -DO $$ -BEGIN - IF NOT EXISTS ( - SELECT 1 FROM information_schema.columns - WHERE table_name='fuentes_url' AND column_name='url_norm' - ) THEN - ALTER TABLE fuentes_url - ADD COLUMN url_norm text GENERATED ALWAYS AS (normalize_url(url)) STORED; - END IF; -END $$; - -CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS fuentes_url_norm_uniq ON fuentes_url (url_norm) -WHERE url_norm IS NOT NULL; - - diff --git a/ner_worker.py b/ner_worker.py index f393e59..a6f6f9f 100644 --- a/ner_worker.py +++ b/ner_worker.py @@ -2,6 +2,10 @@ import os import time import logging import re +import string +from typing import List, Tuple +from collections import Counter + import psycopg2 import psycopg2.extras import spacy @@ -28,30 +32,120 @@ ENT_LABELS = { } _ws_re = re.compile(r"\s+") + HTML_TRASH_PATTERNS = [ r"<[^>]+>", r"&[a-z]+;", + r"&#\d+;?", # entidades numéricas tipo … r'width="\d+"', r'height="\d+"', ] + +# Palabras/sintagmas demasiado genéricos o claramente ruido GENERIC_BAD_TAGS = { "república", "estado", "centro", "gobierno", + "el gobierno", + "gobiernos", "report", "sp", "unión", + "union", + "dólares", + "dolar", + "dólar", + "the post", + "post", + "artículo", + "el artículo", + "la ciudad", + "mundo", + "país", + "pais", + "países", + "paises", + "la noche", + "la publicación", + "este miércoles", + "el miércoles", + "hoy", + "ayer", + "mañana", + "servicio", + "servicios", + "el presidente", + "presidente", + "el ministro", + "ministro", + "la guerra", + "guerra", + "seguridad", + "wp-content", + "internal_photos", + "/internal_photos", + "https", + "http", + "src", } +STOPWORDS = set() -def clean_tag_text(text): +ARTICLES = { + "el", + "la", + "los", + "las", + "un", + "una", + "uno", + "al", + "del", +} + +TOPIC_MIN_CHARS = 4 +TOPIC_MAX_WORDS = 6 +TOPIC_MAX_PER_DOC = 15 + + +def _looks_like_attr_or_path(text_lower: str) -> bool: + """Detecta cosas tipo rutas, atributos HTML, ids raros, etc.""" + if text_lower.startswith("/"): + return True + if "http://" in text_lower or "https://" in text_lower: + return True + if ".com" in text_lower or ".net" in text_lower or ".org" in text_lower: + return True + if any(ext in text_lower for ext in (".jpg", ".jpeg", ".png", ".gif", ".webp")): + return True + if re.search(r"\b(src|alt|style|class)\s*=", text_lower): + return True + if "data-" in text_lower: + return True + if re.search(r"&#\d+;?", text_lower): + return True + # cosas tipo atributo=valor + if "=" in text_lower and " " not in text_lower.strip(): + return True + # cadenas largas sin espacios (ids, hashes…) + if re.fullmatch(r"[a-z0-9_]{15,}", text_lower.replace("-", "").replace(" ", "")): + return True + # palabra única con guión suele ser ruta/slug: wp-content, internal-photos… + if "-" in text_lower and " " not in text_lower: + return True + return False + + +def clean_tag_text(text: str) -> str | None: + """Limpieza para entidades (PERSON/ORG/GPE/LOC).""" if not text: return None text = BeautifulSoup(text, "html.parser").get_text() for pat in HTML_TRASH_PATTERNS: text = re.sub(pat, "", text) text = _ws_re.sub(" ", text).strip() + text = text.strip(string.punctuation + " ") if len(text) < 3: return None if re.search(r"[<>/\\]", text): @@ -59,13 +153,15 @@ def clean_tag_text(text): lower = text.lower() if lower.startswith("href="): return None - if lower.startswith("http"): + if _looks_like_attr_or_path(lower): return None if lower in GENERIC_BAD_TAGS: return None + replacements = { "ee.uu.": "Estados Unidos", "los estados unidos": "Estados Unidos", + "eeuu": "Estados Unidos", "eu": "Unión Europea", "ue": "Unión Europea", "kosova": "Kosovo", @@ -75,13 +171,112 @@ def clean_tag_text(text): return text +def clean_topic_text(text: str) -> str | None: + """Limpieza para posibles 'temas' (noun_chunks).""" + if not text: + return None + text = BeautifulSoup(text, "html.parser").get_text() + for pat in HTML_TRASH_PATTERNS: + text = re.sub(pat, "", text) + text = _ws_re.sub(" ", text).strip() + text = text.strip(string.punctuation + " ") + if len(text) < TOPIC_MIN_CHARS: + return None + + lower = text.lower() + + if _looks_like_attr_or_path(lower): + return None + + # tokenizamos en minúsculas y quitamos puntuación + tokens = [ + t.strip(string.punctuation) + for t in lower.split() + if t.strip(string.punctuation) + ] + if not tokens: + return None + + # quitamos artículo inicial si lo hay + if tokens and tokens[0] in ARTICLES: + tokens = tokens[1:] + if not tokens: + return None + + # reconstruimos texto normalizado sin artículo + norm = " ".join(tokens).strip() + if len(norm) < TOPIC_MIN_CHARS: + return None + + if norm in GENERIC_BAD_TAGS: + return None + + # límite máximo de palabras + if len(tokens) > TOPIC_MAX_WORDS: + return None + + # todos stopwords => fuera + if all(t in STOPWORDS for t in tokens): + return None + + # sólo números/fechas + if re.fullmatch(r"[0-9\s\.,\-:/]+", norm): + return None + + return norm + + def get_conn(): return psycopg2.connect(**DB) +def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]: + ents: List[Tuple[str, str]] = [] + topics: List[Tuple[str, str]] = [] + + if not text: + return ents, topics + + doc = nlp(text) + + # Entidades "clásicas" + for ent in doc.ents: + tipo = ENT_LABELS.get(ent.label_) + if not tipo: + continue + val = clean_tag_text(ent.text) + if not val: + continue + ents.append((val, tipo)) + + # Candidatos a "tema" a partir de noun_chunks + topic_counter: Counter[str] = Counter() + + for chunk in doc.noun_chunks: + val = clean_topic_text(chunk.text) + if not val: + continue + topic_counter[val] += 1 + + ent_values = {v for (v, _) in ents} + + for val, _count in topic_counter.most_common(TOPIC_MAX_PER_DOC): + if val in ent_values: + continue + topics.append((val, "tema")) + + # quitamos duplicados + ents = list(set(ents)) + topics = list(set(topics)) + return ents, topics + + def main(): - nlp = spacy.load("es_core_news_md", disable=["parser", "lemmatizer", "textcat"]) - logging.info("spaCy cargado: es_core_news_md") + global STOPWORDS + + nlp = spacy.load("es_core_news_md", disable=["lemmatizer", "textcat"]) + STOPWORDS = set(nlp.Defaults.stop_words) + logging.info("spaCy cargado: es_core_news_md (NER + parser)") while True: try: @@ -109,7 +304,7 @@ def main(): time.sleep(5) continue - logging.info(f"Procesando {len(rows)} traducciones para NER...") + logging.info(f"Procesando {len(rows)} traducciones para NER/temas...") new_links = 0 @@ -118,22 +313,12 @@ def main(): if not text: continue - doc = nlp(text) - ents = [] - - for ent in doc.ents: - tipo = ENT_LABELS.get(ent.label_) - if not tipo: - continue - val = clean_tag_text(ent.text) - if not val: - continue - ents.append((val, tipo)) - - if not ents: + ents, topics = extract_entities_and_topics(nlp, text) + all_tags = ents + topics + if not all_tags: continue - for valor, tipo in set(ents): + for valor, tipo in all_tags: try: cur.execute( """ @@ -160,9 +345,9 @@ def main(): logging.exception("Fallo insertando tag/relación") conn.commit() - logging.info(f"NER lote OK. Nuevos enlaces: {new_links}.") - except Exception as e: - logging.exception(f"Error en NER loop: {e}") + logging.info(f"NER/temas lote OK. Nuevos enlaces: {new_links}.") + except Exception: + logging.exception("Error en NER loop") time.sleep(5) diff --git a/related_worker.py b/related_worker.py index 6f8f17d..a9b8b08 100644 --- a/related_worker.py +++ b/related_worker.py @@ -1,9 +1,9 @@ import os import time -import math import logging from typing import List, Tuple +import numpy as np import psycopg2 import psycopg2.extras @@ -22,7 +22,6 @@ DB = dict( TOPK = int(os.environ.get("RELATED_TOPK", 10)) BATCH_IDS = int(os.environ.get("RELATED_BATCH_IDS", 200)) -BATCH_SIM = int(os.environ.get("RELATED_BATCH_SIM", 2000)) SLEEP_IDLE = float(os.environ.get("RELATED_SLEEP", 10)) MIN_SCORE = float(os.environ.get("RELATED_MIN_SCORE", 0.0)) WINDOW_HOURS = int(os.environ.get("RELATED_WINDOW_H", 0)) @@ -32,44 +31,64 @@ def get_conn(): return psycopg2.connect(**DB) +# --------------------------------------------------------- +# Cargar embeddings SOLO de traducciones en español (lang_to='es') +# --------------------------------------------------------- def _fetch_all_embeddings(cur): + base_sql = """ + SELECT e.traduccion_id, e.vec + FROM embeddings e + JOIN traducciones t ON t.id = e.traduccion_id + JOIN noticias n ON n.id = t.noticia_id + WHERE t.lang_to = 'es' + """ + + params = [] if WINDOW_HOURS > 0: - cur.execute( - """ - SELECT e.traduccion_id, e.vec - FROM embeddings e - JOIN traducciones t ON t.id = e.traduccion_id - JOIN noticias n ON n.id = t.noticia_id - WHERE n.fecha >= NOW() - INTERVAL %s - """, - (f"{WINDOW_HOURS} hours",), - ) - else: - cur.execute("SELECT traduccion_id, vec FROM embeddings") + base_sql += " AND n.fecha >= NOW() - INTERVAL %s" + params.append(f"{WINDOW_HOURS} hours") + + cur.execute(base_sql, params) rows = cur.fetchall() if not rows: - return [], [], [] + return [], None ids = [] vecs = [] - norms = [] - for tr_id, v in rows: + + for tid, v in rows: if v is None: - v = [] - nrm = math.sqrt(sum(((x or 0.0) * (x or 0.0)) for x in v)) or 1e-8 - ids.append(tr_id) + continue + ids.append(tid) vecs.append(v) - norms.append(nrm) - return ids, vecs, norms + + if not ids: + return [], None + + # Convertimos a matriz numpy + mat = np.array(vecs, dtype=np.float32) + + # Normalizamos (evita división por 0) + norms = np.linalg.norm(mat, axis=1, keepdims=True) + norms[norms == 0] = 1e-8 + mat = mat / norms + + return ids, mat +# --------------------------------------------------------- +# Obtiene IDs pendientes +# --------------------------------------------------------- def _fetch_pending_ids(cur, limit) -> List[int]: cur.execute( """ SELECT e.traduccion_id FROM embeddings e - LEFT JOIN related_noticias r ON r.traduccion_id = e.traduccion_id + JOIN traducciones t ON t.id = e.traduccion_id + LEFT JOIN related_noticias r + ON r.traduccion_id = e.traduccion_id + WHERE t.lang_to = 'es' GROUP BY e.traduccion_id HAVING COUNT(r.related_traduccion_id) = 0 ORDER BY e.traduccion_id DESC @@ -80,42 +99,44 @@ def _fetch_pending_ids(cur, limit) -> List[int]: return [r[0] for r in cur.fetchall()] -def _cosine_with_norms(a, b, na, nb): - num = 0.0 - for x, y in zip(a, b): - xv = x or 0.0 - yv = y or 0.0 - num += xv * yv - denom = na * nb - if denom <= 0.0: - return 0.0 - return num / denom - - -def _topk_for_one( +# --------------------------------------------------------- +# TOP-K usando NumPy (súper rápido) +# --------------------------------------------------------- +def _topk_numpy( idx: int, ids_all: List[int], - vecs_all: List[List[float]], - norms_all: List[float], - pool_indices: List[int], - K: int, + mat: np.ndarray, + K: int ) -> List[Tuple[int, float]]: - me_vec = vecs_all[idx] - me_norm = norms_all[idx] - out: List[Tuple[int, float]] = [] - for j in pool_indices: - if j == idx: - continue - s = _cosine_with_norms(me_vec, vecs_all[j], me_norm, norms_all[j]) - out.append((ids_all[j], s)) + # vector de la noticia central + q = mat[idx] # (dim,) - out.sort(key=lambda t: t[1], reverse=True) - if MIN_SCORE > 0.0: - out = [p for p in out if p[1] >= MIN_SCORE] - return out[:K] + # similitudes coseno: dot product (matriz · vector) + sims = np.dot(mat, q) + + # eliminar self-match + sims[idx] = -999.0 + + # filtramos por score mínimo + if MIN_SCORE > 0: + mask = sims >= MIN_SCORE + sims = np.where(mask, sims, -999.0) + + # obtenemos los índices top-k (mucho más rápido que ordenar todo) + if K >= len(sims): + top_idx = np.argsort(-sims) + else: + part = np.argpartition(-sims, K)[:K] + top_idx = part[np.argsort(-sims[part])] + + out = [(ids_all[j], float(sims[j])) for j in top_idx[:K]] + return out +# --------------------------------------------------------- +# Inserta en la tabla related_noticias +# --------------------------------------------------------- def _insert_related(cur, tr_id: int, pairs: List[Tuple[int, float]]): if not pairs: return @@ -127,48 +148,47 @@ def _insert_related(cur, tr_id: int, pairs: List[Tuple[int, float]]): ON CONFLICT (traduccion_id, related_traduccion_id) DO UPDATE SET score = EXCLUDED.score """, - [(tr_id, rid, float(score)) for (rid, score) in pairs], + [(tr_id, rid, score) for (rid, score) in pairs], ) +# --------------------------------------------------------- +# Procesar IDs objetivo +# --------------------------------------------------------- def build_for_ids(conn, target_ids: List[int]) -> int: with conn.cursor() as cur: - ids_all, vecs_all, norms_all = _fetch_all_embeddings(cur) - if not ids_all: + ids_all, mat = _fetch_all_embeddings(cur) + + if not ids_all or mat is None: return 0 + # Mapa ID → index pos = {tid: i for i, tid in enumerate(ids_all)} - n = len(ids_all) processed = 0 with conn.cursor() as cur: for tr_id in target_ids: if tr_id not in pos: continue - i = pos[tr_id] - top: List[Tuple[int, float]] = [] - for start in range(0, n, BATCH_SIM): - block = list(range(start, min(start + BATCH_SIM, n))) - candidates = _topk_for_one(i, ids_all, vecs_all, norms_all, block, TOPK) - top += candidates - top.sort(key=lambda t: t[1], reverse=True) - if len(top) > TOPK: - top = top[:TOPK] - - _insert_related(cur, tr_id, top) + idx = pos[tr_id] + pairs = _topk_numpy(idx, ids_all, mat, TOPK) + _insert_related(cur, tr_id, pairs) processed += 1 conn.commit() + return processed +# --------------------------------------------------------- +# MAIN +# --------------------------------------------------------- def main(): logging.info( - "Iniciando related_worker (TOPK=%s, BATCH_IDS=%s, BATCH_SIM=%s, MIN_SCORE=%.3f, WINDOW_H=%s)", + "Iniciando related_worker (TOPK=%s, BATCH_IDS=%s, MIN_SCORE=%.3f, WINDOW_H=%s)", TOPK, BATCH_IDS, - BATCH_SIM, MIN_SCORE, WINDOW_HOURS, ) diff --git a/scheduler.py b/scheduler.py index bcbb1c1..b6b517f 100644 --- a/scheduler.py +++ b/scheduler.py @@ -36,7 +36,7 @@ if __name__ == '__main__': # --- CORRECCIÓN 2: Se cambió 'fetch_and_store' por 'fetch_and_store_all' --- fetch_and_store_all, "interval", - minutes=3, + minutes=10, id="rss_job", next_run_time=datetime.utcnow() + timedelta(seconds=10) ) diff --git a/static/style.css b/static/style.css index 2957eb4..1386657 100644 --- a/static/style.css +++ b/static/style.css @@ -14,6 +14,7 @@ } * { box-sizing: border-box; } + body { font-family: 'Poppins', 'Segoe UI', Tahoma, sans-serif; margin: 0; @@ -24,6 +25,12 @@ body { font-weight: 400; } +img { + max-width: 100%; + height: auto; + display: block; +} + .container { max-width: 900px; margin: 30px auto; @@ -36,58 +43,324 @@ body { -webkit-backdrop-filter: blur(12px); } -header { text-align: center; margin-bottom: 40px; border-bottom: 1px solid var(--border-color); padding-bottom: 30px; } -h1 { font-size: 2.8rem; font-weight: 700; margin: 0 0 5px 0; background: var(--gradiente-principal); -webkit-background-clip: text; -webkit-text-fill-color: transparent; display: inline-block; } -h2 { font-size: 1.8rem; font-weight: 600; color: var(--primary-color); margin-bottom: 20px; } -.subtitle { color: var(--text-color-light); font-size: 1.1rem; margin-top: 5px; } +header { + text-align: center; + margin-bottom: 40px; + border-bottom: 1px solid var(--border-color); + padding-bottom: 30px; +} -.form-section, .card { margin-bottom: 30px; background: rgba(255, 255, 255, 0.6); padding: 25px; border-radius: var(--border-radius-md); border: 1px solid var(--border-color); } -label { display: block; margin-bottom: 8px; font-weight: 600; color: var(--text-color); font-size: 0.9rem; } -select, input[type="text"], input[type="url"], input[type="file"], textarea { width: 100%; padding: 12px 15px; border: 1px solid var(--border-color); background-color: #f8f9fa; border-radius: var(--border-radius-sm); font-size: 1rem; font-family: 'Poppins', sans-serif; transition: all var(--transition-speed) ease; } -select:focus, input:focus, textarea:focus { outline: none; border-color: var(--primary-color); box-shadow: 0 0 0 3px var(--shadow-color); background-color: white; } +h1 { + font-size: 2.8rem; + font-weight: 700; + margin: 0 0 5px 0; + background: var(--gradiente-principal); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + display: inline-block; +} -.btn, button { padding: 12px 25px; background: var(--gradiente-principal); color: white !important; border: none; border-radius: var(--border-radius-sm); font-size: 1rem; font-weight: 600; cursor: pointer; transition: all var(--transition-speed) ease; box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1); text-decoration: none; display: inline-block; text-align: center; } -.btn:hover, button:hover { transform: translateY(-3px); box-shadow: 0 6px 20px rgba(0, 0, 0, 0.2); text-decoration: none; } -.btn-secondary { background: #34495e; } .btn-secondary:hover { background: #2c3e50; } -.btn-info { background: #17a2b8; } .btn-info:hover { background: #138496; } -.btn-danger { background: #dc3545; } .btn-danger:hover { background: #c82333; } -.btn-small { padding: 6px 14px; font-size: 0.9rem; } -a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:hover { text-decoration: underline; } -.top-link { display: inline-block; margin-bottom: 25px; font-weight: 500; color: var(--primary-color); } -.top-link:hover { text-decoration: underline; } +h2 { + font-size: 1.8rem; + font-weight: 600; + color: var(--primary-color); + margin-bottom: 20px; +} -.noticias-list { list-style: none; padding: 0; margin: 0; } -.noticia-item { display: flex; gap: 20px; padding: 20px 10px; border-bottom: 1px solid var(--border-color); transition: background-color 0.2s ease; } -.noticia-item:last-child { border-bottom: none; } -.noticia-item:hover { background-color: rgba(255,255,255,0.4); } -.noticia-imagen img { width: 150px; height: 100px; border-radius: var(--border-radius-sm); object-fit: cover; } -.noticia-texto h3 { margin: 0 0 5px 0; } -.noticia-texto h3 a { color: var(--text-color); font-weight: 600; } -.noticia-texto h3 a:hover { color: var(--primary-color); } -.noticia-meta { font-size: 0.8rem; color: var(--text-color-light); margin-bottom: 8px; } +.subtitle { + color: var(--text-color-light); + font-size: 1.1rem; + margin-top: 5px; +} -.flash-messages { list-style: none; padding: 0; margin-bottom: 20px; } -.flash-messages li { padding: 15px 20px; border-radius: var(--border-radius-sm); border-left: 5px solid; } -.flash-messages .error { background-color: #fff0f3; color: #d90429; border-color: var(--error-color); } -.flash-messages .success { background-color: #e6fcf5; color: #00b894; border-color: #00b894; } -.flash-messages .warning { background-color: #fffbeb; color: #f39c12; border-color: #f39c12; } +.form-section, +.card { + margin-bottom: 30px; + background: rgba(255, 255, 255, 0.6); + padding: 25px; + border-radius: var(--border-radius-md); + border: 1px solid var(--border-color); +} + +label { + display: block; + margin-bottom: 8px; + font-weight: 600; + color: var(--text-color); + font-size: 0.9rem; +} + +select, +input[type="text"], +input[type="url"], +input[type="file"], +textarea { + width: 100%; + padding: 12px 15px; + border: 1px solid var(--border-color); + background-color: #f8f9fa; + border-radius: var(--border-radius-sm); + font-size: 1rem; + font-family: 'Poppins', sans-serif; + transition: all var(--transition-speed) ease; +} + +select:focus, +input:focus, +textarea:focus { + outline: none; + border-color: var(--primary-color); + box-shadow: 0 0 0 3px var(--shadow-color); + background-color: white; +} + +.btn, +button { + padding: 12px 25px; + background: var(--gradiente-principal); + color: white !important; + border: none; + border-radius: var(--border-radius-sm); + font-size: 1rem; + font-weight: 600; + cursor: pointer; + transition: all var(--transition-speed) ease; + box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1); + text-decoration: none; + display: inline-block; + text-align: center; +} + +.btn:hover, +button:hover { + transform: translateY(-3px); + box-shadow: 0 6px 20px rgba(0, 0, 0, 0.2); + text-decoration: none; +} + +.btn-secondary { background: #34495e; } +.btn-secondary:hover { background: #2c3e50; } + +.btn-info { background: #17a2b8; } +.btn-info:hover { background: #138496; } + +.btn-danger { background: #dc3545; } +.btn-danger:hover { background: #c82333; } + +.btn-small { + padding: 6px 14px; + font-size: 0.9rem; +} + +a { + color: var(--secondary-color); + text-decoration: none; + font-weight: 500; +} + +a:hover { + text-decoration: underline; +} + +.top-link { + display: inline-block; + margin-bottom: 25px; + font-weight: 500; + color: var(--primary-color); +} + +.top-link:hover { + text-decoration: underline; +} + +.noticias-list { + list-style: none; + padding: 0; + margin: 0; +} + +.noticia-item { + display: flex; + gap: 20px; + padding: 20px 10px; + border-bottom: 1px solid var(--border-color); + transition: background-color 0.2s ease; + align-items: flex-start; +} + +.noticia-item:last-child { + border-bottom: none; +} + +.noticia-item:hover { + background-color: rgba(255, 255, 255, 0.4); +} + +.noticia-imagen { + flex: 0 0 180px; + max-width: 180px; +} + +.noticia-imagen img { + width: 100%; + height: 120px; + border-radius: var(--border-radius-sm); + object-fit: cover; +} + +.noticia-texto h3 { + margin: 0 0 5px 0; +} + +.noticia-texto h3 a { + color: var(--text-color); + font-weight: 600; +} + +.noticia-texto h3 a:hover { + color: var(--primary-color); +} + +.noticia-meta { + font-size: 0.8rem; + color: var(--text-color-light); + margin-bottom: 8px; +} + +.flash-messages { + list-style: none; + padding: 0; + margin-bottom: 20px; +} + +.flash-messages li { + padding: 15px 20px; + border-radius: var(--border-radius-sm); + border-left: 5px solid; +} + +.flash-messages .error { + background-color: #fff0f3; + color: #d90429; + border-color: var(--error-color); +} + +.flash-messages .success { + background-color: #e6fcf5; + color: #00b894; + border-color: #00b894; +} + +.flash-messages .warning { + background-color: #fffbeb; + color: #f39c12; + border-color: #f39c12; +} + +.dashboard-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); + gap: 20px; + margin-bottom: 40px; +} + +.stat-card { + background: rgba(255, 255, 255, 0.8); + padding: 20px; + border-radius: var(--border-radius-md); + text-align: center; + border: 1px solid var(--border-color); + transition: all 0.3s ease; +} + +.stat-card:hover { + transform: translateY(-5px); + box-shadow: 0 4px 15px rgba(0,0,0,0.08); +} + +.stat-card .stat-number { + font-size: 2.5rem; + font-weight: 600; + background: var(--gradiente-principal); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + line-height: 1.2; +} + +.stat-card .stat-label { + font-size: 0.9rem; + color: var(--text-color-light); + font-weight: 500; + margin-top: 5px; +} + +.pagination { + display: flex; + justify-content: center; + align-items: center; + gap: 5px; + margin: 30px 0; + flex-wrap: wrap; +} + +.page-link { + display: inline-block; + padding: 8px 14px; + background: rgba(255, 255, 255, 0.6); + border: 1px solid var(--border-color); + border-radius: var(--border-radius-sm); + color: var(--primary-color); + text-decoration: none; + transition: all 0.2s ease; +} + +.page-link:hover { + background: white; + box-shadow: 0 2px 5px rgba(0,0,0,0.1); +} + +.page-link.active { + background: var(--gradiente-principal); + color: white; + border-color: transparent; + cursor: default; +} -.dashboard-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px; margin-bottom: 40px; } -.stat-card { background: rgba(255, 255, 255, 0.8); padding: 20px; border-radius: var(--border-radius-md); text-align: center; border: 1px solid var(--border-color); transition: all 0.3s ease; } -.stat-card:hover { transform: translateY(-5px); box-shadow: 0 4px 15px rgba(0,0,0,0.08); } -.stat-card .stat-number { font-size: 2.5rem; font-weight: 600; background: var(--gradiente-principal); -webkit-background-clip: text; -webkit-text-fill-color: transparent; line-height: 1.2; } -.stat-card .stat-label { font-size: 0.9rem; color: var(--text-color-light); font-weight: 500; margin-top: 5px; } -.pagination { display: flex; justify-content: center; align-items: center; gap: 5px; margin: 30px 0; flex-wrap: wrap; } -.page-link { display: inline-block; padding: 8px 14px; background: rgba(255, 255, 255, 0.6); border: 1px solid var(--border-color); border-radius: var(--border-radius-sm); color: var(--primary-color); text-decoration: none; transition: all 0.2s ease; } -.page-link:hover { background: white; box-shadow: 0 2px 5px rgba(0,0,0,0.1); } -.page-link.active { background: var(--gradiente-principal); color: white; border-color: transparent; cursor: default; } .feed-detail-card { padding: 0; } -.feed-header { display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap; gap: 10px; background: rgba(233, 236, 239, 0.5); padding: 15px 25px; border-bottom: 1px solid var(--border-color); } -.feed-header h2 { margin: 0; font-size: 1.4rem; } + +.feed-header { + display: flex; + justify-content: space-between; + align-items: center; + flex-wrap: wrap; + gap: 10px; + background: rgba(233, 236, 239, 0.5); + padding: 15px 25px; + border-bottom: 1px solid var(--border-color); +} + +.feed-header h2 { + margin: 0; + font-size: 1.4rem; +} + .feed-body { padding: 25px; } -.feed-body dl { display: grid; grid-template-columns: 120px 1fr; gap: 10px 20px; } -.feed-body dt { font-weight: 600; color: var(--text-color-light); } -.feed-body dd { margin: 0; word-break: break-all; } + +.feed-body dl { + display: grid; + grid-template-columns: 120px 1fr; + gap: 10px 20px; +} + +.feed-body dt { + font-weight: 600; + color: var(--text-color-light); +} + +.feed-body dd { + margin: 0; + word-break: break-all; +} .main-nav { display: flex; @@ -98,6 +371,7 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a: flex-wrap: wrap; padding-top: 15px; } + .nav-link { font-weight: 500; color: var(--text-color); @@ -106,11 +380,13 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a: border-radius: var(--border-radius-sm); transition: all var(--transition-speed); } + .nav-link:hover { background-color: rgba(255,255,255,0.6); text-decoration: none; color: var(--primary-color); } + .nav-actions { display: flex; gap: 10px; @@ -118,15 +394,78 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a: } @media (max-width: 768px) { - .container { padding: 20px; margin: 15px; } + .container { + padding: 20px; + margin: 15px; + } + h1 { font-size: 2rem; } - .noticia-item { flex-direction: column; } - .feed-body dl { grid-template-columns: 100px 1fr; } - .main-nav { flex-direction: column; gap: 10px; } - .nav-actions { margin-left: 0; margin-top: 10px; } + + .noticia-item { + flex-direction: column; + } + + .noticia-imagen { + flex: 0 0 auto; + max-width: 100%; + } + + .noticia-imagen img { + width: 100%; + height: auto; + } + + .feed-body dl { + grid-template-columns: 100px 1fr; + } + + .main-nav { + flex-direction: column; + gap: 10px; + } + + .nav-actions { + margin-left: 0; + margin-top: 10px; + } +} + +.resumen-container { + position: relative; +} + +/* Neutralizar estilos raros que vienen dentro del HTML de los resúmenes */ +.resumen-container .btn, +.resumen-container button, +.resumen-container input[type="button"], +.resumen-container input[type="submit"], +.resumen-container .wp-block-button__link { + padding: 0; + margin: 0; + background: none; + border: none; + border-radius: 0; + box-shadow: none; + font: inherit; + display: inline; + color: var(--secondary-color); + text-decoration: underline; + cursor: pointer; +} + +.resumen-container .btn:hover, +.resumen-container button:hover, +.resumen-container .wp-block-button__link:hover { + transform: none; + box-shadow: none; + text-decoration: underline; +} + +.resumen-container a { + color: var(--secondary-color); + text-decoration: underline; } -.resumen-container { position: relative; } .ver-mas-btn { background: none; border: none; @@ -144,8 +483,14 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a: gap: 15px; margin-bottom: 20px; } + .filter-search-box { flex-grow: 1; } -.filter-actions { display: flex; gap: 10px; white-space: nowrap; } + +.filter-actions { + display: flex; + gap: 10px; + white-space: nowrap; +} .clamp { display: -webkit-box; @@ -154,44 +499,62 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a: overflow: hidden; word-break: break-word; } + .clamp.expanded { -webkit-line-clamp: unset; max-height: none; overflow: visible; } -/* Pestañas por noticia */ + .tabs { width: 100%; } -.tabs-header { display: flex; gap: 8px; margin-bottom: 8px; } + +.tabs-header { + display: flex; + gap: 8px; + margin-bottom: 8px; +} + .tab-btn { - background: rgba(255,255,255,0.7); - border: 1px solid var(--border-color); - border-radius: 999px; - padding: 6px 12px; - font-weight: 600; - cursor: pointer; + background: rgba(255,255,255,0.7); + border: 1px solid var(--border-color); + border-radius: 999px; + padding: 6px 12px; + font-weight: 600; + cursor: pointer; } + .tab-btn.active { - background: var(--gradiente-principal); - color: #fff !important; - border-color: transparent; + background: var(--gradiente-principal); + color: #fff !important; + border-color: transparent; } + .tab-btn[disabled] { - opacity: .45; - cursor: not-allowed; + opacity: .45; + cursor: not-allowed; } + .tab-panel { display: none; } + .tab-panel.active { display: block; } .badge { - display: inline-block; - margin-left: 8px; - font-size: .75rem; - padding: 2px 8px; - border-radius: 999px; - background: #e8f0ff; - color: var(--secondary-color); - vertical-align: middle; + display: inline-block; + margin-left: 8px; + font-size: .75rem; + padding: 2px 8px; + border-radius: 999px; + background: #e8f0ff; + color: var(--secondary-color); + vertical-align: middle; } -.badge-secondary { background: #f1f3f5; color: #555; } + +.badge-secondary { + background: #f1f3f5; + color: #555; +} + .mini-link { margin-left: 8px; font-size: .8rem; } + .m0 { margin: 0 0 6px 0; } + diff --git a/templates/_noticias_list.html b/templates/_noticias_list.html index 6a90885..16ae391 100644 --- a/templates/_noticias_list.html +++ b/templates/_noticias_list.html @@ -9,7 +9,12 @@ {% if n.imagen_url %}
- Imagen para {{ n.titulo }} + Imagen para {{ n.titulo }}
{% endif %} diff --git a/templates/dashboard.html b/templates/dashboard.html index e63ada5..689463d 100644 --- a/templates/dashboard.html +++ b/templates/dashboard.html @@ -47,12 +47,21 @@
-
+

Operaciones del Sistema

-

Genera una copia de seguridad completa de todas tus fuentes y noticias en un archivo .zip.

- Backup Completo (.zip) +

Genera o restaura una copia de seguridad completa de todas tus fuentes y noticias.

+ +
+ + Backup Completo (.zip) + + + + Restaurar Backup (.zip) + +
diff --git a/templates/noticia.html b/templates/noticia.html index c3e12ac..675a9f7 100644 --- a/templates/noticia.html +++ b/templates/noticia.html @@ -50,19 +50,28 @@ {% if d.imagen_url %}
- Imagen de la noticia + Imagen de la noticia
{% endif %} {% if d.resumen_trad %}

Resumen (traducido)

-
{{ d.resumen_trad|safe_html }}
+
+
{{ d.resumen_trad|safe_html }}
+

{% endif %} {% if d.resumen_orig %}

Resumen (original)

-
{{ d.resumen_orig|safe_html }}
+
+
{{ d.resumen_orig|safe_html }}
+
{% endif %} {% if tags and tags|length %} @@ -87,7 +96,12 @@ {% if r.imagen_url %} {% endif %} diff --git a/templates/restore_completo.html b/templates/restore_completo.html new file mode 100644 index 0000000..d91f8c7 --- /dev/null +++ b/templates/restore_completo.html @@ -0,0 +1,49 @@ +{% extends "base.html" %} + +{% block title %}Restaurar Backup Completo{% endblock %} + +{% block content %} +
+
+

Restaurar Backup Completo

+
+
+

+ Sube un archivo .zip generado desde + "Backup Completo (.zip)" en el dashboard. +

+ +
+ ⚠ Atención: +
    +
  • Se vaciarán las tablas feeds y fuentes_url.
  • +
  • Los datos de esos CSV se volverán a cargar desde el backup.
  • +
  • No se tocan noticias, traducciones ni tags.
  • +
+
+ +
+
+ + +
+ +
+ Cancelar + +
+
+
+
+{% endblock %} + diff --git a/translation_worker.py b/translation_worker.py index ab9a583..fb16ab7 100644 --- a/translation_worker.py +++ b/translation_worker.py @@ -1,4 +1,3 @@ -# translation_worker.py import os import time import logging @@ -8,17 +7,17 @@ from typing import List, Optional import psycopg2 import psycopg2.extras +from psycopg2.extras import execute_values import torch from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from langdetect import detect, DetectorFactory -DetectorFactory.seed = 0 # resultados reproducibles +DetectorFactory.seed = 0 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s") LOG = logging.getLogger(__name__) -# ---------- Config DB ---------- DB_CONFIG = { "host": os.environ.get("DB_HOST", "localhost"), "port": int(os.environ.get("DB_PORT", 5432)), @@ -27,7 +26,7 @@ DB_CONFIG = { "password": os.environ.get("DB_PASS", "x"), } -# ---------- Helpers ENV (con retrocompatibilidad) ---------- + def _env_list(name: str, *fallbacks: str, default: str = "es") -> List[str]: raw = None for key in (name, *fallbacks): @@ -37,6 +36,7 @@ def _env_list(name: str, *fallbacks: str, default: str = "es") -> List[str]: raw = raw if raw is not None else default return [s.strip() for s in raw.split(",") if s and s.strip()] + def _env_int(name: str, *fallbacks: str, default: int = 8) -> int: for key in (name, *fallbacks): val = os.environ.get(key) @@ -47,6 +47,7 @@ def _env_int(name: str, *fallbacks: str, default: int = 8) -> int: pass return default + def _env_float(name: str, *fallbacks: str, default: float = 5.0) -> float: for key in (name, *fallbacks): val = os.environ.get(key) @@ -57,6 +58,7 @@ def _env_float(name: str, *fallbacks: str, default: float = 5.0) -> float: pass return default + def _env_str(name: str, *fallbacks: str, default: Optional[str] = None) -> Optional[str]: for key in (name, *fallbacks): val = os.environ.get(key) @@ -64,68 +66,105 @@ def _env_str(name: str, *fallbacks: str, default: Optional[str] = None) -> Optio return val return default + def _env_bool(name: str, default: bool = False) -> bool: val = os.environ.get(name) if val is None: return default return str(val).strip().lower() in ("1", "true", "yes", "y", "on") -TARGET_LANGS = _env_list("TARGET_LANGS", "TRANSLATE_TO", default="es") -BATCH_SIZE = _env_int("BATCH", "TRANSLATOR_BATCH", "TRANSLATE_BATCH", default=8) -ENQUEUE_MAX = _env_int("ENQUEUE", "TRANSLATOR_ENQUEUE", "TRANSLATE_ENQUEUE", default=200) -SLEEP_IDLE = _env_float("SLEEP_IDLE", "TRANSLATOR_SLEEP_IDLE", "TRANSLATE_SLEEP_IDLE", default=5.0) -DEVICE_CFG = (_env_str("DEVICE", default="auto") or "auto").lower() # 'cpu' | 'cuda' | 'auto' -# Límites de tokens (ajusta si ves OOM) +TARGET_LANGS = _env_list("TARGET_LANGS", "TRANSLATE_TO", default="es") +BATCH_SIZE = _env_int("BATCH", "TRANSLATOR_BATCH", "TRANSLATE_BATCH", default=8) +ENQUEUE_MAX = _env_int("ENQUEUE", "TRANSLATOR_ENQUEUE", "TRANSLATE_ENQUEUE", default=200) +SLEEP_IDLE = _env_float("SLEEP_IDLE", "TRANSLATOR_SLEEP_IDLE", "TRANSLATE_SLEEP_IDLE", default=5.0) +DEVICE_CFG = (_env_str("DEVICE", default="auto") or "auto").lower() + MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", default=512) MAX_NEW_TOKENS = _env_int("MAX_NEW_TOKENS", default=256) -# ---- Beams: por defecto 2 para títulos y 1 para cuerpo; respeta NUM_BEAMS si sólo se define ese ---- + def _beams_from_env(): nb_global = os.environ.get("NUM_BEAMS") has_title = os.environ.get("NUM_BEAMS_TITLE") is not None - has_body = os.environ.get("NUM_BEAMS_BODY") is not None + has_body = os.environ.get("NUM_BEAMS_BODY") is not None if nb_global and not has_title and not has_body: try: v = max(1, int(nb_global)) return v, v except ValueError: pass - # por defecto: 2 (título), 1 (cuerpo) return _env_int("NUM_BEAMS_TITLE", default=2), _env_int("NUM_BEAMS_BODY", default=1) + NUM_BEAMS_TITLE, NUM_BEAMS_BODY = _beams_from_env() -# Modelo por defecto: NLLB 600M (cámbialo por facebook/nllb-200-1.3B si quieres el 1.3B) UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", default="facebook/nllb-200-distilled-600M") -# ---------- Chunking por frases (para artículos largos) ---------- -# Activo por defecto para evitar secuencias > límite del modelo -CHUNK_BY_SENTENCES = _env_bool("CHUNK_BY_SENTENCES", default=True) -CHUNK_MAX_TOKENS = _env_int("CHUNK_MAX_TOKENS", default=900) # <= modelo - margen -CHUNK_OVERLAP_SENTS = _env_int("CHUNK_OVERLAP_SENTS", default=0) # 0 o 1 +CHUNK_BY_SENTENCES = _env_bool("CHUNK_BY_SENTENCES", default=True) +CHUNK_MAX_TOKENS = _env_int("CHUNK_MAX_TOKENS", default=900) +CHUNK_OVERLAP_SENTS = _env_int("CHUNK_OVERLAP_SENTS", default=0) -# Abreviaturas comunes y marcador temporal _ABBR = ("Sr", "Sra", "Dr", "Dra", "Ing", "Lic", "pág", "etc") -_ABBR_MARK = "§" # no debería aparecer en texto normal +_ABBR_MARK = "§" + +_SENT_SPLIT_RE = re.compile( + r'(?<=[\.!\?…])\s+(?=["“\(\[A-ZÁÉÍÓÚÑÄÖÜ0-9])|(?:\n{2,})' +) + +NLLB_LANG = { + "es": "spa_Latn", + "en": "eng_Latn", + "fr": "fra_Latn", + "de": "deu_Latn", + "it": "ita_Latn", + "pt": "por_Latn", + "nl": "nld_Latn", + "sv": "swe_Latn", + "da": "dan_Latn", + "fi": "fin_Latn", + "no": "nob_Latn", + "nb": "nob_Latn", + "nn": "nno_Latn", + "pl": "pol_Latn", + "cs": "ces_Latn", + "sk": "slk_Latn", + "sl": "slv_Latn", + "hu": "hun_Latn", + "ro": "ron_Latn", + "bg": "bul_Cyrl", + "el": "ell_Grek", + "ru": "rus_Cyrl", + "uk": "ukr_Cyrl", + "hr": "hrv_Latn", + "sr": "srp_Cyrl", + "bs": "bos_Latn", + "tr": "tur_Latn", + "ar": "arb_Arab", + "fa": "pes_Arab", + "he": "heb_Hebr", + "zh": "zho_Hans", + "ja": "jpn_Jpan", + "ko": "kor_Hang", + "vi": "vie_Latn", + "th": "tha_Thai", + "id": "ind_Latn", + "ms": "zsm_Latn", + "pt-br": "por_Latn", + "pt-pt": "por_Latn", +} + def _protect_abbrev(text: str) -> str: - # Iniciales de una letra: "E.", "A." t = re.sub(r"\b([A-ZÁÉÍÓÚÑÄÖÜ])\.", r"\1" + _ABBR_MARK, text) - # Abreviaturas de la lista (case-insensitive) pat = r"\b(?:" + "|".join(map(re.escape, _ABBR)) + r")\." t = re.sub(pat, lambda m: m.group(0)[:-1] + _ABBR_MARK, t, flags=re.IGNORECASE) return t + def _restore_abbrev(text: str) -> str: return text.replace(_ABBR_MARK, ".") -# Regex de corte SIN look-behind variable: -# - Corta tras [.!?…] si hay espacios y luego comienza otra frase (letra mayúscula, comillas, paréntesis, dígito) -# - O cuando hay doble salto de línea -_SENT_SPLIT_RE = re.compile( - r'(?<=[\.!\?…])\s+(?=["“\(\[A-ZÁÉÍÓÚÑÄÖÜ0-9])|(?:\n{2,})' -) def split_into_sentences(text: str) -> List[str]: text = (text or "").strip() @@ -134,7 +173,6 @@ def split_into_sentences(text: str) -> List[str]: protected = _protect_abbrev(text) parts = [p.strip() for p in _SENT_SPLIT_RE.split(protected) if p and p.strip()] parts = [_restore_abbrev(p) for p in parts] - # Une piezas muy cortas con la anterior para más coherencia merged: List[str] = [] for p in parts: if merged and len(p) < 40: @@ -143,26 +181,6 @@ def split_into_sentences(text: str) -> List[str]: merged.append(p) return merged -# ---------- Mapeo idiomas a códigos NLLB ---------- -NLLB_LANG = { - # básicos - "es": "spa_Latn", "en": "eng_Latn", "fr": "fra_Latn", "de": "deu_Latn", "it": "ita_Latn", "pt": "por_Latn", - # nórdicos - "nl": "nld_Latn", "sv": "swe_Latn", "da": "dan_Latn", "fi": "fin_Latn", - # noruego - "no": "nob_Latn", "nb": "nob_Latn", "nn": "nno_Latn", - # CEE - "pl": "pol_Latn", "cs": "ces_Latn", "sk": "slk_Latn", "sl": "slv_Latn", - "hu": "hun_Latn", "ro": "ron_Latn", "bg": "bul_Cyrl", "el": "ell_Grek", - "ru": "rus_Cyrl", "uk": "ukr_Cyrl", "hr": "hrv_Latn", "sr": "srp_Cyrl", "bs": "bos_Latn", - # ME/Asia - "tr": "tur_Latn", "ar": "arb_Arab", "fa": "pes_Arab", "he": "heb_Hebr", - "zh": "zho_Hans", "ja": "jpn_Jpan", "ko": "kor_Hang", - # SEA - "vi": "vie_Latn", "th": "tha_Thai", "id": "ind_Latn", "ms": "zsm_Latn", - # variantes - "pt-br": "por_Latn", "pt-pt": "por_Latn", -} def map_to_nllb(code: Optional[str]) -> Optional[str]: if not code: @@ -172,29 +190,35 @@ def map_to_nllb(code: Optional[str]) -> Optional[str]: return NLLB_LANG[code] return f"{code}_Latn" + def normalize_lang(code: Optional[str], default: Optional[str] = None) -> Optional[str]: if not code: return default code = code.strip().lower() return code if code else default -# ---------- DB ---------- + def get_conn(): return psycopg2.connect(**DB_CONFIG) + def ensure_indexes(conn): with conn.cursor() as cur: - cur.execute(""" + cur.execute( + """ CREATE INDEX IF NOT EXISTS traducciones_lang_to_status_idx ON traducciones (lang_to, status); CREATE INDEX IF NOT EXISTS traducciones_status_idx ON traducciones (status); - """) + """ + ) conn.commit() + def ensure_pending(conn, lang_to: str, enqueue_limit: int): with conn.cursor() as cur: - cur.execute(""" + cur.execute( + """ INSERT INTO traducciones (noticia_id, lang_from, lang_to, status) SELECT sub.id, NULL, %s, 'pending' FROM ( @@ -206,12 +230,16 @@ def ensure_pending(conn, lang_to: str, enqueue_limit: int): ORDER BY n.fecha DESC NULLS LAST, n.id LIMIT %s ) AS sub; - """, (lang_to, lang_to, enqueue_limit)) + """, + (lang_to, lang_to, enqueue_limit), + ) conn.commit() + def fetch_pending_batch(conn, lang_to: str, batch_size: int): with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: - cur.execute(""" + cur.execute( + """ SELECT t.id AS tr_id, t.noticia_id, t.lang_from, t.lang_to, n.titulo, n.resumen FROM traducciones t @@ -219,7 +247,9 @@ def fetch_pending_batch(conn, lang_to: str, batch_size: int): WHERE t.lang_to = %s AND t.status = 'pending' ORDER BY t.id LIMIT %s; - """, (lang_to, batch_size)) + """, + (lang_to, batch_size), + ) rows = cur.fetchall() if rows: ids = [r["tr_id"] for r in rows] @@ -228,21 +258,6 @@ def fetch_pending_batch(conn, lang_to: str, batch_size: int): conn.commit() return rows -def mark_done(conn, tr_id: int, title_tr: str, body_tr: str, lang_from: Optional[str]): - with conn.cursor() as cur: - cur.execute(""" - UPDATE traducciones - SET titulo_trad=%s, resumen_trad=%s, - lang_from = COALESCE(lang_from, %s), - status='done', error=NULL - WHERE id=%s; - """, (title_tr, body_tr, lang_from, tr_id)) - conn.commit() - -def mark_error(conn, tr_id: int, msg: str): - with conn.cursor() as cur: - cur.execute("UPDATE traducciones SET status='error', error=%s WHERE id=%s;", (msg[:1500], tr_id)) - conn.commit() def detect_lang(text1: str, text2: str) -> Optional[str]: txt = (text1 or "").strip() or (text2 or "").strip() @@ -253,13 +268,14 @@ def detect_lang(text1: str, text2: str) -> Optional[str]: except Exception: return None -# ---------- Modelo único y manejo de CUDA (NLLB) ---------- + _TOKENIZER: Optional[AutoTokenizer] = None _MODEL: Optional[AutoModelForSeq2SeqLM] = None _DEVICE: Optional[torch.device] = None _CUDA_FAILS: int = 0 _CUDA_DISABLED: bool = False + def _resolve_device() -> torch.device: global _CUDA_DISABLED if _CUDA_DISABLED: @@ -268,13 +284,14 @@ def _resolve_device() -> torch.device: return torch.device("cpu") if DEVICE_CFG == "cuda": return torch.device("cuda" if torch.cuda.is_available() else "cpu") - # auto return torch.device("cuda" if torch.cuda.is_available() else "cpu") + def _is_cuda_mem_error(exc: Exception) -> bool: s = str(exc) return ("CUDA out of memory" in s) or ("CUDACachingAllocator" in s) or ("expandable_segment" in s) + def _free_cuda(): if torch.cuda.is_available(): try: @@ -283,8 +300,8 @@ def _free_cuda(): except Exception: pass + def _load_model_on(device: torch.device): - """Carga (o recarga) el modelo/tokenizer en el dispositivo indicado.""" global _TOKENIZER, _MODEL, _DEVICE dtype = torch.float16 if device.type == "cuda" else torch.float32 @@ -293,9 +310,9 @@ def _load_model_on(device: torch.device): mdl = AutoModelForSeq2SeqLM.from_pretrained( UNIVERSAL_MODEL, torch_dtype=dtype, - low_cpu_mem_usage=True + low_cpu_mem_usage=True, ) - # use_cache=False reduce picos de VRAM en generación + try: mdl.config.use_cache = False except Exception: @@ -306,8 +323,8 @@ def _load_model_on(device: torch.device): _TOKENIZER, _MODEL, _DEVICE = tok, mdl, device + def get_universal_components(): - """Devuelve (tokenizer, model, device). Carga en GPU si está disponible y estable.""" global _TOKENIZER, _MODEL, _DEVICE, _CUDA_FAILS, _CUDA_DISABLED if _MODEL is not None and _DEVICE is not None: @@ -329,14 +346,13 @@ def get_universal_components(): _load_model_on(torch.device("cpu")) return _TOKENIZER, _MODEL, _DEVICE -# ---------- Utilidades de tokenización / chunking ---------- + def _safe_src_len(tokenizer) -> int: model_max = getattr(tokenizer, "model_max_length", 1024) or 1024 - # margen para tokens especiales/ruido return min(MAX_SRC_TOKENS, int(model_max) - 16) + def _token_chunks(tokenizer, text: str, max_tokens: int) -> List[str]: - """Troceo simple por tokens (fallback)""" if not text: return [] ids = tokenizer.encode(text, add_special_tokens=False) @@ -344,22 +360,20 @@ def _token_chunks(tokenizer, text: str, max_tokens: int) -> List[str]: return [text] chunks = [] for i in range(0, len(ids), max_tokens): - sub = ids[i:i+max_tokens] + sub = ids[i : i + max_tokens] piece = tokenizer.decode(sub, skip_special_tokens=True, clean_up_tokenization_spaces=True) if piece.strip(): chunks.append(piece.strip()) return chunks + def _norm(s: str) -> str: import re as _re + return _re.sub(r"\W+", "", (s or "").lower()).strip() + def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_code: str) -> int: - """ - Resuelve el id del token de idioma destino para NLLB de forma robusta, - funcionando aunque falte `lang_code_to_id` en el tokenizer. - """ - # 1) tokenizer.lang_code_to_id (si existe) try: mapping = getattr(tokenizer, "lang_code_to_id", None) if isinstance(mapping, dict): @@ -369,7 +383,6 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c except Exception: pass - # 2) model.config.lang_code_to_id (si existe) try: mapping = getattr(getattr(model, "config", None), "lang_code_to_id", None) if isinstance(mapping, dict): @@ -379,7 +392,6 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c except Exception: pass - # 3) convert_tokens_to_ids (algunos builds registran el código como token especial) try: tid = tokenizer.convert_tokens_to_ids(tgt_code) if isinstance(tid, int) and tid not in (-1, getattr(tokenizer, "unk_token_id", -1)): @@ -387,7 +399,6 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c except Exception: pass - # 4) additional_special_tokens/_ids (buscar el código tal cual) try: ats = getattr(tokenizer, "additional_special_tokens", None) ats_ids = getattr(tokenizer, "additional_special_tokens_ids", None) @@ -398,17 +409,12 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c except Exception: pass - # 5) último recurso: usa eos/bos para no romper generate() LOG.warning("No pude resolver lang code id para '%s'. Uso fallback (eos/bos).", tgt_code) return getattr(tokenizer, "eos_token_id", None) or getattr(tokenizer, "bos_token_id", None) or 0 -# ---------- Traducción base ---------- + @torch.inference_mode() def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1, _tries: int = 0) -> str: - """ - Traduce un texto (usando troceo por tokens si excede MAX_SRC_TOKENS). - Se usa para títulos y como núcleo para chunks de artículos. - """ if not text or not text.strip(): return "" @@ -416,7 +422,6 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1, src_code = map_to_nllb(src_lang) or "eng_Latn" tgt_code = map_to_nllb(tgt_lang) or "spa_Latn" - # Configura idioma origen (si la prop existe) try: tok.src_lang = src_code except Exception: @@ -439,7 +444,7 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1, max_new_tokens=MAX_NEW_TOKENS, num_beams=max(1, int(num_beams)), do_sample=False, - use_cache=False, # ↓ memoria + use_cache=False, ) if int(num_beams) > 1: gen_kwargs["early_stopping"] = True @@ -459,7 +464,6 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1, except Exception as e: if device.type == "cuda" and _is_cuda_mem_error(e) and _tries < 2: LOG.warning("CUDA OOM/allocator: intento de recuperación %d. Detalle: %s", _tries + 1, e) - # desactiva CUDA y relanza en CPU global _MODEL, _DEVICE, _CUDA_DISABLED _CUDA_DISABLED = True try: @@ -474,10 +478,11 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1, return translate_text(src_lang, tgt_lang, text, num_beams=num_beams, _tries=_tries + 1) raise -# ---------- Chunking por frases para artículos ---------- + def _sent_token_len(tokenizer, sent: str) -> int: return len(tokenizer(sent, add_special_tokens=False).input_ids) + def _pack_sentences_to_token_chunks( tokenizer, sentences: List[str], max_tokens: int, overlap_sents: int = 0 ) -> List[List[str]]: @@ -487,11 +492,10 @@ def _pack_sentences_to_token_chunks( for s in sentences: slen = _sent_token_len(tokenizer, s) if slen > max_tokens: - # Si una sola frase excede el límite, córtala por tokens como último recurso ids = tokenizer(s, add_special_tokens=False).input_ids step = max_tokens for i in range(0, len(ids), step): - sub = tokenizer.decode(ids[i:i+step], skip_special_tokens=True) + sub = tokenizer.decode(ids[i : i + step], skip_special_tokens=True) if cur: chunks.append(cur) cur = [] @@ -500,7 +504,8 @@ def _pack_sentences_to_token_chunks( continue if cur_tokens + slen <= max_tokens: - cur.append(s); cur_tokens += slen + cur.append(s) + cur_tokens += slen else: if cur: chunks.append(cur) @@ -509,13 +514,14 @@ def _pack_sentences_to_token_chunks( cur = overlap + [s] cur_tokens = sum(_sent_token_len(tokenizer, x) for x in cur) else: - cur = [s]; cur_tokens = slen + cur = [s] + cur_tokens = slen if cur: chunks.append(cur) return chunks + def _smart_concatenate(parts: List[str], tail_window: int = 120) -> str: - """Une partes evitando duplicados obvios en el borde (heurística ligera).""" if not parts: return "" out = parts[0] @@ -529,24 +535,17 @@ def _smart_concatenate(parts: List[str], tail_window: int = 120) -> str: out += ("" if cut == 0 else nxt[cut:]) if nxt else "" return out + def translate_article_full( src_lang: str, tgt_lang: str, text: str, num_beams: int, ) -> str: - """ - Traduce un artículo completo: - - Divide por frases (sin look-behind variable) - - Empaqueta en chunks <= límite de tokens - - Traduce chunk a chunk (usa translate_text internamente) - - Une con heurística para evitar duplicados en bordes - """ if not text or not text.strip(): return "" if not CHUNK_BY_SENTENCES: - # Ruta rápida: una sola pasada con truncamiento interno return translate_text(src_lang, tgt_lang, text, num_beams=num_beams) tok, _, _ = get_universal_components() @@ -569,46 +568,83 @@ def translate_article_full( return _smart_concatenate([p for p in translated_parts if p]) -# ---------- Procesamiento por lotes ---------- + def process_batch(conn, rows): + done_rows = [] + error_rows = [] + for r in rows: tr_id = r["tr_id"] lang_to = normalize_lang(r["lang_to"], "es") or "es" lang_from = normalize_lang(r["lang_from"]) or detect_lang(r["titulo"] or "", r["resumen"] or "") or "en" title = (r["titulo"] or "").strip() - body = (r["resumen"] or "").strip() + body = (r["resumen"] or "").strip() - # Si ya está en el mismo idioma, copia tal cual if (map_to_nllb(lang_from) or "eng_Latn") == (map_to_nllb(lang_to) or "spa_Latn"): - mark_done(conn, tr_id, title, body, lang_from) + done_rows.append((title, body, lang_from, tr_id)) continue try: - # Títulos: cortos, traducción directa (beams más altos si quieres) title_tr = translate_text(lang_from, lang_to, title, num_beams=NUM_BEAMS_TITLE) if title else "" - # Cuerpo/resumen: artículo completo con chunking por frases - body_tr = translate_article_full(lang_from, lang_to, body, num_beams=NUM_BEAMS_BODY) if body else "" + body_tr = translate_article_full(lang_from, lang_to, body, num_beams=NUM_BEAMS_BODY) if body else "" - # Si la "traducción" es igual al original, déjala vacía if _norm(title_tr) == _norm(title): title_tr = "" if _norm(body_tr) == _norm(body): body_tr = "" - mark_done(conn, tr_id, title_tr, body_tr, lang_from) + done_rows.append((title_tr, body_tr, lang_from, tr_id)) except Exception as e: LOG.exception("Error traduciendo fila") - mark_error(conn, tr_id, str(e)) + error_rows.append((str(e)[:1500], tr_id)) + + with conn.cursor() as cur: + if done_rows: + execute_values( + cur, + """ + UPDATE traducciones AS t + SET titulo_trad = v.titulo_trad, + resumen_trad = v.resumen_trad, + lang_from = COALESCE(t.lang_from, v.lang_from), + status = 'done', + error = NULL + FROM (VALUES %s) AS v(titulo_trad, resumen_trad, lang_from, id) + WHERE t.id = v.id; + """, + done_rows, + ) + + if error_rows: + execute_values( + cur, + """ + UPDATE traducciones AS t + SET status = 'error', + error = v.error + FROM (VALUES %s) AS v(error, id) + WHERE t.id = v.id; + """, + error_rows, + ) + conn.commit() + def main(): LOG.info( "Arrancando worker de traducción (NLLB). TARGET_LANGS=%s, BATCH=%s, ENQUEUE=%s, DEVICE=%s, " "BEAMS(title/body)=%s/%s, CHUNK_BY_SENTENCES=%s, CHUNK_MAX_TOKENS=%s, OVERLAP_SENTS=%s", - TARGET_LANGS, BATCH_SIZE, ENQUEUE_MAX, DEVICE_CFG, NUM_BEAMS_TITLE, NUM_BEAMS_BODY, - CHUNK_BY_SENTENCES, CHUNK_MAX_TOKENS, CHUNK_OVERLAP_SENTS + TARGET_LANGS, + BATCH_SIZE, + ENQUEUE_MAX, + DEVICE_CFG, + NUM_BEAMS_TITLE, + NUM_BEAMS_BODY, + CHUNK_BY_SENTENCES, + CHUNK_MAX_TOKENS, + CHUNK_OVERLAP_SENTS, ) - # Pre-carga el modelo una vez para reservar memoria de forma limpia get_universal_components() while True: @@ -628,6 +664,7 @@ def main(): if not any_work: time.sleep(SLEEP_IDLE) + if __name__ == "__main__": os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") main()