arreglo de ui y busquedas

This commit is contained in:
jlimolina 2025-11-21 04:42:02 +01:00
parent cb8f69fb93
commit fc06566928
15 changed files with 1115 additions and 435 deletions

98
app.py
View file

@ -3,6 +3,7 @@ import csv
import io
import time
import socket
import zipfile
from datetime import datetime, date
from concurrent.futures import ThreadPoolExecutor, as_completed
@ -337,21 +338,48 @@ def home():
params.append(int(continente_id))
if q:
where.append("n.tsv @@ plainto_tsquery('spanish', %s)")
params.append(q)
search_like = f"%{q}%"
if use_tr:
where.append(
"""
(
n.tsv @@ websearch_to_tsquery('spanish', %s)
OR t.titulo_trad ILIKE %s
OR t.resumen_trad ILIKE %s
OR n.titulo ILIKE %s
OR n.resumen ILIKE %s
)
"""
)
params.extend([q, search_like, search_like, search_like, search_like])
else:
where.append(
"""
(
n.tsv @@ websearch_to_tsquery('spanish', %s)
OR n.titulo ILIKE %s
OR n.resumen ILIKE %s
)
"""
)
params.extend([q, search_like, search_like])
where_sql = " AND ".join(where)
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
cur.execute(
f"""
SELECT COUNT(*)
SELECT COUNT(DISTINCT n.id)
FROM noticias n
LEFT JOIN categorias c ON c.id = n.categoria_id
LEFT JOIN paises p ON p.id = n.pais_id
LEFT JOIN traducciones t
ON t.noticia_id = n.id
AND t.lang_to = %s
AND t.status = 'done'
WHERE {where_sql}
""",
params,
[lang] + params,
)
total_results = cur.fetchone()[0] if cur.rowcount else 0
total_pages = (total_results // per_page) + (1 if total_results % per_page else 0)
@ -925,7 +953,7 @@ def add_url_source():
"""
INSERT INTO fuentes_url (nombre, url, categoria_id, pais_id, idioma)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (url_norm) DO UPDATE
ON CONFLICT (url) DO UPDATE
SET nombre = EXCLUDED.nombre,
categoria_id = EXCLUDED.categoria_id,
pais_id = EXCLUDED.pais_id,
@ -1172,8 +1200,6 @@ def scrape_url():
@app.route("/backup_completo")
def backup_completo():
import zipfile
mem_file = io.BytesIO()
with zipfile.ZipFile(mem_file, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
with get_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur:
@ -1207,6 +1233,64 @@ def backup_completo():
)
@app.route("/restore_completo", methods=["GET", "POST"])
def restore_completo():
if request.method == "GET":
return render_template("restore_completo.html")
file = request.files.get("backup_file")
if not file or file.filename == "":
flash("No se ha seleccionado ningún archivo.", "error")
return redirect(url_for("restore_completo"))
filename = file.filename.lower()
if not filename.endswith(".zip"):
flash("El archivo debe ser un .zip.", "error")
return redirect(url_for("restore_completo"))
raw = file.read()
try:
zf = zipfile.ZipFile(io.BytesIO(raw))
except zipfile.BadZipFile:
flash("El archivo no es un .zip válido.", "error")
return redirect(url_for("restore_completo"))
restored_counts = {}
conn = get_conn()
try:
with conn:
with conn.cursor() as cur:
if "feeds.csv" in zf.namelist():
cur.execute("TRUNCATE TABLE feeds RESTART IDENTITY;")
with zf.open("feeds.csv") as f:
text_f = io.TextIOWrapper(f, encoding="utf-8")
cur.copy_expert("COPY feeds FROM STDIN CSV HEADER", text_f)
restored_counts["feeds"] = cur.rowcount if cur.rowcount is not None else 0
if "fuentes_url.csv" in zf.namelist():
cur.execute("TRUNCATE TABLE fuentes_url RESTART IDENTITY;")
with zf.open("fuentes_url.csv") as f2:
text_f2 = io.TextIOWrapper(f2, encoding="utf-8")
cur.copy_expert("COPY fuentes_url FROM STDIN CSV HEADER", text_f2)
restored_counts["fuentes_url"] = cur.rowcount if cur.rowcount is not None else 0
except Exception as e:
conn.rollback()
conn.close()
flash(f"Error al restaurar el backup: {e}", "error")
return redirect(url_for("restore_completo"))
conn.close()
if restored_counts:
partes = [f"{tabla}: {n} filas" for tabla, n in restored_counts.items()]
flash("Restauración completada: " + ", ".join(partes), "success")
else:
flash("Backup procesado pero no se encontraron ficheros reconocidos (feeds.csv, fuentes_url.csv).", "warning")
return redirect(url_for("dashboard"))
if __name__ == "__main__":
app.run(host="0.0.0.0", port=8001, debug=True)

View file

@ -141,7 +141,7 @@ services:
- DB_NAME=${DB_NAME}
- DB_USER=${DB_USER}
- DB_PASS=${DB_PASS}
- EMB_MODEL=sentence-transformers/all-MiniLM-L6-v2
- EMB_MODEL=sentence-transformers/paraphrase-multilingual-mpnet-base-v2
- EMB_BATCH=256
- EMB_SLEEP_IDLE=5
- EMB_LANGS=es

View file

@ -1,16 +1,12 @@
# embeddings_worker.py
# Worker de embeddings para TRADUCCIONES:
# - Lee traducciones con status='done' y sin embedding para un modelo concreto
# - Calcula embedding (Sentence-Transformers) sobre titulo_trad + resumen_trad
# - Guarda en traduccion_embeddings (traduccion_id, model, dim, embedding)
import os
import time
import logging
from typing import List
import numpy as np
import psycopg2
import psycopg2.extras
from psycopg2.extras import execute_values
from sentence_transformers import SentenceTransformer
import torch
@ -30,7 +26,7 @@ DB = dict(
# Modelo por defecto: multilingüe, bueno para muchas lenguas
EMB_MODEL = os.environ.get(
"EMB_MODEL",
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
"sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
)
EMB_BATCH = int(os.environ.get("EMB_BATCH", "128"))
SLEEP_IDLE = float(os.environ.get("EMB_SLEEP_IDLE", "5.0"))
@ -65,8 +61,13 @@ def ensure_schema(conn):
);
"""
)
cur.execute("CREATE INDEX IF NOT EXISTS idx_tr_emb_model ON traduccion_embeddings(model);")
cur.execute("CREATE INDEX IF NOT EXISTS idx_tr_emb_trid ON traduccion_embeddings(traduccion_id);")
# Alineado con init-db/08-embeddings.sql
cur.execute(
"CREATE INDEX IF NOT EXISTS idx_tr_emb_model ON traduccion_embeddings(model);"
)
cur.execute(
"CREATE INDEX IF NOT EXISTS idx_tr_emb_traduccion_id ON traduccion_embeddings(traduccion_id);"
)
conn.commit()
@ -104,7 +105,7 @@ def texts_from_rows(rows: List[psycopg2.extras.DictRow]) -> List[str]:
Compone el texto a vectorizar por cada traducción:
'titulo_trad' + '\n' + 'resumen_trad'. Si alguno falta, usa lo disponible.
"""
texts = []
texts: List[str] = []
for r in rows:
title = (r["titulo_trad"] or "").strip()
body = (r["resumen_trad"] or "").strip()
@ -117,23 +118,36 @@ def texts_from_rows(rows: List[psycopg2.extras.DictRow]) -> List[str]:
def upsert_embeddings(conn, rows, embs: np.ndarray, model_name: str):
"""
Inserta/actualiza embeddings por traducción.
Inserta/actualiza embeddings por traducción en lote (batch insert).
"""
if embs.size == 0 or not rows:
return
dim = int(embs.shape[1])
# Preparamos los datos para execute_values
data = [
(
int(r["traduccion_id"]),
model_name,
dim,
[float(x) for x in e],
)
for r, e in zip(rows, embs)
]
with conn.cursor() as cur:
for r, e in zip(rows, embs):
cur.execute(
execute_values(
cur,
"""
INSERT INTO traduccion_embeddings (traduccion_id, model, dim, embedding)
VALUES (%s, %s, %s, %s)
VALUES %s
ON CONFLICT (traduccion_id, model) DO UPDATE
SET embedding = EXCLUDED.embedding,
dim = EXCLUDED.dim,
created_at = NOW()
""",
(int(r["traduccion_id"]), model_name, dim, list(map(float, e))),
data,
)
conn.commit()

View file

@ -29,7 +29,7 @@ SELECT
te.dim,
te.embedding AS vec
FROM traduccion_embeddings te
WHERE te.model = 'sentence-transformers/all-MiniLM-L6-v2';
WHERE te.model = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2';
CREATE TABLE IF NOT EXISTS related_noticias (
traduccion_id INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE,

View file

@ -1,62 +0,0 @@
-- Función para “normalizar” URLs: minúsculas en esquema/host, quitar www,
-- quitar fragmentos (#...), limpiar trackers comunes (utm_*, gclid, fbclid, ref, etc.),
-- colapsar dobles “/”, y quitar la “/” final salvo si es la raíz.
CREATE OR REPLACE FUNCTION normalize_url(in_url text)
RETURNS text
LANGUAGE plpgsql
AS $$
DECLARE
u text := trim(in_url);
scheme_host text;
path_q text;
BEGIN
IF u IS NULL OR u = '' THEN
RETURN NULL;
END IF;
-- quitar espacios y fragmentos
u := regexp_replace(u, '#.*$', '', 'i');
-- separar esquema+host de path+query
-- ej: https://example.com:443/foo?bar -> scheme_host=https://example.com:443 ; path_q=/foo?bar
scheme_host := substring(u FROM '^[a-z]+://[^/]*');
IF scheme_host IS NULL THEN
-- si no hay esquema, asumimos http
u := 'http://' || u;
scheme_host := substring(u FROM '^[a-z]+://[^/]*');
END IF;
path_q := substring(u FROM '^[a-z]+://[^/]*(/.*)$');
IF path_q IS NULL THEN
path_q := '/';
END IF;
-- normalizar esquema y host (minúsculas, quitar www.)
scheme_host := lower(scheme_host);
scheme_host := regexp_replace(scheme_host, '^(https?://)www\.', '\1', 'i');
-- quitar puerto por defecto (:80 en http, :443 en https)
scheme_host := regexp_replace(scheme_host, '^http://([^/:]+):80$', 'http://\1', 'i');
scheme_host := regexp_replace(scheme_host, '^https://([^/:]+):443$', 'https://\1', 'i');
-- limpiar parámetros de tracking en la query
-- elimina ?utm_... &utm_... gclid fbclid mc_cid mc_eid ref ref_src etc.
path_q := regexp_replace(path_q, '([?&])(utm_[^=&]+|gclid|fbclid|mc_cid|mc_eid|ref|ref_src|yclid|igshid)=[^&#]*', '\1', 'gi');
-- limpiar conectores sobrantes ?, &, &&, ?&, etc.
path_q := regexp_replace(path_q, '\?&+', '?', 'g');
path_q := regexp_replace(path_q, '&{2,}', '&', 'g');
path_q := regexp_replace(path_q, '\?$', '', 'g');
path_q := regexp_replace(path_q, '\?$','', 'g');
-- colapsar dobles barras en path (no tocar “://”)
path_q := regexp_replace(path_q, '/{2,}', '/', 'g');
-- quitar “/” final si no es la raíz
IF path_q <> '/' THEN
path_q := regexp_replace(path_q, '/+$', '', 'g');
END IF;
RETURN scheme_host || path_q;
END;
$$;

View file

@ -1,38 +0,0 @@
-- Añadir columna generada url_norm y crear índice único sobre ella.
-- OJO: si ya existen duplicados, este índice fallará.
-- Primero crea la columna si no existe:
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name='feeds' AND column_name='url_norm'
) THEN
ALTER TABLE feeds
ADD COLUMN url_norm text GENERATED ALWAYS AS (normalize_url(url)) STORED;
END IF;
END $$;
-- Índice único (concurrently para no bloquear). Requiere estar fuera de transacción.
-- Si tu herramienta corre todo en una transacción, ejecuta estas dos líneas aparte.
-- Quita duplicados antes si da error.
CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS feeds_url_norm_uniq ON feeds (url_norm)
WHERE url_norm IS NOT NULL;
-- (Opcional) repetir lo mismo para fuentes_url y noticias si quieres esa garantía también:
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name='fuentes_url' AND column_name='url_norm'
) THEN
ALTER TABLE fuentes_url
ADD COLUMN url_norm text GENERATED ALWAYS AS (normalize_url(url)) STORED;
END IF;
END $$;
CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS fuentes_url_norm_uniq ON fuentes_url (url_norm)
WHERE url_norm IS NOT NULL;

View file

@ -2,6 +2,10 @@ import os
import time
import logging
import re
import string
from typing import List, Tuple
from collections import Counter
import psycopg2
import psycopg2.extras
import spacy
@ -28,30 +32,120 @@ ENT_LABELS = {
}
_ws_re = re.compile(r"\s+")
HTML_TRASH_PATTERNS = [
r"<[^>]+>",
r"&[a-z]+;",
r"&#\d+;?", # entidades numéricas tipo &#8230;
r'width="\d+"',
r'height="\d+"',
]
# Palabras/sintagmas demasiado genéricos o claramente ruido
GENERIC_BAD_TAGS = {
"república",
"estado",
"centro",
"gobierno",
"el gobierno",
"gobiernos",
"report",
"sp",
"unión",
"union",
"dólares",
"dolar",
"dólar",
"the post",
"post",
"artículo",
"el artículo",
"la ciudad",
"mundo",
"país",
"pais",
"países",
"paises",
"la noche",
"la publicación",
"este miércoles",
"el miércoles",
"hoy",
"ayer",
"mañana",
"servicio",
"servicios",
"el presidente",
"presidente",
"el ministro",
"ministro",
"la guerra",
"guerra",
"seguridad",
"wp-content",
"internal_photos",
"/internal_photos",
"https",
"http",
"src",
}
STOPWORDS = set()
def clean_tag_text(text):
ARTICLES = {
"el",
"la",
"los",
"las",
"un",
"una",
"uno",
"al",
"del",
}
TOPIC_MIN_CHARS = 4
TOPIC_MAX_WORDS = 6
TOPIC_MAX_PER_DOC = 15
def _looks_like_attr_or_path(text_lower: str) -> bool:
"""Detecta cosas tipo rutas, atributos HTML, ids raros, etc."""
if text_lower.startswith("/"):
return True
if "http://" in text_lower or "https://" in text_lower:
return True
if ".com" in text_lower or ".net" in text_lower or ".org" in text_lower:
return True
if any(ext in text_lower for ext in (".jpg", ".jpeg", ".png", ".gif", ".webp")):
return True
if re.search(r"\b(src|alt|style|class)\s*=", text_lower):
return True
if "data-" in text_lower:
return True
if re.search(r"&#\d+;?", text_lower):
return True
# cosas tipo atributo=valor
if "=" in text_lower and " " not in text_lower.strip():
return True
# cadenas largas sin espacios (ids, hashes…)
if re.fullmatch(r"[a-z0-9_]{15,}", text_lower.replace("-", "").replace(" ", "")):
return True
# palabra única con guión suele ser ruta/slug: wp-content, internal-photos…
if "-" in text_lower and " " not in text_lower:
return True
return False
def clean_tag_text(text: str) -> str | None:
"""Limpieza para entidades (PERSON/ORG/GPE/LOC)."""
if not text:
return None
text = BeautifulSoup(text, "html.parser").get_text()
for pat in HTML_TRASH_PATTERNS:
text = re.sub(pat, "", text)
text = _ws_re.sub(" ", text).strip()
text = text.strip(string.punctuation + " ")
if len(text) < 3:
return None
if re.search(r"[<>/\\]", text):
@ -59,13 +153,15 @@ def clean_tag_text(text):
lower = text.lower()
if lower.startswith("href="):
return None
if lower.startswith("http"):
if _looks_like_attr_or_path(lower):
return None
if lower in GENERIC_BAD_TAGS:
return None
replacements = {
"ee.uu.": "Estados Unidos",
"los estados unidos": "Estados Unidos",
"eeuu": "Estados Unidos",
"eu": "Unión Europea",
"ue": "Unión Europea",
"kosova": "Kosovo",
@ -75,13 +171,112 @@ def clean_tag_text(text):
return text
def clean_topic_text(text: str) -> str | None:
"""Limpieza para posibles 'temas' (noun_chunks)."""
if not text:
return None
text = BeautifulSoup(text, "html.parser").get_text()
for pat in HTML_TRASH_PATTERNS:
text = re.sub(pat, "", text)
text = _ws_re.sub(" ", text).strip()
text = text.strip(string.punctuation + " ")
if len(text) < TOPIC_MIN_CHARS:
return None
lower = text.lower()
if _looks_like_attr_or_path(lower):
return None
# tokenizamos en minúsculas y quitamos puntuación
tokens = [
t.strip(string.punctuation)
for t in lower.split()
if t.strip(string.punctuation)
]
if not tokens:
return None
# quitamos artículo inicial si lo hay
if tokens and tokens[0] in ARTICLES:
tokens = tokens[1:]
if not tokens:
return None
# reconstruimos texto normalizado sin artículo
norm = " ".join(tokens).strip()
if len(norm) < TOPIC_MIN_CHARS:
return None
if norm in GENERIC_BAD_TAGS:
return None
# límite máximo de palabras
if len(tokens) > TOPIC_MAX_WORDS:
return None
# todos stopwords => fuera
if all(t in STOPWORDS for t in tokens):
return None
# sólo números/fechas
if re.fullmatch(r"[0-9\s\.,\-:/]+", norm):
return None
return norm
def get_conn():
return psycopg2.connect(**DB)
def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]:
ents: List[Tuple[str, str]] = []
topics: List[Tuple[str, str]] = []
if not text:
return ents, topics
doc = nlp(text)
# Entidades "clásicas"
for ent in doc.ents:
tipo = ENT_LABELS.get(ent.label_)
if not tipo:
continue
val = clean_tag_text(ent.text)
if not val:
continue
ents.append((val, tipo))
# Candidatos a "tema" a partir de noun_chunks
topic_counter: Counter[str] = Counter()
for chunk in doc.noun_chunks:
val = clean_topic_text(chunk.text)
if not val:
continue
topic_counter[val] += 1
ent_values = {v for (v, _) in ents}
for val, _count in topic_counter.most_common(TOPIC_MAX_PER_DOC):
if val in ent_values:
continue
topics.append((val, "tema"))
# quitamos duplicados
ents = list(set(ents))
topics = list(set(topics))
return ents, topics
def main():
nlp = spacy.load("es_core_news_md", disable=["parser", "lemmatizer", "textcat"])
logging.info("spaCy cargado: es_core_news_md")
global STOPWORDS
nlp = spacy.load("es_core_news_md", disable=["lemmatizer", "textcat"])
STOPWORDS = set(nlp.Defaults.stop_words)
logging.info("spaCy cargado: es_core_news_md (NER + parser)")
while True:
try:
@ -109,7 +304,7 @@ def main():
time.sleep(5)
continue
logging.info(f"Procesando {len(rows)} traducciones para NER...")
logging.info(f"Procesando {len(rows)} traducciones para NER/temas...")
new_links = 0
@ -118,22 +313,12 @@ def main():
if not text:
continue
doc = nlp(text)
ents = []
for ent in doc.ents:
tipo = ENT_LABELS.get(ent.label_)
if not tipo:
continue
val = clean_tag_text(ent.text)
if not val:
continue
ents.append((val, tipo))
if not ents:
ents, topics = extract_entities_and_topics(nlp, text)
all_tags = ents + topics
if not all_tags:
continue
for valor, tipo in set(ents):
for valor, tipo in all_tags:
try:
cur.execute(
"""
@ -160,9 +345,9 @@ def main():
logging.exception("Fallo insertando tag/relación")
conn.commit()
logging.info(f"NER lote OK. Nuevos enlaces: {new_links}.")
except Exception as e:
logging.exception(f"Error en NER loop: {e}")
logging.info(f"NER/temas lote OK. Nuevos enlaces: {new_links}.")
except Exception:
logging.exception("Error en NER loop")
time.sleep(5)

View file

@ -1,9 +1,9 @@
import os
import time
import math
import logging
from typing import List, Tuple
import numpy as np
import psycopg2
import psycopg2.extras
@ -22,7 +22,6 @@ DB = dict(
TOPK = int(os.environ.get("RELATED_TOPK", 10))
BATCH_IDS = int(os.environ.get("RELATED_BATCH_IDS", 200))
BATCH_SIM = int(os.environ.get("RELATED_BATCH_SIM", 2000))
SLEEP_IDLE = float(os.environ.get("RELATED_SLEEP", 10))
MIN_SCORE = float(os.environ.get("RELATED_MIN_SCORE", 0.0))
WINDOW_HOURS = int(os.environ.get("RELATED_WINDOW_H", 0))
@ -32,44 +31,64 @@ def get_conn():
return psycopg2.connect(**DB)
# ---------------------------------------------------------
# Cargar embeddings SOLO de traducciones en español (lang_to='es')
# ---------------------------------------------------------
def _fetch_all_embeddings(cur):
if WINDOW_HOURS > 0:
cur.execute(
"""
base_sql = """
SELECT e.traduccion_id, e.vec
FROM embeddings e
JOIN traducciones t ON t.id = e.traduccion_id
JOIN noticias n ON n.id = t.noticia_id
WHERE n.fecha >= NOW() - INTERVAL %s
""",
(f"{WINDOW_HOURS} hours",),
)
else:
cur.execute("SELECT traduccion_id, vec FROM embeddings")
WHERE t.lang_to = 'es'
"""
params = []
if WINDOW_HOURS > 0:
base_sql += " AND n.fecha >= NOW() - INTERVAL %s"
params.append(f"{WINDOW_HOURS} hours")
cur.execute(base_sql, params)
rows = cur.fetchall()
if not rows:
return [], [], []
return [], None
ids = []
vecs = []
norms = []
for tr_id, v in rows:
for tid, v in rows:
if v is None:
v = []
nrm = math.sqrt(sum(((x or 0.0) * (x or 0.0)) for x in v)) or 1e-8
ids.append(tr_id)
continue
ids.append(tid)
vecs.append(v)
norms.append(nrm)
return ids, vecs, norms
if not ids:
return [], None
# Convertimos a matriz numpy
mat = np.array(vecs, dtype=np.float32)
# Normalizamos (evita división por 0)
norms = np.linalg.norm(mat, axis=1, keepdims=True)
norms[norms == 0] = 1e-8
mat = mat / norms
return ids, mat
# ---------------------------------------------------------
# Obtiene IDs pendientes
# ---------------------------------------------------------
def _fetch_pending_ids(cur, limit) -> List[int]:
cur.execute(
"""
SELECT e.traduccion_id
FROM embeddings e
LEFT JOIN related_noticias r ON r.traduccion_id = e.traduccion_id
JOIN traducciones t ON t.id = e.traduccion_id
LEFT JOIN related_noticias r
ON r.traduccion_id = e.traduccion_id
WHERE t.lang_to = 'es'
GROUP BY e.traduccion_id
HAVING COUNT(r.related_traduccion_id) = 0
ORDER BY e.traduccion_id DESC
@ -80,42 +99,44 @@ def _fetch_pending_ids(cur, limit) -> List[int]:
return [r[0] for r in cur.fetchall()]
def _cosine_with_norms(a, b, na, nb):
num = 0.0
for x, y in zip(a, b):
xv = x or 0.0
yv = y or 0.0
num += xv * yv
denom = na * nb
if denom <= 0.0:
return 0.0
return num / denom
def _topk_for_one(
# ---------------------------------------------------------
# TOP-K usando NumPy (súper rápido)
# ---------------------------------------------------------
def _topk_numpy(
idx: int,
ids_all: List[int],
vecs_all: List[List[float]],
norms_all: List[float],
pool_indices: List[int],
K: int,
mat: np.ndarray,
K: int
) -> List[Tuple[int, float]]:
me_vec = vecs_all[idx]
me_norm = norms_all[idx]
out: List[Tuple[int, float]] = []
for j in pool_indices:
if j == idx:
continue
s = _cosine_with_norms(me_vec, vecs_all[j], me_norm, norms_all[j])
out.append((ids_all[j], s))
# vector de la noticia central
q = mat[idx] # (dim,)
out.sort(key=lambda t: t[1], reverse=True)
if MIN_SCORE > 0.0:
out = [p for p in out if p[1] >= MIN_SCORE]
return out[:K]
# similitudes coseno: dot product (matriz · vector)
sims = np.dot(mat, q)
# eliminar self-match
sims[idx] = -999.0
# filtramos por score mínimo
if MIN_SCORE > 0:
mask = sims >= MIN_SCORE
sims = np.where(mask, sims, -999.0)
# obtenemos los índices top-k (mucho más rápido que ordenar todo)
if K >= len(sims):
top_idx = np.argsort(-sims)
else:
part = np.argpartition(-sims, K)[:K]
top_idx = part[np.argsort(-sims[part])]
out = [(ids_all[j], float(sims[j])) for j in top_idx[:K]]
return out
# ---------------------------------------------------------
# Inserta en la tabla related_noticias
# ---------------------------------------------------------
def _insert_related(cur, tr_id: int, pairs: List[Tuple[int, float]]):
if not pairs:
return
@ -127,48 +148,47 @@ def _insert_related(cur, tr_id: int, pairs: List[Tuple[int, float]]):
ON CONFLICT (traduccion_id, related_traduccion_id)
DO UPDATE SET score = EXCLUDED.score
""",
[(tr_id, rid, float(score)) for (rid, score) in pairs],
[(tr_id, rid, score) for (rid, score) in pairs],
)
# ---------------------------------------------------------
# Procesar IDs objetivo
# ---------------------------------------------------------
def build_for_ids(conn, target_ids: List[int]) -> int:
with conn.cursor() as cur:
ids_all, vecs_all, norms_all = _fetch_all_embeddings(cur)
if not ids_all:
ids_all, mat = _fetch_all_embeddings(cur)
if not ids_all or mat is None:
return 0
# Mapa ID → index
pos = {tid: i for i, tid in enumerate(ids_all)}
n = len(ids_all)
processed = 0
with conn.cursor() as cur:
for tr_id in target_ids:
if tr_id not in pos:
continue
i = pos[tr_id]
top: List[Tuple[int, float]] = []
for start in range(0, n, BATCH_SIM):
block = list(range(start, min(start + BATCH_SIM, n)))
candidates = _topk_for_one(i, ids_all, vecs_all, norms_all, block, TOPK)
top += candidates
top.sort(key=lambda t: t[1], reverse=True)
if len(top) > TOPK:
top = top[:TOPK]
_insert_related(cur, tr_id, top)
idx = pos[tr_id]
pairs = _topk_numpy(idx, ids_all, mat, TOPK)
_insert_related(cur, tr_id, pairs)
processed += 1
conn.commit()
return processed
# ---------------------------------------------------------
# MAIN
# ---------------------------------------------------------
def main():
logging.info(
"Iniciando related_worker (TOPK=%s, BATCH_IDS=%s, BATCH_SIM=%s, MIN_SCORE=%.3f, WINDOW_H=%s)",
"Iniciando related_worker (TOPK=%s, BATCH_IDS=%s, MIN_SCORE=%.3f, WINDOW_H=%s)",
TOPK,
BATCH_IDS,
BATCH_SIM,
MIN_SCORE,
WINDOW_HOURS,
)

View file

@ -36,7 +36,7 @@ if __name__ == '__main__':
# --- CORRECCIÓN 2: Se cambió 'fetch_and_store' por 'fetch_and_store_all' ---
fetch_and_store_all,
"interval",
minutes=3,
minutes=10,
id="rss_job",
next_run_time=datetime.utcnow() + timedelta(seconds=10)
)

View file

@ -14,6 +14,7 @@
}
* { box-sizing: border-box; }
body {
font-family: 'Poppins', 'Segoe UI', Tahoma, sans-serif;
margin: 0;
@ -24,6 +25,12 @@ body {
font-weight: 400;
}
img {
max-width: 100%;
height: auto;
display: block;
}
.container {
max-width: 900px;
margin: 30px auto;
@ -36,58 +43,324 @@ body {
-webkit-backdrop-filter: blur(12px);
}
header { text-align: center; margin-bottom: 40px; border-bottom: 1px solid var(--border-color); padding-bottom: 30px; }
h1 { font-size: 2.8rem; font-weight: 700; margin: 0 0 5px 0; background: var(--gradiente-principal); -webkit-background-clip: text; -webkit-text-fill-color: transparent; display: inline-block; }
h2 { font-size: 1.8rem; font-weight: 600; color: var(--primary-color); margin-bottom: 20px; }
.subtitle { color: var(--text-color-light); font-size: 1.1rem; margin-top: 5px; }
header {
text-align: center;
margin-bottom: 40px;
border-bottom: 1px solid var(--border-color);
padding-bottom: 30px;
}
.form-section, .card { margin-bottom: 30px; background: rgba(255, 255, 255, 0.6); padding: 25px; border-radius: var(--border-radius-md); border: 1px solid var(--border-color); }
label { display: block; margin-bottom: 8px; font-weight: 600; color: var(--text-color); font-size: 0.9rem; }
select, input[type="text"], input[type="url"], input[type="file"], textarea { width: 100%; padding: 12px 15px; border: 1px solid var(--border-color); background-color: #f8f9fa; border-radius: var(--border-radius-sm); font-size: 1rem; font-family: 'Poppins', sans-serif; transition: all var(--transition-speed) ease; }
select:focus, input:focus, textarea:focus { outline: none; border-color: var(--primary-color); box-shadow: 0 0 0 3px var(--shadow-color); background-color: white; }
h1 {
font-size: 2.8rem;
font-weight: 700;
margin: 0 0 5px 0;
background: var(--gradiente-principal);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
display: inline-block;
}
.btn, button { padding: 12px 25px; background: var(--gradiente-principal); color: white !important; border: none; border-radius: var(--border-radius-sm); font-size: 1rem; font-weight: 600; cursor: pointer; transition: all var(--transition-speed) ease; box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1); text-decoration: none; display: inline-block; text-align: center; }
.btn:hover, button:hover { transform: translateY(-3px); box-shadow: 0 6px 20px rgba(0, 0, 0, 0.2); text-decoration: none; }
.btn-secondary { background: #34495e; } .btn-secondary:hover { background: #2c3e50; }
.btn-info { background: #17a2b8; } .btn-info:hover { background: #138496; }
.btn-danger { background: #dc3545; } .btn-danger:hover { background: #c82333; }
.btn-small { padding: 6px 14px; font-size: 0.9rem; }
a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:hover { text-decoration: underline; }
.top-link { display: inline-block; margin-bottom: 25px; font-weight: 500; color: var(--primary-color); }
.top-link:hover { text-decoration: underline; }
h2 {
font-size: 1.8rem;
font-weight: 600;
color: var(--primary-color);
margin-bottom: 20px;
}
.noticias-list { list-style: none; padding: 0; margin: 0; }
.noticia-item { display: flex; gap: 20px; padding: 20px 10px; border-bottom: 1px solid var(--border-color); transition: background-color 0.2s ease; }
.noticia-item:last-child { border-bottom: none; }
.noticia-item:hover { background-color: rgba(255,255,255,0.4); }
.noticia-imagen img { width: 150px; height: 100px; border-radius: var(--border-radius-sm); object-fit: cover; }
.noticia-texto h3 { margin: 0 0 5px 0; }
.noticia-texto h3 a { color: var(--text-color); font-weight: 600; }
.noticia-texto h3 a:hover { color: var(--primary-color); }
.noticia-meta { font-size: 0.8rem; color: var(--text-color-light); margin-bottom: 8px; }
.subtitle {
color: var(--text-color-light);
font-size: 1.1rem;
margin-top: 5px;
}
.flash-messages { list-style: none; padding: 0; margin-bottom: 20px; }
.flash-messages li { padding: 15px 20px; border-radius: var(--border-radius-sm); border-left: 5px solid; }
.flash-messages .error { background-color: #fff0f3; color: #d90429; border-color: var(--error-color); }
.flash-messages .success { background-color: #e6fcf5; color: #00b894; border-color: #00b894; }
.flash-messages .warning { background-color: #fffbeb; color: #f39c12; border-color: #f39c12; }
.form-section,
.card {
margin-bottom: 30px;
background: rgba(255, 255, 255, 0.6);
padding: 25px;
border-radius: var(--border-radius-md);
border: 1px solid var(--border-color);
}
label {
display: block;
margin-bottom: 8px;
font-weight: 600;
color: var(--text-color);
font-size: 0.9rem;
}
select,
input[type="text"],
input[type="url"],
input[type="file"],
textarea {
width: 100%;
padding: 12px 15px;
border: 1px solid var(--border-color);
background-color: #f8f9fa;
border-radius: var(--border-radius-sm);
font-size: 1rem;
font-family: 'Poppins', sans-serif;
transition: all var(--transition-speed) ease;
}
select:focus,
input:focus,
textarea:focus {
outline: none;
border-color: var(--primary-color);
box-shadow: 0 0 0 3px var(--shadow-color);
background-color: white;
}
.btn,
button {
padding: 12px 25px;
background: var(--gradiente-principal);
color: white !important;
border: none;
border-radius: var(--border-radius-sm);
font-size: 1rem;
font-weight: 600;
cursor: pointer;
transition: all var(--transition-speed) ease;
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1);
text-decoration: none;
display: inline-block;
text-align: center;
}
.btn:hover,
button:hover {
transform: translateY(-3px);
box-shadow: 0 6px 20px rgba(0, 0, 0, 0.2);
text-decoration: none;
}
.btn-secondary { background: #34495e; }
.btn-secondary:hover { background: #2c3e50; }
.btn-info { background: #17a2b8; }
.btn-info:hover { background: #138496; }
.btn-danger { background: #dc3545; }
.btn-danger:hover { background: #c82333; }
.btn-small {
padding: 6px 14px;
font-size: 0.9rem;
}
a {
color: var(--secondary-color);
text-decoration: none;
font-weight: 500;
}
a:hover {
text-decoration: underline;
}
.top-link {
display: inline-block;
margin-bottom: 25px;
font-weight: 500;
color: var(--primary-color);
}
.top-link:hover {
text-decoration: underline;
}
.noticias-list {
list-style: none;
padding: 0;
margin: 0;
}
.noticia-item {
display: flex;
gap: 20px;
padding: 20px 10px;
border-bottom: 1px solid var(--border-color);
transition: background-color 0.2s ease;
align-items: flex-start;
}
.noticia-item:last-child {
border-bottom: none;
}
.noticia-item:hover {
background-color: rgba(255, 255, 255, 0.4);
}
.noticia-imagen {
flex: 0 0 180px;
max-width: 180px;
}
.noticia-imagen img {
width: 100%;
height: 120px;
border-radius: var(--border-radius-sm);
object-fit: cover;
}
.noticia-texto h3 {
margin: 0 0 5px 0;
}
.noticia-texto h3 a {
color: var(--text-color);
font-weight: 600;
}
.noticia-texto h3 a:hover {
color: var(--primary-color);
}
.noticia-meta {
font-size: 0.8rem;
color: var(--text-color-light);
margin-bottom: 8px;
}
.flash-messages {
list-style: none;
padding: 0;
margin-bottom: 20px;
}
.flash-messages li {
padding: 15px 20px;
border-radius: var(--border-radius-sm);
border-left: 5px solid;
}
.flash-messages .error {
background-color: #fff0f3;
color: #d90429;
border-color: var(--error-color);
}
.flash-messages .success {
background-color: #e6fcf5;
color: #00b894;
border-color: #00b894;
}
.flash-messages .warning {
background-color: #fffbeb;
color: #f39c12;
border-color: #f39c12;
}
.dashboard-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 20px;
margin-bottom: 40px;
}
.stat-card {
background: rgba(255, 255, 255, 0.8);
padding: 20px;
border-radius: var(--border-radius-md);
text-align: center;
border: 1px solid var(--border-color);
transition: all 0.3s ease;
}
.stat-card:hover {
transform: translateY(-5px);
box-shadow: 0 4px 15px rgba(0,0,0,0.08);
}
.stat-card .stat-number {
font-size: 2.5rem;
font-weight: 600;
background: var(--gradiente-principal);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
line-height: 1.2;
}
.stat-card .stat-label {
font-size: 0.9rem;
color: var(--text-color-light);
font-weight: 500;
margin-top: 5px;
}
.pagination {
display: flex;
justify-content: center;
align-items: center;
gap: 5px;
margin: 30px 0;
flex-wrap: wrap;
}
.page-link {
display: inline-block;
padding: 8px 14px;
background: rgba(255, 255, 255, 0.6);
border: 1px solid var(--border-color);
border-radius: var(--border-radius-sm);
color: var(--primary-color);
text-decoration: none;
transition: all 0.2s ease;
}
.page-link:hover {
background: white;
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
}
.page-link.active {
background: var(--gradiente-principal);
color: white;
border-color: transparent;
cursor: default;
}
.dashboard-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px; margin-bottom: 40px; }
.stat-card { background: rgba(255, 255, 255, 0.8); padding: 20px; border-radius: var(--border-radius-md); text-align: center; border: 1px solid var(--border-color); transition: all 0.3s ease; }
.stat-card:hover { transform: translateY(-5px); box-shadow: 0 4px 15px rgba(0,0,0,0.08); }
.stat-card .stat-number { font-size: 2.5rem; font-weight: 600; background: var(--gradiente-principal); -webkit-background-clip: text; -webkit-text-fill-color: transparent; line-height: 1.2; }
.stat-card .stat-label { font-size: 0.9rem; color: var(--text-color-light); font-weight: 500; margin-top: 5px; }
.pagination { display: flex; justify-content: center; align-items: center; gap: 5px; margin: 30px 0; flex-wrap: wrap; }
.page-link { display: inline-block; padding: 8px 14px; background: rgba(255, 255, 255, 0.6); border: 1px solid var(--border-color); border-radius: var(--border-radius-sm); color: var(--primary-color); text-decoration: none; transition: all 0.2s ease; }
.page-link:hover { background: white; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
.page-link.active { background: var(--gradiente-principal); color: white; border-color: transparent; cursor: default; }
.feed-detail-card { padding: 0; }
.feed-header { display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap; gap: 10px; background: rgba(233, 236, 239, 0.5); padding: 15px 25px; border-bottom: 1px solid var(--border-color); }
.feed-header h2 { margin: 0; font-size: 1.4rem; }
.feed-header {
display: flex;
justify-content: space-between;
align-items: center;
flex-wrap: wrap;
gap: 10px;
background: rgba(233, 236, 239, 0.5);
padding: 15px 25px;
border-bottom: 1px solid var(--border-color);
}
.feed-header h2 {
margin: 0;
font-size: 1.4rem;
}
.feed-body { padding: 25px; }
.feed-body dl { display: grid; grid-template-columns: 120px 1fr; gap: 10px 20px; }
.feed-body dt { font-weight: 600; color: var(--text-color-light); }
.feed-body dd { margin: 0; word-break: break-all; }
.feed-body dl {
display: grid;
grid-template-columns: 120px 1fr;
gap: 10px 20px;
}
.feed-body dt {
font-weight: 600;
color: var(--text-color-light);
}
.feed-body dd {
margin: 0;
word-break: break-all;
}
.main-nav {
display: flex;
@ -98,6 +371,7 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
flex-wrap: wrap;
padding-top: 15px;
}
.nav-link {
font-weight: 500;
color: var(--text-color);
@ -106,11 +380,13 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
border-radius: var(--border-radius-sm);
transition: all var(--transition-speed);
}
.nav-link:hover {
background-color: rgba(255,255,255,0.6);
text-decoration: none;
color: var(--primary-color);
}
.nav-actions {
display: flex;
gap: 10px;
@ -118,15 +394,78 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
}
@media (max-width: 768px) {
.container { padding: 20px; margin: 15px; }
.container {
padding: 20px;
margin: 15px;
}
h1 { font-size: 2rem; }
.noticia-item { flex-direction: column; }
.feed-body dl { grid-template-columns: 100px 1fr; }
.main-nav { flex-direction: column; gap: 10px; }
.nav-actions { margin-left: 0; margin-top: 10px; }
.noticia-item {
flex-direction: column;
}
.noticia-imagen {
flex: 0 0 auto;
max-width: 100%;
}
.noticia-imagen img {
width: 100%;
height: auto;
}
.feed-body dl {
grid-template-columns: 100px 1fr;
}
.main-nav {
flex-direction: column;
gap: 10px;
}
.nav-actions {
margin-left: 0;
margin-top: 10px;
}
}
.resumen-container {
position: relative;
}
/* Neutralizar estilos raros que vienen dentro del HTML de los resúmenes */
.resumen-container .btn,
.resumen-container button,
.resumen-container input[type="button"],
.resumen-container input[type="submit"],
.resumen-container .wp-block-button__link {
padding: 0;
margin: 0;
background: none;
border: none;
border-radius: 0;
box-shadow: none;
font: inherit;
display: inline;
color: var(--secondary-color);
text-decoration: underline;
cursor: pointer;
}
.resumen-container .btn:hover,
.resumen-container button:hover,
.resumen-container .wp-block-button__link:hover {
transform: none;
box-shadow: none;
text-decoration: underline;
}
.resumen-container a {
color: var(--secondary-color);
text-decoration: underline;
}
.resumen-container { position: relative; }
.ver-mas-btn {
background: none;
border: none;
@ -144,8 +483,14 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
gap: 15px;
margin-bottom: 20px;
}
.filter-search-box { flex-grow: 1; }
.filter-actions { display: flex; gap: 10px; white-space: nowrap; }
.filter-actions {
display: flex;
gap: 10px;
white-space: nowrap;
}
.clamp {
display: -webkit-box;
@ -154,14 +499,21 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
overflow: hidden;
word-break: break-word;
}
.clamp.expanded {
-webkit-line-clamp: unset;
max-height: none;
overflow: visible;
}
/* Pestañas por noticia */
.tabs { width: 100%; }
.tabs-header { display: flex; gap: 8px; margin-bottom: 8px; }
.tabs-header {
display: flex;
gap: 8px;
margin-bottom: 8px;
}
.tab-btn {
background: rgba(255,255,255,0.7);
border: 1px solid var(--border-color);
@ -170,16 +522,20 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
font-weight: 600;
cursor: pointer;
}
.tab-btn.active {
background: var(--gradiente-principal);
color: #fff !important;
border-color: transparent;
}
.tab-btn[disabled] {
opacity: .45;
cursor: not-allowed;
}
.tab-panel { display: none; }
.tab-panel.active { display: block; }
.badge {
@ -192,6 +548,13 @@ a { color: var(--secondary-color); text-decoration: none; font-weight: 500; } a:
color: var(--secondary-color);
vertical-align: middle;
}
.badge-secondary { background: #f1f3f5; color: #555; }
.badge-secondary {
background: #f1f3f5;
color: #555;
}
.mini-link { margin-left: 8px; font-size: .8rem; }
.m0 { margin: 0 0 6px 0; }

View file

@ -9,7 +9,12 @@
{% if n.imagen_url %}
<div class="noticia-imagen">
<a href="{{ detalle_url }}">
<img src="{{ n.imagen_url }}" alt="Imagen para {{ n.titulo }}" loading="lazy">
<img
src="{{ n.imagen_url }}"
alt="Imagen para {{ n.titulo }}"
loading="lazy"
onerror="this.closest('.noticia-imagen').style.display='none';"
>
</a>
</div>
{% endif %}

View file

@ -51,8 +51,17 @@
<h3>Operaciones del Sistema</h3>
</div>
<div class="card-body">
<p>Genera una copia de seguridad completa de todas tus fuentes y noticias en un archivo .zip.</p>
<a href="{{ url_for('backup_completo') }}" class="btn btn-secondary"><i class="fas fa-archive"></i> Backup Completo (.zip)</a>
<p>Genera o restaura una copia de seguridad completa de todas tus fuentes y noticias.</p>
<div style="display:flex; gap:10px; flex-wrap:wrap;">
<a href="{{ url_for('backup_completo') }}" class="btn btn-secondary">
<i class="fas fa-archive"></i> Backup Completo (.zip)
</a>
<a href="{{ url_for('restore_completo') }}" class="btn btn-info">
<i class="fas fa-upload"></i> Restaurar Backup (.zip)
</a>
</div>
</div>
</div>

View file

@ -50,19 +50,28 @@
{% if d.imagen_url %}
<div style="margin-bottom:16px; text-align:center;">
<img src="{{ d.imagen_url }}" alt="Imagen de la noticia" style="max-width:100%; height:auto;" loading="lazy">
<img
src="{{ d.imagen_url }}"
alt="Imagen de la noticia"
loading="lazy"
onerror="this.style.display='none';"
>
</div>
{% endif %}
{% if d.resumen_trad %}
<h3>Resumen (traducido)</h3>
<div>{{ d.resumen_trad|safe_html }}</div>
<div class="resumen-container">
<div class="resumen-completo" style="display:block;">{{ d.resumen_trad|safe_html }}</div>
</div>
<hr>
{% endif %}
{% if d.resumen_orig %}
<h3>Resumen (original)</h3>
<div>{{ d.resumen_orig|safe_html }}</div>
<div class="resumen-container">
<div class="resumen-completo" style="display:block;">{{ d.resumen_orig|safe_html }}</div>
</div>
{% endif %}
{% if tags and tags|length %}
@ -87,7 +96,12 @@
{% if r.imagen_url %}
<div class="noticia-imagen">
<a href="{{ r.url }}" target="_blank" rel="noopener">
<img src="{{ r.imagen_url }}" alt="Imagen relacionada" loading="lazy">
<img
src="{{ r.imagen_url }}"
alt="Imagen relacionada"
loading="lazy"
onerror="this.closest('.noticia-imagen').style.display='none';"
>
</a>
</div>
{% endif %}

View file

@ -0,0 +1,49 @@
{% extends "base.html" %}
{% block title %}Restaurar Backup Completo{% endblock %}
{% block content %}
<div class="card">
<div class="card-header">
<h3>Restaurar Backup Completo</h3>
</div>
<div class="card-body">
<p>
Sube un archivo <strong>.zip</strong> generado desde
<em>"Backup Completo (.zip)"</em> en el dashboard.
</p>
<div style="background: #fff3cd; border: 1px solid #ffeeba; padding: 10px 12px; border-radius: 8px; margin: 15px 0;">
<strong>⚠ Atención:</strong>
<ul style="margin: 8px 0 0 18px; padding: 0;">
<li>Se <strong>vaciarán</strong> las tablas <code>feeds</code> y <code>fuentes_url</code>.</li>
<li>Los datos de esos CSV se volverán a cargar desde el backup.</li>
<li>No se tocan noticias, traducciones ni tags.</li>
</ul>
</div>
<form action="{{ url_for('restore_completo') }}" method="post" enctype="multipart/form-data">
<div class="form-group" style="margin-bottom: 15px;">
<label for="backup_file"><strong>Archivo .zip de backup completo</strong></label>
<input
type="file"
id="backup_file"
name="backup_file"
accept=".zip"
required
style="display:block; margin-top:8px;"
>
</div>
<div style="margin-top: 20px; display:flex; gap:10px;">
<a href="{{ url_for('dashboard') }}" class="btn btn-secondary">Cancelar</a>
<button type="submit" class="btn btn-danger">
<i class="fas fa-exclamation-triangle"></i>
Restaurar desde backup
</button>
</div>
</form>
</div>
</div>
{% endblock %}

View file

@ -1,4 +1,3 @@
# translation_worker.py
import os
import time
import logging
@ -8,17 +7,17 @@ from typing import List, Optional
import psycopg2
import psycopg2.extras
from psycopg2.extras import execute_values
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0 # resultados reproducibles
DetectorFactory.seed = 0
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
LOG = logging.getLogger(__name__)
# ---------- Config DB ----------
DB_CONFIG = {
"host": os.environ.get("DB_HOST", "localhost"),
"port": int(os.environ.get("DB_PORT", 5432)),
@ -27,7 +26,7 @@ DB_CONFIG = {
"password": os.environ.get("DB_PASS", "x"),
}
# ---------- Helpers ENV (con retrocompatibilidad) ----------
def _env_list(name: str, *fallbacks: str, default: str = "es") -> List[str]:
raw = None
for key in (name, *fallbacks):
@ -37,6 +36,7 @@ def _env_list(name: str, *fallbacks: str, default: str = "es") -> List[str]:
raw = raw if raw is not None else default
return [s.strip() for s in raw.split(",") if s and s.strip()]
def _env_int(name: str, *fallbacks: str, default: int = 8) -> int:
for key in (name, *fallbacks):
val = os.environ.get(key)
@ -47,6 +47,7 @@ def _env_int(name: str, *fallbacks: str, default: int = 8) -> int:
pass
return default
def _env_float(name: str, *fallbacks: str, default: float = 5.0) -> float:
for key in (name, *fallbacks):
val = os.environ.get(key)
@ -57,6 +58,7 @@ def _env_float(name: str, *fallbacks: str, default: float = 5.0) -> float:
pass
return default
def _env_str(name: str, *fallbacks: str, default: Optional[str] = None) -> Optional[str]:
for key in (name, *fallbacks):
val = os.environ.get(key)
@ -64,23 +66,24 @@ def _env_str(name: str, *fallbacks: str, default: Optional[str] = None) -> Optio
return val
return default
def _env_bool(name: str, default: bool = False) -> bool:
val = os.environ.get(name)
if val is None:
return default
return str(val).strip().lower() in ("1", "true", "yes", "y", "on")
TARGET_LANGS = _env_list("TARGET_LANGS", "TRANSLATE_TO", default="es")
BATCH_SIZE = _env_int("BATCH", "TRANSLATOR_BATCH", "TRANSLATE_BATCH", default=8)
ENQUEUE_MAX = _env_int("ENQUEUE", "TRANSLATOR_ENQUEUE", "TRANSLATE_ENQUEUE", default=200)
SLEEP_IDLE = _env_float("SLEEP_IDLE", "TRANSLATOR_SLEEP_IDLE", "TRANSLATE_SLEEP_IDLE", default=5.0)
DEVICE_CFG = (_env_str("DEVICE", default="auto") or "auto").lower() # 'cpu' | 'cuda' | 'auto'
DEVICE_CFG = (_env_str("DEVICE", default="auto") or "auto").lower()
# Límites de tokens (ajusta si ves OOM)
MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", default=512)
MAX_NEW_TOKENS = _env_int("MAX_NEW_TOKENS", default=256)
# ---- Beams: por defecto 2 para títulos y 1 para cuerpo; respeta NUM_BEAMS si sólo se define ese ----
def _beams_from_env():
nb_global = os.environ.get("NUM_BEAMS")
has_title = os.environ.get("NUM_BEAMS_TITLE") is not None
@ -91,41 +94,77 @@ def _beams_from_env():
return v, v
except ValueError:
pass
# por defecto: 2 (título), 1 (cuerpo)
return _env_int("NUM_BEAMS_TITLE", default=2), _env_int("NUM_BEAMS_BODY", default=1)
NUM_BEAMS_TITLE, NUM_BEAMS_BODY = _beams_from_env()
# Modelo por defecto: NLLB 600M (cámbialo por facebook/nllb-200-1.3B si quieres el 1.3B)
UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", default="facebook/nllb-200-distilled-600M")
# ---------- Chunking por frases (para artículos largos) ----------
# Activo por defecto para evitar secuencias > límite del modelo
CHUNK_BY_SENTENCES = _env_bool("CHUNK_BY_SENTENCES", default=True)
CHUNK_MAX_TOKENS = _env_int("CHUNK_MAX_TOKENS", default=900) # <= modelo - margen
CHUNK_OVERLAP_SENTS = _env_int("CHUNK_OVERLAP_SENTS", default=0) # 0 o 1
CHUNK_MAX_TOKENS = _env_int("CHUNK_MAX_TOKENS", default=900)
CHUNK_OVERLAP_SENTS = _env_int("CHUNK_OVERLAP_SENTS", default=0)
# Abreviaturas comunes y marcador temporal
_ABBR = ("Sr", "Sra", "Dr", "Dra", "Ing", "Lic", "pág", "etc")
_ABBR_MARK = "§" # no debería aparecer en texto normal
_ABBR_MARK = "§"
_SENT_SPLIT_RE = re.compile(
r'(?<=[\.!\?…])\s+(?=["\(\[A-ZÁÉÍÓÚÑÄÖÜ0-9])|(?:\n{2,})'
)
NLLB_LANG = {
"es": "spa_Latn",
"en": "eng_Latn",
"fr": "fra_Latn",
"de": "deu_Latn",
"it": "ita_Latn",
"pt": "por_Latn",
"nl": "nld_Latn",
"sv": "swe_Latn",
"da": "dan_Latn",
"fi": "fin_Latn",
"no": "nob_Latn",
"nb": "nob_Latn",
"nn": "nno_Latn",
"pl": "pol_Latn",
"cs": "ces_Latn",
"sk": "slk_Latn",
"sl": "slv_Latn",
"hu": "hun_Latn",
"ro": "ron_Latn",
"bg": "bul_Cyrl",
"el": "ell_Grek",
"ru": "rus_Cyrl",
"uk": "ukr_Cyrl",
"hr": "hrv_Latn",
"sr": "srp_Cyrl",
"bs": "bos_Latn",
"tr": "tur_Latn",
"ar": "arb_Arab",
"fa": "pes_Arab",
"he": "heb_Hebr",
"zh": "zho_Hans",
"ja": "jpn_Jpan",
"ko": "kor_Hang",
"vi": "vie_Latn",
"th": "tha_Thai",
"id": "ind_Latn",
"ms": "zsm_Latn",
"pt-br": "por_Latn",
"pt-pt": "por_Latn",
}
def _protect_abbrev(text: str) -> str:
# Iniciales de una letra: "E.", "A."
t = re.sub(r"\b([A-ZÁÉÍÓÚÑÄÖÜ])\.", r"\1" + _ABBR_MARK, text)
# Abreviaturas de la lista (case-insensitive)
pat = r"\b(?:" + "|".join(map(re.escape, _ABBR)) + r")\."
t = re.sub(pat, lambda m: m.group(0)[:-1] + _ABBR_MARK, t, flags=re.IGNORECASE)
return t
def _restore_abbrev(text: str) -> str:
return text.replace(_ABBR_MARK, ".")
# Regex de corte SIN look-behind variable:
# - Corta tras [.!?…] si hay espacios y luego comienza otra frase (letra mayúscula, comillas, paréntesis, dígito)
# - O cuando hay doble salto de línea
_SENT_SPLIT_RE = re.compile(
r'(?<=[\.!\?…])\s+(?=["\(\[A-ZÁÉÍÓÚÑÄÖÜ0-9])|(?:\n{2,})'
)
def split_into_sentences(text: str) -> List[str]:
text = (text or "").strip()
@ -134,7 +173,6 @@ def split_into_sentences(text: str) -> List[str]:
protected = _protect_abbrev(text)
parts = [p.strip() for p in _SENT_SPLIT_RE.split(protected) if p and p.strip()]
parts = [_restore_abbrev(p) for p in parts]
# Une piezas muy cortas con la anterior para más coherencia
merged: List[str] = []
for p in parts:
if merged and len(p) < 40:
@ -143,26 +181,6 @@ def split_into_sentences(text: str) -> List[str]:
merged.append(p)
return merged
# ---------- Mapeo idiomas a códigos NLLB ----------
NLLB_LANG = {
# básicos
"es": "spa_Latn", "en": "eng_Latn", "fr": "fra_Latn", "de": "deu_Latn", "it": "ita_Latn", "pt": "por_Latn",
# nórdicos
"nl": "nld_Latn", "sv": "swe_Latn", "da": "dan_Latn", "fi": "fin_Latn",
# noruego
"no": "nob_Latn", "nb": "nob_Latn", "nn": "nno_Latn",
# CEE
"pl": "pol_Latn", "cs": "ces_Latn", "sk": "slk_Latn", "sl": "slv_Latn",
"hu": "hun_Latn", "ro": "ron_Latn", "bg": "bul_Cyrl", "el": "ell_Grek",
"ru": "rus_Cyrl", "uk": "ukr_Cyrl", "hr": "hrv_Latn", "sr": "srp_Cyrl", "bs": "bos_Latn",
# ME/Asia
"tr": "tur_Latn", "ar": "arb_Arab", "fa": "pes_Arab", "he": "heb_Hebr",
"zh": "zho_Hans", "ja": "jpn_Jpan", "ko": "kor_Hang",
# SEA
"vi": "vie_Latn", "th": "tha_Thai", "id": "ind_Latn", "ms": "zsm_Latn",
# variantes
"pt-br": "por_Latn", "pt-pt": "por_Latn",
}
def map_to_nllb(code: Optional[str]) -> Optional[str]:
if not code:
@ -172,29 +190,35 @@ def map_to_nllb(code: Optional[str]) -> Optional[str]:
return NLLB_LANG[code]
return f"{code}_Latn"
def normalize_lang(code: Optional[str], default: Optional[str] = None) -> Optional[str]:
if not code:
return default
code = code.strip().lower()
return code if code else default
# ---------- DB ----------
def get_conn():
return psycopg2.connect(**DB_CONFIG)
def ensure_indexes(conn):
with conn.cursor() as cur:
cur.execute("""
cur.execute(
"""
CREATE INDEX IF NOT EXISTS traducciones_lang_to_status_idx
ON traducciones (lang_to, status);
CREATE INDEX IF NOT EXISTS traducciones_status_idx
ON traducciones (status);
""")
"""
)
conn.commit()
def ensure_pending(conn, lang_to: str, enqueue_limit: int):
with conn.cursor() as cur:
cur.execute("""
cur.execute(
"""
INSERT INTO traducciones (noticia_id, lang_from, lang_to, status)
SELECT sub.id, NULL, %s, 'pending'
FROM (
@ -206,12 +230,16 @@ def ensure_pending(conn, lang_to: str, enqueue_limit: int):
ORDER BY n.fecha DESC NULLS LAST, n.id
LIMIT %s
) AS sub;
""", (lang_to, lang_to, enqueue_limit))
""",
(lang_to, lang_to, enqueue_limit),
)
conn.commit()
def fetch_pending_batch(conn, lang_to: str, batch_size: int):
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute("""
cur.execute(
"""
SELECT t.id AS tr_id, t.noticia_id, t.lang_from, t.lang_to,
n.titulo, n.resumen
FROM traducciones t
@ -219,7 +247,9 @@ def fetch_pending_batch(conn, lang_to: str, batch_size: int):
WHERE t.lang_to = %s AND t.status = 'pending'
ORDER BY t.id
LIMIT %s;
""", (lang_to, batch_size))
""",
(lang_to, batch_size),
)
rows = cur.fetchall()
if rows:
ids = [r["tr_id"] for r in rows]
@ -228,21 +258,6 @@ def fetch_pending_batch(conn, lang_to: str, batch_size: int):
conn.commit()
return rows
def mark_done(conn, tr_id: int, title_tr: str, body_tr: str, lang_from: Optional[str]):
with conn.cursor() as cur:
cur.execute("""
UPDATE traducciones
SET titulo_trad=%s, resumen_trad=%s,
lang_from = COALESCE(lang_from, %s),
status='done', error=NULL
WHERE id=%s;
""", (title_tr, body_tr, lang_from, tr_id))
conn.commit()
def mark_error(conn, tr_id: int, msg: str):
with conn.cursor() as cur:
cur.execute("UPDATE traducciones SET status='error', error=%s WHERE id=%s;", (msg[:1500], tr_id))
conn.commit()
def detect_lang(text1: str, text2: str) -> Optional[str]:
txt = (text1 or "").strip() or (text2 or "").strip()
@ -253,13 +268,14 @@ def detect_lang(text1: str, text2: str) -> Optional[str]:
except Exception:
return None
# ---------- Modelo único y manejo de CUDA (NLLB) ----------
_TOKENIZER: Optional[AutoTokenizer] = None
_MODEL: Optional[AutoModelForSeq2SeqLM] = None
_DEVICE: Optional[torch.device] = None
_CUDA_FAILS: int = 0
_CUDA_DISABLED: bool = False
def _resolve_device() -> torch.device:
global _CUDA_DISABLED
if _CUDA_DISABLED:
@ -268,13 +284,14 @@ def _resolve_device() -> torch.device:
return torch.device("cpu")
if DEVICE_CFG == "cuda":
return torch.device("cuda" if torch.cuda.is_available() else "cpu")
# auto
return torch.device("cuda" if torch.cuda.is_available() else "cpu")
def _is_cuda_mem_error(exc: Exception) -> bool:
s = str(exc)
return ("CUDA out of memory" in s) or ("CUDACachingAllocator" in s) or ("expandable_segment" in s)
def _free_cuda():
if torch.cuda.is_available():
try:
@ -283,8 +300,8 @@ def _free_cuda():
except Exception:
pass
def _load_model_on(device: torch.device):
"""Carga (o recarga) el modelo/tokenizer en el dispositivo indicado."""
global _TOKENIZER, _MODEL, _DEVICE
dtype = torch.float16 if device.type == "cuda" else torch.float32
@ -293,9 +310,9 @@ def _load_model_on(device: torch.device):
mdl = AutoModelForSeq2SeqLM.from_pretrained(
UNIVERSAL_MODEL,
torch_dtype=dtype,
low_cpu_mem_usage=True
low_cpu_mem_usage=True,
)
# use_cache=False reduce picos de VRAM en generación
try:
mdl.config.use_cache = False
except Exception:
@ -306,8 +323,8 @@ def _load_model_on(device: torch.device):
_TOKENIZER, _MODEL, _DEVICE = tok, mdl, device
def get_universal_components():
"""Devuelve (tokenizer, model, device). Carga en GPU si está disponible y estable."""
global _TOKENIZER, _MODEL, _DEVICE, _CUDA_FAILS, _CUDA_DISABLED
if _MODEL is not None and _DEVICE is not None:
@ -329,14 +346,13 @@ def get_universal_components():
_load_model_on(torch.device("cpu"))
return _TOKENIZER, _MODEL, _DEVICE
# ---------- Utilidades de tokenización / chunking ----------
def _safe_src_len(tokenizer) -> int:
model_max = getattr(tokenizer, "model_max_length", 1024) or 1024
# margen para tokens especiales/ruido
return min(MAX_SRC_TOKENS, int(model_max) - 16)
def _token_chunks(tokenizer, text: str, max_tokens: int) -> List[str]:
"""Troceo simple por tokens (fallback)"""
if not text:
return []
ids = tokenizer.encode(text, add_special_tokens=False)
@ -344,22 +360,20 @@ def _token_chunks(tokenizer, text: str, max_tokens: int) -> List[str]:
return [text]
chunks = []
for i in range(0, len(ids), max_tokens):
sub = ids[i:i+max_tokens]
sub = ids[i : i + max_tokens]
piece = tokenizer.decode(sub, skip_special_tokens=True, clean_up_tokenization_spaces=True)
if piece.strip():
chunks.append(piece.strip())
return chunks
def _norm(s: str) -> str:
import re as _re
return _re.sub(r"\W+", "", (s or "").lower()).strip()
def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_code: str) -> int:
"""
Resuelve el id del token de idioma destino para NLLB de forma robusta,
funcionando aunque falte `lang_code_to_id` en el tokenizer.
"""
# 1) tokenizer.lang_code_to_id (si existe)
try:
mapping = getattr(tokenizer, "lang_code_to_id", None)
if isinstance(mapping, dict):
@ -369,7 +383,6 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c
except Exception:
pass
# 2) model.config.lang_code_to_id (si existe)
try:
mapping = getattr(getattr(model, "config", None), "lang_code_to_id", None)
if isinstance(mapping, dict):
@ -379,7 +392,6 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c
except Exception:
pass
# 3) convert_tokens_to_ids (algunos builds registran el código como token especial)
try:
tid = tokenizer.convert_tokens_to_ids(tgt_code)
if isinstance(tid, int) and tid not in (-1, getattr(tokenizer, "unk_token_id", -1)):
@ -387,7 +399,6 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c
except Exception:
pass
# 4) additional_special_tokens/_ids (buscar el código tal cual)
try:
ats = getattr(tokenizer, "additional_special_tokens", None)
ats_ids = getattr(tokenizer, "additional_special_tokens_ids", None)
@ -398,17 +409,12 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c
except Exception:
pass
# 5) último recurso: usa eos/bos para no romper generate()
LOG.warning("No pude resolver lang code id para '%s'. Uso fallback (eos/bos).", tgt_code)
return getattr(tokenizer, "eos_token_id", None) or getattr(tokenizer, "bos_token_id", None) or 0
# ---------- Traducción base ----------
@torch.inference_mode()
def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1, _tries: int = 0) -> str:
"""
Traduce un texto (usando troceo por tokens si excede MAX_SRC_TOKENS).
Se usa para títulos y como núcleo para chunks de artículos.
"""
if not text or not text.strip():
return ""
@ -416,7 +422,6 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1,
src_code = map_to_nllb(src_lang) or "eng_Latn"
tgt_code = map_to_nllb(tgt_lang) or "spa_Latn"
# Configura idioma origen (si la prop existe)
try:
tok.src_lang = src_code
except Exception:
@ -439,7 +444,7 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1,
max_new_tokens=MAX_NEW_TOKENS,
num_beams=max(1, int(num_beams)),
do_sample=False,
use_cache=False, # ↓ memoria
use_cache=False,
)
if int(num_beams) > 1:
gen_kwargs["early_stopping"] = True
@ -459,7 +464,6 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1,
except Exception as e:
if device.type == "cuda" and _is_cuda_mem_error(e) and _tries < 2:
LOG.warning("CUDA OOM/allocator: intento de recuperación %d. Detalle: %s", _tries + 1, e)
# desactiva CUDA y relanza en CPU
global _MODEL, _DEVICE, _CUDA_DISABLED
_CUDA_DISABLED = True
try:
@ -474,10 +478,11 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1,
return translate_text(src_lang, tgt_lang, text, num_beams=num_beams, _tries=_tries + 1)
raise
# ---------- Chunking por frases para artículos ----------
def _sent_token_len(tokenizer, sent: str) -> int:
return len(tokenizer(sent, add_special_tokens=False).input_ids)
def _pack_sentences_to_token_chunks(
tokenizer, sentences: List[str], max_tokens: int, overlap_sents: int = 0
) -> List[List[str]]:
@ -487,11 +492,10 @@ def _pack_sentences_to_token_chunks(
for s in sentences:
slen = _sent_token_len(tokenizer, s)
if slen > max_tokens:
# Si una sola frase excede el límite, córtala por tokens como último recurso
ids = tokenizer(s, add_special_tokens=False).input_ids
step = max_tokens
for i in range(0, len(ids), step):
sub = tokenizer.decode(ids[i:i+step], skip_special_tokens=True)
sub = tokenizer.decode(ids[i : i + step], skip_special_tokens=True)
if cur:
chunks.append(cur)
cur = []
@ -500,7 +504,8 @@ def _pack_sentences_to_token_chunks(
continue
if cur_tokens + slen <= max_tokens:
cur.append(s); cur_tokens += slen
cur.append(s)
cur_tokens += slen
else:
if cur:
chunks.append(cur)
@ -509,13 +514,14 @@ def _pack_sentences_to_token_chunks(
cur = overlap + [s]
cur_tokens = sum(_sent_token_len(tokenizer, x) for x in cur)
else:
cur = [s]; cur_tokens = slen
cur = [s]
cur_tokens = slen
if cur:
chunks.append(cur)
return chunks
def _smart_concatenate(parts: List[str], tail_window: int = 120) -> str:
"""Une partes evitando duplicados obvios en el borde (heurística ligera)."""
if not parts:
return ""
out = parts[0]
@ -529,24 +535,17 @@ def _smart_concatenate(parts: List[str], tail_window: int = 120) -> str:
out += ("" if cut == 0 else nxt[cut:]) if nxt else ""
return out
def translate_article_full(
src_lang: str,
tgt_lang: str,
text: str,
num_beams: int,
) -> str:
"""
Traduce un artículo completo:
- Divide por frases (sin look-behind variable)
- Empaqueta en chunks <= límite de tokens
- Traduce chunk a chunk (usa translate_text internamente)
- Une con heurística para evitar duplicados en bordes
"""
if not text or not text.strip():
return ""
if not CHUNK_BY_SENTENCES:
# Ruta rápida: una sola pasada con truncamiento interno
return translate_text(src_lang, tgt_lang, text, num_beams=num_beams)
tok, _, _ = get_universal_components()
@ -569,8 +568,11 @@ def translate_article_full(
return _smart_concatenate([p for p in translated_parts if p])
# ---------- Procesamiento por lotes ----------
def process_batch(conn, rows):
done_rows = []
error_rows = []
for r in rows:
tr_id = r["tr_id"]
lang_to = normalize_lang(r["lang_to"], "es") or "es"
@ -579,36 +581,70 @@ def process_batch(conn, rows):
title = (r["titulo"] or "").strip()
body = (r["resumen"] or "").strip()
# Si ya está en el mismo idioma, copia tal cual
if (map_to_nllb(lang_from) or "eng_Latn") == (map_to_nllb(lang_to) or "spa_Latn"):
mark_done(conn, tr_id, title, body, lang_from)
done_rows.append((title, body, lang_from, tr_id))
continue
try:
# Títulos: cortos, traducción directa (beams más altos si quieres)
title_tr = translate_text(lang_from, lang_to, title, num_beams=NUM_BEAMS_TITLE) if title else ""
# Cuerpo/resumen: artículo completo con chunking por frases
body_tr = translate_article_full(lang_from, lang_to, body, num_beams=NUM_BEAMS_BODY) if body else ""
# Si la "traducción" es igual al original, déjala vacía
if _norm(title_tr) == _norm(title):
title_tr = ""
if _norm(body_tr) == _norm(body):
body_tr = ""
mark_done(conn, tr_id, title_tr, body_tr, lang_from)
done_rows.append((title_tr, body_tr, lang_from, tr_id))
except Exception as e:
LOG.exception("Error traduciendo fila")
mark_error(conn, tr_id, str(e))
error_rows.append((str(e)[:1500], tr_id))
with conn.cursor() as cur:
if done_rows:
execute_values(
cur,
"""
UPDATE traducciones AS t
SET titulo_trad = v.titulo_trad,
resumen_trad = v.resumen_trad,
lang_from = COALESCE(t.lang_from, v.lang_from),
status = 'done',
error = NULL
FROM (VALUES %s) AS v(titulo_trad, resumen_trad, lang_from, id)
WHERE t.id = v.id;
""",
done_rows,
)
if error_rows:
execute_values(
cur,
"""
UPDATE traducciones AS t
SET status = 'error',
error = v.error
FROM (VALUES %s) AS v(error, id)
WHERE t.id = v.id;
""",
error_rows,
)
conn.commit()
def main():
LOG.info(
"Arrancando worker de traducción (NLLB). TARGET_LANGS=%s, BATCH=%s, ENQUEUE=%s, DEVICE=%s, "
"BEAMS(title/body)=%s/%s, CHUNK_BY_SENTENCES=%s, CHUNK_MAX_TOKENS=%s, OVERLAP_SENTS=%s",
TARGET_LANGS, BATCH_SIZE, ENQUEUE_MAX, DEVICE_CFG, NUM_BEAMS_TITLE, NUM_BEAMS_BODY,
CHUNK_BY_SENTENCES, CHUNK_MAX_TOKENS, CHUNK_OVERLAP_SENTS
TARGET_LANGS,
BATCH_SIZE,
ENQUEUE_MAX,
DEVICE_CFG,
NUM_BEAMS_TITLE,
NUM_BEAMS_BODY,
CHUNK_BY_SENTENCES,
CHUNK_MAX_TOKENS,
CHUNK_OVERLAP_SENTS,
)
# Pre-carga el modelo una vez para reservar memoria de forma limpia
get_universal_components()
while True:
@ -628,6 +664,7 @@ def main():
if not any_work:
time.sleep(SLEEP_IDLE)
if __name__ == "__main__":
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
main()