arreglo de ui y busquedas

2025-11-21 04:42:02 +01:00 · 2025-11-21 04:42:02 +01:00 · fc06566928
commit fc06566928
parent cb8f69fb93
15 changed files with 1115 additions and 435 deletions
--- a/translation_worker.py
+++ b/translation_worker.py
@ -1,4 +1,3 @@
-# translation_worker.py
 import os
 import time
 import logging
@ -8,17 +7,17 @@ from typing import List, Optional

 import psycopg2
 import psycopg2.extras
+from psycopg2.extras import execute_values

 import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from langdetect import detect, DetectorFactory

-DetectorFactory.seed = 0  # resultados reproducibles
+DetectorFactory.seed = 0

 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
 LOG = logging.getLogger(__name__)

-# ---------- Config DB ----------
 DB_CONFIG = {
    "host": os.environ.get("DB_HOST", "localhost"),
    "port": int(os.environ.get("DB_PORT", 5432)),
@ -27,7 +26,7 @@ DB_CONFIG = {
    "password": os.environ.get("DB_PASS", "x"),
 }

-# ---------- Helpers ENV (con retrocompatibilidad) ----------
+
 def _env_list(name: str, *fallbacks: str, default: str = "es") -> List[str]:
    raw = None
    for key in (name, *fallbacks):
@ -37,6 +36,7 @@ def _env_list(name: str, *fallbacks: str, default: str = "es") -> List[str]:
    raw = raw if raw is not None else default
    return [s.strip() for s in raw.split(",") if s and s.strip()]

+
 def _env_int(name: str, *fallbacks: str, default: int = 8) -> int:
    for key in (name, *fallbacks):
        val = os.environ.get(key)
@ -47,6 +47,7 @@ def _env_int(name: str, *fallbacks: str, default: int = 8) -> int:
                pass
    return default

+
 def _env_float(name: str, *fallbacks: str, default: float = 5.0) -> float:
    for key in (name, *fallbacks):
        val = os.environ.get(key)
@ -57,6 +58,7 @@ def _env_float(name: str, *fallbacks: str, default: float = 5.0) -> float:
                pass
    return default

+
 def _env_str(name: str, *fallbacks: str, default: Optional[str] = None) -> Optional[str]:
    for key in (name, *fallbacks):
        val = os.environ.get(key)
@ -64,68 +66,105 @@ def _env_str(name: str, *fallbacks: str, default: Optional[str] = None) -> Optio
            return val
    return default

+
 def _env_bool(name: str, default: bool = False) -> bool:
    val = os.environ.get(name)
    if val is None:
        return default
    return str(val).strip().lower() in ("1", "true", "yes", "y", "on")

-TARGET_LANGS = _env_list("TARGET_LANGS", "TRANSLATE_TO", default="es")
-BATCH_SIZE   = _env_int("BATCH", "TRANSLATOR_BATCH", "TRANSLATE_BATCH", default=8)
-ENQUEUE_MAX  = _env_int("ENQUEUE", "TRANSLATOR_ENQUEUE", "TRANSLATE_ENQUEUE", default=200)
-SLEEP_IDLE   = _env_float("SLEEP_IDLE", "TRANSLATOR_SLEEP_IDLE", "TRANSLATE_SLEEP_IDLE", default=5.0)
-DEVICE_CFG   = (_env_str("DEVICE", default="auto") or "auto").lower()  # 'cpu' | 'cuda' | 'auto'

-# Límites de tokens (ajusta si ves OOM)
+TARGET_LANGS = _env_list("TARGET_LANGS", "TRANSLATE_TO", default="es")
+BATCH_SIZE = _env_int("BATCH", "TRANSLATOR_BATCH", "TRANSLATE_BATCH", default=8)
+ENQUEUE_MAX = _env_int("ENQUEUE", "TRANSLATOR_ENQUEUE", "TRANSLATE_ENQUEUE", default=200)
+SLEEP_IDLE = _env_float("SLEEP_IDLE", "TRANSLATOR_SLEEP_IDLE", "TRANSLATE_SLEEP_IDLE", default=5.0)
+DEVICE_CFG = (_env_str("DEVICE", default="auto") or "auto").lower()
+
 MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", default=512)
 MAX_NEW_TOKENS = _env_int("MAX_NEW_TOKENS", default=256)

-# ---- Beams: por defecto 2 para títulos y 1 para cuerpo; respeta NUM_BEAMS si sólo se define ese ----
+
 def _beams_from_env():
    nb_global = os.environ.get("NUM_BEAMS")
    has_title = os.environ.get("NUM_BEAMS_TITLE") is not None
-    has_body  = os.environ.get("NUM_BEAMS_BODY")  is not None
+    has_body = os.environ.get("NUM_BEAMS_BODY") is not None
    if nb_global and not has_title and not has_body:
        try:
            v = max(1, int(nb_global))
            return v, v
        except ValueError:
            pass
-    # por defecto: 2 (título), 1 (cuerpo)
    return _env_int("NUM_BEAMS_TITLE", default=2), _env_int("NUM_BEAMS_BODY", default=1)

+
 NUM_BEAMS_TITLE, NUM_BEAMS_BODY = _beams_from_env()

-# Modelo por defecto: NLLB 600M (cámbialo por facebook/nllb-200-1.3B si quieres el 1.3B)
 UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", default="facebook/nllb-200-distilled-600M")

-# ---------- Chunking por frases (para artículos largos) ----------
-# Activo por defecto para evitar secuencias > límite del modelo
-CHUNK_BY_SENTENCES   = _env_bool("CHUNK_BY_SENTENCES", default=True)
-CHUNK_MAX_TOKENS     = _env_int("CHUNK_MAX_TOKENS", default=900)  # <= modelo - margen
-CHUNK_OVERLAP_SENTS  = _env_int("CHUNK_OVERLAP_SENTS", default=0) # 0 o 1
+CHUNK_BY_SENTENCES = _env_bool("CHUNK_BY_SENTENCES", default=True)
+CHUNK_MAX_TOKENS = _env_int("CHUNK_MAX_TOKENS", default=900)
+CHUNK_OVERLAP_SENTS = _env_int("CHUNK_OVERLAP_SENTS", default=0)

-# Abreviaturas comunes y marcador temporal
 _ABBR = ("Sr", "Sra", "Dr", "Dra", "Ing", "Lic", "pág", "etc")
-_ABBR_MARK = "§"  # no debería aparecer en texto normal
+_ABBR_MARK = "§"
+
+_SENT_SPLIT_RE = re.compile(
+    r'(?<=[\.!\?…])\s+(?=["“\(\[A-ZÁÉÍÓÚÑÄÖÜ0-9])|(?:\n{2,})'
+)
+
+NLLB_LANG = {
+    "es": "spa_Latn",
+    "en": "eng_Latn",
+    "fr": "fra_Latn",
+    "de": "deu_Latn",
+    "it": "ita_Latn",
+    "pt": "por_Latn",
+    "nl": "nld_Latn",
+    "sv": "swe_Latn",
+    "da": "dan_Latn",
+    "fi": "fin_Latn",
+    "no": "nob_Latn",
+    "nb": "nob_Latn",
+    "nn": "nno_Latn",
+    "pl": "pol_Latn",
+    "cs": "ces_Latn",
+    "sk": "slk_Latn",
+    "sl": "slv_Latn",
+    "hu": "hun_Latn",
+    "ro": "ron_Latn",
+    "bg": "bul_Cyrl",
+    "el": "ell_Grek",
+    "ru": "rus_Cyrl",
+    "uk": "ukr_Cyrl",
+    "hr": "hrv_Latn",
+    "sr": "srp_Cyrl",
+    "bs": "bos_Latn",
+    "tr": "tur_Latn",
+    "ar": "arb_Arab",
+    "fa": "pes_Arab",
+    "he": "heb_Hebr",
+    "zh": "zho_Hans",
+    "ja": "jpn_Jpan",
+    "ko": "kor_Hang",
+    "vi": "vie_Latn",
+    "th": "tha_Thai",
+    "id": "ind_Latn",
+    "ms": "zsm_Latn",
+    "pt-br": "por_Latn",
+    "pt-pt": "por_Latn",
+}
+

 def _protect_abbrev(text: str) -> str:
-    # Iniciales de una letra: "E.", "A."
    t = re.sub(r"\b([A-ZÁÉÍÓÚÑÄÖÜ])\.", r"\1" + _ABBR_MARK, text)
-    # Abreviaturas de la lista (case-insensitive)
    pat = r"\b(?:" + "|".join(map(re.escape, _ABBR)) + r")\."
    t = re.sub(pat, lambda m: m.group(0)[:-1] + _ABBR_MARK, t, flags=re.IGNORECASE)
    return t

+
 def _restore_abbrev(text: str) -> str:
    return text.replace(_ABBR_MARK, ".")

-# Regex de corte SIN look-behind variable:
-# - Corta tras [.!?…] si hay espacios y luego comienza otra frase (letra mayúscula, comillas, paréntesis, dígito)
-# - O cuando hay doble salto de línea
-_SENT_SPLIT_RE = re.compile(
-    r'(?<=[\.!\?…])\s+(?=["“\(\[A-ZÁÉÍÓÚÑÄÖÜ0-9])|(?:\n{2,})'
-)

 def split_into_sentences(text: str) -> List[str]:
    text = (text or "").strip()
@ -134,7 +173,6 @@ def split_into_sentences(text: str) -> List[str]:
    protected = _protect_abbrev(text)
    parts = [p.strip() for p in _SENT_SPLIT_RE.split(protected) if p and p.strip()]
    parts = [_restore_abbrev(p) for p in parts]
-    # Une piezas muy cortas con la anterior para más coherencia
    merged: List[str] = []
    for p in parts:
        if merged and len(p) < 40:
@ -143,26 +181,6 @@ def split_into_sentences(text: str) -> List[str]:
            merged.append(p)
    return merged

-# ---------- Mapeo idiomas a códigos NLLB ----------
-NLLB_LANG = {
-    # básicos
-    "es": "spa_Latn", "en": "eng_Latn", "fr": "fra_Latn", "de": "deu_Latn", "it": "ita_Latn", "pt": "por_Latn",
-    # nórdicos
-    "nl": "nld_Latn", "sv": "swe_Latn", "da": "dan_Latn", "fi": "fin_Latn",
-    # noruego
-    "no": "nob_Latn", "nb": "nob_Latn", "nn": "nno_Latn",
-    # CEE
-    "pl": "pol_Latn", "cs": "ces_Latn", "sk": "slk_Latn", "sl": "slv_Latn",
-    "hu": "hun_Latn", "ro": "ron_Latn", "bg": "bul_Cyrl", "el": "ell_Grek",
-    "ru": "rus_Cyrl", "uk": "ukr_Cyrl", "hr": "hrv_Latn", "sr": "srp_Cyrl", "bs": "bos_Latn",
-    # ME/Asia
-    "tr": "tur_Latn", "ar": "arb_Arab", "fa": "pes_Arab", "he": "heb_Hebr",
-    "zh": "zho_Hans", "ja": "jpn_Jpan", "ko": "kor_Hang",
-    # SEA
-    "vi": "vie_Latn", "th": "tha_Thai", "id": "ind_Latn", "ms": "zsm_Latn",
-    # variantes
-    "pt-br": "por_Latn", "pt-pt": "por_Latn",
-}

 def map_to_nllb(code: Optional[str]) -> Optional[str]:
    if not code:
@ -172,29 +190,35 @@ def map_to_nllb(code: Optional[str]) -> Optional[str]:
        return NLLB_LANG[code]
    return f"{code}_Latn"

+
 def normalize_lang(code: Optional[str], default: Optional[str] = None) -> Optional[str]:
    if not code:
        return default
    code = code.strip().lower()
    return code if code else default

-# ---------- DB ----------
+
 def get_conn():
    return psycopg2.connect(**DB_CONFIG)

+
 def ensure_indexes(conn):
    with conn.cursor() as cur:
-        cur.execute("""
+        cur.execute(
+            """
        CREATE INDEX IF NOT EXISTS traducciones_lang_to_status_idx
            ON traducciones (lang_to, status);
        CREATE INDEX IF NOT EXISTS traducciones_status_idx
            ON traducciones (status);
-        """)
+        """
+        )
    conn.commit()

+
 def ensure_pending(conn, lang_to: str, enqueue_limit: int):
    with conn.cursor() as cur:
-        cur.execute("""
+        cur.execute(
+            """
            INSERT INTO traducciones (noticia_id, lang_from, lang_to, status)
            SELECT sub.id, NULL, %s, 'pending'
            FROM (
@ -206,12 +230,16 @@ def ensure_pending(conn, lang_to: str, enqueue_limit: int):
                ORDER BY n.fecha DESC NULLS LAST, n.id
                LIMIT %s
            ) AS sub;
-        """, (lang_to, lang_to, enqueue_limit))
+        """,
+            (lang_to, lang_to, enqueue_limit),
+        )
    conn.commit()

+
 def fetch_pending_batch(conn, lang_to: str, batch_size: int):
    with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-        cur.execute("""
+        cur.execute(
+            """
            SELECT t.id AS tr_id, t.noticia_id, t.lang_from, t.lang_to,
                   n.titulo, n.resumen
            FROM traducciones t
@ -219,7 +247,9 @@ def fetch_pending_batch(conn, lang_to: str, batch_size: int):
            WHERE t.lang_to = %s AND t.status = 'pending'
            ORDER BY t.id
            LIMIT %s;
-        """, (lang_to, batch_size))
+        """,
+            (lang_to, batch_size),
+        )
        rows = cur.fetchall()
    if rows:
        ids = [r["tr_id"] for r in rows]
@ -228,21 +258,6 @@ def fetch_pending_batch(conn, lang_to: str, batch_size: int):
        conn.commit()
    return rows

-def mark_done(conn, tr_id: int, title_tr: str, body_tr: str, lang_from: Optional[str]):
-    with conn.cursor() as cur:
-        cur.execute("""
-            UPDATE traducciones
-               SET titulo_trad=%s, resumen_trad=%s,
-                   lang_from = COALESCE(lang_from, %s),
-                   status='done', error=NULL
-             WHERE id=%s;
-        """, (title_tr, body_tr, lang_from, tr_id))
-    conn.commit()
-
-def mark_error(conn, tr_id: int, msg: str):
-    with conn.cursor() as cur:
-        cur.execute("UPDATE traducciones SET status='error', error=%s WHERE id=%s;", (msg[:1500], tr_id))
-    conn.commit()

 def detect_lang(text1: str, text2: str) -> Optional[str]:
    txt = (text1 or "").strip() or (text2 or "").strip()
@ -253,13 +268,14 @@ def detect_lang(text1: str, text2: str) -> Optional[str]:
    except Exception:
        return None

-# ---------- Modelo único y manejo de CUDA (NLLB) ----------
+
 _TOKENIZER: Optional[AutoTokenizer] = None
 _MODEL: Optional[AutoModelForSeq2SeqLM] = None
 _DEVICE: Optional[torch.device] = None
 _CUDA_FAILS: int = 0
 _CUDA_DISABLED: bool = False

+
 def _resolve_device() -> torch.device:
    global _CUDA_DISABLED
    if _CUDA_DISABLED:
@ -268,13 +284,14 @@ def _resolve_device() -> torch.device:
        return torch.device("cpu")
    if DEVICE_CFG == "cuda":
        return torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    # auto
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

+
 def _is_cuda_mem_error(exc: Exception) -> bool:
    s = str(exc)
    return ("CUDA out of memory" in s) or ("CUDACachingAllocator" in s) or ("expandable_segment" in s)

+
 def _free_cuda():
    if torch.cuda.is_available():
        try:
@ -283,8 +300,8 @@ def _free_cuda():
        except Exception:
            pass

+
 def _load_model_on(device: torch.device):
-    """Carga (o recarga) el modelo/tokenizer en el dispositivo indicado."""
    global _TOKENIZER, _MODEL, _DEVICE
    dtype = torch.float16 if device.type == "cuda" else torch.float32

@ -293,9 +310,9 @@ def _load_model_on(device: torch.device):
    mdl = AutoModelForSeq2SeqLM.from_pretrained(
        UNIVERSAL_MODEL,
        torch_dtype=dtype,
-        low_cpu_mem_usage=True
+        low_cpu_mem_usage=True,
    )
-    # use_cache=False reduce picos de VRAM en generación
+
    try:
        mdl.config.use_cache = False
    except Exception:
@ -306,8 +323,8 @@ def _load_model_on(device: torch.device):

    _TOKENIZER, _MODEL, _DEVICE = tok, mdl, device

+
 def get_universal_components():
-    """Devuelve (tokenizer, model, device). Carga en GPU si está disponible y estable."""
    global _TOKENIZER, _MODEL, _DEVICE, _CUDA_FAILS, _CUDA_DISABLED

    if _MODEL is not None and _DEVICE is not None:
@ -329,14 +346,13 @@ def get_universal_components():
        _load_model_on(torch.device("cpu"))
        return _TOKENIZER, _MODEL, _DEVICE

-# ---------- Utilidades de tokenización / chunking ----------
+
 def _safe_src_len(tokenizer) -> int:
    model_max = getattr(tokenizer, "model_max_length", 1024) or 1024
-    # margen para tokens especiales/ruido
    return min(MAX_SRC_TOKENS, int(model_max) - 16)

+
 def _token_chunks(tokenizer, text: str, max_tokens: int) -> List[str]:
-    """Troceo simple por tokens (fallback)"""
    if not text:
        return []
    ids = tokenizer.encode(text, add_special_tokens=False)
@ -344,22 +360,20 @@ def _token_chunks(tokenizer, text: str, max_tokens: int) -> List[str]:
        return [text]
    chunks = []
    for i in range(0, len(ids), max_tokens):
-        sub = ids[i:i+max_tokens]
+        sub = ids[i : i + max_tokens]
        piece = tokenizer.decode(sub, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        if piece.strip():
            chunks.append(piece.strip())
    return chunks

+
 def _norm(s: str) -> str:
    import re as _re
+
    return _re.sub(r"\W+", "", (s or "").lower()).strip()

+
 def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_code: str) -> int:
-    """
-    Resuelve el id del token de idioma destino para NLLB de forma robusta,
-    funcionando aunque falte `lang_code_to_id` en el tokenizer.
-    """
-    # 1) tokenizer.lang_code_to_id (si existe)
    try:
        mapping = getattr(tokenizer, "lang_code_to_id", None)
        if isinstance(mapping, dict):
@ -369,7 +383,6 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c
    except Exception:
        pass

-    # 2) model.config.lang_code_to_id (si existe)
    try:
        mapping = getattr(getattr(model, "config", None), "lang_code_to_id", None)
        if isinstance(mapping, dict):
@ -379,7 +392,6 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c
    except Exception:
        pass

-    # 3) convert_tokens_to_ids (algunos builds registran el código como token especial)
    try:
        tid = tokenizer.convert_tokens_to_ids(tgt_code)
        if isinstance(tid, int) and tid not in (-1, getattr(tokenizer, "unk_token_id", -1)):
@ -387,7 +399,6 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c
    except Exception:
        pass

-    # 4) additional_special_tokens/_ids (buscar el código tal cual)
    try:
        ats = getattr(tokenizer, "additional_special_tokens", None)
        ats_ids = getattr(tokenizer, "additional_special_tokens_ids", None)
@ -398,17 +409,12 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c
    except Exception:
        pass

-    # 5) último recurso: usa eos/bos para no romper generate()
    LOG.warning("No pude resolver lang code id para '%s'. Uso fallback (eos/bos).", tgt_code)
    return getattr(tokenizer, "eos_token_id", None) or getattr(tokenizer, "bos_token_id", None) or 0

-# ---------- Traducción base ----------
+
@torch.inference_mode()
 def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1, _tries: int = 0) -> str:
-    """
-    Traduce un texto (usando troceo por tokens si excede MAX_SRC_TOKENS).
-    Se usa para títulos y como núcleo para chunks de artículos.
-    """
    if not text or not text.strip():
        return ""

@ -416,7 +422,6 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1,
    src_code = map_to_nllb(src_lang) or "eng_Latn"
    tgt_code = map_to_nllb(tgt_lang) or "spa_Latn"

-    # Configura idioma origen (si la prop existe)
    try:
        tok.src_lang = src_code
    except Exception:
@ -439,7 +444,7 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1,
                max_new_tokens=MAX_NEW_TOKENS,
                num_beams=max(1, int(num_beams)),
                do_sample=False,
-                use_cache=False,  # ↓ memoria
+                use_cache=False,
            )
            if int(num_beams) > 1:
                gen_kwargs["early_stopping"] = True
@ -459,7 +464,6 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1,
    except Exception as e:
        if device.type == "cuda" and _is_cuda_mem_error(e) and _tries < 2:
            LOG.warning("CUDA OOM/allocator: intento de recuperación %d. Detalle: %s", _tries + 1, e)
-            # desactiva CUDA y relanza en CPU
            global _MODEL, _DEVICE, _CUDA_DISABLED
            _CUDA_DISABLED = True
            try:
@ -474,10 +478,11 @@ def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1,
            return translate_text(src_lang, tgt_lang, text, num_beams=num_beams, _tries=_tries + 1)
        raise

-# ---------- Chunking por frases para artículos ----------
+
 def _sent_token_len(tokenizer, sent: str) -> int:
    return len(tokenizer(sent, add_special_tokens=False).input_ids)

+
 def _pack_sentences_to_token_chunks(
    tokenizer, sentences: List[str], max_tokens: int, overlap_sents: int = 0
 ) -> List[List[str]]:
@ -487,11 +492,10 @@ def _pack_sentences_to_token_chunks(
    for s in sentences:
        slen = _sent_token_len(tokenizer, s)
        if slen > max_tokens:
-            # Si una sola frase excede el límite, córtala por tokens como último recurso
            ids = tokenizer(s, add_special_tokens=False).input_ids
            step = max_tokens
            for i in range(0, len(ids), step):
-                sub = tokenizer.decode(ids[i:i+step], skip_special_tokens=True)
+                sub = tokenizer.decode(ids[i : i + step], skip_special_tokens=True)
                if cur:
                    chunks.append(cur)
                    cur = []
@ -500,7 +504,8 @@ def _pack_sentences_to_token_chunks(
            continue

        if cur_tokens + slen <= max_tokens:
-            cur.append(s); cur_tokens += slen
+            cur.append(s)
+            cur_tokens += slen
        else:
            if cur:
                chunks.append(cur)
@ -509,13 +514,14 @@ def _pack_sentences_to_token_chunks(
                cur = overlap + [s]
                cur_tokens = sum(_sent_token_len(tokenizer, x) for x in cur)
            else:
-                cur = [s]; cur_tokens = slen
+                cur = [s]
+                cur_tokens = slen
    if cur:
        chunks.append(cur)
    return chunks

+
 def _smart_concatenate(parts: List[str], tail_window: int = 120) -> str:
-    """Une partes evitando duplicados obvios en el borde (heurística ligera)."""
    if not parts:
        return ""
    out = parts[0]
@ -529,24 +535,17 @@ def _smart_concatenate(parts: List[str], tail_window: int = 120) -> str:
        out += ("" if cut == 0 else nxt[cut:]) if nxt else ""
    return out

+
 def translate_article_full(
    src_lang: str,
    tgt_lang: str,
    text: str,
    num_beams: int,
 ) -> str:
-    """
-    Traduce un artículo completo:
-    - Divide por frases (sin look-behind variable)
-    - Empaqueta en chunks <= límite de tokens
-    - Traduce chunk a chunk (usa translate_text internamente)
-    - Une con heurística para evitar duplicados en bordes
-    """
    if not text or not text.strip():
        return ""

    if not CHUNK_BY_SENTENCES:
-        # Ruta rápida: una sola pasada con truncamiento interno
        return translate_text(src_lang, tgt_lang, text, num_beams=num_beams)

    tok, _, _ = get_universal_components()
@ -569,46 +568,83 @@ def translate_article_full(

    return _smart_concatenate([p for p in translated_parts if p])

-# ---------- Procesamiento por lotes ----------
+
 def process_batch(conn, rows):
+    done_rows = []
+    error_rows = []
+
    for r in rows:
        tr_id = r["tr_id"]
        lang_to = normalize_lang(r["lang_to"], "es") or "es"
        lang_from = normalize_lang(r["lang_from"]) or detect_lang(r["titulo"] or "", r["resumen"] or "") or "en"

        title = (r["titulo"] or "").strip()
-        body  = (r["resumen"] or "").strip()
+        body = (r["resumen"] or "").strip()

-        # Si ya está en el mismo idioma, copia tal cual
        if (map_to_nllb(lang_from) or "eng_Latn") == (map_to_nllb(lang_to) or "spa_Latn"):
-            mark_done(conn, tr_id, title, body, lang_from)
+            done_rows.append((title, body, lang_from, tr_id))
            continue

        try:
-            # Títulos: cortos, traducción directa (beams más altos si quieres)
            title_tr = translate_text(lang_from, lang_to, title, num_beams=NUM_BEAMS_TITLE) if title else ""
-            # Cuerpo/resumen: artículo completo con chunking por frases
-            body_tr  = translate_article_full(lang_from, lang_to, body, num_beams=NUM_BEAMS_BODY) if body else ""
+            body_tr = translate_article_full(lang_from, lang_to, body, num_beams=NUM_BEAMS_BODY) if body else ""

-            # Si la "traducción" es igual al original, déjala vacía
            if _norm(title_tr) == _norm(title):
                title_tr = ""
            if _norm(body_tr) == _norm(body):
                body_tr = ""

-            mark_done(conn, tr_id, title_tr, body_tr, lang_from)
+            done_rows.append((title_tr, body_tr, lang_from, tr_id))
        except Exception as e:
            LOG.exception("Error traduciendo fila")
-            mark_error(conn, tr_id, str(e))
+            error_rows.append((str(e)[:1500], tr_id))
+
+    with conn.cursor() as cur:
+        if done_rows:
+            execute_values(
+                cur,
+                """
+                UPDATE traducciones AS t
+                SET titulo_trad = v.titulo_trad,
+                    resumen_trad = v.resumen_trad,
+                    lang_from = COALESCE(t.lang_from, v.lang_from),
+                    status = 'done',
+                    error = NULL
+                FROM (VALUES %s) AS v(titulo_trad, resumen_trad, lang_from, id)
+                WHERE t.id = v.id;
+                """,
+                done_rows,
+            )
+
+        if error_rows:
+            execute_values(
+                cur,
+                """
+                UPDATE traducciones AS t
+                SET status = 'error',
+                    error = v.error
+                FROM (VALUES %s) AS v(error, id)
+                WHERE t.id = v.id;
+                """,
+                error_rows,
+            )
+    conn.commit()
+

 def main():
    LOG.info(
        "Arrancando worker de traducción (NLLB). TARGET_LANGS=%s, BATCH=%s, ENQUEUE=%s, DEVICE=%s, "
        "BEAMS(title/body)=%s/%s, CHUNK_BY_SENTENCES=%s, CHUNK_MAX_TOKENS=%s, OVERLAP_SENTS=%s",
-        TARGET_LANGS, BATCH_SIZE, ENQUEUE_MAX, DEVICE_CFG, NUM_BEAMS_TITLE, NUM_BEAMS_BODY,
-        CHUNK_BY_SENTENCES, CHUNK_MAX_TOKENS, CHUNK_OVERLAP_SENTS
+        TARGET_LANGS,
+        BATCH_SIZE,
+        ENQUEUE_MAX,
+        DEVICE_CFG,
+        NUM_BEAMS_TITLE,
+        NUM_BEAMS_BODY,
+        CHUNK_BY_SENTENCES,
+        CHUNK_MAX_TOKENS,
+        CHUNK_OVERLAP_SENTS,
    )
-    # Pre-carga el modelo una vez para reservar memoria de forma limpia
    get_universal_components()

    while True:
@ -628,6 +664,7 @@ def main():
        if not any_work:
            time.sleep(SLEEP_IDLE)

+
 if __name__ == "__main__":
    os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
    main()