go integration and wikipedia

2026-03-28 18:30:07 +01:00 · 2026-03-28 18:30:07 +01:00 · ee90335b92
commit ee90335b92
parent 47a252e339
7828 changed files with 1307913 additions and 20807 deletions
--- a/workers/translation_worker.py
+++ b/workers/translation_worker.py
@ -7,19 +7,15 @@ from typing import List, Optional
 import psycopg2
 import psycopg2.extras
 from psycopg2.extras import execute_values
-
-import ctranslate2
-from transformers import AutoTokenizer
 from langdetect import detect, DetectorFactory
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

 DetectorFactory.seed = 0

 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
 LOG = logging.getLogger("translator")

-# =========================
-# DB CONFIG
-# =========================
 DB_CONFIG = {
    "host": os.environ.get("DB_HOST", "localhost"),
    "port": int(os.environ.get("DB_PORT", 5432)),
@ -28,9 +24,6 @@ DB_CONFIG = {
    "password": os.environ.get("DB_PASS", "x"),
 }

-# =========================
-# ENV HELPERS
-# =========================
 def _env_list(name: str, default="es"):
    raw = os.environ.get(name)
    if raw:
@ -55,37 +48,20 @@ def _env_str(name: str, default=None):
    v = os.environ.get(name)
    return v if v else default

-# =========================
-# CONFIG
-# =========================
-TARGET_LANGS = _env_list("TARGET_LANGS")  # por defecto ["es"]
+TARGET_LANGS = _env_list("TARGET_LANGS")
 BATCH_SIZE = _env_int("TRANSLATOR_BATCH", 8)
 ENQUEUE_MAX = _env_int("ENQUEUE", 200)
 SLEEP_IDLE = _env_float("TRANSLATOR_SLEEP_IDLE", 5.0)
-
-# CTranslate2 Configuration
-CT2_MODEL_PATH = _env_str("CT2_MODEL_PATH", "./models/nllb-ct2")
-CT2_DEVICE = _env_str("CT2_DEVICE", "auto")  # auto, cpu, cuda
-CT2_COMPUTE_TYPE = _env_str("CT2_COMPUTE_TYPE", "auto")  # auto, int8, float16, int8_float16
-
 MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", 512)
 MAX_NEW_TOKENS_TITLE = _env_int("MAX_NEW_TOKENS_TITLE", 96)
 MAX_NEW_TOKENS_BODY = _env_int("MAX_NEW_TOKENS_BODY", 512)
-
 NUM_BEAMS_TITLE = _env_int("NUM_BEAMS_TITLE", 2)
 NUM_BEAMS_BODY = _env_int("NUM_BEAMS_BODY", 2)
-
-# HuggingFace model name (used for tokenizer)
 UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", "facebook/nllb-200-distilled-600M")
-IDENTITY_PAISES_ES = _env_int("IDENTITY_PAISES_ES", 58)
-
 BODY_CHARS_CHUNK = _env_int("BODY_CHARS_CHUNK", 900)

-# =========================
-# LANG MAP
-# =========================
-NLLB_LANG = {
-    "es": "spa_Latn", "en": "eng_Latn", "fr": "fra_Latn", "de": "deu_Latn",
+LANG_CODE_MAP = {
+    "en": "eng_Latn", "es": "spa_Latn", "fr": "fra_Latn", "de": "deu_Latn",
    "it": "ita_Latn", "pt": "por_Latn", "nl": "nld_Latn", "sv": "swe_Latn",
    "da": "dan_Latn", "fi": "fin_Latn", "no": "nob_Latn",
    "pl": "pol_Latn", "cs": "ces_Latn", "sk": "slk_Latn",
@ -96,286 +72,74 @@ NLLB_LANG = {
    "ko": "kor_Hang", "vi": "vie_Latn",
 }

-def map_to_nllb(code: Optional[str]):
-    if not code:
-        return None
-    c = code.strip().lower()
-    return NLLB_LANG.get(c, f"{c}_Latn")
+_tokenizer = None
+_translator = None
+_device = None

-def normalize_lang(code: Optional[str], default=None):
-    return (code or default).strip().lower() if code else default
-
-def _norm(s: str) -> str:
-    return re.sub(r"\W+", "", (s or "").lower()).strip()
-
-def _is_repetitive_output(text: str, threshold: float = 0.25) -> bool:
-    """Detect if translation output is repetitive/low quality.
+def get_translator_components():
+    global _tokenizer, _translator, _device
    
-    Args:
-        text: The translated text to check
-        threshold: Minimum unique word ratio (default 0.25 = 25% unique words)
+    if _translator:
+        return _tokenizer, _translator
    
-    Returns:
-        True if text appears to be repetitive/low quality
-    """
-    if not text or len(text) < 50:
-        return False
+    device = 0 if torch.cuda.is_available() else -1
+    LOG.info(f"Loading model {UNIVERSAL_MODEL} on {'cuda' if device == 0 else 'cpu'}")
    
-    # Check for obvious repetitive patterns
-    repetitive_patterns = [
-        r'(\b\w+\b)( \1){3,}',  # Same word repeated 4+ times
-        r'(\b\w+ \w+\b)( \1){2,}',  # Same 2-word phrase repeated 3+ times
-        r'de la la ',
-        r'la línea de la línea',
-        r'de Internet de Internet',
-    ]
+    _tokenizer = AutoTokenizer.from_pretrained(UNIVERSAL_MODEL, src_lang="eng_Latn")
+    model = AutoModelForSeq2SeqLM.from_pretrained(UNIVERSAL_MODEL)
    
-    for pattern in repetitive_patterns:
-        if re.search(pattern, text, re.IGNORECASE):
-            LOG.warning(f"Detected repetitive pattern: {pattern}")
-            return True
+    if device == 0:
+        model = model.to("cuda")
    
-    # Check word diversity
-    words = text.lower().split()
-    if len(words) < 10:
-        return False
-    
-    unique_ratio = len(set(words)) / len(words)
-    if unique_ratio < threshold:
-        LOG.warning(f"Low word diversity: {unique_ratio:.2%} (threshold: {threshold:.2%})")
-        return True
-    
-    return False
-
-# =========================
-# DB
-# =========================
-def get_conn():
-    return psycopg2.connect(**DB_CONFIG)
-
-def ensure_indexes(conn):
-    with conn.cursor() as cur:
-        cur.execute("CREATE INDEX IF NOT EXISTS traducciones_lang_to_status_idx ON traducciones (lang_to, status);")
-        cur.execute("CREATE INDEX IF NOT EXISTS traducciones_status_idx ON traducciones (status);")
-    conn.commit()
-
-    pass # Moved to translation_ops.py
-
-    pass # Moved to translation_ops.py
-
-def fetch_pending_batch(conn, lang_to: str, batch: int):
-    """Fetch pending translations with row locking to support multiple workers."""
-    if batch <= 0:
-        return []
-    
-    # Use FOR UPDATE SKIP LOCKED to allow multiple workers
-    # Each worker will get different rows without conflicts
-    with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-        cur.execute(
-            """
-            SELECT t.id AS tr_id, t.noticia_id, t.lang_from, t.lang_to,
-                   n.titulo, n.resumen
-            FROM traducciones t
-            JOIN noticias n ON n.id=t.noticia_id
-            WHERE t.lang_to=%s AND t.status='pending'
-            ORDER BY t.id
-            LIMIT %s
-            FOR UPDATE OF t SKIP LOCKED;
-            """,
-            (lang_to, batch),
-        )
-        rows = cur.fetchall()
-        
-        # Update status within the same transaction while rows are locked
-        if rows:
-            ids = [r["tr_id"] for r in rows]
-            cur.execute("UPDATE traducciones SET status='processing' WHERE id = ANY(%s)", (ids,))
-    
-    conn.commit()
-    return rows
-
-# =========================
-# LANGUAGE DETECTION
-# =========================
-def detect_lang(text1: str, text2: str):
-    txt = (text1 or "").strip() or (text2 or "").strip()
-    if not txt:
-        return None
-    try:
-        return detect(txt)
-    except Exception:
-        return None
-
-# =========================
-# MODEL LOADING (CTranslate2)
-# =========================
-_TOKENIZER = None
-_TRANSLATOR = None
-_DEVICE = None
-
-def _resolve_device():
-    if CT2_DEVICE == "cpu":
-        return "cpu"
-    if CT2_DEVICE == "cuda":
-        return "cuda"
-    # auto
-    return "cuda" if ctranslate2.get_cuda_device_count() > 0 else "cpu"
-
-def _ensure_ct2_model():
-    """Convert HuggingFace model to CTranslate2 format if not exists."""
-    import os
-    import subprocess
-    
-    model_dir = CT2_MODEL_PATH
-    
-    # Check if model already exists
-    if os.path.isdir(model_dir) and os.path.exists(os.path.join(model_dir, "model.bin")):
-        LOG.info("CTranslate2 model already exists at %s", model_dir)
-        return True
-    
-    LOG.info("CTranslate2 model not found, converting from %s...", UNIVERSAL_MODEL)
-    LOG.info("This may take 5-10 minutes on first run...")
-    
-    # Create directory if needed
-    os.makedirs(os.path.dirname(model_dir) or ".", exist_ok=True)
-    
-    # Convert the model
-    quantization = CT2_COMPUTE_TYPE if CT2_COMPUTE_TYPE != "auto" else "int8_float16"
-    
-    cmd = [
-        "ct2-transformers-converter",
-        "--model", UNIVERSAL_MODEL,
-        "--output_dir", model_dir,
-        "--quantization", quantization,
-        "--force"
-    ]
-    
-    try:
-        LOG.info("Running: %s", " ".join(cmd))
-        result = subprocess.run(cmd, capture_output=True, text=True, timeout=1800)
-        
-        if result.returncode != 0:
-            LOG.error("Model conversion failed: %s", result.stderr)
-            return False
-        
-        LOG.info("Model conversion completed successfully")
-        return True
-        
-    except subprocess.TimeoutExpired:
-        LOG.error("Model conversion timed out after 30 minutes")
-        return False
-    except Exception as e:
-        LOG.error("Model conversion error: %s", e)
-        return False
-
-def get_universal_components():
-    global _TOKENIZER, _TRANSLATOR, _DEVICE
-    if _TRANSLATOR:
-        return _TOKENIZER, _TRANSLATOR
-
-    # Ensure CT2 model exists (convert if needed)
-    if not _ensure_ct2_model():
-        raise RuntimeError(f"Failed to load/convert CTranslate2 model at {CT2_MODEL_PATH}")
-
-    device = _resolve_device()
-    
-    LOG.info("Loading CTranslate2 model from %s on %s", CT2_MODEL_PATH, device)
-    
-    _TRANSLATOR = ctranslate2.Translator(
-        CT2_MODEL_PATH,
+    _translator = pipeline(
+        "translation",
+        model=model,
+        tokenizer=_tokenizer,
        device=device,
-        compute_type=CT2_COMPUTE_TYPE,
+        max_length=MAX_SRC_TOKENS,
    )
-    _TOKENIZER = AutoTokenizer.from_pretrained(UNIVERSAL_MODEL)
-    _DEVICE = device
    
-    LOG.info("CTranslate2 model loaded successfully")
-    return _TOKENIZER, _TRANSLATOR
+    _device = "cuda" if device == 0 else "cpu"
+    LOG.info(f"Model loaded on {_device}")
+    
+    return _tokenizer, _translator

-# =========================
-# TRANSLATION (CTranslate2)
-# =========================
-def _safe_src_len(tokenizer):
-    max_len = getattr(tokenizer, "model_max_length", 1024) or 1024
-    if max_len > 100000:
-        max_len = 1024
-    return min(MAX_SRC_TOKENS, max_len - 16)
-
-def _translate_texts(src, tgt, texts, beams, max_new_tokens):
-    """Translate texts using CTranslate2."""
+def translate_texts(src: str, tgt: str, texts: List[str]) -> List[str]:
    if not texts:
        return []
    
    clean = [(t or "").strip() for t in texts]
    if all(not t for t in clean):
        return ["" for _ in clean]
+    
+    tok, translator = get_translator_components()
+    
+    src_code = LANG_CODE_MAP.get(src, f"{src}_Latn")
+    tgt_code = LANG_CODE_MAP.get(tgt, "spa_Latn")
+    
+    results = []
+    for text in clean:
+        if not text:
+            results.append("")
+            continue
+        try:
+            result = translator(text, src_lang=src_code, tgt_lang=tgt_code)
+            results.append(result[0]["translation_text"])
+        except Exception as e:
+            LOG.warning(f"Translation error: {e}")
+            results.append(text)
+    
+    return results

-    tok, translator = get_universal_components()
-    src_code = map_to_nllb(src)
-    tgt_code = map_to_nllb(tgt)
-
-    # Set source language on tokenizer
-    try:
-        tok.src_lang = src_code
-    except Exception:
-        pass
-
-    safe_len = _safe_src_len(tok)
-    max_new = max(16, min(int(max_new_tokens), MAX_NEW_TOKENS_BODY))
-
-    # Tokenize: convert text to tokens
-    sources = []
-    for t in clean:
-        if t:
-            ids = tok.encode(t, truncation=True, max_length=safe_len)
-            tokens = tok.convert_ids_to_tokens(ids)
-            sources.append(tokens)
-        else:
-            sources.append([])
-
-    # Target language prefix for NLLB
-    target_prefix = [[tgt_code]] * len(sources)
-
-    # Translate with CTranslate2
-    start = time.time()
-    results = translator.translate_batch(
-        sources,
-        target_prefix=target_prefix,
-        beam_size=beams,
-        max_decoding_length=max_new,
-        repetition_penalty=2.5,  # Increased from 1.2 to prevent loops
-        no_repeat_ngram_size=3,  # Prevent 3-gram repetition
-    )
-    dt = time.time() - start
-
-    # Decode results
-    translated = []
-    total_tokens = 0
-    for result, src_tokens in zip(results, sources):
-        if result.hypotheses:
-            # Skip the first token (language prefix)
-            tokens = result.hypotheses[0][1:]
-            total_tokens += len(tokens) + len(src_tokens)
-            text = tok.decode(tok.convert_tokens_to_ids(tokens))
-            translated.append(text.strip())
-        else:
-            translated.append("")
-
-    if total_tokens > 0:
-        LOG.info("  → tokens=%d tiempo=%.2fs velocidad=%d tok/s", 
-                 total_tokens, dt, int(total_tokens / dt) if dt > 0 else 0)
-
-    return translated
-
-def _split_body_into_chunks(text: str) -> List[str]:
+def split_body_into_chunks(text: str) -> List[str]:
    text = (text or "").strip()
    if len(text) <= BODY_CHARS_CHUNK:
        return [text] if text else []
-
+    
    parts = re.split(r'(\n\n+|(?<=[\.\!\?؛؟。])\s+)', text)
    chunks = []
    current = ""
-
+    
    for part in parts:
        if not part:
            continue
@ -387,260 +151,145 @@ def _split_body_into_chunks(text: str) -> List[str]:
            current = part
    if current.strip():
        chunks.append(current.strip())
-
-    if not chunks:
-        return [text]
-    return chunks
+    
+    return chunks if chunks else [text]

 def translate_body_long(src: str, tgt: str, body: str) -> str:
    body = (body or "").strip()
    if not body:
        return ""
-
-    chunks = _split_body_into_chunks(body)
+    
+    chunks = split_body_into_chunks(body)
    if len(chunks) == 1:
-        translated = _translate_texts(src, tgt, [body], NUM_BEAMS_BODY, MAX_NEW_TOKENS_BODY)[0]
-        return translated.strip()
-
+        return translate_texts(src, tgt, [body])[0].strip()
+    
    translated_chunks = []
    for ch in chunks:
-        tr = _translate_texts(src, tgt, [ch], NUM_BEAMS_BODY, MAX_NEW_TOKENS_BODY)[0]
-        translated_chunks.append(tr.strip())
-    return "\n\n".join(c for c in translated_chunks if c)
+        tr = translate_texts(src, tgt, [ch])[0]
+        translated_chunks.append(tr)
+    
+    return " ".join(translated_chunks)
+
+def normalize_lang(lang: Optional[str], default: str = "es") -> Optional[str]:
+    if not lang:
+        return default
+    lang = lang.strip().lower()[:2]
+    return lang if lang else default
+
+def detect_lang(text: str) -> str:
+    if not text or len(text) < 10:
+        return "en"
+    try:
+        return detect(text)
+    except Exception:
+        return "en"

-# =========================
-# BATCH PROCESS
-# =========================
 def process_batch(conn, rows):
    todo = []
-    done = []
-    errors = []
-
+    
    for r in rows:
-        lang_to = normalize_lang(r["lang_to"], "es") or "es"
-        lang_from = (
-            normalize_lang(r["lang_from"])
-            or detect_lang(r["titulo"], r["resumen"])
-            or "en"
-        )
-
-        titulo = (r["titulo"] or "").strip()
-        resumen = (r["resumen"] or "").strip()
-
-        if map_to_nllb(lang_from) == map_to_nllb(lang_to):
-            done.append((titulo, resumen, lang_from, r["tr_id"]))
-        else:
-            todo.append({
-                "tr_id": r["tr_id"],
-                "lang_from": lang_from,
-                "lang_to": lang_to,
-                "titulo": titulo,
-                "resumen": resumen,
-            })
-
+        lang_to = normalize_lang(r.get("lang_to"), "es") or "es"
+        lang_from = normalize_lang(r.get("lang_from")) or detect_lang(r.get("titulo") or "")
+        
+        titulo = (r.get("titulo") or "").strip()
+        resumen = (r.get("resumen") or "").strip()
+        
+        if lang_from == lang_to:
+            continue
+        
+        todo.append({
+            "tr_id": r.get("tr_id"),
+            "lang_from": lang_from,
+            "lang_to": lang_to,
+            "titulo": titulo,
+            "resumen": resumen,
+        })
+    
+    if not todo:
+        return
+    
    from collections import defaultdict
    groups = defaultdict(list)
    for item in todo:
        key = (item["lang_from"], item["lang_to"])
        groups[key].append(item)
-
+    
    for (lang_from, lang_to), items in groups.items():
-        LOG.info("Translating %s -> %s (%d items)", lang_from, lang_to, len(items))
-
+        LOG.info(f"Translating {lang_from} -> {lang_to} ({len(items)} items)")
+        
        titles = [i["titulo"] for i in items]
-
-        try:
-            tt = _translate_texts(
-                lang_from,
-                lang_to,
-                titles,
-                NUM_BEAMS_TITLE,
-                MAX_NEW_TOKENS_TITLE,
-            )
-
-            bodies_translated: List[str] = []
-            for i in items:
-                bodies_translated.append(
-                    translate_body_long(lang_from, lang_to, i["resumen"])
-                )
-
-            for i, ttr, btr in zip(items, tt, bodies_translated):
-                ttr = (ttr or "").strip()
-                btr = (btr or "").strip()
-
-                if not ttr or _norm(ttr) == _norm(i["titulo"]):
-                    ttr = i["titulo"]
-                if not btr or _norm(btr) == _norm(i["resumen"]):
-                    btr = i["resumen"]
-
-                # CLEANING: Remove <unk> tokens
-                if ttr:
-                    ttr = ttr.replace("<unk>", "").replace("  ", " ").strip()
-                if btr:
-                    btr = btr.replace("<unk>", "").replace("  ", " ").strip()
-
-                # VALIDATION: Check for repetitive output
-                if _is_repetitive_output(ttr) or _is_repetitive_output(btr):
-                    LOG.warning(f"Rejecting repetitive translation for tr_id={i['tr_id']}")
-                    errors.append(("Repetitive output detected", i["tr_id"]))
-                    continue
-
-                done.append((ttr, btr, lang_from, i["tr_id"]))
-
-        except Exception as e:
-            err = str(e)[:800]
-            LOG.exception("Error translating %s -> %s: %s", lang_from, lang_to, err)
-            for i in items:
-                errors.append((err, i["tr_id"]))
-
-    with conn.cursor() as cur:
-        if done:
-            execute_values(
-                cur,
-                """
-                UPDATE traducciones AS t
-                SET titulo_trad=v.titulo_trad,
-                    resumen_trad=v.resumen_trad,
-                    lang_from=COALESCE(t.lang_from, v.lang_from),
-                    status='done',
-                    error=NULL
-                FROM (VALUES %s) AS v(titulo_trad,resumen_trad,lang_from,id)
-                WHERE t.id=v.id;
-                """,
-                done,
-            )
-            
-            # --- NEW: Persist stats ---
-            # Insert a record for each translated item into translation_stats
-            # We need the language 'lang_to'. In this batch, lang_to is uniform for the group usually,
-            # but let's extract it from the 'done' items structure if we had it, or pass it down.
-            # In process_batch, we iterate groups. 
-            # 'done' list here is flattened from multiple groups? 
-            # process_batch logic:
-            # 1. 'done' checks map_to_nllb identity (already done?) -> these have lang_to from row?
-            # 2. 'groups' loop -> translates -> appends to 'done' with lang_from.
-            # 
-            # Wait, 'done' list doesn't have lang_to in the tuple: (titulo, resumen, lang_from, tr_id).
-            # We need to change the 'done' collection to include lang_to OR we insert based on tr_id.
-            
-            # Let's verify process_batch logic.
-            # rows has all info.
-            # define a mapping tr_id -> lang_to
-            tr_map = {r["tr_id"]: r["lang_to"] for r in rows}
-            
-            stats_data = []
-            for item in done:
-                 # item is (titulo, resumen, lang_from, tr_id)
-                 lang_from = item[2]
-                 lang_to = tr_map.get(item[3], "es")
-                 stats_data.append((lang_from, lang_to))
-
-            execute_values(
-                cur,
-                "INSERT INTO translation_stats (lang_from, lang_to) VALUES %s",
-                stats_data
-            )
-            # --------------------------
-
-        if errors:
-            execute_values(
-                cur,
-                """
-                UPDATE traducciones AS t
-                SET status='error', error=v.error
-                FROM (VALUES %s) AS v(error,id)
-                WHERE t.id=v.id;
-                """,
-                errors,
-            )
-
-    conn.commit()
-
-def process_entity_summaries(conn):
-    """Translate pending entity summaries from Wikipedia."""
-    from cache import cache_del
-    
-    LOG.info("DEBUG: Checking for pending entity summaries...")
-    
-    with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-        cur.execute("""
-            SELECT id, entity_name, summary, summary_en 
-            FROM entity_images 
-            WHERE status_es = 'pending' 
-            LIMIT 20
-            FOR UPDATE SKIP LOCKED;
-        """)
-        rows = cur.fetchall()
+        translated_titles = translate_texts(lang_from, lang_to, titles)
        
-        if not rows:
-            return False
-            
-        LOG.info("DEBUG: Found %d pending entity summaries to process", len(rows))
+        translated_bodies = []
+        for i in items:
+            body = (i["resumen"] or "").strip()
+            if body:
+                tr = translate_body_long(lang_from, lang_to, body)
+                translated_bodies.append(tr)
+            else:
+                translated_bodies.append("")
        
-        for r in rows:
-            entity_id = r["id"]
-            name = r["entity_name"]
-            text = r["summary_en"] or r["summary"]
+        cursor = conn.cursor()
+        for item, tt, tb in zip(items, translated_titles, translated_bodies):
+            tt = (tt or "").strip()
+            tb = (tb or "").strip()
+            
+            if not tt:
+                tt = item["titulo"]
+            if not tb:
+                tb = item["resumen"]
            
-            if not text:
-                cur.execute("UPDATE entity_images SET status_es = 'none' WHERE id = %s", (entity_id,))
-                continue
-                
            try:
-                # English -> Spanish
-                translated = translate_body_long('en', 'es', text)
-                if translated:
-                    cur.execute("""
-                        UPDATE entity_images 
-                        SET summary_es = %s, status_es = 'done' 
-                        WHERE id = %s
-                    """, (translated, entity_id))
-                    # Invalidate cache
-                    cache_del(f"wiki:data:{name.lower()}")
-                    LOG.info("  → Translated entity summary: %s", name)
-                else:
-                     cur.execute("UPDATE entity_images SET status_es = 'error' WHERE id = %s", (entity_id,))
+                cursor.execute("""
+                    UPDATE traducciones 
+                    SET titulo_trad = %s, resumen_trad = %s, lang_to = %s
+                    WHERE id = %s
+                """, (tt, tb, lang_to, item["tr_id"]))
            except Exception as e:
-                LOG.error("Error translating entity summary [%s]: %s", name, e)
-                cur.execute("UPDATE entity_images SET status_es = 'error' WHERE id = %s", (entity_id,))
+                LOG.error(f"Update error: {e}")
        
        conn.commit()
-    return True
+        cursor.close()
+        LOG.info(f"Translated {len(items)} items")
+
+def fetch_pending_translations(conn):
+    cursor = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+    
+    for lang in TARGET_LANGS:
+        cursor.execute("""
+            SELECT t.id as tr_id, t.lang_from, t.lang_to,
+                   n.titulo, n.resumen, n.id as noticia_id
+            FROM traducciones t
+            JOIN noticias n ON n.id = t.noticia_id
+            WHERE t.lang_to = %s 
+              AND (t.titulo_trad IS NULL OR t.resumen_trad IS NULL)
+            ORDER BY n.fecha DESC
+            LIMIT %s
+        """, (lang, BATCH_SIZE))
+        
+        rows = cursor.fetchall()
+        if rows:
+            LOG.info(f"Found {len(rows)} pending translations for {lang}")
+            process_batch(conn, rows)
+    
+    cursor.close()
+
+def connect_db():
+    return psycopg2.connect(**DB_CONFIG)

-# =========================
-# MAIN LOOP
-# =========================
 def main():
-    LOG.info("Translator worker iniciado (CTranslate2)")
-    os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
-    get_universal_components()
-
+    LOG.info("Translation worker started (transformers)")
+    get_translator_components()
+    
    while True:
-        any_work = False
-        with get_conn() as conn:
-            ensure_indexes(conn)
-            
-            # 1. Process entity summaries (Wikipedia) -> REMOVED per user request
-            # Logic moved out to keep translator focused on news ONLY.
-            # try:
-            #     if process_entity_summaries(conn):
-            #         any_work = True
-            # except Exception as e:
-            #     LOG.error("Error in process_entity_summaries: %s", e)
-
-            # 2. Process news translations
-            for tgt in TARGET_LANGS:
-                while True:
-                    rows = fetch_pending_batch(conn, tgt, BATCH_SIZE)
-                    if not rows:
-                        break
-                    any_work = True
-                    LOG.info("[%s] %d elementos", tgt, len(rows))
-                    process_batch(conn, rows)
-
-        if not any_work:
-            time.sleep(SLEEP_IDLE)
+        try:
+            conn = connect_db()
+            fetch_pending_translations(conn)
+            conn.close()
+        except Exception as e:
+            LOG.error(f"Error: {e}")
+        
+        time.sleep(30)

 if __name__ == "__main__":
    main()
-