go integration and wikipedia

2026-03-28 18:30:07 +01:00 · 2026-03-28 18:30:07 +01:00 · ee90335b92
commit ee90335b92
parent 47a252e339
7828 changed files with 1307913 additions and 20807 deletions
--- a/workers/ctranslator_worker.py
+++ b/workers/ctranslator_worker.py
@ -0,0 +1,405 @@
+import os
+import time
+import logging
+import re
+from typing import List, Optional
+
+import psycopg2
+import psycopg2.extras
+from langdetect import detect, DetectorFactory
+
+import ctranslate2
+from transformers import AutoTokenizer
+
+DetectorFactory.seed = 0
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
+LOG = logging.getLogger("translator_ct2")
+
+TRANSLATOR_ID = os.environ.get("TRANSLATOR_ID", "")
+TRANSLATOR_TOTAL = int(os.environ.get("TRANSLATOR_TOTAL", "1"))
+
+def clean_text(text: str) -> str:
+    if not text:
+        return ""
+    text = re.sub(r'<[^>]+>', '', text)
+    text = text.replace('<unk>', '')
+    text = text.replace('&nbsp;', ' ')
+    text = text.replace('&amp;', '&')
+    text = text.replace('&lt;', '<')
+    text = text.replace('&gt;', '>')
+    text = text.replace('&quot;', '"')
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+
+DB_CONFIG = {
+    "host": os.environ.get("DB_HOST", "localhost"),
+    "port": int(os.environ.get("DB_PORT", 5432)),
+    "dbname": os.environ.get("DB_NAME", "rss"),
+    "user": os.environ.get("DB_USER", "rss"),
+    "password": os.environ.get("DB_PASS", "x"),
+}
+
+def _env_list(name: str, default="es"):
+    raw = os.environ.get(name)
+    if raw:
+        return [s.strip() for s in raw.split(",") if s.strip()]
+    return [default]
+
+def _env_int(name: str, default: int = 8):
+    v = os.environ.get(name)
+    try:
+        return int(v)
+    except Exception:
+        return default
+
+def _env_str(name: str, default=None):
+    v = os.environ.get(name)
+    return v if v else default
+
+TARGET_LANGS = _env_list("TARGET_LANGS")
+BATCH_SIZE = _env_int("TRANSLATOR_BATCH", 8)
+MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", 512)
+MAX_NEW_TOKENS = _env_int("MAX_NEW_TOKENS", 512)
+
+CT2_MODEL_PATH = _env_str("CT2_MODEL_PATH", "/app/models/nllb-ct2")
+CT2_DEVICE = _env_str("CT2_DEVICE", "cpu")
+CT2_COMPUTE_TYPE = _env_str("CT2_COMPUTE_TYPE", "int8")
+UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", "facebook/nllb-200-distilled-600M")
+BODY_CHARS_CHUNK = _env_int("BODY_CHARS_CHUNK", 900)
+
+LANG_CODE_MAP = {
+    "en": "eng_Latn", "es": "spa_Latn", "fr": "fra_Latn", "de": "deu_Latn",
+    "it": "ita_Latn", "pt": "por_Latn", "nl": "nld_Latn", "sv": "swe_Latn",
+    "da": "dan_Latn", "fi": "fin_Latn", "no": "nob_Latn",
+    "pl": "pol_Latn", "cs": "ces_Latn", "sk": "slk_Latn",
+    "sl": "slv_Latn", "hu": "hun_Latn", "ro": "ron_Latn",
+    "el": "ell_Grek", "ru": "rus_Cyrl", "uk": "ukr_Cyrl",
+    "tr": "tur_Latn", "ar": "arb_Arab", "fa": "pes_Arab",
+    "he": "heb_Hebr", "zh": "zho_Hans", "ja": "jpn_Jpan",
+    "ko": "kor_Hang", "vi": "vie_Latn",
+}
+
+_tokenizer = None
+_translator = None
+
+def ensure_model():
+    global _tokenizer, _translator
+    
+    if _translator:
+        return
+    
+    model_path = CT2_MODEL_PATH
+    model_bin = os.path.join(model_path, "model.bin")
+    
+    if not os.path.exists(model_bin):
+        LOG.info(f"CTranslate2 model not found at {model_path}, converting from {UNIVERSAL_MODEL}...")
+        convert_model()
+    
+    LOG.info(f"Loading CTranslate2 model from {model_path} on {CT2_DEVICE}")
+    
+    _translator = ctranslate2.Translator(
+        model_path,
+        device=CT2_DEVICE,
+        compute_type=CT2_COMPUTE_TYPE,
+    )
+    
+    _tokenizer = AutoTokenizer.from_pretrained(UNIVERSAL_MODEL)
+    LOG.info("CTranslate2 model loaded successfully")
+
+def convert_model():
+    import subprocess
+    
+    model_path = CT2_MODEL_PATH
+    os.makedirs(model_path, exist_ok=True)
+    
+    quantization = CT2_COMPUTE_TYPE if CT2_COMPUTE_TYPE != "auto" else "int8"
+    
+    cmd = [
+        "ct2-transformers-converter",
+        "--model", UNIVERSAL_MODEL,
+        "--output_dir", model_path,
+        "--quantization", quantization,
+        "--force"
+    ]
+    
+    LOG.info(f"Running: {' '.join(cmd)}")
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=1800)
+    
+    if result.returncode != 0:
+        LOG.error(f"Model conversion failed: {result.stderr}")
+        raise RuntimeError("Failed to convert model")
+    
+    LOG.info("Model conversion completed")
+
+def translate_texts(src: str, tgt: str, texts: List[str]) -> List[str]:
+    if not texts:
+        return []
+    
+    ensure_model()
+    
+    clean = [(t or "").strip() for t in texts]
+    if all(not t for t in clean):
+        return ["" for _ in clean]
+    
+    src_code = LANG_CODE_MAP.get(src, f"{src}_Latn")
+    tgt_code = LANG_CODE_MAP.get(tgt, "spa_Latn")
+    
+    try:
+        _tokenizer.src_lang = src_code
+    except Exception:
+        pass
+    
+    sources = []
+    for t in clean:
+        if t:
+            ids = _tokenizer.encode(t, truncation=True, max_length=MAX_SRC_TOKENS)
+            tokens = _tokenizer.convert_ids_to_tokens(ids)
+            sources.append(tokens)
+        else:
+            sources.append([])
+    
+    target_prefix = [[tgt_code]] * len(sources)
+    
+    results = _translator.translate_batch(
+        sources,
+        target_prefix=target_prefix,
+        beam_size=2,
+        max_decoding_length=MAX_NEW_TOKENS,
+        repetition_penalty=2.0,
+        no_repeat_ngram_size=3,
+    )
+    
+    translated = []
+    for result in results:
+        try:
+            if result.hypotheses and len(result.hypotheses) > 0:
+                hyp = result.hypotheses[0]
+                if isinstance(hyp, list) and len(hyp) > 0:
+                    first_hyp = hyp[0]
+                    if isinstance(first_hyp, dict) and "token_ids" in first_hyp:
+                        tokens = first_hyp["token_ids"]
+                        text = _tokenizer.decode(tokens)
+                        translated.append(text.strip())
+                    elif isinstance(first_hyp, str):
+                        token_strings = hyp[1:] if len(hyp) > 1 else []
+                        if token_strings:
+                            text = _tokenizer.convert_tokens_to_string(token_strings)
+                            translated.append(text.strip())
+                        else:
+                            translated.append("")
+                    else:
+                        translated.append("")
+                else:
+                    translated.append("")
+            else:
+                translated.append("")
+        except Exception as e:
+            LOG.error(f"Error processing result: {e}")
+            translated.append("")
+    
+    return translated
+
+def split_body_into_chunks(text: str) -> List[str]:
+    text = (text or "").strip()
+    if len(text) <= BODY_CHARS_CHUNK:
+        return [text] if text else []
+    
+    parts = re.split(r'(\n\n+|(?<=[\.\!\?؛؟。])\s+)', text)
+    chunks = []
+    current = ""
+    
+    for part in parts:
+        if not part:
+            continue
+        if len(current) + len(part) <= BODY_CHARS_CHUNK:
+            current += part
+        else:
+            if current.strip():
+                chunks.append(current.strip())
+            current = part
+    if current.strip():
+        chunks.append(current.strip())
+    
+    return chunks if chunks else [text]
+
+def translate_body_long(src: str, tgt: str, body: str) -> str:
+    body = (body or "").strip()
+    if not body:
+        return ""
+    
+    chunks = split_body_into_chunks(body)
+    if len(chunks) == 1:
+        return translate_texts(src, tgt, [body])[0]
+    
+    translated_chunks = []
+    for ch in chunks:
+        tr = translate_texts(src, tgt, [ch])[0]
+        translated_chunks.append(tr)
+    
+    return " ".join(translated_chunks)
+
+def normalize_lang(lang: Optional[str], default: str = "es") -> Optional[str]:
+    if not lang:
+        return default
+    lang = lang.strip().lower()[:2]
+    return lang if lang else default
+
+def detect_lang(text: str) -> str:
+    if not text or len(text) < 10:
+        return "en"
+    try:
+        return detect(text)
+    except Exception:
+        return "en"
+
+def process_batch(conn, rows):
+    todo = []
+    
+    for r in rows:
+        lang_to = normalize_lang(r.get("lang_to"), "es") or "es"
+        lang_from = normalize_lang(r.get("lang_from")) or detect_lang(r.get("titulo") or "")
+        
+        titulo = (r.get("titulo") or "").strip()
+        resumen = (r.get("resumen") or "").strip()
+        
+        if lang_from == lang_to:
+            # Mark as done and copy original text if languages match
+            cursor = conn.cursor()
+            cursor.execute("""
+                UPDATE traducciones 
+                SET titulo_trad = %s, resumen_trad = %s, status = 'done' 
+                WHERE id = %s
+            """, (titulo, resumen, r.get("tr_id")))
+            conn.commit()
+            cursor.close()
+            continue
+        
+        todo.append({
+            "tr_id": r.get("tr_id"),
+            "lang_from": lang_from,
+            "lang_to": lang_to,
+            "titulo": titulo,
+            "resumen": resumen,
+        })
+    
+    if not todo:
+        return
+    
+    # 1. FAST LOCKING: Commit locked_at immediately to inform other workers
+    cursor = conn.cursor()
+    tr_ids = [item["tr_id"] for item in todo]
+    cursor.execute(f"""
+        UPDATE traducciones 
+        SET locked_at = NOW()
+        WHERE id = ANY(ARRAY[{','.join(['%s'] * len(tr_ids))}])
+    """, tr_ids)
+    conn.commit()
+    cursor.close()
+    
+    from collections import defaultdict
+    groups = defaultdict(list)
+    for item in todo:
+        key = (item["lang_from"], item["lang_to"])
+        groups[key].append(item)
+    
+    for (lang_from, lang_to), items in groups.items():
+        LOG.info(f"Translating {lang_from} -> {lang_to} ({len(items)} items)")
+        
+        try:
+            titles = [i["titulo"] for i in items]
+            translated_titles = translate_texts(lang_from, lang_to, titles)
+            
+            for item, tt in zip(items, translated_titles):
+                body = (item["resumen"] or "").strip()
+                tb = ""
+                if body:
+                    try:
+                        tb = translate_body_long(lang_from, lang_to, body)
+                    except Exception as e:
+                        LOG.error(f"Body translation error for ID {item['tr_id']}: {e}")
+                        tb = item["resumen"]
+                
+                tt = clean_text((tt or "").strip())
+                tb = clean_text((tb or "").strip())
+                
+                if not tt:
+                    tt = item["titulo"]
+                if not tb:
+                    tb = item["resumen"]
+                
+                # 2. INDIVIDUAL COMMIT: Save each item as it's done
+                try:
+                    cursor = conn.cursor()
+                    cursor.execute("""
+                        UPDATE traducciones 
+                        SET titulo_trad = %s, resumen_trad = %s, status = 'done', locked_at = NULL
+                        WHERE id = %s
+                    """, (tt, tb, item["tr_id"]))
+                    conn.commit()
+                    cursor.close()
+                except Exception as e:
+                    LOG.error(f"Update error for ID {item['tr_id']}: {e}")
+                    conn.rollback()
+            
+            LOG.info(f"Finished group {lang_from} -> {lang_to}")
+            
+        except Exception as e:
+            LOG.error(f"Batch group error {lang_from} -> {lang_to}: {e}")
+            # Mark these as error to avoid infinite loop if it's a model crash
+            try:
+                cursor = conn.cursor()
+                cursor.execute("""
+                    UPDATE traducciones SET status = 'error', locked_at = NULL 
+                    WHERE id = ANY(ARRAY[{','.join(['%s'] * len(items))}])
+                """, [i["tr_id"] for i in items])
+                conn.commit()
+                cursor.close()
+            except:
+                conn.rollback()
+
+def fetch_pending_translations(conn):
+    cursor = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+    
+    worker_id = os.environ.get("HOSTNAME", f"worker-{os.getpid()}")
+    
+    for lang in TARGET_LANGS:
+        cursor.execute("""
+            SELECT t.id as tr_id, t.lang_from, t.lang_to,
+                   n.titulo, n.resumen, n.id as noticia_id
+            FROM traducciones t
+            JOIN noticias n ON n.id = t.noticia_id
+            WHERE t.lang_to = %s 
+              AND (t.titulo_trad IS NULL OR t.resumen_trad IS NULL)
+              AND (t.locked_at IS NULL OR t.locked_at < NOW() - INTERVAL '10 minutes')
+            ORDER BY n.fecha DESC
+            LIMIT %s
+            FOR UPDATE SKIP LOCKED
+        """, (lang, BATCH_SIZE))
+        
+        rows = cursor.fetchall()
+        if rows:
+            LOG.info(f"Found {len(rows)} pending translations for {lang}")
+            process_batch(conn, rows)
+    
+    cursor.close()
+
+def connect_db():
+    return psycopg2.connect(**DB_CONFIG)
+
+def main():
+    LOG.info(f"CTranslate2 translator worker started (device={CT2_DEVICE}, instances={TRANSLATOR_TOTAL})")
+    ensure_model()
+    
+    while True:
+        try:
+            conn = connect_db()
+            fetch_pending_translations(conn)
+            conn.close()
+        except Exception as e:
+            LOG.error(f"Error: {e}")
+        
+        time.sleep(30)
+
+if __name__ == "__main__":
+    main()
--- a/workers/langdetect_worker.py
+++ b/workers/langdetect_worker.py
@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+"""
+Language Detection Worker
+Detects and updates the language of news items in the database.
+"""
+
+import os
+import sys
+import time
+import logging
+from collections import Counter
+
+import psycopg2
+from psycopg2.extras import RealDictCursor
+from langdetect import detect, LangDetectException
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='[%(asctime)s] %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+LOG = logging.getLogger(__name__)
+
+DB_CONFIG = {
+    'host': os.getenv('DB_HOST', 'db'),
+    'port': int(os.getenv('DB_PORT', 5432)),
+    'database': os.getenv('DB_NAME', 'rss'),
+    'user': os.getenv('DB_USER', 'rss'),
+    'password': os.getenv('DB_PASS', 'rss')
+}
+
+BATCH_SIZE = int(os.getenv('LANG_DETECT_BATCH', '1000'))
+SLEEP_INTERVAL = int(os.getenv('LANG_DETECT_SLEEP', '60'))
+
+def get_db_connection():
+    return psycopg2.connect(**DB_CONFIG)
+
+def detect_language(text):
+    if not text or len(text.strip()) < 10:
+        return None
+    try:
+        return detect(text)
+    except LangDetectException:
+        return None
+
+def process_batch(conn):
+    cursor = conn.cursor(cursor_factory=RealDictCursor)
+    
+    # ONLY pick items where lang is NULL or empty
+    cursor.execute("""
+        SELECT id, titulo, resumen
+        FROM noticias
+        WHERE lang IS NULL OR TRIM(lang) = ''
+        ORDER BY fecha DESC
+        LIMIT %s
+    """, (BATCH_SIZE,))
+    
+    rows = cursor.fetchall()
+    if not rows:
+        return 0
+    
+    updated = 0
+    lang_stats = Counter()
+    
+    for row in rows:
+        news_id = row['id']
+        titulo = (row['titulo'] or "").strip()
+        resumen = (row['resumen'] or "").strip()
+        
+        combined = f"{titulo} {resumen}".strip()
+        
+        lang = detect_language(combined)
+        
+        if lang:
+            cursor.execute("""
+                UPDATE noticias SET lang = %s WHERE id = %s
+            """, (lang, news_id))
+            lang_stats[lang] += 1
+            updated += 1
+    
+    conn.commit()
+    cursor.close()
+    
+    if updated > 0:
+        LOG.info(f"Updated {updated} news languages: {dict(lang_stats)}")
+    
+    return updated
+
+def main():
+    LOG.info("Language detection worker started")
+    
+    while True:
+        try:
+            conn = get_db_connection()
+            processed = process_batch(conn)
+            conn.close()
+            
+            if processed == 0:
+                LOG.info("No more news to process, sleeping...")
+                time.sleep(SLEEP_INTERVAL)
+            else:
+                time.sleep(1)
+                
+        except Exception as e:
+            LOG.error(f"Error: {e}")
+            time.sleep(10)
+
+if __name__ == "__main__":
+    main()
--- a/workers/ner_worker.py
+++ b/workers/ner_worker.py
@ -3,7 +3,8 @@ import time
 import logging
 import re
 import string
-from typing import List, Tuple
+import json
+from typing import List, Tuple, Set, Dict
 from collections import Counter

 import psycopg2
@ -46,6 +47,49 @@ ENT_LABELS = {
    "MISC": "tema",
 }

+# ==========================================================
+# Configuración global de entidades (Synonyms / Blacklist)
+# ==========================================================
+ENTITY_CONFIG = {"blacklist": [], "synonyms": {}}
+REVERSE_SYNONYMS = {}
+
+def load_entity_config():
+    global ENTITY_CONFIG, REVERSE_SYNONYMS
+    path = "entity_config.json"
+    if os.path.exists(path):
+        try:
+            with open(path, "r", encoding="utf-8") as f:
+                ENTITY_CONFIG = json.load(f)
+            
+            # Construir mapa inverso para búsqueda rápida de sinónimos
+            REVERSE_SYNONYMS = {}
+            for canonical, aliases in ENTITY_CONFIG.get("synonyms", {}).items():
+                for alias in aliases:
+                    REVERSE_SYNONYMS[alias.lower()] = canonical
+                REVERSE_SYNONYMS[canonical.lower()] = canonical
+            
+            log.info(f"Loaded entity_config.json: {len(ENTITY_CONFIG.get('blacklist', []))} blacklisted, {len(ENTITY_CONFIG.get('synonyms', {}))} synonym groups")
+        except Exception as e:
+            log.error(f"Error loading entity_config.json: {e}")
+
+def get_canonical_name(text: str) -> str:
+    if not text:
+        return text
+    lower = text.lower()
+    return REVERSE_SYNONYMS.get(lower, text)
+
+def is_blacklisted(text: str) -> bool:
+    if not text:
+        return True
+    lower = text.lower()
+    # Check full match
+    if lower in [item.lower() for item in ENTITY_CONFIG.get("blacklist", [])]:
+        return True
+    # Check if it's just a number
+    if re.fullmatch(r"[0-9\s\.,\-:/]+", lower):
+        return True
+    return False
+
 # ==========================================================
 # Limpieza avanzada
 # ==========================================================
@ -125,7 +169,11 @@ def clean_tag_text(text: str) -> str | None:
    if not text:
        return None

-    text = BeautifulSoup(text, "html.parser").get_text()
+    try:
+        text = BeautifulSoup(text, "html.parser").get_text()
+    except Exception:
+        pass
+
    for pat in HTML_TRASH_PATTERNS:
        text = re.sub(pat, "", text)

@ -133,71 +181,25 @@ def clean_tag_text(text: str) -> str | None:
    text = text.strip(string.punctuation + " ")

    if len(text) < 3:
-        log.debug(f"Clean reject (too short): {text}")
        return None
    if re.search(r"[<>/\\]", text):
-        log.debug(f"Clean reject (bad chars): {text}")
+        return None
+
+    if is_blacklisted(text):
        return None

    lower = text.lower()
    if lower.startswith("href="):
-        log.debug(f"Clean reject (href): {text}")
        return None
    if _looks_like_attr_or_path(lower):
-        log.debug(f"Clean reject (attr/path): {text}")
        return None
    if lower in GENERIC_BAD_TAGS:
-        log.debug(f"Clean reject (generic bad): {text}")
        return None

-    replacements = {
-        "ee.uu.": "Estados Unidos",
-        "los estados unidos": "Estados Unidos",
-        "eeuu": "Estados Unidos",
-        "eu": "Unión Europea",
-        "ue": "Unión Europea",
-        "kosova": "Kosovo",
-        # Specific User Requests
-        "trump": "Donald Trump",
-        "mr. trump": "Donald Trump",
-        "mr trump": "Donald Trump",
-        "doland trump": "Donald Trump",
-        "el presidente trump": "Donald Trump",
-        "president trump": "Donald Trump",
-        "ex-president trump": "Donald Trump",
-        "expresidente trump": "Donald Trump",
-        "putin": "Vladimir Putin",
-        "vladimir putin": "Vladimir Putin",
-        "v. putin": "Vladimir Putin",
-        "presidente putin": "Vladimir Putin",
-        # New requests
-        "sanchez": "Pedro Sánchez",
-        "pedro sanchez": "Pedro Sánchez",
-        "p. sanchez": "Pedro Sánchez",
-        "mr. sanchez": "Pedro Sánchez",
-        "sánchez": "Pedro Sánchez", # explicit match just in case
-        "pedro sánchez": "Pedro Sánchez",
-        "maduro": "Nicolás Maduro",
-        "nicolas maduro": "Nicolás Maduro",
-        "mr. maduro": "Nicolás Maduro",
-        "lula": "Lula da Silva",
-        "lula da silva": "Lula da Silva",
-        "luiz inácio lula da silva": "Lula da Silva",
-    }
-    if lower in replacements:
-        return replacements[lower]
-
-    # Blacklist (explicit removals requested)
-    blacklist = {
-        "getty images", "netflix", "fiscalia", "segun", "estoy", # People blacklist
-        "and more", "app", "estamos", "ultra", # Orgs blacklist
-        "hacienda", "fiscalía" 
-    }
-    if lower in blacklist:
-        log.debug(f"Clean reject (blacklist): {text}")
-        return None
-
-    return text
+    # Normalización vía entity_config
+    canonical = get_canonical_name(text)
+    
+    return canonical


 # ==========================================================
@ -207,7 +209,11 @@ def clean_topic_text(text: str) -> str | None:
    if not text:
        return None

-    text = BeautifulSoup(text, "html.parser").get_text()
+    try:
+        text = BeautifulSoup(text, "html.parser").get_text()
+    except Exception:
+        pass
+
    for pat in HTML_TRASH_PATTERNS:
        text = re.sub(pat, "", text)

@ -217,6 +223,9 @@ def clean_topic_text(text: str) -> str | None:
    if len(text) < TOPIC_MIN_CHARS:
        return None

+    if is_blacklisted(text):
+        return None
+
    lower = text.lower()
    if _looks_like_attr_or_path(lower):
        return None
@ -245,8 +254,6 @@ def clean_topic_text(text: str) -> str | None:
        return None
    if all(t in STOPWORDS for t in tokens):
        return None
-    if re.fullmatch(r"[0-9\s\.,\-:/]+", norm):
-        return None

    return norm

@ -262,8 +269,6 @@ def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]],
        return ents, topics

    doc = nlp(text)
-    # log.debug(f"Processing text ({len(text)} chars): {text[:30]}...")
-    # log.debug(f"Entities found: {len(doc.ents)}")

    # --- ENTIDADES ---
    for ent in doc.ents:
@ -273,35 +278,8 @@ def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]],

        cleaned = clean_tag_text(ent.text)
        if not cleaned:
-            # log.debug(f"Rejected entity: {ent.text} ({ent.label_})")
            continue
            
-        if tipo == "persona":
-            lower_cleaned = cleaned.lower()
-            # Aggressive normalization rules for VIPs
-            # Use token checks or substring checks carefully
-            if "trump" in lower_cleaned.split(): 
-                # Token 'trump' exists? e.g. "donald trump", "trump", "mr. trump"
-                # Exclude family members
-                family = ["ivanka", "melania", "eric", "tiffany", "barron", "lara", "mary", "fred"]
-                if not any(f in lower_cleaned for f in family):
-                    cleaned = "Donald Trump"
-            
-            elif "sanchez" in lower_cleaned or "sánchez" in lower_cleaned:
-                # Be careful of other Sanchez? But user context implies Pedro.
-                if "pedro" in lower_cleaned or "presidente" in lower_cleaned or lower_cleaned in ["sanchez", "sánchez"]:
-                    cleaned = "Pedro Sánchez"
-            
-            elif "maduro" in lower_cleaned:
-                cleaned = "Nicolás Maduro"
-            
-            elif "lula" in lower_cleaned:
-                cleaned = "Lula da Silva"
-            
-            elif "putin" in lower_cleaned:
-                cleaned = "Vladimir Putin"
-
-        # log.debug(f"Accepted entity: {cleaned} ({tipo})")
        ents.append((cleaned, tipo))

    # --- TOPICS ---
@ -311,10 +289,10 @@ def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]],
        if cleaned:
            topic_counter[cleaned] += 1

-    ent_values = {v for (v, _) in ents}
+    ent_values = {v.lower() for (v, _) in ents}

    for val, _count in topic_counter.most_common(TOPIC_MAX_PER_DOC):
-        if val in ent_values:
+        if val.lower() in ent_values:
            continue
        topics.append((val, "tema"))

@ -328,85 +306,98 @@ def main():
    global STOPWORDS

    # Cargar spaCy
-    log.info("Cargando modelo spaCy es_core_news_md...")
-    nlp = spacy.load("es_core_news_md", disable=["lemmatizer", "textcat"])
+    log.info("Cargando modelo spaCy es_core_news_lg...")
+    nlp = spacy.load("es_core_news_lg", disable=["lemmatizer", "textcat"])
    STOPWORDS = set(nlp.Defaults.stop_words)
    log.info("Modelo spaCy cargado correctamente.")

+    # Cargar configuración de entidades
+    load_entity_config()
+
    while True:
        try:
-            with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-                cur.execute(
-                    """
-                    SELECT t.id, t.titulo_trad, t.resumen_trad
-                    FROM traducciones t
-                    WHERE t.status = 'done'
-                      AND t.lang_to = %s
-                      AND NOT EXISTS (
-                          SELECT 1 FROM tags_noticia tn WHERE tn.traduccion_id = t.id
-                      )
-                    ORDER BY t.id DESC
-                    LIMIT %s;
-                    """,
-                    (NER_LANG, BATCH),
-                )
+            with get_conn() as conn:
+                with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+                    cur.execute(
+                        """
+                        SELECT t.id, t.titulo_trad, t.resumen_trad, t.noticia_id
+                        FROM traducciones t
+                        WHERE t.status = 'done'
+                          AND t.lang_to = %s
+                          AND NOT EXISTS (
+                              SELECT 1 FROM tags_noticia tn WHERE tn.traduccion_id = t.id
+                          )
+                        ORDER BY t.id DESC
+                        LIMIT %s;
+                        """,
+                        (NER_LANG, BATCH),
+                    )

+                    rows = cur.fetchall()

-                rows = cur.fetchall()
-
-                if not rows:
-                    time.sleep(5)
-                    continue
-
-                log.info(f"Procesando {len(rows)} traducciones para NER/temas...")
-
-                inserted_links = 0
-
-                for r in rows:
-                    text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
-                    if not text:
+                    if not rows:
+                        time.sleep(10)
                        continue

-                    ents, topics = extract_entities_and_topics(nlp, text)
-                    tags = ents + topics
-                    if not tags:
-                        continue
+                    log.info(f"Procesando {len(rows)} traducciones para NER/temas...")

-                    for valor, tipo in tags:
-                        try:
-                            cur.execute(
-                                """
-                                INSERT INTO tags (valor, tipo)
-                                VALUES (%s, %s)
-                                ON CONFLICT (valor, tipo)
-                                DO UPDATE SET valor = EXCLUDED.valor
-                                RETURNING id;
-                                """,
-                                (valor, tipo),
-                            )
-                            tag_id = cur.fetchone()[0]
+                    inserted_links = 0

-                            cur.execute(
-                                """
-                                INSERT INTO tags_noticia (traduccion_id, tag_id)
-                                VALUES (%s, %s)
-                                ON CONFLICT DO NOTHING;
-                                """,
-                                (r["id"], tag_id),
-                            )
+                    for r in rows:
+                        noticia_id = r["noticia_id"]
+                        traduccion_id = r["id"]
+                        
+                        text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
+                        if not text:
+                            # Para evitar re-procesar, insertamos un tag especial '_none_'
+                            tags = [("_none_", "sistema")]
+                        else:
+                            ents, topics = extract_entities_and_topics(nlp, text)
+                            tags = ents + topics
+                            if not tags:
+                                tags = [("_none_", "sistema")]

-                            if cur.rowcount > 0:
-                                inserted_links += 1
+                        for valor, tipo in tags:
+                            try:
+                                # Usar commit parcial por noticia para evitar abortar todo el batch
+                                cur.execute(
+                                    """
+                                    INSERT INTO tags (valor, tipo)
+                                    VALUES (%s, %s)
+                                    ON CONFLICT (valor, tipo)
+                                    DO UPDATE SET valor = EXCLUDED.valor
+                                    RETURNING id;
+                                    """,
+                                    (valor, tipo),
+                                )
+                                tag_id = cur.fetchone()[0]

-                        except Exception:
-                            log.exception("Error insertando tag/relación")
+                                cur.execute(
+                                    """
+                                    INSERT INTO tags_noticia (traduccion_id, noticia_id, tag_id)
+                                    VALUES (%s, %s, %s)
+                                    ON CONFLICT DO NOTHING;
+                                    """,
+                                    (traduccion_id, noticia_id, tag_id),
+                                )

-                conn.commit()
-                log.info(f"Lote NER OK. Nuevas relaciones tag_noticia: {inserted_links}")
+                                if cur.rowcount > 0:
+                                    inserted_links += 1
+                            except Exception as e:
+                                log.error(f"Error insertando tag '{valor}': {e}")
+                                conn.rollback()
+                                # Volvemos a empezar el loop de tags para esta noticia no es buena idea,
+                                # pero el rollback abortó la transacción del cursor.
+                                # En psycopg2, tras rollback hay que seguir o cerrar.
+                                pass
+                        
+                        conn.commit()

-        except Exception:
-            log.exception("Error general en NER loop")
-            time.sleep(5)
+                    log.info(f"Lote NER OK. Nuevas relaciones tag_noticia: {inserted_links}")
+
+        except Exception as e:
+            log.exception(f"Error general en NER loop: {e}")
+            time.sleep(10)


 if __name__ == "__main__":
--- a/workers/qdrant_worker.py
+++ b/workers/qdrant_worker.py
@ -1,334 +0,0 @@
-"""
-Worker de Qdrant
-Vectoriza noticias traducidas y las sube a Qdrant para búsquedas semánticas.
-"""
-
-import os
-import sys
-import time
-import uuid
-from datetime import datetime
-from typing import List, Dict, Any
-
-# Añadir el directorio raíz al path
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from db import get_read_conn, get_write_conn
-
-try:
-    from qdrant_client import QdrantClient
-    from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
-except ImportError:
-    print("❌ Error: qdrant-client no instalado. Ejecuta: pip install qdrant-client")
-    sys.exit(1)
-
-try:
-    from sentence_transformers import SentenceTransformer
-except ImportError:
-    print("❌ Error: sentence-transformers no instalado")
-    sys.exit(1)
-
-# Configuración
-QDRANT_HOST = os.environ.get("QDRANT_HOST", "localhost")
-QDRANT_PORT = int(os.environ.get("QDRANT_PORT", "6333"))
-QDRANT_COLLECTION = os.environ.get("QDRANT_COLLECTION_NAME", "news_vectors")
-
-EMB_MODEL = os.environ.get("EMB_MODEL", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
-EMB_DEVICE = os.environ.get("EMB_DEVICE", "cuda")
-BATCH_SIZE = int(os.environ.get("QDRANT_BATCH_SIZE", "100"))
-SLEEP_IDLE = int(os.environ.get("QDRANT_SLEEP_IDLE", "30"))
-
-# Cliente Qdrant global
-qdrant_client = None
-embedding_model = None
-
-
-def init_qdrant_client():
-    """
-    Inicializa el cliente de Qdrant y crea la colección si no existe.
-    """
-    global qdrant_client
-    
-    print(f"🔌 Conectando a Qdrant en {QDRANT_HOST}:{QDRANT_PORT}...")
-    qdrant_client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
-    
-    # Verificar si la colección existe
-    collections = qdrant_client.get_collections().collections
-    collection_names = [c.name for c in collections]
-    
-    if QDRANT_COLLECTION not in collection_names:
-        print(f"📦 Creando colección '{QDRANT_COLLECTION}'...")
-        
-        # Obtener dimensión del modelo de embeddings
-        # paraphrase-multilingual-MiniLM-L12-v2 = 384 dimensiones
-        vector_size = 384
-        
-        qdrant_client.create_collection(
-            collection_name=QDRANT_COLLECTION,
-            vectors_config=VectorParams(
-                size=vector_size,
-                distance=Distance.COSINE
-            )
-        )
-        print(f"✅ Colección '{QDRANT_COLLECTION}' creada (dimensión: {vector_size})")
-    else:
-        print(f"✅ Colección '{QDRANT_COLLECTION}' ya existe")
-    
-    # Obtener info de la colección
-    collection_info = qdrant_client.get_collection(QDRANT_COLLECTION)
-    print(f"📊 Puntos en colección: {collection_info.points_count}")
-
-
-def init_embedding_model():
-    """
-    Inicializa el modelo de embeddings.
-    """
-    global embedding_model
-    
-    print(f"🤖 Cargando modelo de embeddings: {EMB_MODEL}")
-    print(f"🖥️  Dispositivo: {EMB_DEVICE}")
-    
-    embedding_model = SentenceTransformer(EMB_MODEL, device=EMB_DEVICE)
-    
-    print(f"✅ Modelo cargado correctamente")
-
-
-def get_pending_news(limit: int = BATCH_SIZE) -> List[Dict[str, Any]]:
-    """
-    Obtiene noticias traducidas pendientes de vectorizar.
-    
-    Args:
-        limit: Número máximo de noticias a obtener
-        
-    Returns:
-        Lista de noticias
-    """
-    with get_read_conn() as conn:
-        with conn.cursor() as cur:
-            cur.execute("""
-                SELECT 
-                    t.id as traduccion_id,
-                    t.noticia_id,
-                    TRIM(t.lang_to) as lang,
-                    t.titulo_trad as titulo,
-                    t.resumen_trad as resumen,
-                    n.url,
-                    n.fecha,
-                    n.fuente_nombre,
-                    n.categoria_id,
-                    n.pais_id
-                FROM traducciones t
-                INNER JOIN noticias n ON t.noticia_id = n.id
-                WHERE t.vectorized = FALSE
-                AND t.status = 'done'
-                ORDER BY t.created_at ASC
-                LIMIT %s
-            """, (limit,))
-            
-            columns = [desc[0] for desc in cur.description]
-            results = []
-            for row in cur.fetchall():
-                results.append(dict(zip(columns, row)))
-            
-            return results
-
-
-def generate_embeddings(texts: List[str]) -> List[List[float]]:
-    """
-    Genera embeddings para una lista de textos.
-    
-    Args:
-        texts: Lista de textos
-        
-    Returns:
-        Lista de vectores de embeddings
-    """
-    embeddings = embedding_model.encode(
-        texts,
-        batch_size=32,
-        show_progress_bar=False,
-        convert_to_numpy=True
-    )
-    return embeddings.tolist()
-
-
-def upload_to_qdrant(news_batch: List[Dict[str, Any]]):
-    """
-    Sube un lote de noticias a Qdrant.
-    
-    Args:
-        news_batch: Lista de noticias
-    """
-    if not news_batch:
-        return
-    
-    # Preparar textos para embeddings (título + resumen)
-    texts = [
-        f"{news['titulo']} {news['resumen']}"
-        for news in news_batch
-    ]
-    
-    print(f"  🧮 Generando embeddings para {len(texts)} noticias...")
-    embeddings = generate_embeddings(texts)
-    
-    # Preparar puntos para Qdrant
-    points = []
-    for news, embedding in zip(news_batch, embeddings):
-        point_id = str(uuid.uuid4())
-        
-        # Preparar payload (metadata)
-        payload = {
-            "news_id": news['noticia_id'],
-            "traduccion_id": news['traduccion_id'],
-            "titulo": news['titulo'],
-            "resumen": news['resumen'],
-            "url": news['url'],
-            "fecha": news['fecha'].isoformat() if news['fecha'] else None,
-            "fuente_nombre": news['fuente_nombre'],
-            "categoria_id": news['categoria_id'],
-            "pais_id": news['pais_id'],
-            "lang": news['lang']
-        }
-        
-        point = PointStruct(
-            id=point_id,
-            vector=embedding,
-            payload=payload
-        )
-        points.append(point)
-        
-        # Guardar point_id para actualizar DB
-        news['qdrant_point_id'] = point_id
-    
-    # Subir a Qdrant
-    print(f"  ⬆️  Subiendo {len(points)} puntos a Qdrant...")
-    qdrant_client.upsert(
-        collection_name=QDRANT_COLLECTION,
-        points=points
-    )
-    
-    # Actualizar base de datos
-    print(f"  💾 Actualizando estado en PostgreSQL...")
-    with get_write_conn() as conn:
-        with conn.cursor() as cur:
-            for news in news_batch:
-                cur.execute("""
-                    UPDATE traducciones
-                    SET 
-                        vectorized = TRUE,
-                        vectorization_date = NOW(),
-                        qdrant_point_id = %s
-                    WHERE id = %s
-                """, (news['qdrant_point_id'], news['traduccion_id']))
-        conn.commit()
-    
-    print(f"  ✅ Lote subido correctamente")
-
-
-def process_batch():
-    """
-    Procesa un lote de noticias traducidas.
-    
-    Returns:
-        Número de noticias procesadas
-    """
-    news_batch = get_pending_news()
-    
-    if not news_batch:
-        return 0
-    
-    print(f"\n📋 Procesando {len(news_batch)} noticias traducidas...")
-    
-    try:
-        upload_to_qdrant(news_batch)
-        return len(news_batch)
-    except Exception as e:
-        print(f"❌ Error procesando lote: {e}")
-        return 0
-
-
-def get_stats():
-    """
-    Obtiene estadísticas del sistema.
-    """
-    with get_read_conn() as conn:
-        with conn.cursor() as cur:
-            cur.execute("""
-                SELECT 
-                    COUNT(*) as total,
-                    COUNT(*) FILTER (WHERE vectorized = TRUE) as vectorizadas,
-                    COUNT(*) FILTER (WHERE vectorized = FALSE AND status = 'done') as pendientes
-                FROM traducciones
-                WHERE lang_to = 'es' 
-            """)
-            row = cur.fetchone()
-            return {
-                'total': row[0],
-                'vectorizadas': row[1],
-                'pendientes': row[2]
-            }
-
-
-def main():
-    """
-    Loop principal del worker.
-    """
-    print("=" * 80)
-    print("🚀 Qdrant Vectorization Worker (Direct Translation)")
-    print("=" * 80)
-    print(f"Qdrant: {QDRANT_HOST}:{QDRANT_PORT}")
-    print(f"Colección: {QDRANT_COLLECTION}")
-    print(f"Modelo: {EMB_MODEL}")
-    print(f"Dispositivo: {EMB_DEVICE}")
-    print(f"Tamaño de lote: {BATCH_SIZE}")
-    print("=" * 80)
-    
-    # Inicializar Qdrant
-    try:
-        init_qdrant_client()
-    except Exception as e:
-        print(f"❌ Error inicializando Qdrant: {e}")
-        print("⚠️  Asegúrate de que Qdrant esté corriendo")
-        return
-    
-    # Inicializar modelo de embeddings
-    try:
-        init_embedding_model()
-    except Exception as e:
-        print(f"❌ Error cargando modelo de embeddings: {e}")
-        return
-    
-    print("\n🔄 Iniciando loop de procesamiento...\n")
-    
-    total_processed = 0
-    
-    while True:
-        try:
-            processed = process_batch()
-            total_processed += processed
-            
-            if processed > 0:
-                print(f"\n✅ Lote completado: {processed} noticias vectorizadas")
-                print(f"📊 Total procesado en esta sesión: {total_processed}")
-                
-                # Mostrar estadísticas
-                stats = get_stats()
-                print(f"📈 Estadísticas globales:")
-                print(f"   Total traducciones: {stats['total']}")
-                print(f"   Vectorizadas: {stats['vectorizadas']}")
-                print(f"   Pendientes: {stats['pendientes']}")
-            else:
-                print(f"💤 No hay noticias pendientes. Esperando {SLEEP_IDLE}s...")
-                time.sleep(SLEEP_IDLE)
-                
-        except KeyboardInterrupt:
-            print("\n\n⏹️  Worker detenido por el usuario")
-            break
-        except Exception as e:
-            print(f"\n❌ Error en loop principal: {e}")
-            print(f"⏳ Esperando {SLEEP_IDLE}s antes de reintentar...")
-            time.sleep(SLEEP_IDLE)
-
-
-if __name__ == "__main__":
-    main()
--- a/workers/related_worker.py
+++ b/workers/related_worker.py
@ -1,202 +0,0 @@
-import os
-import time
-import logging
-from typing import List, Tuple
-
-import numpy as np
-import psycopg2
-import psycopg2.extras
-
-logging.basicConfig(
-    level=logging.INFO,
-    format='[related] %(asctime)s %(levelname)s: %(message)s'
-)
-
-DB = dict(
-    host=os.environ.get("DB_HOST", "localhost"),
-    port=int(os.environ.get("DB_PORT", 5432)),
-    dbname=os.environ.get("DB_NAME", "rss"),
-    user=os.environ.get("DB_USER", "rss"),
-    password=os.environ.get("DB_PASS", "x"),
-)
-
-EMB_MODEL = os.environ.get(
-    "EMB_MODEL",
-    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
-)
-
-TOPK = int(os.environ.get("RELATED_TOPK", 10))
-BATCH_IDS = int(os.environ.get("RELATED_BATCH_IDS", 200))
-SLEEP_IDLE = float(os.environ.get("RELATED_SLEEP", 10))
-MIN_SCORE = float(os.environ.get("RELATED_MIN_SCORE", 0.0))
-WINDOW_HOURS = int(os.environ.get("RELATED_WINDOW_H", 0))
-
-
-def get_conn():
-    return psycopg2.connect(**DB)
-
-
-def fetch_all_embeddings(cur) -> Tuple[List[int], np.ndarray]:
-    sql = """
-        SELECT e.traduccion_id, e.embedding, n.fecha
-        FROM traduccion_embeddings e
-        JOIN traducciones t ON t.id = e.traduccion_id
-        JOIN noticias n ON n.id = t.noticia_id
-        WHERE e.model = %s
-          AND t.status = 'done'
-          AND t.lang_to = 'es'
-    """
-    params = [EMB_MODEL]
-
-    if WINDOW_HOURS > 0:
-        sql += " AND n.fecha >= NOW() - INTERVAL %s"
-        params.append(f"{WINDOW_HOURS} hours")
-
-    cur.execute(sql, params)
-    rows = cur.fetchall()
-
-    if not rows:
-        return [], None
-
-    ids = []
-    vecs = []
-
-    for tr_id, emb, _ in rows:
-        if not emb:
-            continue
-        arr = np.asarray(emb, dtype=np.float32)
-        if arr.ndim != 1 or arr.size == 0:
-            continue
-        ids.append(tr_id)
-        vecs.append(arr)
-
-    if not ids:
-        return [], None
-
-    mat = np.vstack(vecs)
-    norms = np.linalg.norm(mat, axis=1, keepdims=True)
-    norms[norms == 0] = 1e-8
-    mat = mat / norms
-
-    return ids, mat
-
-
-def fetch_pending_ids(cur, limit) -> List[int]:
-    cur.execute(
-        """
-        SELECT t.id
-        FROM traducciones t
-        JOIN traduccion_embeddings e
-             ON e.traduccion_id = t.id AND e.model = %s
-        LEFT JOIN related_noticias r
-               ON r.traduccion_id = t.id
-        WHERE t.lang_to = 'es'
-          AND t.status = 'done'
-        GROUP BY t.id
-        HAVING COUNT(r.related_traduccion_id) = 0
-        ORDER BY t.id DESC
-        LIMIT %s;
-        """,
-        (EMB_MODEL, limit),
-    )
-    return [r[0] for r in cur.fetchall()]
-
-
-def topk(idx: int, ids_all: List[int], mat: np.ndarray, K: int) -> List[Tuple[int, float]]:
-    q = mat[idx]
-    sims = np.dot(mat, q)
-    sims[idx] = -999.0
-
-    if MIN_SCORE > 0:
-        mask = sims >= MIN_SCORE
-        sims = np.where(mask, sims, -999.0)
-
-    if K >= len(sims):
-        top_idx = np.argsort(-sims)
-    else:
-        part = np.argpartition(-sims, K)[:K]
-        top_idx = part[np.argsort(-sims[part])]
-
-    return [(ids_all[j], float(sims[j])) for j in top_idx[:K]]
-
-
-def insert_related(cur, tr_id: int, pairs):
-    clean = []
-    for rid, score in pairs:
-        if rid == tr_id:
-            continue
-        s = float(score)
-        if s <= 0:
-            continue
-        clean.append((tr_id, rid, s))
-
-    if not clean:
-        return
-
-    psycopg2.extras.execute_values(
-        cur,
-        """
-        INSERT INTO related_noticias (traduccion_id, related_traduccion_id, score)
-        VALUES %s
-        ON CONFLICT (traduccion_id, related_traduccion_id)
-        DO UPDATE SET score = EXCLUDED.score;
-        """,
-        clean,
-    )
-
-
-def build_for_ids(conn, target_ids: List[int]) -> int:
-    with conn.cursor() as cur:
-        ids_all, mat = fetch_all_embeddings(cur)
-
-    if not ids_all or mat is None:
-        return 0
-
-    pos = {tid: i for i, tid in enumerate(ids_all)}
-    processed = 0
-
-    with conn.cursor() as cur:
-        for tr_id in target_ids:
-            if tr_id not in pos:
-                continue
-            idx = pos[tr_id]
-            pairs = topk(idx, ids_all, mat, TOPK)
-            insert_related(cur, tr_id, pairs)
-            processed += 1
-
-        conn.commit()
-
-    return processed
-
-
-def main():
-    logging.info(
-        "Iniciando related_worker (EMB=%s TOPK=%s BATCH=%s MIN=%.3f WINDOW_H=%s)",
-        EMB_MODEL,
-        TOPK,
-        BATCH_IDS,
-        MIN_SCORE,
-        WINDOW_HOURS,
-    )
-
-    while True:
-        try:
-            with get_conn() as conn, conn.cursor() as cur:
-                todo = fetch_pending_ids(cur, BATCH_IDS)
-
-            if not todo:
-                time.sleep(SLEEP_IDLE)
-                continue
-
-            with get_conn() as conn:
-                done = build_for_ids(conn, todo)
-                logging.info("Relacionadas generadas/actualizadas para %d traducciones.", done)
-
-        except Exception:
-            logging.exception("Error en related_worker")
-            time.sleep(SLEEP_IDLE)
-
-
-if __name__ == "__main__":
-    main()
-
--- a/workers/simple_translator.py
+++ b/workers/simple_translator.py
@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+import os
+import time
+import logging
+import re
+from typing import List, Optional
+
+import psycopg2
+import psycopg2.extras
+from psycopg2.extras import execute_values
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
+import torch
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
+LOG = logging.getLogger("translator_simple")
+
+DB_CONFIG = {
+    "host": os.environ.get("DB_HOST", "localhost"),
+    "port": int(os.environ.get("DB_PORT", 5432)),
+    "dbname": os.environ.get("DB_NAME", "rss"),
+    "user": os.environ.get("DB_USER", "rss"),
+    "password": os.environ.get("DB_PASS", "x"),
+}
+
+TARGET_LANGS = os.environ.get("TARGET_LANGS", "es").split(",")
+BATCH_SIZE = int(os.environ.get("TRANSLATOR_BATCH", 32))
+MAX_SRC_TOKENS = 512
+
+TRANSLATORS = {}
+
+LANG_MAP = {
+    "en": "en-ES",
+    "es": "es-ES", 
+    "fr": "fr-ES",
+    "de": "de-ES",
+    "pt": "pt-ES",
+    "it": "it-ES",
+    "ru": "ru-ES",
+    "ar": "ar-ES",
+    "fa": "fa-ES",
+    "ps": "ps-ES",
+    "zh": "zh-ES",
+    "ja": "ja-ES",
+    "ko": "ko-ES",
+}
+
+def get_translator(source_lang: str, target_lang: str = "es"):
+    key = f"{source_lang}_{target_lang}"
+    if key in TRANSLATORS:
+        return TRANSLATORS[key]
+    
+    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
+    if source_lang == target_lang:
+        model_name = f"Helsinki-NLP/opus-mt-{source_lang}-es"
+    
+    LOG.info(f"Loading translator: {model_name}")
+    
+    try:
+        device = 0 if torch.cuda.is_available() else -1
+        translator = pipeline("translation", model=model_name, device=device)
+        TRANSLATORS[key] = translator
+        LOG.info(f"Translator loaded: {key}")
+        return translator
+    except Exception as e:
+        LOG.error(f"Failed to load translator {model_name}: {e}")
+        return None
+
+def normalize_lang(lang: Optional[str], default: str = "es") -> Optional[str]:
+    if not lang:
+        return default
+    lang = lang.strip().lower()[:2]
+    return lang if lang else default
+
+def translate_text(source_lang: str, target_lang: str, texts: List[str]) -> List[str]:
+    if not texts:
+        return []
+    
+    if source_lang == target_lang:
+        return texts
+    
+    translator = get_translator(source_lang, target_lang)
+    if not translator:
+        return texts
+    
+    results = []
+    for text in texts:
+        if not text or not text.strip():
+            results.append(text)
+            continue
+        try:
+            result = translator(text[:MAX_SRC_TOKENS], max_length=MAX_SRC_TOKENS)
+            translated = result[0]['translation_text']
+            results.append(translated)
+        except Exception as e:
+            LOG.warning(f"Translation error: {e}")
+            results.append(text)
+    
+    return results
+
+def connect_db():
+    return psycopg2.connect(**DB_CONFIG)
+
+def process_batch(conn, rows):
+    todo = []
+    
+    for r in rows:
+        lang_to = normalize_lang(r.get("lang_to"), "es") or "es"
+        lang_from = normalize_lang(r.get("lang_from")) or "en"
+        
+        titulo = (r.get("titulo") or "").strip()
+        resumen = (r.get("resumen") or "").strip()
+        
+        if lang_from == lang_to:
+            continue
+            
+        todo.append({
+            "tr_id": r.get("tr_id"),
+            "lang_from": lang_from,
+            "lang_to": lang_to,
+            "titulo": titulo,
+            "resumen": resumen,
+        })
+    
+    if not todo:
+        return
+    
+    from collections import defaultdict
+    groups = defaultdict(list)
+    for item in todo:
+        key = (item["lang_from"], item["lang_to"])
+        groups[key].append(item)
+    
+    for (lang_from, lang_to), items in groups.items():
+        LOG.info(f"Translating {lang_from} -> {lang_to} ({len(items)} items)")
+        
+        titles = [i["titulo"] for i in items]
+        translated_titles = translate_text(lang_from, lang_to, titles)
+        
+        translated_bodies = []
+        for i in items:
+            body = (i["resumen"] or "").strip()
+            if body:
+                tr = translate_text(lang_from, lang_to, [body])
+                translated_bodies.append(tr[0] if tr else body)
+            else:
+                translated_bodies.append("")
+        
+        cursor = conn.cursor()
+        for i, (item, tt, tb) in enumerate(zip(items, translated_titles, translated_bodies)):
+            tt = (tt or "").strip()
+            tb = (tb or "").strip()
+            
+            if not tt:
+                tt = item["titulo"]
+            if not tb:
+                tb = item["resumen"]
+            
+            try:
+                cursor.execute("""
+                    UPDATE traducciones 
+                    SET titulo_trad = %s, resumen_trad = %s, lang_to = %s
+                    WHERE id = %s
+                """, (tt, tb, lang_to, item["tr_id"]))
+            except Exception as e:
+                LOG.error(f"Update error: {e}")
+        
+        conn.commit()
+        cursor.close()
+        LOG.info(f"Translated {len(items)} items")
+
+def fetch_pending_translations(conn):
+    cursor = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+    
+    for lang in TARGET_LANGS:
+        cursor.execute("""
+            SELECT t.id as tr_id, t.lang_from, t.lang_to,
+                   n.titulo, n.resumen, n.id as noticia_id
+            FROM traducciones t
+            JOIN noticias n ON n.id = t.noticia_id
+            WHERE t.lang_to = %s 
+              AND (t.titulo_trad IS NULL OR t.resumen_trad IS NULL)
+            ORDER BY n.fecha DESC
+            LIMIT %s
+        """, (lang, BATCH_SIZE))
+        
+        rows = cursor.fetchall()
+        if rows:
+            LOG.info(f"Found {len(rows)} pending translations for {lang}")
+            process_batch(conn, rows)
+    
+    cursor.close()
+
+def main():
+    LOG.info("Simple translator worker started")
+    
+    while True:
+        try:
+            conn = connect_db()
+            fetch_pending_translations(conn)
+            conn.close()
+        except Exception as e:
+            LOG.error(f"Error: {e}")
+        
+        time.sleep(30)
+
+if __name__ == "__main__":
+    main()
--- a/workers/simple_translator_worker.py
+++ b/workers/simple_translator_worker.py
@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""
+Simple Translation Worker using deep-translator
+Uses free translation APIs (Google, LibreTranslate, etc.)
+"""
+
+import os
+import sys
+import time
+import logging
+from datetime import datetime
+
+import psycopg2
+from psycopg2.extras import RealDictCursor
+from deep_translator import GoogleTranslator, MyMemoryTranslator, single_detection
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='[%(asctime)s] %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger(__name__)
+
+DB_CONFIG = {
+    'host': os.getenv('DB_HOST', 'db'),
+    'port': int(os.getenv('DB_PORT', 5432)),
+    'database': os.getenv('DB_NAME', 'rss'),
+    'user': os.getenv('DB_USER', 'rss'),
+    'password': os.getenv('DB_PASS', 'rss')
+}
+
+TARGET_LANG = os.getenv('TARGET_LANGS', 'es').split(',')[0].strip()
+BATCH_SIZE = int(os.getenv('TRANSLATOR_BATCH', '32'))
+SLEEP_INTERVAL = int(os.getenv('TRANSLATOR_SLEEP', '60'))
+
+def get_db_connection():
+    return psycopg2.connect(**DB_CONFIG)
+
+def get_pending_translations(conn):
+    with conn.cursor(cursor_factory=RealDictCursor) as cur:
+        cur.execute("""
+            SELECT n.id, n.feed_id, n.lang, n.titulo, n.resumen
+            FROM noticias n
+            WHERE NOT EXISTS (
+                SELECT 1 FROM traducciones t 
+                WHERE t.noticia_id = n.id AND t.lang_to = %s
+            )
+            AND n.lang IS NOT NULL
+            AND n.lang != %s
+            ORDER BY n.created_at DESC
+            LIMIT %s
+        """, (TARGET_LANG, TARGET_LANG, BATCH_SIZE))
+        return cur.fetchall()
+
+def detect_language(text):
+    """Detect language using MyMemory (free API)"""
+    try:
+        if text and len(text.strip()) > 10:
+            lang = single_detection(text, api_key=None)
+            return lang
+    except Exception as e:
+        logger.debug(f"Language detection failed: {e}")
+    return 'en'
+
+def translate_text(text, source_lang, target_lang):
+    """Translate text using Google Translator (via deep-translator)"""
+    if not text or not text.strip():
+        return ""
+    
+    try:
+        translator = GoogleTranslator(source=source_lang, target=target_lang)
+        translated = translator.translate(text)
+        return translated if translated else text
+    except Exception as e:
+        logger.warning(f"Google translation failed: {e}")
+        
+        # Fallback to MyMemory
+        try:
+            translator = MyMemoryTranslator(source=source_lang, target=target_lang)
+            translated = translator.translate(text)
+            return translated if translated else text
+        except Exception as e2:
+            logger.error(f"MyMemory translation also failed: {e2}")
+            return text
+
+def save_translation(conn, noticia_id, lang_from, titulo, resumen):
+    titulo_trad = translate_text(titulo, lang_from, TARGET_LANG)
+    resumen_trad = translate_text(resumen, lang_from, TARGET_LANG) if resumen else ""
+    
+    with conn.cursor() as cur:
+        cur.execute("""
+            INSERT INTO traducciones (noticia_id, lang_from, lang_to, titulo_trad, resumen_trad, status, created_at)
+            VALUES (%s, %s, %s, %s, %s, 'done', NOW())
+            ON CONFLICT (noticia_id, lang_to) DO UPDATE SET
+                titulo_trad = EXCLUDED.titulo_trad,
+                resumen_trad = EXCLUDED.resumen_trad,
+                status = 'done'
+        """, (noticia_id, lang_from, TARGET_LANG, titulo_trad, resumen_trad))
+    conn.commit()
+
+def process_translations():
+    logger.info("Starting translation worker...")
+    
+    while True:
+        conn = get_db_connection()
+        try:
+            pending = get_pending_translations(conn)
+            
+            if not pending:
+                logger.info(f"No pending translations. Sleeping {SLEEP_INTERVAL}s...")
+                time.sleep(SLEEP_INTERVAL)
+                continue
+            
+            logger.info(f"Found {len(pending)} pending translations")
+            
+            for item in pending:
+                try:
+                    lang = item['lang']
+                    
+                    # Auto-detect language if needed
+                    if not lang or lang == '':
+                        lang = detect_language(item['titulo'] or '')
+                        logger.info(f"Detected language: {lang} for news {item['id']}")
+                    
+                    # Skip if already target language
+                    if lang == TARGET_LANG:
+                        logger.debug(f"Skipping news {item['id']} - already in target language")
+                        continue
+                    
+                    save_translation(
+                        conn, 
+                        item['id'], 
+                        lang,
+                        item['titulo'], 
+                        item['resumen']
+                    )
+                    logger.info(f"Translated news {item['id']}: {item['titulo'][:50]}...")
+                    
+                except Exception as e:
+                    logger.error(f"Error translating news {item['id']}: {e}")
+                    continue
+                    
+        except Exception as e:
+            logger.error(f"Database error: {e}")
+            time.sleep(5)
+        finally:
+            conn.close()
+
+if __name__ == '__main__':
+    logger.info(f"Translation worker started. Target: {TARGET_LANG}")
+    process_translations()
--- a/workers/topics_worker.py
+++ b/workers/topics_worker.py
@ -1,244 +0,0 @@
-import os
-import time
-import logging
-import json
-import psycopg2
-from psycopg2.extras import execute_values
-
-# Logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='[topics_worker] %(asctime)s %(levelname)s: %(message)s'
-)
-log = logging.getLogger(__name__)
-
-# Config
-DB_CONFIG = {
-    "host": os.environ.get("DB_HOST", "localhost"),
-    "port": int(os.environ.get("DB_PORT", 5432)),
-    "dbname": os.environ.get("DB_NAME", "rss"),
-    "user": os.environ.get("DB_USER", "rss"),
-    "password": os.environ.get("DB_PASS", "x"),
-}
-
-SLEEP_IDLE = 10
-BATCH_SIZE = 500
-
-def get_conn():
-    return psycopg2.connect(**DB_CONFIG)
-
-def load_topics(conn):
-    """
-    Load topics and heir keywords.
-    Returns list of dicts: [{'id': 1, 'weight': 5, 'keywords': ['foo', 'bar']}]
-    """
-    with conn.cursor() as cur:
-        cur.execute("SELECT id, weight, keywords FROM topics")
-        rows = cur.fetchall()
-    
-    topics = []
-    for r in rows:
-        tid, weight, kw_str = r
-        if not kw_str:
-            continue
-        # Keywords are comma separated based on insert script
-        kws = [k.strip().lower() for k in kw_str.split(",") if k.strip()]
-        topics.append({
-            "id": tid,
-            "weight": weight,
-            "keywords": kws
-        })
-    return topics
-
-
-def load_countries(conn):
-    """
-    Load countries.
-    Returns list: [{'id': 1, 'name': 'España', 'keywords': ['españa', 'madrid']}]
-    """
-    with conn.cursor() as cur:
-        cur.execute("SELECT id, nombre FROM paises")
-        rows = cur.fetchall()
-    
-    countries = []
-    # Hardcoded aliases for simplicity. A separate table would be better.
-    ALIASES = {
-        "Estados Unidos": ["eeuu", "ee.uu.", "usa", "estadounidense", "washington"],
-        "Rusia": ["ruso", "rusa", "moscú", "kremlin"],
-        "China": ["chino", "china", "pekin", "beijing"],
-        "Ucrania": ["ucraniano", "kiev", "kyiv"],
-        "Israel": ["israelí", "tel aviv", "jerusalén"],
-        "España": ["español", "madrid"],
-        "Reino Unido": ["uk", "londres", "británico"],
-        "Francia": ["francés", "parís"],
-        "Alemania": ["alemán", "berlín"],
-        "Palestina": ["palestino", "gaza", "cisjordania"],
-        "Irán": ["iraní", "teherán"],
-    }
-    
-    for r in rows:
-        cid, name = r
-        kws = [name.lower()]
-        if name in ALIASES:
-            kws.extend(ALIASES[name])
-        countries.append({"id": cid, "name": name, "keywords": kws})
-    return countries
-
-def process_batch(conn, topics, countries):
-    """
-    Fetch batch of processed=False news.
-    Match against topics AND countries.
-    Insert into news_topics.
-    Mark processed.
-    """
-    with conn.cursor() as cur:
-        # Fetch news
-        cur.execute("""
-            SELECT id, titulo, resumen 
-            FROM noticias 
-            WHERE topics_processed = FALSE 
-            ORDER BY fecha DESC 
-            LIMIT %s
-        """, (BATCH_SIZE,))
-        news_items = cur.fetchall()
-
-    if not news_items:
-        return 0
-
-    inserts = [] # (noticia_id, topic_id, score)
-    processed_ids = []
-    
-    # Batch updates for pais_id
-    country_updates = [] # (pais_id, noticia_id)
-
-    for item in news_items:
-        nid, titulo, resumen = item
-        text = (titulo or "") + " " + (resumen or "")
-        text_lower = text.lower()
-        
-        # 1. Match Topics
-        for topic in topics:
-            matched_count = 0
-            for kw in topic["keywords"]:
-                if kw in text_lower:
-                    matched_count += 1
-            
-            if matched_count > 0:
-                score = topic["weight"] * matched_count
-                inserts.append((nid, topic["id"], score))
-
-        # 2. Match Country (Find best match)
-        best_country = None
-        # Simple heuristic: First found? Or count matches?
-        # Let's count matches.
-        max_matches = 0
-        
-        for c in countries:
-            matches = 0
-            for kw in c["keywords"]:
-                # simple word matching. can be improved with regex word boundaries
-                if kw in text_lower:
-                    matches += 1
-            
-            if matches > max_matches:
-                max_matches = matches
-                best_country = c["id"]
-        
-        if best_country:
-            country_updates.append((best_country, nid))
-
-        processed_ids.append(nid)
-
-    with conn.cursor() as cur:
-        # Insert relations
-        if inserts:
-            execute_values(cur, """
-                INSERT INTO news_topics (noticia_id, topic_id, score)
-                VALUES %s
-                ON CONFLICT (noticia_id, topic_id) DO UPDATE SET score = EXCLUDED.score
-            """, inserts)
-        
-        # Update Countries
-        if country_updates:
-            execute_values(cur, """
-                UPDATE noticias AS n
-                SET pais_id = v.pais_id
-                FROM (VALUES %s) AS v(pais_id, noticia_id)
-                WHERE n.id = v.noticia_id
-            """, country_updates)
-        
-        # Mark processed
-        cur.execute("""
-            UPDATE noticias 
-            SET topics_processed = TRUE 
-            WHERE id = ANY(%s)
-        """, (processed_ids,))
-        
-    conn.commit()
-    return len(news_items)
-
-def initialize_schema(conn):
-    """
-    Ensure required tables and columns exist.
-    """
-    log.info("Checking/Initializing schema...")
-    with conn.cursor() as cur:
-        cur.execute("""
-            CREATE TABLE IF NOT EXISTS topics (
-                id SERIAL PRIMARY KEY,
-                slug VARCHAR(50) UNIQUE NOT NULL,
-                name VARCHAR(100) NOT NULL,
-                weight INTEGER DEFAULT 1,
-                keywords TEXT,
-                group_name VARCHAR(50)
-            );
-            CREATE TABLE IF NOT EXISTS news_topics (
-                noticia_id VARCHAR(32) REFERENCES noticias(id) ON DELETE CASCADE,
-                topic_id INTEGER REFERENCES topics(id) ON DELETE CASCADE,
-                score INTEGER DEFAULT 0,
-                created_at TIMESTAMP DEFAULT NOW(),
-                PRIMARY KEY (noticia_id, topic_id)
-            );
-            ALTER TABLE noticias ADD COLUMN IF NOT EXISTS topics_processed BOOLEAN DEFAULT FALSE;
-        """)
-        conn.commit()
-    log.info("Schema OK.")
-
-def main():
-    log.info("Starting topics_worker...")
-    
-    # Run migrations once at startup
-    try:
-        with get_conn() as conn:
-            initialize_schema(conn)
-    except Exception as e:
-        log.error(f"Error during schema initialization: {e}")
-        # We might want to exit here if the schema is crucial
-        # sys.exit(1)
-
-    while True:
-        try:
-            with get_conn() as conn:
-
-                topics = load_topics(conn)
-                if not topics:
-                    log.warning("No topics found in DB. Sleeping.")
-                    time.sleep(SLEEP_IDLE)
-                    continue
-                
-                # Load countries
-                countries = load_countries(conn)
-                
-                count = process_batch(conn, topics, countries)
-
-                if count < BATCH_SIZE:
-                     time.sleep(SLEEP_IDLE)
-                else:
-                    log.info(f"Processed {count} items.")
-                    
-        except Exception as e:
-            log.exception("Error in topics_worker")
-            time.sleep(SLEEP_IDLE)
-
-if __name__ == "__main__":
-    main()
--- a/workers/translation_scheduler.py
+++ b/workers/translation_scheduler.py
@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+"""
+Translation Scheduler Worker
+Creates translation jobs for news that need to be translated.
+"""
+
+import os
+import sys
+import time
+import logging
+from datetime import datetime
+
+import psycopg2
+from psycopg2.extras import RealDictCursor
+from langdetect import detect, LangDetectException
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='[%(asctime)s] %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger(__name__)
+
+DB_CONFIG = {
+    'host': os.getenv('DB_HOST', 'db'),
+    'port': int(os.getenv('DB_PORT', 5432)),
+    'database': os.getenv('DB_NAME', 'rss'),
+    'user': os.getenv('DB_USER', 'rss'),
+    'password': os.getenv('DB_PASS', 'rss')
+}
+
+TARGET_LANGS = os.getenv('TARGET_LANGS', 'es').split(',')
+BATCH_SIZE = int(os.getenv('SCHEDULER_BATCH', '2000'))
+SLEEP_INTERVAL = int(os.getenv('SCHEDULER_SLEEP', '30'))
+
+# Common source languages to try
+SOURCE_LANGS = ['en', 'fr', 'pt', 'de', 'it', 'ru', 'zh', 'ja', 'ar', 'nl', 'pl', 'sv']
+
+def get_db_connection():
+    return psycopg2.connect(**DB_CONFIG)
+
+def create_translation_jobs(conn):
+    """Create translation jobs for news without translations.
+    Relies on langdetect_worker to have set the 'lang' column.
+    """
+    created = 0
+    
+    with conn.cursor(cursor_factory=RealDictCursor) as cur:
+        for lang in TARGET_LANGS:
+            lang = lang.strip()
+            if not lang:
+                continue
+                
+            # Insert translation jobs for news that have a detected language
+            # but don't have a translation record for the target language.
+            cur.execute("""
+                INSERT INTO traducciones (noticia_id, lang_from, lang_to, status, created_at)
+                SELECT n.id, n.lang, %s, 'pending', NOW()
+                FROM noticias n
+                WHERE n.lang IS NOT NULL 
+                  AND TRIM(n.lang) != ''
+                  AND n.lang != %s
+                  AND NOT EXISTS (
+                      SELECT 1 FROM traducciones t 
+                      WHERE t.noticia_id = n.id AND t.lang_to = %s
+                  )
+                ORDER BY n.fecha DESC
+                LIMIT %s
+                ON CONFLICT (noticia_id, lang_to) DO NOTHING
+                RETURNING noticia_id
+            """, (lang, lang, lang, BATCH_SIZE))
+            
+            rows = cur.fetchall()
+            if rows:
+                created += len(rows)
+                logger.info(f"Created {len(rows)} translation jobs for {lang}")
+                
+        conn.commit()
+    
+    return created
+
+def process_translations():
+    logger.info("Starting translation scheduler loop...")
+    
+    while True:
+        try:
+            conn = get_db_connection()
+            created = create_translation_jobs(conn)
+            conn.close()
+            
+            if created == 0:
+                logger.info(f"No new news to schedule. Sleeping {SLEEP_INTERVAL}s...")
+                time.sleep(SLEEP_INTERVAL)
+            else:
+                logger.info(f"Total jobs created in this cycle: {created}")
+                # Short sleep to avoid hammer but keep momentum
+                time.sleep(5)
+                
+        except Exception as e:
+            logger.error(f"Scheduler error: {e}")
+            time.sleep(10)
+
+if __name__ == '__main__':
+    logger.info("Translation scheduler started")
+    process_translations()
--- a/workers/translation_worker.py
+++ b/workers/translation_worker.py
@ -7,19 +7,15 @@ from typing import List, Optional
 import psycopg2
 import psycopg2.extras
 from psycopg2.extras import execute_values
-
-import ctranslate2
-from transformers import AutoTokenizer
 from langdetect import detect, DetectorFactory
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

 DetectorFactory.seed = 0

 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
 LOG = logging.getLogger("translator")

-# =========================
-# DB CONFIG
-# =========================
 DB_CONFIG = {
    "host": os.environ.get("DB_HOST", "localhost"),
    "port": int(os.environ.get("DB_PORT", 5432)),
@ -28,9 +24,6 @@ DB_CONFIG = {
    "password": os.environ.get("DB_PASS", "x"),
 }

-# =========================
-# ENV HELPERS
-# =========================
 def _env_list(name: str, default="es"):
    raw = os.environ.get(name)
    if raw:
@ -55,37 +48,20 @@ def _env_str(name: str, default=None):
    v = os.environ.get(name)
    return v if v else default

-# =========================
-# CONFIG
-# =========================
-TARGET_LANGS = _env_list("TARGET_LANGS")  # por defecto ["es"]
+TARGET_LANGS = _env_list("TARGET_LANGS")
 BATCH_SIZE = _env_int("TRANSLATOR_BATCH", 8)
 ENQUEUE_MAX = _env_int("ENQUEUE", 200)
 SLEEP_IDLE = _env_float("TRANSLATOR_SLEEP_IDLE", 5.0)
-
-# CTranslate2 Configuration
-CT2_MODEL_PATH = _env_str("CT2_MODEL_PATH", "./models/nllb-ct2")
-CT2_DEVICE = _env_str("CT2_DEVICE", "auto")  # auto, cpu, cuda
-CT2_COMPUTE_TYPE = _env_str("CT2_COMPUTE_TYPE", "auto")  # auto, int8, float16, int8_float16
-
 MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", 512)
 MAX_NEW_TOKENS_TITLE = _env_int("MAX_NEW_TOKENS_TITLE", 96)
 MAX_NEW_TOKENS_BODY = _env_int("MAX_NEW_TOKENS_BODY", 512)
-
 NUM_BEAMS_TITLE = _env_int("NUM_BEAMS_TITLE", 2)
 NUM_BEAMS_BODY = _env_int("NUM_BEAMS_BODY", 2)
-
-# HuggingFace model name (used for tokenizer)
 UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", "facebook/nllb-200-distilled-600M")
-IDENTITY_PAISES_ES = _env_int("IDENTITY_PAISES_ES", 58)
-
 BODY_CHARS_CHUNK = _env_int("BODY_CHARS_CHUNK", 900)

-# =========================
-# LANG MAP
-# =========================
-NLLB_LANG = {
-    "es": "spa_Latn", "en": "eng_Latn", "fr": "fra_Latn", "de": "deu_Latn",
+LANG_CODE_MAP = {
+    "en": "eng_Latn", "es": "spa_Latn", "fr": "fra_Latn", "de": "deu_Latn",
    "it": "ita_Latn", "pt": "por_Latn", "nl": "nld_Latn", "sv": "swe_Latn",
    "da": "dan_Latn", "fi": "fin_Latn", "no": "nob_Latn",
    "pl": "pol_Latn", "cs": "ces_Latn", "sk": "slk_Latn",
@ -96,286 +72,74 @@ NLLB_LANG = {
    "ko": "kor_Hang", "vi": "vie_Latn",
 }

-def map_to_nllb(code: Optional[str]):
-    if not code:
-        return None
-    c = code.strip().lower()
-    return NLLB_LANG.get(c, f"{c}_Latn")
+_tokenizer = None
+_translator = None
+_device = None

-def normalize_lang(code: Optional[str], default=None):
-    return (code or default).strip().lower() if code else default
-
-def _norm(s: str) -> str:
-    return re.sub(r"\W+", "", (s or "").lower()).strip()
-
-def _is_repetitive_output(text: str, threshold: float = 0.25) -> bool:
-    """Detect if translation output is repetitive/low quality.
+def get_translator_components():
+    global _tokenizer, _translator, _device
    
-    Args:
-        text: The translated text to check
-        threshold: Minimum unique word ratio (default 0.25 = 25% unique words)
+    if _translator:
+        return _tokenizer, _translator
    
-    Returns:
-        True if text appears to be repetitive/low quality
-    """
-    if not text or len(text) < 50:
-        return False
+    device = 0 if torch.cuda.is_available() else -1
+    LOG.info(f"Loading model {UNIVERSAL_MODEL} on {'cuda' if device == 0 else 'cpu'}")
    
-    # Check for obvious repetitive patterns
-    repetitive_patterns = [
-        r'(\b\w+\b)( \1){3,}',  # Same word repeated 4+ times
-        r'(\b\w+ \w+\b)( \1){2,}',  # Same 2-word phrase repeated 3+ times
-        r'de la la ',
-        r'la línea de la línea',
-        r'de Internet de Internet',
-    ]
+    _tokenizer = AutoTokenizer.from_pretrained(UNIVERSAL_MODEL, src_lang="eng_Latn")
+    model = AutoModelForSeq2SeqLM.from_pretrained(UNIVERSAL_MODEL)
    
-    for pattern in repetitive_patterns:
-        if re.search(pattern, text, re.IGNORECASE):
-            LOG.warning(f"Detected repetitive pattern: {pattern}")
-            return True
+    if device == 0:
+        model = model.to("cuda")
    
-    # Check word diversity
-    words = text.lower().split()
-    if len(words) < 10:
-        return False
-    
-    unique_ratio = len(set(words)) / len(words)
-    if unique_ratio < threshold:
-        LOG.warning(f"Low word diversity: {unique_ratio:.2%} (threshold: {threshold:.2%})")
-        return True
-    
-    return False
-
-# =========================
-# DB
-# =========================
-def get_conn():
-    return psycopg2.connect(**DB_CONFIG)
-
-def ensure_indexes(conn):
-    with conn.cursor() as cur:
-        cur.execute("CREATE INDEX IF NOT EXISTS traducciones_lang_to_status_idx ON traducciones (lang_to, status);")
-        cur.execute("CREATE INDEX IF NOT EXISTS traducciones_status_idx ON traducciones (status);")
-    conn.commit()
-
-    pass # Moved to translation_ops.py
-
-    pass # Moved to translation_ops.py
-
-def fetch_pending_batch(conn, lang_to: str, batch: int):
-    """Fetch pending translations with row locking to support multiple workers."""
-    if batch <= 0:
-        return []
-    
-    # Use FOR UPDATE SKIP LOCKED to allow multiple workers
-    # Each worker will get different rows without conflicts
-    with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-        cur.execute(
-            """
-            SELECT t.id AS tr_id, t.noticia_id, t.lang_from, t.lang_to,
-                   n.titulo, n.resumen
-            FROM traducciones t
-            JOIN noticias n ON n.id=t.noticia_id
-            WHERE t.lang_to=%s AND t.status='pending'
-            ORDER BY t.id
-            LIMIT %s
-            FOR UPDATE OF t SKIP LOCKED;
-            """,
-            (lang_to, batch),
-        )
-        rows = cur.fetchall()
-        
-        # Update status within the same transaction while rows are locked
-        if rows:
-            ids = [r["tr_id"] for r in rows]
-            cur.execute("UPDATE traducciones SET status='processing' WHERE id = ANY(%s)", (ids,))
-    
-    conn.commit()
-    return rows
-
-# =========================
-# LANGUAGE DETECTION
-# =========================
-def detect_lang(text1: str, text2: str):
-    txt = (text1 or "").strip() or (text2 or "").strip()
-    if not txt:
-        return None
-    try:
-        return detect(txt)
-    except Exception:
-        return None
-
-# =========================
-# MODEL LOADING (CTranslate2)
-# =========================
-_TOKENIZER = None
-_TRANSLATOR = None
-_DEVICE = None
-
-def _resolve_device():
-    if CT2_DEVICE == "cpu":
-        return "cpu"
-    if CT2_DEVICE == "cuda":
-        return "cuda"
-    # auto
-    return "cuda" if ctranslate2.get_cuda_device_count() > 0 else "cpu"
-
-def _ensure_ct2_model():
-    """Convert HuggingFace model to CTranslate2 format if not exists."""
-    import os
-    import subprocess
-    
-    model_dir = CT2_MODEL_PATH
-    
-    # Check if model already exists
-    if os.path.isdir(model_dir) and os.path.exists(os.path.join(model_dir, "model.bin")):
-        LOG.info("CTranslate2 model already exists at %s", model_dir)
-        return True
-    
-    LOG.info("CTranslate2 model not found, converting from %s...", UNIVERSAL_MODEL)
-    LOG.info("This may take 5-10 minutes on first run...")
-    
-    # Create directory if needed
-    os.makedirs(os.path.dirname(model_dir) or ".", exist_ok=True)
-    
-    # Convert the model
-    quantization = CT2_COMPUTE_TYPE if CT2_COMPUTE_TYPE != "auto" else "int8_float16"
-    
-    cmd = [
-        "ct2-transformers-converter",
-        "--model", UNIVERSAL_MODEL,
-        "--output_dir", model_dir,
-        "--quantization", quantization,
-        "--force"
-    ]
-    
-    try:
-        LOG.info("Running: %s", " ".join(cmd))
-        result = subprocess.run(cmd, capture_output=True, text=True, timeout=1800)
-        
-        if result.returncode != 0:
-            LOG.error("Model conversion failed: %s", result.stderr)
-            return False
-        
-        LOG.info("Model conversion completed successfully")
-        return True
-        
-    except subprocess.TimeoutExpired:
-        LOG.error("Model conversion timed out after 30 minutes")
-        return False
-    except Exception as e:
-        LOG.error("Model conversion error: %s", e)
-        return False
-
-def get_universal_components():
-    global _TOKENIZER, _TRANSLATOR, _DEVICE
-    if _TRANSLATOR:
-        return _TOKENIZER, _TRANSLATOR
-
-    # Ensure CT2 model exists (convert if needed)
-    if not _ensure_ct2_model():
-        raise RuntimeError(f"Failed to load/convert CTranslate2 model at {CT2_MODEL_PATH}")
-
-    device = _resolve_device()
-    
-    LOG.info("Loading CTranslate2 model from %s on %s", CT2_MODEL_PATH, device)
-    
-    _TRANSLATOR = ctranslate2.Translator(
-        CT2_MODEL_PATH,
+    _translator = pipeline(
+        "translation",
+        model=model,
+        tokenizer=_tokenizer,
        device=device,
-        compute_type=CT2_COMPUTE_TYPE,
+        max_length=MAX_SRC_TOKENS,
    )
-    _TOKENIZER = AutoTokenizer.from_pretrained(UNIVERSAL_MODEL)
-    _DEVICE = device
    
-    LOG.info("CTranslate2 model loaded successfully")
-    return _TOKENIZER, _TRANSLATOR
+    _device = "cuda" if device == 0 else "cpu"
+    LOG.info(f"Model loaded on {_device}")
+    
+    return _tokenizer, _translator

-# =========================
-# TRANSLATION (CTranslate2)
-# =========================
-def _safe_src_len(tokenizer):
-    max_len = getattr(tokenizer, "model_max_length", 1024) or 1024
-    if max_len > 100000:
-        max_len = 1024
-    return min(MAX_SRC_TOKENS, max_len - 16)
-
-def _translate_texts(src, tgt, texts, beams, max_new_tokens):
-    """Translate texts using CTranslate2."""
+def translate_texts(src: str, tgt: str, texts: List[str]) -> List[str]:
    if not texts:
        return []
    
    clean = [(t or "").strip() for t in texts]
    if all(not t for t in clean):
        return ["" for _ in clean]
+    
+    tok, translator = get_translator_components()
+    
+    src_code = LANG_CODE_MAP.get(src, f"{src}_Latn")
+    tgt_code = LANG_CODE_MAP.get(tgt, "spa_Latn")
+    
+    results = []
+    for text in clean:
+        if not text:
+            results.append("")
+            continue
+        try:
+            result = translator(text, src_lang=src_code, tgt_lang=tgt_code)
+            results.append(result[0]["translation_text"])
+        except Exception as e:
+            LOG.warning(f"Translation error: {e}")
+            results.append(text)
+    
+    return results

-    tok, translator = get_universal_components()
-    src_code = map_to_nllb(src)
-    tgt_code = map_to_nllb(tgt)
-
-    # Set source language on tokenizer
-    try:
-        tok.src_lang = src_code
-    except Exception:
-        pass
-
-    safe_len = _safe_src_len(tok)
-    max_new = max(16, min(int(max_new_tokens), MAX_NEW_TOKENS_BODY))
-
-    # Tokenize: convert text to tokens
-    sources = []
-    for t in clean:
-        if t:
-            ids = tok.encode(t, truncation=True, max_length=safe_len)
-            tokens = tok.convert_ids_to_tokens(ids)
-            sources.append(tokens)
-        else:
-            sources.append([])
-
-    # Target language prefix for NLLB
-    target_prefix = [[tgt_code]] * len(sources)
-
-    # Translate with CTranslate2
-    start = time.time()
-    results = translator.translate_batch(
-        sources,
-        target_prefix=target_prefix,
-        beam_size=beams,
-        max_decoding_length=max_new,
-        repetition_penalty=2.5,  # Increased from 1.2 to prevent loops
-        no_repeat_ngram_size=3,  # Prevent 3-gram repetition
-    )
-    dt = time.time() - start
-
-    # Decode results
-    translated = []
-    total_tokens = 0
-    for result, src_tokens in zip(results, sources):
-        if result.hypotheses:
-            # Skip the first token (language prefix)
-            tokens = result.hypotheses[0][1:]
-            total_tokens += len(tokens) + len(src_tokens)
-            text = tok.decode(tok.convert_tokens_to_ids(tokens))
-            translated.append(text.strip())
-        else:
-            translated.append("")
-
-    if total_tokens > 0:
-        LOG.info("  → tokens=%d tiempo=%.2fs velocidad=%d tok/s", 
-                 total_tokens, dt, int(total_tokens / dt) if dt > 0 else 0)
-
-    return translated
-
-def _split_body_into_chunks(text: str) -> List[str]:
+def split_body_into_chunks(text: str) -> List[str]:
    text = (text or "").strip()
    if len(text) <= BODY_CHARS_CHUNK:
        return [text] if text else []
-
+    
    parts = re.split(r'(\n\n+|(?<=[\.\!\?؛؟。])\s+)', text)
    chunks = []
    current = ""
-
+    
    for part in parts:
        if not part:
            continue
@ -387,260 +151,145 @@ def _split_body_into_chunks(text: str) -> List[str]:
            current = part
    if current.strip():
        chunks.append(current.strip())
-
-    if not chunks:
-        return [text]
-    return chunks
+    
+    return chunks if chunks else [text]

 def translate_body_long(src: str, tgt: str, body: str) -> str:
    body = (body or "").strip()
    if not body:
        return ""
-
-    chunks = _split_body_into_chunks(body)
+    
+    chunks = split_body_into_chunks(body)
    if len(chunks) == 1:
-        translated = _translate_texts(src, tgt, [body], NUM_BEAMS_BODY, MAX_NEW_TOKENS_BODY)[0]
-        return translated.strip()
-
+        return translate_texts(src, tgt, [body])[0].strip()
+    
    translated_chunks = []
    for ch in chunks:
-        tr = _translate_texts(src, tgt, [ch], NUM_BEAMS_BODY, MAX_NEW_TOKENS_BODY)[0]
-        translated_chunks.append(tr.strip())
-    return "\n\n".join(c for c in translated_chunks if c)
+        tr = translate_texts(src, tgt, [ch])[0]
+        translated_chunks.append(tr)
+    
+    return " ".join(translated_chunks)
+
+def normalize_lang(lang: Optional[str], default: str = "es") -> Optional[str]:
+    if not lang:
+        return default
+    lang = lang.strip().lower()[:2]
+    return lang if lang else default
+
+def detect_lang(text: str) -> str:
+    if not text or len(text) < 10:
+        return "en"
+    try:
+        return detect(text)
+    except Exception:
+        return "en"

-# =========================
-# BATCH PROCESS
-# =========================
 def process_batch(conn, rows):
    todo = []
-    done = []
-    errors = []
-
+    
    for r in rows:
-        lang_to = normalize_lang(r["lang_to"], "es") or "es"
-        lang_from = (
-            normalize_lang(r["lang_from"])
-            or detect_lang(r["titulo"], r["resumen"])
-            or "en"
-        )
-
-        titulo = (r["titulo"] or "").strip()
-        resumen = (r["resumen"] or "").strip()
-
-        if map_to_nllb(lang_from) == map_to_nllb(lang_to):
-            done.append((titulo, resumen, lang_from, r["tr_id"]))
-        else:
-            todo.append({
-                "tr_id": r["tr_id"],
-                "lang_from": lang_from,
-                "lang_to": lang_to,
-                "titulo": titulo,
-                "resumen": resumen,
-            })
-
+        lang_to = normalize_lang(r.get("lang_to"), "es") or "es"
+        lang_from = normalize_lang(r.get("lang_from")) or detect_lang(r.get("titulo") or "")
+        
+        titulo = (r.get("titulo") or "").strip()
+        resumen = (r.get("resumen") or "").strip()
+        
+        if lang_from == lang_to:
+            continue
+        
+        todo.append({
+            "tr_id": r.get("tr_id"),
+            "lang_from": lang_from,
+            "lang_to": lang_to,
+            "titulo": titulo,
+            "resumen": resumen,
+        })
+    
+    if not todo:
+        return
+    
    from collections import defaultdict
    groups = defaultdict(list)
    for item in todo:
        key = (item["lang_from"], item["lang_to"])
        groups[key].append(item)
-
+    
    for (lang_from, lang_to), items in groups.items():
-        LOG.info("Translating %s -> %s (%d items)", lang_from, lang_to, len(items))
-
+        LOG.info(f"Translating {lang_from} -> {lang_to} ({len(items)} items)")
+        
        titles = [i["titulo"] for i in items]
-
-        try:
-            tt = _translate_texts(
-                lang_from,
-                lang_to,
-                titles,
-                NUM_BEAMS_TITLE,
-                MAX_NEW_TOKENS_TITLE,
-            )
-
-            bodies_translated: List[str] = []
-            for i in items:
-                bodies_translated.append(
-                    translate_body_long(lang_from, lang_to, i["resumen"])
-                )
-
-            for i, ttr, btr in zip(items, tt, bodies_translated):
-                ttr = (ttr or "").strip()
-                btr = (btr or "").strip()
-
-                if not ttr or _norm(ttr) == _norm(i["titulo"]):
-                    ttr = i["titulo"]
-                if not btr or _norm(btr) == _norm(i["resumen"]):
-                    btr = i["resumen"]
-
-                # CLEANING: Remove <unk> tokens
-                if ttr:
-                    ttr = ttr.replace("<unk>", "").replace("  ", " ").strip()
-                if btr:
-                    btr = btr.replace("<unk>", "").replace("  ", " ").strip()
-
-                # VALIDATION: Check for repetitive output
-                if _is_repetitive_output(ttr) or _is_repetitive_output(btr):
-                    LOG.warning(f"Rejecting repetitive translation for tr_id={i['tr_id']}")
-                    errors.append(("Repetitive output detected", i["tr_id"]))
-                    continue
-
-                done.append((ttr, btr, lang_from, i["tr_id"]))
-
-        except Exception as e:
-            err = str(e)[:800]
-            LOG.exception("Error translating %s -> %s: %s", lang_from, lang_to, err)
-            for i in items:
-                errors.append((err, i["tr_id"]))
-
-    with conn.cursor() as cur:
-        if done:
-            execute_values(
-                cur,
-                """
-                UPDATE traducciones AS t
-                SET titulo_trad=v.titulo_trad,
-                    resumen_trad=v.resumen_trad,
-                    lang_from=COALESCE(t.lang_from, v.lang_from),
-                    status='done',
-                    error=NULL
-                FROM (VALUES %s) AS v(titulo_trad,resumen_trad,lang_from,id)
-                WHERE t.id=v.id;
-                """,
-                done,
-            )
-            
-            # --- NEW: Persist stats ---
-            # Insert a record for each translated item into translation_stats
-            # We need the language 'lang_to'. In this batch, lang_to is uniform for the group usually,
-            # but let's extract it from the 'done' items structure if we had it, or pass it down.
-            # In process_batch, we iterate groups. 
-            # 'done' list here is flattened from multiple groups? 
-            # process_batch logic:
-            # 1. 'done' checks map_to_nllb identity (already done?) -> these have lang_to from row?
-            # 2. 'groups' loop -> translates -> appends to 'done' with lang_from.
-            # 
-            # Wait, 'done' list doesn't have lang_to in the tuple: (titulo, resumen, lang_from, tr_id).
-            # We need to change the 'done' collection to include lang_to OR we insert based on tr_id.
-            
-            # Let's verify process_batch logic.
-            # rows has all info.
-            # define a mapping tr_id -> lang_to
-            tr_map = {r["tr_id"]: r["lang_to"] for r in rows}
-            
-            stats_data = []
-            for item in done:
-                 # item is (titulo, resumen, lang_from, tr_id)
-                 lang_from = item[2]
-                 lang_to = tr_map.get(item[3], "es")
-                 stats_data.append((lang_from, lang_to))
-
-            execute_values(
-                cur,
-                "INSERT INTO translation_stats (lang_from, lang_to) VALUES %s",
-                stats_data
-            )
-            # --------------------------
-
-        if errors:
-            execute_values(
-                cur,
-                """
-                UPDATE traducciones AS t
-                SET status='error', error=v.error
-                FROM (VALUES %s) AS v(error,id)
-                WHERE t.id=v.id;
-                """,
-                errors,
-            )
-
-    conn.commit()
-
-def process_entity_summaries(conn):
-    """Translate pending entity summaries from Wikipedia."""
-    from cache import cache_del
-    
-    LOG.info("DEBUG: Checking for pending entity summaries...")
-    
-    with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-        cur.execute("""
-            SELECT id, entity_name, summary, summary_en 
-            FROM entity_images 
-            WHERE status_es = 'pending' 
-            LIMIT 20
-            FOR UPDATE SKIP LOCKED;
-        """)
-        rows = cur.fetchall()
+        translated_titles = translate_texts(lang_from, lang_to, titles)
        
-        if not rows:
-            return False
-            
-        LOG.info("DEBUG: Found %d pending entity summaries to process", len(rows))
+        translated_bodies = []
+        for i in items:
+            body = (i["resumen"] or "").strip()
+            if body:
+                tr = translate_body_long(lang_from, lang_to, body)
+                translated_bodies.append(tr)
+            else:
+                translated_bodies.append("")
        
-        for r in rows:
-            entity_id = r["id"]
-            name = r["entity_name"]
-            text = r["summary_en"] or r["summary"]
+        cursor = conn.cursor()
+        for item, tt, tb in zip(items, translated_titles, translated_bodies):
+            tt = (tt or "").strip()
+            tb = (tb or "").strip()
+            
+            if not tt:
+                tt = item["titulo"]
+            if not tb:
+                tb = item["resumen"]
            
-            if not text:
-                cur.execute("UPDATE entity_images SET status_es = 'none' WHERE id = %s", (entity_id,))
-                continue
-                
            try:
-                # English -> Spanish
-                translated = translate_body_long('en', 'es', text)
-                if translated:
-                    cur.execute("""
-                        UPDATE entity_images 
-                        SET summary_es = %s, status_es = 'done' 
-                        WHERE id = %s
-                    """, (translated, entity_id))
-                    # Invalidate cache
-                    cache_del(f"wiki:data:{name.lower()}")
-                    LOG.info("  → Translated entity summary: %s", name)
-                else:
-                     cur.execute("UPDATE entity_images SET status_es = 'error' WHERE id = %s", (entity_id,))
+                cursor.execute("""
+                    UPDATE traducciones 
+                    SET titulo_trad = %s, resumen_trad = %s, lang_to = %s
+                    WHERE id = %s
+                """, (tt, tb, lang_to, item["tr_id"]))
            except Exception as e:
-                LOG.error("Error translating entity summary [%s]: %s", name, e)
-                cur.execute("UPDATE entity_images SET status_es = 'error' WHERE id = %s", (entity_id,))
+                LOG.error(f"Update error: {e}")
        
        conn.commit()
-    return True
+        cursor.close()
+        LOG.info(f"Translated {len(items)} items")
+
+def fetch_pending_translations(conn):
+    cursor = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+    
+    for lang in TARGET_LANGS:
+        cursor.execute("""
+            SELECT t.id as tr_id, t.lang_from, t.lang_to,
+                   n.titulo, n.resumen, n.id as noticia_id
+            FROM traducciones t
+            JOIN noticias n ON n.id = t.noticia_id
+            WHERE t.lang_to = %s 
+              AND (t.titulo_trad IS NULL OR t.resumen_trad IS NULL)
+            ORDER BY n.fecha DESC
+            LIMIT %s
+        """, (lang, BATCH_SIZE))
+        
+        rows = cursor.fetchall()
+        if rows:
+            LOG.info(f"Found {len(rows)} pending translations for {lang}")
+            process_batch(conn, rows)
+    
+    cursor.close()
+
+def connect_db():
+    return psycopg2.connect(**DB_CONFIG)

-# =========================
-# MAIN LOOP
-# =========================
 def main():
-    LOG.info("Translator worker iniciado (CTranslate2)")
-    os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
-    get_universal_components()
-
+    LOG.info("Translation worker started (transformers)")
+    get_translator_components()
+    
    while True:
-        any_work = False
-        with get_conn() as conn:
-            ensure_indexes(conn)
-            
-            # 1. Process entity summaries (Wikipedia) -> REMOVED per user request
-            # Logic moved out to keep translator focused on news ONLY.
-            # try:
-            #     if process_entity_summaries(conn):
-            #         any_work = True
-            # except Exception as e:
-            #     LOG.error("Error in process_entity_summaries: %s", e)
-
-            # 2. Process news translations
-            for tgt in TARGET_LANGS:
-                while True:
-                    rows = fetch_pending_batch(conn, tgt, BATCH_SIZE)
-                    if not rows:
-                        break
-                    any_work = True
-                    LOG.info("[%s] %d elementos", tgt, len(rows))
-                    process_batch(conn, rows)
-
-        if not any_work:
-            time.sleep(SLEEP_IDLE)
+        try:
+            conn = connect_db()
+            fetch_pending_translations(conn)
+            conn.close()
+        except Exception as e:
+            LOG.error(f"Error: {e}")
+        
+        time.sleep(30)

 if __name__ == "__main__":
    main()
-
--- a/workers/url_discovery_worker.py
+++ b/workers/url_discovery_worker.py
@ -1,471 +0,0 @@
-"""
-URL Feed Discovery Worker
-This worker automatically discovers RSS feeds from URLs stored in fuentes_url table
-and creates entries in the feeds table (or feeds_pending for review).
-Runs every 15 minutes.
-"""
-
-import os
-import sys
-import time
-import logging
-from datetime import datetime
-from typing import List, Dict
-
-# Add parent directory to path to import modules
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from db import get_conn
-from utils.feed_discovery import discover_feeds, get_feed_metadata
-from utils.feed_analysis import (
-    analyze_feed,
-    get_country_id_by_name,
-    get_category_id_by_name
-)
-
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-
-# Configuration
-CHECK_INTERVAL = int(os.getenv('URL_DISCOVERY_INTERVAL_MIN', '15')) * 60  # Default: 15 minutes
-BATCH_SIZE = int(os.getenv('URL_DISCOVERY_BATCH_SIZE', '10'))  # Process URLs in batches
-MAX_FEEDS_PER_URL = int(os.getenv('MAX_FEEDS_PER_URL', '5'))  # Max feeds to create per URL
-
-
-def get_pending_urls(limit: int = BATCH_SIZE) -> List[Dict]:
-    """
-    Get URLs that need to be processed.
-    Priority: never checked > failed checks > oldest successful checks
-    """
-    with get_conn() as conn:
-        with conn.cursor() as cur:
-            cur.execute("""
-                SELECT id, nombre, url, categoria_id, pais_id, idioma, last_status
-                FROM fuentes_url
-                WHERE active = TRUE
-                ORDER BY 
-                    CASE 
-                        WHEN last_check IS NULL THEN 1  -- Never checked (highest priority)
-                        WHEN last_status = 'error' THEN 2  -- Failed checks
-                        WHEN last_status = 'no_feeds' THEN 3  -- No feeds found
-                        ELSE 4  -- Successful checks (lowest priority)
-                    END,
-                    last_check ASC NULLS FIRST
-                LIMIT %s
-            """, (limit,))
-            
-            columns = [desc[0] for desc in cur.description]
-            return [dict(zip(columns, row)) for row in cur.fetchall()]
-
-
-def update_url_status(url_id: int, status: str, message: str = None, http_code: int = None):
-    """Update the status of a URL source"""
-    with get_conn() as conn:
-        with conn.cursor() as cur:
-            cur.execute("""
-                UPDATE fuentes_url
-                SET last_check = NOW(),
-                    last_status = %s,
-                    status_message = %s,
-                    last_http_code = %s
-                WHERE id = %s
-            """, (status, message, http_code, url_id))
-        conn.commit()
-
-
-def create_pending_feed(
-    fuente_url_id: int,
-    feed_url: str,
-    metadata: Dict,
-    analysis: Dict,
-    categoria_id: int = None,
-    pais_id: int = None,
-    idioma: str = None
-) -> bool:
-    """
-    Create a pending feed entry for manual review
-    """
-    try:
-        with get_conn() as conn:
-            # Get detected country ID
-            detected_country_id = None
-            if analysis.get('detected_country'):
-                detected_country_id = get_country_id_by_name(conn, analysis['detected_country'])
-            
-            # Get suggested category ID
-            suggested_categoria_id = None
-            if analysis.get('suggested_category'):
-                suggested_categoria_id = get_category_id_by_name(conn, analysis['suggested_category'])
-            
-            with conn.cursor() as cur:
-                cur.execute("""
-                   INSERT INTO feeds_pending (
-                        fuente_url_id, feed_url, feed_title, feed_description,
-                        feed_language, feed_type, entry_count,
-                        detected_country_id, suggested_categoria_id,
-                        categoria_id, pais_id, idioma, notes
-                    )
-                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
-                    ON CONFLICT (feed_url) DO UPDATE
-                        SET feed_title = EXCLUDED.feed_title,
-                            feed_description = EXCLUDED.feed_description,
-                            discovered_at = NOW()
-                    RETURNING id
-                """, (
-                    fuente_url_id,
-                    feed_url,
-                    metadata.get('title', 'Feed sin título'),
-                    metadata.get('description', '')[:500],
-                    analysis.get('language'),
-                    'rss',  # Default type
-                    metadata.get('entry_count', 0),
-                    detected_country_id,
-                    suggested_categoria_id,
-                    categoria_id,
-                    pais_id,
-                    idioma,
-                    analysis.get('analysis_notes', '')
-                ))
-                
-                result = cur.fetchone()
-                conn.commit()
-                
-                if result:
-                    logger.info(f"Created pending feed for review: {metadata.get('title')} ({feed_url})")
-                    return True
-                else:
-                    logger.debug(f"Pending feed updated: {feed_url}")
-                    return False
-                    
-    except Exception as e:
-        logger.error(f"Error creating pending feed {feed_url}: {e}")
-        return False
-
-
-def create_feed_from_metadata(
-    feed_url: str,
-    fuente_url_id: int = None,
-    categoria_id: int = None,
-    pais_id: int = None,
-    idioma: str = None,
-    auto_approve: bool = False,
-    context_title: str = None
-) -> Dict:
-    """
-    Create a feed entry from discovered feed URL with intelligent analysis.
-    
-    Returns:
-        {
-            'created': True/False,
-            'pending': True/False,
-            'status': 'created'/'pending'/'existing'/'error',
-            'message': 'Description'
-        }
-    """
-    result = {
-        'created': False,
-        'pending': False,
-        'status': 'error',
-        'message': ''
-    }
-    
-    try:
-        # Get feed metadata
-        metadata = get_feed_metadata(feed_url, timeout=10)
-        
-        if not metadata:
-            result['message'] = 'No se pudo obtener metadata del feed'
-            logger.warning(f"{result['message']}: {feed_url}")
-            return result
-        
-        # Add URL to metadata for analysis
-        metadata['url'] = feed_url
-        
-        # Use context title if provided, otherwise use metadata title
-        # This helps when feed XML title is generic (e.g. "RSS Feed") but site link had meaningful text
-        feed_title = context_title if context_title else metadata.get('title', 'Feed sin título')
-        # Update metadata for consistency in pending feeds AND analysis
-        metadata['title'] = feed_title
-        
-        # Perform intelligent analysis
-        analysis = analyze_feed(metadata)
-        
-        # Determine if we need manual review
-        needs_review = False
-        
-        # If parent URL has no category or country, we need review
-        if not categoria_id or not pais_id:
-            needs_review = True
-            logger.info(f"Feed needs review (missing categoria_id={categoria_id} or pais_id={pais_id})")
-        
-        # If auto_approve is disabled, we need review
-        if not auto_approve:
-            needs_review = True
-        
-        # Enhance metadata with analysis
-        if not idioma and analysis.get('language'):
-            idioma = analysis['language']
-        
-        # If needs review, create pending feed
-        if needs_review:
-            created_pending = create_pending_feed(
-                fuente_url_id=fuente_url_id,
-                feed_url=feed_url,
-                metadata=metadata,
-                analysis=analysis,
-                categoria_id=categoria_id,
-                pais_id=pais_id,
-                idioma=idioma
-            )
-            
-            result['pending'] = created_pending
-            result['status'] = 'pending'
-            result['message'] = f"Feed creado y pendiente de revisión (país: {analysis.get('detected_country', 'N/A')}, categoría sugerida: {analysis.get('suggested_category', 'N/A')})"
-            return result
-        
-        # Otherwise, create feed directly
-        nombre = feed_title
-        descripcion = metadata.get('description', '')
-        
-        with get_conn() as conn:
-            with conn.cursor() as cur:
-                cur.execute("""
-                    INSERT INTO feeds (nombre, descripcion, url, categoria_id, pais_id, idioma, fuente_url_id, activo)
-                    VALUES (%s, %s, %s, %s, %s, %s, %s, TRUE)
-                    ON CONFLICT (url) DO NOTHING
-                    RETURNING id
-                """, (
-                    nombre,
-                    descripcion[:500] if descripcion else None,
-                    feed_url,
-                    categoria_id,
-                    pais_id,
-                    idioma,
-                    fuente_url_id
-                ))
-                
-                feed_result = cur.fetchone()
-                conn.commit()
-                
-                if feed_result:
-                    logger.info(f"Created new feed: {nombre} ({feed_url})")
-                    result['created'] = True
-                    result['status'] = 'created'
-                    result['message'] = f"Feed creado exitosamente"
-                else:
-                    logger.debug(f"Feed already exists: {feed_url}")
-                    result['status'] = 'existing'
-                    result['message'] = 'El feed ya existe'
-                    
-    except Exception as e:
-        logger.error(f"Error creating feed from {feed_url}: {e}")
-        result['message'] = str(e)
-        result['status'] = 'error'
-    
-    return result
-
-
-def process_url_source(url_data: Dict) -> Dict:
-    """
-    Process a single URL source to discover and create feeds.
-    Returns statistics about the operation.
-    """
-    url_id = url_data['id']
-    source_url = url_data['url']
-    nombre = url_data['nombre']
-    categoria_id = url_data['categoria_id']
-    pais_id = url_data['pais_id']
-    idioma = url_data['idioma']
-    
-    logger.info(f"Processing URL source: {nombre} ({source_url})")
-    logger.info(f"  Parent metadata: categoria_id={categoria_id}, pais_id={pais_id}, idioma={idioma}")
-    
-    stats = {
-        'url_id': url_id,
-        'url': source_url,
-        'discovered': 0,
-        'created': 0,
-        'pending': 0,
-        'existing': 0,
-        'errors': 0,
-        'status': 'unknown'
-    }
-    
-    try:
-        # Discover feeds from URL
-        discovered = discover_feeds(source_url, timeout=15)
-        stats['discovered'] = len(discovered)
-        
-        if not discovered:
-            logger.warning(f"No feeds discovered from: {source_url}")
-            update_url_status(url_id, 'no_feeds', 'No se encontraron feeds RSS', 200)
-            stats['status'] = 'no_feeds'
-            return stats
-        
-        # Filter only valid feeds
-        valid_feeds = [f for f in discovered if f.get('valid', False)]
-        
-        if not valid_feeds:
-            logger.warning(f"No valid feeds found for: {source_url}")
-            update_url_status(url_id, 'no_valid_feeds', f'Se encontraron {len(discovered)} feeds pero ninguno válido')
-            stats['status'] = 'no_valid_feeds'
-            return stats
-        
-        # Limit number of feeds per URL
-        feeds_to_create = valid_feeds[:MAX_FEEDS_PER_URL]
-        
-        logger.info(f"Found {len(valid_feeds)} valid feeds, processing up to {len(feeds_to_create)}")
-        
-        # Determine if auto-approve (parent has category AND country)
-        auto_approve = bool(categoria_id and pais_id)
-        
-        if not auto_approve:
-            logger.info("→ Feeds will require manual review (parent lacks category or country)")
-        else:
-            logger.info("→ Feeds will be auto-approved (parent has complete metadata)")
-        
-        # Create feeds
-        for feed_info in feeds_to_create:
-            feed_url = feed_info['url']
-            
-            try:
-                result = create_feed_from_metadata(
-                    feed_url=feed_url,
-                    fuente_url_id=url_id,
-                    categoria_id=categoria_id,
-                    pais_id=pais_id,
-                    idioma=idioma,
-                    auto_approve=auto_approve,
-                    context_title=feed_info.get('context_label')
-                )
-                
-                if result['status'] == 'created':
-                    stats['created'] += 1
-                elif result['status'] == 'pending':
-                    stats['pending'] += 1
-                elif result['status'] == 'existing':
-                    stats['existing'] += 1
-                else:
-                    stats['errors'] += 1
-                    
-            except Exception as e:
-                logger.error(f"Error creating feed {feed_url}: {e}")
-                stats['errors'] += 1
-        
-        # Update URL status
-        if stats['created'] > 0 or stats['pending'] > 0:
-            parts = []
-            if stats['created'] > 0:
-                parts.append(f"{stats['created']} creados")
-            if stats['pending'] > 0:
-                parts.append(f"{stats['pending']} pendientes de revisión")
-            if stats['existing'] > 0:
-                parts.append(f"{stats['existing']} ya existían")
-            
-            message = ", ".join(parts)
-            update_url_status(url_id, 'success', message, 200)
-            stats['status'] = 'success'
-        elif stats['existing'] > 0:
-            message = f"Todos los {stats['existing']} feeds ya existían"
-            update_url_status(url_id, 'existing', message, 200)
-            stats['status'] = 'existing'
-        else:
-            message = f"No se pudieron procesar feeds ({stats['errors']} errores)"
-            update_url_status(url_id, 'error', message)
-            stats['status'] = 'error'
-        
-        logger.info(f"Processed {source_url}: {stats['created']} created, {stats['pending']} pending, {stats['existing']} existing, {stats['errors']} errors")
-        
-    except Exception as e:
-        logger.error(f"Error processing URL {source_url}: {e}")
-        update_url_status(url_id, 'error', str(e)[:200])
-        stats['status'] = 'error'
-        stats['errors'] += 1
-    
-    return stats
-
-
-def process_batch():
-    """Process a batch of URL sources"""
-    logger.info("=" * 80)
-    logger.info(f"Starting URL discovery batch - {datetime.now().isoformat()}")
-    
-    # Get pending URLs
-    urls = get_pending_urls(limit=BATCH_SIZE)
-    
-    if not urls:
-        logger.info("No pending URLs to process")
-        return
-    
-    logger.info(f"Processing {len(urls)} URL sources")
-    
-    # Process statistics
-    total_stats = {
-        'processed': 0,
-        'discovered': 0,
-        'created': 0,
-        'pending': 0,
-        'existing': 0,
-        'errors': 0
-    }
-    
-    # Process each URL
-    for url_data in urls:
-        stats = process_url_source(url_data)
-        
-        total_stats['processed'] += 1
-        total_stats['discovered'] += stats['discovered']
-        total_stats['created'] += stats['created']
-        total_stats['pending'] += stats['pending']
-        total_stats['existing'] += stats['existing']
-        total_stats['errors'] += stats['errors']
-        
-        # Small delay between URLs to avoid hammering servers
-        time.sleep(2)
-    
-    # Log summary
-    logger.info("-" * 80)
-    logger.info(f"Batch complete:")
-    logger.info(f"  - Processed: {total_stats['processed']} URLs")
-    logger.info(f"  - Discovered: {total_stats['discovered']} feeds")
-    logger.info(f"  - Created: {total_stats['created']} new feeds")
-    logger.info(f"  - Pending review: {total_stats['pending']} feeds")
-    logger.info(f"  - Already existing: {total_stats['existing']} feeds")
-    logger.info(f"  - Errors: {total_stats['errors']}")
-    logger.info("=" * 80)
-
-
-def main():
-    """Main worker loop"""
-    logger.info("URL Feed Discovery Worker started")
-    logger.info(f"Check interval: {CHECK_INTERVAL} seconds ({CHECK_INTERVAL // 60} minutes)")
-    logger.info(f"Batch size: {BATCH_SIZE}")
-    logger.info(f"Max feeds per URL: {MAX_FEEDS_PER_URL}")
-    
-    # Run immediately on start
-    try:
-        process_batch()
-    except Exception as e:
-        logger.error(f"Error in initial batch: {e}", exc_info=True)
-    
-    # Main loop
-    while True:
-        try:
-            logger.info(f"Waiting {CHECK_INTERVAL} seconds until next batch...")
-            time.sleep(CHECK_INTERVAL)
-            process_batch()
-            
-        except KeyboardInterrupt:
-            logger.info("Worker stopped by user")
-            break
-        except Exception as e:
-            logger.error(f"Error in main loop: {e}", exc_info=True)
-            # Wait a bit before retrying to avoid rapid failure loops
-            time.sleep(60)
-
-
-if __name__ == "__main__":
-    main()
--- a/workers/url_worker.py
+++ b/workers/url_worker.py
@ -1,125 +0,0 @@
-import logging
-import hashlib
-from datetime import datetime
-from newspaper import Article, ArticleException, Config
-import requests
-from db import get_write_conn, get_read_conn
-
-# Configuration
-logger = logging.getLogger("url_worker")
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-
-def get_active_urls():
-    """Get all active URL sources."""
-    with get_read_conn() as conn:
-        with conn.cursor() as cur:
-            cur.execute("""
-                SELECT id, nombre, url, categoria_id, pais_id, idioma
-                FROM fuentes_url 
-                WHERE active = true
-            """)
-            return cur.fetchall()
-
-def update_source_status(source_id, status, message, http_code=0):
-    """Update the status of a URL source."""
-    with get_write_conn() as conn:
-        with conn.cursor() as cur:
-            cur.execute("""
-                UPDATE fuentes_url 
-                SET last_check = NOW(),
-                    last_status = %s,
-                    status_message = %s,
-                    last_http_code = %s
-                WHERE id = %s
-            """, (status, message, http_code, source_id))
-        conn.commit()
-
-def save_article(source, article):
-    """Save the extracted article to the database."""
-    source_id, source_name, source_url, cat_id, pais_id, lang = source
-    
-    # Use the article url if possible, otherwise source_url
-    final_url = article.url or source_url
-    noticia_id = hashlib.md5(final_url.encode("utf-8")).hexdigest()
-    
-    with get_write_conn() as conn:
-        with conn.cursor() as cur:
-            # Check if exists
-            cur.execute("SELECT id FROM noticias WHERE id = %s", (noticia_id,))
-            if cur.fetchone():
-                return False # Already exists
-            
-            # Prepare data
-            title = article.title or "Sin título"
-            summary = article.summary or article.text[:500] 
-            image_url = article.top_image
-            pub_date = article.publish_date or datetime.utcnow()
-            
-            cur.execute("""
-                INSERT INTO noticias (
-                    id, titulo, resumen, url, fecha, imagen_url, 
-                    fuente_nombre, categoria_id, pais_id
-                ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
-                ON CONFLICT (id) DO NOTHING
-            """, (
-                noticia_id, title, summary, final_url, pub_date, image_url,
-                source_name, cat_id, pais_id
-            ))
-        conn.commit()
-    return True
-
-def process_url(source):
-    """Process a single URL source."""
-    source_id, name, url, _, _, _ = source
-    
-    logger.info(f"Processing URL: {url} ({name})")
-    
-    try:
-        # Browser-like headers
-        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-        config = Config()
-        config.browser_user_agent = user_agent
-        config.request_timeout = 30
-
-        article = Article(url, config=config, language='es')
-        article.download()
-        
-        if not article.html:
-             update_source_status(source_id, "ERROR", "No content downloaded (Empty HTML)", 0)
-             return
-
-        article.parse()
-        try:
-            article.nlp()
-        except:
-            pass 
-            
-        if not article.title:
-            update_source_status(source_id, "ERROR_PARSE", "Could not extract title (Page might be not an article)", 200)
-            return
-            
-        saved = save_article(source, article)
-        
-        status_msg = "News created successfully" if saved else "News already exists"
-        update_source_status(source_id, "OK", status_msg, 200)
-        logger.info(f"Success {url}: {status_msg}")
-
-    except ArticleException as ae:
-        logger.error(f"Newspaper Error {url}: {ae}")
-        update_source_status(source_id, "ERROR_DOWNLOAD", str(ae)[:200], 0)
-    except requests.exceptions.RequestException as re:
-        logger.error(f"Network Error {url}: {re}")
-        update_source_status(source_id, "ERROR_NETWORK", str(re)[:200], 0)
-    except Exception as e:
-        logger.error(f"Unexpected Error {url}: {e}")
-        update_source_status(source_id, "ERROR_UNKNOWN", str(e)[:200], 500)
-
-def main():
-    logger.info("Starting URL Worker")
-    urls = get_active_urls()
-    logger.info(f"Found {len(urls)} active URLs")
-    for source in urls:
-        process_url(source)
-
-if __name__ == "__main__":
-    main()
--- a/workers/url_worker_daemon.py
+++ b/workers/url_worker_daemon.py
@ -1,31 +0,0 @@
-import time
-import logging
-import sys
-from workers.url_worker import main as run_once
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    stream=sys.stdout
-)
-logger = logging.getLogger("url_worker_daemon")
-
-INTERVAL = 300  # 5 minutes
-
-def main():
-    logger.info("Starting URL Worker Daemon")
-    logger.info(f"Check interval: {INTERVAL} seconds")
-    
-    while True:
-        try:
-            logger.info("Running job cycle...")
-            run_once()
-            logger.info("Cycle completed.")
-        except Exception as e:
-            logger.exception(f"Error in job cycle: {e}")
-        
-        time.sleep(INTERVAL)
-
-if __name__ == "__main__":
-    main()