go integration and wikipedia

This commit is contained in:
jlimolina 2026-03-28 18:30:07 +01:00
parent 47a252e339
commit ee90335b92
7828 changed files with 1307913 additions and 20807 deletions

View file

@ -7,19 +7,15 @@ from typing import List, Optional
import psycopg2
import psycopg2.extras
from psycopg2.extras import execute_values
import ctranslate2
from transformers import AutoTokenizer
from langdetect import detect, DetectorFactory
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
DetectorFactory.seed = 0
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
LOG = logging.getLogger("translator")
# =========================
# DB CONFIG
# =========================
DB_CONFIG = {
"host": os.environ.get("DB_HOST", "localhost"),
"port": int(os.environ.get("DB_PORT", 5432)),
@ -28,9 +24,6 @@ DB_CONFIG = {
"password": os.environ.get("DB_PASS", "x"),
}
# =========================
# ENV HELPERS
# =========================
def _env_list(name: str, default="es"):
raw = os.environ.get(name)
if raw:
@ -55,37 +48,20 @@ def _env_str(name: str, default=None):
v = os.environ.get(name)
return v if v else default
# =========================
# CONFIG
# =========================
TARGET_LANGS = _env_list("TARGET_LANGS") # por defecto ["es"]
TARGET_LANGS = _env_list("TARGET_LANGS")
BATCH_SIZE = _env_int("TRANSLATOR_BATCH", 8)
ENQUEUE_MAX = _env_int("ENQUEUE", 200)
SLEEP_IDLE = _env_float("TRANSLATOR_SLEEP_IDLE", 5.0)
# CTranslate2 Configuration
CT2_MODEL_PATH = _env_str("CT2_MODEL_PATH", "./models/nllb-ct2")
CT2_DEVICE = _env_str("CT2_DEVICE", "auto") # auto, cpu, cuda
CT2_COMPUTE_TYPE = _env_str("CT2_COMPUTE_TYPE", "auto") # auto, int8, float16, int8_float16
MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", 512)
MAX_NEW_TOKENS_TITLE = _env_int("MAX_NEW_TOKENS_TITLE", 96)
MAX_NEW_TOKENS_BODY = _env_int("MAX_NEW_TOKENS_BODY", 512)
NUM_BEAMS_TITLE = _env_int("NUM_BEAMS_TITLE", 2)
NUM_BEAMS_BODY = _env_int("NUM_BEAMS_BODY", 2)
# HuggingFace model name (used for tokenizer)
UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", "facebook/nllb-200-distilled-600M")
IDENTITY_PAISES_ES = _env_int("IDENTITY_PAISES_ES", 58)
BODY_CHARS_CHUNK = _env_int("BODY_CHARS_CHUNK", 900)
# =========================
# LANG MAP
# =========================
NLLB_LANG = {
"es": "spa_Latn", "en": "eng_Latn", "fr": "fra_Latn", "de": "deu_Latn",
LANG_CODE_MAP = {
"en": "eng_Latn", "es": "spa_Latn", "fr": "fra_Latn", "de": "deu_Latn",
"it": "ita_Latn", "pt": "por_Latn", "nl": "nld_Latn", "sv": "swe_Latn",
"da": "dan_Latn", "fi": "fin_Latn", "no": "nob_Latn",
"pl": "pol_Latn", "cs": "ces_Latn", "sk": "slk_Latn",
@ -96,286 +72,74 @@ NLLB_LANG = {
"ko": "kor_Hang", "vi": "vie_Latn",
}
def map_to_nllb(code: Optional[str]):
if not code:
return None
c = code.strip().lower()
return NLLB_LANG.get(c, f"{c}_Latn")
_tokenizer = None
_translator = None
_device = None
def normalize_lang(code: Optional[str], default=None):
return (code or default).strip().lower() if code else default
def _norm(s: str) -> str:
return re.sub(r"\W+", "", (s or "").lower()).strip()
def _is_repetitive_output(text: str, threshold: float = 0.25) -> bool:
"""Detect if translation output is repetitive/low quality.
def get_translator_components():
global _tokenizer, _translator, _device
Args:
text: The translated text to check
threshold: Minimum unique word ratio (default 0.25 = 25% unique words)
if _translator:
return _tokenizer, _translator
Returns:
True if text appears to be repetitive/low quality
"""
if not text or len(text) < 50:
return False
device = 0 if torch.cuda.is_available() else -1
LOG.info(f"Loading model {UNIVERSAL_MODEL} on {'cuda' if device == 0 else 'cpu'}")
# Check for obvious repetitive patterns
repetitive_patterns = [
r'(\b\w+\b)( \1){3,}', # Same word repeated 4+ times
r'(\b\w+ \w+\b)( \1){2,}', # Same 2-word phrase repeated 3+ times
r'de la la ',
r'la línea de la línea',
r'de Internet de Internet',
]
_tokenizer = AutoTokenizer.from_pretrained(UNIVERSAL_MODEL, src_lang="eng_Latn")
model = AutoModelForSeq2SeqLM.from_pretrained(UNIVERSAL_MODEL)
for pattern in repetitive_patterns:
if re.search(pattern, text, re.IGNORECASE):
LOG.warning(f"Detected repetitive pattern: {pattern}")
return True
if device == 0:
model = model.to("cuda")
# Check word diversity
words = text.lower().split()
if len(words) < 10:
return False
unique_ratio = len(set(words)) / len(words)
if unique_ratio < threshold:
LOG.warning(f"Low word diversity: {unique_ratio:.2%} (threshold: {threshold:.2%})")
return True
return False
# =========================
# DB
# =========================
def get_conn():
return psycopg2.connect(**DB_CONFIG)
def ensure_indexes(conn):
with conn.cursor() as cur:
cur.execute("CREATE INDEX IF NOT EXISTS traducciones_lang_to_status_idx ON traducciones (lang_to, status);")
cur.execute("CREATE INDEX IF NOT EXISTS traducciones_status_idx ON traducciones (status);")
conn.commit()
pass # Moved to translation_ops.py
pass # Moved to translation_ops.py
def fetch_pending_batch(conn, lang_to: str, batch: int):
"""Fetch pending translations with row locking to support multiple workers."""
if batch <= 0:
return []
# Use FOR UPDATE SKIP LOCKED to allow multiple workers
# Each worker will get different rows without conflicts
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute(
"""
SELECT t.id AS tr_id, t.noticia_id, t.lang_from, t.lang_to,
n.titulo, n.resumen
FROM traducciones t
JOIN noticias n ON n.id=t.noticia_id
WHERE t.lang_to=%s AND t.status='pending'
ORDER BY t.id
LIMIT %s
FOR UPDATE OF t SKIP LOCKED;
""",
(lang_to, batch),
)
rows = cur.fetchall()
# Update status within the same transaction while rows are locked
if rows:
ids = [r["tr_id"] for r in rows]
cur.execute("UPDATE traducciones SET status='processing' WHERE id = ANY(%s)", (ids,))
conn.commit()
return rows
# =========================
# LANGUAGE DETECTION
# =========================
def detect_lang(text1: str, text2: str):
txt = (text1 or "").strip() or (text2 or "").strip()
if not txt:
return None
try:
return detect(txt)
except Exception:
return None
# =========================
# MODEL LOADING (CTranslate2)
# =========================
_TOKENIZER = None
_TRANSLATOR = None
_DEVICE = None
def _resolve_device():
if CT2_DEVICE == "cpu":
return "cpu"
if CT2_DEVICE == "cuda":
return "cuda"
# auto
return "cuda" if ctranslate2.get_cuda_device_count() > 0 else "cpu"
def _ensure_ct2_model():
"""Convert HuggingFace model to CTranslate2 format if not exists."""
import os
import subprocess
model_dir = CT2_MODEL_PATH
# Check if model already exists
if os.path.isdir(model_dir) and os.path.exists(os.path.join(model_dir, "model.bin")):
LOG.info("CTranslate2 model already exists at %s", model_dir)
return True
LOG.info("CTranslate2 model not found, converting from %s...", UNIVERSAL_MODEL)
LOG.info("This may take 5-10 minutes on first run...")
# Create directory if needed
os.makedirs(os.path.dirname(model_dir) or ".", exist_ok=True)
# Convert the model
quantization = CT2_COMPUTE_TYPE if CT2_COMPUTE_TYPE != "auto" else "int8_float16"
cmd = [
"ct2-transformers-converter",
"--model", UNIVERSAL_MODEL,
"--output_dir", model_dir,
"--quantization", quantization,
"--force"
]
try:
LOG.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=1800)
if result.returncode != 0:
LOG.error("Model conversion failed: %s", result.stderr)
return False
LOG.info("Model conversion completed successfully")
return True
except subprocess.TimeoutExpired:
LOG.error("Model conversion timed out after 30 minutes")
return False
except Exception as e:
LOG.error("Model conversion error: %s", e)
return False
def get_universal_components():
global _TOKENIZER, _TRANSLATOR, _DEVICE
if _TRANSLATOR:
return _TOKENIZER, _TRANSLATOR
# Ensure CT2 model exists (convert if needed)
if not _ensure_ct2_model():
raise RuntimeError(f"Failed to load/convert CTranslate2 model at {CT2_MODEL_PATH}")
device = _resolve_device()
LOG.info("Loading CTranslate2 model from %s on %s", CT2_MODEL_PATH, device)
_TRANSLATOR = ctranslate2.Translator(
CT2_MODEL_PATH,
_translator = pipeline(
"translation",
model=model,
tokenizer=_tokenizer,
device=device,
compute_type=CT2_COMPUTE_TYPE,
max_length=MAX_SRC_TOKENS,
)
_TOKENIZER = AutoTokenizer.from_pretrained(UNIVERSAL_MODEL)
_DEVICE = device
LOG.info("CTranslate2 model loaded successfully")
return _TOKENIZER, _TRANSLATOR
_device = "cuda" if device == 0 else "cpu"
LOG.info(f"Model loaded on {_device}")
return _tokenizer, _translator
# =========================
# TRANSLATION (CTranslate2)
# =========================
def _safe_src_len(tokenizer):
max_len = getattr(tokenizer, "model_max_length", 1024) or 1024
if max_len > 100000:
max_len = 1024
return min(MAX_SRC_TOKENS, max_len - 16)
def _translate_texts(src, tgt, texts, beams, max_new_tokens):
"""Translate texts using CTranslate2."""
def translate_texts(src: str, tgt: str, texts: List[str]) -> List[str]:
if not texts:
return []
clean = [(t or "").strip() for t in texts]
if all(not t for t in clean):
return ["" for _ in clean]
tok, translator = get_translator_components()
src_code = LANG_CODE_MAP.get(src, f"{src}_Latn")
tgt_code = LANG_CODE_MAP.get(tgt, "spa_Latn")
results = []
for text in clean:
if not text:
results.append("")
continue
try:
result = translator(text, src_lang=src_code, tgt_lang=tgt_code)
results.append(result[0]["translation_text"])
except Exception as e:
LOG.warning(f"Translation error: {e}")
results.append(text)
return results
tok, translator = get_universal_components()
src_code = map_to_nllb(src)
tgt_code = map_to_nllb(tgt)
# Set source language on tokenizer
try:
tok.src_lang = src_code
except Exception:
pass
safe_len = _safe_src_len(tok)
max_new = max(16, min(int(max_new_tokens), MAX_NEW_TOKENS_BODY))
# Tokenize: convert text to tokens
sources = []
for t in clean:
if t:
ids = tok.encode(t, truncation=True, max_length=safe_len)
tokens = tok.convert_ids_to_tokens(ids)
sources.append(tokens)
else:
sources.append([])
# Target language prefix for NLLB
target_prefix = [[tgt_code]] * len(sources)
# Translate with CTranslate2
start = time.time()
results = translator.translate_batch(
sources,
target_prefix=target_prefix,
beam_size=beams,
max_decoding_length=max_new,
repetition_penalty=2.5, # Increased from 1.2 to prevent loops
no_repeat_ngram_size=3, # Prevent 3-gram repetition
)
dt = time.time() - start
# Decode results
translated = []
total_tokens = 0
for result, src_tokens in zip(results, sources):
if result.hypotheses:
# Skip the first token (language prefix)
tokens = result.hypotheses[0][1:]
total_tokens += len(tokens) + len(src_tokens)
text = tok.decode(tok.convert_tokens_to_ids(tokens))
translated.append(text.strip())
else:
translated.append("")
if total_tokens > 0:
LOG.info(" → tokens=%d tiempo=%.2fs velocidad=%d tok/s",
total_tokens, dt, int(total_tokens / dt) if dt > 0 else 0)
return translated
def _split_body_into_chunks(text: str) -> List[str]:
def split_body_into_chunks(text: str) -> List[str]:
text = (text or "").strip()
if len(text) <= BODY_CHARS_CHUNK:
return [text] if text else []
parts = re.split(r'(\n\n+|(?<=[\.\!\?؛؟。])\s+)', text)
chunks = []
current = ""
for part in parts:
if not part:
continue
@ -387,260 +151,145 @@ def _split_body_into_chunks(text: str) -> List[str]:
current = part
if current.strip():
chunks.append(current.strip())
if not chunks:
return [text]
return chunks
return chunks if chunks else [text]
def translate_body_long(src: str, tgt: str, body: str) -> str:
body = (body or "").strip()
if not body:
return ""
chunks = _split_body_into_chunks(body)
chunks = split_body_into_chunks(body)
if len(chunks) == 1:
translated = _translate_texts(src, tgt, [body], NUM_BEAMS_BODY, MAX_NEW_TOKENS_BODY)[0]
return translated.strip()
return translate_texts(src, tgt, [body])[0].strip()
translated_chunks = []
for ch in chunks:
tr = _translate_texts(src, tgt, [ch], NUM_BEAMS_BODY, MAX_NEW_TOKENS_BODY)[0]
translated_chunks.append(tr.strip())
return "\n\n".join(c for c in translated_chunks if c)
tr = translate_texts(src, tgt, [ch])[0]
translated_chunks.append(tr)
return " ".join(translated_chunks)
def normalize_lang(lang: Optional[str], default: str = "es") -> Optional[str]:
if not lang:
return default
lang = lang.strip().lower()[:2]
return lang if lang else default
def detect_lang(text: str) -> str:
if not text or len(text) < 10:
return "en"
try:
return detect(text)
except Exception:
return "en"
# =========================
# BATCH PROCESS
# =========================
def process_batch(conn, rows):
todo = []
done = []
errors = []
for r in rows:
lang_to = normalize_lang(r["lang_to"], "es") or "es"
lang_from = (
normalize_lang(r["lang_from"])
or detect_lang(r["titulo"], r["resumen"])
or "en"
)
titulo = (r["titulo"] or "").strip()
resumen = (r["resumen"] or "").strip()
if map_to_nllb(lang_from) == map_to_nllb(lang_to):
done.append((titulo, resumen, lang_from, r["tr_id"]))
else:
todo.append({
"tr_id": r["tr_id"],
"lang_from": lang_from,
"lang_to": lang_to,
"titulo": titulo,
"resumen": resumen,
})
lang_to = normalize_lang(r.get("lang_to"), "es") or "es"
lang_from = normalize_lang(r.get("lang_from")) or detect_lang(r.get("titulo") or "")
titulo = (r.get("titulo") or "").strip()
resumen = (r.get("resumen") or "").strip()
if lang_from == lang_to:
continue
todo.append({
"tr_id": r.get("tr_id"),
"lang_from": lang_from,
"lang_to": lang_to,
"titulo": titulo,
"resumen": resumen,
})
if not todo:
return
from collections import defaultdict
groups = defaultdict(list)
for item in todo:
key = (item["lang_from"], item["lang_to"])
groups[key].append(item)
for (lang_from, lang_to), items in groups.items():
LOG.info("Translating %s -> %s (%d items)", lang_from, lang_to, len(items))
LOG.info(f"Translating {lang_from} -> {lang_to} ({len(items)} items)")
titles = [i["titulo"] for i in items]
try:
tt = _translate_texts(
lang_from,
lang_to,
titles,
NUM_BEAMS_TITLE,
MAX_NEW_TOKENS_TITLE,
)
bodies_translated: List[str] = []
for i in items:
bodies_translated.append(
translate_body_long(lang_from, lang_to, i["resumen"])
)
for i, ttr, btr in zip(items, tt, bodies_translated):
ttr = (ttr or "").strip()
btr = (btr or "").strip()
if not ttr or _norm(ttr) == _norm(i["titulo"]):
ttr = i["titulo"]
if not btr or _norm(btr) == _norm(i["resumen"]):
btr = i["resumen"]
# CLEANING: Remove <unk> tokens
if ttr:
ttr = ttr.replace("<unk>", "").replace(" ", " ").strip()
if btr:
btr = btr.replace("<unk>", "").replace(" ", " ").strip()
# VALIDATION: Check for repetitive output
if _is_repetitive_output(ttr) or _is_repetitive_output(btr):
LOG.warning(f"Rejecting repetitive translation for tr_id={i['tr_id']}")
errors.append(("Repetitive output detected", i["tr_id"]))
continue
done.append((ttr, btr, lang_from, i["tr_id"]))
except Exception as e:
err = str(e)[:800]
LOG.exception("Error translating %s -> %s: %s", lang_from, lang_to, err)
for i in items:
errors.append((err, i["tr_id"]))
with conn.cursor() as cur:
if done:
execute_values(
cur,
"""
UPDATE traducciones AS t
SET titulo_trad=v.titulo_trad,
resumen_trad=v.resumen_trad,
lang_from=COALESCE(t.lang_from, v.lang_from),
status='done',
error=NULL
FROM (VALUES %s) AS v(titulo_trad,resumen_trad,lang_from,id)
WHERE t.id=v.id;
""",
done,
)
# --- NEW: Persist stats ---
# Insert a record for each translated item into translation_stats
# We need the language 'lang_to'. In this batch, lang_to is uniform for the group usually,
# but let's extract it from the 'done' items structure if we had it, or pass it down.
# In process_batch, we iterate groups.
# 'done' list here is flattened from multiple groups?
# process_batch logic:
# 1. 'done' checks map_to_nllb identity (already done?) -> these have lang_to from row?
# 2. 'groups' loop -> translates -> appends to 'done' with lang_from.
#
# Wait, 'done' list doesn't have lang_to in the tuple: (titulo, resumen, lang_from, tr_id).
# We need to change the 'done' collection to include lang_to OR we insert based on tr_id.
# Let's verify process_batch logic.
# rows has all info.
# define a mapping tr_id -> lang_to
tr_map = {r["tr_id"]: r["lang_to"] for r in rows}
stats_data = []
for item in done:
# item is (titulo, resumen, lang_from, tr_id)
lang_from = item[2]
lang_to = tr_map.get(item[3], "es")
stats_data.append((lang_from, lang_to))
execute_values(
cur,
"INSERT INTO translation_stats (lang_from, lang_to) VALUES %s",
stats_data
)
# --------------------------
if errors:
execute_values(
cur,
"""
UPDATE traducciones AS t
SET status='error', error=v.error
FROM (VALUES %s) AS v(error,id)
WHERE t.id=v.id;
""",
errors,
)
conn.commit()
def process_entity_summaries(conn):
"""Translate pending entity summaries from Wikipedia."""
from cache import cache_del
LOG.info("DEBUG: Checking for pending entity summaries...")
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute("""
SELECT id, entity_name, summary, summary_en
FROM entity_images
WHERE status_es = 'pending'
LIMIT 20
FOR UPDATE SKIP LOCKED;
""")
rows = cur.fetchall()
translated_titles = translate_texts(lang_from, lang_to, titles)
if not rows:
return False
LOG.info("DEBUG: Found %d pending entity summaries to process", len(rows))
translated_bodies = []
for i in items:
body = (i["resumen"] or "").strip()
if body:
tr = translate_body_long(lang_from, lang_to, body)
translated_bodies.append(tr)
else:
translated_bodies.append("")
for r in rows:
entity_id = r["id"]
name = r["entity_name"]
text = r["summary_en"] or r["summary"]
cursor = conn.cursor()
for item, tt, tb in zip(items, translated_titles, translated_bodies):
tt = (tt or "").strip()
tb = (tb or "").strip()
if not tt:
tt = item["titulo"]
if not tb:
tb = item["resumen"]
if not text:
cur.execute("UPDATE entity_images SET status_es = 'none' WHERE id = %s", (entity_id,))
continue
try:
# English -> Spanish
translated = translate_body_long('en', 'es', text)
if translated:
cur.execute("""
UPDATE entity_images
SET summary_es = %s, status_es = 'done'
WHERE id = %s
""", (translated, entity_id))
# Invalidate cache
cache_del(f"wiki:data:{name.lower()}")
LOG.info(" → Translated entity summary: %s", name)
else:
cur.execute("UPDATE entity_images SET status_es = 'error' WHERE id = %s", (entity_id,))
cursor.execute("""
UPDATE traducciones
SET titulo_trad = %s, resumen_trad = %s, lang_to = %s
WHERE id = %s
""", (tt, tb, lang_to, item["tr_id"]))
except Exception as e:
LOG.error("Error translating entity summary [%s]: %s", name, e)
cur.execute("UPDATE entity_images SET status_es = 'error' WHERE id = %s", (entity_id,))
LOG.error(f"Update error: {e}")
conn.commit()
return True
cursor.close()
LOG.info(f"Translated {len(items)} items")
def fetch_pending_translations(conn):
cursor = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
for lang in TARGET_LANGS:
cursor.execute("""
SELECT t.id as tr_id, t.lang_from, t.lang_to,
n.titulo, n.resumen, n.id as noticia_id
FROM traducciones t
JOIN noticias n ON n.id = t.noticia_id
WHERE t.lang_to = %s
AND (t.titulo_trad IS NULL OR t.resumen_trad IS NULL)
ORDER BY n.fecha DESC
LIMIT %s
""", (lang, BATCH_SIZE))
rows = cursor.fetchall()
if rows:
LOG.info(f"Found {len(rows)} pending translations for {lang}")
process_batch(conn, rows)
cursor.close()
def connect_db():
return psycopg2.connect(**DB_CONFIG)
# =========================
# MAIN LOOP
# =========================
def main():
LOG.info("Translator worker iniciado (CTranslate2)")
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
get_universal_components()
LOG.info("Translation worker started (transformers)")
get_translator_components()
while True:
any_work = False
with get_conn() as conn:
ensure_indexes(conn)
# 1. Process entity summaries (Wikipedia) -> REMOVED per user request
# Logic moved out to keep translator focused on news ONLY.
# try:
# if process_entity_summaries(conn):
# any_work = True
# except Exception as e:
# LOG.error("Error in process_entity_summaries: %s", e)
# 2. Process news translations
for tgt in TARGET_LANGS:
while True:
rows = fetch_pending_batch(conn, tgt, BATCH_SIZE)
if not rows:
break
any_work = True
LOG.info("[%s] %d elementos", tgt, len(rows))
process_batch(conn, rows)
if not any_work:
time.sleep(SLEEP_IDLE)
try:
conn = connect_db()
fetch_pending_translations(conn)
conn.close()
except Exception as e:
LOG.error(f"Error: {e}")
time.sleep(30)
if __name__ == "__main__":
main()