coconews/workers/simple_translator_worker.py
SITO d9ea78b8a7 fix: revision completa de rutas Docker, logica SQL y configuracion
Backend Go:
- backend/cmd/server/main.go: ruta wiki_images configurable via WIKI_IMAGES_PATH
- backend/cmd/wiki_worker/main.go: default /opt/rss2 en lugar de /app, leer env
- workers/ctranslator_worker.py: default CT2_MODEL_PATH /opt/rss2 en lugar de /app
- workers/llm_categorizer_worker.py: default LLM_MODEL_PATH /opt/rss2
- workers/{langdetect,simple_translator,translation_scheduler}.py: DB_HOST default 'localhost' en lugar de 'db' (hostname Docker)

SQL / esquema:
- poc/seed.sql: corregir logica de auto-traducciones ES (id LIKE md5() era incorrecto)
- init-db/06-tags.sql: eliminar columna wiki_checked duplicada

Documentacion y configuracion:
- docs/DEPLOY_DEBIAN.md: usar ct2-transformers-converter (lo que usa el worker real)
- deploy/debian/env.example: agregar WIKI_IMAGES_PATH
- deploy/debian/systemd/rss2-cluster.service: agregar HF_HOME faltante
- deploy/debian/install.sh: comparacion numerica correcta de version Go
- scripts/generate_secure_credentials.sh: ruta CT2_MODEL_PATH corregida
- frontend/nginx.conf: advertencia de que es configuracion Docker legacy
- docs/QUICKSTART_LLM.md: nota de deprecacion Docker
- README.md: renombrar backend-go a backend en diagrama

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-31 08:57:01 +02:00

151 lines
5.3 KiB
Python

#!/usr/bin/env python3
"""
Simple Translation Worker using deep-translator
Uses free translation APIs (Google, LibreTranslate, etc.)
"""
import os
import sys
import time
import logging
from datetime import datetime
import psycopg2
from psycopg2.extras import RealDictCursor
from deep_translator import GoogleTranslator, MyMemoryTranslator, single_detection
logging.basicConfig(
level=logging.INFO,
format='[%(asctime)s] %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
DB_CONFIG = {
'host': os.getenv('DB_HOST', 'localhost'),
'port': int(os.getenv('DB_PORT', 5432)),
'database': os.getenv('DB_NAME', 'rss'),
'user': os.getenv('DB_USER', 'rss'),
'password': os.getenv('DB_PASS', 'rss')
}
TARGET_LANG = os.getenv('TARGET_LANGS', 'es').split(',')[0].strip()
BATCH_SIZE = int(os.getenv('TRANSLATOR_BATCH', '32'))
SLEEP_INTERVAL = int(os.getenv('TRANSLATOR_SLEEP', '60'))
def get_db_connection():
return psycopg2.connect(**DB_CONFIG)
def get_pending_translations(conn):
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT n.id, n.feed_id, n.lang, n.titulo, n.resumen
FROM noticias n
WHERE NOT EXISTS (
SELECT 1 FROM traducciones t
WHERE t.noticia_id = n.id AND t.lang_to = %s
)
AND n.lang IS NOT NULL
AND n.lang != %s
ORDER BY n.created_at DESC
LIMIT %s
""", (TARGET_LANG, TARGET_LANG, BATCH_SIZE))
return cur.fetchall()
def detect_language(text):
"""Detect language using MyMemory (free API)"""
try:
if text and len(text.strip()) > 10:
lang = single_detection(text, api_key=None)
return lang
except Exception as e:
logger.debug(f"Language detection failed: {e}")
return 'en'
def translate_text(text, source_lang, target_lang):
"""Translate text using Google Translator (via deep-translator)"""
if not text or not text.strip():
return ""
try:
translator = GoogleTranslator(source=source_lang, target=target_lang)
translated = translator.translate(text)
return translated if translated else text
except Exception as e:
logger.warning(f"Google translation failed: {e}")
# Fallback to MyMemory
try:
translator = MyMemoryTranslator(source=source_lang, target=target_lang)
translated = translator.translate(text)
return translated if translated else text
except Exception as e2:
logger.error(f"MyMemory translation also failed: {e2}")
return text
def save_translation(conn, noticia_id, lang_from, titulo, resumen):
titulo_trad = translate_text(titulo, lang_from, TARGET_LANG)
resumen_trad = translate_text(resumen, lang_from, TARGET_LANG) if resumen else ""
with conn.cursor() as cur:
cur.execute("""
INSERT INTO traducciones (noticia_id, lang_from, lang_to, titulo_trad, resumen_trad, status, created_at)
VALUES (%s, %s, %s, %s, %s, 'done', NOW())
ON CONFLICT (noticia_id, lang_to) DO UPDATE SET
titulo_trad = EXCLUDED.titulo_trad,
resumen_trad = EXCLUDED.resumen_trad,
status = 'done'
""", (noticia_id, lang_from, TARGET_LANG, titulo_trad, resumen_trad))
conn.commit()
def process_translations():
logger.info("Starting translation worker...")
while True:
conn = get_db_connection()
try:
pending = get_pending_translations(conn)
if not pending:
logger.info(f"No pending translations. Sleeping {SLEEP_INTERVAL}s...")
time.sleep(SLEEP_INTERVAL)
continue
logger.info(f"Found {len(pending)} pending translations")
for item in pending:
try:
lang = item['lang']
# Auto-detect language if needed
if not lang or lang == '':
lang = detect_language(item['titulo'] or '')
logger.info(f"Detected language: {lang} for news {item['id']}")
# Skip if already target language
if lang == TARGET_LANG:
logger.debug(f"Skipping news {item['id']} - already in target language")
continue
save_translation(
conn,
item['id'],
lang,
item['titulo'],
item['resumen']
)
logger.info(f"Translated news {item['id']}: {item['titulo'][:50]}...")
except Exception as e:
logger.error(f"Error translating news {item['id']}: {e}")
continue
except Exception as e:
logger.error(f"Database error: {e}")
time.sleep(5)
finally:
conn.close()
if __name__ == '__main__':
logger.info(f"Translation worker started. Target: {TARGET_LANG}")
process_translations()