Backend Go:
- backend/cmd/server/main.go: ruta wiki_images configurable via WIKI_IMAGES_PATH
- backend/cmd/wiki_worker/main.go: default /opt/rss2 en lugar de /app, leer env
- workers/ctranslator_worker.py: default CT2_MODEL_PATH /opt/rss2 en lugar de /app
- workers/llm_categorizer_worker.py: default LLM_MODEL_PATH /opt/rss2
- workers/{langdetect,simple_translator,translation_scheduler}.py: DB_HOST default 'localhost' en lugar de 'db' (hostname Docker)
SQL / esquema:
- poc/seed.sql: corregir logica de auto-traducciones ES (id LIKE md5() era incorrecto)
- init-db/06-tags.sql: eliminar columna wiki_checked duplicada
Documentacion y configuracion:
- docs/DEPLOY_DEBIAN.md: usar ct2-transformers-converter (lo que usa el worker real)
- deploy/debian/env.example: agregar WIKI_IMAGES_PATH
- deploy/debian/systemd/rss2-cluster.service: agregar HF_HOME faltante
- deploy/debian/install.sh: comparacion numerica correcta de version Go
- scripts/generate_secure_credentials.sh: ruta CT2_MODEL_PATH corregida
- frontend/nginx.conf: advertencia de que es configuracion Docker legacy
- docs/QUICKSTART_LLM.md: nota de deprecacion Docker
- README.md: renombrar backend-go a backend en diagrama
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
109 lines
2.7 KiB
Python
109 lines
2.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Language Detection Worker
|
|
Detects and updates the language of news items in the database.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import logging
|
|
from collections import Counter
|
|
|
|
import psycopg2
|
|
from psycopg2.extras import RealDictCursor
|
|
from langdetect import detect, LangDetectException
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='[%(asctime)s] %(levelname)s - %(message)s',
|
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|
)
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
DB_CONFIG = {
|
|
'host': os.getenv('DB_HOST', 'localhost'),
|
|
'port': int(os.getenv('DB_PORT', 5432)),
|
|
'database': os.getenv('DB_NAME', 'rss'),
|
|
'user': os.getenv('DB_USER', 'rss'),
|
|
'password': os.getenv('DB_PASS', 'rss')
|
|
}
|
|
|
|
BATCH_SIZE = int(os.getenv('LANG_DETECT_BATCH', '1000'))
|
|
SLEEP_INTERVAL = int(os.getenv('LANG_DETECT_SLEEP', '60'))
|
|
|
|
def get_db_connection():
|
|
return psycopg2.connect(**DB_CONFIG)
|
|
|
|
def detect_language(text):
|
|
if not text or len(text.strip()) < 10:
|
|
return None
|
|
try:
|
|
return detect(text)
|
|
except LangDetectException:
|
|
return None
|
|
|
|
def process_batch(conn):
|
|
cursor = conn.cursor(cursor_factory=RealDictCursor)
|
|
|
|
# ONLY pick items where lang is NULL or empty
|
|
cursor.execute("""
|
|
SELECT id, titulo, resumen
|
|
FROM noticias
|
|
WHERE lang IS NULL OR TRIM(lang) = ''
|
|
ORDER BY fecha DESC
|
|
LIMIT %s
|
|
""", (BATCH_SIZE,))
|
|
|
|
rows = cursor.fetchall()
|
|
if not rows:
|
|
return 0
|
|
|
|
updated = 0
|
|
lang_stats = Counter()
|
|
|
|
for row in rows:
|
|
news_id = row['id']
|
|
titulo = (row['titulo'] or "").strip()
|
|
resumen = (row['resumen'] or "").strip()
|
|
|
|
combined = f"{titulo} {resumen}".strip()
|
|
|
|
lang = detect_language(combined)
|
|
|
|
if lang:
|
|
cursor.execute("""
|
|
UPDATE noticias SET lang = %s WHERE id = %s
|
|
""", (lang, news_id))
|
|
lang_stats[lang] += 1
|
|
updated += 1
|
|
|
|
conn.commit()
|
|
cursor.close()
|
|
|
|
if updated > 0:
|
|
LOG.info(f"Updated {updated} news languages: {dict(lang_stats)}")
|
|
|
|
return updated
|
|
|
|
def main():
|
|
LOG.info("Language detection worker started")
|
|
|
|
while True:
|
|
try:
|
|
conn = get_db_connection()
|
|
processed = process_batch(conn)
|
|
conn.close()
|
|
|
|
if processed == 0:
|
|
LOG.info("No more news to process, sleeping...")
|
|
time.sleep(SLEEP_INTERVAL)
|
|
else:
|
|
time.sleep(1)
|
|
|
|
except Exception as e:
|
|
LOG.error(f"Error: {e}")
|
|
time.sleep(10)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|