go integration and wikipedia
This commit is contained in:
parent
47a252e339
commit
ee90335b92
7828 changed files with 1307913 additions and 20807 deletions
405
workers/ctranslator_worker.py
Normal file
405
workers/ctranslator_worker.py
Normal file
|
|
@ -0,0 +1,405 @@
|
|||
import os
|
||||
import time
|
||||
import logging
|
||||
import re
|
||||
from typing import List, Optional
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
from langdetect import detect, DetectorFactory
|
||||
|
||||
import ctranslate2
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
DetectorFactory.seed = 0
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
|
||||
LOG = logging.getLogger("translator_ct2")
|
||||
|
||||
TRANSLATOR_ID = os.environ.get("TRANSLATOR_ID", "")
|
||||
TRANSLATOR_TOTAL = int(os.environ.get("TRANSLATOR_TOTAL", "1"))
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
if not text:
|
||||
return ""
|
||||
text = re.sub(r'<[^>]+>', '', text)
|
||||
text = text.replace('<unk>', '')
|
||||
text = text.replace(' ', ' ')
|
||||
text = text.replace('&', '&')
|
||||
text = text.replace('<', '<')
|
||||
text = text.replace('>', '>')
|
||||
text = text.replace('"', '"')
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
DB_CONFIG = {
|
||||
"host": os.environ.get("DB_HOST", "localhost"),
|
||||
"port": int(os.environ.get("DB_PORT", 5432)),
|
||||
"dbname": os.environ.get("DB_NAME", "rss"),
|
||||
"user": os.environ.get("DB_USER", "rss"),
|
||||
"password": os.environ.get("DB_PASS", "x"),
|
||||
}
|
||||
|
||||
def _env_list(name: str, default="es"):
|
||||
raw = os.environ.get(name)
|
||||
if raw:
|
||||
return [s.strip() for s in raw.split(",") if s.strip()]
|
||||
return [default]
|
||||
|
||||
def _env_int(name: str, default: int = 8):
|
||||
v = os.environ.get(name)
|
||||
try:
|
||||
return int(v)
|
||||
except Exception:
|
||||
return default
|
||||
|
||||
def _env_str(name: str, default=None):
|
||||
v = os.environ.get(name)
|
||||
return v if v else default
|
||||
|
||||
TARGET_LANGS = _env_list("TARGET_LANGS")
|
||||
BATCH_SIZE = _env_int("TRANSLATOR_BATCH", 8)
|
||||
MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", 512)
|
||||
MAX_NEW_TOKENS = _env_int("MAX_NEW_TOKENS", 512)
|
||||
|
||||
CT2_MODEL_PATH = _env_str("CT2_MODEL_PATH", "/app/models/nllb-ct2")
|
||||
CT2_DEVICE = _env_str("CT2_DEVICE", "cpu")
|
||||
CT2_COMPUTE_TYPE = _env_str("CT2_COMPUTE_TYPE", "int8")
|
||||
UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", "facebook/nllb-200-distilled-600M")
|
||||
BODY_CHARS_CHUNK = _env_int("BODY_CHARS_CHUNK", 900)
|
||||
|
||||
LANG_CODE_MAP = {
|
||||
"en": "eng_Latn", "es": "spa_Latn", "fr": "fra_Latn", "de": "deu_Latn",
|
||||
"it": "ita_Latn", "pt": "por_Latn", "nl": "nld_Latn", "sv": "swe_Latn",
|
||||
"da": "dan_Latn", "fi": "fin_Latn", "no": "nob_Latn",
|
||||
"pl": "pol_Latn", "cs": "ces_Latn", "sk": "slk_Latn",
|
||||
"sl": "slv_Latn", "hu": "hun_Latn", "ro": "ron_Latn",
|
||||
"el": "ell_Grek", "ru": "rus_Cyrl", "uk": "ukr_Cyrl",
|
||||
"tr": "tur_Latn", "ar": "arb_Arab", "fa": "pes_Arab",
|
||||
"he": "heb_Hebr", "zh": "zho_Hans", "ja": "jpn_Jpan",
|
||||
"ko": "kor_Hang", "vi": "vie_Latn",
|
||||
}
|
||||
|
||||
_tokenizer = None
|
||||
_translator = None
|
||||
|
||||
def ensure_model():
|
||||
global _tokenizer, _translator
|
||||
|
||||
if _translator:
|
||||
return
|
||||
|
||||
model_path = CT2_MODEL_PATH
|
||||
model_bin = os.path.join(model_path, "model.bin")
|
||||
|
||||
if not os.path.exists(model_bin):
|
||||
LOG.info(f"CTranslate2 model not found at {model_path}, converting from {UNIVERSAL_MODEL}...")
|
||||
convert_model()
|
||||
|
||||
LOG.info(f"Loading CTranslate2 model from {model_path} on {CT2_DEVICE}")
|
||||
|
||||
_translator = ctranslate2.Translator(
|
||||
model_path,
|
||||
device=CT2_DEVICE,
|
||||
compute_type=CT2_COMPUTE_TYPE,
|
||||
)
|
||||
|
||||
_tokenizer = AutoTokenizer.from_pretrained(UNIVERSAL_MODEL)
|
||||
LOG.info("CTranslate2 model loaded successfully")
|
||||
|
||||
def convert_model():
|
||||
import subprocess
|
||||
|
||||
model_path = CT2_MODEL_PATH
|
||||
os.makedirs(model_path, exist_ok=True)
|
||||
|
||||
quantization = CT2_COMPUTE_TYPE if CT2_COMPUTE_TYPE != "auto" else "int8"
|
||||
|
||||
cmd = [
|
||||
"ct2-transformers-converter",
|
||||
"--model", UNIVERSAL_MODEL,
|
||||
"--output_dir", model_path,
|
||||
"--quantization", quantization,
|
||||
"--force"
|
||||
]
|
||||
|
||||
LOG.info(f"Running: {' '.join(cmd)}")
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=1800)
|
||||
|
||||
if result.returncode != 0:
|
||||
LOG.error(f"Model conversion failed: {result.stderr}")
|
||||
raise RuntimeError("Failed to convert model")
|
||||
|
||||
LOG.info("Model conversion completed")
|
||||
|
||||
def translate_texts(src: str, tgt: str, texts: List[str]) -> List[str]:
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
ensure_model()
|
||||
|
||||
clean = [(t or "").strip() for t in texts]
|
||||
if all(not t for t in clean):
|
||||
return ["" for _ in clean]
|
||||
|
||||
src_code = LANG_CODE_MAP.get(src, f"{src}_Latn")
|
||||
tgt_code = LANG_CODE_MAP.get(tgt, "spa_Latn")
|
||||
|
||||
try:
|
||||
_tokenizer.src_lang = src_code
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
sources = []
|
||||
for t in clean:
|
||||
if t:
|
||||
ids = _tokenizer.encode(t, truncation=True, max_length=MAX_SRC_TOKENS)
|
||||
tokens = _tokenizer.convert_ids_to_tokens(ids)
|
||||
sources.append(tokens)
|
||||
else:
|
||||
sources.append([])
|
||||
|
||||
target_prefix = [[tgt_code]] * len(sources)
|
||||
|
||||
results = _translator.translate_batch(
|
||||
sources,
|
||||
target_prefix=target_prefix,
|
||||
beam_size=2,
|
||||
max_decoding_length=MAX_NEW_TOKENS,
|
||||
repetition_penalty=2.0,
|
||||
no_repeat_ngram_size=3,
|
||||
)
|
||||
|
||||
translated = []
|
||||
for result in results:
|
||||
try:
|
||||
if result.hypotheses and len(result.hypotheses) > 0:
|
||||
hyp = result.hypotheses[0]
|
||||
if isinstance(hyp, list) and len(hyp) > 0:
|
||||
first_hyp = hyp[0]
|
||||
if isinstance(first_hyp, dict) and "token_ids" in first_hyp:
|
||||
tokens = first_hyp["token_ids"]
|
||||
text = _tokenizer.decode(tokens)
|
||||
translated.append(text.strip())
|
||||
elif isinstance(first_hyp, str):
|
||||
token_strings = hyp[1:] if len(hyp) > 1 else []
|
||||
if token_strings:
|
||||
text = _tokenizer.convert_tokens_to_string(token_strings)
|
||||
translated.append(text.strip())
|
||||
else:
|
||||
translated.append("")
|
||||
else:
|
||||
translated.append("")
|
||||
else:
|
||||
translated.append("")
|
||||
else:
|
||||
translated.append("")
|
||||
except Exception as e:
|
||||
LOG.error(f"Error processing result: {e}")
|
||||
translated.append("")
|
||||
|
||||
return translated
|
||||
|
||||
def split_body_into_chunks(text: str) -> List[str]:
|
||||
text = (text or "").strip()
|
||||
if len(text) <= BODY_CHARS_CHUNK:
|
||||
return [text] if text else []
|
||||
|
||||
parts = re.split(r'(\n\n+|(?<=[\.\!\?؛؟。])\s+)', text)
|
||||
chunks = []
|
||||
current = ""
|
||||
|
||||
for part in parts:
|
||||
if not part:
|
||||
continue
|
||||
if len(current) + len(part) <= BODY_CHARS_CHUNK:
|
||||
current += part
|
||||
else:
|
||||
if current.strip():
|
||||
chunks.append(current.strip())
|
||||
current = part
|
||||
if current.strip():
|
||||
chunks.append(current.strip())
|
||||
|
||||
return chunks if chunks else [text]
|
||||
|
||||
def translate_body_long(src: str, tgt: str, body: str) -> str:
|
||||
body = (body or "").strip()
|
||||
if not body:
|
||||
return ""
|
||||
|
||||
chunks = split_body_into_chunks(body)
|
||||
if len(chunks) == 1:
|
||||
return translate_texts(src, tgt, [body])[0]
|
||||
|
||||
translated_chunks = []
|
||||
for ch in chunks:
|
||||
tr = translate_texts(src, tgt, [ch])[0]
|
||||
translated_chunks.append(tr)
|
||||
|
||||
return " ".join(translated_chunks)
|
||||
|
||||
def normalize_lang(lang: Optional[str], default: str = "es") -> Optional[str]:
|
||||
if not lang:
|
||||
return default
|
||||
lang = lang.strip().lower()[:2]
|
||||
return lang if lang else default
|
||||
|
||||
def detect_lang(text: str) -> str:
|
||||
if not text or len(text) < 10:
|
||||
return "en"
|
||||
try:
|
||||
return detect(text)
|
||||
except Exception:
|
||||
return "en"
|
||||
|
||||
def process_batch(conn, rows):
|
||||
todo = []
|
||||
|
||||
for r in rows:
|
||||
lang_to = normalize_lang(r.get("lang_to"), "es") or "es"
|
||||
lang_from = normalize_lang(r.get("lang_from")) or detect_lang(r.get("titulo") or "")
|
||||
|
||||
titulo = (r.get("titulo") or "").strip()
|
||||
resumen = (r.get("resumen") or "").strip()
|
||||
|
||||
if lang_from == lang_to:
|
||||
# Mark as done and copy original text if languages match
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
UPDATE traducciones
|
||||
SET titulo_trad = %s, resumen_trad = %s, status = 'done'
|
||||
WHERE id = %s
|
||||
""", (titulo, resumen, r.get("tr_id")))
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
continue
|
||||
|
||||
todo.append({
|
||||
"tr_id": r.get("tr_id"),
|
||||
"lang_from": lang_from,
|
||||
"lang_to": lang_to,
|
||||
"titulo": titulo,
|
||||
"resumen": resumen,
|
||||
})
|
||||
|
||||
if not todo:
|
||||
return
|
||||
|
||||
# 1. FAST LOCKING: Commit locked_at immediately to inform other workers
|
||||
cursor = conn.cursor()
|
||||
tr_ids = [item["tr_id"] for item in todo]
|
||||
cursor.execute(f"""
|
||||
UPDATE traducciones
|
||||
SET locked_at = NOW()
|
||||
WHERE id = ANY(ARRAY[{','.join(['%s'] * len(tr_ids))}])
|
||||
""", tr_ids)
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
from collections import defaultdict
|
||||
groups = defaultdict(list)
|
||||
for item in todo:
|
||||
key = (item["lang_from"], item["lang_to"])
|
||||
groups[key].append(item)
|
||||
|
||||
for (lang_from, lang_to), items in groups.items():
|
||||
LOG.info(f"Translating {lang_from} -> {lang_to} ({len(items)} items)")
|
||||
|
||||
try:
|
||||
titles = [i["titulo"] for i in items]
|
||||
translated_titles = translate_texts(lang_from, lang_to, titles)
|
||||
|
||||
for item, tt in zip(items, translated_titles):
|
||||
body = (item["resumen"] or "").strip()
|
||||
tb = ""
|
||||
if body:
|
||||
try:
|
||||
tb = translate_body_long(lang_from, lang_to, body)
|
||||
except Exception as e:
|
||||
LOG.error(f"Body translation error for ID {item['tr_id']}: {e}")
|
||||
tb = item["resumen"]
|
||||
|
||||
tt = clean_text((tt or "").strip())
|
||||
tb = clean_text((tb or "").strip())
|
||||
|
||||
if not tt:
|
||||
tt = item["titulo"]
|
||||
if not tb:
|
||||
tb = item["resumen"]
|
||||
|
||||
# 2. INDIVIDUAL COMMIT: Save each item as it's done
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
UPDATE traducciones
|
||||
SET titulo_trad = %s, resumen_trad = %s, status = 'done', locked_at = NULL
|
||||
WHERE id = %s
|
||||
""", (tt, tb, item["tr_id"]))
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
except Exception as e:
|
||||
LOG.error(f"Update error for ID {item['tr_id']}: {e}")
|
||||
conn.rollback()
|
||||
|
||||
LOG.info(f"Finished group {lang_from} -> {lang_to}")
|
||||
|
||||
except Exception as e:
|
||||
LOG.error(f"Batch group error {lang_from} -> {lang_to}: {e}")
|
||||
# Mark these as error to avoid infinite loop if it's a model crash
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
UPDATE traducciones SET status = 'error', locked_at = NULL
|
||||
WHERE id = ANY(ARRAY[{','.join(['%s'] * len(items))}])
|
||||
""", [i["tr_id"] for i in items])
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
except:
|
||||
conn.rollback()
|
||||
|
||||
def fetch_pending_translations(conn):
|
||||
cursor = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
|
||||
worker_id = os.environ.get("HOSTNAME", f"worker-{os.getpid()}")
|
||||
|
||||
for lang in TARGET_LANGS:
|
||||
cursor.execute("""
|
||||
SELECT t.id as tr_id, t.lang_from, t.lang_to,
|
||||
n.titulo, n.resumen, n.id as noticia_id
|
||||
FROM traducciones t
|
||||
JOIN noticias n ON n.id = t.noticia_id
|
||||
WHERE t.lang_to = %s
|
||||
AND (t.titulo_trad IS NULL OR t.resumen_trad IS NULL)
|
||||
AND (t.locked_at IS NULL OR t.locked_at < NOW() - INTERVAL '10 minutes')
|
||||
ORDER BY n.fecha DESC
|
||||
LIMIT %s
|
||||
FOR UPDATE SKIP LOCKED
|
||||
""", (lang, BATCH_SIZE))
|
||||
|
||||
rows = cursor.fetchall()
|
||||
if rows:
|
||||
LOG.info(f"Found {len(rows)} pending translations for {lang}")
|
||||
process_batch(conn, rows)
|
||||
|
||||
cursor.close()
|
||||
|
||||
def connect_db():
|
||||
return psycopg2.connect(**DB_CONFIG)
|
||||
|
||||
def main():
|
||||
LOG.info(f"CTranslate2 translator worker started (device={CT2_DEVICE}, instances={TRANSLATOR_TOTAL})")
|
||||
ensure_model()
|
||||
|
||||
while True:
|
||||
try:
|
||||
conn = connect_db()
|
||||
fetch_pending_translations(conn)
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
LOG.error(f"Error: {e}")
|
||||
|
||||
time.sleep(30)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
109
workers/langdetect_worker.py
Normal file
109
workers/langdetect_worker.py
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Language Detection Worker
|
||||
Detects and updates the language of news items in the database.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import logging
|
||||
from collections import Counter
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from langdetect import detect, LangDetectException
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='[%(asctime)s] %(levelname)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
DB_CONFIG = {
|
||||
'host': os.getenv('DB_HOST', 'db'),
|
||||
'port': int(os.getenv('DB_PORT', 5432)),
|
||||
'database': os.getenv('DB_NAME', 'rss'),
|
||||
'user': os.getenv('DB_USER', 'rss'),
|
||||
'password': os.getenv('DB_PASS', 'rss')
|
||||
}
|
||||
|
||||
BATCH_SIZE = int(os.getenv('LANG_DETECT_BATCH', '1000'))
|
||||
SLEEP_INTERVAL = int(os.getenv('LANG_DETECT_SLEEP', '60'))
|
||||
|
||||
def get_db_connection():
|
||||
return psycopg2.connect(**DB_CONFIG)
|
||||
|
||||
def detect_language(text):
|
||||
if not text or len(text.strip()) < 10:
|
||||
return None
|
||||
try:
|
||||
return detect(text)
|
||||
except LangDetectException:
|
||||
return None
|
||||
|
||||
def process_batch(conn):
|
||||
cursor = conn.cursor(cursor_factory=RealDictCursor)
|
||||
|
||||
# ONLY pick items where lang is NULL or empty
|
||||
cursor.execute("""
|
||||
SELECT id, titulo, resumen
|
||||
FROM noticias
|
||||
WHERE lang IS NULL OR TRIM(lang) = ''
|
||||
ORDER BY fecha DESC
|
||||
LIMIT %s
|
||||
""", (BATCH_SIZE,))
|
||||
|
||||
rows = cursor.fetchall()
|
||||
if not rows:
|
||||
return 0
|
||||
|
||||
updated = 0
|
||||
lang_stats = Counter()
|
||||
|
||||
for row in rows:
|
||||
news_id = row['id']
|
||||
titulo = (row['titulo'] or "").strip()
|
||||
resumen = (row['resumen'] or "").strip()
|
||||
|
||||
combined = f"{titulo} {resumen}".strip()
|
||||
|
||||
lang = detect_language(combined)
|
||||
|
||||
if lang:
|
||||
cursor.execute("""
|
||||
UPDATE noticias SET lang = %s WHERE id = %s
|
||||
""", (lang, news_id))
|
||||
lang_stats[lang] += 1
|
||||
updated += 1
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
if updated > 0:
|
||||
LOG.info(f"Updated {updated} news languages: {dict(lang_stats)}")
|
||||
|
||||
return updated
|
||||
|
||||
def main():
|
||||
LOG.info("Language detection worker started")
|
||||
|
||||
while True:
|
||||
try:
|
||||
conn = get_db_connection()
|
||||
processed = process_batch(conn)
|
||||
conn.close()
|
||||
|
||||
if processed == 0:
|
||||
LOG.info("No more news to process, sleeping...")
|
||||
time.sleep(SLEEP_INTERVAL)
|
||||
else:
|
||||
time.sleep(1)
|
||||
|
||||
except Exception as e:
|
||||
LOG.error(f"Error: {e}")
|
||||
time.sleep(10)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -3,7 +3,8 @@ import time
|
|||
import logging
|
||||
import re
|
||||
import string
|
||||
from typing import List, Tuple
|
||||
import json
|
||||
from typing import List, Tuple, Set, Dict
|
||||
from collections import Counter
|
||||
|
||||
import psycopg2
|
||||
|
|
@ -46,6 +47,49 @@ ENT_LABELS = {
|
|||
"MISC": "tema",
|
||||
}
|
||||
|
||||
# ==========================================================
|
||||
# Configuración global de entidades (Synonyms / Blacklist)
|
||||
# ==========================================================
|
||||
ENTITY_CONFIG = {"blacklist": [], "synonyms": {}}
|
||||
REVERSE_SYNONYMS = {}
|
||||
|
||||
def load_entity_config():
|
||||
global ENTITY_CONFIG, REVERSE_SYNONYMS
|
||||
path = "entity_config.json"
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
ENTITY_CONFIG = json.load(f)
|
||||
|
||||
# Construir mapa inverso para búsqueda rápida de sinónimos
|
||||
REVERSE_SYNONYMS = {}
|
||||
for canonical, aliases in ENTITY_CONFIG.get("synonyms", {}).items():
|
||||
for alias in aliases:
|
||||
REVERSE_SYNONYMS[alias.lower()] = canonical
|
||||
REVERSE_SYNONYMS[canonical.lower()] = canonical
|
||||
|
||||
log.info(f"Loaded entity_config.json: {len(ENTITY_CONFIG.get('blacklist', []))} blacklisted, {len(ENTITY_CONFIG.get('synonyms', {}))} synonym groups")
|
||||
except Exception as e:
|
||||
log.error(f"Error loading entity_config.json: {e}")
|
||||
|
||||
def get_canonical_name(text: str) -> str:
|
||||
if not text:
|
||||
return text
|
||||
lower = text.lower()
|
||||
return REVERSE_SYNONYMS.get(lower, text)
|
||||
|
||||
def is_blacklisted(text: str) -> bool:
|
||||
if not text:
|
||||
return True
|
||||
lower = text.lower()
|
||||
# Check full match
|
||||
if lower in [item.lower() for item in ENTITY_CONFIG.get("blacklist", [])]:
|
||||
return True
|
||||
# Check if it's just a number
|
||||
if re.fullmatch(r"[0-9\s\.,\-:/]+", lower):
|
||||
return True
|
||||
return False
|
||||
|
||||
# ==========================================================
|
||||
# Limpieza avanzada
|
||||
# ==========================================================
|
||||
|
|
@ -125,7 +169,11 @@ def clean_tag_text(text: str) -> str | None:
|
|||
if not text:
|
||||
return None
|
||||
|
||||
text = BeautifulSoup(text, "html.parser").get_text()
|
||||
try:
|
||||
text = BeautifulSoup(text, "html.parser").get_text()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for pat in HTML_TRASH_PATTERNS:
|
||||
text = re.sub(pat, "", text)
|
||||
|
||||
|
|
@ -133,71 +181,25 @@ def clean_tag_text(text: str) -> str | None:
|
|||
text = text.strip(string.punctuation + " ")
|
||||
|
||||
if len(text) < 3:
|
||||
log.debug(f"Clean reject (too short): {text}")
|
||||
return None
|
||||
if re.search(r"[<>/\\]", text):
|
||||
log.debug(f"Clean reject (bad chars): {text}")
|
||||
return None
|
||||
|
||||
if is_blacklisted(text):
|
||||
return None
|
||||
|
||||
lower = text.lower()
|
||||
if lower.startswith("href="):
|
||||
log.debug(f"Clean reject (href): {text}")
|
||||
return None
|
||||
if _looks_like_attr_or_path(lower):
|
||||
log.debug(f"Clean reject (attr/path): {text}")
|
||||
return None
|
||||
if lower in GENERIC_BAD_TAGS:
|
||||
log.debug(f"Clean reject (generic bad): {text}")
|
||||
return None
|
||||
|
||||
replacements = {
|
||||
"ee.uu.": "Estados Unidos",
|
||||
"los estados unidos": "Estados Unidos",
|
||||
"eeuu": "Estados Unidos",
|
||||
"eu": "Unión Europea",
|
||||
"ue": "Unión Europea",
|
||||
"kosova": "Kosovo",
|
||||
# Specific User Requests
|
||||
"trump": "Donald Trump",
|
||||
"mr. trump": "Donald Trump",
|
||||
"mr trump": "Donald Trump",
|
||||
"doland trump": "Donald Trump",
|
||||
"el presidente trump": "Donald Trump",
|
||||
"president trump": "Donald Trump",
|
||||
"ex-president trump": "Donald Trump",
|
||||
"expresidente trump": "Donald Trump",
|
||||
"putin": "Vladimir Putin",
|
||||
"vladimir putin": "Vladimir Putin",
|
||||
"v. putin": "Vladimir Putin",
|
||||
"presidente putin": "Vladimir Putin",
|
||||
# New requests
|
||||
"sanchez": "Pedro Sánchez",
|
||||
"pedro sanchez": "Pedro Sánchez",
|
||||
"p. sanchez": "Pedro Sánchez",
|
||||
"mr. sanchez": "Pedro Sánchez",
|
||||
"sánchez": "Pedro Sánchez", # explicit match just in case
|
||||
"pedro sánchez": "Pedro Sánchez",
|
||||
"maduro": "Nicolás Maduro",
|
||||
"nicolas maduro": "Nicolás Maduro",
|
||||
"mr. maduro": "Nicolás Maduro",
|
||||
"lula": "Lula da Silva",
|
||||
"lula da silva": "Lula da Silva",
|
||||
"luiz inácio lula da silva": "Lula da Silva",
|
||||
}
|
||||
if lower in replacements:
|
||||
return replacements[lower]
|
||||
|
||||
# Blacklist (explicit removals requested)
|
||||
blacklist = {
|
||||
"getty images", "netflix", "fiscalia", "segun", "estoy", # People blacklist
|
||||
"and more", "app", "estamos", "ultra", # Orgs blacklist
|
||||
"hacienda", "fiscalía"
|
||||
}
|
||||
if lower in blacklist:
|
||||
log.debug(f"Clean reject (blacklist): {text}")
|
||||
return None
|
||||
|
||||
return text
|
||||
# Normalización vía entity_config
|
||||
canonical = get_canonical_name(text)
|
||||
|
||||
return canonical
|
||||
|
||||
|
||||
# ==========================================================
|
||||
|
|
@ -207,7 +209,11 @@ def clean_topic_text(text: str) -> str | None:
|
|||
if not text:
|
||||
return None
|
||||
|
||||
text = BeautifulSoup(text, "html.parser").get_text()
|
||||
try:
|
||||
text = BeautifulSoup(text, "html.parser").get_text()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for pat in HTML_TRASH_PATTERNS:
|
||||
text = re.sub(pat, "", text)
|
||||
|
||||
|
|
@ -217,6 +223,9 @@ def clean_topic_text(text: str) -> str | None:
|
|||
if len(text) < TOPIC_MIN_CHARS:
|
||||
return None
|
||||
|
||||
if is_blacklisted(text):
|
||||
return None
|
||||
|
||||
lower = text.lower()
|
||||
if _looks_like_attr_or_path(lower):
|
||||
return None
|
||||
|
|
@ -245,8 +254,6 @@ def clean_topic_text(text: str) -> str | None:
|
|||
return None
|
||||
if all(t in STOPWORDS for t in tokens):
|
||||
return None
|
||||
if re.fullmatch(r"[0-9\s\.,\-:/]+", norm):
|
||||
return None
|
||||
|
||||
return norm
|
||||
|
||||
|
|
@ -262,8 +269,6 @@ def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]],
|
|||
return ents, topics
|
||||
|
||||
doc = nlp(text)
|
||||
# log.debug(f"Processing text ({len(text)} chars): {text[:30]}...")
|
||||
# log.debug(f"Entities found: {len(doc.ents)}")
|
||||
|
||||
# --- ENTIDADES ---
|
||||
for ent in doc.ents:
|
||||
|
|
@ -273,35 +278,8 @@ def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]],
|
|||
|
||||
cleaned = clean_tag_text(ent.text)
|
||||
if not cleaned:
|
||||
# log.debug(f"Rejected entity: {ent.text} ({ent.label_})")
|
||||
continue
|
||||
|
||||
if tipo == "persona":
|
||||
lower_cleaned = cleaned.lower()
|
||||
# Aggressive normalization rules for VIPs
|
||||
# Use token checks or substring checks carefully
|
||||
if "trump" in lower_cleaned.split():
|
||||
# Token 'trump' exists? e.g. "donald trump", "trump", "mr. trump"
|
||||
# Exclude family members
|
||||
family = ["ivanka", "melania", "eric", "tiffany", "barron", "lara", "mary", "fred"]
|
||||
if not any(f in lower_cleaned for f in family):
|
||||
cleaned = "Donald Trump"
|
||||
|
||||
elif "sanchez" in lower_cleaned or "sánchez" in lower_cleaned:
|
||||
# Be careful of other Sanchez? But user context implies Pedro.
|
||||
if "pedro" in lower_cleaned or "presidente" in lower_cleaned or lower_cleaned in ["sanchez", "sánchez"]:
|
||||
cleaned = "Pedro Sánchez"
|
||||
|
||||
elif "maduro" in lower_cleaned:
|
||||
cleaned = "Nicolás Maduro"
|
||||
|
||||
elif "lula" in lower_cleaned:
|
||||
cleaned = "Lula da Silva"
|
||||
|
||||
elif "putin" in lower_cleaned:
|
||||
cleaned = "Vladimir Putin"
|
||||
|
||||
# log.debug(f"Accepted entity: {cleaned} ({tipo})")
|
||||
ents.append((cleaned, tipo))
|
||||
|
||||
# --- TOPICS ---
|
||||
|
|
@ -311,10 +289,10 @@ def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]],
|
|||
if cleaned:
|
||||
topic_counter[cleaned] += 1
|
||||
|
||||
ent_values = {v for (v, _) in ents}
|
||||
ent_values = {v.lower() for (v, _) in ents}
|
||||
|
||||
for val, _count in topic_counter.most_common(TOPIC_MAX_PER_DOC):
|
||||
if val in ent_values:
|
||||
if val.lower() in ent_values:
|
||||
continue
|
||||
topics.append((val, "tema"))
|
||||
|
||||
|
|
@ -328,85 +306,98 @@ def main():
|
|||
global STOPWORDS
|
||||
|
||||
# Cargar spaCy
|
||||
log.info("Cargando modelo spaCy es_core_news_md...")
|
||||
nlp = spacy.load("es_core_news_md", disable=["lemmatizer", "textcat"])
|
||||
log.info("Cargando modelo spaCy es_core_news_lg...")
|
||||
nlp = spacy.load("es_core_news_lg", disable=["lemmatizer", "textcat"])
|
||||
STOPWORDS = set(nlp.Defaults.stop_words)
|
||||
log.info("Modelo spaCy cargado correctamente.")
|
||||
|
||||
# Cargar configuración de entidades
|
||||
load_entity_config()
|
||||
|
||||
while True:
|
||||
try:
|
||||
with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT t.id, t.titulo_trad, t.resumen_trad
|
||||
FROM traducciones t
|
||||
WHERE t.status = 'done'
|
||||
AND t.lang_to = %s
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM tags_noticia tn WHERE tn.traduccion_id = t.id
|
||||
)
|
||||
ORDER BY t.id DESC
|
||||
LIMIT %s;
|
||||
""",
|
||||
(NER_LANG, BATCH),
|
||||
)
|
||||
with get_conn() as conn:
|
||||
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT t.id, t.titulo_trad, t.resumen_trad, t.noticia_id
|
||||
FROM traducciones t
|
||||
WHERE t.status = 'done'
|
||||
AND t.lang_to = %s
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM tags_noticia tn WHERE tn.traduccion_id = t.id
|
||||
)
|
||||
ORDER BY t.id DESC
|
||||
LIMIT %s;
|
||||
""",
|
||||
(NER_LANG, BATCH),
|
||||
)
|
||||
|
||||
rows = cur.fetchall()
|
||||
|
||||
rows = cur.fetchall()
|
||||
|
||||
if not rows:
|
||||
time.sleep(5)
|
||||
continue
|
||||
|
||||
log.info(f"Procesando {len(rows)} traducciones para NER/temas...")
|
||||
|
||||
inserted_links = 0
|
||||
|
||||
for r in rows:
|
||||
text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
|
||||
if not text:
|
||||
if not rows:
|
||||
time.sleep(10)
|
||||
continue
|
||||
|
||||
ents, topics = extract_entities_and_topics(nlp, text)
|
||||
tags = ents + topics
|
||||
if not tags:
|
||||
continue
|
||||
log.info(f"Procesando {len(rows)} traducciones para NER/temas...")
|
||||
|
||||
for valor, tipo in tags:
|
||||
try:
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO tags (valor, tipo)
|
||||
VALUES (%s, %s)
|
||||
ON CONFLICT (valor, tipo)
|
||||
DO UPDATE SET valor = EXCLUDED.valor
|
||||
RETURNING id;
|
||||
""",
|
||||
(valor, tipo),
|
||||
)
|
||||
tag_id = cur.fetchone()[0]
|
||||
inserted_links = 0
|
||||
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO tags_noticia (traduccion_id, tag_id)
|
||||
VALUES (%s, %s)
|
||||
ON CONFLICT DO NOTHING;
|
||||
""",
|
||||
(r["id"], tag_id),
|
||||
)
|
||||
for r in rows:
|
||||
noticia_id = r["noticia_id"]
|
||||
traduccion_id = r["id"]
|
||||
|
||||
text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
|
||||
if not text:
|
||||
# Para evitar re-procesar, insertamos un tag especial '_none_'
|
||||
tags = [("_none_", "sistema")]
|
||||
else:
|
||||
ents, topics = extract_entities_and_topics(nlp, text)
|
||||
tags = ents + topics
|
||||
if not tags:
|
||||
tags = [("_none_", "sistema")]
|
||||
|
||||
if cur.rowcount > 0:
|
||||
inserted_links += 1
|
||||
for valor, tipo in tags:
|
||||
try:
|
||||
# Usar commit parcial por noticia para evitar abortar todo el batch
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO tags (valor, tipo)
|
||||
VALUES (%s, %s)
|
||||
ON CONFLICT (valor, tipo)
|
||||
DO UPDATE SET valor = EXCLUDED.valor
|
||||
RETURNING id;
|
||||
""",
|
||||
(valor, tipo),
|
||||
)
|
||||
tag_id = cur.fetchone()[0]
|
||||
|
||||
except Exception:
|
||||
log.exception("Error insertando tag/relación")
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO tags_noticia (traduccion_id, noticia_id, tag_id)
|
||||
VALUES (%s, %s, %s)
|
||||
ON CONFLICT DO NOTHING;
|
||||
""",
|
||||
(traduccion_id, noticia_id, tag_id),
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
log.info(f"Lote NER OK. Nuevas relaciones tag_noticia: {inserted_links}")
|
||||
if cur.rowcount > 0:
|
||||
inserted_links += 1
|
||||
except Exception as e:
|
||||
log.error(f"Error insertando tag '{valor}': {e}")
|
||||
conn.rollback()
|
||||
# Volvemos a empezar el loop de tags para esta noticia no es buena idea,
|
||||
# pero el rollback abortó la transacción del cursor.
|
||||
# En psycopg2, tras rollback hay que seguir o cerrar.
|
||||
pass
|
||||
|
||||
conn.commit()
|
||||
|
||||
except Exception:
|
||||
log.exception("Error general en NER loop")
|
||||
time.sleep(5)
|
||||
log.info(f"Lote NER OK. Nuevas relaciones tag_noticia: {inserted_links}")
|
||||
|
||||
except Exception as e:
|
||||
log.exception(f"Error general en NER loop: {e}")
|
||||
time.sleep(10)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -1,334 +0,0 @@
|
|||
"""
|
||||
Worker de Qdrant
|
||||
Vectoriza noticias traducidas y las sube a Qdrant para búsquedas semánticas.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any
|
||||
|
||||
# Añadir el directorio raíz al path
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from db import get_read_conn, get_write_conn
|
||||
|
||||
try:
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
|
||||
except ImportError:
|
||||
print("❌ Error: qdrant-client no instalado. Ejecuta: pip install qdrant-client")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
except ImportError:
|
||||
print("❌ Error: sentence-transformers no instalado")
|
||||
sys.exit(1)
|
||||
|
||||
# Configuración
|
||||
QDRANT_HOST = os.environ.get("QDRANT_HOST", "localhost")
|
||||
QDRANT_PORT = int(os.environ.get("QDRANT_PORT", "6333"))
|
||||
QDRANT_COLLECTION = os.environ.get("QDRANT_COLLECTION_NAME", "news_vectors")
|
||||
|
||||
EMB_MODEL = os.environ.get("EMB_MODEL", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
||||
EMB_DEVICE = os.environ.get("EMB_DEVICE", "cuda")
|
||||
BATCH_SIZE = int(os.environ.get("QDRANT_BATCH_SIZE", "100"))
|
||||
SLEEP_IDLE = int(os.environ.get("QDRANT_SLEEP_IDLE", "30"))
|
||||
|
||||
# Cliente Qdrant global
|
||||
qdrant_client = None
|
||||
embedding_model = None
|
||||
|
||||
|
||||
def init_qdrant_client():
|
||||
"""
|
||||
Inicializa el cliente de Qdrant y crea la colección si no existe.
|
||||
"""
|
||||
global qdrant_client
|
||||
|
||||
print(f"🔌 Conectando a Qdrant en {QDRANT_HOST}:{QDRANT_PORT}...")
|
||||
qdrant_client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
|
||||
|
||||
# Verificar si la colección existe
|
||||
collections = qdrant_client.get_collections().collections
|
||||
collection_names = [c.name for c in collections]
|
||||
|
||||
if QDRANT_COLLECTION not in collection_names:
|
||||
print(f"📦 Creando colección '{QDRANT_COLLECTION}'...")
|
||||
|
||||
# Obtener dimensión del modelo de embeddings
|
||||
# paraphrase-multilingual-MiniLM-L12-v2 = 384 dimensiones
|
||||
vector_size = 384
|
||||
|
||||
qdrant_client.create_collection(
|
||||
collection_name=QDRANT_COLLECTION,
|
||||
vectors_config=VectorParams(
|
||||
size=vector_size,
|
||||
distance=Distance.COSINE
|
||||
)
|
||||
)
|
||||
print(f"✅ Colección '{QDRANT_COLLECTION}' creada (dimensión: {vector_size})")
|
||||
else:
|
||||
print(f"✅ Colección '{QDRANT_COLLECTION}' ya existe")
|
||||
|
||||
# Obtener info de la colección
|
||||
collection_info = qdrant_client.get_collection(QDRANT_COLLECTION)
|
||||
print(f"📊 Puntos en colección: {collection_info.points_count}")
|
||||
|
||||
|
||||
def init_embedding_model():
|
||||
"""
|
||||
Inicializa el modelo de embeddings.
|
||||
"""
|
||||
global embedding_model
|
||||
|
||||
print(f"🤖 Cargando modelo de embeddings: {EMB_MODEL}")
|
||||
print(f"🖥️ Dispositivo: {EMB_DEVICE}")
|
||||
|
||||
embedding_model = SentenceTransformer(EMB_MODEL, device=EMB_DEVICE)
|
||||
|
||||
print(f"✅ Modelo cargado correctamente")
|
||||
|
||||
|
||||
def get_pending_news(limit: int = BATCH_SIZE) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Obtiene noticias traducidas pendientes de vectorizar.
|
||||
|
||||
Args:
|
||||
limit: Número máximo de noticias a obtener
|
||||
|
||||
Returns:
|
||||
Lista de noticias
|
||||
"""
|
||||
with get_read_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT
|
||||
t.id as traduccion_id,
|
||||
t.noticia_id,
|
||||
TRIM(t.lang_to) as lang,
|
||||
t.titulo_trad as titulo,
|
||||
t.resumen_trad as resumen,
|
||||
n.url,
|
||||
n.fecha,
|
||||
n.fuente_nombre,
|
||||
n.categoria_id,
|
||||
n.pais_id
|
||||
FROM traducciones t
|
||||
INNER JOIN noticias n ON t.noticia_id = n.id
|
||||
WHERE t.vectorized = FALSE
|
||||
AND t.status = 'done'
|
||||
ORDER BY t.created_at ASC
|
||||
LIMIT %s
|
||||
""", (limit,))
|
||||
|
||||
columns = [desc[0] for desc in cur.description]
|
||||
results = []
|
||||
for row in cur.fetchall():
|
||||
results.append(dict(zip(columns, row)))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
||||
"""
|
||||
Genera embeddings para una lista de textos.
|
||||
|
||||
Args:
|
||||
texts: Lista de textos
|
||||
|
||||
Returns:
|
||||
Lista de vectores de embeddings
|
||||
"""
|
||||
embeddings = embedding_model.encode(
|
||||
texts,
|
||||
batch_size=32,
|
||||
show_progress_bar=False,
|
||||
convert_to_numpy=True
|
||||
)
|
||||
return embeddings.tolist()
|
||||
|
||||
|
||||
def upload_to_qdrant(news_batch: List[Dict[str, Any]]):
|
||||
"""
|
||||
Sube un lote de noticias a Qdrant.
|
||||
|
||||
Args:
|
||||
news_batch: Lista de noticias
|
||||
"""
|
||||
if not news_batch:
|
||||
return
|
||||
|
||||
# Preparar textos para embeddings (título + resumen)
|
||||
texts = [
|
||||
f"{news['titulo']} {news['resumen']}"
|
||||
for news in news_batch
|
||||
]
|
||||
|
||||
print(f" 🧮 Generando embeddings para {len(texts)} noticias...")
|
||||
embeddings = generate_embeddings(texts)
|
||||
|
||||
# Preparar puntos para Qdrant
|
||||
points = []
|
||||
for news, embedding in zip(news_batch, embeddings):
|
||||
point_id = str(uuid.uuid4())
|
||||
|
||||
# Preparar payload (metadata)
|
||||
payload = {
|
||||
"news_id": news['noticia_id'],
|
||||
"traduccion_id": news['traduccion_id'],
|
||||
"titulo": news['titulo'],
|
||||
"resumen": news['resumen'],
|
||||
"url": news['url'],
|
||||
"fecha": news['fecha'].isoformat() if news['fecha'] else None,
|
||||
"fuente_nombre": news['fuente_nombre'],
|
||||
"categoria_id": news['categoria_id'],
|
||||
"pais_id": news['pais_id'],
|
||||
"lang": news['lang']
|
||||
}
|
||||
|
||||
point = PointStruct(
|
||||
id=point_id,
|
||||
vector=embedding,
|
||||
payload=payload
|
||||
)
|
||||
points.append(point)
|
||||
|
||||
# Guardar point_id para actualizar DB
|
||||
news['qdrant_point_id'] = point_id
|
||||
|
||||
# Subir a Qdrant
|
||||
print(f" ⬆️ Subiendo {len(points)} puntos a Qdrant...")
|
||||
qdrant_client.upsert(
|
||||
collection_name=QDRANT_COLLECTION,
|
||||
points=points
|
||||
)
|
||||
|
||||
# Actualizar base de datos
|
||||
print(f" 💾 Actualizando estado en PostgreSQL...")
|
||||
with get_write_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
for news in news_batch:
|
||||
cur.execute("""
|
||||
UPDATE traducciones
|
||||
SET
|
||||
vectorized = TRUE,
|
||||
vectorization_date = NOW(),
|
||||
qdrant_point_id = %s
|
||||
WHERE id = %s
|
||||
""", (news['qdrant_point_id'], news['traduccion_id']))
|
||||
conn.commit()
|
||||
|
||||
print(f" ✅ Lote subido correctamente")
|
||||
|
||||
|
||||
def process_batch():
|
||||
"""
|
||||
Procesa un lote de noticias traducidas.
|
||||
|
||||
Returns:
|
||||
Número de noticias procesadas
|
||||
"""
|
||||
news_batch = get_pending_news()
|
||||
|
||||
if not news_batch:
|
||||
return 0
|
||||
|
||||
print(f"\n📋 Procesando {len(news_batch)} noticias traducidas...")
|
||||
|
||||
try:
|
||||
upload_to_qdrant(news_batch)
|
||||
return len(news_batch)
|
||||
except Exception as e:
|
||||
print(f"❌ Error procesando lote: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
def get_stats():
|
||||
"""
|
||||
Obtiene estadísticas del sistema.
|
||||
"""
|
||||
with get_read_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE vectorized = TRUE) as vectorizadas,
|
||||
COUNT(*) FILTER (WHERE vectorized = FALSE AND status = 'done') as pendientes
|
||||
FROM traducciones
|
||||
WHERE lang_to = 'es'
|
||||
""")
|
||||
row = cur.fetchone()
|
||||
return {
|
||||
'total': row[0],
|
||||
'vectorizadas': row[1],
|
||||
'pendientes': row[2]
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Loop principal del worker.
|
||||
"""
|
||||
print("=" * 80)
|
||||
print("🚀 Qdrant Vectorization Worker (Direct Translation)")
|
||||
print("=" * 80)
|
||||
print(f"Qdrant: {QDRANT_HOST}:{QDRANT_PORT}")
|
||||
print(f"Colección: {QDRANT_COLLECTION}")
|
||||
print(f"Modelo: {EMB_MODEL}")
|
||||
print(f"Dispositivo: {EMB_DEVICE}")
|
||||
print(f"Tamaño de lote: {BATCH_SIZE}")
|
||||
print("=" * 80)
|
||||
|
||||
# Inicializar Qdrant
|
||||
try:
|
||||
init_qdrant_client()
|
||||
except Exception as e:
|
||||
print(f"❌ Error inicializando Qdrant: {e}")
|
||||
print("⚠️ Asegúrate de que Qdrant esté corriendo")
|
||||
return
|
||||
|
||||
# Inicializar modelo de embeddings
|
||||
try:
|
||||
init_embedding_model()
|
||||
except Exception as e:
|
||||
print(f"❌ Error cargando modelo de embeddings: {e}")
|
||||
return
|
||||
|
||||
print("\n🔄 Iniciando loop de procesamiento...\n")
|
||||
|
||||
total_processed = 0
|
||||
|
||||
while True:
|
||||
try:
|
||||
processed = process_batch()
|
||||
total_processed += processed
|
||||
|
||||
if processed > 0:
|
||||
print(f"\n✅ Lote completado: {processed} noticias vectorizadas")
|
||||
print(f"📊 Total procesado en esta sesión: {total_processed}")
|
||||
|
||||
# Mostrar estadísticas
|
||||
stats = get_stats()
|
||||
print(f"📈 Estadísticas globales:")
|
||||
print(f" Total traducciones: {stats['total']}")
|
||||
print(f" Vectorizadas: {stats['vectorizadas']}")
|
||||
print(f" Pendientes: {stats['pendientes']}")
|
||||
else:
|
||||
print(f"💤 No hay noticias pendientes. Esperando {SLEEP_IDLE}s...")
|
||||
time.sleep(SLEEP_IDLE)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\n⏹️ Worker detenido por el usuario")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error en loop principal: {e}")
|
||||
print(f"⏳ Esperando {SLEEP_IDLE}s antes de reintentar...")
|
||||
time.sleep(SLEEP_IDLE)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,202 +0,0 @@
|
|||
import os
|
||||
import time
|
||||
import logging
|
||||
from typing import List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='[related] %(asctime)s %(levelname)s: %(message)s'
|
||||
)
|
||||
|
||||
DB = dict(
|
||||
host=os.environ.get("DB_HOST", "localhost"),
|
||||
port=int(os.environ.get("DB_PORT", 5432)),
|
||||
dbname=os.environ.get("DB_NAME", "rss"),
|
||||
user=os.environ.get("DB_USER", "rss"),
|
||||
password=os.environ.get("DB_PASS", "x"),
|
||||
)
|
||||
|
||||
EMB_MODEL = os.environ.get(
|
||||
"EMB_MODEL",
|
||||
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
||||
)
|
||||
|
||||
TOPK = int(os.environ.get("RELATED_TOPK", 10))
|
||||
BATCH_IDS = int(os.environ.get("RELATED_BATCH_IDS", 200))
|
||||
SLEEP_IDLE = float(os.environ.get("RELATED_SLEEP", 10))
|
||||
MIN_SCORE = float(os.environ.get("RELATED_MIN_SCORE", 0.0))
|
||||
WINDOW_HOURS = int(os.environ.get("RELATED_WINDOW_H", 0))
|
||||
|
||||
|
||||
def get_conn():
|
||||
return psycopg2.connect(**DB)
|
||||
|
||||
|
||||
def fetch_all_embeddings(cur) -> Tuple[List[int], np.ndarray]:
|
||||
sql = """
|
||||
SELECT e.traduccion_id, e.embedding, n.fecha
|
||||
FROM traduccion_embeddings e
|
||||
JOIN traducciones t ON t.id = e.traduccion_id
|
||||
JOIN noticias n ON n.id = t.noticia_id
|
||||
WHERE e.model = %s
|
||||
AND t.status = 'done'
|
||||
AND t.lang_to = 'es'
|
||||
"""
|
||||
params = [EMB_MODEL]
|
||||
|
||||
if WINDOW_HOURS > 0:
|
||||
sql += " AND n.fecha >= NOW() - INTERVAL %s"
|
||||
params.append(f"{WINDOW_HOURS} hours")
|
||||
|
||||
cur.execute(sql, params)
|
||||
rows = cur.fetchall()
|
||||
|
||||
if not rows:
|
||||
return [], None
|
||||
|
||||
ids = []
|
||||
vecs = []
|
||||
|
||||
for tr_id, emb, _ in rows:
|
||||
if not emb:
|
||||
continue
|
||||
arr = np.asarray(emb, dtype=np.float32)
|
||||
if arr.ndim != 1 or arr.size == 0:
|
||||
continue
|
||||
ids.append(tr_id)
|
||||
vecs.append(arr)
|
||||
|
||||
if not ids:
|
||||
return [], None
|
||||
|
||||
mat = np.vstack(vecs)
|
||||
norms = np.linalg.norm(mat, axis=1, keepdims=True)
|
||||
norms[norms == 0] = 1e-8
|
||||
mat = mat / norms
|
||||
|
||||
return ids, mat
|
||||
|
||||
|
||||
def fetch_pending_ids(cur, limit) -> List[int]:
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT t.id
|
||||
FROM traducciones t
|
||||
JOIN traduccion_embeddings e
|
||||
ON e.traduccion_id = t.id AND e.model = %s
|
||||
LEFT JOIN related_noticias r
|
||||
ON r.traduccion_id = t.id
|
||||
WHERE t.lang_to = 'es'
|
||||
AND t.status = 'done'
|
||||
GROUP BY t.id
|
||||
HAVING COUNT(r.related_traduccion_id) = 0
|
||||
ORDER BY t.id DESC
|
||||
LIMIT %s;
|
||||
""",
|
||||
(EMB_MODEL, limit),
|
||||
)
|
||||
return [r[0] for r in cur.fetchall()]
|
||||
|
||||
|
||||
def topk(idx: int, ids_all: List[int], mat: np.ndarray, K: int) -> List[Tuple[int, float]]:
|
||||
q = mat[idx]
|
||||
sims = np.dot(mat, q)
|
||||
sims[idx] = -999.0
|
||||
|
||||
if MIN_SCORE > 0:
|
||||
mask = sims >= MIN_SCORE
|
||||
sims = np.where(mask, sims, -999.0)
|
||||
|
||||
if K >= len(sims):
|
||||
top_idx = np.argsort(-sims)
|
||||
else:
|
||||
part = np.argpartition(-sims, K)[:K]
|
||||
top_idx = part[np.argsort(-sims[part])]
|
||||
|
||||
return [(ids_all[j], float(sims[j])) for j in top_idx[:K]]
|
||||
|
||||
|
||||
def insert_related(cur, tr_id: int, pairs):
|
||||
clean = []
|
||||
for rid, score in pairs:
|
||||
if rid == tr_id:
|
||||
continue
|
||||
s = float(score)
|
||||
if s <= 0:
|
||||
continue
|
||||
clean.append((tr_id, rid, s))
|
||||
|
||||
if not clean:
|
||||
return
|
||||
|
||||
psycopg2.extras.execute_values(
|
||||
cur,
|
||||
"""
|
||||
INSERT INTO related_noticias (traduccion_id, related_traduccion_id, score)
|
||||
VALUES %s
|
||||
ON CONFLICT (traduccion_id, related_traduccion_id)
|
||||
DO UPDATE SET score = EXCLUDED.score;
|
||||
""",
|
||||
clean,
|
||||
)
|
||||
|
||||
|
||||
def build_for_ids(conn, target_ids: List[int]) -> int:
|
||||
with conn.cursor() as cur:
|
||||
ids_all, mat = fetch_all_embeddings(cur)
|
||||
|
||||
if not ids_all or mat is None:
|
||||
return 0
|
||||
|
||||
pos = {tid: i for i, tid in enumerate(ids_all)}
|
||||
processed = 0
|
||||
|
||||
with conn.cursor() as cur:
|
||||
for tr_id in target_ids:
|
||||
if tr_id not in pos:
|
||||
continue
|
||||
idx = pos[tr_id]
|
||||
pairs = topk(idx, ids_all, mat, TOPK)
|
||||
insert_related(cur, tr_id, pairs)
|
||||
processed += 1
|
||||
|
||||
conn.commit()
|
||||
|
||||
return processed
|
||||
|
||||
|
||||
def main():
|
||||
logging.info(
|
||||
"Iniciando related_worker (EMB=%s TOPK=%s BATCH=%s MIN=%.3f WINDOW_H=%s)",
|
||||
EMB_MODEL,
|
||||
TOPK,
|
||||
BATCH_IDS,
|
||||
MIN_SCORE,
|
||||
WINDOW_HOURS,
|
||||
)
|
||||
|
||||
while True:
|
||||
try:
|
||||
with get_conn() as conn, conn.cursor() as cur:
|
||||
todo = fetch_pending_ids(cur, BATCH_IDS)
|
||||
|
||||
if not todo:
|
||||
time.sleep(SLEEP_IDLE)
|
||||
continue
|
||||
|
||||
with get_conn() as conn:
|
||||
done = build_for_ids(conn, todo)
|
||||
logging.info("Relacionadas generadas/actualizadas para %d traducciones.", done)
|
||||
|
||||
except Exception:
|
||||
logging.exception("Error en related_worker")
|
||||
time.sleep(SLEEP_IDLE)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
207
workers/simple_translator.py
Normal file
207
workers/simple_translator.py
Normal file
|
|
@ -0,0 +1,207 @@
|
|||
#!/usr/bin/env python3
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
import re
|
||||
from typing import List, Optional
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
from psycopg2.extras import execute_values
|
||||
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
||||
import torch
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
|
||||
LOG = logging.getLogger("translator_simple")
|
||||
|
||||
DB_CONFIG = {
|
||||
"host": os.environ.get("DB_HOST", "localhost"),
|
||||
"port": int(os.environ.get("DB_PORT", 5432)),
|
||||
"dbname": os.environ.get("DB_NAME", "rss"),
|
||||
"user": os.environ.get("DB_USER", "rss"),
|
||||
"password": os.environ.get("DB_PASS", "x"),
|
||||
}
|
||||
|
||||
TARGET_LANGS = os.environ.get("TARGET_LANGS", "es").split(",")
|
||||
BATCH_SIZE = int(os.environ.get("TRANSLATOR_BATCH", 32))
|
||||
MAX_SRC_TOKENS = 512
|
||||
|
||||
TRANSLATORS = {}
|
||||
|
||||
LANG_MAP = {
|
||||
"en": "en-ES",
|
||||
"es": "es-ES",
|
||||
"fr": "fr-ES",
|
||||
"de": "de-ES",
|
||||
"pt": "pt-ES",
|
||||
"it": "it-ES",
|
||||
"ru": "ru-ES",
|
||||
"ar": "ar-ES",
|
||||
"fa": "fa-ES",
|
||||
"ps": "ps-ES",
|
||||
"zh": "zh-ES",
|
||||
"ja": "ja-ES",
|
||||
"ko": "ko-ES",
|
||||
}
|
||||
|
||||
def get_translator(source_lang: str, target_lang: str = "es"):
|
||||
key = f"{source_lang}_{target_lang}"
|
||||
if key in TRANSLATORS:
|
||||
return TRANSLATORS[key]
|
||||
|
||||
model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
|
||||
if source_lang == target_lang:
|
||||
model_name = f"Helsinki-NLP/opus-mt-{source_lang}-es"
|
||||
|
||||
LOG.info(f"Loading translator: {model_name}")
|
||||
|
||||
try:
|
||||
device = 0 if torch.cuda.is_available() else -1
|
||||
translator = pipeline("translation", model=model_name, device=device)
|
||||
TRANSLATORS[key] = translator
|
||||
LOG.info(f"Translator loaded: {key}")
|
||||
return translator
|
||||
except Exception as e:
|
||||
LOG.error(f"Failed to load translator {model_name}: {e}")
|
||||
return None
|
||||
|
||||
def normalize_lang(lang: Optional[str], default: str = "es") -> Optional[str]:
|
||||
if not lang:
|
||||
return default
|
||||
lang = lang.strip().lower()[:2]
|
||||
return lang if lang else default
|
||||
|
||||
def translate_text(source_lang: str, target_lang: str, texts: List[str]) -> List[str]:
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
if source_lang == target_lang:
|
||||
return texts
|
||||
|
||||
translator = get_translator(source_lang, target_lang)
|
||||
if not translator:
|
||||
return texts
|
||||
|
||||
results = []
|
||||
for text in texts:
|
||||
if not text or not text.strip():
|
||||
results.append(text)
|
||||
continue
|
||||
try:
|
||||
result = translator(text[:MAX_SRC_TOKENS], max_length=MAX_SRC_TOKENS)
|
||||
translated = result[0]['translation_text']
|
||||
results.append(translated)
|
||||
except Exception as e:
|
||||
LOG.warning(f"Translation error: {e}")
|
||||
results.append(text)
|
||||
|
||||
return results
|
||||
|
||||
def connect_db():
|
||||
return psycopg2.connect(**DB_CONFIG)
|
||||
|
||||
def process_batch(conn, rows):
|
||||
todo = []
|
||||
|
||||
for r in rows:
|
||||
lang_to = normalize_lang(r.get("lang_to"), "es") or "es"
|
||||
lang_from = normalize_lang(r.get("lang_from")) or "en"
|
||||
|
||||
titulo = (r.get("titulo") or "").strip()
|
||||
resumen = (r.get("resumen") or "").strip()
|
||||
|
||||
if lang_from == lang_to:
|
||||
continue
|
||||
|
||||
todo.append({
|
||||
"tr_id": r.get("tr_id"),
|
||||
"lang_from": lang_from,
|
||||
"lang_to": lang_to,
|
||||
"titulo": titulo,
|
||||
"resumen": resumen,
|
||||
})
|
||||
|
||||
if not todo:
|
||||
return
|
||||
|
||||
from collections import defaultdict
|
||||
groups = defaultdict(list)
|
||||
for item in todo:
|
||||
key = (item["lang_from"], item["lang_to"])
|
||||
groups[key].append(item)
|
||||
|
||||
for (lang_from, lang_to), items in groups.items():
|
||||
LOG.info(f"Translating {lang_from} -> {lang_to} ({len(items)} items)")
|
||||
|
||||
titles = [i["titulo"] for i in items]
|
||||
translated_titles = translate_text(lang_from, lang_to, titles)
|
||||
|
||||
translated_bodies = []
|
||||
for i in items:
|
||||
body = (i["resumen"] or "").strip()
|
||||
if body:
|
||||
tr = translate_text(lang_from, lang_to, [body])
|
||||
translated_bodies.append(tr[0] if tr else body)
|
||||
else:
|
||||
translated_bodies.append("")
|
||||
|
||||
cursor = conn.cursor()
|
||||
for i, (item, tt, tb) in enumerate(zip(items, translated_titles, translated_bodies)):
|
||||
tt = (tt or "").strip()
|
||||
tb = (tb or "").strip()
|
||||
|
||||
if not tt:
|
||||
tt = item["titulo"]
|
||||
if not tb:
|
||||
tb = item["resumen"]
|
||||
|
||||
try:
|
||||
cursor.execute("""
|
||||
UPDATE traducciones
|
||||
SET titulo_trad = %s, resumen_trad = %s, lang_to = %s
|
||||
WHERE id = %s
|
||||
""", (tt, tb, lang_to, item["tr_id"]))
|
||||
except Exception as e:
|
||||
LOG.error(f"Update error: {e}")
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
LOG.info(f"Translated {len(items)} items")
|
||||
|
||||
def fetch_pending_translations(conn):
|
||||
cursor = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
|
||||
for lang in TARGET_LANGS:
|
||||
cursor.execute("""
|
||||
SELECT t.id as tr_id, t.lang_from, t.lang_to,
|
||||
n.titulo, n.resumen, n.id as noticia_id
|
||||
FROM traducciones t
|
||||
JOIN noticias n ON n.id = t.noticia_id
|
||||
WHERE t.lang_to = %s
|
||||
AND (t.titulo_trad IS NULL OR t.resumen_trad IS NULL)
|
||||
ORDER BY n.fecha DESC
|
||||
LIMIT %s
|
||||
""", (lang, BATCH_SIZE))
|
||||
|
||||
rows = cursor.fetchall()
|
||||
if rows:
|
||||
LOG.info(f"Found {len(rows)} pending translations for {lang}")
|
||||
process_batch(conn, rows)
|
||||
|
||||
cursor.close()
|
||||
|
||||
def main():
|
||||
LOG.info("Simple translator worker started")
|
||||
|
||||
while True:
|
||||
try:
|
||||
conn = connect_db()
|
||||
fetch_pending_translations(conn)
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
LOG.error(f"Error: {e}")
|
||||
|
||||
time.sleep(30)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
151
workers/simple_translator_worker.py
Normal file
151
workers/simple_translator_worker.py
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple Translation Worker using deep-translator
|
||||
Uses free translation APIs (Google, LibreTranslate, etc.)
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from deep_translator import GoogleTranslator, MyMemoryTranslator, single_detection
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='[%(asctime)s] %(levelname)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DB_CONFIG = {
|
||||
'host': os.getenv('DB_HOST', 'db'),
|
||||
'port': int(os.getenv('DB_PORT', 5432)),
|
||||
'database': os.getenv('DB_NAME', 'rss'),
|
||||
'user': os.getenv('DB_USER', 'rss'),
|
||||
'password': os.getenv('DB_PASS', 'rss')
|
||||
}
|
||||
|
||||
TARGET_LANG = os.getenv('TARGET_LANGS', 'es').split(',')[0].strip()
|
||||
BATCH_SIZE = int(os.getenv('TRANSLATOR_BATCH', '32'))
|
||||
SLEEP_INTERVAL = int(os.getenv('TRANSLATOR_SLEEP', '60'))
|
||||
|
||||
def get_db_connection():
|
||||
return psycopg2.connect(**DB_CONFIG)
|
||||
|
||||
def get_pending_translations(conn):
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT n.id, n.feed_id, n.lang, n.titulo, n.resumen
|
||||
FROM noticias n
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM traducciones t
|
||||
WHERE t.noticia_id = n.id AND t.lang_to = %s
|
||||
)
|
||||
AND n.lang IS NOT NULL
|
||||
AND n.lang != %s
|
||||
ORDER BY n.created_at DESC
|
||||
LIMIT %s
|
||||
""", (TARGET_LANG, TARGET_LANG, BATCH_SIZE))
|
||||
return cur.fetchall()
|
||||
|
||||
def detect_language(text):
|
||||
"""Detect language using MyMemory (free API)"""
|
||||
try:
|
||||
if text and len(text.strip()) > 10:
|
||||
lang = single_detection(text, api_key=None)
|
||||
return lang
|
||||
except Exception as e:
|
||||
logger.debug(f"Language detection failed: {e}")
|
||||
return 'en'
|
||||
|
||||
def translate_text(text, source_lang, target_lang):
|
||||
"""Translate text using Google Translator (via deep-translator)"""
|
||||
if not text or not text.strip():
|
||||
return ""
|
||||
|
||||
try:
|
||||
translator = GoogleTranslator(source=source_lang, target=target_lang)
|
||||
translated = translator.translate(text)
|
||||
return translated if translated else text
|
||||
except Exception as e:
|
||||
logger.warning(f"Google translation failed: {e}")
|
||||
|
||||
# Fallback to MyMemory
|
||||
try:
|
||||
translator = MyMemoryTranslator(source=source_lang, target=target_lang)
|
||||
translated = translator.translate(text)
|
||||
return translated if translated else text
|
||||
except Exception as e2:
|
||||
logger.error(f"MyMemory translation also failed: {e2}")
|
||||
return text
|
||||
|
||||
def save_translation(conn, noticia_id, lang_from, titulo, resumen):
|
||||
titulo_trad = translate_text(titulo, lang_from, TARGET_LANG)
|
||||
resumen_trad = translate_text(resumen, lang_from, TARGET_LANG) if resumen else ""
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
INSERT INTO traducciones (noticia_id, lang_from, lang_to, titulo_trad, resumen_trad, status, created_at)
|
||||
VALUES (%s, %s, %s, %s, %s, 'done', NOW())
|
||||
ON CONFLICT (noticia_id, lang_to) DO UPDATE SET
|
||||
titulo_trad = EXCLUDED.titulo_trad,
|
||||
resumen_trad = EXCLUDED.resumen_trad,
|
||||
status = 'done'
|
||||
""", (noticia_id, lang_from, TARGET_LANG, titulo_trad, resumen_trad))
|
||||
conn.commit()
|
||||
|
||||
def process_translations():
|
||||
logger.info("Starting translation worker...")
|
||||
|
||||
while True:
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
pending = get_pending_translations(conn)
|
||||
|
||||
if not pending:
|
||||
logger.info(f"No pending translations. Sleeping {SLEEP_INTERVAL}s...")
|
||||
time.sleep(SLEEP_INTERVAL)
|
||||
continue
|
||||
|
||||
logger.info(f"Found {len(pending)} pending translations")
|
||||
|
||||
for item in pending:
|
||||
try:
|
||||
lang = item['lang']
|
||||
|
||||
# Auto-detect language if needed
|
||||
if not lang or lang == '':
|
||||
lang = detect_language(item['titulo'] or '')
|
||||
logger.info(f"Detected language: {lang} for news {item['id']}")
|
||||
|
||||
# Skip if already target language
|
||||
if lang == TARGET_LANG:
|
||||
logger.debug(f"Skipping news {item['id']} - already in target language")
|
||||
continue
|
||||
|
||||
save_translation(
|
||||
conn,
|
||||
item['id'],
|
||||
lang,
|
||||
item['titulo'],
|
||||
item['resumen']
|
||||
)
|
||||
logger.info(f"Translated news {item['id']}: {item['titulo'][:50]}...")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error translating news {item['id']}: {e}")
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Database error: {e}")
|
||||
time.sleep(5)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
logger.info(f"Translation worker started. Target: {TARGET_LANG}")
|
||||
process_translations()
|
||||
|
|
@ -1,244 +0,0 @@
|
|||
import os
|
||||
import time
|
||||
import logging
|
||||
import json
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
|
||||
# Logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='[topics_worker] %(asctime)s %(levelname)s: %(message)s'
|
||||
)
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# Config
|
||||
DB_CONFIG = {
|
||||
"host": os.environ.get("DB_HOST", "localhost"),
|
||||
"port": int(os.environ.get("DB_PORT", 5432)),
|
||||
"dbname": os.environ.get("DB_NAME", "rss"),
|
||||
"user": os.environ.get("DB_USER", "rss"),
|
||||
"password": os.environ.get("DB_PASS", "x"),
|
||||
}
|
||||
|
||||
SLEEP_IDLE = 10
|
||||
BATCH_SIZE = 500
|
||||
|
||||
def get_conn():
|
||||
return psycopg2.connect(**DB_CONFIG)
|
||||
|
||||
def load_topics(conn):
|
||||
"""
|
||||
Load topics and heir keywords.
|
||||
Returns list of dicts: [{'id': 1, 'weight': 5, 'keywords': ['foo', 'bar']}]
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT id, weight, keywords FROM topics")
|
||||
rows = cur.fetchall()
|
||||
|
||||
topics = []
|
||||
for r in rows:
|
||||
tid, weight, kw_str = r
|
||||
if not kw_str:
|
||||
continue
|
||||
# Keywords are comma separated based on insert script
|
||||
kws = [k.strip().lower() for k in kw_str.split(",") if k.strip()]
|
||||
topics.append({
|
||||
"id": tid,
|
||||
"weight": weight,
|
||||
"keywords": kws
|
||||
})
|
||||
return topics
|
||||
|
||||
|
||||
def load_countries(conn):
|
||||
"""
|
||||
Load countries.
|
||||
Returns list: [{'id': 1, 'name': 'España', 'keywords': ['españa', 'madrid']}]
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT id, nombre FROM paises")
|
||||
rows = cur.fetchall()
|
||||
|
||||
countries = []
|
||||
# Hardcoded aliases for simplicity. A separate table would be better.
|
||||
ALIASES = {
|
||||
"Estados Unidos": ["eeuu", "ee.uu.", "usa", "estadounidense", "washington"],
|
||||
"Rusia": ["ruso", "rusa", "moscú", "kremlin"],
|
||||
"China": ["chino", "china", "pekin", "beijing"],
|
||||
"Ucrania": ["ucraniano", "kiev", "kyiv"],
|
||||
"Israel": ["israelí", "tel aviv", "jerusalén"],
|
||||
"España": ["español", "madrid"],
|
||||
"Reino Unido": ["uk", "londres", "británico"],
|
||||
"Francia": ["francés", "parís"],
|
||||
"Alemania": ["alemán", "berlín"],
|
||||
"Palestina": ["palestino", "gaza", "cisjordania"],
|
||||
"Irán": ["iraní", "teherán"],
|
||||
}
|
||||
|
||||
for r in rows:
|
||||
cid, name = r
|
||||
kws = [name.lower()]
|
||||
if name in ALIASES:
|
||||
kws.extend(ALIASES[name])
|
||||
countries.append({"id": cid, "name": name, "keywords": kws})
|
||||
return countries
|
||||
|
||||
def process_batch(conn, topics, countries):
|
||||
"""
|
||||
Fetch batch of processed=False news.
|
||||
Match against topics AND countries.
|
||||
Insert into news_topics.
|
||||
Mark processed.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
# Fetch news
|
||||
cur.execute("""
|
||||
SELECT id, titulo, resumen
|
||||
FROM noticias
|
||||
WHERE topics_processed = FALSE
|
||||
ORDER BY fecha DESC
|
||||
LIMIT %s
|
||||
""", (BATCH_SIZE,))
|
||||
news_items = cur.fetchall()
|
||||
|
||||
if not news_items:
|
||||
return 0
|
||||
|
||||
inserts = [] # (noticia_id, topic_id, score)
|
||||
processed_ids = []
|
||||
|
||||
# Batch updates for pais_id
|
||||
country_updates = [] # (pais_id, noticia_id)
|
||||
|
||||
for item in news_items:
|
||||
nid, titulo, resumen = item
|
||||
text = (titulo or "") + " " + (resumen or "")
|
||||
text_lower = text.lower()
|
||||
|
||||
# 1. Match Topics
|
||||
for topic in topics:
|
||||
matched_count = 0
|
||||
for kw in topic["keywords"]:
|
||||
if kw in text_lower:
|
||||
matched_count += 1
|
||||
|
||||
if matched_count > 0:
|
||||
score = topic["weight"] * matched_count
|
||||
inserts.append((nid, topic["id"], score))
|
||||
|
||||
# 2. Match Country (Find best match)
|
||||
best_country = None
|
||||
# Simple heuristic: First found? Or count matches?
|
||||
# Let's count matches.
|
||||
max_matches = 0
|
||||
|
||||
for c in countries:
|
||||
matches = 0
|
||||
for kw in c["keywords"]:
|
||||
# simple word matching. can be improved with regex word boundaries
|
||||
if kw in text_lower:
|
||||
matches += 1
|
||||
|
||||
if matches > max_matches:
|
||||
max_matches = matches
|
||||
best_country = c["id"]
|
||||
|
||||
if best_country:
|
||||
country_updates.append((best_country, nid))
|
||||
|
||||
processed_ids.append(nid)
|
||||
|
||||
with conn.cursor() as cur:
|
||||
# Insert relations
|
||||
if inserts:
|
||||
execute_values(cur, """
|
||||
INSERT INTO news_topics (noticia_id, topic_id, score)
|
||||
VALUES %s
|
||||
ON CONFLICT (noticia_id, topic_id) DO UPDATE SET score = EXCLUDED.score
|
||||
""", inserts)
|
||||
|
||||
# Update Countries
|
||||
if country_updates:
|
||||
execute_values(cur, """
|
||||
UPDATE noticias AS n
|
||||
SET pais_id = v.pais_id
|
||||
FROM (VALUES %s) AS v(pais_id, noticia_id)
|
||||
WHERE n.id = v.noticia_id
|
||||
""", country_updates)
|
||||
|
||||
# Mark processed
|
||||
cur.execute("""
|
||||
UPDATE noticias
|
||||
SET topics_processed = TRUE
|
||||
WHERE id = ANY(%s)
|
||||
""", (processed_ids,))
|
||||
|
||||
conn.commit()
|
||||
return len(news_items)
|
||||
|
||||
def initialize_schema(conn):
|
||||
"""
|
||||
Ensure required tables and columns exist.
|
||||
"""
|
||||
log.info("Checking/Initializing schema...")
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
CREATE TABLE IF NOT EXISTS topics (
|
||||
id SERIAL PRIMARY KEY,
|
||||
slug VARCHAR(50) UNIQUE NOT NULL,
|
||||
name VARCHAR(100) NOT NULL,
|
||||
weight INTEGER DEFAULT 1,
|
||||
keywords TEXT,
|
||||
group_name VARCHAR(50)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS news_topics (
|
||||
noticia_id VARCHAR(32) REFERENCES noticias(id) ON DELETE CASCADE,
|
||||
topic_id INTEGER REFERENCES topics(id) ON DELETE CASCADE,
|
||||
score INTEGER DEFAULT 0,
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
PRIMARY KEY (noticia_id, topic_id)
|
||||
);
|
||||
ALTER TABLE noticias ADD COLUMN IF NOT EXISTS topics_processed BOOLEAN DEFAULT FALSE;
|
||||
""")
|
||||
conn.commit()
|
||||
log.info("Schema OK.")
|
||||
|
||||
def main():
|
||||
log.info("Starting topics_worker...")
|
||||
|
||||
# Run migrations once at startup
|
||||
try:
|
||||
with get_conn() as conn:
|
||||
initialize_schema(conn)
|
||||
except Exception as e:
|
||||
log.error(f"Error during schema initialization: {e}")
|
||||
# We might want to exit here if the schema is crucial
|
||||
# sys.exit(1)
|
||||
|
||||
while True:
|
||||
try:
|
||||
with get_conn() as conn:
|
||||
|
||||
topics = load_topics(conn)
|
||||
if not topics:
|
||||
log.warning("No topics found in DB. Sleeping.")
|
||||
time.sleep(SLEEP_IDLE)
|
||||
continue
|
||||
|
||||
# Load countries
|
||||
countries = load_countries(conn)
|
||||
|
||||
count = process_batch(conn, topics, countries)
|
||||
|
||||
if count < BATCH_SIZE:
|
||||
time.sleep(SLEEP_IDLE)
|
||||
else:
|
||||
log.info(f"Processed {count} items.")
|
||||
|
||||
except Exception as e:
|
||||
log.exception("Error in topics_worker")
|
||||
time.sleep(SLEEP_IDLE)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
105
workers/translation_scheduler.py
Normal file
105
workers/translation_scheduler.py
Normal file
|
|
@ -0,0 +1,105 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Translation Scheduler Worker
|
||||
Creates translation jobs for news that need to be translated.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from langdetect import detect, LangDetectException
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='[%(asctime)s] %(levelname)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DB_CONFIG = {
|
||||
'host': os.getenv('DB_HOST', 'db'),
|
||||
'port': int(os.getenv('DB_PORT', 5432)),
|
||||
'database': os.getenv('DB_NAME', 'rss'),
|
||||
'user': os.getenv('DB_USER', 'rss'),
|
||||
'password': os.getenv('DB_PASS', 'rss')
|
||||
}
|
||||
|
||||
TARGET_LANGS = os.getenv('TARGET_LANGS', 'es').split(',')
|
||||
BATCH_SIZE = int(os.getenv('SCHEDULER_BATCH', '2000'))
|
||||
SLEEP_INTERVAL = int(os.getenv('SCHEDULER_SLEEP', '30'))
|
||||
|
||||
# Common source languages to try
|
||||
SOURCE_LANGS = ['en', 'fr', 'pt', 'de', 'it', 'ru', 'zh', 'ja', 'ar', 'nl', 'pl', 'sv']
|
||||
|
||||
def get_db_connection():
|
||||
return psycopg2.connect(**DB_CONFIG)
|
||||
|
||||
def create_translation_jobs(conn):
|
||||
"""Create translation jobs for news without translations.
|
||||
Relies on langdetect_worker to have set the 'lang' column.
|
||||
"""
|
||||
created = 0
|
||||
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
for lang in TARGET_LANGS:
|
||||
lang = lang.strip()
|
||||
if not lang:
|
||||
continue
|
||||
|
||||
# Insert translation jobs for news that have a detected language
|
||||
# but don't have a translation record for the target language.
|
||||
cur.execute("""
|
||||
INSERT INTO traducciones (noticia_id, lang_from, lang_to, status, created_at)
|
||||
SELECT n.id, n.lang, %s, 'pending', NOW()
|
||||
FROM noticias n
|
||||
WHERE n.lang IS NOT NULL
|
||||
AND TRIM(n.lang) != ''
|
||||
AND n.lang != %s
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM traducciones t
|
||||
WHERE t.noticia_id = n.id AND t.lang_to = %s
|
||||
)
|
||||
ORDER BY n.fecha DESC
|
||||
LIMIT %s
|
||||
ON CONFLICT (noticia_id, lang_to) DO NOTHING
|
||||
RETURNING noticia_id
|
||||
""", (lang, lang, lang, BATCH_SIZE))
|
||||
|
||||
rows = cur.fetchall()
|
||||
if rows:
|
||||
created += len(rows)
|
||||
logger.info(f"Created {len(rows)} translation jobs for {lang}")
|
||||
|
||||
conn.commit()
|
||||
|
||||
return created
|
||||
|
||||
def process_translations():
|
||||
logger.info("Starting translation scheduler loop...")
|
||||
|
||||
while True:
|
||||
try:
|
||||
conn = get_db_connection()
|
||||
created = create_translation_jobs(conn)
|
||||
conn.close()
|
||||
|
||||
if created == 0:
|
||||
logger.info(f"No new news to schedule. Sleeping {SLEEP_INTERVAL}s...")
|
||||
time.sleep(SLEEP_INTERVAL)
|
||||
else:
|
||||
logger.info(f"Total jobs created in this cycle: {created}")
|
||||
# Short sleep to avoid hammer but keep momentum
|
||||
time.sleep(5)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Scheduler error: {e}")
|
||||
time.sleep(10)
|
||||
|
||||
if __name__ == '__main__':
|
||||
logger.info("Translation scheduler started")
|
||||
process_translations()
|
||||
|
|
@ -7,19 +7,15 @@ from typing import List, Optional
|
|||
import psycopg2
|
||||
import psycopg2.extras
|
||||
from psycopg2.extras import execute_values
|
||||
|
||||
import ctranslate2
|
||||
from transformers import AutoTokenizer
|
||||
from langdetect import detect, DetectorFactory
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
||||
|
||||
DetectorFactory.seed = 0
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
|
||||
LOG = logging.getLogger("translator")
|
||||
|
||||
# =========================
|
||||
# DB CONFIG
|
||||
# =========================
|
||||
DB_CONFIG = {
|
||||
"host": os.environ.get("DB_HOST", "localhost"),
|
||||
"port": int(os.environ.get("DB_PORT", 5432)),
|
||||
|
|
@ -28,9 +24,6 @@ DB_CONFIG = {
|
|||
"password": os.environ.get("DB_PASS", "x"),
|
||||
}
|
||||
|
||||
# =========================
|
||||
# ENV HELPERS
|
||||
# =========================
|
||||
def _env_list(name: str, default="es"):
|
||||
raw = os.environ.get(name)
|
||||
if raw:
|
||||
|
|
@ -55,37 +48,20 @@ def _env_str(name: str, default=None):
|
|||
v = os.environ.get(name)
|
||||
return v if v else default
|
||||
|
||||
# =========================
|
||||
# CONFIG
|
||||
# =========================
|
||||
TARGET_LANGS = _env_list("TARGET_LANGS") # por defecto ["es"]
|
||||
TARGET_LANGS = _env_list("TARGET_LANGS")
|
||||
BATCH_SIZE = _env_int("TRANSLATOR_BATCH", 8)
|
||||
ENQUEUE_MAX = _env_int("ENQUEUE", 200)
|
||||
SLEEP_IDLE = _env_float("TRANSLATOR_SLEEP_IDLE", 5.0)
|
||||
|
||||
# CTranslate2 Configuration
|
||||
CT2_MODEL_PATH = _env_str("CT2_MODEL_PATH", "./models/nllb-ct2")
|
||||
CT2_DEVICE = _env_str("CT2_DEVICE", "auto") # auto, cpu, cuda
|
||||
CT2_COMPUTE_TYPE = _env_str("CT2_COMPUTE_TYPE", "auto") # auto, int8, float16, int8_float16
|
||||
|
||||
MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", 512)
|
||||
MAX_NEW_TOKENS_TITLE = _env_int("MAX_NEW_TOKENS_TITLE", 96)
|
||||
MAX_NEW_TOKENS_BODY = _env_int("MAX_NEW_TOKENS_BODY", 512)
|
||||
|
||||
NUM_BEAMS_TITLE = _env_int("NUM_BEAMS_TITLE", 2)
|
||||
NUM_BEAMS_BODY = _env_int("NUM_BEAMS_BODY", 2)
|
||||
|
||||
# HuggingFace model name (used for tokenizer)
|
||||
UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", "facebook/nllb-200-distilled-600M")
|
||||
IDENTITY_PAISES_ES = _env_int("IDENTITY_PAISES_ES", 58)
|
||||
|
||||
BODY_CHARS_CHUNK = _env_int("BODY_CHARS_CHUNK", 900)
|
||||
|
||||
# =========================
|
||||
# LANG MAP
|
||||
# =========================
|
||||
NLLB_LANG = {
|
||||
"es": "spa_Latn", "en": "eng_Latn", "fr": "fra_Latn", "de": "deu_Latn",
|
||||
LANG_CODE_MAP = {
|
||||
"en": "eng_Latn", "es": "spa_Latn", "fr": "fra_Latn", "de": "deu_Latn",
|
||||
"it": "ita_Latn", "pt": "por_Latn", "nl": "nld_Latn", "sv": "swe_Latn",
|
||||
"da": "dan_Latn", "fi": "fin_Latn", "no": "nob_Latn",
|
||||
"pl": "pol_Latn", "cs": "ces_Latn", "sk": "slk_Latn",
|
||||
|
|
@ -96,286 +72,74 @@ NLLB_LANG = {
|
|||
"ko": "kor_Hang", "vi": "vie_Latn",
|
||||
}
|
||||
|
||||
def map_to_nllb(code: Optional[str]):
|
||||
if not code:
|
||||
return None
|
||||
c = code.strip().lower()
|
||||
return NLLB_LANG.get(c, f"{c}_Latn")
|
||||
_tokenizer = None
|
||||
_translator = None
|
||||
_device = None
|
||||
|
||||
def normalize_lang(code: Optional[str], default=None):
|
||||
return (code or default).strip().lower() if code else default
|
||||
|
||||
def _norm(s: str) -> str:
|
||||
return re.sub(r"\W+", "", (s or "").lower()).strip()
|
||||
|
||||
def _is_repetitive_output(text: str, threshold: float = 0.25) -> bool:
|
||||
"""Detect if translation output is repetitive/low quality.
|
||||
def get_translator_components():
|
||||
global _tokenizer, _translator, _device
|
||||
|
||||
Args:
|
||||
text: The translated text to check
|
||||
threshold: Minimum unique word ratio (default 0.25 = 25% unique words)
|
||||
if _translator:
|
||||
return _tokenizer, _translator
|
||||
|
||||
Returns:
|
||||
True if text appears to be repetitive/low quality
|
||||
"""
|
||||
if not text or len(text) < 50:
|
||||
return False
|
||||
device = 0 if torch.cuda.is_available() else -1
|
||||
LOG.info(f"Loading model {UNIVERSAL_MODEL} on {'cuda' if device == 0 else 'cpu'}")
|
||||
|
||||
# Check for obvious repetitive patterns
|
||||
repetitive_patterns = [
|
||||
r'(\b\w+\b)( \1){3,}', # Same word repeated 4+ times
|
||||
r'(\b\w+ \w+\b)( \1){2,}', # Same 2-word phrase repeated 3+ times
|
||||
r'de la la ',
|
||||
r'la línea de la línea',
|
||||
r'de Internet de Internet',
|
||||
]
|
||||
_tokenizer = AutoTokenizer.from_pretrained(UNIVERSAL_MODEL, src_lang="eng_Latn")
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(UNIVERSAL_MODEL)
|
||||
|
||||
for pattern in repetitive_patterns:
|
||||
if re.search(pattern, text, re.IGNORECASE):
|
||||
LOG.warning(f"Detected repetitive pattern: {pattern}")
|
||||
return True
|
||||
if device == 0:
|
||||
model = model.to("cuda")
|
||||
|
||||
# Check word diversity
|
||||
words = text.lower().split()
|
||||
if len(words) < 10:
|
||||
return False
|
||||
|
||||
unique_ratio = len(set(words)) / len(words)
|
||||
if unique_ratio < threshold:
|
||||
LOG.warning(f"Low word diversity: {unique_ratio:.2%} (threshold: {threshold:.2%})")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
# =========================
|
||||
# DB
|
||||
# =========================
|
||||
def get_conn():
|
||||
return psycopg2.connect(**DB_CONFIG)
|
||||
|
||||
def ensure_indexes(conn):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("CREATE INDEX IF NOT EXISTS traducciones_lang_to_status_idx ON traducciones (lang_to, status);")
|
||||
cur.execute("CREATE INDEX IF NOT EXISTS traducciones_status_idx ON traducciones (status);")
|
||||
conn.commit()
|
||||
|
||||
pass # Moved to translation_ops.py
|
||||
|
||||
pass # Moved to translation_ops.py
|
||||
|
||||
def fetch_pending_batch(conn, lang_to: str, batch: int):
|
||||
"""Fetch pending translations with row locking to support multiple workers."""
|
||||
if batch <= 0:
|
||||
return []
|
||||
|
||||
# Use FOR UPDATE SKIP LOCKED to allow multiple workers
|
||||
# Each worker will get different rows without conflicts
|
||||
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT t.id AS tr_id, t.noticia_id, t.lang_from, t.lang_to,
|
||||
n.titulo, n.resumen
|
||||
FROM traducciones t
|
||||
JOIN noticias n ON n.id=t.noticia_id
|
||||
WHERE t.lang_to=%s AND t.status='pending'
|
||||
ORDER BY t.id
|
||||
LIMIT %s
|
||||
FOR UPDATE OF t SKIP LOCKED;
|
||||
""",
|
||||
(lang_to, batch),
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
|
||||
# Update status within the same transaction while rows are locked
|
||||
if rows:
|
||||
ids = [r["tr_id"] for r in rows]
|
||||
cur.execute("UPDATE traducciones SET status='processing' WHERE id = ANY(%s)", (ids,))
|
||||
|
||||
conn.commit()
|
||||
return rows
|
||||
|
||||
# =========================
|
||||
# LANGUAGE DETECTION
|
||||
# =========================
|
||||
def detect_lang(text1: str, text2: str):
|
||||
txt = (text1 or "").strip() or (text2 or "").strip()
|
||||
if not txt:
|
||||
return None
|
||||
try:
|
||||
return detect(txt)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
# =========================
|
||||
# MODEL LOADING (CTranslate2)
|
||||
# =========================
|
||||
_TOKENIZER = None
|
||||
_TRANSLATOR = None
|
||||
_DEVICE = None
|
||||
|
||||
def _resolve_device():
|
||||
if CT2_DEVICE == "cpu":
|
||||
return "cpu"
|
||||
if CT2_DEVICE == "cuda":
|
||||
return "cuda"
|
||||
# auto
|
||||
return "cuda" if ctranslate2.get_cuda_device_count() > 0 else "cpu"
|
||||
|
||||
def _ensure_ct2_model():
|
||||
"""Convert HuggingFace model to CTranslate2 format if not exists."""
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
model_dir = CT2_MODEL_PATH
|
||||
|
||||
# Check if model already exists
|
||||
if os.path.isdir(model_dir) and os.path.exists(os.path.join(model_dir, "model.bin")):
|
||||
LOG.info("CTranslate2 model already exists at %s", model_dir)
|
||||
return True
|
||||
|
||||
LOG.info("CTranslate2 model not found, converting from %s...", UNIVERSAL_MODEL)
|
||||
LOG.info("This may take 5-10 minutes on first run...")
|
||||
|
||||
# Create directory if needed
|
||||
os.makedirs(os.path.dirname(model_dir) or ".", exist_ok=True)
|
||||
|
||||
# Convert the model
|
||||
quantization = CT2_COMPUTE_TYPE if CT2_COMPUTE_TYPE != "auto" else "int8_float16"
|
||||
|
||||
cmd = [
|
||||
"ct2-transformers-converter",
|
||||
"--model", UNIVERSAL_MODEL,
|
||||
"--output_dir", model_dir,
|
||||
"--quantization", quantization,
|
||||
"--force"
|
||||
]
|
||||
|
||||
try:
|
||||
LOG.info("Running: %s", " ".join(cmd))
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=1800)
|
||||
|
||||
if result.returncode != 0:
|
||||
LOG.error("Model conversion failed: %s", result.stderr)
|
||||
return False
|
||||
|
||||
LOG.info("Model conversion completed successfully")
|
||||
return True
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
LOG.error("Model conversion timed out after 30 minutes")
|
||||
return False
|
||||
except Exception as e:
|
||||
LOG.error("Model conversion error: %s", e)
|
||||
return False
|
||||
|
||||
def get_universal_components():
|
||||
global _TOKENIZER, _TRANSLATOR, _DEVICE
|
||||
if _TRANSLATOR:
|
||||
return _TOKENIZER, _TRANSLATOR
|
||||
|
||||
# Ensure CT2 model exists (convert if needed)
|
||||
if not _ensure_ct2_model():
|
||||
raise RuntimeError(f"Failed to load/convert CTranslate2 model at {CT2_MODEL_PATH}")
|
||||
|
||||
device = _resolve_device()
|
||||
|
||||
LOG.info("Loading CTranslate2 model from %s on %s", CT2_MODEL_PATH, device)
|
||||
|
||||
_TRANSLATOR = ctranslate2.Translator(
|
||||
CT2_MODEL_PATH,
|
||||
_translator = pipeline(
|
||||
"translation",
|
||||
model=model,
|
||||
tokenizer=_tokenizer,
|
||||
device=device,
|
||||
compute_type=CT2_COMPUTE_TYPE,
|
||||
max_length=MAX_SRC_TOKENS,
|
||||
)
|
||||
_TOKENIZER = AutoTokenizer.from_pretrained(UNIVERSAL_MODEL)
|
||||
_DEVICE = device
|
||||
|
||||
LOG.info("CTranslate2 model loaded successfully")
|
||||
return _TOKENIZER, _TRANSLATOR
|
||||
_device = "cuda" if device == 0 else "cpu"
|
||||
LOG.info(f"Model loaded on {_device}")
|
||||
|
||||
return _tokenizer, _translator
|
||||
|
||||
# =========================
|
||||
# TRANSLATION (CTranslate2)
|
||||
# =========================
|
||||
def _safe_src_len(tokenizer):
|
||||
max_len = getattr(tokenizer, "model_max_length", 1024) or 1024
|
||||
if max_len > 100000:
|
||||
max_len = 1024
|
||||
return min(MAX_SRC_TOKENS, max_len - 16)
|
||||
|
||||
def _translate_texts(src, tgt, texts, beams, max_new_tokens):
|
||||
"""Translate texts using CTranslate2."""
|
||||
def translate_texts(src: str, tgt: str, texts: List[str]) -> List[str]:
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
clean = [(t or "").strip() for t in texts]
|
||||
if all(not t for t in clean):
|
||||
return ["" for _ in clean]
|
||||
|
||||
tok, translator = get_translator_components()
|
||||
|
||||
src_code = LANG_CODE_MAP.get(src, f"{src}_Latn")
|
||||
tgt_code = LANG_CODE_MAP.get(tgt, "spa_Latn")
|
||||
|
||||
results = []
|
||||
for text in clean:
|
||||
if not text:
|
||||
results.append("")
|
||||
continue
|
||||
try:
|
||||
result = translator(text, src_lang=src_code, tgt_lang=tgt_code)
|
||||
results.append(result[0]["translation_text"])
|
||||
except Exception as e:
|
||||
LOG.warning(f"Translation error: {e}")
|
||||
results.append(text)
|
||||
|
||||
return results
|
||||
|
||||
tok, translator = get_universal_components()
|
||||
src_code = map_to_nllb(src)
|
||||
tgt_code = map_to_nllb(tgt)
|
||||
|
||||
# Set source language on tokenizer
|
||||
try:
|
||||
tok.src_lang = src_code
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
safe_len = _safe_src_len(tok)
|
||||
max_new = max(16, min(int(max_new_tokens), MAX_NEW_TOKENS_BODY))
|
||||
|
||||
# Tokenize: convert text to tokens
|
||||
sources = []
|
||||
for t in clean:
|
||||
if t:
|
||||
ids = tok.encode(t, truncation=True, max_length=safe_len)
|
||||
tokens = tok.convert_ids_to_tokens(ids)
|
||||
sources.append(tokens)
|
||||
else:
|
||||
sources.append([])
|
||||
|
||||
# Target language prefix for NLLB
|
||||
target_prefix = [[tgt_code]] * len(sources)
|
||||
|
||||
# Translate with CTranslate2
|
||||
start = time.time()
|
||||
results = translator.translate_batch(
|
||||
sources,
|
||||
target_prefix=target_prefix,
|
||||
beam_size=beams,
|
||||
max_decoding_length=max_new,
|
||||
repetition_penalty=2.5, # Increased from 1.2 to prevent loops
|
||||
no_repeat_ngram_size=3, # Prevent 3-gram repetition
|
||||
)
|
||||
dt = time.time() - start
|
||||
|
||||
# Decode results
|
||||
translated = []
|
||||
total_tokens = 0
|
||||
for result, src_tokens in zip(results, sources):
|
||||
if result.hypotheses:
|
||||
# Skip the first token (language prefix)
|
||||
tokens = result.hypotheses[0][1:]
|
||||
total_tokens += len(tokens) + len(src_tokens)
|
||||
text = tok.decode(tok.convert_tokens_to_ids(tokens))
|
||||
translated.append(text.strip())
|
||||
else:
|
||||
translated.append("")
|
||||
|
||||
if total_tokens > 0:
|
||||
LOG.info(" → tokens=%d tiempo=%.2fs velocidad=%d tok/s",
|
||||
total_tokens, dt, int(total_tokens / dt) if dt > 0 else 0)
|
||||
|
||||
return translated
|
||||
|
||||
def _split_body_into_chunks(text: str) -> List[str]:
|
||||
def split_body_into_chunks(text: str) -> List[str]:
|
||||
text = (text or "").strip()
|
||||
if len(text) <= BODY_CHARS_CHUNK:
|
||||
return [text] if text else []
|
||||
|
||||
|
||||
parts = re.split(r'(\n\n+|(?<=[\.\!\?؛؟。])\s+)', text)
|
||||
chunks = []
|
||||
current = ""
|
||||
|
||||
|
||||
for part in parts:
|
||||
if not part:
|
||||
continue
|
||||
|
|
@ -387,260 +151,145 @@ def _split_body_into_chunks(text: str) -> List[str]:
|
|||
current = part
|
||||
if current.strip():
|
||||
chunks.append(current.strip())
|
||||
|
||||
if not chunks:
|
||||
return [text]
|
||||
return chunks
|
||||
|
||||
return chunks if chunks else [text]
|
||||
|
||||
def translate_body_long(src: str, tgt: str, body: str) -> str:
|
||||
body = (body or "").strip()
|
||||
if not body:
|
||||
return ""
|
||||
|
||||
chunks = _split_body_into_chunks(body)
|
||||
|
||||
chunks = split_body_into_chunks(body)
|
||||
if len(chunks) == 1:
|
||||
translated = _translate_texts(src, tgt, [body], NUM_BEAMS_BODY, MAX_NEW_TOKENS_BODY)[0]
|
||||
return translated.strip()
|
||||
|
||||
return translate_texts(src, tgt, [body])[0].strip()
|
||||
|
||||
translated_chunks = []
|
||||
for ch in chunks:
|
||||
tr = _translate_texts(src, tgt, [ch], NUM_BEAMS_BODY, MAX_NEW_TOKENS_BODY)[0]
|
||||
translated_chunks.append(tr.strip())
|
||||
return "\n\n".join(c for c in translated_chunks if c)
|
||||
tr = translate_texts(src, tgt, [ch])[0]
|
||||
translated_chunks.append(tr)
|
||||
|
||||
return " ".join(translated_chunks)
|
||||
|
||||
def normalize_lang(lang: Optional[str], default: str = "es") -> Optional[str]:
|
||||
if not lang:
|
||||
return default
|
||||
lang = lang.strip().lower()[:2]
|
||||
return lang if lang else default
|
||||
|
||||
def detect_lang(text: str) -> str:
|
||||
if not text or len(text) < 10:
|
||||
return "en"
|
||||
try:
|
||||
return detect(text)
|
||||
except Exception:
|
||||
return "en"
|
||||
|
||||
# =========================
|
||||
# BATCH PROCESS
|
||||
# =========================
|
||||
def process_batch(conn, rows):
|
||||
todo = []
|
||||
done = []
|
||||
errors = []
|
||||
|
||||
|
||||
for r in rows:
|
||||
lang_to = normalize_lang(r["lang_to"], "es") or "es"
|
||||
lang_from = (
|
||||
normalize_lang(r["lang_from"])
|
||||
or detect_lang(r["titulo"], r["resumen"])
|
||||
or "en"
|
||||
)
|
||||
|
||||
titulo = (r["titulo"] or "").strip()
|
||||
resumen = (r["resumen"] or "").strip()
|
||||
|
||||
if map_to_nllb(lang_from) == map_to_nllb(lang_to):
|
||||
done.append((titulo, resumen, lang_from, r["tr_id"]))
|
||||
else:
|
||||
todo.append({
|
||||
"tr_id": r["tr_id"],
|
||||
"lang_from": lang_from,
|
||||
"lang_to": lang_to,
|
||||
"titulo": titulo,
|
||||
"resumen": resumen,
|
||||
})
|
||||
|
||||
lang_to = normalize_lang(r.get("lang_to"), "es") or "es"
|
||||
lang_from = normalize_lang(r.get("lang_from")) or detect_lang(r.get("titulo") or "")
|
||||
|
||||
titulo = (r.get("titulo") or "").strip()
|
||||
resumen = (r.get("resumen") or "").strip()
|
||||
|
||||
if lang_from == lang_to:
|
||||
continue
|
||||
|
||||
todo.append({
|
||||
"tr_id": r.get("tr_id"),
|
||||
"lang_from": lang_from,
|
||||
"lang_to": lang_to,
|
||||
"titulo": titulo,
|
||||
"resumen": resumen,
|
||||
})
|
||||
|
||||
if not todo:
|
||||
return
|
||||
|
||||
from collections import defaultdict
|
||||
groups = defaultdict(list)
|
||||
for item in todo:
|
||||
key = (item["lang_from"], item["lang_to"])
|
||||
groups[key].append(item)
|
||||
|
||||
|
||||
for (lang_from, lang_to), items in groups.items():
|
||||
LOG.info("Translating %s -> %s (%d items)", lang_from, lang_to, len(items))
|
||||
|
||||
LOG.info(f"Translating {lang_from} -> {lang_to} ({len(items)} items)")
|
||||
|
||||
titles = [i["titulo"] for i in items]
|
||||
|
||||
try:
|
||||
tt = _translate_texts(
|
||||
lang_from,
|
||||
lang_to,
|
||||
titles,
|
||||
NUM_BEAMS_TITLE,
|
||||
MAX_NEW_TOKENS_TITLE,
|
||||
)
|
||||
|
||||
bodies_translated: List[str] = []
|
||||
for i in items:
|
||||
bodies_translated.append(
|
||||
translate_body_long(lang_from, lang_to, i["resumen"])
|
||||
)
|
||||
|
||||
for i, ttr, btr in zip(items, tt, bodies_translated):
|
||||
ttr = (ttr or "").strip()
|
||||
btr = (btr or "").strip()
|
||||
|
||||
if not ttr or _norm(ttr) == _norm(i["titulo"]):
|
||||
ttr = i["titulo"]
|
||||
if not btr or _norm(btr) == _norm(i["resumen"]):
|
||||
btr = i["resumen"]
|
||||
|
||||
# CLEANING: Remove <unk> tokens
|
||||
if ttr:
|
||||
ttr = ttr.replace("<unk>", "").replace(" ", " ").strip()
|
||||
if btr:
|
||||
btr = btr.replace("<unk>", "").replace(" ", " ").strip()
|
||||
|
||||
# VALIDATION: Check for repetitive output
|
||||
if _is_repetitive_output(ttr) or _is_repetitive_output(btr):
|
||||
LOG.warning(f"Rejecting repetitive translation for tr_id={i['tr_id']}")
|
||||
errors.append(("Repetitive output detected", i["tr_id"]))
|
||||
continue
|
||||
|
||||
done.append((ttr, btr, lang_from, i["tr_id"]))
|
||||
|
||||
except Exception as e:
|
||||
err = str(e)[:800]
|
||||
LOG.exception("Error translating %s -> %s: %s", lang_from, lang_to, err)
|
||||
for i in items:
|
||||
errors.append((err, i["tr_id"]))
|
||||
|
||||
with conn.cursor() as cur:
|
||||
if done:
|
||||
execute_values(
|
||||
cur,
|
||||
"""
|
||||
UPDATE traducciones AS t
|
||||
SET titulo_trad=v.titulo_trad,
|
||||
resumen_trad=v.resumen_trad,
|
||||
lang_from=COALESCE(t.lang_from, v.lang_from),
|
||||
status='done',
|
||||
error=NULL
|
||||
FROM (VALUES %s) AS v(titulo_trad,resumen_trad,lang_from,id)
|
||||
WHERE t.id=v.id;
|
||||
""",
|
||||
done,
|
||||
)
|
||||
|
||||
# --- NEW: Persist stats ---
|
||||
# Insert a record for each translated item into translation_stats
|
||||
# We need the language 'lang_to'. In this batch, lang_to is uniform for the group usually,
|
||||
# but let's extract it from the 'done' items structure if we had it, or pass it down.
|
||||
# In process_batch, we iterate groups.
|
||||
# 'done' list here is flattened from multiple groups?
|
||||
# process_batch logic:
|
||||
# 1. 'done' checks map_to_nllb identity (already done?) -> these have lang_to from row?
|
||||
# 2. 'groups' loop -> translates -> appends to 'done' with lang_from.
|
||||
#
|
||||
# Wait, 'done' list doesn't have lang_to in the tuple: (titulo, resumen, lang_from, tr_id).
|
||||
# We need to change the 'done' collection to include lang_to OR we insert based on tr_id.
|
||||
|
||||
# Let's verify process_batch logic.
|
||||
# rows has all info.
|
||||
# define a mapping tr_id -> lang_to
|
||||
tr_map = {r["tr_id"]: r["lang_to"] for r in rows}
|
||||
|
||||
stats_data = []
|
||||
for item in done:
|
||||
# item is (titulo, resumen, lang_from, tr_id)
|
||||
lang_from = item[2]
|
||||
lang_to = tr_map.get(item[3], "es")
|
||||
stats_data.append((lang_from, lang_to))
|
||||
|
||||
execute_values(
|
||||
cur,
|
||||
"INSERT INTO translation_stats (lang_from, lang_to) VALUES %s",
|
||||
stats_data
|
||||
)
|
||||
# --------------------------
|
||||
|
||||
if errors:
|
||||
execute_values(
|
||||
cur,
|
||||
"""
|
||||
UPDATE traducciones AS t
|
||||
SET status='error', error=v.error
|
||||
FROM (VALUES %s) AS v(error,id)
|
||||
WHERE t.id=v.id;
|
||||
""",
|
||||
errors,
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
|
||||
def process_entity_summaries(conn):
|
||||
"""Translate pending entity summaries from Wikipedia."""
|
||||
from cache import cache_del
|
||||
|
||||
LOG.info("DEBUG: Checking for pending entity summaries...")
|
||||
|
||||
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT id, entity_name, summary, summary_en
|
||||
FROM entity_images
|
||||
WHERE status_es = 'pending'
|
||||
LIMIT 20
|
||||
FOR UPDATE SKIP LOCKED;
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
translated_titles = translate_texts(lang_from, lang_to, titles)
|
||||
|
||||
if not rows:
|
||||
return False
|
||||
|
||||
LOG.info("DEBUG: Found %d pending entity summaries to process", len(rows))
|
||||
translated_bodies = []
|
||||
for i in items:
|
||||
body = (i["resumen"] or "").strip()
|
||||
if body:
|
||||
tr = translate_body_long(lang_from, lang_to, body)
|
||||
translated_bodies.append(tr)
|
||||
else:
|
||||
translated_bodies.append("")
|
||||
|
||||
for r in rows:
|
||||
entity_id = r["id"]
|
||||
name = r["entity_name"]
|
||||
text = r["summary_en"] or r["summary"]
|
||||
cursor = conn.cursor()
|
||||
for item, tt, tb in zip(items, translated_titles, translated_bodies):
|
||||
tt = (tt or "").strip()
|
||||
tb = (tb or "").strip()
|
||||
|
||||
if not tt:
|
||||
tt = item["titulo"]
|
||||
if not tb:
|
||||
tb = item["resumen"]
|
||||
|
||||
if not text:
|
||||
cur.execute("UPDATE entity_images SET status_es = 'none' WHERE id = %s", (entity_id,))
|
||||
continue
|
||||
|
||||
try:
|
||||
# English -> Spanish
|
||||
translated = translate_body_long('en', 'es', text)
|
||||
if translated:
|
||||
cur.execute("""
|
||||
UPDATE entity_images
|
||||
SET summary_es = %s, status_es = 'done'
|
||||
WHERE id = %s
|
||||
""", (translated, entity_id))
|
||||
# Invalidate cache
|
||||
cache_del(f"wiki:data:{name.lower()}")
|
||||
LOG.info(" → Translated entity summary: %s", name)
|
||||
else:
|
||||
cur.execute("UPDATE entity_images SET status_es = 'error' WHERE id = %s", (entity_id,))
|
||||
cursor.execute("""
|
||||
UPDATE traducciones
|
||||
SET titulo_trad = %s, resumen_trad = %s, lang_to = %s
|
||||
WHERE id = %s
|
||||
""", (tt, tb, lang_to, item["tr_id"]))
|
||||
except Exception as e:
|
||||
LOG.error("Error translating entity summary [%s]: %s", name, e)
|
||||
cur.execute("UPDATE entity_images SET status_es = 'error' WHERE id = %s", (entity_id,))
|
||||
LOG.error(f"Update error: {e}")
|
||||
|
||||
conn.commit()
|
||||
return True
|
||||
cursor.close()
|
||||
LOG.info(f"Translated {len(items)} items")
|
||||
|
||||
def fetch_pending_translations(conn):
|
||||
cursor = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
|
||||
for lang in TARGET_LANGS:
|
||||
cursor.execute("""
|
||||
SELECT t.id as tr_id, t.lang_from, t.lang_to,
|
||||
n.titulo, n.resumen, n.id as noticia_id
|
||||
FROM traducciones t
|
||||
JOIN noticias n ON n.id = t.noticia_id
|
||||
WHERE t.lang_to = %s
|
||||
AND (t.titulo_trad IS NULL OR t.resumen_trad IS NULL)
|
||||
ORDER BY n.fecha DESC
|
||||
LIMIT %s
|
||||
""", (lang, BATCH_SIZE))
|
||||
|
||||
rows = cursor.fetchall()
|
||||
if rows:
|
||||
LOG.info(f"Found {len(rows)} pending translations for {lang}")
|
||||
process_batch(conn, rows)
|
||||
|
||||
cursor.close()
|
||||
|
||||
def connect_db():
|
||||
return psycopg2.connect(**DB_CONFIG)
|
||||
|
||||
# =========================
|
||||
# MAIN LOOP
|
||||
# =========================
|
||||
def main():
|
||||
LOG.info("Translator worker iniciado (CTranslate2)")
|
||||
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
||||
get_universal_components()
|
||||
|
||||
LOG.info("Translation worker started (transformers)")
|
||||
get_translator_components()
|
||||
|
||||
while True:
|
||||
any_work = False
|
||||
with get_conn() as conn:
|
||||
ensure_indexes(conn)
|
||||
|
||||
# 1. Process entity summaries (Wikipedia) -> REMOVED per user request
|
||||
# Logic moved out to keep translator focused on news ONLY.
|
||||
# try:
|
||||
# if process_entity_summaries(conn):
|
||||
# any_work = True
|
||||
# except Exception as e:
|
||||
# LOG.error("Error in process_entity_summaries: %s", e)
|
||||
|
||||
# 2. Process news translations
|
||||
for tgt in TARGET_LANGS:
|
||||
while True:
|
||||
rows = fetch_pending_batch(conn, tgt, BATCH_SIZE)
|
||||
if not rows:
|
||||
break
|
||||
any_work = True
|
||||
LOG.info("[%s] %d elementos", tgt, len(rows))
|
||||
process_batch(conn, rows)
|
||||
|
||||
if not any_work:
|
||||
time.sleep(SLEEP_IDLE)
|
||||
try:
|
||||
conn = connect_db()
|
||||
fetch_pending_translations(conn)
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
LOG.error(f"Error: {e}")
|
||||
|
||||
time.sleep(30)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
|
|
|||
|
|
@ -1,471 +0,0 @@
|
|||
"""
|
||||
URL Feed Discovery Worker
|
||||
This worker automatically discovers RSS feeds from URLs stored in fuentes_url table
|
||||
and creates entries in the feeds table (or feeds_pending for review).
|
||||
Runs every 15 minutes.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import List, Dict
|
||||
|
||||
# Add parent directory to path to import modules
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from db import get_conn
|
||||
from utils.feed_discovery import discover_feeds, get_feed_metadata
|
||||
from utils.feed_analysis import (
|
||||
analyze_feed,
|
||||
get_country_id_by_name,
|
||||
get_category_id_by_name
|
||||
)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration
|
||||
CHECK_INTERVAL = int(os.getenv('URL_DISCOVERY_INTERVAL_MIN', '15')) * 60 # Default: 15 minutes
|
||||
BATCH_SIZE = int(os.getenv('URL_DISCOVERY_BATCH_SIZE', '10')) # Process URLs in batches
|
||||
MAX_FEEDS_PER_URL = int(os.getenv('MAX_FEEDS_PER_URL', '5')) # Max feeds to create per URL
|
||||
|
||||
|
||||
def get_pending_urls(limit: int = BATCH_SIZE) -> List[Dict]:
|
||||
"""
|
||||
Get URLs that need to be processed.
|
||||
Priority: never checked > failed checks > oldest successful checks
|
||||
"""
|
||||
with get_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT id, nombre, url, categoria_id, pais_id, idioma, last_status
|
||||
FROM fuentes_url
|
||||
WHERE active = TRUE
|
||||
ORDER BY
|
||||
CASE
|
||||
WHEN last_check IS NULL THEN 1 -- Never checked (highest priority)
|
||||
WHEN last_status = 'error' THEN 2 -- Failed checks
|
||||
WHEN last_status = 'no_feeds' THEN 3 -- No feeds found
|
||||
ELSE 4 -- Successful checks (lowest priority)
|
||||
END,
|
||||
last_check ASC NULLS FIRST
|
||||
LIMIT %s
|
||||
""", (limit,))
|
||||
|
||||
columns = [desc[0] for desc in cur.description]
|
||||
return [dict(zip(columns, row)) for row in cur.fetchall()]
|
||||
|
||||
|
||||
def update_url_status(url_id: int, status: str, message: str = None, http_code: int = None):
|
||||
"""Update the status of a URL source"""
|
||||
with get_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
UPDATE fuentes_url
|
||||
SET last_check = NOW(),
|
||||
last_status = %s,
|
||||
status_message = %s,
|
||||
last_http_code = %s
|
||||
WHERE id = %s
|
||||
""", (status, message, http_code, url_id))
|
||||
conn.commit()
|
||||
|
||||
|
||||
def create_pending_feed(
|
||||
fuente_url_id: int,
|
||||
feed_url: str,
|
||||
metadata: Dict,
|
||||
analysis: Dict,
|
||||
categoria_id: int = None,
|
||||
pais_id: int = None,
|
||||
idioma: str = None
|
||||
) -> bool:
|
||||
"""
|
||||
Create a pending feed entry for manual review
|
||||
"""
|
||||
try:
|
||||
with get_conn() as conn:
|
||||
# Get detected country ID
|
||||
detected_country_id = None
|
||||
if analysis.get('detected_country'):
|
||||
detected_country_id = get_country_id_by_name(conn, analysis['detected_country'])
|
||||
|
||||
# Get suggested category ID
|
||||
suggested_categoria_id = None
|
||||
if analysis.get('suggested_category'):
|
||||
suggested_categoria_id = get_category_id_by_name(conn, analysis['suggested_category'])
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
INSERT INTO feeds_pending (
|
||||
fuente_url_id, feed_url, feed_title, feed_description,
|
||||
feed_language, feed_type, entry_count,
|
||||
detected_country_id, suggested_categoria_id,
|
||||
categoria_id, pais_id, idioma, notes
|
||||
)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (feed_url) DO UPDATE
|
||||
SET feed_title = EXCLUDED.feed_title,
|
||||
feed_description = EXCLUDED.feed_description,
|
||||
discovered_at = NOW()
|
||||
RETURNING id
|
||||
""", (
|
||||
fuente_url_id,
|
||||
feed_url,
|
||||
metadata.get('title', 'Feed sin título'),
|
||||
metadata.get('description', '')[:500],
|
||||
analysis.get('language'),
|
||||
'rss', # Default type
|
||||
metadata.get('entry_count', 0),
|
||||
detected_country_id,
|
||||
suggested_categoria_id,
|
||||
categoria_id,
|
||||
pais_id,
|
||||
idioma,
|
||||
analysis.get('analysis_notes', '')
|
||||
))
|
||||
|
||||
result = cur.fetchone()
|
||||
conn.commit()
|
||||
|
||||
if result:
|
||||
logger.info(f"Created pending feed for review: {metadata.get('title')} ({feed_url})")
|
||||
return True
|
||||
else:
|
||||
logger.debug(f"Pending feed updated: {feed_url}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating pending feed {feed_url}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def create_feed_from_metadata(
|
||||
feed_url: str,
|
||||
fuente_url_id: int = None,
|
||||
categoria_id: int = None,
|
||||
pais_id: int = None,
|
||||
idioma: str = None,
|
||||
auto_approve: bool = False,
|
||||
context_title: str = None
|
||||
) -> Dict:
|
||||
"""
|
||||
Create a feed entry from discovered feed URL with intelligent analysis.
|
||||
|
||||
Returns:
|
||||
{
|
||||
'created': True/False,
|
||||
'pending': True/False,
|
||||
'status': 'created'/'pending'/'existing'/'error',
|
||||
'message': 'Description'
|
||||
}
|
||||
"""
|
||||
result = {
|
||||
'created': False,
|
||||
'pending': False,
|
||||
'status': 'error',
|
||||
'message': ''
|
||||
}
|
||||
|
||||
try:
|
||||
# Get feed metadata
|
||||
metadata = get_feed_metadata(feed_url, timeout=10)
|
||||
|
||||
if not metadata:
|
||||
result['message'] = 'No se pudo obtener metadata del feed'
|
||||
logger.warning(f"{result['message']}: {feed_url}")
|
||||
return result
|
||||
|
||||
# Add URL to metadata for analysis
|
||||
metadata['url'] = feed_url
|
||||
|
||||
# Use context title if provided, otherwise use metadata title
|
||||
# This helps when feed XML title is generic (e.g. "RSS Feed") but site link had meaningful text
|
||||
feed_title = context_title if context_title else metadata.get('title', 'Feed sin título')
|
||||
# Update metadata for consistency in pending feeds AND analysis
|
||||
metadata['title'] = feed_title
|
||||
|
||||
# Perform intelligent analysis
|
||||
analysis = analyze_feed(metadata)
|
||||
|
||||
# Determine if we need manual review
|
||||
needs_review = False
|
||||
|
||||
# If parent URL has no category or country, we need review
|
||||
if not categoria_id or not pais_id:
|
||||
needs_review = True
|
||||
logger.info(f"Feed needs review (missing categoria_id={categoria_id} or pais_id={pais_id})")
|
||||
|
||||
# If auto_approve is disabled, we need review
|
||||
if not auto_approve:
|
||||
needs_review = True
|
||||
|
||||
# Enhance metadata with analysis
|
||||
if not idioma and analysis.get('language'):
|
||||
idioma = analysis['language']
|
||||
|
||||
# If needs review, create pending feed
|
||||
if needs_review:
|
||||
created_pending = create_pending_feed(
|
||||
fuente_url_id=fuente_url_id,
|
||||
feed_url=feed_url,
|
||||
metadata=metadata,
|
||||
analysis=analysis,
|
||||
categoria_id=categoria_id,
|
||||
pais_id=pais_id,
|
||||
idioma=idioma
|
||||
)
|
||||
|
||||
result['pending'] = created_pending
|
||||
result['status'] = 'pending'
|
||||
result['message'] = f"Feed creado y pendiente de revisión (país: {analysis.get('detected_country', 'N/A')}, categoría sugerida: {analysis.get('suggested_category', 'N/A')})"
|
||||
return result
|
||||
|
||||
# Otherwise, create feed directly
|
||||
nombre = feed_title
|
||||
descripcion = metadata.get('description', '')
|
||||
|
||||
with get_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
INSERT INTO feeds (nombre, descripcion, url, categoria_id, pais_id, idioma, fuente_url_id, activo)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, TRUE)
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
RETURNING id
|
||||
""", (
|
||||
nombre,
|
||||
descripcion[:500] if descripcion else None,
|
||||
feed_url,
|
||||
categoria_id,
|
||||
pais_id,
|
||||
idioma,
|
||||
fuente_url_id
|
||||
))
|
||||
|
||||
feed_result = cur.fetchone()
|
||||
conn.commit()
|
||||
|
||||
if feed_result:
|
||||
logger.info(f"Created new feed: {nombre} ({feed_url})")
|
||||
result['created'] = True
|
||||
result['status'] = 'created'
|
||||
result['message'] = f"Feed creado exitosamente"
|
||||
else:
|
||||
logger.debug(f"Feed already exists: {feed_url}")
|
||||
result['status'] = 'existing'
|
||||
result['message'] = 'El feed ya existe'
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating feed from {feed_url}: {e}")
|
||||
result['message'] = str(e)
|
||||
result['status'] = 'error'
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def process_url_source(url_data: Dict) -> Dict:
|
||||
"""
|
||||
Process a single URL source to discover and create feeds.
|
||||
Returns statistics about the operation.
|
||||
"""
|
||||
url_id = url_data['id']
|
||||
source_url = url_data['url']
|
||||
nombre = url_data['nombre']
|
||||
categoria_id = url_data['categoria_id']
|
||||
pais_id = url_data['pais_id']
|
||||
idioma = url_data['idioma']
|
||||
|
||||
logger.info(f"Processing URL source: {nombre} ({source_url})")
|
||||
logger.info(f" Parent metadata: categoria_id={categoria_id}, pais_id={pais_id}, idioma={idioma}")
|
||||
|
||||
stats = {
|
||||
'url_id': url_id,
|
||||
'url': source_url,
|
||||
'discovered': 0,
|
||||
'created': 0,
|
||||
'pending': 0,
|
||||
'existing': 0,
|
||||
'errors': 0,
|
||||
'status': 'unknown'
|
||||
}
|
||||
|
||||
try:
|
||||
# Discover feeds from URL
|
||||
discovered = discover_feeds(source_url, timeout=15)
|
||||
stats['discovered'] = len(discovered)
|
||||
|
||||
if not discovered:
|
||||
logger.warning(f"No feeds discovered from: {source_url}")
|
||||
update_url_status(url_id, 'no_feeds', 'No se encontraron feeds RSS', 200)
|
||||
stats['status'] = 'no_feeds'
|
||||
return stats
|
||||
|
||||
# Filter only valid feeds
|
||||
valid_feeds = [f for f in discovered if f.get('valid', False)]
|
||||
|
||||
if not valid_feeds:
|
||||
logger.warning(f"No valid feeds found for: {source_url}")
|
||||
update_url_status(url_id, 'no_valid_feeds', f'Se encontraron {len(discovered)} feeds pero ninguno válido')
|
||||
stats['status'] = 'no_valid_feeds'
|
||||
return stats
|
||||
|
||||
# Limit number of feeds per URL
|
||||
feeds_to_create = valid_feeds[:MAX_FEEDS_PER_URL]
|
||||
|
||||
logger.info(f"Found {len(valid_feeds)} valid feeds, processing up to {len(feeds_to_create)}")
|
||||
|
||||
# Determine if auto-approve (parent has category AND country)
|
||||
auto_approve = bool(categoria_id and pais_id)
|
||||
|
||||
if not auto_approve:
|
||||
logger.info("→ Feeds will require manual review (parent lacks category or country)")
|
||||
else:
|
||||
logger.info("→ Feeds will be auto-approved (parent has complete metadata)")
|
||||
|
||||
# Create feeds
|
||||
for feed_info in feeds_to_create:
|
||||
feed_url = feed_info['url']
|
||||
|
||||
try:
|
||||
result = create_feed_from_metadata(
|
||||
feed_url=feed_url,
|
||||
fuente_url_id=url_id,
|
||||
categoria_id=categoria_id,
|
||||
pais_id=pais_id,
|
||||
idioma=idioma,
|
||||
auto_approve=auto_approve,
|
||||
context_title=feed_info.get('context_label')
|
||||
)
|
||||
|
||||
if result['status'] == 'created':
|
||||
stats['created'] += 1
|
||||
elif result['status'] == 'pending':
|
||||
stats['pending'] += 1
|
||||
elif result['status'] == 'existing':
|
||||
stats['existing'] += 1
|
||||
else:
|
||||
stats['errors'] += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating feed {feed_url}: {e}")
|
||||
stats['errors'] += 1
|
||||
|
||||
# Update URL status
|
||||
if stats['created'] > 0 or stats['pending'] > 0:
|
||||
parts = []
|
||||
if stats['created'] > 0:
|
||||
parts.append(f"{stats['created']} creados")
|
||||
if stats['pending'] > 0:
|
||||
parts.append(f"{stats['pending']} pendientes de revisión")
|
||||
if stats['existing'] > 0:
|
||||
parts.append(f"{stats['existing']} ya existían")
|
||||
|
||||
message = ", ".join(parts)
|
||||
update_url_status(url_id, 'success', message, 200)
|
||||
stats['status'] = 'success'
|
||||
elif stats['existing'] > 0:
|
||||
message = f"Todos los {stats['existing']} feeds ya existían"
|
||||
update_url_status(url_id, 'existing', message, 200)
|
||||
stats['status'] = 'existing'
|
||||
else:
|
||||
message = f"No se pudieron procesar feeds ({stats['errors']} errores)"
|
||||
update_url_status(url_id, 'error', message)
|
||||
stats['status'] = 'error'
|
||||
|
||||
logger.info(f"Processed {source_url}: {stats['created']} created, {stats['pending']} pending, {stats['existing']} existing, {stats['errors']} errors")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing URL {source_url}: {e}")
|
||||
update_url_status(url_id, 'error', str(e)[:200])
|
||||
stats['status'] = 'error'
|
||||
stats['errors'] += 1
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def process_batch():
|
||||
"""Process a batch of URL sources"""
|
||||
logger.info("=" * 80)
|
||||
logger.info(f"Starting URL discovery batch - {datetime.now().isoformat()}")
|
||||
|
||||
# Get pending URLs
|
||||
urls = get_pending_urls(limit=BATCH_SIZE)
|
||||
|
||||
if not urls:
|
||||
logger.info("No pending URLs to process")
|
||||
return
|
||||
|
||||
logger.info(f"Processing {len(urls)} URL sources")
|
||||
|
||||
# Process statistics
|
||||
total_stats = {
|
||||
'processed': 0,
|
||||
'discovered': 0,
|
||||
'created': 0,
|
||||
'pending': 0,
|
||||
'existing': 0,
|
||||
'errors': 0
|
||||
}
|
||||
|
||||
# Process each URL
|
||||
for url_data in urls:
|
||||
stats = process_url_source(url_data)
|
||||
|
||||
total_stats['processed'] += 1
|
||||
total_stats['discovered'] += stats['discovered']
|
||||
total_stats['created'] += stats['created']
|
||||
total_stats['pending'] += stats['pending']
|
||||
total_stats['existing'] += stats['existing']
|
||||
total_stats['errors'] += stats['errors']
|
||||
|
||||
# Small delay between URLs to avoid hammering servers
|
||||
time.sleep(2)
|
||||
|
||||
# Log summary
|
||||
logger.info("-" * 80)
|
||||
logger.info(f"Batch complete:")
|
||||
logger.info(f" - Processed: {total_stats['processed']} URLs")
|
||||
logger.info(f" - Discovered: {total_stats['discovered']} feeds")
|
||||
logger.info(f" - Created: {total_stats['created']} new feeds")
|
||||
logger.info(f" - Pending review: {total_stats['pending']} feeds")
|
||||
logger.info(f" - Already existing: {total_stats['existing']} feeds")
|
||||
logger.info(f" - Errors: {total_stats['errors']}")
|
||||
logger.info("=" * 80)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main worker loop"""
|
||||
logger.info("URL Feed Discovery Worker started")
|
||||
logger.info(f"Check interval: {CHECK_INTERVAL} seconds ({CHECK_INTERVAL // 60} minutes)")
|
||||
logger.info(f"Batch size: {BATCH_SIZE}")
|
||||
logger.info(f"Max feeds per URL: {MAX_FEEDS_PER_URL}")
|
||||
|
||||
# Run immediately on start
|
||||
try:
|
||||
process_batch()
|
||||
except Exception as e:
|
||||
logger.error(f"Error in initial batch: {e}", exc_info=True)
|
||||
|
||||
# Main loop
|
||||
while True:
|
||||
try:
|
||||
logger.info(f"Waiting {CHECK_INTERVAL} seconds until next batch...")
|
||||
time.sleep(CHECK_INTERVAL)
|
||||
process_batch()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Worker stopped by user")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error in main loop: {e}", exc_info=True)
|
||||
# Wait a bit before retrying to avoid rapid failure loops
|
||||
time.sleep(60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,125 +0,0 @@
|
|||
import logging
|
||||
import hashlib
|
||||
from datetime import datetime
|
||||
from newspaper import Article, ArticleException, Config
|
||||
import requests
|
||||
from db import get_write_conn, get_read_conn
|
||||
|
||||
# Configuration
|
||||
logger = logging.getLogger("url_worker")
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
|
||||
def get_active_urls():
|
||||
"""Get all active URL sources."""
|
||||
with get_read_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT id, nombre, url, categoria_id, pais_id, idioma
|
||||
FROM fuentes_url
|
||||
WHERE active = true
|
||||
""")
|
||||
return cur.fetchall()
|
||||
|
||||
def update_source_status(source_id, status, message, http_code=0):
|
||||
"""Update the status of a URL source."""
|
||||
with get_write_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
UPDATE fuentes_url
|
||||
SET last_check = NOW(),
|
||||
last_status = %s,
|
||||
status_message = %s,
|
||||
last_http_code = %s
|
||||
WHERE id = %s
|
||||
""", (status, message, http_code, source_id))
|
||||
conn.commit()
|
||||
|
||||
def save_article(source, article):
|
||||
"""Save the extracted article to the database."""
|
||||
source_id, source_name, source_url, cat_id, pais_id, lang = source
|
||||
|
||||
# Use the article url if possible, otherwise source_url
|
||||
final_url = article.url or source_url
|
||||
noticia_id = hashlib.md5(final_url.encode("utf-8")).hexdigest()
|
||||
|
||||
with get_write_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
# Check if exists
|
||||
cur.execute("SELECT id FROM noticias WHERE id = %s", (noticia_id,))
|
||||
if cur.fetchone():
|
||||
return False # Already exists
|
||||
|
||||
# Prepare data
|
||||
title = article.title or "Sin título"
|
||||
summary = article.summary or article.text[:500]
|
||||
image_url = article.top_image
|
||||
pub_date = article.publish_date or datetime.utcnow()
|
||||
|
||||
cur.execute("""
|
||||
INSERT INTO noticias (
|
||||
id, titulo, resumen, url, fecha, imagen_url,
|
||||
fuente_nombre, categoria_id, pais_id
|
||||
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (id) DO NOTHING
|
||||
""", (
|
||||
noticia_id, title, summary, final_url, pub_date, image_url,
|
||||
source_name, cat_id, pais_id
|
||||
))
|
||||
conn.commit()
|
||||
return True
|
||||
|
||||
def process_url(source):
|
||||
"""Process a single URL source."""
|
||||
source_id, name, url, _, _, _ = source
|
||||
|
||||
logger.info(f"Processing URL: {url} ({name})")
|
||||
|
||||
try:
|
||||
# Browser-like headers
|
||||
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
config = Config()
|
||||
config.browser_user_agent = user_agent
|
||||
config.request_timeout = 30
|
||||
|
||||
article = Article(url, config=config, language='es')
|
||||
article.download()
|
||||
|
||||
if not article.html:
|
||||
update_source_status(source_id, "ERROR", "No content downloaded (Empty HTML)", 0)
|
||||
return
|
||||
|
||||
article.parse()
|
||||
try:
|
||||
article.nlp()
|
||||
except:
|
||||
pass
|
||||
|
||||
if not article.title:
|
||||
update_source_status(source_id, "ERROR_PARSE", "Could not extract title (Page might be not an article)", 200)
|
||||
return
|
||||
|
||||
saved = save_article(source, article)
|
||||
|
||||
status_msg = "News created successfully" if saved else "News already exists"
|
||||
update_source_status(source_id, "OK", status_msg, 200)
|
||||
logger.info(f"Success {url}: {status_msg}")
|
||||
|
||||
except ArticleException as ae:
|
||||
logger.error(f"Newspaper Error {url}: {ae}")
|
||||
update_source_status(source_id, "ERROR_DOWNLOAD", str(ae)[:200], 0)
|
||||
except requests.exceptions.RequestException as re:
|
||||
logger.error(f"Network Error {url}: {re}")
|
||||
update_source_status(source_id, "ERROR_NETWORK", str(re)[:200], 0)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected Error {url}: {e}")
|
||||
update_source_status(source_id, "ERROR_UNKNOWN", str(e)[:200], 500)
|
||||
|
||||
def main():
|
||||
logger.info("Starting URL Worker")
|
||||
urls = get_active_urls()
|
||||
logger.info(f"Found {len(urls)} active URLs")
|
||||
for source in urls:
|
||||
process_url(source)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,31 +0,0 @@
|
|||
import time
|
||||
import logging
|
||||
import sys
|
||||
from workers.url_worker import main as run_once
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
stream=sys.stdout
|
||||
)
|
||||
logger = logging.getLogger("url_worker_daemon")
|
||||
|
||||
INTERVAL = 300 # 5 minutes
|
||||
|
||||
def main():
|
||||
logger.info("Starting URL Worker Daemon")
|
||||
logger.info(f"Check interval: {INTERVAL} seconds")
|
||||
|
||||
while True:
|
||||
try:
|
||||
logger.info("Running job cycle...")
|
||||
run_once()
|
||||
logger.info("Cycle completed.")
|
||||
except Exception as e:
|
||||
logger.exception(f"Error in job cycle: {e}")
|
||||
|
||||
time.sleep(INTERVAL)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue