go integration and wikipedia

This commit is contained in:
jlimolina 2026-03-28 18:30:07 +01:00
parent 47a252e339
commit ee90335b92
7828 changed files with 1307913 additions and 20807 deletions

View file

@ -0,0 +1,405 @@
import os
import time
import logging
import re
from typing import List, Optional
import psycopg2
import psycopg2.extras
from langdetect import detect, DetectorFactory
import ctranslate2
from transformers import AutoTokenizer
DetectorFactory.seed = 0
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
LOG = logging.getLogger("translator_ct2")
TRANSLATOR_ID = os.environ.get("TRANSLATOR_ID", "")
TRANSLATOR_TOTAL = int(os.environ.get("TRANSLATOR_TOTAL", "1"))
def clean_text(text: str) -> str:
if not text:
return ""
text = re.sub(r'<[^>]+>', '', text)
text = text.replace('<unk>', '')
text = text.replace('&nbsp;', ' ')
text = text.replace('&amp;', '&')
text = text.replace('&lt;', '<')
text = text.replace('&gt;', '>')
text = text.replace('&quot;', '"')
text = re.sub(r'\s+', ' ', text)
return text.strip()
DB_CONFIG = {
"host": os.environ.get("DB_HOST", "localhost"),
"port": int(os.environ.get("DB_PORT", 5432)),
"dbname": os.environ.get("DB_NAME", "rss"),
"user": os.environ.get("DB_USER", "rss"),
"password": os.environ.get("DB_PASS", "x"),
}
def _env_list(name: str, default="es"):
raw = os.environ.get(name)
if raw:
return [s.strip() for s in raw.split(",") if s.strip()]
return [default]
def _env_int(name: str, default: int = 8):
v = os.environ.get(name)
try:
return int(v)
except Exception:
return default
def _env_str(name: str, default=None):
v = os.environ.get(name)
return v if v else default
TARGET_LANGS = _env_list("TARGET_LANGS")
BATCH_SIZE = _env_int("TRANSLATOR_BATCH", 8)
MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", 512)
MAX_NEW_TOKENS = _env_int("MAX_NEW_TOKENS", 512)
CT2_MODEL_PATH = _env_str("CT2_MODEL_PATH", "/app/models/nllb-ct2")
CT2_DEVICE = _env_str("CT2_DEVICE", "cpu")
CT2_COMPUTE_TYPE = _env_str("CT2_COMPUTE_TYPE", "int8")
UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", "facebook/nllb-200-distilled-600M")
BODY_CHARS_CHUNK = _env_int("BODY_CHARS_CHUNK", 900)
LANG_CODE_MAP = {
"en": "eng_Latn", "es": "spa_Latn", "fr": "fra_Latn", "de": "deu_Latn",
"it": "ita_Latn", "pt": "por_Latn", "nl": "nld_Latn", "sv": "swe_Latn",
"da": "dan_Latn", "fi": "fin_Latn", "no": "nob_Latn",
"pl": "pol_Latn", "cs": "ces_Latn", "sk": "slk_Latn",
"sl": "slv_Latn", "hu": "hun_Latn", "ro": "ron_Latn",
"el": "ell_Grek", "ru": "rus_Cyrl", "uk": "ukr_Cyrl",
"tr": "tur_Latn", "ar": "arb_Arab", "fa": "pes_Arab",
"he": "heb_Hebr", "zh": "zho_Hans", "ja": "jpn_Jpan",
"ko": "kor_Hang", "vi": "vie_Latn",
}
_tokenizer = None
_translator = None
def ensure_model():
global _tokenizer, _translator
if _translator:
return
model_path = CT2_MODEL_PATH
model_bin = os.path.join(model_path, "model.bin")
if not os.path.exists(model_bin):
LOG.info(f"CTranslate2 model not found at {model_path}, converting from {UNIVERSAL_MODEL}...")
convert_model()
LOG.info(f"Loading CTranslate2 model from {model_path} on {CT2_DEVICE}")
_translator = ctranslate2.Translator(
model_path,
device=CT2_DEVICE,
compute_type=CT2_COMPUTE_TYPE,
)
_tokenizer = AutoTokenizer.from_pretrained(UNIVERSAL_MODEL)
LOG.info("CTranslate2 model loaded successfully")
def convert_model():
import subprocess
model_path = CT2_MODEL_PATH
os.makedirs(model_path, exist_ok=True)
quantization = CT2_COMPUTE_TYPE if CT2_COMPUTE_TYPE != "auto" else "int8"
cmd = [
"ct2-transformers-converter",
"--model", UNIVERSAL_MODEL,
"--output_dir", model_path,
"--quantization", quantization,
"--force"
]
LOG.info(f"Running: {' '.join(cmd)}")
result = subprocess.run(cmd, capture_output=True, text=True, timeout=1800)
if result.returncode != 0:
LOG.error(f"Model conversion failed: {result.stderr}")
raise RuntimeError("Failed to convert model")
LOG.info("Model conversion completed")
def translate_texts(src: str, tgt: str, texts: List[str]) -> List[str]:
if not texts:
return []
ensure_model()
clean = [(t or "").strip() for t in texts]
if all(not t for t in clean):
return ["" for _ in clean]
src_code = LANG_CODE_MAP.get(src, f"{src}_Latn")
tgt_code = LANG_CODE_MAP.get(tgt, "spa_Latn")
try:
_tokenizer.src_lang = src_code
except Exception:
pass
sources = []
for t in clean:
if t:
ids = _tokenizer.encode(t, truncation=True, max_length=MAX_SRC_TOKENS)
tokens = _tokenizer.convert_ids_to_tokens(ids)
sources.append(tokens)
else:
sources.append([])
target_prefix = [[tgt_code]] * len(sources)
results = _translator.translate_batch(
sources,
target_prefix=target_prefix,
beam_size=2,
max_decoding_length=MAX_NEW_TOKENS,
repetition_penalty=2.0,
no_repeat_ngram_size=3,
)
translated = []
for result in results:
try:
if result.hypotheses and len(result.hypotheses) > 0:
hyp = result.hypotheses[0]
if isinstance(hyp, list) and len(hyp) > 0:
first_hyp = hyp[0]
if isinstance(first_hyp, dict) and "token_ids" in first_hyp:
tokens = first_hyp["token_ids"]
text = _tokenizer.decode(tokens)
translated.append(text.strip())
elif isinstance(first_hyp, str):
token_strings = hyp[1:] if len(hyp) > 1 else []
if token_strings:
text = _tokenizer.convert_tokens_to_string(token_strings)
translated.append(text.strip())
else:
translated.append("")
else:
translated.append("")
else:
translated.append("")
else:
translated.append("")
except Exception as e:
LOG.error(f"Error processing result: {e}")
translated.append("")
return translated
def split_body_into_chunks(text: str) -> List[str]:
text = (text or "").strip()
if len(text) <= BODY_CHARS_CHUNK:
return [text] if text else []
parts = re.split(r'(\n\n+|(?<=[\.\!\?؛؟。])\s+)', text)
chunks = []
current = ""
for part in parts:
if not part:
continue
if len(current) + len(part) <= BODY_CHARS_CHUNK:
current += part
else:
if current.strip():
chunks.append(current.strip())
current = part
if current.strip():
chunks.append(current.strip())
return chunks if chunks else [text]
def translate_body_long(src: str, tgt: str, body: str) -> str:
body = (body or "").strip()
if not body:
return ""
chunks = split_body_into_chunks(body)
if len(chunks) == 1:
return translate_texts(src, tgt, [body])[0]
translated_chunks = []
for ch in chunks:
tr = translate_texts(src, tgt, [ch])[0]
translated_chunks.append(tr)
return " ".join(translated_chunks)
def normalize_lang(lang: Optional[str], default: str = "es") -> Optional[str]:
if not lang:
return default
lang = lang.strip().lower()[:2]
return lang if lang else default
def detect_lang(text: str) -> str:
if not text or len(text) < 10:
return "en"
try:
return detect(text)
except Exception:
return "en"
def process_batch(conn, rows):
todo = []
for r in rows:
lang_to = normalize_lang(r.get("lang_to"), "es") or "es"
lang_from = normalize_lang(r.get("lang_from")) or detect_lang(r.get("titulo") or "")
titulo = (r.get("titulo") or "").strip()
resumen = (r.get("resumen") or "").strip()
if lang_from == lang_to:
# Mark as done and copy original text if languages match
cursor = conn.cursor()
cursor.execute("""
UPDATE traducciones
SET titulo_trad = %s, resumen_trad = %s, status = 'done'
WHERE id = %s
""", (titulo, resumen, r.get("tr_id")))
conn.commit()
cursor.close()
continue
todo.append({
"tr_id": r.get("tr_id"),
"lang_from": lang_from,
"lang_to": lang_to,
"titulo": titulo,
"resumen": resumen,
})
if not todo:
return
# 1. FAST LOCKING: Commit locked_at immediately to inform other workers
cursor = conn.cursor()
tr_ids = [item["tr_id"] for item in todo]
cursor.execute(f"""
UPDATE traducciones
SET locked_at = NOW()
WHERE id = ANY(ARRAY[{','.join(['%s'] * len(tr_ids))}])
""", tr_ids)
conn.commit()
cursor.close()
from collections import defaultdict
groups = defaultdict(list)
for item in todo:
key = (item["lang_from"], item["lang_to"])
groups[key].append(item)
for (lang_from, lang_to), items in groups.items():
LOG.info(f"Translating {lang_from} -> {lang_to} ({len(items)} items)")
try:
titles = [i["titulo"] for i in items]
translated_titles = translate_texts(lang_from, lang_to, titles)
for item, tt in zip(items, translated_titles):
body = (item["resumen"] or "").strip()
tb = ""
if body:
try:
tb = translate_body_long(lang_from, lang_to, body)
except Exception as e:
LOG.error(f"Body translation error for ID {item['tr_id']}: {e}")
tb = item["resumen"]
tt = clean_text((tt or "").strip())
tb = clean_text((tb or "").strip())
if not tt:
tt = item["titulo"]
if not tb:
tb = item["resumen"]
# 2. INDIVIDUAL COMMIT: Save each item as it's done
try:
cursor = conn.cursor()
cursor.execute("""
UPDATE traducciones
SET titulo_trad = %s, resumen_trad = %s, status = 'done', locked_at = NULL
WHERE id = %s
""", (tt, tb, item["tr_id"]))
conn.commit()
cursor.close()
except Exception as e:
LOG.error(f"Update error for ID {item['tr_id']}: {e}")
conn.rollback()
LOG.info(f"Finished group {lang_from} -> {lang_to}")
except Exception as e:
LOG.error(f"Batch group error {lang_from} -> {lang_to}: {e}")
# Mark these as error to avoid infinite loop if it's a model crash
try:
cursor = conn.cursor()
cursor.execute("""
UPDATE traducciones SET status = 'error', locked_at = NULL
WHERE id = ANY(ARRAY[{','.join(['%s'] * len(items))}])
""", [i["tr_id"] for i in items])
conn.commit()
cursor.close()
except:
conn.rollback()
def fetch_pending_translations(conn):
cursor = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
worker_id = os.environ.get("HOSTNAME", f"worker-{os.getpid()}")
for lang in TARGET_LANGS:
cursor.execute("""
SELECT t.id as tr_id, t.lang_from, t.lang_to,
n.titulo, n.resumen, n.id as noticia_id
FROM traducciones t
JOIN noticias n ON n.id = t.noticia_id
WHERE t.lang_to = %s
AND (t.titulo_trad IS NULL OR t.resumen_trad IS NULL)
AND (t.locked_at IS NULL OR t.locked_at < NOW() - INTERVAL '10 minutes')
ORDER BY n.fecha DESC
LIMIT %s
FOR UPDATE SKIP LOCKED
""", (lang, BATCH_SIZE))
rows = cursor.fetchall()
if rows:
LOG.info(f"Found {len(rows)} pending translations for {lang}")
process_batch(conn, rows)
cursor.close()
def connect_db():
return psycopg2.connect(**DB_CONFIG)
def main():
LOG.info(f"CTranslate2 translator worker started (device={CT2_DEVICE}, instances={TRANSLATOR_TOTAL})")
ensure_model()
while True:
try:
conn = connect_db()
fetch_pending_translations(conn)
conn.close()
except Exception as e:
LOG.error(f"Error: {e}")
time.sleep(30)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,109 @@
#!/usr/bin/env python3
"""
Language Detection Worker
Detects and updates the language of news items in the database.
"""
import os
import sys
import time
import logging
from collections import Counter
import psycopg2
from psycopg2.extras import RealDictCursor
from langdetect import detect, LangDetectException
logging.basicConfig(
level=logging.INFO,
format='[%(asctime)s] %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
LOG = logging.getLogger(__name__)
DB_CONFIG = {
'host': os.getenv('DB_HOST', 'db'),
'port': int(os.getenv('DB_PORT', 5432)),
'database': os.getenv('DB_NAME', 'rss'),
'user': os.getenv('DB_USER', 'rss'),
'password': os.getenv('DB_PASS', 'rss')
}
BATCH_SIZE = int(os.getenv('LANG_DETECT_BATCH', '1000'))
SLEEP_INTERVAL = int(os.getenv('LANG_DETECT_SLEEP', '60'))
def get_db_connection():
return psycopg2.connect(**DB_CONFIG)
def detect_language(text):
if not text or len(text.strip()) < 10:
return None
try:
return detect(text)
except LangDetectException:
return None
def process_batch(conn):
cursor = conn.cursor(cursor_factory=RealDictCursor)
# ONLY pick items where lang is NULL or empty
cursor.execute("""
SELECT id, titulo, resumen
FROM noticias
WHERE lang IS NULL OR TRIM(lang) = ''
ORDER BY fecha DESC
LIMIT %s
""", (BATCH_SIZE,))
rows = cursor.fetchall()
if not rows:
return 0
updated = 0
lang_stats = Counter()
for row in rows:
news_id = row['id']
titulo = (row['titulo'] or "").strip()
resumen = (row['resumen'] or "").strip()
combined = f"{titulo} {resumen}".strip()
lang = detect_language(combined)
if lang:
cursor.execute("""
UPDATE noticias SET lang = %s WHERE id = %s
""", (lang, news_id))
lang_stats[lang] += 1
updated += 1
conn.commit()
cursor.close()
if updated > 0:
LOG.info(f"Updated {updated} news languages: {dict(lang_stats)}")
return updated
def main():
LOG.info("Language detection worker started")
while True:
try:
conn = get_db_connection()
processed = process_batch(conn)
conn.close()
if processed == 0:
LOG.info("No more news to process, sleeping...")
time.sleep(SLEEP_INTERVAL)
else:
time.sleep(1)
except Exception as e:
LOG.error(f"Error: {e}")
time.sleep(10)
if __name__ == "__main__":
main()

View file

@ -3,7 +3,8 @@ import time
import logging
import re
import string
from typing import List, Tuple
import json
from typing import List, Tuple, Set, Dict
from collections import Counter
import psycopg2
@ -46,6 +47,49 @@ ENT_LABELS = {
"MISC": "tema",
}
# ==========================================================
# Configuración global de entidades (Synonyms / Blacklist)
# ==========================================================
ENTITY_CONFIG = {"blacklist": [], "synonyms": {}}
REVERSE_SYNONYMS = {}
def load_entity_config():
global ENTITY_CONFIG, REVERSE_SYNONYMS
path = "entity_config.json"
if os.path.exists(path):
try:
with open(path, "r", encoding="utf-8") as f:
ENTITY_CONFIG = json.load(f)
# Construir mapa inverso para búsqueda rápida de sinónimos
REVERSE_SYNONYMS = {}
for canonical, aliases in ENTITY_CONFIG.get("synonyms", {}).items():
for alias in aliases:
REVERSE_SYNONYMS[alias.lower()] = canonical
REVERSE_SYNONYMS[canonical.lower()] = canonical
log.info(f"Loaded entity_config.json: {len(ENTITY_CONFIG.get('blacklist', []))} blacklisted, {len(ENTITY_CONFIG.get('synonyms', {}))} synonym groups")
except Exception as e:
log.error(f"Error loading entity_config.json: {e}")
def get_canonical_name(text: str) -> str:
if not text:
return text
lower = text.lower()
return REVERSE_SYNONYMS.get(lower, text)
def is_blacklisted(text: str) -> bool:
if not text:
return True
lower = text.lower()
# Check full match
if lower in [item.lower() for item in ENTITY_CONFIG.get("blacklist", [])]:
return True
# Check if it's just a number
if re.fullmatch(r"[0-9\s\.,\-:/]+", lower):
return True
return False
# ==========================================================
# Limpieza avanzada
# ==========================================================
@ -125,7 +169,11 @@ def clean_tag_text(text: str) -> str | None:
if not text:
return None
text = BeautifulSoup(text, "html.parser").get_text()
try:
text = BeautifulSoup(text, "html.parser").get_text()
except Exception:
pass
for pat in HTML_TRASH_PATTERNS:
text = re.sub(pat, "", text)
@ -133,71 +181,25 @@ def clean_tag_text(text: str) -> str | None:
text = text.strip(string.punctuation + " ")
if len(text) < 3:
log.debug(f"Clean reject (too short): {text}")
return None
if re.search(r"[<>/\\]", text):
log.debug(f"Clean reject (bad chars): {text}")
return None
if is_blacklisted(text):
return None
lower = text.lower()
if lower.startswith("href="):
log.debug(f"Clean reject (href): {text}")
return None
if _looks_like_attr_or_path(lower):
log.debug(f"Clean reject (attr/path): {text}")
return None
if lower in GENERIC_BAD_TAGS:
log.debug(f"Clean reject (generic bad): {text}")
return None
replacements = {
"ee.uu.": "Estados Unidos",
"los estados unidos": "Estados Unidos",
"eeuu": "Estados Unidos",
"eu": "Unión Europea",
"ue": "Unión Europea",
"kosova": "Kosovo",
# Specific User Requests
"trump": "Donald Trump",
"mr. trump": "Donald Trump",
"mr trump": "Donald Trump",
"doland trump": "Donald Trump",
"el presidente trump": "Donald Trump",
"president trump": "Donald Trump",
"ex-president trump": "Donald Trump",
"expresidente trump": "Donald Trump",
"putin": "Vladimir Putin",
"vladimir putin": "Vladimir Putin",
"v. putin": "Vladimir Putin",
"presidente putin": "Vladimir Putin",
# New requests
"sanchez": "Pedro Sánchez",
"pedro sanchez": "Pedro Sánchez",
"p. sanchez": "Pedro Sánchez",
"mr. sanchez": "Pedro Sánchez",
"sánchez": "Pedro Sánchez", # explicit match just in case
"pedro sánchez": "Pedro Sánchez",
"maduro": "Nicolás Maduro",
"nicolas maduro": "Nicolás Maduro",
"mr. maduro": "Nicolás Maduro",
"lula": "Lula da Silva",
"lula da silva": "Lula da Silva",
"luiz inácio lula da silva": "Lula da Silva",
}
if lower in replacements:
return replacements[lower]
# Blacklist (explicit removals requested)
blacklist = {
"getty images", "netflix", "fiscalia", "segun", "estoy", # People blacklist
"and more", "app", "estamos", "ultra", # Orgs blacklist
"hacienda", "fiscalía"
}
if lower in blacklist:
log.debug(f"Clean reject (blacklist): {text}")
return None
return text
# Normalización vía entity_config
canonical = get_canonical_name(text)
return canonical
# ==========================================================
@ -207,7 +209,11 @@ def clean_topic_text(text: str) -> str | None:
if not text:
return None
text = BeautifulSoup(text, "html.parser").get_text()
try:
text = BeautifulSoup(text, "html.parser").get_text()
except Exception:
pass
for pat in HTML_TRASH_PATTERNS:
text = re.sub(pat, "", text)
@ -217,6 +223,9 @@ def clean_topic_text(text: str) -> str | None:
if len(text) < TOPIC_MIN_CHARS:
return None
if is_blacklisted(text):
return None
lower = text.lower()
if _looks_like_attr_or_path(lower):
return None
@ -245,8 +254,6 @@ def clean_topic_text(text: str) -> str | None:
return None
if all(t in STOPWORDS for t in tokens):
return None
if re.fullmatch(r"[0-9\s\.,\-:/]+", norm):
return None
return norm
@ -262,8 +269,6 @@ def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]],
return ents, topics
doc = nlp(text)
# log.debug(f"Processing text ({len(text)} chars): {text[:30]}...")
# log.debug(f"Entities found: {len(doc.ents)}")
# --- ENTIDADES ---
for ent in doc.ents:
@ -273,35 +278,8 @@ def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]],
cleaned = clean_tag_text(ent.text)
if not cleaned:
# log.debug(f"Rejected entity: {ent.text} ({ent.label_})")
continue
if tipo == "persona":
lower_cleaned = cleaned.lower()
# Aggressive normalization rules for VIPs
# Use token checks or substring checks carefully
if "trump" in lower_cleaned.split():
# Token 'trump' exists? e.g. "donald trump", "trump", "mr. trump"
# Exclude family members
family = ["ivanka", "melania", "eric", "tiffany", "barron", "lara", "mary", "fred"]
if not any(f in lower_cleaned for f in family):
cleaned = "Donald Trump"
elif "sanchez" in lower_cleaned or "sánchez" in lower_cleaned:
# Be careful of other Sanchez? But user context implies Pedro.
if "pedro" in lower_cleaned or "presidente" in lower_cleaned or lower_cleaned in ["sanchez", "sánchez"]:
cleaned = "Pedro Sánchez"
elif "maduro" in lower_cleaned:
cleaned = "Nicolás Maduro"
elif "lula" in lower_cleaned:
cleaned = "Lula da Silva"
elif "putin" in lower_cleaned:
cleaned = "Vladimir Putin"
# log.debug(f"Accepted entity: {cleaned} ({tipo})")
ents.append((cleaned, tipo))
# --- TOPICS ---
@ -311,10 +289,10 @@ def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]],
if cleaned:
topic_counter[cleaned] += 1
ent_values = {v for (v, _) in ents}
ent_values = {v.lower() for (v, _) in ents}
for val, _count in topic_counter.most_common(TOPIC_MAX_PER_DOC):
if val in ent_values:
if val.lower() in ent_values:
continue
topics.append((val, "tema"))
@ -328,85 +306,98 @@ def main():
global STOPWORDS
# Cargar spaCy
log.info("Cargando modelo spaCy es_core_news_md...")
nlp = spacy.load("es_core_news_md", disable=["lemmatizer", "textcat"])
log.info("Cargando modelo spaCy es_core_news_lg...")
nlp = spacy.load("es_core_news_lg", disable=["lemmatizer", "textcat"])
STOPWORDS = set(nlp.Defaults.stop_words)
log.info("Modelo spaCy cargado correctamente.")
# Cargar configuración de entidades
load_entity_config()
while True:
try:
with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute(
"""
SELECT t.id, t.titulo_trad, t.resumen_trad
FROM traducciones t
WHERE t.status = 'done'
AND t.lang_to = %s
AND NOT EXISTS (
SELECT 1 FROM tags_noticia tn WHERE tn.traduccion_id = t.id
)
ORDER BY t.id DESC
LIMIT %s;
""",
(NER_LANG, BATCH),
)
with get_conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute(
"""
SELECT t.id, t.titulo_trad, t.resumen_trad, t.noticia_id
FROM traducciones t
WHERE t.status = 'done'
AND t.lang_to = %s
AND NOT EXISTS (
SELECT 1 FROM tags_noticia tn WHERE tn.traduccion_id = t.id
)
ORDER BY t.id DESC
LIMIT %s;
""",
(NER_LANG, BATCH),
)
rows = cur.fetchall()
rows = cur.fetchall()
if not rows:
time.sleep(5)
continue
log.info(f"Procesando {len(rows)} traducciones para NER/temas...")
inserted_links = 0
for r in rows:
text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
if not text:
if not rows:
time.sleep(10)
continue
ents, topics = extract_entities_and_topics(nlp, text)
tags = ents + topics
if not tags:
continue
log.info(f"Procesando {len(rows)} traducciones para NER/temas...")
for valor, tipo in tags:
try:
cur.execute(
"""
INSERT INTO tags (valor, tipo)
VALUES (%s, %s)
ON CONFLICT (valor, tipo)
DO UPDATE SET valor = EXCLUDED.valor
RETURNING id;
""",
(valor, tipo),
)
tag_id = cur.fetchone()[0]
inserted_links = 0
cur.execute(
"""
INSERT INTO tags_noticia (traduccion_id, tag_id)
VALUES (%s, %s)
ON CONFLICT DO NOTHING;
""",
(r["id"], tag_id),
)
for r in rows:
noticia_id = r["noticia_id"]
traduccion_id = r["id"]
text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
if not text:
# Para evitar re-procesar, insertamos un tag especial '_none_'
tags = [("_none_", "sistema")]
else:
ents, topics = extract_entities_and_topics(nlp, text)
tags = ents + topics
if not tags:
tags = [("_none_", "sistema")]
if cur.rowcount > 0:
inserted_links += 1
for valor, tipo in tags:
try:
# Usar commit parcial por noticia para evitar abortar todo el batch
cur.execute(
"""
INSERT INTO tags (valor, tipo)
VALUES (%s, %s)
ON CONFLICT (valor, tipo)
DO UPDATE SET valor = EXCLUDED.valor
RETURNING id;
""",
(valor, tipo),
)
tag_id = cur.fetchone()[0]
except Exception:
log.exception("Error insertando tag/relación")
cur.execute(
"""
INSERT INTO tags_noticia (traduccion_id, noticia_id, tag_id)
VALUES (%s, %s, %s)
ON CONFLICT DO NOTHING;
""",
(traduccion_id, noticia_id, tag_id),
)
conn.commit()
log.info(f"Lote NER OK. Nuevas relaciones tag_noticia: {inserted_links}")
if cur.rowcount > 0:
inserted_links += 1
except Exception as e:
log.error(f"Error insertando tag '{valor}': {e}")
conn.rollback()
# Volvemos a empezar el loop de tags para esta noticia no es buena idea,
# pero el rollback abortó la transacción del cursor.
# En psycopg2, tras rollback hay que seguir o cerrar.
pass
conn.commit()
except Exception:
log.exception("Error general en NER loop")
time.sleep(5)
log.info(f"Lote NER OK. Nuevas relaciones tag_noticia: {inserted_links}")
except Exception as e:
log.exception(f"Error general en NER loop: {e}")
time.sleep(10)
if __name__ == "__main__":

View file

@ -1,334 +0,0 @@
"""
Worker de Qdrant
Vectoriza noticias traducidas y las sube a Qdrant para búsquedas semánticas.
"""
import os
import sys
import time
import uuid
from datetime import datetime
from typing import List, Dict, Any
# Añadir el directorio raíz al path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from db import get_read_conn, get_write_conn
try:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
except ImportError:
print("❌ Error: qdrant-client no instalado. Ejecuta: pip install qdrant-client")
sys.exit(1)
try:
from sentence_transformers import SentenceTransformer
except ImportError:
print("❌ Error: sentence-transformers no instalado")
sys.exit(1)
# Configuración
QDRANT_HOST = os.environ.get("QDRANT_HOST", "localhost")
QDRANT_PORT = int(os.environ.get("QDRANT_PORT", "6333"))
QDRANT_COLLECTION = os.environ.get("QDRANT_COLLECTION_NAME", "news_vectors")
EMB_MODEL = os.environ.get("EMB_MODEL", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
EMB_DEVICE = os.environ.get("EMB_DEVICE", "cuda")
BATCH_SIZE = int(os.environ.get("QDRANT_BATCH_SIZE", "100"))
SLEEP_IDLE = int(os.environ.get("QDRANT_SLEEP_IDLE", "30"))
# Cliente Qdrant global
qdrant_client = None
embedding_model = None
def init_qdrant_client():
"""
Inicializa el cliente de Qdrant y crea la colección si no existe.
"""
global qdrant_client
print(f"🔌 Conectando a Qdrant en {QDRANT_HOST}:{QDRANT_PORT}...")
qdrant_client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
# Verificar si la colección existe
collections = qdrant_client.get_collections().collections
collection_names = [c.name for c in collections]
if QDRANT_COLLECTION not in collection_names:
print(f"📦 Creando colección '{QDRANT_COLLECTION}'...")
# Obtener dimensión del modelo de embeddings
# paraphrase-multilingual-MiniLM-L12-v2 = 384 dimensiones
vector_size = 384
qdrant_client.create_collection(
collection_name=QDRANT_COLLECTION,
vectors_config=VectorParams(
size=vector_size,
distance=Distance.COSINE
)
)
print(f"✅ Colección '{QDRANT_COLLECTION}' creada (dimensión: {vector_size})")
else:
print(f"✅ Colección '{QDRANT_COLLECTION}' ya existe")
# Obtener info de la colección
collection_info = qdrant_client.get_collection(QDRANT_COLLECTION)
print(f"📊 Puntos en colección: {collection_info.points_count}")
def init_embedding_model():
"""
Inicializa el modelo de embeddings.
"""
global embedding_model
print(f"🤖 Cargando modelo de embeddings: {EMB_MODEL}")
print(f"🖥️ Dispositivo: {EMB_DEVICE}")
embedding_model = SentenceTransformer(EMB_MODEL, device=EMB_DEVICE)
print(f"✅ Modelo cargado correctamente")
def get_pending_news(limit: int = BATCH_SIZE) -> List[Dict[str, Any]]:
"""
Obtiene noticias traducidas pendientes de vectorizar.
Args:
limit: Número máximo de noticias a obtener
Returns:
Lista de noticias
"""
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT
t.id as traduccion_id,
t.noticia_id,
TRIM(t.lang_to) as lang,
t.titulo_trad as titulo,
t.resumen_trad as resumen,
n.url,
n.fecha,
n.fuente_nombre,
n.categoria_id,
n.pais_id
FROM traducciones t
INNER JOIN noticias n ON t.noticia_id = n.id
WHERE t.vectorized = FALSE
AND t.status = 'done'
ORDER BY t.created_at ASC
LIMIT %s
""", (limit,))
columns = [desc[0] for desc in cur.description]
results = []
for row in cur.fetchall():
results.append(dict(zip(columns, row)))
return results
def generate_embeddings(texts: List[str]) -> List[List[float]]:
"""
Genera embeddings para una lista de textos.
Args:
texts: Lista de textos
Returns:
Lista de vectores de embeddings
"""
embeddings = embedding_model.encode(
texts,
batch_size=32,
show_progress_bar=False,
convert_to_numpy=True
)
return embeddings.tolist()
def upload_to_qdrant(news_batch: List[Dict[str, Any]]):
"""
Sube un lote de noticias a Qdrant.
Args:
news_batch: Lista de noticias
"""
if not news_batch:
return
# Preparar textos para embeddings (título + resumen)
texts = [
f"{news['titulo']} {news['resumen']}"
for news in news_batch
]
print(f" 🧮 Generando embeddings para {len(texts)} noticias...")
embeddings = generate_embeddings(texts)
# Preparar puntos para Qdrant
points = []
for news, embedding in zip(news_batch, embeddings):
point_id = str(uuid.uuid4())
# Preparar payload (metadata)
payload = {
"news_id": news['noticia_id'],
"traduccion_id": news['traduccion_id'],
"titulo": news['titulo'],
"resumen": news['resumen'],
"url": news['url'],
"fecha": news['fecha'].isoformat() if news['fecha'] else None,
"fuente_nombre": news['fuente_nombre'],
"categoria_id": news['categoria_id'],
"pais_id": news['pais_id'],
"lang": news['lang']
}
point = PointStruct(
id=point_id,
vector=embedding,
payload=payload
)
points.append(point)
# Guardar point_id para actualizar DB
news['qdrant_point_id'] = point_id
# Subir a Qdrant
print(f" ⬆️ Subiendo {len(points)} puntos a Qdrant...")
qdrant_client.upsert(
collection_name=QDRANT_COLLECTION,
points=points
)
# Actualizar base de datos
print(f" 💾 Actualizando estado en PostgreSQL...")
with get_write_conn() as conn:
with conn.cursor() as cur:
for news in news_batch:
cur.execute("""
UPDATE traducciones
SET
vectorized = TRUE,
vectorization_date = NOW(),
qdrant_point_id = %s
WHERE id = %s
""", (news['qdrant_point_id'], news['traduccion_id']))
conn.commit()
print(f" ✅ Lote subido correctamente")
def process_batch():
"""
Procesa un lote de noticias traducidas.
Returns:
Número de noticias procesadas
"""
news_batch = get_pending_news()
if not news_batch:
return 0
print(f"\n📋 Procesando {len(news_batch)} noticias traducidas...")
try:
upload_to_qdrant(news_batch)
return len(news_batch)
except Exception as e:
print(f"❌ Error procesando lote: {e}")
return 0
def get_stats():
"""
Obtiene estadísticas del sistema.
"""
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE vectorized = TRUE) as vectorizadas,
COUNT(*) FILTER (WHERE vectorized = FALSE AND status = 'done') as pendientes
FROM traducciones
WHERE lang_to = 'es'
""")
row = cur.fetchone()
return {
'total': row[0],
'vectorizadas': row[1],
'pendientes': row[2]
}
def main():
"""
Loop principal del worker.
"""
print("=" * 80)
print("🚀 Qdrant Vectorization Worker (Direct Translation)")
print("=" * 80)
print(f"Qdrant: {QDRANT_HOST}:{QDRANT_PORT}")
print(f"Colección: {QDRANT_COLLECTION}")
print(f"Modelo: {EMB_MODEL}")
print(f"Dispositivo: {EMB_DEVICE}")
print(f"Tamaño de lote: {BATCH_SIZE}")
print("=" * 80)
# Inicializar Qdrant
try:
init_qdrant_client()
except Exception as e:
print(f"❌ Error inicializando Qdrant: {e}")
print("⚠️ Asegúrate de que Qdrant esté corriendo")
return
# Inicializar modelo de embeddings
try:
init_embedding_model()
except Exception as e:
print(f"❌ Error cargando modelo de embeddings: {e}")
return
print("\n🔄 Iniciando loop de procesamiento...\n")
total_processed = 0
while True:
try:
processed = process_batch()
total_processed += processed
if processed > 0:
print(f"\n✅ Lote completado: {processed} noticias vectorizadas")
print(f"📊 Total procesado en esta sesión: {total_processed}")
# Mostrar estadísticas
stats = get_stats()
print(f"📈 Estadísticas globales:")
print(f" Total traducciones: {stats['total']}")
print(f" Vectorizadas: {stats['vectorizadas']}")
print(f" Pendientes: {stats['pendientes']}")
else:
print(f"💤 No hay noticias pendientes. Esperando {SLEEP_IDLE}s...")
time.sleep(SLEEP_IDLE)
except KeyboardInterrupt:
print("\n\n⏹️ Worker detenido por el usuario")
break
except Exception as e:
print(f"\n❌ Error en loop principal: {e}")
print(f"⏳ Esperando {SLEEP_IDLE}s antes de reintentar...")
time.sleep(SLEEP_IDLE)
if __name__ == "__main__":
main()

View file

@ -1,202 +0,0 @@
import os
import time
import logging
from typing import List, Tuple
import numpy as np
import psycopg2
import psycopg2.extras
logging.basicConfig(
level=logging.INFO,
format='[related] %(asctime)s %(levelname)s: %(message)s'
)
DB = dict(
host=os.environ.get("DB_HOST", "localhost"),
port=int(os.environ.get("DB_PORT", 5432)),
dbname=os.environ.get("DB_NAME", "rss"),
user=os.environ.get("DB_USER", "rss"),
password=os.environ.get("DB_PASS", "x"),
)
EMB_MODEL = os.environ.get(
"EMB_MODEL",
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
TOPK = int(os.environ.get("RELATED_TOPK", 10))
BATCH_IDS = int(os.environ.get("RELATED_BATCH_IDS", 200))
SLEEP_IDLE = float(os.environ.get("RELATED_SLEEP", 10))
MIN_SCORE = float(os.environ.get("RELATED_MIN_SCORE", 0.0))
WINDOW_HOURS = int(os.environ.get("RELATED_WINDOW_H", 0))
def get_conn():
return psycopg2.connect(**DB)
def fetch_all_embeddings(cur) -> Tuple[List[int], np.ndarray]:
sql = """
SELECT e.traduccion_id, e.embedding, n.fecha
FROM traduccion_embeddings e
JOIN traducciones t ON t.id = e.traduccion_id
JOIN noticias n ON n.id = t.noticia_id
WHERE e.model = %s
AND t.status = 'done'
AND t.lang_to = 'es'
"""
params = [EMB_MODEL]
if WINDOW_HOURS > 0:
sql += " AND n.fecha >= NOW() - INTERVAL %s"
params.append(f"{WINDOW_HOURS} hours")
cur.execute(sql, params)
rows = cur.fetchall()
if not rows:
return [], None
ids = []
vecs = []
for tr_id, emb, _ in rows:
if not emb:
continue
arr = np.asarray(emb, dtype=np.float32)
if arr.ndim != 1 or arr.size == 0:
continue
ids.append(tr_id)
vecs.append(arr)
if not ids:
return [], None
mat = np.vstack(vecs)
norms = np.linalg.norm(mat, axis=1, keepdims=True)
norms[norms == 0] = 1e-8
mat = mat / norms
return ids, mat
def fetch_pending_ids(cur, limit) -> List[int]:
cur.execute(
"""
SELECT t.id
FROM traducciones t
JOIN traduccion_embeddings e
ON e.traduccion_id = t.id AND e.model = %s
LEFT JOIN related_noticias r
ON r.traduccion_id = t.id
WHERE t.lang_to = 'es'
AND t.status = 'done'
GROUP BY t.id
HAVING COUNT(r.related_traduccion_id) = 0
ORDER BY t.id DESC
LIMIT %s;
""",
(EMB_MODEL, limit),
)
return [r[0] for r in cur.fetchall()]
def topk(idx: int, ids_all: List[int], mat: np.ndarray, K: int) -> List[Tuple[int, float]]:
q = mat[idx]
sims = np.dot(mat, q)
sims[idx] = -999.0
if MIN_SCORE > 0:
mask = sims >= MIN_SCORE
sims = np.where(mask, sims, -999.0)
if K >= len(sims):
top_idx = np.argsort(-sims)
else:
part = np.argpartition(-sims, K)[:K]
top_idx = part[np.argsort(-sims[part])]
return [(ids_all[j], float(sims[j])) for j in top_idx[:K]]
def insert_related(cur, tr_id: int, pairs):
clean = []
for rid, score in pairs:
if rid == tr_id:
continue
s = float(score)
if s <= 0:
continue
clean.append((tr_id, rid, s))
if not clean:
return
psycopg2.extras.execute_values(
cur,
"""
INSERT INTO related_noticias (traduccion_id, related_traduccion_id, score)
VALUES %s
ON CONFLICT (traduccion_id, related_traduccion_id)
DO UPDATE SET score = EXCLUDED.score;
""",
clean,
)
def build_for_ids(conn, target_ids: List[int]) -> int:
with conn.cursor() as cur:
ids_all, mat = fetch_all_embeddings(cur)
if not ids_all or mat is None:
return 0
pos = {tid: i for i, tid in enumerate(ids_all)}
processed = 0
with conn.cursor() as cur:
for tr_id in target_ids:
if tr_id not in pos:
continue
idx = pos[tr_id]
pairs = topk(idx, ids_all, mat, TOPK)
insert_related(cur, tr_id, pairs)
processed += 1
conn.commit()
return processed
def main():
logging.info(
"Iniciando related_worker (EMB=%s TOPK=%s BATCH=%s MIN=%.3f WINDOW_H=%s)",
EMB_MODEL,
TOPK,
BATCH_IDS,
MIN_SCORE,
WINDOW_HOURS,
)
while True:
try:
with get_conn() as conn, conn.cursor() as cur:
todo = fetch_pending_ids(cur, BATCH_IDS)
if not todo:
time.sleep(SLEEP_IDLE)
continue
with get_conn() as conn:
done = build_for_ids(conn, todo)
logging.info("Relacionadas generadas/actualizadas para %d traducciones.", done)
except Exception:
logging.exception("Error en related_worker")
time.sleep(SLEEP_IDLE)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,207 @@
#!/usr/bin/env python3
import os
import time
import logging
import re
from typing import List, Optional
import psycopg2
import psycopg2.extras
from psycopg2.extras import execute_values
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
LOG = logging.getLogger("translator_simple")
DB_CONFIG = {
"host": os.environ.get("DB_HOST", "localhost"),
"port": int(os.environ.get("DB_PORT", 5432)),
"dbname": os.environ.get("DB_NAME", "rss"),
"user": os.environ.get("DB_USER", "rss"),
"password": os.environ.get("DB_PASS", "x"),
}
TARGET_LANGS = os.environ.get("TARGET_LANGS", "es").split(",")
BATCH_SIZE = int(os.environ.get("TRANSLATOR_BATCH", 32))
MAX_SRC_TOKENS = 512
TRANSLATORS = {}
LANG_MAP = {
"en": "en-ES",
"es": "es-ES",
"fr": "fr-ES",
"de": "de-ES",
"pt": "pt-ES",
"it": "it-ES",
"ru": "ru-ES",
"ar": "ar-ES",
"fa": "fa-ES",
"ps": "ps-ES",
"zh": "zh-ES",
"ja": "ja-ES",
"ko": "ko-ES",
}
def get_translator(source_lang: str, target_lang: str = "es"):
key = f"{source_lang}_{target_lang}"
if key in TRANSLATORS:
return TRANSLATORS[key]
model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
if source_lang == target_lang:
model_name = f"Helsinki-NLP/opus-mt-{source_lang}-es"
LOG.info(f"Loading translator: {model_name}")
try:
device = 0 if torch.cuda.is_available() else -1
translator = pipeline("translation", model=model_name, device=device)
TRANSLATORS[key] = translator
LOG.info(f"Translator loaded: {key}")
return translator
except Exception as e:
LOG.error(f"Failed to load translator {model_name}: {e}")
return None
def normalize_lang(lang: Optional[str], default: str = "es") -> Optional[str]:
if not lang:
return default
lang = lang.strip().lower()[:2]
return lang if lang else default
def translate_text(source_lang: str, target_lang: str, texts: List[str]) -> List[str]:
if not texts:
return []
if source_lang == target_lang:
return texts
translator = get_translator(source_lang, target_lang)
if not translator:
return texts
results = []
for text in texts:
if not text or not text.strip():
results.append(text)
continue
try:
result = translator(text[:MAX_SRC_TOKENS], max_length=MAX_SRC_TOKENS)
translated = result[0]['translation_text']
results.append(translated)
except Exception as e:
LOG.warning(f"Translation error: {e}")
results.append(text)
return results
def connect_db():
return psycopg2.connect(**DB_CONFIG)
def process_batch(conn, rows):
todo = []
for r in rows:
lang_to = normalize_lang(r.get("lang_to"), "es") or "es"
lang_from = normalize_lang(r.get("lang_from")) or "en"
titulo = (r.get("titulo") or "").strip()
resumen = (r.get("resumen") or "").strip()
if lang_from == lang_to:
continue
todo.append({
"tr_id": r.get("tr_id"),
"lang_from": lang_from,
"lang_to": lang_to,
"titulo": titulo,
"resumen": resumen,
})
if not todo:
return
from collections import defaultdict
groups = defaultdict(list)
for item in todo:
key = (item["lang_from"], item["lang_to"])
groups[key].append(item)
for (lang_from, lang_to), items in groups.items():
LOG.info(f"Translating {lang_from} -> {lang_to} ({len(items)} items)")
titles = [i["titulo"] for i in items]
translated_titles = translate_text(lang_from, lang_to, titles)
translated_bodies = []
for i in items:
body = (i["resumen"] or "").strip()
if body:
tr = translate_text(lang_from, lang_to, [body])
translated_bodies.append(tr[0] if tr else body)
else:
translated_bodies.append("")
cursor = conn.cursor()
for i, (item, tt, tb) in enumerate(zip(items, translated_titles, translated_bodies)):
tt = (tt or "").strip()
tb = (tb or "").strip()
if not tt:
tt = item["titulo"]
if not tb:
tb = item["resumen"]
try:
cursor.execute("""
UPDATE traducciones
SET titulo_trad = %s, resumen_trad = %s, lang_to = %s
WHERE id = %s
""", (tt, tb, lang_to, item["tr_id"]))
except Exception as e:
LOG.error(f"Update error: {e}")
conn.commit()
cursor.close()
LOG.info(f"Translated {len(items)} items")
def fetch_pending_translations(conn):
cursor = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
for lang in TARGET_LANGS:
cursor.execute("""
SELECT t.id as tr_id, t.lang_from, t.lang_to,
n.titulo, n.resumen, n.id as noticia_id
FROM traducciones t
JOIN noticias n ON n.id = t.noticia_id
WHERE t.lang_to = %s
AND (t.titulo_trad IS NULL OR t.resumen_trad IS NULL)
ORDER BY n.fecha DESC
LIMIT %s
""", (lang, BATCH_SIZE))
rows = cursor.fetchall()
if rows:
LOG.info(f"Found {len(rows)} pending translations for {lang}")
process_batch(conn, rows)
cursor.close()
def main():
LOG.info("Simple translator worker started")
while True:
try:
conn = connect_db()
fetch_pending_translations(conn)
conn.close()
except Exception as e:
LOG.error(f"Error: {e}")
time.sleep(30)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,151 @@
#!/usr/bin/env python3
"""
Simple Translation Worker using deep-translator
Uses free translation APIs (Google, LibreTranslate, etc.)
"""
import os
import sys
import time
import logging
from datetime import datetime
import psycopg2
from psycopg2.extras import RealDictCursor
from deep_translator import GoogleTranslator, MyMemoryTranslator, single_detection
logging.basicConfig(
level=logging.INFO,
format='[%(asctime)s] %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
DB_CONFIG = {
'host': os.getenv('DB_HOST', 'db'),
'port': int(os.getenv('DB_PORT', 5432)),
'database': os.getenv('DB_NAME', 'rss'),
'user': os.getenv('DB_USER', 'rss'),
'password': os.getenv('DB_PASS', 'rss')
}
TARGET_LANG = os.getenv('TARGET_LANGS', 'es').split(',')[0].strip()
BATCH_SIZE = int(os.getenv('TRANSLATOR_BATCH', '32'))
SLEEP_INTERVAL = int(os.getenv('TRANSLATOR_SLEEP', '60'))
def get_db_connection():
return psycopg2.connect(**DB_CONFIG)
def get_pending_translations(conn):
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT n.id, n.feed_id, n.lang, n.titulo, n.resumen
FROM noticias n
WHERE NOT EXISTS (
SELECT 1 FROM traducciones t
WHERE t.noticia_id = n.id AND t.lang_to = %s
)
AND n.lang IS NOT NULL
AND n.lang != %s
ORDER BY n.created_at DESC
LIMIT %s
""", (TARGET_LANG, TARGET_LANG, BATCH_SIZE))
return cur.fetchall()
def detect_language(text):
"""Detect language using MyMemory (free API)"""
try:
if text and len(text.strip()) > 10:
lang = single_detection(text, api_key=None)
return lang
except Exception as e:
logger.debug(f"Language detection failed: {e}")
return 'en'
def translate_text(text, source_lang, target_lang):
"""Translate text using Google Translator (via deep-translator)"""
if not text or not text.strip():
return ""
try:
translator = GoogleTranslator(source=source_lang, target=target_lang)
translated = translator.translate(text)
return translated if translated else text
except Exception as e:
logger.warning(f"Google translation failed: {e}")
# Fallback to MyMemory
try:
translator = MyMemoryTranslator(source=source_lang, target=target_lang)
translated = translator.translate(text)
return translated if translated else text
except Exception as e2:
logger.error(f"MyMemory translation also failed: {e2}")
return text
def save_translation(conn, noticia_id, lang_from, titulo, resumen):
titulo_trad = translate_text(titulo, lang_from, TARGET_LANG)
resumen_trad = translate_text(resumen, lang_from, TARGET_LANG) if resumen else ""
with conn.cursor() as cur:
cur.execute("""
INSERT INTO traducciones (noticia_id, lang_from, lang_to, titulo_trad, resumen_trad, status, created_at)
VALUES (%s, %s, %s, %s, %s, 'done', NOW())
ON CONFLICT (noticia_id, lang_to) DO UPDATE SET
titulo_trad = EXCLUDED.titulo_trad,
resumen_trad = EXCLUDED.resumen_trad,
status = 'done'
""", (noticia_id, lang_from, TARGET_LANG, titulo_trad, resumen_trad))
conn.commit()
def process_translations():
logger.info("Starting translation worker...")
while True:
conn = get_db_connection()
try:
pending = get_pending_translations(conn)
if not pending:
logger.info(f"No pending translations. Sleeping {SLEEP_INTERVAL}s...")
time.sleep(SLEEP_INTERVAL)
continue
logger.info(f"Found {len(pending)} pending translations")
for item in pending:
try:
lang = item['lang']
# Auto-detect language if needed
if not lang or lang == '':
lang = detect_language(item['titulo'] or '')
logger.info(f"Detected language: {lang} for news {item['id']}")
# Skip if already target language
if lang == TARGET_LANG:
logger.debug(f"Skipping news {item['id']} - already in target language")
continue
save_translation(
conn,
item['id'],
lang,
item['titulo'],
item['resumen']
)
logger.info(f"Translated news {item['id']}: {item['titulo'][:50]}...")
except Exception as e:
logger.error(f"Error translating news {item['id']}: {e}")
continue
except Exception as e:
logger.error(f"Database error: {e}")
time.sleep(5)
finally:
conn.close()
if __name__ == '__main__':
logger.info(f"Translation worker started. Target: {TARGET_LANG}")
process_translations()

View file

@ -1,244 +0,0 @@
import os
import time
import logging
import json
import psycopg2
from psycopg2.extras import execute_values
# Logging
logging.basicConfig(
level=logging.INFO,
format='[topics_worker] %(asctime)s %(levelname)s: %(message)s'
)
log = logging.getLogger(__name__)
# Config
DB_CONFIG = {
"host": os.environ.get("DB_HOST", "localhost"),
"port": int(os.environ.get("DB_PORT", 5432)),
"dbname": os.environ.get("DB_NAME", "rss"),
"user": os.environ.get("DB_USER", "rss"),
"password": os.environ.get("DB_PASS", "x"),
}
SLEEP_IDLE = 10
BATCH_SIZE = 500
def get_conn():
return psycopg2.connect(**DB_CONFIG)
def load_topics(conn):
"""
Load topics and heir keywords.
Returns list of dicts: [{'id': 1, 'weight': 5, 'keywords': ['foo', 'bar']}]
"""
with conn.cursor() as cur:
cur.execute("SELECT id, weight, keywords FROM topics")
rows = cur.fetchall()
topics = []
for r in rows:
tid, weight, kw_str = r
if not kw_str:
continue
# Keywords are comma separated based on insert script
kws = [k.strip().lower() for k in kw_str.split(",") if k.strip()]
topics.append({
"id": tid,
"weight": weight,
"keywords": kws
})
return topics
def load_countries(conn):
"""
Load countries.
Returns list: [{'id': 1, 'name': 'España', 'keywords': ['españa', 'madrid']}]
"""
with conn.cursor() as cur:
cur.execute("SELECT id, nombre FROM paises")
rows = cur.fetchall()
countries = []
# Hardcoded aliases for simplicity. A separate table would be better.
ALIASES = {
"Estados Unidos": ["eeuu", "ee.uu.", "usa", "estadounidense", "washington"],
"Rusia": ["ruso", "rusa", "moscú", "kremlin"],
"China": ["chino", "china", "pekin", "beijing"],
"Ucrania": ["ucraniano", "kiev", "kyiv"],
"Israel": ["israelí", "tel aviv", "jerusalén"],
"España": ["español", "madrid"],
"Reino Unido": ["uk", "londres", "británico"],
"Francia": ["francés", "parís"],
"Alemania": ["alemán", "berlín"],
"Palestina": ["palestino", "gaza", "cisjordania"],
"Irán": ["iraní", "teherán"],
}
for r in rows:
cid, name = r
kws = [name.lower()]
if name in ALIASES:
kws.extend(ALIASES[name])
countries.append({"id": cid, "name": name, "keywords": kws})
return countries
def process_batch(conn, topics, countries):
"""
Fetch batch of processed=False news.
Match against topics AND countries.
Insert into news_topics.
Mark processed.
"""
with conn.cursor() as cur:
# Fetch news
cur.execute("""
SELECT id, titulo, resumen
FROM noticias
WHERE topics_processed = FALSE
ORDER BY fecha DESC
LIMIT %s
""", (BATCH_SIZE,))
news_items = cur.fetchall()
if not news_items:
return 0
inserts = [] # (noticia_id, topic_id, score)
processed_ids = []
# Batch updates for pais_id
country_updates = [] # (pais_id, noticia_id)
for item in news_items:
nid, titulo, resumen = item
text = (titulo or "") + " " + (resumen or "")
text_lower = text.lower()
# 1. Match Topics
for topic in topics:
matched_count = 0
for kw in topic["keywords"]:
if kw in text_lower:
matched_count += 1
if matched_count > 0:
score = topic["weight"] * matched_count
inserts.append((nid, topic["id"], score))
# 2. Match Country (Find best match)
best_country = None
# Simple heuristic: First found? Or count matches?
# Let's count matches.
max_matches = 0
for c in countries:
matches = 0
for kw in c["keywords"]:
# simple word matching. can be improved with regex word boundaries
if kw in text_lower:
matches += 1
if matches > max_matches:
max_matches = matches
best_country = c["id"]
if best_country:
country_updates.append((best_country, nid))
processed_ids.append(nid)
with conn.cursor() as cur:
# Insert relations
if inserts:
execute_values(cur, """
INSERT INTO news_topics (noticia_id, topic_id, score)
VALUES %s
ON CONFLICT (noticia_id, topic_id) DO UPDATE SET score = EXCLUDED.score
""", inserts)
# Update Countries
if country_updates:
execute_values(cur, """
UPDATE noticias AS n
SET pais_id = v.pais_id
FROM (VALUES %s) AS v(pais_id, noticia_id)
WHERE n.id = v.noticia_id
""", country_updates)
# Mark processed
cur.execute("""
UPDATE noticias
SET topics_processed = TRUE
WHERE id = ANY(%s)
""", (processed_ids,))
conn.commit()
return len(news_items)
def initialize_schema(conn):
"""
Ensure required tables and columns exist.
"""
log.info("Checking/Initializing schema...")
with conn.cursor() as cur:
cur.execute("""
CREATE TABLE IF NOT EXISTS topics (
id SERIAL PRIMARY KEY,
slug VARCHAR(50) UNIQUE NOT NULL,
name VARCHAR(100) NOT NULL,
weight INTEGER DEFAULT 1,
keywords TEXT,
group_name VARCHAR(50)
);
CREATE TABLE IF NOT EXISTS news_topics (
noticia_id VARCHAR(32) REFERENCES noticias(id) ON DELETE CASCADE,
topic_id INTEGER REFERENCES topics(id) ON DELETE CASCADE,
score INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT NOW(),
PRIMARY KEY (noticia_id, topic_id)
);
ALTER TABLE noticias ADD COLUMN IF NOT EXISTS topics_processed BOOLEAN DEFAULT FALSE;
""")
conn.commit()
log.info("Schema OK.")
def main():
log.info("Starting topics_worker...")
# Run migrations once at startup
try:
with get_conn() as conn:
initialize_schema(conn)
except Exception as e:
log.error(f"Error during schema initialization: {e}")
# We might want to exit here if the schema is crucial
# sys.exit(1)
while True:
try:
with get_conn() as conn:
topics = load_topics(conn)
if not topics:
log.warning("No topics found in DB. Sleeping.")
time.sleep(SLEEP_IDLE)
continue
# Load countries
countries = load_countries(conn)
count = process_batch(conn, topics, countries)
if count < BATCH_SIZE:
time.sleep(SLEEP_IDLE)
else:
log.info(f"Processed {count} items.")
except Exception as e:
log.exception("Error in topics_worker")
time.sleep(SLEEP_IDLE)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,105 @@
#!/usr/bin/env python3
"""
Translation Scheduler Worker
Creates translation jobs for news that need to be translated.
"""
import os
import sys
import time
import logging
from datetime import datetime
import psycopg2
from psycopg2.extras import RealDictCursor
from langdetect import detect, LangDetectException
logging.basicConfig(
level=logging.INFO,
format='[%(asctime)s] %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
DB_CONFIG = {
'host': os.getenv('DB_HOST', 'db'),
'port': int(os.getenv('DB_PORT', 5432)),
'database': os.getenv('DB_NAME', 'rss'),
'user': os.getenv('DB_USER', 'rss'),
'password': os.getenv('DB_PASS', 'rss')
}
TARGET_LANGS = os.getenv('TARGET_LANGS', 'es').split(',')
BATCH_SIZE = int(os.getenv('SCHEDULER_BATCH', '2000'))
SLEEP_INTERVAL = int(os.getenv('SCHEDULER_SLEEP', '30'))
# Common source languages to try
SOURCE_LANGS = ['en', 'fr', 'pt', 'de', 'it', 'ru', 'zh', 'ja', 'ar', 'nl', 'pl', 'sv']
def get_db_connection():
return psycopg2.connect(**DB_CONFIG)
def create_translation_jobs(conn):
"""Create translation jobs for news without translations.
Relies on langdetect_worker to have set the 'lang' column.
"""
created = 0
with conn.cursor(cursor_factory=RealDictCursor) as cur:
for lang in TARGET_LANGS:
lang = lang.strip()
if not lang:
continue
# Insert translation jobs for news that have a detected language
# but don't have a translation record for the target language.
cur.execute("""
INSERT INTO traducciones (noticia_id, lang_from, lang_to, status, created_at)
SELECT n.id, n.lang, %s, 'pending', NOW()
FROM noticias n
WHERE n.lang IS NOT NULL
AND TRIM(n.lang) != ''
AND n.lang != %s
AND NOT EXISTS (
SELECT 1 FROM traducciones t
WHERE t.noticia_id = n.id AND t.lang_to = %s
)
ORDER BY n.fecha DESC
LIMIT %s
ON CONFLICT (noticia_id, lang_to) DO NOTHING
RETURNING noticia_id
""", (lang, lang, lang, BATCH_SIZE))
rows = cur.fetchall()
if rows:
created += len(rows)
logger.info(f"Created {len(rows)} translation jobs for {lang}")
conn.commit()
return created
def process_translations():
logger.info("Starting translation scheduler loop...")
while True:
try:
conn = get_db_connection()
created = create_translation_jobs(conn)
conn.close()
if created == 0:
logger.info(f"No new news to schedule. Sleeping {SLEEP_INTERVAL}s...")
time.sleep(SLEEP_INTERVAL)
else:
logger.info(f"Total jobs created in this cycle: {created}")
# Short sleep to avoid hammer but keep momentum
time.sleep(5)
except Exception as e:
logger.error(f"Scheduler error: {e}")
time.sleep(10)
if __name__ == '__main__':
logger.info("Translation scheduler started")
process_translations()

View file

@ -7,19 +7,15 @@ from typing import List, Optional
import psycopg2
import psycopg2.extras
from psycopg2.extras import execute_values
import ctranslate2
from transformers import AutoTokenizer
from langdetect import detect, DetectorFactory
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
DetectorFactory.seed = 0
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
LOG = logging.getLogger("translator")
# =========================
# DB CONFIG
# =========================
DB_CONFIG = {
"host": os.environ.get("DB_HOST", "localhost"),
"port": int(os.environ.get("DB_PORT", 5432)),
@ -28,9 +24,6 @@ DB_CONFIG = {
"password": os.environ.get("DB_PASS", "x"),
}
# =========================
# ENV HELPERS
# =========================
def _env_list(name: str, default="es"):
raw = os.environ.get(name)
if raw:
@ -55,37 +48,20 @@ def _env_str(name: str, default=None):
v = os.environ.get(name)
return v if v else default
# =========================
# CONFIG
# =========================
TARGET_LANGS = _env_list("TARGET_LANGS") # por defecto ["es"]
TARGET_LANGS = _env_list("TARGET_LANGS")
BATCH_SIZE = _env_int("TRANSLATOR_BATCH", 8)
ENQUEUE_MAX = _env_int("ENQUEUE", 200)
SLEEP_IDLE = _env_float("TRANSLATOR_SLEEP_IDLE", 5.0)
# CTranslate2 Configuration
CT2_MODEL_PATH = _env_str("CT2_MODEL_PATH", "./models/nllb-ct2")
CT2_DEVICE = _env_str("CT2_DEVICE", "auto") # auto, cpu, cuda
CT2_COMPUTE_TYPE = _env_str("CT2_COMPUTE_TYPE", "auto") # auto, int8, float16, int8_float16
MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", 512)
MAX_NEW_TOKENS_TITLE = _env_int("MAX_NEW_TOKENS_TITLE", 96)
MAX_NEW_TOKENS_BODY = _env_int("MAX_NEW_TOKENS_BODY", 512)
NUM_BEAMS_TITLE = _env_int("NUM_BEAMS_TITLE", 2)
NUM_BEAMS_BODY = _env_int("NUM_BEAMS_BODY", 2)
# HuggingFace model name (used for tokenizer)
UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", "facebook/nllb-200-distilled-600M")
IDENTITY_PAISES_ES = _env_int("IDENTITY_PAISES_ES", 58)
BODY_CHARS_CHUNK = _env_int("BODY_CHARS_CHUNK", 900)
# =========================
# LANG MAP
# =========================
NLLB_LANG = {
"es": "spa_Latn", "en": "eng_Latn", "fr": "fra_Latn", "de": "deu_Latn",
LANG_CODE_MAP = {
"en": "eng_Latn", "es": "spa_Latn", "fr": "fra_Latn", "de": "deu_Latn",
"it": "ita_Latn", "pt": "por_Latn", "nl": "nld_Latn", "sv": "swe_Latn",
"da": "dan_Latn", "fi": "fin_Latn", "no": "nob_Latn",
"pl": "pol_Latn", "cs": "ces_Latn", "sk": "slk_Latn",
@ -96,286 +72,74 @@ NLLB_LANG = {
"ko": "kor_Hang", "vi": "vie_Latn",
}
def map_to_nllb(code: Optional[str]):
if not code:
return None
c = code.strip().lower()
return NLLB_LANG.get(c, f"{c}_Latn")
_tokenizer = None
_translator = None
_device = None
def normalize_lang(code: Optional[str], default=None):
return (code or default).strip().lower() if code else default
def _norm(s: str) -> str:
return re.sub(r"\W+", "", (s or "").lower()).strip()
def _is_repetitive_output(text: str, threshold: float = 0.25) -> bool:
"""Detect if translation output is repetitive/low quality.
def get_translator_components():
global _tokenizer, _translator, _device
Args:
text: The translated text to check
threshold: Minimum unique word ratio (default 0.25 = 25% unique words)
if _translator:
return _tokenizer, _translator
Returns:
True if text appears to be repetitive/low quality
"""
if not text or len(text) < 50:
return False
device = 0 if torch.cuda.is_available() else -1
LOG.info(f"Loading model {UNIVERSAL_MODEL} on {'cuda' if device == 0 else 'cpu'}")
# Check for obvious repetitive patterns
repetitive_patterns = [
r'(\b\w+\b)( \1){3,}', # Same word repeated 4+ times
r'(\b\w+ \w+\b)( \1){2,}', # Same 2-word phrase repeated 3+ times
r'de la la ',
r'la línea de la línea',
r'de Internet de Internet',
]
_tokenizer = AutoTokenizer.from_pretrained(UNIVERSAL_MODEL, src_lang="eng_Latn")
model = AutoModelForSeq2SeqLM.from_pretrained(UNIVERSAL_MODEL)
for pattern in repetitive_patterns:
if re.search(pattern, text, re.IGNORECASE):
LOG.warning(f"Detected repetitive pattern: {pattern}")
return True
if device == 0:
model = model.to("cuda")
# Check word diversity
words = text.lower().split()
if len(words) < 10:
return False
unique_ratio = len(set(words)) / len(words)
if unique_ratio < threshold:
LOG.warning(f"Low word diversity: {unique_ratio:.2%} (threshold: {threshold:.2%})")
return True
return False
# =========================
# DB
# =========================
def get_conn():
return psycopg2.connect(**DB_CONFIG)
def ensure_indexes(conn):
with conn.cursor() as cur:
cur.execute("CREATE INDEX IF NOT EXISTS traducciones_lang_to_status_idx ON traducciones (lang_to, status);")
cur.execute("CREATE INDEX IF NOT EXISTS traducciones_status_idx ON traducciones (status);")
conn.commit()
pass # Moved to translation_ops.py
pass # Moved to translation_ops.py
def fetch_pending_batch(conn, lang_to: str, batch: int):
"""Fetch pending translations with row locking to support multiple workers."""
if batch <= 0:
return []
# Use FOR UPDATE SKIP LOCKED to allow multiple workers
# Each worker will get different rows without conflicts
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute(
"""
SELECT t.id AS tr_id, t.noticia_id, t.lang_from, t.lang_to,
n.titulo, n.resumen
FROM traducciones t
JOIN noticias n ON n.id=t.noticia_id
WHERE t.lang_to=%s AND t.status='pending'
ORDER BY t.id
LIMIT %s
FOR UPDATE OF t SKIP LOCKED;
""",
(lang_to, batch),
)
rows = cur.fetchall()
# Update status within the same transaction while rows are locked
if rows:
ids = [r["tr_id"] for r in rows]
cur.execute("UPDATE traducciones SET status='processing' WHERE id = ANY(%s)", (ids,))
conn.commit()
return rows
# =========================
# LANGUAGE DETECTION
# =========================
def detect_lang(text1: str, text2: str):
txt = (text1 or "").strip() or (text2 or "").strip()
if not txt:
return None
try:
return detect(txt)
except Exception:
return None
# =========================
# MODEL LOADING (CTranslate2)
# =========================
_TOKENIZER = None
_TRANSLATOR = None
_DEVICE = None
def _resolve_device():
if CT2_DEVICE == "cpu":
return "cpu"
if CT2_DEVICE == "cuda":
return "cuda"
# auto
return "cuda" if ctranslate2.get_cuda_device_count() > 0 else "cpu"
def _ensure_ct2_model():
"""Convert HuggingFace model to CTranslate2 format if not exists."""
import os
import subprocess
model_dir = CT2_MODEL_PATH
# Check if model already exists
if os.path.isdir(model_dir) and os.path.exists(os.path.join(model_dir, "model.bin")):
LOG.info("CTranslate2 model already exists at %s", model_dir)
return True
LOG.info("CTranslate2 model not found, converting from %s...", UNIVERSAL_MODEL)
LOG.info("This may take 5-10 minutes on first run...")
# Create directory if needed
os.makedirs(os.path.dirname(model_dir) or ".", exist_ok=True)
# Convert the model
quantization = CT2_COMPUTE_TYPE if CT2_COMPUTE_TYPE != "auto" else "int8_float16"
cmd = [
"ct2-transformers-converter",
"--model", UNIVERSAL_MODEL,
"--output_dir", model_dir,
"--quantization", quantization,
"--force"
]
try:
LOG.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=1800)
if result.returncode != 0:
LOG.error("Model conversion failed: %s", result.stderr)
return False
LOG.info("Model conversion completed successfully")
return True
except subprocess.TimeoutExpired:
LOG.error("Model conversion timed out after 30 minutes")
return False
except Exception as e:
LOG.error("Model conversion error: %s", e)
return False
def get_universal_components():
global _TOKENIZER, _TRANSLATOR, _DEVICE
if _TRANSLATOR:
return _TOKENIZER, _TRANSLATOR
# Ensure CT2 model exists (convert if needed)
if not _ensure_ct2_model():
raise RuntimeError(f"Failed to load/convert CTranslate2 model at {CT2_MODEL_PATH}")
device = _resolve_device()
LOG.info("Loading CTranslate2 model from %s on %s", CT2_MODEL_PATH, device)
_TRANSLATOR = ctranslate2.Translator(
CT2_MODEL_PATH,
_translator = pipeline(
"translation",
model=model,
tokenizer=_tokenizer,
device=device,
compute_type=CT2_COMPUTE_TYPE,
max_length=MAX_SRC_TOKENS,
)
_TOKENIZER = AutoTokenizer.from_pretrained(UNIVERSAL_MODEL)
_DEVICE = device
LOG.info("CTranslate2 model loaded successfully")
return _TOKENIZER, _TRANSLATOR
_device = "cuda" if device == 0 else "cpu"
LOG.info(f"Model loaded on {_device}")
return _tokenizer, _translator
# =========================
# TRANSLATION (CTranslate2)
# =========================
def _safe_src_len(tokenizer):
max_len = getattr(tokenizer, "model_max_length", 1024) or 1024
if max_len > 100000:
max_len = 1024
return min(MAX_SRC_TOKENS, max_len - 16)
def _translate_texts(src, tgt, texts, beams, max_new_tokens):
"""Translate texts using CTranslate2."""
def translate_texts(src: str, tgt: str, texts: List[str]) -> List[str]:
if not texts:
return []
clean = [(t or "").strip() for t in texts]
if all(not t for t in clean):
return ["" for _ in clean]
tok, translator = get_translator_components()
src_code = LANG_CODE_MAP.get(src, f"{src}_Latn")
tgt_code = LANG_CODE_MAP.get(tgt, "spa_Latn")
results = []
for text in clean:
if not text:
results.append("")
continue
try:
result = translator(text, src_lang=src_code, tgt_lang=tgt_code)
results.append(result[0]["translation_text"])
except Exception as e:
LOG.warning(f"Translation error: {e}")
results.append(text)
return results
tok, translator = get_universal_components()
src_code = map_to_nllb(src)
tgt_code = map_to_nllb(tgt)
# Set source language on tokenizer
try:
tok.src_lang = src_code
except Exception:
pass
safe_len = _safe_src_len(tok)
max_new = max(16, min(int(max_new_tokens), MAX_NEW_TOKENS_BODY))
# Tokenize: convert text to tokens
sources = []
for t in clean:
if t:
ids = tok.encode(t, truncation=True, max_length=safe_len)
tokens = tok.convert_ids_to_tokens(ids)
sources.append(tokens)
else:
sources.append([])
# Target language prefix for NLLB
target_prefix = [[tgt_code]] * len(sources)
# Translate with CTranslate2
start = time.time()
results = translator.translate_batch(
sources,
target_prefix=target_prefix,
beam_size=beams,
max_decoding_length=max_new,
repetition_penalty=2.5, # Increased from 1.2 to prevent loops
no_repeat_ngram_size=3, # Prevent 3-gram repetition
)
dt = time.time() - start
# Decode results
translated = []
total_tokens = 0
for result, src_tokens in zip(results, sources):
if result.hypotheses:
# Skip the first token (language prefix)
tokens = result.hypotheses[0][1:]
total_tokens += len(tokens) + len(src_tokens)
text = tok.decode(tok.convert_tokens_to_ids(tokens))
translated.append(text.strip())
else:
translated.append("")
if total_tokens > 0:
LOG.info(" → tokens=%d tiempo=%.2fs velocidad=%d tok/s",
total_tokens, dt, int(total_tokens / dt) if dt > 0 else 0)
return translated
def _split_body_into_chunks(text: str) -> List[str]:
def split_body_into_chunks(text: str) -> List[str]:
text = (text or "").strip()
if len(text) <= BODY_CHARS_CHUNK:
return [text] if text else []
parts = re.split(r'(\n\n+|(?<=[\.\!\?؛؟。])\s+)', text)
chunks = []
current = ""
for part in parts:
if not part:
continue
@ -387,260 +151,145 @@ def _split_body_into_chunks(text: str) -> List[str]:
current = part
if current.strip():
chunks.append(current.strip())
if not chunks:
return [text]
return chunks
return chunks if chunks else [text]
def translate_body_long(src: str, tgt: str, body: str) -> str:
body = (body or "").strip()
if not body:
return ""
chunks = _split_body_into_chunks(body)
chunks = split_body_into_chunks(body)
if len(chunks) == 1:
translated = _translate_texts(src, tgt, [body], NUM_BEAMS_BODY, MAX_NEW_TOKENS_BODY)[0]
return translated.strip()
return translate_texts(src, tgt, [body])[0].strip()
translated_chunks = []
for ch in chunks:
tr = _translate_texts(src, tgt, [ch], NUM_BEAMS_BODY, MAX_NEW_TOKENS_BODY)[0]
translated_chunks.append(tr.strip())
return "\n\n".join(c for c in translated_chunks if c)
tr = translate_texts(src, tgt, [ch])[0]
translated_chunks.append(tr)
return " ".join(translated_chunks)
def normalize_lang(lang: Optional[str], default: str = "es") -> Optional[str]:
if not lang:
return default
lang = lang.strip().lower()[:2]
return lang if lang else default
def detect_lang(text: str) -> str:
if not text or len(text) < 10:
return "en"
try:
return detect(text)
except Exception:
return "en"
# =========================
# BATCH PROCESS
# =========================
def process_batch(conn, rows):
todo = []
done = []
errors = []
for r in rows:
lang_to = normalize_lang(r["lang_to"], "es") or "es"
lang_from = (
normalize_lang(r["lang_from"])
or detect_lang(r["titulo"], r["resumen"])
or "en"
)
titulo = (r["titulo"] or "").strip()
resumen = (r["resumen"] or "").strip()
if map_to_nllb(lang_from) == map_to_nllb(lang_to):
done.append((titulo, resumen, lang_from, r["tr_id"]))
else:
todo.append({
"tr_id": r["tr_id"],
"lang_from": lang_from,
"lang_to": lang_to,
"titulo": titulo,
"resumen": resumen,
})
lang_to = normalize_lang(r.get("lang_to"), "es") or "es"
lang_from = normalize_lang(r.get("lang_from")) or detect_lang(r.get("titulo") or "")
titulo = (r.get("titulo") or "").strip()
resumen = (r.get("resumen") or "").strip()
if lang_from == lang_to:
continue
todo.append({
"tr_id": r.get("tr_id"),
"lang_from": lang_from,
"lang_to": lang_to,
"titulo": titulo,
"resumen": resumen,
})
if not todo:
return
from collections import defaultdict
groups = defaultdict(list)
for item in todo:
key = (item["lang_from"], item["lang_to"])
groups[key].append(item)
for (lang_from, lang_to), items in groups.items():
LOG.info("Translating %s -> %s (%d items)", lang_from, lang_to, len(items))
LOG.info(f"Translating {lang_from} -> {lang_to} ({len(items)} items)")
titles = [i["titulo"] for i in items]
try:
tt = _translate_texts(
lang_from,
lang_to,
titles,
NUM_BEAMS_TITLE,
MAX_NEW_TOKENS_TITLE,
)
bodies_translated: List[str] = []
for i in items:
bodies_translated.append(
translate_body_long(lang_from, lang_to, i["resumen"])
)
for i, ttr, btr in zip(items, tt, bodies_translated):
ttr = (ttr or "").strip()
btr = (btr or "").strip()
if not ttr or _norm(ttr) == _norm(i["titulo"]):
ttr = i["titulo"]
if not btr or _norm(btr) == _norm(i["resumen"]):
btr = i["resumen"]
# CLEANING: Remove <unk> tokens
if ttr:
ttr = ttr.replace("<unk>", "").replace(" ", " ").strip()
if btr:
btr = btr.replace("<unk>", "").replace(" ", " ").strip()
# VALIDATION: Check for repetitive output
if _is_repetitive_output(ttr) or _is_repetitive_output(btr):
LOG.warning(f"Rejecting repetitive translation for tr_id={i['tr_id']}")
errors.append(("Repetitive output detected", i["tr_id"]))
continue
done.append((ttr, btr, lang_from, i["tr_id"]))
except Exception as e:
err = str(e)[:800]
LOG.exception("Error translating %s -> %s: %s", lang_from, lang_to, err)
for i in items:
errors.append((err, i["tr_id"]))
with conn.cursor() as cur:
if done:
execute_values(
cur,
"""
UPDATE traducciones AS t
SET titulo_trad=v.titulo_trad,
resumen_trad=v.resumen_trad,
lang_from=COALESCE(t.lang_from, v.lang_from),
status='done',
error=NULL
FROM (VALUES %s) AS v(titulo_trad,resumen_trad,lang_from,id)
WHERE t.id=v.id;
""",
done,
)
# --- NEW: Persist stats ---
# Insert a record for each translated item into translation_stats
# We need the language 'lang_to'. In this batch, lang_to is uniform for the group usually,
# but let's extract it from the 'done' items structure if we had it, or pass it down.
# In process_batch, we iterate groups.
# 'done' list here is flattened from multiple groups?
# process_batch logic:
# 1. 'done' checks map_to_nllb identity (already done?) -> these have lang_to from row?
# 2. 'groups' loop -> translates -> appends to 'done' with lang_from.
#
# Wait, 'done' list doesn't have lang_to in the tuple: (titulo, resumen, lang_from, tr_id).
# We need to change the 'done' collection to include lang_to OR we insert based on tr_id.
# Let's verify process_batch logic.
# rows has all info.
# define a mapping tr_id -> lang_to
tr_map = {r["tr_id"]: r["lang_to"] for r in rows}
stats_data = []
for item in done:
# item is (titulo, resumen, lang_from, tr_id)
lang_from = item[2]
lang_to = tr_map.get(item[3], "es")
stats_data.append((lang_from, lang_to))
execute_values(
cur,
"INSERT INTO translation_stats (lang_from, lang_to) VALUES %s",
stats_data
)
# --------------------------
if errors:
execute_values(
cur,
"""
UPDATE traducciones AS t
SET status='error', error=v.error
FROM (VALUES %s) AS v(error,id)
WHERE t.id=v.id;
""",
errors,
)
conn.commit()
def process_entity_summaries(conn):
"""Translate pending entity summaries from Wikipedia."""
from cache import cache_del
LOG.info("DEBUG: Checking for pending entity summaries...")
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute("""
SELECT id, entity_name, summary, summary_en
FROM entity_images
WHERE status_es = 'pending'
LIMIT 20
FOR UPDATE SKIP LOCKED;
""")
rows = cur.fetchall()
translated_titles = translate_texts(lang_from, lang_to, titles)
if not rows:
return False
LOG.info("DEBUG: Found %d pending entity summaries to process", len(rows))
translated_bodies = []
for i in items:
body = (i["resumen"] or "").strip()
if body:
tr = translate_body_long(lang_from, lang_to, body)
translated_bodies.append(tr)
else:
translated_bodies.append("")
for r in rows:
entity_id = r["id"]
name = r["entity_name"]
text = r["summary_en"] or r["summary"]
cursor = conn.cursor()
for item, tt, tb in zip(items, translated_titles, translated_bodies):
tt = (tt or "").strip()
tb = (tb or "").strip()
if not tt:
tt = item["titulo"]
if not tb:
tb = item["resumen"]
if not text:
cur.execute("UPDATE entity_images SET status_es = 'none' WHERE id = %s", (entity_id,))
continue
try:
# English -> Spanish
translated = translate_body_long('en', 'es', text)
if translated:
cur.execute("""
UPDATE entity_images
SET summary_es = %s, status_es = 'done'
WHERE id = %s
""", (translated, entity_id))
# Invalidate cache
cache_del(f"wiki:data:{name.lower()}")
LOG.info(" → Translated entity summary: %s", name)
else:
cur.execute("UPDATE entity_images SET status_es = 'error' WHERE id = %s", (entity_id,))
cursor.execute("""
UPDATE traducciones
SET titulo_trad = %s, resumen_trad = %s, lang_to = %s
WHERE id = %s
""", (tt, tb, lang_to, item["tr_id"]))
except Exception as e:
LOG.error("Error translating entity summary [%s]: %s", name, e)
cur.execute("UPDATE entity_images SET status_es = 'error' WHERE id = %s", (entity_id,))
LOG.error(f"Update error: {e}")
conn.commit()
return True
cursor.close()
LOG.info(f"Translated {len(items)} items")
def fetch_pending_translations(conn):
cursor = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
for lang in TARGET_LANGS:
cursor.execute("""
SELECT t.id as tr_id, t.lang_from, t.lang_to,
n.titulo, n.resumen, n.id as noticia_id
FROM traducciones t
JOIN noticias n ON n.id = t.noticia_id
WHERE t.lang_to = %s
AND (t.titulo_trad IS NULL OR t.resumen_trad IS NULL)
ORDER BY n.fecha DESC
LIMIT %s
""", (lang, BATCH_SIZE))
rows = cursor.fetchall()
if rows:
LOG.info(f"Found {len(rows)} pending translations for {lang}")
process_batch(conn, rows)
cursor.close()
def connect_db():
return psycopg2.connect(**DB_CONFIG)
# =========================
# MAIN LOOP
# =========================
def main():
LOG.info("Translator worker iniciado (CTranslate2)")
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
get_universal_components()
LOG.info("Translation worker started (transformers)")
get_translator_components()
while True:
any_work = False
with get_conn() as conn:
ensure_indexes(conn)
# 1. Process entity summaries (Wikipedia) -> REMOVED per user request
# Logic moved out to keep translator focused on news ONLY.
# try:
# if process_entity_summaries(conn):
# any_work = True
# except Exception as e:
# LOG.error("Error in process_entity_summaries: %s", e)
# 2. Process news translations
for tgt in TARGET_LANGS:
while True:
rows = fetch_pending_batch(conn, tgt, BATCH_SIZE)
if not rows:
break
any_work = True
LOG.info("[%s] %d elementos", tgt, len(rows))
process_batch(conn, rows)
if not any_work:
time.sleep(SLEEP_IDLE)
try:
conn = connect_db()
fetch_pending_translations(conn)
conn.close()
except Exception as e:
LOG.error(f"Error: {e}")
time.sleep(30)
if __name__ == "__main__":
main()

View file

@ -1,471 +0,0 @@
"""
URL Feed Discovery Worker
This worker automatically discovers RSS feeds from URLs stored in fuentes_url table
and creates entries in the feeds table (or feeds_pending for review).
Runs every 15 minutes.
"""
import os
import sys
import time
import logging
from datetime import datetime
from typing import List, Dict
# Add parent directory to path to import modules
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from db import get_conn
from utils.feed_discovery import discover_feeds, get_feed_metadata
from utils.feed_analysis import (
analyze_feed,
get_country_id_by_name,
get_category_id_by_name
)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Configuration
CHECK_INTERVAL = int(os.getenv('URL_DISCOVERY_INTERVAL_MIN', '15')) * 60 # Default: 15 minutes
BATCH_SIZE = int(os.getenv('URL_DISCOVERY_BATCH_SIZE', '10')) # Process URLs in batches
MAX_FEEDS_PER_URL = int(os.getenv('MAX_FEEDS_PER_URL', '5')) # Max feeds to create per URL
def get_pending_urls(limit: int = BATCH_SIZE) -> List[Dict]:
"""
Get URLs that need to be processed.
Priority: never checked > failed checks > oldest successful checks
"""
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT id, nombre, url, categoria_id, pais_id, idioma, last_status
FROM fuentes_url
WHERE active = TRUE
ORDER BY
CASE
WHEN last_check IS NULL THEN 1 -- Never checked (highest priority)
WHEN last_status = 'error' THEN 2 -- Failed checks
WHEN last_status = 'no_feeds' THEN 3 -- No feeds found
ELSE 4 -- Successful checks (lowest priority)
END,
last_check ASC NULLS FIRST
LIMIT %s
""", (limit,))
columns = [desc[0] for desc in cur.description]
return [dict(zip(columns, row)) for row in cur.fetchall()]
def update_url_status(url_id: int, status: str, message: str = None, http_code: int = None):
"""Update the status of a URL source"""
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
UPDATE fuentes_url
SET last_check = NOW(),
last_status = %s,
status_message = %s,
last_http_code = %s
WHERE id = %s
""", (status, message, http_code, url_id))
conn.commit()
def create_pending_feed(
fuente_url_id: int,
feed_url: str,
metadata: Dict,
analysis: Dict,
categoria_id: int = None,
pais_id: int = None,
idioma: str = None
) -> bool:
"""
Create a pending feed entry for manual review
"""
try:
with get_conn() as conn:
# Get detected country ID
detected_country_id = None
if analysis.get('detected_country'):
detected_country_id = get_country_id_by_name(conn, analysis['detected_country'])
# Get suggested category ID
suggested_categoria_id = None
if analysis.get('suggested_category'):
suggested_categoria_id = get_category_id_by_name(conn, analysis['suggested_category'])
with conn.cursor() as cur:
cur.execute("""
INSERT INTO feeds_pending (
fuente_url_id, feed_url, feed_title, feed_description,
feed_language, feed_type, entry_count,
detected_country_id, suggested_categoria_id,
categoria_id, pais_id, idioma, notes
)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (feed_url) DO UPDATE
SET feed_title = EXCLUDED.feed_title,
feed_description = EXCLUDED.feed_description,
discovered_at = NOW()
RETURNING id
""", (
fuente_url_id,
feed_url,
metadata.get('title', 'Feed sin título'),
metadata.get('description', '')[:500],
analysis.get('language'),
'rss', # Default type
metadata.get('entry_count', 0),
detected_country_id,
suggested_categoria_id,
categoria_id,
pais_id,
idioma,
analysis.get('analysis_notes', '')
))
result = cur.fetchone()
conn.commit()
if result:
logger.info(f"Created pending feed for review: {metadata.get('title')} ({feed_url})")
return True
else:
logger.debug(f"Pending feed updated: {feed_url}")
return False
except Exception as e:
logger.error(f"Error creating pending feed {feed_url}: {e}")
return False
def create_feed_from_metadata(
feed_url: str,
fuente_url_id: int = None,
categoria_id: int = None,
pais_id: int = None,
idioma: str = None,
auto_approve: bool = False,
context_title: str = None
) -> Dict:
"""
Create a feed entry from discovered feed URL with intelligent analysis.
Returns:
{
'created': True/False,
'pending': True/False,
'status': 'created'/'pending'/'existing'/'error',
'message': 'Description'
}
"""
result = {
'created': False,
'pending': False,
'status': 'error',
'message': ''
}
try:
# Get feed metadata
metadata = get_feed_metadata(feed_url, timeout=10)
if not metadata:
result['message'] = 'No se pudo obtener metadata del feed'
logger.warning(f"{result['message']}: {feed_url}")
return result
# Add URL to metadata for analysis
metadata['url'] = feed_url
# Use context title if provided, otherwise use metadata title
# This helps when feed XML title is generic (e.g. "RSS Feed") but site link had meaningful text
feed_title = context_title if context_title else metadata.get('title', 'Feed sin título')
# Update metadata for consistency in pending feeds AND analysis
metadata['title'] = feed_title
# Perform intelligent analysis
analysis = analyze_feed(metadata)
# Determine if we need manual review
needs_review = False
# If parent URL has no category or country, we need review
if not categoria_id or not pais_id:
needs_review = True
logger.info(f"Feed needs review (missing categoria_id={categoria_id} or pais_id={pais_id})")
# If auto_approve is disabled, we need review
if not auto_approve:
needs_review = True
# Enhance metadata with analysis
if not idioma and analysis.get('language'):
idioma = analysis['language']
# If needs review, create pending feed
if needs_review:
created_pending = create_pending_feed(
fuente_url_id=fuente_url_id,
feed_url=feed_url,
metadata=metadata,
analysis=analysis,
categoria_id=categoria_id,
pais_id=pais_id,
idioma=idioma
)
result['pending'] = created_pending
result['status'] = 'pending'
result['message'] = f"Feed creado y pendiente de revisión (país: {analysis.get('detected_country', 'N/A')}, categoría sugerida: {analysis.get('suggested_category', 'N/A')})"
return result
# Otherwise, create feed directly
nombre = feed_title
descripcion = metadata.get('description', '')
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
INSERT INTO feeds (nombre, descripcion, url, categoria_id, pais_id, idioma, fuente_url_id, activo)
VALUES (%s, %s, %s, %s, %s, %s, %s, TRUE)
ON CONFLICT (url) DO NOTHING
RETURNING id
""", (
nombre,
descripcion[:500] if descripcion else None,
feed_url,
categoria_id,
pais_id,
idioma,
fuente_url_id
))
feed_result = cur.fetchone()
conn.commit()
if feed_result:
logger.info(f"Created new feed: {nombre} ({feed_url})")
result['created'] = True
result['status'] = 'created'
result['message'] = f"Feed creado exitosamente"
else:
logger.debug(f"Feed already exists: {feed_url}")
result['status'] = 'existing'
result['message'] = 'El feed ya existe'
except Exception as e:
logger.error(f"Error creating feed from {feed_url}: {e}")
result['message'] = str(e)
result['status'] = 'error'
return result
def process_url_source(url_data: Dict) -> Dict:
"""
Process a single URL source to discover and create feeds.
Returns statistics about the operation.
"""
url_id = url_data['id']
source_url = url_data['url']
nombre = url_data['nombre']
categoria_id = url_data['categoria_id']
pais_id = url_data['pais_id']
idioma = url_data['idioma']
logger.info(f"Processing URL source: {nombre} ({source_url})")
logger.info(f" Parent metadata: categoria_id={categoria_id}, pais_id={pais_id}, idioma={idioma}")
stats = {
'url_id': url_id,
'url': source_url,
'discovered': 0,
'created': 0,
'pending': 0,
'existing': 0,
'errors': 0,
'status': 'unknown'
}
try:
# Discover feeds from URL
discovered = discover_feeds(source_url, timeout=15)
stats['discovered'] = len(discovered)
if not discovered:
logger.warning(f"No feeds discovered from: {source_url}")
update_url_status(url_id, 'no_feeds', 'No se encontraron feeds RSS', 200)
stats['status'] = 'no_feeds'
return stats
# Filter only valid feeds
valid_feeds = [f for f in discovered if f.get('valid', False)]
if not valid_feeds:
logger.warning(f"No valid feeds found for: {source_url}")
update_url_status(url_id, 'no_valid_feeds', f'Se encontraron {len(discovered)} feeds pero ninguno válido')
stats['status'] = 'no_valid_feeds'
return stats
# Limit number of feeds per URL
feeds_to_create = valid_feeds[:MAX_FEEDS_PER_URL]
logger.info(f"Found {len(valid_feeds)} valid feeds, processing up to {len(feeds_to_create)}")
# Determine if auto-approve (parent has category AND country)
auto_approve = bool(categoria_id and pais_id)
if not auto_approve:
logger.info("→ Feeds will require manual review (parent lacks category or country)")
else:
logger.info("→ Feeds will be auto-approved (parent has complete metadata)")
# Create feeds
for feed_info in feeds_to_create:
feed_url = feed_info['url']
try:
result = create_feed_from_metadata(
feed_url=feed_url,
fuente_url_id=url_id,
categoria_id=categoria_id,
pais_id=pais_id,
idioma=idioma,
auto_approve=auto_approve,
context_title=feed_info.get('context_label')
)
if result['status'] == 'created':
stats['created'] += 1
elif result['status'] == 'pending':
stats['pending'] += 1
elif result['status'] == 'existing':
stats['existing'] += 1
else:
stats['errors'] += 1
except Exception as e:
logger.error(f"Error creating feed {feed_url}: {e}")
stats['errors'] += 1
# Update URL status
if stats['created'] > 0 or stats['pending'] > 0:
parts = []
if stats['created'] > 0:
parts.append(f"{stats['created']} creados")
if stats['pending'] > 0:
parts.append(f"{stats['pending']} pendientes de revisión")
if stats['existing'] > 0:
parts.append(f"{stats['existing']} ya existían")
message = ", ".join(parts)
update_url_status(url_id, 'success', message, 200)
stats['status'] = 'success'
elif stats['existing'] > 0:
message = f"Todos los {stats['existing']} feeds ya existían"
update_url_status(url_id, 'existing', message, 200)
stats['status'] = 'existing'
else:
message = f"No se pudieron procesar feeds ({stats['errors']} errores)"
update_url_status(url_id, 'error', message)
stats['status'] = 'error'
logger.info(f"Processed {source_url}: {stats['created']} created, {stats['pending']} pending, {stats['existing']} existing, {stats['errors']} errors")
except Exception as e:
logger.error(f"Error processing URL {source_url}: {e}")
update_url_status(url_id, 'error', str(e)[:200])
stats['status'] = 'error'
stats['errors'] += 1
return stats
def process_batch():
"""Process a batch of URL sources"""
logger.info("=" * 80)
logger.info(f"Starting URL discovery batch - {datetime.now().isoformat()}")
# Get pending URLs
urls = get_pending_urls(limit=BATCH_SIZE)
if not urls:
logger.info("No pending URLs to process")
return
logger.info(f"Processing {len(urls)} URL sources")
# Process statistics
total_stats = {
'processed': 0,
'discovered': 0,
'created': 0,
'pending': 0,
'existing': 0,
'errors': 0
}
# Process each URL
for url_data in urls:
stats = process_url_source(url_data)
total_stats['processed'] += 1
total_stats['discovered'] += stats['discovered']
total_stats['created'] += stats['created']
total_stats['pending'] += stats['pending']
total_stats['existing'] += stats['existing']
total_stats['errors'] += stats['errors']
# Small delay between URLs to avoid hammering servers
time.sleep(2)
# Log summary
logger.info("-" * 80)
logger.info(f"Batch complete:")
logger.info(f" - Processed: {total_stats['processed']} URLs")
logger.info(f" - Discovered: {total_stats['discovered']} feeds")
logger.info(f" - Created: {total_stats['created']} new feeds")
logger.info(f" - Pending review: {total_stats['pending']} feeds")
logger.info(f" - Already existing: {total_stats['existing']} feeds")
logger.info(f" - Errors: {total_stats['errors']}")
logger.info("=" * 80)
def main():
"""Main worker loop"""
logger.info("URL Feed Discovery Worker started")
logger.info(f"Check interval: {CHECK_INTERVAL} seconds ({CHECK_INTERVAL // 60} minutes)")
logger.info(f"Batch size: {BATCH_SIZE}")
logger.info(f"Max feeds per URL: {MAX_FEEDS_PER_URL}")
# Run immediately on start
try:
process_batch()
except Exception as e:
logger.error(f"Error in initial batch: {e}", exc_info=True)
# Main loop
while True:
try:
logger.info(f"Waiting {CHECK_INTERVAL} seconds until next batch...")
time.sleep(CHECK_INTERVAL)
process_batch()
except KeyboardInterrupt:
logger.info("Worker stopped by user")
break
except Exception as e:
logger.error(f"Error in main loop: {e}", exc_info=True)
# Wait a bit before retrying to avoid rapid failure loops
time.sleep(60)
if __name__ == "__main__":
main()

View file

@ -1,125 +0,0 @@
import logging
import hashlib
from datetime import datetime
from newspaper import Article, ArticleException, Config
import requests
from db import get_write_conn, get_read_conn
# Configuration
logger = logging.getLogger("url_worker")
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
def get_active_urls():
"""Get all active URL sources."""
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT id, nombre, url, categoria_id, pais_id, idioma
FROM fuentes_url
WHERE active = true
""")
return cur.fetchall()
def update_source_status(source_id, status, message, http_code=0):
"""Update the status of a URL source."""
with get_write_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
UPDATE fuentes_url
SET last_check = NOW(),
last_status = %s,
status_message = %s,
last_http_code = %s
WHERE id = %s
""", (status, message, http_code, source_id))
conn.commit()
def save_article(source, article):
"""Save the extracted article to the database."""
source_id, source_name, source_url, cat_id, pais_id, lang = source
# Use the article url if possible, otherwise source_url
final_url = article.url or source_url
noticia_id = hashlib.md5(final_url.encode("utf-8")).hexdigest()
with get_write_conn() as conn:
with conn.cursor() as cur:
# Check if exists
cur.execute("SELECT id FROM noticias WHERE id = %s", (noticia_id,))
if cur.fetchone():
return False # Already exists
# Prepare data
title = article.title or "Sin título"
summary = article.summary or article.text[:500]
image_url = article.top_image
pub_date = article.publish_date or datetime.utcnow()
cur.execute("""
INSERT INTO noticias (
id, titulo, resumen, url, fecha, imagen_url,
fuente_nombre, categoria_id, pais_id
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (id) DO NOTHING
""", (
noticia_id, title, summary, final_url, pub_date, image_url,
source_name, cat_id, pais_id
))
conn.commit()
return True
def process_url(source):
"""Process a single URL source."""
source_id, name, url, _, _, _ = source
logger.info(f"Processing URL: {url} ({name})")
try:
# Browser-like headers
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
config = Config()
config.browser_user_agent = user_agent
config.request_timeout = 30
article = Article(url, config=config, language='es')
article.download()
if not article.html:
update_source_status(source_id, "ERROR", "No content downloaded (Empty HTML)", 0)
return
article.parse()
try:
article.nlp()
except:
pass
if not article.title:
update_source_status(source_id, "ERROR_PARSE", "Could not extract title (Page might be not an article)", 200)
return
saved = save_article(source, article)
status_msg = "News created successfully" if saved else "News already exists"
update_source_status(source_id, "OK", status_msg, 200)
logger.info(f"Success {url}: {status_msg}")
except ArticleException as ae:
logger.error(f"Newspaper Error {url}: {ae}")
update_source_status(source_id, "ERROR_DOWNLOAD", str(ae)[:200], 0)
except requests.exceptions.RequestException as re:
logger.error(f"Network Error {url}: {re}")
update_source_status(source_id, "ERROR_NETWORK", str(re)[:200], 0)
except Exception as e:
logger.error(f"Unexpected Error {url}: {e}")
update_source_status(source_id, "ERROR_UNKNOWN", str(e)[:200], 500)
def main():
logger.info("Starting URL Worker")
urls = get_active_urls()
logger.info(f"Found {len(urls)} active URLs")
for source in urls:
process_url(source)
if __name__ == "__main__":
main()

View file

@ -1,31 +0,0 @@
import time
import logging
import sys
from workers.url_worker import main as run_once
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
stream=sys.stdout
)
logger = logging.getLogger("url_worker_daemon")
INTERVAL = 300 # 5 minutes
def main():
logger.info("Starting URL Worker Daemon")
logger.info(f"Check interval: {INTERVAL} seconds")
while True:
try:
logger.info("Running job cycle...")
run_once()
logger.info("Cycle completed.")
except Exception as e:
logger.exception(f"Error in job cycle: {e}")
time.sleep(INTERVAL)
if __name__ == "__main__":
main()