Initial clean commit

This commit is contained in:
jlimolina 2026-01-13 13:39:51 +01:00
commit 6784d81c2c
141 changed files with 25219 additions and 0 deletions

1
workers/__init__.py Normal file
View file

@ -0,0 +1 @@
# Workers package

447
workers/cluster_worker.py Normal file
View file

@ -0,0 +1,447 @@
import os
import time
import logging
from typing import List, Dict, Any, Optional, Tuple
import numpy as np
import psycopg2
import psycopg2.extras
from psycopg2.extras import Json, execute_values
# -------------------------------------------------------------
# LOGGING
# -------------------------------------------------------------
logging.basicConfig(
level=logging.INFO,
format='[cluster_worker] %(asctime)s %(levelname)s: %(message)s'
)
log = logging.getLogger(__name__)
# -------------------------------------------------------------
# CONFIG
# -------------------------------------------------------------
DB = dict(
host=os.environ.get("DB_HOST", "localhost"),
port=int(os.environ.get("DB_PORT", 5432)),
dbname=os.environ.get("DB_NAME", "rss"),
user=os.environ.get("DB_USER", "rss"),
password=os.environ.get("DB_PASS", "x"),
)
EVENT_LANGS = [
s.strip().lower()
for s in os.environ.get("EVENT_LANGS", "es").split(",")
if s.strip()
]
EVENT_BATCH_IDS = int(os.environ.get("EVENT_BATCH_IDS", "200"))
EVENT_SLEEP_IDLE = float(os.environ.get("EVENT_SLEEP_IDLE", "5.0"))
EVENT_DIST_THRESHOLD = float(os.environ.get("EVENT_DIST_THRESHOLD", "0.25"))
EMB_MODEL = os.environ.get(
"EMB_MODEL",
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
)
# -------------------------------------------------------------
# DB CONNECTION
# -------------------------------------------------------------
def get_conn():
return psycopg2.connect(**DB)
# -------------------------------------------------------------
# SCHEMA CHECK
# -------------------------------------------------------------
def ensure_schema(conn):
"""Crea índices si no existen (seguro en producción)."""
with conn.cursor() as cur:
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_traducciones_evento
ON traducciones(evento_id);
""")
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_traducciones_evento_fecha
ON traducciones(evento_id, noticia_id);
""")
conn.commit()
# -------------------------------------------------------------
# FETCH PENDING
# -------------------------------------------------------------
def fetch_pending_traducciones(conn) -> List[int]:
"""Traducciones completadas sin evento asignado pero con embedding."""
with conn.cursor() as cur:
cur.execute(
"""
SELECT t.id
FROM traducciones t
JOIN traduccion_embeddings e
ON e.traduccion_id = t.id
AND e.model = %s
WHERE t.status = 'done'
AND t.evento_id IS NULL
AND t.lang_to = ANY(%s)
ORDER BY t.id DESC
LIMIT %s;
""",
(EMB_MODEL, EVENT_LANGS, EVENT_BATCH_IDS),
)
rows = cur.fetchall()
return [r[0] for r in rows]
# -------------------------------------------------------------
# FETCH EMBEDDINGS
# -------------------------------------------------------------
def fetch_embeddings_for(conn, tr_ids: List[int]) -> Dict[int, np.ndarray]:
"""Obtiene embeddings como vectores float32, validados y normales."""
if not tr_ids:
return {}
with conn.cursor() as cur:
cur.execute(
"""
SELECT traduccion_id, embedding
FROM traduccion_embeddings
WHERE traduccion_id = ANY(%s)
AND model = %s;
""",
(tr_ids, EMB_MODEL),
)
rows = cur.fetchall()
out = {}
for tr_id, emb in rows:
if not emb:
continue
try:
arr = np.asarray(emb, dtype=np.float32)
if arr.ndim != 1 or arr.size == 0:
continue
if np.isnan(arr).any():
continue
norm = np.linalg.norm(arr)
if norm > 0:
arr = arr / norm
out[int(tr_id)] = arr
except Exception:
continue
return out
# -------------------------------------------------------------
# FETCH CENTROIDS (optimized with matrix)
# -------------------------------------------------------------
class CentroidIndex:
"""Índice vectorizado para búsqueda rápida de centroides."""
def __init__(self):
self.centroids: List[Dict[str, Any]] = []
self._matrix: Optional[np.ndarray] = None
self._ids: List[int] = []
def load_from_db(self, conn):
"""Carga centroides de la BD."""
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute("""
SELECT id, centroid, total_traducciones
FROM eventos
ORDER BY id;
""")
rows = cur.fetchall()
self.centroids = []
vectors = []
for r in rows:
raw = r["centroid"]
if not isinstance(raw, list):
continue
try:
arr = np.asarray(raw, dtype=np.float32)
if arr.ndim != 1 or arr.size == 0:
continue
if np.isnan(arr).any():
continue
norm = np.linalg.norm(arr)
if norm > 0:
arr = arr / norm
self.centroids.append({
"id": int(r["id"]),
"vec": arr,
"n": int(r["total_traducciones"] or 1),
})
vectors.append(arr)
except Exception:
continue
# Build matrix for vectorized search
if vectors:
self._matrix = np.vstack(vectors)
self._ids = [c["id"] for c in self.centroids]
else:
self._matrix = None
self._ids = []
def find_nearest(self, vec: np.ndarray) -> Tuple[Optional[int], float]:
"""Encuentra el centroide más cercano usando operaciones vectorizadas."""
if self._matrix is None or len(self.centroids) == 0:
return None, 1.0
# Vectorized cosine similarity: dot product with normalized vectors
similarities = self._matrix @ vec
best_idx = int(np.argmax(similarities))
best_sim = float(similarities[best_idx])
best_dist = 1.0 - max(-1.0, min(1.0, best_sim))
return best_idx, best_dist
def add_centroid(self, evento_id: int, vec: np.ndarray):
"""Añade un nuevo centroide al índice."""
self.centroids.append({"id": evento_id, "vec": vec.copy(), "n": 1})
if self._matrix is None:
self._matrix = vec.reshape(1, -1)
else:
self._matrix = np.vstack([self._matrix, vec])
self._ids.append(evento_id)
def update_centroid(self, idx: int, new_vec: np.ndarray, new_n: int):
"""Actualiza un centroide existente."""
self.centroids[idx]["vec"] = new_vec
self.centroids[idx]["n"] = new_n
if self._matrix is not None:
self._matrix[idx] = new_vec
# -------------------------------------------------------------
# BATCH FETCH TRADUCCION INFO
# -------------------------------------------------------------
def fetch_traducciones_info_batch(conn, tr_ids: List[int]) -> Dict[int, Dict[str, Any]]:
"""Obtiene info de múltiples traducciones en una sola consulta."""
if not tr_ids:
return {}
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute(
"""
SELECT
t.id AS traduccion_id,
t.noticia_id,
n.fecha,
COALESCE(NULLIF(t.titulo_trad,''), n.titulo) AS titulo_evento
FROM traducciones t
JOIN noticias n ON n.id = t.noticia_id
WHERE t.id = ANY(%s);
""",
(tr_ids,),
)
rows = cur.fetchall()
result = {}
for row in rows:
tr_id = int(row["traduccion_id"])
result[tr_id] = {
"traduccion_id": tr_id,
"noticia_id": row["noticia_id"],
"fecha": row["fecha"],
"titulo_evento": row["titulo_evento"] or "",
}
return result
# -------------------------------------------------------------
# BATCH PROCESSING
# -------------------------------------------------------------
def process_batch_optimized(
conn,
pending_ids: List[int],
emb_by_tr: Dict[int, np.ndarray],
centroid_index: CentroidIndex,
) -> int:
"""Procesa un batch completo con operaciones optimizadas."""
# 1. Fetch all traduccion info in one query
infos = fetch_traducciones_info_batch(conn, pending_ids)
# Prepare batch operations
new_eventos = [] # (vec, info) for new eventos
assign_existing = [] # (tr_id, evento_id, idx, vec, info)
assign_new = [] # (tr_id, vec, info) - will get evento_id after insert
processed = 0
for tr_id in pending_ids:
vec = emb_by_tr.get(tr_id)
if vec is None:
continue
info = infos.get(tr_id)
if not info:
continue
processed += 1
if len(centroid_index.centroids) == 0:
# First event ever
assign_new.append((tr_id, vec, info))
else:
best_idx, best_dist = centroid_index.find_nearest(vec)
if best_idx is not None and best_dist <= EVENT_DIST_THRESHOLD:
assign_existing.append((tr_id, centroid_index.centroids[best_idx]["id"], best_idx, vec, info))
else:
assign_new.append((tr_id, vec, info))
with conn.cursor() as cur:
# 2. Insert new eventos in batch
new_evento_ids = {}
for tr_id, vec, info in assign_new:
cur.execute(
"""
INSERT INTO eventos (centroid, total_traducciones,
fecha_inicio, fecha_fin, n_noticias, titulo)
VALUES (%s, 1, %s, %s, 1, %s)
RETURNING id;
""",
(
Json(vec.tolist()),
info["fecha"],
info["fecha"],
info["titulo_evento"],
),
)
new_id = cur.fetchone()[0]
new_evento_ids[tr_id] = new_id
centroid_index.add_centroid(new_id, vec)
# 3. Update existing eventos and centroids
for tr_id, evento_id, idx, vec, info in assign_existing:
c = centroid_index.centroids[idx]
n_old = c["n"]
n_new = n_old + 1
new_vec = (c["vec"] * n_old + vec) / float(n_new)
norm = np.linalg.norm(new_vec)
if norm > 0:
new_vec = new_vec / norm
centroid_index.update_centroid(idx, new_vec, n_new)
cur.execute(
"""
UPDATE eventos
SET centroid = %s,
total_traducciones = total_traducciones + 1,
fecha_inicio = LEAST(fecha_inicio, %s),
fecha_fin = GREATEST(fecha_fin, %s),
n_noticias = n_noticias + 1
WHERE id = %s;
""",
(Json(new_vec.tolist()), info["fecha"], info["fecha"], evento_id),
)
# 4. Batch update traducciones.evento_id
trad_updates = []
for tr_id, evento_id, _, _, _ in assign_existing:
trad_updates.append((evento_id, tr_id))
for tr_id, _, _ in assign_new:
trad_updates.append((new_evento_ids[tr_id], tr_id))
if trad_updates:
execute_values(
cur,
"""
UPDATE traducciones AS t
SET evento_id = v.evento_id
FROM (VALUES %s) AS v(evento_id, id)
WHERE t.id = v.id;
""",
trad_updates,
)
# 5. Batch insert eventos_noticias
en_inserts = []
for tr_id, evento_id, _, _, info in assign_existing:
if info.get("noticia_id"):
en_inserts.append((evento_id, info["noticia_id"], info["traduccion_id"]))
for tr_id, _, info in assign_new:
if info.get("noticia_id"):
en_inserts.append((new_evento_ids[tr_id], info["noticia_id"], info["traduccion_id"]))
if en_inserts:
execute_values(
cur,
"""
INSERT INTO eventos_noticias (evento_id, noticia_id, traduccion_id)
VALUES %s
ON CONFLICT DO NOTHING;
""",
en_inserts,
)
return processed
# -------------------------------------------------------------
# MAIN LOOP
# -------------------------------------------------------------
def main():
log.info(
"Iniciando cluster_worker (optimized) langs=%s batch=%d threshold=%.3f emb=%s",
",".join(EVENT_LANGS),
EVENT_BATCH_IDS,
EVENT_DIST_THRESHOLD,
EMB_MODEL,
)
while True:
try:
with get_conn() as conn:
ensure_schema(conn)
pending_ids = fetch_pending_traducciones(conn)
if not pending_ids:
time.sleep(EVENT_SLEEP_IDLE)
continue
emb_by_tr = fetch_embeddings_for(conn, pending_ids)
if not emb_by_tr:
time.sleep(EVENT_SLEEP_IDLE)
continue
# Load centroids with vectorized index
centroid_index = CentroidIndex()
centroid_index.load_from_db(conn)
# Process batch with optimizations
t0 = time.time()
processed = process_batch_optimized(conn, pending_ids, emb_by_tr, centroid_index)
dt = time.time() - t0
conn.commit()
log.info("Cluster OK: %d procesadas en %.2fs (%.1f/s)",
processed, dt, processed / dt if dt > 0 else 0)
except Exception:
log.exception("Error en cluster_worker")
time.sleep(EVENT_SLEEP_IDLE)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,267 @@
import os
import time
import logging
from typing import List
import numpy as np
import psycopg2
import psycopg2.extras
from psycopg2.extras import execute_values
import torch
from sentence_transformers import SentenceTransformer
# ================================================================
# Logging
# ================================================================
logging.basicConfig(
level=logging.INFO,
format='[EMB] %(asctime)s %(levelname)s: %(message)s'
)
log = logging.getLogger("embeddings_worker")
# ================================================================
# Configuración
# ================================================================
DB = dict(
host=os.environ.get("DB_HOST", "localhost"),
port=int(os.environ.get("DB_PORT", 5432)),
dbname=os.environ.get("DB_NAME", "rss"),
user=os.environ.get("DB_USER", "rss"),
password=os.environ.get("DB_PASS", "x"),
)
EMB_MODEL = os.environ.get(
"EMB_MODEL",
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
)
EMB_BATCH = int(os.environ.get("EMB_BATCH", "128"))
SLEEP_IDLE = float(os.environ.get("EMB_SLEEP_IDLE", "5.0"))
# ej: "es,en,fr"
EMB_LANGS = [
s.strip()
for s in os.environ.get("EMB_LANGS", "es").split(",")
if s.strip()
]
DEVICE_ENV = os.environ.get("DEVICE", "auto").lower()
EMB_LIMIT = int(os.environ.get("EMB_LIMIT", "1000"))
# ================================================================
# Conexión
# ================================================================
def get_conn():
return psycopg2.connect(**DB)
# ================================================================
# Esquema — se asegura que exista
# ================================================================
def ensure_schema(conn):
"""
Asegura que la tabla de embeddings existe. Idempotente.
"""
with conn.cursor() as cur:
cur.execute(
"""
CREATE TABLE IF NOT EXISTS traduccion_embeddings (
id SERIAL PRIMARY KEY,
traduccion_id INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE,
model TEXT NOT NULL,
dim INT NOT NULL,
embedding DOUBLE PRECISION[] NOT NULL,
created_at TIMESTAMP DEFAULT NOW(),
UNIQUE (traduccion_id, model)
);
"""
)
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_tr_emb_model
ON traduccion_embeddings(model);
""")
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_tr_emb_traduccion_id
ON traduccion_embeddings(traduccion_id);
""")
conn.commit()
# ================================================================
# Fetch de trabajos pendientes
# ================================================================
def fetch_batch_pending(conn) -> List[psycopg2.extras.DictRow]:
"""
Obtiene traducciones en status 'done' que aún no tienen embedding
para este modelo.
"""
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute(
"""
SELECT
t.id AS traduccion_id,
t.lang_to AS lang_to,
COALESCE(NULLIF(t.titulo_trad,''), '') AS titulo_trad,
COALESCE(NULLIF(t.resumen_trad,''), '') AS resumen_trad,
n.id AS noticia_id
FROM traducciones t
JOIN noticias n ON n.id = t.noticia_id
LEFT JOIN traduccion_embeddings e
ON e.traduccion_id = t.id AND e.model = %s
WHERE t.status = 'done'
AND t.lang_to = ANY(%s)
AND e.traduccion_id IS NULL
ORDER BY t.id
LIMIT %s;
""",
(EMB_MODEL, EMB_LANGS, EMB_LIMIT),
)
return cur.fetchall()
# ================================================================
# Preparación de textos
# ================================================================
def texts_from_rows(rows: List[psycopg2.extras.DictRow]) -> List[str]:
"""
Devuelve textos combinados para embeddings.
Evita pasar texto vacío al modelo.
"""
texts = []
for r in rows:
title = (r["titulo_trad"] or "").strip()
body = (r["resumen_trad"] or "").strip()
if title and body:
texts.append(f"{title}\n{body}")
else:
texts.append(title or body or "")
return texts
# ================================================================
# Upsert
# ================================================================
def upsert_embeddings(conn, rows, embs: np.ndarray, model_name: str):
"""
Inserta o actualiza embeddings en la base de datos.
"""
if embs.size == 0 or not rows:
return
dim = int(embs.shape[1])
data = [
(
int(r["traduccion_id"]),
model_name,
dim,
embs[i].astype(float).tolist(),
)
for i, r in enumerate(rows)
]
with conn.cursor() as cur:
execute_values(
cur,
"""
INSERT INTO traduccion_embeddings
(traduccion_id, model, dim, embedding)
VALUES %s
ON CONFLICT (traduccion_id, model)
DO UPDATE SET
embedding = EXCLUDED.embedding,
dim = EXCLUDED.dim,
created_at = NOW();
""",
data,
)
conn.commit()
# ================================================================
# Load model
# ================================================================
def resolve_device() -> str:
"""
Determina el dispositivo a usar.
"""
if DEVICE_ENV in ("cpu", "cuda"):
if DEVICE_ENV == "cuda" and not torch.cuda.is_available():
return "cpu"
return DEVICE_ENV
# auto
return "cuda" if torch.cuda.is_available() else "cpu"
def load_model() -> SentenceTransformer:
"""
Carga el modelo con fallback CPU si CUDA falla.
"""
device = resolve_device()
log.info(f"Cargando modelo {EMB_MODEL} en device={device}")
try:
return SentenceTransformer(EMB_MODEL, device=device)
except Exception as e:
log.error(f"Fallo cargando modelo en {device}: {e}")
if device == "cuda":
log.warning("→ Reintentando en CPU…")
return SentenceTransformer(EMB_MODEL, device="cpu")
raise
# ================================================================
# Main Worker
# ================================================================
def main():
log.info(
f"Iniciando embeddings_worker | model={EMB_MODEL} | batch={EMB_BATCH} | lang={','.join(EMB_LANGS)} | limit={EMB_LIMIT}"
)
model = load_model()
while True:
try:
with get_conn() as conn:
ensure_schema(conn)
rows = fetch_batch_pending(conn)
if not rows:
time.sleep(SLEEP_IDLE)
continue
texts = texts_from_rows(rows)
# Encode
embs = model.encode(
texts,
batch_size=EMB_BATCH,
convert_to_numpy=True,
show_progress_bar=False,
normalize_embeddings=True,
)
# Upsert
upsert_embeddings(conn, rows, embs, EMB_MODEL)
log.info(f"Embeddings generados: {len(rows)}")
except Exception as e:
log.exception(f"Error en embeddings_worker: {e}")
time.sleep(SLEEP_IDLE)
if __name__ == "__main__":
main()

414
workers/ner_worker.py Normal file
View file

@ -0,0 +1,414 @@
import os
import time
import logging
import re
import string
from typing import List, Tuple
from collections import Counter
import psycopg2
import psycopg2.extras
import spacy
from bs4 import BeautifulSoup
# ==========================================================
# Logging
# ==========================================================
logging.basicConfig(
level=logging.INFO,
format='[NER] %(asctime)s %(levelname)s: %(message)s'
)
log = logging.getLogger("ner_worker")
# ==========================================================
# Config DB
# ==========================================================
DB = dict(
host=os.environ.get("DB_HOST", "localhost"),
port=int(os.environ.get("DB_PORT", 5432)),
dbname=os.environ.get("DB_NAME", "rss"),
user=os.environ.get("DB_USER", "rss"),
password=os.environ.get("DB_PASS", "x"),
)
NER_LANG = os.environ.get("NER_LANG", "es").strip().lower()
BATCH = int(os.environ.get("NER_BATCH", 64))
# ==========================================================
# Mapeo de entidades spaCy → nuestro modelo SQL
# ==========================================================
ENT_LABELS = {
"PERSON": "persona",
"PER": "persona",
"ORG": "organizacion",
"GPE": "lugar",
"LOC": "lugar",
"MISC": "tema",
}
# ==========================================================
# Limpieza avanzada
# ==========================================================
_ws_re = re.compile(r"\s+")
HTML_TRASH_PATTERNS = [
r"<[^>]+>",
r"&[a-z]+;",
r"&#\d+;?",
r'width="\d+"',
r'height="\d+"',
]
GENERIC_BAD_TAGS = {
"república", "estado", "centro", "gobierno", "el gobierno",
"gobiernos", "report", "sp", "unión", "union", "dólares",
"dolar", "dólar", "the post", "post", "artículo", "el artículo",
"la ciudad", "mundo", "país", "pais", "países", "paises",
"la noche", "la publicación", "este miércoles", "el miércoles",
"hoy", "ayer", "mañana", "servicio", "servicios", "el presidente",
"presidente", "el ministro", "ministro", "la guerra", "guerra",
"seguridad", "wp-content", "internal_photos", "/internal_photos",
"https", "http", "src"
}
STOPWORDS = set()
ARTICLES = {
"el", "la", "los", "las", "un", "una", "uno", "al", "del"
}
# Límites
TOPIC_MIN_CHARS = 4
TOPIC_MAX_WORDS = 6
TOPIC_MAX_PER_DOC = 15
# ==========================================================
# Helpers
# ==========================================================
def get_conn():
return psycopg2.connect(**DB)
def _looks_like_attr_or_path(text_lower: str) -> bool:
"""Filtra basura tipo rutas, html, atributos, URLs, etc."""
if text_lower.startswith("/"):
return True
if "http://" in text_lower or "https://" in text_lower:
return True
if any(ext in text_lower for ext in (".jpg", ".jpeg", ".png", ".gif", ".webp")):
return True
if re.search(r"\b(src|alt|style|class)\s*=", text_lower):
return True
if "data-" in text_lower:
return True
if re.search(r"&#\d+;?", text_lower):
return True
if "=" in text_lower and " " not in text_lower.strip():
return True
# tokens-hash tipo "ajsdh7287sdhjshd8" (solo si no tiene espacios)
if " " not in text_lower and re.fullmatch(r"[a-z0-9_]{15,}", text_lower.replace("-", "")):
return True
# palabras sin espacios largas con guiones
if "-" in text_lower and " " not in text_lower:
return True
return False
# ==========================================================
# Limpieza de entidades
# ==========================================================
def clean_tag_text(text: str) -> str | None:
if not text:
return None
text = BeautifulSoup(text, "html.parser").get_text()
for pat in HTML_TRASH_PATTERNS:
text = re.sub(pat, "", text)
text = _ws_re.sub(" ", text).strip()
text = text.strip(string.punctuation + " ")
if len(text) < 3:
log.debug(f"Clean reject (too short): {text}")
return None
if re.search(r"[<>/\\]", text):
log.debug(f"Clean reject (bad chars): {text}")
return None
lower = text.lower()
if lower.startswith("href="):
log.debug(f"Clean reject (href): {text}")
return None
if _looks_like_attr_or_path(lower):
log.debug(f"Clean reject (attr/path): {text}")
return None
if lower in GENERIC_BAD_TAGS:
log.debug(f"Clean reject (generic bad): {text}")
return None
replacements = {
"ee.uu.": "Estados Unidos",
"los estados unidos": "Estados Unidos",
"eeuu": "Estados Unidos",
"eu": "Unión Europea",
"ue": "Unión Europea",
"kosova": "Kosovo",
# Specific User Requests
"trump": "Donald Trump",
"mr. trump": "Donald Trump",
"mr trump": "Donald Trump",
"doland trump": "Donald Trump",
"el presidente trump": "Donald Trump",
"president trump": "Donald Trump",
"ex-president trump": "Donald Trump",
"expresidente trump": "Donald Trump",
"putin": "Vladimir Putin",
"vladimir putin": "Vladimir Putin",
"v. putin": "Vladimir Putin",
"presidente putin": "Vladimir Putin",
# New requests
"sanchez": "Pedro Sánchez",
"pedro sanchez": "Pedro Sánchez",
"p. sanchez": "Pedro Sánchez",
"mr. sanchez": "Pedro Sánchez",
"sánchez": "Pedro Sánchez", # explicit match just in case
"pedro sánchez": "Pedro Sánchez",
"maduro": "Nicolás Maduro",
"nicolas maduro": "Nicolás Maduro",
"mr. maduro": "Nicolás Maduro",
"lula": "Lula da Silva",
"lula da silva": "Lula da Silva",
"luiz inácio lula da silva": "Lula da Silva",
}
if lower in replacements:
return replacements[lower]
# Blacklist (explicit removals requested)
blacklist = {
"getty images", "netflix", "fiscalia", "segun", "estoy", # People blacklist
"and more", "app", "estamos", "ultra", # Orgs blacklist
"hacienda", "fiscalía"
}
if lower in blacklist:
log.debug(f"Clean reject (blacklist): {text}")
return None
return text
# ==========================================================
# Limpieza de topics (noun-chunks)
# ==========================================================
def clean_topic_text(text: str) -> str | None:
if not text:
return None
text = BeautifulSoup(text, "html.parser").get_text()
for pat in HTML_TRASH_PATTERNS:
text = re.sub(pat, "", text)
text = _ws_re.sub(" ", text).strip()
text = text.strip(string.punctuation + " ")
if len(text) < TOPIC_MIN_CHARS:
return None
lower = text.lower()
if _looks_like_attr_or_path(lower):
return None
tokens = [
t.strip(string.punctuation)
for t in lower.split()
if t.strip(string.punctuation)
]
if not tokens:
return None
# remover artículos iniciales
if tokens[0] in ARTICLES:
tokens = tokens[1:]
if not tokens:
return None
norm = " ".join(tokens).strip()
if len(norm) < TOPIC_MIN_CHARS:
return None
if norm in GENERIC_BAD_TAGS:
return None
if len(tokens) > TOPIC_MAX_WORDS:
return None
if all(t in STOPWORDS for t in tokens):
return None
if re.fullmatch(r"[0-9\s\.,\-:/]+", norm):
return None
return norm
# ==========================================================
# Extracción NER + Topics
# ==========================================================
def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]:
ents = []
topics = []
if not text:
return ents, topics
doc = nlp(text)
# log.debug(f"Processing text ({len(text)} chars): {text[:30]}...")
# log.debug(f"Entities found: {len(doc.ents)}")
# --- ENTIDADES ---
for ent in doc.ents:
tipo = ENT_LABELS.get(ent.label_)
if not tipo:
continue
cleaned = clean_tag_text(ent.text)
if not cleaned:
# log.debug(f"Rejected entity: {ent.text} ({ent.label_})")
continue
if tipo == "persona":
lower_cleaned = cleaned.lower()
# Aggressive normalization rules for VIPs
# Use token checks or substring checks carefully
if "trump" in lower_cleaned.split():
# Token 'trump' exists? e.g. "donald trump", "trump", "mr. trump"
# Exclude family members
family = ["ivanka", "melania", "eric", "tiffany", "barron", "lara", "mary", "fred"]
if not any(f in lower_cleaned for f in family):
cleaned = "Donald Trump"
elif "sanchez" in lower_cleaned or "sánchez" in lower_cleaned:
# Be careful of other Sanchez? But user context implies Pedro.
if "pedro" in lower_cleaned or "presidente" in lower_cleaned or lower_cleaned in ["sanchez", "sánchez"]:
cleaned = "Pedro Sánchez"
elif "maduro" in lower_cleaned:
cleaned = "Nicolás Maduro"
elif "lula" in lower_cleaned:
cleaned = "Lula da Silva"
elif "putin" in lower_cleaned:
cleaned = "Vladimir Putin"
# log.debug(f"Accepted entity: {cleaned} ({tipo})")
ents.append((cleaned, tipo))
# --- TOPICS ---
topic_counter = Counter()
for chunk in doc.noun_chunks:
cleaned = clean_topic_text(chunk.text)
if cleaned:
topic_counter[cleaned] += 1
ent_values = {v for (v, _) in ents}
for val, _count in topic_counter.most_common(TOPIC_MAX_PER_DOC):
if val in ent_values:
continue
topics.append((val, "tema"))
return list(set(ents)), list(set(topics))
# ==========================================================
# Worker principal
# ==========================================================
def main():
global STOPWORDS
# Cargar spaCy
log.info("Cargando modelo spaCy es_core_news_md...")
nlp = spacy.load("es_core_news_md", disable=["lemmatizer", "textcat"])
STOPWORDS = set(nlp.Defaults.stop_words)
log.info("Modelo spaCy cargado correctamente.")
while True:
try:
with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute(
"""
SELECT t.id, t.titulo_trad, t.resumen_trad
FROM traducciones t
WHERE t.status = 'done'
AND t.lang_to = %s
AND NOT EXISTS (
SELECT 1 FROM tags_noticia tn WHERE tn.traduccion_id = t.id
)
ORDER BY t.id DESC
LIMIT %s;
""",
(NER_LANG, BATCH),
)
rows = cur.fetchall()
if not rows:
time.sleep(5)
continue
log.info(f"Procesando {len(rows)} traducciones para NER/temas...")
inserted_links = 0
for r in rows:
text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
if not text:
continue
ents, topics = extract_entities_and_topics(nlp, text)
tags = ents + topics
if not tags:
continue
for valor, tipo in tags:
try:
cur.execute(
"""
INSERT INTO tags (valor, tipo)
VALUES (%s, %s)
ON CONFLICT (valor, tipo)
DO UPDATE SET valor = EXCLUDED.valor
RETURNING id;
""",
(valor, tipo),
)
tag_id = cur.fetchone()[0]
cur.execute(
"""
INSERT INTO tags_noticia (traduccion_id, tag_id)
VALUES (%s, %s)
ON CONFLICT DO NOTHING;
""",
(r["id"], tag_id),
)
if cur.rowcount > 0:
inserted_links += 1
except Exception:
log.exception("Error insertando tag/relación")
conn.commit()
log.info(f"Lote NER OK. Nuevas relaciones tag_noticia: {inserted_links}")
except Exception:
log.exception("Error general en NER loop")
time.sleep(5)
if __name__ == "__main__":
main()

334
workers/qdrant_worker.py Normal file
View file

@ -0,0 +1,334 @@
"""
Worker de Qdrant
Vectoriza noticias traducidas y las sube a Qdrant para búsquedas semánticas.
"""
import os
import sys
import time
import uuid
from datetime import datetime
from typing import List, Dict, Any
# Añadir el directorio raíz al path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from db import get_read_conn, get_write_conn
try:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
except ImportError:
print("❌ Error: qdrant-client no instalado. Ejecuta: pip install qdrant-client")
sys.exit(1)
try:
from sentence_transformers import SentenceTransformer
except ImportError:
print("❌ Error: sentence-transformers no instalado")
sys.exit(1)
# Configuración
QDRANT_HOST = os.environ.get("QDRANT_HOST", "localhost")
QDRANT_PORT = int(os.environ.get("QDRANT_PORT", "6333"))
QDRANT_COLLECTION = os.environ.get("QDRANT_COLLECTION_NAME", "news_vectors")
EMB_MODEL = os.environ.get("EMB_MODEL", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
EMB_DEVICE = os.environ.get("EMB_DEVICE", "cuda")
BATCH_SIZE = int(os.environ.get("QDRANT_BATCH_SIZE", "100"))
SLEEP_IDLE = int(os.environ.get("QDRANT_SLEEP_IDLE", "30"))
# Cliente Qdrant global
qdrant_client = None
embedding_model = None
def init_qdrant_client():
"""
Inicializa el cliente de Qdrant y crea la colección si no existe.
"""
global qdrant_client
print(f"🔌 Conectando a Qdrant en {QDRANT_HOST}:{QDRANT_PORT}...")
qdrant_client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
# Verificar si la colección existe
collections = qdrant_client.get_collections().collections
collection_names = [c.name for c in collections]
if QDRANT_COLLECTION not in collection_names:
print(f"📦 Creando colección '{QDRANT_COLLECTION}'...")
# Obtener dimensión del modelo de embeddings
# paraphrase-multilingual-MiniLM-L12-v2 = 384 dimensiones
vector_size = 384
qdrant_client.create_collection(
collection_name=QDRANT_COLLECTION,
vectors_config=VectorParams(
size=vector_size,
distance=Distance.COSINE
)
)
print(f"✅ Colección '{QDRANT_COLLECTION}' creada (dimensión: {vector_size})")
else:
print(f"✅ Colección '{QDRANT_COLLECTION}' ya existe")
# Obtener info de la colección
collection_info = qdrant_client.get_collection(QDRANT_COLLECTION)
print(f"📊 Puntos en colección: {collection_info.points_count}")
def init_embedding_model():
"""
Inicializa el modelo de embeddings.
"""
global embedding_model
print(f"🤖 Cargando modelo de embeddings: {EMB_MODEL}")
print(f"🖥️ Dispositivo: {EMB_DEVICE}")
embedding_model = SentenceTransformer(EMB_MODEL, device=EMB_DEVICE)
print(f"✅ Modelo cargado correctamente")
def get_pending_news(limit: int = BATCH_SIZE) -> List[Dict[str, Any]]:
"""
Obtiene noticias traducidas pendientes de vectorizar.
Args:
limit: Número máximo de noticias a obtener
Returns:
Lista de noticias
"""
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT
t.id as traduccion_id,
t.noticia_id,
t.lang_to as lang,
t.titulo_trad as titulo,
t.resumen_trad as resumen,
n.url,
n.fecha,
n.fuente_nombre,
n.categoria_id,
n.pais_id
FROM traducciones t
INNER JOIN noticias n ON t.noticia_id = n.id
WHERE t.vectorized = FALSE
AND t.status = 'done'
ORDER BY t.created_at ASC
LIMIT %s
""", (limit,))
columns = [desc[0] for desc in cur.description]
results = []
for row in cur.fetchall():
results.append(dict(zip(columns, row)))
return results
def generate_embeddings(texts: List[str]) -> List[List[float]]:
"""
Genera embeddings para una lista de textos.
Args:
texts: Lista de textos
Returns:
Lista de vectores de embeddings
"""
embeddings = embedding_model.encode(
texts,
batch_size=32,
show_progress_bar=False,
convert_to_numpy=True
)
return embeddings.tolist()
def upload_to_qdrant(news_batch: List[Dict[str, Any]]):
"""
Sube un lote de noticias a Qdrant.
Args:
news_batch: Lista de noticias
"""
if not news_batch:
return
# Preparar textos para embeddings (título + resumen)
texts = [
f"{news['titulo']} {news['resumen']}"
for news in news_batch
]
print(f" 🧮 Generando embeddings para {len(texts)} noticias...")
embeddings = generate_embeddings(texts)
# Preparar puntos para Qdrant
points = []
for news, embedding in zip(news_batch, embeddings):
point_id = str(uuid.uuid4())
# Preparar payload (metadata)
payload = {
"news_id": news['noticia_id'],
"traduccion_id": news['traduccion_id'],
"titulo": news['titulo'],
"resumen": news['resumen'],
"url": news['url'],
"fecha": news['fecha'].isoformat() if news['fecha'] else None,
"fuente_nombre": news['fuente_nombre'],
"categoria_id": news['categoria_id'],
"pais_id": news['pais_id'],
"lang": news['lang']
}
point = PointStruct(
id=point_id,
vector=embedding,
payload=payload
)
points.append(point)
# Guardar point_id para actualizar DB
news['qdrant_point_id'] = point_id
# Subir a Qdrant
print(f" ⬆️ Subiendo {len(points)} puntos a Qdrant...")
qdrant_client.upsert(
collection_name=QDRANT_COLLECTION,
points=points
)
# Actualizar base de datos
print(f" 💾 Actualizando estado en PostgreSQL...")
with get_write_conn() as conn:
with conn.cursor() as cur:
for news in news_batch:
cur.execute("""
UPDATE traducciones
SET
vectorized = TRUE,
vectorization_date = NOW(),
qdrant_point_id = %s
WHERE id = %s
""", (news['qdrant_point_id'], news['traduccion_id']))
conn.commit()
print(f" ✅ Lote subido correctamente")
def process_batch():
"""
Procesa un lote de noticias traducidas.
Returns:
Número de noticias procesadas
"""
news_batch = get_pending_news()
if not news_batch:
return 0
print(f"\n📋 Procesando {len(news_batch)} noticias traducidas...")
try:
upload_to_qdrant(news_batch)
return len(news_batch)
except Exception as e:
print(f"❌ Error procesando lote: {e}")
return 0
def get_stats():
"""
Obtiene estadísticas del sistema.
"""
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE vectorized = TRUE) as vectorizadas,
COUNT(*) FILTER (WHERE vectorized = FALSE AND status = 'done') as pendientes
FROM traducciones
WHERE lang_to = 'es'
""")
row = cur.fetchone()
return {
'total': row[0],
'vectorizadas': row[1],
'pendientes': row[2]
}
def main():
"""
Loop principal del worker.
"""
print("=" * 80)
print("🚀 Qdrant Vectorization Worker (Direct Translation)")
print("=" * 80)
print(f"Qdrant: {QDRANT_HOST}:{QDRANT_PORT}")
print(f"Colección: {QDRANT_COLLECTION}")
print(f"Modelo: {EMB_MODEL}")
print(f"Dispositivo: {EMB_DEVICE}")
print(f"Tamaño de lote: {BATCH_SIZE}")
print("=" * 80)
# Inicializar Qdrant
try:
init_qdrant_client()
except Exception as e:
print(f"❌ Error inicializando Qdrant: {e}")
print("⚠️ Asegúrate de que Qdrant esté corriendo")
return
# Inicializar modelo de embeddings
try:
init_embedding_model()
except Exception as e:
print(f"❌ Error cargando modelo de embeddings: {e}")
return
print("\n🔄 Iniciando loop de procesamiento...\n")
total_processed = 0
while True:
try:
processed = process_batch()
total_processed += processed
if processed > 0:
print(f"\n✅ Lote completado: {processed} noticias vectorizadas")
print(f"📊 Total procesado en esta sesión: {total_processed}")
# Mostrar estadísticas
stats = get_stats()
print(f"📈 Estadísticas globales:")
print(f" Total traducciones: {stats['total']}")
print(f" Vectorizadas: {stats['vectorizadas']}")
print(f" Pendientes: {stats['pendientes']}")
else:
print(f"💤 No hay noticias pendientes. Esperando {SLEEP_IDLE}s...")
time.sleep(SLEEP_IDLE)
except KeyboardInterrupt:
print("\n\n⏹️ Worker detenido por el usuario")
break
except Exception as e:
print(f"\n❌ Error en loop principal: {e}")
print(f"⏳ Esperando {SLEEP_IDLE}s antes de reintentar...")
time.sleep(SLEEP_IDLE)
if __name__ == "__main__":
main()

202
workers/related_worker.py Normal file
View file

@ -0,0 +1,202 @@
import os
import time
import logging
from typing import List, Tuple
import numpy as np
import psycopg2
import psycopg2.extras
logging.basicConfig(
level=logging.INFO,
format='[related] %(asctime)s %(levelname)s: %(message)s'
)
DB = dict(
host=os.environ.get("DB_HOST", "localhost"),
port=int(os.environ.get("DB_PORT", 5432)),
dbname=os.environ.get("DB_NAME", "rss"),
user=os.environ.get("DB_USER", "rss"),
password=os.environ.get("DB_PASS", "x"),
)
EMB_MODEL = os.environ.get(
"EMB_MODEL",
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
TOPK = int(os.environ.get("RELATED_TOPK", 10))
BATCH_IDS = int(os.environ.get("RELATED_BATCH_IDS", 200))
SLEEP_IDLE = float(os.environ.get("RELATED_SLEEP", 10))
MIN_SCORE = float(os.environ.get("RELATED_MIN_SCORE", 0.0))
WINDOW_HOURS = int(os.environ.get("RELATED_WINDOW_H", 0))
def get_conn():
return psycopg2.connect(**DB)
def fetch_all_embeddings(cur) -> Tuple[List[int], np.ndarray]:
sql = """
SELECT e.traduccion_id, e.embedding, n.fecha
FROM traduccion_embeddings e
JOIN traducciones t ON t.id = e.traduccion_id
JOIN noticias n ON n.id = t.noticia_id
WHERE e.model = %s
AND t.status = 'done'
AND t.lang_to = 'es'
"""
params = [EMB_MODEL]
if WINDOW_HOURS > 0:
sql += " AND n.fecha >= NOW() - INTERVAL %s"
params.append(f"{WINDOW_HOURS} hours")
cur.execute(sql, params)
rows = cur.fetchall()
if not rows:
return [], None
ids = []
vecs = []
for tr_id, emb, _ in rows:
if not emb:
continue
arr = np.asarray(emb, dtype=np.float32)
if arr.ndim != 1 or arr.size == 0:
continue
ids.append(tr_id)
vecs.append(arr)
if not ids:
return [], None
mat = np.vstack(vecs)
norms = np.linalg.norm(mat, axis=1, keepdims=True)
norms[norms == 0] = 1e-8
mat = mat / norms
return ids, mat
def fetch_pending_ids(cur, limit) -> List[int]:
cur.execute(
"""
SELECT t.id
FROM traducciones t
JOIN traduccion_embeddings e
ON e.traduccion_id = t.id AND e.model = %s
LEFT JOIN related_noticias r
ON r.traduccion_id = t.id
WHERE t.lang_to = 'es'
AND t.status = 'done'
GROUP BY t.id
HAVING COUNT(r.related_traduccion_id) = 0
ORDER BY t.id DESC
LIMIT %s;
""",
(EMB_MODEL, limit),
)
return [r[0] for r in cur.fetchall()]
def topk(idx: int, ids_all: List[int], mat: np.ndarray, K: int) -> List[Tuple[int, float]]:
q = mat[idx]
sims = np.dot(mat, q)
sims[idx] = -999.0
if MIN_SCORE > 0:
mask = sims >= MIN_SCORE
sims = np.where(mask, sims, -999.0)
if K >= len(sims):
top_idx = np.argsort(-sims)
else:
part = np.argpartition(-sims, K)[:K]
top_idx = part[np.argsort(-sims[part])]
return [(ids_all[j], float(sims[j])) for j in top_idx[:K]]
def insert_related(cur, tr_id: int, pairs):
clean = []
for rid, score in pairs:
if rid == tr_id:
continue
s = float(score)
if s <= 0:
continue
clean.append((tr_id, rid, s))
if not clean:
return
psycopg2.extras.execute_values(
cur,
"""
INSERT INTO related_noticias (traduccion_id, related_traduccion_id, score)
VALUES %s
ON CONFLICT (traduccion_id, related_traduccion_id)
DO UPDATE SET score = EXCLUDED.score;
""",
clean,
)
def build_for_ids(conn, target_ids: List[int]) -> int:
with conn.cursor() as cur:
ids_all, mat = fetch_all_embeddings(cur)
if not ids_all or mat is None:
return 0
pos = {tid: i for i, tid in enumerate(ids_all)}
processed = 0
with conn.cursor() as cur:
for tr_id in target_ids:
if tr_id not in pos:
continue
idx = pos[tr_id]
pairs = topk(idx, ids_all, mat, TOPK)
insert_related(cur, tr_id, pairs)
processed += 1
conn.commit()
return processed
def main():
logging.info(
"Iniciando related_worker (EMB=%s TOPK=%s BATCH=%s MIN=%.3f WINDOW_H=%s)",
EMB_MODEL,
TOPK,
BATCH_IDS,
MIN_SCORE,
WINDOW_HOURS,
)
while True:
try:
with get_conn() as conn, conn.cursor() as cur:
todo = fetch_pending_ids(cur, BATCH_IDS)
if not todo:
time.sleep(SLEEP_IDLE)
continue
with get_conn() as conn:
done = build_for_ids(conn, todo)
logging.info("Relacionadas generadas/actualizadas para %d traducciones.", done)
except Exception:
logging.exception("Error en related_worker")
time.sleep(SLEEP_IDLE)
if __name__ == "__main__":
main()

244
workers/topics_worker.py Normal file
View file

@ -0,0 +1,244 @@
import os
import time
import logging
import json
import psycopg2
from psycopg2.extras import execute_values
# Logging
logging.basicConfig(
level=logging.INFO,
format='[topics_worker] %(asctime)s %(levelname)s: %(message)s'
)
log = logging.getLogger(__name__)
# Config
DB_CONFIG = {
"host": os.environ.get("DB_HOST", "localhost"),
"port": int(os.environ.get("DB_PORT", 5432)),
"dbname": os.environ.get("DB_NAME", "rss"),
"user": os.environ.get("DB_USER", "rss"),
"password": os.environ.get("DB_PASS", "x"),
}
SLEEP_IDLE = 10
BATCH_SIZE = 500
def get_conn():
return psycopg2.connect(**DB_CONFIG)
def load_topics(conn):
"""
Load topics and heir keywords.
Returns list of dicts: [{'id': 1, 'weight': 5, 'keywords': ['foo', 'bar']}]
"""
with conn.cursor() as cur:
cur.execute("SELECT id, weight, keywords FROM topics")
rows = cur.fetchall()
topics = []
for r in rows:
tid, weight, kw_str = r
if not kw_str:
continue
# Keywords are comma separated based on insert script
kws = [k.strip().lower() for k in kw_str.split(",") if k.strip()]
topics.append({
"id": tid,
"weight": weight,
"keywords": kws
})
return topics
def load_countries(conn):
"""
Load countries.
Returns list: [{'id': 1, 'name': 'España', 'keywords': ['españa', 'madrid']}]
"""
with conn.cursor() as cur:
cur.execute("SELECT id, nombre FROM paises")
rows = cur.fetchall()
countries = []
# Hardcoded aliases for simplicity. A separate table would be better.
ALIASES = {
"Estados Unidos": ["eeuu", "ee.uu.", "usa", "estadounidense", "washington"],
"Rusia": ["ruso", "rusa", "moscú", "kremlin"],
"China": ["chino", "china", "pekin", "beijing"],
"Ucrania": ["ucraniano", "kiev", "kyiv"],
"Israel": ["israelí", "tel aviv", "jerusalén"],
"España": ["español", "madrid"],
"Reino Unido": ["uk", "londres", "británico"],
"Francia": ["francés", "parís"],
"Alemania": ["alemán", "berlín"],
"Palestina": ["palestino", "gaza", "cisjordania"],
"Irán": ["iraní", "teherán"],
}
for r in rows:
cid, name = r
kws = [name.lower()]
if name in ALIASES:
kws.extend(ALIASES[name])
countries.append({"id": cid, "name": name, "keywords": kws})
return countries
def process_batch(conn, topics, countries):
"""
Fetch batch of processed=False news.
Match against topics AND countries.
Insert into news_topics.
Mark processed.
"""
with conn.cursor() as cur:
# Fetch news
cur.execute("""
SELECT id, titulo, resumen
FROM noticias
WHERE topics_processed = FALSE
ORDER BY fecha DESC
LIMIT %s
""", (BATCH_SIZE,))
news_items = cur.fetchall()
if not news_items:
return 0
inserts = [] # (noticia_id, topic_id, score)
processed_ids = []
# Batch updates for pais_id
country_updates = [] # (pais_id, noticia_id)
for item in news_items:
nid, titulo, resumen = item
text = (titulo or "") + " " + (resumen or "")
text_lower = text.lower()
# 1. Match Topics
for topic in topics:
matched_count = 0
for kw in topic["keywords"]:
if kw in text_lower:
matched_count += 1
if matched_count > 0:
score = topic["weight"] * matched_count
inserts.append((nid, topic["id"], score))
# 2. Match Country (Find best match)
best_country = None
# Simple heuristic: First found? Or count matches?
# Let's count matches.
max_matches = 0
for c in countries:
matches = 0
for kw in c["keywords"]:
# simple word matching. can be improved with regex word boundaries
if kw in text_lower:
matches += 1
if matches > max_matches:
max_matches = matches
best_country = c["id"]
if best_country:
country_updates.append((best_country, nid))
processed_ids.append(nid)
with conn.cursor() as cur:
# Insert relations
if inserts:
execute_values(cur, """
INSERT INTO news_topics (noticia_id, topic_id, score)
VALUES %s
ON CONFLICT (noticia_id, topic_id) DO UPDATE SET score = EXCLUDED.score
""", inserts)
# Update Countries
if country_updates:
execute_values(cur, """
UPDATE noticias AS n
SET pais_id = v.pais_id
FROM (VALUES %s) AS v(pais_id, noticia_id)
WHERE n.id = v.noticia_id
""", country_updates)
# Mark processed
cur.execute("""
UPDATE noticias
SET topics_processed = TRUE
WHERE id = ANY(%s)
""", (processed_ids,))
conn.commit()
return len(news_items)
def initialize_schema(conn):
"""
Ensure required tables and columns exist.
"""
log.info("Checking/Initializing schema...")
with conn.cursor() as cur:
cur.execute("""
CREATE TABLE IF NOT EXISTS topics (
id SERIAL PRIMARY KEY,
slug VARCHAR(50) UNIQUE NOT NULL,
name VARCHAR(100) NOT NULL,
weight INTEGER DEFAULT 1,
keywords TEXT,
group_name VARCHAR(50)
);
CREATE TABLE IF NOT EXISTS news_topics (
noticia_id VARCHAR(32) REFERENCES noticias(id) ON DELETE CASCADE,
topic_id INTEGER REFERENCES topics(id) ON DELETE CASCADE,
score INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT NOW(),
PRIMARY KEY (noticia_id, topic_id)
);
ALTER TABLE noticias ADD COLUMN IF NOT EXISTS topics_processed BOOLEAN DEFAULT FALSE;
""")
conn.commit()
log.info("Schema OK.")
def main():
log.info("Starting topics_worker...")
# Run migrations once at startup
try:
with get_conn() as conn:
initialize_schema(conn)
except Exception as e:
log.error(f"Error during schema initialization: {e}")
# We might want to exit here if the schema is crucial
# sys.exit(1)
while True:
try:
with get_conn() as conn:
topics = load_topics(conn)
if not topics:
log.warning("No topics found in DB. Sleeping.")
time.sleep(SLEEP_IDLE)
continue
# Load countries
countries = load_countries(conn)
count = process_batch(conn, topics, countries)
if count < BATCH_SIZE:
time.sleep(SLEEP_IDLE)
else:
log.info(f"Processed {count} items.")
except Exception as e:
log.exception("Error in topics_worker")
time.sleep(SLEEP_IDLE)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,599 @@
import os
import time
import logging
import re
from typing import List, Optional
import psycopg2
import psycopg2.extras
from psycopg2.extras import execute_values
import ctranslate2
from transformers import AutoTokenizer
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
LOG = logging.getLogger("translator")
# =========================
# DB CONFIG
# =========================
DB_CONFIG = {
"host": os.environ.get("DB_HOST", "localhost"),
"port": int(os.environ.get("DB_PORT", 5432)),
"dbname": os.environ.get("DB_NAME", "rss"),
"user": os.environ.get("DB_USER", "rss"),
"password": os.environ.get("DB_PASS", "x"),
}
# =========================
# ENV HELPERS
# =========================
def _env_list(name: str, default="es"):
raw = os.environ.get(name)
if raw:
return [s.strip() for s in raw.split(",") if s.strip()]
return [default]
def _env_int(name: str, default: int = 8):
v = os.environ.get(name)
try:
return int(v)
except Exception:
return default
def _env_float(name: str, default: float = 5.0):
v = os.environ.get(name)
try:
return float(v)
except Exception:
return default
def _env_str(name: str, default=None):
v = os.environ.get(name)
return v if v else default
# =========================
# CONFIG
# =========================
TARGET_LANGS = _env_list("TARGET_LANGS") # por defecto ["es"]
BATCH_SIZE = _env_int("TRANSLATOR_BATCH", 8)
ENQUEUE_MAX = _env_int("ENQUEUE", 200)
SLEEP_IDLE = _env_float("TRANSLATOR_SLEEP_IDLE", 5.0)
# CTranslate2 Configuration
CT2_MODEL_PATH = _env_str("CT2_MODEL_PATH", "./models/nllb-ct2")
CT2_DEVICE = _env_str("CT2_DEVICE", "auto") # auto, cpu, cuda
CT2_COMPUTE_TYPE = _env_str("CT2_COMPUTE_TYPE", "auto") # auto, int8, float16, int8_float16
MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", 512)
MAX_NEW_TOKENS_TITLE = _env_int("MAX_NEW_TOKENS_TITLE", 96)
MAX_NEW_TOKENS_BODY = _env_int("MAX_NEW_TOKENS_BODY", 512)
NUM_BEAMS_TITLE = _env_int("NUM_BEAMS_TITLE", 2)
NUM_BEAMS_BODY = _env_int("NUM_BEAMS_BODY", 1)
# HuggingFace model name (used for tokenizer)
UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", "facebook/nllb-200-distilled-600M")
IDENTITY_PAISES_ES = _env_int("IDENTITY_PAISES_ES", 58)
BODY_CHARS_CHUNK = _env_int("BODY_CHARS_CHUNK", 900)
# =========================
# LANG MAP
# =========================
NLLB_LANG = {
"es": "spa_Latn", "en": "eng_Latn", "fr": "fra_Latn", "de": "deu_Latn",
"it": "ita_Latn", "pt": "por_Latn", "nl": "nld_Latn", "sv": "swe_Latn",
"da": "dan_Latn", "fi": "fin_Latn", "no": "nob_Latn",
"pl": "pol_Latn", "cs": "ces_Latn", "sk": "slk_Latn",
"sl": "slv_Latn", "hu": "hun_Latn", "ro": "ron_Latn",
"el": "ell_Grek", "ru": "rus_Cyrl", "uk": "ukr_Cyrl",
"tr": "tur_Latn", "ar": "arb_Arab", "fa": "pes_Arab",
"he": "heb_Hebr", "zh": "zho_Hans", "ja": "jpn_Jpan",
"ko": "kor_Hang", "vi": "vie_Latn",
}
def map_to_nllb(code: Optional[str]):
if not code:
return None
c = code.strip().lower()
return NLLB_LANG.get(c, f"{c}_Latn")
def normalize_lang(code: Optional[str], default=None):
return (code or default).strip().lower() if code else default
def _norm(s: str) -> str:
return re.sub(r"\W+", "", (s or "").lower()).strip()
# =========================
# DB
# =========================
def get_conn():
return psycopg2.connect(**DB_CONFIG)
def ensure_indexes(conn):
with conn.cursor() as cur:
cur.execute("CREATE INDEX IF NOT EXISTS traducciones_lang_to_status_idx ON traducciones (lang_to, status);")
cur.execute("CREATE INDEX IF NOT EXISTS traducciones_status_idx ON traducciones (status);")
conn.commit()
pass # Moved to translation_ops.py
pass # Moved to translation_ops.py
def fetch_pending_batch(conn, lang_to: str, batch: int):
"""Fetch pending translations with row locking to support multiple workers."""
if batch <= 0:
return []
# Use FOR UPDATE SKIP LOCKED to allow multiple workers
# Each worker will get different rows without conflicts
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute(
"""
SELECT t.id AS tr_id, t.noticia_id, t.lang_from, t.lang_to,
n.titulo, n.resumen
FROM traducciones t
JOIN noticias n ON n.id=t.noticia_id
WHERE t.lang_to=%s AND t.status='pending'
ORDER BY t.id
LIMIT %s
FOR UPDATE OF t SKIP LOCKED;
""",
(lang_to, batch),
)
rows = cur.fetchall()
# Update status within the same transaction while rows are locked
if rows:
ids = [r["tr_id"] for r in rows]
cur.execute("UPDATE traducciones SET status='processing' WHERE id = ANY(%s)", (ids,))
conn.commit()
return rows
# =========================
# LANGUAGE DETECTION
# =========================
def detect_lang(text1: str, text2: str):
txt = (text1 or "").strip() or (text2 or "").strip()
if not txt:
return None
try:
return detect(txt)
except Exception:
return None
# =========================
# MODEL LOADING (CTranslate2)
# =========================
_TOKENIZER = None
_TRANSLATOR = None
_DEVICE = None
def _resolve_device():
if CT2_DEVICE == "cpu":
return "cpu"
if CT2_DEVICE == "cuda":
return "cuda"
# auto
return "cuda" if ctranslate2.get_cuda_device_count() > 0 else "cpu"
def _ensure_ct2_model():
"""Convert HuggingFace model to CTranslate2 format if not exists."""
import os
import subprocess
model_dir = CT2_MODEL_PATH
# Check if model already exists
if os.path.isdir(model_dir) and os.path.exists(os.path.join(model_dir, "model.bin")):
LOG.info("CTranslate2 model already exists at %s", model_dir)
return True
LOG.info("CTranslate2 model not found, converting from %s...", UNIVERSAL_MODEL)
LOG.info("This may take 5-10 minutes on first run...")
# Create directory if needed
os.makedirs(os.path.dirname(model_dir) or ".", exist_ok=True)
# Convert the model
quantization = CT2_COMPUTE_TYPE if CT2_COMPUTE_TYPE != "auto" else "int8_float16"
cmd = [
"ct2-transformers-converter",
"--model", UNIVERSAL_MODEL,
"--output_dir", model_dir,
"--quantization", quantization,
"--force"
]
try:
LOG.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=1800)
if result.returncode != 0:
LOG.error("Model conversion failed: %s", result.stderr)
return False
LOG.info("Model conversion completed successfully")
return True
except subprocess.TimeoutExpired:
LOG.error("Model conversion timed out after 30 minutes")
return False
except Exception as e:
LOG.error("Model conversion error: %s", e)
return False
def get_universal_components():
global _TOKENIZER, _TRANSLATOR, _DEVICE
if _TRANSLATOR:
return _TOKENIZER, _TRANSLATOR
# Ensure CT2 model exists (convert if needed)
if not _ensure_ct2_model():
raise RuntimeError(f"Failed to load/convert CTranslate2 model at {CT2_MODEL_PATH}")
device = _resolve_device()
LOG.info("Loading CTranslate2 model from %s on %s", CT2_MODEL_PATH, device)
_TRANSLATOR = ctranslate2.Translator(
CT2_MODEL_PATH,
device=device,
compute_type=CT2_COMPUTE_TYPE,
)
_TOKENIZER = AutoTokenizer.from_pretrained(UNIVERSAL_MODEL)
_DEVICE = device
LOG.info("CTranslate2 model loaded successfully")
return _TOKENIZER, _TRANSLATOR
# =========================
# TRANSLATION (CTranslate2)
# =========================
def _safe_src_len(tokenizer):
max_len = getattr(tokenizer, "model_max_length", 1024) or 1024
if max_len > 100000:
max_len = 1024
return min(MAX_SRC_TOKENS, max_len - 16)
def _translate_texts(src, tgt, texts, beams, max_new_tokens):
"""Translate texts using CTranslate2."""
if not texts:
return []
clean = [(t or "").strip() for t in texts]
if all(not t for t in clean):
return ["" for _ in clean]
tok, translator = get_universal_components()
src_code = map_to_nllb(src)
tgt_code = map_to_nllb(tgt)
# Set source language on tokenizer
try:
tok.src_lang = src_code
except Exception:
pass
safe_len = _safe_src_len(tok)
max_new = max(16, min(int(max_new_tokens), MAX_NEW_TOKENS_BODY))
# Tokenize: convert text to tokens
sources = []
for t in clean:
if t:
ids = tok.encode(t, truncation=True, max_length=safe_len)
tokens = tok.convert_ids_to_tokens(ids)
sources.append(tokens)
else:
sources.append([])
# Target language prefix for NLLB
target_prefix = [[tgt_code]] * len(sources)
# Translate with CTranslate2
start = time.time()
results = translator.translate_batch(
sources,
target_prefix=target_prefix,
beam_size=beams,
max_decoding_length=max_new,
)
dt = time.time() - start
# Decode results
translated = []
total_tokens = 0
for result, src_tokens in zip(results, sources):
if result.hypotheses:
# Skip the first token (language prefix)
tokens = result.hypotheses[0][1:]
total_tokens += len(tokens) + len(src_tokens)
text = tok.decode(tok.convert_tokens_to_ids(tokens))
translated.append(text.strip())
else:
translated.append("")
if total_tokens > 0:
LOG.info(" → tokens=%d tiempo=%.2fs velocidad=%d tok/s",
total_tokens, dt, int(total_tokens / dt) if dt > 0 else 0)
return translated
def _split_body_into_chunks(text: str) -> List[str]:
text = (text or "").strip()
if len(text) <= BODY_CHARS_CHUNK:
return [text] if text else []
parts = re.split(r'(\n\n+|(?<=[\.\!\?؛؟。])\s+)', text)
chunks = []
current = ""
for part in parts:
if not part:
continue
if len(current) + len(part) <= BODY_CHARS_CHUNK:
current += part
else:
if current.strip():
chunks.append(current.strip())
current = part
if current.strip():
chunks.append(current.strip())
if not chunks:
return [text]
return chunks
def translate_body_long(src: str, tgt: str, body: str) -> str:
body = (body or "").strip()
if not body:
return ""
chunks = _split_body_into_chunks(body)
if len(chunks) == 1:
translated = _translate_texts(src, tgt, [body], NUM_BEAMS_BODY, MAX_NEW_TOKENS_BODY)[0]
return translated.strip()
translated_chunks = []
for ch in chunks:
tr = _translate_texts(src, tgt, [ch], NUM_BEAMS_BODY, MAX_NEW_TOKENS_BODY)[0]
translated_chunks.append(tr.strip())
return "\n\n".join(c for c in translated_chunks if c)
# =========================
# BATCH PROCESS
# =========================
def process_batch(conn, rows):
todo = []
done = []
errors = []
for r in rows:
lang_to = normalize_lang(r["lang_to"], "es") or "es"
lang_from = (
normalize_lang(r["lang_from"])
or detect_lang(r["titulo"], r["resumen"])
or "en"
)
titulo = (r["titulo"] or "").strip()
resumen = (r["resumen"] or "").strip()
if map_to_nllb(lang_from) == map_to_nllb(lang_to):
done.append((titulo, resumen, lang_from, r["tr_id"]))
else:
todo.append({
"tr_id": r["tr_id"],
"lang_from": lang_from,
"lang_to": lang_to,
"titulo": titulo,
"resumen": resumen,
})
from collections import defaultdict
groups = defaultdict(list)
for item in todo:
key = (item["lang_from"], item["lang_to"])
groups[key].append(item)
for (lang_from, lang_to), items in groups.items():
LOG.info("Translating %s -> %s (%d items)", lang_from, lang_to, len(items))
titles = [i["titulo"] for i in items]
try:
tt = _translate_texts(
lang_from,
lang_to,
titles,
NUM_BEAMS_TITLE,
MAX_NEW_TOKENS_TITLE,
)
bodies_translated: List[str] = []
for i in items:
bodies_translated.append(
translate_body_long(lang_from, lang_to, i["resumen"])
)
for i, ttr, btr in zip(items, tt, bodies_translated):
ttr = (ttr or "").strip()
btr = (btr or "").strip()
if not ttr or _norm(ttr) == _norm(i["titulo"]):
ttr = i["titulo"]
if not btr or _norm(btr) == _norm(i["resumen"]):
btr = i["resumen"]
# CLEANING: Remove <unk> tokens
if ttr:
ttr = ttr.replace("<unk>", "").replace(" ", " ").strip()
if btr:
btr = btr.replace("<unk>", "").replace(" ", " ").strip()
done.append((ttr, btr, lang_from, i["tr_id"]))
except Exception as e:
err = str(e)[:800]
LOG.exception("Error translating %s -> %s: %s", lang_from, lang_to, err)
for i in items:
errors.append((err, i["tr_id"]))
with conn.cursor() as cur:
if done:
execute_values(
cur,
"""
UPDATE traducciones AS t
SET titulo_trad=v.titulo_trad,
resumen_trad=v.resumen_trad,
lang_from=COALESCE(t.lang_from, v.lang_from),
status='done',
error=NULL
FROM (VALUES %s) AS v(titulo_trad,resumen_trad,lang_from,id)
WHERE t.id=v.id;
""",
done,
)
# --- NEW: Persist stats ---
# Insert a record for each translated item into translation_stats
# We need the language 'lang_to'. In this batch, lang_to is uniform for the group usually,
# but let's extract it from the 'done' items structure if we had it, or pass it down.
# In process_batch, we iterate groups.
# 'done' list here is flattened from multiple groups?
# process_batch logic:
# 1. 'done' checks map_to_nllb identity (already done?) -> these have lang_to from row?
# 2. 'groups' loop -> translates -> appends to 'done' with lang_from.
#
# Wait, 'done' list doesn't have lang_to in the tuple: (titulo, resumen, lang_from, tr_id).
# We need to change the 'done' collection to include lang_to OR we insert based on tr_id.
# Let's verify process_batch logic.
# rows has all info.
# define a mapping tr_id -> lang_to
tr_map = {r["tr_id"]: r["lang_to"] for r in rows}
stats_data = []
for item in done:
# item is (titulo, resumen, lang_from, tr_id)
lang_from = item[2]
lang_to = tr_map.get(item[3], "es")
stats_data.append((lang_from, lang_to))
execute_values(
cur,
"INSERT INTO translation_stats (lang_from, lang_to) VALUES %s",
stats_data
)
# --------------------------
if errors:
execute_values(
cur,
"""
UPDATE traducciones AS t
SET status='error', error=v.error
FROM (VALUES %s) AS v(error,id)
WHERE t.id=v.id;
""",
errors,
)
conn.commit()
def process_entity_summaries(conn):
"""Translate pending entity summaries from Wikipedia."""
from cache import cache_del
LOG.info("DEBUG: Checking for pending entity summaries...")
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute("""
SELECT id, entity_name, summary, summary_en
FROM entity_images
WHERE status_es = 'pending'
LIMIT 20
FOR UPDATE SKIP LOCKED;
""")
rows = cur.fetchall()
if not rows:
return False
LOG.info("DEBUG: Found %d pending entity summaries to process", len(rows))
for r in rows:
entity_id = r["id"]
name = r["entity_name"]
text = r["summary_en"] or r["summary"]
if not text:
cur.execute("UPDATE entity_images SET status_es = 'none' WHERE id = %s", (entity_id,))
continue
try:
# English -> Spanish
translated = translate_body_long('en', 'es', text)
if translated:
cur.execute("""
UPDATE entity_images
SET summary_es = %s, status_es = 'done'
WHERE id = %s
""", (translated, entity_id))
# Invalidate cache
cache_del(f"wiki:data:{name.lower()}")
LOG.info(" → Translated entity summary: %s", name)
else:
cur.execute("UPDATE entity_images SET status_es = 'error' WHERE id = %s", (entity_id,))
except Exception as e:
LOG.error("Error translating entity summary [%s]: %s", name, e)
cur.execute("UPDATE entity_images SET status_es = 'error' WHERE id = %s", (entity_id,))
conn.commit()
return True
# =========================
# MAIN LOOP
# =========================
def main():
LOG.info("Translator worker iniciado (CTranslate2)")
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
get_universal_components()
while True:
any_work = False
with get_conn() as conn:
ensure_indexes(conn)
# 1. Process entity summaries (Wikipedia) -> REMOVED per user request
# Logic moved out to keep translator focused on news ONLY.
# try:
# if process_entity_summaries(conn):
# any_work = True
# except Exception as e:
# LOG.error("Error in process_entity_summaries: %s", e)
# 2. Process news translations
for tgt in TARGET_LANGS:
while True:
rows = fetch_pending_batch(conn, tgt, BATCH_SIZE)
if not rows:
break
any_work = True
LOG.info("[%s] %d elementos", tgt, len(rows))
process_batch(conn, rows)
if not any_work:
time.sleep(SLEEP_IDLE)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,471 @@
"""
URL Feed Discovery Worker
This worker automatically discovers RSS feeds from URLs stored in fuentes_url table
and creates entries in the feeds table (or feeds_pending for review).
Runs every 15 minutes.
"""
import os
import sys
import time
import logging
from datetime import datetime
from typing import List, Dict
# Add parent directory to path to import modules
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from db import get_conn
from utils.feed_discovery import discover_feeds, get_feed_metadata
from utils.feed_analysis import (
analyze_feed,
get_country_id_by_name,
get_category_id_by_name
)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Configuration
CHECK_INTERVAL = int(os.getenv('URL_DISCOVERY_INTERVAL_MIN', '15')) * 60 # Default: 15 minutes
BATCH_SIZE = int(os.getenv('URL_DISCOVERY_BATCH_SIZE', '10')) # Process URLs in batches
MAX_FEEDS_PER_URL = int(os.getenv('MAX_FEEDS_PER_URL', '5')) # Max feeds to create per URL
def get_pending_urls(limit: int = BATCH_SIZE) -> List[Dict]:
"""
Get URLs that need to be processed.
Priority: never checked > failed checks > oldest successful checks
"""
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT id, nombre, url, categoria_id, pais_id, idioma, last_status
FROM fuentes_url
WHERE active = TRUE
ORDER BY
CASE
WHEN last_check IS NULL THEN 1 -- Never checked (highest priority)
WHEN last_status = 'error' THEN 2 -- Failed checks
WHEN last_status = 'no_feeds' THEN 3 -- No feeds found
ELSE 4 -- Successful checks (lowest priority)
END,
last_check ASC NULLS FIRST
LIMIT %s
""", (limit,))
columns = [desc[0] for desc in cur.description]
return [dict(zip(columns, row)) for row in cur.fetchall()]
def update_url_status(url_id: int, status: str, message: str = None, http_code: int = None):
"""Update the status of a URL source"""
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
UPDATE fuentes_url
SET last_check = NOW(),
last_status = %s,
status_message = %s,
last_http_code = %s
WHERE id = %s
""", (status, message, http_code, url_id))
conn.commit()
def create_pending_feed(
fuente_url_id: int,
feed_url: str,
metadata: Dict,
analysis: Dict,
categoria_id: int = None,
pais_id: int = None,
idioma: str = None
) -> bool:
"""
Create a pending feed entry for manual review
"""
try:
with get_conn() as conn:
# Get detected country ID
detected_country_id = None
if analysis.get('detected_country'):
detected_country_id = get_country_id_by_name(conn, analysis['detected_country'])
# Get suggested category ID
suggested_categoria_id = None
if analysis.get('suggested_category'):
suggested_categoria_id = get_category_id_by_name(conn, analysis['suggested_category'])
with conn.cursor() as cur:
cur.execute("""
INSERT INTO feeds_pending (
fuente_url_id, feed_url, feed_title, feed_description,
feed_language, feed_type, entry_count,
detected_country_id, suggested_categoria_id,
categoria_id, pais_id, idioma, notes
)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (feed_url) DO UPDATE
SET feed_title = EXCLUDED.feed_title,
feed_description = EXCLUDED.feed_description,
discovered_at = NOW()
RETURNING id
""", (
fuente_url_id,
feed_url,
metadata.get('title', 'Feed sin título'),
metadata.get('description', '')[:500],
analysis.get('language'),
'rss', # Default type
metadata.get('entry_count', 0),
detected_country_id,
suggested_categoria_id,
categoria_id,
pais_id,
idioma,
analysis.get('analysis_notes', '')
))
result = cur.fetchone()
conn.commit()
if result:
logger.info(f"Created pending feed for review: {metadata.get('title')} ({feed_url})")
return True
else:
logger.debug(f"Pending feed updated: {feed_url}")
return False
except Exception as e:
logger.error(f"Error creating pending feed {feed_url}: {e}")
return False
def create_feed_from_metadata(
feed_url: str,
fuente_url_id: int = None,
categoria_id: int = None,
pais_id: int = None,
idioma: str = None,
auto_approve: bool = False,
context_title: str = None
) -> Dict:
"""
Create a feed entry from discovered feed URL with intelligent analysis.
Returns:
{
'created': True/False,
'pending': True/False,
'status': 'created'/'pending'/'existing'/'error',
'message': 'Description'
}
"""
result = {
'created': False,
'pending': False,
'status': 'error',
'message': ''
}
try:
# Get feed metadata
metadata = get_feed_metadata(feed_url, timeout=10)
if not metadata:
result['message'] = 'No se pudo obtener metadata del feed'
logger.warning(f"{result['message']}: {feed_url}")
return result
# Add URL to metadata for analysis
metadata['url'] = feed_url
# Use context title if provided, otherwise use metadata title
# This helps when feed XML title is generic (e.g. "RSS Feed") but site link had meaningful text
feed_title = context_title if context_title else metadata.get('title', 'Feed sin título')
# Update metadata for consistency in pending feeds AND analysis
metadata['title'] = feed_title
# Perform intelligent analysis
analysis = analyze_feed(metadata)
# Determine if we need manual review
needs_review = False
# If parent URL has no category or country, we need review
if not categoria_id or not pais_id:
needs_review = True
logger.info(f"Feed needs review (missing categoria_id={categoria_id} or pais_id={pais_id})")
# If auto_approve is disabled, we need review
if not auto_approve:
needs_review = True
# Enhance metadata with analysis
if not idioma and analysis.get('language'):
idioma = analysis['language']
# If needs review, create pending feed
if needs_review:
created_pending = create_pending_feed(
fuente_url_id=fuente_url_id,
feed_url=feed_url,
metadata=metadata,
analysis=analysis,
categoria_id=categoria_id,
pais_id=pais_id,
idioma=idioma
)
result['pending'] = created_pending
result['status'] = 'pending'
result['message'] = f"Feed creado y pendiente de revisión (país: {analysis.get('detected_country', 'N/A')}, categoría sugerida: {analysis.get('suggested_category', 'N/A')})"
return result
# Otherwise, create feed directly
nombre = feed_title
descripcion = metadata.get('description', '')
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
INSERT INTO feeds (nombre, descripcion, url, categoria_id, pais_id, idioma, fuente_url_id, activo)
VALUES (%s, %s, %s, %s, %s, %s, %s, TRUE)
ON CONFLICT (url) DO NOTHING
RETURNING id
""", (
nombre,
descripcion[:500] if descripcion else None,
feed_url,
categoria_id,
pais_id,
idioma,
fuente_url_id
))
feed_result = cur.fetchone()
conn.commit()
if feed_result:
logger.info(f"Created new feed: {nombre} ({feed_url})")
result['created'] = True
result['status'] = 'created'
result['message'] = f"Feed creado exitosamente"
else:
logger.debug(f"Feed already exists: {feed_url}")
result['status'] = 'existing'
result['message'] = 'El feed ya existe'
except Exception as e:
logger.error(f"Error creating feed from {feed_url}: {e}")
result['message'] = str(e)
result['status'] = 'error'
return result
def process_url_source(url_data: Dict) -> Dict:
"""
Process a single URL source to discover and create feeds.
Returns statistics about the operation.
"""
url_id = url_data['id']
source_url = url_data['url']
nombre = url_data['nombre']
categoria_id = url_data['categoria_id']
pais_id = url_data['pais_id']
idioma = url_data['idioma']
logger.info(f"Processing URL source: {nombre} ({source_url})")
logger.info(f" Parent metadata: categoria_id={categoria_id}, pais_id={pais_id}, idioma={idioma}")
stats = {
'url_id': url_id,
'url': source_url,
'discovered': 0,
'created': 0,
'pending': 0,
'existing': 0,
'errors': 0,
'status': 'unknown'
}
try:
# Discover feeds from URL
discovered = discover_feeds(source_url, timeout=15)
stats['discovered'] = len(discovered)
if not discovered:
logger.warning(f"No feeds discovered from: {source_url}")
update_url_status(url_id, 'no_feeds', 'No se encontraron feeds RSS', 200)
stats['status'] = 'no_feeds'
return stats
# Filter only valid feeds
valid_feeds = [f for f in discovered if f.get('valid', False)]
if not valid_feeds:
logger.warning(f"No valid feeds found for: {source_url}")
update_url_status(url_id, 'no_valid_feeds', f'Se encontraron {len(discovered)} feeds pero ninguno válido')
stats['status'] = 'no_valid_feeds'
return stats
# Limit number of feeds per URL
feeds_to_create = valid_feeds[:MAX_FEEDS_PER_URL]
logger.info(f"Found {len(valid_feeds)} valid feeds, processing up to {len(feeds_to_create)}")
# Determine if auto-approve (parent has category AND country)
auto_approve = bool(categoria_id and pais_id)
if not auto_approve:
logger.info("→ Feeds will require manual review (parent lacks category or country)")
else:
logger.info("→ Feeds will be auto-approved (parent has complete metadata)")
# Create feeds
for feed_info in feeds_to_create:
feed_url = feed_info['url']
try:
result = create_feed_from_metadata(
feed_url=feed_url,
fuente_url_id=url_id,
categoria_id=categoria_id,
pais_id=pais_id,
idioma=idioma,
auto_approve=auto_approve,
context_title=feed_info.get('context_label')
)
if result['status'] == 'created':
stats['created'] += 1
elif result['status'] == 'pending':
stats['pending'] += 1
elif result['status'] == 'existing':
stats['existing'] += 1
else:
stats['errors'] += 1
except Exception as e:
logger.error(f"Error creating feed {feed_url}: {e}")
stats['errors'] += 1
# Update URL status
if stats['created'] > 0 or stats['pending'] > 0:
parts = []
if stats['created'] > 0:
parts.append(f"{stats['created']} creados")
if stats['pending'] > 0:
parts.append(f"{stats['pending']} pendientes de revisión")
if stats['existing'] > 0:
parts.append(f"{stats['existing']} ya existían")
message = ", ".join(parts)
update_url_status(url_id, 'success', message, 200)
stats['status'] = 'success'
elif stats['existing'] > 0:
message = f"Todos los {stats['existing']} feeds ya existían"
update_url_status(url_id, 'existing', message, 200)
stats['status'] = 'existing'
else:
message = f"No se pudieron procesar feeds ({stats['errors']} errores)"
update_url_status(url_id, 'error', message)
stats['status'] = 'error'
logger.info(f"Processed {source_url}: {stats['created']} created, {stats['pending']} pending, {stats['existing']} existing, {stats['errors']} errors")
except Exception as e:
logger.error(f"Error processing URL {source_url}: {e}")
update_url_status(url_id, 'error', str(e)[:200])
stats['status'] = 'error'
stats['errors'] += 1
return stats
def process_batch():
"""Process a batch of URL sources"""
logger.info("=" * 80)
logger.info(f"Starting URL discovery batch - {datetime.now().isoformat()}")
# Get pending URLs
urls = get_pending_urls(limit=BATCH_SIZE)
if not urls:
logger.info("No pending URLs to process")
return
logger.info(f"Processing {len(urls)} URL sources")
# Process statistics
total_stats = {
'processed': 0,
'discovered': 0,
'created': 0,
'pending': 0,
'existing': 0,
'errors': 0
}
# Process each URL
for url_data in urls:
stats = process_url_source(url_data)
total_stats['processed'] += 1
total_stats['discovered'] += stats['discovered']
total_stats['created'] += stats['created']
total_stats['pending'] += stats['pending']
total_stats['existing'] += stats['existing']
total_stats['errors'] += stats['errors']
# Small delay between URLs to avoid hammering servers
time.sleep(2)
# Log summary
logger.info("-" * 80)
logger.info(f"Batch complete:")
logger.info(f" - Processed: {total_stats['processed']} URLs")
logger.info(f" - Discovered: {total_stats['discovered']} feeds")
logger.info(f" - Created: {total_stats['created']} new feeds")
logger.info(f" - Pending review: {total_stats['pending']} feeds")
logger.info(f" - Already existing: {total_stats['existing']} feeds")
logger.info(f" - Errors: {total_stats['errors']}")
logger.info("=" * 80)
def main():
"""Main worker loop"""
logger.info("URL Feed Discovery Worker started")
logger.info(f"Check interval: {CHECK_INTERVAL} seconds ({CHECK_INTERVAL // 60} minutes)")
logger.info(f"Batch size: {BATCH_SIZE}")
logger.info(f"Max feeds per URL: {MAX_FEEDS_PER_URL}")
# Run immediately on start
try:
process_batch()
except Exception as e:
logger.error(f"Error in initial batch: {e}", exc_info=True)
# Main loop
while True:
try:
logger.info(f"Waiting {CHECK_INTERVAL} seconds until next batch...")
time.sleep(CHECK_INTERVAL)
process_batch()
except KeyboardInterrupt:
logger.info("Worker stopped by user")
break
except Exception as e:
logger.error(f"Error in main loop: {e}", exc_info=True)
# Wait a bit before retrying to avoid rapid failure loops
time.sleep(60)
if __name__ == "__main__":
main()

125
workers/url_worker.py Normal file
View file

@ -0,0 +1,125 @@
import logging
import hashlib
from datetime import datetime
from newspaper import Article, ArticleException, Config
import requests
from db import get_write_conn, get_read_conn
# Configuration
logger = logging.getLogger("url_worker")
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
def get_active_urls():
"""Get all active URL sources."""
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT id, nombre, url, categoria_id, pais_id, idioma
FROM fuentes_url
WHERE active = true
""")
return cur.fetchall()
def update_source_status(source_id, status, message, http_code=0):
"""Update the status of a URL source."""
with get_write_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
UPDATE fuentes_url
SET last_check = NOW(),
last_status = %s,
status_message = %s,
last_http_code = %s
WHERE id = %s
""", (status, message, http_code, source_id))
conn.commit()
def save_article(source, article):
"""Save the extracted article to the database."""
source_id, source_name, source_url, cat_id, pais_id, lang = source
# Use the article url if possible, otherwise source_url
final_url = article.url or source_url
noticia_id = hashlib.md5(final_url.encode("utf-8")).hexdigest()
with get_write_conn() as conn:
with conn.cursor() as cur:
# Check if exists
cur.execute("SELECT id FROM noticias WHERE id = %s", (noticia_id,))
if cur.fetchone():
return False # Already exists
# Prepare data
title = article.title or "Sin título"
summary = article.summary or article.text[:500]
image_url = article.top_image
pub_date = article.publish_date or datetime.utcnow()
cur.execute("""
INSERT INTO noticias (
id, titulo, resumen, url, fecha, imagen_url,
fuente_nombre, categoria_id, pais_id
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (id) DO NOTHING
""", (
noticia_id, title, summary, final_url, pub_date, image_url,
source_name, cat_id, pais_id
))
conn.commit()
return True
def process_url(source):
"""Process a single URL source."""
source_id, name, url, _, _, _ = source
logger.info(f"Processing URL: {url} ({name})")
try:
# Browser-like headers
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
config = Config()
config.browser_user_agent = user_agent
config.request_timeout = 30
article = Article(url, config=config, language='es')
article.download()
if not article.html:
update_source_status(source_id, "ERROR", "No content downloaded (Empty HTML)", 0)
return
article.parse()
try:
article.nlp()
except:
pass
if not article.title:
update_source_status(source_id, "ERROR_PARSE", "Could not extract title (Page might be not an article)", 200)
return
saved = save_article(source, article)
status_msg = "News created successfully" if saved else "News already exists"
update_source_status(source_id, "OK", status_msg, 200)
logger.info(f"Success {url}: {status_msg}")
except ArticleException as ae:
logger.error(f"Newspaper Error {url}: {ae}")
update_source_status(source_id, "ERROR_DOWNLOAD", str(ae)[:200], 0)
except requests.exceptions.RequestException as re:
logger.error(f"Network Error {url}: {re}")
update_source_status(source_id, "ERROR_NETWORK", str(re)[:200], 0)
except Exception as e:
logger.error(f"Unexpected Error {url}: {e}")
update_source_status(source_id, "ERROR_UNKNOWN", str(e)[:200], 500)
def main():
logger.info("Starting URL Worker")
urls = get_active_urls()
logger.info(f"Found {len(urls)} active URLs")
for source in urls:
process_url(source)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,31 @@
import time
import logging
import sys
from workers.url_worker import main as run_once
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
stream=sys.stdout
)
logger = logging.getLogger("url_worker_daemon")
INTERVAL = 300 # 5 minutes
def main():
logger.info("Starting URL Worker Daemon")
logger.info(f"Check interval: {INTERVAL} seconds")
while True:
try:
logger.info("Running job cycle...")
run_once()
logger.info("Cycle completed.")
except Exception as e:
logger.exception(f"Error in job cycle: {e}")
time.sleep(INTERVAL)
if __name__ == "__main__":
main()