rss/ner_worker.py
2025-11-24 02:37:05 +01:00

339 lines
8.6 KiB
Python

import os
import time
import logging
import re
import string
from typing import List, Tuple
from collections import Counter
import psycopg2
import psycopg2.extras
import spacy
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')
DB = dict(
host=os.environ.get("DB_HOST", "localhost"),
port=int(os.environ.get("DB_PORT", 5432)),
dbname=os.environ.get("DB_NAME", "rss"),
user=os.environ.get("DB_USER", "rss"),
password=os.environ.get("DB_PASS", "x"),
)
NER_LANG = os.environ.get("NER_LANG", "es").strip().lower()
BATCH = int(os.environ.get("NER_BATCH", 64))
ENT_LABELS = {
"PERSON": "persona",
"ORG": "organizacion",
"GPE": "lugar",
"LOC": "lugar",
}
_ws_re = re.compile(r"\s+")
HTML_TRASH_PATTERNS = [
r"<[^>]+>",
r"&[a-z]+;",
r"&#\d+;?",
r'width="\d+"',
r'height="\d+"',
]
GENERIC_BAD_TAGS = {
"república",
"estado",
"centro",
"gobierno",
"el gobierno",
"gobiernos",
"report",
"sp",
"unión",
"union",
"dólares",
"dolar",
"dólar",
"the post",
"post",
"artículo",
"el artículo",
"la ciudad",
"mundo",
"país",
"pais",
"países",
"paises",
"la noche",
"la publicación",
"este miércoles",
"el miércoles",
"hoy",
"ayer",
"mañana",
"servicio",
"servicios",
"el presidente",
"presidente",
"el ministro",
"ministro",
"la guerra",
"guerra",
"seguridad",
"wp-content",
"internal_photos",
"/internal_photos",
"https",
"http",
"src",
}
STOPWORDS = set()
ARTICLES = {
"el",
"la",
"los",
"las",
"un",
"una",
"uno",
"al",
"del",
}
TOPIC_MIN_CHARS = 4
TOPIC_MAX_WORDS = 6
TOPIC_MAX_PER_DOC = 15
def _looks_like_attr_or_path(text_lower: str) -> bool:
if text_lower.startswith("/"):
return True
if "http://" in text_lower or "https://" in text_lower:
return True
if ".com" in text_lower or ".net" in text_lower or ".org" in text_lower:
return True
if any(ext in text_lower for ext in (".jpg", ".jpeg", ".png", ".gif", ".webp")):
return True
if re.search(r"\b(src|alt|style|class)\s*=", text_lower):
return True
if "data-" in text_lower:
return True
if re.search(r"&#\d+;?", text_lower):
return True
if "=" in text_lower and " " not in text_lower.strip():
return True
if re.fullmatch(r"[a-z0-9_]{15,}", text_lower.replace("-", "").replace(" ", "")):
return True
if "-" in text_lower and " " not in text_lower:
return True
return False
def clean_tag_text(text: str) -> str | None:
if not text:
return None
text = BeautifulSoup(text, "html.parser").get_text()
for pat in HTML_TRASH_PATTERNS:
text = re.sub(pat, "", text)
text = _ws_re.sub(" ", text).strip()
text = text.strip(string.punctuation + " ")
if len(text) < 3:
return None
if re.search(r"[<>/\\]", text):
return None
lower = text.lower()
if lower.startswith("href="):
return None
if _looks_like_attr_or_path(lower):
return None
if lower in GENERIC_BAD_TAGS:
return None
replacements = {
"ee.uu.": "Estados Unidos",
"los estados unidos": "Estados Unidos",
"eeuu": "Estados Unidos",
"eu": "Unión Europea",
"ue": "Unión Europea",
"kosova": "Kosovo",
}
if lower in replacements:
text = replacements[lower]
return text
def clean_topic_text(text: str) -> str | None:
if not text:
return None
text = BeautifulSoup(text, "html.parser").get_text()
for pat in HTML_TRASH_PATTERNS:
text = re.sub(pat, "", text)
text = _ws_re.sub(" ", text).strip()
text = text.strip(string.punctuation + " ")
if len(text) < TOPIC_MIN_CHARS:
return None
lower = text.lower()
if _looks_like_attr_or_path(lower):
return None
tokens = [
t.strip(string.punctuation)
for t in lower.split()
if t.strip(string.punctuation)
]
if not tokens:
return None
if tokens and tokens[0] in ARTICLES:
tokens = tokens[1:]
if not tokens:
return None
norm = " ".join(tokens).strip()
if len(norm) < TOPIC_MIN_CHARS:
return None
if norm in GENERIC_BAD_TAGS:
return None
if len(tokens) > TOPIC_MAX_WORDS:
return None
if all(t in STOPWORDS for t in tokens):
return None
if re.fullmatch(r"[0-9\s\.,\-:/]+", norm):
return None
return norm
def get_conn():
return psycopg2.connect(**DB)
def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]:
ents: List[Tuple[str, str]] = []
topics: List[Tuple[str, str]] = []
if not text:
return ents, topics
doc = nlp(text)
for ent in doc.ents:
tipo = ENT_LABELS.get(ent.label_)
if not tipo:
continue
val = clean_tag_text(ent.text)
if not val:
continue
ents.append((val, tipo))
topic_counter: Counter[str] = Counter()
for chunk in doc.noun_chunks:
val = clean_topic_text(chunk.text)
if not val:
continue
topic_counter[val] += 1
ent_values = {v for (v, _) in ents}
for val, _count in topic_counter.most_common(TOPIC_MAX_PER_DOC):
if val in ent_values:
continue
topics.append((val, "tema"))
ents = list(set(ents))
topics = list(set(topics))
return ents, topics
def main():
global STOPWORDS
nlp = spacy.load("es_core_news_md", disable=["lemmatizer", "textcat"])
STOPWORDS = set(nlp.Defaults.stop_words)
logging.info("spaCy cargado: es_core_news_md (NER + parser)")
while True:
try:
with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute(
"""
WITH pend AS (
SELECT t.id, t.titulo_trad, t.resumen_trad
FROM traducciones t
LEFT JOIN tags_noticia tn ON tn.traduccion_id = t.id
WHERE t.status = 'done'
AND t.lang_to = %s
GROUP BY t.id, t.titulo_trad, t.resumen_trad
HAVING COUNT(tn.tag_id) = 0
ORDER BY t.id DESC
LIMIT %s
)
SELECT * FROM pend;
""",
(NER_LANG, BATCH),
)
rows = cur.fetchall()
if not rows:
time.sleep(5)
continue
logging.info(f"Procesando {len(rows)} traducciones para NER/temas...")
new_links = 0
for r in rows:
text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
if not text:
continue
ents, topics = extract_entities_and_topics(nlp, text)
all_tags = ents + topics
if not all_tags:
continue
for valor, tipo in all_tags:
try:
cur.execute(
"""
INSERT INTO tags (valor, tipo)
VALUES (%s, %s)
ON CONFLICT (valor, tipo)
DO UPDATE SET valor = EXCLUDED.valor
RETURNING id
""",
(valor, tipo),
)
tag_id = cur.fetchone()[0]
cur.execute(
"""
INSERT INTO tags_noticia (traduccion_id, tag_id)
VALUES (%s, %s)
ON CONFLICT DO NOTHING
""",
(r["id"], tag_id),
)
if cur.rowcount > 0:
new_links += 1
except Exception:
logging.exception("Fallo insertando tag/relación")
conn.commit()
logging.info(f"NER/temas lote OK. Nuevos enlaces: {new_links}.")
except Exception:
logging.exception("Error en NER loop")
time.sleep(5)
if __name__ == "__main__":
main()