Initial clean commit
This commit is contained in:
commit
6784d81c2c
141 changed files with 25219 additions and 0 deletions
414
workers/ner_worker.py
Normal file
414
workers/ner_worker.py
Normal file
|
|
@ -0,0 +1,414 @@
|
|||
import os
|
||||
import time
|
||||
import logging
|
||||
import re
|
||||
import string
|
||||
from typing import List, Tuple
|
||||
from collections import Counter
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import spacy
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# ==========================================================
|
||||
# Logging
|
||||
# ==========================================================
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='[NER] %(asctime)s %(levelname)s: %(message)s'
|
||||
)
|
||||
log = logging.getLogger("ner_worker")
|
||||
|
||||
# ==========================================================
|
||||
# Config DB
|
||||
# ==========================================================
|
||||
DB = dict(
|
||||
host=os.environ.get("DB_HOST", "localhost"),
|
||||
port=int(os.environ.get("DB_PORT", 5432)),
|
||||
dbname=os.environ.get("DB_NAME", "rss"),
|
||||
user=os.environ.get("DB_USER", "rss"),
|
||||
password=os.environ.get("DB_PASS", "x"),
|
||||
)
|
||||
|
||||
NER_LANG = os.environ.get("NER_LANG", "es").strip().lower()
|
||||
BATCH = int(os.environ.get("NER_BATCH", 64))
|
||||
|
||||
# ==========================================================
|
||||
# Mapeo de entidades spaCy → nuestro modelo SQL
|
||||
# ==========================================================
|
||||
ENT_LABELS = {
|
||||
"PERSON": "persona",
|
||||
"PER": "persona",
|
||||
"ORG": "organizacion",
|
||||
"GPE": "lugar",
|
||||
"LOC": "lugar",
|
||||
"MISC": "tema",
|
||||
}
|
||||
|
||||
# ==========================================================
|
||||
# Limpieza avanzada
|
||||
# ==========================================================
|
||||
_ws_re = re.compile(r"\s+")
|
||||
|
||||
HTML_TRASH_PATTERNS = [
|
||||
r"<[^>]+>",
|
||||
r"&[a-z]+;",
|
||||
r"&#\d+;?",
|
||||
r'width="\d+"',
|
||||
r'height="\d+"',
|
||||
]
|
||||
|
||||
GENERIC_BAD_TAGS = {
|
||||
"república", "estado", "centro", "gobierno", "el gobierno",
|
||||
"gobiernos", "report", "sp", "unión", "union", "dólares",
|
||||
"dolar", "dólar", "the post", "post", "artículo", "el artículo",
|
||||
"la ciudad", "mundo", "país", "pais", "países", "paises",
|
||||
"la noche", "la publicación", "este miércoles", "el miércoles",
|
||||
"hoy", "ayer", "mañana", "servicio", "servicios", "el presidente",
|
||||
"presidente", "el ministro", "ministro", "la guerra", "guerra",
|
||||
"seguridad", "wp-content", "internal_photos", "/internal_photos",
|
||||
"https", "http", "src"
|
||||
}
|
||||
|
||||
STOPWORDS = set()
|
||||
|
||||
ARTICLES = {
|
||||
"el", "la", "los", "las", "un", "una", "uno", "al", "del"
|
||||
}
|
||||
|
||||
# Límites
|
||||
TOPIC_MIN_CHARS = 4
|
||||
TOPIC_MAX_WORDS = 6
|
||||
TOPIC_MAX_PER_DOC = 15
|
||||
|
||||
|
||||
# ==========================================================
|
||||
# Helpers
|
||||
# ==========================================================
|
||||
def get_conn():
|
||||
return psycopg2.connect(**DB)
|
||||
|
||||
|
||||
def _looks_like_attr_or_path(text_lower: str) -> bool:
|
||||
"""Filtra basura tipo rutas, html, atributos, URLs, etc."""
|
||||
if text_lower.startswith("/"):
|
||||
return True
|
||||
if "http://" in text_lower or "https://" in text_lower:
|
||||
return True
|
||||
if any(ext in text_lower for ext in (".jpg", ".jpeg", ".png", ".gif", ".webp")):
|
||||
return True
|
||||
if re.search(r"\b(src|alt|style|class)\s*=", text_lower):
|
||||
return True
|
||||
if "data-" in text_lower:
|
||||
return True
|
||||
if re.search(r"&#\d+;?", text_lower):
|
||||
return True
|
||||
if "=" in text_lower and " " not in text_lower.strip():
|
||||
return True
|
||||
|
||||
# tokens-hash tipo "ajsdh7287sdhjshd8" (solo si no tiene espacios)
|
||||
if " " not in text_lower and re.fullmatch(r"[a-z0-9_]{15,}", text_lower.replace("-", "")):
|
||||
return True
|
||||
|
||||
# palabras sin espacios largas con guiones
|
||||
if "-" in text_lower and " " not in text_lower:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
# ==========================================================
|
||||
# Limpieza de entidades
|
||||
# ==========================================================
|
||||
def clean_tag_text(text: str) -> str | None:
|
||||
if not text:
|
||||
return None
|
||||
|
||||
text = BeautifulSoup(text, "html.parser").get_text()
|
||||
for pat in HTML_TRASH_PATTERNS:
|
||||
text = re.sub(pat, "", text)
|
||||
|
||||
text = _ws_re.sub(" ", text).strip()
|
||||
text = text.strip(string.punctuation + " ")
|
||||
|
||||
if len(text) < 3:
|
||||
log.debug(f"Clean reject (too short): {text}")
|
||||
return None
|
||||
if re.search(r"[<>/\\]", text):
|
||||
log.debug(f"Clean reject (bad chars): {text}")
|
||||
return None
|
||||
|
||||
lower = text.lower()
|
||||
if lower.startswith("href="):
|
||||
log.debug(f"Clean reject (href): {text}")
|
||||
return None
|
||||
if _looks_like_attr_or_path(lower):
|
||||
log.debug(f"Clean reject (attr/path): {text}")
|
||||
return None
|
||||
if lower in GENERIC_BAD_TAGS:
|
||||
log.debug(f"Clean reject (generic bad): {text}")
|
||||
return None
|
||||
|
||||
replacements = {
|
||||
"ee.uu.": "Estados Unidos",
|
||||
"los estados unidos": "Estados Unidos",
|
||||
"eeuu": "Estados Unidos",
|
||||
"eu": "Unión Europea",
|
||||
"ue": "Unión Europea",
|
||||
"kosova": "Kosovo",
|
||||
# Specific User Requests
|
||||
"trump": "Donald Trump",
|
||||
"mr. trump": "Donald Trump",
|
||||
"mr trump": "Donald Trump",
|
||||
"doland trump": "Donald Trump",
|
||||
"el presidente trump": "Donald Trump",
|
||||
"president trump": "Donald Trump",
|
||||
"ex-president trump": "Donald Trump",
|
||||
"expresidente trump": "Donald Trump",
|
||||
"putin": "Vladimir Putin",
|
||||
"vladimir putin": "Vladimir Putin",
|
||||
"v. putin": "Vladimir Putin",
|
||||
"presidente putin": "Vladimir Putin",
|
||||
# New requests
|
||||
"sanchez": "Pedro Sánchez",
|
||||
"pedro sanchez": "Pedro Sánchez",
|
||||
"p. sanchez": "Pedro Sánchez",
|
||||
"mr. sanchez": "Pedro Sánchez",
|
||||
"sánchez": "Pedro Sánchez", # explicit match just in case
|
||||
"pedro sánchez": "Pedro Sánchez",
|
||||
"maduro": "Nicolás Maduro",
|
||||
"nicolas maduro": "Nicolás Maduro",
|
||||
"mr. maduro": "Nicolás Maduro",
|
||||
"lula": "Lula da Silva",
|
||||
"lula da silva": "Lula da Silva",
|
||||
"luiz inácio lula da silva": "Lula da Silva",
|
||||
}
|
||||
if lower in replacements:
|
||||
return replacements[lower]
|
||||
|
||||
# Blacklist (explicit removals requested)
|
||||
blacklist = {
|
||||
"getty images", "netflix", "fiscalia", "segun", "estoy", # People blacklist
|
||||
"and more", "app", "estamos", "ultra", # Orgs blacklist
|
||||
"hacienda", "fiscalía"
|
||||
}
|
||||
if lower in blacklist:
|
||||
log.debug(f"Clean reject (blacklist): {text}")
|
||||
return None
|
||||
|
||||
return text
|
||||
|
||||
|
||||
# ==========================================================
|
||||
# Limpieza de topics (noun-chunks)
|
||||
# ==========================================================
|
||||
def clean_topic_text(text: str) -> str | None:
|
||||
if not text:
|
||||
return None
|
||||
|
||||
text = BeautifulSoup(text, "html.parser").get_text()
|
||||
for pat in HTML_TRASH_PATTERNS:
|
||||
text = re.sub(pat, "", text)
|
||||
|
||||
text = _ws_re.sub(" ", text).strip()
|
||||
text = text.strip(string.punctuation + " ")
|
||||
|
||||
if len(text) < TOPIC_MIN_CHARS:
|
||||
return None
|
||||
|
||||
lower = text.lower()
|
||||
if _looks_like_attr_or_path(lower):
|
||||
return None
|
||||
|
||||
tokens = [
|
||||
t.strip(string.punctuation)
|
||||
for t in lower.split()
|
||||
if t.strip(string.punctuation)
|
||||
]
|
||||
if not tokens:
|
||||
return None
|
||||
|
||||
# remover artículos iniciales
|
||||
if tokens[0] in ARTICLES:
|
||||
tokens = tokens[1:]
|
||||
if not tokens:
|
||||
return None
|
||||
|
||||
norm = " ".join(tokens).strip()
|
||||
|
||||
if len(norm) < TOPIC_MIN_CHARS:
|
||||
return None
|
||||
if norm in GENERIC_BAD_TAGS:
|
||||
return None
|
||||
if len(tokens) > TOPIC_MAX_WORDS:
|
||||
return None
|
||||
if all(t in STOPWORDS for t in tokens):
|
||||
return None
|
||||
if re.fullmatch(r"[0-9\s\.,\-:/]+", norm):
|
||||
return None
|
||||
|
||||
return norm
|
||||
|
||||
|
||||
# ==========================================================
|
||||
# Extracción NER + Topics
|
||||
# ==========================================================
|
||||
def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]:
|
||||
ents = []
|
||||
topics = []
|
||||
|
||||
if not text:
|
||||
return ents, topics
|
||||
|
||||
doc = nlp(text)
|
||||
# log.debug(f"Processing text ({len(text)} chars): {text[:30]}...")
|
||||
# log.debug(f"Entities found: {len(doc.ents)}")
|
||||
|
||||
# --- ENTIDADES ---
|
||||
for ent in doc.ents:
|
||||
tipo = ENT_LABELS.get(ent.label_)
|
||||
if not tipo:
|
||||
continue
|
||||
|
||||
cleaned = clean_tag_text(ent.text)
|
||||
if not cleaned:
|
||||
# log.debug(f"Rejected entity: {ent.text} ({ent.label_})")
|
||||
continue
|
||||
|
||||
if tipo == "persona":
|
||||
lower_cleaned = cleaned.lower()
|
||||
# Aggressive normalization rules for VIPs
|
||||
# Use token checks or substring checks carefully
|
||||
if "trump" in lower_cleaned.split():
|
||||
# Token 'trump' exists? e.g. "donald trump", "trump", "mr. trump"
|
||||
# Exclude family members
|
||||
family = ["ivanka", "melania", "eric", "tiffany", "barron", "lara", "mary", "fred"]
|
||||
if not any(f in lower_cleaned for f in family):
|
||||
cleaned = "Donald Trump"
|
||||
|
||||
elif "sanchez" in lower_cleaned or "sánchez" in lower_cleaned:
|
||||
# Be careful of other Sanchez? But user context implies Pedro.
|
||||
if "pedro" in lower_cleaned or "presidente" in lower_cleaned or lower_cleaned in ["sanchez", "sánchez"]:
|
||||
cleaned = "Pedro Sánchez"
|
||||
|
||||
elif "maduro" in lower_cleaned:
|
||||
cleaned = "Nicolás Maduro"
|
||||
|
||||
elif "lula" in lower_cleaned:
|
||||
cleaned = "Lula da Silva"
|
||||
|
||||
elif "putin" in lower_cleaned:
|
||||
cleaned = "Vladimir Putin"
|
||||
|
||||
# log.debug(f"Accepted entity: {cleaned} ({tipo})")
|
||||
ents.append((cleaned, tipo))
|
||||
|
||||
# --- TOPICS ---
|
||||
topic_counter = Counter()
|
||||
for chunk in doc.noun_chunks:
|
||||
cleaned = clean_topic_text(chunk.text)
|
||||
if cleaned:
|
||||
topic_counter[cleaned] += 1
|
||||
|
||||
ent_values = {v for (v, _) in ents}
|
||||
|
||||
for val, _count in topic_counter.most_common(TOPIC_MAX_PER_DOC):
|
||||
if val in ent_values:
|
||||
continue
|
||||
topics.append((val, "tema"))
|
||||
|
||||
return list(set(ents)), list(set(topics))
|
||||
|
||||
|
||||
# ==========================================================
|
||||
# Worker principal
|
||||
# ==========================================================
|
||||
def main():
|
||||
global STOPWORDS
|
||||
|
||||
# Cargar spaCy
|
||||
log.info("Cargando modelo spaCy es_core_news_md...")
|
||||
nlp = spacy.load("es_core_news_md", disable=["lemmatizer", "textcat"])
|
||||
STOPWORDS = set(nlp.Defaults.stop_words)
|
||||
log.info("Modelo spaCy cargado correctamente.")
|
||||
|
||||
while True:
|
||||
try:
|
||||
with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT t.id, t.titulo_trad, t.resumen_trad
|
||||
FROM traducciones t
|
||||
WHERE t.status = 'done'
|
||||
AND t.lang_to = %s
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM tags_noticia tn WHERE tn.traduccion_id = t.id
|
||||
)
|
||||
ORDER BY t.id DESC
|
||||
LIMIT %s;
|
||||
""",
|
||||
(NER_LANG, BATCH),
|
||||
)
|
||||
|
||||
|
||||
rows = cur.fetchall()
|
||||
|
||||
if not rows:
|
||||
time.sleep(5)
|
||||
continue
|
||||
|
||||
log.info(f"Procesando {len(rows)} traducciones para NER/temas...")
|
||||
|
||||
inserted_links = 0
|
||||
|
||||
for r in rows:
|
||||
text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
ents, topics = extract_entities_and_topics(nlp, text)
|
||||
tags = ents + topics
|
||||
if not tags:
|
||||
continue
|
||||
|
||||
for valor, tipo in tags:
|
||||
try:
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO tags (valor, tipo)
|
||||
VALUES (%s, %s)
|
||||
ON CONFLICT (valor, tipo)
|
||||
DO UPDATE SET valor = EXCLUDED.valor
|
||||
RETURNING id;
|
||||
""",
|
||||
(valor, tipo),
|
||||
)
|
||||
tag_id = cur.fetchone()[0]
|
||||
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO tags_noticia (traduccion_id, tag_id)
|
||||
VALUES (%s, %s)
|
||||
ON CONFLICT DO NOTHING;
|
||||
""",
|
||||
(r["id"], tag_id),
|
||||
)
|
||||
|
||||
if cur.rowcount > 0:
|
||||
inserted_links += 1
|
||||
|
||||
except Exception:
|
||||
log.exception("Error insertando tag/relación")
|
||||
|
||||
conn.commit()
|
||||
log.info(f"Lote NER OK. Nuevas relaciones tag_noticia: {inserted_links}")
|
||||
|
||||
except Exception:
|
||||
log.exception("Error general en NER loop")
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue