import os import time import logging import re import string from typing import List, Tuple from collections import Counter import psycopg2 import psycopg2.extras import spacy from bs4 import BeautifulSoup # ========================================================== # Logging # ========================================================== logging.basicConfig( level=logging.INFO, format='[NER] %(asctime)s %(levelname)s: %(message)s' ) log = logging.getLogger("ner_worker") # ========================================================== # Config DB # ========================================================== DB = dict( host=os.environ.get("DB_HOST", "localhost"), port=int(os.environ.get("DB_PORT", 5432)), dbname=os.environ.get("DB_NAME", "rss"), user=os.environ.get("DB_USER", "rss"), password=os.environ.get("DB_PASS", "x"), ) NER_LANG = os.environ.get("NER_LANG", "es").strip().lower() BATCH = int(os.environ.get("NER_BATCH", 64)) # ========================================================== # Mapeo de entidades spaCy → nuestro modelo SQL # ========================================================== ENT_LABELS = { "PERSON": "persona", "PER": "persona", "ORG": "organizacion", "GPE": "lugar", "LOC": "lugar", "MISC": "tema", } # ========================================================== # Limpieza avanzada # ========================================================== _ws_re = re.compile(r"\s+") HTML_TRASH_PATTERNS = [ r"<[^>]+>", r"&[a-z]+;", r"&#\d+;?", r'width="\d+"', r'height="\d+"', ] GENERIC_BAD_TAGS = { "república", "estado", "centro", "gobierno", "el gobierno", "gobiernos", "report", "sp", "unión", "union", "dólares", "dolar", "dólar", "the post", "post", "artículo", "el artículo", "la ciudad", "mundo", "país", "pais", "países", "paises", "la noche", "la publicación", "este miércoles", "el miércoles", "hoy", "ayer", "mañana", "servicio", "servicios", "el presidente", "presidente", "el ministro", "ministro", "la guerra", "guerra", "seguridad", "wp-content", "internal_photos", "/internal_photos", "https", "http", "src" } STOPWORDS = set() ARTICLES = { "el", "la", "los", "las", "un", "una", "uno", "al", "del" } # Límites TOPIC_MIN_CHARS = 4 TOPIC_MAX_WORDS = 6 TOPIC_MAX_PER_DOC = 15 # ========================================================== # Helpers # ========================================================== def get_conn(): return psycopg2.connect(**DB) def _looks_like_attr_or_path(text_lower: str) -> bool: """Filtra basura tipo rutas, html, atributos, URLs, etc.""" if text_lower.startswith("/"): return True if "http://" in text_lower or "https://" in text_lower: return True if any(ext in text_lower for ext in (".jpg", ".jpeg", ".png", ".gif", ".webp")): return True if re.search(r"\b(src|alt|style|class)\s*=", text_lower): return True if "data-" in text_lower: return True if re.search(r"&#\d+;?", text_lower): return True if "=" in text_lower and " " not in text_lower.strip(): return True # tokens-hash tipo "ajsdh7287sdhjshd8" (solo si no tiene espacios) if " " not in text_lower and re.fullmatch(r"[a-z0-9_]{15,}", text_lower.replace("-", "")): return True # palabras sin espacios largas con guiones if "-" in text_lower and " " not in text_lower: return True return False # ========================================================== # Limpieza de entidades # ========================================================== def clean_tag_text(text: str) -> str | None: if not text: return None text = BeautifulSoup(text, "html.parser").get_text() for pat in HTML_TRASH_PATTERNS: text = re.sub(pat, "", text) text = _ws_re.sub(" ", text).strip() text = text.strip(string.punctuation + " ") if len(text) < 3: log.debug(f"Clean reject (too short): {text}") return None if re.search(r"[<>/\\]", text): log.debug(f"Clean reject (bad chars): {text}") return None lower = text.lower() if lower.startswith("href="): log.debug(f"Clean reject (href): {text}") return None if _looks_like_attr_or_path(lower): log.debug(f"Clean reject (attr/path): {text}") return None if lower in GENERIC_BAD_TAGS: log.debug(f"Clean reject (generic bad): {text}") return None replacements = { "ee.uu.": "Estados Unidos", "los estados unidos": "Estados Unidos", "eeuu": "Estados Unidos", "eu": "Unión Europea", "ue": "Unión Europea", "kosova": "Kosovo", # Specific User Requests "trump": "Donald Trump", "mr. trump": "Donald Trump", "mr trump": "Donald Trump", "doland trump": "Donald Trump", "el presidente trump": "Donald Trump", "president trump": "Donald Trump", "ex-president trump": "Donald Trump", "expresidente trump": "Donald Trump", "putin": "Vladimir Putin", "vladimir putin": "Vladimir Putin", "v. putin": "Vladimir Putin", "presidente putin": "Vladimir Putin", # New requests "sanchez": "Pedro Sánchez", "pedro sanchez": "Pedro Sánchez", "p. sanchez": "Pedro Sánchez", "mr. sanchez": "Pedro Sánchez", "sánchez": "Pedro Sánchez", # explicit match just in case "pedro sánchez": "Pedro Sánchez", "maduro": "Nicolás Maduro", "nicolas maduro": "Nicolás Maduro", "mr. maduro": "Nicolás Maduro", "lula": "Lula da Silva", "lula da silva": "Lula da Silva", "luiz inácio lula da silva": "Lula da Silva", } if lower in replacements: return replacements[lower] # Blacklist (explicit removals requested) blacklist = { "getty images", "netflix", "fiscalia", "segun", "estoy", # People blacklist "and more", "app", "estamos", "ultra", # Orgs blacklist "hacienda", "fiscalía" } if lower in blacklist: log.debug(f"Clean reject (blacklist): {text}") return None return text # ========================================================== # Limpieza de topics (noun-chunks) # ========================================================== def clean_topic_text(text: str) -> str | None: if not text: return None text = BeautifulSoup(text, "html.parser").get_text() for pat in HTML_TRASH_PATTERNS: text = re.sub(pat, "", text) text = _ws_re.sub(" ", text).strip() text = text.strip(string.punctuation + " ") if len(text) < TOPIC_MIN_CHARS: return None lower = text.lower() if _looks_like_attr_or_path(lower): return None tokens = [ t.strip(string.punctuation) for t in lower.split() if t.strip(string.punctuation) ] if not tokens: return None # remover artículos iniciales if tokens[0] in ARTICLES: tokens = tokens[1:] if not tokens: return None norm = " ".join(tokens).strip() if len(norm) < TOPIC_MIN_CHARS: return None if norm in GENERIC_BAD_TAGS: return None if len(tokens) > TOPIC_MAX_WORDS: return None if all(t in STOPWORDS for t in tokens): return None if re.fullmatch(r"[0-9\s\.,\-:/]+", norm): return None return norm # ========================================================== # Extracción NER + Topics # ========================================================== def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]: ents = [] topics = [] if not text: return ents, topics doc = nlp(text) # log.debug(f"Processing text ({len(text)} chars): {text[:30]}...") # log.debug(f"Entities found: {len(doc.ents)}") # --- ENTIDADES --- for ent in doc.ents: tipo = ENT_LABELS.get(ent.label_) if not tipo: continue cleaned = clean_tag_text(ent.text) if not cleaned: # log.debug(f"Rejected entity: {ent.text} ({ent.label_})") continue if tipo == "persona": lower_cleaned = cleaned.lower() # Aggressive normalization rules for VIPs # Use token checks or substring checks carefully if "trump" in lower_cleaned.split(): # Token 'trump' exists? e.g. "donald trump", "trump", "mr. trump" # Exclude family members family = ["ivanka", "melania", "eric", "tiffany", "barron", "lara", "mary", "fred"] if not any(f in lower_cleaned for f in family): cleaned = "Donald Trump" elif "sanchez" in lower_cleaned or "sánchez" in lower_cleaned: # Be careful of other Sanchez? But user context implies Pedro. if "pedro" in lower_cleaned or "presidente" in lower_cleaned or lower_cleaned in ["sanchez", "sánchez"]: cleaned = "Pedro Sánchez" elif "maduro" in lower_cleaned: cleaned = "Nicolás Maduro" elif "lula" in lower_cleaned: cleaned = "Lula da Silva" elif "putin" in lower_cleaned: cleaned = "Vladimir Putin" # log.debug(f"Accepted entity: {cleaned} ({tipo})") ents.append((cleaned, tipo)) # --- TOPICS --- topic_counter = Counter() for chunk in doc.noun_chunks: cleaned = clean_topic_text(chunk.text) if cleaned: topic_counter[cleaned] += 1 ent_values = {v for (v, _) in ents} for val, _count in topic_counter.most_common(TOPIC_MAX_PER_DOC): if val in ent_values: continue topics.append((val, "tema")) return list(set(ents)), list(set(topics)) # ========================================================== # Worker principal # ========================================================== def main(): global STOPWORDS # Cargar spaCy log.info("Cargando modelo spaCy es_core_news_md...") nlp = spacy.load("es_core_news_md", disable=["lemmatizer", "textcat"]) STOPWORDS = set(nlp.Defaults.stop_words) log.info("Modelo spaCy cargado correctamente.") while True: try: with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute( """ SELECT t.id, t.titulo_trad, t.resumen_trad FROM traducciones t WHERE t.status = 'done' AND t.lang_to = %s AND NOT EXISTS ( SELECT 1 FROM tags_noticia tn WHERE tn.traduccion_id = t.id ) ORDER BY t.id DESC LIMIT %s; """, (NER_LANG, BATCH), ) rows = cur.fetchall() if not rows: time.sleep(5) continue log.info(f"Procesando {len(rows)} traducciones para NER/temas...") inserted_links = 0 for r in rows: text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip() if not text: continue ents, topics = extract_entities_and_topics(nlp, text) tags = ents + topics if not tags: continue for valor, tipo in tags: try: cur.execute( """ INSERT INTO tags (valor, tipo) VALUES (%s, %s) ON CONFLICT (valor, tipo) DO UPDATE SET valor = EXCLUDED.valor RETURNING id; """, (valor, tipo), ) tag_id = cur.fetchone()[0] cur.execute( """ INSERT INTO tags_noticia (traduccion_id, tag_id) VALUES (%s, %s) ON CONFLICT DO NOTHING; """, (r["id"], tag_id), ) if cur.rowcount > 0: inserted_links += 1 except Exception: log.exception("Error insertando tag/relación") conn.commit() log.info(f"Lote NER OK. Nuevas relaciones tag_noticia: {inserted_links}") except Exception: log.exception("Error general en NER loop") time.sleep(5) if __name__ == "__main__": main()