import os import time import logging import re import string from typing import List, Tuple from collections import Counter import psycopg2 import psycopg2.extras import spacy from bs4 import BeautifulSoup logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') DB = dict( host=os.environ.get("DB_HOST", "localhost"), port=int(os.environ.get("DB_PORT", 5432)), dbname=os.environ.get("DB_NAME", "rss"), user=os.environ.get("DB_USER", "rss"), password=os.environ.get("DB_PASS", "x"), ) NER_LANG = os.environ.get("NER_LANG", "es").strip().lower() BATCH = int(os.environ.get("NER_BATCH", 64)) ENT_LABELS = { "PERSON": "persona", "ORG": "organizacion", "GPE": "lugar", "LOC": "lugar", } _ws_re = re.compile(r"\s+") HTML_TRASH_PATTERNS = [ r"<[^>]+>", r"&[a-z]+;", r"&#\d+;?", # entidades numéricas tipo … r'width="\d+"', r'height="\d+"', ] # Palabras/sintagmas demasiado genéricos o claramente ruido GENERIC_BAD_TAGS = { "república", "estado", "centro", "gobierno", "el gobierno", "gobiernos", "report", "sp", "unión", "union", "dólares", "dolar", "dólar", "the post", "post", "artículo", "el artículo", "la ciudad", "mundo", "país", "pais", "países", "paises", "la noche", "la publicación", "este miércoles", "el miércoles", "hoy", "ayer", "mañana", "servicio", "servicios", "el presidente", "presidente", "el ministro", "ministro", "la guerra", "guerra", "seguridad", "wp-content", "internal_photos", "/internal_photos", "https", "http", "src", } STOPWORDS = set() ARTICLES = { "el", "la", "los", "las", "un", "una", "uno", "al", "del", } TOPIC_MIN_CHARS = 4 TOPIC_MAX_WORDS = 6 TOPIC_MAX_PER_DOC = 15 def _looks_like_attr_or_path(text_lower: str) -> bool: """Detecta cosas tipo rutas, atributos HTML, ids raros, etc.""" if text_lower.startswith("/"): return True if "http://" in text_lower or "https://" in text_lower: return True if ".com" in text_lower or ".net" in text_lower or ".org" in text_lower: return True if any(ext in text_lower for ext in (".jpg", ".jpeg", ".png", ".gif", ".webp")): return True if re.search(r"\b(src|alt|style|class)\s*=", text_lower): return True if "data-" in text_lower: return True if re.search(r"&#\d+;?", text_lower): return True # cosas tipo atributo=valor if "=" in text_lower and " " not in text_lower.strip(): return True # cadenas largas sin espacios (ids, hashes…) if re.fullmatch(r"[a-z0-9_]{15,}", text_lower.replace("-", "").replace(" ", "")): return True # palabra única con guión suele ser ruta/slug: wp-content, internal-photos… if "-" in text_lower and " " not in text_lower: return True return False def clean_tag_text(text: str) -> str | None: """Limpieza para entidades (PERSON/ORG/GPE/LOC).""" if not text: return None text = BeautifulSoup(text, "html.parser").get_text() for pat in HTML_TRASH_PATTERNS: text = re.sub(pat, "", text) text = _ws_re.sub(" ", text).strip() text = text.strip(string.punctuation + " ") if len(text) < 3: return None if re.search(r"[<>/\\]", text): return None lower = text.lower() if lower.startswith("href="): return None if _looks_like_attr_or_path(lower): return None if lower in GENERIC_BAD_TAGS: return None replacements = { "ee.uu.": "Estados Unidos", "los estados unidos": "Estados Unidos", "eeuu": "Estados Unidos", "eu": "Unión Europea", "ue": "Unión Europea", "kosova": "Kosovo", } if lower in replacements: text = replacements[lower] return text def clean_topic_text(text: str) -> str | None: """Limpieza para posibles 'temas' (noun_chunks).""" if not text: return None text = BeautifulSoup(text, "html.parser").get_text() for pat in HTML_TRASH_PATTERNS: text = re.sub(pat, "", text) text = _ws_re.sub(" ", text).strip() text = text.strip(string.punctuation + " ") if len(text) < TOPIC_MIN_CHARS: return None lower = text.lower() if _looks_like_attr_or_path(lower): return None # tokenizamos en minúsculas y quitamos puntuación tokens = [ t.strip(string.punctuation) for t in lower.split() if t.strip(string.punctuation) ] if not tokens: return None # quitamos artículo inicial si lo hay if tokens and tokens[0] in ARTICLES: tokens = tokens[1:] if not tokens: return None # reconstruimos texto normalizado sin artículo norm = " ".join(tokens).strip() if len(norm) < TOPIC_MIN_CHARS: return None if norm in GENERIC_BAD_TAGS: return None # límite máximo de palabras if len(tokens) > TOPIC_MAX_WORDS: return None # todos stopwords => fuera if all(t in STOPWORDS for t in tokens): return None # sólo números/fechas if re.fullmatch(r"[0-9\s\.,\-:/]+", norm): return None return norm def get_conn(): return psycopg2.connect(**DB) def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]: ents: List[Tuple[str, str]] = [] topics: List[Tuple[str, str]] = [] if not text: return ents, topics doc = nlp(text) # Entidades "clásicas" for ent in doc.ents: tipo = ENT_LABELS.get(ent.label_) if not tipo: continue val = clean_tag_text(ent.text) if not val: continue ents.append((val, tipo)) # Candidatos a "tema" a partir de noun_chunks topic_counter: Counter[str] = Counter() for chunk in doc.noun_chunks: val = clean_topic_text(chunk.text) if not val: continue topic_counter[val] += 1 ent_values = {v for (v, _) in ents} for val, _count in topic_counter.most_common(TOPIC_MAX_PER_DOC): if val in ent_values: continue topics.append((val, "tema")) # quitamos duplicados ents = list(set(ents)) topics = list(set(topics)) return ents, topics def main(): global STOPWORDS nlp = spacy.load("es_core_news_md", disable=["lemmatizer", "textcat"]) STOPWORDS = set(nlp.Defaults.stop_words) logging.info("spaCy cargado: es_core_news_md (NER + parser)") while True: try: with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute( """ WITH pend AS ( SELECT t.id, t.titulo_trad, t.resumen_trad FROM traducciones t LEFT JOIN tags_noticia tn ON tn.traduccion_id = t.id WHERE t.status = 'done' AND t.lang_to = %s GROUP BY t.id, t.titulo_trad, t.resumen_trad HAVING COUNT(tn.tag_id) = 0 ORDER BY t.id DESC LIMIT %s ) SELECT * FROM pend; """, (NER_LANG, BATCH), ) rows = cur.fetchall() if not rows: time.sleep(5) continue logging.info(f"Procesando {len(rows)} traducciones para NER/temas...") new_links = 0 for r in rows: text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip() if not text: continue ents, topics = extract_entities_and_topics(nlp, text) all_tags = ents + topics if not all_tags: continue for valor, tipo in all_tags: try: cur.execute( """ INSERT INTO tags (valor, tipo) VALUES (%s, %s) ON CONFLICT (valor, tipo) DO UPDATE SET valor = EXCLUDED.valor RETURNING id """, (valor, tipo), ) tag_id = cur.fetchone()[0] cur.execute( """ INSERT INTO tags_noticia (traduccion_id, tag_id) VALUES (%s, %s) ON CONFLICT DO NOTHING """, (r["id"], tag_id), ) if cur.rowcount > 0: new_links += 1 except Exception: logging.exception("Fallo insertando tag/relación") conn.commit() logging.info(f"NER/temas lote OK. Nuevos enlaces: {new_links}.") except Exception: logging.exception("Error en NER loop") time.sleep(5) if __name__ == "__main__": main()