import os import time import logging import re import string from typing import List, Tuple from collections import Counter import psycopg2 import psycopg2.extras import spacy from bs4 import BeautifulSoup logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') DB = dict( host=os.environ.get("DB_HOST", "localhost"), port=int(os.environ.get("DB_PORT", 5432)), dbname=os.environ.get("DB_NAME", "rss"), user=os.environ.get("DB_USER", "rss"), password=os.environ.get("DB_PASS", "x"), ) NER_LANG = os.environ.get("NER_LANG", "es").strip().lower() BATCH = int(os.environ.get("NER_BATCH", 64)) ENT_LABELS = { "PERSON": "persona", "ORG": "organizacion", "GPE": "lugar", "LOC": "lugar", } _ws_re = re.compile(r"\s+") HTML_TRASH_PATTERNS = [ r"<[^>]+>", r"&[a-z]+;", r"&#\d+;?", r'width="\d+"', r'height="\d+"', ] GENERIC_BAD_TAGS = { "república", "estado", "centro", "gobierno", "el gobierno", "gobiernos", "report", "sp", "unión", "union", "dólares", "dolar", "dólar", "the post", "post", "artículo", "el artículo", "la ciudad", "mundo", "país", "pais", "países", "paises", "la noche", "la publicación", "este miércoles", "el miércoles", "hoy", "ayer", "mañana", "servicio", "servicios", "el presidente", "presidente", "el ministro", "ministro", "la guerra", "guerra", "seguridad", "wp-content", "internal_photos", "/internal_photos", "https", "http", "src", } STOPWORDS = set() ARTICLES = { "el", "la", "los", "las", "un", "una", "uno", "al", "del", } TOPIC_MIN_CHARS = 4 TOPIC_MAX_WORDS = 6 TOPIC_MAX_PER_DOC = 15 def _looks_like_attr_or_path(text_lower: str) -> bool: if text_lower.startswith("/"): return True if "http://" in text_lower or "https://" in text_lower: return True if ".com" in text_lower or ".net" in text_lower or ".org" in text_lower: return True if any(ext in text_lower for ext in (".jpg", ".jpeg", ".png", ".gif", ".webp")): return True if re.search(r"\b(src|alt|style|class)\s*=", text_lower): return True if "data-" in text_lower: return True if re.search(r"&#\d+;?", text_lower): return True if "=" in text_lower and " " not in text_lower.strip(): return True if re.fullmatch(r"[a-z0-9_]{15,}", text_lower.replace("-", "").replace(" ", "")): return True if "-" in text_lower and " " not in text_lower: return True return False def clean_tag_text(text: str) -> str | None: if not text: return None text = BeautifulSoup(text, "html.parser").get_text() for pat in HTML_TRASH_PATTERNS: text = re.sub(pat, "", text) text = _ws_re.sub(" ", text).strip() text = text.strip(string.punctuation + " ") if len(text) < 3: return None if re.search(r"[<>/\\]", text): return None lower = text.lower() if lower.startswith("href="): return None if _looks_like_attr_or_path(lower): return None if lower in GENERIC_BAD_TAGS: return None replacements = { "ee.uu.": "Estados Unidos", "los estados unidos": "Estados Unidos", "eeuu": "Estados Unidos", "eu": "Unión Europea", "ue": "Unión Europea", "kosova": "Kosovo", } if lower in replacements: text = replacements[lower] return text def clean_topic_text(text: str) -> str | None: if not text: return None text = BeautifulSoup(text, "html.parser").get_text() for pat in HTML_TRASH_PATTERNS: text = re.sub(pat, "", text) text = _ws_re.sub(" ", text).strip() text = text.strip(string.punctuation + " ") if len(text) < TOPIC_MIN_CHARS: return None lower = text.lower() if _looks_like_attr_or_path(lower): return None tokens = [ t.strip(string.punctuation) for t in lower.split() if t.strip(string.punctuation) ] if not tokens: return None if tokens and tokens[0] in ARTICLES: tokens = tokens[1:] if not tokens: return None norm = " ".join(tokens).strip() if len(norm) < TOPIC_MIN_CHARS: return None if norm in GENERIC_BAD_TAGS: return None if len(tokens) > TOPIC_MAX_WORDS: return None if all(t in STOPWORDS for t in tokens): return None if re.fullmatch(r"[0-9\s\.,\-:/]+", norm): return None return norm def get_conn(): return psycopg2.connect(**DB) def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]: ents: List[Tuple[str, str]] = [] topics: List[Tuple[str, str]] = [] if not text: return ents, topics doc = nlp(text) for ent in doc.ents: tipo = ENT_LABELS.get(ent.label_) if not tipo: continue val = clean_tag_text(ent.text) if not val: continue ents.append((val, tipo)) topic_counter: Counter[str] = Counter() for chunk in doc.noun_chunks: val = clean_topic_text(chunk.text) if not val: continue topic_counter[val] += 1 ent_values = {v for (v, _) in ents} for val, _count in topic_counter.most_common(TOPIC_MAX_PER_DOC): if val in ent_values: continue topics.append((val, "tema")) ents = list(set(ents)) topics = list(set(topics)) return ents, topics def main(): global STOPWORDS nlp = spacy.load("es_core_news_md", disable=["lemmatizer", "textcat"]) STOPWORDS = set(nlp.Defaults.stop_words) logging.info("spaCy cargado: es_core_news_md (NER + parser)") while True: try: with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute( """ WITH pend AS ( SELECT t.id, t.titulo_trad, t.resumen_trad FROM traducciones t LEFT JOIN tags_noticia tn ON tn.traduccion_id = t.id WHERE t.status = 'done' AND t.lang_to = %s GROUP BY t.id, t.titulo_trad, t.resumen_trad HAVING COUNT(tn.tag_id) = 0 ORDER BY t.id DESC LIMIT %s ) SELECT * FROM pend; """, (NER_LANG, BATCH), ) rows = cur.fetchall() if not rows: time.sleep(5) continue logging.info(f"Procesando {len(rows)} traducciones para NER/temas...") new_links = 0 for r in rows: text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip() if not text: continue ents, topics = extract_entities_and_topics(nlp, text) all_tags = ents + topics if not all_tags: continue for valor, tipo in all_tags: try: cur.execute( """ INSERT INTO tags (valor, tipo) VALUES (%s, %s) ON CONFLICT (valor, tipo) DO UPDATE SET valor = EXCLUDED.valor RETURNING id """, (valor, tipo), ) tag_id = cur.fetchone()[0] cur.execute( """ INSERT INTO tags_noticia (traduccion_id, tag_id) VALUES (%s, %s) ON CONFLICT DO NOTHING """, (r["id"], tag_id), ) if cur.rowcount > 0: new_links += 1 except Exception: logging.exception("Fallo insertando tag/relación") conn.commit() logging.info(f"NER/temas lote OK. Nuevos enlaces: {new_links}.") except Exception: logging.exception("Error en NER loop") time.sleep(5) if __name__ == "__main__": main()