import os import time import logging import re import psycopg2 import psycopg2.extras import spacy from bs4 import BeautifulSoup logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') DB = dict( host=os.environ.get("DB_HOST", "localhost"), port=int(os.environ.get("DB_PORT", 5432)), dbname=os.environ.get("DB_NAME", "rss"), user=os.environ.get("DB_USER", "rss"), password=os.environ.get("DB_PASS", "x"), ) NER_LANG = os.environ.get("NER_LANG", "es").strip().lower() BATCH = int(os.environ.get("NER_BATCH", 64)) ENT_LABELS = { "PERSON": "persona", "ORG": "organizacion", "GPE": "lugar", "LOC": "lugar", } _ws_re = re.compile(r"\s+") HTML_TRASH_PATTERNS = [ r"<[^>]+>", r"&[a-z]+;", r'width="\d+"', r'height="\d+"', ] GENERIC_BAD_TAGS = { "república", "estado", "centro", "gobierno", "report", "sp", "unión", } def clean_tag_text(text): if not text: return None text = BeautifulSoup(text, "html.parser").get_text() for pat in HTML_TRASH_PATTERNS: text = re.sub(pat, "", text) text = _ws_re.sub(" ", text).strip() if len(text) < 3: return None if re.search(r"[<>/\\]", text): return None lower = text.lower() if lower.startswith("href="): return None if lower.startswith("http"): return None if lower in GENERIC_BAD_TAGS: return None replacements = { "ee.uu.": "Estados Unidos", "los estados unidos": "Estados Unidos", "eu": "Unión Europea", "ue": "Unión Europea", "kosova": "Kosovo", } if lower in replacements: text = replacements[lower] return text def get_conn(): return psycopg2.connect(**DB) def main(): nlp = spacy.load("es_core_news_md", disable=["parser", "lemmatizer", "textcat"]) logging.info("spaCy cargado: es_core_news_md") while True: try: with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute( """ WITH pend AS ( SELECT t.id, t.titulo_trad, t.resumen_trad FROM traducciones t LEFT JOIN tags_noticia tn ON tn.traduccion_id = t.id WHERE t.status = 'done' AND t.lang_to = %s GROUP BY t.id, t.titulo_trad, t.resumen_trad HAVING COUNT(tn.tag_id) = 0 ORDER BY t.id DESC LIMIT %s ) SELECT * FROM pend; """, (NER_LANG, BATCH), ) rows = cur.fetchall() if not rows: time.sleep(5) continue logging.info(f"Procesando {len(rows)} traducciones para NER...") new_links = 0 for r in rows: text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip() if not text: continue doc = nlp(text) ents = [] for ent in doc.ents: tipo = ENT_LABELS.get(ent.label_) if not tipo: continue val = clean_tag_text(ent.text) if not val: continue ents.append((val, tipo)) if not ents: continue for valor, tipo in set(ents): try: cur.execute( """ INSERT INTO tags (valor, tipo) VALUES (%s, %s) ON CONFLICT (valor, tipo) DO UPDATE SET valor = EXCLUDED.valor RETURNING id """, (valor, tipo), ) tag_id = cur.fetchone()[0] cur.execute( """ INSERT INTO tags_noticia (traduccion_id, tag_id) VALUES (%s, %s) ON CONFLICT DO NOTHING """, (r["id"], tag_id), ) if cur.rowcount > 0: new_links += 1 except Exception: logging.exception("Fallo insertando tag/relación") conn.commit() logging.info(f"NER lote OK. Nuevos enlaces: {new_links}.") except Exception as e: logging.exception(f"Error en NER loop: {e}") time.sleep(5) if __name__ == "__main__": main()