rss/ner_worker.py

import os
import time
import logging
import re
import psycopg2
import psycopg2.extras
import spacy
from bs4 import BeautifulSoup

logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')

DB = dict(
    host=os.environ.get("DB_HOST", "localhost"),
    port=int(os.environ.get("DB_PORT", 5432)),
    dbname=os.environ.get("DB_NAME", "rss"),
    user=os.environ.get("DB_USER", "rss"),
    password=os.environ.get("DB_PASS", "x"),
)

NER_LANG = os.environ.get("NER_LANG", "es").strip().lower()
BATCH = int(os.environ.get("NER_BATCH", 64))

ENT_LABELS = {
    "PERSON": "persona",
    "ORG": "organizacion",
    "GPE": "lugar",
    "LOC": "lugar",
}

_ws_re = re.compile(r"\s+")
HTML_TRASH_PATTERNS = [
    r"<[^>]+>",
    r"&[a-z]+;",
    r'width="\d+"',
    r'height="\d+"',
]
GENERIC_BAD_TAGS = {
    "república",
    "estado",
    "centro",
    "gobierno",
    "report",
    "sp",
    "unión",
}


def clean_tag_text(text):
    if not text:
        return None
    text = BeautifulSoup(text, "html.parser").get_text()
    for pat in HTML_TRASH_PATTERNS:
        text = re.sub(pat, "", text)
    text = _ws_re.sub(" ", text).strip()
    if len(text) < 3:
        return None
    if re.search(r"[<>/\\]", text):
        return None
    lower = text.lower()
    if lower.startswith("href="):
        return None
    if lower.startswith("http"):
        return None
    if lower in GENERIC_BAD_TAGS:
        return None
    replacements = {
        "ee.uu.": "Estados Unidos",
        "los estados unidos": "Estados Unidos",
        "eu": "Unión Europea",
        "ue": "Unión Europea",
        "kosova": "Kosovo",
    }
    if lower in replacements:
        text = replacements[lower]
    return text


def get_conn():
    return psycopg2.connect(**DB)


def main():
    nlp = spacy.load("es_core_news_md", disable=["parser", "lemmatizer", "textcat"])
    logging.info("spaCy cargado: es_core_news_md")

    while True:
        try:
            with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                cur.execute(
                    """
                    WITH pend AS (
                      SELECT t.id, t.titulo_trad, t.resumen_trad
                      FROM traducciones t
                      LEFT JOIN tags_noticia tn ON tn.traduccion_id = t.id
                      WHERE t.status = 'done'
                        AND t.lang_to = %s
                      GROUP BY t.id, t.titulo_trad, t.resumen_trad
                      HAVING COUNT(tn.tag_id) = 0
                      ORDER BY t.id DESC
                      LIMIT %s
                    )
                    SELECT * FROM pend;
                    """,
                    (NER_LANG, BATCH),
                )
                rows = cur.fetchall()

                if not rows:
                    time.sleep(5)
                    continue

                logging.info(f"Procesando {len(rows)} traducciones para NER...")

                new_links = 0

                for r in rows:
                    text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
                    if not text:
                        continue

                    doc = nlp(text)
                    ents = []

                    for ent in doc.ents:
                        tipo = ENT_LABELS.get(ent.label_)
                        if not tipo:
                            continue
                        val = clean_tag_text(ent.text)
                        if not val:
                            continue
                        ents.append((val, tipo))

                    if not ents:
                        continue

                    for valor, tipo in set(ents):
                        try:
                            cur.execute(
                                """
                                INSERT INTO tags (valor, tipo)
                                VALUES (%s, %s)
                                ON CONFLICT (valor, tipo)
                                DO UPDATE SET valor = EXCLUDED.valor
                                RETURNING id
                                """,
                                (valor, tipo),
                            )
                            tag_id = cur.fetchone()[0]
                            cur.execute(
                                """
                                INSERT INTO tags_noticia (traduccion_id, tag_id)
                                VALUES (%s, %s)
                                ON CONFLICT DO NOTHING
                                """,
                                (r["id"], tag_id),
                            )
                            if cur.rowcount > 0:
                                new_links += 1
                        except Exception:
                            logging.exception("Fallo insertando tag/relación")

                conn.commit()
                logging.info(f"NER lote OK. Nuevos enlaces: {new_links}.")
        except Exception as e:
            logging.exception(f"Error en NER loop: {e}")
            time.sleep(5)


if __name__ == "__main__":
    main()