rss/ner_worker.py

import os
import time
import logging
import re
import string
from typing import List, Tuple
from collections import Counter

import psycopg2
import psycopg2.extras
import spacy
from bs4 import BeautifulSoup

logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')

DB = dict(
    host=os.environ.get("DB_HOST", "localhost"),
    port=int(os.environ.get("DB_PORT", 5432)),
    dbname=os.environ.get("DB_NAME", "rss"),
    user=os.environ.get("DB_USER", "rss"),
    password=os.environ.get("DB_PASS", "x"),
)

NER_LANG = os.environ.get("NER_LANG", "es").strip().lower()
BATCH = int(os.environ.get("NER_BATCH", 64))

ENT_LABELS = {
    "PERSON": "persona",
    "ORG": "organizacion",
    "GPE": "lugar",
    "LOC": "lugar",
}

_ws_re = re.compile(r"\s+")

HTML_TRASH_PATTERNS = [
    r"<[^>]+>",
    r"&[a-z]+;",
    r"&#\d+;?",
    r'width="\d+"',
    r'height="\d+"',
]

GENERIC_BAD_TAGS = {
    "república",
    "estado",
    "centro",
    "gobierno",
    "el gobierno",
    "gobiernos",
    "report",
    "sp",
    "unión",
    "union",
    "dólares",
    "dolar",
    "dólar",
    "the post",
    "post",
    "artículo",
    "el artículo",
    "la ciudad",
    "mundo",
    "país",
    "pais",
    "países",
    "paises",
    "la noche",
    "la publicación",
    "este miércoles",
    "el miércoles",
    "hoy",
    "ayer",
    "mañana",
    "servicio",
    "servicios",
    "el presidente",
    "presidente",
    "el ministro",
    "ministro",
    "la guerra",
    "guerra",
    "seguridad",
    "wp-content",
    "internal_photos",
    "/internal_photos",
    "https",
    "http",
    "src",
}

STOPWORDS = set()

ARTICLES = {
    "el",
    "la",
    "los",
    "las",
    "un",
    "una",
    "uno",
    "al",
    "del",
}

TOPIC_MIN_CHARS = 4
TOPIC_MAX_WORDS = 6
TOPIC_MAX_PER_DOC = 15


def _looks_like_attr_or_path(text_lower: str) -> bool:
    if text_lower.startswith("/"):
        return True
    if "http://" in text_lower or "https://" in text_lower:
        return True
    if ".com" in text_lower or ".net" in text_lower or ".org" in text_lower:
        return True
    if any(ext in text_lower for ext in (".jpg", ".jpeg", ".png", ".gif", ".webp")):
        return True
    if re.search(r"\b(src|alt|style|class)\s*=", text_lower):
        return True
    if "data-" in text_lower:
        return True
    if re.search(r"&#\d+;?", text_lower):
        return True
    if "=" in text_lower and " " not in text_lower.strip():
        return True
    if re.fullmatch(r"[a-z0-9_]{15,}", text_lower.replace("-", "").replace(" ", "")):
        return True
    if "-" in text_lower and " " not in text_lower:
        return True
    return False


def clean_tag_text(text: str) -> str | None:
    if not text:
        return None
    text = BeautifulSoup(text, "html.parser").get_text()
    for pat in HTML_TRASH_PATTERNS:
        text = re.sub(pat, "", text)
    text = _ws_re.sub(" ", text).strip()
    text = text.strip(string.punctuation + " ")
    if len(text) < 3:
        return None
    if re.search(r"[<>/\\]", text):
        return None
    lower = text.lower()
    if lower.startswith("href="):
        return None
    if _looks_like_attr_or_path(lower):
        return None
    if lower in GENERIC_BAD_TAGS:
        return None

    replacements = {
        "ee.uu.": "Estados Unidos",
        "los estados unidos": "Estados Unidos",
        "eeuu": "Estados Unidos",
        "eu": "Unión Europea",
        "ue": "Unión Europea",
        "kosova": "Kosovo",
    }
    if lower in replacements:
        text = replacements[lower]
    return text


def clean_topic_text(text: str) -> str | None:
    if not text:
        return None
    text = BeautifulSoup(text, "html.parser").get_text()
    for pat in HTML_TRASH_PATTERNS:
        text = re.sub(pat, "", text)
    text = _ws_re.sub(" ", text).strip()
    text = text.strip(string.punctuation + " ")
    if len(text) < TOPIC_MIN_CHARS:
        return None

    lower = text.lower()
    if _looks_like_attr_or_path(lower):
        return None

    tokens = [
        t.strip(string.punctuation)
        for t in lower.split()
        if t.strip(string.punctuation)
    ]
    if not tokens:
        return None

    if tokens and tokens[0] in ARTICLES:
        tokens = tokens[1:]
        if not tokens:
            return None

    norm = " ".join(tokens).strip()
    if len(norm) < TOPIC_MIN_CHARS:
        return None

    if norm in GENERIC_BAD_TAGS:
        return None

    if len(tokens) > TOPIC_MAX_WORDS:
        return None

    if all(t in STOPWORDS for t in tokens):
        return None

    if re.fullmatch(r"[0-9\s\.,\-:/]+", norm):
        return None

    return norm


def get_conn():
    return psycopg2.connect(**DB)


def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]:
    ents: List[Tuple[str, str]] = []
    topics: List[Tuple[str, str]] = []

    if not text:
        return ents, topics

    doc = nlp(text)

    for ent in doc.ents:
        tipo = ENT_LABELS.get(ent.label_)
        if not tipo:
            continue
        val = clean_tag_text(ent.text)
        if not val:
            continue
        ents.append((val, tipo))

    topic_counter: Counter[str] = Counter()

    for chunk in doc.noun_chunks:
        val = clean_topic_text(chunk.text)
        if not val:
            continue
        topic_counter[val] += 1

    ent_values = {v for (v, _) in ents}

    for val, _count in topic_counter.most_common(TOPIC_MAX_PER_DOC):
        if val in ent_values:
            continue
        topics.append((val, "tema"))

    ents = list(set(ents))
    topics = list(set(topics))
    return ents, topics


def main():
    global STOPWORDS

    nlp = spacy.load("es_core_news_md", disable=["lemmatizer", "textcat"])
    STOPWORDS = set(nlp.Defaults.stop_words)
    logging.info("spaCy cargado: es_core_news_md (NER + parser)")

    while True:
        try:
            with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                cur.execute(
                    """
                    WITH pend AS (
                      SELECT t.id, t.titulo_trad, t.resumen_trad
                      FROM traducciones t
                      LEFT JOIN tags_noticia tn ON tn.traduccion_id = t.id
                      WHERE t.status = 'done'
                        AND t.lang_to = %s
                      GROUP BY t.id, t.titulo_trad, t.resumen_trad
                      HAVING COUNT(tn.tag_id) = 0
                      ORDER BY t.id DESC
                      LIMIT %s
                    )
                    SELECT * FROM pend;
                    """,
                    (NER_LANG, BATCH),
                )
                rows = cur.fetchall()

                if not rows:
                    time.sleep(5)
                    continue

                logging.info(f"Procesando {len(rows)} traducciones para NER/temas...")

                new_links = 0

                for r in rows:
                    text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
                    if not text:
                        continue

                    ents, topics = extract_entities_and_topics(nlp, text)
                    all_tags = ents + topics
                    if not all_tags:
                        continue

                    for valor, tipo in all_tags:
                        try:
                            cur.execute(
                                """
                                INSERT INTO tags (valor, tipo)
                                VALUES (%s, %s)
                                ON CONFLICT (valor, tipo)
                                DO UPDATE SET valor = EXCLUDED.valor
                                RETURNING id
                                """,
                                (valor, tipo),
                            )
                            tag_id = cur.fetchone()[0]
                            cur.execute(
                                """
                                INSERT INTO tags_noticia (traduccion_id, tag_id)
                                VALUES (%s, %s)
                                ON CONFLICT DO NOTHING
                                """,
                                (r["id"], tag_id),
                            )
                            if cur.rowcount > 0:
                                new_links += 1
                        except Exception:
                            logging.exception("Fallo insertando tag/relación")

                conn.commit()
                logging.info(f"NER/temas lote OK. Nuevos enlaces: {new_links}.")
        except Exception:
            logging.exception("Error en NER loop")
            time.sleep(5)


if __name__ == "__main__":
    main()