rss/ner_worker.py

import os
import time
import logging
import re
import string
from typing import List, Tuple
from collections import Counter

import psycopg2
import psycopg2.extras
import spacy
from bs4 import BeautifulSoup

logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')

DB = dict(
    host=os.environ.get("DB_HOST", "localhost"),
    port=int(os.environ.get("DB_PORT", 5432)),
    dbname=os.environ.get("DB_NAME", "rss"),
    user=os.environ.get("DB_USER", "rss"),
    password=os.environ.get("DB_PASS", "x"),
)

NER_LANG = os.environ.get("NER_LANG", "es").strip().lower()
BATCH = int(os.environ.get("NER_BATCH", 64))

ENT_LABELS = {
    "PERSON": "persona",
    "ORG": "organizacion",
    "GPE": "lugar",
    "LOC": "lugar",
}

_ws_re = re.compile(r"\s+")

HTML_TRASH_PATTERNS = [
    r"<[^>]+>",
    r"&[a-z]+;",
    r"&#\d+;?",  # entidades numéricas tipo &#8230;
    r'width="\d+"',
    r'height="\d+"',
]

# Palabras/sintagmas demasiado genéricos o claramente ruido
GENERIC_BAD_TAGS = {
    "república",
    "estado",
    "centro",
    "gobierno",
    "el gobierno",
    "gobiernos",
    "report",
    "sp",
    "unión",
    "union",
    "dólares",
    "dolar",
    "dólar",
    "the post",
    "post",
    "artículo",
    "el artículo",
    "la ciudad",
    "mundo",
    "país",
    "pais",
    "países",
    "paises",
    "la noche",
    "la publicación",
    "este miércoles",
    "el miércoles",
    "hoy",
    "ayer",
    "mañana",
    "servicio",
    "servicios",
    "el presidente",
    "presidente",
    "el ministro",
    "ministro",
    "la guerra",
    "guerra",
    "seguridad",
    "wp-content",
    "internal_photos",
    "/internal_photos",
    "https",
    "http",
    "src",
}

STOPWORDS = set()

ARTICLES = {
    "el",
    "la",
    "los",
    "las",
    "un",
    "una",
    "uno",
    "al",
    "del",
}

TOPIC_MIN_CHARS = 4
TOPIC_MAX_WORDS = 6
TOPIC_MAX_PER_DOC = 15


def _looks_like_attr_or_path(text_lower: str) -> bool:
    """Detecta cosas tipo rutas, atributos HTML, ids raros, etc."""
    if text_lower.startswith("/"):
        return True
    if "http://" in text_lower or "https://" in text_lower:
        return True
    if ".com" in text_lower or ".net" in text_lower or ".org" in text_lower:
        return True
    if any(ext in text_lower for ext in (".jpg", ".jpeg", ".png", ".gif", ".webp")):
        return True
    if re.search(r"\b(src|alt|style|class)\s*=", text_lower):
        return True
    if "data-" in text_lower:
        return True
    if re.search(r"&#\d+;?", text_lower):
        return True
    # cosas tipo atributo=valor
    if "=" in text_lower and " " not in text_lower.strip():
        return True
    # cadenas largas sin espacios (ids, hashes…)
    if re.fullmatch(r"[a-z0-9_]{15,}", text_lower.replace("-", "").replace(" ", "")):
        return True
    # palabra única con guión suele ser ruta/slug: wp-content, internal-photos…
    if "-" in text_lower and " " not in text_lower:
        return True
    return False


def clean_tag_text(text: str) -> str | None:
    """Limpieza para entidades (PERSON/ORG/GPE/LOC)."""
    if not text:
        return None
    text = BeautifulSoup(text, "html.parser").get_text()
    for pat in HTML_TRASH_PATTERNS:
        text = re.sub(pat, "", text)
    text = _ws_re.sub(" ", text).strip()
    text = text.strip(string.punctuation + " ")
    if len(text) < 3:
        return None
    if re.search(r"[<>/\\]", text):
        return None
    lower = text.lower()
    if lower.startswith("href="):
        return None
    if _looks_like_attr_or_path(lower):
        return None
    if lower in GENERIC_BAD_TAGS:
        return None

    replacements = {
        "ee.uu.": "Estados Unidos",
        "los estados unidos": "Estados Unidos",
        "eeuu": "Estados Unidos",
        "eu": "Unión Europea",
        "ue": "Unión Europea",
        "kosova": "Kosovo",
    }
    if lower in replacements:
        text = replacements[lower]
    return text


def clean_topic_text(text: str) -> str | None:
    """Limpieza para posibles 'temas' (noun_chunks)."""
    if not text:
        return None
    text = BeautifulSoup(text, "html.parser").get_text()
    for pat in HTML_TRASH_PATTERNS:
        text = re.sub(pat, "", text)
    text = _ws_re.sub(" ", text).strip()
    text = text.strip(string.punctuation + " ")
    if len(text) < TOPIC_MIN_CHARS:
        return None

    lower = text.lower()

    if _looks_like_attr_or_path(lower):
        return None

    # tokenizamos en minúsculas y quitamos puntuación
    tokens = [
        t.strip(string.punctuation)
        for t in lower.split()
        if t.strip(string.punctuation)
    ]
    if not tokens:
        return None

    # quitamos artículo inicial si lo hay
    if tokens and tokens[0] in ARTICLES:
        tokens = tokens[1:]
        if not tokens:
            return None

    # reconstruimos texto normalizado sin artículo
    norm = " ".join(tokens).strip()
    if len(norm) < TOPIC_MIN_CHARS:
        return None

    if norm in GENERIC_BAD_TAGS:
        return None

    # límite máximo de palabras
    if len(tokens) > TOPIC_MAX_WORDS:
        return None

    # todos stopwords => fuera
    if all(t in STOPWORDS for t in tokens):
        return None

    # sólo números/fechas
    if re.fullmatch(r"[0-9\s\.,\-:/]+", norm):
        return None

    return norm


def get_conn():
    return psycopg2.connect(**DB)


def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]:
    ents: List[Tuple[str, str]] = []
    topics: List[Tuple[str, str]] = []

    if not text:
        return ents, topics

    doc = nlp(text)

    # Entidades "clásicas"
    for ent in doc.ents:
        tipo = ENT_LABELS.get(ent.label_)
        if not tipo:
            continue
        val = clean_tag_text(ent.text)
        if not val:
            continue
        ents.append((val, tipo))

    # Candidatos a "tema" a partir de noun_chunks
    topic_counter: Counter[str] = Counter()

    for chunk in doc.noun_chunks:
        val = clean_topic_text(chunk.text)
        if not val:
            continue
        topic_counter[val] += 1

    ent_values = {v for (v, _) in ents}

    for val, _count in topic_counter.most_common(TOPIC_MAX_PER_DOC):
        if val in ent_values:
            continue
        topics.append((val, "tema"))

    # quitamos duplicados
    ents = list(set(ents))
    topics = list(set(topics))
    return ents, topics


def main():
    global STOPWORDS

    nlp = spacy.load("es_core_news_md", disable=["lemmatizer", "textcat"])
    STOPWORDS = set(nlp.Defaults.stop_words)
    logging.info("spaCy cargado: es_core_news_md (NER + parser)")

    while True:
        try:
            with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                cur.execute(
                    """
                    WITH pend AS (
                      SELECT t.id, t.titulo_trad, t.resumen_trad
                      FROM traducciones t
                      LEFT JOIN tags_noticia tn ON tn.traduccion_id = t.id
                      WHERE t.status = 'done'
                        AND t.lang_to = %s
                      GROUP BY t.id, t.titulo_trad, t.resumen_trad
                      HAVING COUNT(tn.tag_id) = 0
                      ORDER BY t.id DESC
                      LIMIT %s
                    )
                    SELECT * FROM pend;
                    """,
                    (NER_LANG, BATCH),
                )
                rows = cur.fetchall()

                if not rows:
                    time.sleep(5)
                    continue

                logging.info(f"Procesando {len(rows)} traducciones para NER/temas...")

                new_links = 0

                for r in rows:
                    text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
                    if not text:
                        continue

                    ents, topics = extract_entities_and_topics(nlp, text)
                    all_tags = ents + topics
                    if not all_tags:
                        continue

                    for valor, tipo in all_tags:
                        try:
                            cur.execute(
                                """
                                INSERT INTO tags (valor, tipo)
                                VALUES (%s, %s)
                                ON CONFLICT (valor, tipo)
                                DO UPDATE SET valor = EXCLUDED.valor
                                RETURNING id
                                """,
                                (valor, tipo),
                            )
                            tag_id = cur.fetchone()[0]
                            cur.execute(
                                """
                                INSERT INTO tags_noticia (traduccion_id, tag_id)
                                VALUES (%s, %s)
                                ON CONFLICT DO NOTHING
                                """,
                                (r["id"], tag_id),
                            )
                            if cur.rowcount > 0:
                                new_links += 1
                        except Exception:
                            logging.exception("Fallo insertando tag/relación")

                conn.commit()
                logging.info(f"NER/temas lote OK. Nuevos enlaces: {new_links}.")
        except Exception:
            logging.exception("Error en NER loop")
            time.sleep(5)


if __name__ == "__main__":
    main()