Mejoras: NER, embeddings, dashboard, docker-compose y limpieza

2025-11-17 19:37:05 +01:00 · 2025-11-17 19:37:05 +01:00 · d508dc2058
commit d508dc2058
parent 6c5aff9936
19 changed files with 2218 additions and 1185 deletions
--- a/ner_worker.py
+++ b/ner_worker.py
@ -5,6 +5,7 @@ import re
 import psycopg2
 import psycopg2.extras
 import spacy
+from bs4 import BeautifulSoup

 logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')

@ -16,13 +17,9 @@ DB = dict(
    password=os.environ.get("DB_PASS", "x"),
 )

-# Idioma de las traducciones que vamos a etiquetar
 NER_LANG = os.environ.get("NER_LANG", "es").strip().lower()
-
-# Tamaño de lote de traducciones a procesar por iteración
 BATCH = int(os.environ.get("NER_BATCH", 64))

-# Mapeo de etiquetas de spaCy -> tipos de nuestro esquema
 ENT_LABELS = {
    "PERSON": "persona",
    "ORG": "organizacion",
@ -30,28 +27,65 @@ ENT_LABELS = {
    "LOC": "lugar",
 }

-# Normaliza el valor del tag (quita espacios extra, colapsa espacios internos)
 _ws_re = re.compile(r"\s+")
-def _clean_value(s: str) -> str:
-    if not s:
-        return ""
-    s = s.strip()
-    s = _ws_re.sub(" ", s)
-    return s
+HTML_TRASH_PATTERNS = [
+    r"<[^>]+>",
+    r"&[a-z]+;",
+    r'width="\d+"',
+    r'height="\d+"',
+]
+GENERIC_BAD_TAGS = {
+    "república",
+    "estado",
+    "centro",
+    "gobierno",
+    "report",
+    "sp",
+    "unión",
+}
+
+
+def clean_tag_text(text):
+    if not text:
+        return None
+    text = BeautifulSoup(text, "html.parser").get_text()
+    for pat in HTML_TRASH_PATTERNS:
+        text = re.sub(pat, "", text)
+    text = _ws_re.sub(" ", text).strip()
+    if len(text) < 3:
+        return None
+    if re.search(r"[<>/\\]", text):
+        return None
+    lower = text.lower()
+    if lower.startswith("href="):
+        return None
+    if lower.startswith("http"):
+        return None
+    if lower in GENERIC_BAD_TAGS:
+        return None
+    replacements = {
+        "ee.uu.": "Estados Unidos",
+        "los estados unidos": "Estados Unidos",
+        "eu": "Unión Europea",
+        "ue": "Unión Europea",
+        "kosova": "Kosovo",
+    }
+    if lower in replacements:
+        text = replacements[lower]
+    return text
+

 def get_conn():
    return psycopg2.connect(**DB)

+
 def main():
-    # Nota: asumimos español porque el contenedor instala es_core_news_md en el Dockerfile.
-    # Si quisieras soportar más idiomas, instala el modelo correspondiente y haz un mapping.
    nlp = spacy.load("es_core_news_md", disable=["parser", "lemmatizer", "textcat"])
    logging.info("spaCy cargado: es_core_news_md")

    while True:
        try:
            with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-                # Tomamos traducciones 'done' hacia NER_LANG que aún no tengan ninguna relación en tags_noticia
                cur.execute(
                    """
                    WITH pend AS (
@ -78,7 +112,7 @@ def main():
                logging.info(f"Procesando {len(rows)} traducciones para NER...")

                new_links = 0
-                new_tags = 0
+
                for r in rows:
                    text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
                    if not text:
@ -91,17 +125,14 @@ def main():
                        tipo = ENT_LABELS.get(ent.label_)
                        if not tipo:
                            continue
-                        val = _clean_value(ent.text)
-                        # filtros simples
-                        if len(val) < 2:
+                        val = clean_tag_text(ent.text)
+                        if not val:
                            continue
                        ents.append((val, tipo))

                    if not ents:
                        continue

-                    # Insertamos (o actualizamos si ya existe) el tag y luego la relación
-                    # IMPORTANTE: requiere UNIQUE(valor, tipo) en 'tags' y UNIQUE(traduccion_id, tag_id) en 'tags_noticia'
                    for valor, tipo in set(ents):
                        try:
                            cur.execute(
@ -115,7 +146,6 @@ def main():
                                (valor, tipo),
                            )
                            tag_id = cur.fetchone()[0]
-                            # Intenta crear la relación; si existe (por UNIQUE), se ignora
                            cur.execute(
                                """
                                INSERT INTO tags_noticia (traduccion_id, tag_id)
@ -126,11 +156,7 @@ def main():
                            )
                            if cur.rowcount > 0:
                                new_links += 1
-                            # Heurística: si el tag se ha creado (no hay forma directa aquí),
-                            # lo aproximamos contando que el RETURNING vino de un insert o un update.
-                            # Para no complicar: cuenta enlaces nuevos, y deja 'new_tags' como métrica opcional.
                        except Exception:
-                            # No abortar el lote por un único fallo en un valor raro.
                            logging.exception("Fallo insertando tag/relación")

                conn.commit()
@ -139,6 +165,7 @@ def main():
            logging.exception(f"Error en NER loop: {e}")
            time.sleep(5)

+
 if __name__ == "__main__":
    main()