rss/ner_worker.py

171 lines
5 KiB
Python

import os
import time
import logging
import re
import psycopg2
import psycopg2.extras
import spacy
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')
DB = dict(
host=os.environ.get("DB_HOST", "localhost"),
port=int(os.environ.get("DB_PORT", 5432)),
dbname=os.environ.get("DB_NAME", "rss"),
user=os.environ.get("DB_USER", "rss"),
password=os.environ.get("DB_PASS", "x"),
)
NER_LANG = os.environ.get("NER_LANG", "es").strip().lower()
BATCH = int(os.environ.get("NER_BATCH", 64))
ENT_LABELS = {
"PERSON": "persona",
"ORG": "organizacion",
"GPE": "lugar",
"LOC": "lugar",
}
_ws_re = re.compile(r"\s+")
HTML_TRASH_PATTERNS = [
r"<[^>]+>",
r"&[a-z]+;",
r'width="\d+"',
r'height="\d+"',
]
GENERIC_BAD_TAGS = {
"república",
"estado",
"centro",
"gobierno",
"report",
"sp",
"unión",
}
def clean_tag_text(text):
if not text:
return None
text = BeautifulSoup(text, "html.parser").get_text()
for pat in HTML_TRASH_PATTERNS:
text = re.sub(pat, "", text)
text = _ws_re.sub(" ", text).strip()
if len(text) < 3:
return None
if re.search(r"[<>/\\]", text):
return None
lower = text.lower()
if lower.startswith("href="):
return None
if lower.startswith("http"):
return None
if lower in GENERIC_BAD_TAGS:
return None
replacements = {
"ee.uu.": "Estados Unidos",
"los estados unidos": "Estados Unidos",
"eu": "Unión Europea",
"ue": "Unión Europea",
"kosova": "Kosovo",
}
if lower in replacements:
text = replacements[lower]
return text
def get_conn():
return psycopg2.connect(**DB)
def main():
nlp = spacy.load("es_core_news_md", disable=["parser", "lemmatizer", "textcat"])
logging.info("spaCy cargado: es_core_news_md")
while True:
try:
with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute(
"""
WITH pend AS (
SELECT t.id, t.titulo_trad, t.resumen_trad
FROM traducciones t
LEFT JOIN tags_noticia tn ON tn.traduccion_id = t.id
WHERE t.status = 'done'
AND t.lang_to = %s
GROUP BY t.id, t.titulo_trad, t.resumen_trad
HAVING COUNT(tn.tag_id) = 0
ORDER BY t.id DESC
LIMIT %s
)
SELECT * FROM pend;
""",
(NER_LANG, BATCH),
)
rows = cur.fetchall()
if not rows:
time.sleep(5)
continue
logging.info(f"Procesando {len(rows)} traducciones para NER...")
new_links = 0
for r in rows:
text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
if not text:
continue
doc = nlp(text)
ents = []
for ent in doc.ents:
tipo = ENT_LABELS.get(ent.label_)
if not tipo:
continue
val = clean_tag_text(ent.text)
if not val:
continue
ents.append((val, tipo))
if not ents:
continue
for valor, tipo in set(ents):
try:
cur.execute(
"""
INSERT INTO tags (valor, tipo)
VALUES (%s, %s)
ON CONFLICT (valor, tipo)
DO UPDATE SET valor = EXCLUDED.valor
RETURNING id
""",
(valor, tipo),
)
tag_id = cur.fetchone()[0]
cur.execute(
"""
INSERT INTO tags_noticia (traduccion_id, tag_id)
VALUES (%s, %s)
ON CONFLICT DO NOTHING
""",
(r["id"], tag_id),
)
if cur.rowcount > 0:
new_links += 1
except Exception:
logging.exception("Fallo insertando tag/relación")
conn.commit()
logging.info(f"NER lote OK. Nuevos enlaces: {new_links}.")
except Exception as e:
logging.exception(f"Error en NER loop: {e}")
time.sleep(5)
if __name__ == "__main__":
main()