356 lines
9.4 KiB
Python
356 lines
9.4 KiB
Python
import os
|
|
import time
|
|
import logging
|
|
import re
|
|
import string
|
|
from typing import List, Tuple
|
|
from collections import Counter
|
|
|
|
import psycopg2
|
|
import psycopg2.extras
|
|
import spacy
|
|
from bs4 import BeautifulSoup
|
|
|
|
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')
|
|
|
|
DB = dict(
|
|
host=os.environ.get("DB_HOST", "localhost"),
|
|
port=int(os.environ.get("DB_PORT", 5432)),
|
|
dbname=os.environ.get("DB_NAME", "rss"),
|
|
user=os.environ.get("DB_USER", "rss"),
|
|
password=os.environ.get("DB_PASS", "x"),
|
|
)
|
|
|
|
NER_LANG = os.environ.get("NER_LANG", "es").strip().lower()
|
|
BATCH = int(os.environ.get("NER_BATCH", 64))
|
|
|
|
ENT_LABELS = {
|
|
"PERSON": "persona",
|
|
"ORG": "organizacion",
|
|
"GPE": "lugar",
|
|
"LOC": "lugar",
|
|
}
|
|
|
|
_ws_re = re.compile(r"\s+")
|
|
|
|
HTML_TRASH_PATTERNS = [
|
|
r"<[^>]+>",
|
|
r"&[a-z]+;",
|
|
r"&#\d+;?", # entidades numéricas tipo …
|
|
r'width="\d+"',
|
|
r'height="\d+"',
|
|
]
|
|
|
|
# Palabras/sintagmas demasiado genéricos o claramente ruido
|
|
GENERIC_BAD_TAGS = {
|
|
"república",
|
|
"estado",
|
|
"centro",
|
|
"gobierno",
|
|
"el gobierno",
|
|
"gobiernos",
|
|
"report",
|
|
"sp",
|
|
"unión",
|
|
"union",
|
|
"dólares",
|
|
"dolar",
|
|
"dólar",
|
|
"the post",
|
|
"post",
|
|
"artículo",
|
|
"el artículo",
|
|
"la ciudad",
|
|
"mundo",
|
|
"país",
|
|
"pais",
|
|
"países",
|
|
"paises",
|
|
"la noche",
|
|
"la publicación",
|
|
"este miércoles",
|
|
"el miércoles",
|
|
"hoy",
|
|
"ayer",
|
|
"mañana",
|
|
"servicio",
|
|
"servicios",
|
|
"el presidente",
|
|
"presidente",
|
|
"el ministro",
|
|
"ministro",
|
|
"la guerra",
|
|
"guerra",
|
|
"seguridad",
|
|
"wp-content",
|
|
"internal_photos",
|
|
"/internal_photos",
|
|
"https",
|
|
"http",
|
|
"src",
|
|
}
|
|
|
|
STOPWORDS = set()
|
|
|
|
ARTICLES = {
|
|
"el",
|
|
"la",
|
|
"los",
|
|
"las",
|
|
"un",
|
|
"una",
|
|
"uno",
|
|
"al",
|
|
"del",
|
|
}
|
|
|
|
TOPIC_MIN_CHARS = 4
|
|
TOPIC_MAX_WORDS = 6
|
|
TOPIC_MAX_PER_DOC = 15
|
|
|
|
|
|
def _looks_like_attr_or_path(text_lower: str) -> bool:
|
|
"""Detecta cosas tipo rutas, atributos HTML, ids raros, etc."""
|
|
if text_lower.startswith("/"):
|
|
return True
|
|
if "http://" in text_lower or "https://" in text_lower:
|
|
return True
|
|
if ".com" in text_lower or ".net" in text_lower or ".org" in text_lower:
|
|
return True
|
|
if any(ext in text_lower for ext in (".jpg", ".jpeg", ".png", ".gif", ".webp")):
|
|
return True
|
|
if re.search(r"\b(src|alt|style|class)\s*=", text_lower):
|
|
return True
|
|
if "data-" in text_lower:
|
|
return True
|
|
if re.search(r"&#\d+;?", text_lower):
|
|
return True
|
|
# cosas tipo atributo=valor
|
|
if "=" in text_lower and " " not in text_lower.strip():
|
|
return True
|
|
# cadenas largas sin espacios (ids, hashes…)
|
|
if re.fullmatch(r"[a-z0-9_]{15,}", text_lower.replace("-", "").replace(" ", "")):
|
|
return True
|
|
# palabra única con guión suele ser ruta/slug: wp-content, internal-photos…
|
|
if "-" in text_lower and " " not in text_lower:
|
|
return True
|
|
return False
|
|
|
|
|
|
def clean_tag_text(text: str) -> str | None:
|
|
"""Limpieza para entidades (PERSON/ORG/GPE/LOC)."""
|
|
if not text:
|
|
return None
|
|
text = BeautifulSoup(text, "html.parser").get_text()
|
|
for pat in HTML_TRASH_PATTERNS:
|
|
text = re.sub(pat, "", text)
|
|
text = _ws_re.sub(" ", text).strip()
|
|
text = text.strip(string.punctuation + " ")
|
|
if len(text) < 3:
|
|
return None
|
|
if re.search(r"[<>/\\]", text):
|
|
return None
|
|
lower = text.lower()
|
|
if lower.startswith("href="):
|
|
return None
|
|
if _looks_like_attr_or_path(lower):
|
|
return None
|
|
if lower in GENERIC_BAD_TAGS:
|
|
return None
|
|
|
|
replacements = {
|
|
"ee.uu.": "Estados Unidos",
|
|
"los estados unidos": "Estados Unidos",
|
|
"eeuu": "Estados Unidos",
|
|
"eu": "Unión Europea",
|
|
"ue": "Unión Europea",
|
|
"kosova": "Kosovo",
|
|
}
|
|
if lower in replacements:
|
|
text = replacements[lower]
|
|
return text
|
|
|
|
|
|
def clean_topic_text(text: str) -> str | None:
|
|
"""Limpieza para posibles 'temas' (noun_chunks)."""
|
|
if not text:
|
|
return None
|
|
text = BeautifulSoup(text, "html.parser").get_text()
|
|
for pat in HTML_TRASH_PATTERNS:
|
|
text = re.sub(pat, "", text)
|
|
text = _ws_re.sub(" ", text).strip()
|
|
text = text.strip(string.punctuation + " ")
|
|
if len(text) < TOPIC_MIN_CHARS:
|
|
return None
|
|
|
|
lower = text.lower()
|
|
|
|
if _looks_like_attr_or_path(lower):
|
|
return None
|
|
|
|
# tokenizamos en minúsculas y quitamos puntuación
|
|
tokens = [
|
|
t.strip(string.punctuation)
|
|
for t in lower.split()
|
|
if t.strip(string.punctuation)
|
|
]
|
|
if not tokens:
|
|
return None
|
|
|
|
# quitamos artículo inicial si lo hay
|
|
if tokens and tokens[0] in ARTICLES:
|
|
tokens = tokens[1:]
|
|
if not tokens:
|
|
return None
|
|
|
|
# reconstruimos texto normalizado sin artículo
|
|
norm = " ".join(tokens).strip()
|
|
if len(norm) < TOPIC_MIN_CHARS:
|
|
return None
|
|
|
|
if norm in GENERIC_BAD_TAGS:
|
|
return None
|
|
|
|
# límite máximo de palabras
|
|
if len(tokens) > TOPIC_MAX_WORDS:
|
|
return None
|
|
|
|
# todos stopwords => fuera
|
|
if all(t in STOPWORDS for t in tokens):
|
|
return None
|
|
|
|
# sólo números/fechas
|
|
if re.fullmatch(r"[0-9\s\.,\-:/]+", norm):
|
|
return None
|
|
|
|
return norm
|
|
|
|
|
|
def get_conn():
|
|
return psycopg2.connect(**DB)
|
|
|
|
|
|
def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]:
|
|
ents: List[Tuple[str, str]] = []
|
|
topics: List[Tuple[str, str]] = []
|
|
|
|
if not text:
|
|
return ents, topics
|
|
|
|
doc = nlp(text)
|
|
|
|
# Entidades "clásicas"
|
|
for ent in doc.ents:
|
|
tipo = ENT_LABELS.get(ent.label_)
|
|
if not tipo:
|
|
continue
|
|
val = clean_tag_text(ent.text)
|
|
if not val:
|
|
continue
|
|
ents.append((val, tipo))
|
|
|
|
# Candidatos a "tema" a partir de noun_chunks
|
|
topic_counter: Counter[str] = Counter()
|
|
|
|
for chunk in doc.noun_chunks:
|
|
val = clean_topic_text(chunk.text)
|
|
if not val:
|
|
continue
|
|
topic_counter[val] += 1
|
|
|
|
ent_values = {v for (v, _) in ents}
|
|
|
|
for val, _count in topic_counter.most_common(TOPIC_MAX_PER_DOC):
|
|
if val in ent_values:
|
|
continue
|
|
topics.append((val, "tema"))
|
|
|
|
# quitamos duplicados
|
|
ents = list(set(ents))
|
|
topics = list(set(topics))
|
|
return ents, topics
|
|
|
|
|
|
def main():
|
|
global STOPWORDS
|
|
|
|
nlp = spacy.load("es_core_news_md", disable=["lemmatizer", "textcat"])
|
|
STOPWORDS = set(nlp.Defaults.stop_words)
|
|
logging.info("spaCy cargado: es_core_news_md (NER + parser)")
|
|
|
|
while True:
|
|
try:
|
|
with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
|
cur.execute(
|
|
"""
|
|
WITH pend AS (
|
|
SELECT t.id, t.titulo_trad, t.resumen_trad
|
|
FROM traducciones t
|
|
LEFT JOIN tags_noticia tn ON tn.traduccion_id = t.id
|
|
WHERE t.status = 'done'
|
|
AND t.lang_to = %s
|
|
GROUP BY t.id, t.titulo_trad, t.resumen_trad
|
|
HAVING COUNT(tn.tag_id) = 0
|
|
ORDER BY t.id DESC
|
|
LIMIT %s
|
|
)
|
|
SELECT * FROM pend;
|
|
""",
|
|
(NER_LANG, BATCH),
|
|
)
|
|
rows = cur.fetchall()
|
|
|
|
if not rows:
|
|
time.sleep(5)
|
|
continue
|
|
|
|
logging.info(f"Procesando {len(rows)} traducciones para NER/temas...")
|
|
|
|
new_links = 0
|
|
|
|
for r in rows:
|
|
text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip()
|
|
if not text:
|
|
continue
|
|
|
|
ents, topics = extract_entities_and_topics(nlp, text)
|
|
all_tags = ents + topics
|
|
if not all_tags:
|
|
continue
|
|
|
|
for valor, tipo in all_tags:
|
|
try:
|
|
cur.execute(
|
|
"""
|
|
INSERT INTO tags (valor, tipo)
|
|
VALUES (%s, %s)
|
|
ON CONFLICT (valor, tipo)
|
|
DO UPDATE SET valor = EXCLUDED.valor
|
|
RETURNING id
|
|
""",
|
|
(valor, tipo),
|
|
)
|
|
tag_id = cur.fetchone()[0]
|
|
cur.execute(
|
|
"""
|
|
INSERT INTO tags_noticia (traduccion_id, tag_id)
|
|
VALUES (%s, %s)
|
|
ON CONFLICT DO NOTHING
|
|
""",
|
|
(r["id"], tag_id),
|
|
)
|
|
if cur.rowcount > 0:
|
|
new_links += 1
|
|
except Exception:
|
|
logging.exception("Fallo insertando tag/relación")
|
|
|
|
conn.commit()
|
|
logging.info(f"NER/temas lote OK. Nuevos enlaces: {new_links}.")
|
|
except Exception:
|
|
logging.exception("Error en NER loop")
|
|
time.sleep(5)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|