optimizaciones

This commit is contained in:
jlimolina 2025-11-24 02:37:05 +01:00
parent 937da3f90b
commit 86ee083b90
5 changed files with 26 additions and 100 deletions

View file

@ -36,12 +36,11 @@ _ws_re = re.compile(r"\s+")
HTML_TRASH_PATTERNS = [
r"<[^>]+>",
r"&[a-z]+;",
r"&#\d+;?", # entidades numéricas tipo &#8230;
r"&#\d+;?",
r'width="\d+"',
r'height="\d+"',
]
# Palabras/sintagmas demasiado genéricos o claramente ruido
GENERIC_BAD_TAGS = {
"república",
"estado",
@ -110,7 +109,6 @@ TOPIC_MAX_PER_DOC = 15
def _looks_like_attr_or_path(text_lower: str) -> bool:
"""Detecta cosas tipo rutas, atributos HTML, ids raros, etc."""
if text_lower.startswith("/"):
return True
if "http://" in text_lower or "https://" in text_lower:
@ -125,20 +123,16 @@ def _looks_like_attr_or_path(text_lower: str) -> bool:
return True
if re.search(r"&#\d+;?", text_lower):
return True
# cosas tipo atributo=valor
if "=" in text_lower and " " not in text_lower.strip():
return True
# cadenas largas sin espacios (ids, hashes…)
if re.fullmatch(r"[a-z0-9_]{15,}", text_lower.replace("-", "").replace(" ", "")):
return True
# palabra única con guión suele ser ruta/slug: wp-content, internal-photos…
if "-" in text_lower and " " not in text_lower:
return True
return False
def clean_tag_text(text: str) -> str | None:
"""Limpieza para entidades (PERSON/ORG/GPE/LOC)."""
if not text:
return None
text = BeautifulSoup(text, "html.parser").get_text()
@ -172,7 +166,6 @@ def clean_tag_text(text: str) -> str | None:
def clean_topic_text(text: str) -> str | None:
"""Limpieza para posibles 'temas' (noun_chunks)."""
if not text:
return None
text = BeautifulSoup(text, "html.parser").get_text()
@ -184,11 +177,9 @@ def clean_topic_text(text: str) -> str | None:
return None
lower = text.lower()
if _looks_like_attr_or_path(lower):
return None
# tokenizamos en minúsculas y quitamos puntuación
tokens = [
t.strip(string.punctuation)
for t in lower.split()
@ -197,13 +188,11 @@ def clean_topic_text(text: str) -> str | None:
if not tokens:
return None
# quitamos artículo inicial si lo hay
if tokens and tokens[0] in ARTICLES:
tokens = tokens[1:]
if not tokens:
return None
# reconstruimos texto normalizado sin artículo
norm = " ".join(tokens).strip()
if len(norm) < TOPIC_MIN_CHARS:
return None
@ -211,15 +200,12 @@ def clean_topic_text(text: str) -> str | None:
if norm in GENERIC_BAD_TAGS:
return None
# límite máximo de palabras
if len(tokens) > TOPIC_MAX_WORDS:
return None
# todos stopwords => fuera
if all(t in STOPWORDS for t in tokens):
return None
# sólo números/fechas
if re.fullmatch(r"[0-9\s\.,\-:/]+", norm):
return None
@ -239,7 +225,6 @@ def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]],
doc = nlp(text)
# Entidades "clásicas"
for ent in doc.ents:
tipo = ENT_LABELS.get(ent.label_)
if not tipo:
@ -249,7 +234,6 @@ def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]],
continue
ents.append((val, tipo))
# Candidatos a "tema" a partir de noun_chunks
topic_counter: Counter[str] = Counter()
for chunk in doc.noun_chunks:
@ -265,7 +249,6 @@ def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]],
continue
topics.append((val, "tema"))
# quitamos duplicados
ents = list(set(ents))
topics = list(set(topics))
return ents, topics