optimizaciones

2025-11-24 02:37:05 +01:00 · 2025-11-24 02:37:05 +01:00 · 86ee083b90
commit 86ee083b90
parent 937da3f90b
5 changed files with 26 additions and 100 deletions
--- a/ner_worker.py
+++ b/ner_worker.py
@ -36,12 +36,11 @@ _ws_re = re.compile(r"\s+")
 HTML_TRASH_PATTERNS = [
    r"<[^>]+>",
    r"&[a-z]+;",
-    r"&#\d+;?",  # entidades numéricas tipo &#8230;
+    r"&#\d+;?",
    r'width="\d+"',
    r'height="\d+"',
 ]

-# Palabras/sintagmas demasiado genéricos o claramente ruido
 GENERIC_BAD_TAGS = {
    "república",
    "estado",
@ -110,7 +109,6 @@ TOPIC_MAX_PER_DOC = 15


 def _looks_like_attr_or_path(text_lower: str) -> bool:
-    """Detecta cosas tipo rutas, atributos HTML, ids raros, etc."""
    if text_lower.startswith("/"):
        return True
    if "http://" in text_lower or "https://" in text_lower:
@ -125,20 +123,16 @@ def _looks_like_attr_or_path(text_lower: str) -> bool:
        return True
    if re.search(r"&#\d+;?", text_lower):
        return True
-    # cosas tipo atributo=valor
    if "=" in text_lower and " " not in text_lower.strip():
        return True
-    # cadenas largas sin espacios (ids, hashes…)
    if re.fullmatch(r"[a-z0-9_]{15,}", text_lower.replace("-", "").replace(" ", "")):
        return True
-    # palabra única con guión suele ser ruta/slug: wp-content, internal-photos…
    if "-" in text_lower and " " not in text_lower:
        return True
    return False


 def clean_tag_text(text: str) -> str | None:
-    """Limpieza para entidades (PERSON/ORG/GPE/LOC)."""
    if not text:
        return None
    text = BeautifulSoup(text, "html.parser").get_text()
@ -172,7 +166,6 @@ def clean_tag_text(text: str) -> str | None:


 def clean_topic_text(text: str) -> str | None:
-    """Limpieza para posibles 'temas' (noun_chunks)."""
    if not text:
        return None
    text = BeautifulSoup(text, "html.parser").get_text()
@ -184,11 +177,9 @@ def clean_topic_text(text: str) -> str | None:
        return None

    lower = text.lower()
-
    if _looks_like_attr_or_path(lower):
        return None

-    # tokenizamos en minúsculas y quitamos puntuación
    tokens = [
        t.strip(string.punctuation)
        for t in lower.split()
@ -197,13 +188,11 @@ def clean_topic_text(text: str) -> str | None:
    if not tokens:
        return None

-    # quitamos artículo inicial si lo hay
    if tokens and tokens[0] in ARTICLES:
        tokens = tokens[1:]
        if not tokens:
            return None

-    # reconstruimos texto normalizado sin artículo
    norm = " ".join(tokens).strip()
    if len(norm) < TOPIC_MIN_CHARS:
        return None
@ -211,15 +200,12 @@ def clean_topic_text(text: str) -> str | None:
    if norm in GENERIC_BAD_TAGS:
        return None

-    # límite máximo de palabras
    if len(tokens) > TOPIC_MAX_WORDS:
        return None

-    # todos stopwords => fuera
    if all(t in STOPWORDS for t in tokens):
        return None

-    # sólo números/fechas
    if re.fullmatch(r"[0-9\s\.,\-:/]+", norm):
        return None

@ -239,7 +225,6 @@ def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]],

    doc = nlp(text)

-    # Entidades "clásicas"
    for ent in doc.ents:
        tipo = ENT_LABELS.get(ent.label_)
        if not tipo:
@ -249,7 +234,6 @@ def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]],
            continue
        ents.append((val, tipo))

-    # Candidatos a "tema" a partir de noun_chunks
    topic_counter: Counter[str] = Counter()

    for chunk in doc.noun_chunks:
@ -265,7 +249,6 @@ def extract_entities_and_topics(nlp, text: str) -> Tuple[List[Tuple[str, str]],
            continue
        topics.append((val, "tema"))

-    # quitamos duplicados
    ents = list(set(ents))
    topics = list(set(topics))
    return ents, topics