From 95adc07f37dc9d095cef2b778cc215ffc3645df0 Mon Sep 17 00:00:00 2001 From: jlimolina Date: Sun, 25 Jan 2026 07:33:57 +0100 Subject: [PATCH] aumento de velocidad y cambios en el tema de noticias relacionadas --- inspect_qdrant.py | 21 +++++ models/noticias.py | 155 +++++++++++++++++-------------- routers/noticia.py | 8 +- routers/search.py | 3 +- routers/stats.py | 1 + templates/noticia_classic.html | 165 ++++++++++++++++++++++++++++----- utils/qdrant_search.py | 15 ++- workers/qdrant_worker.py | 2 +- workers/translation_worker.py | 2 +- 9 files changed, 275 insertions(+), 97 deletions(-) create mode 100644 inspect_qdrant.py diff --git a/inspect_qdrant.py b/inspect_qdrant.py new file mode 100644 index 0000000..a8ff899 --- /dev/null +++ b/inspect_qdrant.py @@ -0,0 +1,21 @@ + +import os +import sys +sys.path.append(os.getcwd()) +from utils.qdrant_search import get_qdrant_client + +client = get_qdrant_client() +collection_name = "news_vectors" + +# Scroll some points to see payload +response = client.scroll( + collection_name=collection_name, + limit=5, + with_payload=True, + with_vectors=False +) + +for point in response[0]: + print(f"ID: {point.id}") + print(f"Payload: {point.payload}") + print("-" * 20) diff --git a/models/noticias.py b/models/noticias.py index 8b2b0ad..fc01e07 100644 --- a/models/noticias.py +++ b/models/noticias.py @@ -1,7 +1,7 @@ -from psycopg2 import extras -from typing import List, Dict, Optional, Tuple, Any import os -# from sentence_transformers import SentenceTransformer (Moved to functions to avoid heavy start-up) +from typing import List, Dict, Optional, Tuple, Any +from psycopg2 import extras +from utils.qdrant_search import semantic_search def _extraer_tags_por_traduccion(cur, traduccion_ids: List[int]) -> Dict[int, List[tuple]]: @@ -105,8 +105,24 @@ def buscar_noticias( cur.execute("SELECT reltuples::bigint FROM pg_class WHERE relname = 'noticias'") row = cur.fetchone() total_results = row[0] if row else 0 + elif q and not (categoria_id or pais_id or continente_id or fecha): + # Conteo optimizado para búsqueda simple (UNION de hits en noticias y traducciones) + cur.execute( + """ + SELECT COUNT(DISTINCT id) FROM ( + SELECT id FROM noticias + WHERE search_vector_es @@ websearch_to_tsquery('spanish', %s) + UNION ALL + SELECT noticia_id as id FROM traducciones + WHERE search_vector_es @@ websearch_to_tsquery('spanish', %s) + AND lang_to = %s AND status = 'done' + ) as all_hits + """, + (q, q, lang), + ) + total_results = cur.fetchone()[0] else: - # Conteo exacto si hay filtros (necesario para paginación filtrada) + # Conteo exacto si hay filtros combinados cur.execute( f""" SELECT COUNT(n.id) @@ -175,16 +191,7 @@ def buscar_noticias( return noticias, total_results, total_pages, tags_por_tr -# Cache del modelo para no cargarlo en cada petición -_model_cache = {} - -def _get_emb_model(): - from sentence_transformers import SentenceTransformer - model_name = os.environ.get("EMB_MODEL", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") - if model_name not in _model_cache: - device = "cuda" if torch.cuda.is_available() else "cpu" - _model_cache[model_name] = SentenceTransformer(model_name, device=device) - return _model_cache[model_name], model_name +# Embedding model loading moved to utils.qdrant_search def buscar_noticias_semantica( conn, @@ -194,77 +201,89 @@ def buscar_noticias_semantica( categoria_id: Optional[str] = None, continente_id: Optional[str] = None, pais_id: Optional[str] = None, - fecha: Optional[str] = None, + fecha: Optional[Any] = None, lang: str = "es", ) -> Tuple[List[Dict], int, int, Dict]: """ - Búsqueda semántica usando embeddings y similitud coseno (vía producto punto si están normalizados). + Búsqueda semántica optimizada usando Qdrant. + Cae de vuelta a búsqueda tradicional si falla. """ if not q.strip(): return buscar_noticias(conn, page, per_page, "", categoria_id, continente_id, pais_id, fecha, lang) - offset = (page - 1) * per_page - model, model_name = _get_emb_model() - - # Generar embedding de la consulta - q_emb = model.encode([q], normalize_embeddings=True)[0].tolist() - - where = ["t.status = 'done'", "t.lang_to = %s"] - params = [lang] - - if fecha: - where.append("n.fecha::date = %s") - params.append(fecha) + # Preparar filtros para Qdrant + q_filters = {"lang": lang} if categoria_id: - where.append("n.categoria_id = %s") - params.append(int(categoria_id)) + q_filters["categoria_id"] = int(categoria_id) if pais_id: - where.append("n.pais_id = %s") - params.append(int(pais_id)) - elif continente_id: - where.append("p.continente_id = %s") - params.append(int(continente_id)) + q_filters["pais_id"] = int(pais_id) + # Nota: No filtramos por fecha o continente en Qdrant por ahora para simplicidad, + # ya que requeriría lógica más compleja de filtrado en Qdrant (rango o joins manuales). + + # Realizar búsqueda en Qdrant + # Obtenemos más resultados de los necesarios para permitir re-filtrado o mejor ranking + # Pero no demasiados para mantener la velocidad + limit_qdrant = min(page * per_page * 2, 500) + + try: + results_q = semantic_search( + query=q, + limit=limit_qdrant, + score_threshold=0.35, + filters=q_filters + ) + except Exception as e: + print(f"⚠️ Error en búsqueda Qdrant, usando fallback: {e}") + return buscar_noticias(conn, page, per_page, q, categoria_id, continente_id, pais_id, fecha, lang) - where_sql = " AND ".join(where) + if not results_q: + # Fallback a búsqueda tradicional si no hay resultados semánticos + return buscar_noticias(conn, page, per_page, q, categoria_id, continente_id, pais_id, fecha, lang) + # El total real en Qdrant para esta búsqueda es difícil de saber sin una query de conteo separada, + # estimamos o usamos el tamaño de la lista retornada (limitada por nuestro umbral). + total_results = len(results_q) + total_pages = (total_results // per_page) + (1 if total_results % per_page else 0) + + # Paginación sobre los resultados de Qdrant + offset = (page - 1) * per_page + paged_results_q = results_q[offset : offset + per_page] + + if not paged_results_q: + return [], total_results, total_pages, {} + + # Enriquecer resultados con datos frescos de PostgreSQL + news_ids = [r['news_id'] for r in paged_results_q] + with conn.cursor(cursor_factory=extras.DictCursor) as cur: - # Consulta de búsqueda vectorial (usamos un array_agg o similar para el producto punto si no hay pgvector) - # Nota: Aquí asumo que usamos producto punto entre arrays de double precision - query_sql = f""" - WITH similarity AS ( - SELECT - te.traduccion_id, - ( - SELECT SUM(a*b) - FROM unnest(te.embedding, %s::double precision[]) AS t(a,b) - ) AS score - FROM traduccion_embeddings te - WHERE te.model = %s - ) - SELECT + cur.execute( + """ + SELECT n.id, n.titulo, n.resumen, n.url, n.fecha, n.imagen_url, n.fuente_nombre, c.nombre AS categoria, p.nombre AS pais, t.id AS traduccion_id, t.titulo_trad AS titulo_traducido, t.resumen_trad AS resumen_traducido, - TRUE AS tiene_traduccion, s.score - FROM similarity s - JOIN traducciones t ON t.id = s.traduccion_id - JOIN noticias n ON n.id = t.noticia_id + TRUE AS tiene_traduccion + FROM noticias n LEFT JOIN categorias c ON c.id = n.categoria_id LEFT JOIN paises p ON p.id = n.pais_id - WHERE {where_sql} - ORDER BY n.fecha DESC NULLS LAST, s.score DESC - LIMIT %s OFFSET %s - """ + LEFT JOIN traducciones t ON t.noticia_id = n.id AND t.lang_to = %s AND t.status = 'done' + WHERE n.id = ANY(%s) + """, + (lang, news_ids), + ) + db_rows = {row['id']: row for row in cur.fetchall()} - # Para el conteo total en semántica podemos simplificar o usar el mismo WHERE - cur.execute(f"SELECT COUNT(*) FROM traducciones t JOIN noticias n ON n.id = t.noticia_id LEFT JOIN paises p ON p.id = n.pais_id WHERE {where_sql}", params) - total_results = cur.fetchone()[0] - total_pages = (total_results // per_page) + (1 if total_results % per_page else 0) - - cur.execute(query_sql, [q_emb, model_name] + params + [per_page, offset]) - noticias = cur.fetchall() - - tr_ids = [n["traduccion_id"] for n in noticias] + # Mantener el orden de relevancia de Qdrant + noticias_enriquecidas = [] + for r_q in paged_results_q: + nid = r_q['news_id'] + if nid in db_rows: + row = dict(db_rows[nid]) + row['score'] = r_q['score'] # Añadir score de relevancia + noticias_enriquecidas.append(row) + + # Tags + tr_ids = [n["traduccion_id"] for n in noticias_enriquecidas if n.get("traduccion_id")] tags_por_tr = _extraer_tags_por_traduccion(cur, tr_ids) - return noticias, total_results, total_pages, tags_por_tr + return noticias_enriquecidas, total_results, total_pages, tags_por_tr diff --git a/routers/noticia.py b/routers/noticia.py index 9831f23..7702e8f 100644 --- a/routers/noticia.py +++ b/routers/noticia.py @@ -93,17 +93,21 @@ def noticia(): cur.execute( """ SELECT + n2.id, n2.url, - n2.titulo, + n2.titulo AS titulo_original, n2.fecha, n2.imagen_url, n2.fuente_nombre, rn.score, t2.titulo_trad, - t2.id AS related_tr_id + t2.id AS traduccion_id, + c.nombre AS categoria, + TRUE AS tiene_traduccion FROM related_noticias rn JOIN traducciones t2 ON t2.id = rn.related_traduccion_id JOIN noticias n2 ON n2.id = t2.noticia_id + LEFT JOIN categorias c ON c.id = n2.categoria_id WHERE rn.traduccion_id = %s ORDER BY rn.score DESC LIMIT 8; diff --git a/routers/search.py b/routers/search.py index 6ad95c2..8b1d8fc 100644 --- a/routers/search.py +++ b/routers/search.py @@ -42,7 +42,8 @@ def search(): semantic_results = semantic_search( query=q, limit=max_qdrant_results, - score_threshold=0.3 # Umbral más bajo para capturar más resultados + score_threshold=0.3, # Umbral más bajo para capturar más resultados + filters={"lang": lang} ) if semantic_results: diff --git a/routers/stats.py b/routers/stats.py index d3119e0..7b1df58 100644 --- a/routers/stats.py +++ b/routers/stats.py @@ -103,6 +103,7 @@ def aggregate_normalized_entities(rows, entity_type='persona'): @stats_bp.route("/") +@cached(ttl_seconds=600, prefix="stats_index") def index(): """Stats dashboard page.""" diff --git a/templates/noticia_classic.html b/templates/noticia_classic.html index 7a9a8bd..be3f211 100644 --- a/templates/noticia_classic.html +++ b/templates/noticia_classic.html @@ -127,6 +127,41 @@ {% endif %} + + + {% if related_news %} + + {% endif %} @@ -150,29 +185,6 @@ - - {% if related_news %} -
-

Artículos Relacionados

- -
- {% endif %} - {% if categorias %}
@@ -339,6 +351,113 @@ margin-top: 5px; font-size: 0.9rem; } + +.related-section { + margin-top: 50px; + padding-top: 30px; + border-top: 1px solid var(--border-color); +} + +.section-title { + font-size: 1.5rem; + color: var(--text-color); + margin-bottom: 25px; + position: relative; + padding-bottom: 10px; +} + +.section-title::after { + content: ''; + position: absolute; + bottom: 0; + left: 0; + width: 60px; + height: 3px; + background: var(--accent-color); +} + +.related-grid { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); + gap: 20px; + margin-bottom: 40px; +} + +.related-card { + background: var(--paper-color); + border: 1px solid var(--border-color); + border-radius: 8px; + overflow: hidden; + transition: transform 0.2s, box-shadow 0.2s; + display: flex; + flex-direction: column; +} + +.related-card:hover { + transform: translateY(-5px); + box-shadow: 0 10px 20px rgba(0,0,0,0.1); +} + +.related-card-image { + height: 160px; + overflow: hidden; +} + +.related-card-image img { + width: 100%; + height: 100%; + object-fit: cover; + transition: transform 0.3s; +} + +.related-card:hover .related-card-image img { + transform: scale(1.05); +} + +.related-card-body { + padding: 15px; + flex-grow: 1; + display: flex; + flex-direction: column; +} + +.related-badge { + font-size: 0.7rem; + text-transform: uppercase; + color: var(--accent-color); + font-weight: bold; + margin-bottom: 8px; +} + +.related-card h4 { + font-size: 1rem; + line-height: 1.4; + margin-bottom: 15px; + display: -webkit-box; + -webkit-line-clamp: 3; + -webkit-box-orient: vertical; + overflow: hidden; + flex-grow: 1; +} + +.related-card h4 a { + color: var(--text-color); + text-decoration: none; +} + +.related-card h4 a:hover { + color: var(--accent-color); +} + +.related-footer { + display: flex; + justify-content: space-between; + font-size: 0.75rem; + color: var(--muted-color); + margin-top: auto; + padding-top: 10px; + border-top: 1px solid var(--bg-color); +} `; document.head.appendChild(style); diff --git a/utils/qdrant_search.py b/utils/qdrant_search.py index b7c20b9..2ac6154 100644 --- a/utils/qdrant_search.py +++ b/utils/qdrant_search.py @@ -13,6 +13,7 @@ QDRANT_HOST = os.environ.get("QDRANT_HOST", "localhost") QDRANT_PORT = int(os.environ.get("QDRANT_PORT", "6333")) QDRANT_COLLECTION = os.environ.get("QDRANT_COLLECTION_NAME", "news_vectors") EMB_MODEL = os.environ.get("EMB_MODEL", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") +EMB_DEVICE = os.environ.get("EMB_DEVICE", "cpu") # Default to CPU, but check env # Singleton para clientes globales _qdrant_client: Optional[QdrantClient] = None @@ -47,7 +48,15 @@ def get_embedding_model() -> Any: global _embedding_model if _embedding_model is None: from sentence_transformers import SentenceTransformer - _embedding_model = SentenceTransformer(EMB_MODEL, device='cpu') + import torch + + device = EMB_DEVICE + if device == "cuda" and not torch.cuda.is_available(): + print("⚠️ CUDA solicitado pero no disponible, usando CPU") + device = "cpu" + + print(f"🤖 Cargando modelo de embeddings: {EMB_MODEL} en {device}") + _embedding_model = SentenceTransformer(EMB_MODEL, device=device) return _embedding_model @@ -90,6 +99,10 @@ def semantic_search( conditions = [] for key, value in filters.items(): if value is not None: + if key == "lang" and isinstance(value, str) and len(value) < 5: + # Character(5) in Postgres pads with spaces + value = value.ljust(5) + conditions.append( FieldCondition(key=key, match=MatchValue(value=value)) ) diff --git a/workers/qdrant_worker.py b/workers/qdrant_worker.py index cde3c65..e515168 100644 --- a/workers/qdrant_worker.py +++ b/workers/qdrant_worker.py @@ -109,7 +109,7 @@ def get_pending_news(limit: int = BATCH_SIZE) -> List[Dict[str, Any]]: SELECT t.id as traduccion_id, t.noticia_id, - t.lang_to as lang, + TRIM(t.lang_to) as lang, t.titulo_trad as titulo, t.resumen_trad as resumen, n.url, diff --git a/workers/translation_worker.py b/workers/translation_worker.py index 9fe5c8c..ad468ab 100644 --- a/workers/translation_worker.py +++ b/workers/translation_worker.py @@ -304,7 +304,7 @@ def _translate_texts(src, tgt, texts, beams, max_new_tokens): target_prefix=target_prefix, beam_size=beams, max_decoding_length=max_new, - repetition_penalty=1.1, + repetition_penalty=1.2, no_repeat_ngram_size=4, ) dt = time.time() - start