diff --git a/Dockerfile b/Dockerfile index df9d76a..1747b9f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,49 +1,59 @@ +# Dockerfile +# ----------- + # Imagen base Python FROM python:3.11-slim -# Por defecto construimos para CUDA 12.1 (cu121) -# Si alguna vez quisieras CPU, pásale: --build-arg TORCH_CUDA=cpu +# Construcción para CUDA 12.1 por defecto (usa --build-arg TORCH_CUDA=cpu para CPU) ARG TORCH_CUDA=cu121 WORKDIR /app -# Paquetes nativos necesarios +# Paquetes del sistema necesarios +# - libpq-dev y gcc: para compilar dependencias que hablen con PostgreSQL (psycopg2) +# - git: algunos modelos/liberías pueden tirar de git RUN apt-get update && apt-get install -y --no-install-recommends \ libpq-dev \ gcc \ git \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* -# Requerimientos -COPY requirements.txt . +# Ajustes de pip / runtime +ENV PYTHONUNBUFFERED=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + TOKENIZERS_PARALLELISM=false \ + HF_HUB_DISABLE_SYMLINKS_WARNING=1 -# Actualiza pip y herramientas base +# Dependencias Python +COPY requirements.txt ./ RUN python -m pip install --no-cache-dir --upgrade pip setuptools wheel -# Instala PyTorch con el runtime CUDA 12.1 (o CPU si TORCH_CUDA=cpu) +# Instala PyTorch: +# - Con CUDA 12.1 si TORCH_CUDA=cu121 (requiere runtime nvidia al ejecutar) +# - Con ruedas CPU si TORCH_CUDA=cpu RUN if [ "$TORCH_CUDA" = "cu121" ]; then \ pip install --no-cache-dir --index-url https://download.pytorch.org/whl/cu121 \ - torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1; \ + torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 ; \ else \ pip install --no-cache-dir \ - torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1; \ + torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 ; \ fi # Instala el resto de dependencias de tu app RUN pip install --no-cache-dir -r requirements.txt -# spaCy + modelo español (para el NER que quieres después) -RUN pip install --no-cache-dir "spacy>=3.7,<4.0" \ - && python -m spacy download es_core_news_md +# Descarga el modelo de spaCy (español) para NER +# Si el entorno de build no tiene red, no rompas la build: intenta en runtime. +RUN python -m spacy download es_core_news_md || true -# Copia el código +# Copia el código de la app COPY . . -# (Opcional) descarga recursos NLTK si tu app los usa; si no, déjalo como no-op +# Descarga de recursos NLTK que usa newspaper3k (no crítico en build) RUN python download_models.py || true -# Puerto que usará gunicorn en el servicio web +# Puerto que usa gunicorn en el servicio web EXPOSE 8000 -# El CMD lo define docker-compose +# El CMD/entrypoint se define en docker-compose.yml (web, scheduler, workers) diff --git a/actualizar_repo.sh b/actualizar_repo.sh deleted file mode 100755 index b916498..0000000 --- a/actualizar_repo.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash - -# --- Script para actualizar el repositorio de Git de forma robusta --- - -echo "🚀 Iniciando actualización del repositorio..." - -# 1. Sincronizar con el repositorio remoto para evitar conflictos -echo "----------------------------------------" -echo "🔄 Sincronizando con el repositorio remoto (git pull)..." -git pull || { echo "❌ Error al hacer git pull. Soluciona los conflictos y vuelve a intentarlo."; exit 1; } -echo "----------------------------------------" - - -# 2. Preparar todos los archivos modificados y nuevos -echo "➕ Añadiendo todos los archivos al área de preparación (git add .)" -git add . -git add -u # Asegura que los archivos eliminados también se registren - -# 3. Crear el mensaje del commit solo si hay cambios -COMMIT_MSG="Actualización del $(date +'%Y-%m-%d a las %H:%M:%S')" -echo "💬 Creando commit con el mensaje: '$COMMIT_MSG'" - -# Solo hacemos commit si hay algo que añadir para evitar commits vacíos -if ! git diff-index --quiet HEAD --; then - git commit -m "$COMMIT_MSG" -else - echo "ℹ️ No hay cambios que subir. El repositorio ya está actualizado." - exit 0 -fi - -# 4. Subir los cambios a GitHub -echo "⬆️ Subiendo cambios al repositorio remoto (git push)..." -git push || { echo "❌ Error al hacer git push. Revisa la conexión o los permisos."; exit 1; } - -echo "✅ ¡Actualización completada!" diff --git a/app.py b/app.py index 8052725..b740494 100644 --- a/app.py +++ b/app.py @@ -1,931 +1,1207 @@ import os -import sys import csv -import math -from io import StringIO, BytesIO -from datetime import datetime -import logging -import atexit -import zipfile -from contextlib import contextmanager +import io +import time +import socket +from datetime import datetime, date from concurrent.futures import ThreadPoolExecutor, as_completed -from tqdm import tqdm -from flask import Flask, render_template, request, redirect, url_for, Response, flash, make_response, abort +from dotenv import load_dotenv +from flask import ( + Flask, render_template, request, redirect, url_for, + flash, send_file +) +from markupsafe import Markup import psycopg2 -import psycopg2.extras -import psycopg2.pool -import bleach +import psycopg2.extras as extras -from feed_processor import process_single_feed -from url_processor import process_newspaper_url - -logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='[%(asctime)s] %(levelname)s in %(module)s: %(message)s') - -app = Flask(__name__) -app.config['SECRET_KEY'] = os.environ.get('SECRET_KEY', os.urandom(24)) +load_dotenv() DB_CONFIG = { - "host": os.environ.get("DB_HOST", "localhost"), - "port": int(os.environ.get("DB_PORT", 5432)), - "dbname": os.environ.get("DB_NAME", "rss"), - "user": os.environ.get("DB_USER", "rss"), - "password": os.environ.get("DB_PASS", "x") + "dbname": os.getenv("DB_NAME", "rss"), + "user": os.getenv("DB_USER", "rss"), + "password": os.getenv("DB_PASS", ""), + "host": os.getenv("DB_HOST", "localhost"), + "port": int(os.getenv("DB_PORT", "5432")), } -MAX_WORKERS = int(os.environ.get("RSS_MAX_WORKERS", 20)) -SINGLE_FEED_TIMEOUT = int(os.environ.get("RSS_FEED_TIMEOUT", 30)) -MAX_FALLOS = int(os.environ.get("RSS_MAX_FAILURES", 5)) -NEWS_PER_PAGE = int(os.environ.get("NEWS_PER_PAGE", 20)) -DEFAULT_TRANSLATION_LANG = os.environ.get("DEFAULT_TRANSLATION_LANG", "es").strip().lower() -DEFAULT_LANG = os.environ.get("DEFAULT_LANG", DEFAULT_TRANSLATION_LANG).strip().lower() -WEB_TRANSLATED_DEFAULT = os.environ.get("WEB_TRANSLATED_DEFAULT", "1").strip().lower() in ("1", "true", "yes") +DEFAULT_LANG = os.getenv("DEFAULT_LANG", "es") +DEFAULT_TRANSLATION_LANG = os.getenv("DEFAULT_TRANSLATION_LANG", "es") +WEB_TRANSLATED_DEFAULT = os.getenv("WEB_TRANSLATED_DEFAULT", "1") == "1" +NEWS_PER_PAGE_DEFAULT = int(os.getenv("NEWS_PER_PAGE", "20")) + +SECRET_KEY = os.getenv("SECRET_KEY", "cambia_esta_clave_insegura") + +RSS_MAX_WORKERS = int(os.getenv("RSS_MAX_WORKERS", "10")) +RSS_FEED_TIMEOUT = int(os.getenv("RSS_FEED_TIMEOUT", "30")) +RSS_MAX_FAILURES = int(os.getenv("RSS_MAX_FAILURES", "5")) + +app = Flask(__name__) +app.config["SECRET_KEY"] = SECRET_KEY -db_pool = None -try: - db_pool = psycopg2.pool.SimpleConnectionPool(minconn=1, maxconn=10, **DB_CONFIG) - app.logger.info("Pool de conexiones a la base de datos creado exitosamente.") -except psycopg2.OperationalError as e: - logging.error(f"FATAL: No se pudo conectar a la base de datos para crear el pool: {e}") -@contextmanager def get_conn(): - if not db_pool: - raise ConnectionError("El pool de la base de datos no está disponible.") - conn = None - try: - conn = db_pool.getconn() - yield conn - conn.commit() - except Exception as e: - if conn: - conn.rollback() - raise e - finally: - if conn: - db_pool.putconn(conn) + return psycopg2.connect(**DB_CONFIG) -@atexit.register -def shutdown_hooks(): - if db_pool: - db_pool.closeall() - app.logger.info("Pool de conexiones de la base de datos cerrado.") -@app.template_filter('safe_html') -def safe_html(text): - if not text: +def safe_html(texto): + if not texto: return "" - return bleach.clean( - text, - tags={'a', 'b', 'strong', 'i', 'em', 'p', 'br', 'ul', 'ol', 'li', 'blockquote', 'h3', 'h4'}, - attributes={'a': ['href', 'title', 'rel', 'target']}, - strip=True - ) + return Markup(texto) -def _get_form_dependencies(cursor): - cursor.execute("SELECT id, nombre FROM categorias ORDER BY nombre") - categorias = cursor.fetchall() - cursor.execute("SELECT id, nombre, continente_id FROM paises ORDER BY nombre") - paises = cursor.fetchall() - return categorias, paises -def _get_lang_and_flags(): - qlang = request.args.get("lang", "").strip().lower() - cookie_lang = (request.cookies.get("lang") or "").strip().lower() - lang = qlang or cookie_lang or DEFAULT_LANG or "es" - force_orig = request.args.get("orig") == "1" - use_translation = (not force_orig) and WEB_TRANSLATED_DEFAULT - return lang, use_translation, bool(qlang) +app.jinja_env.filters["safe_html"] = safe_html -def _build_news_query(args, *, count=False, limit=None, offset=None, lang="es", use_translation=True): - select_rank_params = [] - from_params = [] - where_params = [] - tail_params = [] - conditions = [] - q = args.get("q", "").strip() - cat_id = args.get("categoria_id") - cont_id = args.get("continente_id") - pais_id = args.get("pais_id") - fecha_filtro = args.get("fecha") +def get_categorias(conn): + with conn.cursor(cursor_factory=extras.DictCursor) as cur: + cur.execute("SELECT id, nombre FROM categorias ORDER BY nombre;") + return cur.fetchall() - sql_from = """ - FROM noticias n - LEFT JOIN categorias c ON n.categoria_id = c.id - LEFT JOIN paises p ON n.pais_id = p.id - LEFT JOIN continentes co ON p.continente_id = co.id - """ - if (not count) and use_translation: - sql_from += """ - LEFT JOIN LATERAL ( - SELECT id AS traduccion_id, titulo_trad, resumen_trad - FROM traducciones - WHERE traducciones.noticia_id = n.id - AND traducciones.lang_to = %s - AND traducciones.status = 'done' - ORDER BY id DESC - LIMIT 1 - ) t ON TRUE - """ - from_params.append(lang) +def get_continentes(conn): + with conn.cursor(cursor_factory=extras.DictCursor) as cur: + cur.execute("SELECT id, nombre FROM continentes ORDER BY nombre;") + return cur.fetchall() - if q: - conditions.append("n.tsv @@ plainto_tsquery('spanish', %s)") - where_params.append(q) - if cat_id: - conditions.append("n.categoria_id = %s") - where_params.append(cat_id) - - if pais_id: - conditions.append("n.pais_id = %s") - where_params.append(pais_id) - elif cont_id: - conditions.append("p.continente_id = %s") - where_params.append(cont_id) - - if fecha_filtro: - try: - fecha_obj = datetime.strptime(fecha_filtro, '%Y-%m-%d') - conditions.append("n.fecha::date = %s") - where_params.append(fecha_obj.date()) - except ValueError: - flash("Formato de fecha no válido. Use AAAA-MM-DD.", "error") - - where_clause = " WHERE " + " AND ".join(conditions) if conditions else "" - - if count: - sql_count = "SELECT COUNT(*) " + sql_from + where_clause - sql_params = from_params + where_params - return sql_count, sql_params - - if use_translation: - select_cols = """ - SELECT - COALESCE(t.traduccion_id, NULL) AS traduccion_id, - n.fecha, - n.titulo AS titulo_original, - n.resumen AS resumen_original, - t.titulo_trad AS titulo_traducido, - t.resumen_trad AS resumen_traducido, - COALESCE(t.titulo_trad, n.titulo) AS titulo, - COALESCE(t.resumen_trad, n.resumen) AS resumen, - n.url, n.imagen_url, n.fuente_nombre, - c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente, - (t.titulo_trad IS NOT NULL OR t.resumen_trad IS NOT NULL) AS tiene_traduccion - """ - else: - select_cols = """ - SELECT - NULL::int AS traduccion_id, - n.fecha, - n.titulo AS titulo_original, - n.resumen AS resumen_original, - NULL::text AS titulo_traducido, - NULL::text AS resumen_traducido, - n.titulo AS titulo, - n.resumen AS resumen, - n.url, n.imagen_url, n.fuente_nombre, - c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente, - FALSE AS tiene_traduccion - """ - - order_clause = " ORDER BY n.fecha DESC NULLS LAST" - - if q: - select_cols = select_cols.replace( - "SELECT", - "SELECT ts_rank(n.tsv, plainto_tsquery('spanish', %s)) AS rank," - ) - select_rank_params.append(q) - order_clause = " ORDER BY rank DESC, n.fecha DESC NULLS LAST" - - if limit is not None: - order_clause += " LIMIT %s" - tail_params.append(limit) - if offset is not None: - order_clause += " OFFSET %s" - tail_params.append(offset) - - sql_page = select_cols + sql_from + where_clause + order_clause - sql_params = select_rank_params + from_params + where_params + tail_params - return sql_page, sql_params - -@app.route("/") -def home(): - noticias, categorias, continentes, paises = [], [], [], [] - - q = request.args.get("q", "").strip() - cat_id = request.args.get("categoria_id") - cont_id = request.args.get("continente_id") - pais_id = request.args.get("pais_id") - fecha_filtro = request.args.get("fecha") - - lang, use_tr, set_cookie = _get_lang_and_flags() - - page = request.args.get("page", default=1, type=int) - per_page = request.args.get("per_page", default=NEWS_PER_PAGE, type=int) - if per_page is None or per_page <= 0: - per_page = NEWS_PER_PAGE - per_page = 100 if per_page > 100 else (10 if per_page < 10 else per_page) - if page is None or page <= 0: - page = 1 - offset = (page - 1) * per_page - - total_results = 0 - total_pages = 0 - tags_por_trad = {} - - try: - with get_conn() as conn: - with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: - cursor.execute("SELECT id, nombre FROM categorias ORDER BY nombre") - categorias = cursor.fetchall() - cursor.execute("SELECT id, nombre FROM continentes ORDER BY nombre") - continentes = cursor.fetchall() - cursor.execute("SELECT id, nombre, continente_id FROM paises ORDER BY nombre") - paises = cursor.fetchall() - - sql_count, params_count = _build_news_query( - request.args, count=True, lang=lang, use_translation=use_tr - ) - cursor.execute(sql_count, tuple(params_count)) - total_results = cursor.fetchone()[0] or 0 - total_pages = math.ceil(total_results / per_page) if total_results else 0 - - sql_page, params_page = _build_news_query( - request.args, - count=False, - limit=per_page, - offset=offset, - lang=lang, - use_translation=use_tr - ) - cursor.execute(sql_page, tuple(params_page)) - noticias = cursor.fetchall() - - # Cargar tags por traducción (si aplica) - tr_ids = [row['traduccion_id'] for row in noticias if row.get('traduccion_id')] - if tr_ids: - cursor.execute(""" - SELECT tn.traduccion_id, tg.valor, tg.tipo - FROM tags_noticia tn - JOIN tags tg ON tg.id = tn.tag_id - WHERE tn.traduccion_id = ANY(%s) - ORDER BY tg.tipo, tg.valor - """, (tr_ids,)) - for trid, valor, tipo in cursor.fetchall(): - tags_por_trad.setdefault(trid, []).append((valor, tipo)) - - except psycopg2.Error as db_err: - app.logger.error(f"[DB ERROR] Al leer noticias: {db_err}", exc_info=True) - flash("Error de base de datos al cargar las noticias.", "error") - - ctx = dict( - noticias=noticias, categorias=categorias, continentes=continentes, paises=paises, - cat_id=int(cat_id) if cat_id else None, cont_id=int(cont_id) if cont_id else None, - pais_id=int(pais_id) if pais_id else None, fecha_filtro=fecha_filtro, q=q, - page=page, per_page=per_page, total_pages=total_pages, total_results=total_results, - lang=lang, use_tr=use_tr, - tags_por_trad=tags_por_trad - ) - - if request.headers.get('X-Requested-With') == 'XMLHttpRequest': - resp = make_response(render_template('_noticias_list.html', **ctx)) - if set_cookie: - resp.set_cookie("lang", lang, max_age=60*60*24*365) - return resp - - html = render_template("noticias.html", **ctx) - resp = make_response(html) - if set_cookie: - resp.set_cookie("lang", lang, max_age=60*60*24*365) - return resp - -@app.get("/noticia/") -def noticia(tr_id): - with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: +def get_paises(conn): + with conn.cursor(cursor_factory=extras.DictCursor) as cur: cur.execute( """ - SELECT - t.id, - n.id AS noticia_id, - n.fecha, - n.titulo AS titulo_original, - n.resumen AS cuerpo_original, - t.titulo_trad AS titulo_traducido, - t.resumen_trad AS cuerpo_traducido, - n.url AS fuente_url, - n.fuente_nombre, - p.nombre AS pais, - co.nombre AS continente, - c.nombre AS categoria, - t.lang_to, - t.status - FROM traducciones t - JOIN noticias n ON n.id = t.noticia_id - LEFT JOIN paises p ON n.pais_id = p.id - LEFT JOIN continentes co ON p.continente_id = co.id - LEFT JOIN categorias c ON n.categoria_id = c.id - WHERE t.id = %s - """, - (tr_id,) + SELECT p.id, p.nombre, p.continente_id + FROM paises p + ORDER BY p.nombre; + """ ) - row = cur.fetchone() - if not row: - abort(404) - return render_template("noticia.html", r=row) + return cur.fetchall() + + +def normalize_url_py(u: str | None) -> str | None: + if not u: + return None + u = u.strip() + if not u: + return None + if "://" not in u: + u = "http://" + u + u = u.split("#", 1)[0] + + try: + from urllib.parse import urlsplit, urlunsplit, parse_qsl, urlencode + except ImportError: + return u + + sp = urlsplit(u) + scheme = sp.scheme.lower() + netloc = sp.netloc.lower() + + if netloc.startswith("www."): + netloc = netloc[4:] + + if scheme == "http" and netloc.endswith(":80"): + netloc = netloc[:-3] + if scheme == "https" and netloc.endswith(":443"): + netloc = netloc[:-4] + + qs_pairs = [] + for k, v in parse_qsl(sp.query, keep_blank_values=True): + kl = k.lower() + if kl.startswith("utm_"): + continue + if kl in ("gclid", "fbclid", "mc_cid", "mc_eid", "ref", "ref_src", "yclid", "igshid"): + continue + qs_pairs.append((k, v)) + new_query = urlencode(qs_pairs, doseq=True) + + path = sp.path or "/" + while "//" in path: + path = path.replace("//", "/") + if path != "/": + path = path.rstrip("/") + + return urlunsplit((scheme, netloc, path, new_query, "")) + + +def _parse_entry_date(entry) -> datetime | None: + dt = None + try: + if getattr(entry, "published_parsed", None): + import time as _time + dt = datetime.fromtimestamp(_time.mktime(entry.published_parsed)) + elif getattr(entry, "updated_parsed", None): + import time as _time + dt = datetime.fromtimestamp(_time.mktime(entry.updated_parsed)) + except Exception: + dt = None + return dt + + +def _process_feed(feed_row): + import feedparser + + feed_id = feed_row["id"] + feed_url = feed_row["url"] + feed_nombre = feed_row["nombre"] + categoria_id = feed_row["categoria_id"] + pais_id = feed_row["pais_id"] + + app.logger.info(f"[ingesta] Procesando feed {feed_id} '{feed_nombre}' ({feed_url})") + + try: + old_timeout = socket.getdefaulttimeout() + socket.setdefaulttimeout(RSS_FEED_TIMEOUT) + try: + parsed = feedparser.parse(feed_url) + finally: + socket.setdefaulttimeout(old_timeout) + + if parsed.bozo and parsed.bozo_exception: + app.logger.warning(f"[ingesta] Feed {feed_id} bozo={parsed.bozo}: {parsed.bozo_exception}") + + entries = parsed.entries or [] + nuevos = 0 + + with get_conn() as conn: + conn.autocommit = True + with conn.cursor() as cur: + for entry in entries: + link = getattr(entry, "link", None) or getattr(entry, "id", None) + if not link: + continue + + url_norm = normalize_url_py(link) + if not url_norm: + continue + + titulo = getattr(entry, "title", None) or url_norm + resumen = getattr(entry, "summary", None) or getattr(entry, "description", None) + if resumen: + resumen = resumen[:4000] + + fecha = _parse_entry_date(entry) or datetime.utcnow() + + imagen_url = None + try: + if hasattr(entry, "media_content") and entry.media_content: + imagen_url = entry.media_content[0].get("url") + except Exception: + imagen_url = None + + if not imagen_url: + try: + if hasattr(entry, "links"): + for l in entry.links: + if l.get("rel") == "enclosure" and l.get("type", "").startswith("image/"): + imagen_url = l.get("href") + break + except Exception: + imagen_url = None + + try: + cur.execute( + """ + INSERT INTO noticias + (id, titulo, resumen, url, fecha, imagen_url, + fuente_nombre, categoria_id, pais_id) + VALUES (md5(%s), %s, %s, %s, %s, %s, %s, %s, %s) + ON CONFLICT (url) DO NOTHING; + """, + ( + url_norm, + titulo, + resumen, + url_norm, + fecha, + imagen_url, + feed_nombre, + categoria_id, + pais_id, + ), + ) + if cur.rowcount > 0: + nuevos += 1 + except psycopg2.Error as e: + app.logger.warning(f"[ingesta] Error insertando noticia de {feed_url}: {e}") + + # Si ha ido bien, reseteamos fallos + with get_conn() as conn, conn.cursor() as cur: + cur.execute( + "UPDATE feeds SET fallos = 0 WHERE id = %s;", + (feed_id,), + ) + + app.logger.info(f"[ingesta] Feed {feed_id} OK. Nuevas noticias: {nuevos}") + + except Exception as e: + app.logger.exception(f"[ingesta] Error procesando feed {feed_id} ({feed_url}): {e}") + try: + # Incrementamos fallos y marcamos inactivo si supera RSS_MAX_FAILURES + with get_conn() as conn, conn.cursor() as cur: + cur.execute( + """ + UPDATE feeds + SET fallos = COALESCE(fallos, 0) + 1, + activo = CASE + WHEN COALESCE(fallos, 0) + 1 >= %s THEN FALSE + ELSE activo + END + WHERE id = %s; + """, + (RSS_MAX_FAILURES, feed_id), + ) + except Exception as e2: + app.logger.warning(f"[ingesta] No se pudo actualizar fallos de feed {feed_id}: {e2}") + + +def fetch_and_store_all(): + app.logger.info("[ingesta] fetch_and_store_all() iniciado") + + with get_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur: + cur.execute( + """ + SELECT id, nombre, url, categoria_id, pais_id, fallos, activo + FROM feeds + WHERE activo = TRUE + AND (fallos IS NULL OR fallos < %s) + ORDER BY id; + """, + (RSS_MAX_FAILURES,), + ) + feeds = cur.fetchall() + + if not feeds: + app.logger.info("[ingesta] No hay feeds activos para procesar.") + return + + app.logger.info(f"[ingesta] Procesando {len(feeds)} feeds (max workers = {RSS_MAX_WORKERS})") + + max_workers = max(1, RSS_MAX_WORKERS) + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = {executor.submit(_process_feed, f): f for f in feeds} + for future in as_completed(futures): + feed_row = futures[future] + try: + future.result() + except Exception as e: + app.logger.exception(f"[ingesta] Excepción no controlada en feed {feed_row['id']}: {e}") + + app.logger.info("[ingesta] fetch_and_store_all() terminado") + + +@app.route("/") +@app.route("/home") +def home(): + page = max(int(request.args.get("page", 1) or 1), 1) + per_page = int(request.args.get("per_page", NEWS_PER_PAGE_DEFAULT) or NEWS_PER_PAGE_DEFAULT) + per_page = min(max(per_page, 10), 100) + + q = (request.args.get("q") or "").strip() + categoria_id = request.args.get("categoria_id") or None + continente_id = request.args.get("continente_id") or None + pais_id = request.args.get("pais_id") or None + fecha_str = request.args.get("fecha") or "" + lang = (request.args.get("lang") or DEFAULT_TRANSLATION_LANG or DEFAULT_LANG).lower()[:5] + orig_flag = request.args.get("orig") + use_tr = not bool(orig_flag) + + fecha_filtro = None + if fecha_str: + try: + fecha_filtro = datetime.strptime(fecha_str, "%Y-%m-%d").date() + except ValueError: + fecha_filtro = None + + offset = (page - 1) * per_page + + with get_conn() as conn: + conn.autocommit = True + categorias = get_categorias(conn) + continentes = get_continentes(conn) + paises = get_paises(conn) + + params = [] + where = ["1=1"] + + if fecha_filtro: + where.append("n.fecha::date = %s") + params.append(fecha_filtro) + + if categoria_id: + where.append("n.categoria_id = %s") + params.append(int(categoria_id)) + + if pais_id: + where.append("n.pais_id = %s") + params.append(int(pais_id)) + elif continente_id: + where.append("p.continente_id = %s") + params.append(int(continente_id)) + + if q: + where.append("n.tsv @@ plainto_tsquery('spanish', %s)") + params.append(q) + + where_sql = " AND ".join(where) + + with conn.cursor(cursor_factory=extras.DictCursor) as cur: + cur.execute( + f""" + SELECT COUNT(*) + FROM noticias n + LEFT JOIN categorias c ON c.id = n.categoria_id + LEFT JOIN paises p ON p.id = n.pais_id + WHERE {where_sql} + """, + params, + ) + total_results = cur.fetchone()[0] if cur.rowcount else 0 + total_pages = (total_results // per_page) + (1 if total_results % per_page else 0) + + cur.execute( + f""" + SELECT + n.id, + n.titulo, + n.resumen, + n.url, + n.fecha, + n.imagen_url, + n.fuente_nombre, + c.nombre AS categoria, + p.nombre AS pais, + t.id AS traduccion_id, + t.titulo_trad AS titulo_traducido, + t.resumen_trad AS resumen_traducido, + CASE WHEN t.id IS NOT NULL THEN TRUE ELSE FALSE END AS tiene_traduccion, + n.titulo AS titulo_original, + n.resumen AS resumen_original + FROM noticias n + LEFT JOIN categorias c ON c.id = n.categoria_id + LEFT JOIN paises p ON p.id = n.pais_id + LEFT JOIN traducciones t + ON t.noticia_id = n.id + AND t.lang_to = %s + AND t.status = 'done' + WHERE {where_sql} + ORDER BY n.fecha DESC NULLS LAST, n.id DESC + LIMIT %s OFFSET %s + """, + [lang] + params + [per_page, offset], + ) + noticias = cur.fetchall() + + tags_por_tr = {} + tr_ids = [n["traduccion_id"] for n in noticias if n["traduccion_id"]] + if tr_ids: + cur.execute( + """ + SELECT tn.traduccion_id, tg.valor, tg.tipo + FROM tags_noticia tn + JOIN tags tg ON tg.id = tn.tag_id + WHERE tn.traduccion_id = ANY(%s); + """, + (tr_ids,), + ) + for tr_id, valor, tipo in cur.fetchall(): + tags_por_tr.setdefault(tr_id, []).append((valor, tipo)) + + context = dict( + noticias=noticias, + total_results=total_results, + total_pages=total_pages, + page=page, + per_page=per_page, + categorias=categorias, + continentes=continentes, + paises=paises, + q=q, + cat_id=int(categoria_id) if categoria_id else None, + cont_id=int(continente_id) if continente_id else None, + pais_id=int(pais_id) if pais_id else None, + fecha_filtro=fecha_str, + use_tr=use_tr, + lang=lang, + tags_por_tr=tags_por_tr, + ) + + if request.headers.get("X-Requested-With") == "XMLHttpRequest": + return render_template("_noticias_list.html", **context) + + return render_template("noticias.html", **context) + + +@app.route("/noticia") +def noticia(): + tr_id = request.args.get("tr_id") + noticia_id = request.args.get("id") + + if not tr_id and not noticia_id: + flash("No se ha indicado ninguna noticia.", "warning") + return redirect(url_for("home")) + + with get_conn() as conn: + conn.autocommit = True + with conn.cursor(cursor_factory=extras.DictCursor) as cur: + dato = None + + if tr_id: + cur.execute( + """ + SELECT + t.id AS traduccion_id, + t.lang_from, + t.lang_to, + t.titulo_trad, + t.resumen_trad, + n.id As noticia_id, + n.titulo AS titulo_orig, + n.resumen AS resumen_orig, + n.url, + n.fecha, + n.imagen_url, + n.fuente_nombre, + c.nombre AS categoria, + p.nombre AS pais + FROM traducciones t + JOIN noticias n ON n.id = t.noticia_id + LEFT JOIN categorias c ON c.id = n.categoria_id + LEFT JOIN paises p ON p.id = n.pais_id + WHERE t.id = %s + """, + (int(tr_id),), + ) + dato = cur.fetchone() + else: + cur.execute( + """ + SELECT + NULL AS traduccion_id, + NULL AS lang_from, + NULL AS lang_to, + NULL AS titulo_trad, + NULL AS resumen_trad, + n.id AS noticia_id, + n.titulo AS titulo_orig, + n.resumen AS resumen_orig, + n.url, + n.fecha, + n.imagen_url, + n.fuente_nombre, + c.nombre AS categoria, + p.nombre AS pais + FROM noticias n + LEFT JOIN categorias c ON c.id = n.categoria_id + LEFT JOIN paises p ON p.id = n.pais_id + WHERE n.id = %s + """, + (noticia_id,), + ) + dato = cur.fetchone() + + tags = [] + relacionadas = [] + + if dato and dato["traduccion_id"]: + cur.execute( + """ + SELECT tg.valor, tg.tipo + FROM tags_noticia tn + JOIN tags tg ON tg.id = tn.tag_id + WHERE tn.traduccion_id = %s + ORDER BY tg.tipo, tg.valor; + """, + (dato["traduccion_id"],), + ) + tags = cur.fetchall() + + cur.execute( + """ + SELECT + n2.url, + n2.titulo, + n2.fecha, + n2.imagen_url, + n2.fuente_nombre, + rn.score + FROM related_noticias rn + JOIN traducciones t2 ON t2.id = rn.related_traduccion_id + JOIN noticias n2 ON n2.id = t2.noticia_id + WHERE rn.traduccion_id = %s + ORDER BY rn.score DESC + LIMIT 8; + """, + (dato["traduccion_id"],), + ) + relacionadas = cur.fetchall() + + return render_template( + "noticia.html", + dato=dato, + tags=tags, + relacionadas=relacionadas, + ) + @app.route("/dashboard") def dashboard(): - stats = {'feeds_totales': 0, 'noticias_totales': 0, 'feeds_caidos': 0} - top_tags = [] + with get_conn() as conn: + conn.autocommit = True + with conn.cursor(cursor_factory=extras.DictCursor) as cur: + cur.execute("SELECT COUNT(*) FROM feeds;") + feeds_totales = cur.fetchone()[0] - try: - with get_conn() as conn: - # Usamos DictCursor aquí para poder usar t.valor / t.tipo / t.apariciones en Jinja - with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: - cursor.execute("SELECT COUNT(*) FROM feeds") - stats['feeds_totales'] = cursor.fetchone()[0] - cursor.execute("SELECT COUNT(*) FROM noticias") - stats['noticias_totales'] = cursor.fetchone()[0] - cursor.execute("SELECT COUNT(*) FROM feeds WHERE activo = FALSE") - stats['feeds_caidos'] = cursor.fetchone()[0] + cur.execute("SELECT COUNT(*) FROM noticias;") + noticias_totales = cur.fetchone()[0] - cursor.execute(""" - SELECT valor, tipo, apariciones - FROM v_tag_counts_24h - ORDER BY apariciones DESC, valor - LIMIT 20 - """) - top_tags = cursor.fetchall() + cur.execute("SELECT COUNT(*) FROM feeds WHERE activo = FALSE;") + feeds_caidos = cur.fetchone()[0] + + stats = { + "feeds_totales": feeds_totales, + "noticias_totales": noticias_totales, + "feeds_caidos": feeds_caidos, + } + + top_tags = [] + try: + cur.execute( + "SELECT id, valor, tipo, apariciones FROM v_tag_counts_24h ORDER BY apariciones DESC LIMIT 100;" + ) + top_tags = cur.fetchall() + except psycopg2.Error: + top_tags = [] - except psycopg2.Error as db_err: - app.logger.error(f"[DB ERROR] Al calcular estadísticas: {db_err}") - flash("Error al conectar con la base de datos.", "error") return render_template("dashboard.html", stats=stats, top_tags=top_tags) -@app.route("/feeds/manage") + +@app.route("/feeds") def manage_feeds(): - page = request.args.get('page', 1, type=int) - per_page = 20 + page = max(int(request.args.get("page", 1) or 1), 1) + per_page = 50 offset = (page - 1) * per_page - feeds_list, total_feeds = [], 0 - try: - with get_conn() as conn: - with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: - cursor.execute("SELECT COUNT(*) FROM feeds") - total_feeds = cursor.fetchone()[0] - cursor.execute(""" - SELECT f.id, f.nombre, f.url, c.nombre as categoria, p.nombre as pais, f.idioma, f.activo, f.fallos - FROM feeds f - LEFT JOIN categorias c ON f.categoria_id = c.id - LEFT JOIN paises p ON f.pais_id = p.id - ORDER BY f.nombre LIMIT %s OFFSET %s - """, (per_page, offset)) - feeds_list = cursor.fetchall() - except psycopg2.Error as db_err: - app.logger.error(f"[DB ERROR] Al obtener lista de feeds: {db_err}") - flash("Error al obtener la lista de feeds.", "error") - total_pages = math.ceil(total_feeds / per_page) if total_feeds > 0 else 0 - return render_template("feeds_list.html", feeds=feeds_list, page=page, total_pages=total_pages, total_feeds=total_feeds) -@app.route("/feeds/add", methods=['GET', 'POST']) + with get_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur: + cur.execute("SELECT COUNT(*) FROM feeds;") + total_feeds = cur.fetchone()[0] if cur.rowcount else 0 + total_pages = (total_feeds // per_page) + (1 if total_feeds % per_page else 0) + + cur.execute( + """ + SELECT + f.id, + f.nombre, + f.descripcion, + f.url, + f.activo, + f.fallos, + c.nombre AS categoria, + p.nombre AS pais + FROM feeds f + LEFT JOIN categorias c ON c.id = f.categoria_id + LEFT JOIN paises p ON p.id = f.pais_id + ORDER BY f.nombre + LIMIT %s OFFSET %s; + """, + (per_page, offset), + ) + feeds = cur.fetchall() + + cur.execute("SELECT id, nombre FROM categorias ORDER BY nombre;") + categorias = cur.fetchall() + cur.execute("SELECT id, nombre FROM paises ORDER BY nombre;") + paises = cur.fetchall() + + return render_template( + "feeds_list.html", + feeds=feeds, + total_feeds=total_feeds, + total_pages=total_pages, + page=page, + categorias=categorias, + paises=paises, + ) + + +@app.route("/feeds/add", methods=["GET", "POST"]) def add_feed(): - if request.method == 'POST': - nombre = request.form.get("nombre") - try: - with get_conn() as conn: - with conn.cursor() as cursor: - categoria_id = int(request.form.get("categoria_id")) if request.form.get("categoria_id") else None - pais_id = int(request.form.get("pais_id")) if request.form.get("pais_id") else None - cursor.execute( - "INSERT INTO feeds (nombre, descripcion, url, categoria_id, pais_id, idioma) VALUES (%s, %s, %s, %s, %s, %s)", - (nombre, request.form.get("descripcion"), request.form.get("url"), categoria_id, pais_id, (request.form.get("idioma", "").strip() or None)) - ) - flash(f"Feed '{nombre}' añadido correctamente.", "success") - except psycopg2.Error as db_err: - app.logger.error(f"[DB ERROR] Al agregar feed: {db_err}", exc_info=True) - flash(f"Error al añadir el feed: {db_err}", "error") - return redirect(url_for("manage_feeds")) + with get_conn() as conn: + conn.autocommit = True + categorias = get_categorias(conn) + paises = get_paises(conn) + + if request.method == "POST": + nombre = request.form.get("nombre") + descripcion = request.form.get("descripcion") or None + url = request.form.get("url") + categoria_id = request.form.get("categoria_id") or None + pais_id = request.form.get("pais_id") or None + idioma = (request.form.get("idioma") or "").strip().lower()[:2] or None + + try: + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO feeds (nombre, descripcion, url, categoria_id, pais_id, idioma) + VALUES (%s, %s, %s, %s, %s, %s); + """, + ( + nombre, + descripcion, + url, + int(categoria_id) if categoria_id else None, + int(pais_id) if pais_id else None, + idioma, + ), + ) + flash(f"Feed '{nombre}' añadido correctamente.", "success") + return redirect(url_for("manage_feeds")) + except psycopg2.Error as e: + flash(f"Error al añadir feed: {e}", "error") - categorias, paises = [], [] - try: - with get_conn() as conn: - with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: - categorias, paises = _get_form_dependencies(cursor) - except psycopg2.Error as db_err: - app.logger.error(f"[DB ERROR] Al cargar formulario: {db_err}") - flash("No se pudieron cargar las categorías o países.", "error") return render_template("add_feed.html", categorias=categorias, paises=paises) -@app.route("/feeds/edit/", methods=["GET", "POST"]) -def edit_feed(feed_id): - if request.method == "POST": - try: - with get_conn() as conn: - with conn.cursor() as cursor: - categoria_id = int(request.form.get("categoria_id")) if request.form.get("categoria_id") else None - pais_id = int(request.form.get("pais_id")) if request.form.get("pais_id") else None - idioma = request.form.get("idioma", "").strip() or None - activo = "activo" in request.form - cursor.execute( - "UPDATE feeds SET nombre=%s, descripcion=%s, url=%s, categoria_id=%s, pais_id=%s, idioma=%s, activo=%s WHERE id=%s", - (request.form.get("nombre"), request.form.get("descripcion"), request.form.get("url"), categoria_id, pais_id, idioma, activo, feed_id) - ) - flash("Feed actualizado correctamente.", "success") - except psycopg2.Error as db_err: - app.logger.error(f"[DB ERROR] Al actualizar feed: {db_err}", exc_info=True) - flash(f"Error al actualizar el feed: {db_err}", "error") - return redirect(url_for("manage_feeds")) - feed, categorias, paises = None, [], [] - try: - with get_conn() as conn: - with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: - cursor.execute("SELECT * FROM feeds WHERE id = %s", (feed_id,)) - feed = cursor.fetchone() - if not feed: - flash("No se encontró el feed solicitado.", "error") +@app.route("/feeds//edit", methods=["GET", "POST"]) +def edit_feed(feed_id): + with get_conn() as conn: + conn.autocommit = True + with conn.cursor(cursor_factory=extras.DictCursor) as cur: + cur.execute("SELECT * FROM feeds WHERE id = %s;", (feed_id,)) + feed = cur.fetchone() + if not feed: + flash("Feed no encontrado.", "error") + return redirect(url_for("manage_feeds")) + + categorias = get_categorias(conn) + paises = get_paises(conn) + + if request.method == "POST": + nombre = request.form.get("nombre") + descripcion = request.form.get("descripcion") or None + url = request.form.get("url") + categoria_id = request.form.get("categoria_id") or None + pais_id = request.form.get("pais_id") or None + idioma = (request.form.get("idioma") or "").strip().lower()[:2] or None + activo = bool(request.form.get("activo")) + + try: + cur.execute( + """ + UPDATE feeds + SET nombre = %s, + descripcion = %s, + url = %s, + categoria_id = %s, + pais_id = %s, + idioma = %s, + activo = %s + WHERE id = %s; + """, + ( + nombre, + descripcion, + url, + int(categoria_id) if categoria_id else None, + int(pais_id) if pais_id else None, + idioma, + activo, + feed_id, + ), + ) + flash("Feed actualizado correctamente.", "success") return redirect(url_for("manage_feeds")) - categorias, paises = _get_form_dependencies(cursor) - except psycopg2.Error as db_err: - flash("Error al cargar el feed para editar.", "error") - app.logger.error(f"Error al cargar feed {feed_id} para editar: {db_err}") - return redirect(url_for("manage_feeds")) + except psycopg2.Error as e: + flash(f"Error al actualizar feed: {e}", "error") + return render_template("edit_feed.html", feed=feed, categorias=categorias, paises=paises) -@app.route("/feeds/delete/") + +@app.route("/feeds//delete") def delete_feed(feed_id): - try: - with get_conn() as conn: - with conn.cursor() as cursor: - cursor.execute("DELETE FROM feeds WHERE id=%s", (feed_id,)) - flash("Feed eliminado correctamente.", "success") - except psycopg2.Error as db_err: - app.logger.error(f"[DB ERROR] Al eliminar feed: {db_err}", exc_info=True) - flash(f"Error al eliminar el feed: {db_err}", "error") + with get_conn() as conn, conn.cursor() as cur: + try: + cur.execute("DELETE FROM feeds WHERE id = %s;", (feed_id,)) + flash("Feed eliminado.", "success") + except psycopg2.Error as e: + flash(f"No se pudo eliminar el feed: {e}", "error") return redirect(url_for("manage_feeds")) -@app.route("/feeds/reactivar/") + +@app.route("/feeds//reactivar") def reactivar_feed(feed_id): - try: - with get_conn() as conn: - with conn.cursor() as cursor: - cursor.execute("UPDATE feeds SET activo = TRUE, fallos = 0 WHERE id = %s", (feed_id,)) - flash("Feed reactivado.", "success") - except psycopg2.Error as db_err: - flash(f"Error al reactivar feed: {db_err}", "error") + with get_conn() as conn, conn.cursor() as cur: + try: + cur.execute( + "UPDATE feeds SET activo = TRUE, fallos = 0 WHERE id = %s;", + (feed_id,), + ) + flash("Feed reactivado.", "success") + except psycopg2.Error as e: + flash(f"No se pudo reactivar el feed: {e}", "error") return redirect(url_for("manage_feeds")) -@app.route("/urls/manage") -def manage_urls(): - fuentes = [] - try: - with get_conn() as conn: - with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: - cursor.execute(""" - SELECT f.id, f.nombre, f.url, c.nombre as categoria, p.nombre as pais, f.idioma - FROM fuentes_url f - LEFT JOIN categorias c ON f.categoria_id = c.id - LEFT JOIN paises p ON f.pais_id = p.id - ORDER BY f.nombre - """) - fuentes = cursor.fetchall() - except psycopg2.Error as db_err: - app.logger.error(f"[DB ERROR] Al obtener lista de fuentes URL: {db_err}") - flash("Error al obtener la lista de fuentes URL.", "error") - return render_template("urls_list.html", fuentes=fuentes) -@app.route("/urls/add", methods=['GET', 'POST']) -def add_url_source(): - if request.method == 'POST': - nombre = request.form.get("nombre") - try: - with get_conn() as conn: - with conn.cursor() as cursor: - categoria_id = int(request.form.get("categoria_id")) if request.form.get("categoria_id") else None - pais_id = int(request.form.get("pais_id")) if request.form.get("pais_id") else None - idioma = request.form.get("idioma", "es").strip().lower() - cursor.execute( - "INSERT INTO fuentes_url (nombre, url, categoria_id, pais_id, idioma) VALUES (%s, %s, %s, %s, %s)", - (nombre, request.form.get("url"), categoria_id, pais_id, idioma) - ) - flash(f"Fuente URL '{nombre}' añadida correctamente.", "success") - except psycopg2.Error as db_err: - app.logger.error(f"[DB ERROR] Al agregar fuente URL: {db_err}", exc_info=True) - flash(f"Error al añadir la fuente URL: {db_err}", "error") - return redirect(url_for("manage_urls")) +@app.route("/add", methods=["POST"]) +def legacy_add_feed(): + return add_feed() - categorias, paises = [], [] - try: - with get_conn() as conn: - with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: - categorias, paises = _get_form_dependencies(cursor) - except psycopg2.Error as db_err: - app.logger.error(f"[DB ERROR] Al cargar formulario: {db_err}") - flash("No se pudieron cargar las categorías o países.", "error") - return render_template("add_url_source.html", categorias=categorias, paises=paises) - -@app.route("/urls/edit/", methods=["GET", "POST"]) -def edit_url_source(url_id): - if request.method == "POST": - try: - with get_conn() as conn: - with conn.cursor() as cursor: - categoria_id = int(request.form.get("categoria_id")) if request.form.get("categoria_id") else None - pais_id = int(request.form.get("pais_id")) if request.form.get("pais_id") else None - idioma = request.form.get("idioma", "es").strip().lower() - cursor.execute( - "UPDATE fuentes_url SET nombre=%s, url=%s, categoria_id=%s, pais_id=%s, idioma=%s WHERE id=%s", - (request.form.get("nombre"), request.form.get("url"), categoria_id, pais_id, idioma, url_id) - ) - flash("Fuente URL actualizada correctamente.", "success") - except psycopg2.Error as db_err: - app.logger.error(f"[DB ERROR] Al actualizar fuente URL: {db_err}", exc_info=True) - flash(f"Error al actualizar la fuente URL: {db_err}", "error") - return redirect(url_for("manage_urls")) - - fuente, categorias, paises = None, [], [] - try: - with get_conn() as conn: - with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: - cursor.execute("SELECT * FROM fuentes_url WHERE id = %s", (url_id,)) - fuente = cursor.fetchone() - if not fuente: - flash("No se encontró la fuente URL solicitada.", "error") - return redirect(url_for("manage_urls")) - categorias, paises = _get_form_dependencies(cursor) - except psycopg2.Error as db_err: - flash("Error al cargar la fuente URL para editar.", "error") - app.logger.error(f"Error al cargar fuente URL {url_id} para editar: {db_err}") - return redirect(url_for("manage_urls")) - return render_template("edit_url_source.html", fuente=fuente, categorias=categorias, paises=paises) - -@app.route("/urls/delete/") -def delete_url_source(url_id): - try: - with get_conn() as conn: - with conn.cursor() as cursor: - cursor.execute("DELETE FROM fuentes_url WHERE id=%s", (url_id,)) - flash("Fuente URL eliminada correctamente.", "success") - except psycopg2.Error as db_err: - flash(f"Error al eliminar la fuente URL: {db_err}", "error") - return redirect(url_for("manage_urls")) - -def fetch_and_store_all(): - logging.info("--- INICIANDO CICLO DE CAPTURA GLOBAL (RSS y URL) ---") - todas_las_noticias = [] - feeds_fallidos = [] - feeds_exitosos = [] - feeds_para_actualizar_headers = [] - logging.info("=> Parte 1: Procesando Feeds RSS...") - feeds_to_process = [] - try: - with get_conn() as conn: - with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: - cursor.execute("SELECT id, nombre, url, categoria_id, pais_id, last_etag, last_modified FROM feeds WHERE activo = TRUE") - feeds_to_process = cursor.fetchall() - logging.info(f"Encontrados {len(feeds_to_process)} feeds RSS activos para procesar.") - except psycopg2.Error as db_err: - logging.error(f"Error de BD al obtener feeds RSS: {db_err}") - return - - if feeds_to_process: - with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: - future_to_feed = {executor.submit(process_single_feed, dict(feed)): feed for feed in feeds_to_process} - for future in tqdm(as_completed(future_to_feed), total=len(feeds_to_process), desc="Procesando Fuentes RSS"): - original_feed_data = future_to_feed[future] - feed_id = original_feed_data['id'] - try: - _, noticias_encontradas, new_etag, new_modified, success = future.result(timeout=SINGLE_FEED_TIMEOUT) - if success: - feeds_exitosos.append(feed_id) - if noticias_encontradas: - todas_las_noticias.extend(noticias_encontradas) - if (new_etag and new_etag != original_feed_data.get('last_etag')) or \ - (new_modified and new_modified != original_feed_data.get('last_modified')): - feeds_para_actualizar_headers.append({'id': feed_id, 'etag': new_etag, 'modified': new_modified}) - else: - feeds_fallidos.append(feed_id) - except Exception as exc: - logging.error(f"Excepción en feed {original_feed_data['url']} (ID: {feed_id}): {exc}") - feeds_fallidos.append(feed_id) - - noticias_desde_rss_count = len(todas_las_noticias) - logging.info(f"=> Parte 1 Finalizada. Noticias desde RSS: {noticias_desde_rss_count}. Éxitos: {len(feeds_exitosos)}. Fallos: {len(feeds_fallidos)}.") - - logging.info("=> Parte 2: Procesando Fuentes URL...") - urls_to_process = [] - try: - with get_conn() as conn: - with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: - cursor.execute("SELECT * FROM fuentes_url") - urls_to_process = cursor.fetchall() - logging.info(f"Encontradas {len(urls_to_process)} fuentes URL para scrapear.") - except Exception as e: - logging.error(f"Error de BD al obtener fuentes URL: {e}") - - if urls_to_process: - with ThreadPoolExecutor(max_workers=10) as executor: - future_to_url = { - executor.submit( - process_newspaper_url, - source['nombre'], source['url'], source['categoria_id'], - source['pais_id'], source['idioma'] - ): source for source in urls_to_process - } - for future in tqdm(as_completed(future_to_url), total=len(urls_to_process), desc="Procesando Fuentes URL"): - source = future_to_url[future] - try: - noticias_encontradas, _ = future.result() - if noticias_encontradas: - todas_las_noticias.extend(noticias_encontradas) - except Exception as exc: - logging.error(f"Fallo al procesar la fuente URL {source['nombre']}: {exc}") - - noticias_desde_urls_count = len(todas_las_noticias) - noticias_desde_rss_count - logging.info(f"=> Parte 2 Finalizada. Noticias encontradas desde URLs: {noticias_desde_urls_count}.") - - logging.info("=> Parte 3: Actualizando la base de datos...") - if not any([todas_las_noticias, feeds_fallidos, feeds_exitosos, feeds_para_actualizar_headers]): - logging.info("No se encontraron nuevas noticias ni cambios en los feeds. Nada que actualizar.") - logging.info("--- CICLO DE CAPTURA GLOBAL FINALIZADO ---") - return - - try: - with get_conn() as conn: - with conn.cursor() as cursor: - if feeds_fallidos: - cursor.execute("UPDATE feeds SET fallos = fallos + 1 WHERE id IN %s", (tuple(feeds_fallidos),)) - cursor.execute("UPDATE feeds SET activo = FALSE WHERE fallos >= %s AND id IN %s", (MAX_FALLOS, tuple(feeds_fallidos))) - logging.info(f"Incrementado contador de fallos para {len(feeds_fallidos)} feeds.") - - if feeds_exitosos: - cursor.execute("UPDATE feeds SET fallos = 0 WHERE id IN %s", (tuple(feeds_exitosos),)) - logging.info(f"Reseteado contador de fallos para {len(feeds_exitosos)} feeds.") - - if feeds_para_actualizar_headers: - psycopg2.extras.execute_values( - cursor, - "UPDATE feeds SET last_etag = data.etag, last_modified = data.modified FROM (VALUES %s) AS data(id, etag, modified) WHERE feeds.id = data.id", - [(f['id'], f['etag'], f['modified']) for f in feeds_para_actualizar_headers] - ) - logging.info(f"Actualizados headers para {len(feeds_para_actualizar_headers)} feeds.") - - if todas_las_noticias: - logging.info(f"Intentando insertar/ignorar {len(todas_las_noticias)} noticias en total.") - insert_query = """ - INSERT INTO noticias (id, titulo, resumen, url, fecha, imagen_url, fuente_nombre, categoria_id, pais_id) - VALUES %s - ON CONFLICT (url) DO NOTHING; - """ - psycopg2.extras.execute_values(cursor, insert_query, todas_las_noticias, page_size=200) - logging.info(f"Inserción de noticias finalizada. {cursor.rowcount} filas podrían haber sido afectadas.") - - logging.info("=> Parte 3 Finalizada. Base de datos actualizada correctamente.") - except Exception as e: - logging.error(f"Error de BD en la actualización masiva final: {e}", exc_info=True) - - logging.info("--- CICLO DE CAPTURA GLOBAL FINALIZADO ---") @app.route("/backup_feeds") def backup_feeds(): - try: - with get_conn() as conn: - with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: - cursor.execute(""" - SELECT f.id, f.nombre, f.descripcion, f.url, f.categoria_id, c.nombre AS categoria, - f.pais_id, p.nombre AS pais, f.idioma, f.activo, f.fallos - FROM feeds f - LEFT JOIN categorias c ON f.categoria_id = c.id - LEFT JOIN paises p ON f.pais_id = p.id - ORDER BY f.id - """) - feeds_ = cursor.fetchall() - if not feeds_: - flash("No hay feeds para exportar.", "warning") - return redirect(url_for("dashboard")) - - fieldnames = list(feeds_[0].keys()) - output = StringIO() - writer = csv.DictWriter(output, fieldnames=fieldnames) - writer.writeheader() - writer.writerows([dict(feed) for feed in feeds_]) - return Response(output.getvalue(), mimetype="text/csv", headers={"Content-Disposition": "attachment;filename=feeds_backup.csv"}) - except Exception as e: - app.logger.error(f"[ERROR] Al hacer backup de feeds: {e}", exc_info=True) - flash(f"Error interno al generar el backup: {e}", "error") - return redirect(url_for("dashboard")) - -@app.route("/backup_urls") -def backup_urls(): - try: - with get_conn() as conn: - with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: - cursor.execute(""" - SELECT f.id, f.nombre, f.url, f.categoria_id, c.nombre AS categoria, f.pais_id, p.nombre AS pais, f.idioma - FROM fuentes_url f - LEFT JOIN categorias c ON f.categoria_id = c.id - LEFT JOIN paises p ON f.pais_id = p.id - ORDER BY f.id - """) - fuentes = cursor.fetchall() - if not fuentes: - flash("No hay fuentes URL para exportar.", "warning") - return redirect(url_for("dashboard")) - - fieldnames = list(fuentes[0].keys()) - output = StringIO() - writer = csv.DictWriter(output, fieldnames=fieldnames) - writer.writeheader() - writer.writerows([dict(fuente) for fuente in fuentes]) - return Response( - output.getvalue(), - mimetype="text/csv", - headers={"Content-Disposition": "attachment;filename=fuentes_url_backup.csv"} + with get_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur: + cur.execute( + """ + SELECT + f.id, + f.nombre, + f.descripcion, + f.url, + f.categoria_id, + c.nombre AS categoria, + f.pais_id, + p.nombre AS pais, + f.idioma, + f.activo, + f.fallos + FROM feeds f + LEFT JOIN categorias c ON c.id = f.categoria_id + LEFT JOIN paises p ON p.id = f.pais_id + ORDER BY f.id; + """ ) - except Exception as e: - app.logger.error(f"[ERROR] Al hacer backup de fuentes URL: {e}", exc_info=True) - flash(f"Error interno al generar el backup de fuentes URL: {e}", "error") - return redirect(url_for("dashboard")) + rows = cur.fetchall() -@app.route("/backup_noticias") -def backup_noticias(): - try: - with get_conn() as conn: - with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: - cursor.execute(""" - SELECT n.id, n.titulo, n.resumen, n.url, n.fecha, n.imagen_url, n.fuente_nombre, - c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente - FROM noticias n - LEFT JOIN categorias c ON n.categoria_id = c.id - LEFT JOIN paises p ON n.pais_id = p.id - LEFT JOIN continentes co ON p.continente_id = co.id - ORDER BY n.fecha DESC - """) - noticias = cursor.fetchall() - if not noticias: - flash("No hay noticias para exportar.", "warning") - return redirect(url_for("dashboard")) + output = io.StringIO() + writer = csv.writer(output) + writer.writerow(["id", "nombre", "descripcion", "url", "categoria_id", "categoria", + "pais_id", "pais", "idioma", "activo", "fallos"]) + for r in rows: + writer.writerow([ + r["id"], + r["nombre"], + r["descripcion"] or "", + r["url"], + r["categoria_id"] or "", + r["categoria"] or "", + r["pais_id"] or "", + r["pais"] or "", + r["idioma"] or "", + r["activo"], + r["fallos"], + ]) - fieldnames_noticias = list(noticias[0].keys()) - output = StringIO() - writer = csv.DictWriter(output, fieldnames=fieldnames_noticias) - writer.writeheader() - writer.writerows([dict(noticia) for noticia in noticias]) - return Response(output.getvalue(), mimetype="text/csv", headers={"Content-Disposition": "attachment;filename=noticias_backup.csv"}) - except Exception as e: - app.logger.error(f"[ERROR] Al hacer backup de noticias: {e}", exc_info=True) - flash(f"Error interno al generar el backup: {e}", "error") - return redirect(url_for("dashboard")) + output.seek(0) + return send_file( + io.BytesIO(output.getvalue().encode("utf-8")), + mimetype="text/csv", + as_attachment=True, + download_name="feeds_backup.csv", + ) -@app.route("/backup_completo") -def backup_completo(): - try: - memory_buffer = BytesIO() - with zipfile.ZipFile(memory_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf: - with get_conn() as conn: - with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: - cursor.execute("SELECT * FROM feeds ORDER BY id") - feeds_data = cursor.fetchall() - if feeds_data: - output_feeds = StringIO() - writer_feeds = csv.DictWriter(output_feeds, fieldnames=list(feeds_data[0].keys())) - writer_feeds.writeheader() - writer_feeds.writerows([dict(f) for f in feeds_data]) - zipf.writestr("feeds.csv", output_feeds.getvalue()) - - cursor.execute("SELECT * FROM fuentes_url ORDER BY id") - fuentes_data = cursor.fetchall() - if fuentes_data: - output_fuentes = StringIO() - writer_fuentes = csv.DictWriter(output_fuentes, fieldnames=list(fuentes_data[0].keys())) - writer_fuentes.writeheader() - writer_fuentes.writerows([dict(f) for f in fuentes_data]) - zipf.writestr("fuentes_url.csv", output_fuentes.getvalue()) - - cursor.execute("SELECT * FROM noticias ORDER BY fecha DESC") - noticias_data = cursor.fetchall() - if noticias_data: - output_noticias = StringIO() - writer_noticias = csv.DictWriter(output_noticias, fieldnames=list(noticias_data[0].keys())) - writer_noticias.writeheader() - writer_noticias.writerows([dict(n) for n in noticias_data]) - zipf.writestr("noticias.csv", output_noticias.getvalue()) - - memory_buffer.seek(0) - return Response(memory_buffer, mimetype="application/zip", headers={"Content-Disposition": "attachment;filename=rss_backup_completo.zip"}) - except Exception as e: - app.logger.error(f"[ERROR] Al hacer backup completo: {e}", exc_info=True) - flash(f"Error interno al generar el backup: {e}", "error") - return redirect(url_for("dashboard")) @app.route("/restore_feeds", methods=["GET", "POST"]) def restore_feeds(): - if request.method == "POST": - file = request.files.get("file") - if not file or not file.filename.endswith(".csv"): - flash("Archivo no válido. Sube un .csv.", "error") - return redirect(url_for("restore_feeds")) + if request.method == "GET": + return render_template("restore_feeds.html") + file = request.files.get("file") + if not file: + flash("No se ha subido ningún archivo.", "error") + return redirect(url_for("restore_feeds")) + + try: + content = file.stream.read().decode("utf-8", errors="ignore") + reader = csv.DictReader(io.StringIO(content)) + except Exception as e: + flash(f"Error leyendo el CSV: {e}", "error") + return redirect(url_for("restore_feeds")) + + def parse_int_field(row, key): + """ + Intenta convertir row[key] a int. + - Si está vacío -> None + - Si no es convertible (p.ej. 'categoria_id') -> None y log de aviso + """ + val = row.get(key) + if val is None or str(val).strip() == "": + return None try: - file_stream = StringIO(file.read().decode("utf-8", errors='ignore')) - reader = csv.DictReader(file_stream) - rows = list(reader) - n_ok, n_err = 0, 0 - with get_conn() as conn: - for row in rows: - with conn.cursor() as cursor: - try: - cursor.execute("SAVEPOINT restore_feed_row") - activo = str(row.get("activo", "")).strip().lower() in ["1", "true", "t", "yes", "on"] - cat_id = int(row["categoria_id"]) if row.get("categoria_id") and row["categoria_id"].strip() else None - pais_id = int(row["pais_id"]) if row.get("pais_id") and row["pais_id"].strip() else None - cursor.execute( - """ - INSERT INTO feeds (id, nombre, descripcion, url, categoria_id, pais_id, idioma, activo, fallos) - VALUES (%(id)s, %(nombre)s, %(descripcion)s, %(url)s, %(categoria_id)s, %(pais_id)s, %(idioma)s, %(activo)s, %(fallos)s) - ON CONFLICT (id) DO UPDATE SET - nombre=EXCLUDED.nombre, descripcion=EXCLUDED.descripcion, url=EXCLUDED.url, categoria_id=EXCLUDED.categoria_id, - pais_id=EXCLUDED.pais_id, idioma=EXCLUDED.idioma, activo=EXCLUDED.activo, fallos=EXCLUDED.fallos; - """, - { - "id": int(row["id"]), "nombre": row.get("nombre"), "descripcion": row.get("descripcion") or "", "url": row.get("url"), - "categoria_id": cat_id, "pais_id": pais_id, "idioma": row.get("idioma") or None, "activo": activo, - "fallos": int(row.get("fallos", 0) or 0) - } - ) - n_ok += 1 - cursor.execute("RELEASE SAVEPOINT restore_feed_row") - except Exception as e: - cursor.execute("ROLLBACK TO SAVEPOINT restore_feed_row") - n_err += 1 - app.logger.error(f"Error procesando fila (se omite): {row} - Error: {e}") - flash(f"Restauración completada. Feeds procesados: {n_ok}. Errores: {n_err}.", "success" if n_err == 0 else "warning") - except Exception as e: - app.logger.error(f"Error al restaurar feeds desde CSV: {e}", exc_info=True) - flash(f"Ocurrió un error general al procesar el archivo: {e}", "error") - return redirect(url_for("dashboard")) + return int(val) + except (ValueError, TypeError): + app.logger.warning( + f"[restore_feeds] Valor no numérico '{val}' en columna {key}, se usará NULL." + ) + return None + + conn = get_conn() + try: + with conn.cursor() as cur: + for row in reader: + try: + categoria_id = parse_int_field(row, "categoria_id") + pais_id = parse_int_field(row, "pais_id") + + cur.execute( + """ + INSERT INTO feeds (nombre, descripcion, url, categoria_id, pais_id, idioma, activo, fallos) + VALUES (%s,%s,%s,%s,%s,%s,%s,%s) + ON CONFLICT (url) DO UPDATE + SET nombre = EXCLUDED.nombre, + descripcion = EXCLUDED.descripcion, + categoria_id = EXCLUDED.categoria_id, + pais_id = EXCLUDED.pais_id, + idioma = EXCLUDED.idioma, + activo = EXCLUDED.activo, + fallos = EXCLUDED.fallos; + """, + ( + row["nombre"], + row.get("descripcion") or None, + row["url"], + categoria_id, + pais_id, + (row.get("idioma") or "").strip().lower()[:2] or None, + row.get("activo") in ("1", "True", "true", "t", "on"), + int(row.get("fallos") or 0), + ), + ) + conn.commit() + except psycopg2.Error as e: + print("Error restaurando feed:", e) + conn.rollback() + finally: + conn.close() + + flash("Restauración de feeds completada (con posibles errores en algunos registros).", "success") + return redirect(url_for("dashboard")) + + +@app.route("/urls") +def manage_urls(): + with get_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur: + cur.execute( + """ + SELECT + fu.id, + fu.nombre, + fu.url, + c.nombre AS categoria, + p.nombre AS pais, + fu.idioma + FROM fuentes_url fu + LEFT JOIN categorias c ON c.id = fu.categoria_id + LEFT JOIN paises p ON p.id = fu.pais_id + ORDER BY fu.nombre; + """ + ) + fuentes = cur.fetchall() + + return render_template("urls_list.html", fuentes=fuentes) + + +@app.route("/urls/add_source", methods=["GET", "POST"]) +def add_url_source(): + with get_conn() as conn: + conn.autocommit = True + categorias = get_categorias(conn) + paises = get_paises(conn) + + if request.method == "POST": + nombre = request.form.get("nombre") + url = request.form.get("url") + categoria_id = request.form.get("categoria_id") or None + pais_id = request.form.get("pais_id") or None + idioma = (request.form.get("idioma", "es") or "es").strip().lower()[:2] + + try: + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO fuentes_url (nombre, url, categoria_id, pais_id, idioma) + VALUES (%s, %s, %s, %s, %s) + ON CONFLICT (url_norm) DO UPDATE + SET nombre = EXCLUDED.nombre, + categoria_id = EXCLUDED.categoria_id, + pais_id = EXCLUDED.pais_id, + idioma = EXCLUDED.idioma; + """, + ( + nombre, + url, + int(categoria_id) if categoria_id else None, + int(pais_id) if pais_id else None, + idioma, + ), + ) + flash(f"Fuente URL '{nombre}' añadida/actualizada correctamente.", "success") + return redirect(url_for("manage_urls")) + except psycopg2.Error as e: + flash(f"Error al guardar fuente URL: {e}", "error") + + return render_template("add_url_source.html", categorias=categorias, paises=paises) + + +@app.route("/urls//edit", methods=["GET", "POST"]) +def edit_url_source(url_id): + with get_conn() as conn: + conn.autocommit = True + with conn.cursor(cursor_factory=extras.DictCursor) as cur: + cur.execute("SELECT * FROM fuentes_url WHERE id = %s;", (url_id,)) + fuente = cur.fetchone() + if not fuente: + flash("Fuente URL no encontrada.", "error") + return redirect(url_for("manage_urls")) + + categorias = get_categorias(conn) + paises = get_paises(conn) + + if request.method == "POST": + nombre = request.form.get("nombre") + url = request.form.get("url") + categoria_id = request.form.get("categoria_id") or None + pais_id = request.form.get("pais_id") or None + idioma = (request.form.get("idioma") or "").strip().lower()[:2] or "es" + + try: + cur.execute( + """ + UPDATE fuentes_url + SET nombre = %s, + url = %s, + categoria_id = %s, + pais_id = %s, + idioma = %s + WHERE id = %s; + """, + ( + nombre, + url, + int(categoria_id) if categoria_id else None, + int(pais_id) if pais_id else None, + idioma, + url_id, + ), + ) + flash("Fuente URL actualizada.", "success") + return redirect(url_for("manage_urls")) + except psycopg2.Error as e: + flash(f"Error al actualizar fuente: {e}", "error") + + return render_template("edit_url_source.html", fuente=fuente, categorias=categorias, paises=paises) + + +@app.route("/urls//delete") +def delete_url_source(url_id): + with get_conn() as conn, conn.cursor() as cur: + try: + cur.execute("DELETE FROM fuentes_url WHERE id = %s;", (url_id,)) + flash("Fuente URL eliminada.", "success") + except psycopg2.Error as e: + flash(f"No se pudo eliminar la fuente URL: {e}", "error") + return redirect(url_for("manage_urls")) + + +@app.route("/backup_urls") +def backup_urls(): + with get_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur: + cur.execute( + """ + SELECT id, nombre, url, categoria_id, pais_id, idioma + FROM fuentes_url + ORDER BY id; + """ + ) + rows = cur.fetchall() + + output = io.StringIO() + writer = csv.writer(output) + writer.writerow(["id", "nombre", "url", "categoria_id", "pais_id", "idioma"]) + for r in rows: + writer.writerow([ + r["id"], + r["nombre"], + r["url"], + r["categoria_id"] or "", + r["pais_id"] or "", + r["idioma"] or "", + ]) + + output.seek(0) + return send_file( + io.BytesIO(output.getvalue().encode("utf-8")), + mimetype="text/csv", + as_attachment=True, + download_name="fuentes_url_backup.csv", + ) - return render_template("restore_feeds.html") @app.route("/restore_urls", methods=["GET", "POST"]) def restore_urls(): - if request.method == "POST": - file = request.files.get("file") - if not file or not file.filename.endswith(".csv"): - flash("Archivo no válido. Sube un .csv.", "error") - return redirect(url_for("restore_urls")) + if request.method == "GET": + return render_template("restore_urls.html") - try: - file_stream = StringIO(file.read().decode("utf-8", errors='ignore')) - reader = csv.DictReader(file_stream) - rows = list(reader) - n_ok, n_err = 0, 0 - with get_conn() as conn: - for row in rows: - with conn.cursor() as cursor: - try: - cursor.execute("SAVEPOINT restore_url_row") - cat_id = int(row["categoria_id"]) if row.get("categoria_id") and row["categoria_id"].strip() else None - pais_id = int(row["pais_id"]) if row.get("pais_id") and row["pais_id"].strip() else None - cursor.execute( - """ - INSERT INTO fuentes_url (id, nombre, url, categoria_id, pais_id, idioma) - VALUES (%(id)s, %(nombre)s, %(url)s, %(categoria_id)s, %(pais_id)s, %(idioma)s) - ON CONFLICT (id) DO UPDATE SET - nombre=EXCLUDED.nombre, url=EXCLUDED.url, categoria_id=EXCLUDED.categoria_id, - pais_id=EXCLUDED.pais_id, idioma=EXCLUDED.idioma; - """, - { - "id": int(row["id"]), - "nombre": row.get("nombre"), - "url": row.get("url"), - "categoria_id": cat_id, - "pais_id": pais_id, - "idioma": row.get("idioma") or None - } - ) - n_ok += 1 - cursor.execute("RELEASE SAVEPOINT restore_url_row") - except Exception as e: - cursor.execute("ROLLBACK TO SAVEPOINT restore_url_row") - n_err += 1 - app.logger.error(f"Error procesando fila de fuente URL (se omite): {row} - Error: {e}") - flash(f"Restauración de Fuentes URL completada. Procesadas: {n_ok}. Errores: {n_err}.", "success" if n_err == 0 else "warning") - except Exception as e: - app.logger.error(f"Error al restaurar fuentes URL desde CSV: {e}", exc_info=True) - flash(f"Ocurrió un error general al procesar el archivo: {e}", "error") + file = request.files.get("file") + if not file: + flash("No se ha subido ningún archivo.", "error") + return redirect(url_for("restore_urls")) + + try: + content = file.stream.read().decode("utf-8", errors="ignore") + reader = csv.DictReader(io.StringIO(content)) + except Exception as e: + flash(f"Error leyendo el CSV: {e}", "error") + return redirect(url_for("restore_urls")) + + conn = get_conn() + try: + with conn.cursor() as cur: + for row in reader: + try: + cur.execute( + """ + INSERT INTO fuentes_url (id, nombre, url, categoria_id, pais_id, idioma) + VALUES (%s,%s,%s,%s,%s,%s) + ON CONFLICT (id) DO UPDATE + SET nombre = EXCLUDED.nombre, + url = EXCLUDED.url, + categoria_id = EXCLUDED.categoria_id, + pais_id = EXCLUDED.pais_id, + idioma = EXCLUDED.idioma; + """, + ( + int(row["id"]), + row["nombre"], + row["url"], + int(row["categoria_id"]) if row.get("categoria_id") else None, + int(row["pais_id"]) if row.get("pais_id") else None, + (row.get("idioma") or "es").strip().lower()[:2], + ), + ) + conn.commit() + except psycopg2.Error as e: + print("Error restaurando fuente_url:", e) + conn.rollback() + finally: + conn.close() + + flash("Importación de fuentes URL completada (con posibles errores en algunas filas).", "success") + return redirect(url_for("dashboard")) + + +@app.route("/urls/add", methods=["GET", "POST"]) +def add_url(): + with get_conn() as conn: + conn.autocommit = True + categorias = get_categorias(conn) + paises = get_paises(conn) + + if request.method == "POST": + url = request.form.get("url") + categoria_id = request.form.get("categoria_id") or None + pais_id = request.form.get("pais_id") or None + + if not url: + flash("Debes indicar una URL.", "error") + return redirect(url_for("add_url")) + + try: + from newspaper import Article + except ImportError: + flash("La librería newspaper3k no está instalada en este entorno.", "error") + return redirect(url_for("add_url")) + + try: + art = Article(url) + art.download() + art.parse() + titulo = art.title or url + resumen = (art.summary or "")[:2000] if hasattr(art, "summary") else None + imagen_url = art.top_image or None + + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO noticias (id, titulo, resumen, url, fecha, imagen_url, + fuente_nombre, categoria_id, pais_id) + VALUES (md5(%s), %s, %s, %s, NOW(), %s, %s, %s, %s) + ON CONFLICT (url) DO NOTHING; + """, + ( + url, + titulo, + resumen, + url, + imagen_url, + None, + int(categoria_id) if categoria_id else None, + int(pais_id) if pais_id else None, + ), + ) + flash("Noticia añadida desde URL.", "success") + return redirect(url_for("home")) + except Exception as e: + flash(f"Error al scrapear la URL: {e}", "error") + return redirect(url_for("add_url")) + + return render_template("add_url.html", categorias=categorias, paises=paises) + + +@app.route("/urls/scrape", methods=["GET", "POST"]) +def scrape_url(): + with get_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur: + cur.execute("SELECT id, nombre FROM fuentes_url ORDER BY nombre;") + fuentes = cur.fetchall() + + if request.method == "POST": + source_id = request.form.get("source_id") + if not source_id: + flash("Debes seleccionar una fuente.", "error") + return redirect(url_for("scrape_url")) + + flash("Scrapeo desde fuente aún no implementado (stub).", "warning") return redirect(url_for("dashboard")) - return render_template("restore_urls.html") + return render_template("scrape_url.html", fuentes=fuentes) + + +@app.route("/backup_completo") +def backup_completo(): + import zipfile + + mem_file = io.BytesIO() + with zipfile.ZipFile(mem_file, mode="w", compression=zipfile.ZIP_DEFLATED) as zf: + with get_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur: + cur.execute("SELECT * FROM feeds ORDER BY id;") + rows = cur.fetchall() + buf = io.StringIO() + writer = csv.writer(buf) + if rows: + writer.writerow(rows[0].keys()) + for r in rows: + writer.writerow(list(r.values())) + zf.writestr("feeds.csv", buf.getvalue()) + + with get_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur: + cur.execute("SELECT * FROM fuentes_url ORDER BY id;") + rows = cur.fetchall() + buf2 = io.StringIO() + writer2 = csv.writer(buf2) + if rows: + writer2.writerow(rows[0].keys()) + for r in rows: + writer2.writerow(list(r.values())) + zf.writestr("fuentes_url.csv", buf2.getvalue()) + + mem_file.seek(0) + return send_file( + mem_file, + mimetype="application/zip", + as_attachment=True, + download_name="backup_completo_rss.zip", + ) + if __name__ == "__main__": - if not db_pool: - app.logger.error("La aplicación no puede arrancar sin una conexión a la base de datos.") - sys.exit(1) app.run(host="0.0.0.0", port=8001, debug=True) diff --git a/categorias.csv b/categorias.csv new file mode 100644 index 0000000..a31ef80 --- /dev/null +++ b/categorias.csv @@ -0,0 +1,16 @@ +id,nombre +1,Ciencia +2,Cultura +3,Deportes +4,Economía +5,Educación +6,Entretenimiento +7,Internacional +8,Medio Ambiente +9,Moda +10,Opinión +11,Política +12,Salud +13,Sociedad +14,Tecnología +15,Viajes diff --git a/categorias.sql b/categorias.sql deleted file mode 100755 index 1873ea7..0000000 --- a/categorias.sql +++ /dev/null @@ -1,18 +0,0 @@ -INSERT INTO categorias (nombre) VALUES -('Ciencia'), -('Cultura'), -('Deportes'), -('Economía'), -('Educación'), -('Entretenimiento'), -('Internacional'), -('Medio Ambiente'), -('Moda'), -('Opinión'), -('Política'), -('Salud'), -('Sociedad'), -('Tecnología'), -('Viajes') -ON CONFLICT DO NOTHING; - diff --git a/continentes.csv b/continentes.csv new file mode 100644 index 0000000..444d792 --- /dev/null +++ b/continentes.csv @@ -0,0 +1,7 @@ +id,nombre +1,África +2,América +3,Asia +4,Europa +5,Oceanía +6,Antártida diff --git a/continentes.sql b/continentes.sql deleted file mode 100755 index 0efec4f..0000000 --- a/continentes.sql +++ /dev/null @@ -1,9 +0,0 @@ -INSERT INTO continentes (id, nombre) VALUES -(1, 'África'), -(2, 'América'), -(3, 'Asia'), -(4, 'Europa'), -(5, 'Oceanía'), -(6, 'Antártida') -ON CONFLICT (id) DO NOTHING; - diff --git a/docker-compose.yml b/docker-compose.yml index 82150fd..1fb5e5e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,20 +1,27 @@ services: db: - image: postgres:15 + image: postgres:18 container_name: rss_db environment: - - POSTGRES_DB=${DB_NAME} - - POSTGRES_USER=${DB_USER} - - POSTGRES_PASSWORD=${DB_PASS} + POSTGRES_DB: ${DB_NAME} + POSTGRES_USER: ${DB_USER} + POSTGRES_PASSWORD: ${DB_PASS} + POSTGRES_INITDB_ARGS: "--encoding=UTF8 --locale=C.UTF-8" + LANG: C.UTF-8 + LC_ALL: C.UTF-8 + TZ: Europe/Madrid + PGDATA: /var/lib/postgresql/data/18/main + command: ["postgres", "-c", "max_connections=400"] volumes: - - postgres_data:/var/lib/postgresql/data - - ./init-db:/docker-entrypoint-initdb.d + - /datos/rss/postgres/18:/var/lib/postgresql/data + - ./init-db:/docker-entrypoint-initdb.d:ro restart: always healthcheck: - test: ["CMD-SHELL", "pg_isready -U ${DB_USER} -d ${DB_NAME}"] + test: ["CMD-SHELL", "pg_isready -h 127.0.0.1 -p 5432 -U $$POSTGRES_USER -d $$POSTGRES_DB || exit 1"] interval: 5s timeout: 5s - retries: 5 + retries: 30 + start_period: 20s web: build: @@ -22,7 +29,7 @@ services: args: TORCH_CUDA: cu121 container_name: rss_web - command: gunicorn --bind 0.0.0.0:8000 --workers 3 app:app + command: bash -lc "gunicorn --bind 0.0.0.0:8000 --workers 3 --timeout 120 app:app" ports: - "8001:8000" environment: @@ -46,7 +53,7 @@ services: args: TORCH_CUDA: cu121 container_name: rss_scheduler - command: python scheduler.py + command: bash -lc "python scheduler.py" environment: - DB_HOST=db - DB_PORT=5432 @@ -54,6 +61,7 @@ services: - DB_USER=${DB_USER} - DB_PASS=${DB_PASS} - SECRET_KEY=${SECRET_KEY} + - RSS_MAX_WORKERS=8 depends_on: db: condition: service_healthy @@ -65,7 +73,7 @@ services: args: TORCH_CUDA: cu121 container_name: rss_translator - command: python translation_worker.py + command: bash -lc "python translation_worker.py" environment: - DB_HOST=db - DB_PORT=5432 @@ -101,7 +109,7 @@ services: - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_DRIVER_CAPABILITIES=compute,utility volumes: - - hf_cache:/root/.cache/huggingface + - /datos/rss/hf_cache:/root/.cache/huggingface depends_on: db: condition: service_healthy @@ -114,7 +122,7 @@ services: args: TORCH_CUDA: cu121 container_name: rss_ner - command: python ner_worker.py + command: bash -lc "python ner_worker.py" environment: - DB_HOST=db - DB_PORT=5432 @@ -128,7 +136,61 @@ services: condition: service_healthy restart: always -volumes: - postgres_data: - hf_cache: + embeddings: + build: + context: . + args: + TORCH_CUDA: cu121 + container_name: rss_embeddings + command: bash -lc "python embeddings_worker.py" + environment: + - DB_HOST=db + - DB_PORT=5432 + - DB_NAME=${DB_NAME} + - DB_USER=${DB_USER} + - DB_PASS=${DB_PASS} + + - EMB_MODEL=sentence-transformers/all-MiniLM-L6-v2 + - EMB_BATCH=64 + - EMB_SLEEP=5 + + - PYTHONUNBUFFERED=1 + - HF_HOME=/root/.cache/huggingface + - TOKENIZERS_PARALLELISM=false + volumes: + - /datos/rss/hf_cache:/root/.cache/huggingface + depends_on: + db: + condition: service_healthy + restart: always + # gpus: all + + related: + build: + context: . + args: + TORCH_CUDA: cu121 + container_name: rss_related + command: bash -lc "python related_worker.py" + environment: + - DB_HOST=db + - DB_PORT=5432 + - DB_NAME=${DB_NAME} + - DB_USER=${DB_USER} + - DB_PASS=${DB_PASS} + + - RELATED_TOPK=10 + - RELATED_BATCH_IDS=200 + - RELATED_BATCH_SIM=2000 + - RELATED_SLEEP=10 + - RELATED_MIN_SCORE=0.0 + - RELATED_WINDOW_H=0 + depends_on: + db: + condition: service_healthy + restart: always + +networks: + default: + name: rss_default diff --git a/embeddings_worker.py b/embeddings_worker.py new file mode 100644 index 0000000..d4f337e --- /dev/null +++ b/embeddings_worker.py @@ -0,0 +1,161 @@ +# embeddings_worker.py +# Worker de embeddings para TRADUCCIONES: +# - Lee traducciones con status='done' y sin embedding para un modelo concreto +# - Calcula embedding (Sentence-Transformers) sobre título_trad + resumen_trad +# - Guarda en traduccion_embeddings (traduccion_id, model, dim, embedding) + +import os +import time +import logging +from typing import List, Tuple +import numpy as np +import psycopg2 +import psycopg2.extras +from sentence_transformers import SentenceTransformer + +logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') +log = logging.getLogger(__name__) + +# ---------- Configuración DB ---------- +DB = dict( + host=os.environ.get("DB_HOST", "localhost"), + port=int(os.environ.get("DB_PORT", 5432)), + dbname=os.environ.get("DB_NAME", "rss"), + user=os.environ.get("DB_USER", "rss"), + password=os.environ.get("DB_PASS", "x"), +) + +# ---------- Parámetros de worker ---------- +# Modelo por defecto: MiniLM pequeño y rápido +EMB_MODEL = os.environ.get("EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2") +EMB_BATCH = int(os.environ.get("EMB_BATCH", "128")) +SLEEP_IDLE = float(os.environ.get("EMB_SLEEP_IDLE", "5.0")) +# Filtrado por idiomas destino (coma-separado). Por defecto sólo 'es' +EMB_LANGS = [s.strip() for s in os.environ.get("EMB_LANGS", "es").split(",") if s.strip()] +DEVICE = os.environ.get("DEVICE", "auto").lower() # 'auto' | 'cpu' | 'cuda' + +# Límite por iteración (para no tragar toda la tabla de golpe) +EMB_LIMIT = int(os.environ.get("EMB_LIMIT", "1000")) + +# ---------- Utilidades ---------- +def get_conn(): + return psycopg2.connect(**DB) + +def ensure_schema(conn): + """Crea la tabla de embeddings para traducciones si no existe.""" + with conn.cursor() as cur: + cur.execute(""" + CREATE TABLE IF NOT EXISTS traduccion_embeddings ( + id SERIAL PRIMARY KEY, + traduccion_id INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE, + model TEXT NOT NULL, + dim INT NOT NULL, + embedding DOUBLE PRECISION[] NOT NULL, + created_at TIMESTAMP DEFAULT NOW(), + UNIQUE (traduccion_id, model) + ); + """) + cur.execute("CREATE INDEX IF NOT EXISTS idx_tr_emb_model ON traduccion_embeddings(model);") + cur.execute("CREATE INDEX IF NOT EXISTS idx_tr_emb_trid ON traduccion_embeddings(traduccion_id);") + conn.commit() + +def fetch_batch_pending(conn) -> List[psycopg2.extras.DictRow]: + """ + Devuelve un lote de traducciones 'done' del/los idioma(s) objetivo + que no tienen embedding aún para el EMB_MODEL indicado. + """ + with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: + # Usamos ANY(%s) para filtrar por múltiples idiomas destino + cur.execute(f""" + SELECT t.id AS traduccion_id, + t.lang_to AS lang_to, + COALESCE(NULLIF(t.titulo_trad, ''), '') AS titulo_trad, + COALESCE(NULLIF(t.resumen_trad,''), '') AS resumen_trad, + n.id AS noticia_id + FROM traducciones t + JOIN noticias n ON n.id = t.noticia_id + LEFT JOIN traduccion_embeddings e + ON e.traduccion_id = t.id AND e.model = %s + WHERE t.status = 'done' + AND t.lang_to = ANY(%s) + AND e.traduccion_id IS NULL + ORDER BY t.id + LIMIT %s + """, (EMB_MODEL, EMB_LANGS, EMB_LIMIT)) + rows = cur.fetchall() + return rows + +def texts_from_rows(rows: List[psycopg2.extras.DictRow]) -> List[str]: + """ + Compone el texto a vectorizar por cada traducción: + 'titulo_trad' + '\n' + 'resumen_trad'. Si alguno falta, usa lo disponible. + """ + texts = [] + for r in rows: + title = (r["titulo_trad"] or "").strip() + body = (r["resumen_trad"] or "").strip() + if title and body: + texts.append(f"{title}\n{body}") + else: + texts.append(title or body or "") + return texts + +def upsert_embeddings(conn, rows, embs: np.ndarray, model_name: str): + """ + Inserta/actualiza embeddings por traducción. + """ + if embs.size == 0 or not rows: + return + dim = int(embs.shape[1]) + with conn.cursor() as cur: + for r, e in zip(rows, embs): + cur.execute(""" + INSERT INTO traduccion_embeddings (traduccion_id, model, dim, embedding) + VALUES (%s, %s, %s, %s) + ON CONFLICT (traduccion_id, model) DO UPDATE + SET embedding = EXCLUDED.embedding, + dim = EXCLUDED.dim, + created_at = NOW() + """, (int(r["traduccion_id"]), model_name, dim, list(map(float, e)))) + conn.commit() + +# ---------- Main loop ---------- +def main(): + log.info("Arrancando embeddings_worker para TRADUCCIONES") + log.info("Modelo: %s | Batch: %s | Idiomas: %s | Device: %s", + EMB_MODEL, EMB_BATCH, ",".join(EMB_LANGS), DEVICE) + + # Carga modelo + # DEVICE='auto' -> deja que S-B decida (usa CUDA si está disponible) + model = SentenceTransformer(EMB_MODEL, device=None if DEVICE == "auto" else DEVICE) + + while True: + try: + with get_conn() as conn: + ensure_schema(conn) + + rows = fetch_batch_pending(conn) + if not rows: + time.sleep(SLEEP_IDLE) + continue + + texts = texts_from_rows(rows) + # Normalizamos embeddings (unit-length) para facilitar similitudes posteriores + embs = model.encode( + texts, + batch_size=EMB_BATCH, + convert_to_numpy=True, + show_progress_bar=False, + normalize_embeddings=True + ) + + upsert_embeddings(conn, rows, embs, EMB_MODEL) + log.info("Embeddings upserted: %d", len(rows)) + + except Exception as e: + log.exception("Error en embeddings_worker: %s", e) + time.sleep(SLEEP_IDLE) + +if __name__ == "__main__": + main() + diff --git a/init-db/08-embeddings.sql b/init-db/08-embeddings.sql new file mode 100644 index 0000000..2e80632 --- /dev/null +++ b/init-db/08-embeddings.sql @@ -0,0 +1,81 @@ +-- init-db/08-embeddings.sql +-- ============================================================ +-- Esquema para embeddings y relaciones semánticas entre noticias +-- Compatible con embeddings_worker.py (usa traduccion_embeddings) +-- y mantiene una vista "embeddings" para compatibilidad previa. +-- ============================================================ + +-- Tabla principal de embeddings por traducción (con modelo) +CREATE TABLE IF NOT EXISTS traduccion_embeddings ( + id SERIAL PRIMARY KEY, + traduccion_id INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE, + model TEXT NOT NULL, + dim INT NOT NULL, + embedding DOUBLE PRECISION[] NOT NULL, + created_at TIMESTAMP DEFAULT NOW(), + UNIQUE (traduccion_id, model) +); + +-- Índices recomendados +CREATE INDEX IF NOT EXISTS idx_tr_emb_traduccion_id ON traduccion_embeddings(traduccion_id); +CREATE INDEX IF NOT EXISTS idx_tr_emb_model ON traduccion_embeddings(model); + +-- ----------------------------------------------------------------- +-- Vista de compatibilidad "embeddings" +-- (emula tu antigua tabla con columnas: traduccion_id, dim, vec) +-- Ajusta el valor del WHERE model = '...' si usas otro modelo. +-- ----------------------------------------------------------------- +DO $$ +BEGIN + -- Si ya existe una tabla llamada embeddings, la renombramos a embeddings_legacy para evitar conflicto + IF EXISTS ( + SELECT 1 FROM information_schema.tables + WHERE table_schema = 'public' AND table_name = 'embeddings' + ) THEN + EXECUTE 'ALTER TABLE embeddings RENAME TO embeddings_legacy'; + END IF; +EXCEPTION WHEN others THEN + -- No bloqueamos la migración por esto + NULL; +END$$; + +-- Crea/actualiza la vista +CREATE OR REPLACE VIEW embeddings AS +SELECT + te.traduccion_id, + te.dim, + te.embedding AS vec +FROM traduccion_embeddings te +WHERE te.model = 'sentence-transformers/all-MiniLM-L6-v2'; + +-- Nota: +-- Si quieres que la vista siempre coja el embedding más reciente de CUALQUIER modelo: +-- REEMPLAZA el WHERE anterior por: +-- WHERE te.id IN ( +-- SELECT DISTINCT ON (traduccion_id) id +-- FROM traduccion_embeddings +-- ORDER BY traduccion_id, created_at DESC +-- ); + +-- ----------------------------------------------------------------- +-- Relaciones semánticas entre traducciones (opcional) +-- Esta tabla no la usa el worker directamente, pero permite cachear +-- "noticias relacionadas" precalculadas por otro proceso/batch. +-- ----------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS related_noticias ( + traduccion_id INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE, + related_traduccion_id INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE, + score DOUBLE PRECISION NOT NULL, + created_at TIMESTAMP DEFAULT NOW(), + PRIMARY KEY (traduccion_id, related_traduccion_id), + CHECK (traduccion_id <> related_traduccion_id) +); + +-- Índices para acelerar consultas en ambos sentidos +CREATE INDEX IF NOT EXISTS idx_related_by_tr ON related_noticias (traduccion_id); +CREATE INDEX IF NOT EXISTS idx_related_by_relatedtr ON related_noticias (related_traduccion_id); + +-- Sugerencias: +-- - Si pretendes recalcular periódicamente, podrías limpiar por ventana temporal: +-- DELETE FROM related_noticias WHERE created_at < NOW() - INTERVAL '7 days'; + diff --git a/migrations/001_utils_normalize_url.sql b/migrations/001_utils_normalize_url.sql new file mode 100644 index 0000000..7ba2b1a --- /dev/null +++ b/migrations/001_utils_normalize_url.sql @@ -0,0 +1,62 @@ +-- Función para “normalizar” URLs: minúsculas en esquema/host, quitar www, +-- quitar fragmentos (#...), limpiar trackers comunes (utm_*, gclid, fbclid, ref, etc.), +-- colapsar dobles “/”, y quitar la “/” final salvo si es la raíz. + +CREATE OR REPLACE FUNCTION normalize_url(in_url text) +RETURNS text +LANGUAGE plpgsql +AS $$ +DECLARE + u text := trim(in_url); + scheme_host text; + path_q text; +BEGIN + IF u IS NULL OR u = '' THEN + RETURN NULL; + END IF; + + -- quitar espacios y fragmentos + u := regexp_replace(u, '#.*$', '', 'i'); + + -- separar esquema+host de path+query + -- ej: https://example.com:443/foo?bar -> scheme_host=https://example.com:443 ; path_q=/foo?bar + scheme_host := substring(u FROM '^[a-z]+://[^/]*'); + IF scheme_host IS NULL THEN + -- si no hay esquema, asumimos http + u := 'http://' || u; + scheme_host := substring(u FROM '^[a-z]+://[^/]*'); + END IF; + path_q := substring(u FROM '^[a-z]+://[^/]*(/.*)$'); + IF path_q IS NULL THEN + path_q := '/'; + END IF; + + -- normalizar esquema y host (minúsculas, quitar www.) + scheme_host := lower(scheme_host); + scheme_host := regexp_replace(scheme_host, '^(https?://)www\.', '\1', 'i'); + + -- quitar puerto por defecto (:80 en http, :443 en https) + scheme_host := regexp_replace(scheme_host, '^http://([^/:]+):80$', 'http://\1', 'i'); + scheme_host := regexp_replace(scheme_host, '^https://([^/:]+):443$', 'https://\1', 'i'); + + -- limpiar parámetros de tracking en la query + -- elimina ?utm_... &utm_... gclid fbclid mc_cid mc_eid ref ref_src etc. + path_q := regexp_replace(path_q, '([?&])(utm_[^=&]+|gclid|fbclid|mc_cid|mc_eid|ref|ref_src|yclid|igshid)=[^&#]*', '\1', 'gi'); + -- limpiar conectores sobrantes ?, &, &&, ?&, etc. + path_q := regexp_replace(path_q, '\?&+', '?', 'g'); + path_q := regexp_replace(path_q, '&{2,}', '&', 'g'); + path_q := regexp_replace(path_q, '\?$', '', 'g'); + path_q := regexp_replace(path_q, '\?$','', 'g'); + + -- colapsar dobles barras en path (no tocar “://”) + path_q := regexp_replace(path_q, '/{2,}', '/', 'g'); + + -- quitar “/” final si no es la raíz + IF path_q <> '/' THEN + path_q := regexp_replace(path_q, '/+$', '', 'g'); + END IF; + + RETURN scheme_host || path_q; +END; +$$; + diff --git a/migrations/002_unique_index_url_norm.sql b/migrations/002_unique_index_url_norm.sql new file mode 100644 index 0000000..0736c09 --- /dev/null +++ b/migrations/002_unique_index_url_norm.sql @@ -0,0 +1,38 @@ +-- Añadir columna generada url_norm y crear índice único sobre ella. +-- OJO: si ya existen duplicados, este índice fallará. +-- Primero crea la columna si no existe: + +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name='feeds' AND column_name='url_norm' + ) THEN + ALTER TABLE feeds + ADD COLUMN url_norm text GENERATED ALWAYS AS (normalize_url(url)) STORED; + END IF; +END $$; + +-- Índice único (concurrently para no bloquear). Requiere estar fuera de transacción. +-- Si tu herramienta corre todo en una transacción, ejecuta estas dos líneas aparte. +-- Quita duplicados antes si da error. +CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS feeds_url_norm_uniq ON feeds (url_norm) +WHERE url_norm IS NOT NULL; + +-- (Opcional) repetir lo mismo para fuentes_url y noticias si quieres esa garantía también: + +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name='fuentes_url' AND column_name='url_norm' + ) THEN + ALTER TABLE fuentes_url + ADD COLUMN url_norm text GENERATED ALWAYS AS (normalize_url(url)) STORED; + END IF; +END $$; + +CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS fuentes_url_norm_uniq ON fuentes_url (url_norm) +WHERE url_norm IS NOT NULL; + + diff --git a/ner_worker.py b/ner_worker.py index 5785201..f393e59 100644 --- a/ner_worker.py +++ b/ner_worker.py @@ -5,6 +5,7 @@ import re import psycopg2 import psycopg2.extras import spacy +from bs4 import BeautifulSoup logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') @@ -16,13 +17,9 @@ DB = dict( password=os.environ.get("DB_PASS", "x"), ) -# Idioma de las traducciones que vamos a etiquetar NER_LANG = os.environ.get("NER_LANG", "es").strip().lower() - -# Tamaño de lote de traducciones a procesar por iteración BATCH = int(os.environ.get("NER_BATCH", 64)) -# Mapeo de etiquetas de spaCy -> tipos de nuestro esquema ENT_LABELS = { "PERSON": "persona", "ORG": "organizacion", @@ -30,28 +27,65 @@ ENT_LABELS = { "LOC": "lugar", } -# Normaliza el valor del tag (quita espacios extra, colapsa espacios internos) _ws_re = re.compile(r"\s+") -def _clean_value(s: str) -> str: - if not s: - return "" - s = s.strip() - s = _ws_re.sub(" ", s) - return s +HTML_TRASH_PATTERNS = [ + r"<[^>]+>", + r"&[a-z]+;", + r'width="\d+"', + r'height="\d+"', +] +GENERIC_BAD_TAGS = { + "república", + "estado", + "centro", + "gobierno", + "report", + "sp", + "unión", +} + + +def clean_tag_text(text): + if not text: + return None + text = BeautifulSoup(text, "html.parser").get_text() + for pat in HTML_TRASH_PATTERNS: + text = re.sub(pat, "", text) + text = _ws_re.sub(" ", text).strip() + if len(text) < 3: + return None + if re.search(r"[<>/\\]", text): + return None + lower = text.lower() + if lower.startswith("href="): + return None + if lower.startswith("http"): + return None + if lower in GENERIC_BAD_TAGS: + return None + replacements = { + "ee.uu.": "Estados Unidos", + "los estados unidos": "Estados Unidos", + "eu": "Unión Europea", + "ue": "Unión Europea", + "kosova": "Kosovo", + } + if lower in replacements: + text = replacements[lower] + return text + def get_conn(): return psycopg2.connect(**DB) + def main(): - # Nota: asumimos español porque el contenedor instala es_core_news_md en el Dockerfile. - # Si quisieras soportar más idiomas, instala el modelo correspondiente y haz un mapping. nlp = spacy.load("es_core_news_md", disable=["parser", "lemmatizer", "textcat"]) logging.info("spaCy cargado: es_core_news_md") while True: try: with get_conn() as conn, conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: - # Tomamos traducciones 'done' hacia NER_LANG que aún no tengan ninguna relación en tags_noticia cur.execute( """ WITH pend AS ( @@ -78,7 +112,7 @@ def main(): logging.info(f"Procesando {len(rows)} traducciones para NER...") new_links = 0 - new_tags = 0 + for r in rows: text = f"{r['titulo_trad'] or ''}\n{r['resumen_trad'] or ''}".strip() if not text: @@ -91,17 +125,14 @@ def main(): tipo = ENT_LABELS.get(ent.label_) if not tipo: continue - val = _clean_value(ent.text) - # filtros simples - if len(val) < 2: + val = clean_tag_text(ent.text) + if not val: continue ents.append((val, tipo)) if not ents: continue - # Insertamos (o actualizamos si ya existe) el tag y luego la relación - # IMPORTANTE: requiere UNIQUE(valor, tipo) en 'tags' y UNIQUE(traduccion_id, tag_id) en 'tags_noticia' for valor, tipo in set(ents): try: cur.execute( @@ -115,7 +146,6 @@ def main(): (valor, tipo), ) tag_id = cur.fetchone()[0] - # Intenta crear la relación; si existe (por UNIQUE), se ignora cur.execute( """ INSERT INTO tags_noticia (traduccion_id, tag_id) @@ -126,11 +156,7 @@ def main(): ) if cur.rowcount > 0: new_links += 1 - # Heurística: si el tag se ha creado (no hay forma directa aquí), - # lo aproximamos contando que el RETURNING vino de un insert o un update. - # Para no complicar: cuenta enlaces nuevos, y deja 'new_tags' como métrica opcional. except Exception: - # No abortar el lote por un único fallo en un valor raro. logging.exception("Fallo insertando tag/relación") conn.commit() @@ -139,6 +165,7 @@ def main(): logging.exception(f"Error en NER loop: {e}") time.sleep(5) + if __name__ == "__main__": main() diff --git a/paises.csv b/paises.csv new file mode 100644 index 0000000..ea90890 --- /dev/null +++ b/paises.csv @@ -0,0 +1,196 @@ +id,nombre,continente_id +1,Afganistán,3 +2,Albania,4 +3,Alemania,4 +4,Andorra,4 +5,Angola,1 +6,Antigua y Barbuda,2 +7,Arabia Saudita,3 +8,Argelia,1 +9,Argentina,2 +10,Armenia,3 +11,Australia,5 +12,Austria,4 +13,Azerbaiyán,3 +14,Bahamas,2 +15,Bangladés,3 +16,Barbados,2 +17,Baréin,3 +19,Belice,2 +20,Benín,1 +21,Bielorrusia,4 +22,Birmania,3 +23,Bolivia,2 +24,Bosnia y Herzegovina,4 +25,Botsuana,1 +26,Brasil,2 +27,Brunéi,3 +28,Bulgaria,4 +29,Burkina Faso,1 +30,Burundi,1 +31,Bután,3 +18,Bélgica,4 +32,Cabo Verde,1 +33,Camboya,3 +34,Camerún,1 +35,Canadá,2 +36,Catar,3 +37,Chad,1 +38,Chile,2 +39,China,3 +40,Chipre,3 +41,Colombia,2 +42,Comoras,1 +43,Corea del Norte,3 +44,Corea del Sur,3 +46,Costa Rica,2 +45,Costa de Marfil,1 +47,Croacia,4 +48,Cuba,2 +49,Dinamarca,4 +50,Dominica,2 +51,Ecuador,2 +52,Egipto,1 +53,El Salvador,2 +54,Emiratos Árabes Unidos,3 +55,Eritrea,1 +56,Eslovaquia,4 +57,Eslovenia,4 +58,España,4 +59,Estados Unidos,2 +60,Estonia,4 +61,Esuatini,1 +62,Etiopía,1 +63,Filipinas,3 +64,Finlandia,4 +65,Fiyi,5 +66,Francia,4 +67,Gabón,1 +68,Gambia,1 +69,Georgia,3 +70,Ghana,1 +71,Granada,2 +72,Grecia,4 +73,Guatemala,2 +74,Guinea,1 +76,Guinea Ecuatorial,1 +75,Guinea-Bisáu,1 +77,Guyana,2 +78,Haití,2 +79,Honduras,2 +80,Hungría,4 +81,India,3 +82,Indonesia,3 +83,Irak,3 +85,Irlanda,4 +84,Irán,3 +86,Islandia,4 +87,Islas Marshall,5 +88,Islas Salomón,5 +89,Israel,3 +90,Italia,4 +91,Jamaica,2 +92,Japón,3 +93,Jordania,3 +94,Kazajistán,3 +95,Kenia,1 +96,Kirguistán,3 +97,Kiribati,5 +98,Kuwait,3 +99,Laos,3 +100,Lesoto,1 +101,Letonia,4 +103,Liberia,1 +104,Libia,1 +105,Liechtenstein,4 +106,Lituania,4 +107,Luxemburgo,4 +102,Líbano,3 +108,Macedonia del Norte,4 +109,Madagascar,1 +110,Malasia,3 +111,Malaui,1 +112,Maldivas,3 +114,Malta,4 +113,Malí,1 +115,Marruecos,1 +116,Mauricio,1 +117,Mauritania,1 +119,Micronesia,5 +120,Moldavia,4 +122,Mongolia,3 +123,Montenegro,4 +124,Mozambique,1 +118,México,2 +121,Mónaco,4 +125,Namibia,1 +126,Nauru,5 +127,Nepal,3 +128,Nicaragua,2 +130,Nigeria,1 +131,Noruega,4 +132,Nueva Zelanda,5 +129,Níger,1 +133,Omán,3 +135,Pakistán,3 +136,Palaos,5 +137,Palestina,3 +138,Panamá,2 +139,Papúa Nueva Guinea,5 +140,Paraguay,2 +134,Países Bajos,4 +141,Perú,2 +142,Polonia,4 +143,Portugal,4 +144,Reino Unido,4 +145,República Centroafricana,1 +146,República Checa,4 +148,República Democrática del Congo,1 +149,República Dominicana,2 +147,República del Congo,1 +150,Ruanda,1 +151,Rumanía,4 +152,Rusia,3 +153,Samoa,5 +154,San Cristóbal y Nieves,2 +155,San Marino,4 +156,San Vicente y las Granadinas,2 +157,Santa Lucía,2 +158,Santo Tomé y Príncipe,1 +159,Senegal,1 +160,Serbia,4 +161,Seychelles,1 +162,Sierra Leona,1 +163,Singapur,3 +164,Siria,3 +165,Somalia,1 +166,Sri Lanka,3 +167,Sudáfrica,1 +168,Sudán,1 +169,Sudán del Sur,1 +170,Suecia,4 +171,Suiza,4 +172,Surinam,2 +173,Tailandia,3 +174,Tanzania,1 +175,Tayikistán,3 +176,Timor Oriental,3 +177,Togo,1 +178,Tonga,5 +179,Trinidad y Tobago,2 +181,Turkmenistán,3 +182,Turquía,3 +183,Tuvalu,5 +180,Túnez,1 +184,Ucrania,4 +185,Uganda,1 +186,Uruguay,2 +187,Uzbekistán,3 +188,Vanuatu,5 +189,Vaticano,4 +190,Venezuela,2 +191,Vietnam,3 +192,Yemen,3 +193,Yibuti,1 +194,Zambia,1 +195,Zimbabue,1 diff --git a/paises.sql b/paises.sql deleted file mode 100755 index 4d8eeda..0000000 --- a/paises.sql +++ /dev/null @@ -1,198 +0,0 @@ -INSERT INTO paises (nombre, continente_id) VALUES -('Afganistán', 3), -('Albania', 4), -('Alemania', 4), -('Andorra', 4), -('Angola', 1), -('Antigua y Barbuda', 2), -('Arabia Saudita', 3), -('Argelia', 1), -('Argentina', 2), -('Armenia', 3), -('Australia', 5), -('Austria', 4), -('Azerbaiyán', 3), -('Bahamas', 2), -('Bangladés', 3), -('Barbados', 2), -('Baréin', 3), -('Bélgica', 4), -('Belice', 2), -('Benín', 1), -('Bielorrusia', 4), -('Birmania', 3), -('Bolivia', 2), -('Bosnia y Herzegovina', 4), -('Botsuana', 1), -('Brasil', 2), -('Brunéi', 3), -('Bulgaria', 4), -('Burkina Faso', 1), -('Burundi', 1), -('Bután', 3), -('Cabo Verde', 1), -('Camboya', 3), -('Camerún', 1), -('Canadá', 2), -('Catar', 3), -('Chad', 1), -('Chile', 2), -('China', 3), -('Chipre', 3), -('Colombia', 2), -('Comoras', 1), -('Corea del Norte', 3), -('Corea del Sur', 3), -('Costa de Marfil', 1), -('Costa Rica', 2), -('Croacia', 4), -('Cuba', 2), -('Dinamarca', 4), -('Dominica', 2), -('Ecuador', 2), -('Egipto', 1), -('El Salvador', 2), -('Emiratos Árabes Unidos', 3), -('Eritrea', 1), -('Eslovaquia', 4), -('Eslovenia', 4), -('España', 4), -('Estados Unidos', 2), -('Estonia', 4), -('Esuatini', 1), -('Etiopía', 1), -('Filipinas', 3), -('Finlandia', 4), -('Fiyi', 5), -('Francia', 4), -('Gabón', 1), -('Gambia', 1), -('Georgia', 3), -('Ghana', 1), -('Granada', 2), -('Grecia', 4), -('Guatemala', 2), -('Guinea', 1), -('Guinea-Bisáu', 1), -('Guinea Ecuatorial', 1), -('Guyana', 2), -('Haití', 2), -('Honduras', 2), -('Hungría', 4), -('India', 3), -('Indonesia', 3), -('Irak', 3), -('Irán', 3), -('Irlanda', 4), -('Islandia', 4), -('Islas Marshall', 5), -('Islas Salomón', 5), -('Israel', 3), -('Italia', 4), -('Jamaica', 2), -('Japón', 3), -('Jordania', 3), -('Kazajistán', 3), -('Kenia', 1), -('Kirguistán', 3), -('Kiribati', 5), -('Kuwait', 3), -('Laos', 3), -('Lesoto', 1), -('Letonia', 4), -('Líbano', 3), -('Liberia', 1), -('Libia', 1), -('Liechtenstein', 4), -('Lituania', 4), -('Luxemburgo', 4), -('Macedonia del Norte', 4), -('Madagascar', 1), -('Malasia', 3), -('Malaui', 1), -('Maldivas', 3), -('Malí', 1), -('Malta', 4), -('Marruecos', 1), -('Mauricio', 1), -('Mauritania', 1), -('México', 2), -('Micronesia', 5), -('Moldavia', 4), -('Mónaco', 4), -('Mongolia', 3), -('Montenegro', 4), -('Mozambique', 1), -('Namibia', 1), -('Nauru', 5), -('Nepal', 3), -('Nicaragua', 2), -('Níger', 1), -('Nigeria', 1), -('Noruega', 4), -('Nueva Zelanda', 5), -('Omán', 3), -('Países Bajos', 4), -('Pakistán', 3), -('Palaos', 5), -('Palestina', 3), -('Panamá', 2), -('Papúa Nueva Guinea', 5), -('Paraguay', 2), -('Perú', 2), -('Polonia', 4), -('Portugal', 4), -('Reino Unido', 4), -('República Centroafricana', 1), -('República Checa', 4), -('República del Congo', 1), -('República Democrática del Congo', 1), -('República Dominicana', 2), -('Ruanda', 1), -('Rumanía', 4), -('Rusia', 3), -('Samoa', 5), -('San Cristóbal y Nieves', 2), -('San Marino', 4), -('San Vicente y las Granadinas', 2), -('Santa Lucía', 2), -('Santo Tomé y Príncipe', 1), -('Senegal', 1), -('Serbia', 4), -('Seychelles', 1), -('Sierra Leona', 1), -('Singapur', 3), -('Siria', 3), -('Somalia', 1), -('Sri Lanka', 3), -('Sudáfrica', 1), -('Sudán', 1), -('Sudán del Sur', 1), -('Suecia', 4), -('Suiza', 4), -('Surinam', 2), -('Tailandia', 3), -('Tanzania', 1), -('Tayikistán', 3), -('Timor Oriental', 3), -('Togo', 1), -('Tonga', 5), -('Trinidad y Tobago', 2), -('Túnez', 1), -('Turkmenistán', 3), -('Turquía', 3), -('Tuvalu', 5), -('Ucrania', 4), -('Uganda', 1), -('Uruguay', 2), -('Uzbekistán', 3), -('Vanuatu', 5), -('Vaticano', 4), -('Venezuela', 2), -('Vietnam', 3), -('Yemen', 3), -('Yibuti', 1), -('Zambia', 1), -('Zimbabue', 1) -ON CONFLICT DO NOTHING; - diff --git a/related_worker.py b/related_worker.py new file mode 100644 index 0000000..e745c6a --- /dev/null +++ b/related_worker.py @@ -0,0 +1,206 @@ +# related_worker.py +import os +import time +import math +import logging +from typing import List, Tuple + +import psycopg2 +import psycopg2.extras + +logging.basicConfig( + level=logging.INFO, + format='[related] %(asctime)s %(levelname)s: %(message)s' +) + +DB = dict( + host=os.environ.get("DB_HOST", "localhost"), + port=int(os.environ.get("DB_PORT", 5432)), + dbname=os.environ.get("DB_NAME", "rss"), + user=os.environ.get("DB_USER", "rss"), + password=os.environ.get("DB_PASS", "x"), +) + +# Config +TOPK = int(os.environ.get("RELATED_TOPK", 10)) # vecinos por traducción +BATCH_IDS = int(os.environ.get("RELATED_BATCH_IDS", 200)) # cuántas traducciones objetivo por pasada +BATCH_SIM = int(os.environ.get("RELATED_BATCH_SIM", 2000)) # tamaño de bloque al comparar contra el resto +SLEEP_IDLE = float(os.environ.get("RELATED_SLEEP", 10)) # pausa cuando no hay trabajo +MIN_SCORE = float(os.environ.get("RELATED_MIN_SCORE", 0.0)) # descarta relaciones por debajo de este coseno +WINDOW_HOURS = int(os.environ.get("RELATED_WINDOW_H", 0)) # 0 = sin filtro temporal; >0 = últimas X horas + +def get_conn(): + return psycopg2.connect(**DB) + +def _fetch_all_embeddings(cur): + """ + Devuelve: + ids: List[int] con traduccion_id + vecs: List[List[float]] con el embedding (puede venir como list de DOUBLE PRECISION[]) + norms: List[float] con la norma L2 de cada vector (precalculada para acelerar el coseno) + Si WINDOW_HOURS > 0, limitamos a noticias recientes. + """ + if WINDOW_HOURS > 0: + cur.execute(""" + SELECT e.traduccion_id, e.vec + FROM embeddings e + JOIN traducciones t ON t.id = e.traduccion_id + JOIN noticias n ON n.id = t.noticia_id + WHERE n.fecha >= NOW() - INTERVAL %s + """, (f"{WINDOW_HOURS} hours",)) + else: + cur.execute("SELECT traduccion_id, vec FROM embeddings") + + rows = cur.fetchall() + if not rows: + return [], [], [] + + ids = [] + vecs = [] + norms = [] + for tr_id, v in rows: + # v llega como lista de floats (DOUBLE PRECISION[]); protegemos None + if v is None: + v = [] + # calcular norma + nrm = math.sqrt(sum(((x or 0.0) * (x or 0.0)) for x in v)) or 1e-8 + ids.append(tr_id) + vecs.append(v) + norms.append(nrm) + return ids, vecs, norms + +def _fetch_pending_ids(cur, limit) -> List[int]: + """ + Traducciones con embedding pero sin relaciones generadas aún. + Si quieres regenerar periódicamente, puedes cambiar la condición + para tener en cuenta antigüedad o un flag de 'stale'. + """ + cur.execute(""" + SELECT e.traduccion_id + FROM embeddings e + LEFT JOIN related_noticias r ON r.traduccion_id = e.traduccion_id + GROUP BY e.traduccion_id + HAVING COUNT(r.related_traduccion_id) = 0 + ORDER BY e.traduccion_id DESC + LIMIT %s; + """, (limit,)) + return [r[0] for r in cur.fetchall()] + +def _cosine_with_norms(a, b, na, nb): + # producto punto + num = 0.0 + # zip se corta por el más corto; si longitudes difieren, usamos la intersección + for x, y in zip(a, b): + xv = x or 0.0 + yv = y or 0.0 + num += xv * yv + denom = na * nb + if denom <= 0.0: + return 0.0 + return num / denom + +def _topk_for_one(idx: int, + ids_all: List[int], + vecs_all: List[List[float]], + norms_all: List[float], + pool_indices: List[int], + K: int) -> List[Tuple[int, float]]: + """ + Devuelve los K mejores (related_id, score) para ids_all[idx] restringido al conjunto pool_indices. + """ + me_vec = vecs_all[idx] + me_norm = norms_all[idx] + + out: List[Tuple[int, float]] = [] + for j in pool_indices: + if j == idx: + continue + s = _cosine_with_norms(me_vec, vecs_all[j], me_norm, norms_all[j]) + out.append((ids_all[j], s)) + + # top-K ordenado por score desc + out.sort(key=lambda t: t[1], reverse=True) + if MIN_SCORE > 0.0: + out = [p for p in out if p[1] >= MIN_SCORE] + return out[:K] + +def _insert_related(cur, tr_id: int, pairs: List[Tuple[int, float]]): + if not pairs: + return + psycopg2.extras.execute_values( + cur, + """ + INSERT INTO related_noticias (traduccion_id, related_traduccion_id, score) + VALUES %s + ON CONFLICT (traduccion_id, related_traduccion_id) + DO UPDATE SET score = EXCLUDED.score + """, + [(tr_id, rid, float(score)) for (rid, score) in pairs] + ) + +def build_for_ids(conn, target_ids: List[int]) -> int: + """ + Para las traducciones de target_ids: + - carga TODOS los embeddings (opcionalmente filtrados por ventana temporal), + - para cada target calcula sus TOPK vecinos por coseno, por bloques, + - upsert en related_noticias. + """ + with conn.cursor() as cur: + ids_all, vecs_all, norms_all = _fetch_all_embeddings(cur) + if not ids_all: + return 0 + + # mapa traduccion_id -> índice en arrays + pos = {tid: i for i, tid in enumerate(ids_all)} + n = len(ids_all) + processed = 0 + + with conn.cursor() as cur: + for tr_id in target_ids: + if tr_id not in pos: + continue + i = pos[tr_id] + + # barrido por bloques para no disparar memoria + top: List[Tuple[int, float]] = [] + for start in range(0, n, BATCH_SIM): + block = list(range(start, min(start + BATCH_SIM, n))) + candidates = _topk_for_one(i, ids_all, vecs_all, norms_all, block, TOPK) + + # merge de top-K global + top += candidates + top.sort(key=lambda t: t[1], reverse=True) + if len(top) > TOPK: + top = top[:TOPK] + + _insert_related(cur, tr_id, top) + processed += 1 + + conn.commit() + return processed + +def main(): + logging.info( + "Iniciando related_worker (TOPK=%s, BATCH_IDS=%s, BATCH_SIM=%s, MIN_SCORE=%.3f, WINDOW_H=%s)", + TOPK, BATCH_IDS, BATCH_SIM, MIN_SCORE, WINDOW_HOURS + ) + while True: + try: + with get_conn() as conn, conn.cursor() as cur: + todo = _fetch_pending_ids(cur, BATCH_IDS) + + if not todo: + time.sleep(SLEEP_IDLE) + continue + + with get_conn() as conn: + done = build_for_ids(conn, todo) + logging.info("Relacionadas generadas/actualizadas para %d traducciones.", done) + + except Exception: + logging.exception("Error en related_worker") + time.sleep(SLEEP_IDLE) + +if __name__ == "__main__": + main() + diff --git a/requirements.txt b/requirements.txt index c5529f7..87f9cf7 100755 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,8 @@ sentencepiece==0.2.0 sacremoses==0.1.1 accelerate==0.33.0 spacy>=3.7,<4.0 -# Nota: PyTorch (torch) NO se fija aquí. -# Se instala en el Dockerfile con la wheel adecuada de CUDA (cu121) para tu GPU. - +pgvector==0.2.5 +sentence-transformers==3.0.1 +numpy>=1.26 +scikit-learn>=1.4 +python-dotenv>=1.0 diff --git a/templates/noticia.html b/templates/noticia.html new file mode 100644 index 0000000..4d6b605 --- /dev/null +++ b/templates/noticia.html @@ -0,0 +1,111 @@ +{% extends "base.html" %} +{% block title %} + {% set d = dato if dato is defined else (r if r is defined else None) %} + {% if d %} + {{ d.titulo_trad or d.titulo_orig or d.titulo_traducido or d.titulo_original or 'Detalle de Noticia' }} + {% else %} + Detalle de Noticia + {% endif %} +{% endblock %} + +{% block content %} +{% set d = dato if dato is defined else (r if r is defined else None) %} + +{% if not d %} +
+
+

No se encontró la noticia solicitada.

+
+
+{% else %} +
+
+

+ {{ d.titulo_trad or d.titulo_orig or d.titulo_traducido or d.titulo_original }} + {% if d.lang_to %}{{ d.lang_to|upper }}{% endif %} +

+ {% if d.fuente_url or d.url %} + + {% endif %} +
+ +
+
+ {% set fecha_ = d.fecha %} + {% if fecha_ %} + + {% if fecha_ is string %}{{ fecha_ }}{% else %}{{ fecha_.strftime('%d-%m-%Y %H:%M') }}{% endif %} + {% endif %} + {% if d.fuente_nombre %} | {{ d.fuente_nombre }}{% endif %} + {% if d.categoria %} | {{ d.categoria }}{% endif %} + {% if d.pais %} | {{ d.pais }}{% endif %} +
+ + {% if d.resumen_trad or d.cuerpo_traducido %} +

Resumen (traducido)

+
{{ (d.resumen_trad or d.cuerpo_traducido)|safe_html }}
+
+ {% endif %} + + {% if d.resumen_orig or d.cuerpo_original or d.resumen or d.titulo_original %} +

Resumen (original)

+
{{ (d.resumen_orig or d.cuerpo_original or d.resumen)|safe_html }}
+ {% endif %} + + {% if tags is defined and tags and tags|length %} +
+ {% for t in tags %} + {# t puede ser DictRow (t['valor']) o tupla (t.0) #} + {% set valor = t.valor if t.valor is defined else (t[0] if t[0] is defined else '') %} + {% set tipo = t.tipo if t.tipo is defined else (t[1] if t[1] is defined else '') %} + {{ valor }} + {% endfor %} +
+ {% endif %} +
+
+ +{% set rels = relacionadas if relacionadas is defined else None %} +{% if rels and rels|length %} +
+
+

Noticias relacionadas

+
+
+
    + {% for r in rels %} +
  • + {% if r.imagen_url %} +
    + + Imagen relacionada + +
    + {% endif %} +
    +

    + {{ r.titulo }} +

    +
    + {% if r.fecha %} + + {% if r.fecha is string %}{{ r.fecha }}{% else %}{{ r.fecha.strftime('%d-%m-%Y %H:%M') }}{% endif %} + {% endif %} + {% if r.fuente_nombre %} | {{ r.fuente_nombre }}{% endif %} + {% if r.score is defined %} | score: {{ "%.3f"|format(r.score) }}{% endif %} +
    + {% if r.resumen %} +
    {{ r.resumen }}
    + {% endif %} +
    +
  • + {% endfor %} +
+
+
+{% endif %} +{% endif %} +{% endblock %} + diff --git a/templates/noticias.html b/templates/noticias.html index be2c4bd..3e483ee 100644 --- a/templates/noticias.html +++ b/templates/noticias.html @@ -88,10 +88,16 @@ document.addEventListener('DOMContentLoaded', function() { const form = document.getElementById('filter-form'); const continenteSelect = document.getElementById('continente_id'); const paisSelect = document.getElementById('pais_id'); + const categoriaSelect = document.getElementById('categoria_id'); + const fechaInput = document.getElementById('fecha'); + const qInput = document.getElementById('q'); + const pageInput = document.getElementById('page'); const origInput = document.getElementById('orig'); const langInput = document.getElementById('lang'); + function setPage1() { pageInput.value = 1; } + function filtrarPaises() { const continenteId = continenteSelect.value; for (let i = 1; i < paisSelect.options.length; i++) { @@ -105,22 +111,14 @@ document.addEventListener('DOMContentLoaded', function() { } } - async function cargarNoticias(keepPage) { - if (!keepPage) pageInput.value = 1; - - const formData = new FormData(form); - const params = new URLSearchParams(formData); - const newUrl = `${form.action}?${params.toString()}`; - + async function cargarNoticiasFromURL(url) { const container = document.getElementById('noticias-container'); container.style.opacity = '0.5'; container.innerHTML = '
'; - try { - const response = await fetch(newUrl, { headers: { 'X-Requested-With': 'XMLHttpRequest' } }); + const response = await fetch(url, { headers: { 'X-Requested-With': 'XMLHttpRequest' } }); const html = await response.text(); container.innerHTML = html; - window.history.pushState({path: newUrl}, '', newUrl); } catch (error) { console.error('Error al filtrar noticias:', error); container.innerHTML = '

Error al cargar las noticias.

'; @@ -129,11 +127,25 @@ document.addEventListener('DOMContentLoaded', function() { } } + async function cargarNoticias(keepPage) { + if (!keepPage) setPage1(); + + const formData = new FormData(form); + const params = new URLSearchParams(formData); + const newUrl = `${form.action}?${params.toString()}`; + + await cargarNoticiasFromURL(newUrl); + // Actualizar historial + window.history.pushState({ path: newUrl }, '', newUrl); + } + + // Submit manual form.addEventListener('submit', function(e) { e.preventDefault(); cargarNoticias(false); }); + // Toggle traducción/original const toggleOrig = document.getElementById('toggle-orig'); const toggleTr = document.getElementById('toggle-tr'); @@ -153,12 +165,38 @@ document.addEventListener('DOMContentLoaded', function() { }); } + // Cambios en selects/fecha -> recarga automática continenteSelect.addEventListener('change', function() { filtrarPaises(); cargarNoticias(false); }); + paisSelect.addEventListener('change', function() { + cargarNoticias(false); + }); + categoriaSelect.addEventListener('change', function() { + cargarNoticias(false); + }); + fechaInput.addEventListener('change', function() { + cargarNoticias(false); + }); + // Debounce búsqueda + let qTimer = null; + qInput.addEventListener('input', function() { + if (qTimer) clearTimeout(qTimer); + qTimer = setTimeout(() => { + cargarNoticias(false); + }, 450); + }); + + // Cargar países al inicio filtrarPaises(); + + // Soporte de navegación del historial + window.addEventListener('popstate', function(e) { + const url = (e.state && e.state.path) ? e.state.path : window.location.href; + cargarNoticiasFromURL(url); + }); }); {% endblock %}