Fix cursor closed error, improve feed parsing, and update worker counts

This commit is contained in:
jlimolina 2025-06-13 14:02:32 +02:00
parent ce19d301e6
commit 824ff0539d
523 changed files with 190411 additions and 355 deletions

488
app.py
View file

@ -1,8 +1,8 @@
# -*- coding: utf-8 -*-
# app.py - Versión final solo para la web
import os
import sys
import hashlib
import re
import hashlib # Keep if used elsewhere (e.g., if generating IDs in other parts of the app), otherwise remove
import csv
import math
from io import StringIO, BytesIO
@ -10,20 +10,27 @@ from datetime import datetime, timedelta
import logging
import atexit
import zipfile
from contextlib import contextmanager
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from flask import Flask, render_template, request, redirect, url_for, Response, flash
from apscheduler.schedulers.background import BackgroundScheduler
import psycopg2
import psycopg2.extras
import feedparser
import psycopg2.pool
import bleach
# Import the processing function from the new module
from feed_processor import process_single_feed
# --- Configuración de Logging ---
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='[%(asctime)s] %(levelname)s in %(module)s: %(message)s')
# --- Inicialización de la App Flask ---
app = Flask(__name__)
app.config['SECRET_KEY'] = os.environ.get('SECRET_KEY', os.urandom(24))
# Configuración de la base de datos
# --- Configuración de la Base de Datos y Constantes ---
DB_CONFIG = {
"host": os.environ.get("DB_HOST", "localhost"),
"port": int(os.environ.get("DB_PORT", 5432)),
@ -31,38 +38,58 @@ DB_CONFIG = {
"user": os.environ.get("DB_USER", "rss"),
"password": os.environ.get("DB_PASS", "x")
}
MAX_FALLOS = 5
# Define worker constants here or in a separate config
MAX_WORKERS = int(os.environ.get("RSS_MAX_WORKERS", 20)) # Changed default to 20 concurrent workers
SINGLE_FEED_TIMEOUT = int(os.environ.get("RSS_FEED_TIMEOUT", 30)) # Example: 30 seconds per feed process
MAX_FALLOS = int(os.environ.get("RSS_MAX_FAILURES", 5)) # Number of failures before deactivating a feed
# --- Pool de Conexiones a la Base de Datos ---
db_pool = None
try:
# Aumentamos el pool para dar cabida a los workers del servidor web
db_pool = psycopg2.pool.SimpleConnectionPool(minconn=1, maxconn=10, **DB_CONFIG)
app.logger.info("Pool de conexiones a la base de datos creado exitosamente.")
except psycopg2.OperationalError as e:
logging.error(f"FATAL: No se pudo conectar a la base de datos para crear el pool: {e}")
# Consider sys.exit(1) here if DB connection is absolutely critical for app startup
@contextmanager
def get_conn():
return psycopg2.connect(**DB_CONFIG)
if not db_pool: raise ConnectionError("El pool de la base de datos no está disponible.")
conn = None
try:
conn = db_pool.getconn()
yield conn
conn.commit()
except Exception as e:
if conn: conn.rollback()
raise e
finally:
if conn: db_pool.putconn(conn)
# --- Hook de Cierre ---
@atexit.register
def shutdown_hooks():
if db_pool:
db_pool.closeall()
app.logger.info("Pool de conexiones de la base de datos cerrado.")
# --- Filtros y Rutas ---
@app.template_filter('safe_html')
def safe_html(text):
if not text: return ""
allowed_tags = {'a', 'b', 'strong', 'i', 'em', 'p', 'br', 'img'}
allowed_attrs = {'a': ['href', 'title'], 'img': ['src', 'alt']}
return bleach.clean(text, tags=allowed_tags, attributes=allowed_attrs, strip=True)
# En app.py, reemplaza tu función home() con esta:
# En app.py
return bleach.clean(text, tags={'a', 'b', 'strong', 'i', 'em', 'p', 'br'}, attributes={'a': ['href', 'title']}, strip=True)
@app.route("/")
def home():
# ... (el código inicial para obtener los parámetros no cambia) ...
cat_id = request.args.get("categoria_id")
cont_id = request.args.get("continente_id")
pais_id = request.args.get("pais_id")
fecha_filtro = request.args.get("fecha")
q = request.args.get("q", "").strip() # <--- NUEVO: Obtenemos el término de búsqueda
# ... (el resto del código hasta la construcción de la consulta) ...
# (se omite por brevedad, es idéntico)
cat_id, cont_id, pais_id, fecha_filtro = request.args.get("categoria_id"), request.args.get("continente_id"), request.args.get("pais_id"), request.args.get("fecha")
q = request.args.get("q", "").strip()
noticias, categorias, continentes, paises = [], [], [], []
try:
with get_conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
# ... (las consultas para obtener categorías, continentes y países no cambian) ...
cursor.execute("SELECT id, nombre FROM categorias ORDER BY nombre")
categorias = cursor.fetchall()
cursor.execute("SELECT id, nombre FROM continentes ORDER BY nombre")
@ -70,23 +97,16 @@ def home():
cursor.execute("SELECT id, nombre, continente_id FROM paises ORDER BY nombre")
paises = cursor.fetchall()
sql_params, conditions = [], []
# El SELECT ahora puede incluir el ranking de la búsqueda
sql_base = "SELECT n.fecha, n.titulo, n.resumen, n.url, n.imagen_url, c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente FROM noticias n LEFT JOIN categorias c ON n.categoria_id = c.id LEFT JOIN paises p ON n.pais_id = p.id LEFT JOIN continentes co ON p.continente_id = co.id"
# === INICIO DE LA MODIFICACIÓN DE LA CONSULTA ===
if q:
# Convierte el término de búsqueda para que busque palabras completas
# Ejemplo: "guerra ucrania" -> "guerra & ucrania"
search_query = " & ".join(q.split())
conditions.append("n.tsv @@ to_tsquery('spanish', %s)")
sql_params.append(search_query)
if cat_id: conditions.append("n.categoria_id = %s"); sql_params.append(cat_id)
if pais_id: conditions.append("n.pais_id = %s"); sql_params.append(pais_id)
elif cont_id: conditions.append("p.continente_id = %s"); sql_params.append(cont_id)
if fecha_filtro:
try:
fecha_obj = datetime.strptime(fecha_filtro, '%Y-%m-%d')
@ -94,42 +114,30 @@ def home():
sql_params.append(fecha_obj.date())
except ValueError:
flash("Formato de fecha no válido. Use AAAA-MM-DD.", "error")
fecha_filtro = None
if conditions:
sql_base += " WHERE " + " AND ".join(conditions)
if conditions: sql_base += " WHERE " + " AND ".join(conditions)
# Si hay búsqueda, ordena por relevancia. Si no, por fecha.
order_clause = " ORDER BY n.fecha DESC NULLS LAST"
if q:
search_query_ts = " & ".join(q.split())
sql_base += " ORDER BY ts_rank(n.tsv, to_tsquery('spanish', %s)) DESC, n.fecha DESC"
order_clause = " ORDER BY ts_rank(n.tsv, to_tsquery('spanish', %s)) DESC, n.fecha DESC"
sql_params.append(search_query_ts)
else:
sql_base += " ORDER BY n.fecha DESC NULLS LAST"
sql_final = sql_base + " LIMIT 50"
# === FIN DE LA MODIFICACIÓN DE LA CONSULTA ===
sql_final = sql_base + order_clause + " LIMIT 50"
cursor.execute(sql_final, tuple(sql_params))
noticias = cursor.fetchall()
except psycopg2.Error as db_err:
app.logger.error(f"[DB ERROR] Al leer noticias: {db_err}", exc_info=True)
flash("Error de base de datos al cargar las noticias.", "error")
noticias, categorias, continentes, paises = [], [], [], [] # Resetea en caso de error
if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
return render_template('_noticias_list.html', noticias=noticias)
return render_template("noticias.html",
noticias=noticias,
categorias=categorias,
continentes=continentes,
paises=paises,
cat_id=int(cat_id) if cat_id else None,
cont_id=int(cont_id) if cont_id else None,
pais_id=int(pais_id) if pais_id else None,
fecha_filtro=fecha_filtro,
q=q) # <--- NUEVO: Pasamos el término de búsqueda a la plantilla
noticias=noticias, categorias=categorias, continentes=continentes, paises=paises,
cat_id=int(cat_id) if cat_id else None, cont_id=int(cont_id) if cont_id else None,
pais_id=int(pais_id) if pais_id else None, fecha_filtro=fecha_filtro, q=q)
@app.route("/feeds")
def dashboard():
@ -137,24 +145,23 @@ def dashboard():
try:
with get_conn() as conn:
with conn.cursor() as cursor:
cursor.execute("SELECT COUNT(*) FROM feeds;")
cursor.execute("SELECT COUNT(*) FROM feeds")
stats['feeds_totales'] = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM noticias;")
cursor.execute("SELECT COUNT(*) FROM noticias")
stats['noticias_totales'] = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM feeds WHERE activo = FALSE;")
cursor.execute("SELECT COUNT(*) FROM feeds WHERE activo = FALSE")
stats['feeds_caidos'] = cursor.fetchone()[0]
except psycopg2.Error as db_err:
app.logger.error(f"[DB ERROR] Al calcular estadísticas del dashboard: {db_err}")
flash("Error al conectar con la base de datos para mostrar el resumen.", "error")
app.logger.error(f"[DB ERROR] Al calcular estadísticas: {db_err}")
flash("Error al conectar con la base de datos.", "error")
return render_template("dashboard.html", stats=stats)
@app.route("/feeds/manage")
def manage_feeds():
page = request.args.get('page', 1, type=int)
per_page = 10
per_page = 20
offset = (page - 1) * per_page
feeds_list = []
total_feeds = 0
feeds_list, total_feeds = [], 0
try:
with get_conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
@ -165,7 +172,7 @@ def manage_feeds():
except psycopg2.Error as db_err:
app.logger.error(f"[DB ERROR] Al obtener lista de feeds: {db_err}")
flash("Error al obtener la lista de feeds.", "error")
total_pages = math.ceil(total_feeds / per_page)
total_pages = math.ceil(total_feeds / per_page) if total_feeds > 0 else 0
return render_template("feeds_list.html", feeds=feeds_list, page=page, total_pages=total_pages, total_feeds=total_feeds)
def _get_form_dependencies(cursor):
@ -182,9 +189,11 @@ def add_feed():
try:
with get_conn() as conn:
with conn.cursor() as cursor:
categoria_id = int(request.form.get("categoria_id")) if request.form.get("categoria_id") else None
pais_id = int(request.form.get("pais_id")) if request.form.get("pais_id") else None
cursor.execute(
"INSERT INTO feeds (nombre, descripcion, url, categoria_id, pais_id, idioma) VALUES (%s, %s, %s, %s, %s, %s)",
(nombre, request.form.get("descripcion"), request.form.get("url"), request.form.get("categoria_id"), request.form.get("pais_id"), (request.form.get("idioma", "").strip() or None))
(nombre, request.form.get("descripcion"), request.form.get("url"), categoria_id, pais_id, (request.form.get("idioma", "").strip() or None))
)
flash(f"Feed '{nombre}' añadido correctamente.", "success")
except psycopg2.Error as db_err:
@ -198,39 +207,43 @@ def add_feed():
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
categorias, paises = _get_form_dependencies(cursor)
except psycopg2.Error as db_err:
app.logger.error(f"[DB ERROR] Al cargar formulario para añadir feed: {db_err}")
app.logger.error(f"[DB ERROR] Al cargar formulario: {db_err}")
flash("No se pudieron cargar las categorías o países.", "error")
return render_template("add_feed.html", categorias=categorias, paises=paises)
@app.route("/edit/<int:feed_id>", methods=["GET", "POST"])
def edit_feed(feed_id):
feed, categorias, paises = None, [], []
try:
with get_conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
if request.method == "POST":
if request.method == "POST":
try:
with get_conn() as conn:
with conn.cursor() as cursor:
categoria_id = int(request.form.get("categoria_id")) if request.form.get("categoria_id") else None
pais_id = int(request.form.get("pais_id")) if request.form.get("pais_id") else None
idioma = request.form.get("idioma", "").strip() or None
activo = "activo" in request.form
cursor.execute(
"""UPDATE feeds SET nombre=%s, descripcion=%s, url=%s, categoria_id=%s, pais_id=%s, idioma=%s, activo=%s WHERE id=%s""",
(request.form.get("nombre"), request.form.get("descripcion"), request.form.get("url"), request.form.get("categoria_id"), request.form.get("pais_id"), idioma, activo, feed_id)
"UPDATE feeds SET nombre=%s, descripcion=%s, url=%s, categoria_id=%s, pais_id=%s, idioma=%s, activo=%s WHERE id=%s",
(request.form.get("nombre"), request.form.get("descripcion"), request.form.get("url"), categoria_id, pais_id, idioma, activo, feed_id)
)
flash("Feed actualizado correctamente.", "success")
return redirect(url_for("manage_feeds"))
flash("Feed actualizado correctamente.", "success")
except psycopg2.Error as db_err:
app.logger.error(f"[DB ERROR] Al actualizar feed: {db_err}", exc_info=True)
flash(f"Error al actualizar el feed: {db_err}", "error")
return redirect(url_for("manage_feeds"))
try:
with get_conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
cursor.execute("SELECT * FROM feeds WHERE id = %s", (feed_id,))
feed = cursor.fetchone()
if not feed:
flash("No se encontró el feed solicitado.", "error")
return redirect(url_for("manage_feeds"))
categorias, paises = _get_form_dependencies(cursor)
except psycopg2.Error as db_err:
app.logger.error(f"[DB ERROR] Al editar feed: {db_err}", exc_info=True)
flash(f"Error al editar el feed: {db_err}", "error")
app.logger.error(f"[DB ERROR] Al cargar feed para editar: {db_err}", exc_info=True)
flash("Error al cargar el feed para editar.", "error")
return redirect(url_for("manage_feeds"))
if not feed:
flash("No se encontró el feed solicitado.", "error")
return redirect(url_for("manage_feeds"))
return render_template("edit_feed.html", feed=feed, categorias=categorias, paises=paises)
@app.route("/delete/<int:feed_id>")
@ -251,10 +264,9 @@ def reactivar_feed(feed_id):
with get_conn() as conn:
with conn.cursor() as cursor:
cursor.execute("UPDATE feeds SET activo = TRUE, fallos = 0 WHERE id = %s", (feed_id,))
flash("Feed reactivado y contador de fallos reseteado.", "success")
flash("Feed reactivado.", "success")
except psycopg2.Error as db_err:
app.logger.error(f"[DB ERROR] Al reactivar feed: {db_err}", exc_info=True)
flash(f"Error al reactivar el feed: {db_err}", "error")
flash(f"Error al reactivar feed: {db_err}", "error")
return redirect(url_for("manage_feeds"))
@app.route("/backup_feeds")
@ -262,30 +274,18 @@ def backup_feeds():
try:
with get_conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
cursor.execute("""
SELECT f.id, f.nombre, f.descripcion, f.url, f.categoria_id, c.nombre AS categoria,
f.pais_id, p.nombre AS pais, f.idioma, f.activo, f.fallos
FROM feeds f
LEFT JOIN categorias c ON f.categoria_id = c.id
LEFT JOIN paises p ON f.pais_id = p.id
ORDER BY f.id
""")
cursor.execute("SELECT f.id, f.nombre, f.descripcion, f.url, f.categoria_id, c.nombre AS categoria, f.pais_id, p.nombre AS pais, f.idioma, f.activo, f.fallos FROM feeds f LEFT JOIN categorias c ON f.categoria_id = c.id LEFT JOIN paises p ON f.pais_id = p.id ORDER BY f.id")
feeds_ = cursor.fetchall()
if not feeds_:
flash("No hay feeds para exportar.", "warning")
return redirect(url_for("dashboard"))
si = StringIO()
writer = csv.DictWriter(si, fieldnames=[desc[0] for desc in cursor.description])
writer.writeheader()
writer.writerows([dict(row) for row in feeds_])
output = si.getvalue()
si.close()
return Response(output, mimetype="text/csv", headers={"Content-Disposition": "attachment;filename=feeds_backup.csv"})
if not feeds_:
flash("No hay feeds para exportar.", "warning")
return redirect(url_for("dashboard"))
output = StringIO()
writer = csv.DictWriter(output, fieldnames=feeds_[0].keys())
writer.writeheader()
writer.writerows(feeds_)
return Response(output.getvalue(), mimetype="text/csv", headers={"Content-Disposition": "attachment;filename=feeds_backup.csv"})
except Exception as e:
app.logger.error(f"[ERROR] Al hacer backup de feeds: {e}", exc_info=True)
flash("Error al generar el backup.", "error")
return redirect(url_for("dashboard"))
@app.route("/backup_noticias")
@ -293,93 +293,47 @@ def backup_noticias():
try:
with get_conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
cursor.execute("""
SELECT n.id, n.titulo, n.resumen, n.url, n.fecha, n.imagen_url,
c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente
FROM noticias n
LEFT JOIN categorias c ON n.categoria_id = c.id
LEFT JOIN paises p ON n.pais_id = p.id
LEFT JOIN continentes co ON p.continente_id = co.id
ORDER BY n.fecha DESC
""")
cursor.execute("SELECT n.id, n.titulo, n.resumen, n.url, n.fecha, n.imagen_url, c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente FROM noticias n LEFT JOIN categorias c ON n.categoria_id = c.id LEFT JOIN paises p ON n.pais_id = p.id LEFT JOIN continentes co ON p.continente_id = co.id ORDER BY n.fecha DESC")
noticias = cursor.fetchall()
if not noticias:
flash("No hay noticias para exportar.", "warning")
return redirect(url_for("dashboard"))
si = StringIO()
writer = csv.DictWriter(si, fieldnames=[desc[0] for desc in cursor.description])
writer.writeheader()
writer.writerows([dict(row) for row in noticias])
output = si.getvalue()
si.close()
return Response(
output,
mimetype="text/csv",
headers={"Content-Disposition": "attachment;filename=noticias_backup.csv"}
)
if not noticias:
flash("No hay noticias para exportar.", "warning")
return redirect(url_for("dashboard"))
output = StringIO()
writer = csv.DictWriter(output, fieldnames=noticias[0].keys())
writer.writeheader()
writer.writerows(noticias)
return Response(output.getvalue(), mimetype="text/csv", headers={"Content-Disposition": "attachment;filename=noticias_backup.csv"})
except Exception as e:
app.logger.error(f"[ERROR] Al hacer backup de noticias: {e}", exc_info=True)
flash("Error al generar el backup de noticias.", "error")
return redirect(url_for("dashboard"))
@app.route("/backup_completo")
def backup_completo():
try:
memory_buffer = BytesIO()
with zipfile.ZipFile(memory_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
with get_conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
# Feeds
cursor.execute("""
SELECT f.id, f.nombre, f.descripcion, f.url, f.categoria_id, c.nombre AS categoria,
f.pais_id, p.nombre AS pais, f.idioma, f.activo, f.fallos
FROM feeds f
LEFT JOIN categorias c ON f.categoria_id = c.id
LEFT JOIN paises p ON f.pais_id = p.id
ORDER BY f.id
""")
cursor.execute("SELECT f.id, f.nombre, f.descripcion, f.url, f.categoria_id, c.nombre AS categoria, f.pais_id, p.nombre AS pais, f.idioma, f.activo, f.fallos FROM feeds f LEFT JOIN categorias c ON f.categoria_id = c.id LEFT JOIN paises p ON f.pais_id = p.id ORDER BY f.id")
feeds_data = cursor.fetchall()
if feeds_data:
feeds_si = StringIO()
writer = csv.DictWriter(feeds_si, fieldnames=[desc[0] for desc in cursor.description])
output = StringIO()
writer = csv.DictWriter(output, fieldnames=feeds_data[0].keys())
writer.writeheader()
writer.writerows([dict(row) for row in feeds_data])
zipf.writestr("feeds.csv", feeds_si.getvalue())
feeds_si.close()
# Noticias
cursor.execute("""
SELECT n.id, n.titulo, n.resumen, n.url, n.fecha, n.imagen_url,
c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente
FROM noticias n
LEFT JOIN categorias c ON n.categoria_id = c.id
LEFT JOIN paises p ON n.pais_id = p.id
LEFT JOIN continentes co ON p.continente_id = co.id
ORDER BY n.fecha DESC
""")
writer.writerows(feeds_data)
zipf.writestr("feeds.csv", output.getvalue())
cursor.execute("SELECT n.id, n.titulo, n.resumen, n.url, n.fecha, n.imagen_url, c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente FROM noticias n LEFT JOIN categorias c ON n.categoria_id = c.id LEFT JOIN paises p ON n.pais_id = p.id LEFT JOIN continentes co ON p.continente_id = co.id ORDER BY n.fecha DESC")
noticias_data = cursor.fetchall()
if noticias_data:
noticias_si = StringIO()
writer = csv.DictWriter(noticias_si, fieldnames=[desc[0] for desc in cursor.description])
output = StringIO()
writer = csv.DictWriter(output, fieldnames=noticias_data[0].keys())
writer.writeheader()
writer.writerows([dict(row) for row in noticias_data])
zipf.writestr("noticias.csv", noticias_si.getvalue())
noticias_si.close()
writer.writerows(noticias_data)
zipf.writestr("noticias.csv", output.getvalue())
memory_buffer.seek(0)
return Response(
memory_buffer,
mimetype="application/zip",
headers={"Content-Disposition": "attachment;filename=rss_backup_completo.zip"}
)
return Response(memory_buffer, mimetype="application/zip", headers={"Content-Disposition": "attachment;filename=rss_backup_completo.zip"})
except Exception as e:
app.logger.error(f"[ERROR] Al hacer backup completo: {e}", exc_info=True)
flash("Error al generar el backup completo.", "error")
return redirect(url_for("dashboard"))
@app.route("/restore_feeds", methods=["GET", "POST"])
@ -387,124 +341,146 @@ def restore_feeds():
if request.method == "POST":
file = request.files.get("file")
if not file or not file.filename.endswith(".csv"):
flash("Archivo no válido. Por favor, sube un archivo .csv.", "error")
flash("Archivo no válido. Sube un .csv.", "error")
return redirect(url_for("restore_feeds"))
try:
file_stream = StringIO(file.read().decode("utf-8"))
file_stream = StringIO(file.read().decode("utf-8", errors='ignore'))
reader = csv.DictReader(file_stream)
rows = list(reader)
n_ok, n_err = 0, 0
with get_conn() as conn:
with conn.cursor() as cursor:
for row in rows:
for row in rows:
with conn.cursor() as cursor:
try:
activo_val = str(row.get("activo", "")).strip().lower()
activo = activo_val in ["1", "true", "t", "yes", "on"]
# --- ESTA ES LA PARTE CORREGIDA ---
cursor.execute("SAVEPOINT restore_feed_row")
activo = str(row.get("activo", "")).strip().lower() in ["1", "true", "t", "yes", "on"]
cat_id = int(row["categoria_id"]) if row.get("categoria_id") and row["categoria_id"].strip() else None
pais_id = int(row["pais_id"]) if row.get("pais_id") and row["pais_id"].strip() else None
cursor.execute(
"""
INSERT INTO feeds (id, nombre, descripcion, url, categoria_id, pais_id, idioma, activo, fallos)
VALUES (%(id)s, %(nombre)s, %(descripcion)s, %(url)s, %(categoria_id)s, %(pais_id)s, %(idioma)s, %(activo)s, %(fallos)s)
ON CONFLICT (id) DO UPDATE SET
nombre = EXCLUDED.nombre, descripcion = EXCLUDED.descripcion, url = EXCLUDED.url,
categoria_id = EXCLUDED.categoria_id, pais_id = EXCLUDED.pais_id, idioma = EXCLUDED.idioma,
activo = EXCLUDED.activo, fallos = EXCLUDED.fallos;
nombre=EXCLUDED.nombre, descripcion=EXCLUDED.descripcion, url=EXCLUDED.url, categoria_id=EXCLUDED.categoria_id,
pais_id=EXCLUDED.pais_id, idioma=EXCLUDED.idioma, activo=EXCLUDED.activo, fallos=EXCLUDED.fallos;
""",
{
"id": int(row.get("id")), "nombre": row.get("nombre"), "descripcion": row.get("descripcion") or "",
"url": row.get("url"), "categoria_id": int(row["categoria_id"]) if row.get("categoria_id") else None,
"pais_id": int(row["pais_id"]) if row.get("pais_id") else None,
"idioma": row.get("idioma") or None, # La clave aquí es 'idioma'
"activo": activo, "fallos": int(row.get("fallos", 0)),
}
{"id": int(row["id"]), "nombre": row.get("nombre"), "descripcion": row.get("descripcion") or "", "url": row.get("url"),
"categoria_id": cat_id, "pais_id": pais_id, "idioma": row.get("idioma") or None, "activo": activo,
"fallos": int(row.get("fallos", 0) or 0)}
)
n_ok += 1
cursor.execute("RELEASE SAVEPOINT restore_feed_row")
except Exception as e:
cursor.execute("ROLLBACK TO SAVEPOINT restore_feed_row")
n_err += 1
app.logger.error(f"Error procesando fila del CSV: {row} - Error: {e}")
app.logger.error(f"Error procesando fila (se omite): {row} - Error: {e}")
flash(f"Restauración completada. Feeds procesados: {n_ok}. Errores: {n_err}.", "success" if n_err == 0 else "warning")
except Exception as e:
app.logger.error(f"Error al restaurar feeds desde CSV: {e}", exc_info=True)
flash(f"Ocurrió un error general al procesar el archivo: {e}", "error")
return redirect(url_for("dashboard"))
return render_template("restore_feeds.html")
def sumar_fallo_feed(cursor, feed_id):
cursor.execute("UPDATE feeds SET fallos = fallos + 1 WHERE id = %s RETURNING fallos", (feed_id,))
fallos = cursor.fetchone()[0]
if fallos >= MAX_FALLOS:
cursor.execute("UPDATE feeds SET activo = FALSE WHERE id = %s", (feed_id,))
return fallos
def resetear_fallos_feed(cursor, feed_id):
cursor.execute("UPDATE feeds SET fallos = 0 WHERE id = %s", (feed_id,))
# --- fetch_and_store function (modified slightly) ---
def fetch_and_store():
with app.app_context():
app.logger.info("Iniciando ciclo de actualización de feeds...")
logging.info("--- INICIANDO CICLO DE CAPTURA ---")
feeds_to_process = []
try:
with get_conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
cursor.execute("SELECT id, url, categoria_id, pais_id FROM feeds WHERE activo = TRUE")
logging.info("Paso 1: Obteniendo lista de feeds...")
cursor.execute("SELECT id, url, categoria_id, pais_id, last_etag, last_modified FROM feeds WHERE activo = TRUE")
feeds_to_process = cursor.fetchall()
if not feeds_to_process:
app.logger.info("No hay feeds activos para procesar.")
return
for feed in feeds_to_process:
try:
app.logger.info(f"Procesando feed: {feed['url']}")
parsed = feedparser.parse(feed['url'])
if getattr(parsed, "bozo", False):
app.logger.warning(f"[BOZO] Feed mal formado: {feed['url']} - Excepción: {parsed.bozo_exception}")
sumar_fallo_feed(cursor, feed['id'])
continue
resetear_fallos_feed(cursor, feed['id'])
for entry in parsed.entries:
try:
link = entry.get("link")
if not link: continue
noticia_id = hashlib.md5(link.encode()).hexdigest()
titulo = entry.get("title", "")
resumen = entry.get("summary", "")
imagen_url = ""
if "media_content" in entry and entry.media_content:
imagen_url = entry.media_content[0].get("url", "")
elif "<img" in resumen:
img_search = re.search(r'src="([^"]+)"', resumen)
if img_search: imagen_url = img_search.group(1)
fecha_publicacion = None
if "published_parsed" in entry and entry.published_parsed: fecha_publicacion = datetime(*entry.published_parsed[:6])
elif "updated_parsed" in entry and entry.updated_parsed: fecha_publicacion = datetime(*entry.updated_parsed[:6])
cursor.execute(
"INSERT INTO noticias (id, titulo, resumen, url, fecha, imagen_url, categoria_id, pais_id) VALUES (%s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (id) DO NOTHING",
(noticia_id, titulo, resumen, link, fecha_publicacion, imagen_url, feed['categoria_id'], feed['pais_id'])
)
except Exception as entry_err:
app.logger.error(f"Error en entrada de feed {feed['url']}: {entry_err}")
except Exception as e:
app.logger.error(f"[PARSE ERROR] En feed {feed['url']}: {e}")
sumar_fallo_feed(cursor, feed['id'])
app.logger.info("Ciclo de feeds completado.")
logging.info(f"Paso 2: {len(feeds_to_process)} feeds para procesar.")
except psycopg2.Error as db_err:
app.logger.error(f"[DB ERROR] Fallo en ciclo de actualización: {db_err}")
logging.error(f"Error de BD al obtener feeds: {db_err}")
return
scheduler = BackgroundScheduler(daemon=True)
run_time = datetime.now() + timedelta(seconds=20)
scheduler.add_job(fetch_and_store, "interval", minutes=15, id="rss_job", next_run_time=run_time)
scheduler.start()
atexit.register(lambda: scheduler.shutdown())
app.logger.info("Scheduler configurado. Primera ejecución en 20 segundos.")
if not feeds_to_process:
logging.info("No hay feeds activos para procesar.")
return
feeds_fallidos, feeds_exitosos, todas_las_noticias, feeds_para_actualizar_headers = [], [], [], []
logging.info(f"Paso 3: Iniciando procesamiento paralelo ({MAX_WORKERS} workers)...")
# Pass the dict form of feed data
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
future_to_feed = {executor.submit(process_single_feed, dict(feed)): feed for feed in feeds_to_process}
progress_bar = tqdm(as_completed(future_to_feed), total=len(feeds_to_process), desc="Procesando Feeds")
for future in progress_bar:
original_feed_data = future_to_feed[future] # This is the original DictRow
feed_id = original_feed_data['id']
try:
_, noticias_encontradas, new_etag, new_modified, success = future.result(timeout=SINGLE_FEED_TIMEOUT)
if success:
feeds_exitosos.append(feed_id)
if noticias_encontradas: todas_las_noticias.extend(noticias_encontradas)
# Only add if etag/modified actually changed or were initially None
if (new_etag is not None and new_etag != original_feed_data.get('last_etag')) or \
(new_modified is not None and new_modified != original_feed_data.get('last_modified')):
feeds_para_actualizar_headers.append({'id': feed_id, 'etag': new_etag, 'modified': new_modified})
else:
feeds_fallidos.append(feed_id)
except TimeoutError:
logging.error(f"!!! TIMEOUT en feed {original_feed_data['url']} (ID: {feed_id})")
feeds_fallidos.append(feed_id)
except Exception as exc:
logging.error(f"Excepción en feed {original_feed_data['url']} (ID: {feed_id}): {exc}", exc_info=True)
feeds_fallidos.append(feed_id)
logging.info(f"Paso 4: Procesamiento finalizado. Noticias nuevas: {len(todas_las_noticias)}, Feeds fallidos: {len(feeds_fallidos)}, Feeds actualizados: {len(feeds_para_actualizar_headers)}.")
if not any([todas_las_noticias, feeds_fallidos, feeds_exitosos, feeds_para_actualizar_headers]):
logging.info("Sin cambios que aplicar en la base de datos.")
return
try:
with get_conn() as conn: # This connection is for the entire transaction (commits/rolls back everything together)
logging.info("Paso 5: Actualizando BD...")
# --- Update feed status (fallos, activo) ---
if feeds_fallidos or feeds_exitosos: # Only create cursor if there's work
with conn.cursor() as cursor_feeds_status:
if feeds_fallidos:
cursor_feeds_status.execute("UPDATE feeds SET fallos = fallos + 1 WHERE id IN %s", (tuple(feeds_fallidos),))
cursor_feeds_status.execute("UPDATE feeds SET activo = FALSE WHERE fallos >= %s AND id IN %s", (MAX_FALLOS, tuple(feeds_fallidos)))
if feeds_exitosos:
cursor_feeds_status.execute("UPDATE feeds SET fallos = 0 WHERE id IN %s", (tuple(feeds_exitosos),))
# cursor_feeds_status is implicitly closed here
# --- Update feed headers (etag, modified) ---
if feeds_para_actualizar_headers: # Only create cursor if there's work
with conn.cursor() as cursor_headers:
psycopg2.extras.execute_values(
cursor_headers,
"UPDATE feeds SET last_etag = data.etag, last_modified = data.modified FROM (VALUES %s) AS data(id, etag, modified) WHERE feeds.id = data.id",
[(f['id'], f['etag'], f['modified']) for f in feeds_para_actualizar_headers]
)
# cursor_headers is implicitly closed here
# --- Insert new news articles ---
if todas_las_noticias: # Only create cursor if there's work
logging.info(f"Intentando insertar {len(todas_las_noticias)} noticias en la base de datos.")
with conn.cursor() as cursor_news_insert: # A fresh cursor specifically for news insertion
psycopg2.extras.execute_values(
cursor_news_insert,
"INSERT INTO noticias (id, titulo, resumen, url, fecha, imagen_url, categoria_id, pais_id) VALUES %s ON CONFLICT (id) DO NOTHING",
todas_las_noticias
)
rows_inserted = cursor_news_insert.rowcount
logging.info(f"Se insertaron/omitieron {rows_inserted} noticias (ON CONFLICT DO NOTHING).")
# cursor_news_insert is implicitly closed here
logging.info("--- CICLO DE CAPTURA FINALIZADO ---")
except psycopg2.Error as db_err:
logging.error(f"Error de BD en actualización masiva: {db_err}", exc_info=True)
# --- Arranque de la Aplicación (SOLO PARA DESARROLLO LOCAL) ---
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5000, debug=True, use_reloader=False)
if not db_pool:
app.logger.error("La aplicación no puede arrancar sin una conexión a la base de datos.")
sys.exit(1)
app.run(host="0.0.0.0", port=5000, debug=True)