Actualización del 2025-06-13 a las 22:54:59

This commit is contained in:
jlimolina 2025-06-13 22:54:59 +02:00
parent 5846d1310d
commit 075a438fe2
3 changed files with 397 additions and 45 deletions

73
app.py
View file

@ -1,8 +1,6 @@
# app.py - Versión final solo para la web
import os
import sys
import hashlib # Keep if used elsewhere (e.g., if generating IDs in other parts of the app), otherwise remove
import hashlib
import csv
import math
from io import StringIO, BytesIO
@ -20,17 +18,13 @@ import psycopg2.extras
import psycopg2.pool
import bleach
# Import the processing function from the new module
from feed_processor import process_single_feed
# --- Configuración de Logging ---
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='[%(asctime)s] %(levelname)s in %(module)s: %(message)s')
# --- Inicialización de la App Flask ---
app = Flask(__name__)
app.config['SECRET_KEY'] = os.environ.get('SECRET_KEY', os.urandom(24))
# --- Configuración de la Base de Datos y Constantes ---
DB_CONFIG = {
"host": os.environ.get("DB_HOST", "localhost"),
"port": int(os.environ.get("DB_PORT", 5432)),
@ -39,21 +33,16 @@ DB_CONFIG = {
"password": os.environ.get("DB_PASS", "x")
}
# Define worker constants here or in a separate config
MAX_WORKERS = int(os.environ.get("RSS_MAX_WORKERS", 20)) # Changed default to 20 concurrent workers
SINGLE_FEED_TIMEOUT = int(os.environ.get("RSS_FEED_TIMEOUT", 30)) # Example: 30 seconds per feed process
MAX_FALLOS = int(os.environ.get("RSS_MAX_FAILURES", 5)) # Number of failures before deactivating a feed
MAX_WORKERS = int(os.environ.get("RSS_MAX_WORKERS", 20))
SINGLE_FEED_TIMEOUT = int(os.environ.get("RSS_FEED_TIMEOUT", 30))
MAX_FALLOS = int(os.environ.get("RSS_MAX_FAILURES", 5))
# --- Pool de Conexiones a la Base de Datos ---
db_pool = None
try:
# Aumentamos el pool para dar cabida a los workers del servidor web
db_pool = psycopg2.pool.SimpleConnectionPool(minconn=1, maxconn=10, **DB_CONFIG)
app.logger.info("Pool de conexiones a la base de datos creado exitosamente.")
except psycopg2.OperationalError as e:
logging.error(f"FATAL: No se pudo conectar a la base de datos para crear el pool: {e}")
# Consider sys.exit(1) here if DB connection is absolutely critical for app startup
@contextmanager
def get_conn():
@ -69,14 +58,12 @@ def get_conn():
finally:
if conn: db_pool.putconn(conn)
# --- Hook de Cierre ---
@atexit.register
def shutdown_hooks():
if db_pool:
db_pool.closeall()
app.logger.info("Pool de conexiones de la base de datos cerrado.")
# --- Filtros y Rutas ---
@app.template_filter('safe_html')
def safe_html(text):
if not text: return ""
@ -279,13 +266,16 @@ def backup_feeds():
if not feeds_:
flash("No hay feeds para exportar.", "warning")
return redirect(url_for("dashboard"))
fieldnames = list(feeds_[0].keys())
output = StringIO()
writer = csv.DictWriter(output, fieldnames=feeds_[0].keys())
writer = csv.DictWriter(output, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(feeds_)
writer.writerows([dict(feed) for feed in feeds_])
return Response(output.getvalue(), mimetype="text/csv", headers={"Content-Disposition": "attachment;filename=feeds_backup.csv"})
except Exception as e:
app.logger.error(f"[ERROR] Al hacer backup de feeds: {e}", exc_info=True)
flash(f"Error interno al generar el backup: {e}", "error")
return redirect(url_for("dashboard"))
@app.route("/backup_noticias")
@ -298,13 +288,16 @@ def backup_noticias():
if not noticias:
flash("No hay noticias para exportar.", "warning")
return redirect(url_for("dashboard"))
fieldnames_noticias = list(noticias[0].keys())
output = StringIO()
writer = csv.DictWriter(output, fieldnames=noticias[0].keys())
writer = csv.DictWriter(output, fieldnames=fieldnames_noticias)
writer.writeheader()
writer.writerows(noticias)
writer.writerows([dict(noticia) for noticia in noticias])
return Response(output.getvalue(), mimetype="text/csv", headers={"Content-Disposition": "attachment;filename=noticias_backup.csv"})
except Exception as e:
app.logger.error(f"[ERROR] Al hacer backup de noticias: {e}", exc_info=True)
flash(f"Error interno al generar el backup: {e}", "error")
return redirect(url_for("dashboard"))
@app.route("/backup_completo")
@ -317,23 +310,27 @@ def backup_completo():
cursor.execute("SELECT f.id, f.nombre, f.descripcion, f.url, f.categoria_id, c.nombre AS categoria, f.pais_id, p.nombre AS pais, f.idioma, f.activo, f.fallos FROM feeds f LEFT JOIN categorias c ON f.categoria_id = c.id LEFT JOIN paises p ON f.pais_id = p.id ORDER BY f.id")
feeds_data = cursor.fetchall()
if feeds_data:
fieldnames_feeds = list(feeds_data[0].keys())
output = StringIO()
writer = csv.DictWriter(output, fieldnames=feeds_data[0].keys())
writer = csv.DictWriter(output, fieldnames=fieldnames_feeds)
writer.writeheader()
writer.writerows(feeds_data)
writer.writerows([dict(f) for f in feeds_data])
zipf.writestr("feeds.csv", output.getvalue())
cursor.execute("SELECT n.id, n.titulo, n.resumen, n.url, n.fecha, n.imagen_url, c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente FROM noticias n LEFT JOIN categorias c ON n.categoria_id = c.id LEFT JOIN paises p ON n.pais_id = p.id LEFT JOIN continentes co ON p.continente_id = co.id ORDER BY n.fecha DESC")
noticias_data = cursor.fetchall()
if noticias_data:
fieldnames_noticias = list(noticias_data[0].keys())
output = StringIO()
writer = csv.DictWriter(output, fieldnames=noticias_data[0].keys())
writer = csv.DictWriter(output, fieldnames=fieldnames_noticias)
writer.writeheader()
writer.writerows(noticias_data)
writer.writerows([dict(n) for n in noticias_data])
zipf.writestr("noticias.csv", output.getvalue())
memory_buffer.seek(0)
return Response(memory_buffer, mimetype="application/zip", headers={"Content-Disposition": "attachment;filename=rss_backup_completo.zip"})
except Exception as e:
app.logger.error(f"[ERROR] Al hacer backup completo: {e}", exc_info=True)
flash(f"Error interno al generar el backup: {e}", "error")
return redirect(url_for("dashboard"))
@app.route("/restore_feeds", methods=["GET", "POST"])
@ -381,9 +378,6 @@ def restore_feeds():
return redirect(url_for("dashboard"))
return render_template("restore_feeds.html")
# --- fetch_and_store function (modified slightly) ---
def fetch_and_store():
with app.app_context():
logging.info("--- INICIANDO CICLO DE CAPTURA ---")
@ -406,19 +400,17 @@ def fetch_and_store():
feeds_fallidos, feeds_exitosos, todas_las_noticias, feeds_para_actualizar_headers = [], [], [], []
logging.info(f"Paso 3: Iniciando procesamiento paralelo ({MAX_WORKERS} workers)...")
# Pass the dict form of feed data
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
future_to_feed = {executor.submit(process_single_feed, dict(feed)): feed for feed in feeds_to_process}
progress_bar = tqdm(as_completed(future_to_feed), total=len(feeds_to_process), desc="Procesando Feeds")
for future in progress_bar:
original_feed_data = future_to_feed[future] # This is the original DictRow
original_feed_data = future_to_feed[future]
feed_id = original_feed_data['id']
try:
_, noticias_encontradas, new_etag, new_modified, success = future.result(timeout=SINGLE_FEED_TIMEOUT)
if success:
feeds_exitosos.append(feed_id)
if noticias_encontradas: todas_las_noticias.extend(noticias_encontradas)
# Only add if etag/modified actually changed or were initially None
if (new_etag is not None and new_etag != original_feed_data.get('last_etag')) or \
(new_modified is not None and new_modified != original_feed_data.get('last_modified')):
feeds_para_actualizar_headers.append({'id': feed_id, 'etag': new_etag, 'modified': new_modified})
@ -437,33 +429,28 @@ def fetch_and_store():
return
try:
with get_conn() as conn: # This connection is for the entire transaction (commits/rolls back everything together)
with get_conn() as conn:
logging.info("Paso 5: Actualizando BD...")
# --- Update feed status (fallos, activo) ---
if feeds_fallidos or feeds_exitosos: # Only create cursor if there's work
if feeds_fallidos or feeds_exitosos:
with conn.cursor() as cursor_feeds_status:
if feeds_fallidos:
cursor_feeds_status.execute("UPDATE feeds SET fallos = fallos + 1 WHERE id IN %s", (tuple(feeds_fallidos),))
cursor_feeds_status.execute("UPDATE feeds SET activo = FALSE WHERE fallos >= %s AND id IN %s", (MAX_FALLOS, tuple(feeds_fallidos)))
if feeds_exitosos:
cursor_feeds_status.execute("UPDATE feeds SET fallos = 0 WHERE id IN %s", (tuple(feeds_exitosos),))
# cursor_feeds_status is implicitly closed here
# --- Update feed headers (etag, modified) ---
if feeds_para_actualizar_headers: # Only create cursor if there's work
if feeds_para_actualizar_headers:
with conn.cursor() as cursor_headers:
psycopg2.extras.execute_values(
cursor_headers,
"UPDATE feeds SET last_etag = data.etag, last_modified = data.modified FROM (VALUES %s) AS data(id, etag, modified) WHERE feeds.id = data.id",
[(f['id'], f['etag'], f['modified']) for f in feeds_para_actualizar_headers]
)
# cursor_headers is implicitly closed here
# --- Insert new news articles ---
if todas_las_noticias: # Only create cursor if there's work
if todas_las_noticias:
logging.info(f"Intentando insertar {len(todas_las_noticias)} noticias en la base de datos.")
with conn.cursor() as cursor_news_insert: # A fresh cursor specifically for news insertion
with conn.cursor() as cursor_news_insert:
psycopg2.extras.execute_values(
cursor_news_insert,
"INSERT INTO noticias (id, titulo, resumen, url, fecha, imagen_url, categoria_id, pais_id) VALUES %s ON CONFLICT (id) DO NOTHING",
@ -471,16 +458,14 @@ def fetch_and_store():
)
rows_inserted = cursor_news_insert.rowcount
logging.info(f"Se insertaron/omitieron {rows_inserted} noticias (ON CONFLICT DO NOTHING).")
# cursor_news_insert is implicitly closed here
logging.info("--- CICLO DE CAPTURA FINALIZADO ---")
except psycopg2.Error as db_err:
logging.error(f"Error de BD en actualización masiva: {db_err}", exc_info=True)
# --- Arranque de la Aplicación (SOLO PARA DESARROLLO LOCAL) ---
if __name__ == "__main__":
if not db_pool:
app.logger.error("La aplicación no puede arrancar sin una conexión a la base de datos.")
sys.exit(1)
app.run(host="0.0.0.0", port=5000, debug=True)

364
estructura.sql Normal file
View file

@ -0,0 +1,364 @@
--
-- PostgreSQL database dump
--
-- Dumped from database version 16.9 (Ubuntu 16.9-0ubuntu0.24.04.1)
-- Dumped by pg_dump version 16.9 (Ubuntu 16.9-0ubuntu0.24.04.1)
SET statement_timeout = 0;
SET lock_timeout = 0;
SET idle_in_transaction_session_timeout = 0;
SET client_encoding = 'UTF8';
SET standard_conforming_strings = on;
SELECT pg_catalog.set_config('search_path', '', false);
SET check_function_bodies = false;
SET xmloption = content;
SET client_min_messages = warning;
SET row_security = off;
--
-- Name: noticias_tsv_trigger(); Type: FUNCTION; Schema: public; Owner: rss
--
CREATE FUNCTION public.noticias_tsv_trigger() RETURNS trigger
LANGUAGE plpgsql
AS $$ begin new.tsv := setweight(to_tsvector('spanish', coalesce(new.titulo,'')), 'A') || setweight(to_tsvector('spanish', coalesce(new.resumen,'')), 'B'); return new; end $$;
ALTER FUNCTION public.noticias_tsv_trigger() OWNER TO rss;
SET default_tablespace = '';
SET default_table_access_method = heap;
--
-- Name: categorias; Type: TABLE; Schema: public; Owner: rss
--
CREATE TABLE public.categorias (
id integer NOT NULL,
nombre character varying(100) NOT NULL
);
ALTER TABLE public.categorias OWNER TO rss;
--
-- Name: categorias_id_seq; Type: SEQUENCE; Schema: public; Owner: rss
--
CREATE SEQUENCE public.categorias_id_seq
AS integer
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;
ALTER SEQUENCE public.categorias_id_seq OWNER TO rss;
--
-- Name: categorias_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: rss
--
ALTER SEQUENCE public.categorias_id_seq OWNED BY public.categorias.id;
--
-- Name: continentes; Type: TABLE; Schema: public; Owner: rss
--
CREATE TABLE public.continentes (
id integer NOT NULL,
nombre character varying(50) NOT NULL
);
ALTER TABLE public.continentes OWNER TO rss;
--
-- Name: continentes_id_seq; Type: SEQUENCE; Schema: public; Owner: rss
--
CREATE SEQUENCE public.continentes_id_seq
AS integer
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;
ALTER SEQUENCE public.continentes_id_seq OWNER TO rss;
--
-- Name: continentes_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: rss
--
ALTER SEQUENCE public.continentes_id_seq OWNED BY public.continentes.id;
--
-- Name: feeds; Type: TABLE; Schema: public; Owner: rss
--
CREATE TABLE public.feeds (
id integer NOT NULL,
nombre character varying(255),
descripcion text,
url text NOT NULL,
categoria_id integer,
pais_id integer,
idioma character(2),
activo boolean DEFAULT true,
fallos integer DEFAULT 0,
last_etag character varying(255),
last_modified character varying(255)
);
ALTER TABLE public.feeds OWNER TO rss;
--
-- Name: feeds_id_seq; Type: SEQUENCE; Schema: public; Owner: rss
--
CREATE SEQUENCE public.feeds_id_seq
AS integer
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;
ALTER SEQUENCE public.feeds_id_seq OWNER TO rss;
--
-- Name: feeds_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: rss
--
ALTER SEQUENCE public.feeds_id_seq OWNED BY public.feeds.id;
--
-- Name: noticias; Type: TABLE; Schema: public; Owner: rss
--
CREATE TABLE public.noticias (
id character varying(32) NOT NULL,
titulo text,
resumen text,
url text NOT NULL,
fecha timestamp without time zone,
imagen_url text,
categoria_id integer,
pais_id integer,
tsv tsvector
);
ALTER TABLE public.noticias OWNER TO rss;
--
-- Name: paises; Type: TABLE; Schema: public; Owner: rss
--
CREATE TABLE public.paises (
id integer NOT NULL,
nombre character varying(100) NOT NULL,
continente_id integer
);
ALTER TABLE public.paises OWNER TO rss;
--
-- Name: paises_id_seq; Type: SEQUENCE; Schema: public; Owner: rss
--
CREATE SEQUENCE public.paises_id_seq
AS integer
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;
ALTER SEQUENCE public.paises_id_seq OWNER TO rss;
--
-- Name: paises_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: rss
--
ALTER SEQUENCE public.paises_id_seq OWNED BY public.paises.id;
--
-- Name: categorias id; Type: DEFAULT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.categorias ALTER COLUMN id SET DEFAULT nextval('public.categorias_id_seq'::regclass);
--
-- Name: continentes id; Type: DEFAULT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.continentes ALTER COLUMN id SET DEFAULT nextval('public.continentes_id_seq'::regclass);
--
-- Name: feeds id; Type: DEFAULT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.feeds ALTER COLUMN id SET DEFAULT nextval('public.feeds_id_seq'::regclass);
--
-- Name: paises id; Type: DEFAULT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.paises ALTER COLUMN id SET DEFAULT nextval('public.paises_id_seq'::regclass);
--
-- Name: categorias categorias_nombre_key; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.categorias
ADD CONSTRAINT categorias_nombre_key UNIQUE (nombre);
--
-- Name: categorias categorias_pkey; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.categorias
ADD CONSTRAINT categorias_pkey PRIMARY KEY (id);
--
-- Name: continentes continentes_nombre_key; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.continentes
ADD CONSTRAINT continentes_nombre_key UNIQUE (nombre);
--
-- Name: continentes continentes_pkey; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.continentes
ADD CONSTRAINT continentes_pkey PRIMARY KEY (id);
--
-- Name: feeds feeds_pkey; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.feeds
ADD CONSTRAINT feeds_pkey PRIMARY KEY (id);
--
-- Name: feeds feeds_url_key; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.feeds
ADD CONSTRAINT feeds_url_key UNIQUE (url);
--
-- Name: noticias noticias_pkey; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.noticias
ADD CONSTRAINT noticias_pkey PRIMARY KEY (id);
--
-- Name: noticias noticias_url_key; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.noticias
ADD CONSTRAINT noticias_url_key UNIQUE (url);
--
-- Name: paises paises_nombre_key; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.paises
ADD CONSTRAINT paises_nombre_key UNIQUE (nombre);
--
-- Name: paises paises_pkey; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.paises
ADD CONSTRAINT paises_pkey PRIMARY KEY (id);
--
-- Name: noticias_tsv_idx; Type: INDEX; Schema: public; Owner: rss
--
CREATE INDEX noticias_tsv_idx ON public.noticias USING gin (tsv);
--
-- Name: noticias tsvectorupdate; Type: TRIGGER; Schema: public; Owner: rss
--
CREATE TRIGGER tsvectorupdate BEFORE INSERT ON public.noticias FOR EACH ROW EXECUTE FUNCTION public.noticias_tsv_trigger();
--
-- Name: feeds feeds_categoria_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.feeds
ADD CONSTRAINT feeds_categoria_id_fkey FOREIGN KEY (categoria_id) REFERENCES public.categorias(id) ON DELETE SET NULL;
--
-- Name: feeds feeds_pais_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.feeds
ADD CONSTRAINT feeds_pais_id_fkey FOREIGN KEY (pais_id) REFERENCES public.paises(id) ON DELETE SET NULL;
--
-- Name: noticias noticias_categoria_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.noticias
ADD CONSTRAINT noticias_categoria_id_fkey FOREIGN KEY (categoria_id) REFERENCES public.categorias(id) ON DELETE SET NULL;
--
-- Name: noticias noticias_pais_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.noticias
ADD CONSTRAINT noticias_pais_id_fkey FOREIGN KEY (pais_id) REFERENCES public.paises(id) ON DELETE SET NULL;
--
-- Name: paises paises_continente_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.paises
ADD CONSTRAINT paises_continente_id_fkey FOREIGN KEY (continente_id) REFERENCES public.continentes(id) ON DELETE SET NULL;
--
-- PostgreSQL database dump complete
--

View file

@ -5,3 +5,6 @@ psycopg2-binary==2.9.10
bleach==6.1.0
gunicorn==22.0.0
waitress
tqdm
beautifulsoup4
requests