Actualización del 2025-06-15 a las 22:45:55

2025-06-15 22:45:55 +02:00 · 2025-06-15 22:45:55 +02:00 · eb72ec9e56
commit eb72ec9e56
parent 603149d47a
5 changed files with 114 additions and 94 deletions
--- a/app.py
+++ b/app.py
@ -93,7 +93,8 @@ def home():
                paises = cursor.fetchall()

                sql_params, conditions = [], []
-                sql_base = "SELECT n.fecha, n.titulo, n.resumen, n.url, n.imagen_url, c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente FROM noticias n LEFT JOIN categorias c ON n.categoria_id = c.id LEFT JOIN paises p ON n.pais_id = p.id LEFT JOIN continentes co ON p.continente_id = co.id"
+                # --- CORRECCIÓN: SE AÑADE 'fuente_nombre' AL SELECT ---
+                sql_base = "SELECT n.fecha, n.titulo, n.resumen, n.url, n.imagen_url, n.fuente_nombre, c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente FROM noticias n LEFT JOIN categorias c ON n.categoria_id = c.id LEFT JOIN paises p ON n.pais_id = p.id LEFT JOIN continentes co ON p.continente_id = co.id"

                if q:
                    search_query = " & ".join(q.split())
@ -361,24 +362,29 @@ def scrape_url():
            flash("Debes seleccionar una fuente para procesar.", "error")
            return redirect(url_for('scrape_url'))

+        source = None
        try:
            with get_conn() as conn:
                with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
                    cursor.execute("SELECT * FROM fuentes_url WHERE id = %s", (source_id,))
                    source = cursor.fetchone()
+        except psycopg2.Error as db_err:
+            app.logger.error(f"[DB ERROR] Al buscar fuente URL: {db_err}", exc_info=True)
+            flash("Error de base de datos al buscar la fuente.", "error")
+            return redirect(url_for('scrape_url'))

        if not source:
            flash("La fuente seleccionada no existe.", "error")
            return redirect(url_for('scrape_url'))

-                lista_noticias, message = process_newspaper_url(source['url'], source['categoria_id'], source['pais_id'], source['idioma'])
+        lista_noticias, message = process_newspaper_url(source['nombre'], source['url'], source['categoria_id'], source['pais_id'], source['idioma'])

        if lista_noticias:
-                    # Se necesita una nueva conexión/cursor para la inserción
-                    with get_conn() as insert_conn:
-                        with insert_conn.cursor() as insert_cursor:
+            try:
+                with get_conn() as conn:
+                    with conn.cursor() as cursor:
                        insert_query = """
-                                INSERT INTO noticias (id, titulo, resumen, url, fecha, imagen_url, categoria_id, pais_id)
+                            INSERT INTO noticias (id, titulo, resumen, url, fecha, imagen_url, fuente_nombre, categoria_id, pais_id)
                            VALUES %s
                            ON CONFLICT (url) DO UPDATE SET
                                titulo = EXCLUDED.titulo,
@ -386,16 +392,15 @@ def scrape_url():
                                fecha = EXCLUDED.fecha,
                                imagen_url = EXCLUDED.imagen_url;
                        """
-                            psycopg2.extras.execute_values(insert_cursor, insert_query, lista_noticias)
+                        psycopg2.extras.execute_values(cursor, insert_query, lista_noticias)
                flash(f"Se encontraron y guardaron {len(lista_noticias)} noticias desde '{source['nombre']}'.", "success")
                return redirect(url_for("home"))
+            except psycopg2.Error as db_err:
+                app.logger.error(f"[DB ERROR] Al insertar noticias scrapeadas: {db_err}", exc_info=True)
+                flash(f"Error de base de datos al guardar las noticias: {db_err}", "error")
        else:
            flash(message, "warning")
        
-        except psycopg2.Error as db_err:
-            app.logger.error(f"[DB ERROR] Al procesar fuente URL: {db_err}", exc_info=True)
-            flash(f"Error de base de datos al procesar la fuente: {db_err}", "error")
-        
        return redirect(url_for('scrape_url'))

    fuentes = []
@ -437,7 +442,7 @@ def backup_noticias():
    try:
        with get_conn() as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
-                cursor.execute("SELECT n.id, n.titulo, n.resumen, n.url, n.fecha, n.imagen_url, c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente FROM noticias n LEFT JOIN categorias c ON n.categoria_id = c.id LEFT JOIN paises p ON n.pais_id = p.id LEFT JOIN continentes co ON p.continente_id = co.id ORDER BY n.fecha DESC")
+                cursor.execute("SELECT n.id, n.titulo, n.resumen, n.url, n.fecha, n.imagen_url, n.fuente_nombre, c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente FROM noticias n LEFT JOIN categorias c ON n.categoria_id = c.id LEFT JOIN paises p ON n.pais_id = p.id LEFT JOIN continentes co ON p.continente_id = co.id ORDER BY n.fecha DESC")
                noticias = cursor.fetchall()
        if not noticias:
            flash("No hay noticias para exportar.", "warning")
@ -471,7 +476,7 @@ def backup_completo():
                        writer.writerows([dict(f) for f in feeds_data])
                        zipf.writestr("feeds.csv", output.getvalue())

-                    cursor.execute("SELECT n.id, n.titulo, n.resumen, n.url, n.fecha, n.imagen_url, c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente FROM noticias n LEFT JOIN categorias c ON n.categoria_id = c.id LEFT JOIN paises p ON n.pais_id = p.id LEFT JOIN continentes co ON p.continente_id = co.id ORDER BY n.fecha DESC")
+                    cursor.execute("SELECT n.id, n.titulo, n.resumen, n.url, n.fecha, n.imagen_url, n.fuente_nombre, c.nombre AS categoria, p.nombre AS pais, co.nombre AS continente FROM noticias n LEFT JOIN categorias c ON n.categoria_id = c.id LEFT JOIN paises p ON n.pais_id = p.id LEFT JOIN continentes co ON p.continente_id = co.id ORDER BY n.fecha DESC")
                    noticias_data = cursor.fetchall()
                    if noticias_data:
                        fieldnames_noticias = list(noticias_data[0].keys())
@ -543,7 +548,8 @@ def fetch_and_store():
            with get_conn() as conn:
                with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
                    logging.info("Paso 1: Obteniendo lista de feeds...")
-                    cursor.execute("SELECT id, url, categoria_id, pais_id, last_etag, last_modified FROM feeds WHERE activo = TRUE")
+                    # --- CORRECCIÓN: Se añade 'nombre' al SELECT ---
+                    cursor.execute("SELECT id, nombre, url, categoria_id, pais_id, last_etag, last_modified FROM feeds WHERE activo = TRUE")
                    feeds_to_process = cursor.fetchall()
                    logging.info(f"Paso 2: {len(feeds_to_process)} feeds para procesar.")
        except psycopg2.Error as db_err:
@ -608,9 +614,10 @@ def fetch_and_store():
                if todas_las_noticias:
                    logging.info(f"Intentando insertar {len(todas_las_noticias)} noticias en la base de datos.")
                    with conn.cursor() as cursor_news_insert:
+                        # --- CORRECCIÓN: Se añade 'fuente_nombre' a la consulta INSERT ---
                        psycopg2.extras.execute_values(
                            cursor_news_insert,
-                            "INSERT INTO noticias (id, titulo, resumen, url, fecha, imagen_url, categoria_id, pais_id) VALUES %s ON CONFLICT (id) DO NOTHING",
+                            "INSERT INTO noticias (id, titulo, resumen, url, fecha, imagen_url, fuente_nombre, categoria_id, pais_id) VALUES %s ON CONFLICT (url) DO NOTHING",
                            todas_las_noticias
                        )
                        rows_inserted = cursor_news_insert.rowcount
--- a/feed_processor.py
+++ b/feed_processor.py
@ -1,15 +1,12 @@
-# /home/x/rss/feed_processor.py
-
 import hashlib
 from datetime import datetime
 import logging
 import feedparser
 from bs4 import BeautifulSoup
 import requests
-import xml.sax._exceptions # Make sure this import is present
+import xml.sax._exceptions

-# You might want to define these constants in a central config or pass them
-NETWORK_TIMEOUT = 15 # seconds for fetching the feed
+NETWORK_TIMEOUT = 15 # segundos

 def process_single_feed(feed_data):
    """
@ -17,6 +14,9 @@ def process_single_feed(feed_data):
    """
    feed_id = feed_data['id']
    feed_url = feed_data['url']
+    # --- LÍNEA CLAVE ---
+    # Obtenemos el nombre del feed para usarlo como fuente de la noticia.
+    feed_nombre = feed_data.get('nombre', 'Fuente Desconocida')
    etag = feed_data.get('last_etag')
    modified = feed_data.get('last_modified')

@ -25,42 +25,27 @@ def process_single_feed(feed_data):
    success = False

    try:
-        headers = {}
+        headers = {'User-Agent': 'RssApp/1.0'}
        if etag:
            headers['If-None-Match'] = etag
        if modified:
            headers['If-Modified-Since'] = modified

        response = requests.get(feed_url, headers=headers, timeout=NETWORK_TIMEOUT)
-        response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
+        response.raise_for_status()

        if response.status_code == 304:
            logging.info(f"Feed {feed_url} (ID: {feed_id}) no modificado (304).")
-            # Return existing etag/modified if not modified, as per standard HTTP caching
            return feed_id, [], etag, modified, True

        parsed = feedparser.parse(response.content)

-        # Check if parsed.bozo is set, meaning there was an issue during parsing
-        if parsed.bozo:
-            # feedparser.bozo_exception will contain the actual exception
-            # We catch specific bozo exceptions for better error logging
-            if isinstance(parsed.bozo_exception, (feedparser.CharacterEncodingOverride, feedparser.NonXMLContentType)):
-                # These are usually harmless warnings; we can proceed
-                logging.warning(f"Advertencia al parsear feed {feed_url} (ID: {feed_id}): {parsed.bozo_exception}")
-            elif isinstance(parsed.bozo_exception, xml.sax._exceptions.SAXParseException):
-                # This is a critical parsing error (e.g., invalid XML)
+        if parsed.bozo and isinstance(parsed.bozo_exception, xml.sax._exceptions.SAXParseException):
            logging.error(f"Feed malformado para {feed_url} (ID: {feed_id}): {parsed.bozo_exception}")
-                return feed_id, [], None, None, False # Indicate failure due to parsing error
-            else:
-                # Catch any other unexpected bozo exceptions
-                logging.error(f"Excepción inesperada de bozo en feed {feed_url} (ID: {feed_id}): {parsed.bozo_exception}")
-                return feed_id, [], None, None, False # Indicate failure
+            return feed_id, [], None, None, False
        
-        # Proceed only if parsing was successful or had minor warnings
        if not parsed.entries:
            logging.warning(f"Feed {feed_url} (ID: {feed_id}) no contiene entradas.")
-             # If no entries but parsing was successful, update etag/modified
            return feed_id, [], parsed.get('etag'), parsed.get('modified'), True

        for entry in parsed.entries:
@ -73,9 +58,7 @@ def process_single_feed(feed_data):
            resumen_html = entry.get("summary", "")
            imagen_url = ""

-            # Attempt to get image from media:content or from HTML summary
            if "media_content" in entry and entry.media_content:
-                # Assuming the first media_content is the relevant one with a 'url'
                imagen_url = entry.media_content[0].get("url", "")
            elif resumen_html:
                soup = BeautifulSoup(resumen_html, 'html.parser')
@ -84,17 +67,19 @@ def process_single_feed(feed_data):
                    imagen_url = img_tag['src']

            resumen_texto_plano = BeautifulSoup(resumen_html, 'html.parser').get_text(separator=' ', strip=True)
-            fecha_publicacion = datetime.now() # Default to now if no publication date
+            fecha_publicacion = datetime.now()
            if hasattr(entry, 'published_parsed') and entry.published_parsed:
                fecha_publicacion = datetime(*entry.published_parsed[:6])
            elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
                fecha_publicacion = datetime(*entry.updated_parsed[:6])

+            # --- LÍNEA CLAVE ---
+            # Añadimos 'feed_nombre' a la tupla de datos que se guardará en la BD.
            noticias_encontradas.append(
-                (noticia_id, titulo, resumen_texto_plano, link, fecha_publicacion, imagen_url, feed_data['categoria_id'], feed_data['pais_id'])
+                (noticia_id, titulo, resumen_texto_plano, link, fecha_publicacion, 
+                 imagen_url, feed_nombre, feed_data['categoria_id'], feed_data['pais_id'])
            )

-        # Get ETag and Last-Modified headers from the response
        new_etag = response.headers.get('ETag')
        new_modified = response.headers.get('Last-Modified')
        success = True
@ -103,7 +88,8 @@ def process_single_feed(feed_data):
        logging.error(f"TIMEOUT al intentar obtener el feed {feed_url} (ID: {feed_id})")
    except requests.exceptions.RequestException as e:
        logging.error(f"Error de HTTP/red para el feed {feed_url} (ID: {feed_id}): {e}")
-    except Exception as e: # General Exception for any other unexpected errors during the process
+    except Exception as e:
        logging.error(f"Excepción inesperada al procesar el feed {feed_url} (ID: {feed_id}): {e}", exc_info=True)

    return feed_id, noticias_encontradas, new_etag, new_modified, success
+
--- a/install.sh
+++ b/install.sh
@ -47,7 +47,7 @@ rm -f /etc/systemd/system/$APP_NAME*
 systemctl daemon-reload
 echo "   -> Servicios systemd limpiados."

-echo "🟢 Paso 1: Instalando dependencias del sistema (PostgreSQL, Python, Gunicorn...)"
+echo "🟢 Paso 1: Instalando dependencias del sistema..."
 apt-get update
 apt-get install -y wget ca-certificates postgresql postgresql-contrib python3-venv python3-pip python3-dev libpq-dev gunicorn

@ -91,7 +91,7 @@ else
    echo "⚠️ ADVERTENCIA: No se encontró download_models.py. El scraping de URLs puede fallar."
 fi

-echo "📐 Paso 4: Creando esquema de BD, configurando FTS y sembrando datos desde archivos .sql..."
+echo "📐 Paso 4: Creando esquema de BD y sembrando datos..."
 export PGPASSWORD="$DB_PASS"

 psql -U "$DB_USER" -h localhost -d "$DB_NAME" <<SQL
@ -100,7 +100,7 @@ CREATE TABLE IF NOT EXISTS categorias (id SERIAL PRIMARY KEY, nombre VARCHAR(100
 CREATE TABLE IF NOT EXISTS paises (id SERIAL PRIMARY KEY, nombre VARCHAR(100) NOT NULL UNIQUE, continente_id INTEGER REFERENCES continentes(id) ON DELETE SET NULL);
 CREATE TABLE IF NOT EXISTS feeds (id SERIAL PRIMARY KEY, nombre VARCHAR(255), descripcion TEXT, url TEXT NOT NULL UNIQUE, categoria_id INTEGER REFERENCES categorias(id) ON DELETE SET NULL, pais_id INTEGER REFERENCES paises(id) ON DELETE SET NULL, idioma CHAR(2), activo BOOLEAN DEFAULT TRUE, fallos INTEGER DEFAULT 0, last_etag TEXT, last_modified TEXT);
 CREATE TABLE IF NOT EXISTS fuentes_url (id SERIAL PRIMARY KEY, nombre VARCHAR(255) NOT NULL, url TEXT NOT NULL UNIQUE, categoria_id INTEGER REFERENCES categorias(id) ON DELETE SET NULL, pais_id INTEGER REFERENCES paises(id) ON DELETE SET NULL, idioma CHAR(2) DEFAULT 'es');
-CREATE TABLE IF NOT EXISTS noticias (id VARCHAR(32) PRIMARY KEY, titulo TEXT, resumen TEXT, url TEXT NOT NULL UNIQUE, fecha TIMESTAMP, imagen_url TEXT, categoria_id INTEGER REFERENCES categorias(id) ON DELETE SET NULL, pais_id INTEGER REFERENCES paises(id) ON DELETE SET NULL, tsv tsvector);
+CREATE TABLE IF NOT EXISTS noticias (id VARCHAR(32) PRIMARY KEY, titulo TEXT, resumen TEXT, url TEXT NOT NULL UNIQUE, fecha TIMESTAMP, imagen_url TEXT, fuente_nombre VARCHAR(255), categoria_id INTEGER REFERENCES categorias(id) ON DELETE SET NULL, pais_id INTEGER REFERENCES paises(id) ON DELETE SET NULL, tsv tsvector);
 ALTER TABLE noticias ADD COLUMN IF NOT EXISTS tsv tsvector;
 CREATE OR REPLACE FUNCTION noticias_tsv_trigger() RETURNS trigger AS \$\$ BEGIN new.tsv := setweight(to_tsvector('spanish', coalesce(new.titulo,'')), 'A') || setweight(to_tsvector('spanish', coalesce(new.resumen,'')), 'B'); return new; END \$\$ LANGUAGE plpgsql;
 DROP TRIGGER IF EXISTS tsvectorupdate ON noticias;
@ -167,7 +167,7 @@ Environment="DB_PORT=5432"
 Environment="DB_NAME=$DB_NAME"
 Environment="DB_USER=$DB_USER"
 Environment="DB_PASS=$DB_PASS"
-ExecStart=$PYTHON_ENV/bin/gunicorn --workers 3 --bind 0.0.0.0:$WEB_PORT $WSGI_APP_ENTRY
+ExecStart=$PYTHON_ENV/bin/gunicorn --workers 3 --bind 0.0.0.0:$WEB_PORT --timeout 120 $WSGI_APP_ENTRY
 Restart=always
 [Install]
 WantedBy=multi-user.target
--- a/templates/_noticias_list.html
+++ b/templates/_noticias_list.html
@ -1,24 +1,37 @@
-<div class="noticias-list">
+<ul class="noticias-list">
    {% for noticia in noticias %}
-        <article class="noticia-item">
+    <li class="noticia-item">
        {% if noticia.imagen_url %}
        <div class="noticia-imagen">
-                <a href="{{ noticia.url }}" target="_blank" rel="noopener noreferrer"><img src="{{ noticia.imagen_url }}" alt="{{ noticia.titulo }}" loading="lazy"></a>
+            <a href="{{ noticia.url }}" target="_blank" rel="noopener noreferrer"><img src="{{ noticia.imagen_url }}" alt="Imagen para {{ noticia.titulo }}" loading="lazy"></a>
        </div>
        {% endif %}
        <div class="noticia-texto">
            <h3><a href="{{ noticia.url }}" target="_blank" rel="noopener noreferrer">{{ noticia.titulo }}</a></h3>
            <div class="noticia-meta">
-                    <span><i class="far fa-calendar-alt"></i> {{ noticia.fecha.strftime('%d-%m-%Y %H:%M') if noticia.fecha else 'N/D' }}</span> |
-                    <span><i class="fas fa-tag"></i> {{ noticia.categoria or 'N/A' }}</span> |
-                    <span><i class="fas fa-globe-americas"></i> {{ noticia.pais or 'Global' }}</span>
+                <span><i class="far fa-calendar-alt"></i> {{ noticia.fecha.strftime('%d-%m-%Y %H:%M') if noticia.fecha else 'N/D' }}</span>
+
+                <!-- INICIO DE LA MODIFICACIÓN: Se añade la fuente de la noticia -->
+                {% if noticia.fuente_nombre %}
+                | <span><i class="fas fa-newspaper"></i> <strong>{{ noticia.fuente_nombre }}</strong></span>
+                {% endif %}
+                <!-- FIN DE LA MODIFICACIÓN -->
+
+                {% if noticia.categoria %}
+                | <span><i class="fas fa-tag"></i> {{ noticia.categoria }}</span>
+                {% endif %}
+
+                {% if noticia.pais %}
+                | <span><i class="fas fa-globe-americas"></i> {{ noticia.pais }}</span>
+                {% endif %}
            </div>
-                <p>{{ noticia.resumen | striptags | safe_html | truncate(280) }}</p>
+            <p>{{ noticia.resumen | safe_html | truncate(280) }}</p>
        </div>
-        </article>
+    </li>
    {% else %}
-        <div class="card" style="text-align:center;">
-            <p><i class="fas fa-info-circle"></i> No hay noticias que mostrar con los filtros seleccionados.</p>
-        </div>
+    <li class="text-center p-4">
+        <i class="fas fa-info-circle"></i> No hay noticias que mostrar con los filtros seleccionados.
+    </li>
    {% endfor %}
-</div>
+</ul>
+
--- a/url_processor.py
+++ b/url_processor.py
@ -11,19 +11,25 @@ def _process_individual_article(article_url, config):
    Está diseñada para ser ejecutada en un hilo separado.
    """
    try:
+        # Es crucial crear un nuevo objeto Article dentro de cada hilo.
        article = newspaper.Article(article_url, config=config)
        article.download()
+        
+        # Un artículo necesita ser parseado para tener título, texto, etc.
        article.parse()

+        # Si no se pudo obtener título o texto, no es un artículo válido.
        if not article.title or not article.text:
            return None

+        # El método nlp() es necesario para el resumen.
        article.nlp()
        return article
    except Exception:
+        # Ignoramos errores en artículos individuales (p.ej., enlaces rotos, etc.)
        return None

-def process_newspaper_url(url, categoria_id, pais_id, idioma='es'):
+def process_newspaper_url(source_name, url, categoria_id, pais_id, idioma='es'):
    """
    Explora la URL de un periódico, extrae los artículos que encuentra
    en paralelo y devuelve una lista de noticias listas para la base de datos.
@ -35,32 +41,39 @@ def process_newspaper_url(url, categoria_id, pais_id, idioma='es'):
    try:
        config = Config()
        config.browser_user_agent = 'RssApp/1.0 (Scraper)'
-        config.request_timeout = 15
-        config.memoize_articles = False
+        config.request_timeout = 15 # Timeout más corto para artículos individuales.
+        config.memoize_articles = False # No guardar en caché para obtener siempre lo último.
        
        # Usamos el idioma proporcionado para mejorar la extracción
        source = newspaper.build(url, config=config, language=idioma)
        
+        # Limitar el número de artículos para no sobrecargar el servidor.
        articles_to_process = source.articles[:25]
        
        logging.info(f"Fuente construida. Procesando {len(articles_to_process)} artículos en paralelo...")

+        # Usamos un ThreadPoolExecutor para procesar los artículos concurrentemente.
        with ThreadPoolExecutor(max_workers=10) as executor:
+            # Creamos un futuro para cada URL de artículo.
            future_to_article = {executor.submit(_process_individual_article, article.url, config): article for article in articles_to_process}
            
            for future in as_completed(future_to_article):
                processed_article = future.result()
                
+                # Si el artículo se procesó correctamente, lo añadimos a la lista.
                if processed_article:
                    noticia_id = hashlib.md5(processed_article.url.encode()).hexdigest()
                    
                    if processed_article.summary:
                        resumen = processed_article.summary
                    else:
+                        # Fallback a un extracto del texto si no hay resumen.
                        resumen = (processed_article.text[:400] + '...') if len(processed_article.text) > 400 else processed_article.text

                    fecha = processed_article.publish_date if processed_article.publish_date else datetime.now()
                    
+                    # --- LÍNEA CLAVE ---
+                    # Añadimos 'source_name' a la tupla de datos
                    todas_las_noticias.append((
                        noticia_id,
                        processed_article.title,
@ -68,6 +81,7 @@ def process_newspaper_url(url, categoria_id, pais_id, idioma='es'):
                        processed_article.url,
                        fecha,
                        processed_article.top_image or '',
+                        source_name,
                        categoria_id,
                        pais_id
                    ))