Actualización del 2025-06-15 a las 22:45:55

2025-06-15 22:45:55 +02:00 · 2025-06-15 22:45:55 +02:00 · eb72ec9e56
commit eb72ec9e56
parent 603149d47a
5 changed files with 114 additions and 94 deletions
--- a/feed_processor.py
+++ b/feed_processor.py
@ -1,15 +1,12 @@
-# /home/x/rss/feed_processor.py
-
 import hashlib
 from datetime import datetime
 import logging
 import feedparser
 from bs4 import BeautifulSoup
 import requests
-import xml.sax._exceptions # Make sure this import is present
+import xml.sax._exceptions

-# You might want to define these constants in a central config or pass them
-NETWORK_TIMEOUT = 15 # seconds for fetching the feed
+NETWORK_TIMEOUT = 15 # segundos

 def process_single_feed(feed_data):
    """
@ -17,6 +14,9 @@ def process_single_feed(feed_data):
    """
    feed_id = feed_data['id']
    feed_url = feed_data['url']
+    # --- LÍNEA CLAVE ---
+    # Obtenemos el nombre del feed para usarlo como fuente de la noticia.
+    feed_nombre = feed_data.get('nombre', 'Fuente Desconocida')
    etag = feed_data.get('last_etag')
    modified = feed_data.get('last_modified')

@ -25,43 +25,28 @@ def process_single_feed(feed_data):
    success = False

    try:
-        headers = {}
+        headers = {'User-Agent': 'RssApp/1.0'}
        if etag:
            headers['If-None-Match'] = etag
        if modified:
            headers['If-Modified-Since'] = modified

        response = requests.get(feed_url, headers=headers, timeout=NETWORK_TIMEOUT)
-        response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
+        response.raise_for_status()

        if response.status_code == 304:
            logging.info(f"Feed {feed_url} (ID: {feed_id}) no modificado (304).")
-            # Return existing etag/modified if not modified, as per standard HTTP caching
            return feed_id, [], etag, modified, True

        parsed = feedparser.parse(response.content)

-        # Check if parsed.bozo is set, meaning there was an issue during parsing
-        if parsed.bozo:
-            # feedparser.bozo_exception will contain the actual exception
-            # We catch specific bozo exceptions for better error logging
-            if isinstance(parsed.bozo_exception, (feedparser.CharacterEncodingOverride, feedparser.NonXMLContentType)):
-                # These are usually harmless warnings; we can proceed
-                logging.warning(f"Advertencia al parsear feed {feed_url} (ID: {feed_id}): {parsed.bozo_exception}")
-            elif isinstance(parsed.bozo_exception, xml.sax._exceptions.SAXParseException):
-                # This is a critical parsing error (e.g., invalid XML)
-                logging.error(f"Feed malformado para {feed_url} (ID: {feed_id}): {parsed.bozo_exception}")
-                return feed_id, [], None, None, False # Indicate failure due to parsing error
-            else:
-                # Catch any other unexpected bozo exceptions
-                logging.error(f"Excepción inesperada de bozo en feed {feed_url} (ID: {feed_id}): {parsed.bozo_exception}")
-                return feed_id, [], None, None, False # Indicate failure
+        if parsed.bozo and isinstance(parsed.bozo_exception, xml.sax._exceptions.SAXParseException):
+            logging.error(f"Feed malformado para {feed_url} (ID: {feed_id}): {parsed.bozo_exception}")
+            return feed_id, [], None, None, False
        
-        # Proceed only if parsing was successful or had minor warnings
        if not parsed.entries:
-             logging.warning(f"Feed {feed_url} (ID: {feed_id}) no contiene entradas.")
-             # If no entries but parsing was successful, update etag/modified
-             return feed_id, [], parsed.get('etag'), parsed.get('modified'), True
+            logging.warning(f"Feed {feed_url} (ID: {feed_id}) no contiene entradas.")
+            return feed_id, [], parsed.get('etag'), parsed.get('modified'), True

        for entry in parsed.entries:
            link = entry.get("link")
@ -73,9 +58,7 @@ def process_single_feed(feed_data):
            resumen_html = entry.get("summary", "")
            imagen_url = ""

-            # Attempt to get image from media:content or from HTML summary
            if "media_content" in entry and entry.media_content:
-                # Assuming the first media_content is the relevant one with a 'url'
                imagen_url = entry.media_content[0].get("url", "")
            elif resumen_html:
                soup = BeautifulSoup(resumen_html, 'html.parser')
@ -84,17 +67,19 @@ def process_single_feed(feed_data):
                    imagen_url = img_tag['src']

            resumen_texto_plano = BeautifulSoup(resumen_html, 'html.parser').get_text(separator=' ', strip=True)
-            fecha_publicacion = datetime.now() # Default to now if no publication date
+            fecha_publicacion = datetime.now()
            if hasattr(entry, 'published_parsed') and entry.published_parsed:
                fecha_publicacion = datetime(*entry.published_parsed[:6])
            elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
                fecha_publicacion = datetime(*entry.updated_parsed[:6])

+            # --- LÍNEA CLAVE ---
+            # Añadimos 'feed_nombre' a la tupla de datos que se guardará en la BD.
            noticias_encontradas.append(
-                (noticia_id, titulo, resumen_texto_plano, link, fecha_publicacion, imagen_url, feed_data['categoria_id'], feed_data['pais_id'])
+                (noticia_id, titulo, resumen_texto_plano, link, fecha_publicacion, 
+                 imagen_url, feed_nombre, feed_data['categoria_id'], feed_data['pais_id'])
            )

-        # Get ETag and Last-Modified headers from the response
        new_etag = response.headers.get('ETag')
        new_modified = response.headers.get('Last-Modified')
        success = True
@ -103,7 +88,8 @@ def process_single_feed(feed_data):
        logging.error(f"TIMEOUT al intentar obtener el feed {feed_url} (ID: {feed_id})")
    except requests.exceptions.RequestException as e:
        logging.error(f"Error de HTTP/red para el feed {feed_url} (ID: {feed_id}): {e}")
-    except Exception as e: # General Exception for any other unexpected errors during the process
+    except Exception as e:
        logging.error(f"Excepción inesperada al procesar el feed {feed_url} (ID: {feed_id}): {e}", exc_info=True)

    return feed_id, noticias_encontradas, new_etag, new_modified, success
+