Fix cursor closed error, improve feed parsing, and update worker counts

2025-06-13 14:02:32 +02:00 · 2025-06-13 14:02:32 +02:00 · 824ff0539d
commit 824ff0539d
parent ce19d301e6
523 changed files with 190411 additions and 355 deletions
--- a/feed_processor.py
+++ b/feed_processor.py
@ -0,0 +1,109 @@
+# /home/x/rss/feed_processor.py
+
+import hashlib
+from datetime import datetime
+import logging
+import feedparser
+from bs4 import BeautifulSoup
+import requests
+import xml.sax._exceptions # Make sure this import is present
+
+# You might want to define these constants in a central config or pass them
+NETWORK_TIMEOUT = 15 # seconds for fetching the feed
+
+def process_single_feed(feed_data):
+    """
+    Procesa un único feed RSS, descarga sus noticias y devuelve la información.
+    """
+    feed_id = feed_data['id']
+    feed_url = feed_data['url']
+    etag = feed_data.get('last_etag')
+    modified = feed_data.get('last_modified')
+
+    noticias_encontradas = []
+    new_etag, new_modified = None, None
+    success = False
+
+    try:
+        headers = {}
+        if etag:
+            headers['If-None-Match'] = etag
+        if modified:
+            headers['If-Modified-Since'] = modified
+
+        response = requests.get(feed_url, headers=headers, timeout=NETWORK_TIMEOUT)
+        response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
+
+        if response.status_code == 304:
+            logging.info(f"Feed {feed_url} (ID: {feed_id}) no modificado (304).")
+            # Return existing etag/modified if not modified, as per standard HTTP caching
+            return feed_id, [], etag, modified, True
+
+        parsed = feedparser.parse(response.content)
+
+        # Check if parsed.bozo is set, meaning there was an issue during parsing
+        if parsed.bozo:
+            # feedparser.bozo_exception will contain the actual exception
+            # We catch specific bozo exceptions for better error logging
+            if isinstance(parsed.bozo_exception, (feedparser.CharacterEncodingOverride, feedparser.NonXMLContentType)):
+                # These are usually harmless warnings; we can proceed
+                logging.warning(f"Advertencia al parsear feed {feed_url} (ID: {feed_id}): {parsed.bozo_exception}")
+            elif isinstance(parsed.bozo_exception, xml.sax._exceptions.SAXParseException):
+                # This is a critical parsing error (e.g., invalid XML)
+                logging.error(f"Feed malformado para {feed_url} (ID: {feed_id}): {parsed.bozo_exception}")
+                return feed_id, [], None, None, False # Indicate failure due to parsing error
+            else:
+                # Catch any other unexpected bozo exceptions
+                logging.error(f"Excepción inesperada de bozo en feed {feed_url} (ID: {feed_id}): {parsed.bozo_exception}")
+                return feed_id, [], None, None, False # Indicate failure
+        
+        # Proceed only if parsing was successful or had minor warnings
+        if not parsed.entries:
+             logging.warning(f"Feed {feed_url} (ID: {feed_id}) no contiene entradas.")
+             # If no entries but parsing was successful, update etag/modified
+             return feed_id, [], parsed.get('etag'), parsed.get('modified'), True
+
+        for entry in parsed.entries:
+            link = entry.get("link")
+            if not link:
+                continue
+
+            noticia_id = hashlib.md5(link.encode()).hexdigest()
+            titulo = entry.get("title", "Sin título")
+            resumen_html = entry.get("summary", "")
+            imagen_url = ""
+
+            # Attempt to get image from media:content or from HTML summary
+            if "media_content" in entry and entry.media_content:
+                # Assuming the first media_content is the relevant one with a 'url'
+                imagen_url = entry.media_content[0].get("url", "")
+            elif resumen_html:
+                soup = BeautifulSoup(resumen_html, 'html.parser')
+                img_tag = soup.find('img')
+                if img_tag and img_tag.get('src'):
+                    imagen_url = img_tag['src']
+
+            resumen_texto_plano = BeautifulSoup(resumen_html, 'html.parser').get_text(separator=' ', strip=True)
+            fecha_publicacion = datetime.now() # Default to now if no publication date
+            if hasattr(entry, 'published_parsed') and entry.published_parsed:
+                fecha_publicacion = datetime(*entry.published_parsed[:6])
+            elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
+                fecha_publicacion = datetime(*entry.updated_parsed[:6])
+
+            noticias_encontradas.append(
+                (noticia_id, titulo, resumen_texto_plano, link, fecha_publicacion, imagen_url, feed_data['categoria_id'], feed_data['pais_id'])
+            )
+
+        # Get ETag and Last-Modified headers from the response
+        new_etag = response.headers.get('ETag')
+        new_modified = response.headers.get('Last-Modified')
+        success = True
+
+    except requests.exceptions.Timeout:
+        logging.error(f"TIMEOUT al intentar obtener el feed {feed_url} (ID: {feed_id})")
+    except requests.exceptions.RequestException as e:
+        logging.error(f"Error de HTTP/red para el feed {feed_url} (ID: {feed_id}): {e}")
+    except Exception as e: # General Exception for any other unexpected errors during the process
+        logging.error(f"Excepción inesperada al procesar el feed {feed_url} (ID: {feed_id}): {e}", exc_info=True)
+
+    return feed_id, noticias_encontradas, new_etag, new_modified, success