Fix cursor closed error, improve feed parsing, and update worker counts
This commit is contained in:
parent
ce19d301e6
commit
824ff0539d
523 changed files with 190411 additions and 355 deletions
109
feed_processor.py
Normal file
109
feed_processor.py
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
# /home/x/rss/feed_processor.py
|
||||
|
||||
import hashlib
|
||||
from datetime import datetime
|
||||
import logging
|
||||
import feedparser
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import xml.sax._exceptions # Make sure this import is present
|
||||
|
||||
# You might want to define these constants in a central config or pass them
|
||||
NETWORK_TIMEOUT = 15 # seconds for fetching the feed
|
||||
|
||||
def process_single_feed(feed_data):
|
||||
"""
|
||||
Procesa un único feed RSS, descarga sus noticias y devuelve la información.
|
||||
"""
|
||||
feed_id = feed_data['id']
|
||||
feed_url = feed_data['url']
|
||||
etag = feed_data.get('last_etag')
|
||||
modified = feed_data.get('last_modified')
|
||||
|
||||
noticias_encontradas = []
|
||||
new_etag, new_modified = None, None
|
||||
success = False
|
||||
|
||||
try:
|
||||
headers = {}
|
||||
if etag:
|
||||
headers['If-None-Match'] = etag
|
||||
if modified:
|
||||
headers['If-Modified-Since'] = modified
|
||||
|
||||
response = requests.get(feed_url, headers=headers, timeout=NETWORK_TIMEOUT)
|
||||
response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
|
||||
|
||||
if response.status_code == 304:
|
||||
logging.info(f"Feed {feed_url} (ID: {feed_id}) no modificado (304).")
|
||||
# Return existing etag/modified if not modified, as per standard HTTP caching
|
||||
return feed_id, [], etag, modified, True
|
||||
|
||||
parsed = feedparser.parse(response.content)
|
||||
|
||||
# Check if parsed.bozo is set, meaning there was an issue during parsing
|
||||
if parsed.bozo:
|
||||
# feedparser.bozo_exception will contain the actual exception
|
||||
# We catch specific bozo exceptions for better error logging
|
||||
if isinstance(parsed.bozo_exception, (feedparser.CharacterEncodingOverride, feedparser.NonXMLContentType)):
|
||||
# These are usually harmless warnings; we can proceed
|
||||
logging.warning(f"Advertencia al parsear feed {feed_url} (ID: {feed_id}): {parsed.bozo_exception}")
|
||||
elif isinstance(parsed.bozo_exception, xml.sax._exceptions.SAXParseException):
|
||||
# This is a critical parsing error (e.g., invalid XML)
|
||||
logging.error(f"Feed malformado para {feed_url} (ID: {feed_id}): {parsed.bozo_exception}")
|
||||
return feed_id, [], None, None, False # Indicate failure due to parsing error
|
||||
else:
|
||||
# Catch any other unexpected bozo exceptions
|
||||
logging.error(f"Excepción inesperada de bozo en feed {feed_url} (ID: {feed_id}): {parsed.bozo_exception}")
|
||||
return feed_id, [], None, None, False # Indicate failure
|
||||
|
||||
# Proceed only if parsing was successful or had minor warnings
|
||||
if not parsed.entries:
|
||||
logging.warning(f"Feed {feed_url} (ID: {feed_id}) no contiene entradas.")
|
||||
# If no entries but parsing was successful, update etag/modified
|
||||
return feed_id, [], parsed.get('etag'), parsed.get('modified'), True
|
||||
|
||||
for entry in parsed.entries:
|
||||
link = entry.get("link")
|
||||
if not link:
|
||||
continue
|
||||
|
||||
noticia_id = hashlib.md5(link.encode()).hexdigest()
|
||||
titulo = entry.get("title", "Sin título")
|
||||
resumen_html = entry.get("summary", "")
|
||||
imagen_url = ""
|
||||
|
||||
# Attempt to get image from media:content or from HTML summary
|
||||
if "media_content" in entry and entry.media_content:
|
||||
# Assuming the first media_content is the relevant one with a 'url'
|
||||
imagen_url = entry.media_content[0].get("url", "")
|
||||
elif resumen_html:
|
||||
soup = BeautifulSoup(resumen_html, 'html.parser')
|
||||
img_tag = soup.find('img')
|
||||
if img_tag and img_tag.get('src'):
|
||||
imagen_url = img_tag['src']
|
||||
|
||||
resumen_texto_plano = BeautifulSoup(resumen_html, 'html.parser').get_text(separator=' ', strip=True)
|
||||
fecha_publicacion = datetime.now() # Default to now if no publication date
|
||||
if hasattr(entry, 'published_parsed') and entry.published_parsed:
|
||||
fecha_publicacion = datetime(*entry.published_parsed[:6])
|
||||
elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
|
||||
fecha_publicacion = datetime(*entry.updated_parsed[:6])
|
||||
|
||||
noticias_encontradas.append(
|
||||
(noticia_id, titulo, resumen_texto_plano, link, fecha_publicacion, imagen_url, feed_data['categoria_id'], feed_data['pais_id'])
|
||||
)
|
||||
|
||||
# Get ETag and Last-Modified headers from the response
|
||||
new_etag = response.headers.get('ETag')
|
||||
new_modified = response.headers.get('Last-Modified')
|
||||
success = True
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
logging.error(f"TIMEOUT al intentar obtener el feed {feed_url} (ID: {feed_id})")
|
||||
except requests.exceptions.RequestException as e:
|
||||
logging.error(f"Error de HTTP/red para el feed {feed_url} (ID: {feed_id}): {e}")
|
||||
except Exception as e: # General Exception for any other unexpected errors during the process
|
||||
logging.error(f"Excepción inesperada al procesar el feed {feed_url} (ID: {feed_id}): {e}", exc_info=True)
|
||||
|
||||
return feed_id, noticias_encontradas, new_etag, new_modified, success
|
||||
Loading…
Add table
Add a link
Reference in a new issue