rss/feed_processor.py
2025-06-15 22:45:55 +02:00

95 lines
3.8 KiB
Python

import hashlib
from datetime import datetime
import logging
import feedparser
from bs4 import BeautifulSoup
import requests
import xml.sax._exceptions
NETWORK_TIMEOUT = 15 # segundos
def process_single_feed(feed_data):
"""
Procesa un único feed RSS, descarga sus noticias y devuelve la información.
"""
feed_id = feed_data['id']
feed_url = feed_data['url']
# --- LÍNEA CLAVE ---
# Obtenemos el nombre del feed para usarlo como fuente de la noticia.
feed_nombre = feed_data.get('nombre', 'Fuente Desconocida')
etag = feed_data.get('last_etag')
modified = feed_data.get('last_modified')
noticias_encontradas = []
new_etag, new_modified = None, None
success = False
try:
headers = {'User-Agent': 'RssApp/1.0'}
if etag:
headers['If-None-Match'] = etag
if modified:
headers['If-Modified-Since'] = modified
response = requests.get(feed_url, headers=headers, timeout=NETWORK_TIMEOUT)
response.raise_for_status()
if response.status_code == 304:
logging.info(f"Feed {feed_url} (ID: {feed_id}) no modificado (304).")
return feed_id, [], etag, modified, True
parsed = feedparser.parse(response.content)
if parsed.bozo and isinstance(parsed.bozo_exception, xml.sax._exceptions.SAXParseException):
logging.error(f"Feed malformado para {feed_url} (ID: {feed_id}): {parsed.bozo_exception}")
return feed_id, [], None, None, False
if not parsed.entries:
logging.warning(f"Feed {feed_url} (ID: {feed_id}) no contiene entradas.")
return feed_id, [], parsed.get('etag'), parsed.get('modified'), True
for entry in parsed.entries:
link = entry.get("link")
if not link:
continue
noticia_id = hashlib.md5(link.encode()).hexdigest()
titulo = entry.get("title", "Sin título")
resumen_html = entry.get("summary", "")
imagen_url = ""
if "media_content" in entry and entry.media_content:
imagen_url = entry.media_content[0].get("url", "")
elif resumen_html:
soup = BeautifulSoup(resumen_html, 'html.parser')
img_tag = soup.find('img')
if img_tag and img_tag.get('src'):
imagen_url = img_tag['src']
resumen_texto_plano = BeautifulSoup(resumen_html, 'html.parser').get_text(separator=' ', strip=True)
fecha_publicacion = datetime.now()
if hasattr(entry, 'published_parsed') and entry.published_parsed:
fecha_publicacion = datetime(*entry.published_parsed[:6])
elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
fecha_publicacion = datetime(*entry.updated_parsed[:6])
# --- LÍNEA CLAVE ---
# Añadimos 'feed_nombre' a la tupla de datos que se guardará en la BD.
noticias_encontradas.append(
(noticia_id, titulo, resumen_texto_plano, link, fecha_publicacion,
imagen_url, feed_nombre, feed_data['categoria_id'], feed_data['pais_id'])
)
new_etag = response.headers.get('ETag')
new_modified = response.headers.get('Last-Modified')
success = True
except requests.exceptions.Timeout:
logging.error(f"TIMEOUT al intentar obtener el feed {feed_url} (ID: {feed_id})")
except requests.exceptions.RequestException as e:
logging.error(f"Error de HTTP/red para el feed {feed_url} (ID: {feed_id}): {e}")
except Exception as e:
logging.error(f"Excepción inesperada al procesar el feed {feed_url} (ID: {feed_id}): {e}", exc_info=True)
return feed_id, noticias_encontradas, new_etag, new_modified, success