import hashlib from datetime import datetime import logging import feedparser from bs4 import BeautifulSoup import requests import xml.sax._exceptions NETWORK_TIMEOUT = 15 # segundos def process_single_feed(feed_data): """ Procesa un único feed RSS, descarga sus noticias y devuelve la información. """ feed_id = feed_data['id'] feed_url = feed_data['url'] # --- LÍNEA CLAVE --- # Obtenemos el nombre del feed para usarlo como fuente de la noticia. feed_nombre = feed_data.get('nombre', 'Fuente Desconocida') etag = feed_data.get('last_etag') modified = feed_data.get('last_modified') noticias_encontradas = [] new_etag, new_modified = None, None success = False try: headers = {'User-Agent': 'RssApp/1.0'} if etag: headers['If-None-Match'] = etag if modified: headers['If-Modified-Since'] = modified response = requests.get(feed_url, headers=headers, timeout=NETWORK_TIMEOUT) response.raise_for_status() if response.status_code == 304: logging.info(f"Feed {feed_url} (ID: {feed_id}) no modificado (304).") return feed_id, [], etag, modified, True parsed = feedparser.parse(response.content) if parsed.bozo and isinstance(parsed.bozo_exception, xml.sax._exceptions.SAXParseException): logging.error(f"Feed malformado para {feed_url} (ID: {feed_id}): {parsed.bozo_exception}") return feed_id, [], None, None, False if not parsed.entries: logging.warning(f"Feed {feed_url} (ID: {feed_id}) no contiene entradas.") return feed_id, [], parsed.get('etag'), parsed.get('modified'), True for entry in parsed.entries: link = entry.get("link") if not link: continue noticia_id = hashlib.md5(link.encode()).hexdigest() titulo = entry.get("title", "Sin título") resumen_html = entry.get("summary", "") imagen_url = "" if "media_content" in entry and entry.media_content: imagen_url = entry.media_content[0].get("url", "") elif resumen_html: soup = BeautifulSoup(resumen_html, 'html.parser') img_tag = soup.find('img') if img_tag and img_tag.get('src'): imagen_url = img_tag['src'] resumen_texto_plano = BeautifulSoup(resumen_html, 'html.parser').get_text(separator=' ', strip=True) fecha_publicacion = datetime.now() if hasattr(entry, 'published_parsed') and entry.published_parsed: fecha_publicacion = datetime(*entry.published_parsed[:6]) elif hasattr(entry, 'updated_parsed') and entry.updated_parsed: fecha_publicacion = datetime(*entry.updated_parsed[:6]) # --- LÍNEA CLAVE --- # Añadimos 'feed_nombre' a la tupla de datos que se guardará en la BD. noticias_encontradas.append( (noticia_id, titulo, resumen_texto_plano, link, fecha_publicacion, imagen_url, feed_nombre, feed_data['categoria_id'], feed_data['pais_id']) ) new_etag = response.headers.get('ETag') new_modified = response.headers.get('Last-Modified') success = True except requests.exceptions.Timeout: logging.error(f"TIMEOUT al intentar obtener el feed {feed_url} (ID: {feed_id})") except requests.exceptions.RequestException as e: logging.error(f"Error de HTTP/red para el feed {feed_url} (ID: {feed_id}): {e}") except Exception as e: logging.error(f"Excepción inesperada al procesar el feed {feed_url} (ID: {feed_id}): {e}", exc_info=True) return feed_id, noticias_encontradas, new_etag, new_modified, success