rss/feed_processor.py

import hashlib
import re
from datetime import datetime
import logging
import feedparser
from bs4 import BeautifulSoup
import requests
import xml.sax._exceptions

NETWORK_TIMEOUT = 15

def process_single_feed(feed_data):
    """
    Procesa un único feed RSS, descarga sus noticias y devuelve la información.
    """
    feed_id = feed_data['id']
    feed_url = feed_data['url']
    feed_nombre = feed_data.get('nombre', 'Fuente Desconocida')
    etag = feed_data.get('last_etag')
    modified = feed_data.get('last_modified')

    noticias_encontradas = []
    new_etag, new_modified = None, None
    success = False

    try:
        headers = {'User-Agent': 'RssApp/1.0'}
        if etag:
            headers['If-None-Match'] = etag
        if modified:
            headers['If-Modified-Since'] = modified

        response = requests.get(feed_url, headers=headers, timeout=NETWORK_TIMEOUT)
        response.raise_for_status()

        if response.status_code == 304:
            logging.info(f"Feed {feed_url} (ID: {feed_id}) no modificado (304).")
            return feed_id, [], etag, modified, True

        parsed = feedparser.parse(response.content)

        if parsed.bozo and isinstance(parsed.bozo_exception, xml.sax._exceptions.SAXParseException):
            logging.error(f"Feed malformado para {feed_url} (ID: {feed_id}): {parsed.bozo_exception}")
            return feed_id, [], None, None, False

        if not parsed.entries:
            logging.warning(f"Feed {feed_url} (ID: {feed_id}) no contiene entradas.")
            return feed_id, [], parsed.get('etag'), parsed.get('modified'), True

        for entry in parsed.entries:
            link = entry.get("link")
            if not link:
                continue

            noticia_id = hashlib.md5(link.encode()).hexdigest()
            titulo = entry.get("title", "Sin título")

            resumen_html = ""
            if hasattr(entry, 'content') and entry.content:
                resumen_html = entry.content[0].value
            elif hasattr(entry, 'summary'):
                resumen_html = entry.summary

            if resumen_html:
                resumen_html = re.sub(r'\[\[\{.*?\}\]\]', '', resumen_html)

            imagen_url = ""
            if "media_content" in entry and entry.media_content:
                imagen_url = entry.media_content[0].get("url", "")
            elif resumen_html:
                soup = BeautifulSoup(resumen_html, 'html.parser')
                img_tag = soup.find('img')
                if img_tag and img_tag.get('src'):
                    imagen_url = img_tag['src']

            resumen_texto_plano = BeautifulSoup(resumen_html, 'html.parser').get_text(separator=' ', strip=True)

            fecha_publicacion = datetime.now()
            if hasattr(entry, 'published_parsed') and entry.published_parsed:
                fecha_publicacion = datetime(*entry.published_parsed[:6])
            elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
                fecha_publicacion = datetime(*entry.updated_parsed[:6])

            noticias_encontradas.append(
                (
                    noticia_id,
                    titulo,
                    resumen_texto_plano,
                    link,
                    fecha_publicacion,
                    imagen_url,
                    feed_nombre,
                    feed_data['categoria_id'],
                    feed_data['pais_id']
                )
            )

        new_etag = response.headers.get('ETag')
        new_modified = response.headers.get('Last-Modified')
        success = True

    except requests.exceptions.Timeout:
        logging.error(f"TIMEOUT al intentar obtener el feed {feed_url} (ID: {feed_id})")
    except requests.exceptions.RequestException as e:
        logging.error(f"Error de HTTP/red para el feed {feed_url} (ID: {feed_id}): {e}")
    except Exception as e:
        logging.error(f"Excepción inesperada al procesar el feed {feed_url} (ID: {feed_id}): {e}", exc_info=True)

    return feed_id, noticias_encontradas, new_etag, new_modified, success