Initial clean commit

This commit is contained in:
jlimolina 2026-01-13 13:39:51 +01:00
commit 6784d81c2c
141 changed files with 25219 additions and 0 deletions

263
utils/feed_discovery.py Normal file
View file

@ -0,0 +1,263 @@
"""
Feed Discovery Utility
Provides functions to automatically discover RSS/Atom feeds from URLs.
"""
import feedfinder2
import feedparser
import requests
from typing import List, Dict, Optional
import logging
logger = logging.getLogger(__name__)
def discover_feeds(url: str, timeout: int = 15) -> List[Dict[str, str]]:
"""
Discover RSS/Atom feeds from a given URL, including link text context.
Args:
url: The URL to search for feeds
timeout: Request timeout in seconds
Returns:
List of dictionaries with feed information:
[
{
'url': 'feed_url',
'title': 'Feed Title',
'context_label': 'Text on the link that pointed here',
'type': 'rss' or 'atom',
'valid': True/False
}
]
"""
discovered_feeds = []
feed_context_map = {}
try:
# 1. Fetch content yourself to parse context (link names)
logger.info(f"Fetching content from: {url}")
headers = {'User-Agent': 'Mozilla/5.0 (compatible; RSS2-Feed-Discovery/1.0; +http://localhost)'}
response = requests.get(url, timeout=timeout, headers=headers)
if response.status_code == 200:
from bs4 import BeautifulSoup
from urllib.parse import urljoin
soup = BeautifulSoup(response.content, 'html.parser')
# Find <link> tags in head
for link in soup.find_all('link', rel='alternate'):
if link.get('type') in ['application/rss+xml', 'application/atom+xml']:
href = link.get('href')
title = link.get('title')
if href:
abs_url = urljoin(url, href)
if title:
feed_context_map[abs_url] = title
# Find <a> tags that might be feeds
for a in soup.find_all('a', href=True):
href = a['href']
# Simple heuristic for potential RSS links
if any(x in href.lower() for x in ['.rss', '.xml', 'feed', 'rss']):
abs_url = urljoin(url, href)
text = a.get_text(strip=True)
title = a.get('title')
# Prefer text, then title
context = text if text else title
if context:
# Don't overwrite if we already have a nice title from <link>
# actually, <a> text is often more descriptive on index pages ("Politics", "Sports")
# while <link> title is usually "Site Name RSS"
# Let's keep the shortest/cleanest or just overwrite.
# For now, let's keep the existing logic or overwrite if not present.
if abs_url not in feed_context_map:
feed_context_map[abs_url] = context
# 2. Use feedfinder2 for robust discovery
logger.info(f"Discovering feeds from: {url}")
feed_urls = feedfinder2.find_feeds(url)
if not feed_urls:
# Fallback: maybe feedfinder missed some that we found manually?
# feedfinder matches strict rules. Let's add ours if valid URLs.
for mapped_url in feed_context_map.keys():
if mapped_url not in feed_urls:
feed_urls.append(mapped_url)
if not feed_urls:
logger.warning(f"No feeds found for URL: {url}")
return []
logger.info(f"Found {len(feed_urls)} potential feeds")
# 3. Validate and merge context
for feed_url in feed_urls:
feed_info = validate_feed(feed_url, timeout=timeout)
if feed_info:
# Add the discovered context label
# fuzzy match if exact match fails?
context = feed_context_map.get(feed_url)
if not context:
# Try trailing slash variations
context = feed_context_map.get(feed_url.rstrip('/'))
feed_info['context_label'] = context or ""
discovered_feeds.append(feed_info)
# Sort by validity (valid feeds first)
discovered_feeds.sort(key=lambda x: x['valid'], reverse=True)
return discovered_feeds
except Exception as e:
logger.error(f"Error discovering feeds from {url}: {e}")
return []
def validate_feed(feed_url: str, timeout: int = 10) -> Optional[Dict[str, str]]:
"""
Validate and extract information from a feed URL.
Args:
feed_url: The feed URL to validate
timeout: Request timeout in seconds
Returns:
Dictionary with feed information if valid, None otherwise
"""
try:
# Try to parse the feed
response = requests.get(
feed_url,
timeout=timeout,
headers={'User-Agent': 'RSS2-Feed-Discovery/1.0'}
)
if response.status_code != 200:
logger.warning(f"Feed returned status {response.status_code}: {feed_url}")
return {
'url': feed_url,
'title': 'Unknown Feed',
'type': 'unknown',
'valid': False,
'error': f'HTTP {response.status_code}'
}
# Parse the feed
feed = feedparser.parse(response.content)
if feed.bozo and not feed.entries:
# Feed has errors and no entries
logger.warning(f"Invalid feed (no entries): {feed_url}")
return {
'url': feed_url,
'title': 'Invalid Feed',
'type': 'unknown',
'valid': False,
'error': 'No entries found'
}
# Extract feed information
feed_type = feed.get('version', 'unknown')
if feed_type.startswith('rss'):
feed_format = 'rss'
elif feed_type.startswith('atom'):
feed_format = 'atom'
else:
feed_format = 'unknown'
# Get feed title
title = feed.feed.get('title', 'Untitled Feed')
# Get feed description
description = feed.feed.get('description', '') or feed.feed.get('subtitle', '')
return {
'url': feed_url,
'title': title,
'description': description,
'type': feed_format,
'version': feed_type,
'valid': True,
'entry_count': len(feed.entries)
}
except requests.Timeout:
logger.error(f"Timeout validating feed: {feed_url}")
return {
'url': feed_url,
'title': 'Timeout',
'type': 'unknown',
'valid': False,
'error': 'Request timeout'
}
except Exception as e:
logger.error(f"Error validating feed {feed_url}: {e}")
return {
'url': feed_url,
'title': 'Error',
'type': 'unknown',
'valid': False,
'error': str(e)
}
def get_feed_metadata(feed_url: str, timeout: int = 10) -> Optional[Dict[str, any]]:
"""
Get detailed metadata from a feed URL.
Args:
feed_url: The feed URL
timeout: Request timeout in seconds
Returns:
Dictionary with detailed feed metadata
"""
try:
response = requests.get(
feed_url,
timeout=timeout,
headers={'User-Agent': 'RSS2-Feed-Discovery/1.0'}
)
if response.status_code != 200:
return None
feed = feedparser.parse(response.content)
if feed.bozo and not feed.entries:
return None
# Extract comprehensive metadata
metadata = {
'title': feed.feed.get('title', ''),
'description': feed.feed.get('description', '') or feed.feed.get('subtitle', ''),
'link': feed.feed.get('link', ''),
'language': feed.feed.get('language', ''),
'updated': feed.feed.get('updated', ''),
'image_url': '',
'entry_count': len(feed.entries),
'entries': []
}
# Extract image if available
if hasattr(feed.feed, 'image'):
metadata['image_url'] = feed.feed.image.get('href', '')
# Get first 5 entries as preview
for entry in feed.entries[:5]:
metadata['entries'].append({
'title': entry.get('title', ''),
'link': entry.get('link', ''),
'published': entry.get('published', '')
})
return metadata
except Exception as e:
logger.error(f"Error getting feed metadata for {feed_url}: {e}")
return None