""" URL Feed Discovery Worker This worker automatically discovers RSS feeds from URLs stored in fuentes_url table and creates entries in the feeds table (or feeds_pending for review). Runs every 15 minutes. """ import os import sys import time import logging from datetime import datetime from typing import List, Dict # Add parent directory to path to import modules sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from db import get_conn from utils.feed_discovery import discover_feeds, get_feed_metadata from utils.feed_analysis import ( analyze_feed, get_country_id_by_name, get_category_id_by_name ) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Configuration CHECK_INTERVAL = int(os.getenv('URL_DISCOVERY_INTERVAL_MIN', '15')) * 60 # Default: 15 minutes BATCH_SIZE = int(os.getenv('URL_DISCOVERY_BATCH_SIZE', '10')) # Process URLs in batches MAX_FEEDS_PER_URL = int(os.getenv('MAX_FEEDS_PER_URL', '5')) # Max feeds to create per URL def get_pending_urls(limit: int = BATCH_SIZE) -> List[Dict]: """ Get URLs that need to be processed. Priority: never checked > failed checks > oldest successful checks """ with get_conn() as conn: with conn.cursor() as cur: cur.execute(""" SELECT id, nombre, url, categoria_id, pais_id, idioma, last_status FROM fuentes_url WHERE active = TRUE ORDER BY CASE WHEN last_check IS NULL THEN 1 -- Never checked (highest priority) WHEN last_status = 'error' THEN 2 -- Failed checks WHEN last_status = 'no_feeds' THEN 3 -- No feeds found ELSE 4 -- Successful checks (lowest priority) END, last_check ASC NULLS FIRST LIMIT %s """, (limit,)) columns = [desc[0] for desc in cur.description] return [dict(zip(columns, row)) for row in cur.fetchall()] def update_url_status(url_id: int, status: str, message: str = None, http_code: int = None): """Update the status of a URL source""" with get_conn() as conn: with conn.cursor() as cur: cur.execute(""" UPDATE fuentes_url SET last_check = NOW(), last_status = %s, status_message = %s, last_http_code = %s WHERE id = %s """, (status, message, http_code, url_id)) conn.commit() def create_pending_feed( fuente_url_id: int, feed_url: str, metadata: Dict, analysis: Dict, categoria_id: int = None, pais_id: int = None, idioma: str = None ) -> bool: """ Create a pending feed entry for manual review """ try: with get_conn() as conn: # Get detected country ID detected_country_id = None if analysis.get('detected_country'): detected_country_id = get_country_id_by_name(conn, analysis['detected_country']) # Get suggested category ID suggested_categoria_id = None if analysis.get('suggested_category'): suggested_categoria_id = get_category_id_by_name(conn, analysis['suggested_category']) with conn.cursor() as cur: cur.execute(""" INSERT INTO feeds_pending ( fuente_url_id, feed_url, feed_title, feed_description, feed_language, feed_type, entry_count, detected_country_id, suggested_categoria_id, categoria_id, pais_id, idioma, notes ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (feed_url) DO UPDATE SET feed_title = EXCLUDED.feed_title, feed_description = EXCLUDED.feed_description, discovered_at = NOW() RETURNING id """, ( fuente_url_id, feed_url, metadata.get('title', 'Feed sin título'), metadata.get('description', '')[:500], analysis.get('language'), 'rss', # Default type metadata.get('entry_count', 0), detected_country_id, suggested_categoria_id, categoria_id, pais_id, idioma, analysis.get('analysis_notes', '') )) result = cur.fetchone() conn.commit() if result: logger.info(f"Created pending feed for review: {metadata.get('title')} ({feed_url})") return True else: logger.debug(f"Pending feed updated: {feed_url}") return False except Exception as e: logger.error(f"Error creating pending feed {feed_url}: {e}") return False def create_feed_from_metadata( feed_url: str, fuente_url_id: int = None, categoria_id: int = None, pais_id: int = None, idioma: str = None, auto_approve: bool = False, context_title: str = None ) -> Dict: """ Create a feed entry from discovered feed URL with intelligent analysis. Returns: { 'created': True/False, 'pending': True/False, 'status': 'created'/'pending'/'existing'/'error', 'message': 'Description' } """ result = { 'created': False, 'pending': False, 'status': 'error', 'message': '' } try: # Get feed metadata metadata = get_feed_metadata(feed_url, timeout=10) if not metadata: result['message'] = 'No se pudo obtener metadata del feed' logger.warning(f"{result['message']}: {feed_url}") return result # Add URL to metadata for analysis metadata['url'] = feed_url # Use context title if provided, otherwise use metadata title # This helps when feed XML title is generic (e.g. "RSS Feed") but site link had meaningful text feed_title = context_title if context_title else metadata.get('title', 'Feed sin título') # Update metadata for consistency in pending feeds AND analysis metadata['title'] = feed_title # Perform intelligent analysis analysis = analyze_feed(metadata) # Determine if we need manual review needs_review = False # If parent URL has no category or country, we need review if not categoria_id or not pais_id: needs_review = True logger.info(f"Feed needs review (missing categoria_id={categoria_id} or pais_id={pais_id})") # If auto_approve is disabled, we need review if not auto_approve: needs_review = True # Enhance metadata with analysis if not idioma and analysis.get('language'): idioma = analysis['language'] # If needs review, create pending feed if needs_review: created_pending = create_pending_feed( fuente_url_id=fuente_url_id, feed_url=feed_url, metadata=metadata, analysis=analysis, categoria_id=categoria_id, pais_id=pais_id, idioma=idioma ) result['pending'] = created_pending result['status'] = 'pending' result['message'] = f"Feed creado y pendiente de revisión (país: {analysis.get('detected_country', 'N/A')}, categoría sugerida: {analysis.get('suggested_category', 'N/A')})" return result # Otherwise, create feed directly nombre = feed_title descripcion = metadata.get('description', '') with get_conn() as conn: with conn.cursor() as cur: cur.execute(""" INSERT INTO feeds (nombre, descripcion, url, categoria_id, pais_id, idioma, fuente_url_id, activo) VALUES (%s, %s, %s, %s, %s, %s, %s, TRUE) ON CONFLICT (url) DO NOTHING RETURNING id """, ( nombre, descripcion[:500] if descripcion else None, feed_url, categoria_id, pais_id, idioma, fuente_url_id )) feed_result = cur.fetchone() conn.commit() if feed_result: logger.info(f"Created new feed: {nombre} ({feed_url})") result['created'] = True result['status'] = 'created' result['message'] = f"Feed creado exitosamente" else: logger.debug(f"Feed already exists: {feed_url}") result['status'] = 'existing' result['message'] = 'El feed ya existe' except Exception as e: logger.error(f"Error creating feed from {feed_url}: {e}") result['message'] = str(e) result['status'] = 'error' return result def process_url_source(url_data: Dict) -> Dict: """ Process a single URL source to discover and create feeds. Returns statistics about the operation. """ url_id = url_data['id'] source_url = url_data['url'] nombre = url_data['nombre'] categoria_id = url_data['categoria_id'] pais_id = url_data['pais_id'] idioma = url_data['idioma'] logger.info(f"Processing URL source: {nombre} ({source_url})") logger.info(f" Parent metadata: categoria_id={categoria_id}, pais_id={pais_id}, idioma={idioma}") stats = { 'url_id': url_id, 'url': source_url, 'discovered': 0, 'created': 0, 'pending': 0, 'existing': 0, 'errors': 0, 'status': 'unknown' } try: # Discover feeds from URL discovered = discover_feeds(source_url, timeout=15) stats['discovered'] = len(discovered) if not discovered: logger.warning(f"No feeds discovered from: {source_url}") update_url_status(url_id, 'no_feeds', 'No se encontraron feeds RSS', 200) stats['status'] = 'no_feeds' return stats # Filter only valid feeds valid_feeds = [f for f in discovered if f.get('valid', False)] if not valid_feeds: logger.warning(f"No valid feeds found for: {source_url}") update_url_status(url_id, 'no_valid_feeds', f'Se encontraron {len(discovered)} feeds pero ninguno válido') stats['status'] = 'no_valid_feeds' return stats # Limit number of feeds per URL feeds_to_create = valid_feeds[:MAX_FEEDS_PER_URL] logger.info(f"Found {len(valid_feeds)} valid feeds, processing up to {len(feeds_to_create)}") # Determine if auto-approve (parent has category AND country) auto_approve = bool(categoria_id and pais_id) if not auto_approve: logger.info("→ Feeds will require manual review (parent lacks category or country)") else: logger.info("→ Feeds will be auto-approved (parent has complete metadata)") # Create feeds for feed_info in feeds_to_create: feed_url = feed_info['url'] try: result = create_feed_from_metadata( feed_url=feed_url, fuente_url_id=url_id, categoria_id=categoria_id, pais_id=pais_id, idioma=idioma, auto_approve=auto_approve, context_title=feed_info.get('context_label') ) if result['status'] == 'created': stats['created'] += 1 elif result['status'] == 'pending': stats['pending'] += 1 elif result['status'] == 'existing': stats['existing'] += 1 else: stats['errors'] += 1 except Exception as e: logger.error(f"Error creating feed {feed_url}: {e}") stats['errors'] += 1 # Update URL status if stats['created'] > 0 or stats['pending'] > 0: parts = [] if stats['created'] > 0: parts.append(f"{stats['created']} creados") if stats['pending'] > 0: parts.append(f"{stats['pending']} pendientes de revisión") if stats['existing'] > 0: parts.append(f"{stats['existing']} ya existían") message = ", ".join(parts) update_url_status(url_id, 'success', message, 200) stats['status'] = 'success' elif stats['existing'] > 0: message = f"Todos los {stats['existing']} feeds ya existían" update_url_status(url_id, 'existing', message, 200) stats['status'] = 'existing' else: message = f"No se pudieron procesar feeds ({stats['errors']} errores)" update_url_status(url_id, 'error', message) stats['status'] = 'error' logger.info(f"Processed {source_url}: {stats['created']} created, {stats['pending']} pending, {stats['existing']} existing, {stats['errors']} errors") except Exception as e: logger.error(f"Error processing URL {source_url}: {e}") update_url_status(url_id, 'error', str(e)[:200]) stats['status'] = 'error' stats['errors'] += 1 return stats def process_batch(): """Process a batch of URL sources""" logger.info("=" * 80) logger.info(f"Starting URL discovery batch - {datetime.now().isoformat()}") # Get pending URLs urls = get_pending_urls(limit=BATCH_SIZE) if not urls: logger.info("No pending URLs to process") return logger.info(f"Processing {len(urls)} URL sources") # Process statistics total_stats = { 'processed': 0, 'discovered': 0, 'created': 0, 'pending': 0, 'existing': 0, 'errors': 0 } # Process each URL for url_data in urls: stats = process_url_source(url_data) total_stats['processed'] += 1 total_stats['discovered'] += stats['discovered'] total_stats['created'] += stats['created'] total_stats['pending'] += stats['pending'] total_stats['existing'] += stats['existing'] total_stats['errors'] += stats['errors'] # Small delay between URLs to avoid hammering servers time.sleep(2) # Log summary logger.info("-" * 80) logger.info(f"Batch complete:") logger.info(f" - Processed: {total_stats['processed']} URLs") logger.info(f" - Discovered: {total_stats['discovered']} feeds") logger.info(f" - Created: {total_stats['created']} new feeds") logger.info(f" - Pending review: {total_stats['pending']} feeds") logger.info(f" - Already existing: {total_stats['existing']} feeds") logger.info(f" - Errors: {total_stats['errors']}") logger.info("=" * 80) def main(): """Main worker loop""" logger.info("URL Feed Discovery Worker started") logger.info(f"Check interval: {CHECK_INTERVAL} seconds ({CHECK_INTERVAL // 60} minutes)") logger.info(f"Batch size: {BATCH_SIZE}") logger.info(f"Max feeds per URL: {MAX_FEEDS_PER_URL}") # Run immediately on start try: process_batch() except Exception as e: logger.error(f"Error in initial batch: {e}", exc_info=True) # Main loop while True: try: logger.info(f"Waiting {CHECK_INTERVAL} seconds until next batch...") time.sleep(CHECK_INTERVAL) process_batch() except KeyboardInterrupt: logger.info("Worker stopped by user") break except Exception as e: logger.error(f"Error in main loop: {e}", exc_info=True) # Wait a bit before retrying to avoid rapid failure loops time.sleep(60) if __name__ == "__main__": main()