Initial clean commit
This commit is contained in:
commit
6784d81c2c
141 changed files with 25219 additions and 0 deletions
471
workers/url_discovery_worker.py
Normal file
471
workers/url_discovery_worker.py
Normal file
|
|
@ -0,0 +1,471 @@
|
|||
"""
|
||||
URL Feed Discovery Worker
|
||||
This worker automatically discovers RSS feeds from URLs stored in fuentes_url table
|
||||
and creates entries in the feeds table (or feeds_pending for review).
|
||||
Runs every 15 minutes.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import List, Dict
|
||||
|
||||
# Add parent directory to path to import modules
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from db import get_conn
|
||||
from utils.feed_discovery import discover_feeds, get_feed_metadata
|
||||
from utils.feed_analysis import (
|
||||
analyze_feed,
|
||||
get_country_id_by_name,
|
||||
get_category_id_by_name
|
||||
)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration
|
||||
CHECK_INTERVAL = int(os.getenv('URL_DISCOVERY_INTERVAL_MIN', '15')) * 60 # Default: 15 minutes
|
||||
BATCH_SIZE = int(os.getenv('URL_DISCOVERY_BATCH_SIZE', '10')) # Process URLs in batches
|
||||
MAX_FEEDS_PER_URL = int(os.getenv('MAX_FEEDS_PER_URL', '5')) # Max feeds to create per URL
|
||||
|
||||
|
||||
def get_pending_urls(limit: int = BATCH_SIZE) -> List[Dict]:
|
||||
"""
|
||||
Get URLs that need to be processed.
|
||||
Priority: never checked > failed checks > oldest successful checks
|
||||
"""
|
||||
with get_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT id, nombre, url, categoria_id, pais_id, idioma, last_status
|
||||
FROM fuentes_url
|
||||
WHERE active = TRUE
|
||||
ORDER BY
|
||||
CASE
|
||||
WHEN last_check IS NULL THEN 1 -- Never checked (highest priority)
|
||||
WHEN last_status = 'error' THEN 2 -- Failed checks
|
||||
WHEN last_status = 'no_feeds' THEN 3 -- No feeds found
|
||||
ELSE 4 -- Successful checks (lowest priority)
|
||||
END,
|
||||
last_check ASC NULLS FIRST
|
||||
LIMIT %s
|
||||
""", (limit,))
|
||||
|
||||
columns = [desc[0] for desc in cur.description]
|
||||
return [dict(zip(columns, row)) for row in cur.fetchall()]
|
||||
|
||||
|
||||
def update_url_status(url_id: int, status: str, message: str = None, http_code: int = None):
|
||||
"""Update the status of a URL source"""
|
||||
with get_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
UPDATE fuentes_url
|
||||
SET last_check = NOW(),
|
||||
last_status = %s,
|
||||
status_message = %s,
|
||||
last_http_code = %s
|
||||
WHERE id = %s
|
||||
""", (status, message, http_code, url_id))
|
||||
conn.commit()
|
||||
|
||||
|
||||
def create_pending_feed(
|
||||
fuente_url_id: int,
|
||||
feed_url: str,
|
||||
metadata: Dict,
|
||||
analysis: Dict,
|
||||
categoria_id: int = None,
|
||||
pais_id: int = None,
|
||||
idioma: str = None
|
||||
) -> bool:
|
||||
"""
|
||||
Create a pending feed entry for manual review
|
||||
"""
|
||||
try:
|
||||
with get_conn() as conn:
|
||||
# Get detected country ID
|
||||
detected_country_id = None
|
||||
if analysis.get('detected_country'):
|
||||
detected_country_id = get_country_id_by_name(conn, analysis['detected_country'])
|
||||
|
||||
# Get suggested category ID
|
||||
suggested_categoria_id = None
|
||||
if analysis.get('suggested_category'):
|
||||
suggested_categoria_id = get_category_id_by_name(conn, analysis['suggested_category'])
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
INSERT INTO feeds_pending (
|
||||
fuente_url_id, feed_url, feed_title, feed_description,
|
||||
feed_language, feed_type, entry_count,
|
||||
detected_country_id, suggested_categoria_id,
|
||||
categoria_id, pais_id, idioma, notes
|
||||
)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (feed_url) DO UPDATE
|
||||
SET feed_title = EXCLUDED.feed_title,
|
||||
feed_description = EXCLUDED.feed_description,
|
||||
discovered_at = NOW()
|
||||
RETURNING id
|
||||
""", (
|
||||
fuente_url_id,
|
||||
feed_url,
|
||||
metadata.get('title', 'Feed sin título'),
|
||||
metadata.get('description', '')[:500],
|
||||
analysis.get('language'),
|
||||
'rss', # Default type
|
||||
metadata.get('entry_count', 0),
|
||||
detected_country_id,
|
||||
suggested_categoria_id,
|
||||
categoria_id,
|
||||
pais_id,
|
||||
idioma,
|
||||
analysis.get('analysis_notes', '')
|
||||
))
|
||||
|
||||
result = cur.fetchone()
|
||||
conn.commit()
|
||||
|
||||
if result:
|
||||
logger.info(f"Created pending feed for review: {metadata.get('title')} ({feed_url})")
|
||||
return True
|
||||
else:
|
||||
logger.debug(f"Pending feed updated: {feed_url}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating pending feed {feed_url}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def create_feed_from_metadata(
|
||||
feed_url: str,
|
||||
fuente_url_id: int = None,
|
||||
categoria_id: int = None,
|
||||
pais_id: int = None,
|
||||
idioma: str = None,
|
||||
auto_approve: bool = False,
|
||||
context_title: str = None
|
||||
) -> Dict:
|
||||
"""
|
||||
Create a feed entry from discovered feed URL with intelligent analysis.
|
||||
|
||||
Returns:
|
||||
{
|
||||
'created': True/False,
|
||||
'pending': True/False,
|
||||
'status': 'created'/'pending'/'existing'/'error',
|
||||
'message': 'Description'
|
||||
}
|
||||
"""
|
||||
result = {
|
||||
'created': False,
|
||||
'pending': False,
|
||||
'status': 'error',
|
||||
'message': ''
|
||||
}
|
||||
|
||||
try:
|
||||
# Get feed metadata
|
||||
metadata = get_feed_metadata(feed_url, timeout=10)
|
||||
|
||||
if not metadata:
|
||||
result['message'] = 'No se pudo obtener metadata del feed'
|
||||
logger.warning(f"{result['message']}: {feed_url}")
|
||||
return result
|
||||
|
||||
# Add URL to metadata for analysis
|
||||
metadata['url'] = feed_url
|
||||
|
||||
# Use context title if provided, otherwise use metadata title
|
||||
# This helps when feed XML title is generic (e.g. "RSS Feed") but site link had meaningful text
|
||||
feed_title = context_title if context_title else metadata.get('title', 'Feed sin título')
|
||||
# Update metadata for consistency in pending feeds AND analysis
|
||||
metadata['title'] = feed_title
|
||||
|
||||
# Perform intelligent analysis
|
||||
analysis = analyze_feed(metadata)
|
||||
|
||||
# Determine if we need manual review
|
||||
needs_review = False
|
||||
|
||||
# If parent URL has no category or country, we need review
|
||||
if not categoria_id or not pais_id:
|
||||
needs_review = True
|
||||
logger.info(f"Feed needs review (missing categoria_id={categoria_id} or pais_id={pais_id})")
|
||||
|
||||
# If auto_approve is disabled, we need review
|
||||
if not auto_approve:
|
||||
needs_review = True
|
||||
|
||||
# Enhance metadata with analysis
|
||||
if not idioma and analysis.get('language'):
|
||||
idioma = analysis['language']
|
||||
|
||||
# If needs review, create pending feed
|
||||
if needs_review:
|
||||
created_pending = create_pending_feed(
|
||||
fuente_url_id=fuente_url_id,
|
||||
feed_url=feed_url,
|
||||
metadata=metadata,
|
||||
analysis=analysis,
|
||||
categoria_id=categoria_id,
|
||||
pais_id=pais_id,
|
||||
idioma=idioma
|
||||
)
|
||||
|
||||
result['pending'] = created_pending
|
||||
result['status'] = 'pending'
|
||||
result['message'] = f"Feed creado y pendiente de revisión (país: {analysis.get('detected_country', 'N/A')}, categoría sugerida: {analysis.get('suggested_category', 'N/A')})"
|
||||
return result
|
||||
|
||||
# Otherwise, create feed directly
|
||||
nombre = feed_title
|
||||
descripcion = metadata.get('description', '')
|
||||
|
||||
with get_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
INSERT INTO feeds (nombre, descripcion, url, categoria_id, pais_id, idioma, fuente_url_id, activo)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, TRUE)
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
RETURNING id
|
||||
""", (
|
||||
nombre,
|
||||
descripcion[:500] if descripcion else None,
|
||||
feed_url,
|
||||
categoria_id,
|
||||
pais_id,
|
||||
idioma,
|
||||
fuente_url_id
|
||||
))
|
||||
|
||||
feed_result = cur.fetchone()
|
||||
conn.commit()
|
||||
|
||||
if feed_result:
|
||||
logger.info(f"Created new feed: {nombre} ({feed_url})")
|
||||
result['created'] = True
|
||||
result['status'] = 'created'
|
||||
result['message'] = f"Feed creado exitosamente"
|
||||
else:
|
||||
logger.debug(f"Feed already exists: {feed_url}")
|
||||
result['status'] = 'existing'
|
||||
result['message'] = 'El feed ya existe'
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating feed from {feed_url}: {e}")
|
||||
result['message'] = str(e)
|
||||
result['status'] = 'error'
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def process_url_source(url_data: Dict) -> Dict:
|
||||
"""
|
||||
Process a single URL source to discover and create feeds.
|
||||
Returns statistics about the operation.
|
||||
"""
|
||||
url_id = url_data['id']
|
||||
source_url = url_data['url']
|
||||
nombre = url_data['nombre']
|
||||
categoria_id = url_data['categoria_id']
|
||||
pais_id = url_data['pais_id']
|
||||
idioma = url_data['idioma']
|
||||
|
||||
logger.info(f"Processing URL source: {nombre} ({source_url})")
|
||||
logger.info(f" Parent metadata: categoria_id={categoria_id}, pais_id={pais_id}, idioma={idioma}")
|
||||
|
||||
stats = {
|
||||
'url_id': url_id,
|
||||
'url': source_url,
|
||||
'discovered': 0,
|
||||
'created': 0,
|
||||
'pending': 0,
|
||||
'existing': 0,
|
||||
'errors': 0,
|
||||
'status': 'unknown'
|
||||
}
|
||||
|
||||
try:
|
||||
# Discover feeds from URL
|
||||
discovered = discover_feeds(source_url, timeout=15)
|
||||
stats['discovered'] = len(discovered)
|
||||
|
||||
if not discovered:
|
||||
logger.warning(f"No feeds discovered from: {source_url}")
|
||||
update_url_status(url_id, 'no_feeds', 'No se encontraron feeds RSS', 200)
|
||||
stats['status'] = 'no_feeds'
|
||||
return stats
|
||||
|
||||
# Filter only valid feeds
|
||||
valid_feeds = [f for f in discovered if f.get('valid', False)]
|
||||
|
||||
if not valid_feeds:
|
||||
logger.warning(f"No valid feeds found for: {source_url}")
|
||||
update_url_status(url_id, 'no_valid_feeds', f'Se encontraron {len(discovered)} feeds pero ninguno válido')
|
||||
stats['status'] = 'no_valid_feeds'
|
||||
return stats
|
||||
|
||||
# Limit number of feeds per URL
|
||||
feeds_to_create = valid_feeds[:MAX_FEEDS_PER_URL]
|
||||
|
||||
logger.info(f"Found {len(valid_feeds)} valid feeds, processing up to {len(feeds_to_create)}")
|
||||
|
||||
# Determine if auto-approve (parent has category AND country)
|
||||
auto_approve = bool(categoria_id and pais_id)
|
||||
|
||||
if not auto_approve:
|
||||
logger.info("→ Feeds will require manual review (parent lacks category or country)")
|
||||
else:
|
||||
logger.info("→ Feeds will be auto-approved (parent has complete metadata)")
|
||||
|
||||
# Create feeds
|
||||
for feed_info in feeds_to_create:
|
||||
feed_url = feed_info['url']
|
||||
|
||||
try:
|
||||
result = create_feed_from_metadata(
|
||||
feed_url=feed_url,
|
||||
fuente_url_id=url_id,
|
||||
categoria_id=categoria_id,
|
||||
pais_id=pais_id,
|
||||
idioma=idioma,
|
||||
auto_approve=auto_approve,
|
||||
context_title=feed_info.get('context_label')
|
||||
)
|
||||
|
||||
if result['status'] == 'created':
|
||||
stats['created'] += 1
|
||||
elif result['status'] == 'pending':
|
||||
stats['pending'] += 1
|
||||
elif result['status'] == 'existing':
|
||||
stats['existing'] += 1
|
||||
else:
|
||||
stats['errors'] += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating feed {feed_url}: {e}")
|
||||
stats['errors'] += 1
|
||||
|
||||
# Update URL status
|
||||
if stats['created'] > 0 or stats['pending'] > 0:
|
||||
parts = []
|
||||
if stats['created'] > 0:
|
||||
parts.append(f"{stats['created']} creados")
|
||||
if stats['pending'] > 0:
|
||||
parts.append(f"{stats['pending']} pendientes de revisión")
|
||||
if stats['existing'] > 0:
|
||||
parts.append(f"{stats['existing']} ya existían")
|
||||
|
||||
message = ", ".join(parts)
|
||||
update_url_status(url_id, 'success', message, 200)
|
||||
stats['status'] = 'success'
|
||||
elif stats['existing'] > 0:
|
||||
message = f"Todos los {stats['existing']} feeds ya existían"
|
||||
update_url_status(url_id, 'existing', message, 200)
|
||||
stats['status'] = 'existing'
|
||||
else:
|
||||
message = f"No se pudieron procesar feeds ({stats['errors']} errores)"
|
||||
update_url_status(url_id, 'error', message)
|
||||
stats['status'] = 'error'
|
||||
|
||||
logger.info(f"Processed {source_url}: {stats['created']} created, {stats['pending']} pending, {stats['existing']} existing, {stats['errors']} errors")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing URL {source_url}: {e}")
|
||||
update_url_status(url_id, 'error', str(e)[:200])
|
||||
stats['status'] = 'error'
|
||||
stats['errors'] += 1
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def process_batch():
|
||||
"""Process a batch of URL sources"""
|
||||
logger.info("=" * 80)
|
||||
logger.info(f"Starting URL discovery batch - {datetime.now().isoformat()}")
|
||||
|
||||
# Get pending URLs
|
||||
urls = get_pending_urls(limit=BATCH_SIZE)
|
||||
|
||||
if not urls:
|
||||
logger.info("No pending URLs to process")
|
||||
return
|
||||
|
||||
logger.info(f"Processing {len(urls)} URL sources")
|
||||
|
||||
# Process statistics
|
||||
total_stats = {
|
||||
'processed': 0,
|
||||
'discovered': 0,
|
||||
'created': 0,
|
||||
'pending': 0,
|
||||
'existing': 0,
|
||||
'errors': 0
|
||||
}
|
||||
|
||||
# Process each URL
|
||||
for url_data in urls:
|
||||
stats = process_url_source(url_data)
|
||||
|
||||
total_stats['processed'] += 1
|
||||
total_stats['discovered'] += stats['discovered']
|
||||
total_stats['created'] += stats['created']
|
||||
total_stats['pending'] += stats['pending']
|
||||
total_stats['existing'] += stats['existing']
|
||||
total_stats['errors'] += stats['errors']
|
||||
|
||||
# Small delay between URLs to avoid hammering servers
|
||||
time.sleep(2)
|
||||
|
||||
# Log summary
|
||||
logger.info("-" * 80)
|
||||
logger.info(f"Batch complete:")
|
||||
logger.info(f" - Processed: {total_stats['processed']} URLs")
|
||||
logger.info(f" - Discovered: {total_stats['discovered']} feeds")
|
||||
logger.info(f" - Created: {total_stats['created']} new feeds")
|
||||
logger.info(f" - Pending review: {total_stats['pending']} feeds")
|
||||
logger.info(f" - Already existing: {total_stats['existing']} feeds")
|
||||
logger.info(f" - Errors: {total_stats['errors']}")
|
||||
logger.info("=" * 80)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main worker loop"""
|
||||
logger.info("URL Feed Discovery Worker started")
|
||||
logger.info(f"Check interval: {CHECK_INTERVAL} seconds ({CHECK_INTERVAL // 60} minutes)")
|
||||
logger.info(f"Batch size: {BATCH_SIZE}")
|
||||
logger.info(f"Max feeds per URL: {MAX_FEEDS_PER_URL}")
|
||||
|
||||
# Run immediately on start
|
||||
try:
|
||||
process_batch()
|
||||
except Exception as e:
|
||||
logger.error(f"Error in initial batch: {e}", exc_info=True)
|
||||
|
||||
# Main loop
|
||||
while True:
|
||||
try:
|
||||
logger.info(f"Waiting {CHECK_INTERVAL} seconds until next batch...")
|
||||
time.sleep(CHECK_INTERVAL)
|
||||
process_batch()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Worker stopped by user")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error in main loop: {e}", exc_info=True)
|
||||
# Wait a bit before retrying to avoid rapid failure loops
|
||||
time.sleep(60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue