125 lines
4.4 KiB
Python
125 lines
4.4 KiB
Python
import logging
|
|
import hashlib
|
|
from datetime import datetime
|
|
from newspaper import Article, ArticleException, Config
|
|
import requests
|
|
from db import get_write_conn, get_read_conn
|
|
|
|
# Configuration
|
|
logger = logging.getLogger("url_worker")
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
|
def get_active_urls():
|
|
"""Get all active URL sources."""
|
|
with get_read_conn() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute("""
|
|
SELECT id, nombre, url, categoria_id, pais_id, idioma
|
|
FROM fuentes_url
|
|
WHERE active = true
|
|
""")
|
|
return cur.fetchall()
|
|
|
|
def update_source_status(source_id, status, message, http_code=0):
|
|
"""Update the status of a URL source."""
|
|
with get_write_conn() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute("""
|
|
UPDATE fuentes_url
|
|
SET last_check = NOW(),
|
|
last_status = %s,
|
|
status_message = %s,
|
|
last_http_code = %s
|
|
WHERE id = %s
|
|
""", (status, message, http_code, source_id))
|
|
conn.commit()
|
|
|
|
def save_article(source, article):
|
|
"""Save the extracted article to the database."""
|
|
source_id, source_name, source_url, cat_id, pais_id, lang = source
|
|
|
|
# Use the article url if possible, otherwise source_url
|
|
final_url = article.url or source_url
|
|
noticia_id = hashlib.md5(final_url.encode("utf-8")).hexdigest()
|
|
|
|
with get_write_conn() as conn:
|
|
with conn.cursor() as cur:
|
|
# Check if exists
|
|
cur.execute("SELECT id FROM noticias WHERE id = %s", (noticia_id,))
|
|
if cur.fetchone():
|
|
return False # Already exists
|
|
|
|
# Prepare data
|
|
title = article.title or "Sin título"
|
|
summary = article.summary or article.text[:500]
|
|
image_url = article.top_image
|
|
pub_date = article.publish_date or datetime.utcnow()
|
|
|
|
cur.execute("""
|
|
INSERT INTO noticias (
|
|
id, titulo, resumen, url, fecha, imagen_url,
|
|
fuente_nombre, categoria_id, pais_id
|
|
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|
ON CONFLICT (id) DO NOTHING
|
|
""", (
|
|
noticia_id, title, summary, final_url, pub_date, image_url,
|
|
source_name, cat_id, pais_id
|
|
))
|
|
conn.commit()
|
|
return True
|
|
|
|
def process_url(source):
|
|
"""Process a single URL source."""
|
|
source_id, name, url, _, _, _ = source
|
|
|
|
logger.info(f"Processing URL: {url} ({name})")
|
|
|
|
try:
|
|
# Browser-like headers
|
|
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
config = Config()
|
|
config.browser_user_agent = user_agent
|
|
config.request_timeout = 30
|
|
|
|
article = Article(url, config=config, language='es')
|
|
article.download()
|
|
|
|
if not article.html:
|
|
update_source_status(source_id, "ERROR", "No content downloaded (Empty HTML)", 0)
|
|
return
|
|
|
|
article.parse()
|
|
try:
|
|
article.nlp()
|
|
except:
|
|
pass
|
|
|
|
if not article.title:
|
|
update_source_status(source_id, "ERROR_PARSE", "Could not extract title (Page might be not an article)", 200)
|
|
return
|
|
|
|
saved = save_article(source, article)
|
|
|
|
status_msg = "News created successfully" if saved else "News already exists"
|
|
update_source_status(source_id, "OK", status_msg, 200)
|
|
logger.info(f"Success {url}: {status_msg}")
|
|
|
|
except ArticleException as ae:
|
|
logger.error(f"Newspaper Error {url}: {ae}")
|
|
update_source_status(source_id, "ERROR_DOWNLOAD", str(ae)[:200], 0)
|
|
except requests.exceptions.RequestException as re:
|
|
logger.error(f"Network Error {url}: {re}")
|
|
update_source_status(source_id, "ERROR_NETWORK", str(re)[:200], 0)
|
|
except Exception as e:
|
|
logger.error(f"Unexpected Error {url}: {e}")
|
|
update_source_status(source_id, "ERROR_UNKNOWN", str(e)[:200], 500)
|
|
|
|
def main():
|
|
logger.info("Starting URL Worker")
|
|
urls = get_active_urls()
|
|
logger.info(f"Found {len(urls)} active URLs")
|
|
for source in urls:
|
|
process_url(source)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|