Initial clean commit
This commit is contained in:
commit
6784d81c2c
141 changed files with 25219 additions and 0 deletions
125
workers/url_worker.py
Normal file
125
workers/url_worker.py
Normal file
|
|
@ -0,0 +1,125 @@
|
|||
import logging
|
||||
import hashlib
|
||||
from datetime import datetime
|
||||
from newspaper import Article, ArticleException, Config
|
||||
import requests
|
||||
from db import get_write_conn, get_read_conn
|
||||
|
||||
# Configuration
|
||||
logger = logging.getLogger("url_worker")
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
|
||||
def get_active_urls():
|
||||
"""Get all active URL sources."""
|
||||
with get_read_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT id, nombre, url, categoria_id, pais_id, idioma
|
||||
FROM fuentes_url
|
||||
WHERE active = true
|
||||
""")
|
||||
return cur.fetchall()
|
||||
|
||||
def update_source_status(source_id, status, message, http_code=0):
|
||||
"""Update the status of a URL source."""
|
||||
with get_write_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
UPDATE fuentes_url
|
||||
SET last_check = NOW(),
|
||||
last_status = %s,
|
||||
status_message = %s,
|
||||
last_http_code = %s
|
||||
WHERE id = %s
|
||||
""", (status, message, http_code, source_id))
|
||||
conn.commit()
|
||||
|
||||
def save_article(source, article):
|
||||
"""Save the extracted article to the database."""
|
||||
source_id, source_name, source_url, cat_id, pais_id, lang = source
|
||||
|
||||
# Use the article url if possible, otherwise source_url
|
||||
final_url = article.url or source_url
|
||||
noticia_id = hashlib.md5(final_url.encode("utf-8")).hexdigest()
|
||||
|
||||
with get_write_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
# Check if exists
|
||||
cur.execute("SELECT id FROM noticias WHERE id = %s", (noticia_id,))
|
||||
if cur.fetchone():
|
||||
return False # Already exists
|
||||
|
||||
# Prepare data
|
||||
title = article.title or "Sin título"
|
||||
summary = article.summary or article.text[:500]
|
||||
image_url = article.top_image
|
||||
pub_date = article.publish_date or datetime.utcnow()
|
||||
|
||||
cur.execute("""
|
||||
INSERT INTO noticias (
|
||||
id, titulo, resumen, url, fecha, imagen_url,
|
||||
fuente_nombre, categoria_id, pais_id
|
||||
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (id) DO NOTHING
|
||||
""", (
|
||||
noticia_id, title, summary, final_url, pub_date, image_url,
|
||||
source_name, cat_id, pais_id
|
||||
))
|
||||
conn.commit()
|
||||
return True
|
||||
|
||||
def process_url(source):
|
||||
"""Process a single URL source."""
|
||||
source_id, name, url, _, _, _ = source
|
||||
|
||||
logger.info(f"Processing URL: {url} ({name})")
|
||||
|
||||
try:
|
||||
# Browser-like headers
|
||||
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
config = Config()
|
||||
config.browser_user_agent = user_agent
|
||||
config.request_timeout = 30
|
||||
|
||||
article = Article(url, config=config, language='es')
|
||||
article.download()
|
||||
|
||||
if not article.html:
|
||||
update_source_status(source_id, "ERROR", "No content downloaded (Empty HTML)", 0)
|
||||
return
|
||||
|
||||
article.parse()
|
||||
try:
|
||||
article.nlp()
|
||||
except:
|
||||
pass
|
||||
|
||||
if not article.title:
|
||||
update_source_status(source_id, "ERROR_PARSE", "Could not extract title (Page might be not an article)", 200)
|
||||
return
|
||||
|
||||
saved = save_article(source, article)
|
||||
|
||||
status_msg = "News created successfully" if saved else "News already exists"
|
||||
update_source_status(source_id, "OK", status_msg, 200)
|
||||
logger.info(f"Success {url}: {status_msg}")
|
||||
|
||||
except ArticleException as ae:
|
||||
logger.error(f"Newspaper Error {url}: {ae}")
|
||||
update_source_status(source_id, "ERROR_DOWNLOAD", str(ae)[:200], 0)
|
||||
except requests.exceptions.RequestException as re:
|
||||
logger.error(f"Network Error {url}: {re}")
|
||||
update_source_status(source_id, "ERROR_NETWORK", str(re)[:200], 0)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected Error {url}: {e}")
|
||||
update_source_status(source_id, "ERROR_UNKNOWN", str(e)[:200], 500)
|
||||
|
||||
def main():
|
||||
logger.info("Starting URL Worker")
|
||||
urls = get_active_urls()
|
||||
logger.info(f"Found {len(urls)} active URLs")
|
||||
for source in urls:
|
||||
process_url(source)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue