Initial clean commit

This commit is contained in:
jlimolina 2026-01-13 13:39:51 +01:00
commit 6784d81c2c
141 changed files with 25219 additions and 0 deletions

125
workers/url_worker.py Normal file
View file

@ -0,0 +1,125 @@
import logging
import hashlib
from datetime import datetime
from newspaper import Article, ArticleException, Config
import requests
from db import get_write_conn, get_read_conn
# Configuration
logger = logging.getLogger("url_worker")
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
def get_active_urls():
"""Get all active URL sources."""
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT id, nombre, url, categoria_id, pais_id, idioma
FROM fuentes_url
WHERE active = true
""")
return cur.fetchall()
def update_source_status(source_id, status, message, http_code=0):
"""Update the status of a URL source."""
with get_write_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
UPDATE fuentes_url
SET last_check = NOW(),
last_status = %s,
status_message = %s,
last_http_code = %s
WHERE id = %s
""", (status, message, http_code, source_id))
conn.commit()
def save_article(source, article):
"""Save the extracted article to the database."""
source_id, source_name, source_url, cat_id, pais_id, lang = source
# Use the article url if possible, otherwise source_url
final_url = article.url or source_url
noticia_id = hashlib.md5(final_url.encode("utf-8")).hexdigest()
with get_write_conn() as conn:
with conn.cursor() as cur:
# Check if exists
cur.execute("SELECT id FROM noticias WHERE id = %s", (noticia_id,))
if cur.fetchone():
return False # Already exists
# Prepare data
title = article.title or "Sin título"
summary = article.summary or article.text[:500]
image_url = article.top_image
pub_date = article.publish_date or datetime.utcnow()
cur.execute("""
INSERT INTO noticias (
id, titulo, resumen, url, fecha, imagen_url,
fuente_nombre, categoria_id, pais_id
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (id) DO NOTHING
""", (
noticia_id, title, summary, final_url, pub_date, image_url,
source_name, cat_id, pais_id
))
conn.commit()
return True
def process_url(source):
"""Process a single URL source."""
source_id, name, url, _, _, _ = source
logger.info(f"Processing URL: {url} ({name})")
try:
# Browser-like headers
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
config = Config()
config.browser_user_agent = user_agent
config.request_timeout = 30
article = Article(url, config=config, language='es')
article.download()
if not article.html:
update_source_status(source_id, "ERROR", "No content downloaded (Empty HTML)", 0)
return
article.parse()
try:
article.nlp()
except:
pass
if not article.title:
update_source_status(source_id, "ERROR_PARSE", "Could not extract title (Page might be not an article)", 200)
return
saved = save_article(source, article)
status_msg = "News created successfully" if saved else "News already exists"
update_source_status(source_id, "OK", status_msg, 200)
logger.info(f"Success {url}: {status_msg}")
except ArticleException as ae:
logger.error(f"Newspaper Error {url}: {ae}")
update_source_status(source_id, "ERROR_DOWNLOAD", str(ae)[:200], 0)
except requests.exceptions.RequestException as re:
logger.error(f"Network Error {url}: {re}")
update_source_status(source_id, "ERROR_NETWORK", str(re)[:200], 0)
except Exception as e:
logger.error(f"Unexpected Error {url}: {e}")
update_source_status(source_id, "ERROR_UNKNOWN", str(e)[:200], 500)
def main():
logger.info("Starting URL Worker")
urls = get_active_urls()
logger.info(f"Found {len(urls)} active URLs")
for source in urls:
process_url(source)
if __name__ == "__main__":
main()