import logging
import hashlib
from datetime import datetime
from newspaper import Article, ArticleException, Config
import requests
from db import get_write_conn, get_read_conn

# Configuration
logger = logging.getLogger("url_worker")
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

def get_active_urls():
    """Get all active URL sources."""
    with get_read_conn() as conn:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT id, nombre, url, categoria_id, pais_id, idioma
                FROM fuentes_url 
                WHERE active = true
            """)
            return cur.fetchall()

def update_source_status(source_id, status, message, http_code=0):
    """Update the status of a URL source."""
    with get_write_conn() as conn:
        with conn.cursor() as cur:
            cur.execute("""
                UPDATE fuentes_url 
                SET last_check = NOW(),
                    last_status = %s,
                    status_message = %s,
                    last_http_code = %s
                WHERE id = %s
            """, (status, message, http_code, source_id))
        conn.commit()

def save_article(source, article):
    """Save the extracted article to the database."""
    source_id, source_name, source_url, cat_id, pais_id, lang = source
    
    # Use the article url if possible, otherwise source_url
    final_url = article.url or source_url
    noticia_id = hashlib.md5(final_url.encode("utf-8")).hexdigest()
    
    with get_write_conn() as conn:
        with conn.cursor() as cur:
            # Check if exists
            cur.execute("SELECT id FROM noticias WHERE id = %s", (noticia_id,))
            if cur.fetchone():
                return False # Already exists
            
            # Prepare data
            title = article.title or "Sin título"
            summary = article.summary or article.text[:500] 
            image_url = article.top_image
            pub_date = article.publish_date or datetime.utcnow()
            
            cur.execute("""
                INSERT INTO noticias (
                    id, titulo, resumen, url, fecha, imagen_url, 
                    fuente_nombre, categoria_id, pais_id
                ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
                ON CONFLICT (id) DO NOTHING
            """, (
                noticia_id, title, summary, final_url, pub_date, image_url,
                source_name, cat_id, pais_id
            ))
        conn.commit()
    return True

def process_url(source):
    """Process a single URL source."""
    source_id, name, url, _, _, _ = source
    
    logger.info(f"Processing URL: {url} ({name})")
    
    try:
        # Browser-like headers
        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        config = Config()
        config.browser_user_agent = user_agent
        config.request_timeout = 30

        article = Article(url, config=config, language='es')
        article.download()
        
        if not article.html:
             update_source_status(source_id, "ERROR", "No content downloaded (Empty HTML)", 0)
             return

        article.parse()
        try:
            article.nlp()
        except:
            pass 
            
        if not article.title:
            update_source_status(source_id, "ERROR_PARSE", "Could not extract title (Page might be not an article)", 200)
            return
            
        saved = save_article(source, article)
        
        status_msg = "News created successfully" if saved else "News already exists"
        update_source_status(source_id, "OK", status_msg, 200)
        logger.info(f"Success {url}: {status_msg}")

    except ArticleException as ae:
        logger.error(f"Newspaper Error {url}: {ae}")
        update_source_status(source_id, "ERROR_DOWNLOAD", str(ae)[:200], 0)
    except requests.exceptions.RequestException as re:
        logger.error(f"Network Error {url}: {re}")
        update_source_status(source_id, "ERROR_NETWORK", str(re)[:200], 0)
    except Exception as e:
        logger.error(f"Unexpected Error {url}: {e}")
        update_source_status(source_id, "ERROR_UNKNOWN", str(e)[:200], 500)

def main():
    logger.info("Starting URL Worker")
    urls = get_active_urls()
    logger.info(f"Found {len(urls)} active URLs")
    for source in urls:
        process_url(source)

if __name__ == "__main__":
    main()