rss2/workers/topics_worker.py

import os
import time
import logging
import json
import psycopg2
from psycopg2.extras import execute_values

# Logging
logging.basicConfig(
    level=logging.INFO,
    format='[topics_worker] %(asctime)s %(levelname)s: %(message)s'
)
log = logging.getLogger(__name__)

# Config
DB_CONFIG = {
    "host": os.environ.get("DB_HOST", "localhost"),
    "port": int(os.environ.get("DB_PORT", 5432)),
    "dbname": os.environ.get("DB_NAME", "rss"),
    "user": os.environ.get("DB_USER", "rss"),
    "password": os.environ.get("DB_PASS", "x"),
}

SLEEP_IDLE = 10
BATCH_SIZE = 500

def get_conn():
    return psycopg2.connect(**DB_CONFIG)

def load_topics(conn):
    """
    Load topics and heir keywords.
    Returns list of dicts: [{'id': 1, 'weight': 5, 'keywords': ['foo', 'bar']}]
    """
    with conn.cursor() as cur:
        cur.execute("SELECT id, weight, keywords FROM topics")
        rows = cur.fetchall()

    topics = []
    for r in rows:
        tid, weight, kw_str = r
        if not kw_str:
            continue
        # Keywords are comma separated based on insert script
        kws = [k.strip().lower() for k in kw_str.split(",") if k.strip()]
        topics.append({
            "id": tid,
            "weight": weight,
            "keywords": kws
        })
    return topics


def load_countries(conn):
    """
    Load countries.
    Returns list: [{'id': 1, 'name': 'España', 'keywords': ['españa', 'madrid']}]
    """
    with conn.cursor() as cur:
        cur.execute("SELECT id, nombre FROM paises")
        rows = cur.fetchall()

    countries = []
    # Hardcoded aliases for simplicity. A separate table would be better.
    ALIASES = {
        "Estados Unidos": ["eeuu", "ee.uu.", "usa", "estadounidense", "washington"],
        "Rusia": ["ruso", "rusa", "moscú", "kremlin"],
        "China": ["chino", "china", "pekin", "beijing"],
        "Ucrania": ["ucraniano", "kiev", "kyiv"],
        "Israel": ["israelí", "tel aviv", "jerusalén"],
        "España": ["español", "madrid"],
        "Reino Unido": ["uk", "londres", "británico"],
        "Francia": ["francés", "parís"],
        "Alemania": ["alemán", "berlín"],
        "Palestina": ["palestino", "gaza", "cisjordania"],
        "Irán": ["iraní", "teherán"],
    }

    for r in rows:
        cid, name = r
        kws = [name.lower()]
        if name in ALIASES:
            kws.extend(ALIASES[name])
        countries.append({"id": cid, "name": name, "keywords": kws})
    return countries

def process_batch(conn, topics, countries):
    """
    Fetch batch of processed=False news.
    Match against topics AND countries.
    Insert into news_topics.
    Mark processed.
    """
    with conn.cursor() as cur:
        # Fetch news
        cur.execute("""
            SELECT id, titulo, resumen
            FROM noticias
            WHERE topics_processed = FALSE
            ORDER BY fecha DESC
            LIMIT %s
        """, (BATCH_SIZE,))
        news_items = cur.fetchall()

    if not news_items:
        return 0

    inserts = [] # (noticia_id, topic_id, score)
    processed_ids = []

    # Batch updates for pais_id
    country_updates = [] # (pais_id, noticia_id)

    for item in news_items:
        nid, titulo, resumen = item
        text = (titulo or "") + " " + (resumen or "")
        text_lower = text.lower()

        # 1. Match Topics
        for topic in topics:
            matched_count = 0
            for kw in topic["keywords"]:
                if kw in text_lower:
                    matched_count += 1

            if matched_count > 0:
                score = topic["weight"] * matched_count
                inserts.append((nid, topic["id"], score))

        # 2. Match Country (Find best match)
        best_country = None
        # Simple heuristic: First found? Or count matches?
        # Let's count matches.
        max_matches = 0

        for c in countries:
            matches = 0
            for kw in c["keywords"]:
                # simple word matching. can be improved with regex word boundaries
                if kw in text_lower:
                    matches += 1

            if matches > max_matches:
                max_matches = matches
                best_country = c["id"]

        if best_country:
            country_updates.append((best_country, nid))

        processed_ids.append(nid)

    with conn.cursor() as cur:
        # Insert relations
        if inserts:
            execute_values(cur, """
                INSERT INTO news_topics (noticia_id, topic_id, score)
                VALUES %s
                ON CONFLICT (noticia_id, topic_id) DO UPDATE SET score = EXCLUDED.score
            """, inserts)

        # Update Countries
        if country_updates:
            execute_values(cur, """
                UPDATE noticias AS n
                SET pais_id = v.pais_id
                FROM (VALUES %s) AS v(pais_id, noticia_id)
                WHERE n.id = v.noticia_id
            """, country_updates)

        # Mark processed
        cur.execute("""
            UPDATE noticias
            SET topics_processed = TRUE
            WHERE id = ANY(%s)
        """, (processed_ids,))

    conn.commit()
    return len(news_items)

def initialize_schema(conn):
    """
    Ensure required tables and columns exist.
    """
    log.info("Checking/Initializing schema...")
    with conn.cursor() as cur:
        cur.execute("""
            CREATE TABLE IF NOT EXISTS topics (
                id SERIAL PRIMARY KEY,
                slug VARCHAR(50) UNIQUE NOT NULL,
                name VARCHAR(100) NOT NULL,
                weight INTEGER DEFAULT 1,
                keywords TEXT,
                group_name VARCHAR(50)
            );
            CREATE TABLE IF NOT EXISTS news_topics (
                noticia_id VARCHAR(32) REFERENCES noticias(id) ON DELETE CASCADE,
                topic_id INTEGER REFERENCES topics(id) ON DELETE CASCADE,
                score INTEGER DEFAULT 0,
                created_at TIMESTAMP DEFAULT NOW(),
                PRIMARY KEY (noticia_id, topic_id)
            );
            ALTER TABLE noticias ADD COLUMN IF NOT EXISTS topics_processed BOOLEAN DEFAULT FALSE;
        """)
        conn.commit()
    log.info("Schema OK.")

def main():
    log.info("Starting topics_worker...")

    # Run migrations once at startup
    try:
        with get_conn() as conn:
            initialize_schema(conn)
    except Exception as e:
        log.error(f"Error during schema initialization: {e}")
        # We might want to exit here if the schema is crucial
        # sys.exit(1)

    while True:
        try:
            with get_conn() as conn:

                topics = load_topics(conn)
                if not topics:
                    log.warning("No topics found in DB. Sleeping.")
                    time.sleep(SLEEP_IDLE)
                    continue

                # Load countries
                countries = load_countries(conn)

                count = process_batch(conn, topics, countries)

                if count < BATCH_SIZE:
                     time.sleep(SLEEP_IDLE)
                else:
                    log.info(f"Processed {count} items.")

        except Exception as e:
            log.exception("Error in topics_worker")
            time.sleep(SLEEP_IDLE)

if __name__ == "__main__":
    main()