Initial clean commit

This commit is contained in:
jlimolina 2026-01-13 13:39:51 +01:00
commit 6784d81c2c
141 changed files with 25219 additions and 0 deletions

244
workers/topics_worker.py Normal file
View file

@ -0,0 +1,244 @@
import os
import time
import logging
import json
import psycopg2
from psycopg2.extras import execute_values
# Logging
logging.basicConfig(
level=logging.INFO,
format='[topics_worker] %(asctime)s %(levelname)s: %(message)s'
)
log = logging.getLogger(__name__)
# Config
DB_CONFIG = {
"host": os.environ.get("DB_HOST", "localhost"),
"port": int(os.environ.get("DB_PORT", 5432)),
"dbname": os.environ.get("DB_NAME", "rss"),
"user": os.environ.get("DB_USER", "rss"),
"password": os.environ.get("DB_PASS", "x"),
}
SLEEP_IDLE = 10
BATCH_SIZE = 500
def get_conn():
return psycopg2.connect(**DB_CONFIG)
def load_topics(conn):
"""
Load topics and heir keywords.
Returns list of dicts: [{'id': 1, 'weight': 5, 'keywords': ['foo', 'bar']}]
"""
with conn.cursor() as cur:
cur.execute("SELECT id, weight, keywords FROM topics")
rows = cur.fetchall()
topics = []
for r in rows:
tid, weight, kw_str = r
if not kw_str:
continue
# Keywords are comma separated based on insert script
kws = [k.strip().lower() for k in kw_str.split(",") if k.strip()]
topics.append({
"id": tid,
"weight": weight,
"keywords": kws
})
return topics
def load_countries(conn):
"""
Load countries.
Returns list: [{'id': 1, 'name': 'España', 'keywords': ['españa', 'madrid']}]
"""
with conn.cursor() as cur:
cur.execute("SELECT id, nombre FROM paises")
rows = cur.fetchall()
countries = []
# Hardcoded aliases for simplicity. A separate table would be better.
ALIASES = {
"Estados Unidos": ["eeuu", "ee.uu.", "usa", "estadounidense", "washington"],
"Rusia": ["ruso", "rusa", "moscú", "kremlin"],
"China": ["chino", "china", "pekin", "beijing"],
"Ucrania": ["ucraniano", "kiev", "kyiv"],
"Israel": ["israelí", "tel aviv", "jerusalén"],
"España": ["español", "madrid"],
"Reino Unido": ["uk", "londres", "británico"],
"Francia": ["francés", "parís"],
"Alemania": ["alemán", "berlín"],
"Palestina": ["palestino", "gaza", "cisjordania"],
"Irán": ["iraní", "teherán"],
}
for r in rows:
cid, name = r
kws = [name.lower()]
if name in ALIASES:
kws.extend(ALIASES[name])
countries.append({"id": cid, "name": name, "keywords": kws})
return countries
def process_batch(conn, topics, countries):
"""
Fetch batch of processed=False news.
Match against topics AND countries.
Insert into news_topics.
Mark processed.
"""
with conn.cursor() as cur:
# Fetch news
cur.execute("""
SELECT id, titulo, resumen
FROM noticias
WHERE topics_processed = FALSE
ORDER BY fecha DESC
LIMIT %s
""", (BATCH_SIZE,))
news_items = cur.fetchall()
if not news_items:
return 0
inserts = [] # (noticia_id, topic_id, score)
processed_ids = []
# Batch updates for pais_id
country_updates = [] # (pais_id, noticia_id)
for item in news_items:
nid, titulo, resumen = item
text = (titulo or "") + " " + (resumen or "")
text_lower = text.lower()
# 1. Match Topics
for topic in topics:
matched_count = 0
for kw in topic["keywords"]:
if kw in text_lower:
matched_count += 1
if matched_count > 0:
score = topic["weight"] * matched_count
inserts.append((nid, topic["id"], score))
# 2. Match Country (Find best match)
best_country = None
# Simple heuristic: First found? Or count matches?
# Let's count matches.
max_matches = 0
for c in countries:
matches = 0
for kw in c["keywords"]:
# simple word matching. can be improved with regex word boundaries
if kw in text_lower:
matches += 1
if matches > max_matches:
max_matches = matches
best_country = c["id"]
if best_country:
country_updates.append((best_country, nid))
processed_ids.append(nid)
with conn.cursor() as cur:
# Insert relations
if inserts:
execute_values(cur, """
INSERT INTO news_topics (noticia_id, topic_id, score)
VALUES %s
ON CONFLICT (noticia_id, topic_id) DO UPDATE SET score = EXCLUDED.score
""", inserts)
# Update Countries
if country_updates:
execute_values(cur, """
UPDATE noticias AS n
SET pais_id = v.pais_id
FROM (VALUES %s) AS v(pais_id, noticia_id)
WHERE n.id = v.noticia_id
""", country_updates)
# Mark processed
cur.execute("""
UPDATE noticias
SET topics_processed = TRUE
WHERE id = ANY(%s)
""", (processed_ids,))
conn.commit()
return len(news_items)
def initialize_schema(conn):
"""
Ensure required tables and columns exist.
"""
log.info("Checking/Initializing schema...")
with conn.cursor() as cur:
cur.execute("""
CREATE TABLE IF NOT EXISTS topics (
id SERIAL PRIMARY KEY,
slug VARCHAR(50) UNIQUE NOT NULL,
name VARCHAR(100) NOT NULL,
weight INTEGER DEFAULT 1,
keywords TEXT,
group_name VARCHAR(50)
);
CREATE TABLE IF NOT EXISTS news_topics (
noticia_id VARCHAR(32) REFERENCES noticias(id) ON DELETE CASCADE,
topic_id INTEGER REFERENCES topics(id) ON DELETE CASCADE,
score INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT NOW(),
PRIMARY KEY (noticia_id, topic_id)
);
ALTER TABLE noticias ADD COLUMN IF NOT EXISTS topics_processed BOOLEAN DEFAULT FALSE;
""")
conn.commit()
log.info("Schema OK.")
def main():
log.info("Starting topics_worker...")
# Run migrations once at startup
try:
with get_conn() as conn:
initialize_schema(conn)
except Exception as e:
log.error(f"Error during schema initialization: {e}")
# We might want to exit here if the schema is crucial
# sys.exit(1)
while True:
try:
with get_conn() as conn:
topics = load_topics(conn)
if not topics:
log.warning("No topics found in DB. Sleeping.")
time.sleep(SLEEP_IDLE)
continue
# Load countries
countries = load_countries(conn)
count = process_batch(conn, topics, countries)
if count < BATCH_SIZE:
time.sleep(SLEEP_IDLE)
else:
log.info(f"Processed {count} items.")
except Exception as e:
log.exception("Error in topics_worker")
time.sleep(SLEEP_IDLE)
if __name__ == "__main__":
main()