244 lines
7.3 KiB
Python
244 lines
7.3 KiB
Python
import os
|
|
import time
|
|
import logging
|
|
import json
|
|
import psycopg2
|
|
from psycopg2.extras import execute_values
|
|
|
|
# Logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='[topics_worker] %(asctime)s %(levelname)s: %(message)s'
|
|
)
|
|
log = logging.getLogger(__name__)
|
|
|
|
# Config
|
|
DB_CONFIG = {
|
|
"host": os.environ.get("DB_HOST", "localhost"),
|
|
"port": int(os.environ.get("DB_PORT", 5432)),
|
|
"dbname": os.environ.get("DB_NAME", "rss"),
|
|
"user": os.environ.get("DB_USER", "rss"),
|
|
"password": os.environ.get("DB_PASS", "x"),
|
|
}
|
|
|
|
SLEEP_IDLE = 10
|
|
BATCH_SIZE = 500
|
|
|
|
def get_conn():
|
|
return psycopg2.connect(**DB_CONFIG)
|
|
|
|
def load_topics(conn):
|
|
"""
|
|
Load topics and heir keywords.
|
|
Returns list of dicts: [{'id': 1, 'weight': 5, 'keywords': ['foo', 'bar']}]
|
|
"""
|
|
with conn.cursor() as cur:
|
|
cur.execute("SELECT id, weight, keywords FROM topics")
|
|
rows = cur.fetchall()
|
|
|
|
topics = []
|
|
for r in rows:
|
|
tid, weight, kw_str = r
|
|
if not kw_str:
|
|
continue
|
|
# Keywords are comma separated based on insert script
|
|
kws = [k.strip().lower() for k in kw_str.split(",") if k.strip()]
|
|
topics.append({
|
|
"id": tid,
|
|
"weight": weight,
|
|
"keywords": kws
|
|
})
|
|
return topics
|
|
|
|
|
|
def load_countries(conn):
|
|
"""
|
|
Load countries.
|
|
Returns list: [{'id': 1, 'name': 'España', 'keywords': ['españa', 'madrid']}]
|
|
"""
|
|
with conn.cursor() as cur:
|
|
cur.execute("SELECT id, nombre FROM paises")
|
|
rows = cur.fetchall()
|
|
|
|
countries = []
|
|
# Hardcoded aliases for simplicity. A separate table would be better.
|
|
ALIASES = {
|
|
"Estados Unidos": ["eeuu", "ee.uu.", "usa", "estadounidense", "washington"],
|
|
"Rusia": ["ruso", "rusa", "moscú", "kremlin"],
|
|
"China": ["chino", "china", "pekin", "beijing"],
|
|
"Ucrania": ["ucraniano", "kiev", "kyiv"],
|
|
"Israel": ["israelí", "tel aviv", "jerusalén"],
|
|
"España": ["español", "madrid"],
|
|
"Reino Unido": ["uk", "londres", "británico"],
|
|
"Francia": ["francés", "parís"],
|
|
"Alemania": ["alemán", "berlín"],
|
|
"Palestina": ["palestino", "gaza", "cisjordania"],
|
|
"Irán": ["iraní", "teherán"],
|
|
}
|
|
|
|
for r in rows:
|
|
cid, name = r
|
|
kws = [name.lower()]
|
|
if name in ALIASES:
|
|
kws.extend(ALIASES[name])
|
|
countries.append({"id": cid, "name": name, "keywords": kws})
|
|
return countries
|
|
|
|
def process_batch(conn, topics, countries):
|
|
"""
|
|
Fetch batch of processed=False news.
|
|
Match against topics AND countries.
|
|
Insert into news_topics.
|
|
Mark processed.
|
|
"""
|
|
with conn.cursor() as cur:
|
|
# Fetch news
|
|
cur.execute("""
|
|
SELECT id, titulo, resumen
|
|
FROM noticias
|
|
WHERE topics_processed = FALSE
|
|
ORDER BY fecha DESC
|
|
LIMIT %s
|
|
""", (BATCH_SIZE,))
|
|
news_items = cur.fetchall()
|
|
|
|
if not news_items:
|
|
return 0
|
|
|
|
inserts = [] # (noticia_id, topic_id, score)
|
|
processed_ids = []
|
|
|
|
# Batch updates for pais_id
|
|
country_updates = [] # (pais_id, noticia_id)
|
|
|
|
for item in news_items:
|
|
nid, titulo, resumen = item
|
|
text = (titulo or "") + " " + (resumen or "")
|
|
text_lower = text.lower()
|
|
|
|
# 1. Match Topics
|
|
for topic in topics:
|
|
matched_count = 0
|
|
for kw in topic["keywords"]:
|
|
if kw in text_lower:
|
|
matched_count += 1
|
|
|
|
if matched_count > 0:
|
|
score = topic["weight"] * matched_count
|
|
inserts.append((nid, topic["id"], score))
|
|
|
|
# 2. Match Country (Find best match)
|
|
best_country = None
|
|
# Simple heuristic: First found? Or count matches?
|
|
# Let's count matches.
|
|
max_matches = 0
|
|
|
|
for c in countries:
|
|
matches = 0
|
|
for kw in c["keywords"]:
|
|
# simple word matching. can be improved with regex word boundaries
|
|
if kw in text_lower:
|
|
matches += 1
|
|
|
|
if matches > max_matches:
|
|
max_matches = matches
|
|
best_country = c["id"]
|
|
|
|
if best_country:
|
|
country_updates.append((best_country, nid))
|
|
|
|
processed_ids.append(nid)
|
|
|
|
with conn.cursor() as cur:
|
|
# Insert relations
|
|
if inserts:
|
|
execute_values(cur, """
|
|
INSERT INTO news_topics (noticia_id, topic_id, score)
|
|
VALUES %s
|
|
ON CONFLICT (noticia_id, topic_id) DO UPDATE SET score = EXCLUDED.score
|
|
""", inserts)
|
|
|
|
# Update Countries
|
|
if country_updates:
|
|
execute_values(cur, """
|
|
UPDATE noticias AS n
|
|
SET pais_id = v.pais_id
|
|
FROM (VALUES %s) AS v(pais_id, noticia_id)
|
|
WHERE n.id = v.noticia_id
|
|
""", country_updates)
|
|
|
|
# Mark processed
|
|
cur.execute("""
|
|
UPDATE noticias
|
|
SET topics_processed = TRUE
|
|
WHERE id = ANY(%s)
|
|
""", (processed_ids,))
|
|
|
|
conn.commit()
|
|
return len(news_items)
|
|
|
|
def initialize_schema(conn):
|
|
"""
|
|
Ensure required tables and columns exist.
|
|
"""
|
|
log.info("Checking/Initializing schema...")
|
|
with conn.cursor() as cur:
|
|
cur.execute("""
|
|
CREATE TABLE IF NOT EXISTS topics (
|
|
id SERIAL PRIMARY KEY,
|
|
slug VARCHAR(50) UNIQUE NOT NULL,
|
|
name VARCHAR(100) NOT NULL,
|
|
weight INTEGER DEFAULT 1,
|
|
keywords TEXT,
|
|
group_name VARCHAR(50)
|
|
);
|
|
CREATE TABLE IF NOT EXISTS news_topics (
|
|
noticia_id VARCHAR(32) REFERENCES noticias(id) ON DELETE CASCADE,
|
|
topic_id INTEGER REFERENCES topics(id) ON DELETE CASCADE,
|
|
score INTEGER DEFAULT 0,
|
|
created_at TIMESTAMP DEFAULT NOW(),
|
|
PRIMARY KEY (noticia_id, topic_id)
|
|
);
|
|
ALTER TABLE noticias ADD COLUMN IF NOT EXISTS topics_processed BOOLEAN DEFAULT FALSE;
|
|
""")
|
|
conn.commit()
|
|
log.info("Schema OK.")
|
|
|
|
def main():
|
|
log.info("Starting topics_worker...")
|
|
|
|
# Run migrations once at startup
|
|
try:
|
|
with get_conn() as conn:
|
|
initialize_schema(conn)
|
|
except Exception as e:
|
|
log.error(f"Error during schema initialization: {e}")
|
|
# We might want to exit here if the schema is crucial
|
|
# sys.exit(1)
|
|
|
|
while True:
|
|
try:
|
|
with get_conn() as conn:
|
|
|
|
topics = load_topics(conn)
|
|
if not topics:
|
|
log.warning("No topics found in DB. Sleeping.")
|
|
time.sleep(SLEEP_IDLE)
|
|
continue
|
|
|
|
# Load countries
|
|
countries = load_countries(conn)
|
|
|
|
count = process_batch(conn, topics, countries)
|
|
|
|
if count < BATCH_SIZE:
|
|
time.sleep(SLEEP_IDLE)
|
|
else:
|
|
log.info(f"Processed {count} items.")
|
|
|
|
except Exception as e:
|
|
log.exception("Error in topics_worker")
|
|
time.sleep(SLEEP_IDLE)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|