Initial clean commit
This commit is contained in:
commit
6784d81c2c
141 changed files with 25219 additions and 0 deletions
244
workers/topics_worker.py
Normal file
244
workers/topics_worker.py
Normal file
|
|
@ -0,0 +1,244 @@
|
|||
import os
|
||||
import time
|
||||
import logging
|
||||
import json
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
|
||||
# Logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='[topics_worker] %(asctime)s %(levelname)s: %(message)s'
|
||||
)
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# Config
|
||||
DB_CONFIG = {
|
||||
"host": os.environ.get("DB_HOST", "localhost"),
|
||||
"port": int(os.environ.get("DB_PORT", 5432)),
|
||||
"dbname": os.environ.get("DB_NAME", "rss"),
|
||||
"user": os.environ.get("DB_USER", "rss"),
|
||||
"password": os.environ.get("DB_PASS", "x"),
|
||||
}
|
||||
|
||||
SLEEP_IDLE = 10
|
||||
BATCH_SIZE = 500
|
||||
|
||||
def get_conn():
|
||||
return psycopg2.connect(**DB_CONFIG)
|
||||
|
||||
def load_topics(conn):
|
||||
"""
|
||||
Load topics and heir keywords.
|
||||
Returns list of dicts: [{'id': 1, 'weight': 5, 'keywords': ['foo', 'bar']}]
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT id, weight, keywords FROM topics")
|
||||
rows = cur.fetchall()
|
||||
|
||||
topics = []
|
||||
for r in rows:
|
||||
tid, weight, kw_str = r
|
||||
if not kw_str:
|
||||
continue
|
||||
# Keywords are comma separated based on insert script
|
||||
kws = [k.strip().lower() for k in kw_str.split(",") if k.strip()]
|
||||
topics.append({
|
||||
"id": tid,
|
||||
"weight": weight,
|
||||
"keywords": kws
|
||||
})
|
||||
return topics
|
||||
|
||||
|
||||
def load_countries(conn):
|
||||
"""
|
||||
Load countries.
|
||||
Returns list: [{'id': 1, 'name': 'España', 'keywords': ['españa', 'madrid']}]
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT id, nombre FROM paises")
|
||||
rows = cur.fetchall()
|
||||
|
||||
countries = []
|
||||
# Hardcoded aliases for simplicity. A separate table would be better.
|
||||
ALIASES = {
|
||||
"Estados Unidos": ["eeuu", "ee.uu.", "usa", "estadounidense", "washington"],
|
||||
"Rusia": ["ruso", "rusa", "moscú", "kremlin"],
|
||||
"China": ["chino", "china", "pekin", "beijing"],
|
||||
"Ucrania": ["ucraniano", "kiev", "kyiv"],
|
||||
"Israel": ["israelí", "tel aviv", "jerusalén"],
|
||||
"España": ["español", "madrid"],
|
||||
"Reino Unido": ["uk", "londres", "británico"],
|
||||
"Francia": ["francés", "parís"],
|
||||
"Alemania": ["alemán", "berlín"],
|
||||
"Palestina": ["palestino", "gaza", "cisjordania"],
|
||||
"Irán": ["iraní", "teherán"],
|
||||
}
|
||||
|
||||
for r in rows:
|
||||
cid, name = r
|
||||
kws = [name.lower()]
|
||||
if name in ALIASES:
|
||||
kws.extend(ALIASES[name])
|
||||
countries.append({"id": cid, "name": name, "keywords": kws})
|
||||
return countries
|
||||
|
||||
def process_batch(conn, topics, countries):
|
||||
"""
|
||||
Fetch batch of processed=False news.
|
||||
Match against topics AND countries.
|
||||
Insert into news_topics.
|
||||
Mark processed.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
# Fetch news
|
||||
cur.execute("""
|
||||
SELECT id, titulo, resumen
|
||||
FROM noticias
|
||||
WHERE topics_processed = FALSE
|
||||
ORDER BY fecha DESC
|
||||
LIMIT %s
|
||||
""", (BATCH_SIZE,))
|
||||
news_items = cur.fetchall()
|
||||
|
||||
if not news_items:
|
||||
return 0
|
||||
|
||||
inserts = [] # (noticia_id, topic_id, score)
|
||||
processed_ids = []
|
||||
|
||||
# Batch updates for pais_id
|
||||
country_updates = [] # (pais_id, noticia_id)
|
||||
|
||||
for item in news_items:
|
||||
nid, titulo, resumen = item
|
||||
text = (titulo or "") + " " + (resumen or "")
|
||||
text_lower = text.lower()
|
||||
|
||||
# 1. Match Topics
|
||||
for topic in topics:
|
||||
matched_count = 0
|
||||
for kw in topic["keywords"]:
|
||||
if kw in text_lower:
|
||||
matched_count += 1
|
||||
|
||||
if matched_count > 0:
|
||||
score = topic["weight"] * matched_count
|
||||
inserts.append((nid, topic["id"], score))
|
||||
|
||||
# 2. Match Country (Find best match)
|
||||
best_country = None
|
||||
# Simple heuristic: First found? Or count matches?
|
||||
# Let's count matches.
|
||||
max_matches = 0
|
||||
|
||||
for c in countries:
|
||||
matches = 0
|
||||
for kw in c["keywords"]:
|
||||
# simple word matching. can be improved with regex word boundaries
|
||||
if kw in text_lower:
|
||||
matches += 1
|
||||
|
||||
if matches > max_matches:
|
||||
max_matches = matches
|
||||
best_country = c["id"]
|
||||
|
||||
if best_country:
|
||||
country_updates.append((best_country, nid))
|
||||
|
||||
processed_ids.append(nid)
|
||||
|
||||
with conn.cursor() as cur:
|
||||
# Insert relations
|
||||
if inserts:
|
||||
execute_values(cur, """
|
||||
INSERT INTO news_topics (noticia_id, topic_id, score)
|
||||
VALUES %s
|
||||
ON CONFLICT (noticia_id, topic_id) DO UPDATE SET score = EXCLUDED.score
|
||||
""", inserts)
|
||||
|
||||
# Update Countries
|
||||
if country_updates:
|
||||
execute_values(cur, """
|
||||
UPDATE noticias AS n
|
||||
SET pais_id = v.pais_id
|
||||
FROM (VALUES %s) AS v(pais_id, noticia_id)
|
||||
WHERE n.id = v.noticia_id
|
||||
""", country_updates)
|
||||
|
||||
# Mark processed
|
||||
cur.execute("""
|
||||
UPDATE noticias
|
||||
SET topics_processed = TRUE
|
||||
WHERE id = ANY(%s)
|
||||
""", (processed_ids,))
|
||||
|
||||
conn.commit()
|
||||
return len(news_items)
|
||||
|
||||
def initialize_schema(conn):
|
||||
"""
|
||||
Ensure required tables and columns exist.
|
||||
"""
|
||||
log.info("Checking/Initializing schema...")
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
CREATE TABLE IF NOT EXISTS topics (
|
||||
id SERIAL PRIMARY KEY,
|
||||
slug VARCHAR(50) UNIQUE NOT NULL,
|
||||
name VARCHAR(100) NOT NULL,
|
||||
weight INTEGER DEFAULT 1,
|
||||
keywords TEXT,
|
||||
group_name VARCHAR(50)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS news_topics (
|
||||
noticia_id VARCHAR(32) REFERENCES noticias(id) ON DELETE CASCADE,
|
||||
topic_id INTEGER REFERENCES topics(id) ON DELETE CASCADE,
|
||||
score INTEGER DEFAULT 0,
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
PRIMARY KEY (noticia_id, topic_id)
|
||||
);
|
||||
ALTER TABLE noticias ADD COLUMN IF NOT EXISTS topics_processed BOOLEAN DEFAULT FALSE;
|
||||
""")
|
||||
conn.commit()
|
||||
log.info("Schema OK.")
|
||||
|
||||
def main():
|
||||
log.info("Starting topics_worker...")
|
||||
|
||||
# Run migrations once at startup
|
||||
try:
|
||||
with get_conn() as conn:
|
||||
initialize_schema(conn)
|
||||
except Exception as e:
|
||||
log.error(f"Error during schema initialization: {e}")
|
||||
# We might want to exit here if the schema is crucial
|
||||
# sys.exit(1)
|
||||
|
||||
while True:
|
||||
try:
|
||||
with get_conn() as conn:
|
||||
|
||||
topics = load_topics(conn)
|
||||
if not topics:
|
||||
log.warning("No topics found in DB. Sleeping.")
|
||||
time.sleep(SLEEP_IDLE)
|
||||
continue
|
||||
|
||||
# Load countries
|
||||
countries = load_countries(conn)
|
||||
|
||||
count = process_batch(conn, topics, countries)
|
||||
|
||||
if count < BATCH_SIZE:
|
||||
time.sleep(SLEEP_IDLE)
|
||||
else:
|
||||
log.info(f"Processed {count} items.")
|
||||
|
||||
except Exception as e:
|
||||
log.exception("Error in topics_worker")
|
||||
time.sleep(SLEEP_IDLE)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue