Initial clean commit

This commit is contained in:
jlimolina 2026-01-13 13:39:51 +01:00
commit 6784d81c2c
141 changed files with 25219 additions and 0 deletions

139
utils/wiki.py Normal file
View file

@ -0,0 +1,139 @@
import requests
import logging
from cache import cache_get, cache_set
from db import get_read_conn, get_write_conn
logger = logging.getLogger(__name__)
# Cache for 24 hours
CACHE_TTL = 86400
def fetch_wiki_data(name, entity_type=None):
"""
Fetch image URL AND summary from Wikipedia API for any entity.
Returns tuple: (image_url, summary)
"""
# 1. Check Cache
cache_key = f"wiki:data:{name.lower()}"
cached_data = cache_get(cache_key)
if cached_data is not None:
# Cache stores dict: {"image": url, "summary": text}
if isinstance(cached_data, dict):
return cached_data.get("image"), cached_data.get("summary")
# Legacy cache (string URL only)? Support migration or ignore
if isinstance(cached_data, str) and cached_data != "NO_IMAGE":
return cached_data, None
return None, None
# 2. Check Database
try:
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("SELECT image_url, summary, summary_es FROM entity_images WHERE entity_name = %s", (name,))
row = cur.fetchone()
if row:
image_url, summary, summary_es = row
# Prefer the translated summary if it exists
final_summary = summary_es if summary_es else summary
# Update cache and return
cache_value = {"image": image_url, "summary": final_summary}
cache_set(cache_key, cache_value, ttl_seconds=CACHE_TTL)
return image_url, final_summary
except Exception as e:
logger.error(f"DB read error for {name}: {e}")
# 3. Fetch from Wikipedia
summary_en = None
summary_es = None
status_es = 'none'
image_url, summary = _query_wikipedia_api_full(name, lang='es')
if summary:
summary_es = summary
status_es = 'done'
else:
# Try English if Spanish failed
img_en, summ_en = _query_wikipedia_api_full(name, lang='en')
if summ_en:
summary = summ_en
summary_en = summ_en
status_es = 'pending'
if not image_url:
image_url = img_en
# 4. Persist to Database (found or not)
try:
with get_write_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
INSERT INTO entity_images (entity_name, image_url, summary, summary_en, summary_es, status_es, last_checked)
VALUES (%s, %s, %s, %s, %s, %s, NOW())
ON CONFLICT (entity_name) DO UPDATE
SET image_url = EXCLUDED.image_url,
summary = EXCLUDED.summary,
summary_en = EXCLUDED.summary_en,
summary_es = EXCLUDED.summary_es,
status_es = EXCLUDED.status_es,
last_checked = NOW()
""", (name, image_url, summary, summary_en, summary_es, status_es))
conn.commit()
except Exception as e:
logger.error(f"DB write error for {name}: {e}")
# 5. Cache Result
cache_value = {"image": image_url, "summary": summary}
cache_set(cache_key, cache_value, ttl_seconds=CACHE_TTL)
return image_url, summary
def _query_wikipedia_api_full(query, lang='es'):
"""
Query Wikipedia API for thumbnail and summary.
"""
try:
url = f"https://{lang}.wikipedia.org/w/api.php"
params = {
"action": "query",
"format": "json",
"prop": "pageimages|extracts",
"piprop": "thumbnail",
"pithumbsize": 300, # Larger size requested
"exintro": 1,
"explaintext": 1,
"exchars": 400, # Limit chars
"titles": query,
"redirects": 1,
"origin": "*"
}
# Wikipedia requires a User-Agent
headers = {
"User-Agent": "NewsEntityStats/1.0 (internal_tool; contact@example.com)"
}
response = requests.get(url, params=params, headers=headers, timeout=2) # Fast timeout
data = response.json()
pages = data.get("query", {}).get("pages", {})
for page_id, page_data in pages.items():
if page_id == "-1":
continue # Not found
image_url = None
if "thumbnail" in page_data:
image_url = page_data["thumbnail"]["source"]
summary = page_data.get("extract")
if summary and "may refer to:" in summary: # Disambiguation page
summary = None
if image_url or summary:
return image_url, summary
except Exception as e:
logger.error(f"Error fetching wiki data for {query} ({lang}): {e}")
return None, None