139 lines
4.9 KiB
Python
139 lines
4.9 KiB
Python
import requests
|
|
import logging
|
|
from cache import cache_get, cache_set
|
|
from db import get_read_conn, get_write_conn
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Cache for 24 hours
|
|
CACHE_TTL = 86400
|
|
|
|
def fetch_wiki_data(name, entity_type=None):
|
|
"""
|
|
Fetch image URL AND summary from Wikipedia API for any entity.
|
|
Returns tuple: (image_url, summary)
|
|
"""
|
|
# 1. Check Cache
|
|
cache_key = f"wiki:data:{name.lower()}"
|
|
cached_data = cache_get(cache_key)
|
|
if cached_data is not None:
|
|
# Cache stores dict: {"image": url, "summary": text}
|
|
if isinstance(cached_data, dict):
|
|
return cached_data.get("image"), cached_data.get("summary")
|
|
# Legacy cache (string URL only)? Support migration or ignore
|
|
if isinstance(cached_data, str) and cached_data != "NO_IMAGE":
|
|
return cached_data, None
|
|
return None, None
|
|
|
|
# 2. Check Database
|
|
try:
|
|
with get_read_conn() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute("SELECT image_url, summary, summary_es FROM entity_images WHERE entity_name = %s", (name,))
|
|
row = cur.fetchone()
|
|
if row:
|
|
image_url, summary, summary_es = row
|
|
# Prefer the translated summary if it exists
|
|
final_summary = summary_es if summary_es else summary
|
|
|
|
# Update cache and return
|
|
cache_value = {"image": image_url, "summary": final_summary}
|
|
cache_set(cache_key, cache_value, ttl_seconds=CACHE_TTL)
|
|
return image_url, final_summary
|
|
except Exception as e:
|
|
logger.error(f"DB read error for {name}: {e}")
|
|
|
|
# 3. Fetch from Wikipedia
|
|
summary_en = None
|
|
summary_es = None
|
|
status_es = 'none'
|
|
|
|
image_url, summary = _query_wikipedia_api_full(name, lang='es')
|
|
if summary:
|
|
summary_es = summary
|
|
status_es = 'done'
|
|
else:
|
|
# Try English if Spanish failed
|
|
img_en, summ_en = _query_wikipedia_api_full(name, lang='en')
|
|
if summ_en:
|
|
summary = summ_en
|
|
summary_en = summ_en
|
|
status_es = 'pending'
|
|
if not image_url:
|
|
image_url = img_en
|
|
|
|
# 4. Persist to Database (found or not)
|
|
try:
|
|
with get_write_conn() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute("""
|
|
INSERT INTO entity_images (entity_name, image_url, summary, summary_en, summary_es, status_es, last_checked)
|
|
VALUES (%s, %s, %s, %s, %s, %s, NOW())
|
|
ON CONFLICT (entity_name) DO UPDATE
|
|
SET image_url = EXCLUDED.image_url,
|
|
summary = EXCLUDED.summary,
|
|
summary_en = EXCLUDED.summary_en,
|
|
summary_es = EXCLUDED.summary_es,
|
|
status_es = EXCLUDED.status_es,
|
|
last_checked = NOW()
|
|
""", (name, image_url, summary, summary_en, summary_es, status_es))
|
|
conn.commit()
|
|
except Exception as e:
|
|
logger.error(f"DB write error for {name}: {e}")
|
|
|
|
# 5. Cache Result
|
|
cache_value = {"image": image_url, "summary": summary}
|
|
cache_set(cache_key, cache_value, ttl_seconds=CACHE_TTL)
|
|
|
|
return image_url, summary
|
|
|
|
|
|
def _query_wikipedia_api_full(query, lang='es'):
|
|
"""
|
|
Query Wikipedia API for thumbnail and summary.
|
|
"""
|
|
try:
|
|
url = f"https://{lang}.wikipedia.org/w/api.php"
|
|
params = {
|
|
"action": "query",
|
|
"format": "json",
|
|
"prop": "pageimages|extracts",
|
|
"piprop": "thumbnail",
|
|
"pithumbsize": 300, # Larger size requested
|
|
"exintro": 1,
|
|
"explaintext": 1,
|
|
"exchars": 400, # Limit chars
|
|
"titles": query,
|
|
"redirects": 1,
|
|
"origin": "*"
|
|
}
|
|
|
|
# Wikipedia requires a User-Agent
|
|
headers = {
|
|
"User-Agent": "NewsEntityStats/1.0 (internal_tool; contact@example.com)"
|
|
}
|
|
|
|
response = requests.get(url, params=params, headers=headers, timeout=2) # Fast timeout
|
|
data = response.json()
|
|
|
|
pages = data.get("query", {}).get("pages", {})
|
|
|
|
for page_id, page_data in pages.items():
|
|
if page_id == "-1":
|
|
continue # Not found
|
|
|
|
image_url = None
|
|
if "thumbnail" in page_data:
|
|
image_url = page_data["thumbnail"]["source"]
|
|
|
|
summary = page_data.get("extract")
|
|
if summary and "may refer to:" in summary: # Disambiguation page
|
|
summary = None
|
|
|
|
if image_url or summary:
|
|
return image_url, summary
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error fetching wiki data for {query} ({lang}): {e}")
|
|
|
|
return None, None
|