Initial clean commit
This commit is contained in:
commit
6784d81c2c
141 changed files with 25219 additions and 0 deletions
139
utils/wiki.py
Normal file
139
utils/wiki.py
Normal file
|
|
@ -0,0 +1,139 @@
|
|||
import requests
|
||||
import logging
|
||||
from cache import cache_get, cache_set
|
||||
from db import get_read_conn, get_write_conn
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Cache for 24 hours
|
||||
CACHE_TTL = 86400
|
||||
|
||||
def fetch_wiki_data(name, entity_type=None):
|
||||
"""
|
||||
Fetch image URL AND summary from Wikipedia API for any entity.
|
||||
Returns tuple: (image_url, summary)
|
||||
"""
|
||||
# 1. Check Cache
|
||||
cache_key = f"wiki:data:{name.lower()}"
|
||||
cached_data = cache_get(cache_key)
|
||||
if cached_data is not None:
|
||||
# Cache stores dict: {"image": url, "summary": text}
|
||||
if isinstance(cached_data, dict):
|
||||
return cached_data.get("image"), cached_data.get("summary")
|
||||
# Legacy cache (string URL only)? Support migration or ignore
|
||||
if isinstance(cached_data, str) and cached_data != "NO_IMAGE":
|
||||
return cached_data, None
|
||||
return None, None
|
||||
|
||||
# 2. Check Database
|
||||
try:
|
||||
with get_read_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT image_url, summary, summary_es FROM entity_images WHERE entity_name = %s", (name,))
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
image_url, summary, summary_es = row
|
||||
# Prefer the translated summary if it exists
|
||||
final_summary = summary_es if summary_es else summary
|
||||
|
||||
# Update cache and return
|
||||
cache_value = {"image": image_url, "summary": final_summary}
|
||||
cache_set(cache_key, cache_value, ttl_seconds=CACHE_TTL)
|
||||
return image_url, final_summary
|
||||
except Exception as e:
|
||||
logger.error(f"DB read error for {name}: {e}")
|
||||
|
||||
# 3. Fetch from Wikipedia
|
||||
summary_en = None
|
||||
summary_es = None
|
||||
status_es = 'none'
|
||||
|
||||
image_url, summary = _query_wikipedia_api_full(name, lang='es')
|
||||
if summary:
|
||||
summary_es = summary
|
||||
status_es = 'done'
|
||||
else:
|
||||
# Try English if Spanish failed
|
||||
img_en, summ_en = _query_wikipedia_api_full(name, lang='en')
|
||||
if summ_en:
|
||||
summary = summ_en
|
||||
summary_en = summ_en
|
||||
status_es = 'pending'
|
||||
if not image_url:
|
||||
image_url = img_en
|
||||
|
||||
# 4. Persist to Database (found or not)
|
||||
try:
|
||||
with get_write_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
INSERT INTO entity_images (entity_name, image_url, summary, summary_en, summary_es, status_es, last_checked)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, NOW())
|
||||
ON CONFLICT (entity_name) DO UPDATE
|
||||
SET image_url = EXCLUDED.image_url,
|
||||
summary = EXCLUDED.summary,
|
||||
summary_en = EXCLUDED.summary_en,
|
||||
summary_es = EXCLUDED.summary_es,
|
||||
status_es = EXCLUDED.status_es,
|
||||
last_checked = NOW()
|
||||
""", (name, image_url, summary, summary_en, summary_es, status_es))
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"DB write error for {name}: {e}")
|
||||
|
||||
# 5. Cache Result
|
||||
cache_value = {"image": image_url, "summary": summary}
|
||||
cache_set(cache_key, cache_value, ttl_seconds=CACHE_TTL)
|
||||
|
||||
return image_url, summary
|
||||
|
||||
|
||||
def _query_wikipedia_api_full(query, lang='es'):
|
||||
"""
|
||||
Query Wikipedia API for thumbnail and summary.
|
||||
"""
|
||||
try:
|
||||
url = f"https://{lang}.wikipedia.org/w/api.php"
|
||||
params = {
|
||||
"action": "query",
|
||||
"format": "json",
|
||||
"prop": "pageimages|extracts",
|
||||
"piprop": "thumbnail",
|
||||
"pithumbsize": 300, # Larger size requested
|
||||
"exintro": 1,
|
||||
"explaintext": 1,
|
||||
"exchars": 400, # Limit chars
|
||||
"titles": query,
|
||||
"redirects": 1,
|
||||
"origin": "*"
|
||||
}
|
||||
|
||||
# Wikipedia requires a User-Agent
|
||||
headers = {
|
||||
"User-Agent": "NewsEntityStats/1.0 (internal_tool; contact@example.com)"
|
||||
}
|
||||
|
||||
response = requests.get(url, params=params, headers=headers, timeout=2) # Fast timeout
|
||||
data = response.json()
|
||||
|
||||
pages = data.get("query", {}).get("pages", {})
|
||||
|
||||
for page_id, page_data in pages.items():
|
||||
if page_id == "-1":
|
||||
continue # Not found
|
||||
|
||||
image_url = None
|
||||
if "thumbnail" in page_data:
|
||||
image_url = page_data["thumbnail"]["source"]
|
||||
|
||||
summary = page_data.get("extract")
|
||||
if summary and "may refer to:" in summary: # Disambiguation page
|
||||
summary = None
|
||||
|
||||
if image_url or summary:
|
||||
return image_url, summary
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching wiki data for {query} ({lang}): {e}")
|
||||
|
||||
return None, None
|
||||
Loading…
Add table
Add a link
Reference in a new issue