Initial clean commit
This commit is contained in:
commit
6784d81c2c
141 changed files with 25219 additions and 0 deletions
911
routers/stats.py
Normal file
911
routers/stats.py
Normal file
|
|
@ -0,0 +1,911 @@
|
|||
from flask import Blueprint, render_template, jsonify
|
||||
from db import get_read_conn
|
||||
from datetime import datetime, timedelta
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from cache import cached
|
||||
|
||||
stats_bp = Blueprint("stats", __name__, url_prefix="/stats")
|
||||
|
||||
|
||||
# ==================================================================================
|
||||
# ENTITY NORMALIZATION SYSTEM
|
||||
# ==================================================================================
|
||||
# Dictionary to map entity name variations to canonical names
|
||||
import json
|
||||
|
||||
CONFIG_FILE = "entity_config.json"
|
||||
_config_cache = {"data": None, "mtime": 0}
|
||||
|
||||
def load_entity_config():
|
||||
"""Load entity config from JSON file with simple modification time caching."""
|
||||
global _config_cache
|
||||
try:
|
||||
# Check if file exists
|
||||
if not os.path.exists(CONFIG_FILE):
|
||||
return {"blacklist": [], "synonyms": {}}
|
||||
|
||||
# Check modification time
|
||||
mtime = os.path.getmtime(CONFIG_FILE)
|
||||
if _config_cache["data"] is not None and mtime <= _config_cache["mtime"]:
|
||||
return _config_cache["data"]
|
||||
|
||||
# Load fresh config
|
||||
with open(CONFIG_FILE, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Normalize structure
|
||||
if "blacklist" not in data: data["blacklist"] = []
|
||||
if "synonyms" not in data: data["synonyms"] = {}
|
||||
|
||||
# Pre-process synonyms for reverse lookup (variation -> canonical)
|
||||
lookup = {}
|
||||
for canonical, variations in data["synonyms"].items():
|
||||
lookup[canonical.lower()] = canonical # Map canonical to itself
|
||||
for var in variations:
|
||||
lookup[var.lower()] = canonical
|
||||
|
||||
data["_lookup"] = lookup
|
||||
data["_blacklist_set"] = {x.lower() for x in data["blacklist"]}
|
||||
|
||||
_config_cache = {"data": data, "mtime": mtime}
|
||||
return data
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading entity config: {e}")
|
||||
# Return fallback or previous cache if available
|
||||
return _config_cache["data"] if _config_cache["data"] else {"blacklist": [], "synonyms": {}}
|
||||
|
||||
|
||||
def normalize_entity_name(name: str, config=None) -> str:
|
||||
"""Normalize entity name to its canonical form."""
|
||||
if config is None:
|
||||
config = load_entity_config()
|
||||
|
||||
lookup = config.get("_lookup", {})
|
||||
return lookup.get(name.lower(), name)
|
||||
|
||||
|
||||
def aggregate_normalized_entities(rows, entity_type='persona'):
|
||||
"""Aggregate entity counts by normalized names and filter blacklisted items.
|
||||
|
||||
Args:
|
||||
rows: List of (name, count) tuples from database
|
||||
entity_type: Type of entity for normalization (kept for compatibility but config is global now)
|
||||
|
||||
Returns:
|
||||
List of (normalized_name, total_count) tuples sorted by count
|
||||
"""
|
||||
aggregated = {}
|
||||
config = load_entity_config()
|
||||
blacklist = config.get("_blacklist_set", set())
|
||||
|
||||
for name, count in rows:
|
||||
# 1. Check blacklist (exact or lower match)
|
||||
if name.lower() in blacklist:
|
||||
continue
|
||||
|
||||
# 2. Normalize
|
||||
normalized = normalize_entity_name(name, config)
|
||||
|
||||
# 3. Check blacklist again (in case canonical name is blacklisted)
|
||||
if normalized.lower() in blacklist:
|
||||
continue
|
||||
|
||||
aggregated[normalized] = aggregated.get(normalized, 0) + count
|
||||
|
||||
# Sort by count descending
|
||||
sorted_items = sorted(aggregated.items(), key=lambda x: x[1], reverse=True)
|
||||
return sorted_items
|
||||
|
||||
# ==================================================================================
|
||||
|
||||
|
||||
@stats_bp.route("/")
|
||||
def index():
|
||||
"""Stats dashboard page."""
|
||||
|
||||
# Calculate translation stats for the banner
|
||||
with get_read_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
# Translations per minute (last 5 minutes)
|
||||
cur.execute("""
|
||||
SELECT COUNT(*) FROM traducciones
|
||||
WHERE status = 'done'
|
||||
AND created_at > NOW() - INTERVAL '5 minutes'
|
||||
""")
|
||||
recent_5min = cur.fetchone()[0]
|
||||
translations_per_min = round(recent_5min / 5, 1) if recent_5min else 0
|
||||
|
||||
# Status counts
|
||||
cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'done'")
|
||||
traducciones_count = cur.fetchone()[0]
|
||||
|
||||
cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'pending'")
|
||||
pending_count = cur.fetchone()[0]
|
||||
|
||||
cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'processing'")
|
||||
processing_count = cur.fetchone()[0]
|
||||
|
||||
cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'error'")
|
||||
error_count = cur.fetchone()[0]
|
||||
|
||||
# Total noticias (exact count - cached for 5 min in view)
|
||||
cur.execute("SELECT COUNT(*) FROM noticias")
|
||||
noticias_count = cur.fetchone()[0] or 0
|
||||
|
||||
# News ingested today
|
||||
cur.execute("""
|
||||
SELECT COUNT(*) FROM noticias
|
||||
WHERE DATE(fecha) = CURRENT_DATE
|
||||
""")
|
||||
noticias_hoy = cur.fetchone()[0] or 0
|
||||
|
||||
# News ingested in the last hour
|
||||
cur.execute("""
|
||||
SELECT COUNT(*) FROM noticias
|
||||
WHERE fecha >= NOW() - INTERVAL '1 hour'
|
||||
""")
|
||||
noticias_ultima_hora = cur.fetchone()[0] or 0
|
||||
|
||||
return render_template("stats.html",
|
||||
translations_per_min=translations_per_min,
|
||||
noticias_count=noticias_count,
|
||||
traducciones_count=traducciones_count,
|
||||
pending_count=pending_count,
|
||||
processing_count=processing_count,
|
||||
error_count=error_count,
|
||||
noticias_hoy=noticias_hoy,
|
||||
noticias_ultima_hora=noticias_ultima_hora)
|
||||
|
||||
|
||||
@stats_bp.route("/api/activity")
|
||||
@cached(ttl_seconds=300, prefix="stats")
|
||||
def activity_data():
|
||||
"""Get activity data (news count) for the specified range."""
|
||||
from flask import request
|
||||
range_param = request.args.get("range", "30d")
|
||||
|
||||
# Default: 30d -> group by day
|
||||
days = 30
|
||||
minutes = 0
|
||||
interval_sql = "day" # For date_trunc or casting
|
||||
timedelta_step = timedelta(days=1)
|
||||
date_format = "%Y-%m-%d"
|
||||
|
||||
if range_param == "1h":
|
||||
minutes = 60
|
||||
interval_sql = "minute"
|
||||
timedelta_step = timedelta(minutes=1)
|
||||
date_format = "%H:%M"
|
||||
elif range_param == "8h":
|
||||
minutes = 480
|
||||
interval_sql = "minute"
|
||||
timedelta_step = timedelta(minutes=1)
|
||||
date_format = "%H:%M"
|
||||
elif range_param == "1d": # Alias for 24h
|
||||
minutes = 1440
|
||||
interval_sql = "hour"
|
||||
timedelta_step = timedelta(hours=1)
|
||||
date_format = "%H:%M"
|
||||
elif range_param == "24h":
|
||||
minutes = 1440
|
||||
interval_sql = "hour"
|
||||
timedelta_step = timedelta(hours=1)
|
||||
date_format = "%H:%M"
|
||||
elif range_param == "7d":
|
||||
minutes = 10080
|
||||
interval_sql = "hour"
|
||||
timedelta_step = timedelta(hours=1)
|
||||
# Include Month-Day for 7d context
|
||||
date_format = "%d %H:%M"
|
||||
elif range_param == "30d":
|
||||
# Specific existing logic uses date casting, we can adapt
|
||||
minutes = 0
|
||||
days = 30
|
||||
interval_sql = "day"
|
||||
timedelta_step = timedelta(days=1)
|
||||
date_format = "%Y-%m-%d"
|
||||
|
||||
# Calculate start time
|
||||
if minutes > 0:
|
||||
start_time = datetime.utcnow() - timedelta(minutes=minutes)
|
||||
# Using timestamp column directly
|
||||
date_column = "fecha"
|
||||
else:
|
||||
start_time = datetime.utcnow() - timedelta(days=days)
|
||||
# For 30d we might just use date part start
|
||||
start_time = start_time.replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
date_column = "fecha"
|
||||
|
||||
with get_read_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
# Construct query based on interval
|
||||
if interval_sql == "day":
|
||||
# Original logic style for 30d, but generalized
|
||||
cur.execute("""
|
||||
SELECT
|
||||
fecha::date as time_slot,
|
||||
COUNT(*) as count
|
||||
FROM noticias
|
||||
WHERE fecha >= %s
|
||||
GROUP BY time_slot
|
||||
ORDER BY time_slot
|
||||
""", (start_time,))
|
||||
else:
|
||||
# Granular logic
|
||||
cur.execute(f"""
|
||||
SELECT
|
||||
date_trunc('{interval_sql}', fecha) as time_slot,
|
||||
COUNT(*) as count
|
||||
FROM noticias
|
||||
WHERE fecha >= %s
|
||||
GROUP BY time_slot
|
||||
ORDER BY time_slot
|
||||
""", (start_time,))
|
||||
|
||||
rows = cur.fetchall()
|
||||
|
||||
# Fill gaps
|
||||
data_map = {row[0]: row[1] for row in rows}
|
||||
labels = []
|
||||
data = []
|
||||
|
||||
# Iterate with step
|
||||
if minutes > 0:
|
||||
# Granular start alignment
|
||||
current = start_time.replace(second=0, microsecond=0)
|
||||
if interval_sql == "hour":
|
||||
current = current.replace(minute=0)
|
||||
|
||||
end = datetime.utcnow().replace(second=0, microsecond=0)
|
||||
if interval_sql == "hour":
|
||||
end = end.replace(minute=0) + timedelta(hours=1)
|
||||
else:
|
||||
# Daily start alignment
|
||||
current = start_time.date() if isinstance(start_time, datetime) else start_time
|
||||
end = datetime.utcnow().date()
|
||||
|
||||
while current <= end:
|
||||
# Format label
|
||||
labels.append(current.strftime(date_format))
|
||||
|
||||
# Lookup key can be date or datetime depending on query
|
||||
# DB returns date for ::date and datetime for date_trunc
|
||||
# Let's handle both lookup types safely
|
||||
lookup_key = current
|
||||
# API might have mismatch if current is date object and DB returned datetime or vice versa
|
||||
# rows[0] is date object for 'day', datetime for 'minute'/'hour'
|
||||
|
||||
val = data_map.get(lookup_key, 0)
|
||||
# Fallback if types don't match exactly (datetime vs date) - unlikely if logic is consistent but good to check
|
||||
if val == 0 and isinstance(lookup_key, datetime) and interval_sql == 'day':
|
||||
val = data_map.get(lookup_key.date(), 0)
|
||||
|
||||
data.append(val)
|
||||
|
||||
current += timedelta_step
|
||||
|
||||
return jsonify({
|
||||
"labels": labels,
|
||||
"data": data
|
||||
})
|
||||
|
||||
|
||||
@stats_bp.route("/api/categories")
|
||||
@cached(ttl_seconds=300, prefix="stats")
|
||||
def categories_data():
|
||||
"""Get news count per category (Top 8 + Others)."""
|
||||
with get_read_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT
|
||||
c.nombre,
|
||||
COUNT(n.id) as count
|
||||
FROM noticias n
|
||||
JOIN categorias c ON c.id = n.categoria_id
|
||||
GROUP BY c.nombre
|
||||
ORDER BY count DESC
|
||||
""")
|
||||
|
||||
rows = cur.fetchall()
|
||||
|
||||
# Process Top 8 + Others
|
||||
labels = []
|
||||
data = []
|
||||
others_count = 0
|
||||
top_limit = 8
|
||||
|
||||
for i, row in enumerate(rows):
|
||||
if i < top_limit:
|
||||
labels.append(row[0])
|
||||
data.append(row[1])
|
||||
else:
|
||||
others_count += row[1]
|
||||
|
||||
if others_count > 0:
|
||||
labels.append("Otros")
|
||||
data.append(others_count)
|
||||
|
||||
return jsonify({
|
||||
"labels": labels,
|
||||
"data": data
|
||||
})
|
||||
|
||||
|
||||
@stats_bp.route("/api/countries")
|
||||
@cached(ttl_seconds=300, prefix="stats")
|
||||
def countries_data():
|
||||
"""Get news count per country (Top 10 + Others)."""
|
||||
with get_read_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT
|
||||
p.nombre,
|
||||
COUNT(n.id) as count
|
||||
FROM noticias n
|
||||
JOIN paises p ON p.id = n.pais_id
|
||||
GROUP BY p.nombre
|
||||
ORDER BY count DESC
|
||||
""")
|
||||
|
||||
rows = cur.fetchall()
|
||||
|
||||
# Process Top 10 + Others
|
||||
labels = []
|
||||
data = []
|
||||
others_count = 0
|
||||
top_limit = 10
|
||||
|
||||
for i, row in enumerate(rows):
|
||||
if i < top_limit:
|
||||
labels.append(row[0])
|
||||
data.append(row[1])
|
||||
else:
|
||||
others_count += row[1]
|
||||
|
||||
return jsonify({
|
||||
"labels": labels,
|
||||
"data": data
|
||||
})
|
||||
|
||||
|
||||
@stats_bp.route("/api/countries/list")
|
||||
def countries_list():
|
||||
"""Get alphabetical list of all countries with flags."""
|
||||
from utils import country_flag
|
||||
with get_read_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT nombre FROM paises ORDER BY nombre ASC")
|
||||
rows = cur.fetchall()
|
||||
|
||||
return jsonify([
|
||||
{"name": row[0], "flag": country_flag(row[0])}
|
||||
for row in rows
|
||||
])
|
||||
|
||||
|
||||
@stats_bp.route("/api/translations/activity")
|
||||
def translations_activity_data():
|
||||
"""Get translation count per day for the last 30 days."""
|
||||
days = 30
|
||||
start_date = (datetime.utcnow() - timedelta(days=days)).date()
|
||||
|
||||
with get_read_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT
|
||||
created_at::date as day,
|
||||
COUNT(*) as count
|
||||
FROM traducciones
|
||||
WHERE created_at >= %s
|
||||
GROUP BY day
|
||||
ORDER BY day
|
||||
""", (start_date,))
|
||||
|
||||
rows = cur.fetchall()
|
||||
|
||||
# Fill gaps
|
||||
data_map = {row[0]: row[1] for row in rows}
|
||||
labels = []
|
||||
data = []
|
||||
|
||||
current = start_date
|
||||
end = datetime.utcnow().date()
|
||||
|
||||
while current <= end:
|
||||
labels.append(current.strftime("%Y-%m-%d"))
|
||||
data.append(data_map.get(current, 0))
|
||||
current += timedelta(days=1)
|
||||
|
||||
return jsonify({
|
||||
"labels": labels,
|
||||
"data": data
|
||||
})
|
||||
|
||||
|
||||
@stats_bp.route("/api/translations/languages")
|
||||
@cached(ttl_seconds=60, prefix="stats")
|
||||
def translations_languages_data():
|
||||
"""Get translation count per source language."""
|
||||
# Friendly names for common languages
|
||||
LANG_NAMES = {
|
||||
'en': 'Inglés',
|
||||
'es': 'Español',
|
||||
'fr': 'Francés',
|
||||
'de': 'Alemán',
|
||||
'it': 'Italiano',
|
||||
'pt': 'Portugués',
|
||||
'ru': 'Ruso',
|
||||
'zh': 'Chino',
|
||||
'ja': 'Japonés',
|
||||
'ar': 'Árabe'
|
||||
}
|
||||
|
||||
with get_read_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT
|
||||
lang_from,
|
||||
COUNT(*) as count
|
||||
FROM translation_stats
|
||||
WHERE lang_from IS NOT NULL
|
||||
GROUP BY lang_from
|
||||
ORDER BY count DESC
|
||||
""")
|
||||
|
||||
rows = cur.fetchall()
|
||||
|
||||
labels = []
|
||||
data = []
|
||||
for code, count in rows:
|
||||
code = code.strip().lower()
|
||||
labels.append(LANG_NAMES.get(code, code.upper()))
|
||||
data.append(count)
|
||||
|
||||
return jsonify({
|
||||
"labels": labels,
|
||||
"data": data
|
||||
})
|
||||
|
||||
def get_system_uptime():
|
||||
try:
|
||||
with open('/proc/uptime', 'r') as f:
|
||||
uptime_seconds = float(f.readline().split()[0])
|
||||
days = int(uptime_seconds // (24 * 3600))
|
||||
hours = int((uptime_seconds % (24 * 3600)) // 3600)
|
||||
minutes = int((uptime_seconds % 3600) // 60)
|
||||
if days > 0:
|
||||
return f"{days}d {hours}h {minutes}m"
|
||||
return f"{hours}h {minutes}m"
|
||||
except:
|
||||
return "N/A"
|
||||
|
||||
def get_gpu_info():
|
||||
try:
|
||||
cmd = "nvidia-smi --query-gpu=name,temperature.gpu,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits"
|
||||
with open(os.devnull, 'w') as devnull:
|
||||
res = subprocess.check_output(cmd, shell=True, stderr=devnull).decode().strip()
|
||||
parts = [p.strip() for p in res.split(',')]
|
||||
if len(parts) >= 5:
|
||||
return {
|
||||
"name": parts[0],
|
||||
"temp": f"{parts[1]}°C",
|
||||
"util": f"{parts[2]}%",
|
||||
"mem": f"{parts[3]} MB / {parts[4]} MB"
|
||||
}
|
||||
except:
|
||||
pass
|
||||
return None
|
||||
|
||||
def get_cpu_info():
|
||||
try:
|
||||
load = os.getloadavg()
|
||||
cores = os.cpu_count()
|
||||
return {
|
||||
"load": f"{load[0]:.2f}, {load[1]:.2f}, {load[2]:.2f}",
|
||||
"cores": cores
|
||||
}
|
||||
except:
|
||||
return None
|
||||
|
||||
@stats_bp.route("/api/system/info")
|
||||
def system_info_api():
|
||||
"""Endpoint for real-time system monitoring."""
|
||||
return jsonify({
|
||||
"uptime": get_system_uptime(),
|
||||
"gpu": get_gpu_info(),
|
||||
"cpu": get_cpu_info(),
|
||||
"timestamp": datetime.now().strftime("%H:%M:%S")
|
||||
})
|
||||
|
||||
|
||||
@stats_bp.route("/api/translations/rate")
|
||||
@cached(ttl_seconds=60, prefix="stats")
|
||||
def translations_rate_data():
|
||||
"""Get translation count for the specified range (1h, 8h, 24h, 7d)."""
|
||||
# Parameters
|
||||
from flask import request
|
||||
|
||||
range_param = request.args.get("range", "1h")
|
||||
|
||||
# Default: 1h -> group by minute
|
||||
minutes = 60
|
||||
interval_sql = "minute"
|
||||
timedelta_step = timedelta(minutes=1)
|
||||
date_format = "%H:%M"
|
||||
|
||||
if range_param == "8h":
|
||||
minutes = 8 * 60
|
||||
interval_sql = "minute" # Still group by minute for detailed graph? Or 5 mins?
|
||||
# Let's simple group by minute but it might be dense. 480 points. Fine.
|
||||
timedelta_step = timedelta(minutes=1)
|
||||
date_format = "%H:%M"
|
||||
|
||||
elif range_param == "24h":
|
||||
minutes = 24 * 60
|
||||
# Group by 15 minutes? Postgres: date_trunc('hour', ...) or extract?
|
||||
# Let's use custom grouping? Or simple 'hour' is too granular? 1440 mins.
|
||||
# Let's group by hour for 24h to be safe/clean
|
||||
interval_sql = "hour"
|
||||
timedelta_step = timedelta(hours=1)
|
||||
date_format = "%H:%M"
|
||||
|
||||
elif range_param == "7d":
|
||||
minutes = 7 * 24 * 60
|
||||
interval_sql = "hour" # 7 * 24 = 168 points
|
||||
timedelta_step = timedelta(hours=1)
|
||||
date_format = "%Y-%m-%d %H:%M"
|
||||
|
||||
start_time = datetime.utcnow() - timedelta(minutes=minutes)
|
||||
|
||||
with get_read_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
# Query translation_stats instead of traducciones
|
||||
cur.execute(f"""
|
||||
SELECT
|
||||
date_trunc('{interval_sql}', created_at) as time_slot,
|
||||
COUNT(*) as count
|
||||
FROM translation_stats
|
||||
WHERE created_at >= %s
|
||||
GROUP BY time_slot
|
||||
ORDER BY time_slot
|
||||
""", (start_time,))
|
||||
|
||||
rows = cur.fetchall()
|
||||
|
||||
# Fill gaps
|
||||
data_map = {row[0]: row[1] for row in rows}
|
||||
labels = []
|
||||
data = []
|
||||
|
||||
# Iterate by step
|
||||
# Align start_time to step if possible (lazy alignment)
|
||||
current = start_time.replace(second=0, microsecond=0)
|
||||
if interval_sql == "hour":
|
||||
current = current.replace(minute=0)
|
||||
|
||||
end = datetime.utcnow().replace(second=0, microsecond=0)
|
||||
if interval_sql == "hour":
|
||||
end = end.replace(minute=0) + timedelta(hours=1) # Ensure we cover current partial hour
|
||||
|
||||
while current <= end:
|
||||
labels.append(current.strftime(date_format))
|
||||
data.append(data_map.get(current, 0))
|
||||
current += timedelta_step
|
||||
|
||||
return jsonify({
|
||||
"labels": labels,
|
||||
"data": data
|
||||
})
|
||||
|
||||
|
||||
@stats_bp.route("/entities")
|
||||
def entities_dashboard():
|
||||
"""Dashboard for Named Entities statistics."""
|
||||
return render_template("stats_entities.html")
|
||||
|
||||
|
||||
@stats_bp.route("/api/entities/people")
|
||||
def entities_people():
|
||||
"""Top 25 mentioned people, optionally filtered by country and/or date."""
|
||||
from flask import request
|
||||
from datetime import datetime
|
||||
from cache import cache_get, cache_set
|
||||
|
||||
# 1. Check config mtime for cache invalidation
|
||||
try:
|
||||
config_mtime = os.path.getmtime(CONFIG_FILE)
|
||||
except OSError:
|
||||
config_mtime = 0
|
||||
|
||||
country_filter = request.args.get("country")
|
||||
date_filter = request.args.get("date")
|
||||
|
||||
# 2. Build cache key with mtime
|
||||
cache_key = f"entities:people:{country_filter}:{date_filter}:{config_mtime}"
|
||||
|
||||
# 3. Try cache
|
||||
cached_data = cache_get(cache_key)
|
||||
if cached_data:
|
||||
return jsonify(cached_data)
|
||||
|
||||
# Determine time range
|
||||
if date_filter:
|
||||
# Single day query
|
||||
try:
|
||||
target_date = datetime.strptime(date_filter, "%Y-%m-%d").date()
|
||||
time_condition = "DATE(tr.created_at) = %s"
|
||||
time_params = [target_date]
|
||||
except ValueError:
|
||||
# Invalid date format, fallback to 30 days
|
||||
time_condition = "tr.created_at >= NOW() - INTERVAL '30 days'"
|
||||
time_params = []
|
||||
else:
|
||||
# Default: last 30 days
|
||||
time_condition = "tr.created_at >= NOW() - INTERVAL '30 days'"
|
||||
time_params = []
|
||||
|
||||
if country_filter and country_filter != 'global':
|
||||
# Filtered by country
|
||||
query = f"""
|
||||
SELECT t.valor, COUNT(*) as menciones
|
||||
FROM tags t
|
||||
JOIN tags_noticia tn ON tn.tag_id = t.id
|
||||
JOIN traducciones tr ON tn.traduccion_id = tr.id
|
||||
JOIN noticias n ON tr.noticia_id = n.id
|
||||
WHERE t.tipo = 'persona'
|
||||
AND {time_condition}
|
||||
AND n.pais_id = (SELECT id FROM paises WHERE nombre ILIKE %s LIMIT 1)
|
||||
GROUP BY t.valor
|
||||
ORDER BY menciones DESC
|
||||
"""
|
||||
params = tuple(time_params + [country_filter])
|
||||
else:
|
||||
# Global view
|
||||
query = f"""
|
||||
SELECT t.valor, COUNT(*) as menciones
|
||||
FROM tags t
|
||||
JOIN tags_noticia tn ON tn.tag_id = t.id
|
||||
JOIN traducciones tr ON tn.traduccion_id = tr.id
|
||||
WHERE t.tipo = 'persona'
|
||||
AND {time_condition}
|
||||
GROUP BY t.valor
|
||||
ORDER BY menciones DESC
|
||||
"""
|
||||
params = tuple(time_params)
|
||||
|
||||
with get_read_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(query, params)
|
||||
rows = cur.fetchall()
|
||||
|
||||
# Normalize and aggregate
|
||||
normalized_rows = aggregate_normalized_entities(rows, entity_type='persona')
|
||||
|
||||
# Take top 50
|
||||
top_50 = normalized_rows[:50]
|
||||
|
||||
# Enrich with Wikipedia Images (Parallel Execution)
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from utils.wiki import fetch_wiki_data
|
||||
|
||||
images = []
|
||||
summaries = []
|
||||
|
||||
def get_image_safe(name):
|
||||
try:
|
||||
return fetch_wiki_data(name)
|
||||
except Exception:
|
||||
return None, None
|
||||
|
||||
if top_50:
|
||||
names = [row[0] for row in top_50]
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
try:
|
||||
results = list(executor.map(get_image_safe, names))
|
||||
|
||||
# Unpack results
|
||||
for img, smry in results:
|
||||
images.append(img)
|
||||
summaries.append(smry)
|
||||
except Exception as e:
|
||||
import logging
|
||||
logging.error(f"Error fetching wiki data: {e}")
|
||||
# Fallback to empty if threading fails
|
||||
images = [None] * len(names)
|
||||
summaries = [None] * len(names)
|
||||
else:
|
||||
images = []
|
||||
summaries = []
|
||||
|
||||
result = {
|
||||
"labels": [row[0] for row in top_50],
|
||||
"data": [row[1] for row in top_50],
|
||||
"images": images,
|
||||
"summaries": summaries
|
||||
}
|
||||
|
||||
# 4. Set cache
|
||||
cache_set(cache_key, result, ttl_seconds=600)
|
||||
|
||||
return jsonify(result)
|
||||
|
||||
|
||||
@stats_bp.route("/api/entities/orgs")
|
||||
def entities_orgs():
|
||||
"""Top mentioned organizations, optionally filtered by country."""
|
||||
from flask import request
|
||||
from cache import cache_get, cache_set
|
||||
|
||||
country_filter = request.args.get("country")
|
||||
|
||||
try:
|
||||
config_mtime = os.path.getmtime(CONFIG_FILE)
|
||||
except OSError:
|
||||
config_mtime = 0
|
||||
|
||||
cache_key = f"entities:orgs:{country_filter}:{config_mtime}"
|
||||
|
||||
cached_data = cache_get(cache_key)
|
||||
if cached_data:
|
||||
return jsonify(cached_data)
|
||||
|
||||
if country_filter and country_filter != 'global':
|
||||
query = """
|
||||
SELECT t.valor, COUNT(*) as menciones
|
||||
FROM tags t
|
||||
JOIN tags_noticia tn ON tn.tag_id = t.id
|
||||
JOIN traducciones tr ON tn.traduccion_id = tr.id
|
||||
JOIN noticias n ON tr.noticia_id = n.id
|
||||
WHERE t.tipo = 'organizacion'
|
||||
AND tr.created_at >= NOW() - INTERVAL '30 days'
|
||||
AND n.pais_id = (SELECT id FROM paises WHERE nombre ILIKE %s LIMIT 1)
|
||||
GROUP BY t.valor
|
||||
ORDER BY menciones DESC
|
||||
LIMIT 50
|
||||
"""
|
||||
params = (country_filter,)
|
||||
else:
|
||||
query = """
|
||||
SELECT t.valor, COUNT(*) as menciones
|
||||
FROM tags t
|
||||
JOIN tags_noticia tn ON tn.tag_id = t.id
|
||||
JOIN traducciones tr ON tn.traduccion_id = tr.id
|
||||
WHERE t.tipo = 'organizacion'
|
||||
AND tr.created_at >= NOW() - INTERVAL '30 days'
|
||||
GROUP BY t.valor
|
||||
ORDER BY menciones DESC
|
||||
LIMIT 50
|
||||
"""
|
||||
params = ()
|
||||
|
||||
with get_read_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(query, params)
|
||||
rows = cur.fetchall()
|
||||
|
||||
normalized_rows = aggregate_normalized_entities(rows, entity_type='organizacion')
|
||||
|
||||
# Enrich with Wikipedia Images
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from utils.wiki import fetch_wiki_data
|
||||
|
||||
images = []
|
||||
summaries = []
|
||||
|
||||
def get_info_safe(name):
|
||||
try:
|
||||
return fetch_wiki_data(name)
|
||||
except Exception:
|
||||
return None, None
|
||||
|
||||
if normalized_rows:
|
||||
names = [row[0] for row in normalized_rows]
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
results = list(executor.map(get_info_safe, names))
|
||||
for img, smry in results:
|
||||
images.append(img)
|
||||
summaries.append(smry)
|
||||
|
||||
result = {
|
||||
"labels": [row[0] for row in normalized_rows],
|
||||
"data": [row[1] for row in normalized_rows],
|
||||
"images": images,
|
||||
"summaries": summaries
|
||||
}
|
||||
|
||||
cache_set(cache_key, result, ttl_seconds=600)
|
||||
return jsonify(result)
|
||||
|
||||
|
||||
@stats_bp.route("/api/entities/places")
|
||||
def entities_places():
|
||||
"""Top mentioned places, optionally filtered by country."""
|
||||
from flask import request
|
||||
from cache import cache_get, cache_set
|
||||
|
||||
country_filter = request.args.get("country")
|
||||
|
||||
try:
|
||||
config_mtime = os.path.getmtime(CONFIG_FILE)
|
||||
except OSError:
|
||||
config_mtime = 0
|
||||
|
||||
cache_key = f"entities:places:{country_filter}:{config_mtime}"
|
||||
|
||||
cached_data = cache_get(cache_key)
|
||||
if cached_data:
|
||||
return jsonify(cached_data)
|
||||
|
||||
if country_filter and country_filter != 'global':
|
||||
query = """
|
||||
SELECT t.valor, COUNT(*) as menciones
|
||||
FROM tags t
|
||||
JOIN tags_noticia tn ON tn.tag_id = t.id
|
||||
JOIN traducciones tr ON tn.traduccion_id = tr.id
|
||||
JOIN noticias n ON tr.noticia_id = n.id
|
||||
WHERE t.tipo = 'lugar'
|
||||
AND tr.created_at >= NOW() - INTERVAL '30 days'
|
||||
AND n.pais_id = (SELECT id FROM paises WHERE nombre ILIKE %s LIMIT 1)
|
||||
AND n.pais_id != (SELECT id FROM paises WHERE nombre = 'España')
|
||||
GROUP BY t.valor
|
||||
ORDER BY menciones DESC
|
||||
LIMIT 50
|
||||
"""
|
||||
params = (country_filter,)
|
||||
else:
|
||||
query = """
|
||||
SELECT t.valor, COUNT(*) as menciones
|
||||
FROM tags t
|
||||
JOIN tags_noticia tn ON tn.tag_id = t.id
|
||||
JOIN traducciones tr ON tn.traduccion_id = tr.id
|
||||
JOIN noticias n ON tr.noticia_id = n.id
|
||||
WHERE t.tipo = 'lugar'
|
||||
AND tr.created_at >= NOW() - INTERVAL '30 days'
|
||||
AND n.pais_id != (SELECT id FROM paises WHERE nombre = 'España')
|
||||
GROUP BY t.valor
|
||||
ORDER BY menciones DESC
|
||||
LIMIT 50
|
||||
"""
|
||||
params = ()
|
||||
|
||||
with get_read_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(query, params)
|
||||
rows = cur.fetchall()
|
||||
|
||||
# Normalize
|
||||
normalized_rows = aggregate_normalized_entities(rows, entity_type='lugar')
|
||||
|
||||
# Enrich with Wikipedia Images
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from utils.wiki import fetch_wiki_data
|
||||
|
||||
images = []
|
||||
summaries = []
|
||||
|
||||
def get_info_safe(name):
|
||||
try:
|
||||
return fetch_wiki_data(name)
|
||||
except Exception:
|
||||
return None, None
|
||||
|
||||
if normalized_rows:
|
||||
names = [row[0] for row in normalized_rows]
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
results = list(executor.map(get_info_safe, names))
|
||||
for img, smry in results:
|
||||
images.append(img)
|
||||
summaries.append(smry)
|
||||
|
||||
result = {
|
||||
"labels": [row[0] for row in normalized_rows],
|
||||
"data": [row[1] for row in normalized_rows],
|
||||
"images": images,
|
||||
"summaries": summaries
|
||||
}
|
||||
|
||||
cache_set(cache_key, result, ttl_seconds=600)
|
||||
return jsonify(result)
|
||||
Loading…
Add table
Add a link
Reference in a new issue