913 lines
29 KiB
Python
913 lines
29 KiB
Python
from flask import Blueprint, render_template, jsonify
|
|
from db import get_read_conn
|
|
from datetime import datetime, timedelta
|
|
import os
|
|
import subprocess
|
|
import time
|
|
from cache import cached
|
|
|
|
stats_bp = Blueprint("stats", __name__, url_prefix="/stats")
|
|
|
|
|
|
# ==================================================================================
|
|
# ENTITY NORMALIZATION SYSTEM
|
|
# ==================================================================================
|
|
# Dictionary to map entity name variations to canonical names
|
|
import json
|
|
|
|
CONFIG_FILE = "entity_config.json"
|
|
_config_cache = {"data": None, "mtime": 0}
|
|
|
|
def load_entity_config():
|
|
"""Load entity config from JSON file with simple modification time caching."""
|
|
global _config_cache
|
|
try:
|
|
# Check if file exists
|
|
if not os.path.exists(CONFIG_FILE):
|
|
return {"blacklist": [], "synonyms": {}}
|
|
|
|
# Check modification time
|
|
mtime = os.path.getmtime(CONFIG_FILE)
|
|
if _config_cache["data"] is not None and mtime <= _config_cache["mtime"]:
|
|
return _config_cache["data"]
|
|
|
|
# Load fresh config
|
|
with open(CONFIG_FILE, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Normalize structure
|
|
if "blacklist" not in data: data["blacklist"] = []
|
|
if "synonyms" not in data: data["synonyms"] = {}
|
|
|
|
# Pre-process synonyms for reverse lookup (variation -> canonical)
|
|
lookup = {}
|
|
for canonical, variations in data["synonyms"].items():
|
|
lookup[canonical.lower()] = canonical # Map canonical to itself
|
|
for var in variations:
|
|
lookup[var.lower()] = canonical
|
|
|
|
data["_lookup"] = lookup
|
|
data["_blacklist_set"] = {x.lower() for x in data["blacklist"]}
|
|
|
|
_config_cache = {"data": data, "mtime": mtime}
|
|
return data
|
|
|
|
except Exception as e:
|
|
print(f"Error loading entity config: {e}")
|
|
# Return fallback or previous cache if available
|
|
return _config_cache["data"] if _config_cache["data"] else {"blacklist": [], "synonyms": {}}
|
|
|
|
|
|
def normalize_entity_name(name: str, config=None) -> str:
|
|
"""Normalize entity name to its canonical form."""
|
|
if config is None:
|
|
config = load_entity_config()
|
|
|
|
lookup = config.get("_lookup", {})
|
|
return lookup.get(name.lower(), name)
|
|
|
|
|
|
def aggregate_normalized_entities(rows, entity_type='persona'):
|
|
"""Aggregate entity counts by normalized names and filter blacklisted items.
|
|
|
|
Args:
|
|
rows: List of (name, count) tuples from database
|
|
entity_type: Type of entity for normalization (kept for compatibility but config is global now)
|
|
|
|
Returns:
|
|
List of (normalized_name, total_count) tuples sorted by count
|
|
"""
|
|
aggregated = {}
|
|
config = load_entity_config()
|
|
blacklist = config.get("_blacklist_set", set())
|
|
|
|
for name, count in rows:
|
|
# 1. Check blacklist (exact or lower match)
|
|
if name.lower() in blacklist:
|
|
continue
|
|
|
|
# 2. Normalize
|
|
normalized = normalize_entity_name(name, config)
|
|
|
|
# 3. Check blacklist again (in case canonical name is blacklisted)
|
|
if normalized.lower() in blacklist:
|
|
continue
|
|
|
|
aggregated[normalized] = aggregated.get(normalized, 0) + count
|
|
|
|
# Sort by count descending
|
|
sorted_items = sorted(aggregated.items(), key=lambda x: x[1], reverse=True)
|
|
return sorted_items
|
|
|
|
# ==================================================================================
|
|
|
|
|
|
@stats_bp.route("/")
|
|
@cached(ttl_seconds=600, prefix="stats_index")
|
|
def index():
|
|
"""Stats dashboard page."""
|
|
|
|
# Calculate translation stats for the banner
|
|
with get_read_conn() as conn:
|
|
with conn.cursor() as cur:
|
|
# Translations per minute (last 5 minutes)
|
|
cur.execute("""
|
|
SELECT COUNT(*) FROM traducciones
|
|
WHERE status = 'done'
|
|
AND created_at > NOW() - INTERVAL '5 minutes'
|
|
""")
|
|
recent_5min = cur.fetchone()[0]
|
|
translations_per_min = round(recent_5min / 5, 1) if recent_5min else 0
|
|
|
|
# Status counts
|
|
cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'done'")
|
|
traducciones_count = cur.fetchone()[0]
|
|
|
|
cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'pending'")
|
|
pending_count = cur.fetchone()[0]
|
|
|
|
cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'processing'")
|
|
processing_count = cur.fetchone()[0]
|
|
|
|
cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'error'")
|
|
error_count = cur.fetchone()[0]
|
|
|
|
# Total noticias (exact count - cached for 5 min in view)
|
|
cur.execute("SELECT COUNT(*) FROM noticias")
|
|
noticias_count = cur.fetchone()[0] or 0
|
|
|
|
# News ingested today
|
|
cur.execute("""
|
|
SELECT COUNT(*) FROM noticias
|
|
WHERE DATE(fecha) = CURRENT_DATE
|
|
""")
|
|
noticias_hoy = cur.fetchone()[0] or 0
|
|
|
|
# News ingested in the last hour
|
|
cur.execute("""
|
|
SELECT COUNT(*) FROM noticias
|
|
WHERE fecha >= NOW() - INTERVAL '1 hour'
|
|
""")
|
|
noticias_ultima_hora = cur.fetchone()[0] or 0
|
|
|
|
return render_template("stats.html",
|
|
translations_per_min=translations_per_min,
|
|
noticias_count=noticias_count,
|
|
traducciones_count=traducciones_count,
|
|
pending_count=pending_count,
|
|
processing_count=processing_count,
|
|
error_count=error_count,
|
|
noticias_hoy=noticias_hoy,
|
|
noticias_ultima_hora=noticias_ultima_hora)
|
|
|
|
|
|
@stats_bp.route("/api/activity")
|
|
@cached(ttl_seconds=300, prefix="stats")
|
|
def activity_data():
|
|
"""Get activity data (news count) for the specified range."""
|
|
from flask import request
|
|
range_param = request.args.get("range", "30d")
|
|
|
|
# Default: 30d -> group by day
|
|
days = 30
|
|
minutes = 0
|
|
interval_sql = "day" # For date_trunc or casting
|
|
timedelta_step = timedelta(days=1)
|
|
date_format = "%Y-%m-%d"
|
|
|
|
if range_param == "1h":
|
|
minutes = 60
|
|
interval_sql = "minute"
|
|
timedelta_step = timedelta(minutes=1)
|
|
date_format = "%H:%M"
|
|
elif range_param == "8h":
|
|
minutes = 480
|
|
interval_sql = "minute"
|
|
timedelta_step = timedelta(minutes=1)
|
|
date_format = "%H:%M"
|
|
elif range_param == "1d": # Alias for 24h
|
|
minutes = 1440
|
|
interval_sql = "hour"
|
|
timedelta_step = timedelta(hours=1)
|
|
date_format = "%H:%M"
|
|
elif range_param == "24h":
|
|
minutes = 1440
|
|
interval_sql = "hour"
|
|
timedelta_step = timedelta(hours=1)
|
|
date_format = "%H:%M"
|
|
elif range_param == "7d":
|
|
minutes = 10080
|
|
interval_sql = "hour"
|
|
timedelta_step = timedelta(hours=1)
|
|
# Include Month-Day for 7d context
|
|
date_format = "%d %H:%M"
|
|
elif range_param == "30d":
|
|
# Specific existing logic uses date casting, we can adapt
|
|
minutes = 0
|
|
days = 30
|
|
interval_sql = "day"
|
|
timedelta_step = timedelta(days=1)
|
|
date_format = "%Y-%m-%d"
|
|
|
|
# Calculate start time
|
|
if minutes > 0:
|
|
start_time = datetime.utcnow() - timedelta(minutes=minutes)
|
|
# Using timestamp column directly
|
|
date_column = "fecha"
|
|
else:
|
|
start_time = datetime.utcnow() - timedelta(days=days)
|
|
# For 30d we might just use date part start
|
|
start_time = start_time.replace(hour=0, minute=0, second=0, microsecond=0)
|
|
date_column = "fecha"
|
|
|
|
with get_read_conn() as conn:
|
|
with conn.cursor() as cur:
|
|
# Construct query based on interval
|
|
if interval_sql == "day":
|
|
# Original logic style for 30d, but generalized
|
|
cur.execute("""
|
|
SELECT
|
|
fecha::date as time_slot,
|
|
COUNT(*) as count
|
|
FROM noticias
|
|
WHERE fecha >= %s
|
|
GROUP BY time_slot
|
|
ORDER BY time_slot
|
|
""", (start_time,))
|
|
else:
|
|
# Granular logic
|
|
cur.execute(f"""
|
|
SELECT
|
|
date_trunc('{interval_sql}', fecha) as time_slot,
|
|
COUNT(*) as count
|
|
FROM noticias
|
|
WHERE fecha >= %s
|
|
GROUP BY time_slot
|
|
ORDER BY time_slot
|
|
""", (start_time,))
|
|
|
|
rows = cur.fetchall()
|
|
|
|
# Fill gaps
|
|
data_map = {row[0]: row[1] for row in rows}
|
|
labels = []
|
|
data = []
|
|
|
|
# Iterate with step
|
|
if minutes > 0:
|
|
# Granular start alignment
|
|
current = start_time.replace(second=0, microsecond=0)
|
|
if interval_sql == "hour":
|
|
current = current.replace(minute=0)
|
|
|
|
end = datetime.utcnow().replace(second=0, microsecond=0)
|
|
if interval_sql == "hour":
|
|
end = end.replace(minute=0) + timedelta(hours=1)
|
|
else:
|
|
# Daily start alignment
|
|
current = start_time.date() if isinstance(start_time, datetime) else start_time
|
|
end = datetime.utcnow().date()
|
|
|
|
while current <= end:
|
|
# Format label
|
|
labels.append(current.strftime(date_format))
|
|
|
|
# Lookup key can be date or datetime depending on query
|
|
# DB returns date for ::date and datetime for date_trunc
|
|
# Let's handle both lookup types safely
|
|
lookup_key = current
|
|
# API might have mismatch if current is date object and DB returned datetime or vice versa
|
|
# rows[0] is date object for 'day', datetime for 'minute'/'hour'
|
|
|
|
val = data_map.get(lookup_key, 0)
|
|
# Fallback if types don't match exactly (datetime vs date) - unlikely if logic is consistent but good to check
|
|
if val == 0 and isinstance(lookup_key, datetime) and interval_sql == 'day':
|
|
val = data_map.get(lookup_key.date(), 0)
|
|
|
|
data.append(val)
|
|
|
|
current += timedelta_step
|
|
|
|
return jsonify({
|
|
"labels": labels,
|
|
"data": data
|
|
})
|
|
|
|
|
|
@stats_bp.route("/api/categories")
|
|
@cached(ttl_seconds=300, prefix="stats")
|
|
def categories_data():
|
|
"""Get news count per category (Top 8 + Others)."""
|
|
with get_read_conn() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute("""
|
|
SELECT
|
|
c.nombre,
|
|
COUNT(n.id) as count
|
|
FROM noticias n
|
|
JOIN categorias c ON c.id = n.categoria_id
|
|
GROUP BY c.nombre
|
|
ORDER BY count DESC
|
|
""")
|
|
|
|
rows = cur.fetchall()
|
|
|
|
# Process Top 8 + Others
|
|
labels = []
|
|
data = []
|
|
others_count = 0
|
|
top_limit = 8
|
|
|
|
for i, row in enumerate(rows):
|
|
if i < top_limit:
|
|
labels.append(row[0])
|
|
data.append(row[1])
|
|
else:
|
|
others_count += row[1]
|
|
|
|
if others_count > 0:
|
|
labels.append("Otros")
|
|
data.append(others_count)
|
|
|
|
return jsonify({
|
|
"labels": labels,
|
|
"data": data
|
|
})
|
|
|
|
|
|
@stats_bp.route("/api/countries")
|
|
@cached(ttl_seconds=300, prefix="stats")
|
|
def countries_data():
|
|
"""Get news count per country (Top 10 + Others)."""
|
|
with get_read_conn() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute("""
|
|
SELECT
|
|
p.nombre,
|
|
COUNT(n.id) as count
|
|
FROM noticias n
|
|
JOIN paises p ON p.id = n.pais_id
|
|
GROUP BY p.nombre
|
|
ORDER BY count DESC
|
|
""")
|
|
|
|
rows = cur.fetchall()
|
|
|
|
# Process Top 10 + Others
|
|
labels = []
|
|
data = []
|
|
others_count = 0
|
|
top_limit = 10
|
|
|
|
for i, row in enumerate(rows):
|
|
if i < top_limit:
|
|
labels.append(row[0])
|
|
data.append(row[1])
|
|
else:
|
|
others_count += row[1]
|
|
|
|
return jsonify({
|
|
"labels": labels,
|
|
"data": data
|
|
})
|
|
|
|
|
|
@stats_bp.route("/api/countries/list")
|
|
def countries_list():
|
|
"""Get alphabetical list of all countries with flags."""
|
|
from utils import country_flag
|
|
with get_read_conn() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute("SELECT nombre FROM paises ORDER BY nombre ASC")
|
|
rows = cur.fetchall()
|
|
|
|
return jsonify([
|
|
{"name": row[0], "flag": country_flag(row[0])}
|
|
for row in rows
|
|
])
|
|
|
|
|
|
@stats_bp.route("/api/translations/activity")
|
|
def translations_activity_data():
|
|
"""Get translation count per day for the last 30 days."""
|
|
days = 30
|
|
start_date = (datetime.utcnow() - timedelta(days=days)).date()
|
|
|
|
with get_read_conn() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute("""
|
|
SELECT
|
|
created_at::date as day,
|
|
COUNT(*) as count
|
|
FROM traducciones
|
|
WHERE created_at >= %s
|
|
GROUP BY day
|
|
ORDER BY day
|
|
""", (start_date,))
|
|
|
|
rows = cur.fetchall()
|
|
|
|
# Fill gaps
|
|
data_map = {row[0]: row[1] for row in rows}
|
|
labels = []
|
|
data = []
|
|
|
|
current = start_date
|
|
end = datetime.utcnow().date()
|
|
|
|
while current <= end:
|
|
labels.append(current.strftime("%Y-%m-%d"))
|
|
data.append(data_map.get(current, 0))
|
|
current += timedelta(days=1)
|
|
|
|
return jsonify({
|
|
"labels": labels,
|
|
"data": data
|
|
})
|
|
|
|
|
|
@stats_bp.route("/api/translations/languages")
|
|
@cached(ttl_seconds=60, prefix="stats")
|
|
def translations_languages_data():
|
|
"""Get translation count per source language."""
|
|
# Friendly names for common languages
|
|
LANG_NAMES = {
|
|
'en': 'Inglés',
|
|
'es': 'Español',
|
|
'fr': 'Francés',
|
|
'de': 'Alemán',
|
|
'it': 'Italiano',
|
|
'pt': 'Portugués',
|
|
'ru': 'Ruso',
|
|
'zh': 'Chino',
|
|
'ja': 'Japonés',
|
|
'ar': 'Árabe'
|
|
}
|
|
|
|
with get_read_conn() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute("""
|
|
SELECT
|
|
lang_from,
|
|
COUNT(*) as count
|
|
FROM translation_stats
|
|
WHERE lang_from IS NOT NULL
|
|
GROUP BY lang_from
|
|
ORDER BY count DESC
|
|
""")
|
|
|
|
rows = cur.fetchall()
|
|
|
|
labels = []
|
|
data = []
|
|
for code, count in rows:
|
|
code = code.strip().lower()
|
|
labels.append(LANG_NAMES.get(code, code.upper()))
|
|
data.append(count)
|
|
|
|
return jsonify({
|
|
"labels": labels,
|
|
"data": data
|
|
})
|
|
|
|
def get_system_uptime():
|
|
try:
|
|
with open('/proc/uptime', 'r') as f:
|
|
uptime_seconds = float(f.readline().split()[0])
|
|
days = int(uptime_seconds // (24 * 3600))
|
|
hours = int((uptime_seconds % (24 * 3600)) // 3600)
|
|
minutes = int((uptime_seconds % 3600) // 60)
|
|
if days > 0:
|
|
return f"{days}d {hours}h {minutes}m"
|
|
return f"{hours}h {minutes}m"
|
|
except:
|
|
return "N/A"
|
|
|
|
def get_gpu_info():
|
|
try:
|
|
cmd = "nvidia-smi --query-gpu=name,temperature.gpu,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits"
|
|
with open(os.devnull, 'w') as devnull:
|
|
res = subprocess.check_output(cmd, shell=True, stderr=devnull).decode().strip()
|
|
parts = [p.strip() for p in res.split(',')]
|
|
if len(parts) >= 5:
|
|
return {
|
|
"name": parts[0],
|
|
"temp": f"{parts[1]}°C",
|
|
"util": f"{parts[2]}%",
|
|
"mem": f"{parts[3]} MB / {parts[4]} MB"
|
|
}
|
|
except:
|
|
pass
|
|
return None
|
|
|
|
def get_cpu_info():
|
|
try:
|
|
load = os.getloadavg()
|
|
cores = os.cpu_count()
|
|
return {
|
|
"load": f"{load[0]:.2f}, {load[1]:.2f}, {load[2]:.2f}",
|
|
"cores": cores
|
|
}
|
|
except:
|
|
return None
|
|
|
|
@stats_bp.route("/api/system/info")
|
|
@cached(ttl_seconds=40, prefix="system_info")
|
|
def system_info_api():
|
|
"""Endpoint for real-time system monitoring."""
|
|
return jsonify({
|
|
"uptime": get_system_uptime(),
|
|
"gpu": get_gpu_info(),
|
|
"cpu": get_cpu_info(),
|
|
"timestamp": datetime.now().strftime("%H:%M:%S")
|
|
})
|
|
|
|
|
|
@stats_bp.route("/api/translations/rate")
|
|
@cached(ttl_seconds=60, prefix="stats")
|
|
def translations_rate_data():
|
|
"""Get translation count for the specified range (1h, 8h, 24h, 7d)."""
|
|
# Parameters
|
|
from flask import request
|
|
|
|
range_param = request.args.get("range", "1h")
|
|
|
|
# Default: 1h -> group by minute
|
|
minutes = 60
|
|
interval_sql = "minute"
|
|
timedelta_step = timedelta(minutes=1)
|
|
date_format = "%H:%M"
|
|
|
|
if range_param == "8h":
|
|
minutes = 8 * 60
|
|
interval_sql = "minute" # Still group by minute for detailed graph? Or 5 mins?
|
|
# Let's simple group by minute but it might be dense. 480 points. Fine.
|
|
timedelta_step = timedelta(minutes=1)
|
|
date_format = "%H:%M"
|
|
|
|
elif range_param == "24h":
|
|
minutes = 24 * 60
|
|
# Group by 15 minutes? Postgres: date_trunc('hour', ...) or extract?
|
|
# Let's use custom grouping? Or simple 'hour' is too granular? 1440 mins.
|
|
# Let's group by hour for 24h to be safe/clean
|
|
interval_sql = "hour"
|
|
timedelta_step = timedelta(hours=1)
|
|
date_format = "%H:%M"
|
|
|
|
elif range_param == "7d":
|
|
minutes = 7 * 24 * 60
|
|
interval_sql = "hour" # 7 * 24 = 168 points
|
|
timedelta_step = timedelta(hours=1)
|
|
date_format = "%Y-%m-%d %H:%M"
|
|
|
|
start_time = datetime.utcnow() - timedelta(minutes=minutes)
|
|
|
|
with get_read_conn() as conn:
|
|
with conn.cursor() as cur:
|
|
# Query translation_stats instead of traducciones
|
|
cur.execute(f"""
|
|
SELECT
|
|
date_trunc('{interval_sql}', created_at) as time_slot,
|
|
COUNT(*) as count
|
|
FROM translation_stats
|
|
WHERE created_at >= %s
|
|
GROUP BY time_slot
|
|
ORDER BY time_slot
|
|
""", (start_time,))
|
|
|
|
rows = cur.fetchall()
|
|
|
|
# Fill gaps
|
|
data_map = {row[0]: row[1] for row in rows}
|
|
labels = []
|
|
data = []
|
|
|
|
# Iterate by step
|
|
# Align start_time to step if possible (lazy alignment)
|
|
current = start_time.replace(second=0, microsecond=0)
|
|
if interval_sql == "hour":
|
|
current = current.replace(minute=0)
|
|
|
|
end = datetime.utcnow().replace(second=0, microsecond=0)
|
|
if interval_sql == "hour":
|
|
end = end.replace(minute=0) + timedelta(hours=1) # Ensure we cover current partial hour
|
|
|
|
while current <= end:
|
|
labels.append(current.strftime(date_format))
|
|
data.append(data_map.get(current, 0))
|
|
current += timedelta_step
|
|
|
|
return jsonify({
|
|
"labels": labels,
|
|
"data": data
|
|
})
|
|
|
|
|
|
@stats_bp.route("/entities")
|
|
def entities_dashboard():
|
|
"""Dashboard for Named Entities statistics."""
|
|
return render_template("stats_entities.html")
|
|
|
|
|
|
@stats_bp.route("/api/entities/people")
|
|
def entities_people():
|
|
"""Top 25 mentioned people, optionally filtered by country and/or date."""
|
|
from flask import request
|
|
from datetime import datetime
|
|
from cache import cache_get, cache_set
|
|
|
|
# 1. Check config mtime for cache invalidation
|
|
try:
|
|
config_mtime = os.path.getmtime(CONFIG_FILE)
|
|
except OSError:
|
|
config_mtime = 0
|
|
|
|
country_filter = request.args.get("country")
|
|
date_filter = request.args.get("date")
|
|
|
|
# 2. Build cache key with mtime
|
|
cache_key = f"entities:people:{country_filter}:{date_filter}:{config_mtime}"
|
|
|
|
# 3. Try cache
|
|
cached_data = cache_get(cache_key)
|
|
if cached_data:
|
|
return jsonify(cached_data)
|
|
|
|
# Determine time range
|
|
if date_filter:
|
|
# Single day query
|
|
try:
|
|
target_date = datetime.strptime(date_filter, "%Y-%m-%d").date()
|
|
time_condition = "DATE(tr.created_at) = %s"
|
|
time_params = [target_date]
|
|
except ValueError:
|
|
# Invalid date format, fallback to 30 days
|
|
time_condition = "tr.created_at >= NOW() - INTERVAL '30 days'"
|
|
time_params = []
|
|
else:
|
|
# Default: last 30 days
|
|
time_condition = "tr.created_at >= NOW() - INTERVAL '30 days'"
|
|
time_params = []
|
|
|
|
if country_filter and country_filter != 'global':
|
|
# Filtered by country
|
|
query = f"""
|
|
SELECT t.valor, COUNT(*) as menciones
|
|
FROM tags t
|
|
JOIN tags_noticia tn ON tn.tag_id = t.id
|
|
JOIN traducciones tr ON tn.traduccion_id = tr.id
|
|
JOIN noticias n ON tr.noticia_id = n.id
|
|
WHERE t.tipo = 'persona'
|
|
AND {time_condition}
|
|
AND n.pais_id = (SELECT id FROM paises WHERE nombre ILIKE %s LIMIT 1)
|
|
GROUP BY t.valor
|
|
ORDER BY menciones DESC
|
|
"""
|
|
params = tuple(time_params + [country_filter])
|
|
else:
|
|
# Global view
|
|
query = f"""
|
|
SELECT t.valor, COUNT(*) as menciones
|
|
FROM tags t
|
|
JOIN tags_noticia tn ON tn.tag_id = t.id
|
|
JOIN traducciones tr ON tn.traduccion_id = tr.id
|
|
WHERE t.tipo = 'persona'
|
|
AND {time_condition}
|
|
GROUP BY t.valor
|
|
ORDER BY menciones DESC
|
|
"""
|
|
params = tuple(time_params)
|
|
|
|
with get_read_conn() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute(query, params)
|
|
rows = cur.fetchall()
|
|
|
|
# Normalize and aggregate
|
|
normalized_rows = aggregate_normalized_entities(rows, entity_type='persona')
|
|
|
|
# Take top 50
|
|
top_50 = normalized_rows[:50]
|
|
|
|
# Enrich with Wikipedia Images (Parallel Execution)
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from utils.wiki import fetch_wiki_data
|
|
|
|
images = []
|
|
summaries = []
|
|
|
|
def get_image_safe(name):
|
|
try:
|
|
return fetch_wiki_data(name)
|
|
except Exception:
|
|
return None, None
|
|
|
|
if top_50:
|
|
names = [row[0] for row in top_50]
|
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
|
try:
|
|
results = list(executor.map(get_image_safe, names))
|
|
|
|
# Unpack results
|
|
for img, smry in results:
|
|
images.append(img)
|
|
summaries.append(smry)
|
|
except Exception as e:
|
|
import logging
|
|
logging.error(f"Error fetching wiki data: {e}")
|
|
# Fallback to empty if threading fails
|
|
images = [None] * len(names)
|
|
summaries = [None] * len(names)
|
|
else:
|
|
images = []
|
|
summaries = []
|
|
|
|
result = {
|
|
"labels": [row[0] for row in top_50],
|
|
"data": [row[1] for row in top_50],
|
|
"images": images,
|
|
"summaries": summaries
|
|
}
|
|
|
|
# 4. Set cache
|
|
cache_set(cache_key, result, ttl_seconds=600)
|
|
|
|
return jsonify(result)
|
|
|
|
|
|
@stats_bp.route("/api/entities/orgs")
|
|
def entities_orgs():
|
|
"""Top mentioned organizations, optionally filtered by country."""
|
|
from flask import request
|
|
from cache import cache_get, cache_set
|
|
|
|
country_filter = request.args.get("country")
|
|
|
|
try:
|
|
config_mtime = os.path.getmtime(CONFIG_FILE)
|
|
except OSError:
|
|
config_mtime = 0
|
|
|
|
cache_key = f"entities:orgs:{country_filter}:{config_mtime}"
|
|
|
|
cached_data = cache_get(cache_key)
|
|
if cached_data:
|
|
return jsonify(cached_data)
|
|
|
|
if country_filter and country_filter != 'global':
|
|
query = """
|
|
SELECT t.valor, COUNT(*) as menciones
|
|
FROM tags t
|
|
JOIN tags_noticia tn ON tn.tag_id = t.id
|
|
JOIN traducciones tr ON tn.traduccion_id = tr.id
|
|
JOIN noticias n ON tr.noticia_id = n.id
|
|
WHERE t.tipo = 'organizacion'
|
|
AND tr.created_at >= NOW() - INTERVAL '30 days'
|
|
AND n.pais_id = (SELECT id FROM paises WHERE nombre ILIKE %s LIMIT 1)
|
|
GROUP BY t.valor
|
|
ORDER BY menciones DESC
|
|
LIMIT 50
|
|
"""
|
|
params = (country_filter,)
|
|
else:
|
|
query = """
|
|
SELECT t.valor, COUNT(*) as menciones
|
|
FROM tags t
|
|
JOIN tags_noticia tn ON tn.tag_id = t.id
|
|
JOIN traducciones tr ON tn.traduccion_id = tr.id
|
|
WHERE t.tipo = 'organizacion'
|
|
AND tr.created_at >= NOW() - INTERVAL '30 days'
|
|
GROUP BY t.valor
|
|
ORDER BY menciones DESC
|
|
LIMIT 50
|
|
"""
|
|
params = ()
|
|
|
|
with get_read_conn() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute(query, params)
|
|
rows = cur.fetchall()
|
|
|
|
normalized_rows = aggregate_normalized_entities(rows, entity_type='organizacion')
|
|
|
|
# Enrich with Wikipedia Images
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from utils.wiki import fetch_wiki_data
|
|
|
|
images = []
|
|
summaries = []
|
|
|
|
def get_info_safe(name):
|
|
try:
|
|
return fetch_wiki_data(name)
|
|
except Exception:
|
|
return None, None
|
|
|
|
if normalized_rows:
|
|
names = [row[0] for row in normalized_rows]
|
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
|
results = list(executor.map(get_info_safe, names))
|
|
for img, smry in results:
|
|
images.append(img)
|
|
summaries.append(smry)
|
|
|
|
result = {
|
|
"labels": [row[0] for row in normalized_rows],
|
|
"data": [row[1] for row in normalized_rows],
|
|
"images": images,
|
|
"summaries": summaries
|
|
}
|
|
|
|
cache_set(cache_key, result, ttl_seconds=600)
|
|
return jsonify(result)
|
|
|
|
|
|
@stats_bp.route("/api/entities/places")
|
|
def entities_places():
|
|
"""Top mentioned places, optionally filtered by country."""
|
|
from flask import request
|
|
from cache import cache_get, cache_set
|
|
|
|
country_filter = request.args.get("country")
|
|
|
|
try:
|
|
config_mtime = os.path.getmtime(CONFIG_FILE)
|
|
except OSError:
|
|
config_mtime = 0
|
|
|
|
cache_key = f"entities:places:{country_filter}:{config_mtime}"
|
|
|
|
cached_data = cache_get(cache_key)
|
|
if cached_data:
|
|
return jsonify(cached_data)
|
|
|
|
if country_filter and country_filter != 'global':
|
|
query = """
|
|
SELECT t.valor, COUNT(*) as menciones
|
|
FROM tags t
|
|
JOIN tags_noticia tn ON tn.tag_id = t.id
|
|
JOIN traducciones tr ON tn.traduccion_id = tr.id
|
|
JOIN noticias n ON tr.noticia_id = n.id
|
|
WHERE t.tipo = 'lugar'
|
|
AND tr.created_at >= NOW() - INTERVAL '30 days'
|
|
AND n.pais_id = (SELECT id FROM paises WHERE nombre ILIKE %s LIMIT 1)
|
|
AND n.pais_id != (SELECT id FROM paises WHERE nombre = 'España')
|
|
GROUP BY t.valor
|
|
ORDER BY menciones DESC
|
|
LIMIT 50
|
|
"""
|
|
params = (country_filter,)
|
|
else:
|
|
query = """
|
|
SELECT t.valor, COUNT(*) as menciones
|
|
FROM tags t
|
|
JOIN tags_noticia tn ON tn.tag_id = t.id
|
|
JOIN traducciones tr ON tn.traduccion_id = tr.id
|
|
JOIN noticias n ON tr.noticia_id = n.id
|
|
WHERE t.tipo = 'lugar'
|
|
AND tr.created_at >= NOW() - INTERVAL '30 days'
|
|
AND n.pais_id != (SELECT id FROM paises WHERE nombre = 'España')
|
|
GROUP BY t.valor
|
|
ORDER BY menciones DESC
|
|
LIMIT 50
|
|
"""
|
|
params = ()
|
|
|
|
with get_read_conn() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute(query, params)
|
|
rows = cur.fetchall()
|
|
|
|
# Normalize
|
|
normalized_rows = aggregate_normalized_entities(rows, entity_type='lugar')
|
|
|
|
# Enrich with Wikipedia Images
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from utils.wiki import fetch_wiki_data
|
|
|
|
images = []
|
|
summaries = []
|
|
|
|
def get_info_safe(name):
|
|
try:
|
|
return fetch_wiki_data(name)
|
|
except Exception:
|
|
return None, None
|
|
|
|
if normalized_rows:
|
|
names = [row[0] for row in normalized_rows]
|
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
|
results = list(executor.map(get_info_safe, names))
|
|
for img, smry in results:
|
|
images.append(img)
|
|
summaries.append(smry)
|
|
|
|
result = {
|
|
"labels": [row[0] for row in normalized_rows],
|
|
"data": [row[1] for row in normalized_rows],
|
|
"images": images,
|
|
"summaries": summaries
|
|
}
|
|
|
|
cache_set(cache_key, result, ttl_seconds=600)
|
|
return jsonify(result)
|