rss2/routers/stats.py
2026-01-13 13:39:51 +01:00

911 lines
29 KiB
Python

from flask import Blueprint, render_template, jsonify
from db import get_read_conn
from datetime import datetime, timedelta
import os
import subprocess
import time
from cache import cached
stats_bp = Blueprint("stats", __name__, url_prefix="/stats")
# ==================================================================================
# ENTITY NORMALIZATION SYSTEM
# ==================================================================================
# Dictionary to map entity name variations to canonical names
import json
CONFIG_FILE = "entity_config.json"
_config_cache = {"data": None, "mtime": 0}
def load_entity_config():
"""Load entity config from JSON file with simple modification time caching."""
global _config_cache
try:
# Check if file exists
if not os.path.exists(CONFIG_FILE):
return {"blacklist": [], "synonyms": {}}
# Check modification time
mtime = os.path.getmtime(CONFIG_FILE)
if _config_cache["data"] is not None and mtime <= _config_cache["mtime"]:
return _config_cache["data"]
# Load fresh config
with open(CONFIG_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
# Normalize structure
if "blacklist" not in data: data["blacklist"] = []
if "synonyms" not in data: data["synonyms"] = {}
# Pre-process synonyms for reverse lookup (variation -> canonical)
lookup = {}
for canonical, variations in data["synonyms"].items():
lookup[canonical.lower()] = canonical # Map canonical to itself
for var in variations:
lookup[var.lower()] = canonical
data["_lookup"] = lookup
data["_blacklist_set"] = {x.lower() for x in data["blacklist"]}
_config_cache = {"data": data, "mtime": mtime}
return data
except Exception as e:
print(f"Error loading entity config: {e}")
# Return fallback or previous cache if available
return _config_cache["data"] if _config_cache["data"] else {"blacklist": [], "synonyms": {}}
def normalize_entity_name(name: str, config=None) -> str:
"""Normalize entity name to its canonical form."""
if config is None:
config = load_entity_config()
lookup = config.get("_lookup", {})
return lookup.get(name.lower(), name)
def aggregate_normalized_entities(rows, entity_type='persona'):
"""Aggregate entity counts by normalized names and filter blacklisted items.
Args:
rows: List of (name, count) tuples from database
entity_type: Type of entity for normalization (kept for compatibility but config is global now)
Returns:
List of (normalized_name, total_count) tuples sorted by count
"""
aggregated = {}
config = load_entity_config()
blacklist = config.get("_blacklist_set", set())
for name, count in rows:
# 1. Check blacklist (exact or lower match)
if name.lower() in blacklist:
continue
# 2. Normalize
normalized = normalize_entity_name(name, config)
# 3. Check blacklist again (in case canonical name is blacklisted)
if normalized.lower() in blacklist:
continue
aggregated[normalized] = aggregated.get(normalized, 0) + count
# Sort by count descending
sorted_items = sorted(aggregated.items(), key=lambda x: x[1], reverse=True)
return sorted_items
# ==================================================================================
@stats_bp.route("/")
def index():
"""Stats dashboard page."""
# Calculate translation stats for the banner
with get_read_conn() as conn:
with conn.cursor() as cur:
# Translations per minute (last 5 minutes)
cur.execute("""
SELECT COUNT(*) FROM traducciones
WHERE status = 'done'
AND created_at > NOW() - INTERVAL '5 minutes'
""")
recent_5min = cur.fetchone()[0]
translations_per_min = round(recent_5min / 5, 1) if recent_5min else 0
# Status counts
cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'done'")
traducciones_count = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'pending'")
pending_count = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'processing'")
processing_count = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'error'")
error_count = cur.fetchone()[0]
# Total noticias (exact count - cached for 5 min in view)
cur.execute("SELECT COUNT(*) FROM noticias")
noticias_count = cur.fetchone()[0] or 0
# News ingested today
cur.execute("""
SELECT COUNT(*) FROM noticias
WHERE DATE(fecha) = CURRENT_DATE
""")
noticias_hoy = cur.fetchone()[0] or 0
# News ingested in the last hour
cur.execute("""
SELECT COUNT(*) FROM noticias
WHERE fecha >= NOW() - INTERVAL '1 hour'
""")
noticias_ultima_hora = cur.fetchone()[0] or 0
return render_template("stats.html",
translations_per_min=translations_per_min,
noticias_count=noticias_count,
traducciones_count=traducciones_count,
pending_count=pending_count,
processing_count=processing_count,
error_count=error_count,
noticias_hoy=noticias_hoy,
noticias_ultima_hora=noticias_ultima_hora)
@stats_bp.route("/api/activity")
@cached(ttl_seconds=300, prefix="stats")
def activity_data():
"""Get activity data (news count) for the specified range."""
from flask import request
range_param = request.args.get("range", "30d")
# Default: 30d -> group by day
days = 30
minutes = 0
interval_sql = "day" # For date_trunc or casting
timedelta_step = timedelta(days=1)
date_format = "%Y-%m-%d"
if range_param == "1h":
minutes = 60
interval_sql = "minute"
timedelta_step = timedelta(minutes=1)
date_format = "%H:%M"
elif range_param == "8h":
minutes = 480
interval_sql = "minute"
timedelta_step = timedelta(minutes=1)
date_format = "%H:%M"
elif range_param == "1d": # Alias for 24h
minutes = 1440
interval_sql = "hour"
timedelta_step = timedelta(hours=1)
date_format = "%H:%M"
elif range_param == "24h":
minutes = 1440
interval_sql = "hour"
timedelta_step = timedelta(hours=1)
date_format = "%H:%M"
elif range_param == "7d":
minutes = 10080
interval_sql = "hour"
timedelta_step = timedelta(hours=1)
# Include Month-Day for 7d context
date_format = "%d %H:%M"
elif range_param == "30d":
# Specific existing logic uses date casting, we can adapt
minutes = 0
days = 30
interval_sql = "day"
timedelta_step = timedelta(days=1)
date_format = "%Y-%m-%d"
# Calculate start time
if minutes > 0:
start_time = datetime.utcnow() - timedelta(minutes=minutes)
# Using timestamp column directly
date_column = "fecha"
else:
start_time = datetime.utcnow() - timedelta(days=days)
# For 30d we might just use date part start
start_time = start_time.replace(hour=0, minute=0, second=0, microsecond=0)
date_column = "fecha"
with get_read_conn() as conn:
with conn.cursor() as cur:
# Construct query based on interval
if interval_sql == "day":
# Original logic style for 30d, but generalized
cur.execute("""
SELECT
fecha::date as time_slot,
COUNT(*) as count
FROM noticias
WHERE fecha >= %s
GROUP BY time_slot
ORDER BY time_slot
""", (start_time,))
else:
# Granular logic
cur.execute(f"""
SELECT
date_trunc('{interval_sql}', fecha) as time_slot,
COUNT(*) as count
FROM noticias
WHERE fecha >= %s
GROUP BY time_slot
ORDER BY time_slot
""", (start_time,))
rows = cur.fetchall()
# Fill gaps
data_map = {row[0]: row[1] for row in rows}
labels = []
data = []
# Iterate with step
if minutes > 0:
# Granular start alignment
current = start_time.replace(second=0, microsecond=0)
if interval_sql == "hour":
current = current.replace(minute=0)
end = datetime.utcnow().replace(second=0, microsecond=0)
if interval_sql == "hour":
end = end.replace(minute=0) + timedelta(hours=1)
else:
# Daily start alignment
current = start_time.date() if isinstance(start_time, datetime) else start_time
end = datetime.utcnow().date()
while current <= end:
# Format label
labels.append(current.strftime(date_format))
# Lookup key can be date or datetime depending on query
# DB returns date for ::date and datetime for date_trunc
# Let's handle both lookup types safely
lookup_key = current
# API might have mismatch if current is date object and DB returned datetime or vice versa
# rows[0] is date object for 'day', datetime for 'minute'/'hour'
val = data_map.get(lookup_key, 0)
# Fallback if types don't match exactly (datetime vs date) - unlikely if logic is consistent but good to check
if val == 0 and isinstance(lookup_key, datetime) and interval_sql == 'day':
val = data_map.get(lookup_key.date(), 0)
data.append(val)
current += timedelta_step
return jsonify({
"labels": labels,
"data": data
})
@stats_bp.route("/api/categories")
@cached(ttl_seconds=300, prefix="stats")
def categories_data():
"""Get news count per category (Top 8 + Others)."""
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT
c.nombre,
COUNT(n.id) as count
FROM noticias n
JOIN categorias c ON c.id = n.categoria_id
GROUP BY c.nombre
ORDER BY count DESC
""")
rows = cur.fetchall()
# Process Top 8 + Others
labels = []
data = []
others_count = 0
top_limit = 8
for i, row in enumerate(rows):
if i < top_limit:
labels.append(row[0])
data.append(row[1])
else:
others_count += row[1]
if others_count > 0:
labels.append("Otros")
data.append(others_count)
return jsonify({
"labels": labels,
"data": data
})
@stats_bp.route("/api/countries")
@cached(ttl_seconds=300, prefix="stats")
def countries_data():
"""Get news count per country (Top 10 + Others)."""
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT
p.nombre,
COUNT(n.id) as count
FROM noticias n
JOIN paises p ON p.id = n.pais_id
GROUP BY p.nombre
ORDER BY count DESC
""")
rows = cur.fetchall()
# Process Top 10 + Others
labels = []
data = []
others_count = 0
top_limit = 10
for i, row in enumerate(rows):
if i < top_limit:
labels.append(row[0])
data.append(row[1])
else:
others_count += row[1]
return jsonify({
"labels": labels,
"data": data
})
@stats_bp.route("/api/countries/list")
def countries_list():
"""Get alphabetical list of all countries with flags."""
from utils import country_flag
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("SELECT nombre FROM paises ORDER BY nombre ASC")
rows = cur.fetchall()
return jsonify([
{"name": row[0], "flag": country_flag(row[0])}
for row in rows
])
@stats_bp.route("/api/translations/activity")
def translations_activity_data():
"""Get translation count per day for the last 30 days."""
days = 30
start_date = (datetime.utcnow() - timedelta(days=days)).date()
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT
created_at::date as day,
COUNT(*) as count
FROM traducciones
WHERE created_at >= %s
GROUP BY day
ORDER BY day
""", (start_date,))
rows = cur.fetchall()
# Fill gaps
data_map = {row[0]: row[1] for row in rows}
labels = []
data = []
current = start_date
end = datetime.utcnow().date()
while current <= end:
labels.append(current.strftime("%Y-%m-%d"))
data.append(data_map.get(current, 0))
current += timedelta(days=1)
return jsonify({
"labels": labels,
"data": data
})
@stats_bp.route("/api/translations/languages")
@cached(ttl_seconds=60, prefix="stats")
def translations_languages_data():
"""Get translation count per source language."""
# Friendly names for common languages
LANG_NAMES = {
'en': 'Inglés',
'es': 'Español',
'fr': 'Francés',
'de': 'Alemán',
'it': 'Italiano',
'pt': 'Portugués',
'ru': 'Ruso',
'zh': 'Chino',
'ja': 'Japonés',
'ar': 'Árabe'
}
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT
lang_from,
COUNT(*) as count
FROM translation_stats
WHERE lang_from IS NOT NULL
GROUP BY lang_from
ORDER BY count DESC
""")
rows = cur.fetchall()
labels = []
data = []
for code, count in rows:
code = code.strip().lower()
labels.append(LANG_NAMES.get(code, code.upper()))
data.append(count)
return jsonify({
"labels": labels,
"data": data
})
def get_system_uptime():
try:
with open('/proc/uptime', 'r') as f:
uptime_seconds = float(f.readline().split()[0])
days = int(uptime_seconds // (24 * 3600))
hours = int((uptime_seconds % (24 * 3600)) // 3600)
minutes = int((uptime_seconds % 3600) // 60)
if days > 0:
return f"{days}d {hours}h {minutes}m"
return f"{hours}h {minutes}m"
except:
return "N/A"
def get_gpu_info():
try:
cmd = "nvidia-smi --query-gpu=name,temperature.gpu,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits"
with open(os.devnull, 'w') as devnull:
res = subprocess.check_output(cmd, shell=True, stderr=devnull).decode().strip()
parts = [p.strip() for p in res.split(',')]
if len(parts) >= 5:
return {
"name": parts[0],
"temp": f"{parts[1]}°C",
"util": f"{parts[2]}%",
"mem": f"{parts[3]} MB / {parts[4]} MB"
}
except:
pass
return None
def get_cpu_info():
try:
load = os.getloadavg()
cores = os.cpu_count()
return {
"load": f"{load[0]:.2f}, {load[1]:.2f}, {load[2]:.2f}",
"cores": cores
}
except:
return None
@stats_bp.route("/api/system/info")
def system_info_api():
"""Endpoint for real-time system monitoring."""
return jsonify({
"uptime": get_system_uptime(),
"gpu": get_gpu_info(),
"cpu": get_cpu_info(),
"timestamp": datetime.now().strftime("%H:%M:%S")
})
@stats_bp.route("/api/translations/rate")
@cached(ttl_seconds=60, prefix="stats")
def translations_rate_data():
"""Get translation count for the specified range (1h, 8h, 24h, 7d)."""
# Parameters
from flask import request
range_param = request.args.get("range", "1h")
# Default: 1h -> group by minute
minutes = 60
interval_sql = "minute"
timedelta_step = timedelta(minutes=1)
date_format = "%H:%M"
if range_param == "8h":
minutes = 8 * 60
interval_sql = "minute" # Still group by minute for detailed graph? Or 5 mins?
# Let's simple group by minute but it might be dense. 480 points. Fine.
timedelta_step = timedelta(minutes=1)
date_format = "%H:%M"
elif range_param == "24h":
minutes = 24 * 60
# Group by 15 minutes? Postgres: date_trunc('hour', ...) or extract?
# Let's use custom grouping? Or simple 'hour' is too granular? 1440 mins.
# Let's group by hour for 24h to be safe/clean
interval_sql = "hour"
timedelta_step = timedelta(hours=1)
date_format = "%H:%M"
elif range_param == "7d":
minutes = 7 * 24 * 60
interval_sql = "hour" # 7 * 24 = 168 points
timedelta_step = timedelta(hours=1)
date_format = "%Y-%m-%d %H:%M"
start_time = datetime.utcnow() - timedelta(minutes=minutes)
with get_read_conn() as conn:
with conn.cursor() as cur:
# Query translation_stats instead of traducciones
cur.execute(f"""
SELECT
date_trunc('{interval_sql}', created_at) as time_slot,
COUNT(*) as count
FROM translation_stats
WHERE created_at >= %s
GROUP BY time_slot
ORDER BY time_slot
""", (start_time,))
rows = cur.fetchall()
# Fill gaps
data_map = {row[0]: row[1] for row in rows}
labels = []
data = []
# Iterate by step
# Align start_time to step if possible (lazy alignment)
current = start_time.replace(second=0, microsecond=0)
if interval_sql == "hour":
current = current.replace(minute=0)
end = datetime.utcnow().replace(second=0, microsecond=0)
if interval_sql == "hour":
end = end.replace(minute=0) + timedelta(hours=1) # Ensure we cover current partial hour
while current <= end:
labels.append(current.strftime(date_format))
data.append(data_map.get(current, 0))
current += timedelta_step
return jsonify({
"labels": labels,
"data": data
})
@stats_bp.route("/entities")
def entities_dashboard():
"""Dashboard for Named Entities statistics."""
return render_template("stats_entities.html")
@stats_bp.route("/api/entities/people")
def entities_people():
"""Top 25 mentioned people, optionally filtered by country and/or date."""
from flask import request
from datetime import datetime
from cache import cache_get, cache_set
# 1. Check config mtime for cache invalidation
try:
config_mtime = os.path.getmtime(CONFIG_FILE)
except OSError:
config_mtime = 0
country_filter = request.args.get("country")
date_filter = request.args.get("date")
# 2. Build cache key with mtime
cache_key = f"entities:people:{country_filter}:{date_filter}:{config_mtime}"
# 3. Try cache
cached_data = cache_get(cache_key)
if cached_data:
return jsonify(cached_data)
# Determine time range
if date_filter:
# Single day query
try:
target_date = datetime.strptime(date_filter, "%Y-%m-%d").date()
time_condition = "DATE(tr.created_at) = %s"
time_params = [target_date]
except ValueError:
# Invalid date format, fallback to 30 days
time_condition = "tr.created_at >= NOW() - INTERVAL '30 days'"
time_params = []
else:
# Default: last 30 days
time_condition = "tr.created_at >= NOW() - INTERVAL '30 days'"
time_params = []
if country_filter and country_filter != 'global':
# Filtered by country
query = f"""
SELECT t.valor, COUNT(*) as menciones
FROM tags t
JOIN tags_noticia tn ON tn.tag_id = t.id
JOIN traducciones tr ON tn.traduccion_id = tr.id
JOIN noticias n ON tr.noticia_id = n.id
WHERE t.tipo = 'persona'
AND {time_condition}
AND n.pais_id = (SELECT id FROM paises WHERE nombre ILIKE %s LIMIT 1)
GROUP BY t.valor
ORDER BY menciones DESC
"""
params = tuple(time_params + [country_filter])
else:
# Global view
query = f"""
SELECT t.valor, COUNT(*) as menciones
FROM tags t
JOIN tags_noticia tn ON tn.tag_id = t.id
JOIN traducciones tr ON tn.traduccion_id = tr.id
WHERE t.tipo = 'persona'
AND {time_condition}
GROUP BY t.valor
ORDER BY menciones DESC
"""
params = tuple(time_params)
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute(query, params)
rows = cur.fetchall()
# Normalize and aggregate
normalized_rows = aggregate_normalized_entities(rows, entity_type='persona')
# Take top 50
top_50 = normalized_rows[:50]
# Enrich with Wikipedia Images (Parallel Execution)
from concurrent.futures import ThreadPoolExecutor
from utils.wiki import fetch_wiki_data
images = []
summaries = []
def get_image_safe(name):
try:
return fetch_wiki_data(name)
except Exception:
return None, None
if top_50:
names = [row[0] for row in top_50]
with ThreadPoolExecutor(max_workers=10) as executor:
try:
results = list(executor.map(get_image_safe, names))
# Unpack results
for img, smry in results:
images.append(img)
summaries.append(smry)
except Exception as e:
import logging
logging.error(f"Error fetching wiki data: {e}")
# Fallback to empty if threading fails
images = [None] * len(names)
summaries = [None] * len(names)
else:
images = []
summaries = []
result = {
"labels": [row[0] for row in top_50],
"data": [row[1] for row in top_50],
"images": images,
"summaries": summaries
}
# 4. Set cache
cache_set(cache_key, result, ttl_seconds=600)
return jsonify(result)
@stats_bp.route("/api/entities/orgs")
def entities_orgs():
"""Top mentioned organizations, optionally filtered by country."""
from flask import request
from cache import cache_get, cache_set
country_filter = request.args.get("country")
try:
config_mtime = os.path.getmtime(CONFIG_FILE)
except OSError:
config_mtime = 0
cache_key = f"entities:orgs:{country_filter}:{config_mtime}"
cached_data = cache_get(cache_key)
if cached_data:
return jsonify(cached_data)
if country_filter and country_filter != 'global':
query = """
SELECT t.valor, COUNT(*) as menciones
FROM tags t
JOIN tags_noticia tn ON tn.tag_id = t.id
JOIN traducciones tr ON tn.traduccion_id = tr.id
JOIN noticias n ON tr.noticia_id = n.id
WHERE t.tipo = 'organizacion'
AND tr.created_at >= NOW() - INTERVAL '30 days'
AND n.pais_id = (SELECT id FROM paises WHERE nombre ILIKE %s LIMIT 1)
GROUP BY t.valor
ORDER BY menciones DESC
LIMIT 50
"""
params = (country_filter,)
else:
query = """
SELECT t.valor, COUNT(*) as menciones
FROM tags t
JOIN tags_noticia tn ON tn.tag_id = t.id
JOIN traducciones tr ON tn.traduccion_id = tr.id
WHERE t.tipo = 'organizacion'
AND tr.created_at >= NOW() - INTERVAL '30 days'
GROUP BY t.valor
ORDER BY menciones DESC
LIMIT 50
"""
params = ()
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute(query, params)
rows = cur.fetchall()
normalized_rows = aggregate_normalized_entities(rows, entity_type='organizacion')
# Enrich with Wikipedia Images
from concurrent.futures import ThreadPoolExecutor
from utils.wiki import fetch_wiki_data
images = []
summaries = []
def get_info_safe(name):
try:
return fetch_wiki_data(name)
except Exception:
return None, None
if normalized_rows:
names = [row[0] for row in normalized_rows]
with ThreadPoolExecutor(max_workers=10) as executor:
results = list(executor.map(get_info_safe, names))
for img, smry in results:
images.append(img)
summaries.append(smry)
result = {
"labels": [row[0] for row in normalized_rows],
"data": [row[1] for row in normalized_rows],
"images": images,
"summaries": summaries
}
cache_set(cache_key, result, ttl_seconds=600)
return jsonify(result)
@stats_bp.route("/api/entities/places")
def entities_places():
"""Top mentioned places, optionally filtered by country."""
from flask import request
from cache import cache_get, cache_set
country_filter = request.args.get("country")
try:
config_mtime = os.path.getmtime(CONFIG_FILE)
except OSError:
config_mtime = 0
cache_key = f"entities:places:{country_filter}:{config_mtime}"
cached_data = cache_get(cache_key)
if cached_data:
return jsonify(cached_data)
if country_filter and country_filter != 'global':
query = """
SELECT t.valor, COUNT(*) as menciones
FROM tags t
JOIN tags_noticia tn ON tn.tag_id = t.id
JOIN traducciones tr ON tn.traduccion_id = tr.id
JOIN noticias n ON tr.noticia_id = n.id
WHERE t.tipo = 'lugar'
AND tr.created_at >= NOW() - INTERVAL '30 days'
AND n.pais_id = (SELECT id FROM paises WHERE nombre ILIKE %s LIMIT 1)
AND n.pais_id != (SELECT id FROM paises WHERE nombre = 'España')
GROUP BY t.valor
ORDER BY menciones DESC
LIMIT 50
"""
params = (country_filter,)
else:
query = """
SELECT t.valor, COUNT(*) as menciones
FROM tags t
JOIN tags_noticia tn ON tn.tag_id = t.id
JOIN traducciones tr ON tn.traduccion_id = tr.id
JOIN noticias n ON tr.noticia_id = n.id
WHERE t.tipo = 'lugar'
AND tr.created_at >= NOW() - INTERVAL '30 days'
AND n.pais_id != (SELECT id FROM paises WHERE nombre = 'España')
GROUP BY t.valor
ORDER BY menciones DESC
LIMIT 50
"""
params = ()
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute(query, params)
rows = cur.fetchall()
# Normalize
normalized_rows = aggregate_normalized_entities(rows, entity_type='lugar')
# Enrich with Wikipedia Images
from concurrent.futures import ThreadPoolExecutor
from utils.wiki import fetch_wiki_data
images = []
summaries = []
def get_info_safe(name):
try:
return fetch_wiki_data(name)
except Exception:
return None, None
if normalized_rows:
names = [row[0] for row in normalized_rows]
with ThreadPoolExecutor(max_workers=10) as executor:
results = list(executor.map(get_info_safe, names))
for img, smry in results:
images.append(img)
summaries.append(smry)
result = {
"labels": [row[0] for row in normalized_rows],
"data": [row[1] for row in normalized_rows],
"images": images,
"summaries": summaries
}
cache_set(cache_key, result, ttl_seconds=600)
return jsonify(result)