rss2/routers/stats.py

from flask import Blueprint, render_template, jsonify
from db import get_read_conn
from datetime import datetime, timedelta
import os
import subprocess
import time
from cache import cached

stats_bp = Blueprint("stats", __name__, url_prefix="/stats")


# ==================================================================================
# ENTITY NORMALIZATION SYSTEM
# ==================================================================================
# Dictionary to map entity name variations to canonical names
import json

CONFIG_FILE = "entity_config.json"
_config_cache = {"data": None, "mtime": 0}

def load_entity_config():
    """Load entity config from JSON file with simple modification time caching."""
    global _config_cache
    try:
        # Check if file exists
        if not os.path.exists(CONFIG_FILE):
            return {"blacklist": [], "synonyms": {}}

        # Check modification time
        mtime = os.path.getmtime(CONFIG_FILE)
        if _config_cache["data"] is not None and mtime <= _config_cache["mtime"]:
            return _config_cache["data"]

        # Load fresh config
        with open(CONFIG_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)

            # Normalize structure
            if "blacklist" not in data: data["blacklist"] = []
            if "synonyms" not in data: data["synonyms"] = {}

            # Pre-process synonyms for reverse lookup (variation -> canonical)
            lookup = {}
            for canonical, variations in data["synonyms"].items():
                lookup[canonical.lower()] = canonical # Map canonical to itself
                for var in variations:
                    lookup[var.lower()] = canonical

            data["_lookup"] = lookup
            data["_blacklist_set"] = {x.lower() for x in data["blacklist"]}

            _config_cache = {"data": data, "mtime": mtime}
            return data

    except Exception as e:
        print(f"Error loading entity config: {e}")
        # Return fallback or previous cache if available
        return _config_cache["data"] if _config_cache["data"] else {"blacklist": [], "synonyms": {}}


def normalize_entity_name(name: str, config=None) -> str:
    """Normalize entity name to its canonical form."""
    if config is None:
        config = load_entity_config()

    lookup = config.get("_lookup", {})
    return lookup.get(name.lower(), name)


def aggregate_normalized_entities(rows, entity_type='persona'):
    """Aggregate entity counts by normalized names and filter blacklisted items.

    Args:
        rows: List of (name, count) tuples from database
        entity_type: Type of entity for normalization (kept for compatibility but config is global now)

    Returns:
        List of (normalized_name, total_count) tuples sorted by count
    """
    aggregated = {}
    config = load_entity_config()
    blacklist = config.get("_blacklist_set", set())

    for name, count in rows:
        # 1. Check blacklist (exact or lower match)
        if name.lower() in blacklist:
            continue

        # 2. Normalize
        normalized = normalize_entity_name(name, config)

        # 3. Check blacklist again (in case canonical name is blacklisted)
        if normalized.lower() in blacklist:
            continue

        aggregated[normalized] = aggregated.get(normalized, 0) + count

    # Sort by count descending
    sorted_items = sorted(aggregated.items(), key=lambda x: x[1], reverse=True)
    return sorted_items

# ==================================================================================


@stats_bp.route("/")
def index():
    """Stats dashboard page."""

    # Calculate translation stats for the banner
    with get_read_conn() as conn:
        with conn.cursor() as cur:
            # Translations per minute (last 5 minutes)
            cur.execute("""
                SELECT COUNT(*) FROM traducciones
                WHERE status = 'done'
                AND created_at > NOW() - INTERVAL '5 minutes'
            """)
            recent_5min = cur.fetchone()[0]
            translations_per_min = round(recent_5min / 5, 1) if recent_5min else 0

            # Status counts
            cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'done'")
            traducciones_count = cur.fetchone()[0]

            cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'pending'")
            pending_count = cur.fetchone()[0]

            cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'processing'")
            processing_count = cur.fetchone()[0]

            cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'error'")
            error_count = cur.fetchone()[0]

            # Total noticias (exact count - cached for 5 min in view)
            cur.execute("SELECT COUNT(*) FROM noticias")
            noticias_count = cur.fetchone()[0] or 0

            # News ingested today
            cur.execute("""
                SELECT COUNT(*) FROM noticias
                WHERE DATE(fecha) = CURRENT_DATE
            """)
            noticias_hoy = cur.fetchone()[0] or 0

            # News ingested in the last hour
            cur.execute("""
                SELECT COUNT(*) FROM noticias
                WHERE fecha >= NOW() - INTERVAL '1 hour'
            """)
            noticias_ultima_hora = cur.fetchone()[0] or 0

    return render_template("stats.html",
                         translations_per_min=translations_per_min,
                         noticias_count=noticias_count,
                         traducciones_count=traducciones_count,
                         pending_count=pending_count,
                         processing_count=processing_count,
                         error_count=error_count,
                         noticias_hoy=noticias_hoy,
                         noticias_ultima_hora=noticias_ultima_hora)


@stats_bp.route("/api/activity")
@cached(ttl_seconds=300, prefix="stats")
def activity_data():
    """Get activity data (news count) for the specified range."""
    from flask import request
    range_param = request.args.get("range", "30d")

    # Default: 30d -> group by day
    days = 30
    minutes = 0
    interval_sql = "day" # For date_trunc or casting
    timedelta_step = timedelta(days=1)
    date_format = "%Y-%m-%d"

    if range_param == "1h":
        minutes = 60
        interval_sql = "minute"
        timedelta_step = timedelta(minutes=1)
        date_format = "%H:%M"
    elif range_param == "8h":
        minutes = 480
        interval_sql = "minute"
        timedelta_step = timedelta(minutes=1)
        date_format = "%H:%M"
    elif range_param == "1d": # Alias for 24h
        minutes = 1440
        interval_sql = "hour"
        timedelta_step = timedelta(hours=1)
        date_format = "%H:%M"
    elif range_param == "24h":
        minutes = 1440
        interval_sql = "hour"
        timedelta_step = timedelta(hours=1)
        date_format = "%H:%M"
    elif range_param == "7d":
        minutes = 10080
        interval_sql = "hour"
        timedelta_step = timedelta(hours=1)
        # Include Month-Day for 7d context
        date_format = "%d %H:%M"
    elif range_param == "30d":
        # Specific existing logic uses date casting, we can adapt
        minutes = 0
        days = 30
        interval_sql = "day"
        timedelta_step = timedelta(days=1)
        date_format = "%Y-%m-%d"

    # Calculate start time
    if minutes > 0:
        start_time = datetime.utcnow() - timedelta(minutes=minutes)
        # Using timestamp column directly
        date_column = "fecha"
    else:
        start_time = datetime.utcnow() - timedelta(days=days)
        # For 30d we might just use date part start
        start_time = start_time.replace(hour=0, minute=0, second=0, microsecond=0)
        date_column = "fecha"

    with get_read_conn() as conn:
        with conn.cursor() as cur:
            # Construct query based on interval
            if interval_sql == "day":
                 # Original logic style for 30d, but generalized
                cur.execute("""
                    SELECT
                        fecha::date as time_slot,
                        COUNT(*) as count
                    FROM noticias
                    WHERE fecha >= %s
                    GROUP BY time_slot
                    ORDER BY time_slot
                """, (start_time,))
            else:
                # Granular logic
                cur.execute(f"""
                    SELECT
                        date_trunc('{interval_sql}', fecha) as time_slot,
                        COUNT(*) as count
                    FROM noticias
                    WHERE fecha >= %s
                    GROUP BY time_slot
                    ORDER BY time_slot
                """, (start_time,))

            rows = cur.fetchall()

    # Fill gaps
    data_map = {row[0]: row[1] for row in rows}
    labels = []
    data = []

    # Iterate with step
    if minutes > 0:
        # Granular start alignment
        current = start_time.replace(second=0, microsecond=0)
        if interval_sql == "hour":
            current = current.replace(minute=0)

        end = datetime.utcnow().replace(second=0, microsecond=0)
        if interval_sql == "hour":
            end = end.replace(minute=0) + timedelta(hours=1)
    else:
        # Daily start alignment
        current = start_time.date() if isinstance(start_time, datetime) else start_time
        end = datetime.utcnow().date()

    while current <= end:
        # Format label
        labels.append(current.strftime(date_format))

        # Lookup key can be date or datetime depending on query
        # DB returns date for ::date and datetime for date_trunc
        # Let's handle both lookup types safely
        lookup_key = current
        # API might have mismatch if current is date object and DB returned datetime or vice versa
        # rows[0] is date object for 'day', datetime for 'minute'/'hour'

        val = data_map.get(lookup_key, 0)
        # Fallback if types don't match exactly (datetime vs date) - unlikely if logic is consistent but good to check
        if val == 0 and isinstance(lookup_key, datetime) and interval_sql == 'day':
             val = data_map.get(lookup_key.date(), 0)

        data.append(val)

        current += timedelta_step

    return jsonify({
        "labels": labels,
        "data": data
    })


@stats_bp.route("/api/categories")
@cached(ttl_seconds=300, prefix="stats")
def categories_data():
    """Get news count per category (Top 8 + Others)."""
    with get_read_conn() as conn:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT
                    c.nombre,
                    COUNT(n.id) as count
                FROM noticias n
                JOIN categorias c ON c.id = n.categoria_id
                GROUP BY c.nombre
                ORDER BY count DESC
            """)

            rows = cur.fetchall()

    # Process Top 8 + Others
    labels = []
    data = []
    others_count = 0
    top_limit = 8

    for i, row in enumerate(rows):
        if i < top_limit:
            labels.append(row[0])
            data.append(row[1])
        else:
            others_count += row[1]

    if others_count > 0:
        labels.append("Otros")
        data.append(others_count)

    return jsonify({
        "labels": labels,
        "data": data
    })


@stats_bp.route("/api/countries")
@cached(ttl_seconds=300, prefix="stats")
def countries_data():
    """Get news count per country (Top 10 + Others)."""
    with get_read_conn() as conn:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT
                    p.nombre,
                    COUNT(n.id) as count
                FROM noticias n
                JOIN paises p ON p.id = n.pais_id
                GROUP BY p.nombre
                ORDER BY count DESC
            """)

            rows = cur.fetchall()

    # Process Top 10 + Others
    labels = []
    data = []
    others_count = 0
    top_limit = 10

    for i, row in enumerate(rows):
        if i < top_limit:
            labels.append(row[0])
            data.append(row[1])
        else:
            others_count += row[1]

    return jsonify({
        "labels": labels,
        "data": data
    })


@stats_bp.route("/api/countries/list")
def countries_list():
    """Get alphabetical list of all countries with flags."""
    from utils import country_flag
    with get_read_conn() as conn:
        with conn.cursor() as cur:
            cur.execute("SELECT nombre FROM paises ORDER BY nombre ASC")
            rows = cur.fetchall()

    return jsonify([
        {"name": row[0], "flag": country_flag(row[0])}
        for row in rows
    ])


@stats_bp.route("/api/translations/activity")
def translations_activity_data():
    """Get translation count per day for the last 30 days."""
    days = 30
    start_date = (datetime.utcnow() - timedelta(days=days)).date()

    with get_read_conn() as conn:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT
                    created_at::date as day,
                    COUNT(*) as count
                FROM traducciones
                WHERE created_at >= %s
                GROUP BY day
                ORDER BY day
            """, (start_date,))

            rows = cur.fetchall()

    # Fill gaps
    data_map = {row[0]: row[1] for row in rows}
    labels = []
    data = []

    current = start_date
    end = datetime.utcnow().date()

    while current <= end:
        labels.append(current.strftime("%Y-%m-%d"))
        data.append(data_map.get(current, 0))
        current += timedelta(days=1)

    return jsonify({
        "labels": labels,
        "data": data
    })


@stats_bp.route("/api/translations/languages")
@cached(ttl_seconds=60, prefix="stats")
def translations_languages_data():
    """Get translation count per source language."""
    # Friendly names for common languages
    LANG_NAMES = {
        'en': 'Inglés',
        'es': 'Español',
        'fr': 'Francés',
        'de': 'Alemán',
        'it': 'Italiano',
        'pt': 'Portugués',
        'ru': 'Ruso',
        'zh': 'Chino',
        'ja': 'Japonés',
        'ar': 'Árabe'
    }

    with get_read_conn() as conn:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT
                    lang_from,
                    COUNT(*) as count
                FROM translation_stats
                WHERE lang_from IS NOT NULL
                GROUP BY lang_from
                ORDER BY count DESC
            """)

            rows = cur.fetchall()

    labels = []
    data = []
    for code, count in rows:
        code = code.strip().lower()
        labels.append(LANG_NAMES.get(code, code.upper()))
        data.append(count)

    return jsonify({
        "labels": labels,
        "data": data
    })

def get_system_uptime():
    try:
        with open('/proc/uptime', 'r') as f:
            uptime_seconds = float(f.readline().split()[0])
            days = int(uptime_seconds // (24 * 3600))
            hours = int((uptime_seconds % (24 * 3600)) // 3600)
            minutes = int((uptime_seconds % 3600) // 60)
            if days > 0:
                return f"{days}d {hours}h {minutes}m"
            return f"{hours}h {minutes}m"
    except:
        return "N/A"

def get_gpu_info():
    try:
        cmd = "nvidia-smi --query-gpu=name,temperature.gpu,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits"
        with open(os.devnull, 'w') as devnull:
            res = subprocess.check_output(cmd, shell=True, stderr=devnull).decode().strip()
        parts = [p.strip() for p in res.split(',')]
        if len(parts) >= 5:
            return {
                "name": parts[0],
                "temp": f"{parts[1]}°C",
                "util": f"{parts[2]}%",
                "mem": f"{parts[3]} MB / {parts[4]} MB"
            }
    except:
        pass
    return None

def get_cpu_info():
    try:
        load = os.getloadavg()
        cores = os.cpu_count()
        return {
            "load": f"{load[0]:.2f}, {load[1]:.2f}, {load[2]:.2f}",
            "cores": cores
        }
    except:
        return None

@stats_bp.route("/api/system/info")
def system_info_api():
    """Endpoint for real-time system monitoring."""
    return jsonify({
        "uptime": get_system_uptime(),
        "gpu": get_gpu_info(),
        "cpu": get_cpu_info(),
        "timestamp": datetime.now().strftime("%H:%M:%S")
    })


@stats_bp.route("/api/translations/rate")
@cached(ttl_seconds=60, prefix="stats")
def translations_rate_data():
    """Get translation count for the specified range (1h, 8h, 24h, 7d)."""
    # Parameters
    from flask import request

    range_param = request.args.get("range", "1h")

    # Default: 1h -> group by minute
    minutes = 60
    interval_sql = "minute"
    timedelta_step = timedelta(minutes=1)
    date_format = "%H:%M"

    if range_param == "8h":
        minutes = 8 * 60
        interval_sql = "minute" # Still group by minute for detailed graph? Or 5 mins?
        # Let's simple group by minute but it might be dense. 480 points. Fine.
        timedelta_step = timedelta(minutes=1)
        date_format = "%H:%M"

    elif range_param == "24h":
        minutes = 24 * 60
        # Group by 15 minutes? Postgres: date_trunc('hour', ...) or extract?
        # Let's use custom grouping? Or simple 'hour' is too granular? 1440 mins.
        # Let's group by hour for 24h to be safe/clean
        interval_sql = "hour"
        timedelta_step = timedelta(hours=1)
        date_format = "%H:%M"

    elif range_param == "7d":
        minutes = 7 * 24 * 60
        interval_sql = "hour" # 7 * 24 = 168 points
        timedelta_step = timedelta(hours=1)
        date_format = "%Y-%m-%d %H:%M"

    start_time = datetime.utcnow() - timedelta(minutes=minutes)

    with get_read_conn() as conn:
        with conn.cursor() as cur:
            # Query translation_stats instead of traducciones
            cur.execute(f"""
                SELECT
                    date_trunc('{interval_sql}', created_at) as time_slot,
                    COUNT(*) as count
                FROM translation_stats
                WHERE created_at >= %s
                GROUP BY time_slot
                ORDER BY time_slot
            """, (start_time,))

            rows = cur.fetchall()

    # Fill gaps
    data_map = {row[0]: row[1] for row in rows}
    labels = []
    data = []

    # Iterate by step
    # Align start_time to step if possible (lazy alignment)
    current = start_time.replace(second=0, microsecond=0)
    if interval_sql == "hour":
        current = current.replace(minute=0)

    end = datetime.utcnow().replace(second=0, microsecond=0)
    if interval_sql == "hour":
        end = end.replace(minute=0) + timedelta(hours=1) # Ensure we cover current partial hour

    while current <= end:
        labels.append(current.strftime(date_format))
        data.append(data_map.get(current, 0))
        current += timedelta_step

    return jsonify({
        "labels": labels,
        "data": data
    })


@stats_bp.route("/entities")
def entities_dashboard():
    """Dashboard for Named Entities statistics."""
    return render_template("stats_entities.html")


@stats_bp.route("/api/entities/people")
def entities_people():
    """Top 25 mentioned people, optionally filtered by country and/or date."""
    from flask import request
    from datetime import datetime
    from cache import cache_get, cache_set

    # 1. Check config mtime for cache invalidation
    try:
        config_mtime = os.path.getmtime(CONFIG_FILE)
    except OSError:
        config_mtime = 0

    country_filter = request.args.get("country")
    date_filter = request.args.get("date")

    # 2. Build cache key with mtime
    cache_key = f"entities:people:{country_filter}:{date_filter}:{config_mtime}"

    # 3. Try cache
    cached_data = cache_get(cache_key)
    if cached_data:
        return jsonify(cached_data)

    # Determine time range
    if date_filter:
        # Single day query
        try:
            target_date = datetime.strptime(date_filter, "%Y-%m-%d").date()
            time_condition = "DATE(tr.created_at) = %s"
            time_params = [target_date]
        except ValueError:
            # Invalid date format, fallback to 30 days
            time_condition = "tr.created_at >= NOW() - INTERVAL '30 days'"
            time_params = []
    else:
        # Default: last 30 days
        time_condition = "tr.created_at >= NOW() - INTERVAL '30 days'"
        time_params = []

    if country_filter and country_filter != 'global':
        # Filtered by country
        query = f"""
            SELECT t.valor, COUNT(*) as menciones
            FROM tags t
            JOIN tags_noticia tn ON tn.tag_id = t.id
            JOIN traducciones tr ON tn.traduccion_id = tr.id
            JOIN noticias n ON tr.noticia_id = n.id
            WHERE t.tipo = 'persona'
              AND {time_condition}
              AND n.pais_id = (SELECT id FROM paises WHERE nombre ILIKE %s LIMIT 1)
            GROUP BY t.valor
            ORDER BY menciones DESC
        """
        params = tuple(time_params + [country_filter])
    else:
        # Global view
        query = f"""
            SELECT t.valor, COUNT(*) as menciones
            FROM tags t
            JOIN tags_noticia tn ON tn.tag_id = t.id
            JOIN traducciones tr ON tn.traduccion_id = tr.id
            WHERE t.tipo = 'persona'
              AND {time_condition}
            GROUP BY t.valor
            ORDER BY menciones DESC
        """
        params = tuple(time_params)

    with get_read_conn() as conn:
        with conn.cursor() as cur:
            cur.execute(query, params)
            rows = cur.fetchall()

    # Normalize and aggregate
    normalized_rows = aggregate_normalized_entities(rows, entity_type='persona')

    # Take top 50
    top_50 = normalized_rows[:50]

    # Enrich with Wikipedia Images (Parallel Execution)
    from concurrent.futures import ThreadPoolExecutor
    from utils.wiki import fetch_wiki_data

    images = []
    summaries = []

    def get_image_safe(name):
        try:
            return fetch_wiki_data(name)
        except Exception:
            return None, None

    if top_50:
        names = [row[0] for row in top_50]
        with ThreadPoolExecutor(max_workers=10) as executor:
            try:
                results = list(executor.map(get_image_safe, names))

                # Unpack results
                for img, smry in results:
                    images.append(img)
                    summaries.append(smry)
            except Exception as e:
                import logging
                logging.error(f"Error fetching wiki data: {e}")
                # Fallback to empty if threading fails
                images = [None] * len(names)
                summaries = [None] * len(names)
    else:
        images = []
        summaries = []

    result = {
        "labels": [row[0] for row in top_50],
        "data": [row[1] for row in top_50],
        "images": images,
        "summaries": summaries
    }

    # 4. Set cache
    cache_set(cache_key, result, ttl_seconds=600)

    return jsonify(result)


@stats_bp.route("/api/entities/orgs")
def entities_orgs():
    """Top mentioned organizations, optionally filtered by country."""
    from flask import request
    from cache import cache_get, cache_set

    country_filter = request.args.get("country")

    try:
        config_mtime = os.path.getmtime(CONFIG_FILE)
    except OSError:
        config_mtime = 0

    cache_key = f"entities:orgs:{country_filter}:{config_mtime}"

    cached_data = cache_get(cache_key)
    if cached_data:
        return jsonify(cached_data)

    if country_filter and country_filter != 'global':
        query = """
            SELECT t.valor, COUNT(*) as menciones
            FROM tags t
            JOIN tags_noticia tn ON tn.tag_id = t.id
            JOIN traducciones tr ON tn.traduccion_id = tr.id
            JOIN noticias n ON tr.noticia_id = n.id
            WHERE t.tipo = 'organizacion'
              AND tr.created_at >= NOW() - INTERVAL '30 days'
              AND n.pais_id = (SELECT id FROM paises WHERE nombre ILIKE %s LIMIT 1)
            GROUP BY t.valor
            ORDER BY menciones DESC
            LIMIT 50
        """
        params = (country_filter,)
    else:
        query = """
            SELECT t.valor, COUNT(*) as menciones
            FROM tags t
            JOIN tags_noticia tn ON tn.tag_id = t.id
            JOIN traducciones tr ON tn.traduccion_id = tr.id
            WHERE t.tipo = 'organizacion'
              AND tr.created_at >= NOW() - INTERVAL '30 days'
            GROUP BY t.valor
            ORDER BY menciones DESC
            LIMIT 50
        """
        params = ()

    with get_read_conn() as conn:
        with conn.cursor() as cur:
            cur.execute(query, params)
            rows = cur.fetchall()

    normalized_rows = aggregate_normalized_entities(rows, entity_type='organizacion')

    # Enrich with Wikipedia Images
    from concurrent.futures import ThreadPoolExecutor
    from utils.wiki import fetch_wiki_data

    images = []
    summaries = []

    def get_info_safe(name):
        try:
            return fetch_wiki_data(name)
        except Exception:
            return None, None

    if normalized_rows:
        names = [row[0] for row in normalized_rows]
        with ThreadPoolExecutor(max_workers=10) as executor:
            results = list(executor.map(get_info_safe, names))
            for img, smry in results:
                images.append(img)
                summaries.append(smry)

    result = {
        "labels": [row[0] for row in normalized_rows],
        "data": [row[1] for row in normalized_rows],
        "images": images,
        "summaries": summaries
    }

    cache_set(cache_key, result, ttl_seconds=600)
    return jsonify(result)


@stats_bp.route("/api/entities/places")
def entities_places():
    """Top mentioned places, optionally filtered by country."""
    from flask import request
    from cache import cache_get, cache_set

    country_filter = request.args.get("country")

    try:
        config_mtime = os.path.getmtime(CONFIG_FILE)
    except OSError:
        config_mtime = 0

    cache_key = f"entities:places:{country_filter}:{config_mtime}"

    cached_data = cache_get(cache_key)
    if cached_data:
        return jsonify(cached_data)

    if country_filter and country_filter != 'global':
        query = """
            SELECT t.valor, COUNT(*) as menciones
            FROM tags t
            JOIN tags_noticia tn ON tn.tag_id = t.id
            JOIN traducciones tr ON tn.traduccion_id = tr.id
            JOIN noticias n ON tr.noticia_id = n.id
            WHERE t.tipo = 'lugar'
              AND tr.created_at >= NOW() - INTERVAL '30 days'
              AND n.pais_id = (SELECT id FROM paises WHERE nombre ILIKE %s LIMIT 1)
              AND n.pais_id != (SELECT id FROM paises WHERE nombre = 'España')
            GROUP BY t.valor
            ORDER BY menciones DESC
            LIMIT 50
        """
        params = (country_filter,)
    else:
        query = """
            SELECT t.valor, COUNT(*) as menciones
            FROM tags t
            JOIN tags_noticia tn ON tn.tag_id = t.id
            JOIN traducciones tr ON tn.traduccion_id = tr.id
            JOIN noticias n ON tr.noticia_id = n.id
            WHERE t.tipo = 'lugar'
              AND tr.created_at >= NOW() - INTERVAL '30 days'
              AND n.pais_id != (SELECT id FROM paises WHERE nombre = 'España')
            GROUP BY t.valor
            ORDER BY menciones DESC
            LIMIT 50
        """
        params = ()

    with get_read_conn() as conn:
        with conn.cursor() as cur:
            cur.execute(query, params)
            rows = cur.fetchall()

    # Normalize
    normalized_rows = aggregate_normalized_entities(rows, entity_type='lugar')

    # Enrich with Wikipedia Images
    from concurrent.futures import ThreadPoolExecutor
    from utils.wiki import fetch_wiki_data

    images = []
    summaries = []

    def get_info_safe(name):
        try:
            return fetch_wiki_data(name)
        except Exception:
            return None, None

    if normalized_rows:
        names = [row[0] for row in normalized_rows]
        with ThreadPoolExecutor(max_workers=10) as executor:
            results = list(executor.map(get_info_safe, names))
            for img, smry in results:
                images.append(img)
                summaries.append(smry)

    result = {
        "labels": [row[0] for row in normalized_rows],
        "data": [row[1] for row in normalized_rows],
        "images": images,
        "summaries": summaries
    }

    cache_set(cache_key, result, ttl_seconds=600)
    return jsonify(result)