From 0975f44a0ea18cb0952edf1207de91ea16d7552c Mon Sep 17 00:00:00 2001
From: SITO <sito@ransomsito.ransomsito>
Date: Sun, 29 Mar 2026 19:41:11 +0200
Subject: [PATCH] =?UTF-8?q?feat:=20scraper=20de=20im=C3=A1genes=20Wikipedi?=
 =?UTF-8?q?a=20+=20debug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- wikipedia_image_scraper.py: descarga imágenes de Wikipedia por tema
  usando Wikimedia API, con filtros de tamaño/extensión y metadatos
  (autor, licencia, dimensiones, artículo origen)
- debug_wiki.py: script de diagnóstico para verificar API responses
- .gitignore: excluye output/ y __pycache__

Fix: normalizar prefijo "Archivo:" → "File:" para Wikimedia Commons API

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 BACK_BACK/IMAGENES/.gitignore                 |   3 +
 BACK_BACK/IMAGENES/debug_wiki.py              |  30 ++
 BACK_BACK/IMAGENES/wikipedia_image_scraper.py | 446 ++++++++++++++++++
 3 files changed, 479 insertions(+)
 create mode 100644 BACK_BACK/IMAGENES/.gitignore
 create mode 100644 BACK_BACK/IMAGENES/debug_wiki.py
 create mode 100644 BACK_BACK/IMAGENES/wikipedia_image_scraper.py

diff --git a/BACK_BACK/IMAGENES/.gitignore b/BACK_BACK/IMAGENES/.gitignore
new file mode 100644
index 00000000..d8c14e4b
--- /dev/null
+++ b/BACK_BACK/IMAGENES/.gitignore
@@ -0,0 +1,3 @@
+__pycache__/
+output/
+*.pyc
diff --git a/BACK_BACK/IMAGENES/debug_wiki.py b/BACK_BACK/IMAGENES/debug_wiki.py
new file mode 100644
index 00000000..73fb032a
--- /dev/null
+++ b/BACK_BACK/IMAGENES/debug_wiki.py
@@ -0,0 +1,30 @@
+"""Script de debug para ver qué devuelve la API de Wikipedia/Wikimedia."""
+import requests
+from wikipedia_image_scraper import (
+    search_articles, get_article_images, get_image_info, should_skip, SKIP_PATTERNS
+)
+
+# 1. Buscar artículos
+print("=== ARTÍCULOS ===")
+articles = search_articles("cambio climático", lang="es", limit=2)
+for a in articles:
+    print(f"  {a['title']}")
+
+# 2. Imágenes del primer artículo
+print("\n=== IMÁGENES DEL ARTÍCULO ===")
+img_titles = get_article_images(articles[0]["title"], lang="es", limit=10)
+for t in img_titles:
+    print(f"  {t}")
+
+# 3. Info de las primeras 5 imágenes
+print("\n=== INFO DE CADA IMAGEN ===")
+for title in img_titles[:5]:
+    print(f"\n  Título: {title}")
+    info = get_image_info(title)
+    if info is None:
+        print("    → get_image_info devolvió None")
+        continue
+    print(f"    url:    {info.get('url', 'N/A')[:80]}")
+    print(f"    size:   {info.get('width')}x{info.get('height')}  {info.get('size_bytes')}B")
+    skip, motivo = should_skip(title, info)
+    print(f"    skip:   {skip}  ({motivo})")
diff --git a/BACK_BACK/IMAGENES/wikipedia_image_scraper.py b/BACK_BACK/IMAGENES/wikipedia_image_scraper.py
new file mode 100644
index 00000000..b5840e99
--- /dev/null
+++ b/BACK_BACK/IMAGENES/wikipedia_image_scraper.py
@@ -0,0 +1,446 @@
+"""
+wikipedia_image_scraper.py
+--------------------------
+Descarga imágenes de artículos de Wikipedia por tema usando la Wikimedia API.
+Las guarda en una carpeta local y registra los metadatos en JSON / MongoDB.
+
+Flujo:
+  1. Busca artículos en Wikipedia por tema/keyword
+  2. Para cada artículo extrae las imágenes (Wikimedia API)
+  3. Filtra imágenes no relevantes (iconos, banderas, logos pequeños...)
+  4. Descarga las imágenes a la carpeta de destino
+  5. Guarda metadatos: título del artículo, tema, url, descripción, fecha
+  6. Opcional: guarda metadatos en MongoDB colección 'imagenes_wiki'
+
+Uso:
+    python wikipedia_image_scraper.py --tema "cambio climático" --max 30
+    python wikipedia_image_scraper.py --tema "geopolítica" --lang es --max 50
+    python wikipedia_image_scraper.py --temas temas.txt --max 20
+    python wikipedia_image_scraper.py --tema "climate change" --lang en --max 40
+
+Requisitos:
+    pip install requests Pillow pymongo python-dotenv
+"""
+
+import argparse
+import json
+import os
+import time
+from datetime import datetime
+from pathlib import Path
+from urllib.parse import quote, urlparse
+
+import requests
+from PIL import Image, UnidentifiedImageError
+
+# ── Configuración ──────────────────────────────────────────────────────────────
+
+WIKI_API_ES  = "https://es.wikipedia.org/w/api.php"
+WIKI_API_EN  = "https://en.wikipedia.org/w/api.php"
+WIKIMEDIA_API = "https://commons.wikimedia.org/w/api.php"
+
+OUTPUT_BASE  = Path(__file__).parent / "output" / "wiki_images"
+
+# Tamaño mínimo para considerar una imagen relevante (pixels)
+MIN_WIDTH  = 200
+MIN_HEIGHT = 200
+MIN_BYTES  = 20_000   # 20KB mínimo
+
+# Extensiones válidas
+VALID_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp"}
+
+# Prefijos/sufijos de archivos a ignorar (iconos, banderas, etc.)
+SKIP_PATTERNS = [
+    "flag_", "Flag_", "icon", "Icon", "logo", "Logo",
+    "symbol", "Symbol", "coat_of_arms", "Coat_of_arms",
+    "commons-logo", "wiki", "Wiki", "question_mark",
+    "edit-", "nuvola", "Nuvola", "pictogram", "Pictogram",
+    "OOjs", "Ambox", "Portal-", "Disambig",
+]
+
+HEADERS = {
+    "User-Agent": "FLUJOS-Project/1.0 (https://gitea.laenre.net/hacklab/FLUJOS; educational research)"
+}
+
+
+# ── Funciones de búsqueda Wikipedia ───────────────────────────────────────────
+
+def search_articles(tema: str, lang: str = "es", limit: int = 10) -> list[dict]:
+    """Busca artículos en Wikipedia por tema. Devuelve lista de {title, pageid}."""
+    api_url = WIKI_API_EN if lang == "en" else WIKI_API_ES
+
+    params = {
+        "action":   "query",
+        "list":     "search",
+        "srsearch": tema,
+        "srlimit":  limit,
+        "format":   "json",
+        "srinfo":   "totalhits",
+        "srprop":   "snippet|titlesnippet",
+    }
+
+    resp = requests.get(api_url, params=params, headers=HEADERS, timeout=15)
+    resp.raise_for_status()
+    data = resp.json()
+
+    articles = []
+    for item in data.get("query", {}).get("search", []):
+        articles.append({
+            "title":   item["title"],
+            "pageid":  item["pageid"],
+            "snippet": item.get("snippet", "").replace("<span class=\"searchmatch\">", "").replace("</span>", ""),
+        })
+
+    return articles
+
+
+def get_article_images(title: str, lang: str = "es", limit: int = 20) -> list[str]:
+    """Obtiene lista de nombres de archivo de imágenes de un artículo Wikipedia."""
+    api_url = WIKI_API_EN if lang == "en" else WIKI_API_ES
+
+    params = {
+        "action":  "query",
+        "titles":  title,
+        "prop":    "images",
+        "imlimit": limit,
+        "format":  "json",
+    }
+
+    resp = requests.get(api_url, params=params, headers=HEADERS, timeout=15)
+    resp.raise_for_status()
+    data = resp.json()
+
+    pages = data.get("query", {}).get("pages", {})
+    image_titles = []
+    for page in pages.values():
+        for img in page.get("images", []):
+            image_titles.append(img["title"])
+
+    return image_titles
+
+
+def get_image_info(file_title: str) -> dict | None:
+    """
+    Obtiene info de una imagen via Wikimedia API:
+    url directa de descarga, dimensiones, descripción, autor, licencia.
+    """
+    # Normalizar namespace: Wikipedia ES usa "Archivo:", Commons usa "File:"
+    for prefix in ("Archivo:", "Fichero:", "Image:", "Imagen:"):
+        if file_title.startswith(prefix):
+            file_title = "File:" + file_title[len(prefix):]
+            break
+
+    params = {
+        "action":   "query",
+        "titles":   file_title,
+        "prop":     "imageinfo",
+        "iiprop":   "url|size|extmetadata",
+        "iiurlwidth": 1200,
+        "format":   "json",
+    }
+
+    resp = requests.get(WIKIMEDIA_API, params=params, headers=HEADERS, timeout=15)
+    resp.raise_for_status()
+    data = resp.json()
+
+    pages = data.get("query", {}).get("pages", {})
+    for page in pages.values():
+        infos = page.get("imageinfo", [])
+        if not infos:
+            return None
+        info = infos[0]
+
+        ext_meta = info.get("extmetadata", {})
+        return {
+            "url":          info.get("thumburl") or info.get("url"),
+            "url_original": info.get("url"),
+            "width":        info.get("width", 0),
+            "height":       info.get("height", 0),
+            "size_bytes":   info.get("size", 0),
+            "descripcion":  ext_meta.get("ImageDescription", {}).get("value", ""),
+            "autor":        ext_meta.get("Artist", {}).get("value", ""),
+            "licencia":     ext_meta.get("LicenseShortName", {}).get("value", ""),
+            "fecha_orig":   ext_meta.get("DateTimeOriginal", {}).get("value", ""),
+        }
+
+    return None
+
+
+# ── Filtros ────────────────────────────────────────────────────────────────────
+
+def should_skip(file_title: str, img_info: dict) -> tuple[bool, str]:
+    """Devuelve (skip, motivo) — True si la imagen debe descartarse."""
+    filename = Path(file_title).name
+
+    # Extensión válida
+    ext = Path(filename).suffix.lower()
+    if ext not in VALID_EXTENSIONS:
+        return True, f"extensión no válida: {ext}"
+
+    # Patrones a ignorar
+    for pattern in SKIP_PATTERNS:
+        if pattern in filename:
+            return True, f"patrón ignorado: {pattern}"
+
+    # Tamaño mínimo
+    if img_info.get("width", 0) < MIN_WIDTH or img_info.get("height", 0) < MIN_HEIGHT:
+        return True, f"demasiado pequeña: {img_info.get('width')}x{img_info.get('height')}"
+
+    if img_info.get("size_bytes", 0) < MIN_BYTES:
+        return True, f"archivo demasiado pequeño: {img_info.get('size_bytes')} bytes"
+
+    return False, ""
+
+
+# ── Descarga ───────────────────────────────────────────────────────────────────
+
+def download_image(url: str, dest_path: Path) -> bool:
+    """Descarga una imagen a dest_path. Devuelve True si éxito."""
+    try:
+        resp = requests.get(url, headers=HEADERS, timeout=30, stream=True)
+        resp.raise_for_status()
+
+        with open(dest_path, "wb") as f:
+            for chunk in resp.iter_content(chunk_size=8192):
+                f.write(chunk)
+
+        # Verificar que es imagen válida con Pillow
+        with Image.open(dest_path) as img:
+            img.verify()
+
+        return True
+
+    except (UnidentifiedImageError, Exception) as e:
+        if dest_path.exists():
+            dest_path.unlink()
+        return False
+
+
+# ── Pipeline principal ─────────────────────────────────────────────────────────
+
+class WikipediaImageScraper:
+
+    def __init__(self, output_dir: Path = OUTPUT_BASE, lang: str = "es"):
+        self.output_dir = output_dir
+        self.lang = lang
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.session = requests.Session()
+
+    def scrape_tema(self, tema: str, max_images: int = 30, max_articles: int = 10) -> list[dict]:
+        """
+        Descarga imágenes de Wikipedia sobre un tema.
+
+        Args:
+            tema: tema a buscar (ej: "cambio climático")
+            max_images: máximo de imágenes a descargar
+            max_articles: máximo de artículos a explorar
+
+        Returns:
+            Lista de metadatos de imágenes descargadas
+        """
+        # Carpeta por tema
+        tema_slug = tema.lower().replace(" ", "_").replace("/", "-")[:40]
+        tema_dir = self.output_dir / tema_slug
+        tema_dir.mkdir(exist_ok=True)
+
+        print(f"\n[WikiScraper] Tema: '{tema}'  |  max_images={max_images}  |  lang={self.lang}")
+        print("-" * 50)
+
+        # 1. Buscar artículos
+        articles = search_articles(tema, lang=self.lang, limit=max_articles)
+        print(f"  Artículos encontrados: {len(articles)}")
+        for a in articles[:5]:
+            print(f"    · {a['title']}")
+        if len(articles) > 5:
+            print(f"    ... y {len(articles)-5} más")
+
+        downloaded = []
+        total_downloaded = 0
+
+        for article in articles:
+            if total_downloaded >= max_images:
+                break
+
+            print(f"\n  → {article['title']}")
+
+            # 2. Obtener imágenes del artículo
+            try:
+                img_titles = get_article_images(article["title"], lang=self.lang, limit=25)
+            except Exception as e:
+                print(f"     ERROR obteniendo imágenes: {e}")
+                continue
+
+            print(f"     {len(img_titles)} imágenes en el artículo")
+
+            for img_title in img_titles:
+                if total_downloaded >= max_images:
+                    break
+
+                # 3. Obtener info de la imagen
+                try:
+                    img_info = get_image_info(img_title)
+                    time.sleep(0.2)  # respetar rate limit Wikimedia
+                except Exception as e:
+                    continue
+
+                if not img_info or not img_info.get("url"):
+                    continue
+
+                # 4. Filtrar
+                skip, motivo = should_skip(img_title, img_info)
+                if skip:
+                    continue
+
+                # 5. Nombre de archivo local
+                original_name = Path(urlparse(img_info["url"]).path).name
+                ext = Path(original_name).suffix.lower() or ".jpg"
+                safe_name = f"{tema_slug}_{total_downloaded:03d}{ext}"
+                dest_path = tema_dir / safe_name
+
+                # Saltar si ya existe
+                if dest_path.exists():
+                    print(f"     ↳ ya existe: {safe_name}")
+                    total_downloaded += 1
+                    continue
+
+                # 6. Descargar
+                print(f"     ↓ {safe_name}  ({img_info['width']}x{img_info['height']}  {img_info['size_bytes']//1024}KB)")
+                success = download_image(img_info["url"], dest_path)
+
+                if success:
+                    meta = {
+                        "archivo":          safe_name,
+                        "image_path":       str(dest_path.resolve()),
+                        "tema":             tema.lower(),
+                        "subtema":          article["title"].lower(),
+                        "texto":            article.get("snippet", ""),
+                        "descripcion_wiki": img_info.get("descripcion", ""),
+                        "autor":            img_info.get("autor", ""),
+                        "licencia":         img_info.get("licencia", ""),
+                        "url_original":     img_info.get("url_original", ""),
+                        "width":            img_info["width"],
+                        "height":           img_info["height"],
+                        "size_bytes":       img_info["size_bytes"],
+                        "source_type":      "wikipedia_imagen",
+                        "lang":             self.lang,
+                        "fecha":            datetime.now().strftime("%Y-%m-%d"),
+                        "articulo_wiki":    article["title"],
+                        "keywords":         [],  # se rellenan con image_analyzer.py
+                    }
+                    downloaded.append(meta)
+                    total_downloaded += 1
+                else:
+                    print(f"     ✗ fallo descarga")
+
+        print(f"\n[WikiScraper] Descargadas: {total_downloaded} imágenes en {tema_dir}")
+        return downloaded
+
+    def scrape_multitema(self, temas: list[str], max_per_tema: int = 20) -> list[dict]:
+        """Descarga imágenes para múltiples temas."""
+        all_results = []
+        for tema in temas:
+            results = self.scrape_tema(tema, max_images=max_per_tema)
+            all_results.extend(results)
+            time.sleep(1)  # pausa entre temas
+        return all_results
+
+    def save_metadata(self, metadata: list[dict], json_path: Path = None) -> Path:
+        """Guarda metadatos en JSON."""
+        if json_path is None:
+            ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+            json_path = self.output_dir / f"metadata_{ts}.json"
+
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump(metadata, f, ensure_ascii=False, indent=2)
+
+        print(f"[WikiScraper] Metadatos guardados: {json_path}")
+        return json_path
+
+    def save_to_mongo(self, metadata: list[dict]) -> dict:
+        """Guarda metadatos en MongoDB colección 'imagenes_wiki'."""
+        from mongo_helper import MongoHelper
+        mongo = MongoHelper()
+
+        if not mongo.is_available():
+            print("[WikiScraper] MongoDB no disponible — solo JSON local")
+            return {"inserted": 0, "updated": 0}
+
+        # Usar colección imagenes_wiki para no mezclar con imagenes analizadas
+        db = mongo.connect()
+        from pymongo import UpdateOne
+        col = db["imagenes_wiki"]
+        col.create_index("archivo", unique=True)
+
+        ops = [
+            UpdateOne({"archivo": doc["archivo"]}, {"$set": doc}, upsert=True)
+            for doc in metadata
+        ]
+        if ops:
+            result = col.bulk_write(ops)
+            stats = {"inserted": result.upserted_count, "updated": result.modified_count}
+        else:
+            stats = {"inserted": 0, "updated": 0}
+
+        print(f"[WikiScraper] MongoDB imagenes_wiki → {stats}")
+        mongo.disconnect()
+        return stats
+
+
+# ── CLI ────────────────────────────────────────────────────────────────────────
+
+# Temas de FLUJOS por defecto
+TEMAS_FLUJOS = [
+    "cambio climático",
+    "geopolítica conflictos",
+    "seguridad internacional espionaje",
+    "libertad de prensa periodismo",
+    "corporaciones poder económico",
+    "populismo extremismo",
+    "desinformación redes sociales",
+    "privacidad vigilancia masiva",
+    "biodiversidad medioambiente",
+    "inteligencia artificial algoritmos",
+]
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Descarga imágenes de Wikipedia por tema")
+
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--tema",   help="Tema único a buscar (ej: 'cambio climático')")
+    group.add_argument("--temas",  help="Fichero .txt con un tema por línea")
+    group.add_argument("--flujos", action="store_true", help="Usar los temas de FLUJOS por defecto")
+
+    parser.add_argument("--max",    type=int, default=20,  help="Máximo imágenes por tema (default: 20)")
+    parser.add_argument("--lang",   default="es",          help="Idioma Wikipedia: es | en (default: es)")
+    parser.add_argument("--output", default=str(OUTPUT_BASE), help="Carpeta de destino")
+    parser.add_argument("--mongo",  action="store_true",   help="Guardar metadatos en MongoDB")
+
+    args = parser.parse_args()
+
+    scraper = WikipediaImageScraper(output_dir=Path(args.output), lang=args.lang)
+
+    # Determinar lista de temas
+    if args.flujos:
+        temas = TEMAS_FLUJOS
+    elif args.temas:
+        with open(args.temas, encoding="utf-8") as f:
+            temas = [l.strip() for l in f if l.strip()]
+    else:
+        temas = [args.tema]
+
+    # Ejecutar
+    if len(temas) == 1:
+        metadata = scraper.scrape_tema(temas[0], max_images=args.max)
+    else:
+        metadata = scraper.scrape_multitema(temas, max_per_tema=args.max)
+
+    # Guardar resultados
+    if metadata:
+        json_path = scraper.save_metadata(metadata)
+        print(f"\n  Total imágenes descargadas: {len(metadata)}")
+        print(f"  JSON: {json_path}")
+
+        if args.mongo:
+            scraper.save_to_mongo(metadata)
+    else:
+        print("\n  No se descargaron imágenes.")