446 lines
17 KiB
Python
446 lines
17 KiB
Python
"""
|
|
wikipedia_image_scraper.py
|
|
--------------------------
|
|
Descarga imágenes de artículos de Wikipedia por tema usando la Wikimedia API.
|
|
Las guarda en una carpeta local y registra los metadatos en JSON / MongoDB.
|
|
|
|
Flujo:
|
|
1. Busca artículos en Wikipedia por tema/keyword
|
|
2. Para cada artículo extrae las imágenes (Wikimedia API)
|
|
3. Filtra imágenes no relevantes (iconos, banderas, logos pequeños...)
|
|
4. Descarga las imágenes a la carpeta de destino
|
|
5. Guarda metadatos: título del artículo, tema, url, descripción, fecha
|
|
6. Opcional: guarda metadatos en MongoDB colección 'imagenes_wiki'
|
|
|
|
Uso:
|
|
python wikipedia_image_scraper.py --tema "cambio climático" --max 30
|
|
python wikipedia_image_scraper.py --tema "geopolítica" --lang es --max 50
|
|
python wikipedia_image_scraper.py --temas temas.txt --max 20
|
|
python wikipedia_image_scraper.py --tema "climate change" --lang en --max 40
|
|
|
|
Requisitos:
|
|
pip install requests Pillow pymongo python-dotenv
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import time
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from urllib.parse import quote, urlparse
|
|
|
|
import requests
|
|
from PIL import Image, UnidentifiedImageError
|
|
|
|
# ── Configuración ──────────────────────────────────────────────────────────────
|
|
|
|
WIKI_API_ES = "https://es.wikipedia.org/w/api.php"
|
|
WIKI_API_EN = "https://en.wikipedia.org/w/api.php"
|
|
WIKIMEDIA_API = "https://commons.wikimedia.org/w/api.php"
|
|
|
|
OUTPUT_BASE = Path(__file__).parent / "output" / "wiki_images"
|
|
|
|
# Tamaño mínimo para considerar una imagen relevante (pixels)
|
|
MIN_WIDTH = 200
|
|
MIN_HEIGHT = 200
|
|
MIN_BYTES = 20_000 # 20KB mínimo
|
|
|
|
# Extensiones válidas
|
|
VALID_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp"}
|
|
|
|
# Prefijos/sufijos de archivos a ignorar (iconos, banderas, etc.)
|
|
SKIP_PATTERNS = [
|
|
"flag_", "Flag_", "icon", "Icon", "logo", "Logo",
|
|
"symbol", "Symbol", "coat_of_arms", "Coat_of_arms",
|
|
"commons-logo", "wiki", "Wiki", "question_mark",
|
|
"edit-", "nuvola", "Nuvola", "pictogram", "Pictogram",
|
|
"OOjs", "Ambox", "Portal-", "Disambig",
|
|
]
|
|
|
|
HEADERS = {
|
|
"User-Agent": "FLUJOS-Project/1.0 (https://gitea.laenre.net/hacklab/FLUJOS; educational research)"
|
|
}
|
|
|
|
|
|
# ── Funciones de búsqueda Wikipedia ───────────────────────────────────────────
|
|
|
|
def search_articles(tema: str, lang: str = "es", limit: int = 10) -> list[dict]:
|
|
"""Busca artículos en Wikipedia por tema. Devuelve lista de {title, pageid}."""
|
|
api_url = WIKI_API_EN if lang == "en" else WIKI_API_ES
|
|
|
|
params = {
|
|
"action": "query",
|
|
"list": "search",
|
|
"srsearch": tema,
|
|
"srlimit": limit,
|
|
"format": "json",
|
|
"srinfo": "totalhits",
|
|
"srprop": "snippet|titlesnippet",
|
|
}
|
|
|
|
resp = requests.get(api_url, params=params, headers=HEADERS, timeout=15)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
|
|
articles = []
|
|
for item in data.get("query", {}).get("search", []):
|
|
articles.append({
|
|
"title": item["title"],
|
|
"pageid": item["pageid"],
|
|
"snippet": item.get("snippet", "").replace("<span class=\"searchmatch\">", "").replace("</span>", ""),
|
|
})
|
|
|
|
return articles
|
|
|
|
|
|
def get_article_images(title: str, lang: str = "es", limit: int = 20) -> list[str]:
|
|
"""Obtiene lista de nombres de archivo de imágenes de un artículo Wikipedia."""
|
|
api_url = WIKI_API_EN if lang == "en" else WIKI_API_ES
|
|
|
|
params = {
|
|
"action": "query",
|
|
"titles": title,
|
|
"prop": "images",
|
|
"imlimit": limit,
|
|
"format": "json",
|
|
}
|
|
|
|
resp = requests.get(api_url, params=params, headers=HEADERS, timeout=15)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
|
|
pages = data.get("query", {}).get("pages", {})
|
|
image_titles = []
|
|
for page in pages.values():
|
|
for img in page.get("images", []):
|
|
image_titles.append(img["title"])
|
|
|
|
return image_titles
|
|
|
|
|
|
def get_image_info(file_title: str) -> dict | None:
|
|
"""
|
|
Obtiene info de una imagen via Wikimedia API:
|
|
url directa de descarga, dimensiones, descripción, autor, licencia.
|
|
"""
|
|
# Normalizar namespace: Wikipedia ES usa "Archivo:", Commons usa "File:"
|
|
for prefix in ("Archivo:", "Fichero:", "Image:", "Imagen:"):
|
|
if file_title.startswith(prefix):
|
|
file_title = "File:" + file_title[len(prefix):]
|
|
break
|
|
|
|
params = {
|
|
"action": "query",
|
|
"titles": file_title,
|
|
"prop": "imageinfo",
|
|
"iiprop": "url|size|extmetadata",
|
|
"iiurlwidth": 1200,
|
|
"format": "json",
|
|
}
|
|
|
|
resp = requests.get(WIKIMEDIA_API, params=params, headers=HEADERS, timeout=15)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
|
|
pages = data.get("query", {}).get("pages", {})
|
|
for page in pages.values():
|
|
infos = page.get("imageinfo", [])
|
|
if not infos:
|
|
return None
|
|
info = infos[0]
|
|
|
|
ext_meta = info.get("extmetadata", {})
|
|
return {
|
|
"url": info.get("thumburl") or info.get("url"),
|
|
"url_original": info.get("url"),
|
|
"width": info.get("width", 0),
|
|
"height": info.get("height", 0),
|
|
"size_bytes": info.get("size", 0),
|
|
"descripcion": ext_meta.get("ImageDescription", {}).get("value", ""),
|
|
"autor": ext_meta.get("Artist", {}).get("value", ""),
|
|
"licencia": ext_meta.get("LicenseShortName", {}).get("value", ""),
|
|
"fecha_orig": ext_meta.get("DateTimeOriginal", {}).get("value", ""),
|
|
}
|
|
|
|
return None
|
|
|
|
|
|
# ── Filtros ────────────────────────────────────────────────────────────────────
|
|
|
|
def should_skip(file_title: str, img_info: dict) -> tuple[bool, str]:
|
|
"""Devuelve (skip, motivo) — True si la imagen debe descartarse."""
|
|
filename = Path(file_title).name
|
|
|
|
# Extensión válida
|
|
ext = Path(filename).suffix.lower()
|
|
if ext not in VALID_EXTENSIONS:
|
|
return True, f"extensión no válida: {ext}"
|
|
|
|
# Patrones a ignorar
|
|
for pattern in SKIP_PATTERNS:
|
|
if pattern in filename:
|
|
return True, f"patrón ignorado: {pattern}"
|
|
|
|
# Tamaño mínimo
|
|
if img_info.get("width", 0) < MIN_WIDTH or img_info.get("height", 0) < MIN_HEIGHT:
|
|
return True, f"demasiado pequeña: {img_info.get('width')}x{img_info.get('height')}"
|
|
|
|
if img_info.get("size_bytes", 0) < MIN_BYTES:
|
|
return True, f"archivo demasiado pequeño: {img_info.get('size_bytes')} bytes"
|
|
|
|
return False, ""
|
|
|
|
|
|
# ── Descarga ───────────────────────────────────────────────────────────────────
|
|
|
|
def download_image(url: str, dest_path: Path) -> bool:
|
|
"""Descarga una imagen a dest_path. Devuelve True si éxito."""
|
|
try:
|
|
resp = requests.get(url, headers=HEADERS, timeout=30, stream=True)
|
|
resp.raise_for_status()
|
|
|
|
with open(dest_path, "wb") as f:
|
|
for chunk in resp.iter_content(chunk_size=8192):
|
|
f.write(chunk)
|
|
|
|
# Verificar que es imagen válida con Pillow
|
|
with Image.open(dest_path) as img:
|
|
img.verify()
|
|
|
|
return True
|
|
|
|
except (UnidentifiedImageError, Exception) as e:
|
|
if dest_path.exists():
|
|
dest_path.unlink()
|
|
return False
|
|
|
|
|
|
# ── Pipeline principal ─────────────────────────────────────────────────────────
|
|
|
|
class WikipediaImageScraper:
|
|
|
|
def __init__(self, output_dir: Path = OUTPUT_BASE, lang: str = "es"):
|
|
self.output_dir = output_dir
|
|
self.lang = lang
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
self.session = requests.Session()
|
|
|
|
def scrape_tema(self, tema: str, max_images: int = 30, max_articles: int = 10) -> list[dict]:
|
|
"""
|
|
Descarga imágenes de Wikipedia sobre un tema.
|
|
|
|
Args:
|
|
tema: tema a buscar (ej: "cambio climático")
|
|
max_images: máximo de imágenes a descargar
|
|
max_articles: máximo de artículos a explorar
|
|
|
|
Returns:
|
|
Lista de metadatos de imágenes descargadas
|
|
"""
|
|
# Carpeta por tema
|
|
tema_slug = tema.lower().replace(" ", "_").replace("/", "-")[:40]
|
|
tema_dir = self.output_dir / tema_slug
|
|
tema_dir.mkdir(exist_ok=True)
|
|
|
|
print(f"\n[WikiScraper] Tema: '{tema}' | max_images={max_images} | lang={self.lang}")
|
|
print("-" * 50)
|
|
|
|
# 1. Buscar artículos
|
|
articles = search_articles(tema, lang=self.lang, limit=max_articles)
|
|
print(f" Artículos encontrados: {len(articles)}")
|
|
for a in articles[:5]:
|
|
print(f" · {a['title']}")
|
|
if len(articles) > 5:
|
|
print(f" ... y {len(articles)-5} más")
|
|
|
|
downloaded = []
|
|
total_downloaded = 0
|
|
|
|
for article in articles:
|
|
if total_downloaded >= max_images:
|
|
break
|
|
|
|
print(f"\n → {article['title']}")
|
|
|
|
# 2. Obtener imágenes del artículo
|
|
try:
|
|
img_titles = get_article_images(article["title"], lang=self.lang, limit=25)
|
|
except Exception as e:
|
|
print(f" ERROR obteniendo imágenes: {e}")
|
|
continue
|
|
|
|
print(f" {len(img_titles)} imágenes en el artículo")
|
|
|
|
for img_title in img_titles:
|
|
if total_downloaded >= max_images:
|
|
break
|
|
|
|
# 3. Obtener info de la imagen
|
|
try:
|
|
img_info = get_image_info(img_title)
|
|
time.sleep(0.2) # respetar rate limit Wikimedia
|
|
except Exception as e:
|
|
continue
|
|
|
|
if not img_info or not img_info.get("url"):
|
|
continue
|
|
|
|
# 4. Filtrar
|
|
skip, motivo = should_skip(img_title, img_info)
|
|
if skip:
|
|
continue
|
|
|
|
# 5. Nombre de archivo local
|
|
original_name = Path(urlparse(img_info["url"]).path).name
|
|
ext = Path(original_name).suffix.lower() or ".jpg"
|
|
safe_name = f"{tema_slug}_{total_downloaded:03d}{ext}"
|
|
dest_path = tema_dir / safe_name
|
|
|
|
# Saltar si ya existe
|
|
if dest_path.exists():
|
|
print(f" ↳ ya existe: {safe_name}")
|
|
total_downloaded += 1
|
|
continue
|
|
|
|
# 6. Descargar
|
|
print(f" ↓ {safe_name} ({img_info['width']}x{img_info['height']} {img_info['size_bytes']//1024}KB)")
|
|
success = download_image(img_info["url"], dest_path)
|
|
|
|
if success:
|
|
meta = {
|
|
"archivo": safe_name,
|
|
"image_path": str(dest_path.resolve()),
|
|
"tema": tema.lower(),
|
|
"subtema": article["title"].lower(),
|
|
"texto": article.get("snippet", ""),
|
|
"descripcion_wiki": img_info.get("descripcion", ""),
|
|
"autor": img_info.get("autor", ""),
|
|
"licencia": img_info.get("licencia", ""),
|
|
"url_original": img_info.get("url_original", ""),
|
|
"width": img_info["width"],
|
|
"height": img_info["height"],
|
|
"size_bytes": img_info["size_bytes"],
|
|
"source_type": "wikipedia_imagen",
|
|
"lang": self.lang,
|
|
"fecha": datetime.now().strftime("%Y-%m-%d"),
|
|
"articulo_wiki": article["title"],
|
|
"keywords": [], # se rellenan con image_analyzer.py
|
|
}
|
|
downloaded.append(meta)
|
|
total_downloaded += 1
|
|
else:
|
|
print(f" ✗ fallo descarga")
|
|
|
|
print(f"\n[WikiScraper] Descargadas: {total_downloaded} imágenes en {tema_dir}")
|
|
return downloaded
|
|
|
|
def scrape_multitema(self, temas: list[str], max_per_tema: int = 20) -> list[dict]:
|
|
"""Descarga imágenes para múltiples temas."""
|
|
all_results = []
|
|
for tema in temas:
|
|
results = self.scrape_tema(tema, max_images=max_per_tema)
|
|
all_results.extend(results)
|
|
time.sleep(1) # pausa entre temas
|
|
return all_results
|
|
|
|
def save_metadata(self, metadata: list[dict], json_path: Path = None) -> Path:
|
|
"""Guarda metadatos en JSON."""
|
|
if json_path is None:
|
|
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
json_path = self.output_dir / f"metadata_{ts}.json"
|
|
|
|
with open(json_path, "w", encoding="utf-8") as f:
|
|
json.dump(metadata, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"[WikiScraper] Metadatos guardados: {json_path}")
|
|
return json_path
|
|
|
|
def save_to_mongo(self, metadata: list[dict]) -> dict:
|
|
"""Guarda metadatos en MongoDB colección 'imagenes_wiki'."""
|
|
from mongo_helper import MongoHelper
|
|
mongo = MongoHelper()
|
|
|
|
if not mongo.is_available():
|
|
print("[WikiScraper] MongoDB no disponible — solo JSON local")
|
|
return {"inserted": 0, "updated": 0}
|
|
|
|
# Usar colección imagenes_wiki para no mezclar con imagenes analizadas
|
|
db = mongo.connect()
|
|
from pymongo import UpdateOne
|
|
col = db["imagenes_wiki"]
|
|
col.create_index("archivo", unique=True)
|
|
|
|
ops = [
|
|
UpdateOne({"archivo": doc["archivo"]}, {"$set": doc}, upsert=True)
|
|
for doc in metadata
|
|
]
|
|
if ops:
|
|
result = col.bulk_write(ops)
|
|
stats = {"inserted": result.upserted_count, "updated": result.modified_count}
|
|
else:
|
|
stats = {"inserted": 0, "updated": 0}
|
|
|
|
print(f"[WikiScraper] MongoDB imagenes_wiki → {stats}")
|
|
mongo.disconnect()
|
|
return stats
|
|
|
|
|
|
# ── CLI ────────────────────────────────────────────────────────────────────────
|
|
|
|
# Temas de FLUJOS por defecto
|
|
TEMAS_FLUJOS = [
|
|
"cambio climático",
|
|
"geopolítica conflictos",
|
|
"seguridad internacional espionaje",
|
|
"libertad de prensa periodismo",
|
|
"corporaciones poder económico",
|
|
"populismo extremismo",
|
|
"desinformación redes sociales",
|
|
"privacidad vigilancia masiva",
|
|
"biodiversidad medioambiente",
|
|
"inteligencia artificial algoritmos",
|
|
]
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Descarga imágenes de Wikipedia por tema")
|
|
|
|
group = parser.add_mutually_exclusive_group(required=True)
|
|
group.add_argument("--tema", help="Tema único a buscar (ej: 'cambio climático')")
|
|
group.add_argument("--temas", help="Fichero .txt con un tema por línea")
|
|
group.add_argument("--flujos", action="store_true", help="Usar los temas de FLUJOS por defecto")
|
|
|
|
parser.add_argument("--max", type=int, default=20, help="Máximo imágenes por tema (default: 20)")
|
|
parser.add_argument("--lang", default="es", help="Idioma Wikipedia: es | en (default: es)")
|
|
parser.add_argument("--output", default=str(OUTPUT_BASE), help="Carpeta de destino")
|
|
parser.add_argument("--mongo", action="store_true", help="Guardar metadatos en MongoDB")
|
|
|
|
args = parser.parse_args()
|
|
|
|
scraper = WikipediaImageScraper(output_dir=Path(args.output), lang=args.lang)
|
|
|
|
# Determinar lista de temas
|
|
if args.flujos:
|
|
temas = TEMAS_FLUJOS
|
|
elif args.temas:
|
|
with open(args.temas, encoding="utf-8") as f:
|
|
temas = [l.strip() for l in f if l.strip()]
|
|
else:
|
|
temas = [args.tema]
|
|
|
|
# Ejecutar
|
|
if len(temas) == 1:
|
|
metadata = scraper.scrape_tema(temas[0], max_images=args.max)
|
|
else:
|
|
metadata = scraper.scrape_multitema(temas, max_per_tema=args.max)
|
|
|
|
# Guardar resultados
|
|
if metadata:
|
|
json_path = scraper.save_metadata(metadata)
|
|
print(f"\n Total imágenes descargadas: {len(metadata)}")
|
|
print(f" JSON: {json_path}")
|
|
|
|
if args.mongo:
|
|
scraper.save_to_mongo(metadata)
|
|
else:
|
|
print("\n No se descargaron imágenes.")
|