código completo FLUJOS — snapshot limpio sin datos scrapeados

Incluye: backend Node.js/Express, visualización 3D (Three.js/3d-force-graph), scrapers Wikipedia/noticias/imágenes, analizador Qwen3-VL, pipeline maestro con systemd timer, fixes de seguridad (NoSQL injection, XSS, ReDoS, port binding) y documentación técnica completa en docs/. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-21 23:45:29 +02:00 · 2026-04-21 23:45:29 +02:00 · 83f67b76b4
commit 83f67b76b4
parent 013fe673f3
190 changed files with 193337 additions and 2 deletions
--- a/FLUJOS_DATOS/IMAGENES/image_analyzer.py
+++ b/FLUJOS_DATOS/IMAGENES/image_analyzer.py
@ -0,0 +1,324 @@
+"""
+image_analyzer.py
+-----------------
+Analiza imágenes con Qwen3-VL-8B-Instruct (HuggingFace transformers).
+Extrae tema, subtema, keywords, descripción y entidades.
+
+Mejoras:
+  - Opción 3: Resume — salta imágenes ya analizadas en MongoDB
+  - Opción 4: Prioriza imágenes cuyos artículos ya están en MongoDB
+  - Opción 5: Batch inference — procesa N imágenes a la vez (ahorra RAM en activaciones)
+
+Uso:
+    analyzer = ImageAnalyzer()
+    result   = analyzer.analyze("foto.jpg")
+    results  = analyzer.analyze_folder("./mis_imagenes/", batch_size=4)
+    results  = analyzer.analyze_folder("./mis_imagenes/", resume=True)
+"""
+
+import json
+import os
+import re
+from datetime import datetime
+from pathlib import Path
+
+import torch
+from PIL import Image
+from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
+
+# ── Configuración ──────────────────────────────────────────────────────────────
+
+MODEL_ID  = os.getenv("VISION_MODEL", "Qwen/Qwen3-VL-8B-Instruct")
+CACHE_DIR = os.getenv("HF_HOME", "/var/www/theflows.net/flujos/FLUJOS_DATOS/IMAGENES/model_cache")
+
+SUPPORTED_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"}
+
+# RAM por imagen en batch (aprox): ~500MB activaciones encoder
+# Modelo base bfloat16: ~16GB
+# Batch de 4: ~18GB total → seguro con 64GB
+DEFAULT_BATCH_SIZE = 4
+
+KEYWORD_PROMPT = """Analiza esta imagen en detalle.
+Devuelve ÚNICAMENTE un objeto JSON válido con esta estructura exacta, sin texto adicional:
+
+{
+  "tema": "tema principal de la imagen (1-3 palabras en español)",
+  "subtema": "subtema específico (1-4 palabras en español)",
+  "keywords": ["palabra1", "palabra2", "palabra3"],
+  "descripcion": "descripción breve y objetiva de lo que muestra la imagen (1-2 frases)",
+  "entidades": ["nombre_propio1", "organizacion1", "lugar1"],
+  "idioma_detectado": "es/en/fr/..."
+}
+
+Requisitos:
+- keywords: entre 8 y 15 palabras clave relevantes, en minúsculas
+- entidades: solo si son claramente visibles/identificables, puede estar vacío []
+- todo el contenido en español salvo entidades propias
+- SOLO el JSON, sin markdown ni explicaciones"""
+
+
+# ── Clase principal ────────────────────────────────────────────────────────────
+
+class ImageAnalyzer:
+
+    def __init__(self, model_id: str = MODEL_ID):
+        self.model_id   = model_id
+        self._model     = None
+        self._processor = None
+
+    def _load_model(self):
+        if self._model is not None:
+            return
+
+        print(f"[ImageAnalyzer] Cargando modelo {self.model_id}...")
+        print(f"[ImageAnalyzer] Cache: {CACHE_DIR}")
+
+        self._model = Qwen3VLForConditionalGeneration.from_pretrained(
+            self.model_id,
+            torch_dtype=torch.bfloat16,
+            device_map="cpu",
+            cache_dir=CACHE_DIR,
+        )
+        self._processor = AutoProcessor.from_pretrained(
+            self.model_id,
+            cache_dir=CACHE_DIR,
+        )
+        print("[ImageAnalyzer] Modelo cargado.")
+
+    # ── Opción 3: Resume — obtener archivos ya analizados en MongoDB ───────────
+
+    @staticmethod
+    def get_already_analyzed(mongo_url: str = None, db_name: str = None) -> set[str]:
+        """Devuelve el conjunto de nombres de archivo ya en MongoDB colección 'imagenes'."""
+        try:
+            from pymongo import MongoClient
+            url    = mongo_url or os.getenv("MONGO_URL", "mongodb://localhost:27017")
+            dbname = db_name   or os.getenv("DB_NAME",   "FLUJOS_DATOS")
+            client = MongoClient(url, serverSelectionTimeoutMS=3000)
+            client.admin.command("ping")
+            db     = client[dbname]
+            done   = set(doc["archivo"] for doc in db["imagenes"].find({}, {"archivo": 1, "_id": 0}))
+            client.close()
+            print(f"[ImageAnalyzer] Resume: {len(done)} imágenes ya analizadas en MongoDB")
+            return done
+        except Exception as e:
+            print(f"[ImageAnalyzer] Resume: MongoDB no disponible ({e}) — se analizarán todas")
+            return set()
+
+    # ── Opción 4: Priorizar imágenes cuyos artículos existen en MongoDB ────────
+
+    @staticmethod
+    def get_known_article_titles(mongo_url: str = None, db_name: str = None) -> set[str]:
+        """Devuelve títulos de artículos Wikipedia que ya tenemos en MongoDB."""
+        try:
+            from pymongo import MongoClient
+            url    = mongo_url or os.getenv("MONGO_URL", "mongodb://localhost:27017")
+            dbname = db_name   or os.getenv("DB_NAME",   "FLUJOS_DATOS")
+            client = MongoClient(url, serverSelectionTimeoutMS=3000)
+            db     = client[dbname]
+            titles = set()
+            for doc in db["wikipedia"].find({}, {"titulo": 1, "subtema": 1, "_id": 0}):
+                if doc.get("titulo"):
+                    titles.add(doc["titulo"].lower())
+                if doc.get("subtema"):
+                    titles.add(doc["subtema"].lower())
+            client.close()
+            print(f"[ImageAnalyzer] Priorización: {len(titles)} títulos conocidos en MongoDB")
+            return titles
+        except Exception:
+            return set()
+
+    @staticmethod
+    def _priority_score(img_path: Path, known_titles: set[str]) -> int:
+        """Imagen con subtema en MongoDB Wikipedia → prioridad alta (0), resto (1)."""
+        stem = img_path.parent.name.lower().replace("_", " ")
+        return 0 if any(stem in t or t in stem for t in known_titles) else 1
+
+    # ── Helpers ────────────────────────────────────────────────────────────────
+
+    def _parse_json_response(self, raw: str) -> dict:
+        raw = raw.strip()
+        match = re.search(r'\{[\s\S]*\}', raw)
+        if match:
+            return json.loads(match.group())
+        raise ValueError(f"No se encontró JSON válido:\n{raw[:300]}")
+
+    def _build_result(self, img_path: Path, parsed: dict) -> dict:
+        return {
+            "archivo":      img_path.name,
+            "image_path":   str(img_path.resolve()),
+            "tema":         parsed.get("tema", "sin_clasificar").lower(),
+            "subtema":      parsed.get("subtema", "").lower(),
+            "texto":        parsed.get("descripcion", ""),
+            "keywords":     [k.lower().strip() for k in parsed.get("keywords", [])],
+            "entidades":    parsed.get("entidades", []),
+            "idioma":       parsed.get("idioma_detectado", "es"),
+            "source_type":  "imagen",
+            "fecha":        datetime.now().strftime("%Y-%m-%d"),
+            "modelo_usado": self.model_id,
+        }
+
+    # ── Análisis de una imagen (individual) ───────────────────────────────────
+
+    def analyze(self, image_path: str, extra_context: str = "") -> dict:
+        if not os.path.exists(image_path):
+            raise FileNotFoundError(f"Imagen no encontrada: {image_path}")
+
+        self._load_model()
+        prompt = (f"Contexto adicional: {extra_context}\n\n" + KEYWORD_PROMPT) if extra_context else KEYWORD_PROMPT
+        image  = Image.open(image_path).convert("RGB")
+
+        messages = [{"role": "user", "content": [
+            {"type": "image", "image": image},
+            {"type": "text",  "text": prompt},
+        ]}]
+
+        inputs = self._processor.apply_chat_template(
+            messages, tokenize=True, add_generation_prompt=True,
+            return_dict=True, return_tensors="pt",
+        )
+        inputs = {k: v.to(self._model.device) for k, v in inputs.items()}
+
+        print(f"  → Analizando: {Path(image_path).name}")
+        with torch.no_grad():
+            generated_ids = self._model.generate(**inputs, max_new_tokens=512, do_sample=False)
+
+        trimmed = [out[len(inp):] for inp, out in zip(inputs["input_ids"], generated_ids)]
+        raw     = self._processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        return self._build_result(Path(image_path), self._parse_json_response(raw))
+
+    # ── Opción 5: Batch inference ──────────────────────────────────────────────
+
+    def analyze_batch(self, image_paths: list[str], extra_context: str = "") -> list[dict]:
+        """
+        Analiza un lote de imágenes en una sola llamada al modelo.
+        Más eficiente que N llamadas individuales.
+        RAM estimada: ~16GB modelo + ~500MB × batch_size activaciones.
+        """
+        self._load_model()
+        prompt = (f"Contexto adicional: {extra_context}\n\n" + KEYWORD_PROMPT) if extra_context else KEYWORD_PROMPT
+
+        batch_messages = []
+        valid_paths    = []
+        for path in image_paths:
+            try:
+                img = Image.open(path).convert("RGB")
+                batch_messages.append([{"role": "user", "content": [
+                    {"type": "image", "image": img},
+                    {"type": "text",  "text": prompt},
+                ]}])
+                valid_paths.append(Path(path))
+            except Exception as e:
+                print(f"  ✗ Error abriendo {path}: {e}")
+
+        if not batch_messages:
+            return []
+
+        all_inputs = [
+            self._processor.apply_chat_template(
+                msgs, tokenize=True, add_generation_prompt=True,
+                return_dict=True, return_tensors="pt",
+            )
+            for msgs in batch_messages
+        ]
+
+        # Pad manualmente para batch
+        input_ids_list      = [x["input_ids"][0]      for x in all_inputs]
+        attention_mask_list = [x["attention_mask"][0]  for x in all_inputs]
+
+        max_len       = max(t.shape[0] for t in input_ids_list)
+        pad_id        = self._processor.tokenizer.pad_token_id or 0
+        padded_ids    = torch.stack([
+            torch.nn.functional.pad(t, (max_len - t.shape[0], 0), value=pad_id)
+            for t in input_ids_list
+        ])
+        padded_masks  = torch.stack([
+            torch.nn.functional.pad(t, (max_len - t.shape[0], 0), value=0)
+            for t in attention_mask_list
+        ])
+
+        with torch.no_grad():
+            generated = self._model.generate(
+                input_ids=padded_ids.to(self._model.device),
+                attention_mask=padded_masks.to(self._model.device),
+                max_new_tokens=512,
+                do_sample=False,
+            )
+
+        results = []
+        for i, (out_ids, in_ids) in enumerate(zip(generated, padded_ids)):
+            raw = self._processor.decode(out_ids[in_ids.shape[0]:], skip_special_tokens=True)
+            try:
+                parsed = self._parse_json_response(raw)
+                results.append(self._build_result(valid_paths[i], parsed))
+                print(f"  ✓ {valid_paths[i].name} → tema={parsed.get('tema','?')}")
+            except Exception as e:
+                print(f"  ✗ {valid_paths[i].name}: {e}")
+                results.append({
+                    "archivo": valid_paths[i].name, "error": str(e),
+                    "source_type": "imagen", "fecha": datetime.now().strftime("%Y-%m-%d"),
+                })
+        return results
+
+    # ── Análisis de carpeta con todas las mejoras ──────────────────────────────
+
+    def analyze_folder(
+        self,
+        folder_path: str,
+        extra_context: str  = "",
+        resume: bool        = True,
+        batch_size: int     = DEFAULT_BATCH_SIZE,
+        prioritize: bool    = True,
+    ) -> list[dict]:
+        """
+        Args:
+            resume:     Si True, salta imágenes ya analizadas en MongoDB (opción 3)
+            prioritize: Si True, procesa primero imágenes cuyos artículos están en MongoDB (opción 4)
+            batch_size: Imágenes por lote para el modelo (opción 5). Default: 4
+        """
+        folder = Path(folder_path)
+        if not folder.exists():
+            raise FileNotFoundError(f"Carpeta no encontrada: {folder_path}")
+
+        images = sorted([
+            p for p in folder.rglob("*")
+            if p.is_file() and p.suffix.lower() in SUPPORTED_EXTENSIONS
+        ])
+        print(f"\n[ImageAnalyzer] {len(images)} imágenes encontradas en {folder_path}")
+
+        # Opción 3: Resume — filtrar ya analizadas
+        if resume:
+            done   = self.get_already_analyzed()
+            before = len(images)
+            images = [p for p in images if p.name not in done]
+            print(f"[ImageAnalyzer] Resume: {before - len(images)} saltadas, {len(images)} pendientes")
+
+        if not images:
+            print("[ImageAnalyzer] Nada que analizar.")
+            return []
+
+        # Opción 4: Priorizar por artículos conocidos en MongoDB
+        if prioritize:
+            known  = self.get_known_article_titles()
+            images = sorted(images, key=lambda p: self._priority_score(p, known))
+            print(f"[ImageAnalyzer] Priorización activada")
+
+        # Opción 5: Batch inference
+        results = []
+        total   = len(images)
+        for start in range(0, total, batch_size):
+            batch = images[start:start + batch_size]
+            end   = min(start + batch_size, total)
+            print(f"\n  [Batch {start//batch_size + 1}] imágenes {start+1}-{end}/{total}")
+            batch_results = self.analyze_batch([str(p) for p in batch], extra_context)
+            results.extend(batch_results)
+
+        ok = len([r for r in results if "error" not in r])
+        print(f"\n[ImageAnalyzer] Completado: {ok}/{total} OK\n")
+        return results
+
+    @staticmethod
+    def save_json(results: list[dict], output_path: str):
+        with open(output_path, "w", encoding="utf-8") as f:
+            json.dump(results, f, ensure_ascii=False, indent=2)
+        print(f"[ImageAnalyzer] Guardado: {output_path} ({len(results)} registros)")
--- a/FLUJOS_DATOS/IMAGENES/image_comparator.py
+++ b/FLUJOS_DATOS/IMAGENES/image_comparator.py
@ -0,0 +1,158 @@
+"""
+image_comparator.py
+-------------------
+Compara keywords de imágenes con documentos de texto (noticias, wikipedia, torrents)
+usando similitud TF-IDF coseno.
+
+Produce documentos para la colección 'comparaciones' de MongoDB,
+con la misma estructura que los comparaciones texto-texto ya existentes:
+  { noticia1, noticia2, porcentaje_similitud }
+
+Ampliado con campos opcionales: source1_type, source2_type (para saber qué se comparó).
+
+Uso:
+    comp = ImageComparator()
+    resultados = comp.compare_image_vs_collection(imagen_doc, lista_docs_texto)
+    top = comp.top_n(resultados, n=10)
+"""
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+
+
+# ── Clase principal ─────────────────────────────────────────────────────────────
+
+class ImageComparator:
+
+    def __init__(self, threshold: float = 5.0):
+        """
+        Args:
+            threshold: porcentaje mínimo de similitud para incluir en resultados (0-100)
+        """
+        self.threshold = threshold
+        self.vectorizer = TfidfVectorizer(
+            analyzer="word",
+            ngram_range=(1, 2),
+            min_df=1,
+            strip_accents="unicode",
+            lowercase=True,
+        )
+
+    # ── Conversión de documentos a texto ──────────────────────────────────────
+
+    @staticmethod
+    def doc_to_text(doc: dict) -> str:
+        """
+        Concatena los campos relevantes de un documento en un string para TF-IDF.
+        Compatible con estructura de noticias/wikipedia/torrents/imagenes.
+        """
+        parts = []
+        # keywords de imágenes (lista) — los más informativos, se repiten para darles peso
+        if doc.get("keywords"):
+            kws = doc["keywords"] if isinstance(doc["keywords"], list) else []
+            parts.extend(kws * 3)   # peso extra a keywords
+        # campos de texto estándar
+        for field in ("tema", "subtema", "texto"):
+            val = doc.get(field)
+            if val and isinstance(val, str):
+                parts.append(val)
+        # entidades
+        if doc.get("entidades"):
+            parts.extend(doc["entidades"])
+        return " ".join(parts)
+
+    # ── Comparación imagen vs lista de documentos ─────────────────────────────
+
+    def compare_image_vs_collection(
+        self,
+        image_doc: dict,
+        text_docs: list[dict],
+    ) -> list[dict]:
+        """
+        Compara una imagen contra una lista de documentos de texto.
+
+        Returns:
+            Lista de dicts ordenados por porcentaje_similitud desc, filtrados por threshold.
+        """
+        if not text_docs:
+            return []
+
+        all_docs = [image_doc] + text_docs
+        texts = [self.doc_to_text(d) for d in all_docs]
+
+        try:
+            matrix = self.vectorizer.fit_transform(texts)
+        except ValueError:
+            return []
+
+        # Similitud de imagen (índice 0) contra todos los demás
+        sims = cosine_similarity(matrix[0:1], matrix[1:]).flatten()
+
+        comparaciones = []
+        for doc, sim in zip(text_docs, sims):
+            pct = round(float(sim) * 100, 2)
+            if pct < self.threshold:
+                continue
+
+            comparaciones.append({
+                # Campos compatibles con colección 'comparaciones' existente
+                "noticia1":             image_doc.get("archivo", "imagen"),
+                "noticia2":             doc.get("archivo", str(doc.get("_id", ""))),
+                "porcentaje_similitud": pct,
+                # Campos extendidos (opcionales — no rompen queries existentes)
+                "source1_type":         "imagen",
+                "source2_type":         doc.get("source_type", "texto"),
+                "tema_imagen":          image_doc.get("tema", ""),
+                "tema_doc":             doc.get("tema", ""),
+            })
+
+        comparaciones.sort(key=lambda x: x["porcentaje_similitud"], reverse=True)
+        return comparaciones
+
+    # ── Comparación muchas imágenes vs colección ───────────────────────────────
+
+    def compare_batch(
+        self,
+        image_docs: list[dict],
+        text_docs: list[dict],
+    ) -> list[dict]:
+        """
+        Compara múltiples imágenes contra una colección de documentos.
+
+        Returns:
+            Todos los pares con similitud >= threshold, sin duplicados.
+        """
+        all_comparaciones = []
+        seen = set()
+
+        for img_doc in image_docs:
+            results = self.compare_image_vs_collection(img_doc, text_docs)
+            for r in results:
+                key = (r["noticia1"], r["noticia2"])
+                if key not in seen:
+                    seen.add(key)
+                    all_comparaciones.append(r)
+
+        all_comparaciones.sort(key=lambda x: x["porcentaje_similitud"], reverse=True)
+        return all_comparaciones
+
+    # ── Helpers ────────────────────────────────────────────────────────────────
+
+    @staticmethod
+    def top_n(comparaciones: list[dict], n: int = 20) -> list[dict]:
+        return sorted(comparaciones, key=lambda x: x["porcentaje_similitud"], reverse=True)[:n]
+
+    @staticmethod
+    def stats(comparaciones: list[dict]) -> dict:
+        if not comparaciones:
+            return {"total": 0}
+        sims = [c["porcentaje_similitud"] for c in comparaciones]
+        return {
+            "total":    len(sims),
+            "media":    round(np.mean(sims), 2),
+            "max":      round(max(sims), 2),
+            "min":      round(min(sims), 2),
+            "sobre_50": sum(1 for s in sims if s >= 50),
+            "sobre_70": sum(1 for s in sims if s >= 70),
+        }
--- a/FLUJOS_DATOS/IMAGENES/mongo_helper.py
+++ b/FLUJOS_DATOS/IMAGENES/mongo_helper.py
@ -0,0 +1,172 @@
+"""
+mongo_helper.py
+---------------
+Operaciones MongoDB para la colección 'imagenes' y extensión de 'comparaciones'.
+Compatible con la estructura existente de FLUJOS_DATOS.
+
+Uso:
+    mongo = MongoHelper()
+    mongo.upsert_imagenes(lista_docs)
+    mongo.insert_comparaciones(lista_comparaciones)
+    docs = mongo.get_collection_sample("noticias", limit=100)
+"""
+
+import os
+from pymongo import MongoClient, UpdateOne
+from pymongo.errors import ConnectionFailure
+
+MONGO_URL = os.getenv("MONGO_URL", "mongodb://localhost:27017")
+DB_NAME   = os.getenv("DB_NAME", "FLUJOS_DATOS")
+
+
+class MongoHelper:
+    def __init__(self, mongo_url: str = MONGO_URL, db_name: str = DB_NAME):
+        self.mongo_url = mongo_url
+        self.db_name = db_name
+        self._client = None
+        self._db = None
+
+    # ── Conexión ───────────────────────────────────────────────────────────────
+
+    def connect(self):
+        if self._client is None:
+            self._client = MongoClient(self.mongo_url, serverSelectionTimeoutMS=5000)
+            self._client.admin.command("ping")
+            self._db = self._client[self.db_name]
+            print(f"[MongoDB] Conectado a {self.mongo_url} / {self.db_name}")
+        return self._db
+
+    def disconnect(self):
+        if self._client:
+            self._client.close()
+            self._client = None
+            self._db = None
+
+    def is_available(self) -> bool:
+        try:
+            self.connect()
+            return True
+        except ConnectionFailure:
+            return False
+
+    # ── Colección IMAGENES ─────────────────────────────────────────────────────
+
+    def upsert_imagenes(self, docs: list[dict]) -> dict:
+        """
+        Inserta o actualiza documentos en la colección 'imagenes'.
+        Usa 'archivo' como clave única (upsert por nombre de archivo).
+
+        Returns: {'inserted': N, 'updated': N}
+        """
+        db = self.connect()
+        collection = db["imagenes"]
+        collection.create_index("archivo", unique=True)
+
+        ops = [
+            UpdateOne(
+                {"archivo": doc["archivo"]},
+                {"$set": doc},
+                upsert=True
+            )
+            for doc in docs if "error" not in doc
+        ]
+
+        if not ops:
+            return {"inserted": 0, "updated": 0}
+
+        result = collection.bulk_write(ops)
+        stats = {
+            "inserted": result.upserted_count,
+            "updated":  result.modified_count,
+        }
+        print(f"[MongoDB] imagenes → {stats}")
+        return stats
+
+    def get_imagenes(self, tema: str = None, limit: int = 500) -> list[dict]:
+        """Recupera documentos de la colección 'imagenes'."""
+        db = self.connect()
+        query = {"tema": {"$regex": tema, "$options": "i"}} if tema else {}
+        return list(db["imagenes"].find(query, {"_id": 0}).limit(limit))
+
+    # ── Colección COMPARACIONES ────────────────────────────────────────────────
+
+    def insert_comparaciones(self, comparaciones: list[dict], replace_existing: bool = False) -> int:
+        """
+        Inserta comparaciones imagen-texto en la colección 'comparaciones'.
+        Evita duplicados por (noticia1, noticia2).
+
+        Returns: número de documentos insertados
+        """
+        db = self.connect()
+        collection = db["comparaciones"]
+
+        ops = []
+        for comp in comparaciones:
+            filter_q = {"noticia1": comp["noticia1"], "noticia2": comp["noticia2"]}
+            update_q  = {"$set": comp} if replace_existing else {"$setOnInsert": comp}
+            ops.append(UpdateOne(filter_q, update_q, upsert=True))
+
+        if not ops:
+            return 0
+
+        result = collection.bulk_write(ops)
+        inserted = result.upserted_count
+        print(f"[MongoDB] comparaciones → {inserted} nuevas, {result.modified_count} actualizadas")
+        return inserted
+
+    # ── Leer colecciones existentes (para comparar) ────────────────────────────
+
+    def get_collection_sample(
+        self,
+        collection_name: str,
+        tema: str = None,
+        limit: int = 200,
+        fields: list[str] = None,
+    ) -> list[dict]:
+        """
+        Lee una muestra de documentos de una colección existente.
+        Compatible con noticias, wikipedia, torrents.
+        """
+        db = self.connect()
+        query = {}
+        if tema:
+            query["$or"] = [
+                {"tema":    {"$regex": tema, "$options": "i"}},
+                {"subtema": {"$regex": tema, "$options": "i"}},
+                {"texto":   {"$regex": tema, "$options": "i"}},
+            ]
+
+        projection = {"_id": 0}
+        if fields:
+            for f in fields:
+                projection[f] = 1
+
+        docs = list(db[collection_name].find(query, projection).limit(limit))
+        for doc in docs:
+            if "source_type" not in doc:
+                doc["source_type"] = collection_name
+        return docs
+
+    def get_all_text_docs(self, tema: str = None, limit_per_collection: int = 200) -> list[dict]:
+        """
+        Recupera documentos de noticias + wikipedia + torrents combinados.
+        Útil para comparar imágenes contra todo el corpus.
+        """
+        all_docs = []
+        for col in ("noticias", "wikipedia", "torrents"):
+            try:
+                docs = self.get_collection_sample(col, tema=tema, limit=limit_per_collection)
+                all_docs.extend(docs)
+                print(f"[MongoDB] {col}: {len(docs)} docs cargados")
+            except Exception as e:
+                print(f"[MongoDB] WARNING: no se pudo leer '{col}': {e}")
+        return all_docs
+
+    # ── Info de la BD ──────────────────────────────────────────────────────────
+
+    def collection_stats(self) -> dict:
+        db = self.connect()
+        stats = {}
+        for col_name in db.list_collection_names():
+            stats[col_name] = db[col_name].count_documents({})
+        return stats
--- a/FLUJOS_DATOS/IMAGENES/pipeline_imagenes.py
+++ b/FLUJOS_DATOS/IMAGENES/pipeline_imagenes.py
@ -0,0 +1,134 @@
+"""
+pipeline_imagenes.py
+--------------------
+Pipeline end-to-end:
+  1. Scraping de imágenes Wikipedia por temas de FLUJOS
+  2. Análisis con Qwen3-VL-8B (keywords + metadata)
+  3. Comparación con corpus texto (noticias/wikipedia/torrents)
+  4. Guardado en MongoDB
+
+Ejecutar:
+    python pipeline_imagenes.py --scrape --analizar --mongo
+    python pipeline_imagenes.py --scrape --tema "cambio climático" --max 10
+    python pipeline_imagenes.py --analizar --carpeta ./output/wiki_images/
+    python pipeline_imagenes.py --solo-json   # sin MongoDB
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+
+from image_analyzer import ImageAnalyzer
+from image_comparator import ImageComparator
+from mongo_helper import MongoHelper
+from wikipedia_image_scraper import WikipediaImageScraper, TEMAS_FLUJOS
+
+OUTPUT_DIR   = Path(__file__).parent / "output"
+IMAGES_DIR   = OUTPUT_DIR / "wiki_images"
+
+
+def fase_scraping(temas: list[str], max_per_tema: int, lang: str, usar_mongo: bool) -> list[dict]:
+    print("\n" + "="*60)
+    print("  FASE 1 — Scraping de imágenes Wikipedia")
+    print("="*60)
+
+    scraper = WikipediaImageScraper(output_dir=IMAGES_DIR, lang=lang)
+
+    if len(temas) == 1:
+        metadata = scraper.scrape_tema(temas[0], max_images=max_per_tema)
+    else:
+        metadata = scraper.scrape_multitema(temas, max_per_tema=max_per_tema)
+
+    if metadata:
+        json_path = scraper.save_metadata(metadata)
+        print(f"\n  Total imágenes descargadas: {len(metadata)}")
+        if usar_mongo:
+            scraper.save_to_mongo(metadata)
+
+    return metadata
+
+
+def fase_analisis(carpeta: str, usar_mongo: bool, threshold: float) -> tuple[list, list]:
+    print("\n" + "="*60)
+    print("  FASE 2 — Análisis con Qwen3-VL-8B")
+    print("="*60)
+
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    OUTPUT_DIR.mkdir(exist_ok=True)
+
+    analyzer = ImageAnalyzer()
+    image_docs = analyzer.analyze_folder(carpeta)
+
+    if not image_docs:
+        print("  No se analizaron imágenes.")
+        return [], []
+
+    json_imagenes = OUTPUT_DIR / f"imagenes_{timestamp}.json"
+    ImageAnalyzer.save_json(image_docs, str(json_imagenes))
+
+    # Cargar corpus texto para comparar
+    print("\n  Cargando corpus de texto para comparar...")
+    mongo = MongoHelper()
+    if usar_mongo and mongo.is_available():
+        text_docs = mongo.get_all_text_docs(limit_per_collection=300)
+    else:
+        print("  MongoDB no disponible — comparación omitida")
+        text_docs = []
+
+    comparaciones = []
+    if text_docs:
+        comparador = ImageComparator(threshold=threshold)
+        valid = [d for d in image_docs if "error" not in d]
+        comparaciones = comparador.compare_batch(valid, text_docs)
+
+        stats = comparador.stats(comparaciones)
+        print(f"  Similitud media: {stats.get('media', 0)}%  |  max: {stats.get('max', 0)}%")
+
+        json_comp = OUTPUT_DIR / f"comparaciones_{timestamp}.json"
+        with open(json_comp, "w", encoding="utf-8") as f:
+            json.dump(comparaciones, f, ensure_ascii=False, indent=2)
+        print(f"  Guardado: {json_comp}")
+
+        if usar_mongo and mongo.is_available():
+            mongo.upsert_imagenes([d for d in image_docs if "error" not in d])
+            mongo.insert_comparaciones(comparaciones)
+            mongo.disconnect()
+
+    return image_docs, comparaciones
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Pipeline imágenes FLUJOS")
+
+    parser.add_argument("--scrape",    action="store_true", help="Ejecutar fase de scraping")
+    parser.add_argument("--analizar",  action="store_true", help="Ejecutar fase de análisis VLM")
+    parser.add_argument("--tema",      default=None,        help="Tema único (ej: 'cambio climático')")
+    parser.add_argument("--max",       type=int, default=20, help="Máx imágenes por tema (default: 20)")
+    parser.add_argument("--lang",      default="es",        help="Idioma Wikipedia: es|en")
+    parser.add_argument("--carpeta",   default=str(IMAGES_DIR), help="Carpeta para analizar")
+    parser.add_argument("--umbral",    type=float, default=5.0, help="Umbral similitud (default: 5.0)")
+    parser.add_argument("--mongo",     action="store_true", help="Guardar en MongoDB")
+    parser.add_argument("--solo-json", action="store_true", help="Solo JSON local, sin MongoDB")
+
+    args = parser.parse_args()
+    usar_mongo = args.mongo and not args.solo_json
+
+    if not args.scrape and not args.analizar:
+        parser.print_help()
+        sys.exit(0)
+
+    temas = [args.tema] if args.tema else TEMAS_FLUJOS
+
+    if args.scrape:
+        fase_scraping(temas, args.max, args.lang, usar_mongo)
+
+    if args.analizar:
+        carpeta = args.carpeta
+        if args.scrape and args.tema:
+            tema_slug = args.tema.lower().replace(" ", "_").replace("/", "-")[:40]
+            carpeta = str(IMAGES_DIR / tema_slug)
+        fase_analisis(carpeta, usar_mongo, args.umbral)
+
+    print("\n  Pipeline completado.")
--- a/FLUJOS_DATOS/IMAGENES/requirements_imagenes.txt
+++ b/FLUJOS_DATOS/IMAGENES/requirements_imagenes.txt
@ -0,0 +1,26 @@
+# Pipeline de imágenes FLUJOS — Qwen3-VL-8B
+# pip install -r requirements_imagenes.txt
+
+# HuggingFace / modelo Qwen3-VL
+transformers @ git+https://github.com/huggingface/transformers
+torch>=2.3.0
+torchvision>=0.18.0
+accelerate>=0.30.0
+huggingface_hub>=0.23.0
+qwen-vl-utils>=0.0.8
+
+# MongoDB
+pymongo>=4.6
+
+# ML / comparación
+scikit-learn>=1.3
+numpy>=1.24
+
+# Imágenes
+Pillow>=10.0
+
+# HTTP
+requests>=2.31
+
+# Utilidades
+python-dotenv>=1.0
--- a/FLUJOS_DATOS/IMAGENES/wikipedia_image_scraper.py
+++ b/FLUJOS_DATOS/IMAGENES/wikipedia_image_scraper.py
@ -0,0 +1,446 @@
+"""
+wikipedia_image_scraper.py
+--------------------------
+Descarga imágenes de artículos de Wikipedia por tema usando la Wikimedia API.
+Las guarda en una carpeta local y registra los metadatos en JSON / MongoDB.
+
+Flujo:
+  1. Busca artículos en Wikipedia por tema/keyword
+  2. Para cada artículo extrae las imágenes (Wikimedia API)
+  3. Filtra imágenes no relevantes (iconos, banderas, logos pequeños...)
+  4. Descarga las imágenes a la carpeta de destino
+  5. Guarda metadatos: título del artículo, tema, url, descripción, fecha
+  6. Opcional: guarda metadatos en MongoDB colección 'imagenes_wiki'
+
+Uso:
+    python wikipedia_image_scraper.py --tema "cambio climático" --max 30
+    python wikipedia_image_scraper.py --tema "geopolítica" --lang es --max 50
+    python wikipedia_image_scraper.py --temas temas.txt --max 20
+    python wikipedia_image_scraper.py --tema "climate change" --lang en --max 40
+
+Requisitos:
+    pip install requests Pillow pymongo python-dotenv
+"""
+
+import argparse
+import json
+import os
+import time
+from datetime import datetime
+from pathlib import Path
+from urllib.parse import quote, urlparse
+
+import requests
+from PIL import Image, UnidentifiedImageError
+
+# ── Configuración ──────────────────────────────────────────────────────────────
+
+WIKI_API_ES  = "https://es.wikipedia.org/w/api.php"
+WIKI_API_EN  = "https://en.wikipedia.org/w/api.php"
+WIKIMEDIA_API = "https://commons.wikimedia.org/w/api.php"
+
+OUTPUT_BASE  = Path(__file__).parent / "output" / "wiki_images"
+
+# Tamaño mínimo para considerar una imagen relevante (pixels)
+MIN_WIDTH  = 200
+MIN_HEIGHT = 200
+MIN_BYTES  = 20_000   # 20KB mínimo
+
+# Extensiones válidas
+VALID_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp"}
+
+# Prefijos/sufijos de archivos a ignorar (iconos, banderas, etc.)
+SKIP_PATTERNS = [
+    "flag_", "Flag_", "icon", "Icon", "logo", "Logo",
+    "symbol", "Symbol", "coat_of_arms", "Coat_of_arms",
+    "commons-logo", "wiki", "Wiki", "question_mark",
+    "edit-", "nuvola", "Nuvola", "pictogram", "Pictogram",
+    "OOjs", "Ambox", "Portal-", "Disambig",
+]
+
+HEADERS = {
+    "User-Agent": "FLUJOS-Project/1.0 (https://gitea.laenre.net/hacklab/FLUJOS; educational research)"
+}
+
+
+# ── Funciones de búsqueda Wikipedia ───────────────────────────────────────────
+
+def search_articles(tema: str, lang: str = "es", limit: int = 10) -> list[dict]:
+    """Busca artículos en Wikipedia por tema. Devuelve lista de {title, pageid}."""
+    api_url = WIKI_API_EN if lang == "en" else WIKI_API_ES
+
+    params = {
+        "action":   "query",
+        "list":     "search",
+        "srsearch": tema,
+        "srlimit":  limit,
+        "format":   "json",
+        "srinfo":   "totalhits",
+        "srprop":   "snippet|titlesnippet",
+    }
+
+    resp = requests.get(api_url, params=params, headers=HEADERS, timeout=15)
+    resp.raise_for_status()
+    data = resp.json()
+
+    articles = []
+    for item in data.get("query", {}).get("search", []):
+        articles.append({
+            "title":   item["title"],
+            "pageid":  item["pageid"],
+            "snippet": item.get("snippet", "").replace("<span class=\"searchmatch\">", "").replace("</span>", ""),
+        })
+
+    return articles
+
+
+def get_article_images(title: str, lang: str = "es", limit: int = 20) -> list[str]:
+    """Obtiene lista de nombres de archivo de imágenes de un artículo Wikipedia."""
+    api_url = WIKI_API_EN if lang == "en" else WIKI_API_ES
+
+    params = {
+        "action":  "query",
+        "titles":  title,
+        "prop":    "images",
+        "imlimit": limit,
+        "format":  "json",
+    }
+
+    resp = requests.get(api_url, params=params, headers=HEADERS, timeout=15)
+    resp.raise_for_status()
+    data = resp.json()
+
+    pages = data.get("query", {}).get("pages", {})
+    image_titles = []
+    for page in pages.values():
+        for img in page.get("images", []):
+            image_titles.append(img["title"])
+
+    return image_titles
+
+
+def get_image_info(file_title: str) -> dict | None:
+    """
+    Obtiene info de una imagen via Wikimedia API:
+    url directa de descarga, dimensiones, descripción, autor, licencia.
+    """
+    # Normalizar namespace: Wikipedia ES usa "Archivo:", Commons usa "File:"
+    for prefix in ("Archivo:", "Fichero:", "Image:", "Imagen:"):
+        if file_title.startswith(prefix):
+            file_title = "File:" + file_title[len(prefix):]
+            break
+
+    params = {
+        "action":   "query",
+        "titles":   file_title,
+        "prop":     "imageinfo",
+        "iiprop":   "url|size|extmetadata",
+        "iiurlwidth": 1200,
+        "format":   "json",
+    }
+
+    resp = requests.get(WIKIMEDIA_API, params=params, headers=HEADERS, timeout=15)
+    resp.raise_for_status()
+    data = resp.json()
+
+    pages = data.get("query", {}).get("pages", {})
+    for page in pages.values():
+        infos = page.get("imageinfo", [])
+        if not infos:
+            return None
+        info = infos[0]
+
+        ext_meta = info.get("extmetadata", {})
+        return {
+            "url":          info.get("thumburl") or info.get("url"),
+            "url_original": info.get("url"),
+            "width":        info.get("width", 0),
+            "height":       info.get("height", 0),
+            "size_bytes":   info.get("size", 0),
+            "descripcion":  ext_meta.get("ImageDescription", {}).get("value", ""),
+            "autor":        ext_meta.get("Artist", {}).get("value", ""),
+            "licencia":     ext_meta.get("LicenseShortName", {}).get("value", ""),
+            "fecha_orig":   ext_meta.get("DateTimeOriginal", {}).get("value", ""),
+        }
+
+    return None
+
+
+# ── Filtros ────────────────────────────────────────────────────────────────────
+
+def should_skip(file_title: str, img_info: dict) -> tuple[bool, str]:
+    """Devuelve (skip, motivo) — True si la imagen debe descartarse."""
+    filename = Path(file_title).name
+
+    # Extensión válida
+    ext = Path(filename).suffix.lower()
+    if ext not in VALID_EXTENSIONS:
+        return True, f"extensión no válida: {ext}"
+
+    # Patrones a ignorar
+    for pattern in SKIP_PATTERNS:
+        if pattern in filename:
+            return True, f"patrón ignorado: {pattern}"
+
+    # Tamaño mínimo
+    if img_info.get("width", 0) < MIN_WIDTH or img_info.get("height", 0) < MIN_HEIGHT:
+        return True, f"demasiado pequeña: {img_info.get('width')}x{img_info.get('height')}"
+
+    if img_info.get("size_bytes", 0) < MIN_BYTES:
+        return True, f"archivo demasiado pequeño: {img_info.get('size_bytes')} bytes"
+
+    return False, ""
+
+
+# ── Descarga ───────────────────────────────────────────────────────────────────
+
+def download_image(url: str, dest_path: Path) -> bool:
+    """Descarga una imagen a dest_path. Devuelve True si éxito."""
+    try:
+        resp = requests.get(url, headers=HEADERS, timeout=30, stream=True)
+        resp.raise_for_status()
+
+        with open(dest_path, "wb") as f:
+            for chunk in resp.iter_content(chunk_size=8192):
+                f.write(chunk)
+
+        # Verificar que es imagen válida con Pillow
+        with Image.open(dest_path) as img:
+            img.verify()
+
+        return True
+
+    except (UnidentifiedImageError, Exception) as e:
+        if dest_path.exists():
+            dest_path.unlink()
+        return False
+
+
+# ── Pipeline principal ─────────────────────────────────────────────────────────
+
+class WikipediaImageScraper:
+
+    def __init__(self, output_dir: Path = OUTPUT_BASE, lang: str = "es"):
+        self.output_dir = output_dir
+        self.lang = lang
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.session = requests.Session()
+
+    def scrape_tema(self, tema: str, max_images: int = 30, max_articles: int = 10) -> list[dict]:
+        """
+        Descarga imágenes de Wikipedia sobre un tema.
+
+        Args:
+            tema: tema a buscar (ej: "cambio climático")
+            max_images: máximo de imágenes a descargar
+            max_articles: máximo de artículos a explorar
+
+        Returns:
+            Lista de metadatos de imágenes descargadas
+        """
+        # Carpeta por tema
+        tema_slug = tema.lower().replace(" ", "_").replace("/", "-")[:40]
+        tema_dir = self.output_dir / tema_slug
+        tema_dir.mkdir(exist_ok=True)
+
+        print(f"\n[WikiScraper] Tema: '{tema}'  |  max_images={max_images}  |  lang={self.lang}")
+        print("-" * 50)
+
+        # 1. Buscar artículos
+        articles = search_articles(tema, lang=self.lang, limit=max_articles)
+        print(f"  Artículos encontrados: {len(articles)}")
+        for a in articles[:5]:
+            print(f"    · {a['title']}")
+        if len(articles) > 5:
+            print(f"    ... y {len(articles)-5} más")
+
+        downloaded = []
+        total_downloaded = 0
+
+        for article in articles:
+            if total_downloaded >= max_images:
+                break
+
+            print(f"\n  → {article['title']}")
+
+            # 2. Obtener imágenes del artículo
+            try:
+                img_titles = get_article_images(article["title"], lang=self.lang, limit=25)
+            except Exception as e:
+                print(f"     ERROR obteniendo imágenes: {e}")
+                continue
+
+            print(f"     {len(img_titles)} imágenes en el artículo")
+
+            for img_title in img_titles:
+                if total_downloaded >= max_images:
+                    break
+
+                # 3. Obtener info de la imagen
+                try:
+                    img_info = get_image_info(img_title)
+                    time.sleep(0.2)  # respetar rate limit Wikimedia
+                except Exception as e:
+                    continue
+
+                if not img_info or not img_info.get("url"):
+                    continue
+
+                # 4. Filtrar
+                skip, motivo = should_skip(img_title, img_info)
+                if skip:
+                    continue
+
+                # 5. Nombre de archivo local
+                original_name = Path(urlparse(img_info["url"]).path).name
+                ext = Path(original_name).suffix.lower() or ".jpg"
+                safe_name = f"{tema_slug}_{total_downloaded:03d}{ext}"
+                dest_path = tema_dir / safe_name
+
+                # Saltar si ya existe
+                if dest_path.exists():
+                    print(f"     ↳ ya existe: {safe_name}")
+                    total_downloaded += 1
+                    continue
+
+                # 6. Descargar
+                print(f"     ↓ {safe_name}  ({img_info['width']}x{img_info['height']}  {img_info['size_bytes']//1024}KB)")
+                success = download_image(img_info["url"], dest_path)
+
+                if success:
+                    meta = {
+                        "archivo":          safe_name,
+                        "image_path":       str(dest_path.resolve()),
+                        "tema":             tema.lower(),
+                        "subtema":          article["title"].lower(),
+                        "texto":            article.get("snippet", ""),
+                        "descripcion_wiki": img_info.get("descripcion", ""),
+                        "autor":            img_info.get("autor", ""),
+                        "licencia":         img_info.get("licencia", ""),
+                        "url_original":     img_info.get("url_original", ""),
+                        "width":            img_info["width"],
+                        "height":           img_info["height"],
+                        "size_bytes":       img_info["size_bytes"],
+                        "source_type":      "wikipedia_imagen",
+                        "lang":             self.lang,
+                        "fecha":            datetime.now().strftime("%Y-%m-%d"),
+                        "articulo_wiki":    article["title"],
+                        "keywords":         [],  # se rellenan con image_analyzer.py
+                    }
+                    downloaded.append(meta)
+                    total_downloaded += 1
+                else:
+                    print(f"     ✗ fallo descarga")
+
+        print(f"\n[WikiScraper] Descargadas: {total_downloaded} imágenes en {tema_dir}")
+        return downloaded
+
+    def scrape_multitema(self, temas: list[str], max_per_tema: int = 20) -> list[dict]:
+        """Descarga imágenes para múltiples temas."""
+        all_results = []
+        for tema in temas:
+            results = self.scrape_tema(tema, max_images=max_per_tema)
+            all_results.extend(results)
+            time.sleep(1)  # pausa entre temas
+        return all_results
+
+    def save_metadata(self, metadata: list[dict], json_path: Path = None) -> Path:
+        """Guarda metadatos en JSON."""
+        if json_path is None:
+            ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+            json_path = self.output_dir / f"metadata_{ts}.json"
+
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump(metadata, f, ensure_ascii=False, indent=2)
+
+        print(f"[WikiScraper] Metadatos guardados: {json_path}")
+        return json_path
+
+    def save_to_mongo(self, metadata: list[dict]) -> dict:
+        """Guarda metadatos en MongoDB colección 'imagenes_wiki'."""
+        from mongo_helper import MongoHelper
+        mongo = MongoHelper()
+
+        if not mongo.is_available():
+            print("[WikiScraper] MongoDB no disponible — solo JSON local")
+            return {"inserted": 0, "updated": 0}
+
+        # Usar colección imagenes_wiki para no mezclar con imagenes analizadas
+        db = mongo.connect()
+        from pymongo import UpdateOne
+        col = db["imagenes_wiki"]
+        col.create_index("archivo", unique=True)
+
+        ops = [
+            UpdateOne({"archivo": doc["archivo"]}, {"$set": doc}, upsert=True)
+            for doc in metadata
+        ]
+        if ops:
+            result = col.bulk_write(ops)
+            stats = {"inserted": result.upserted_count, "updated": result.modified_count}
+        else:
+            stats = {"inserted": 0, "updated": 0}
+
+        print(f"[WikiScraper] MongoDB imagenes_wiki → {stats}")
+        mongo.disconnect()
+        return stats
+
+
+# ── CLI ────────────────────────────────────────────────────────────────────────
+
+# Temas de FLUJOS por defecto
+TEMAS_FLUJOS = [
+    "cambio climático",
+    "geopolítica conflictos",
+    "seguridad internacional espionaje",
+    "libertad de prensa periodismo",
+    "corporaciones poder económico",
+    "populismo extremismo",
+    "desinformación redes sociales",
+    "privacidad vigilancia masiva",
+    "biodiversidad medioambiente",
+    "inteligencia artificial algoritmos",
+]
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Descarga imágenes de Wikipedia por tema")
+
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--tema",   help="Tema único a buscar (ej: 'cambio climático')")
+    group.add_argument("--temas",  help="Fichero .txt con un tema por línea")
+    group.add_argument("--flujos", action="store_true", help="Usar los temas de FLUJOS por defecto")
+
+    parser.add_argument("--max",    type=int, default=20,  help="Máximo imágenes por tema (default: 20)")
+    parser.add_argument("--lang",   default="es",          help="Idioma Wikipedia: es | en (default: es)")
+    parser.add_argument("--output", default=str(OUTPUT_BASE), help="Carpeta de destino")
+    parser.add_argument("--mongo",  action="store_true",   help="Guardar metadatos en MongoDB")
+
+    args = parser.parse_args()
+
+    scraper = WikipediaImageScraper(output_dir=Path(args.output), lang=args.lang)
+
+    # Determinar lista de temas
+    if args.flujos:
+        temas = TEMAS_FLUJOS
+    elif args.temas:
+        with open(args.temas, encoding="utf-8") as f:
+            temas = [l.strip() for l in f if l.strip()]
+    else:
+        temas = [args.tema]
+
+    # Ejecutar
+    if len(temas) == 1:
+        metadata = scraper.scrape_tema(temas[0], max_images=args.max)
+    else:
+        metadata = scraper.scrape_multitema(temas, max_per_tema=args.max)
+
+    # Guardar resultados
+    if metadata:
+        json_path = scraper.save_metadata(metadata)
+        print(f"\n  Total imágenes descargadas: {len(metadata)}")
+        print(f"  JSON: {json_path}")
+
+        if args.mongo:
+            scraper.save_to_mongo(metadata)
+    else:
+        print("\n  No se descargaron imágenes.")