Initial commit - FLUJOS codebase (production branch)

Includes: FLUJOS app (Node/Flask/Python), FLUJOS_DATOS scripts (scrapers, Keras, Django) Excludes: MongoDB, scraped data, Wikipedia/WikiLeaks dumps, Python venv, node_modules
2026-03-31 14:10:02 +02:00 · 2026-03-31 14:10:02 +02:00 · a40b946163
commit a40b946163
158 changed files with 196645 additions and 0 deletions
--- a/FLUJOS_DATOS/SCRIPTS/check.py
+++ b/FLUJOS_DATOS/SCRIPTS/check.py
@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+import time
+import shutil
+from pymongo import MongoClient
+
+def print_header(title):
+    width = shutil.get_terminal_size().columns
+    print("\n" + title.center(width, "─") + "\n")
+
+def draw_bar(progress, total, length=40):
+    """Dibuja una barra de progreso de longitud `length`."""
+    proportion = progress / total if total else 0
+    filled = int(proportion * length)
+    bar = "[" + "#" * filled + "-" * (length - filled) + "]"
+    return f"{bar} {progress}/{total} ({proportion*100:5.1f}%)"
+
+def main():
+    # ─────────────────────────────────────────────────────────────────────────────
+    #                     1) Conexión y estado general
+    # ─────────────────────────────────────────────────────────────────────────────
+    print_header("Conexión a MongoDB")
+    t0 = time.perf_counter()
+    client = MongoClient('mongodb://localhost:27017')
+    admin = client.admin.command('serverStatus')
+    print(f" Ping: {client.admin.command('ping')['ok']}")
+    print(f" Uptime (s): {admin['uptime']:.0f}")
+    print(f" Conexiones actuales: {admin['connections']['current']}")
+    mem = admin.get('mem', {})
+    resident = mem.get('resident', 0)
+    print(f" Memoria resident (MB): {resident:.2f}")
+    db = client['FLUJOS_DATOS']
+    cols = db.list_collection_names()
+    print(f" Collections: {len(cols)}")
+    total_docs = sum(db[c].estimated_document_count() for c in cols)
+    print(f" Documents: {total_docs:,}")
+    print(f" Tiempo tramo: {(time.perf_counter()-t0):.2f} s")
+
+    # ─────────────────────────────────────────────────────────────────────────────
+    #             2) Estadísticas por colección y por tema
+    # ─────────────────────────────────────────────────────────────────────────────
+    collections = ['noticias','wikipedia','torrents']
+    for idx, col in enumerate(collections, start=1):
+        print_header(f"{idx}) Estadísticas de «{col.upper()}»")
+        t1 = time.perf_counter()
+        coll = db[col]
+        cnt = coll.estimated_document_count()
+        stats = db.command("collStats", col)
+        size_mb = stats['size'] / (1024*1024)
+        idx_mb  = stats['totalIndexSize'] / (1024*1024)
+        total_mb = size_mb + idx_mb
+        print(f" Total documentos : {cnt:,}")
+        print(f" Tamaño data      : {size_mb:8.1f} MB {draw_bar(size_mb, total_mb)}")
+        print(f" Tamaño índices   : {idx_mb:8.1f} MB {draw_bar(idx_mb, total_mb)}")
+        temas = coll.distinct("tema")
+        for tema in temas:
+            n = coll.count_documents({"tema": tema})
+            print(f"   • {tema:>25}: {n:6,} {draw_bar(n, cnt)}")
+        print(f" Tiempo tramo     : {(time.perf_counter()-t1):.2f} s")
+
+    # ─────────────────────────────────────────────────────────────────────────────
+    #   3) Enlaces Totales y buckets de porcentaje_similitud (con barra en vivo)
+    # ─────────────────────────────────────────────────────────────────────────────
+    print_header(f"{len(collections)+1}) Enlaces y Buckets de Similitud")
+    comp = db['comparaciones']
+    total_links = comp.estimated_document_count()
+    print(f" Total enlaces: {total_links:,}\n")
+
+    # Definimos los límites de los buckets
+    boundaries = [0, 1, 2, 5, 10, 20, 50]
+    labels    = [f"{boundaries[i]}–{boundaries[i+1]}%" for i in range(len(boundaries)-1)] + ["50%+"]
+    bucket_counts = []
+
+    t2 = time.perf_counter()
+    print(" Calculando buckets:")
+    for i, label in enumerate(labels):
+        low = boundaries[i]
+        high = boundaries[i+1] if i+1 < len(boundaries) else None
+
+        # Construimos la consulta para este bucket
+        if high is not None:
+            query = {"porcentaje_similitud": {"$gte": low, "$lt": high}}
+        else:
+            query = {"porcentaje_similitud": {"$gte": low}}
+
+        # Contamos documentos
+        count = comp.count_documents(query)
+        bucket_counts.append((label, count))
+
+        # Actualizamos barra de progreso
+        print("\r " + draw_bar(i+1, len(labels)), end="", flush=True)
+    print()  # salto de línea tras la barra
+
+    # Mostramos resultados
+    for label, count in bucket_counts:
+        print(f"   • {label:>7}: {count:>10,}  ({count/total_links*100:5.2f}%)")
+
+    print(f"\n Tiempo tramo: {(time.perf_counter()-t2):.2f} s")
+    client.close()
+
+if __name__ == "__main__":
+    main()
--- a/FLUJOS_DATOS/SCRIPTS/double_check_coincidencias.py
+++ b/FLUJOS_DATOS/SCRIPTS/double_check_coincidencias.py
@ -0,0 +1,23 @@
+# check_mismatches.py
+from pymongo import MongoClient
+
+client = MongoClient('mongodb://localhost:27017')
+db = client['FLUJOS_DATOS']
+
+# 1) Todos los nombres únicos que aparecen en comparaciones
+comp = db.comparaciones.distinct('noticia1') + db.comparaciones.distinct('noticia2')
+comp = set(comp)
+
+# 2) Todos los nombres únicos en las colecciones de nodos
+fuentes = ['noticias','wikipedia','torrents','leaks']
+todos = set()
+for col in fuentes:
+    if col in db.list_collection_names():
+        todos |= set(db[col].distinct('archivo'))
+
+sólo_en_comparaciones = comp - todos
+sólo_en_fuentes        = todos - comp
+
+print(f"Mismatches comparaciones→fuentes: {len(sólo_en_comparaciones)}")
+print(f"Mismatches fuentes→comparaciones: {len(sólo_en_fuentes)}")
+print("Ejemplos (up to 10):", list(sólo_en_comparaciones)[:10])