#!/usr/bin/env python3 import time import shutil from pymongo import MongoClient def print_header(title): width = shutil.get_terminal_size().columns print("\n" + title.center(width, "─") + "\n") def draw_bar(progress, total, length=40): """Dibuja una barra de progreso de longitud `length`.""" proportion = progress / total if total else 0 filled = int(proportion * length) bar = "[" + "#" * filled + "-" * (length - filled) + "]" return f"{bar} {progress}/{total} ({proportion*100:5.1f}%)" def main(): # ───────────────────────────────────────────────────────────────────────────── # 1) Conexión y estado general # ───────────────────────────────────────────────────────────────────────────── print_header("Conexión a MongoDB") t0 = time.perf_counter() client = MongoClient('mongodb://localhost:27017') admin = client.admin.command('serverStatus') print(f" Ping: {client.admin.command('ping')['ok']}") print(f" Uptime (s): {admin['uptime']:.0f}") print(f" Conexiones actuales: {admin['connections']['current']}") mem = admin.get('mem', {}) resident = mem.get('resident', 0) print(f" Memoria resident (MB): {resident:.2f}") db = client['FLUJOS_DATOS'] cols = db.list_collection_names() print(f" Collections: {len(cols)}") total_docs = sum(db[c].estimated_document_count() for c in cols) print(f" Documents: {total_docs:,}") print(f" Tiempo tramo: {(time.perf_counter()-t0):.2f} s") # ───────────────────────────────────────────────────────────────────────────── # 2) Estadísticas por colección y por tema # ───────────────────────────────────────────────────────────────────────────── collections = ['noticias','wikipedia','torrents'] for idx, col in enumerate(collections, start=1): print_header(f"{idx}) Estadísticas de «{col.upper()}»") t1 = time.perf_counter() coll = db[col] cnt = coll.estimated_document_count() stats = db.command("collStats", col) size_mb = stats['size'] / (1024*1024) idx_mb = stats['totalIndexSize'] / (1024*1024) total_mb = size_mb + idx_mb print(f" Total documentos : {cnt:,}") print(f" Tamaño data : {size_mb:8.1f} MB {draw_bar(size_mb, total_mb)}") print(f" Tamaño índices : {idx_mb:8.1f} MB {draw_bar(idx_mb, total_mb)}") temas = coll.distinct("tema") for tema in temas: n = coll.count_documents({"tema": tema}) print(f" • {tema:>25}: {n:6,} {draw_bar(n, cnt)}") print(f" Tiempo tramo : {(time.perf_counter()-t1):.2f} s") # ───────────────────────────────────────────────────────────────────────────── # 3) Enlaces Totales y buckets de porcentaje_similitud (con barra en vivo) # ───────────────────────────────────────────────────────────────────────────── print_header(f"{len(collections)+1}) Enlaces y Buckets de Similitud") comp = db['comparaciones'] total_links = comp.estimated_document_count() print(f" Total enlaces: {total_links:,}\n") # Definimos los límites de los buckets boundaries = [0, 1, 2, 5, 10, 20, 50] labels = [f"{boundaries[i]}–{boundaries[i+1]}%" for i in range(len(boundaries)-1)] + ["50%+"] bucket_counts = [] t2 = time.perf_counter() print(" Calculando buckets:") for i, label in enumerate(labels): low = boundaries[i] high = boundaries[i+1] if i+1 < len(boundaries) else None # Construimos la consulta para este bucket if high is not None: query = {"porcentaje_similitud": {"$gte": low, "$lt": high}} else: query = {"porcentaje_similitud": {"$gte": low}} # Contamos documentos count = comp.count_documents(query) bucket_counts.append((label, count)) # Actualizamos barra de progreso print("\r " + draw_bar(i+1, len(labels)), end="", flush=True) print() # salto de línea tras la barra # Mostramos resultados for label, count in bucket_counts: print(f" • {label:>7}: {count:>10,} ({count/total_links*100:5.2f}%)") print(f"\n Tiempo tramo: {(time.perf_counter()-t2):.2f} s") client.close() if __name__ == "__main__": main()