FLUJOS/FLUJOS_DATOS/SCRIPTS/check.py

#!/usr/bin/env python3
import time
import shutil
from pymongo import MongoClient

def print_header(title):
    width = shutil.get_terminal_size().columns
    print("\n" + title.center(width, "─") + "\n")

def draw_bar(progress, total, length=40):
    """Dibuja una barra de progreso de longitud `length`."""
    proportion = progress / total if total else 0
    filled = int(proportion * length)
    bar = "[" + "#" * filled + "-" * (length - filled) + "]"
    return f"{bar} {progress}/{total} ({proportion*100:5.1f}%)"

def main():
    # ─────────────────────────────────────────────────────────────────────────────
    #                     1) Conexión y estado general
    # ─────────────────────────────────────────────────────────────────────────────
    print_header("Conexión a MongoDB")
    t0 = time.perf_counter()
    client = MongoClient('mongodb://localhost:27017')
    admin = client.admin.command('serverStatus')
    print(f" Ping: {client.admin.command('ping')['ok']}")
    print(f" Uptime (s): {admin['uptime']:.0f}")
    print(f" Conexiones actuales: {admin['connections']['current']}")
    mem = admin.get('mem', {})
    resident = mem.get('resident', 0)
    print(f" Memoria resident (MB): {resident:.2f}")
    db = client['FLUJOS_DATOS']
    cols = db.list_collection_names()
    print(f" Collections: {len(cols)}")
    total_docs = sum(db[c].estimated_document_count() for c in cols)
    print(f" Documents: {total_docs:,}")
    print(f" Tiempo tramo: {(time.perf_counter()-t0):.2f} s")

    # ─────────────────────────────────────────────────────────────────────────────
    #             2) Estadísticas por colección y por tema
    # ─────────────────────────────────────────────────────────────────────────────
    collections = ['noticias','wikipedia','torrents']
    for idx, col in enumerate(collections, start=1):
        print_header(f"{idx}) Estadísticas de «{col.upper()}»")
        t1 = time.perf_counter()
        coll = db[col]
        cnt = coll.estimated_document_count()
        stats = db.command("collStats", col)
        size_mb = stats['size'] / (1024*1024)
        idx_mb  = stats['totalIndexSize'] / (1024*1024)
        total_mb = size_mb + idx_mb
        print(f" Total documentos : {cnt:,}")
        print(f" Tamaño data      : {size_mb:8.1f} MB {draw_bar(size_mb, total_mb)}")
        print(f" Tamaño índices   : {idx_mb:8.1f} MB {draw_bar(idx_mb, total_mb)}")
        temas = coll.distinct("tema")
        for tema in temas:
            n = coll.count_documents({"tema": tema})
            print(f"   • {tema:>25}: {n:6,} {draw_bar(n, cnt)}")
        print(f" Tiempo tramo     : {(time.perf_counter()-t1):.2f} s")

    # ─────────────────────────────────────────────────────────────────────────────
    #   3) Enlaces Totales y buckets de porcentaje_similitud (con barra en vivo)
    # ─────────────────────────────────────────────────────────────────────────────
    print_header(f"{len(collections)+1}) Enlaces y Buckets de Similitud")
    comp = db['comparaciones']
    total_links = comp.estimated_document_count()
    print(f" Total enlaces: {total_links:,}\n")

    # Definimos los límites de los buckets
    boundaries = [0, 1, 2, 5, 10, 20, 50]
    labels    = [f"{boundaries[i]}–{boundaries[i+1]}%" for i in range(len(boundaries)-1)] + ["50%+"]
    bucket_counts = []

    t2 = time.perf_counter()
    print(" Calculando buckets:")
    for i, label in enumerate(labels):
        low = boundaries[i]
        high = boundaries[i+1] if i+1 < len(boundaries) else None

        # Construimos la consulta para este bucket
        if high is not None:
            query = {"porcentaje_similitud": {"$gte": low, "$lt": high}}
        else:
            query = {"porcentaje_similitud": {"$gte": low}}

        # Contamos documentos
        count = comp.count_documents(query)
        bucket_counts.append((label, count))

        # Actualizamos barra de progreso
        print("\r " + draw_bar(i+1, len(labels)), end="", flush=True)
    print()  # salto de línea tras la barra

    # Mostramos resultados
    for label, count in bucket_counts:
        print(f"   • {label:>7}: {count:>10,}  ({count/total_links*100:5.2f}%)")

    print(f"\n Tiempo tramo: {(time.perf_counter()-t2):.2f} s")
    client.close()

if __name__ == "__main__":
    main()