Includes: FLUJOS app (Node/Flask/Python), FLUJOS_DATOS scripts (scrapers, Keras, Django) Excludes: MongoDB, scraped data, Wikipedia/WikiLeaks dumps, Python venv, node_modules
101 lines
5.1 KiB
Python
101 lines
5.1 KiB
Python
#!/usr/bin/env python3
|
||
import time
|
||
import shutil
|
||
from pymongo import MongoClient
|
||
|
||
def print_header(title):
|
||
width = shutil.get_terminal_size().columns
|
||
print("\n" + title.center(width, "─") + "\n")
|
||
|
||
def draw_bar(progress, total, length=40):
|
||
"""Dibuja una barra de progreso de longitud `length`."""
|
||
proportion = progress / total if total else 0
|
||
filled = int(proportion * length)
|
||
bar = "[" + "#" * filled + "-" * (length - filled) + "]"
|
||
return f"{bar} {progress}/{total} ({proportion*100:5.1f}%)"
|
||
|
||
def main():
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 1) Conexión y estado general
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
print_header("Conexión a MongoDB")
|
||
t0 = time.perf_counter()
|
||
client = MongoClient('mongodb://localhost:27017')
|
||
admin = client.admin.command('serverStatus')
|
||
print(f" Ping: {client.admin.command('ping')['ok']}")
|
||
print(f" Uptime (s): {admin['uptime']:.0f}")
|
||
print(f" Conexiones actuales: {admin['connections']['current']}")
|
||
mem = admin.get('mem', {})
|
||
resident = mem.get('resident', 0)
|
||
print(f" Memoria resident (MB): {resident:.2f}")
|
||
db = client['FLUJOS_DATOS']
|
||
cols = db.list_collection_names()
|
||
print(f" Collections: {len(cols)}")
|
||
total_docs = sum(db[c].estimated_document_count() for c in cols)
|
||
print(f" Documents: {total_docs:,}")
|
||
print(f" Tiempo tramo: {(time.perf_counter()-t0):.2f} s")
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 2) Estadísticas por colección y por tema
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
collections = ['noticias','wikipedia','torrents']
|
||
for idx, col in enumerate(collections, start=1):
|
||
print_header(f"{idx}) Estadísticas de «{col.upper()}»")
|
||
t1 = time.perf_counter()
|
||
coll = db[col]
|
||
cnt = coll.estimated_document_count()
|
||
stats = db.command("collStats", col)
|
||
size_mb = stats['size'] / (1024*1024)
|
||
idx_mb = stats['totalIndexSize'] / (1024*1024)
|
||
total_mb = size_mb + idx_mb
|
||
print(f" Total documentos : {cnt:,}")
|
||
print(f" Tamaño data : {size_mb:8.1f} MB {draw_bar(size_mb, total_mb)}")
|
||
print(f" Tamaño índices : {idx_mb:8.1f} MB {draw_bar(idx_mb, total_mb)}")
|
||
temas = coll.distinct("tema")
|
||
for tema in temas:
|
||
n = coll.count_documents({"tema": tema})
|
||
print(f" • {tema:>25}: {n:6,} {draw_bar(n, cnt)}")
|
||
print(f" Tiempo tramo : {(time.perf_counter()-t1):.2f} s")
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 3) Enlaces Totales y buckets de porcentaje_similitud (con barra en vivo)
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
print_header(f"{len(collections)+1}) Enlaces y Buckets de Similitud")
|
||
comp = db['comparaciones']
|
||
total_links = comp.estimated_document_count()
|
||
print(f" Total enlaces: {total_links:,}\n")
|
||
|
||
# Definimos los límites de los buckets
|
||
boundaries = [0, 1, 2, 5, 10, 20, 50]
|
||
labels = [f"{boundaries[i]}–{boundaries[i+1]}%" for i in range(len(boundaries)-1)] + ["50%+"]
|
||
bucket_counts = []
|
||
|
||
t2 = time.perf_counter()
|
||
print(" Calculando buckets:")
|
||
for i, label in enumerate(labels):
|
||
low = boundaries[i]
|
||
high = boundaries[i+1] if i+1 < len(boundaries) else None
|
||
|
||
# Construimos la consulta para este bucket
|
||
if high is not None:
|
||
query = {"porcentaje_similitud": {"$gte": low, "$lt": high}}
|
||
else:
|
||
query = {"porcentaje_similitud": {"$gte": low}}
|
||
|
||
# Contamos documentos
|
||
count = comp.count_documents(query)
|
||
bucket_counts.append((label, count))
|
||
|
||
# Actualizamos barra de progreso
|
||
print("\r " + draw_bar(i+1, len(labels)), end="", flush=True)
|
||
print() # salto de línea tras la barra
|
||
|
||
# Mostramos resultados
|
||
for label, count in bucket_counts:
|
||
print(f" • {label:>7}: {count:>10,} ({count/total_links*100:5.2f}%)")
|
||
|
||
print(f"\n Tiempo tramo: {(time.perf_counter()-t2):.2f} s")
|
||
client.close()
|
||
|
||
if __name__ == "__main__":
|
||
main()
|