FLUJOS/FLUJOS_DATOS/SCRIPTS/check.py
CAPITANSITO a40b946163 Initial commit - FLUJOS codebase (production branch)
Includes: FLUJOS app (Node/Flask/Python), FLUJOS_DATOS scripts (scrapers, Keras, Django)
Excludes: MongoDB, scraped data, Wikipedia/WikiLeaks dumps, Python venv, node_modules
2026-03-31 14:10:02 +02:00

101 lines
5.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
import time
import shutil
from pymongo import MongoClient
def print_header(title):
width = shutil.get_terminal_size().columns
print("\n" + title.center(width, "") + "\n")
def draw_bar(progress, total, length=40):
"""Dibuja una barra de progreso de longitud `length`."""
proportion = progress / total if total else 0
filled = int(proportion * length)
bar = "[" + "#" * filled + "-" * (length - filled) + "]"
return f"{bar} {progress}/{total} ({proportion*100:5.1f}%)"
def main():
# ─────────────────────────────────────────────────────────────────────────────
# 1) Conexión y estado general
# ─────────────────────────────────────────────────────────────────────────────
print_header("Conexión a MongoDB")
t0 = time.perf_counter()
client = MongoClient('mongodb://localhost:27017')
admin = client.admin.command('serverStatus')
print(f" Ping: {client.admin.command('ping')['ok']}")
print(f" Uptime (s): {admin['uptime']:.0f}")
print(f" Conexiones actuales: {admin['connections']['current']}")
mem = admin.get('mem', {})
resident = mem.get('resident', 0)
print(f" Memoria resident (MB): {resident:.2f}")
db = client['FLUJOS_DATOS']
cols = db.list_collection_names()
print(f" Collections: {len(cols)}")
total_docs = sum(db[c].estimated_document_count() for c in cols)
print(f" Documents: {total_docs:,}")
print(f" Tiempo tramo: {(time.perf_counter()-t0):.2f} s")
# ─────────────────────────────────────────────────────────────────────────────
# 2) Estadísticas por colección y por tema
# ─────────────────────────────────────────────────────────────────────────────
collections = ['noticias','wikipedia','torrents']
for idx, col in enumerate(collections, start=1):
print_header(f"{idx}) Estadísticas de «{col.upper()}»")
t1 = time.perf_counter()
coll = db[col]
cnt = coll.estimated_document_count()
stats = db.command("collStats", col)
size_mb = stats['size'] / (1024*1024)
idx_mb = stats['totalIndexSize'] / (1024*1024)
total_mb = size_mb + idx_mb
print(f" Total documentos : {cnt:,}")
print(f" Tamaño data : {size_mb:8.1f} MB {draw_bar(size_mb, total_mb)}")
print(f" Tamaño índices : {idx_mb:8.1f} MB {draw_bar(idx_mb, total_mb)}")
temas = coll.distinct("tema")
for tema in temas:
n = coll.count_documents({"tema": tema})
print(f"{tema:>25}: {n:6,} {draw_bar(n, cnt)}")
print(f" Tiempo tramo : {(time.perf_counter()-t1):.2f} s")
# ─────────────────────────────────────────────────────────────────────────────
# 3) Enlaces Totales y buckets de porcentaje_similitud (con barra en vivo)
# ─────────────────────────────────────────────────────────────────────────────
print_header(f"{len(collections)+1}) Enlaces y Buckets de Similitud")
comp = db['comparaciones']
total_links = comp.estimated_document_count()
print(f" Total enlaces: {total_links:,}\n")
# Definimos los límites de los buckets
boundaries = [0, 1, 2, 5, 10, 20, 50]
labels = [f"{boundaries[i]}{boundaries[i+1]}%" for i in range(len(boundaries)-1)] + ["50%+"]
bucket_counts = []
t2 = time.perf_counter()
print(" Calculando buckets:")
for i, label in enumerate(labels):
low = boundaries[i]
high = boundaries[i+1] if i+1 < len(boundaries) else None
# Construimos la consulta para este bucket
if high is not None:
query = {"porcentaje_similitud": {"$gte": low, "$lt": high}}
else:
query = {"porcentaje_similitud": {"$gte": low}}
# Contamos documentos
count = comp.count_documents(query)
bucket_counts.append((label, count))
# Actualizamos barra de progreso
print("\r " + draw_bar(i+1, len(labels)), end="", flush=True)
print() # salto de línea tras la barra
# Mostramos resultados
for label, count in bucket_counts:
print(f"{label:>7}: {count:>10,} ({count/total_links*100:5.2f}%)")
print(f"\n Tiempo tramo: {(time.perf_counter()-t2):.2f} s")
client.close()
if __name__ == "__main__":
main()