FLUJOS/FLUJOS_DATOS/SCRIPTS/check.py
CAPITANSITO 83f67b76b4 código completo FLUJOS — snapshot limpio sin datos scrapeados
Incluye: backend Node.js/Express, visualización 3D (Three.js/3d-force-graph),
scrapers Wikipedia/noticias/imágenes, analizador Qwen3-VL, pipeline maestro
con systemd timer, fixes de seguridad (NoSQL injection, XSS, ReDoS, port
binding) y documentación técnica completa en docs/.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-21 23:45:29 +02:00

101 lines
5.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
import time
import shutil
from pymongo import MongoClient
def print_header(title):
width = shutil.get_terminal_size().columns
print("\n" + title.center(width, "") + "\n")
def draw_bar(progress, total, length=40):
"""Dibuja una barra de progreso de longitud `length`."""
proportion = progress / total if total else 0
filled = int(proportion * length)
bar = "[" + "#" * filled + "-" * (length - filled) + "]"
return f"{bar} {progress}/{total} ({proportion*100:5.1f}%)"
def main():
# ─────────────────────────────────────────────────────────────────────────────
# 1) Conexión y estado general
# ─────────────────────────────────────────────────────────────────────────────
print_header("Conexión a MongoDB")
t0 = time.perf_counter()
client = MongoClient('mongodb://localhost:27017')
admin = client.admin.command('serverStatus')
print(f" Ping: {client.admin.command('ping')['ok']}")
print(f" Uptime (s): {admin['uptime']:.0f}")
print(f" Conexiones actuales: {admin['connections']['current']}")
mem = admin.get('mem', {})
resident = mem.get('resident', 0)
print(f" Memoria resident (MB): {resident:.2f}")
db = client['FLUJOS_DATOS']
cols = db.list_collection_names()
print(f" Collections: {len(cols)}")
total_docs = sum(db[c].estimated_document_count() for c in cols)
print(f" Documents: {total_docs:,}")
print(f" Tiempo tramo: {(time.perf_counter()-t0):.2f} s")
# ─────────────────────────────────────────────────────────────────────────────
# 2) Estadísticas por colección y por tema
# ─────────────────────────────────────────────────────────────────────────────
collections = ['noticias','wikipedia','torrents']
for idx, col in enumerate(collections, start=1):
print_header(f"{idx}) Estadísticas de «{col.upper()}»")
t1 = time.perf_counter()
coll = db[col]
cnt = coll.estimated_document_count()
stats = db.command("collStats", col)
size_mb = stats['size'] / (1024*1024)
idx_mb = stats['totalIndexSize'] / (1024*1024)
total_mb = size_mb + idx_mb
print(f" Total documentos : {cnt:,}")
print(f" Tamaño data : {size_mb:8.1f} MB {draw_bar(size_mb, total_mb)}")
print(f" Tamaño índices : {idx_mb:8.1f} MB {draw_bar(idx_mb, total_mb)}")
temas = coll.distinct("tema")
for tema in temas:
n = coll.count_documents({"tema": tema})
print(f"{tema:>25}: {n:6,} {draw_bar(n, cnt)}")
print(f" Tiempo tramo : {(time.perf_counter()-t1):.2f} s")
# ─────────────────────────────────────────────────────────────────────────────
# 3) Enlaces Totales y buckets de porcentaje_similitud (con barra en vivo)
# ─────────────────────────────────────────────────────────────────────────────
print_header(f"{len(collections)+1}) Enlaces y Buckets de Similitud")
comp = db['comparaciones']
total_links = comp.estimated_document_count()
print(f" Total enlaces: {total_links:,}\n")
# Definimos los límites de los buckets
boundaries = [0, 1, 2, 5, 10, 20, 50]
labels = [f"{boundaries[i]}{boundaries[i+1]}%" for i in range(len(boundaries)-1)] + ["50%+"]
bucket_counts = []
t2 = time.perf_counter()
print(" Calculando buckets:")
for i, label in enumerate(labels):
low = boundaries[i]
high = boundaries[i+1] if i+1 < len(boundaries) else None
# Construimos la consulta para este bucket
if high is not None:
query = {"porcentaje_similitud": {"$gte": low, "$lt": high}}
else:
query = {"porcentaje_similitud": {"$gte": low}}
# Contamos documentos
count = comp.count_documents(query)
bucket_counts.append((label, count))
# Actualizamos barra de progreso
print("\r " + draw_bar(i+1, len(labels)), end="", flush=True)
print() # salto de línea tras la barra
# Mostramos resultados
for label, count in bucket_counts:
print(f"{label:>7}: {count:>10,} ({count/total_links*100:5.2f}%)")
print(f"\n Tiempo tramo: {(time.perf_counter()-t2):.2f} s")
client.close()
if __name__ == "__main__":
main()