Initial clean commit

This commit is contained in:
jlimolina 2026-01-13 13:39:51 +01:00
commit 6784d81c2c
141 changed files with 25219 additions and 0 deletions

View file

@ -0,0 +1,67 @@
#!/usr/bin/env python3
"""
Script para limpiar caracteres <unk> de las traducciones.
"""
import re
from db import get_conn
def clean_text(text):
"""Remove <unk> tokens and other problematic characters."""
if not text:
return text
# Remove <unk> tokens
text = text.replace('<unk>', '')
text = text.replace('<EFBFBD>', '')
# Remove other problematic Unicode characters
text = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]', '', text)
return text.strip()
def main():
"""Clean all translations with <unk> tokens."""
print("🧹 Limpiando tokens <unk> de traducciones...")
with get_conn() as conn:
with conn.cursor() as cur:
# Find translations with <unk> tokens
cur.execute("""
SELECT id, titulo_trad, resumen_trad
FROM traducciones
WHERE titulo_trad LIKE '%<unk>%'
OR resumen_trad LIKE '%<unk>%'
OR titulo_trad LIKE '%<EFBFBD>%'
OR resumen_trad LIKE '%<EFBFBD>%'
""")
translations = cur.fetchall()
print(f"📊 Encontradas {len(translations)} traducciones con tokens problemáticos")
if not translations:
print("✅ No hay traducciones que limpiar")
return
updated_count = 0
for row in translations:
tr_id, titulo, resumen = row
# Clean the fields
new_titulo = clean_text(titulo) if titulo else titulo
new_resumen = clean_text(resumen) if resumen else resumen
# Update only if something changed
if new_titulo != titulo or new_resumen != resumen:
cur.execute("""
UPDATE traducciones
SET titulo_trad = %s,
resumen_trad = %s
WHERE id = %s
""", (new_titulo, new_resumen, tr_id))
updated_count += 1
if updated_count % 100 == 0:
print(f" ⏳ Procesadas {updated_count} traducciones...")
conn.commit()
print(f"✅ Limpieza completada: {updated_count} traducciones actualizadas")
if __name__ == "__main__":
main()

39
scripts/convert_model.sh Executable file
View file

@ -0,0 +1,39 @@
#!/bin/bash
# Convertir modelo NLLB de HuggingFace a formato CTranslate2
# Ejecutar una vez antes de usar el translation_worker con CTranslate2
set -e
MODEL=${UNIVERSAL_MODEL:-"facebook/nllb-200-distilled-600M"}
OUTPUT_DIR=${CT2_MODEL_PATH:-"./models/nllb-ct2"}
QUANTIZATION=${CT2_QUANTIZATION:-"int8_float16"}
echo "=== Conversión de modelo NLLB a CTranslate2 ==="
echo "Modelo origen: $MODEL"
echo "Directorio destino: $OUTPUT_DIR"
echo "Quantización: $QUANTIZATION"
echo ""
# Verificar que ctranslate2 está instalado
if ! command -v ct2-transformers-converter &> /dev/null; then
echo "Error: ct2-transformers-converter no encontrado."
echo "Instala con: pip install ctranslate2"
exit 1
fi
# Crear directorio si no existe
mkdir -p "$(dirname "$OUTPUT_DIR")"
# Convertir el modelo
echo "Iniciando conversión (puede tardar 5-10 minutos)..."
ct2-transformers-converter \
--model "$MODEL" \
--output_dir "$OUTPUT_DIR" \
--quantization "$QUANTIZATION" \
--force
echo ""
echo "✓ Conversión completada: $OUTPUT_DIR"
echo ""
echo "Para usar el modelo, establece:"
echo " export CT2_MODEL_PATH=$OUTPUT_DIR"

View file

@ -0,0 +1,77 @@
#!/bin/bash
# Script de ejemplo para crear parrillas de videos
echo "🎬 Creando parrillas de ejemplo..."
# 1. Noticias de Bulgaria
docker-compose exec -T db psql -U rss -d rss << EOF
INSERT INTO video_parrillas (
nombre, descripcion, tipo_filtro,
pais_id, max_noticias, duracion_maxima,
idioma_voz, template, include_images, include_subtitles,
frecuencia, activo
) VALUES (
'Noticias de Bulgaria',
'Resumen diario de las noticias más importantes de Bulgaria',
'pais',
(SELECT id FROM paises WHERE nombre ILIKE '%bulgaria%' LIMIT 1),
5, 180,
'es', 'standard', true, true,
'daily', true
) ON CONFLICT DO NOTHING;
EOF
# 2. Ciencia en Europa
docker-compose exec -T db psql -U rss -d rss << EOF
INSERT INTO video_parrillas (
nombre, descripcion, tipo_filtro,
categoria_id, continente_id, max_noticias,
idioma_voz, template, include_subtitles,
frecuencia, activo
) VALUES (
'Ciencia en Europa',
'Las últimas noticias científicas de Europa',
'categoria',
(SELECT id FROM categorias WHERE nombre ILIKE '%ciencia%' LIMIT 1),
(SELECT id FROM continentes WHERE nombre = 'Europa' LIMIT 1),
7,
'es', 'modern', true,
'daily', true
) ON CONFLICT DO NOTHING;
EOF
# 3. Tecnología Global
docker-compose exec -T db psql -U rss -d rss << EOF
INSERT INTO video_parrillas (
nombre, descripcion, tipo_filtro,
categoria_id, max_noticias, duracion_maxima,
idioma_voz, template, include_subtitles,
frecuencia, activo
) VALUES (
'Tech News Daily',
'Resumen diario de tecnología mundial',
'categoria',
(SELECT id FROM categorias WHERE nombre ILIKE '%tecnolog%' LIMIT 1),
8, 300,
'es', 'modern', true,
'daily', true
) ON CONFLICT DO NOTHING;
EOF
echo "✅ Parrillas creadas!"
echo ""
echo "📊 Ver parrillas creadas:"
docker-compose exec -T db psql -U rss -d rss -c "
SELECT id, nombre, tipo_filtro, max_noticias, frecuencia, activo
FROM video_parrillas
ORDER BY id DESC;
"
echo ""
echo "🎥 Accede a la interfaz web en: http://localhost:8001/parrillas/"
echo ""
echo "💡 Para generar un video manualmente:"
echo " docker-compose exec web python generar_videos_noticias.py <id_parrilla>"
echo ""
echo "📅 Para generar todos los videos del día:"
echo " docker-compose exec web python generar_videos_noticias.py"

64
scripts/diagnose_rss.py Normal file
View file

@ -0,0 +1,64 @@
import os
import psycopg2
from datetime import datetime
# Database configuration
DB_WRITE_HOST = os.environ.get("DB_WRITE_HOST", "db")
DB_NAME = os.environ.get("DB_NAME", "rss")
DB_USER = os.environ.get("DB_USER", "rss")
DB_PASS = os.environ.get("DB_PASS", "x")
DB_PORT = os.environ.get("DB_PORT", "5432")
def check_db():
try:
conn = psycopg2.connect(
host=DB_WRITE_HOST,
database=DB_NAME,
user=DB_USER,
password=DB_PASS,
port=DB_PORT,
connect_timeout=5
)
print("✅ Database connection successful.")
with conn.cursor() as cur:
# 1. Total news and latest date
cur.execute("SELECT COUNT(*), MAX(fecha) FROM noticias;")
count, latest = cur.fetchone()
print(f"📊 Total news: {count}")
print(f"🕒 Latest news date: {latest}")
# 2. Feed status
cur.execute("SELECT COUNT(*) FROM feeds WHERE activo = TRUE;")
active_feeds = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM feeds WHERE activo = FALSE;")
inactive_feeds = cur.fetchone()[0]
print(f"📡 Active feeds: {active_feeds}")
print(f"🚫 Inactive feeds: {inactive_feeds}")
# 3. Feeds with most failures
cur.execute("SELECT id, nombre, url, fallos, last_error FROM feeds WHERE fallos > 0 ORDER BY fallos DESC LIMIT 5;")
failures = cur.fetchall()
if failures:
print("\n⚠️ Feeds with most failures:")
for f in failures:
print(f" - ID {f[0]}: {f[1]} ({f[3]} fallos) - Error: {f[4]}")
else:
print("\n✅ No feeds with reported failures.")
# 4. Check for unprocessed translations (if applicable)
# Checking schema again: table 'noticias' doesn't seem to have a 'translated' flag?
# Conversation eeb18716 mentioned 'TRAD/MIN, PENDING, PROCESSING, COMPLETED, ERRORS' metrics.
# Let's check 'traducciones' table if it exists.
cur.execute("SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'traducciones');")
if cur.fetchone()[0]:
cur.execute("SELECT COUNT(*) FROM noticias WHERE id NOT IN (SELECT noticia_id FROM traducciones);")
pending_trans = cur.fetchone()[0]
print(f"🌎 News pending translation: {pending_trans}")
conn.close()
except Exception as e:
print(f"❌ Database error: {e}")
if __name__ == "__main__":
check_db()

View file

@ -0,0 +1,99 @@
import logging
import ssl
import nltk
import os
import urllib.request
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# ================================================================
# Logging
# ================================================================
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
LOG = logging.getLogger("download_models")
# ================================================================
# SSL FIX
# ================================================================
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
# ================================================================
# Paths y modelos
# ================================================================
NLTK_PACKAGES = ["punkt", "punkt_tab", "stopwords"]
NLLB_MODEL = "facebook/nllb-200-distilled-600M"
FASTTEXT_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.218.bin"
FASTTEXT_DEST = "/app/models/lid.218.bin" # donde lo espera tu worker
# ================================================================
# Descargar NLTK
# ================================================================
def download_nltk():
for pkg in NLTK_PACKAGES:
try:
path = f"tokenizers/{pkg}" if pkg.startswith("punkt") else f"corpora/{pkg}"
nltk.data.find(path)
LOG.info(f"NLTK '{pkg}' already installed")
except LookupError:
LOG.info(f"Downloading NLTK '{pkg}'...")
nltk.download(pkg, quiet=True)
LOG.info(f"Downloaded OK: {pkg}")
# ================================================================
# Descargar NLLB
# ================================================================
def download_nllb(model_name: str):
LOG.info(f"Downloading NLLB model: {model_name}")
try:
AutoTokenizer.from_pretrained(model_name)
AutoModelForSeq2SeqLM.from_pretrained(model_name)
LOG.info(f"Downloaded OK: {model_name}")
except Exception as e:
LOG.error(f"Failed downloading NLLB model {model_name}: {e}")
# ================================================================
# Descargar fastText LID.218
# ================================================================
def download_fasttext():
# Crear carpeta /app/models si no existe
dest_dir = os.path.dirname(FASTTEXT_DEST)
os.makedirs(dest_dir, exist_ok=True)
# Si ya existe, no lo descargamos
if os.path.exists(FASTTEXT_DEST):
LOG.info(f"fastText LID already exists at {FASTTEXT_DEST}")
return
LOG.info(f"Downloading fastText LID model from {FASTTEXT_URL}")
try:
urllib.request.urlretrieve(FASTTEXT_URL, FASTTEXT_DEST)
LOG.info(f"Downloaded fastText LID model to {FASTTEXT_DEST}")
except Exception as e:
LOG.error(f"Failed to download fastText LID model: {e}")
# ================================================================
# Main
# ================================================================
if __name__ == "__main__":
LOG.info("Downloading NLTK data...")
download_nltk()
LOG.info("Downloading NLLB model...")
download_nllb(NLLB_MODEL)
LOG.info("Downloading fastText LID model...")
download_fasttext()
LOG.info("All downloads completed successfully.")

View file

@ -0,0 +1,71 @@
import html
import psycopg2
from db import get_conn
import re
def fix_entities():
print("🔧 Fixing HTML entities in database...")
with get_conn() as conn:
with conn.cursor() as cur:
# 1. Update Noticias
print("Processing 'noticias' table...")
cur.execute("""
SELECT id, titulo, resumen
FROM noticias
WHERE titulo LIKE '%&%;%' OR resumen LIKE '%&%;%'
""")
rows = cur.fetchall()
print(f"Found {len(rows)} rows in 'noticias' to check.")
count = 0
for r in rows:
nid, tit, res = r
new_tit = html.unescape(tit) if tit else tit
new_res = html.unescape(res) if res else res
if new_tit != tit or new_res != res:
cur.execute("""
UPDATE noticias
SET titulo = %s, resumen = %s
WHERE id = %s
""", (new_tit, new_res, nid))
count += 1
if count % 100 == 0:
print(f"Updated {count} noticias...")
print(f"Updated {count} rows in 'noticias'.")
# 2. Update Traducciones
print("\nProcessing 'traducciones' table...")
cur.execute("""
SELECT id, titulo_trad, resumen_trad
FROM traducciones
WHERE titulo_trad LIKE '%&%;%' OR resumen_trad LIKE '%&%;%'
""")
rows = cur.fetchall()
print(f"Found {len(rows)} translations to check.")
count_tr = 0
for r in rows:
tid, tit, res = r
new_tit = html.unescape(tit) if tit else tit
new_res = html.unescape(res) if res else res
if new_tit != tit or new_res != res:
cur.execute("""
UPDATE traducciones
SET titulo_trad = %s, resumen_trad = %s
WHERE id = %s
""", (new_tit, new_res, tid))
count_tr += 1
print(f"Updated {count_tr} rows in 'traducciones'.")
conn.commit()
print("✅ Database cleaning complete.")
if __name__ == "__main__":
fix_entities()

View file

@ -0,0 +1,92 @@
import html
import psycopg2
from db import get_conn
import sys
def recursive_unescape(text):
if not text:
return text
# Limit loops to prevent infinite loops on weird edge cases
max_loops = 5
current = text
for _ in range(max_loops):
new_text = html.unescape(current)
if new_text == current:
break
current = new_text
return current
def fix_entities_recursive():
print("🔧 Fixing HTML entities RECURSIVELY in database...")
with get_conn() as conn:
with conn.cursor() as cur:
# 1. Update Noticias
print("Processing 'noticias' table...")
# We select ALL rows that contain '&' to catch any entity
# Optimisation: limit to rows with '&'
# Note: This might be slow if table is huge, but we have ~13k rows, it's fine.
cur.execute("""
SELECT id, titulo, resumen
FROM noticias
WHERE titulo LIKE '%&%' OR resumen LIKE '%&%'
""")
rows = cur.fetchall()
print(f"Found {len(rows)} candidates in 'noticias'.")
count = 0
for r in rows:
nid, tit, res = r
new_tit = recursive_unescape(tit)
new_res = recursive_unescape(res)
if new_tit != tit or new_res != res:
cur.execute("""
UPDATE noticias
SET titulo = %s, resumen = %s
WHERE id = %s
""", (new_tit, new_res, nid))
count += 1
if count % 100 == 0:
print(f"Updated {count} noticias...")
print(f"Total updated in 'noticias': {count}")
# 2. Update Traducciones
print("\nProcessing 'traducciones' table...")
cur.execute("""
SELECT id, titulo_trad, resumen_trad
FROM traducciones
WHERE titulo_trad LIKE '%&%' OR resumen_trad LIKE '%&%'
""")
rows = cur.fetchall()
print(f"Found {len(rows)} candidates in 'traducciones'.")
count_tr = 0
for r in rows:
tid, tit, res = r
new_tit = recursive_unescape(tit)
new_res = recursive_unescape(res)
if new_tit != tit or new_res != res:
cur.execute("""
UPDATE traducciones
SET titulo_trad = %s, resumen_trad = %s
WHERE id = %s
""", (new_tit, new_res, tid))
count_tr += 1
if count_tr % 100 == 0:
print(f"Updated {count_tr} traducciones...")
print(f"Total updated in 'traducciones': {count_tr}")
conn.commit()
print("✅ Database cleaning complete.")
if __name__ == "__main__":
fix_entities_recursive()

244
scripts/migrate_to_qdrant.py Executable file
View file

@ -0,0 +1,244 @@
#!/usr/bin/env python3
"""
Script de migración para vectorizar noticias existentes en Qdrant.
Uso:
# Ver estadísticas
python scripts/migrate_to_qdrant.py --stats
# Vectorizar noticias (proceso completo)
python scripts/migrate_to_qdrant.py --vectorize --batch-size 200
# Limpiar y empezar de nuevo
python scripts/migrate_to_qdrant.py --reset
"""
import os
import sys
import argparse
import time
from datetime import datetime
# Añadir el directorio raíz al path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from db import get_read_conn, get_write_conn
def get_statistics():
"""
Muestra estadísticas del sistema.
"""
print("\n" + "=" * 80)
print("📊 ESTADÍSTICAS DEL SISTEMA")
print("=" * 80)
with get_read_conn() as conn:
with conn.cursor() as cur:
# Traducciones totales
cur.execute("""
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE lang_to = 'es') as es,
COUNT(*) FILTER (WHERE status = 'done') as completadas
FROM traducciones
""")
row = cur.fetchone()
print(f"\n📰 TRADUCCIONES:")
print(f" Total: {row[0]:,}")
print(f" En español: {row[1]:,}")
print(f" Completadas: {row[2]:,}")
# Estado vectorización
cur.execute("""
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE vectorized = TRUE) as vectorizadas,
COUNT(*) FILTER (WHERE vectorized = FALSE AND status = 'done') as pendientes
FROM traducciones
WHERE lang_to = 'es'
""")
row = cur.fetchone()
print(f"\n🔧 VECTORIZACIÓN:")
print(f" Total (ES): {row[0]:,}")
print(f" Vectorizadas: {row[1]:,}")
print(f" Pendientes: {row[2]:,}")
# Info de Qdrant (si existe)
try:
from qdrant_client import QdrantClient
qdrant_host = os.environ.get("QDRANT_HOST", "localhost")
qdrant_port = int(os.environ.get("QDRANT_PORT", "6333"))
collection_name = os.environ.get("QDRANT_COLLECTION_NAME", "news_vectors")
client = QdrantClient(host=qdrant_host, port=qdrant_port)
collection_info = client.get_collection(collection_name)
print(f"\n🔍 QDRANT:")
print(f" Colección: {collection_name}")
print(f" Puntos: {collection_info.points_count:,}")
print(f" Vectores: {collection_info.vectors_count:,}")
except Exception as e:
print(f"\n⚠️ No se pudo conectar a Qdrant: {e}")
print("\n" + "=" * 80 + "\n")
def vectorize_all(batch_size: int = 200):
"""
Vectoriza todas las noticias traducidas pendientes.
"""
print("\n" + "=" * 80)
print("🔍 INICIANDO VECTORIZACIÓN MASIVA")
print("=" * 80)
print(f"Tamaño de lote: {batch_size}")
print("=" * 80 + "\n")
# Importar el worker de Qdrant
from workers.qdrant_worker import (
init_qdrant_client,
init_embedding_model,
get_pending_news,
upload_to_qdrant
)
# Inicializar
print("🔌 Inicializando Qdrant...")
init_qdrant_client()
print("🤖 Cargando modelo de embeddings...")
init_embedding_model()
total_processed = 0
start_time = time.time()
while True:
# Obtener lote pendiente
news_batch = get_pending_news(limit=batch_size)
if not news_batch:
print("\n✅ No hay más noticias pendientes de vectorizar")
break
print(f"\n📋 Procesando lote de {len(news_batch)} noticias...")
try:
upload_to_qdrant(news_batch)
total_processed += len(news_batch)
elapsed = time.time() - start_time
rate = total_processed / elapsed if elapsed > 0 else 0
print(f"\n📊 Progreso: {total_processed:,} vectorizadas")
print(f"⏱️ Velocidad: {rate:.2f} noticias/segundo")
print(f"⏳ Tiempo transcurrido: {elapsed/60:.1f} minutos")
except Exception as e:
print(f"❌ Error procesando lote: {e}")
break
elapsed = time.time() - start_time
print("\n" + "=" * 80)
print("✅ VECTORIZACIÓN COMPLETADA")
print("=" * 80)
print(f"Total vectorizadas: {total_processed:,}")
print(f"Tiempo total: {elapsed/60:.1f} minutos")
print(f"Velocidad promedio: {total_processed/elapsed:.2f} noticias/segundo")
print("=" * 80 + "\n")
def reset_all():
"""
Resetea el estado de vectorización y limpia Qdrant.
"""
print("\n" + "=" * 80)
print("⚠️ RESET COMPLETO DEL SISTEMA DE VECTORES")
print("=" * 80)
response = input("\n¿Estás seguro? Esto eliminará TODOS los vectores y reiniciará el estado (s/N): ")
if response.lower() != 's':
print("❌ Operación cancelada")
return
print("\n🗑️ Reseteando base de datos...")
with get_write_conn() as conn:
with conn.cursor() as cur:
# Resetear flag de vectorización
cur.execute("""
UPDATE traducciones
SET vectorized = FALSE,
qdrant_point_id = NULL,
vectorization_date = NULL
""")
conn.commit()
print("✅ Flags de vectorización reseteados en PostgreSQL")
# Limpiar Qdrant
try:
from qdrant_client import QdrantClient
qdrant_host = os.environ.get("QDRANT_HOST", "localhost")
qdrant_port = int(os.environ.get("QDRANT_PORT", "6333"))
collection_name = os.environ.get("QDRANT_COLLECTION_NAME", "news_vectors")
client = QdrantClient(host=qdrant_host, port=qdrant_port)
# Eliminar colección
client.delete_collection(collection_name)
print(f"✅ Colección '{collection_name}' eliminada de Qdrant")
# Recrear colección
from qdrant_client.models import Distance, VectorParams
client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=384, distance=Distance.COSINE)
)
print(f"✅ Colección '{collection_name}' recreada")
except Exception as e:
print(f"⚠️ Error limpiando Qdrant: {e}")
print("\n✅ Reset completado\n")
def main():
parser = argparse.ArgumentParser(
description="Script de migración para Qdrant (Directo)",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument("--stats", action="store_true", help="Mostrar estadísticas")
parser.add_argument("--vectorize", action="store_true", help="Vectorizar noticias traducidas")
parser.add_argument("--reset", action="store_true", help="Limpiar Qdrant y reiniciar estado")
parser.add_argument("--batch-size", type=int, default=200, help="Tamaño de lote (default: 200)")
args = parser.parse_args()
# Si no se especifica ninguna opción, mostrar estadísticas
if not any([args.stats, args.vectorize, args.reset]):
args.stats = True
try:
if args.stats:
get_statistics()
if args.reset:
reset_all()
if args.vectorize:
vectorize_all(batch_size=args.batch_size)
except KeyboardInterrupt:
print("\n\n⏹️ Proceso interrumpido por el usuario")
except Exception as e:
print(f"\n❌ Error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,70 @@
import logging
import sys
import os
from concurrent.futures import ThreadPoolExecutor
# Add app to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from db import get_read_conn
from utils.wiki import fetch_wiki_data
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def get_top_entities():
"""Get top 100 people, 50 orgs, 50 places from last 30 days."""
entities = []
query = """
SELECT t.valor, COUNT(*) as c
FROM tags t
JOIN tags_noticia tn ON t.id = tn.tag_id
JOIN traducciones tr ON tn.traduccion_id = tr.id
WHERE tr.created_at > NOW() - INTERVAL '30 days'
AND t.tipo = %s
GROUP BY t.valor
ORDER BY c DESC
LIMIT %s
"""
try:
with get_read_conn() as conn:
with conn.cursor() as cur:
# People
cur.execute(query, ('persona', 100))
entities.extend([row[0] for row in cur.fetchall()])
# Orgs
cur.execute(query, ('organizacion', 50))
entities.extend([row[0] for row in cur.fetchall()])
# Places
cur.execute(query, ('lugar', 50))
entities.extend([row[0] for row in cur.fetchall()])
except Exception as e:
logger.error(f"Error fetching top entities: {e}")
return list(set(entities))
def precache_entity(name):
try:
img, summary = fetch_wiki_data(name)
if img or summary:
logger.info(f"✓ Cached: {name}")
else:
logger.info(f"✗ No data for: {name}")
except Exception as e:
logger.error(f"Error caching {name}: {e}")
def run_precache():
logger.info("Starting entity pre-cache...")
entities = get_top_entities()
logger.info(f"Found {len(entities)} unique top entities to cache.")
with ThreadPoolExecutor(max_workers=10) as executor:
executor.map(precache_entity, entities)
logger.info("Pre-cache complete.")
if __name__ == "__main__":
run_precache()

44
scripts/recover_system.py Normal file
View file

@ -0,0 +1,44 @@
import os
import psycopg2
import logging
from datetime import datetime
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
logger = logging.getLogger("recover_system")
DB_CONFIG = {
"host": os.environ.get("DB_HOST", "localhost"),
"port": int(os.environ.get("DB_PORT", 5432)),
"dbname": os.environ.get("DB_NAME", "rss"),
"user": os.environ.get("DB_USER", "rss"),
"password": os.environ.get("DB_PASS", "x"),
}
def recover():
try:
conn = psycopg2.connect(**DB_CONFIG)
conn.autocommit = True
with conn.cursor() as cur:
# 1. Reset stuck translations
logger.info("Resetting stuck 'processing' translations to 'pending'...")
cur.execute("UPDATE traducciones SET status = 'pending' WHERE status = 'processing';")
logger.info(f"Reset {cur.rowcount} translations.")
# 2. Correct future-dated news
logger.info("Correcting future-dated news...")
now = datetime.utcnow()
cur.execute("UPDATE noticias SET fecha = %s WHERE fecha > %s;", (now, now))
logger.info(f"Corrected {cur.rowcount} news items.")
# 3. Reactivate feeds (Optional - only those with few failures)
logger.info("Reactivating feeds with 10-29 failures (giving them another chance)...")
cur.execute("UPDATE feeds SET activo = TRUE, fallos = 0 WHERE activo = FALSE AND fallos < 30;")
logger.info(f"Reactivated {cur.rowcount} feeds.")
conn.close()
logger.info("Recovery complete!")
except Exception as e:
logger.error(f"Error during recovery: {e}")
if __name__ == "__main__":
recover()

View file

@ -0,0 +1,95 @@
#!/usr/bin/env python3
"""
Script de diagnóstico para verificar la conectividad con Qdrant.
Ejecutar desde el contenedor rss2_web para diagnosticar problemas de red.
"""
import os
import sys
def test_qdrant_connection():
"""Prueba la conexión a Qdrant y muestra información de diagnóstico."""
# Configuración
qdrant_host = os.environ.get("QDRANT_HOST", "localhost")
qdrant_port = int(os.environ.get("QDRANT_PORT", "6333"))
print("=" * 60)
print("🔍 DIAGNÓSTICO DE CONEXIÓN QDRANT")
print("=" * 60)
print(f"Host: {qdrant_host}")
print(f"Port: {qdrant_port}")
print()
# 1. Test de resolución DNS
print("1⃣ Probando resolución DNS...")
try:
import socket
ip = socket.gethostbyname(qdrant_host)
print(f" ✅ Host '{qdrant_host}' resuelve a: {ip}")
except Exception as e:
print(f" ❌ ERROR: No se pudo resolver '{qdrant_host}': {e}")
return False
# 2. Test de conectividad TCP
print("\n2⃣ Probando conectividad TCP...")
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(5)
result = sock.connect_ex((ip, qdrant_port))
sock.close()
if result == 0:
print(f" ✅ Puerto {qdrant_port} está abierto")
else:
print(f" ❌ ERROR: Puerto {qdrant_port} está cerrado o inaccesible")
return False
except Exception as e:
print(f" ❌ ERROR en test TCP: {e}")
return False
# 3. Test de cliente Qdrant
print("\n3⃣ Probando cliente Qdrant...")
try:
from qdrant_client import QdrantClient
client = QdrantClient(host=qdrant_host, port=qdrant_port, timeout=5)
collections = client.get_collections()
print(f" ✅ Cliente Qdrant conectado exitosamente")
print(f" 📊 Colecciones disponibles: {[c.name for c in collections.collections]}")
# Test de búsqueda
for collection in collections.collections:
try:
info = client.get_collection(collection.name)
print(f" 📁 {collection.name}: {info.points_count} vectores")
except Exception as e:
print(f" ⚠️ No se pudo obtener info de {collection.name}: {e}")
return True
except Exception as e:
print(f" ❌ ERROR en cliente Qdrant: {e}")
import traceback
traceback.print_exc()
return False
print("\n" + "=" * 60)
if __name__ == "__main__":
success = test_qdrant_connection()
if success:
print("\n✅ DIAGNÓSTICO EXITOSO: Qdrant está accesible")
sys.exit(0)
else:
print("\n❌ DIAGNÓSTICO FALLIDO: Problemas de conectividad con Qdrant")
print("\n💡 SOLUCIONES POSIBLES:")
print(" 1. Verificar que el contenedor 'qdrant' esté corriendo:")
print(" docker ps | grep qdrant")
print(" 2. Verificar que ambos contenedores estén en la misma red:")
print(" docker network inspect rss2_default")
print(" 3. Reiniciar el contenedor de Qdrant:")
print(" docker restart rss2_qdrant")
print(" 4. Verificar variables de entorno QDRANT_HOST y QDRANT_PORT")
sys.exit(1)

View file

@ -0,0 +1,54 @@
import sys
import os
# Add app to path
sys.path.append('/home/x/rss2')
try:
from db import get_conn, get_read_conn, get_write_conn
from cache import get_redis
import psycopg2
print("Imports successfull.")
except ImportError as e:
print(f"Import failed: {e}")
sys.exit(1)
def test_db():
print("\n--- Testing Database Connections ---")
print("Testing Primary (Write) Connection...")
try:
with get_write_conn() as conn:
with conn.cursor() as cur:
cur.execute("SELECT 1")
print(" [OK] Primary reachable.")
except Exception as e:
print(f" [FAIL] Primary unreachable: {e}")
print("Testing Replica (Read) Connection...")
try:
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("SELECT 1")
# Check if it's actually the replica (read-only mode is usually set in replica,
# but here we just check connectivity)
print(" [OK] Replica reachable.")
except Exception as e:
print(f" [FAIL] Replica unreachable: {e}")
def test_redis():
print("\n--- Testing Redis Connection ---")
try:
r = get_redis()
if r:
r.ping()
print(" [OK] Redis reachable.")
else:
print(" [FAIL] Redis client returned None (likely connection failed).")
except Exception as e:
print(f" [FAIL] Redis error: {e}")
if __name__ == "__main__":
test_db()
test_redis()
print("\nVerification complete.")