Initial clean commit

2026-01-13 13:39:51 +01:00 · 2026-01-13 13:39:51 +01:00 · 6784d81c2c
commit 6784d81c2c
141 changed files with 25219 additions and 0 deletions
--- a/scripts/clean_unk_tokens.py
+++ b/scripts/clean_unk_tokens.py
@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+"""
+Script para limpiar caracteres <unk> de las traducciones.
+"""
+import re
+from db import get_conn
+
+def clean_text(text):
+    """Remove <unk> tokens and other problematic characters."""
+    if not text:
+        return text
+    # Remove <unk> tokens
+    text = text.replace('<unk>', '')
+    text = text.replace('<EFBFBD>', '')
+    # Remove other problematic Unicode characters
+    text = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]', '', text)
+    return text.strip()
+
+def main():
+    """Clean all translations with <unk> tokens."""
+    print("🧹 Limpiando tokens <unk> de traducciones...")
+    
+    with get_conn() as conn:
+        with conn.cursor() as cur:
+            # Find translations with <unk> tokens
+            cur.execute("""
+                SELECT id, titulo_trad, resumen_trad
+                FROM traducciones
+                WHERE titulo_trad LIKE '%<unk>%' 
+                   OR resumen_trad LIKE '%<unk>%' 
+                   OR titulo_trad LIKE '%<EFBFBD>%'
+                   OR resumen_trad LIKE '%<EFBFBD>%'
+            """)
+            
+            translations = cur.fetchall()
+            print(f"📊 Encontradas {len(translations)} traducciones con tokens problemáticos")
+            
+            if not translations:
+                print("✅ No hay traducciones que limpiar")
+                return
+            
+            updated_count = 0
+            for row in translations:
+                tr_id, titulo, resumen = row
+                
+                # Clean the fields
+                new_titulo = clean_text(titulo) if titulo else titulo
+                new_resumen = clean_text(resumen) if resumen else resumen
+                
+                # Update only if something changed
+                if new_titulo != titulo or new_resumen != resumen:
+                    cur.execute("""
+                        UPDATE traducciones
+                        SET titulo_trad = %s,
+                            resumen_trad = %s
+                        WHERE id = %s
+                    """, (new_titulo, new_resumen, tr_id))
+                    updated_count += 1
+                    
+                    if updated_count % 100 == 0:
+                        print(f"  ⏳ Procesadas {updated_count} traducciones...")
+            
+            conn.commit()
+            print(f"✅ Limpieza completada: {updated_count} traducciones actualizadas")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/convert_model.sh
+++ b/scripts/convert_model.sh
@ -0,0 +1,39 @@
+#!/bin/bash
+# Convertir modelo NLLB de HuggingFace a formato CTranslate2
+# Ejecutar una vez antes de usar el translation_worker con CTranslate2
+
+set -e
+
+MODEL=${UNIVERSAL_MODEL:-"facebook/nllb-200-distilled-600M"}
+OUTPUT_DIR=${CT2_MODEL_PATH:-"./models/nllb-ct2"}
+QUANTIZATION=${CT2_QUANTIZATION:-"int8_float16"}
+
+echo "=== Conversión de modelo NLLB a CTranslate2 ==="
+echo "Modelo origen: $MODEL"
+echo "Directorio destino: $OUTPUT_DIR"
+echo "Quantización: $QUANTIZATION"
+echo ""
+
+# Verificar que ctranslate2 está instalado
+if ! command -v ct2-transformers-converter &> /dev/null; then
+    echo "Error: ct2-transformers-converter no encontrado."
+    echo "Instala con: pip install ctranslate2"
+    exit 1
+fi
+
+# Crear directorio si no existe
+mkdir -p "$(dirname "$OUTPUT_DIR")"
+
+# Convertir el modelo
+echo "Iniciando conversión (puede tardar 5-10 minutos)..."
+ct2-transformers-converter \
+    --model "$MODEL" \
+    --output_dir "$OUTPUT_DIR" \
+    --quantization "$QUANTIZATION" \
+    --force
+
+echo ""
+echo "✓ Conversión completada: $OUTPUT_DIR"
+echo ""
+echo "Para usar el modelo, establece:"
+echo "  export CT2_MODEL_PATH=$OUTPUT_DIR"
--- a/scripts/crear_parrillas_ejemplo.sh
+++ b/scripts/crear_parrillas_ejemplo.sh
@ -0,0 +1,77 @@
+#!/bin/bash
+# Script de ejemplo para crear parrillas de videos
+
+echo "🎬 Creando parrillas de ejemplo..."
+
+# 1. Noticias de Bulgaria
+docker-compose exec -T db psql -U rss -d rss << EOF
+INSERT INTO video_parrillas (
+    nombre, descripcion, tipo_filtro,
+    pais_id, max_noticias, duracion_maxima,
+    idioma_voz, template, include_images, include_subtitles,
+    frecuencia, activo
+) VALUES (
+    'Noticias de Bulgaria',
+    'Resumen diario de las noticias más importantes de Bulgaria',
+    'pais',
+    (SELECT id FROM paises WHERE nombre ILIKE '%bulgaria%' LIMIT 1),
+    5, 180,
+    'es', 'standard', true, true,
+    'daily', true
+) ON CONFLICT DO NOTHING;
+EOF
+
+# 2. Ciencia en Europa
+docker-compose exec -T db psql -U rss -d rss << EOF
+INSERT INTO video_parrillas (
+    nombre, descripcion, tipo_filtro,
+    categoria_id, continente_id, max_noticias,
+    idioma_voz, template, include_subtitles,
+    frecuencia, activo
+) VALUES (
+    'Ciencia en Europa',
+    'Las últimas noticias científicas de Europa',
+    'categoria',
+    (SELECT id FROM categorias WHERE nombre ILIKE '%ciencia%' LIMIT 1),
+    (SELECT id FROM continentes WHERE nombre = 'Europa' LIMIT 1),
+    7,
+    'es', 'modern', true,
+    'daily', true
+) ON CONFLICT DO NOTHING;
+EOF
+
+# 3. Tecnología Global
+docker-compose exec -T db psql -U rss -d rss << EOF
+INSERT INTO video_parrillas (
+    nombre, descripcion, tipo_filtro,
+    categoria_id, max_noticias, duracion_maxima,
+    idioma_voz, template, include_subtitles,
+    frecuencia, activo
+) VALUES (
+    'Tech News Daily',
+    'Resumen diario de tecnología mundial',
+    'categoria',
+    (SELECT id FROM categorias WHERE nombre ILIKE '%tecnolog%' LIMIT 1),
+    8, 300,
+    'es', 'modern', true,
+    'daily', true
+) ON CONFLICT DO NOTHING;
+EOF
+
+echo "✅ Parrillas creadas!"
+echo ""
+echo "📊 Ver parrillas creadas:"
+docker-compose exec -T db psql -U rss -d rss -c "
+SELECT id, nombre, tipo_filtro, max_noticias, frecuencia, activo 
+FROM video_parrillas 
+ORDER BY id DESC;
+"
+
+echo ""
+echo "🎥 Accede a la interfaz web en: http://localhost:8001/parrillas/"
+echo ""
+echo "💡 Para generar un video manualmente:"
+echo "   docker-compose exec web python generar_videos_noticias.py <id_parrilla>"
+echo ""
+echo "📅 Para generar todos los videos del día:"
+echo "   docker-compose exec web python generar_videos_noticias.py"
--- a/scripts/diagnose_rss.py
+++ b/scripts/diagnose_rss.py
@ -0,0 +1,64 @@
+import os
+import psycopg2
+from datetime import datetime
+
+# Database configuration
+DB_WRITE_HOST = os.environ.get("DB_WRITE_HOST", "db")
+DB_NAME = os.environ.get("DB_NAME", "rss")
+DB_USER = os.environ.get("DB_USER", "rss")
+DB_PASS = os.environ.get("DB_PASS", "x")
+DB_PORT = os.environ.get("DB_PORT", "5432")
+
+def check_db():
+    try:
+        conn = psycopg2.connect(
+            host=DB_WRITE_HOST,
+            database=DB_NAME,
+            user=DB_USER,
+            password=DB_PASS,
+            port=DB_PORT,
+            connect_timeout=5
+        )
+        print("✅ Database connection successful.")
+        
+        with conn.cursor() as cur:
+            # 1. Total news and latest date
+            cur.execute("SELECT COUNT(*), MAX(fecha) FROM noticias;")
+            count, latest = cur.fetchone()
+            print(f"📊 Total news: {count}")
+            print(f"🕒 Latest news date: {latest}")
+            
+            # 2. Feed status
+            cur.execute("SELECT COUNT(*) FROM feeds WHERE activo = TRUE;")
+            active_feeds = cur.fetchone()[0]
+            cur.execute("SELECT COUNT(*) FROM feeds WHERE activo = FALSE;")
+            inactive_feeds = cur.fetchone()[0]
+            print(f"📡 Active feeds: {active_feeds}")
+            print(f"🚫 Inactive feeds: {inactive_feeds}")
+            
+            # 3. Feeds with most failures
+            cur.execute("SELECT id, nombre, url, fallos, last_error FROM feeds WHERE fallos > 0 ORDER BY fallos DESC LIMIT 5;")
+            failures = cur.fetchall()
+            if failures:
+                print("\n⚠️ Feeds with most failures:")
+                for f in failures:
+                    print(f"  - ID {f[0]}: {f[1]} ({f[3]} fallos) - Error: {f[4]}")
+            else:
+                print("\n✅ No feeds with reported failures.")
+
+            # 4. Check for unprocessed translations (if applicable)
+            # Checking schema again: table 'noticias' doesn't seem to have a 'translated' flag?
+            # Conversation eeb18716 mentioned 'TRAD/MIN, PENDING, PROCESSING, COMPLETED, ERRORS' metrics.
+            # Let's check 'traducciones' table if it exists.
+            cur.execute("SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'traducciones');")
+            if cur.fetchone()[0]:
+                cur.execute("SELECT COUNT(*) FROM noticias WHERE id NOT IN (SELECT noticia_id FROM traducciones);")
+                pending_trans = cur.fetchone()[0]
+                print(f"🌎 News pending translation: {pending_trans}")
+
+        conn.close()
+    except Exception as e:
+        print(f"❌ Database error: {e}")
+
+if __name__ == "__main__":
+    check_db()
--- a/scripts/download_models.py
+++ b/scripts/download_models.py
@ -0,0 +1,99 @@
+import logging
+import ssl
+import nltk
+import os
+import urllib.request
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+# ================================================================
+# Logging
+# ================================================================
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s"
+)
+LOG = logging.getLogger("download_models")
+
+# ================================================================
+# SSL FIX
+# ================================================================
+try:
+    _create_unverified_https_context = ssl._create_unverified_context
+except AttributeError:
+    pass
+else:
+    ssl._create_default_https_context = _create_unverified_https_context
+
+# ================================================================
+# Paths y modelos
+# ================================================================
+NLTK_PACKAGES = ["punkt", "punkt_tab", "stopwords"]
+
+NLLB_MODEL = "facebook/nllb-200-distilled-600M"
+
+FASTTEXT_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.218.bin"
+FASTTEXT_DEST = "/app/models/lid.218.bin"  # donde lo espera tu worker
+
+
+# ================================================================
+# Descargar NLTK
+# ================================================================
+def download_nltk():
+    for pkg in NLTK_PACKAGES:
+        try:
+            path = f"tokenizers/{pkg}" if pkg.startswith("punkt") else f"corpora/{pkg}"
+            nltk.data.find(path)
+            LOG.info(f"NLTK '{pkg}' already installed")
+        except LookupError:
+            LOG.info(f"Downloading NLTK '{pkg}'...")
+            nltk.download(pkg, quiet=True)
+            LOG.info(f"Downloaded OK: {pkg}")
+
+# ================================================================
+# Descargar NLLB
+# ================================================================
+def download_nllb(model_name: str):
+    LOG.info(f"Downloading NLLB model: {model_name}")
+    try:
+        AutoTokenizer.from_pretrained(model_name)
+        AutoModelForSeq2SeqLM.from_pretrained(model_name)
+        LOG.info(f"Downloaded OK: {model_name}")
+    except Exception as e:
+        LOG.error(f"Failed downloading NLLB model {model_name}: {e}")
+
+# ================================================================
+# Descargar fastText LID.218
+# ================================================================
+def download_fasttext():
+    # Crear carpeta /app/models si no existe
+    dest_dir = os.path.dirname(FASTTEXT_DEST)
+    os.makedirs(dest_dir, exist_ok=True)
+
+    # Si ya existe, no lo descargamos
+    if os.path.exists(FASTTEXT_DEST):
+        LOG.info(f"fastText LID already exists at {FASTTEXT_DEST}")
+        return
+
+    LOG.info(f"Downloading fastText LID model from {FASTTEXT_URL}")
+
+    try:
+        urllib.request.urlretrieve(FASTTEXT_URL, FASTTEXT_DEST)
+        LOG.info(f"Downloaded fastText LID model to {FASTTEXT_DEST}")
+    except Exception as e:
+        LOG.error(f"Failed to download fastText LID model: {e}")
+
+# ================================================================
+# Main
+# ================================================================
+if __name__ == "__main__":
+    LOG.info("Downloading NLTK data...")
+    download_nltk()
+
+    LOG.info("Downloading NLLB model...")
+    download_nllb(NLLB_MODEL)
+
+    LOG.info("Downloading fastText LID model...")
+    download_fasttext()
+
+    LOG.info("All downloads completed successfully.")
+
--- a/scripts/fix_html_entities.py
+++ b/scripts/fix_html_entities.py
@ -0,0 +1,71 @@
+import html
+import psycopg2
+from db import get_conn
+import re
+
+def fix_entities():
+    print("🔧 Fixing HTML entities in database...")
+    
+    with get_conn() as conn:
+        with conn.cursor() as cur:
+            # 1. Update Noticias
+            print("Processing 'noticias' table...")
+            cur.execute("""
+                SELECT id, titulo, resumen 
+                FROM noticias 
+                WHERE titulo LIKE '%&%;%' OR resumen LIKE '%&%;%'
+            """)
+            rows = cur.fetchall()
+            print(f"Found {len(rows)} rows in 'noticias' to check.")
+            
+            count = 0
+            for r in rows:
+                nid, tit, res = r
+                
+                new_tit = html.unescape(tit) if tit else tit
+                new_res = html.unescape(res) if res else res
+                
+                if new_tit != tit or new_res != res:
+                    cur.execute("""
+                        UPDATE noticias 
+                        SET titulo = %s, resumen = %s 
+                        WHERE id = %s
+                    """, (new_tit, new_res, nid))
+                    count += 1
+                    if count % 100 == 0:
+                        print(f"Updated {count} noticias...")
+            
+            print(f"Updated {count} rows in 'noticias'.")
+
+            # 2. Update Traducciones
+            print("\nProcessing 'traducciones' table...")
+            cur.execute("""
+                SELECT id, titulo_trad, resumen_trad 
+                FROM traducciones 
+                WHERE titulo_trad LIKE '%&%;%' OR resumen_trad LIKE '%&%;%'
+            """)
+            rows = cur.fetchall()
+            print(f"Found {len(rows)} translations to check.")
+            
+            count_tr = 0
+            for r in rows:
+                tid, tit, res = r
+                
+                new_tit = html.unescape(tit) if tit else tit
+                new_res = html.unescape(res) if res else res
+                
+                if new_tit != tit or new_res != res:
+                    cur.execute("""
+                        UPDATE traducciones 
+                        SET titulo_trad = %s, resumen_trad = %s 
+                        WHERE id = %s
+                    """, (new_tit, new_res, tid))
+                    count_tr += 1
+            
+            print(f"Updated {count_tr} rows in 'traducciones'.")
+            
+            conn.commit()
+            print("✅ Database cleaning complete.")
+
+if __name__ == "__main__":
+    fix_entities()
--- a/scripts/fix_html_recursive.py
+++ b/scripts/fix_html_recursive.py
@ -0,0 +1,92 @@
+import html
+import psycopg2
+from db import get_conn
+import sys
+
+def recursive_unescape(text):
+    if not text:
+        return text
+    
+    # Limit loops to prevent infinite loops on weird edge cases
+    max_loops = 5
+    current = text
+    
+    for _ in range(max_loops):
+        new_text = html.unescape(current)
+        if new_text == current:
+            break
+        current = new_text
+        
+    return current
+
+def fix_entities_recursive():
+    print("🔧 Fixing HTML entities RECURSIVELY in database...")
+    
+    with get_conn() as conn:
+        with conn.cursor() as cur:
+            # 1. Update Noticias
+            print("Processing 'noticias' table...")
+            # We select ALL rows that contain '&' to catch any entity
+            # Optimisation: limit to rows with '&' 
+            # Note: This might be slow if table is huge, but we have ~13k rows, it's fine.
+            cur.execute("""
+                SELECT id, titulo, resumen 
+                FROM noticias 
+                WHERE titulo LIKE '%&%' OR resumen LIKE '%&%'
+            """)
+            rows = cur.fetchall()
+            print(f"Found {len(rows)} candidates in 'noticias'.")
+            
+            count = 0
+            for r in rows:
+                nid, tit, res = r
+                
+                new_tit = recursive_unescape(tit)
+                new_res = recursive_unescape(res)
+                
+                if new_tit != tit or new_res != res:
+                    cur.execute("""
+                        UPDATE noticias 
+                        SET titulo = %s, resumen = %s 
+                        WHERE id = %s
+                    """, (new_tit, new_res, nid))
+                    count += 1
+                    if count % 100 == 0:
+                        print(f"Updated {count} noticias...")
+            
+            print(f"Total updated in 'noticias': {count}")
+
+            # 2. Update Traducciones
+            print("\nProcessing 'traducciones' table...")
+            cur.execute("""
+                SELECT id, titulo_trad, resumen_trad 
+                FROM traducciones 
+                WHERE titulo_trad LIKE '%&%' OR resumen_trad LIKE '%&%'
+            """)
+            rows = cur.fetchall()
+            print(f"Found {len(rows)} candidates in 'traducciones'.")
+            
+            count_tr = 0
+            for r in rows:
+                tid, tit, res = r
+                
+                new_tit = recursive_unescape(tit)
+                new_res = recursive_unescape(res)
+                
+                if new_tit != tit or new_res != res:
+                    cur.execute("""
+                        UPDATE traducciones 
+                        SET titulo_trad = %s, resumen_trad = %s 
+                        WHERE id = %s
+                    """, (new_tit, new_res, tid))
+                    count_tr += 1
+                    if count_tr % 100 == 0:
+                        print(f"Updated {count_tr} traducciones...")
+            
+            print(f"Total updated in 'traducciones': {count_tr}")
+            
+            conn.commit()
+            print("✅ Database cleaning complete.")
+
+if __name__ == "__main__":
+    fix_entities_recursive()
--- a/scripts/migrate_to_qdrant.py
+++ b/scripts/migrate_to_qdrant.py
@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+"""
+Script de migración para vectorizar noticias existentes en Qdrant.
+
+Uso:
+    # Ver estadísticas
+    python scripts/migrate_to_qdrant.py --stats
+    
+    # Vectorizar noticias (proceso completo)
+    python scripts/migrate_to_qdrant.py --vectorize --batch-size 200
+    
+    # Limpiar y empezar de nuevo
+    python scripts/migrate_to_qdrant.py --reset
+"""
+
+import os
+import sys
+import argparse
+import time
+from datetime import datetime
+
+# Añadir el directorio raíz al path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from db import get_read_conn, get_write_conn
+
+
+def get_statistics():
+    """
+    Muestra estadísticas del sistema.
+    """
+    print("\n" + "=" * 80)
+    print("📊 ESTADÍSTICAS DEL SISTEMA")
+    print("=" * 80)
+    
+    with get_read_conn() as conn:
+        with conn.cursor() as cur:
+            # Traducciones totales
+            cur.execute("""
+                SELECT 
+                    COUNT(*) as total,
+                    COUNT(*) FILTER (WHERE lang_to = 'es') as es,
+                    COUNT(*) FILTER (WHERE status = 'done') as completadas
+                FROM traducciones
+            """)
+            row = cur.fetchone()
+            print(f"\n📰 TRADUCCIONES:")
+            print(f"   Total: {row[0]:,}")
+            print(f"   En español: {row[1]:,}")
+            print(f"   Completadas: {row[2]:,}")
+            
+            # Estado vectorización
+            cur.execute("""
+                SELECT 
+                    COUNT(*) as total,
+                    COUNT(*) FILTER (WHERE vectorized = TRUE) as vectorizadas,
+                    COUNT(*) FILTER (WHERE vectorized = FALSE AND status = 'done') as pendientes
+                FROM traducciones
+                WHERE lang_to = 'es'
+            """)
+            row = cur.fetchone()
+            print(f"\n🔧 VECTORIZACIÓN:")
+            print(f"   Total (ES): {row[0]:,}")
+            print(f"   Vectorizadas: {row[1]:,}")
+            print(f"   Pendientes: {row[2]:,}")
+            
+            # Info de Qdrant (si existe)
+            try:
+                from qdrant_client import QdrantClient
+                qdrant_host = os.environ.get("QDRANT_HOST", "localhost")
+                qdrant_port = int(os.environ.get("QDRANT_PORT", "6333"))
+                collection_name = os.environ.get("QDRANT_COLLECTION_NAME", "news_vectors")
+                
+                client = QdrantClient(host=qdrant_host, port=qdrant_port)
+                collection_info = client.get_collection(collection_name)
+                
+                print(f"\n🔍 QDRANT:")
+                print(f"   Colección: {collection_name}")
+                print(f"   Puntos: {collection_info.points_count:,}")
+                print(f"   Vectores: {collection_info.vectors_count:,}")
+            except Exception as e:
+                print(f"\n⚠️  No se pudo conectar a Qdrant: {e}")
+    
+    print("\n" + "=" * 80 + "\n")
+
+
+def vectorize_all(batch_size: int = 200):
+    """
+    Vectoriza todas las noticias traducidas pendientes.
+    """
+    print("\n" + "=" * 80)
+    print("🔍 INICIANDO VECTORIZACIÓN MASIVA")
+    print("=" * 80)
+    print(f"Tamaño de lote: {batch_size}")
+    print("=" * 80 + "\n")
+    
+    # Importar el worker de Qdrant
+    from workers.qdrant_worker import (
+        init_qdrant_client,
+        init_embedding_model,
+        get_pending_news,
+        upload_to_qdrant
+    )
+    
+    # Inicializar
+    print("🔌 Inicializando Qdrant...")
+    init_qdrant_client()
+    
+    print("🤖 Cargando modelo de embeddings...")
+    init_embedding_model()
+    
+    total_processed = 0
+    start_time = time.time()
+    
+    while True:
+        # Obtener lote pendiente
+        news_batch = get_pending_news(limit=batch_size)
+        
+        if not news_batch:
+            print("\n✅ No hay más noticias pendientes de vectorizar")
+            break
+        
+        print(f"\n📋 Procesando lote de {len(news_batch)} noticias...")
+        
+        try:
+            upload_to_qdrant(news_batch)
+            total_processed += len(news_batch)
+            
+            elapsed = time.time() - start_time
+            rate = total_processed / elapsed if elapsed > 0 else 0
+            
+            print(f"\n📊 Progreso: {total_processed:,} vectorizadas")
+            print(f"⏱️  Velocidad: {rate:.2f} noticias/segundo")
+            print(f"⏳ Tiempo transcurrido: {elapsed/60:.1f} minutos")
+            
+        except Exception as e:
+            print(f"❌ Error procesando lote: {e}")
+            break
+    
+    elapsed = time.time() - start_time
+    print("\n" + "=" * 80)
+    print("✅ VECTORIZACIÓN COMPLETADA")
+    print("=" * 80)
+    print(f"Total vectorizadas: {total_processed:,}")
+    print(f"Tiempo total: {elapsed/60:.1f} minutos")
+    print(f"Velocidad promedio: {total_processed/elapsed:.2f} noticias/segundo")
+    print("=" * 80 + "\n")
+
+
+def reset_all():
+    """
+    Resetea el estado de vectorización y limpia Qdrant.
+    """
+    print("\n" + "=" * 80)
+    print("⚠️  RESET COMPLETO DEL SISTEMA DE VECTORES")
+    print("=" * 80)
+    
+    response = input("\n¿Estás seguro? Esto eliminará TODOS los vectores y reiniciará el estado (s/N): ")
+    
+    if response.lower() != 's':
+        print("❌ Operación cancelada")
+        return
+    
+    print("\n🗑️  Reseteando base de datos...")
+    
+    with get_write_conn() as conn:
+        with conn.cursor() as cur:
+            # Resetear flag de vectorización
+            cur.execute("""
+                UPDATE traducciones 
+                SET vectorized = FALSE, 
+                    qdrant_point_id = NULL,
+                    vectorization_date = NULL
+            """)
+        conn.commit()
+    
+    print("✅ Flags de vectorización reseteados en PostgreSQL")
+    
+    # Limpiar Qdrant
+    try:
+        from qdrant_client import QdrantClient
+        qdrant_host = os.environ.get("QDRANT_HOST", "localhost")
+        qdrant_port = int(os.environ.get("QDRANT_PORT", "6333"))
+        collection_name = os.environ.get("QDRANT_COLLECTION_NAME", "news_vectors")
+        
+        client = QdrantClient(host=qdrant_host, port=qdrant_port)
+        
+        # Eliminar colección
+        client.delete_collection(collection_name)
+        print(f"✅ Colección '{collection_name}' eliminada de Qdrant")
+        
+        # Recrear colección
+        from qdrant_client.models import Distance, VectorParams
+        client.create_collection(
+            collection_name=collection_name,
+            vectors_config=VectorParams(size=384, distance=Distance.COSINE)
+        )
+        print(f"✅ Colección '{collection_name}' recreada")
+        
+    except Exception as e:
+        print(f"⚠️  Error limpiando Qdrant: {e}")
+    
+    print("\n✅ Reset completado\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Script de migración para Qdrant (Directo)",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+    
+    parser.add_argument("--stats", action="store_true", help="Mostrar estadísticas")
+    parser.add_argument("--vectorize", action="store_true", help="Vectorizar noticias traducidas")
+    parser.add_argument("--reset", action="store_true", help="Limpiar Qdrant y reiniciar estado")
+    parser.add_argument("--batch-size", type=int, default=200, help="Tamaño de lote (default: 200)")
+    
+    args = parser.parse_args()
+    
+    # Si no se especifica ninguna opción, mostrar estadísticas
+    if not any([args.stats, args.vectorize, args.reset]):
+        args.stats = True
+    
+    try:
+        if args.stats:
+            get_statistics()
+        
+        if args.reset:
+            reset_all()
+        
+        if args.vectorize:
+            vectorize_all(batch_size=args.batch_size)
+            
+    except KeyboardInterrupt:
+        print("\n\n⏹️  Proceso interrumpido por el usuario")
+    except Exception as e:
+        print(f"\n❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/precache_entities.py
+++ b/scripts/precache_entities.py
@ -0,0 +1,70 @@
+import logging
+import sys
+import os
+from concurrent.futures import ThreadPoolExecutor
+
+# Add app to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from db import get_read_conn
+from utils.wiki import fetch_wiki_data
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+def get_top_entities():
+    """Get top 100 people, 50 orgs, 50 places from last 30 days."""
+    entities = []
+    query = """
+    SELECT t.valor, COUNT(*) as c
+    FROM tags t
+    JOIN tags_noticia tn ON t.id = tn.tag_id
+    JOIN traducciones tr ON tn.traduccion_id = tr.id
+    WHERE tr.created_at > NOW() - INTERVAL '30 days'
+      AND t.tipo = %s
+    GROUP BY t.valor
+    ORDER BY c DESC
+    LIMIT %s
+    """
+    
+    try:
+        with get_read_conn() as conn:
+            with conn.cursor() as cur:
+                # People
+                cur.execute(query, ('persona', 100))
+                entities.extend([row[0] for row in cur.fetchall()])
+                
+                # Orgs
+                cur.execute(query, ('organizacion', 50))
+                entities.extend([row[0] for row in cur.fetchall()])
+                
+                # Places
+                cur.execute(query, ('lugar', 50))
+                entities.extend([row[0] for row in cur.fetchall()])
+    except Exception as e:
+        logger.error(f"Error fetching top entities: {e}")
+        
+    return list(set(entities))
+
+def precache_entity(name):
+    try:
+        img, summary = fetch_wiki_data(name)
+        if img or summary:
+            logger.info(f"✓ Cached: {name}")
+        else:
+            logger.info(f"✗ No data for: {name}")
+    except Exception as e:
+        logger.error(f"Error caching {name}: {e}")
+
+def run_precache():
+    logger.info("Starting entity pre-cache...")
+    entities = get_top_entities()
+    logger.info(f"Found {len(entities)} unique top entities to cache.")
+    
+    with ThreadPoolExecutor(max_workers=10) as executor:
+        executor.map(precache_entity, entities)
+    
+    logger.info("Pre-cache complete.")
+
+if __name__ == "__main__":
+    run_precache()
--- a/scripts/recover_system.py
+++ b/scripts/recover_system.py
@ -0,0 +1,44 @@
+import os
+import psycopg2
+import logging
+from datetime import datetime
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
+logger = logging.getLogger("recover_system")
+
+DB_CONFIG = {
+    "host": os.environ.get("DB_HOST", "localhost"),
+    "port": int(os.environ.get("DB_PORT", 5432)),
+    "dbname": os.environ.get("DB_NAME", "rss"),
+    "user": os.environ.get("DB_USER", "rss"),
+    "password": os.environ.get("DB_PASS", "x"),
+}
+
+def recover():
+    try:
+        conn = psycopg2.connect(**DB_CONFIG)
+        conn.autocommit = True
+        with conn.cursor() as cur:
+            # 1. Reset stuck translations
+            logger.info("Resetting stuck 'processing' translations to 'pending'...")
+            cur.execute("UPDATE traducciones SET status = 'pending' WHERE status = 'processing';")
+            logger.info(f"Reset {cur.rowcount} translations.")
+
+            # 2. Correct future-dated news
+            logger.info("Correcting future-dated news...")
+            now = datetime.utcnow()
+            cur.execute("UPDATE noticias SET fecha = %s WHERE fecha > %s;", (now, now))
+            logger.info(f"Corrected {cur.rowcount} news items.")
+
+            # 3. Reactivate feeds (Optional - only those with few failures)
+            logger.info("Reactivating feeds with 10-29 failures (giving them another chance)...")
+            cur.execute("UPDATE feeds SET activo = TRUE, fallos = 0 WHERE activo = FALSE AND fallos < 30;")
+            logger.info(f"Reactivated {cur.rowcount} feeds.")
+
+        conn.close()
+        logger.info("Recovery complete!")
+    except Exception as e:
+        logger.error(f"Error during recovery: {e}")
+
+if __name__ == "__main__":
+    recover()
--- a/scripts/test_qdrant_connection.py
+++ b/scripts/test_qdrant_connection.py
@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+"""
+Script de diagnóstico para verificar la conectividad con Qdrant.
+Ejecutar desde el contenedor rss2_web para diagnosticar problemas de red.
+"""
+import os
+import sys
+
+def test_qdrant_connection():
+    """Prueba la conexión a Qdrant y muestra información de diagnóstico."""
+    
+    # Configuración
+    qdrant_host = os.environ.get("QDRANT_HOST", "localhost")
+    qdrant_port = int(os.environ.get("QDRANT_PORT", "6333"))
+    
+    print("=" * 60)
+    print("🔍 DIAGNÓSTICO DE CONEXIÓN QDRANT")
+    print("=" * 60)
+    print(f"Host: {qdrant_host}")
+    print(f"Port: {qdrant_port}")
+    print()
+    
+    # 1. Test de resolución DNS
+    print("1️⃣ Probando resolución DNS...")
+    try:
+        import socket
+        ip = socket.gethostbyname(qdrant_host)
+        print(f"   ✅ Host '{qdrant_host}' resuelve a: {ip}")
+    except Exception as e:
+        print(f"   ❌ ERROR: No se pudo resolver '{qdrant_host}': {e}")
+        return False
+    
+    # 2. Test de conectividad TCP
+    print("\n2️⃣ Probando conectividad TCP...")
+    try:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.settimeout(5)
+        result = sock.connect_ex((ip, qdrant_port))
+        sock.close()
+        
+        if result == 0:
+            print(f"   ✅ Puerto {qdrant_port} está abierto")
+        else:
+            print(f"   ❌ ERROR: Puerto {qdrant_port} está cerrado o inaccesible")
+            return False
+    except Exception as e:
+        print(f"   ❌ ERROR en test TCP: {e}")
+        return False
+    
+    # 3. Test de cliente Qdrant
+    print("\n3️⃣ Probando cliente Qdrant...")
+    try:
+        from qdrant_client import QdrantClient
+        
+        client = QdrantClient(host=qdrant_host, port=qdrant_port, timeout=5)
+        collections = client.get_collections()
+        
+        print(f"   ✅ Cliente Qdrant conectado exitosamente")
+        print(f"   📊 Colecciones disponibles: {[c.name for c in collections.collections]}")
+        
+        # Test de búsqueda
+        for collection in collections.collections:
+            try:
+                info = client.get_collection(collection.name)
+                print(f"   📁 {collection.name}: {info.points_count} vectores")
+            except Exception as e:
+                print(f"   ⚠️ No se pudo obtener info de {collection.name}: {e}")
+        
+        return True
+        
+    except Exception as e:
+        print(f"   ❌ ERROR en cliente Qdrant: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+    
+    print("\n" + "=" * 60)
+
+if __name__ == "__main__":
+    success = test_qdrant_connection()
+    
+    if success:
+        print("\n✅ DIAGNÓSTICO EXITOSO: Qdrant está accesible")
+        sys.exit(0)
+    else:
+        print("\n❌ DIAGNÓSTICO FALLIDO: Problemas de conectividad con Qdrant")
+        print("\n💡 SOLUCIONES POSIBLES:")
+        print("   1. Verificar que el contenedor 'qdrant' esté corriendo:")
+        print("      docker ps | grep qdrant")
+        print("   2. Verificar que ambos contenedores estén en la misma red:")
+        print("      docker network inspect rss2_default")
+        print("   3. Reiniciar el contenedor de Qdrant:")
+        print("      docker restart rss2_qdrant")
+        print("   4. Verificar variables de entorno QDRANT_HOST y QDRANT_PORT")
+        sys.exit(1)
--- a/scripts/verify_connectivity.py
+++ b/scripts/verify_connectivity.py
@ -0,0 +1,54 @@
+import sys
+import os
+
+# Add app to path
+sys.path.append('/home/x/rss2')
+
+try:
+    from db import get_conn, get_read_conn, get_write_conn
+    from cache import get_redis
+    import psycopg2
+    print("Imports successfull.")
+except ImportError as e:
+    print(f"Import failed: {e}")
+    sys.exit(1)
+
+def test_db():
+    print("\n--- Testing Database Connections ---")
+    
+    print("Testing Primary (Write) Connection...")
+    try:
+        with get_write_conn() as conn:
+            with conn.cursor() as cur:
+                cur.execute("SELECT 1")
+                print("  [OK] Primary reachable.")
+    except Exception as e:
+        print(f"  [FAIL] Primary unreachable: {e}")
+
+    print("Testing Replica (Read) Connection...")
+    try:
+        with get_read_conn() as conn:
+            with conn.cursor() as cur:
+                cur.execute("SELECT 1")
+                # Check if it's actually the replica (read-only mode is usually set in replica, 
+                # but here we just check connectivity)
+                print("  [OK] Replica reachable.")
+    except Exception as e:
+        print(f"  [FAIL] Replica unreachable: {e}")
+
+def test_redis():
+    print("\n--- Testing Redis Connection ---")
+    try:
+        r = get_redis()
+        if r:
+            r.ping()
+            print("  [OK] Redis reachable.")
+        else:
+            print("  [FAIL] Redis client returned None (likely connection failed).")
+    except Exception as e:
+        print(f"  [FAIL] Redis error: {e}")
+
+if __name__ == "__main__":
+    test_db()
+    test_redis()
+    print("\nVerification complete.")