go integration and wikipedia

2026-03-28 18:30:07 +01:00 · 2026-03-28 18:30:07 +01:00 · ee90335b92
commit ee90335b92
parent 47a252e339
7828 changed files with 1307913 additions and 20807 deletions
--- a/scripts/clean_repetitive_translations.py
+++ b/scripts/clean_repetitive_translations.py
@ -1,108 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script to detect and clean repetitive/low-quality translations.
-Run this periodically or as a maintenance task.
-"""
-import os
-import re
-import sys
-import psycopg2
-from psycopg2.extras import execute_values
-from dotenv import load_dotenv
-
-load_dotenv()
-
-DB_CONFIG = {
-    "host": os.environ.get("DB_HOST", "localhost"),
-    "port": int(os.environ.get("DB_PORT", 5432)),
-    "dbname": os.environ.get("DB_NAME", "rss"),
-    "user": os.environ.get("DB_USER", "rss"),
-    "password": os.environ.get("DB_PASS", ""),
-}
-
-def is_repetitive(text: str, threshold: float = 0.25) -> bool:
-    """Check if text has repetitive patterns or low word diversity."""
-    if not text or len(text) < 50:
-        return False
-    
-    # Check for obvious repetitive patterns
-    repetitive_patterns = [
-        r'(\b\w+\b)( \1){3,}',  # Same word repeated 4+ times
-        r'(\b\w+ \w+\b)( \1){2,}',  # Same 2-word phrase repeated 3+ times
-        r'de la la ',
-        r'la línea de la línea',
-        r'de Internet de Internet',
-        r'de la de la',
-        r'en el en el',
-    ]
-    
-    for pattern in repetitive_patterns:
-        if re.search(pattern, text, re.IGNORECASE):
-            return True
-    
-    # Check word diversity
-    words = text.lower().split()
-    if len(words) < 10:
-        return False
-    
-    unique_ratio = len(set(words)) / len(words)
-    return unique_ratio < threshold
-
-def main():
-    print("🔍 Scanning for repetitive translations...")
-    
-    conn = psycopg2.connect(**DB_CONFIG)
-    
-    with conn.cursor() as cur:
-        # Fetch all done translations
-        cur.execute("""
-            SELECT id, titulo_trad, resumen_trad 
-            FROM traducciones 
-            WHERE status='done'
-        """)
-        
-        rows = cur.fetchall()
-        total = len(rows)
-        print(f"📊 Checking {total} translations...")
-        
-        bad_ids = []
-        for tr_id, titulo, resumen in rows:
-            if is_repetitive(titulo) or is_repetitive(resumen):
-                bad_ids.append(tr_id)
-        
-        print(f"❌ Found {len(bad_ids)} repetitive translations ({len(bad_ids)/total*100:.2f}%)")
-        
-        if bad_ids:
-            # Show samples
-            cur.execute("""
-                SELECT id, LEFT(resumen_trad, 150) as sample 
-                FROM traducciones 
-                WHERE id = ANY(%s) 
-                LIMIT 5
-            """, (bad_ids,))
-            
-            print("\n📝 Sample bad translations:")
-            for row in cur.fetchall():
-                print(f"  ID {row[0]}: {row[1]}...")
-            
-            # Reset to pending
-            print(f"\n🔄 Resetting {len(bad_ids)} translations to pending...")
-            cur.execute("""
-                UPDATE traducciones 
-                SET status='pending', 
-                    titulo_trad=NULL, 
-                    resumen_trad=NULL, 
-                    error='Repetitive output - auto-cleaned'
-                WHERE id = ANY(%s)
-            """, (bad_ids,))
-            
-            conn.commit()
-            print(f"✅ Successfully reset {len(bad_ids)} translations")
-        else:
-            print("✅ No repetitive translations found!")
-    
-    conn.close()
-    print("\n✨ Cleanup complete!")
-
-if __name__ == "__main__":
-    main()
--- a/scripts/clean_unk_tokens.py
+++ b/scripts/clean_unk_tokens.py
@ -1,67 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script para limpiar caracteres <unk> de las traducciones.
-"""
-import re
-from db import get_conn
-
-def clean_text(text):
-    """Remove <unk> tokens and other problematic characters."""
-    if not text:
-        return text
-    # Remove <unk> tokens
-    text = text.replace('<unk>', '')
-    text = text.replace('<EFBFBD>', '')
-    # Remove other problematic Unicode characters
-    text = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]', '', text)
-    return text.strip()
-
-def main():
-    """Clean all translations with <unk> tokens."""
-    print("🧹 Limpiando tokens <unk> de traducciones...")
-    
-    with get_conn() as conn:
-        with conn.cursor() as cur:
-            # Find translations with <unk> tokens
-            cur.execute("""
-                SELECT id, titulo_trad, resumen_trad
-                FROM traducciones
-                WHERE titulo_trad LIKE '%<unk>%' 
-                   OR resumen_trad LIKE '%<unk>%' 
-                   OR titulo_trad LIKE '%<EFBFBD>%'
-                   OR resumen_trad LIKE '%<EFBFBD>%'
-            """)
-            
-            translations = cur.fetchall()
-            print(f"📊 Encontradas {len(translations)} traducciones con tokens problemáticos")
-            
-            if not translations:
-                print("✅ No hay traducciones que limpiar")
-                return
-            
-            updated_count = 0
-            for row in translations:
-                tr_id, titulo, resumen = row
-                
-                # Clean the fields
-                new_titulo = clean_text(titulo) if titulo else titulo
-                new_resumen = clean_text(resumen) if resumen else resumen
-                
-                # Update only if something changed
-                if new_titulo != titulo or new_resumen != resumen:
-                    cur.execute("""
-                        UPDATE traducciones
-                        SET titulo_trad = %s,
-                            resumen_trad = %s
-                        WHERE id = %s
-                    """, (new_titulo, new_resumen, tr_id))
-                    updated_count += 1
-                    
-                    if updated_count % 100 == 0:
-                        print(f"  ⏳ Procesadas {updated_count} traducciones...")
-            
-            conn.commit()
-            print(f"✅ Limpieza completada: {updated_count} traducciones actualizadas")
-
-if __name__ == "__main__":
-    main()
--- a/scripts/convert_model.sh
+++ b/scripts/convert_model.sh
@ -1,39 +0,0 @@
-#!/bin/bash
-# Convertir modelo NLLB de HuggingFace a formato CTranslate2
-# Ejecutar una vez antes de usar el translation_worker con CTranslate2
-
-set -e
-
-MODEL=${UNIVERSAL_MODEL:-"facebook/nllb-200-distilled-600M"}
-OUTPUT_DIR=${CT2_MODEL_PATH:-"./models/nllb-ct2"}
-QUANTIZATION=${CT2_QUANTIZATION:-"int8_float16"}
-
-echo "=== Conversión de modelo NLLB a CTranslate2 ==="
-echo "Modelo origen: $MODEL"
-echo "Directorio destino: $OUTPUT_DIR"
-echo "Quantización: $QUANTIZATION"
-echo ""
-
-# Verificar que ctranslate2 está instalado
-if ! command -v ct2-transformers-converter &> /dev/null; then
-    echo "Error: ct2-transformers-converter no encontrado."
-    echo "Instala con: pip install ctranslate2"
-    exit 1
-fi
-
-# Crear directorio si no existe
-mkdir -p "$(dirname "$OUTPUT_DIR")"
-
-# Convertir el modelo
-echo "Iniciando conversión (puede tardar 5-10 minutos)..."
-ct2-transformers-converter \
-    --model "$MODEL" \
-    --output_dir "$OUTPUT_DIR" \
-    --quantization "$QUANTIZATION" \
-    --force
-
-echo ""
-echo "✓ Conversión completada: $OUTPUT_DIR"
-echo ""
-echo "Para usar el modelo, establece:"
-echo "  export CT2_MODEL_PATH=$OUTPUT_DIR"
--- a/scripts/crear_parrillas_ejemplo.sh
+++ b/scripts/crear_parrillas_ejemplo.sh
@ -1,77 +0,0 @@
-#!/bin/bash
-# Script de ejemplo para crear parrillas de videos
-
-echo "🎬 Creando parrillas de ejemplo..."
-
-# 1. Noticias de Bulgaria
-docker-compose exec -T db psql -U rss -d rss << EOF
-INSERT INTO video_parrillas (
-    nombre, descripcion, tipo_filtro,
-    pais_id, max_noticias, duracion_maxima,
-    idioma_voz, template, include_images, include_subtitles,
-    frecuencia, activo
-) VALUES (
-    'Noticias de Bulgaria',
-    'Resumen diario de las noticias más importantes de Bulgaria',
-    'pais',
-    (SELECT id FROM paises WHERE nombre ILIKE '%bulgaria%' LIMIT 1),
-    5, 180,
-    'es', 'standard', true, true,
-    'daily', true
-) ON CONFLICT DO NOTHING;
-EOF
-
-# 2. Ciencia en Europa
-docker-compose exec -T db psql -U rss -d rss << EOF
-INSERT INTO video_parrillas (
-    nombre, descripcion, tipo_filtro,
-    categoria_id, continente_id, max_noticias,
-    idioma_voz, template, include_subtitles,
-    frecuencia, activo
-) VALUES (
-    'Ciencia en Europa',
-    'Las últimas noticias científicas de Europa',
-    'categoria',
-    (SELECT id FROM categorias WHERE nombre ILIKE '%ciencia%' LIMIT 1),
-    (SELECT id FROM continentes WHERE nombre = 'Europa' LIMIT 1),
-    7,
-    'es', 'modern', true,
-    'daily', true
-) ON CONFLICT DO NOTHING;
-EOF
-
-# 3. Tecnología Global
-docker-compose exec -T db psql -U rss -d rss << EOF
-INSERT INTO video_parrillas (
-    nombre, descripcion, tipo_filtro,
-    categoria_id, max_noticias, duracion_maxima,
-    idioma_voz, template, include_subtitles,
-    frecuencia, activo
-) VALUES (
-    'Tech News Daily',
-    'Resumen diario de tecnología mundial',
-    'categoria',
-    (SELECT id FROM categorias WHERE nombre ILIKE '%tecnolog%' LIMIT 1),
-    8, 300,
-    'es', 'modern', true,
-    'daily', true
-) ON CONFLICT DO NOTHING;
-EOF
-
-echo "✅ Parrillas creadas!"
-echo ""
-echo "📊 Ver parrillas creadas:"
-docker-compose exec -T db psql -U rss -d rss -c "
-SELECT id, nombre, tipo_filtro, max_noticias, frecuencia, activo 
-FROM video_parrillas 
-ORDER BY id DESC;
-"
-
-echo ""
-echo "🎥 Accede a la interfaz web en: http://localhost:8001/parrillas/"
-echo ""
-echo "💡 Para generar un video manualmente:"
-echo "   docker-compose exec web python generar_videos_noticias.py <id_parrilla>"
-echo ""
-echo "📅 Para generar todos los videos del día:"
-echo "   docker-compose exec web python generar_videos_noticias.py"
--- a/scripts/diagnose_rss.py
+++ b/scripts/diagnose_rss.py
@ -1,64 +0,0 @@
-import os
-import psycopg2
-from datetime import datetime
-
-# Database configuration
-DB_WRITE_HOST = os.environ.get("DB_WRITE_HOST", "db")
-DB_NAME = os.environ.get("DB_NAME", "rss")
-DB_USER = os.environ.get("DB_USER", "rss")
-DB_PASS = os.environ.get("DB_PASS", "x")
-DB_PORT = os.environ.get("DB_PORT", "5432")
-
-def check_db():
-    try:
-        conn = psycopg2.connect(
-            host=DB_WRITE_HOST,
-            database=DB_NAME,
-            user=DB_USER,
-            password=DB_PASS,
-            port=DB_PORT,
-            connect_timeout=5
-        )
-        print("✅ Database connection successful.")
-        
-        with conn.cursor() as cur:
-            # 1. Total news and latest date
-            cur.execute("SELECT COUNT(*), MAX(fecha) FROM noticias;")
-            count, latest = cur.fetchone()
-            print(f"📊 Total news: {count}")
-            print(f"🕒 Latest news date: {latest}")
-            
-            # 2. Feed status
-            cur.execute("SELECT COUNT(*) FROM feeds WHERE activo = TRUE;")
-            active_feeds = cur.fetchone()[0]
-            cur.execute("SELECT COUNT(*) FROM feeds WHERE activo = FALSE;")
-            inactive_feeds = cur.fetchone()[0]
-            print(f"📡 Active feeds: {active_feeds}")
-            print(f"🚫 Inactive feeds: {inactive_feeds}")
-            
-            # 3. Feeds with most failures
-            cur.execute("SELECT id, nombre, url, fallos, last_error FROM feeds WHERE fallos > 0 ORDER BY fallos DESC LIMIT 5;")
-            failures = cur.fetchall()
-            if failures:
-                print("\n⚠️ Feeds with most failures:")
-                for f in failures:
-                    print(f"  - ID {f[0]}: {f[1]} ({f[3]} fallos) - Error: {f[4]}")
-            else:
-                print("\n✅ No feeds with reported failures.")
-
-            # 4. Check for unprocessed translations (if applicable)
-            # Checking schema again: table 'noticias' doesn't seem to have a 'translated' flag?
-            # Conversation eeb18716 mentioned 'TRAD/MIN, PENDING, PROCESSING, COMPLETED, ERRORS' metrics.
-            # Let's check 'traducciones' table if it exists.
-            cur.execute("SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'traducciones');")
-            if cur.fetchone()[0]:
-                cur.execute("SELECT COUNT(*) FROM noticias WHERE id NOT IN (SELECT noticia_id FROM traducciones);")
-                pending_trans = cur.fetchone()[0]
-                print(f"🌎 News pending translation: {pending_trans}")
-
-        conn.close()
-    except Exception as e:
-        print(f"❌ Database error: {e}")
-
-if __name__ == "__main__":
-    check_db()
--- a/scripts/download_llm_model.sh
+++ b/scripts/download_llm_model.sh
@ -1,93 +0,0 @@
-#!/bin/bash
-# Script para descargar modelo LLM compatible con RTX 3060 12GB
-
-set -e
-
-MODEL_DIR="/home/x/rss2/models/llm"
-export PATH="$HOME/.local/bin:$PATH"
-
-echo "=== Descarga de Modelo LLM para Categorización de Noticias ==="
-echo ""
-echo "Para RTX 3060 12GB, se recomienda un modelo 7B cuantizado."
-echo ""
-echo "Opciones disponibles:"
-echo ""
-echo "1) Mistral-7B-Instruct-v0.2 (GPTQ 4-bit) - RECOMENDADO"
-echo "   - Tamaño: ~4.5GB"
-echo "   - Calidad: Excelente para clasificación"
-echo "   - VRAM: ~6-7GB"
-echo ""
-echo "2) Mistral-7B-Instruct-v0.2 (EXL2 4.0bpw)"
-echo "   - Tamaño: ~4.2GB"
-echo "   - Calidad: Excelente (optimizado para ExLlamaV2)"
-echo "   - VRAM: ~5-6GB"
-echo ""
-echo "3) OpenHermes-2.5-Mistral-7B (GPTQ 4-bit)"
-echo "   - Tamaño: ~4.5GB"
-echo "   - Calidad: Muy buena para tareas generales"
-echo "   - VRAM: ~6-7GB"
-echo ""
-echo "4) Neural-Chat-7B-v3-1 (GPTQ 4-bit)"
-echo "   - Tamaño: ~4.5GB"
-echo "   - Calidad: Buena para español"
-echo "   - VRAM: ~6-7GB"
-echo ""
-
-read -p "Selecciona una opción (1-4) [1]: " CHOICE
-CHOICE=${CHOICE:-1}
-
-case $CHOICE in
-    1)
-        MODEL_REPO="TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
-        MODEL_FILE="model.safetensors"
-        ;;
-    2)
-        MODEL_REPO="turboderp/Mistral-7B-instruct-exl2"
-        MODEL_FILE="4.0bpw"
-        ;;
-    3)
-        MODEL_REPO="TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
-        MODEL_FILE="model.safetensors"
-        ;;
-    4)
-        MODEL_REPO="TheBloke/neural-chat-7B-v3-1-GPTQ"
-        MODEL_FILE="model.safetensors"
-        ;;
-    *)
-        echo "Opción inválida"
-        exit 1
-        ;;
-esac
-
-echo ""
-echo "Descargando: $MODEL_REPO"
-echo "Destino: $MODEL_DIR"
-echo ""
-
-# Crear directorio si no existe
-mkdir -p "$MODEL_DIR"
-
-# Verificar si huggingface-cli está instalado
-# Verificar si huggingface-cli está instalado o si el modulo existe
-# Forzamos actualización a una versión reciente para asegurar soporte de CLI
-echo "Actualizando huggingface-hub..."
-pip3 install -U "huggingface_hub[cli]>=0.23.0" --break-system-packages
-
-# Descargar modelo usando script de python directo para evitar problemas de CLI
-echo "Iniciando descarga..."
-python3 -c "
-from huggingface_hub import snapshot_download
-print(f'Descargando { \"$MODEL_REPO\" } a { \"$MODEL_DIR\" }...')
-snapshot_download(repo_id='$MODEL_REPO', local_dir='$MODEL_DIR', local_dir_use_symlinks=False)
-"
-
-echo ""
-echo "✓ Modelo descargado exitosamente en: $MODEL_DIR"
-echo ""
-echo "Información del modelo:"
-echo "----------------------"
-ls -lh "$MODEL_DIR"
-echo ""
-echo "Para usar este modelo, actualiza docker-compose.yml con:"
-echo "  LLM_MODEL_PATH=/app/models/llm"
-echo ""
--- a/scripts/download_models.py
+++ b/scripts/download_models.py
@ -1,99 +0,0 @@
-import logging
-import ssl
-import nltk
-import os
-import urllib.request
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
-# ================================================================
-# Logging
-# ================================================================
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(levelname)s - %(message)s"
-)
-LOG = logging.getLogger("download_models")
-
-# ================================================================
-# SSL FIX
-# ================================================================
-try:
-    _create_unverified_https_context = ssl._create_unverified_context
-except AttributeError:
-    pass
-else:
-    ssl._create_default_https_context = _create_unverified_https_context
-
-# ================================================================
-# Paths y modelos
-# ================================================================
-NLTK_PACKAGES = ["punkt", "punkt_tab", "stopwords"]
-
-NLLB_MODEL = "facebook/nllb-200-distilled-600M"
-
-FASTTEXT_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.218.bin"
-FASTTEXT_DEST = "/app/models/lid.218.bin"  # donde lo espera tu worker
-
-
-# ================================================================
-# Descargar NLTK
-# ================================================================
-def download_nltk():
-    for pkg in NLTK_PACKAGES:
-        try:
-            path = f"tokenizers/{pkg}" if pkg.startswith("punkt") else f"corpora/{pkg}"
-            nltk.data.find(path)
-            LOG.info(f"NLTK '{pkg}' already installed")
-        except LookupError:
-            LOG.info(f"Downloading NLTK '{pkg}'...")
-            nltk.download(pkg, quiet=True)
-            LOG.info(f"Downloaded OK: {pkg}")
-
-# ================================================================
-# Descargar NLLB
-# ================================================================
-def download_nllb(model_name: str):
-    LOG.info(f"Downloading NLLB model: {model_name}")
-    try:
-        AutoTokenizer.from_pretrained(model_name)
-        AutoModelForSeq2SeqLM.from_pretrained(model_name)
-        LOG.info(f"Downloaded OK: {model_name}")
-    except Exception as e:
-        LOG.error(f"Failed downloading NLLB model {model_name}: {e}")
-
-# ================================================================
-# Descargar fastText LID.218
-# ================================================================
-def download_fasttext():
-    # Crear carpeta /app/models si no existe
-    dest_dir = os.path.dirname(FASTTEXT_DEST)
-    os.makedirs(dest_dir, exist_ok=True)
-
-    # Si ya existe, no lo descargamos
-    if os.path.exists(FASTTEXT_DEST):
-        LOG.info(f"fastText LID already exists at {FASTTEXT_DEST}")
-        return
-
-    LOG.info(f"Downloading fastText LID model from {FASTTEXT_URL}")
-
-    try:
-        urllib.request.urlretrieve(FASTTEXT_URL, FASTTEXT_DEST)
-        LOG.info(f"Downloaded fastText LID model to {FASTTEXT_DEST}")
-    except Exception as e:
-        LOG.error(f"Failed to download fastText LID model: {e}")
-
-# ================================================================
-# Main
-# ================================================================
-if __name__ == "__main__":
-    LOG.info("Downloading NLTK data...")
-    download_nltk()
-
-    LOG.info("Downloading NLLB model...")
-    download_nllb(NLLB_MODEL)
-
-    LOG.info("Downloading fastText LID model...")
-    download_fasttext()
-
-    LOG.info("All downloads completed successfully.")
-
--- a/scripts/fix_html_entities.py
+++ b/scripts/fix_html_entities.py
@ -1,71 +0,0 @@
-import html
-import psycopg2
-from db import get_conn
-import re
-
-def fix_entities():
-    print("🔧 Fixing HTML entities in database...")
-    
-    with get_conn() as conn:
-        with conn.cursor() as cur:
-            # 1. Update Noticias
-            print("Processing 'noticias' table...")
-            cur.execute("""
-                SELECT id, titulo, resumen 
-                FROM noticias 
-                WHERE titulo LIKE '%&%;%' OR resumen LIKE '%&%;%'
-            """)
-            rows = cur.fetchall()
-            print(f"Found {len(rows)} rows in 'noticias' to check.")
-            
-            count = 0
-            for r in rows:
-                nid, tit, res = r
-                
-                new_tit = html.unescape(tit) if tit else tit
-                new_res = html.unescape(res) if res else res
-                
-                if new_tit != tit or new_res != res:
-                    cur.execute("""
-                        UPDATE noticias 
-                        SET titulo = %s, resumen = %s 
-                        WHERE id = %s
-                    """, (new_tit, new_res, nid))
-                    count += 1
-                    if count % 100 == 0:
-                        print(f"Updated {count} noticias...")
-            
-            print(f"Updated {count} rows in 'noticias'.")
-
-            # 2. Update Traducciones
-            print("\nProcessing 'traducciones' table...")
-            cur.execute("""
-                SELECT id, titulo_trad, resumen_trad 
-                FROM traducciones 
-                WHERE titulo_trad LIKE '%&%;%' OR resumen_trad LIKE '%&%;%'
-            """)
-            rows = cur.fetchall()
-            print(f"Found {len(rows)} translations to check.")
-            
-            count_tr = 0
-            for r in rows:
-                tid, tit, res = r
-                
-                new_tit = html.unescape(tit) if tit else tit
-                new_res = html.unescape(res) if res else res
-                
-                if new_tit != tit or new_res != res:
-                    cur.execute("""
-                        UPDATE traducciones 
-                        SET titulo_trad = %s, resumen_trad = %s 
-                        WHERE id = %s
-                    """, (new_tit, new_res, tid))
-                    count_tr += 1
-            
-            print(f"Updated {count_tr} rows in 'traducciones'.")
-            
-            conn.commit()
-            print("✅ Database cleaning complete.")
-
-if __name__ == "__main__":
-    fix_entities()
--- a/scripts/fix_html_recursive.py
+++ b/scripts/fix_html_recursive.py
@ -1,92 +0,0 @@
-import html
-import psycopg2
-from db import get_conn
-import sys
-
-def recursive_unescape(text):
-    if not text:
-        return text
-    
-    # Limit loops to prevent infinite loops on weird edge cases
-    max_loops = 5
-    current = text
-    
-    for _ in range(max_loops):
-        new_text = html.unescape(current)
-        if new_text == current:
-            break
-        current = new_text
-        
-    return current
-
-def fix_entities_recursive():
-    print("🔧 Fixing HTML entities RECURSIVELY in database...")
-    
-    with get_conn() as conn:
-        with conn.cursor() as cur:
-            # 1. Update Noticias
-            print("Processing 'noticias' table...")
-            # We select ALL rows that contain '&' to catch any entity
-            # Optimisation: limit to rows with '&' 
-            # Note: This might be slow if table is huge, but we have ~13k rows, it's fine.
-            cur.execute("""
-                SELECT id, titulo, resumen 
-                FROM noticias 
-                WHERE titulo LIKE '%&%' OR resumen LIKE '%&%'
-            """)
-            rows = cur.fetchall()
-            print(f"Found {len(rows)} candidates in 'noticias'.")
-            
-            count = 0
-            for r in rows:
-                nid, tit, res = r
-                
-                new_tit = recursive_unescape(tit)
-                new_res = recursive_unescape(res)
-                
-                if new_tit != tit or new_res != res:
-                    cur.execute("""
-                        UPDATE noticias 
-                        SET titulo = %s, resumen = %s 
-                        WHERE id = %s
-                    """, (new_tit, new_res, nid))
-                    count += 1
-                    if count % 100 == 0:
-                        print(f"Updated {count} noticias...")
-            
-            print(f"Total updated in 'noticias': {count}")
-
-            # 2. Update Traducciones
-            print("\nProcessing 'traducciones' table...")
-            cur.execute("""
-                SELECT id, titulo_trad, resumen_trad 
-                FROM traducciones 
-                WHERE titulo_trad LIKE '%&%' OR resumen_trad LIKE '%&%'
-            """)
-            rows = cur.fetchall()
-            print(f"Found {len(rows)} candidates in 'traducciones'.")
-            
-            count_tr = 0
-            for r in rows:
-                tid, tit, res = r
-                
-                new_tit = recursive_unescape(tit)
-                new_res = recursive_unescape(res)
-                
-                if new_tit != tit or new_res != res:
-                    cur.execute("""
-                        UPDATE traducciones 
-                        SET titulo_trad = %s, resumen_trad = %s 
-                        WHERE id = %s
-                    """, (new_tit, new_res, tid))
-                    count_tr += 1
-                    if count_tr % 100 == 0:
-                        print(f"Updated {count_tr} traducciones...")
-            
-            print(f"Total updated in 'traducciones': {count_tr}")
-            
-            conn.commit()
-            print("✅ Database cleaning complete.")
-
-if __name__ == "__main__":
-    fix_entities_recursive()
--- a/scripts/generar_videos_noticias.py
+++ b/scripts/generar_videos_noticias.py
@ -1,369 +0,0 @@
-#!/usr/bin/env python3
-"""
-Generador de videos de noticias a partir de parrillas.
-Este script procesa parrillas pendientes y genera videos con TTS.
-"""
-import os
-import sys
-import json
-import logging
-from datetime import datetime
-from pathlib import Path
-import requests
-from db import get_conn
-from psycopg2 import extras
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-# Configuración
-OUTPUT_DIR = Path("/app/data/videos")
-AUDIO_DIR = Path("/app/data/audio")
-SUBTITLES_DIR = Path("/app/data/subtitles")
-OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-AUDIO_DIR.mkdir(parents=True, exist_ok=True)
-SUBTITLES_DIR.mkdir(parents=True, exist_ok=True)
-
-# URL del servicio AllTalk TTS (ajustar según configuración)
-ALLTALK_URL = os.getenv("ALLTALK_URL", "http://alltalk:7851")
-
-
-def obtener_noticias_parrilla(parrilla, conn):
-    """
-    Obtiene las noticias que se incluirán en el video según los filtros de la parrilla.
-    """
-    with conn.cursor(cursor_factory=extras.DictCursor) as cur:
-        where_clauses = []
-        params = []
-        
-        if parrilla['pais_id']:
-            where_clauses.append("n.pais_id = %s")
-            params.append(parrilla['pais_id'])
-            
-        if parrilla['categoria_id']:
-            where_clauses.append("n.categoria_id = %s")
-            params.append(parrilla['categoria_id'])
-            
-        if parrilla['entidad_nombre']:
-            where_clauses.append("""
-                EXISTS (
-                    SELECT 1 FROM tags_noticia tn
-                    JOIN tags t ON t.id = tn.tag_id
-                    WHERE tn.traduccion_id = tr.id
-                    AND t.tipo = %s
-                    AND t.valor ILIKE %s
-                )
-            """)
-            params.append(parrilla['entidad_tipo'])
-            params.append(f"%{parrilla['entidad_nombre']}%")
-        
-        # Solo noticias recientes (últimas 24 horas)
-        where_clauses.append("n.fecha >= NOW() - INTERVAL '1 day'")
-        
-        where_sql = " AND ".join(where_clauses) if where_clauses else "1=1"
-        
-        cur.execute(f"""
-            SELECT 
-                n.id,
-                n.titulo,
-                n.imagen_url,
-                n.url,
-                n.fecha,
-                n.fuente_nombre,
-                tr.id as traduccion_id,
-                tr.titulo_trad,
-                tr.resumen_trad,
-                p.nombre as pais,
-                c.nombre as categoria
-            FROM noticias n
-            LEFT JOIN traducciones tr ON tr.noticia_id = n.id 
-                AND tr.lang_to = %s 
-                AND tr.status = 'done'
-            LEFT JOIN paises p ON p.id = n.pais_id
-            LEFT JOIN categorias c ON c.id = n.categoria_id
-            WHERE {where_sql}
-            AND tr.id IS NOT NULL
-            ORDER BY n.fecha DESC
-            LIMIT %s
-        """, [parrilla['idioma_voz']] + params + [parrilla['max_noticias']])
-        
-        return cur.fetchall()
-
-
-def generar_audio_tts(texto, output_path, idioma='es'):
-    """
-    Genera audio usando el servicio AllTalk TTS.
-    """
-    try:
-        # Preparar request para AllTalk
-        payload = {
-            "text_input": texto,
-            "text_filtering": "standard",
-            "character_voice_gen": "irene2.wav",
-            "narrator_enabled": False,
-            "narrator_voice_gen": "male_01.wav",
-            "text_not_inside": "character",
-            "language": idioma,
-            "output_file_name": output_path.stem,
-            "output_file_timestamp": False,
-            "autoplay": False,
-            "autoplay_volume": 0.8
-        }
-        
-        response = requests.post(
-            f"{ALLTALK_URL}/api/tts-generate",
-            json=payload,
-            timeout=60
-        )
-        response.raise_for_status()
-        
-        # El audio se guarda automáticamente por AllTalk
-        # Verificar que existe
-        if output_path.exists():
-            logger.info(f"Audio generado: {output_path}")
-            return True
-        else:
-            logger.error(f"Audio no encontrado después de generación: {output_path}")
-            return False
-            
-    except Exception as e:
-        logger.error(f"Error generating TTS audio: {e}")
-        return False
-
-
-def generar_subtitulos(noticias, output_path):
-    """
-    Genera archivo SRT de subtítulos.
-    """
-    try:
-        with open(output_path, 'w', encoding='utf-8') as f:
-            timestamp = 0
-            
-            for i, noticia in enumerate(noticias, 1):
-                titulo = noticia['titulo_trad'] or noticia['titulo']
-                resumen = noticia['resumen_trad'] or ''
-                
-                # Estimar duración basada en longitud de texto (aprox 150 palabras/min)
-                palabras = len((titulo + " " + resumen).split())
-                duracion = max(5, palabras / 2.5)  # segundos
-                
-                # Formatear timestamp SRT
-                start_time = timestamp
-                end_time = timestamp + duracion
-                
-                f.write(f"{i}\n")
-                f.write(f"{format_srt_time(start_time)} --> {format_srt_time(end_time)}\n")
-                f.write(f"{titulo}\n\n")
-                
-                timestamp = end_time
-                
-        logger.info(f"Subtítulos generados: {output_path}")
-        return True
-        
-    except Exception as e:
-        logger.error(f"Error generating subtitles: {e}")
-        return False
-
-
-def format_srt_time(seconds):
-    """Formatea segundos a formato SRT (HH:MM:SS,mmm)."""
-    hours = int(seconds // 3600)
-    minutes = int((seconds % 3600) // 60)
-    secs = int(seconds % 60)
-    millis = int((seconds % 1) * 1000)
-    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
-
-
-def procesar_parrilla(parrilla_id):
-    """
-    Procesa una parrilla y genera el video.
-    """
-    logger.info(f"Procesando parrilla {parrilla_id}")
-    
-    with get_conn() as conn:
-        with conn.cursor(cursor_factory=extras.DictCursor) as cur:
-            # Obtener configuración de parrilla
-            cur.execute("SELECT * FROM video_parrillas WHERE id = %s", (parrilla_id,))
-            parrilla = cur.fetchone()
-            
-            if not parrilla or not parrilla['activo']:
-                logger.warning(f"Parrilla {parrilla_id} no encontrada o inactiva")
-                return False
-            
-            # Obtener noticias
-            noticias = obtener_noticias_parrilla(parrilla, conn)
-            
-            if not noticias:
-                logger.warning(f"No hay noticias disponibles para parrilla {parrilla_id}")
-                return False
-            
-            logger.info(f"Encontradas {len(noticias)} noticias para el video")
-            
-            # Crear registro de video
-            cur.execute("""
-                INSERT INTO video_generados (
-                    parrilla_id, titulo, descripcion, status, num_noticias
-                ) VALUES (
-                    %s, %s, %s, 'processing', %s
-                ) RETURNING id
-            """, (
-                parrilla_id,
-                f"{parrilla['nombre']} - {datetime.now().strftime('%Y-%m-%d')}",
-                f"Noticias de {parrilla['nombre']}",
-                len(noticias)
-            ))
-            video_id = cur.fetchone()[0]
-            conn.commit()
-            
-            # Preparar directorios
-            video_dir = OUTPUT_DIR / str(video_id)
-            video_dir.mkdir(exist_ok=True, parents=True)
-            
-            # --- SETUP LOGGING FOR THIS VIDEO ---
-            log_file = video_dir / "generation.log"
-            file_handler = logging.FileHandler(log_file, mode='w')
-            file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
-            logger.addHandler(file_handler)
-            
-            try:
-                logger.info(f"Iniciando generación de video {video_id}")
-                logger.info(f"Directorio: {video_dir}")
-                
-                # Generar script de narración
-                logger.info("Generando guion narrativo...")
-                
-                script_parts = []
-                script_parts.append(f"Hola, bienvenidos a {parrilla['nombre']}.")
-                script_parts.append(f"Estas son las noticias más importantes de hoy, {datetime.now().strftime('%d de %B de %Y')}.")
-                
-                for i, noticia in enumerate(noticias, 1):
-                    titulo = noticia['titulo_trad'] or noticia['titulo']
-                    resumen = noticia['resumen_trad'] or ''
-                    
-                    script_parts.append(f"Noticia número {i}.")
-                    script_parts.append(titulo)
-                    if resumen:
-                        script_parts.append(resumen[:500])  # Limitar longitud
-                    script_parts.append("")  # Pausa
-                
-                script_parts.append("Esto ha sido todo por hoy. Gracias por su atención.")
-                
-                full_script = "\n".join(script_parts)
-                
-                # Guardar script
-                script_path = video_dir / "script.txt"
-                with open(script_path, 'w', encoding='utf-8') as f:
-                    f.write(full_script)
-                
-                # Generar audio
-                logger.info(f"Generando audio TTS con AllTalk en: {ALLTALK_URL}")
-                audio_path = video_dir / "audio.wav"
-                if not generar_audio_tts(full_script, audio_path, parrilla['idioma_voz']):
-                    raise Exception(f"Fallo al generar audio TTS en {ALLTALK_URL}")
-                
-                # Generar subtítulos
-                if parrilla['include_subtitles']:
-                    logger.info("Generando subtítulos SRT...")
-                    subtitles_path = video_dir / "subtitles.srt"
-                    generar_subtitulos(noticias, subtitles_path)
-                else:
-                    subtitles_path = None
-                
-                # Registrar noticias en el video
-                for i, noticia in enumerate(noticias, 1):
-                    cur.execute("""
-                        INSERT INTO video_noticias (
-                            video_id, noticia_id, traduccion_id, orden
-                        ) VALUES (%s, %s, %s, %s)
-                    """, (video_id, noticia['id'], noticia['traduccion_id'], i))
-                
-                # Actualizar registro de video
-                cur.execute("""
-                    UPDATE video_generados
-                    SET status = 'completed',
-                        audio_path = %s,
-                        subtitles_path = %s,
-                        noticias_ids = %s
-                    WHERE id = %s
-                """, (
-                    str(audio_path),
-                    str(subtitles_path) if subtitles_path else None,
-                    [n['id'] for n in noticias],
-                    video_id
-                ))
-                
-                # Actualizar parrilla
-                cur.execute("""
-                    UPDATE video_parrillas
-                    SET ultima_generacion = NOW()
-                    WHERE id = %s
-                """, (parrilla_id,))
-                
-                conn.commit()
-                
-                logger.info(f"Video {video_id} generado exitosamente")
-                
-                # Cleanup handler
-                logger.removeHandler(file_handler)
-                file_handler.close()
-                return True
-                
-            except Exception as e:
-                logger.error(f"Error processing video: {e}", exc_info=True)
-                
-                # Marcar como error
-                cur.execute("""
-                    UPDATE video_generados
-                    SET status = 'error',
-                        error_message = %s
-                    WHERE id = %s
-                """, (str(e), video_id))
-                conn.commit()
-                
-                # Cleanup handler
-                logger.removeHandler(file_handler)
-                file_handler.close()
-                
-                return False
-
-
-def main():
-    """
-    Función principal: procesa parrillas activas que necesitan generación.
-    """
-    logger.info("Iniciando generador de videos de noticias")
-    
-    with get_conn() as conn:
-        with conn.cursor(cursor_factory=extras.DictCursor) as cur:
-            # Buscar parrillas activas que necesitan generación
-            # Por ahora, procesar todas las activas manualmente
-            # TODO: Implementar lógica de programación automática
-            
-            if len(sys.argv) > 1:
-                # Modo manual: procesar parrilla específica
-                parrilla_id = int(sys.argv[1])
-                procesar_parrilla(parrilla_id)
-            else:
-                # Modo batch: procesar todas las parrillas activas
-                cur.execute("""
-                    SELECT id FROM video_parrillas
-                    WHERE activo = true
-                    AND frecuencia = 'daily'
-                    AND (ultima_generacion IS NULL 
-                         OR ultima_generacion < NOW() - INTERVAL '1 day')
-                    ORDER BY id
-                """)
-                
-                parrillas = cur.fetchall()
-                logger.info(f"Encontradas {len(parrillas)} parrillas para procesar")
-                
-                for p in parrillas:
-                    try:
-                        procesar_parrilla(p['id'])
-                    except Exception as e:
-                        logger.error(f"Error procesando parrilla {p['id']}: {e}")
-                        continue
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/migrate_to_qdrant.py
+++ b/scripts/migrate_to_qdrant.py
@ -1,244 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script de migración para vectorizar noticias existentes en Qdrant.
-
-Uso:
-    # Ver estadísticas
-    python scripts/migrate_to_qdrant.py --stats
-    
-    # Vectorizar noticias (proceso completo)
-    python scripts/migrate_to_qdrant.py --vectorize --batch-size 200
-    
-    # Limpiar y empezar de nuevo
-    python scripts/migrate_to_qdrant.py --reset
-"""
-
-import os
-import sys
-import argparse
-import time
-from datetime import datetime
-
-# Añadir el directorio raíz al path
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from db import get_read_conn, get_write_conn
-
-
-def get_statistics():
-    """
-    Muestra estadísticas del sistema.
-    """
-    print("\n" + "=" * 80)
-    print("📊 ESTADÍSTICAS DEL SISTEMA")
-    print("=" * 80)
-    
-    with get_read_conn() as conn:
-        with conn.cursor() as cur:
-            # Traducciones totales
-            cur.execute("""
-                SELECT 
-                    COUNT(*) as total,
-                    COUNT(*) FILTER (WHERE lang_to = 'es') as es,
-                    COUNT(*) FILTER (WHERE status = 'done') as completadas
-                FROM traducciones
-            """)
-            row = cur.fetchone()
-            print(f"\n📰 TRADUCCIONES:")
-            print(f"   Total: {row[0]:,}")
-            print(f"   En español: {row[1]:,}")
-            print(f"   Completadas: {row[2]:,}")
-            
-            # Estado vectorización
-            cur.execute("""
-                SELECT 
-                    COUNT(*) as total,
-                    COUNT(*) FILTER (WHERE vectorized = TRUE) as vectorizadas,
-                    COUNT(*) FILTER (WHERE vectorized = FALSE AND status = 'done') as pendientes
-                FROM traducciones
-                WHERE lang_to = 'es'
-            """)
-            row = cur.fetchone()
-            print(f"\n🔧 VECTORIZACIÓN:")
-            print(f"   Total (ES): {row[0]:,}")
-            print(f"   Vectorizadas: {row[1]:,}")
-            print(f"   Pendientes: {row[2]:,}")
-            
-            # Info de Qdrant (si existe)
-            try:
-                from qdrant_client import QdrantClient
-                qdrant_host = os.environ.get("QDRANT_HOST", "localhost")
-                qdrant_port = int(os.environ.get("QDRANT_PORT", "6333"))
-                collection_name = os.environ.get("QDRANT_COLLECTION_NAME", "news_vectors")
-                
-                client = QdrantClient(host=qdrant_host, port=qdrant_port)
-                collection_info = client.get_collection(collection_name)
-                
-                print(f"\n🔍 QDRANT:")
-                print(f"   Colección: {collection_name}")
-                print(f"   Puntos: {collection_info.points_count:,}")
-                print(f"   Vectores: {collection_info.vectors_count:,}")
-            except Exception as e:
-                print(f"\n⚠️  No se pudo conectar a Qdrant: {e}")
-    
-    print("\n" + "=" * 80 + "\n")
-
-
-def vectorize_all(batch_size: int = 200):
-    """
-    Vectoriza todas las noticias traducidas pendientes.
-    """
-    print("\n" + "=" * 80)
-    print("🔍 INICIANDO VECTORIZACIÓN MASIVA")
-    print("=" * 80)
-    print(f"Tamaño de lote: {batch_size}")
-    print("=" * 80 + "\n")
-    
-    # Importar el worker de Qdrant
-    from workers.qdrant_worker import (
-        init_qdrant_client,
-        init_embedding_model,
-        get_pending_news,
-        upload_to_qdrant
-    )
-    
-    # Inicializar
-    print("🔌 Inicializando Qdrant...")
-    init_qdrant_client()
-    
-    print("🤖 Cargando modelo de embeddings...")
-    init_embedding_model()
-    
-    total_processed = 0
-    start_time = time.time()
-    
-    while True:
-        # Obtener lote pendiente
-        news_batch = get_pending_news(limit=batch_size)
-        
-        if not news_batch:
-            print("\n✅ No hay más noticias pendientes de vectorizar")
-            break
-        
-        print(f"\n📋 Procesando lote de {len(news_batch)} noticias...")
-        
-        try:
-            upload_to_qdrant(news_batch)
-            total_processed += len(news_batch)
-            
-            elapsed = time.time() - start_time
-            rate = total_processed / elapsed if elapsed > 0 else 0
-            
-            print(f"\n📊 Progreso: {total_processed:,} vectorizadas")
-            print(f"⏱️  Velocidad: {rate:.2f} noticias/segundo")
-            print(f"⏳ Tiempo transcurrido: {elapsed/60:.1f} minutos")
-            
-        except Exception as e:
-            print(f"❌ Error procesando lote: {e}")
-            break
-    
-    elapsed = time.time() - start_time
-    print("\n" + "=" * 80)
-    print("✅ VECTORIZACIÓN COMPLETADA")
-    print("=" * 80)
-    print(f"Total vectorizadas: {total_processed:,}")
-    print(f"Tiempo total: {elapsed/60:.1f} minutos")
-    print(f"Velocidad promedio: {total_processed/elapsed:.2f} noticias/segundo")
-    print("=" * 80 + "\n")
-
-
-def reset_all():
-    """
-    Resetea el estado de vectorización y limpia Qdrant.
-    """
-    print("\n" + "=" * 80)
-    print("⚠️  RESET COMPLETO DEL SISTEMA DE VECTORES")
-    print("=" * 80)
-    
-    response = input("\n¿Estás seguro? Esto eliminará TODOS los vectores y reiniciará el estado (s/N): ")
-    
-    if response.lower() != 's':
-        print("❌ Operación cancelada")
-        return
-    
-    print("\n🗑️  Reseteando base de datos...")
-    
-    with get_write_conn() as conn:
-        with conn.cursor() as cur:
-            # Resetear flag de vectorización
-            cur.execute("""
-                UPDATE traducciones 
-                SET vectorized = FALSE, 
-                    qdrant_point_id = NULL,
-                    vectorization_date = NULL
-            """)
-        conn.commit()
-    
-    print("✅ Flags de vectorización reseteados en PostgreSQL")
-    
-    # Limpiar Qdrant
-    try:
-        from qdrant_client import QdrantClient
-        qdrant_host = os.environ.get("QDRANT_HOST", "localhost")
-        qdrant_port = int(os.environ.get("QDRANT_PORT", "6333"))
-        collection_name = os.environ.get("QDRANT_COLLECTION_NAME", "news_vectors")
-        
-        client = QdrantClient(host=qdrant_host, port=qdrant_port)
-        
-        # Eliminar colección
-        client.delete_collection(collection_name)
-        print(f"✅ Colección '{collection_name}' eliminada de Qdrant")
-        
-        # Recrear colección
-        from qdrant_client.models import Distance, VectorParams
-        client.create_collection(
-            collection_name=collection_name,
-            vectors_config=VectorParams(size=384, distance=Distance.COSINE)
-        )
-        print(f"✅ Colección '{collection_name}' recreada")
-        
-    except Exception as e:
-        print(f"⚠️  Error limpiando Qdrant: {e}")
-    
-    print("\n✅ Reset completado\n")
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Script de migración para Qdrant (Directo)",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog=__doc__
-    )
-    
-    parser.add_argument("--stats", action="store_true", help="Mostrar estadísticas")
-    parser.add_argument("--vectorize", action="store_true", help="Vectorizar noticias traducidas")
-    parser.add_argument("--reset", action="store_true", help="Limpiar Qdrant y reiniciar estado")
-    parser.add_argument("--batch-size", type=int, default=200, help="Tamaño de lote (default: 200)")
-    
-    args = parser.parse_args()
-    
-    # Si no se especifica ninguna opción, mostrar estadísticas
-    if not any([args.stats, args.vectorize, args.reset]):
-        args.stats = True
-    
-    try:
-        if args.stats:
-            get_statistics()
-        
-        if args.reset:
-            reset_all()
-        
-        if args.vectorize:
-            vectorize_all(batch_size=args.batch_size)
-            
-    except KeyboardInterrupt:
-        print("\n\n⏹️  Proceso interrumpido por el usuario")
-    except Exception as e:
-        print(f"\n❌ Error: {e}")
-        import traceback
-        traceback.print_exc()
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/monitor_translation_quality.py
+++ b/scripts/monitor_translation_quality.py
@ -1,134 +0,0 @@
-#!/usr/bin/env python3
-"""
-Monitor translation quality in real-time.
-Shows statistics about translation quality and detects issues.
-"""
-import os
-import sys
-import time
-import psycopg2
-from datetime import datetime, timedelta
-from dotenv import load_dotenv
-
-load_dotenv()
-
-DB_CONFIG = {
-    "host": os.environ.get("DB_HOST", "localhost"),
-    "port": int(os.environ.get("DB_PORT", 5432)),
-    "dbname": os.environ.get("DB_NAME", "rss"),
-    "user": os.environ.get("DB_USER", "rss"),
-    "password": os.environ.get("DB_PASS", ""),
-}
-
-def get_stats(conn, hours=24):
-    """Get translation statistics for the last N hours."""
-    with conn.cursor() as cur:
-        # Total translations in period
-        cur.execute("""
-            SELECT 
-                COUNT(*) as total,
-                COUNT(CASE WHEN status='done' THEN 1 END) as done,
-                COUNT(CASE WHEN status='pending' THEN 1 END) as pending,
-                COUNT(CASE WHEN status='processing' THEN 1 END) as processing,
-                COUNT(CASE WHEN status='error' THEN 1 END) as errors
-            FROM traducciones
-            WHERE created_at > NOW() - INTERVAL '%s hours'
-        """, (hours,))
-        
-        stats = cur.fetchone()
-        
-        # Check for repetitive patterns in recent translations
-        cur.execute("""
-            SELECT COUNT(*) 
-            FROM traducciones 
-            WHERE status='done' 
-              AND created_at > NOW() - INTERVAL '%s hours'
-              AND (
-                resumen_trad LIKE '%%la línea de la línea%%' 
-                OR resumen_trad LIKE '%%de la la %%'
-                OR resumen_trad LIKE '%%de Internet de Internet%%'
-              )
-        """, (hours,))
-        
-        repetitive = cur.fetchone()[0]
-        
-        # Get error messages
-        cur.execute("""
-            SELECT error, COUNT(*) as count
-            FROM traducciones
-            WHERE status='error' 
-              AND created_at > NOW() - INTERVAL '%s hours'
-            GROUP BY error
-            ORDER BY count DESC
-            LIMIT 5
-        """, (hours,))
-        
-        errors = cur.fetchall()
-        
-        return {
-            'total': stats[0],
-            'done': stats[1],
-            'pending': stats[2],
-            'processing': stats[3],
-            'errors': stats[4],
-            'repetitive': repetitive,
-            'error_details': errors
-        }
-
-def print_stats(stats, hours):
-    """Pretty print statistics."""
-    print(f"\n{'='*60}")
-    print(f"📊 Translation Quality Report - Last {hours}h")
-    print(f"{'='*60}")
-    print(f"Total Translations: {stats['total']}")
-    print(f"  ✅ Done:        {stats['done']:>6} ({stats['done']/max(stats['total'],1)*100:>5.1f}%)")
-    print(f"  ⏳ Pending:     {stats['pending']:>6} ({stats['pending']/max(stats['total'],1)*100:>5.1f}%)")
-    print(f"  🔄 Processing:  {stats['processing']:>6} ({stats['processing']/max(stats['total'],1)*100:>5.1f}%)")
-    print(f"  ❌ Errors:      {stats['errors']:>6} ({stats['errors']/max(stats['total'],1)*100:>5.1f}%)")
-    print(f"\n🔍 Quality Issues:")
-    print(f"  ⚠️  Repetitive:  {stats['repetitive']:>6} ({stats['repetitive']/max(stats['done'],1)*100:>5.1f}% of done)")
-    
-    if stats['error_details']:
-        print(f"\n📋 Top Error Messages:")
-        for error, count in stats['error_details']:
-            error_short = (error[:50] + '...') if error and len(error) > 50 else (error or 'Unknown')
-            print(f"  • {error_short}: {count}")
-    
-    # Quality score
-    if stats['done'] > 0:
-        quality_score = (1 - stats['repetitive'] / stats['done']) * 100
-        quality_emoji = "🟢" if quality_score > 95 else "🟡" if quality_score > 90 else "🔴"
-        print(f"\n{quality_emoji} Quality Score: {quality_score:.1f}%")
-    
-    print(f"{'='*60}\n")
-
-def main():
-    import argparse
-    parser = argparse.ArgumentParser(description='Monitor translation quality')
-    parser.add_argument('--hours', type=int, default=24, help='Hours to look back (default: 24)')
-    parser.add_argument('--watch', action='store_true', help='Continuous monitoring mode')
-    parser.add_argument('--interval', type=int, default=60, help='Update interval in seconds (default: 60)')
-    
-    args = parser.parse_args()
-    
-    conn = psycopg2.connect(**DB_CONFIG)
-    
-    try:
-        if args.watch:
-            print("🔄 Starting continuous monitoring (Ctrl+C to stop)...")
-            while True:
-                stats = get_stats(conn, args.hours)
-                print(f"\033[2J\033[H")  # Clear screen
-                print(f"Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-                print_stats(stats, args.hours)
-                time.sleep(args.interval)
-        else:
-            stats = get_stats(conn, args.hours)
-            print_stats(stats, args.hours)
-    except KeyboardInterrupt:
-        print("\n\n👋 Monitoring stopped")
-    finally:
-        conn.close()
-
-if __name__ == "__main__":
-    main()
--- a/scripts/precache_entities.py
+++ b/scripts/precache_entities.py
@ -1,70 +0,0 @@
-import logging
-import sys
-import os
-from concurrent.futures import ThreadPoolExecutor
-
-# Add app to path
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from db import get_read_conn
-from utils.wiki import fetch_wiki_data
-
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-
-def get_top_entities():
-    """Get top 100 people, 50 orgs, 50 places from last 30 days."""
-    entities = []
-    query = """
-    SELECT t.valor, COUNT(*) as c
-    FROM tags t
-    JOIN tags_noticia tn ON t.id = tn.tag_id
-    JOIN traducciones tr ON tn.traduccion_id = tr.id
-    WHERE tr.created_at > NOW() - INTERVAL '30 days'
-      AND t.tipo = %s
-    GROUP BY t.valor
-    ORDER BY c DESC
-    LIMIT %s
-    """
-    
-    try:
-        with get_read_conn() as conn:
-            with conn.cursor() as cur:
-                # People
-                cur.execute(query, ('persona', 100))
-                entities.extend([row[0] for row in cur.fetchall()])
-                
-                # Orgs
-                cur.execute(query, ('organizacion', 50))
-                entities.extend([row[0] for row in cur.fetchall()])
-                
-                # Places
-                cur.execute(query, ('lugar', 50))
-                entities.extend([row[0] for row in cur.fetchall()])
-    except Exception as e:
-        logger.error(f"Error fetching top entities: {e}")
-        
-    return list(set(entities))
-
-def precache_entity(name):
-    try:
-        img, summary = fetch_wiki_data(name)
-        if img or summary:
-            logger.info(f"✓ Cached: {name}")
-        else:
-            logger.info(f"✗ No data for: {name}")
-    except Exception as e:
-        logger.error(f"Error caching {name}: {e}")
-
-def run_precache():
-    logger.info("Starting entity pre-cache...")
-    entities = get_top_entities()
-    logger.info(f"Found {len(entities)} unique top entities to cache.")
-    
-    with ThreadPoolExecutor(max_workers=10) as executor:
-        executor.map(precache_entity, entities)
-    
-    logger.info("Pre-cache complete.")
-
-if __name__ == "__main__":
-    run_precache()
--- a/scripts/recover_system.py
+++ b/scripts/recover_system.py
@ -1,44 +0,0 @@
-import os
-import psycopg2
-import logging
-from datetime import datetime
-
-logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
-logger = logging.getLogger("recover_system")
-
-DB_CONFIG = {
-    "host": os.environ.get("DB_HOST", "localhost"),
-    "port": int(os.environ.get("DB_PORT", 5432)),
-    "dbname": os.environ.get("DB_NAME", "rss"),
-    "user": os.environ.get("DB_USER", "rss"),
-    "password": os.environ.get("DB_PASS", "x"),
-}
-
-def recover():
-    try:
-        conn = psycopg2.connect(**DB_CONFIG)
-        conn.autocommit = True
-        with conn.cursor() as cur:
-            # 1. Reset stuck translations
-            logger.info("Resetting stuck 'processing' translations to 'pending'...")
-            cur.execute("UPDATE traducciones SET status = 'pending' WHERE status = 'processing';")
-            logger.info(f"Reset {cur.rowcount} translations.")
-
-            # 2. Correct future-dated news
-            logger.info("Correcting future-dated news...")
-            now = datetime.utcnow()
-            cur.execute("UPDATE noticias SET fecha = %s WHERE fecha > %s;", (now, now))
-            logger.info(f"Corrected {cur.rowcount} news items.")
-
-            # 3. Reactivate feeds (Optional - only those with few failures)
-            logger.info("Reactivating feeds with 10-29 failures (giving them another chance)...")
-            cur.execute("UPDATE feeds SET activo = TRUE, fallos = 0 WHERE activo = FALSE AND fallos < 30;")
-            logger.info(f"Reactivated {cur.rowcount} feeds.")
-
-        conn.close()
-        logger.info("Recovery complete!")
-    except Exception as e:
-        logger.error(f"Error during recovery: {e}")
-
-if __name__ == "__main__":
-    recover()
--- a/scripts/test_llm_categorizer.py
+++ b/scripts/test_llm_categorizer.py
@ -1,140 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script de prueba para el LLM Categorizer
-Prueba la categorización con datos de ejemplo sin necesidad del contenedor
-"""
-
-import os
-import sys
-
-# Datos de prueba
-TEST_NEWS = [
-    {
-        'id': 'test_1',
-        'titulo': 'El gobierno anuncia nuevas medidas económicas para combatir la inflación',
-        'resumen': 'El presidente del gobierno ha presentado un paquete de medidas económicas destinadas a reducir la inflación y proteger el poder adquisitivo de las familias.'
-    },
-    {
-        'id': 'test_2',
-        'titulo': 'Nueva vacuna contra el cáncer muestra resultados prometedores',
-        'resumen': 'Investigadores de la Universidad de Stanford han desarrollado una vacuna experimental que ha mostrado una eficacia del 85% en ensayos clínicos con pacientes con melanoma.'
-    },
-    {
-        'id': 'test_3',
-        'titulo': 'El Real Madrid gana la Champions League por decimoquinta vez',
-        'resumen': 'El equipo blanco se impuso por 2-1 en la final celebrada en Wembley, consolidándose como el club más laureado de la competición europea.'
-    },
-    {
-        'id': 'test_4',
-        'titulo': 'OpenAI lanza GPT-5 con capacidades multimodales mejoradas',
-        'resumen': 'La nueva versión del modelo de lenguaje incorpora mejor comprensión de imágenes, video y audio, además de un razonamiento más avanzado.'
-    },
-    {
-        'id': 'test_5',
-        'titulo': 'Crisis diplomática entre Estados Unidos y China por aranceles',
-        'resumen': 'Las tensiones comerciales se intensifican después de que Washington impusiera nuevos aranceles del 25% a productos tecnológicos chinos.'
-    }
-]
-
-def test_without_llm():
-    """Prueba básica sin LLM (categorización basada en keywords)"""
-    print("=== Prueba de Categorización Básica (sin LLM) ===\n")
-    
-    # Categorías con palabras clave simples
-    CATEGORIES_KEYWORDS = {
-        'Política': ['gobierno', 'presidente', 'político', 'parlamento', 'elecciones'],
-        'Economía': ['económic', 'inflación', 'aranceles', 'bolsa', 'financiero'],
-        'Salud': ['vacuna', 'hospital', 'médico', 'tratamiento', 'enfermedad'],
-        'Deportes': ['fútbol', 'champions', 'equipo', 'partido', 'gana'],
-        'Tecnología': ['tecnológic', 'digital', 'software', 'ai', 'gpt', 'openai'],
-        'Internacional': ['estados unidos', 'china', 'rusia', 'diplomática', 'crisis'],
-    }
-    
-    for news in TEST_NEWS:
-        text = (news['titulo'] + ' ' + news['resumen']).lower()
-        
-        best_category = 'Otros'
-        max_score = 0
-        
-        for category, keywords in CATEGORIES_KEYWORDS.items():
-            score = sum(1 for kw in keywords if kw in text)
-            if score > max_score:
-                max_score = score
-                best_category = category
-        
-        print(f"ID: {news['id']}")
-        print(f"Título: {news['titulo']}")
-        print(f"Categoría: {best_category} (score: {max_score})")
-        print()
-
-def test_with_llm():
-    """Prueba con el LLM real (requiere modelo descargado)"""
-    print("\n=== Prueba de Categorización con LLM ===\n")
-    
-    # Configurar path del modelo
-    MODEL_PATH = os.environ.get("LLM_MODEL_PATH", "/home/x/rss2/models/llm")
-    
-    if not os.path.exists(MODEL_PATH):
-        print(f"❌ Error: No se encuentra el modelo en {MODEL_PATH}")
-        print(f"Por favor ejecuta primero: ./scripts/download_llm_model.sh")
-        return
-    
-    # Verificar si exllamav2 está instalado
-    try:
-        import exllamav2
-        print(f"✓ ExLlamaV2 instalado: {exllamav2.__version__}")
-    except ImportError:
-        print("❌ Error: ExLlamaV2 no está instalado")
-        print("Instalar con: pip install exllamav2")
-        return
-    
-    # Importar el categorizer
-    sys.path.insert(0, '/home/x/rss2')
-    from workers.llm_categorizer_worker import ExLlamaV2Categorizer
-    
-    print(f"Cargando modelo desde: {MODEL_PATH}")
-    print("(Esto puede tardar unos minutos...)\n")
-    
-    try:
-        categorizer = ExLlamaV2Categorizer(MODEL_PATH)
-        print("✓ Modelo cargado exitosamente\n")
-        
-        results = categorizer.categorize_news(TEST_NEWS)
-        
-        print("\n=== Resultados ===\n")
-        for i, news in enumerate(TEST_NEWS):
-            result = results[i]
-            print(f"ID: {news['id']}")
-            print(f"Título: {news['titulo']}")
-            print(f"Categoría: {result['categoria']}")
-            print(f"Confianza: {result['confianza']:.2f}")
-            print()
-            
-    except Exception as e:
-        print(f"❌ Error: {e}")
-        import traceback
-        traceback.print_exc()
-
-def main():
-    print("=" * 60)
-    print("Script de Prueba del LLM Categorizer")
-    print("=" * 60)
-    print()
-    
-    # Prueba básica siempre funciona
-    test_without_llm()
-    
-    # Preguntar si probar con LLM
-    print("\n¿Deseas probar con el LLM real? (requiere modelo descargado)")
-    print("Esto cargará el modelo en GPU y puede tardar varios minutos.")
-    response = input("Continuar? [s/N]: ").strip().lower()
-    
-    if response in ['s', 'si', 'y', 'yes']:
-        test_with_llm()
-    else:
-        print("\nPrueba finalizada. Para probar con el LLM:")
-        print("1. Descarga el modelo: ./scripts/download_llm_model.sh")
-        print("2. Ejecuta este script de nuevo y acepta probar con LLM")
-
-if __name__ == "__main__":
-    main()
--- a/scripts/test_qdrant_connection.py
+++ b/scripts/test_qdrant_connection.py
@ -1,95 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script de diagnóstico para verificar la conectividad con Qdrant.
-Ejecutar desde el contenedor rss2_web para diagnosticar problemas de red.
-"""
-import os
-import sys
-
-def test_qdrant_connection():
-    """Prueba la conexión a Qdrant y muestra información de diagnóstico."""
-    
-    # Configuración
-    qdrant_host = os.environ.get("QDRANT_HOST", "localhost")
-    qdrant_port = int(os.environ.get("QDRANT_PORT", "6333"))
-    
-    print("=" * 60)
-    print("🔍 DIAGNÓSTICO DE CONEXIÓN QDRANT")
-    print("=" * 60)
-    print(f"Host: {qdrant_host}")
-    print(f"Port: {qdrant_port}")
-    print()
-    
-    # 1. Test de resolución DNS
-    print("1️⃣ Probando resolución DNS...")
-    try:
-        import socket
-        ip = socket.gethostbyname(qdrant_host)
-        print(f"   ✅ Host '{qdrant_host}' resuelve a: {ip}")
-    except Exception as e:
-        print(f"   ❌ ERROR: No se pudo resolver '{qdrant_host}': {e}")
-        return False
-    
-    # 2. Test de conectividad TCP
-    print("\n2️⃣ Probando conectividad TCP...")
-    try:
-        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        sock.settimeout(5)
-        result = sock.connect_ex((ip, qdrant_port))
-        sock.close()
-        
-        if result == 0:
-            print(f"   ✅ Puerto {qdrant_port} está abierto")
-        else:
-            print(f"   ❌ ERROR: Puerto {qdrant_port} está cerrado o inaccesible")
-            return False
-    except Exception as e:
-        print(f"   ❌ ERROR en test TCP: {e}")
-        return False
-    
-    # 3. Test de cliente Qdrant
-    print("\n3️⃣ Probando cliente Qdrant...")
-    try:
-        from qdrant_client import QdrantClient
-        
-        client = QdrantClient(host=qdrant_host, port=qdrant_port, timeout=5)
-        collections = client.get_collections()
-        
-        print(f"   ✅ Cliente Qdrant conectado exitosamente")
-        print(f"   📊 Colecciones disponibles: {[c.name for c in collections.collections]}")
-        
-        # Test de búsqueda
-        for collection in collections.collections:
-            try:
-                info = client.get_collection(collection.name)
-                print(f"   📁 {collection.name}: {info.points_count} vectores")
-            except Exception as e:
-                print(f"   ⚠️ No se pudo obtener info de {collection.name}: {e}")
-        
-        return True
-        
-    except Exception as e:
-        print(f"   ❌ ERROR en cliente Qdrant: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-    
-    print("\n" + "=" * 60)
-
-if __name__ == "__main__":
-    success = test_qdrant_connection()
-    
-    if success:
-        print("\n✅ DIAGNÓSTICO EXITOSO: Qdrant está accesible")
-        sys.exit(0)
-    else:
-        print("\n❌ DIAGNÓSTICO FALLIDO: Problemas de conectividad con Qdrant")
-        print("\n💡 SOLUCIONES POSIBLES:")
-        print("   1. Verificar que el contenedor 'qdrant' esté corriendo:")
-        print("      docker ps | grep qdrant")
-        print("   2. Verificar que ambos contenedores estén en la misma red:")
-        print("      docker network inspect rss2_default")
-        print("   3. Reiniciar el contenedor de Qdrant:")
-        print("      docker restart rss2_qdrant")
-        print("   4. Verificar variables de entorno QDRANT_HOST y QDRANT_PORT")
-        sys.exit(1)
--- a/scripts/verify_connectivity.py
+++ b/scripts/verify_connectivity.py
@ -1,54 +0,0 @@
-import sys
-import os
-
-# Add app to path
-sys.path.append('/home/x/rss2')
-
-try:
-    from db import get_conn, get_read_conn, get_write_conn
-    from cache import get_redis
-    import psycopg2
-    print("Imports successfull.")
-except ImportError as e:
-    print(f"Import failed: {e}")
-    sys.exit(1)
-
-def test_db():
-    print("\n--- Testing Database Connections ---")
-    
-    print("Testing Primary (Write) Connection...")
-    try:
-        with get_write_conn() as conn:
-            with conn.cursor() as cur:
-                cur.execute("SELECT 1")
-                print("  [OK] Primary reachable.")
-    except Exception as e:
-        print(f"  [FAIL] Primary unreachable: {e}")
-
-    print("Testing Replica (Read) Connection...")
-    try:
-        with get_read_conn() as conn:
-            with conn.cursor() as cur:
-                cur.execute("SELECT 1")
-                # Check if it's actually the replica (read-only mode is usually set in replica, 
-                # but here we just check connectivity)
-                print("  [OK] Replica reachable.")
-    except Exception as e:
-        print(f"  [FAIL] Replica unreachable: {e}")
-
-def test_redis():
-    print("\n--- Testing Redis Connection ---")
-    try:
-        r = get_redis()
-        if r:
-            r.ping()
-            print("  [OK] Redis reachable.")
-        else:
-            print("  [FAIL] Redis client returned None (likely connection failed).")
-    except Exception as e:
-        print(f"  [FAIL] Redis error: {e}")
-
-if __name__ == "__main__":
-    test_db()
-    test_redis()
-    print("\nVerification complete.")