cambios en la busqueda ajaz y correcciones en traducciones

2026-01-28 11:20:19 +01:00 · 2026-01-28 11:20:19 +01:00 · 47a252e339
commit 47a252e339
parent 95adc07f37
9 changed files with 1152 additions and 449 deletions
--- a/scripts/clean_repetitive_translations.py
+++ b/scripts/clean_repetitive_translations.py
@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""
+Script to detect and clean repetitive/low-quality translations.
+Run this periodically or as a maintenance task.
+"""
+import os
+import re
+import sys
+import psycopg2
+from psycopg2.extras import execute_values
+from dotenv import load_dotenv
+
+load_dotenv()
+
+DB_CONFIG = {
+    "host": os.environ.get("DB_HOST", "localhost"),
+    "port": int(os.environ.get("DB_PORT", 5432)),
+    "dbname": os.environ.get("DB_NAME", "rss"),
+    "user": os.environ.get("DB_USER", "rss"),
+    "password": os.environ.get("DB_PASS", ""),
+}
+
+def is_repetitive(text: str, threshold: float = 0.25) -> bool:
+    """Check if text has repetitive patterns or low word diversity."""
+    if not text or len(text) < 50:
+        return False
+    
+    # Check for obvious repetitive patterns
+    repetitive_patterns = [
+        r'(\b\w+\b)( \1){3,}',  # Same word repeated 4+ times
+        r'(\b\w+ \w+\b)( \1){2,}',  # Same 2-word phrase repeated 3+ times
+        r'de la la ',
+        r'la línea de la línea',
+        r'de Internet de Internet',
+        r'de la de la',
+        r'en el en el',
+    ]
+    
+    for pattern in repetitive_patterns:
+        if re.search(pattern, text, re.IGNORECASE):
+            return True
+    
+    # Check word diversity
+    words = text.lower().split()
+    if len(words) < 10:
+        return False
+    
+    unique_ratio = len(set(words)) / len(words)
+    return unique_ratio < threshold
+
+def main():
+    print("🔍 Scanning for repetitive translations...")
+    
+    conn = psycopg2.connect(**DB_CONFIG)
+    
+    with conn.cursor() as cur:
+        # Fetch all done translations
+        cur.execute("""
+            SELECT id, titulo_trad, resumen_trad 
+            FROM traducciones 
+            WHERE status='done'
+        """)
+        
+        rows = cur.fetchall()
+        total = len(rows)
+        print(f"📊 Checking {total} translations...")
+        
+        bad_ids = []
+        for tr_id, titulo, resumen in rows:
+            if is_repetitive(titulo) or is_repetitive(resumen):
+                bad_ids.append(tr_id)
+        
+        print(f"❌ Found {len(bad_ids)} repetitive translations ({len(bad_ids)/total*100:.2f}%)")
+        
+        if bad_ids:
+            # Show samples
+            cur.execute("""
+                SELECT id, LEFT(resumen_trad, 150) as sample 
+                FROM traducciones 
+                WHERE id = ANY(%s) 
+                LIMIT 5
+            """, (bad_ids,))
+            
+            print("\n📝 Sample bad translations:")
+            for row in cur.fetchall():
+                print(f"  ID {row[0]}: {row[1]}...")
+            
+            # Reset to pending
+            print(f"\n🔄 Resetting {len(bad_ids)} translations to pending...")
+            cur.execute("""
+                UPDATE traducciones 
+                SET status='pending', 
+                    titulo_trad=NULL, 
+                    resumen_trad=NULL, 
+                    error='Repetitive output - auto-cleaned'
+                WHERE id = ANY(%s)
+            """, (bad_ids,))
+            
+            conn.commit()
+            print(f"✅ Successfully reset {len(bad_ids)} translations")
+        else:
+            print("✅ No repetitive translations found!")
+    
+    conn.close()
+    print("\n✨ Cleanup complete!")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/monitor_translation_quality.py
+++ b/scripts/monitor_translation_quality.py
@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+"""
+Monitor translation quality in real-time.
+Shows statistics about translation quality and detects issues.
+"""
+import os
+import sys
+import time
+import psycopg2
+from datetime import datetime, timedelta
+from dotenv import load_dotenv
+
+load_dotenv()
+
+DB_CONFIG = {
+    "host": os.environ.get("DB_HOST", "localhost"),
+    "port": int(os.environ.get("DB_PORT", 5432)),
+    "dbname": os.environ.get("DB_NAME", "rss"),
+    "user": os.environ.get("DB_USER", "rss"),
+    "password": os.environ.get("DB_PASS", ""),
+}
+
+def get_stats(conn, hours=24):
+    """Get translation statistics for the last N hours."""
+    with conn.cursor() as cur:
+        # Total translations in period
+        cur.execute("""
+            SELECT 
+                COUNT(*) as total,
+                COUNT(CASE WHEN status='done' THEN 1 END) as done,
+                COUNT(CASE WHEN status='pending' THEN 1 END) as pending,
+                COUNT(CASE WHEN status='processing' THEN 1 END) as processing,
+                COUNT(CASE WHEN status='error' THEN 1 END) as errors
+            FROM traducciones
+            WHERE created_at > NOW() - INTERVAL '%s hours'
+        """, (hours,))
+        
+        stats = cur.fetchone()
+        
+        # Check for repetitive patterns in recent translations
+        cur.execute("""
+            SELECT COUNT(*) 
+            FROM traducciones 
+            WHERE status='done' 
+              AND created_at > NOW() - INTERVAL '%s hours'
+              AND (
+                resumen_trad LIKE '%%la línea de la línea%%' 
+                OR resumen_trad LIKE '%%de la la %%'
+                OR resumen_trad LIKE '%%de Internet de Internet%%'
+              )
+        """, (hours,))
+        
+        repetitive = cur.fetchone()[0]
+        
+        # Get error messages
+        cur.execute("""
+            SELECT error, COUNT(*) as count
+            FROM traducciones
+            WHERE status='error' 
+              AND created_at > NOW() - INTERVAL '%s hours'
+            GROUP BY error
+            ORDER BY count DESC
+            LIMIT 5
+        """, (hours,))
+        
+        errors = cur.fetchall()
+        
+        return {
+            'total': stats[0],
+            'done': stats[1],
+            'pending': stats[2],
+            'processing': stats[3],
+            'errors': stats[4],
+            'repetitive': repetitive,
+            'error_details': errors
+        }
+
+def print_stats(stats, hours):
+    """Pretty print statistics."""
+    print(f"\n{'='*60}")
+    print(f"📊 Translation Quality Report - Last {hours}h")
+    print(f"{'='*60}")
+    print(f"Total Translations: {stats['total']}")
+    print(f"  ✅ Done:        {stats['done']:>6} ({stats['done']/max(stats['total'],1)*100:>5.1f}%)")
+    print(f"  ⏳ Pending:     {stats['pending']:>6} ({stats['pending']/max(stats['total'],1)*100:>5.1f}%)")
+    print(f"  🔄 Processing:  {stats['processing']:>6} ({stats['processing']/max(stats['total'],1)*100:>5.1f}%)")
+    print(f"  ❌ Errors:      {stats['errors']:>6} ({stats['errors']/max(stats['total'],1)*100:>5.1f}%)")
+    print(f"\n🔍 Quality Issues:")
+    print(f"  ⚠️  Repetitive:  {stats['repetitive']:>6} ({stats['repetitive']/max(stats['done'],1)*100:>5.1f}% of done)")
+    
+    if stats['error_details']:
+        print(f"\n📋 Top Error Messages:")
+        for error, count in stats['error_details']:
+            error_short = (error[:50] + '...') if error and len(error) > 50 else (error or 'Unknown')
+            print(f"  • {error_short}: {count}")
+    
+    # Quality score
+    if stats['done'] > 0:
+        quality_score = (1 - stats['repetitive'] / stats['done']) * 100
+        quality_emoji = "🟢" if quality_score > 95 else "🟡" if quality_score > 90 else "🔴"
+        print(f"\n{quality_emoji} Quality Score: {quality_score:.1f}%")
+    
+    print(f"{'='*60}\n")
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description='Monitor translation quality')
+    parser.add_argument('--hours', type=int, default=24, help='Hours to look back (default: 24)')
+    parser.add_argument('--watch', action='store_true', help='Continuous monitoring mode')
+    parser.add_argument('--interval', type=int, default=60, help='Update interval in seconds (default: 60)')
+    
+    args = parser.parse_args()
+    
+    conn = psycopg2.connect(**DB_CONFIG)
+    
+    try:
+        if args.watch:
+            print("🔄 Starting continuous monitoring (Ctrl+C to stop)...")
+            while True:
+                stats = get_stats(conn, args.hours)
+                print(f"\033[2J\033[H")  # Clear screen
+                print(f"Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+                print_stats(stats, args.hours)
+                time.sleep(args.interval)
+        else:
+            stats = get_stats(conn, args.hours)
+            print_stats(stats, args.hours)
+    except KeyboardInterrupt:
+        print("\n\n👋 Monitoring stopped")
+    finally:
+        conn.close()
+
+if __name__ == "__main__":
+    main()