cambios en la busqueda ajaz y correcciones en traducciones

This commit is contained in:
jlimolina 2026-01-28 11:20:19 +01:00
parent 95adc07f37
commit 47a252e339
9 changed files with 1152 additions and 449 deletions

View file

@ -0,0 +1,108 @@
#!/usr/bin/env python3
"""
Script to detect and clean repetitive/low-quality translations.
Run this periodically or as a maintenance task.
"""
import os
import re
import sys
import psycopg2
from psycopg2.extras import execute_values
from dotenv import load_dotenv
load_dotenv()
DB_CONFIG = {
"host": os.environ.get("DB_HOST", "localhost"),
"port": int(os.environ.get("DB_PORT", 5432)),
"dbname": os.environ.get("DB_NAME", "rss"),
"user": os.environ.get("DB_USER", "rss"),
"password": os.environ.get("DB_PASS", ""),
}
def is_repetitive(text: str, threshold: float = 0.25) -> bool:
"""Check if text has repetitive patterns or low word diversity."""
if not text or len(text) < 50:
return False
# Check for obvious repetitive patterns
repetitive_patterns = [
r'(\b\w+\b)( \1){3,}', # Same word repeated 4+ times
r'(\b\w+ \w+\b)( \1){2,}', # Same 2-word phrase repeated 3+ times
r'de la la ',
r'la línea de la línea',
r'de Internet de Internet',
r'de la de la',
r'en el en el',
]
for pattern in repetitive_patterns:
if re.search(pattern, text, re.IGNORECASE):
return True
# Check word diversity
words = text.lower().split()
if len(words) < 10:
return False
unique_ratio = len(set(words)) / len(words)
return unique_ratio < threshold
def main():
print("🔍 Scanning for repetitive translations...")
conn = psycopg2.connect(**DB_CONFIG)
with conn.cursor() as cur:
# Fetch all done translations
cur.execute("""
SELECT id, titulo_trad, resumen_trad
FROM traducciones
WHERE status='done'
""")
rows = cur.fetchall()
total = len(rows)
print(f"📊 Checking {total} translations...")
bad_ids = []
for tr_id, titulo, resumen in rows:
if is_repetitive(titulo) or is_repetitive(resumen):
bad_ids.append(tr_id)
print(f"❌ Found {len(bad_ids)} repetitive translations ({len(bad_ids)/total*100:.2f}%)")
if bad_ids:
# Show samples
cur.execute("""
SELECT id, LEFT(resumen_trad, 150) as sample
FROM traducciones
WHERE id = ANY(%s)
LIMIT 5
""", (bad_ids,))
print("\n📝 Sample bad translations:")
for row in cur.fetchall():
print(f" ID {row[0]}: {row[1]}...")
# Reset to pending
print(f"\n🔄 Resetting {len(bad_ids)} translations to pending...")
cur.execute("""
UPDATE traducciones
SET status='pending',
titulo_trad=NULL,
resumen_trad=NULL,
error='Repetitive output - auto-cleaned'
WHERE id = ANY(%s)
""", (bad_ids,))
conn.commit()
print(f"✅ Successfully reset {len(bad_ids)} translations")
else:
print("✅ No repetitive translations found!")
conn.close()
print("\n✨ Cleanup complete!")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,134 @@
#!/usr/bin/env python3
"""
Monitor translation quality in real-time.
Shows statistics about translation quality and detects issues.
"""
import os
import sys
import time
import psycopg2
from datetime import datetime, timedelta
from dotenv import load_dotenv
load_dotenv()
DB_CONFIG = {
"host": os.environ.get("DB_HOST", "localhost"),
"port": int(os.environ.get("DB_PORT", 5432)),
"dbname": os.environ.get("DB_NAME", "rss"),
"user": os.environ.get("DB_USER", "rss"),
"password": os.environ.get("DB_PASS", ""),
}
def get_stats(conn, hours=24):
"""Get translation statistics for the last N hours."""
with conn.cursor() as cur:
# Total translations in period
cur.execute("""
SELECT
COUNT(*) as total,
COUNT(CASE WHEN status='done' THEN 1 END) as done,
COUNT(CASE WHEN status='pending' THEN 1 END) as pending,
COUNT(CASE WHEN status='processing' THEN 1 END) as processing,
COUNT(CASE WHEN status='error' THEN 1 END) as errors
FROM traducciones
WHERE created_at > NOW() - INTERVAL '%s hours'
""", (hours,))
stats = cur.fetchone()
# Check for repetitive patterns in recent translations
cur.execute("""
SELECT COUNT(*)
FROM traducciones
WHERE status='done'
AND created_at > NOW() - INTERVAL '%s hours'
AND (
resumen_trad LIKE '%%la línea de la línea%%'
OR resumen_trad LIKE '%%de la la %%'
OR resumen_trad LIKE '%%de Internet de Internet%%'
)
""", (hours,))
repetitive = cur.fetchone()[0]
# Get error messages
cur.execute("""
SELECT error, COUNT(*) as count
FROM traducciones
WHERE status='error'
AND created_at > NOW() - INTERVAL '%s hours'
GROUP BY error
ORDER BY count DESC
LIMIT 5
""", (hours,))
errors = cur.fetchall()
return {
'total': stats[0],
'done': stats[1],
'pending': stats[2],
'processing': stats[3],
'errors': stats[4],
'repetitive': repetitive,
'error_details': errors
}
def print_stats(stats, hours):
"""Pretty print statistics."""
print(f"\n{'='*60}")
print(f"📊 Translation Quality Report - Last {hours}h")
print(f"{'='*60}")
print(f"Total Translations: {stats['total']}")
print(f" ✅ Done: {stats['done']:>6} ({stats['done']/max(stats['total'],1)*100:>5.1f}%)")
print(f" ⏳ Pending: {stats['pending']:>6} ({stats['pending']/max(stats['total'],1)*100:>5.1f}%)")
print(f" 🔄 Processing: {stats['processing']:>6} ({stats['processing']/max(stats['total'],1)*100:>5.1f}%)")
print(f" ❌ Errors: {stats['errors']:>6} ({stats['errors']/max(stats['total'],1)*100:>5.1f}%)")
print(f"\n🔍 Quality Issues:")
print(f" ⚠️ Repetitive: {stats['repetitive']:>6} ({stats['repetitive']/max(stats['done'],1)*100:>5.1f}% of done)")
if stats['error_details']:
print(f"\n📋 Top Error Messages:")
for error, count in stats['error_details']:
error_short = (error[:50] + '...') if error and len(error) > 50 else (error or 'Unknown')
print(f"{error_short}: {count}")
# Quality score
if stats['done'] > 0:
quality_score = (1 - stats['repetitive'] / stats['done']) * 100
quality_emoji = "🟢" if quality_score > 95 else "🟡" if quality_score > 90 else "🔴"
print(f"\n{quality_emoji} Quality Score: {quality_score:.1f}%")
print(f"{'='*60}\n")
def main():
import argparse
parser = argparse.ArgumentParser(description='Monitor translation quality')
parser.add_argument('--hours', type=int, default=24, help='Hours to look back (default: 24)')
parser.add_argument('--watch', action='store_true', help='Continuous monitoring mode')
parser.add_argument('--interval', type=int, default=60, help='Update interval in seconds (default: 60)')
args = parser.parse_args()
conn = psycopg2.connect(**DB_CONFIG)
try:
if args.watch:
print("🔄 Starting continuous monitoring (Ctrl+C to stop)...")
while True:
stats = get_stats(conn, args.hours)
print(f"\033[2J\033[H") # Clear screen
print(f"Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print_stats(stats, args.hours)
time.sleep(args.interval)
else:
stats = get_stats(conn, args.hours)
print_stats(stats, args.hours)
except KeyboardInterrupt:
print("\n\n👋 Monitoring stopped")
finally:
conn.close()
if __name__ == "__main__":
main()