rss2/scripts/clean_repetitive_translations.py

108 lines
3.2 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Script to detect and clean repetitive/low-quality translations.
Run this periodically or as a maintenance task.
"""
import os
import re
import sys
import psycopg2
from psycopg2.extras import execute_values
from dotenv import load_dotenv
load_dotenv()
DB_CONFIG = {
"host": os.environ.get("DB_HOST", "localhost"),
"port": int(os.environ.get("DB_PORT", 5432)),
"dbname": os.environ.get("DB_NAME", "rss"),
"user": os.environ.get("DB_USER", "rss"),
"password": os.environ.get("DB_PASS", ""),
}
def is_repetitive(text: str, threshold: float = 0.25) -> bool:
"""Check if text has repetitive patterns or low word diversity."""
if not text or len(text) < 50:
return False
# Check for obvious repetitive patterns
repetitive_patterns = [
r'(\b\w+\b)( \1){3,}', # Same word repeated 4+ times
r'(\b\w+ \w+\b)( \1){2,}', # Same 2-word phrase repeated 3+ times
r'de la la ',
r'la línea de la línea',
r'de Internet de Internet',
r'de la de la',
r'en el en el',
]
for pattern in repetitive_patterns:
if re.search(pattern, text, re.IGNORECASE):
return True
# Check word diversity
words = text.lower().split()
if len(words) < 10:
return False
unique_ratio = len(set(words)) / len(words)
return unique_ratio < threshold
def main():
print("🔍 Scanning for repetitive translations...")
conn = psycopg2.connect(**DB_CONFIG)
with conn.cursor() as cur:
# Fetch all done translations
cur.execute("""
SELECT id, titulo_trad, resumen_trad
FROM traducciones
WHERE status='done'
""")
rows = cur.fetchall()
total = len(rows)
print(f"📊 Checking {total} translations...")
bad_ids = []
for tr_id, titulo, resumen in rows:
if is_repetitive(titulo) or is_repetitive(resumen):
bad_ids.append(tr_id)
print(f"❌ Found {len(bad_ids)} repetitive translations ({len(bad_ids)/total*100:.2f}%)")
if bad_ids:
# Show samples
cur.execute("""
SELECT id, LEFT(resumen_trad, 150) as sample
FROM traducciones
WHERE id = ANY(%s)
LIMIT 5
""", (bad_ids,))
print("\n📝 Sample bad translations:")
for row in cur.fetchall():
print(f" ID {row[0]}: {row[1]}...")
# Reset to pending
print(f"\n🔄 Resetting {len(bad_ids)} translations to pending...")
cur.execute("""
UPDATE traducciones
SET status='pending',
titulo_trad=NULL,
resumen_trad=NULL,
error='Repetitive output - auto-cleaned'
WHERE id = ANY(%s)
""", (bad_ids,))
conn.commit()
print(f"✅ Successfully reset {len(bad_ids)} translations")
else:
print("✅ No repetitive translations found!")
conn.close()
print("\n✨ Cleanup complete!")
if __name__ == "__main__":
main()