108 lines
3.2 KiB
Python
Executable file
108 lines
3.2 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Script to detect and clean repetitive/low-quality translations.
|
|
Run this periodically or as a maintenance task.
|
|
"""
|
|
import os
|
|
import re
|
|
import sys
|
|
import psycopg2
|
|
from psycopg2.extras import execute_values
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
DB_CONFIG = {
|
|
"host": os.environ.get("DB_HOST", "localhost"),
|
|
"port": int(os.environ.get("DB_PORT", 5432)),
|
|
"dbname": os.environ.get("DB_NAME", "rss"),
|
|
"user": os.environ.get("DB_USER", "rss"),
|
|
"password": os.environ.get("DB_PASS", ""),
|
|
}
|
|
|
|
def is_repetitive(text: str, threshold: float = 0.25) -> bool:
|
|
"""Check if text has repetitive patterns or low word diversity."""
|
|
if not text or len(text) < 50:
|
|
return False
|
|
|
|
# Check for obvious repetitive patterns
|
|
repetitive_patterns = [
|
|
r'(\b\w+\b)( \1){3,}', # Same word repeated 4+ times
|
|
r'(\b\w+ \w+\b)( \1){2,}', # Same 2-word phrase repeated 3+ times
|
|
r'de la la ',
|
|
r'la línea de la línea',
|
|
r'de Internet de Internet',
|
|
r'de la de la',
|
|
r'en el en el',
|
|
]
|
|
|
|
for pattern in repetitive_patterns:
|
|
if re.search(pattern, text, re.IGNORECASE):
|
|
return True
|
|
|
|
# Check word diversity
|
|
words = text.lower().split()
|
|
if len(words) < 10:
|
|
return False
|
|
|
|
unique_ratio = len(set(words)) / len(words)
|
|
return unique_ratio < threshold
|
|
|
|
def main():
|
|
print("🔍 Scanning for repetitive translations...")
|
|
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|
|
|
with conn.cursor() as cur:
|
|
# Fetch all done translations
|
|
cur.execute("""
|
|
SELECT id, titulo_trad, resumen_trad
|
|
FROM traducciones
|
|
WHERE status='done'
|
|
""")
|
|
|
|
rows = cur.fetchall()
|
|
total = len(rows)
|
|
print(f"📊 Checking {total} translations...")
|
|
|
|
bad_ids = []
|
|
for tr_id, titulo, resumen in rows:
|
|
if is_repetitive(titulo) or is_repetitive(resumen):
|
|
bad_ids.append(tr_id)
|
|
|
|
print(f"❌ Found {len(bad_ids)} repetitive translations ({len(bad_ids)/total*100:.2f}%)")
|
|
|
|
if bad_ids:
|
|
# Show samples
|
|
cur.execute("""
|
|
SELECT id, LEFT(resumen_trad, 150) as sample
|
|
FROM traducciones
|
|
WHERE id = ANY(%s)
|
|
LIMIT 5
|
|
""", (bad_ids,))
|
|
|
|
print("\n📝 Sample bad translations:")
|
|
for row in cur.fetchall():
|
|
print(f" ID {row[0]}: {row[1]}...")
|
|
|
|
# Reset to pending
|
|
print(f"\n🔄 Resetting {len(bad_ids)} translations to pending...")
|
|
cur.execute("""
|
|
UPDATE traducciones
|
|
SET status='pending',
|
|
titulo_trad=NULL,
|
|
resumen_trad=NULL,
|
|
error='Repetitive output - auto-cleaned'
|
|
WHERE id = ANY(%s)
|
|
""", (bad_ids,))
|
|
|
|
conn.commit()
|
|
print(f"✅ Successfully reset {len(bad_ids)} translations")
|
|
else:
|
|
print("✅ No repetitive translations found!")
|
|
|
|
conn.close()
|
|
print("\n✨ Cleanup complete!")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|