cambios en la busqueda ajaz y correcciones en traducciones
This commit is contained in:
parent
95adc07f37
commit
47a252e339
9 changed files with 1152 additions and 449 deletions
108
scripts/clean_repetitive_translations.py
Executable file
108
scripts/clean_repetitive_translations.py
Executable file
|
|
@ -0,0 +1,108 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to detect and clean repetitive/low-quality translations.
|
||||
Run this periodically or as a maintenance task.
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
DB_CONFIG = {
|
||||
"host": os.environ.get("DB_HOST", "localhost"),
|
||||
"port": int(os.environ.get("DB_PORT", 5432)),
|
||||
"dbname": os.environ.get("DB_NAME", "rss"),
|
||||
"user": os.environ.get("DB_USER", "rss"),
|
||||
"password": os.environ.get("DB_PASS", ""),
|
||||
}
|
||||
|
||||
def is_repetitive(text: str, threshold: float = 0.25) -> bool:
|
||||
"""Check if text has repetitive patterns or low word diversity."""
|
||||
if not text or len(text) < 50:
|
||||
return False
|
||||
|
||||
# Check for obvious repetitive patterns
|
||||
repetitive_patterns = [
|
||||
r'(\b\w+\b)( \1){3,}', # Same word repeated 4+ times
|
||||
r'(\b\w+ \w+\b)( \1){2,}', # Same 2-word phrase repeated 3+ times
|
||||
r'de la la ',
|
||||
r'la línea de la línea',
|
||||
r'de Internet de Internet',
|
||||
r'de la de la',
|
||||
r'en el en el',
|
||||
]
|
||||
|
||||
for pattern in repetitive_patterns:
|
||||
if re.search(pattern, text, re.IGNORECASE):
|
||||
return True
|
||||
|
||||
# Check word diversity
|
||||
words = text.lower().split()
|
||||
if len(words) < 10:
|
||||
return False
|
||||
|
||||
unique_ratio = len(set(words)) / len(words)
|
||||
return unique_ratio < threshold
|
||||
|
||||
def main():
|
||||
print("🔍 Scanning for repetitive translations...")
|
||||
|
||||
conn = psycopg2.connect(**DB_CONFIG)
|
||||
|
||||
with conn.cursor() as cur:
|
||||
# Fetch all done translations
|
||||
cur.execute("""
|
||||
SELECT id, titulo_trad, resumen_trad
|
||||
FROM traducciones
|
||||
WHERE status='done'
|
||||
""")
|
||||
|
||||
rows = cur.fetchall()
|
||||
total = len(rows)
|
||||
print(f"📊 Checking {total} translations...")
|
||||
|
||||
bad_ids = []
|
||||
for tr_id, titulo, resumen in rows:
|
||||
if is_repetitive(titulo) or is_repetitive(resumen):
|
||||
bad_ids.append(tr_id)
|
||||
|
||||
print(f"❌ Found {len(bad_ids)} repetitive translations ({len(bad_ids)/total*100:.2f}%)")
|
||||
|
||||
if bad_ids:
|
||||
# Show samples
|
||||
cur.execute("""
|
||||
SELECT id, LEFT(resumen_trad, 150) as sample
|
||||
FROM traducciones
|
||||
WHERE id = ANY(%s)
|
||||
LIMIT 5
|
||||
""", (bad_ids,))
|
||||
|
||||
print("\n📝 Sample bad translations:")
|
||||
for row in cur.fetchall():
|
||||
print(f" ID {row[0]}: {row[1]}...")
|
||||
|
||||
# Reset to pending
|
||||
print(f"\n🔄 Resetting {len(bad_ids)} translations to pending...")
|
||||
cur.execute("""
|
||||
UPDATE traducciones
|
||||
SET status='pending',
|
||||
titulo_trad=NULL,
|
||||
resumen_trad=NULL,
|
||||
error='Repetitive output - auto-cleaned'
|
||||
WHERE id = ANY(%s)
|
||||
""", (bad_ids,))
|
||||
|
||||
conn.commit()
|
||||
print(f"✅ Successfully reset {len(bad_ids)} translations")
|
||||
else:
|
||||
print("✅ No repetitive translations found!")
|
||||
|
||||
conn.close()
|
||||
print("\n✨ Cleanup complete!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue