rss2/scripts/fix_html_recursive.py
2026-01-13 13:39:51 +01:00

92 lines
3.1 KiB
Python

import html
import psycopg2
from db import get_conn
import sys
def recursive_unescape(text):
if not text:
return text
# Limit loops to prevent infinite loops on weird edge cases
max_loops = 5
current = text
for _ in range(max_loops):
new_text = html.unescape(current)
if new_text == current:
break
current = new_text
return current
def fix_entities_recursive():
print("🔧 Fixing HTML entities RECURSIVELY in database...")
with get_conn() as conn:
with conn.cursor() as cur:
# 1. Update Noticias
print("Processing 'noticias' table...")
# We select ALL rows that contain '&' to catch any entity
# Optimisation: limit to rows with '&'
# Note: This might be slow if table is huge, but we have ~13k rows, it's fine.
cur.execute("""
SELECT id, titulo, resumen
FROM noticias
WHERE titulo LIKE '%&%' OR resumen LIKE '%&%'
""")
rows = cur.fetchall()
print(f"Found {len(rows)} candidates in 'noticias'.")
count = 0
for r in rows:
nid, tit, res = r
new_tit = recursive_unescape(tit)
new_res = recursive_unescape(res)
if new_tit != tit or new_res != res:
cur.execute("""
UPDATE noticias
SET titulo = %s, resumen = %s
WHERE id = %s
""", (new_tit, new_res, nid))
count += 1
if count % 100 == 0:
print(f"Updated {count} noticias...")
print(f"Total updated in 'noticias': {count}")
# 2. Update Traducciones
print("\nProcessing 'traducciones' table...")
cur.execute("""
SELECT id, titulo_trad, resumen_trad
FROM traducciones
WHERE titulo_trad LIKE '%&%' OR resumen_trad LIKE '%&%'
""")
rows = cur.fetchall()
print(f"Found {len(rows)} candidates in 'traducciones'.")
count_tr = 0
for r in rows:
tid, tit, res = r
new_tit = recursive_unescape(tit)
new_res = recursive_unescape(res)
if new_tit != tit or new_res != res:
cur.execute("""
UPDATE traducciones
SET titulo_trad = %s, resumen_trad = %s
WHERE id = %s
""", (new_tit, new_res, tid))
count_tr += 1
if count_tr % 100 == 0:
print(f"Updated {count_tr} traducciones...")
print(f"Total updated in 'traducciones': {count_tr}")
conn.commit()
print("✅ Database cleaning complete.")
if __name__ == "__main__":
fix_entities_recursive()