varios cambios esteticos y optimizaciones

This commit is contained in:
jlimolina 2025-11-25 03:13:54 +01:00
parent e3a99d9604
commit 9a243db633
8 changed files with 64 additions and 105 deletions

View file

@ -105,6 +105,8 @@ CHUNK_BY_SENTENCES = _env_bool("CHUNK_BY_SENTENCES", default=True)
CHUNK_MAX_TOKENS = _env_int("CHUNK_MAX_TOKENS", default=900)
CHUNK_OVERLAP_SENTS = _env_int("CHUNK_OVERLAP_SENTS", default=0)
IDENTITY_PAISES_ES = _env_int("IDENTITY_PAISES_ES", default=58)
_ABBR = ("Sr", "Sra", "Dr", "Dra", "Ing", "Lic", "pág", "etc")
_ABBR_MARK = "§"
@ -216,6 +218,8 @@ def ensure_indexes(conn):
def ensure_pending(conn, lang_to: str, enqueue_limit: int):
if enqueue_limit <= 0:
return
with conn.cursor() as cur:
cur.execute(
"""
@ -236,7 +240,44 @@ def ensure_pending(conn, lang_to: str, enqueue_limit: int):
conn.commit()
def ensure_identity_spanish(conn, lang_to: str, enqueue_limit: int):
lang_to = normalize_lang(lang_to, "es") or "es"
if lang_to != "es":
return
if enqueue_limit <= 0:
return
LOG.info(
"Creando traducciones identidad ES para pais_id=%s (hasta %s noticias)…",
IDENTITY_PAISES_ES,
enqueue_limit,
)
with conn.cursor() as cur:
cur.execute(
"""
INSERT INTO traducciones (noticia_id, lang_from, lang_to, titulo_trad, resumen_trad, status)
SELECT sub.id, 'es', %s, sub.titulo, sub.resumen, 'done'
FROM (
SELECT n.id, n.titulo, n.resumen
FROM noticias n
LEFT JOIN traducciones t
ON t.noticia_id = n.id AND t.lang_to = %s
WHERE t.id IS NULL
AND n.pais_id = %s
ORDER BY n.fecha DESC NULLS LAST, n.id
LIMIT %s
) AS sub;
""",
(lang_to, lang_to, IDENTITY_PAISES_ES, enqueue_limit),
)
conn.commit()
def fetch_pending_batch(conn, lang_to: str, batch_size: int):
if batch_size <= 0:
return []
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute(
"""
@ -865,7 +906,10 @@ def main():
ensure_indexes(conn)
for lt in TARGET_LANGS:
lt = normalize_lang(lt, "es") or "es"
ensure_identity_spanish(conn, lt, ENQUEUE_MAX)
ensure_pending(conn, lt, ENQUEUE_MAX)
while True:
rows = fetch_pending_batch(conn, lt, BATCH_SIZE)
if not rows: