Initial clean commit
This commit is contained in:
commit
6784d81c2c
141 changed files with 25219 additions and 0 deletions
109
translation_ops.py
Normal file
109
translation_ops.py
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
import os
|
||||
import logging
|
||||
from db import get_conn
|
||||
|
||||
# Config (moved/duplicated from worker or just hardcoded/env read here)
|
||||
IDENTITY_PAISES_ES = int(os.environ.get("IDENTITY_PAISES_ES", 58))
|
||||
LOG = logging.getLogger("translation_ops")
|
||||
|
||||
def ensure_indexes(conn):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("CREATE INDEX IF NOT EXISTS traducciones_lang_to_status_idx ON traducciones (lang_to, status);")
|
||||
cur.execute("CREATE INDEX IF NOT EXISTS traducciones_status_idx ON traducciones (status);")
|
||||
conn.commit()
|
||||
|
||||
def cleanup_stuck_translations(conn, timeout_hours=2):
|
||||
"""Reset stuck 'processing' translations to 'pending' if they're too old."""
|
||||
with conn.cursor() as cur:
|
||||
# Assuming created_at is the timestamp when they entered processing
|
||||
# In this schema, created_at is DEFAULT NOW() upon insertion.
|
||||
# But if we don't have an 'updated_at', we use created_at as a fallback.
|
||||
# For entries stuck in processing for hours, it's safe to reset.
|
||||
cur.execute(
|
||||
"""
|
||||
UPDATE traducciones
|
||||
SET status = 'pending'
|
||||
WHERE status = 'processing'
|
||||
AND created_at < NOW() - INTERVAL '%s hours';
|
||||
""",
|
||||
(timeout_hours,)
|
||||
)
|
||||
if cur.rowcount > 0:
|
||||
LOG.info(f"Cleaned up {cur.rowcount} stuck translations.")
|
||||
conn.commit()
|
||||
|
||||
def ensure_pending(conn, lang_to: str, limit: int):
|
||||
if limit <= 0:
|
||||
return
|
||||
with conn.cursor() as cur:
|
||||
# Use ON CONFLICT DO NOTHING to avoid errors if run concurrently (though scheduler should be serial)
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO traducciones (noticia_id, lang_from, lang_to, status)
|
||||
SELECT n.id, NULL, %s, 'pending'
|
||||
FROM noticias n
|
||||
LEFT JOIN traducciones t ON t.noticia_id=n.id AND t.lang_to=%s
|
||||
WHERE t.id IS NULL
|
||||
ORDER BY n.fecha DESC NULLS LAST
|
||||
LIMIT %s
|
||||
ON CONFLICT DO NOTHING;
|
||||
""",
|
||||
(lang_to, lang_to, limit),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def ensure_identity_spanish(conn, lang_to: str, limit: int):
|
||||
if lang_to != "es" or limit <= 0:
|
||||
return
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO traducciones (noticia_id, lang_from, lang_to,
|
||||
titulo_trad, resumen_trad, status)
|
||||
SELECT n.id, 'es', %s, n.titulo, n.resumen, 'done'
|
||||
FROM noticias n
|
||||
LEFT JOIN traducciones t ON t.noticia_id=n.id AND t.lang_to=%s
|
||||
WHERE t.id IS NULL AND n.pais_id=%s
|
||||
ORDER BY n.fecha DESC NULLS LAST
|
||||
LIMIT %s
|
||||
ON CONFLICT DO NOTHING;
|
||||
""",
|
||||
(lang_to, lang_to, IDENTITY_PAISES_ES, limit),
|
||||
)
|
||||
|
||||
# Also persist stats for these auto-completed Spanish ones
|
||||
# We need to know how many were inserted.
|
||||
# But 'process_batch' in worker handles stats for processed items.
|
||||
# These identity ones bypass the worker.
|
||||
# We should add stats here too if we want them counted.
|
||||
# However, getting the inserted ID is tricky with ON CONFLICT DO NOTHING and bulk insert.
|
||||
# Let's count them afterwards or simple ignore stats for identity 'skip' for now to keep it simple.
|
||||
# User goal was TPM (Work done). This is valid work done.
|
||||
# Let's add RETURNING id if possible?
|
||||
# INSERT ... RETURNING id.
|
||||
# But it's SELECT ...
|
||||
pass
|
||||
|
||||
conn.commit()
|
||||
|
||||
|
||||
def run_producer_cycle():
|
||||
"""Main entry point for the scheduler to generate work."""
|
||||
# Read env vars here to ensure freshness
|
||||
target_langs = os.environ.get("TARGET_LANGS", "es").split(",")
|
||||
enqueue_max = int(os.environ.get("ENQUEUE", 200))
|
||||
|
||||
target_langs = [x.strip() for x in target_langs if x.strip()]
|
||||
|
||||
with get_conn() as conn:
|
||||
ensure_indexes(conn)
|
||||
cleanup_stuck_translations(conn)
|
||||
for tgt in target_langs:
|
||||
# 1. Identity for ES
|
||||
if tgt == "es":
|
||||
ensure_identity_spanish(conn, tgt, enqueue_max)
|
||||
|
||||
# 2. Pendings for all
|
||||
ensure_pending(conn, tgt, enqueue_max)
|
||||
|
||||
LOG.info(f"Producer cycle completed for langs: {target_langs}")
|
||||
Loading…
Add table
Add a link
Reference in a new issue