This commit is contained in:
jlimolina 2025-11-24 23:06:26 +01:00
parent 86ee083b90
commit e3a99d9604
8 changed files with 489 additions and 483 deletions

21
.gitignore vendored
View file

@ -1,24 +1,33 @@
# Virtual environment # --- Virtual environments ---
venv/ venv/
.venv/ .venv/
env/ env/
.env/ .env/
.env
# Byte-code files # --- Python bytecode ---
*.pyc *.pyc
__pycache__/ __pycache__/
# IDE specific files # --- IDE project folders ---
.vscode/ .vscode/
.idea/ .idea/
# Operating System files # --- OS-generated files ---
.DS_Store .DS_Store
Thumbs.db Thumbs.db
# Database files (if SQLite) # --- SQLite / misc DB files ---
*.sqlite3 *.sqlite3
*.db *.db
# Logs # --- Postgres Docker data directory ---
pgdata/
# --- HuggingFace models cache ---
hf_cache/
# --- Logs ---
*.log *.log
logs/

View file

@ -6,6 +6,7 @@ from typing import List, Dict, Any, Optional
import numpy as np import numpy as np
import psycopg2 import psycopg2
import psycopg2.extras import psycopg2.extras
from psycopg2.extras import Json
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
@ -43,29 +44,12 @@ def get_conn():
def ensure_schema(conn): def ensure_schema(conn):
""" """
Asegura que la tabla de eventos y las columnas necesarias existen. Asumimos que las tablas y columnas (eventos, traducciones.evento_id,
Aquí se asume el esquema original de eventos con centroid JSONB. eventos_noticias, función/trigger) ya existen por los scripts init-db.
Aquí solo nos aseguramos de que existan ciertos índices clave
(idempotente).
""" """
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute(
"""
CREATE TABLE IF NOT EXISTS eventos (
id SERIAL PRIMARY KEY,
creado_en TIMESTAMP NOT NULL DEFAULT NOW(),
actualizado_en TIMESTAMP NOT NULL DEFAULT NOW(),
centroid JSONB NOT NULL,
total_traducciones INTEGER NOT NULL DEFAULT 1
);
"""
)
cur.execute(
"""
ALTER TABLE traducciones
ADD COLUMN IF NOT EXISTS evento_id INTEGER REFERENCES eventos(id);
"""
)
cur.execute( cur.execute(
""" """
CREATE INDEX IF NOT EXISTS idx_traducciones_evento CREATE INDEX IF NOT EXISTS idx_traducciones_evento
@ -78,27 +62,6 @@ def ensure_schema(conn):
ON traducciones(evento_id, noticia_id); ON traducciones(evento_id, noticia_id);
""" """
) )
cur.execute(
"""
CREATE OR REPLACE FUNCTION actualizar_evento_modificado()
RETURNS TRIGGER AS $$
BEGIN
NEW.actualizado_en = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
"""
)
cur.execute("DROP TRIGGER IF EXISTS trg_evento_modificado ON eventos;")
cur.execute(
"""
CREATE TRIGGER trg_evento_modificado
BEFORE UPDATE ON eventos
FOR EACH ROW
EXECUTE FUNCTION actualizar_evento_modificado();
"""
)
conn.commit() conn.commit()
@ -161,6 +124,7 @@ def fetch_embeddings_for(conn, tr_ids: List[int]) -> Dict[int, np.ndarray]:
def fetch_centroids(conn) -> List[Dict[str, Any]]: def fetch_centroids(conn) -> List[Dict[str, Any]]:
""" """
Carga todos los centroides actuales desde eventos. Carga todos los centroides actuales desde eventos.
Solo usamos campos de clustering: id, centroid, total_traducciones.
""" """
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute( cur.execute(
@ -178,6 +142,7 @@ def fetch_centroids(conn) -> List[Dict[str, Any]]:
raw = r["centroid"] raw = r["centroid"]
cnt = int(r["total_traducciones"] or 1) cnt = int(r["total_traducciones"] or 1)
if not isinstance(raw, (list, tuple)): if not isinstance(raw, (list, tuple)):
# centroid se almacena como JSONB array → en Python suele llegar como list
continue continue
arr = np.array([float(x or 0.0) for x in raw], dtype="float32") arr = np.array([float(x or 0.0) for x in raw], dtype="float32")
if arr.size == 0: if arr.size == 0:
@ -201,6 +166,54 @@ def cosine_distance(a: np.ndarray, b: np.ndarray) -> float:
return 1.0 - cos return 1.0 - cos
def fetch_traduccion_info(conn, tr_id: int) -> Optional[Dict[str, Any]]:
"""
Devuelve info básica para un tr_id:
- noticia_id
- fecha de la noticia
- un título representativo para el evento (traducido u original).
"""
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute(
"""
SELECT
t.id AS traduccion_id,
t.noticia_id AS noticia_id,
n.fecha AS fecha,
COALESCE(NULLIF(t.titulo_trad, ''), n.titulo) AS titulo_evento
FROM traducciones t
JOIN noticias n ON n.id = t.noticia_id
WHERE t.id = %s;
""",
(tr_id,),
)
row = cur.fetchone()
if not row:
return None
return {
"traduccion_id": int(row["traduccion_id"]),
"noticia_id": row["noticia_id"],
"fecha": row["fecha"],
"titulo_evento": row["titulo_evento"],
}
def _insert_evento_noticia(cur, evento_id: int, info: Dict[str, Any]) -> None:
"""
Inserta relación en eventos_noticias (idempotente).
"""
if not info or not info.get("noticia_id"):
return
cur.execute(
"""
INSERT INTO eventos_noticias (evento_id, noticia_id, traduccion_id)
VALUES (%s, %s, %s)
ON CONFLICT (evento_id, traduccion_id) DO NOTHING;
""",
(evento_id, info["noticia_id"], info["traduccion_id"]),
)
def assign_to_event( def assign_to_event(
conn, conn,
tr_id: int, tr_id: int,
@ -210,15 +223,39 @@ def assign_to_event(
""" """
Asigna una traducción a un evento existente (si distancia <= umbral) Asigna una traducción a un evento existente (si distancia <= umbral)
o crea un evento nuevo con este vector como centroide. o crea un evento nuevo con este vector como centroide.
"""
from psycopg2.extras import Json
Además:
- Actualiza fecha_inicio, fecha_fin, n_noticias del evento.
- Rellena eventos_noticias (evento_id, noticia_id, traduccion_id).
"""
if vec is None or vec.size == 0: if vec is None or vec.size == 0:
return return
info = fetch_traduccion_info(conn, tr_id)
# Si no hay centroides todavía → primer evento
if not centroids: if not centroids:
centroid_list = [float(x) for x in vec.tolist()] centroid_list = [float(x) for x in vec.tolist()]
with conn.cursor() as cur: with conn.cursor() as cur:
if info and info.get("fecha"):
cur.execute(
"""
INSERT INTO eventos (centroid, total_traducciones,
fecha_inicio, fecha_fin, n_noticias, titulo)
VALUES (%s, %s, %s, %s, %s, %s)
RETURNING id;
""",
(
Json(centroid_list),
1,
info["fecha"],
info["fecha"],
1,
info.get("titulo_evento"),
),
)
else:
# Fallback mínimo si no hay info de noticia
cur.execute( cur.execute(
""" """
INSERT INTO eventos (centroid, total_traducciones) INSERT INTO eventos (centroid, total_traducciones)
@ -227,14 +264,22 @@ def assign_to_event(
""", """,
(Json(centroid_list), 1), (Json(centroid_list), 1),
) )
new_id = cur.fetchone()[0] new_id = cur.fetchone()[0]
# Vincular traducción al evento
cur.execute( cur.execute(
"UPDATE traducciones SET evento_id = %s WHERE id = %s;", "UPDATE traducciones SET evento_id = %s WHERE id = %s;",
(new_id, tr_id), (new_id, tr_id),
) )
# Rellenar tabla de relación
_insert_evento_noticia(cur, new_id, info or {})
centroids.append({"id": new_id, "vec": vec.copy(), "n": 1}) centroids.append({"id": new_id, "vec": vec.copy(), "n": 1})
return return
# Buscar el centroide más cercano
best_idx: Optional[int] = None best_idx: Optional[int] = None
best_dist: float = 1.0 best_dist: float = 1.0
@ -244,6 +289,8 @@ def assign_to_event(
best_dist = d best_dist = d
best_idx = i best_idx = i
with conn.cursor() as cur:
# Asignar a evento existente si está por debajo del umbral
if best_idx is not None and best_dist <= EVENT_DIST_THRESHOLD: if best_idx is not None and best_dist <= EVENT_DIST_THRESHOLD:
c = centroids[best_idx] c = centroids[best_idx]
n_old = c["n"] n_old = c["n"]
@ -254,7 +301,29 @@ def assign_to_event(
c["n"] = new_n c["n"] = new_n
centroid_list = [float(x) for x in new_vec.tolist()] centroid_list = [float(x) for x in new_vec.tolist()]
with conn.cursor() as cur:
if info and info.get("fecha"):
cur.execute(
"""
UPDATE eventos
SET centroid = %s,
total_traducciones = total_traducciones + 1,
fecha_inicio = COALESCE(LEAST(fecha_inicio, %s), %s),
fecha_fin = COALESCE(GREATEST(fecha_fin, %s), %s),
n_noticias = n_noticias + 1
WHERE id = %s;
""",
(
Json(centroid_list),
info["fecha"],
info["fecha"],
info["fecha"],
info["fecha"],
c["id"],
),
)
else:
# Sin info de fecha: solo actualizamos centroid/contador
cur.execute( cur.execute(
""" """
UPDATE eventos UPDATE eventos
@ -264,14 +333,37 @@ def assign_to_event(
""", """,
(Json(centroid_list), c["id"]), (Json(centroid_list), c["id"]),
) )
# Vincular traducción y relación
cur.execute( cur.execute(
"UPDATE traducciones SET evento_id = %s WHERE id = %s;", "UPDATE traducciones SET evento_id = %s WHERE id = %s;",
(c["id"], tr_id), (c["id"], tr_id),
) )
_insert_evento_noticia(cur, c["id"], info or {})
return return
# Si no hay evento adecuado → crear uno nuevo
centroid_list = [float(x) for x in vec.tolist()] centroid_list = [float(x) for x in vec.tolist()]
with conn.cursor() as cur:
if info and info.get("fecha"):
cur.execute(
"""
INSERT INTO eventos (centroid, total_traducciones,
fecha_inicio, fecha_fin, n_noticias, titulo)
VALUES (%s, %s, %s, %s, %s, %s)
RETURNING id;
""",
(
Json(centroid_list),
1,
info["fecha"],
info["fecha"],
1,
info.get("titulo_evento"),
),
)
else:
cur.execute( cur.execute(
""" """
INSERT INTO eventos (centroid, total_traducciones) INSERT INTO eventos (centroid, total_traducciones)
@ -280,11 +372,15 @@ def assign_to_event(
""", """,
(Json(centroid_list), 1), (Json(centroid_list), 1),
) )
new_id = cur.fetchone()[0] new_id = cur.fetchone()[0]
cur.execute( cur.execute(
"UPDATE traducciones SET evento_id = %s WHERE id = %s;", "UPDATE traducciones SET evento_id = %s WHERE id = %s;",
(new_id, tr_id), (new_id, tr_id),
) )
_insert_evento_noticia(cur, new_id, info or {})
centroids.append({"id": new_id, "vec": vec.copy(), "n": 1}) centroids.append({"id": new_id, "vec": vec.copy(), "n": 1})
@ -309,11 +405,16 @@ def main():
time.sleep(EVENT_SLEEP_IDLE) time.sleep(EVENT_SLEEP_IDLE)
continue continue
log.info("Traducciones pendientes de asignar evento: %d", len(pending_ids)) log.info(
"Traducciones pendientes de asignar evento: %d",
len(pending_ids),
)
emb_by_tr = fetch_embeddings_for(conn, pending_ids) emb_by_tr = fetch_embeddings_for(conn, pending_ids)
if not emb_by_tr: if not emb_by_tr:
log.warning("No se encontraron embeddings para las traducciones pendientes.") log.warning(
"No se encontraron embeddings para las traducciones pendientes."
)
time.sleep(EVENT_SLEEP_IDLE) time.sleep(EVENT_SLEEP_IDLE)
continue continue
@ -329,7 +430,10 @@ def main():
processed += 1 processed += 1
conn.commit() conn.commit()
log.info("Asignación de eventos completada. Traducciones procesadas: %d", processed) log.info(
"Asignación de eventos completada. Traducciones procesadas: %d",
processed,
)
except Exception as e: except Exception as e:
log.exception("Error en cluster_worker: %s", e) log.exception("Error en cluster_worker: %s", e)

View file

@ -13,7 +13,9 @@ services:
PGDATA: /var/lib/postgresql/data/18/main PGDATA: /var/lib/postgresql/data/18/main
command: ["postgres", "-c", "max_connections=400"] command: ["postgres", "-c", "max_connections=400"]
volumes: volumes:
- /datos/rss/postgres/18:/var/lib/postgresql/data # Datos de Postgres dentro del proyecto
- ./pgdata:/var/lib/postgresql/data
# Scripts de inicialización
- ./init-db:/docker-entrypoint-initdb.d:ro - ./init-db:/docker-entrypoint-initdb.d:ro
restart: always restart: always
healthcheck: healthcheck:
@ -61,18 +63,19 @@ services:
- DB_USER=${DB_USER} - DB_USER=${DB_USER}
- DB_PASS=${DB_PASS} - DB_PASS=${DB_PASS}
- SECRET_KEY=${SECRET_KEY} - SECRET_KEY=${SECRET_KEY}
- RSS_MAX_WORKERS=8 - RSS_MAX_WORKERS=16
depends_on: depends_on:
db: db:
condition: service_healthy condition: service_healthy
restart: always restart: always
translator: # --- Worker de traducción en GPU: encola + traduce ---
translator_gpu:
build: build:
context: . context: .
args: args:
TORCH_CUDA: cu121 TORCH_CUDA: cu121
container_name: rss_translator container_name: rss_translator_gpu
command: bash -lc "python translation_worker.py" command: bash -lc "python translation_worker.py"
environment: environment:
- DB_HOST=db - DB_HOST=db
@ -81,19 +84,19 @@ services:
- DB_USER=${DB_USER} - DB_USER=${DB_USER}
- DB_PASS=${DB_PASS} - DB_PASS=${DB_PASS}
- TARGET_LANGS=es - TARGET_LANGS=es
- TRANSLATOR_BATCH=32 - TRANSLATOR_BATCH=16
- ENQUEUE=200 - ENQUEUE=200 # ESTE encola traducciones nuevas
- TRANSLATOR_SLEEP_IDLE=5 - TRANSLATOR_SLEEP_IDLE=5
- MAX_SRC_TOKENS=680 - MAX_SRC_TOKENS=680
- MAX_NEW_TOKENS=400 - MAX_NEW_TOKENS=400
- NUM_BEAMS_TITLE=2 - NUM_BEAMS_TITLE=1
- NUM_BEAMS_BODY=1 - NUM_BEAMS_BODY=1
- UNIVERSAL_MODEL=facebook/nllb-200-1.3B - UNIVERSAL_MODEL=facebook/nllb-200-distilled-600M
- CHUNK_BY_SENTENCES=True - CHUNK_BY_SENTENCES=True
- CHUNK_MAX_TOKENS=400 - CHUNK_MAX_TOKENS=400
- CHUNK_OVERLAP_SENTS=1 - CHUNK_OVERLAP_SENTS=1
- CLEAN_ARTICLE=1 - CLEAN_ARTICLE=1
- DEVICE=cuda - DEVICE=cuda # GPU
- PYTHONUNBUFFERED=1 - PYTHONUNBUFFERED=1
- HF_HOME=/root/.cache/huggingface - HF_HOME=/root/.cache/huggingface
- TOKENIZERS_PARALLELISM=false - TOKENIZERS_PARALLELISM=false
@ -101,13 +104,52 @@ services:
- NVIDIA_VISIBLE_DEVICES=all - NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility - NVIDIA_DRIVER_CAPABILITIES=compute,utility
volumes: volumes:
- /datos/rss/hf_cache:/root/.cache/huggingface # Cache de modelos HF dentro del proyecto
- ./hf_cache:/root/.cache/huggingface
depends_on: depends_on:
db: db:
condition: service_healthy condition: service_healthy
restart: always restart: always
gpus: all gpus: all
# --- Worker de traducción en CPU: SOLO procesa pendientes ---
translator_cpu:
build:
context: .
args:
TORCH_CUDA: cu121
container_name: rss_translator_cpu
command: bash -lc "python translation_worker.py"
environment:
- DB_HOST=db
- DB_PORT=5432
- DB_NAME=${DB_NAME}
- DB_USER=${DB_USER}
- DB_PASS=${DB_PASS}
- TARGET_LANGS=es
- TRANSLATOR_BATCH=8 # batch más pequeño para CPU
- ENQUEUE=0 # NO encola nuevas traducciones
- TRANSLATOR_SLEEP_IDLE=5
- MAX_SRC_TOKENS=680
- MAX_NEW_TOKENS=400
- NUM_BEAMS_TITLE=1
- NUM_BEAMS_BODY=1
- UNIVERSAL_MODEL=facebook/nllb-200-distilled-600M
- CHUNK_BY_SENTENCES=True
- CHUNK_MAX_TOKENS=400
- CHUNK_OVERLAP_SENTS=1
- CLEAN_ARTICLE=1
- DEVICE=cpu # Fuerza CPU
- PYTHONUNBUFFERED=1
- HF_HOME=/root/.cache/huggingface
- TOKENIZERS_PARALLELISM=false
volumes:
- ./hf_cache:/root/.cache/huggingface
depends_on:
db:
condition: service_healthy
restart: always
ner: ner:
build: build:
context: . context: .
@ -141,7 +183,7 @@ services:
- DB_NAME=${DB_NAME} - DB_NAME=${DB_NAME}
- DB_USER=${DB_USER} - DB_USER=${DB_USER}
- DB_PASS=${DB_PASS} - DB_PASS=${DB_PASS}
- EMB_MODEL=sentence-transformers/paraphrase-multilingual-mpnet-base-v2 - EMB_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
- EMB_BATCH=256 - EMB_BATCH=256
- EMB_SLEEP_IDLE=5 - EMB_SLEEP_IDLE=5
- EMB_LANGS=es - EMB_LANGS=es
@ -151,7 +193,8 @@ services:
- HF_HOME=/root/.cache/huggingface - HF_HOME=/root/.cache/huggingface
- TOKENIZERS_PARALLELISM=false - TOKENIZERS_PARALLELISM=false
volumes: volumes:
- /datos/rss/hf_cache:/root/.cache/huggingface # Reutiliza el mismo cache HF
- ./hf_cache:/root/.cache/huggingface
depends_on: depends_on:
db: db:
condition: service_healthy condition: service_healthy

View file

@ -1,364 +0,0 @@
--
-- PostgreSQL database dump
--
-- Dumped from database version 16.9 (Ubuntu 16.9-0ubuntu0.24.04.1)
-- Dumped by pg_dump version 16.9 (Ubuntu 16.9-0ubuntu0.24.04.1)
SET statement_timeout = 0;
SET lock_timeout = 0;
SET idle_in_transaction_session_timeout = 0;
SET client_encoding = 'UTF8';
SET standard_conforming_strings = on;
SELECT pg_catalog.set_config('search_path', '', false);
SET check_function_bodies = false;
SET xmloption = content;
SET client_min_messages = warning;
SET row_security = off;
--
-- Name: noticias_tsv_trigger(); Type: FUNCTION; Schema: public; Owner: rss
--
CREATE FUNCTION public.noticias_tsv_trigger() RETURNS trigger
LANGUAGE plpgsql
AS $$ begin new.tsv := setweight(to_tsvector('spanish', coalesce(new.titulo,'')), 'A') || setweight(to_tsvector('spanish', coalesce(new.resumen,'')), 'B'); return new; end $$;
ALTER FUNCTION public.noticias_tsv_trigger() OWNER TO rss;
SET default_tablespace = '';
SET default_table_access_method = heap;
--
-- Name: categorias; Type: TABLE; Schema: public; Owner: rss
--
CREATE TABLE public.categorias (
id integer NOT NULL,
nombre character varying(100) NOT NULL
);
ALTER TABLE public.categorias OWNER TO rss;
--
-- Name: categorias_id_seq; Type: SEQUENCE; Schema: public; Owner: rss
--
CREATE SEQUENCE public.categorias_id_seq
AS integer
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;
ALTER SEQUENCE public.categorias_id_seq OWNER TO rss;
--
-- Name: categorias_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: rss
--
ALTER SEQUENCE public.categorias_id_seq OWNED BY public.categorias.id;
--
-- Name: continentes; Type: TABLE; Schema: public; Owner: rss
--
CREATE TABLE public.continentes (
id integer NOT NULL,
nombre character varying(50) NOT NULL
);
ALTER TABLE public.continentes OWNER TO rss;
--
-- Name: continentes_id_seq; Type: SEQUENCE; Schema: public; Owner: rss
--
CREATE SEQUENCE public.continentes_id_seq
AS integer
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;
ALTER SEQUENCE public.continentes_id_seq OWNER TO rss;
--
-- Name: continentes_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: rss
--
ALTER SEQUENCE public.continentes_id_seq OWNED BY public.continentes.id;
--
-- Name: feeds; Type: TABLE; Schema: public; Owner: rss
--
CREATE TABLE public.feeds (
id integer NOT NULL,
nombre character varying(255),
descripcion text,
url text NOT NULL,
categoria_id integer,
pais_id integer,
idioma character(2),
activo boolean DEFAULT true,
fallos integer DEFAULT 0,
last_etag character varying(255),
last_modified character varying(255)
);
ALTER TABLE public.feeds OWNER TO rss;
--
-- Name: feeds_id_seq; Type: SEQUENCE; Schema: public; Owner: rss
--
CREATE SEQUENCE public.feeds_id_seq
AS integer
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;
ALTER SEQUENCE public.feeds_id_seq OWNER TO rss;
--
-- Name: feeds_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: rss
--
ALTER SEQUENCE public.feeds_id_seq OWNED BY public.feeds.id;
--
-- Name: noticias; Type: TABLE; Schema: public; Owner: rss
--
CREATE TABLE public.noticias (
id character varying(32) NOT NULL,
titulo text,
resumen text,
url text NOT NULL,
fecha timestamp without time zone,
imagen_url text,
categoria_id integer,
pais_id integer,
tsv tsvector
);
ALTER TABLE public.noticias OWNER TO rss;
--
-- Name: paises; Type: TABLE; Schema: public; Owner: rss
--
CREATE TABLE public.paises (
id integer NOT NULL,
nombre character varying(100) NOT NULL,
continente_id integer
);
ALTER TABLE public.paises OWNER TO rss;
--
-- Name: paises_id_seq; Type: SEQUENCE; Schema: public; Owner: rss
--
CREATE SEQUENCE public.paises_id_seq
AS integer
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;
ALTER SEQUENCE public.paises_id_seq OWNER TO rss;
--
-- Name: paises_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: rss
--
ALTER SEQUENCE public.paises_id_seq OWNED BY public.paises.id;
--
-- Name: categorias id; Type: DEFAULT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.categorias ALTER COLUMN id SET DEFAULT nextval('public.categorias_id_seq'::regclass);
--
-- Name: continentes id; Type: DEFAULT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.continentes ALTER COLUMN id SET DEFAULT nextval('public.continentes_id_seq'::regclass);
--
-- Name: feeds id; Type: DEFAULT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.feeds ALTER COLUMN id SET DEFAULT nextval('public.feeds_id_seq'::regclass);
--
-- Name: paises id; Type: DEFAULT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.paises ALTER COLUMN id SET DEFAULT nextval('public.paises_id_seq'::regclass);
--
-- Name: categorias categorias_nombre_key; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.categorias
ADD CONSTRAINT categorias_nombre_key UNIQUE (nombre);
--
-- Name: categorias categorias_pkey; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.categorias
ADD CONSTRAINT categorias_pkey PRIMARY KEY (id);
--
-- Name: continentes continentes_nombre_key; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.continentes
ADD CONSTRAINT continentes_nombre_key UNIQUE (nombre);
--
-- Name: continentes continentes_pkey; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.continentes
ADD CONSTRAINT continentes_pkey PRIMARY KEY (id);
--
-- Name: feeds feeds_pkey; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.feeds
ADD CONSTRAINT feeds_pkey PRIMARY KEY (id);
--
-- Name: feeds feeds_url_key; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.feeds
ADD CONSTRAINT feeds_url_key UNIQUE (url);
--
-- Name: noticias noticias_pkey; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.noticias
ADD CONSTRAINT noticias_pkey PRIMARY KEY (id);
--
-- Name: noticias noticias_url_key; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.noticias
ADD CONSTRAINT noticias_url_key UNIQUE (url);
--
-- Name: paises paises_nombre_key; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.paises
ADD CONSTRAINT paises_nombre_key UNIQUE (nombre);
--
-- Name: paises paises_pkey; Type: CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.paises
ADD CONSTRAINT paises_pkey PRIMARY KEY (id);
--
-- Name: noticias_tsv_idx; Type: INDEX; Schema: public; Owner: rss
--
CREATE INDEX noticias_tsv_idx ON public.noticias USING gin (tsv);
--
-- Name: noticias tsvectorupdate; Type: TRIGGER; Schema: public; Owner: rss
--
CREATE TRIGGER tsvectorupdate BEFORE INSERT ON public.noticias FOR EACH ROW EXECUTE FUNCTION public.noticias_tsv_trigger();
--
-- Name: feeds feeds_categoria_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.feeds
ADD CONSTRAINT feeds_categoria_id_fkey FOREIGN KEY (categoria_id) REFERENCES public.categorias(id) ON DELETE SET NULL;
--
-- Name: feeds feeds_pais_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.feeds
ADD CONSTRAINT feeds_pais_id_fkey FOREIGN KEY (pais_id) REFERENCES public.paises(id) ON DELETE SET NULL;
--
-- Name: noticias noticias_categoria_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.noticias
ADD CONSTRAINT noticias_categoria_id_fkey FOREIGN KEY (categoria_id) REFERENCES public.categorias(id) ON DELETE SET NULL;
--
-- Name: noticias noticias_pais_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.noticias
ADD CONSTRAINT noticias_pais_id_fkey FOREIGN KEY (pais_id) REFERENCES public.paises(id) ON DELETE SET NULL;
--
-- Name: paises paises_continente_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: rss
--
ALTER TABLE ONLY public.paises
ADD CONSTRAINT paises_continente_id_fkey FOREIGN KEY (continente_id) REFERENCES public.continentes(id) ON DELETE SET NULL;
--
-- PostgreSQL database dump complete
--

View file

@ -29,7 +29,7 @@ SELECT
te.dim, te.dim,
te.embedding AS vec te.embedding AS vec
FROM traduccion_embeddings te FROM traduccion_embeddings te
WHERE te.model = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'; WHERE te.model = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2';
CREATE TABLE IF NOT EXISTS related_noticias ( CREATE TABLE IF NOT EXISTS related_noticias (
traduccion_id INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE, traduccion_id INT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE,

View file

@ -31,11 +31,13 @@ ALTER TABLE traducciones
-- --------------------------------------------- -- ---------------------------------------------
-- 3. TABLA RELACIÓN EVENTO <-> NOTICIA <-> TRADUCCIÓN -- 3. TABLA RELACIÓN EVENTO <-> NOTICIA <-> TRADUCCIÓN
-- (tipos alineados con noticias.id (VARCHAR(32))
-- y traducciones.id (INTEGER))
-- --------------------------------------------- -- ---------------------------------------------
CREATE TABLE IF NOT EXISTS eventos_noticias ( CREATE TABLE IF NOT EXISTS eventos_noticias (
evento_id BIGINT NOT NULL REFERENCES eventos(id) ON DELETE CASCADE, evento_id BIGINT NOT NULL REFERENCES eventos(id) ON DELETE CASCADE,
noticia_id CHAR(32) NOT NULL REFERENCES noticias(id) ON DELETE CASCADE, noticia_id VARCHAR(32) NOT NULL REFERENCES noticias(id) ON DELETE CASCADE,
traduccion_id BIGINT NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE, traduccion_id INTEGER NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE,
PRIMARY KEY (evento_id, traduccion_id) PRIMARY KEY (evento_id, traduccion_id)
); );

View file

@ -24,7 +24,7 @@ if __name__ == '__main__':
scheduler.add_job( scheduler.add_job(
fetch_and_store_all, fetch_and_store_all,
"interval", "interval",
minutes=10, minutes=3,
id="rss_job", id="rss_job",
next_run_time=datetime.utcnow() + timedelta(seconds=10) next_run_time=datetime.utcnow() + timedelta(seconds=10)
) )

View file

@ -360,7 +360,7 @@ def _token_chunks(tokenizer, text: str, max_tokens: int) -> List[str]:
return [text] return [text]
chunks = [] chunks = []
for i in range(0, len(ids), max_tokens): for i in range(0, len(ids), max_tokens):
sub = ids[i : i + max_tokens] sub = ids[i: i + max_tokens]
piece = tokenizer.decode(sub, skip_special_tokens=True, clean_up_tokenization_spaces=True) piece = tokenizer.decode(sub, skip_special_tokens=True, clean_up_tokenization_spaces=True)
if piece.strip(): if piece.strip():
chunks.append(piece.strip()) chunks.append(piece.strip())
@ -413,6 +413,90 @@ def _forced_bos_id(tokenizer: AutoTokenizer, model: AutoModelForSeq2SeqLM, tgt_c
return getattr(tokenizer, "eos_token_id", None) or getattr(tokenizer, "bos_token_id", None) or 0 return getattr(tokenizer, "eos_token_id", None) or getattr(tokenizer, "bos_token_id", None) or 0
@torch.inference_mode()
def _translate_texts_simple(
src_lang: str,
tgt_lang: str,
texts: List[str],
num_beams: int = 1,
_tries: int = 0,
) -> List[str]:
if not texts:
return []
cleaned = [(t or "").strip() for t in texts]
if all(not t for t in cleaned):
return ["" for _ in cleaned]
tok, mdl, device = get_universal_components()
src_code = map_to_nllb(src_lang) or "eng_Latn"
tgt_code = map_to_nllb(tgt_lang) or "spa_Latn"
try:
tok.src_lang = src_code
except Exception:
pass
forced_bos = _forced_bos_id(tok, mdl, tgt_code)
safe_len = _safe_src_len(tok)
try:
autocast_ctx = (
torch.amp.autocast("cuda", dtype=torch.float16)
if device.type == "cuda"
else contextlib.nullcontext()
)
enc = tok(
cleaned,
return_tensors="pt",
padding=True,
truncation=True,
max_length=safe_len,
)
enc = {k: v.to(device) for k, v in enc.items()}
gen_kwargs = dict(
forced_bos_token_id=forced_bos,
max_new_tokens=MAX_NEW_TOKENS,
num_beams=max(1, int(num_beams)),
do_sample=False,
use_cache=False,
)
if int(num_beams) > 1:
gen_kwargs["early_stopping"] = True
with autocast_ctx:
generated = mdl.generate(**enc, **gen_kwargs)
outs = tok.batch_decode(generated, skip_special_tokens=True)
outs = [o.strip() for o in outs]
del enc, generated
if device.type == "cuda":
_free_cuda()
return outs
except Exception as e:
if device.type == "cuda" and _is_cuda_mem_error(e) and _tries < 2:
LOG.warning("CUDA OOM/allocator (batch): intento de recuperación %d. Detalle: %s", _tries + 1, e)
global _MODEL, _DEVICE, _CUDA_DISABLED
_CUDA_DISABLED = True
try:
if _MODEL is not None:
del _MODEL
except Exception:
pass
_free_cuda()
_MODEL = None
_DEVICE = None
time.sleep(1.0)
return _translate_texts_simple(src_lang, tgt_lang, texts, num_beams=num_beams, _tries=_tries + 1)
raise
@torch.inference_mode() @torch.inference_mode()
def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1, _tries: int = 0) -> str: def translate_text(src_lang: str, tgt_lang: str, text: str, num_beams: int = 1, _tries: int = 0) -> str:
if not text or not text.strip(): if not text or not text.strip():
@ -495,7 +579,7 @@ def _pack_sentences_to_token_chunks(
ids = tokenizer(s, add_special_tokens=False).input_ids ids = tokenizer(s, add_special_tokens=False).input_ids
step = max_tokens step = max_tokens
for i in range(0, len(ids), step): for i in range(0, len(ids), step):
sub = tokenizer.decode(ids[i : i + step], skip_special_tokens=True) sub = tokenizer.decode(ids[i: i + step], skip_special_tokens=True)
if cur: if cur:
chunks.append(cur) chunks.append(cur)
cur = [] cur = []
@ -536,6 +620,75 @@ def _smart_concatenate(parts: List[str], tail_window: int = 120) -> str:
return out return out
def translate_articles_full_batch(
src_lang: str,
tgt_lang: str,
texts: List[str],
num_beams: int,
) -> List[str]:
if not texts:
return []
if not CHUNK_BY_SENTENCES:
return _translate_texts_simple(src_lang, tgt_lang, texts, num_beams=num_beams)
tok, _, _ = get_universal_components()
safe_len = _safe_src_len(tok)
max_chunk_tokens = min(CHUNK_MAX_TOKENS, safe_len)
all_chunk_texts: List[str] = []
per_article_chunk_ids: List[List[int]] = []
for text in texts:
text = (text or "").strip()
if not text:
per_article_chunk_ids.append([])
continue
sents = split_into_sentences(text)
if not sents:
per_article_chunk_ids.append([])
continue
chunks_sents = _pack_sentences_to_token_chunks(
tok,
sents,
max_tokens=max_chunk_tokens,
overlap_sents=CHUNK_OVERLAP_SENTS,
)
ids_for_this_article: List[int] = []
for group in chunks_sents:
chunk_text = " ".join(group).strip()
if not chunk_text:
continue
idx = len(all_chunk_texts)
all_chunk_texts.append(chunk_text)
ids_for_this_article.append(idx)
per_article_chunk_ids.append(ids_for_this_article)
if not all_chunk_texts:
return ["" for _ in texts]
translated_chunks = _translate_texts_simple(
src_lang,
tgt_lang,
all_chunk_texts,
num_beams=num_beams,
)
outs: List[str] = []
for chunk_ids in per_article_chunk_ids:
if not chunk_ids:
outs.append("")
continue
parts = [translated_chunks[i] for i in chunk_ids]
outs.append(_smart_concatenate([p for p in parts if p]))
return outs
def translate_article_full( def translate_article_full(
src_lang: str, src_lang: str,
tgt_lang: str, tgt_lang: str,
@ -570,9 +723,15 @@ def translate_article_full(
def process_batch(conn, rows): def process_batch(conn, rows):
batch_size = len(rows)
LOG.info("Iniciando traducción de batch con %d filas…", batch_size)
t0 = time.time()
done_rows = [] done_rows = []
error_rows = [] error_rows = []
enriched_rows = []
for r in rows: for r in rows:
tr_id = r["tr_id"] tr_id = r["tr_id"]
lang_to = normalize_lang(r["lang_to"], "es") or "es" lang_to = normalize_lang(r["lang_to"], "es") or "es"
@ -581,23 +740,54 @@ def process_batch(conn, rows):
title = (r["titulo"] or "").strip() title = (r["titulo"] or "").strip()
body = (r["resumen"] or "").strip() body = (r["resumen"] or "").strip()
if (map_to_nllb(lang_from) or "eng_Latn") == (map_to_nllb(lang_to) or "spa_Latn"): src_code = map_to_nllb(lang_from) or "eng_Latn"
tgt_code = map_to_nllb(lang_to) or "spa_Latn"
if src_code == tgt_code:
done_rows.append((title, body, lang_from, tr_id)) done_rows.append((title, body, lang_from, tr_id))
continue continue
enriched_rows.append(
{
"tr_id": tr_id,
"lang_from": lang_from,
"lang_to": lang_to,
"title": title,
"body": body,
}
)
from collections import defaultdict
groups = defaultdict(list)
for er in enriched_rows:
key = (er["lang_from"], er["lang_to"])
groups[key].append(er)
for (lang_from, lang_to), items in groups.items():
titles = [it["title"] for it in items]
bodies = [it["body"] for it in items]
try: try:
title_tr = translate_text(lang_from, lang_to, title, num_beams=NUM_BEAMS_TITLE) if title else "" titles_tr = _translate_texts_simple(lang_from, lang_to, titles, num_beams=NUM_BEAMS_TITLE)
body_tr = translate_article_full(lang_from, lang_to, body, num_beams=NUM_BEAMS_BODY) if body else "" bodies_tr = translate_articles_full_batch(lang_from, lang_to, bodies, num_beams=NUM_BEAMS_BODY)
if _norm(title_tr) == _norm(title): for it, t_tr, b_tr in zip(items, titles_tr, bodies_tr):
title_tr = "" title_orig = it["title"]
if _norm(body_tr) == _norm(body): body_orig = it["body"]
body_tr = ""
if _norm(t_tr) == _norm(title_orig):
t_tr = ""
if _norm(b_tr) == _norm(body_orig):
b_tr = ""
done_rows.append((t_tr, b_tr, lang_from, it["tr_id"]))
done_rows.append((title_tr, body_tr, lang_from, tr_id))
except Exception as e: except Exception as e:
LOG.exception("Error traduciendo fila") LOG.exception("Error traduciendo lote %s -> %s", lang_from, lang_to)
error_rows.append((str(e)[:1500], tr_id)) err_msg = str(e)[:1500]
for it in items:
error_rows.append((err_msg, it["tr_id"]))
with conn.cursor() as cur: with conn.cursor() as cur:
if done_rows: if done_rows:
@ -630,6 +820,28 @@ def process_batch(conn, rows):
) )
conn.commit() conn.commit()
dt = time.time() - t0
try:
_, _, device = get_universal_components()
dev_label = device.type.upper() if device is not None else "UNK"
except Exception:
dev_label = "UNK"
if batch_size > 0:
LOG.info(
"[%s] Batch de %d filas traducido en %.2f s (%.2f s/noticia)",
dev_label,
batch_size,
dt,
dt / batch_size,
)
else:
LOG.info(
"[%s] Batch vacío, nada que traducir (%.2f s)",
dev_label,
dt,
)
def main(): def main():
LOG.info( LOG.info(