rss/docker-compose.yml
2025-11-24 00:45:10 +01:00

206 lines
4.9 KiB
YAML

services:
db:
image: postgres:18
container_name: rss_db
environment:
POSTGRES_DB: ${DB_NAME}
POSTGRES_USER: ${DB_USER}
POSTGRES_PASSWORD: ${DB_PASS}
POSTGRES_INITDB_ARGS: "--encoding=UTF8 --locale=C.UTF-8"
LANG: C.UTF-8
LC_ALL: C.UTF-8
TZ: Europe/Madrid
PGDATA: /var/lib/postgresql/data/18/main
command: ["postgres", "-c", "max_connections=400"]
volumes:
- /datos/rss/postgres/18:/var/lib/postgresql/data
- ./init-db:/docker-entrypoint-initdb.d:ro
restart: always
healthcheck:
test: ["CMD-SHELL", "pg_isready -h 127.0.0.1 -p 5432 -U $$POSTGRES_USER -d $$POSTGRES_DB || exit 1"]
interval: 5s
timeout: 5s
retries: 30
start_period: 20s
web:
build:
context: .
args:
TORCH_CUDA: cu121
container_name: rss_web
command: bash -lc "gunicorn --bind 0.0.0.0:8000 --workers 3 --timeout 120 app:app"
ports:
- "8001:8000"
environment:
- DB_HOST=db
- DB_PORT=5432
- DB_NAME=${DB_NAME}
- DB_USER=${DB_USER}
- DB_PASS=${DB_PASS}
- SECRET_KEY=${SECRET_KEY}
- WEB_TRANSLATED_DEFAULT=1
- DEFAULT_LANG=es
- TRANSLATION_PREFERRED_LANGS=es
depends_on:
db:
condition: service_healthy
restart: always
scheduler:
build:
context: .
args:
TORCH_CUDA: cu121
container_name: rss_scheduler
command: bash -lc "python scheduler.py"
environment:
- DB_HOST=db
- DB_PORT=5432
- DB_NAME=${DB_NAME}
- DB_USER=${DB_USER}
- DB_PASS=${DB_PASS}
- SECRET_KEY=${SECRET_KEY}
- RSS_MAX_WORKERS=8
depends_on:
db:
condition: service_healthy
restart: always
translator:
build:
context: .
args:
TORCH_CUDA: cu121
container_name: rss_translator
command: bash -lc "python translation_worker.py"
environment:
- DB_HOST=db
- DB_PORT=5432
- DB_NAME=${DB_NAME}
- DB_USER=${DB_USER}
- DB_PASS=${DB_PASS}
- TARGET_LANGS=es
- TRANSLATOR_BATCH=32
- ENQUEUE=200
- TRANSLATOR_SLEEP_IDLE=5
- MAX_SRC_TOKENS=680
- MAX_NEW_TOKENS=400
- NUM_BEAMS_TITLE=2
- NUM_BEAMS_BODY=1
- UNIVERSAL_MODEL=facebook/nllb-200-1.3B
- CHUNK_BY_SENTENCES=True
- CHUNK_MAX_TOKENS=400
- CHUNK_OVERLAP_SENTS=1
- CLEAN_ARTICLE=1
- DEVICE=cuda
- PYTHONUNBUFFERED=1
- HF_HOME=/root/.cache/huggingface
- TOKENIZERS_PARALLELISM=false
- PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64,garbage_collection_threshold:0.9
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
volumes:
- /datos/rss/hf_cache:/root/.cache/huggingface
depends_on:
db:
condition: service_healthy
restart: always
gpus: all
ner:
build:
context: .
args:
TORCH_CUDA: cu121
container_name: rss_ner
command: bash -lc "python ner_worker.py"
environment:
- DB_HOST=db
- DB_PORT=5432
- DB_NAME=${DB_NAME}
- DB_USER=${DB_USER}
- DB_PASS=${DB_PASS}
- NER_LANG=es
- NER_BATCH=64
depends_on:
db:
condition: service_healthy
restart: always
embeddings:
build:
context: .
args:
TORCH_CUDA: cu121
container_name: rss_embeddings
command: bash -lc "python embeddings_worker.py"
environment:
- DB_HOST=db
- DB_PORT=5432
- DB_NAME=${DB_NAME}
- DB_USER=${DB_USER}
- DB_PASS=${DB_PASS}
- EMB_MODEL=sentence-transformers/paraphrase-multilingual-mpnet-base-v2
- EMB_BATCH=256
- EMB_SLEEP_IDLE=5
- EMB_LANGS=es
- EMB_LIMIT=5000
- DEVICE=cuda
- PYTHONUNBUFFERED=1
- HF_HOME=/root/.cache/huggingface
- TOKENIZERS_PARALLELISM=false
volumes:
- /datos/rss/hf_cache:/root/.cache/huggingface
depends_on:
db:
condition: service_healthy
restart: always
gpus: all
related:
build:
context: .
args:
TORCH_CUDA: cu121
container_name: rss_related
command: bash -lc "python related_worker.py"
environment:
- DB_HOST=db
- DB_PORT=5432
- DB_NAME=${DB_NAME}
- DB_USER=${DB_USER}
- DB_PASS=${DB_PASS}
- RELATED_TOPK=10
- RELATED_BATCH_IDS=200
- RELATED_BATCH_SIM=2000
- RELATED_SLEEP=10
- RELATED_MIN_SCORE=0.0
- RELATED_WINDOW_H=0
depends_on:
db:
condition: service_healthy
restart: always
cluster:
build:
context: .
args:
TORCH_CUDA: cu121
container_name: rss_cluster
command: bash -lc "python cluster_worker.py"
environment:
- DB_HOST=db
- DB_PORT=5432
- DB_NAME=${DB_NAME}
- DB_USER=${DB_USER}
- DB_PASS=${DB_PASS}
depends_on:
db:
condition: service_healthy
restart: always
networks:
default:
name: rss_default