Mejoras: NER, embeddings, dashboard, docker-compose y limpieza

2025-11-17 19:37:05 +01:00 · 2025-11-17 19:37:05 +01:00 · d508dc2058
commit d508dc2058
parent 6c5aff9936
19 changed files with 2218 additions and 1185 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,20 +1,27 @@
 services:
  db:
-    image: postgres:15
+    image: postgres:18
    container_name: rss_db
    environment:
-      - POSTGRES_DB=${DB_NAME}
-      - POSTGRES_USER=${DB_USER}
-      - POSTGRES_PASSWORD=${DB_PASS}
+      POSTGRES_DB: ${DB_NAME}
+      POSTGRES_USER: ${DB_USER}
+      POSTGRES_PASSWORD: ${DB_PASS}
+      POSTGRES_INITDB_ARGS: "--encoding=UTF8 --locale=C.UTF-8"
+      LANG: C.UTF-8
+      LC_ALL: C.UTF-8
+      TZ: Europe/Madrid
+      PGDATA: /var/lib/postgresql/data/18/main
+    command: ["postgres", "-c", "max_connections=400"]
    volumes:
-      - postgres_data:/var/lib/postgresql/data
-      - ./init-db:/docker-entrypoint-initdb.d
+      - /datos/rss/postgres/18:/var/lib/postgresql/data
+      - ./init-db:/docker-entrypoint-initdb.d:ro
    restart: always
    healthcheck:
-      test: ["CMD-SHELL", "pg_isready -U ${DB_USER} -d ${DB_NAME}"]
+      test: ["CMD-SHELL", "pg_isready -h 127.0.0.1 -p 5432 -U $$POSTGRES_USER -d $$POSTGRES_DB || exit 1"]
      interval: 5s
      timeout: 5s
-      retries: 5
+      retries: 30
+      start_period: 20s

  web:
    build:
@ -22,7 +29,7 @@ services:
      args:
        TORCH_CUDA: cu121
    container_name: rss_web
-    command: gunicorn --bind 0.0.0.0:8000 --workers 3 app:app
+    command: bash -lc "gunicorn --bind 0.0.0.0:8000 --workers 3 --timeout 120 app:app"
    ports:
      - "8001:8000"
    environment:
@ -46,7 +53,7 @@ services:
      args:
        TORCH_CUDA: cu121
    container_name: rss_scheduler
-    command: python scheduler.py
+    command: bash -lc "python scheduler.py"
    environment:
      - DB_HOST=db
      - DB_PORT=5432
@ -54,6 +61,7 @@ services:
      - DB_USER=${DB_USER}
      - DB_PASS=${DB_PASS}
      - SECRET_KEY=${SECRET_KEY}
+      - RSS_MAX_WORKERS=8
    depends_on:
      db:
        condition: service_healthy
@ -65,7 +73,7 @@ services:
      args:
        TORCH_CUDA: cu121
    container_name: rss_translator
-    command: python translation_worker.py
+    command: bash -lc "python translation_worker.py"
    environment:
      - DB_HOST=db
      - DB_PORT=5432
@ -101,7 +109,7 @@ services:
      - NVIDIA_VISIBLE_DEVICES=all
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
    volumes:
-      - hf_cache:/root/.cache/huggingface
+      - /datos/rss/hf_cache:/root/.cache/huggingface
    depends_on:
      db:
        condition: service_healthy
@ -114,7 +122,7 @@ services:
      args:
        TORCH_CUDA: cu121
    container_name: rss_ner
-    command: python ner_worker.py
+    command: bash -lc "python ner_worker.py"
    environment:
      - DB_HOST=db
      - DB_PORT=5432
@ -128,7 +136,61 @@ services:
        condition: service_healthy
    restart: always

-volumes:
-  postgres_data:
-  hf_cache:
+  embeddings:
+    build:
+      context: .
+      args:
+        TORCH_CUDA: cu121
+    container_name: rss_embeddings
+    command: bash -lc "python embeddings_worker.py"
+    environment:
+      - DB_HOST=db
+      - DB_PORT=5432
+      - DB_NAME=${DB_NAME}
+      - DB_USER=${DB_USER}
+      - DB_PASS=${DB_PASS}
+
+      - EMB_MODEL=sentence-transformers/all-MiniLM-L6-v2
+      - EMB_BATCH=64
+      - EMB_SLEEP=5
+
+      - PYTHONUNBUFFERED=1
+      - HF_HOME=/root/.cache/huggingface
+      - TOKENIZERS_PARALLELISM=false
+    volumes:
+      - /datos/rss/hf_cache:/root/.cache/huggingface
+    depends_on:
+      db:
+        condition: service_healthy
+    restart: always
+    # gpus: all
+
+  related:
+    build:
+      context: .
+      args:
+        TORCH_CUDA: cu121
+    container_name: rss_related
+    command: bash -lc "python related_worker.py"
+    environment:
+      - DB_HOST=db
+      - DB_PORT=5432
+      - DB_NAME=${DB_NAME}
+      - DB_USER=${DB_USER}
+      - DB_PASS=${DB_PASS}
+
+      - RELATED_TOPK=10
+      - RELATED_BATCH_IDS=200
+      - RELATED_BATCH_SIM=2000
+      - RELATED_SLEEP=10
+      - RELATED_MIN_SCORE=0.0
+      - RELATED_WINDOW_H=0
+    depends_on:
+      db:
+        condition: service_healthy
+    restart: always
+
+networks:
+  default:
+    name: rss_default