From 10d51c3c521d0224c78ee3601528c384b1c4d9a9 Mon Sep 17 00:00:00 2001 From: SITO Date: Mon, 30 Mar 2026 20:49:30 +0200 Subject: [PATCH 1/8] feat(deploy): despliegue nativo Debian sin Docker - Elimina todos los Dockerfiles y docker-compose.yml - Elimina scripts Docker (start_docker, reset_and_deploy, deploy-clean) - Agrega deploy/debian/ con despliegue nativo via systemd: - install.sh: instalacion completa en Debian (PostgreSQL, Redis, Qdrant binario, Go, Python venv, nginx, frontend compilado) - build.sh: recompila binarios Go y frontend sin reinstalar - env.example: variables de entorno sin referencias Docker - nginx.conf: sirve React estatico + proxy al API Go en localhost - systemd/*.service: 16 servicios (8 Go + 7 Python + Qdrant) Todos los hostnames Docker (db, redis, qdrant) reemplazados por 127.0.0.1 Co-Authored-By: Claude Sonnet 4.6 --- Dockerfile | 50 -- Dockerfile.discovery | 31 - Dockerfile.qdrant | 34 - Dockerfile.related | 32 - Dockerfile.scheduler | 23 - Dockerfile.scraper | 32 - Dockerfile.topics | 30 - Dockerfile.translator | 43 - Dockerfile.translator-gpu | 48 -- Dockerfile.wiki | 31 - backend/Dockerfile | 24 - deploy-clean.sh | 47 -- deploy/debian/build.sh | 69 ++ deploy/debian/env.example | 104 +++ deploy/debian/install.sh | 294 +++++++ deploy/debian/nginx.conf | 91 +++ deploy/debian/systemd/rss2-backend.service | 24 + .../debian/systemd/rss2-categorizer.service | 25 + deploy/debian/systemd/rss2-cluster.service | 25 + deploy/debian/systemd/rss2-discovery.service | 26 + deploy/debian/systemd/rss2-embeddings.service | 30 + deploy/debian/systemd/rss2-ingestor.service | 26 + deploy/debian/systemd/rss2-langdetect.service | 25 + deploy/debian/systemd/rss2-ner.service | 26 + .../debian/systemd/rss2-qdrant-worker.service | 28 + deploy/debian/systemd/rss2-qdrant.service | 25 + deploy/debian/systemd/rss2-related.service | 26 + deploy/debian/systemd/rss2-scraper.service | 25 + deploy/debian/systemd/rss2-topics.service | 25 + .../rss2-translation-scheduler.service | 26 + deploy/debian/systemd/rss2-translator.service | 31 + deploy/debian/systemd/rss2-wiki.service | 24 + docker-compose.yml | 748 ------------------ docker-entrypoint-db.sh | 42 - frontend/Dockerfile | 19 - monitoring/prometheus.yml | 21 - reset_and_deploy.sh | 14 - rss-ingestor-go/Dockerfile | 27 - start_docker.sh | 23 - 39 files changed, 975 insertions(+), 1319 deletions(-) delete mode 100644 Dockerfile delete mode 100644 Dockerfile.discovery delete mode 100644 Dockerfile.qdrant delete mode 100644 Dockerfile.related delete mode 100644 Dockerfile.scheduler delete mode 100644 Dockerfile.scraper delete mode 100644 Dockerfile.topics delete mode 100644 Dockerfile.translator delete mode 100644 Dockerfile.translator-gpu delete mode 100644 Dockerfile.wiki delete mode 100644 backend/Dockerfile delete mode 100755 deploy-clean.sh create mode 100755 deploy/debian/build.sh create mode 100644 deploy/debian/env.example create mode 100755 deploy/debian/install.sh create mode 100644 deploy/debian/nginx.conf create mode 100644 deploy/debian/systemd/rss2-backend.service create mode 100644 deploy/debian/systemd/rss2-categorizer.service create mode 100644 deploy/debian/systemd/rss2-cluster.service create mode 100644 deploy/debian/systemd/rss2-discovery.service create mode 100644 deploy/debian/systemd/rss2-embeddings.service create mode 100644 deploy/debian/systemd/rss2-ingestor.service create mode 100644 deploy/debian/systemd/rss2-langdetect.service create mode 100644 deploy/debian/systemd/rss2-ner.service create mode 100644 deploy/debian/systemd/rss2-qdrant-worker.service create mode 100644 deploy/debian/systemd/rss2-qdrant.service create mode 100644 deploy/debian/systemd/rss2-related.service create mode 100644 deploy/debian/systemd/rss2-scraper.service create mode 100644 deploy/debian/systemd/rss2-topics.service create mode 100644 deploy/debian/systemd/rss2-translation-scheduler.service create mode 100644 deploy/debian/systemd/rss2-translator.service create mode 100644 deploy/debian/systemd/rss2-wiki.service delete mode 100644 docker-compose.yml delete mode 100755 docker-entrypoint-db.sh delete mode 100644 frontend/Dockerfile delete mode 100644 monitoring/prometheus.yml delete mode 100755 reset_and_deploy.sh delete mode 100644 rss-ingestor-go/Dockerfile delete mode 100755 start_docker.sh diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index abd9f82..0000000 --- a/Dockerfile +++ /dev/null @@ -1,50 +0,0 @@ -FROM python:3.11-slim - -WORKDIR /app - -RUN apt-get update && apt-get install -y --no-install-recommends \ - libpq-dev gcc git curl \ - && rm -rf /var/lib/apt/lists/* - -ENV PYTHONUNBUFFERED=1 \ - PIP_DISABLE_PIP_VERSION_CHECK=1 \ - TOKENIZERS_PARALLELISM=false \ - HF_HOME=/root/.cache/huggingface - -COPY requirements.txt . -RUN pip install --no-cache-dir --upgrade pip - -RUN pip install --no-cache-dir torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu121 - -RUN pip install --no-cache-dir \ - ctranslate2 \ - sentencepiece \ - transformers==4.44.0 \ - protobuf==3.20.3 \ - "numpy<2" \ - psycopg2-binary \ - redis \ - requests \ - beautifulsoup4 \ - lxml \ - langdetect \ - nltk \ - scikit-learn \ - pandas \ - sentence-transformers \ - spacy - -RUN python -m spacy download es_core_news_lg - -COPY workers/ ./workers/ -COPY init-db/ ./init-db/ -COPY migrations/ ./migrations/ -COPY entity_config.json . - -ENV DB_HOST=db -ENV DB_PORT=5432 -ENV DB_NAME=rss -ENV DB_USER=rss -ENV DB_PASS=x - -CMD ["python", "-m", "workers.embeddings_worker"] diff --git a/Dockerfile.discovery b/Dockerfile.discovery deleted file mode 100644 index 90e405d..0000000 --- a/Dockerfile.discovery +++ /dev/null @@ -1,31 +0,0 @@ -FROM golang:1.22-alpine AS builder - -ENV GOTOOLCHAIN=auto - -RUN apk add --no-cache git - -WORKDIR /app - -COPY backend/go.mod backend/go.sum ./ -RUN go mod download - -COPY backend/ ./ - -RUN CGO_ENABLED=0 GOOS=linux go build -o /bin/discovery ./cmd/discovery - -FROM alpine:3.19 - -RUN apk add --no-cache ca-certificates tzdata - -COPY --from=builder /bin/discovery /bin/discovery - -ENV DB_HOST=db \ - DB_PORT=5432 \ - DB_NAME=rss \ - DB_USER=rss \ - DB_PASS=rss \ - DISCOVERY_INTERVAL=900 \ - DISCOVERY_BATCH=10 \ - MAX_FEEDS_PER_URL=5 - -ENTRYPOINT ["/bin/discovery"] diff --git a/Dockerfile.qdrant b/Dockerfile.qdrant deleted file mode 100644 index e80bfae..0000000 --- a/Dockerfile.qdrant +++ /dev/null @@ -1,34 +0,0 @@ -FROM golang:1.22-alpine AS builder - -ENV GOTOOLCHAIN=auto - -RUN apk add --no-cache git - -WORKDIR /app - -COPY backend/go.mod backend/go.sum ./ -RUN go mod download - -COPY backend/ ./ - -RUN CGO_ENABLED=0 GOOS=linux go build -o /bin/qdrant-worker ./cmd/qdrant - -FROM alpine:3.19 - -RUN apk add --no-cache ca-certificates tzdata - -COPY --from=builder /bin/qdrant-worker /bin/qdrant-worker - -ENV DB_HOST=db \ - DB_PORT=5432 \ - DB_NAME=rss \ - DB_USER=rss \ - DB_PASS=rss \ - QDRANT_HOST=qdrant \ - QDRANT_PORT=6333 \ - QDRANT_COLLECTION=news_vectors \ - OLLAMA_URL=http://ollama:11434 \ - QDRANT_SLEEP=30 \ - QDRANT_BATCH=100 - -ENTRYPOINT ["/bin/qdrant-worker"] diff --git a/Dockerfile.related b/Dockerfile.related deleted file mode 100644 index 12e011d..0000000 --- a/Dockerfile.related +++ /dev/null @@ -1,32 +0,0 @@ -FROM golang:1.22-alpine AS builder - -ENV GOTOOLCHAIN=auto - -RUN apk add --no-cache git - -WORKDIR /app - -COPY backend/go.mod backend/go.sum ./ -RUN go mod download - -COPY backend/ ./ - -RUN CGO_ENABLED=0 GOOS=linux go build -o /bin/related ./cmd/related - -FROM alpine:3.19 - -RUN apk add --no-cache ca-certificates tzdata - -COPY --from=builder /bin/related /bin/related - -ENV DB_HOST=db \ - DB_PORT=5432 \ - DB_NAME=rss \ - DB_USER=rss \ - DB_PASS=rss \ - RELATED_SLEEP=10 \ - RELATED_BATCH=200 \ - RELATED_TOPK=10 \ - EMB_MODEL=mxbai-embed-large - -ENTRYPOINT ["/bin/related"] diff --git a/Dockerfile.scheduler b/Dockerfile.scheduler deleted file mode 100644 index 4a81d3e..0000000 --- a/Dockerfile.scheduler +++ /dev/null @@ -1,23 +0,0 @@ -FROM python:3.11-slim - -WORKDIR /app - -RUN apt-get update && apt-get install -y --no-install-recommends \ - libpq-dev \ - && rm -rf /var/lib/apt/lists/* - -ENV PYTHONUNBUFFERED=1 - -COPY requirements.txt . -RUN pip install --no-cache-dir --upgrade pip -RUN pip install --no-cache-dir psycopg2-binary langdetect - -COPY workers/translation_scheduler.py ./workers/ - -ENV DB_HOST=db -ENV DB_PORT=5432 -ENV DB_NAME=rss -ENV DB_USER=rss -ENV DB_PASS=x - -CMD ["python", "workers/translation_scheduler.py"] diff --git a/Dockerfile.scraper b/Dockerfile.scraper deleted file mode 100644 index 9a32bff..0000000 --- a/Dockerfile.scraper +++ /dev/null @@ -1,32 +0,0 @@ -FROM golang:1.22-alpine AS builder - -ENV GOTOOLCHAIN=auto - -RUN apk add --no-cache git - -WORKDIR /app - -COPY backend/go.mod backend/go.sum ./ -RUN go mod download - -COPY backend/ ./ - -RUN go mod tidy - -RUN CGO_ENABLED=0 GOOS=linux go build -o /bin/scraper ./cmd/scraper - -FROM alpine:3.19 - -RUN apk add --no-cache ca-certificates tzdata - -COPY --from=builder /bin/scraper /bin/scraper - -ENV DB_HOST=db \ - DB_PORT=5432 \ - DB_NAME=rss \ - DB_USER=rss \ - DB_PASS=rss \ - SCRAPER_SLEEP=60 \ - SCRAPER_BATCH=10 - -ENTRYPOINT ["/bin/scraper"] diff --git a/Dockerfile.topics b/Dockerfile.topics deleted file mode 100644 index fc82ea7..0000000 --- a/Dockerfile.topics +++ /dev/null @@ -1,30 +0,0 @@ -FROM golang:1.22-alpine AS builder - -ENV GOTOOLCHAIN=auto - -RUN apk add --no-cache git - -WORKDIR /app - -COPY backend/go.mod backend/go.sum ./ -RUN go mod download - -COPY backend/ ./ - -RUN CGO_ENABLED=0 GOOS=linux go build -o /bin/topics ./cmd/topics - -FROM alpine:3.19 - -RUN apk add --no-cache ca-certificates tzdata - -COPY --from=builder /bin/topics /bin/topics - -ENV DB_HOST=db \ - DB_PORT=5432 \ - DB_NAME=rss \ - DB_USER=rss \ - DB_PASS=rss \ - TOPICS_SLEEP=10 \ - TOPICS_BATCH=500 - -ENTRYPOINT ["/bin/topics"] diff --git a/Dockerfile.translator b/Dockerfile.translator deleted file mode 100644 index e6a96be..0000000 --- a/Dockerfile.translator +++ /dev/null @@ -1,43 +0,0 @@ -FROM python:3.11-slim-bookworm - -RUN apt-get update && apt-get install -y --no-install-recommends \ - patchelf libpq-dev gcc git curl wget \ - && rm -rf /var/lib/apt/lists/* - -ENV PYTHONUNBUFFERED=1 \ - PIP_DISABLE_PIP_VERSION_CHECK=1 \ - TOKENIZERS_PARALLELISM=false \ - HF_HOME=/root/.cache/huggingface - -WORKDIR /app - -COPY requirements.txt . -RUN pip install --no-cache-dir --upgrade pip - -RUN pip install --no-cache-dir torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cpu - -RUN pip install --no-cache-dir \ - ctranslate2==3.24.0 \ - sentencepiece \ - transformers==4.36.0 \ - protobuf==3.20.3 \ - "numpy<2" \ - psycopg2-binary \ - langdetect - -# === ARREGLAR EL EXECUTABLE STACK === -RUN find /usr/local/lib/python3.11/site-packages/ctranslate2* \ - -name "libctranslate2-*.so.*" -o -name "libctranslate2.so*" | \ - xargs -I {} patchelf --clear-execstack {} || true - -COPY workers/ ./workers/ -COPY init-db/ ./init-db/ -COPY migrations/ ./migrations/ - -ENV DB_HOST=db -ENV DB_PORT=5432 -ENV DB_NAME=rss -ENV DB_USER=rss -ENV DB_PASS=x - -CMD ["python", "-m", "workers.ctranslator_worker"] diff --git a/Dockerfile.translator-gpu b/Dockerfile.translator-gpu deleted file mode 100644 index c3a990b..0000000 --- a/Dockerfile.translator-gpu +++ /dev/null @@ -1,48 +0,0 @@ -FROM python:3.11-slim-bookworm - -RUN apt-get update && apt-get install -y --no-install-recommends \ - patchelf libpq-dev gcc git curl wget \ - && rm -rf /var/lib/apt/lists/* - -ENV PYTHONUNBUFFERED=1 \ - PIP_DISABLE_PIP_VERSION_CHECK=1 \ - TOKENIZERS_PARALLELISM=false \ - HF_HOME=/root/.cache/huggingface - -WORKDIR /app - -COPY requirements.txt . -RUN pip install --no-cache-dir --upgrade pip - -# Install PyTorch with CUDA support (cu118 for broader compatibility) -RUN pip install --no-cache-dir torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118 - -RUN pip install --no-cache-dir \ - ctranslate2==3.24.0 \ - sentencepiece \ - transformers==4.36.0 \ - protobuf==3.20.3 \ - "numpy<2" \ - psycopg2-binary \ - langdetect - -# Fix executable stack -RUN find /usr/local/lib/python3.11/site-packages/ctranslate2* \ - -name "libctranslate2-*.so.*" -o -name "libctranslate2.so*" | \ - xargs -I {} patchelf --clear-execstack {} || true - -COPY workers/ ./workers/ -COPY init-db/ ./init-db/ -COPY migrations/ ./migrations/ - -ENV DB_HOST=db -ENV DB_PORT=5432 -ENV DB_NAME=rss -ENV DB_USER=rss -ENV DB_PASS=x - -# GPU Configuration - Override with: docker run --gpus all -ENV CT2_DEVICE=cuda -ENV CT2_COMPUTE_TYPE=float16 - -CMD ["python", "-m", "workers.ctranslator_worker"] diff --git a/Dockerfile.wiki b/Dockerfile.wiki deleted file mode 100644 index fbd84e0..0000000 --- a/Dockerfile.wiki +++ /dev/null @@ -1,31 +0,0 @@ -FROM golang:alpine AS builder - -ENV GOTOOLCHAIN=auto - -RUN apk add --no-cache git - -WORKDIR /app - -COPY backend/go.mod backend/go.sum ./ -RUN go mod download - -COPY backend/ ./ - -RUN go mod tidy - -RUN CGO_ENABLED=0 GOOS=linux go build -o /bin/wiki_worker ./cmd/wiki_worker - -FROM alpine:3.19 - -RUN apk add --no-cache ca-certificates tzdata - -COPY --from=builder /bin/wiki_worker /bin/wiki_worker - -ENV DB_HOST=db \ - DB_PORT=5432 \ - DB_NAME=rss \ - DB_USER=rss \ - DB_PASS=rss \ - WIKI_SLEEP=10 - -ENTRYPOINT ["/bin/wiki_worker"] diff --git a/backend/Dockerfile b/backend/Dockerfile deleted file mode 100644 index 6d232b9..0000000 --- a/backend/Dockerfile +++ /dev/null @@ -1,24 +0,0 @@ -FROM golang:1.23 AS builder - -WORKDIR /app - -RUN apt-get update && apt-get install -y gcc musl-dev git - -COPY go.mod go.sum ./ -RUN go mod download - -COPY . . - -RUN CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -o /server ./cmd/server - -FROM alpine:3.19 - -RUN apk add --no-cache ca-certificates tzdata postgresql-client - -WORKDIR /app - -COPY --from=builder /server . - -EXPOSE 8080 - -CMD ["./server"] diff --git a/deploy-clean.sh b/deploy-clean.sh deleted file mode 100755 index 6f2ccb6..0000000 --- a/deploy-clean.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -# Script para despliegue limpio de RSS2 - -echo "=== RSS2 Clean Deployment Script ===" -echo "" - -# Detener contenedores -echo "1. Deteniendo contenedores..." -docker compose down -v 2>/dev/null - -# Eliminar volúmenes de datos (si hay permisos) -echo "2. Eliminando volúmenes de datos..." -docker volume rm rss2_db 2>/dev/null || true -docker volume rm rss2_redis 2>/dev/null || true - -# Si los volúmenes Docker tienen problemas, intentar con rm -echo " Intentando limpiar /data/..." -sudo rm -rf /datos/rss2/data/pgdata 2>/dev/null || true -sudo rm -rf /datos/rss2/data/redis-data 2>/dev/null || true - -# Iniciar base de datos -echo "3. Iniciando base de datos..." -docker compose up -d db - -# Esperar a que esté lista -echo "4. Esperando a que la base de datos esté lista..." -sleep 10 - -# Verificar estado -if docker compose ps db | grep -q "healthy"; then - echo " ✓ Base de datos iniciada correctamente" - - # Ejecutar script de schema - echo "5. Ejecutando script de inicialización..." - docker compose exec -T db psql -U rss -d rss -f /docker-entrypoint-initdb.d/00-complete-schema.sql 2>&1 | tail -5 - - # Iniciar demás servicios - echo "6. Iniciando servicios..." - docker compose up -d redis backend-go rss2_frontend nginx rss-ingestor-go - - echo "" - echo "=== Despliegue completado ===" - echo "Accede a: http://localhost:8001" -else - echo " ✗ Error: La base de datos no está healthy" - docker compose logs db -fi diff --git a/deploy/debian/build.sh b/deploy/debian/build.sh new file mode 100755 index 0000000..2699652 --- /dev/null +++ b/deploy/debian/build.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash +# ============================================================================= +# RSS2 - Recompila binarios y frontend (sin reinstalar el sistema) +# Usar despues de actualizar el codigo: bash build.sh +# ============================================================================= +set -euo pipefail + +RSS2_HOME="/opt/rss2" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +export PATH=$PATH:/usr/local/go/bin + +GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m' +info() { echo -e "${GREEN}[BUILD]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } + +# --- Go Backend + Workers --- +if [[ -d "$REPO_ROOT/backend" ]]; then + info "Compilando backend Go..." + (cd "$REPO_ROOT/backend" && \ + CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -o "$RSS2_HOME/bin/server" ./cmd/server) + info " [OK] server" + + for cmd in scraper discovery wiki_worker topics related qdrant; do + [[ -d "$REPO_ROOT/backend/cmd/$cmd" ]] || continue + (cd "$REPO_ROOT/backend" && \ + CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -o "$RSS2_HOME/bin/$cmd" "./cmd/$cmd") + info " [OK] $cmd" + done +fi + +# --- Ingestor Go --- +if [[ -d "$REPO_ROOT/rss-ingestor-go" ]]; then + info "Compilando ingestor Go..." + (cd "$REPO_ROOT/rss-ingestor-go" && \ + CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -o "$RSS2_HOME/bin/ingestor" .) + info " [OK] ingestor" +fi + +# --- Frontend React --- +if [[ -d "$REPO_ROOT/frontend" ]]; then + info "Compilando frontend React..." + (cd "$REPO_ROOT/frontend" && \ + npm install --silent && \ + VITE_API_URL=/api npm run build -- --outDir "$RSS2_HOME/frontend/dist") + info " [OK] frontend" +fi + +# --- Workers Python --- +info "Sincronizando workers Python..." +rsync -a --delete "$REPO_ROOT/workers/" "$RSS2_HOME/src/workers/" +cp "$REPO_ROOT/entity_config.json" "$RSS2_HOME/src/" 2>/dev/null || true +info " [OK] workers Python" + +chown -R rss2:rss2 "$RSS2_HOME/bin" "$RSS2_HOME/frontend/dist" "$RSS2_HOME/src" + +# --- Restart servicios --- +info "Reiniciando servicios..." +GO_SERVICES=(rss2-backend rss2-ingestor rss2-scraper rss2-discovery rss2-wiki rss2-topics rss2-related rss2-qdrant-worker) +PY_SERVICES=(rss2-langdetect rss2-translation-scheduler rss2-translator rss2-embeddings rss2-ner rss2-cluster rss2-categorizer) + +for svc in "${GO_SERVICES[@]}" "${PY_SERVICES[@]}"; do + systemctl is-active --quiet "$svc" && systemctl restart "$svc" && info " restarted $svc" || true +done + +systemctl reload nginx 2>/dev/null || true + +info "Build completado." diff --git a/deploy/debian/env.example b/deploy/debian/env.example new file mode 100644 index 0000000..52b91e7 --- /dev/null +++ b/deploy/debian/env.example @@ -0,0 +1,104 @@ +# ============================================================================= +# RSS2 - Variables de entorno para despliegue Debian nativo +# Copiar a /opt/rss2/.env y editar valores antes de instalar +# ============================================================================= + +# --- PostgreSQL --- +POSTGRES_DB=rss +POSTGRES_USER=rss +POSTGRES_PASSWORD=CAMBIA_ESTO_postgres_password + +# Usadas por workers Go (equivalente a DATABASE_URL) +DB_HOST=127.0.0.1 +DB_PORT=5432 +DB_NAME=rss +DB_USER=rss +DB_PASS=CAMBIA_ESTO_postgres_password + +# URL completa para backend API Go +DATABASE_URL=postgres://rss:CAMBIA_ESTO_postgres_password@127.0.0.1:5432/rss?sslmode=disable + +# --- Redis --- +REDIS_PASSWORD=CAMBIA_ESTO_redis_password +REDIS_URL=redis://:CAMBIA_ESTO_redis_password@127.0.0.1:6379 + +# --- JWT Secret (minimo 32 caracteres, aleatorio) --- +SECRET_KEY=CAMBIA_ESTO_jwt_secret_muy_largo_y_aleatorio + +# --- Backend API --- +SERVER_PORT=8080 + +# --- Zona horaria --- +TZ=Europe/Madrid + +# --- HuggingFace cache (modelos ML) --- +HF_HOME=/opt/rss2/hf_cache + +# --- Qdrant (local, sin Docker) --- +QDRANT_HOST=127.0.0.1 +QDRANT_PORT=6333 +QDRANT_COLLECTION=news_vectors + +# --- Translator (NLLB-200 via CTranslate2) --- +TARGET_LANGS=es +TRANSLATOR_BATCH=32 +CT2_MODEL_PATH=/opt/rss2/models/nllb-ct2 +CT2_DEVICE=cpu +CT2_COMPUTE_TYPE=int8 +UNIVERSAL_MODEL=facebook/nllb-200-distilled-600M + +# --- Embeddings --- +EMB_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 +EMB_BATCH=64 +EMB_SLEEP_IDLE=5 +EMB_LANGS=es +EMB_LIMIT=1000 +DEVICE=cpu + +# --- NER --- +NER_LANG=es +NER_BATCH=64 + +# --- Ingestor RSS --- +RSS_MAX_WORKERS=100 +RSS_POKE_INTERVAL_MIN=15 + +# --- Scraper --- +SCRAPER_SLEEP=60 +SCRAPER_BATCH=10 + +# --- Discovery --- +DISCOVERY_INTERVAL=900 +DISCOVERY_BATCH=10 +MAX_FEEDS_PER_URL=5 + +# --- Wiki Worker --- +WIKI_SLEEP=10 + +# --- Topics --- +TOPICS_SLEEP=10 +TOPICS_BATCH=500 + +# --- Related --- +RELATED_SLEEP=10 +RELATED_BATCH=200 +RELATED_TOPK=10 + +# --- Cluster --- +EVENT_DIST_THRESHOLD=0.35 + +# --- Categorizer --- +CATEGORIZER_BATCH_SIZE=10 +CATEGORIZER_SLEEP_IDLE=5 + +# --- Scheduler traduccion --- +SCHEDULER_BATCH=1000 +SCHEDULER_SLEEP=30 + +# --- Lang Detect --- +LANG_DETECT_SLEEP=60 +LANG_DETECT_BATCH=1000 + +# --- Qdrant Worker --- +QDRANT_SLEEP=30 +QDRANT_BATCH=100 diff --git a/deploy/debian/install.sh b/deploy/debian/install.sh new file mode 100755 index 0000000..00da025 --- /dev/null +++ b/deploy/debian/install.sh @@ -0,0 +1,294 @@ +#!/usr/bin/env bash +# ============================================================================= +# RSS2 - Instalacion en Debian (sin Docker) +# Ejecutar como root: bash install.sh +# ============================================================================= +set -euo pipefail + +RSS2_USER="rss2" +RSS2_HOME="/opt/rss2" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m' +info() { echo -e "${GREEN}[INFO]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; } + +[[ "$EUID" -ne 0 ]] && error "Ejecutar como root: sudo bash install.sh" + +# ============================================================================= +# 1. DEPENDENCIAS DEL SISTEMA +# ============================================================================= +info "Instalando dependencias del sistema..." +apt-get update -qq +apt-get install -y --no-install-recommends \ + curl wget git build-essential \ + postgresql postgresql-client \ + redis-server \ + nginx \ + python3 python3-pip python3-venv python3-dev \ + nodejs npm \ + ca-certificates tzdata \ + libpq-dev + +# Go (si no esta instalado o version < 1.22) +if ! command -v go &>/dev/null || [[ "$(go version | awk '{print $3}' | tr -d 'go')" < "1.22" ]]; then + info "Instalando Go 1.23..." + GO_VERSION="1.23.4" + ARCH=$(dpkg --print-architecture) + case "$ARCH" in + amd64) GO_ARCH="amd64" ;; + arm64) GO_ARCH="arm64" ;; + *) error "Arquitectura no soportada: $ARCH" ;; + esac + curl -fsSL "https://go.dev/dl/go${GO_VERSION}.linux-${GO_ARCH}.tar.gz" -o /tmp/go.tar.gz + rm -rf /usr/local/go + tar -C /usr/local -xzf /tmp/go.tar.gz + echo 'export PATH=$PATH:/usr/local/go/bin' > /etc/profile.d/go.sh + export PATH=$PATH:/usr/local/go/bin + rm /tmp/go.tar.gz +fi +info "Go: $(go version)" + +# Qdrant (binario oficial) +if [[ ! -f "$RSS2_HOME/qdrant/qdrant" ]]; then + info "Descargando Qdrant..." + QDRANT_VERSION="v1.12.1" + ARCH=$(dpkg --print-architecture) + case "$ARCH" in + amd64) QDRANT_ARCH="x86_64-unknown-linux-musl" ;; + arm64) QDRANT_ARCH="aarch64-unknown-linux-musl" ;; + *) error "Arquitectura no soportada para Qdrant: $ARCH" ;; + esac + mkdir -p "$RSS2_HOME/qdrant" + curl -fsSL "https://github.com/qdrant/qdrant/releases/download/${QDRANT_VERSION}/qdrant-${QDRANT_ARCH}.tar.gz" \ + -o /tmp/qdrant.tar.gz + tar -C "$RSS2_HOME/qdrant" -xzf /tmp/qdrant.tar.gz + chmod +x "$RSS2_HOME/qdrant/qdrant" + rm /tmp/qdrant.tar.gz +fi + +# ============================================================================= +# 2. USUARIO Y DIRECTORIOS +# ============================================================================= +info "Creando usuario $RSS2_USER y directorios..." +id "$RSS2_USER" &>/dev/null || useradd -r -m -d "$RSS2_HOME" -s /bin/bash "$RSS2_USER" + +mkdir -p \ + "$RSS2_HOME/bin" \ + "$RSS2_HOME/src" \ + "$RSS2_HOME/data/wiki_images" \ + "$RSS2_HOME/data/qdrant_storage" \ + "$RSS2_HOME/hf_cache" \ + "$RSS2_HOME/models" \ + "$RSS2_HOME/frontend/dist" \ + "$RSS2_HOME/logs" + +# ============================================================================= +# 3. CONFIGURACION ENTORNO +# ============================================================================= +if [[ ! -f "$RSS2_HOME/.env" ]]; then + if [[ -f "$SCRIPT_DIR/env.example" ]]; then + cp "$SCRIPT_DIR/env.example" "$RSS2_HOME/.env" + warn "Copia env.example en $RSS2_HOME/.env - EDITA LAS CONTRASENAS antes de continuar" + warn "Presiona Enter cuando hayas editado el .env, o Ctrl+C para salir" + read -r + else + error "No se encontro env.example en $SCRIPT_DIR" + fi +fi + +# ============================================================================= +# 4. POSTGRESQL +# ============================================================================= +info "Configurando PostgreSQL..." +source "$RSS2_HOME/.env" 2>/dev/null || true + +DB_NAME="${POSTGRES_DB:-rss}" +DB_USER="${POSTGRES_USER:-rss}" +DB_PASS="${POSTGRES_PASSWORD:-changeme}" + +systemctl enable --now postgresql + +# Crear usuario y base de datos si no existen +sudo -u postgres psql -tc "SELECT 1 FROM pg_roles WHERE rolname='$DB_USER'" | grep -q 1 || \ + sudo -u postgres psql -c "CREATE USER $DB_USER WITH PASSWORD '$DB_PASS';" + +sudo -u postgres psql -tc "SELECT 1 FROM pg_database WHERE datname='$DB_NAME'" | grep -q 1 || \ + sudo -u postgres createdb -O "$DB_USER" "$DB_NAME" + +# Ejecutar migraciones SQL +if [[ -d "$REPO_ROOT/migrations" ]]; then + info "Ejecutando migraciones..." + for sql_file in "$REPO_ROOT/migrations"/*.sql; do + [[ -f "$sql_file" ]] || continue + info " Aplicando $(basename "$sql_file")..." + sudo -u postgres psql -d "$DB_NAME" -f "$sql_file" 2>/dev/null || warn " (ya aplicada o error ignorado)" + done +fi + +# Ejecutar init-db scripts (schema inicial) +if [[ -d "$REPO_ROOT/init-db" ]]; then + info "Ejecutando scripts de init-db..." + for sql_file in "$REPO_ROOT/init-db"/*.sql; do + [[ -f "$sql_file" ]] || continue + info " $(basename "$sql_file")..." + sudo -u postgres psql -d "$DB_NAME" -f "$sql_file" 2>/dev/null || warn " (ya aplicada o error ignorado)" + done +fi + +# ============================================================================= +# 5. REDIS +# ============================================================================= +info "Configurando Redis..." +REDIS_PASS="${REDIS_PASSWORD:-changeme_redis}" + +# Agregar autenticacion y limites de memoria a redis.conf +REDIS_CONF="/etc/redis/redis.conf" +grep -q "requirepass $REDIS_PASS" "$REDIS_CONF" 2>/dev/null || { + echo "requirepass $REDIS_PASS" >> "$REDIS_CONF" + echo "maxmemory 512mb" >> "$REDIS_CONF" + echo "maxmemory-policy allkeys-lru" >> "$REDIS_CONF" + echo "appendonly yes" >> "$REDIS_CONF" +} +systemctl enable --now redis-server + +# ============================================================================= +# 6. PYTHON VIRTUALENV + DEPENDENCIAS ML +# ============================================================================= +info "Creando virtualenv Python y instalando dependencias..." +python3 -m venv "$RSS2_HOME/venv" +"$RSS2_HOME/venv/bin/pip" install --upgrade pip -q + +if [[ -f "$REPO_ROOT/requirements.txt" ]]; then + "$RSS2_HOME/venv/bin/pip" install -r "$REPO_ROOT/requirements.txt" -q +fi + +# spaCy modelo en español +"$RSS2_HOME/venv/bin/python" -m spacy download es_core_news_lg 2>/dev/null || \ + warn "spaCy model es_core_news_lg no se pudo descargar, hazlo manualmente" + +# Copiar workers Python al directorio de trabajo +info "Copiando workers Python..." +rsync -a --delete "$REPO_ROOT/workers/" "$RSS2_HOME/src/workers/" +cp "$REPO_ROOT/entity_config.json" "$RSS2_HOME/src/" 2>/dev/null || true + +# ============================================================================= +# 7. COMPILAR GO (backend + workers) +# ============================================================================= +info "Compilando binarios Go..." +export PATH=$PATH:/usr/local/go/bin +export GOPATH=/tmp/go-build-rss2 + +# Backend API +if [[ -d "$REPO_ROOT/backend" ]]; then + (cd "$REPO_ROOT/backend" && \ + CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -o "$RSS2_HOME/bin/server" ./cmd/server && \ + info " [OK] server") || warn " [FAIL] server" + for cmd in scraper discovery wiki_worker topics related qdrant; do + [[ -d "$REPO_ROOT/backend/cmd/$cmd" ]] || continue + (cd "$REPO_ROOT/backend" && \ + CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -o "$RSS2_HOME/bin/$cmd" "./cmd/$cmd" && \ + info " [OK] $cmd") || warn " [FAIL] $cmd" + done +fi + +# RSS Ingestor Go (repo separado) +if [[ -d "$REPO_ROOT/rss-ingestor-go" ]]; then + (cd "$REPO_ROOT/rss-ingestor-go" && \ + CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -o "$RSS2_HOME/bin/ingestor" . && \ + info " [OK] ingestor") || warn " [FAIL] ingestor" +fi + +# ============================================================================= +# 8. FRONTEND REACT +# ============================================================================= +info "Compilando frontend React..." +if [[ -d "$REPO_ROOT/frontend" ]]; then + (cd "$REPO_ROOT/frontend" && \ + npm install --silent && \ + VITE_API_URL=/api npm run build -- --outDir "$RSS2_HOME/frontend/dist" && \ + info " [OK] frontend compilado") || warn " [FAIL] frontend" +fi + +# ============================================================================= +# 9. NGINX +# ============================================================================= +info "Configurando Nginx..." +cp "$SCRIPT_DIR/nginx.conf" /etc/nginx/nginx.conf +nginx -t && systemctl enable --now nginx && systemctl reload nginx + +# ============================================================================= +# 10. SYSTEMD SERVICES +# ============================================================================= +info "Instalando servicios systemd..." +SERVICES=( + rss2-qdrant + rss2-backend + rss2-ingestor + rss2-scraper + rss2-discovery + rss2-wiki + rss2-topics + rss2-related + rss2-qdrant-worker + rss2-langdetect + rss2-translation-scheduler + rss2-translator + rss2-embeddings + rss2-ner + rss2-cluster + rss2-categorizer +) + +for svc in "${SERVICES[@]}"; do + svc_file="$SCRIPT_DIR/systemd/${svc}.service" + if [[ -f "$svc_file" ]]; then + cp "$svc_file" "/etc/systemd/system/${svc}.service" + else + warn "No se encontro $svc_file" + fi +done + +systemctl daemon-reload + +for svc in "${SERVICES[@]}"; do + systemctl enable "$svc" 2>/dev/null || true +done + +# ============================================================================= +# 11. PERMISOS FINALES +# ============================================================================= +info "Ajustando permisos..." +chown -R "$RSS2_USER:$RSS2_USER" "$RSS2_HOME" +chmod 600 "$RSS2_HOME/.env" + +# ============================================================================= +# 12. ARRANCAR SERVICIOS +# ============================================================================= +info "Arrancando servicios..." +# Infraestructura primero +systemctl start rss2-qdrant +sleep 3 + +# API y workers Go +for svc in rss2-backend rss2-ingestor rss2-scraper rss2-discovery rss2-wiki rss2-topics rss2-related rss2-qdrant-worker; do + systemctl start "$svc" || warn "No se pudo arrancar $svc" +done + +# Workers Python (modelos pesados, arrancan despues) +for svc in rss2-langdetect rss2-translation-scheduler rss2-translator rss2-embeddings rss2-ner rss2-cluster rss2-categorizer; do + systemctl start "$svc" || warn "No se pudo arrancar $svc" +done + +# ============================================================================= +echo "" +info "=============================================" +info " RSS2 instalado en $RSS2_HOME" +info " Acceder en: http://$(hostname -I | awk '{print $1}'):8001" +info "" +info " Ver logs: journalctl -u rss2-backend -f" +info " Ver estado: systemctl status rss2-backend" +info " Editar env: nano $RSS2_HOME/.env" +info "=============================================" diff --git a/deploy/debian/nginx.conf b/deploy/debian/nginx.conf new file mode 100644 index 0000000..5407ce8 --- /dev/null +++ b/deploy/debian/nginx.conf @@ -0,0 +1,91 @@ +user www-data; +worker_processes auto; +error_log /var/log/nginx/error.log warn; +pid /run/nginx.pid; + +events { + worker_connections 2048; + use epoll; +} + +http { + include /etc/nginx/mime.types; + default_type application/octet-stream; + + log_format main '$remote_addr - $remote_user [$time_local] "$request" ' + '$status $body_bytes_sent "$http_referer" ' + '"$http_user_agent" "$http_x_forwarded_for"'; + + access_log /var/log/nginx/access.log main; + + sendfile on; + tcp_nopush on; + tcp_nodelay on; + keepalive_timeout 65; + types_hash_max_size 2048; + client_max_body_size 100M; + + gzip on; + gzip_vary on; + gzip_proxied any; + gzip_comp_level 6; + gzip_types text/plain text/css text/javascript + application/json application/javascript + application/xml text/xml; + + # Go API backend (proceso nativo en localhost) + upstream api_backend { + server 127.0.0.1:8080; + keepalive 32; + } + + server { + listen 8001; + server_name _; + + client_body_timeout 60s; + client_header_timeout 60s; + send_timeout 300s; + + # Frontend React (archivos estaticos compilados) + root /opt/rss2/frontend/dist; + index index.html; + + location / { + try_files $uri $uri/ /index.html; + } + + # Imagenes Wikipedia servidas directamente + location /wiki-images/ { + alias /opt/rss2/data/wiki_images/; + expires 7d; + add_header Cache-Control "public, immutable"; + } + + # Proxy al API Go + location /api/ { + proxy_pass http://api_backend/api/; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Connection ""; + + proxy_connect_timeout 60s; + proxy_send_timeout 300s; + proxy_read_timeout 300s; + } + + location /health { + access_log off; + return 200 "ok"; + } + + location ~ /\. { + deny all; + access_log off; + log_not_found off; + } + } +} diff --git a/deploy/debian/systemd/rss2-backend.service b/deploy/debian/systemd/rss2-backend.service new file mode 100644 index 0000000..f50239f --- /dev/null +++ b/deploy/debian/systemd/rss2-backend.service @@ -0,0 +1,24 @@ +[Unit] +Description=RSS2 Backend API (Go) +After=network.target postgresql.service redis.service +Requires=postgresql.service redis.service + +[Service] +Type=simple +User=rss2 +Group=rss2 +WorkingDirectory=/opt/rss2 +EnvironmentFile=/opt/rss2/.env +ExecStart=/opt/rss2/bin/server +Restart=always +RestartSec=5 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=rss2-backend + +# Limites de recursos +LimitNOFILE=65536 +MemoryMax=1G + +[Install] +WantedBy=multi-user.target diff --git a/deploy/debian/systemd/rss2-categorizer.service b/deploy/debian/systemd/rss2-categorizer.service new file mode 100644 index 0000000..61ecf9a --- /dev/null +++ b/deploy/debian/systemd/rss2-categorizer.service @@ -0,0 +1,25 @@ +[Unit] +Description=RSS2 Categorizer Worker (Python) +After=network.target postgresql.service +Requires=postgresql.service + +[Service] +Type=simple +User=rss2 +Group=rss2 +WorkingDirectory=/opt/rss2/src +EnvironmentFile=/opt/rss2/.env +Environment=CATEGORIZER_BATCH_SIZE=10 +Environment=CATEGORIZER_SLEEP_IDLE=5 +ExecStart=/opt/rss2/venv/bin/python -m workers.simple_categorizer_worker +Restart=always +RestartSec=10 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=rss2-categorizer + +MemoryMax=1G +CPUQuota=200% + +[Install] +WantedBy=multi-user.target diff --git a/deploy/debian/systemd/rss2-cluster.service b/deploy/debian/systemd/rss2-cluster.service new file mode 100644 index 0000000..dd990fb --- /dev/null +++ b/deploy/debian/systemd/rss2-cluster.service @@ -0,0 +1,25 @@ +[Unit] +Description=RSS2 Cluster Worker - Agrupacion de noticias (Python) +After=network.target postgresql.service +Requires=postgresql.service + +[Service] +Type=simple +User=rss2 +Group=rss2 +WorkingDirectory=/opt/rss2/src +EnvironmentFile=/opt/rss2/.env +Environment=EVENT_DIST_THRESHOLD=0.35 +Environment=EMB_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 +ExecStart=/opt/rss2/venv/bin/python -m workers.cluster_worker +Restart=always +RestartSec=10 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=rss2-cluster + +MemoryMax=2G +CPUQuota=200% + +[Install] +WantedBy=multi-user.target diff --git a/deploy/debian/systemd/rss2-discovery.service b/deploy/debian/systemd/rss2-discovery.service new file mode 100644 index 0000000..c9a435b --- /dev/null +++ b/deploy/debian/systemd/rss2-discovery.service @@ -0,0 +1,26 @@ +[Unit] +Description=RSS2 Discovery de Feeds (Go) +After=network.target postgresql.service +Requires=postgresql.service + +[Service] +Type=simple +User=rss2 +Group=rss2 +WorkingDirectory=/opt/rss2 +EnvironmentFile=/opt/rss2/.env +Environment=DISCOVERY_INTERVAL=900 +Environment=DISCOVERY_BATCH=10 +Environment=MAX_FEEDS_PER_URL=5 +ExecStart=/opt/rss2/bin/discovery +Restart=always +RestartSec=10 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=rss2-discovery + +MemoryMax=512M +CPUQuota=100% + +[Install] +WantedBy=multi-user.target diff --git a/deploy/debian/systemd/rss2-embeddings.service b/deploy/debian/systemd/rss2-embeddings.service new file mode 100644 index 0000000..efb0dbd --- /dev/null +++ b/deploy/debian/systemd/rss2-embeddings.service @@ -0,0 +1,30 @@ +[Unit] +Description=RSS2 Embeddings Worker (Python) +After=network.target postgresql.service +Requires=postgresql.service + +[Service] +Type=simple +User=rss2 +Group=rss2 +WorkingDirectory=/opt/rss2/src +EnvironmentFile=/opt/rss2/.env +Environment=EMB_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 +Environment=EMB_BATCH=64 +Environment=EMB_SLEEP_IDLE=5 +Environment=EMB_LANGS=es +Environment=EMB_LIMIT=1000 +Environment=DEVICE=cpu +Environment=HF_HOME=/opt/rss2/hf_cache +ExecStart=/opt/rss2/venv/bin/python -m workers.embeddings_worker +Restart=always +RestartSec=10 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=rss2-embeddings + +MemoryMax=3G +CPUQuota=200% + +[Install] +WantedBy=multi-user.target diff --git a/deploy/debian/systemd/rss2-ingestor.service b/deploy/debian/systemd/rss2-ingestor.service new file mode 100644 index 0000000..8a29a74 --- /dev/null +++ b/deploy/debian/systemd/rss2-ingestor.service @@ -0,0 +1,26 @@ +[Unit] +Description=RSS2 Ingestor RSS (Go) +After=network.target postgresql.service +Requires=postgresql.service + +[Service] +Type=simple +User=rss2 +Group=rss2 +WorkingDirectory=/opt/rss2 +EnvironmentFile=/opt/rss2/.env +Environment=RSS_MAX_WORKERS=100 +Environment=RSS_POKE_INTERVAL_MIN=15 +ExecStart=/opt/rss2/bin/ingestor +Restart=always +RestartSec=10 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=rss2-ingestor + +LimitNOFILE=65536 +MemoryMax=2G +CPUQuota=200% + +[Install] +WantedBy=multi-user.target diff --git a/deploy/debian/systemd/rss2-langdetect.service b/deploy/debian/systemd/rss2-langdetect.service new file mode 100644 index 0000000..18c2732 --- /dev/null +++ b/deploy/debian/systemd/rss2-langdetect.service @@ -0,0 +1,25 @@ +[Unit] +Description=RSS2 Language Detection Worker (Python) +After=network.target postgresql.service +Requires=postgresql.service + +[Service] +Type=simple +User=rss2 +Group=rss2 +WorkingDirectory=/opt/rss2/src +EnvironmentFile=/opt/rss2/.env +Environment=LANG_DETECT_SLEEP=60 +Environment=LANG_DETECT_BATCH=1000 +ExecStart=/opt/rss2/venv/bin/python -m workers.langdetect_worker +Restart=always +RestartSec=10 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=rss2-langdetect + +MemoryMax=512M +CPUQuota=50% + +[Install] +WantedBy=multi-user.target diff --git a/deploy/debian/systemd/rss2-ner.service b/deploy/debian/systemd/rss2-ner.service new file mode 100644 index 0000000..6f43c78 --- /dev/null +++ b/deploy/debian/systemd/rss2-ner.service @@ -0,0 +1,26 @@ +[Unit] +Description=RSS2 NER Worker - Extraccion de Entidades (Python) +After=network.target postgresql.service +Requires=postgresql.service + +[Service] +Type=simple +User=rss2 +Group=rss2 +WorkingDirectory=/opt/rss2/src +EnvironmentFile=/opt/rss2/.env +Environment=NER_LANG=es +Environment=NER_BATCH=64 +Environment=HF_HOME=/opt/rss2/hf_cache +ExecStart=/opt/rss2/venv/bin/python -m workers.ner_worker +Restart=always +RestartSec=10 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=rss2-ner + +MemoryMax=2G +CPUQuota=200% + +[Install] +WantedBy=multi-user.target diff --git a/deploy/debian/systemd/rss2-qdrant-worker.service b/deploy/debian/systemd/rss2-qdrant-worker.service new file mode 100644 index 0000000..6334fd1 --- /dev/null +++ b/deploy/debian/systemd/rss2-qdrant-worker.service @@ -0,0 +1,28 @@ +[Unit] +Description=RSS2 Qdrant Sync Worker (Go) +After=network.target postgresql.service rss2-qdrant.service +Requires=postgresql.service rss2-qdrant.service + +[Service] +Type=simple +User=rss2 +Group=rss2 +WorkingDirectory=/opt/rss2 +EnvironmentFile=/opt/rss2/.env +Environment=QDRANT_HOST=127.0.0.1 +Environment=QDRANT_PORT=6333 +Environment=QDRANT_COLLECTION=news_vectors +Environment=QDRANT_SLEEP=30 +Environment=QDRANT_BATCH=100 +ExecStart=/opt/rss2/bin/qdrant_worker +Restart=always +RestartSec=10 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=rss2-qdrant-worker + +MemoryMax=1G +CPUQuota=100% + +[Install] +WantedBy=multi-user.target diff --git a/deploy/debian/systemd/rss2-qdrant.service b/deploy/debian/systemd/rss2-qdrant.service new file mode 100644 index 0000000..59b8651 --- /dev/null +++ b/deploy/debian/systemd/rss2-qdrant.service @@ -0,0 +1,25 @@ +[Unit] +Description=Qdrant Vector Database +After=network.target + +[Service] +Type=simple +User=rss2 +Group=rss2 +WorkingDirectory=/opt/rss2/qdrant +ExecStart=/opt/rss2/qdrant/qdrant +Restart=always +RestartSec=5 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=rss2-qdrant + +Environment=QDRANT__SERVICE__HTTP_PORT=6333 +Environment=QDRANT__SERVICE__GRPC_PORT=6334 +Environment=QDRANT__STORAGE__STORAGE_PATH=/opt/rss2/data/qdrant_storage + +MemoryMax=4G +CPUQuota=400% + +[Install] +WantedBy=multi-user.target diff --git a/deploy/debian/systemd/rss2-related.service b/deploy/debian/systemd/rss2-related.service new file mode 100644 index 0000000..cc671f9 --- /dev/null +++ b/deploy/debian/systemd/rss2-related.service @@ -0,0 +1,26 @@ +[Unit] +Description=RSS2 Related News Worker (Go) +After=network.target postgresql.service +Requires=postgresql.service + +[Service] +Type=simple +User=rss2 +Group=rss2 +WorkingDirectory=/opt/rss2 +EnvironmentFile=/opt/rss2/.env +Environment=RELATED_SLEEP=10 +Environment=RELATED_BATCH=200 +Environment=RELATED_TOPK=10 +ExecStart=/opt/rss2/bin/related +Restart=always +RestartSec=10 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=rss2-related + +MemoryMax=1G +CPUQuota=100% + +[Install] +WantedBy=multi-user.target diff --git a/deploy/debian/systemd/rss2-scraper.service b/deploy/debian/systemd/rss2-scraper.service new file mode 100644 index 0000000..83929c4 --- /dev/null +++ b/deploy/debian/systemd/rss2-scraper.service @@ -0,0 +1,25 @@ +[Unit] +Description=RSS2 Scraper HTML (Go) +After=network.target postgresql.service +Requires=postgresql.service + +[Service] +Type=simple +User=rss2 +Group=rss2 +WorkingDirectory=/opt/rss2 +EnvironmentFile=/opt/rss2/.env +Environment=SCRAPER_SLEEP=60 +Environment=SCRAPER_BATCH=10 +ExecStart=/opt/rss2/bin/scraper +Restart=always +RestartSec=10 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=rss2-scraper + +MemoryMax=512M +CPUQuota=100% + +[Install] +WantedBy=multi-user.target diff --git a/deploy/debian/systemd/rss2-topics.service b/deploy/debian/systemd/rss2-topics.service new file mode 100644 index 0000000..f9ab9b5 --- /dev/null +++ b/deploy/debian/systemd/rss2-topics.service @@ -0,0 +1,25 @@ +[Unit] +Description=RSS2 Topics Worker (Go) +After=network.target postgresql.service +Requires=postgresql.service + +[Service] +Type=simple +User=rss2 +Group=rss2 +WorkingDirectory=/opt/rss2 +EnvironmentFile=/opt/rss2/.env +Environment=TOPICS_SLEEP=10 +Environment=TOPICS_BATCH=500 +ExecStart=/opt/rss2/bin/topics +Restart=always +RestartSec=10 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=rss2-topics + +MemoryMax=512M +CPUQuota=100% + +[Install] +WantedBy=multi-user.target diff --git a/deploy/debian/systemd/rss2-translation-scheduler.service b/deploy/debian/systemd/rss2-translation-scheduler.service new file mode 100644 index 0000000..a46f3ad --- /dev/null +++ b/deploy/debian/systemd/rss2-translation-scheduler.service @@ -0,0 +1,26 @@ +[Unit] +Description=RSS2 Translation Scheduler (Python) +After=network.target postgresql.service +Requires=postgresql.service + +[Service] +Type=simple +User=rss2 +Group=rss2 +WorkingDirectory=/opt/rss2/src +EnvironmentFile=/opt/rss2/.env +Environment=TARGET_LANGS=es +Environment=SCHEDULER_BATCH=1000 +Environment=SCHEDULER_SLEEP=30 +ExecStart=/opt/rss2/venv/bin/python -m workers.translation_scheduler +Restart=always +RestartSec=10 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=rss2-translation-scheduler + +MemoryMax=256M +CPUQuota=50% + +[Install] +WantedBy=multi-user.target diff --git a/deploy/debian/systemd/rss2-translator.service b/deploy/debian/systemd/rss2-translator.service new file mode 100644 index 0000000..90528e8 --- /dev/null +++ b/deploy/debian/systemd/rss2-translator.service @@ -0,0 +1,31 @@ +[Unit] +Description=RSS2 Translator Worker NLLB-200 (Python) +After=network.target postgresql.service rss2-translation-scheduler.service +Requires=postgresql.service + +[Service] +Type=simple +User=rss2 +Group=rss2 +WorkingDirectory=/opt/rss2/src +EnvironmentFile=/opt/rss2/.env +Environment=TARGET_LANGS=es +Environment=TRANSLATOR_BATCH=32 +Environment=CT2_MODEL_PATH=/opt/rss2/models/nllb-ct2 +Environment=CT2_DEVICE=cpu +Environment=CT2_COMPUTE_TYPE=int8 +Environment=UNIVERSAL_MODEL=facebook/nllb-200-distilled-600M +Environment=HF_HOME=/opt/rss2/hf_cache +ExecStart=/opt/rss2/venv/bin/python -m workers.ctranslator_worker +Restart=always +RestartSec=15 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=rss2-translator + +# El modelo NLLB-200 consume bastante RAM en CPU +MemoryMax=4G +CPUQuota=200% + +[Install] +WantedBy=multi-user.target diff --git a/deploy/debian/systemd/rss2-wiki.service b/deploy/debian/systemd/rss2-wiki.service new file mode 100644 index 0000000..01891dd --- /dev/null +++ b/deploy/debian/systemd/rss2-wiki.service @@ -0,0 +1,24 @@ +[Unit] +Description=RSS2 Wiki Worker - imagenes Wikipedia (Go) +After=network.target postgresql.service +Requires=postgresql.service + +[Service] +Type=simple +User=rss2 +Group=rss2 +WorkingDirectory=/opt/rss2 +EnvironmentFile=/opt/rss2/.env +Environment=WIKI_SLEEP=10 +ExecStart=/opt/rss2/bin/wiki_worker +Restart=always +RestartSec=10 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=rss2-wiki + +MemoryMax=256M +CPUQuota=50% + +[Install] +WantedBy=multi-user.target diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index b126c81..0000000 --- a/docker-compose.yml +++ /dev/null @@ -1,748 +0,0 @@ -services: - db: - image: postgres:18-alpine - container_name: rss2_db - shm_size: 4gb - environment: - POSTGRES_DB: ${POSTGRES_DB:-rss} - POSTGRES_USER: ${POSTGRES_USER:-rss} - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} - POSTGRES_INITDB_ARGS: "--encoding=UTF8 --locale=C.UTF-8" - LANG: C.UTF-8 - LC_ALL: C.UTF-8 - TZ: Europe/Madrid - PGDATA: /var/lib/postgresql/data/18/main - volumes: - - ./data/pgdata:/var/lib/postgresql/data - - ./init-db:/docker-entrypoint-initdb.d:rw - - ./docker-entrypoint-db.sh:/docker-entrypoint-db.sh:ro - entrypoint: ["bash", "/docker-entrypoint-db.sh"] - networks: - backend: - aliases: - - db - - rss2_db - restart: unless-stopped - healthcheck: - test: [ "CMD-SHELL", "pg_isready -h 127.0.0.1 -p 5432 -U $$POSTGRES_USER -d $$POSTGRES_DB || exit 1" ] - interval: 5s - timeout: 5s - retries: 30 - start_period: 20s - deploy: - resources: - limits: - memory: 8G - reservations: - memory: 4G - - redis: - image: redis:7-alpine - container_name: rss2_redis - environment: - TZ: Europe/Madrid - # SEGURIDAD: Redis con autenticación - command: > - redis-server --appendonly yes --maxmemory 512mb --maxmemory-policy allkeys-lru --requirepass ${REDIS_PASSWORD} - volumes: - - ./data/redis-data:/data - - /etc/timezone:/etc/timezone:ro - - /etc/localtime:/etc/localtime:ro - networks: - backend: - aliases: - - redis - - rss2_redis - restart: unless-stopped - healthcheck: - test: [ "CMD", "redis-cli", "--no-auth-warning", "-a", "${REDIS_PASSWORD}", "ping" ] - interval: 5s - timeout: 3s - retries: 5 - deploy: - resources: - limits: - memory: 768M - reservations: - memory: 512M - - rss-ingestor-go: - build: - context: ./rss-ingestor-go - dockerfile: Dockerfile - container_name: rss2_ingestor_go - environment: - DB_HOST: db - DB_PORT: 5432 - DB_NAME: ${DB_NAME:-rss} - DB_USER: ${DB_USER:-rss} - DB_PASS: ${DB_PASS} - RSS_MAX_WORKERS: 100 - RSS_POKE_INTERVAL_MIN: 15 - TZ: Europe/Madrid - networks: - - backend - depends_on: - db: - condition: service_healthy - restart: unless-stopped - deploy: - resources: - limits: - cpus: '2' - memory: 2G - reservations: - memory: 512M - - langdetect: - build: - context: . - dockerfile: Dockerfile - container_name: rss2_langdetect_py - command: bash -lc "python -m workers.langdetect_worker" - environment: - DB_HOST: db - DB_PORT: 5432 - DB_NAME: ${DB_NAME:-rss} - DB_USER: ${DB_USER:-rss} - DB_PASS: ${DB_PASS} - LANG_DETECT_SLEEP: 60 - LANG_DETECT_BATCH: 1000 - TZ: Europe/Madrid - volumes: - - ./workers:/app/workers - networks: - - backend - depends_on: - db: - condition: service_healthy - restart: unless-stopped - deploy: - resources: - limits: - cpus: '0.5' - memory: 512M - - # ================================================================================== - # SCRAPER WORKER (Go) - Extrae artículos de URLs - # ================================================================================== - scraper: - build: - context: . - dockerfile: Dockerfile.scraper - container_name: rss2_scraper - environment: - DB_HOST: db - DB_PORT: 5432 - DB_NAME: ${DB_NAME:-rss} - DB_USER: ${DB_USER:-rss} - DB_PASS: ${DB_PASS} - SCRAPER_SLEEP: 60 - SCRAPER_BATCH: 10 - TZ: Europe/Madrid - networks: - - backend - depends_on: - db: - condition: service_healthy - restart: unless-stopped - deploy: - resources: - limits: - cpus: '1' - memory: 512M - - # ================================================================================== - # DISCOVERY WORKER (Go) - Descubre RSS feeds - # ================================================================================== - discovery: - build: - context: . - dockerfile: Dockerfile.discovery - container_name: rss2_discovery - environment: - DB_HOST: db - DB_PORT: 5432 - DB_NAME: ${DB_NAME:-rss} - DB_USER: ${DB_USER:-rss} - DB_PASS: ${DB_PASS} - DISCOVERY_INTERVAL: 900 - DISCOVERY_BATCH: 10 - MAX_FEEDS_PER_URL: 5 - TZ: Europe/Madrid - networks: - - backend - depends_on: - db: - condition: service_healthy - restart: unless-stopped - deploy: - resources: - limits: - cpus: '1' - memory: 512M - - # ================================================================================== - # WIKI WORKER (Go) - Wikipedia info and thumbnails - # ================================================================================== - wiki-worker: - build: - context: . - dockerfile: Dockerfile.wiki - container_name: rss2_wiki_worker - environment: - DB_HOST: db - DB_PORT: 5432 - DB_NAME: ${DB_NAME:-rss} - DB_USER: ${DB_USER:-rss} - DB_PASS: ${DB_PASS} - WIKI_SLEEP: 10 - TZ: Europe/Madrid - volumes: - - ./data/wiki_images:/app/data/wiki_images - networks: - - backend - depends_on: - db: - condition: service_healthy - restart: unless-stopped - deploy: - resources: - limits: - cpus: '0.5' - memory: 256M - - # ================================================================================== - # BACKEND GO (API REST) - # ================================================================================== - backend-go: - build: - context: ./backend - dockerfile: Dockerfile - container_name: rss2_backend_go - environment: - TZ: Europe/Madrid - DATABASE_URL: postgres://${POSTGRES_USER:-rss}:${POSTGRES_PASSWORD}@db:5432/${POSTGRES_DB:-rss}?sslmode=disable - REDIS_URL: redis://:${REDIS_PASSWORD:-rss_redis_pass_2024}@redis:6379 - SECRET_KEY: ${SECRET_KEY:-change_this_to_a_long_random_string} - SERVER_PORT: "8080" - volumes: - - ./data/wiki_images:/app/data/wiki_images - networks: - - backend - - frontend - depends_on: - db: - condition: service_healthy - redis: - condition: service_healthy - restart: unless-stopped - - # ================================================================================== - # FRONTEND REACT - # ================================================================================== - rss2_frontend: - build: - context: ./frontend - dockerfile: Dockerfile - container_name: rss2_frontend - environment: - TZ: Europe/Madrid - VITE_API_URL: /api - networks: - - frontend - depends_on: - - backend-go - restart: unless-stopped - - # ================================================================================== - # NGINX (Puerto 8001 - sirve React + proxy API) - # ================================================================================== - nginx: - image: nginx:alpine - container_name: rss2_nginx - ports: - - "8001:80" - volumes: - - ./nginx.conf:/etc/nginx/nginx.conf:ro - networks: - - frontend - depends_on: - - rss2_frontend - - backend-go - restart: unless-stopped - - # ================================================================================== - # TRANSLATOR CPU (CTranslate2) - Scale with: docker compose up -d --scale translator=3 - # ================================================================================== - translator: - build: - context: . - dockerfile: Dockerfile.translator - image: rss2-translator:latest - command: bash -lc "python -m workers.ctranslator_worker" - security_opt: - - seccomp=unconfined - environment: - DB_HOST: db - DB_PORT: 5432 - DB_NAME: ${DB_NAME:-rss} - DB_USER: ${DB_USER:-rss} - DB_PASS: ${DB_PASS} - TARGET_LANGS: es - TRANSLATOR_BATCH: 32 - CT2_MODEL_PATH: /app/models/nllb-ct2 - CT2_DEVICE: cpu - CT2_COMPUTE_TYPE: int8 - UNIVERSAL_MODEL: facebook/nllb-200-distilled-600M - HF_HOME: /app/hf_cache - TZ: Europe/Madrid - TRANSLATOR_ID: ${TRANSLATOR_ID:-} - volumes: - - ./workers:/app/workers - - ./hf_cache:/app/hf_cache - - ./models:/app/models - networks: - - backend - profiles: - - cpu-only - depends_on: - db: - condition: service_healthy - restart: unless-stopped - - # ================================================================================== - # TRANSLATION SCHEDULER - Creates translation jobs - # ================================================================================== - translation-scheduler: - build: - context: . - dockerfile: Dockerfile.scheduler - image: rss2-scheduler:latest - container_name: rss2_translation_scheduler - environment: - DB_HOST: db - DB_PORT: 5432 - DB_NAME: ${DB_NAME:-rss} - DB_USER: ${DB_USER:-rss} - DB_PASS: ${DB_PASS} - TARGET_LANGS: es - SCHEDULER_BATCH: 1000 - SCHEDULER_SLEEP: 30 - TZ: Europe/Madrid - volumes: - - ./workers:/app/workers - networks: - - backend - deploy: - resources: - limits: - cpus: '0.5' - memory: 256M - depends_on: - db: - condition: service_healthy - restart: unless-stopped - - # ================================================================================== - # TRANSLATOR GPU (CTranslate2 with CUDA) - # ================================================================================== - translator-gpu: - build: - context: . - dockerfile: Dockerfile.translator-gpu - image: rss2-translator-gpu:latest - container_name: rss2_translator_gpu - command: bash -lc "python -m workers.ctranslator_worker" - security_opt: - - seccomp=unconfined - environment: - DB_HOST: db - DB_PORT: 5432 - DB_NAME: ${DB_NAME:-rss} - DB_USER: ${DB_USER:-rss} - DB_PASS: ${DB_PASS} - TARGET_LANGS: es - TRANSLATOR_BATCH: 64 - CT2_MODEL_PATH: /app/models/nllb-ct2 - CT2_DEVICE: cuda - CT2_COMPUTE_TYPE: float16 - UNIVERSAL_MODEL: facebook/nllb-200-distilled-600M - HF_HOME: /app/hf_cache - TZ: Europe/Madrid - volumes: - - ./workers:/app/workers - - ./hf_cache:/app/hf_cache - - ./models:/app/models - networks: - - backend - deploy: - resources: - limits: - memory: 4G - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [ gpu ] - depends_on: - db: - condition: service_healthy - restart: unless-stopped - - embeddings: - build: - context: . - dockerfile: Dockerfile - container_name: rss2_embeddings_py - command: bash -lc "python -m workers.embeddings_worker" - environment: - DB_HOST: db - DB_PORT: 5432 - DB_NAME: ${DB_NAME:-rss} - DB_USER: ${DB_USER:-rss} - DB_PASS: ${DB_PASS} - EMB_MODEL: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 - EMB_BATCH: 64 - EMB_SLEEP_IDLE: 5 - EMB_LANGS: es - EMB_LIMIT: 1000 - DEVICE: cuda - HF_HOME: /app/hf_cache - TZ: Europe/Madrid - volumes: - - ./workers:/app/workers - - ./hf_cache:/app/hf_cache - networks: - - backend - deploy: - resources: - limits: - memory: 6G - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [ gpu ] - depends_on: - db: - condition: service_healthy - restart: unless-stopped - - # ================================================================================== - # TOPICS WORKER (Go) - Matching temas y países - # ================================================================================== - topics: - build: - context: . - dockerfile: Dockerfile.topics - container_name: rss2_topics - environment: - DB_HOST: db - DB_PORT: 5432 - DB_NAME: ${DB_NAME:-rss} - DB_USER: ${DB_USER:-rss} - DB_PASS: ${DB_PASS} - TOPICS_SLEEP: 10 - TOPICS_BATCH: 500 - TZ: Europe/Madrid - networks: - - backend - depends_on: - db: - condition: service_healthy - restart: unless-stopped - deploy: - resources: - limits: - cpus: '1' - memory: 512M - - # ================================================================================== - # RELATED WORKER (Go) - Noticias relacionadas - # ================================================================================== - related: - build: - context: . - dockerfile: Dockerfile.related - container_name: rss2_related - environment: - DB_HOST: db - DB_PORT: 5432 - DB_NAME: ${DB_NAME:-rss} - DB_USER: ${DB_USER:-rss} - DB_PASS: ${DB_PASS} - RELATED_SLEEP: 10 - RELATED_BATCH: 200 - RELATED_TOPK: 10 - EMB_MODEL: mxbai-embed-large - TZ: Europe/Madrid - networks: - - backend - depends_on: - db: - condition: service_healthy - restart: unless-stopped - deploy: - resources: - limits: - cpus: '1' - memory: 1G - - qdrant: - image: qdrant/qdrant:latest - container_name: rss2_qdrant - environment: - TZ: Europe/Madrid - QDRANT__SERVICE__GRPC_PORT: 6334 - volumes: - - ./data/qdrant_storage:/qdrant/storage - - /etc/timezone:/etc/timezone:ro - - /etc/localtime:/etc/localtime:ro - networks: - - backend - restart: unless-stopped - deploy: - resources: - limits: - cpus: '4' - memory: 4G - reservations: - memory: 2G - - # ================================================================================== - # QDRANT WORKER (Go) - Vectorización y búsqueda semántica - # ================================================================================== - qdrant-worker: - build: - context: . - dockerfile: Dockerfile.qdrant - container_name: rss2_qdrant_worker - environment: - DB_HOST: db - DB_PORT: 5432 - DB_NAME: ${DB_NAME:-rss} - DB_USER: ${DB_USER:-rss} - DB_PASS: ${DB_PASS} - QDRANT_HOST: qdrant - QDRANT_PORT: 6333 - QDRANT_COLLECTION: news_vectors - OLLAMA_URL: http://ollama:11434 - QDRANT_SLEEP: 30 - QDRANT_BATCH: 100 - TZ: Europe/Madrid - networks: - - backend - depends_on: - db: - condition: service_healthy - qdrant: - condition: service_started - restart: unless-stopped - deploy: - resources: - limits: - cpus: '1' - memory: 1G - - # ================================================================================== - # NER WORKER (Python) - Extracción de entidades - # ================================================================================== - ner: - build: - context: . - dockerfile: Dockerfile - container_name: rss2_ner - command: bash -lc "python -m workers.ner_worker" - environment: - DB_HOST: db - DB_PORT: 5432 - DB_NAME: ${DB_NAME:-rss} - DB_USER: ${DB_USER:-rss} - DB_PASS: ${DB_PASS} - NER_LANG: es - NER_BATCH: 64 - HF_HOME: /app/hf_cache - TZ: Europe/Madrid - volumes: - - ./workers:/app/workers - - ./hf_cache:/app/hf_cache - networks: - - backend - depends_on: - db: - condition: service_healthy - restart: unless-stopped - deploy: - resources: - limits: - cpus: '2' - memory: 2G - - # ================================================================================== - # CLUSTER WORKER (Python) - Agrupación de noticias - # ================================================================================== - cluster: - build: - context: . - dockerfile: Dockerfile - container_name: rss2_cluster_py - command: bash -lc "python -m workers.cluster_worker" - environment: - DB_HOST: db - DB_PORT: 5432 - DB_NAME: ${DB_NAME:-rss} - DB_USER: ${DB_USER:-rss} - DB_PASS: ${DB_PASS} - EVENT_DIST_THRESHOLD: 0.35 - EMB_MODEL: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 - TZ: Europe/Madrid - volumes: - - ./workers:/app/workers - networks: - - backend - depends_on: - db: - condition: service_healthy - restart: unless-stopped - deploy: - resources: - limits: - cpus: '2' - memory: 2G - - # ================================================================================== - # LLM CATEGORIZER (Python) - Categorización con Ollama - # ================================================================================== - llm-categorizer: - build: - context: . - dockerfile: Dockerfile - container_name: rss2_llm_categorizer - command: bash -lc "python -m workers.simple_categorizer_worker" - environment: - DB_HOST: db - DB_PORT: 5432 - DB_NAME: ${DB_NAME:-rss} - DB_USER: ${DB_USER:-rss} - DB_PASS: ${DB_PASS} - CATEGORIZER_BATCH_SIZE: 10 - CATEGORIZER_SLEEP_IDLE: 5 - TZ: Europe/Madrid - volumes: - - ./workers:/app/workers - networks: - - backend - depends_on: - db: - condition: service_healthy - restart: unless-stopped - deploy: - resources: - limits: - cpus: '2' - memory: 1G - - # ================================================================================== - # MONITORING STACK - SECURED - # ================================================================================== - - prometheus: - image: prom/prometheus:latest - container_name: rss2_prometheus - volumes: - - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro - - prometheus_data:/prometheus - command: - - '--config.file=/etc/prometheus/prometheus.yml' - - '--storage.tsdb.path=/prometheus' - - '--web.console.libraries=/usr/share/prometheus/console_libraries' - - '--web.console.templates=/usr/share/prometheus/consoles' - # SEGURIDAD: Sin exposición de puertos - acceso solo vía Grafana o túnel SSH - # ports: - # - "9090:9090" - networks: - - monitoring - restart: unless-stopped - deploy: - resources: - limits: - cpus: '1' - memory: 2G - - grafana: - image: grafana/grafana:latest - container_name: rss2_grafana - # SEGURIDAD: Acceso solo en localhost o vía túnel SSH - # Para acceso remoto, usar túnel SSH: ssh -L 3001:localhost:3001 user@server - ports: - - "127.0.0.1:3001:3000" - environment: - # SEGURIDAD: Cambiar este password en producción - - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD:-change_this_password} - - GF_USERS_ALLOW_SIGN_UP=false - - GF_SERVER_ROOT_URL=http://localhost:3001 - - GF_SECURITY_COOKIE_SECURE=false - - GF_SECURITY_COOKIE_SAMESITE=lax - volumes: - - grafana_data:/var/lib/grafana - networks: - - monitoring - depends_on: - - prometheus - restart: unless-stopped - deploy: - resources: - limits: - cpus: '1' - memory: 1G - - cadvisor: - image: gcr.io/cadvisor/cadvisor:latest - container_name: rss2_cadvisor - # SEGURIDAD: Sin exposición de puertos - solo acceso interno - # ports: - # - "8081:8080" - volumes: - - /:/rootfs:ro - - /var/run:/var/run:ro - - /sys:/sys:ro - - /var/lib/docker/:/var/lib/docker:ro - - /dev/disk/:/dev/disk:ro - devices: - - /dev/kmsg - networks: - - monitoring - restart: unless-stopped - deploy: - resources: - limits: - cpus: '0.5' - memory: 512M - -# ================================================================================== -# REDES SEGMENTADAS -# ================================================================================== -networks: - # Red frontal - Solo nginx y web app - frontend: - name: rss2_frontend - driver: bridge - internal: false - - # Red backend - Base de datos, workers, redis, qdrant - backend: - name: rss2_backend - driver: bridge - internal: false # Acceso externo permitido (necesario para ingestor) - - # Red de monitoreo - Prometheus, Grafana, cAdvisor - monitoring: - name: rss2_monitoring - driver: bridge - internal: true - -volumes: - prometheus_data: - grafana_data: - torch_extensions: diff --git a/docker-entrypoint-db.sh b/docker-entrypoint-db.sh deleted file mode 100755 index 1eb2722..0000000 --- a/docker-entrypoint-db.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -set -e - -# Detectar si la base de datos necesita reinicialización -PGDATA_DIR="/var/lib/postgresql/data/18/main" - -echo "RSS2: Checking database integrity..." - -# Si no existe el archivo de versión, es una base de datos nueva -if [ ! -f "$PGDATA_DIR/PG_VERSION" ]; then - echo "RSS2: New database - will be initialized by docker-entrypoint" -else - # Verificar si la base de datos es funcional - if ! pg_isready -h localhost -p 5432 -U "${POSTGRES_USER:-rss}" 2>/dev/null; then - echo "RSS2: Database appears corrupted - removing old data files for fresh initialization..." - # Eliminar solo los archivos de datos, no todo el directorio - rm -rf "$PGDATA_DIR"/* - echo "RSS2: Data files removed - docker-entrypoint will initialize fresh database" - else - echo "RSS2: Database is healthy" - fi -fi - -# Ejecutar el entrypoint original con los parámetros de PostgreSQL -exec docker-entrypoint.sh \ - postgres \ - -c max_connections=200 \ - -c shared_buffers=4GB \ - -c effective_cache_size=12GB \ - -c work_mem=16MB \ - -c maintenance_work_mem=512MB \ - -c autovacuum_max_workers=3 \ - -c autovacuum_vacuum_scale_factor=0.02 \ - -c autovacuum_vacuum_cost_limit=1000 \ - -c max_worker_processes=8 \ - -c max_parallel_workers=6 \ - -c max_parallel_workers_per_gather=2 \ - -c wal_level=replica \ - -c max_wal_senders=5 \ - -c wal_keep_size=1GB \ - -c hot_standby=on \ - "$@" diff --git a/frontend/Dockerfile b/frontend/Dockerfile deleted file mode 100644 index 5f06218..0000000 --- a/frontend/Dockerfile +++ /dev/null @@ -1,19 +0,0 @@ -FROM node:20-alpine AS builder - -WORKDIR /app - -COPY package*.json ./ -RUN npm install - -COPY . . -RUN npm run build - -FROM nginx:alpine - -COPY --from=builder /app/dist /usr/share/nginx/html - -COPY nginx.conf /etc/nginx/nginx.conf - -EXPOSE 80 - -CMD ["nginx", "-g", "daemon off;"] diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml deleted file mode 100644 index 6cc80ce..0000000 --- a/monitoring/prometheus.yml +++ /dev/null @@ -1,21 +0,0 @@ -global: - scrape_interval: 15s - -scrape_configs: - - job_name: 'prometheus' - static_configs: - - targets: ['localhost:9090'] - - - job_name: 'cadvisor' - static_configs: - - targets: ['cadvisor:8080'] - - # If we had Node Exporter (for host metrics): - # - job_name: 'node_exporter' - # static_configs: - # - targets: ['node-exporter:9100'] - - # If the app exposes metrics (e.g. Flask/Gunicorn with prometheus_client) - # - job_name: 'rss2_web' - # static_configs: - # - targets: ['rss2_web:8000'] diff --git a/reset_and_deploy.sh b/reset_and_deploy.sh deleted file mode 100755 index c103aef..0000000 --- a/reset_and_deploy.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -echo "Stopping all containers..." -docker-compose down - -echo "Removing data volumes..." -# Use sudo if necessary, or ensure current user has permissions -rm -rf data/pgdata data/pgdata-replica data/redis-data data/qdrant_storage - -echo "Starting deployment from scratch..." -docker-compose up -d --build - -echo "Deployment complete. Checking status..." -docker-compose ps diff --git a/rss-ingestor-go/Dockerfile b/rss-ingestor-go/Dockerfile deleted file mode 100644 index b75cbaa..0000000 --- a/rss-ingestor-go/Dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -FROM golang:alpine AS builder - -WORKDIR /app - -# Install git and SSL certs -RUN apk add --no-cache git ca-certificates - -# Copy source code immediately -COPY . . - -# Download dependencies -RUN go mod tidy && go mod download - -# Build the Go app -RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o rss-ingestor . - -# Final stage -FROM alpine:latest - -WORKDIR /root/ - -# Copy the Pre-built binary file from the previous stage -COPY --from=builder /app/rss-ingestor . -COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ - -# Command to run the executable -CMD ["./rss-ingestor"] diff --git a/start_docker.sh b/start_docker.sh deleted file mode 100755 index 1aef36d..0000000 --- a/start_docker.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# Script para iniciar los servicios de Docker -# Ejecutar con: sudo ./start_docker.sh - -set -e -cd "$(dirname "$0")" - -echo "=== RSS2 Docker Services ===" - -# Verificación de modelo eliminada (script de conversión no disponible) - -echo "" -echo "Iniciando servicios Docker..." -docker compose up -d --build - -echo "" -echo "✓ Servicios iniciados" -echo "" -echo "Para ver los logs:" -echo " docker compose logs -f translator" -echo "" -echo "Para verificar el estado:" -echo " docker compose ps" From 00c0254e6c99076cdb5e71e3c638a5ce81370499 Mon Sep 17 00:00:00 2001 From: SITO Date: Mon, 30 Mar 2026 21:02:27 +0200 Subject: [PATCH 2/8] docs: guia de despliegue nativo Debian sin Docker Documentacion completa para instalar COCONEWS en Debian/Ubuntu: - Requisitos hardware por modo (CPU/GPU) - Tabla de servicios y como se gestionan - Instalacion paso a paso incluyendo conversion del modelo NLLB-200 - Gestion de servicios systemd y logs - Estructura de directorios en servidor - Reglas de firewall recomendadas - Procedimiento de backup - Solucion de problemas frecuentes Co-Authored-By: Claude Sonnet 4.6 --- DEPLOY_DEBIAN.md | 283 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 DEPLOY_DEBIAN.md diff --git a/DEPLOY_DEBIAN.md b/DEPLOY_DEBIAN.md new file mode 100644 index 0000000..54a8784 --- /dev/null +++ b/DEPLOY_DEBIAN.md @@ -0,0 +1,283 @@ +# COCONEWS · Despliegue en Debian (sin Docker) + +Guía completa para instalar y operar COCONEWS en un servidor Debian 12 (Bookworm) o Ubuntu 22.04+, sin Docker ni contenedores. + +--- + +## Requisitos de hardware + +| Modo | CPU | RAM | Disco | +|------|-----|-----|-------| +| **Mínimo (solo CPU)** | 4 cores | 8 GB | 40 GB | +| **Recomendado** | 8 cores | 16 GB | 80 GB | +| **Con GPU** | 8 cores + NVIDIA | 16 GB | 80 GB | + +> Los modelos de IA (NLLB-200 + MiniLM + spaCy) ocupan ~5 GB en disco una vez descargados. + +--- + +## Servicios que se instalan en el servidor + +| Servicio | Tecnología | Gestionado por | +|----------|-----------|----------------| +| Base de datos | PostgreSQL 16 | apt + systemd | +| Caché | Redis 7 | apt + systemd | +| Búsqueda vectorial | Qdrant (binario) | systemd | +| API REST | Go (backend) | systemd | +| Ingestor RSS | Go | systemd | +| Scraper / Discovery / Wiki / Topics / Related / Qdrant-worker | Go | systemd | +| Traducción (NLLB-200) | Python + CTranslate2 | systemd | +| Embeddings | Python + Sentence-Transformers | systemd | +| NER | Python + spaCy | systemd | +| Clustering / Categorización | Python | systemd | +| Frontend | React (estático compilado) | nginx | +| Proxy / Web | nginx | apt + systemd | + +--- + +## Instalación paso a paso + +### 1. Clonar el repositorio + +```bash +git clone https://gitea.laenre.net/pietre/rss2.git /opt/src/rss2 +cd /opt/src/rss2 +git checkout coconews +``` + +### 2. Configurar variables de entorno + +```bash +cp deploy/debian/env.example /opt/rss2/.env +nano /opt/rss2/.env +``` + +Valores que **debes cambiar obligatoriamente**: + +```env +POSTGRES_PASSWORD=contraseña_segura_postgres +DB_PASS=contraseña_segura_postgres +REDIS_PASSWORD=contraseña_segura_redis +SECRET_KEY=cadena_aleatoria_minimo_32_caracteres +``` + +Genera claves seguras con: +```bash +openssl rand -hex 32 +``` + +### 3. Descargar y convertir el modelo de traducción (NLLB-200) + +Este paso se hace **una sola vez** y puede tardar 10-30 minutos dependiendo de la conexión. + +```bash +# Instalar dependencias Python primero +python3 -m venv /opt/rss2/venv +/opt/rss2/venv/bin/pip install ctranslate2 transformers sentencepiece + +# Convertir modelo NLLB-200 a formato CTranslate2 +/opt/rss2/venv/bin/ct2-opus-mt-converter \ + --model facebook/nllb-200-distilled-600M \ + --output_dir /opt/rss2/models/nllb-ct2 \ + --quantization int8 + +# Alternativa si el comando anterior falla: +/opt/rss2/venv/bin/python -c " +import ctranslate2 +ctranslate2.converters.OpusMTConverter( + 'facebook/nllb-200-distilled-600M' +).convert('/opt/rss2/models/nllb-ct2', quantization='int8') +" +``` + +### 4. Ejecutar el instalador + +```bash +sudo bash /opt/src/rss2/deploy/debian/install.sh +``` + +El script hace automáticamente: +- Instala PostgreSQL, Redis, nginx, Go, Node.js via apt +- Descarga el binario de Qdrant +- Crea el usuario `rss2` del sistema +- Crea la base de datos y ejecuta las migraciones +- Compila los 8 binarios Go +- Compila el frontend React +- Instala y habilita los 16 servicios systemd + +--- + +## Verificar que funciona + +```bash +# Estado de todos los servicios +systemctl status rss2-backend rss2-ingestor rss2-translator rss2-embeddings + +# Ver logs en tiempo real +journalctl -u rss2-backend -f +journalctl -u rss2-translator -f + +# Comprobar que el API responde +curl http://localhost:8080/api/stats + +# Acceder al frontend +# http://IP_DEL_SERVIDOR:8001 +``` + +--- + +## Gestión de servicios + +### Iniciar / parar / reiniciar + +```bash +# Un servicio concreto +systemctl start rss2-backend +systemctl stop rss2-translator +systemctl restart rss2-embeddings + +# Todos los workers de una vez +systemctl restart rss2-backend rss2-ingestor rss2-scraper rss2-discovery \ + rss2-wiki rss2-topics rss2-related rss2-qdrant-worker \ + rss2-langdetect rss2-translation-scheduler rss2-translator \ + rss2-embeddings rss2-ner rss2-cluster rss2-categorizer +``` + +### Ver logs + +```bash +journalctl -u rss2-backend -f # API Go +journalctl -u rss2-translator -f # Traductor +journalctl -u rss2-embeddings -f # Embeddings +journalctl -u rss2-ner -f # NER entidades +journalctl -u rss2-ingestor -f # Ingestor RSS +``` + +--- + +## Actualizar el código + +Cuando hay nuevos cambios en el repositorio: + +```bash +cd /opt/src/rss2 +git pull +sudo bash deploy/debian/build.sh +``` + +El script `build.sh` recompila los binarios Go, el frontend y sincroniza los workers Python, y reinicia los servicios automáticamente. + +--- + +## Estructura de directorios en el servidor + +``` +/opt/rss2/ +├── .env # Variables de entorno (permisos 600) +├── bin/ # Binarios Go compilados +│ ├── server # API REST +│ ├── ingestor # Ingestor RSS +│ ├── scraper +│ ├── discovery +│ ├── wiki_worker +│ ├── topics +│ ├── related +│ └── qdrant_worker +├── src/ +│ └── workers/ # Workers Python +├── venv/ # Virtualenv Python (ML) +├── models/ +│ └── nllb-ct2/ # Modelo traduccion CTranslate2 +├── hf_cache/ # Cache HuggingFace (embeddings, NER) +├── frontend/ +│ └── dist/ # Frontend React compilado (servido por nginx) +├── data/ +│ ├── wiki_images/ # Imagenes Wikipedia descargadas +│ └── qdrant_storage/ # Datos vectoriales Qdrant +└── qdrant/ + └── qdrant # Binario Qdrant +``` + +--- + +## Requisitos de red / firewall + +Solo exponer al exterior el puerto **8001** (nginx). El resto deben ser internos: + +```bash +# Con ufw: +ufw allow 8001/tcp # COCONEWS web +ufw deny 5432/tcp # PostgreSQL - solo localhost +ufw deny 6379/tcp # Redis - solo localhost +ufw deny 6333/tcp # Qdrant - solo localhost +ufw deny 8080/tcp # API Go - solo localhost (nginx hace proxy) +ufw enable +``` + +--- + +## Backup de datos + +```bash +# PostgreSQL +sudo -u postgres pg_dump rss > /opt/rss2/backups/rss_$(date +%Y%m%d).sql + +# Datos Qdrant +systemctl stop rss2-qdrant +tar -czf /opt/rss2/backups/qdrant_$(date +%Y%m%d).tar.gz /opt/rss2/data/qdrant_storage +systemctl start rss2-qdrant + +# Imagenes Wikipedia (opcional, se pueden re-descargar) +tar -czf /opt/rss2/backups/wiki_images_$(date +%Y%m%d).tar.gz /opt/rss2/data/wiki_images +``` + +--- + +## Solución de problemas frecuentes + +### El traductor no arranca + +```bash +journalctl -u rss2-translator -n 50 +# Si dice "model not found": el modelo NLLB-200 no está convertido +# Ejecutar el paso 3 de la instalación +``` + +### PostgreSQL rechaza la conexión + +```bash +# Verificar que el .env tiene DB_HOST=127.0.0.1 (no "db") +grep DB_HOST /opt/rss2/.env + +# Verificar que el usuario existe +sudo -u postgres psql -c "\du" +``` + +### nginx devuelve 502 Bad Gateway + +```bash +# El backend Go no está corriendo +systemctl status rss2-backend +journalctl -u rss2-backend -n 30 +``` + +### Memoria insuficiente para los modelos Python + +Con 8 GB RAM el translator + embeddings + NER pueden coincidir. Si el servidor tiene poca RAM, deshabilitar el translator-gpu y bajar el batch: + +```bash +# En /opt/rss2/.env +TRANSLATOR_BATCH=8 +EMB_BATCH=32 +NER_BATCH=16 +systemctl restart rss2-translator rss2-embeddings rss2-ner +``` + +--- + +## Primer inicio de sesión + +1. Abrir `http://IP:8001` en el navegador +2. Al no haber usuarios, el sistema te redirige al registro +3. El primer usuario registrado se convierte en **administrador** +4. Ir a Configuración → Feeds → Importar el `feeds.csv` del repositorio para empezar con fuentes precargadas From ab3b0b53c552e99fd1ca53e8db200fbb8ebdfd90 Mon Sep 17 00:00:00 2001 From: SITO Date: Mon, 30 Mar 2026 21:11:56 +0200 Subject: [PATCH 3/8] fix(deploy): corregir 5 problemas bloqueantes para despliegue Debian MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - install.sh/build.sh: actualizar Go 1.23 → 1.25 (requerido por rss-ingestor-go) - install.sh/build.sh: nombrar binario qdrant como qdrant_worker para coincidir con rss2-qdrant-worker.service (ExecStart) - install.sh/build.sh: GOTOOLCHAIN=local en ingestor para evitar descarga automatica de toolchain Go superior - rss2-backend.service: sobreescribir hostnames Docker (libretranslate, ollama, spacy) por 127.0.0.1 para despliegue nativo - env.example: agregar TRANSLATION_URL, OLLAMA_URL, SPACY_URL con nota explicativa sobre uso en endpoints admin - DEPLOY_DEBIAN.md: corregir comando conversion NLLB-200 a CTranslate2 usando OpusMTConverter Python API en lugar de CLI incorrecto Co-Authored-By: Claude Sonnet 4.6 --- DEPLOY_DEBIAN.md | 25 ++++++++++------------ deploy/debian/build.sh | 9 ++++++-- deploy/debian/env.example | 7 ++++++ deploy/debian/install.sh | 19 ++++++++++------ deploy/debian/systemd/rss2-backend.service | 4 ++++ 5 files changed, 41 insertions(+), 23 deletions(-) diff --git a/DEPLOY_DEBIAN.md b/DEPLOY_DEBIAN.md index 54a8784..9835f0a 100644 --- a/DEPLOY_DEBIAN.md +++ b/DEPLOY_DEBIAN.md @@ -71,25 +71,22 @@ openssl rand -hex 32 Este paso se hace **una sola vez** y puede tardar 10-30 minutos dependiendo de la conexión. ```bash -# Instalar dependencias Python primero +# Instalar dependencias Python primero (si aun no se hizo) python3 -m venv /opt/rss2/venv /opt/rss2/venv/bin/pip install ctranslate2 transformers sentencepiece -# Convertir modelo NLLB-200 a formato CTranslate2 -/opt/rss2/venv/bin/ct2-opus-mt-converter \ - --model facebook/nllb-200-distilled-600M \ - --output_dir /opt/rss2/models/nllb-ct2 \ - --quantization int8 - -# Alternativa si el comando anterior falla: -/opt/rss2/venv/bin/python -c " -import ctranslate2 -ctranslate2.converters.OpusMTConverter( - 'facebook/nllb-200-distilled-600M' -).convert('/opt/rss2/models/nllb-ct2', quantization='int8') -" +# Convertir modelo NLLB-200 a formato CTranslate2 (tarda 10-30 min) +/opt/rss2/venv/bin/python - <<'EOF' +from ctranslate2.converters import OpusMTConverter +converter = OpusMTConverter("facebook/nllb-200-distilled-600M") +converter.convert("/opt/rss2/models/nllb-ct2", quantization="int8", force=True) +print("Modelo convertido OK en /opt/rss2/models/nllb-ct2") +EOF ``` +> El modelo ocupa ~600 MB convertido. Si la descarga de HuggingFace falla, exporta +> `HF_ENDPOINT=https://huggingface.co` o usa un mirror. + ### 4. Ejecutar el instalador ```bash diff --git a/deploy/debian/build.sh b/deploy/debian/build.sh index 2699652..6dd81b2 100755 --- a/deploy/debian/build.sh +++ b/deploy/debian/build.sh @@ -22,19 +22,24 @@ if [[ -d "$REPO_ROOT/backend" ]]; then CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -o "$RSS2_HOME/bin/server" ./cmd/server) info " [OK] server" - for cmd in scraper discovery wiki_worker topics related qdrant; do + for cmd in scraper discovery wiki_worker topics related; do [[ -d "$REPO_ROOT/backend/cmd/$cmd" ]] || continue (cd "$REPO_ROOT/backend" && \ CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -o "$RSS2_HOME/bin/$cmd" "./cmd/$cmd") info " [OK] $cmd" done + # qdrant worker: nombre del binario debe coincidir con el service + [[ -d "$REPO_ROOT/backend/cmd/qdrant" ]] && \ + (cd "$REPO_ROOT/backend" && \ + CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -o "$RSS2_HOME/bin/qdrant_worker" "./cmd/qdrant") + info " [OK] qdrant_worker" fi # --- Ingestor Go --- if [[ -d "$REPO_ROOT/rss-ingestor-go" ]]; then info "Compilando ingestor Go..." (cd "$REPO_ROOT/rss-ingestor-go" && \ - CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -o "$RSS2_HOME/bin/ingestor" .) + GOTOOLCHAIN=local CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -o "$RSS2_HOME/bin/ingestor" .) info " [OK] ingestor" fi diff --git a/deploy/debian/env.example b/deploy/debian/env.example index 52b91e7..406f928 100644 --- a/deploy/debian/env.example +++ b/deploy/debian/env.example @@ -34,6 +34,13 @@ TZ=Europe/Madrid # --- HuggingFace cache (modelos ML) --- HF_HOME=/opt/rss2/hf_cache +# --- Endpoints ML opcionales (solo si corres Ollama o LibreTranslate por separado) --- +# Los workers Python van directo a BD; estos endpoints solo los usa el backend para +# llamadas on-demand desde el panel admin (NER, traduccion manual, etc.) +TRANSLATION_URL=http://127.0.0.1:7790 +OLLAMA_URL=http://127.0.0.1:11434 +SPACY_URL=http://127.0.0.1:8000 + # --- Qdrant (local, sin Docker) --- QDRANT_HOST=127.0.0.1 QDRANT_PORT=6333 diff --git a/deploy/debian/install.sh b/deploy/debian/install.sh index 00da025..9627bbc 100755 --- a/deploy/debian/install.sh +++ b/deploy/debian/install.sh @@ -32,10 +32,10 @@ apt-get install -y --no-install-recommends \ ca-certificates tzdata \ libpq-dev -# Go (si no esta instalado o version < 1.22) -if ! command -v go &>/dev/null || [[ "$(go version | awk '{print $3}' | tr -d 'go')" < "1.22" ]]; then - info "Instalando Go 1.23..." - GO_VERSION="1.23.4" +# Go (rss-ingestor-go requiere Go 1.25) +if ! command -v go &>/dev/null || [[ "$(go version | awk '{print $3}' | tr -d 'go')" < "1.25" ]]; then + info "Instalando Go 1.25..." + GO_VERSION="1.25.0" ARCH=$(dpkg --print-architecture) case "$ARCH" in amd64) GO_ARCH="amd64" ;; @@ -186,18 +186,23 @@ if [[ -d "$REPO_ROOT/backend" ]]; then (cd "$REPO_ROOT/backend" && \ CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -o "$RSS2_HOME/bin/server" ./cmd/server && \ info " [OK] server") || warn " [FAIL] server" - for cmd in scraper discovery wiki_worker topics related qdrant; do + for cmd in scraper discovery wiki_worker topics related; do [[ -d "$REPO_ROOT/backend/cmd/$cmd" ]] || continue (cd "$REPO_ROOT/backend" && \ CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -o "$RSS2_HOME/bin/$cmd" "./cmd/$cmd" && \ info " [OK] $cmd") || warn " [FAIL] $cmd" done + # qdrant worker: output como qdrant_worker para coincidir con el service + [[ -d "$REPO_ROOT/backend/cmd/qdrant" ]] && \ + (cd "$REPO_ROOT/backend" && \ + CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -o "$RSS2_HOME/bin/qdrant_worker" "./cmd/qdrant" && \ + info " [OK] qdrant_worker") || warn " [FAIL] qdrant_worker" fi -# RSS Ingestor Go (repo separado) +# RSS Ingestor Go (repo separado, requiere Go 1.25) if [[ -d "$REPO_ROOT/rss-ingestor-go" ]]; then (cd "$REPO_ROOT/rss-ingestor-go" && \ - CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -o "$RSS2_HOME/bin/ingestor" . && \ + GOTOOLCHAIN=local CGO_ENABLED=0 GOOS=linux go build -buildvcs=false -o "$RSS2_HOME/bin/ingestor" . && \ info " [OK] ingestor") || warn " [FAIL] ingestor" fi diff --git a/deploy/debian/systemd/rss2-backend.service b/deploy/debian/systemd/rss2-backend.service index f50239f..3919116 100644 --- a/deploy/debian/systemd/rss2-backend.service +++ b/deploy/debian/systemd/rss2-backend.service @@ -9,6 +9,10 @@ User=rss2 Group=rss2 WorkingDirectory=/opt/rss2 EnvironmentFile=/opt/rss2/.env +# Sobreescribir hostnames Docker por localhost (los workers Python van directo a DB) +Environment=TRANSLATION_URL=http://127.0.0.1:7790 +Environment=OLLAMA_URL=http://127.0.0.1:11434 +Environment=SPACY_URL=http://127.0.0.1:8000 ExecStart=/opt/rss2/bin/server Restart=always RestartSec=5 From e3c682a36fcfbc781352caa7a3fa051427812999 Mon Sep 17 00:00:00 2001 From: SITO Date: Mon, 30 Mar 2026 21:17:11 +0200 Subject: [PATCH 4/8] feat: prerequisites, POC local y README reescrito MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit deploy/debian/prerequisites.sh: - Instalador de dependencias del sistema para Debian/Ubuntu - Detecta OS, instala PostgreSQL 16 (repo oficial), Redis, nginx, Go 1.25, Node.js 20 LTS, Qdrant binario, Python venv - Crea usuario rss2 y estructura /opt/rss2 - Pregunta interactivamente si instalar modelos ML pesados (ctranslate2, transformers, spaCy es_core_news_lg, NLLB-200) - Separado de install.sh para poder ejecutarlo independientemente poc/poc.sh: - POC local en ~2 minutos sin Docker, sin workers ML - Crea BD temporal coconews_poc con schema completo - Carga 10 noticias de muestra en español listas para ver - Compila backend Go y frontend React en /tmp/coconews-poc - Lanza Redis en puerto alternativo (6380) sin interferir - Sirve frontend con npx serve en http://127.0.0.1:18001 - Limpieza automatica al Ctrl+C poc/seed.sql: - 10 noticias de muestra en español (no requieren traduccion) - Categorias, continentes y paises basicos - 5 feeds de ejemplo (El Pais, BBC Mundo, etc.) README.md: - Reescrito completamente sin referencias Docker - Diagrama ASCII de arquitectura - Inicio rapido con poc.sh (2 minutos) - Instrucciones de install en Debian con prerequisites.sh - Tabla de requisitos hardware por modo - Mapa completo del repositorio Co-Authored-By: Claude Sonnet 4.6 --- README.md | 255 ++++++++++++++++----------- deploy/debian/prerequisites.sh | 305 +++++++++++++++++++++++++++++++++ poc/poc.sh | 213 +++++++++++++++++++++++ poc/seed.sql | 94 ++++++++++ 4 files changed, 766 insertions(+), 101 deletions(-) create mode 100755 deploy/debian/prerequisites.sh create mode 100755 poc/poc.sh create mode 100644 poc/seed.sql diff --git a/README.md b/README.md index 4019aaa..d70ea43 100644 --- a/README.md +++ b/README.md @@ -1,124 +1,177 @@ -# RSS2 - AI-Powered News Intelligence Platform +# COCONEWS -RSS2 es una plataforma avanzada de agregación, traducción, análisis y vectorización de noticias, diseñada para transformar flujos masivos de información en inteligencia accionable. Utiliza una arquitectura híbrida de microservicios (Go + Python) integrada con modelos de inteligencia artificial de última generación para ofrecer búsqueda semántica, clasificación inteligente y automatización de contenidos. +Plataforma de inteligencia de noticias. Agrega feeds RSS de cualquier idioma, los traduce automáticamente al español, extrae entidades, los agrupa por eventos y los hace buscables semánticamente. --- -## 🚀 Capacidades Principales +## Qué hace -* **Enriquecimiento con Wikipedia**: Sistema automatizado que detecta personas y organizaciones, descarga sus biografías e imágenes oficiales de Wikipedia para mostrarlas en tooltips interactivos con avatares circulares. -* **Categorización Inteligente (LLM)**: Clasificación de noticias mediante una instancia local de Mistral-7B / Llama-3 (vía Ollama), procesando contenido en tiempo real. -* **Búsqueda Semántica**: Motor vectorial Qdrant para descubrir noticias por contexto y significado, yendo más allá de las palabras clave tradicionales. -* **Traducción Neuronal de Alta Calidad**: Integración de NLLB-200 (vía CTranslate2) para traducir noticias de múltiples idiomas al español con precisión profesional. -* **Inteligencia de Entidades (NER)**: Extracción y normalización automática de Personas, Organizaciones y Lugares para análisis de tendencias y mapeo de relaciones. -* **Búsqueda de Noticias Relacionadas**: Algoritmos de similitud que agrupan noticias sobre el mismo tema automáticamente. +- **Ingesta** feeds RSS/Atom de cualquier idioma de forma continua +- **Traduce** al español con NLLB-200 (200 idiomas soportados) +- **Extrae entidades** (personas, organizaciones, lugares) con spaCy y las enriquece con Wikipedia +- **Genera embeddings** para búsqueda por significado, no solo por palabras clave +- **Agrupa noticias** del mismo evento automáticamente +- **Categoriza** contenido con reglas y modelos de lenguaje +- Interfaz web React con búsqueda semántica, filtros y tooltips de Wikipedia --- -## 🏗️ Arquitectura de Servicios (Docker) +## Arquitectura -El sistema se orquestra mediante Docker Compose y se divide en capas especializadas: - -### Capa de Acceso y API -| Servicio | Tecnología | Descripción | -|---------|------------|-------------| -| **`nginx`** | Nginx Alpine | Gateway y Proxy Inverso (Puerto **8001**). | -| **`rss2_frontend`** | React + Vite | Interfaz web de usuario moderna y responsiva. | -| **`backend-go`** | Go + Gin | API REST principal y gestión de lógica de negocio. | - -### Ingesta y Descubrimiento (Go) -| Servicio | Tecnología | Descripción | -|---------|------------|-------------| -| **`rss-ingestor-go`** | Go | Crawler de alto rendimiento para feeds RSS. | -| **`scraper`** | Go | Scraper profundo con sanitización de HTML y extracción de texto. | -| **`discovery`** | Go | Agente autónomo para descubrir nuevos feeds a partir de URLs. | - -### Procesamiento de Datos e IA (Go & Python) -| Servicio | Tecnología | Descripción | -|---------|------------|-------------| -| **`translator`** | NLLB-200 (CPU) | Traducción neuronal optimizada con CTranslate2. | -| **`translator-gpu`**| NLLB-200 (GPU) | Traducción acelerada por hardware (CUDA). | -| **`wiki-worker`** | Go | **[NUEVO]** Integración con Wikipedia y gestión de imágenes locales. | -| **`embeddings`** | S-Transformers | Generación de vectores para búsqueda semántica. | -| **`ner`** | Spacy / BERT | Reconocimiento de entidades nombradas (NER). | -| **`llm-categorizer`**| Ollama / Mistral | Clasificación avanzada mediante modelos de lenguaje. | -| **`topics`** | Go | Matcher automático de países y temas predefinidos. | -| **`related`** | Go | Motor de detección de noticias relacionadas. | - -### Capa de Almacenamiento -| Servicio | Tecnología | Descripción | -|---------|------------|-------------| -| **`db`** | PostgreSQL 18 | Base de datos relacional principal. | -| **`qdrant`** | Qdrant | Base de datos vectorial para búsqueda por similitud. | -| **`redis`** | Redis 7 | Colas de mensajes y caché de alto desempeño. | - ---- - -## ⚙️ Guía de Configuración - -### 1. Requisitos de Hardware -* **Modo Básico (CPU)**: 4+ Cores CPU, 8GB RAM. -* **Modo Avanzado (IA)**: NVIDIA GPU con 8GB+ VRAM (mínimo recomendado para LLM y Traducción GPU). - -### 2. Instalación Rápida -```bash -git clone -cd rss2 -cp .env.example .env -# Edita .env con tus credenciales -docker compose up -d +``` +Internet (RSS/Atom) + │ + ▼ + rss-ingestor-go ──→ PostgreSQL ──→ langdetect + │ │ │ + scraper/discovery │ translator (NLLB-200) + │ │ + │ embeddings (MiniLM) + │ │ + │ ner (spaCy) + │ │ + │ cluster / related + │ │ + │ qdrant-worker ──→ Qdrant + │ + backend-go (API REST :8080) + │ + nginx (:8001) + │ + Frontend React ``` -### 3. Escalado de Workers (¡Importante!) -Para aumentar la velocidad de procesamiento (especialmente la traducción), puedes escalar los workers: +**Stack:** +- Go 1.25 — API REST (Gin), ingestor RSS, scraper, workers +- Python 3 — Workers ML (NLLB-200, MiniLM, spaCy, CTranslate2) +- PostgreSQL 16 — datos relacionales + full-text search +- Qdrant — búsqueda vectorial semántica +- Redis 7 — caché de consultas +- React 18 + TypeScript + Tailwind — frontend +- nginx — proxy inverso + archivos estáticos + +--- + +## Inicio rápido (POC local) + +Prueba COCONEWS en tu máquina en ~2 minutos con datos de muestra, sin instalar los modelos ML. + +**Requisitos mínimos para el POC:** +- Go 1.25+ +- Node.js 18+ +- PostgreSQL (corriendo) +- Redis (corriendo) ```bash -# Ejecutar 4 traductores en paralelo -docker compose up -d --scale translator=4 +git clone https://gitea.laenre.net/pietre/rss2.git coconews +cd coconews +git checkout coconews -# Si usas GPU y tienes capacidad -docker compose up -d --scale translator-gpu=2 +bash poc/poc.sh +``` + +Abre `http://127.0.0.1:18001` en el navegador. +El primer usuario que se registre será administrador. + +--- + +## Instalación en servidor Debian + +Para un despliegue completo con todos los workers ML en producción: + +### 1. Instalar prerequisites + +```bash +sudo bash deploy/debian/prerequisites.sh +``` + +Instala: PostgreSQL 16, Redis, nginx, Go 1.25, Node.js 20, Qdrant, Python 3 venv. +Pregunta si instalar los modelos ML pesados ahora o después. + +### 2. Configurar entorno + +```bash +cp deploy/debian/env.example /opt/rss2/.env +nano /opt/rss2/.env # edita contraseñas y SECRET_KEY +``` + +### 3. Instalar y arrancar + +```bash +sudo bash deploy/debian/install.sh +``` + +Compila los binarios Go, el frontend React, crea los servicios systemd y arranca todo. + +### Acceder + +``` +http://IP_DEL_SERVIDOR:8001 +``` + +Guía completa: [DEPLOY_DEBIAN.md](DEPLOY_DEBIAN.md) + +--- + +## Gestión de servicios + +```bash +# Estado general +systemctl status rss2-backend rss2-ingestor rss2-translator + +# Logs en tiempo real +journalctl -u rss2-backend -f +journalctl -u rss2-translator -f + +# Reiniciar tras actualizar código +git pull +sudo bash deploy/debian/build.sh ``` --- -## 🛡️ Administración y Mantenimiento +## Actualizar el código -### Copias de Seguridad (Backups) -Desde el panel de Administración (`/admin/settings`), puedes realizar: -* **Backup Completo**: Volcado SQL de toda la base de datos. -* **Backup de Noticias (ZIP)**: **[NUEVO]** Genera un archivo comprimido que incluye las tablas de noticias, traducciones y todas sus etiquetas. Ideal para migraciones de contenido. - -### Variables de Entorno Clave (`.env`) -| Variable | Descripción | -|----------|-------------| -| `WIKI_SLEEP` | Tiempo de espera entre peticiones a Wikipedia (evita bloqueos). | -| `SCHEDULER_BATCH`| Cantidad de noticias a enviar a traducir por ciclo. | -| `TARGET_LANGS` | Idiomas destino (ej: `es`). | -| `OLLAMA_HOST` | Dirección del servidor Ollama para categorización. | - ---- - -## 📖 Documentación de la API (Campos Wikipedia) - -Las respuestas de noticias ahora incluyen el objeto `entities` enriquecido: - -```json -{ - "id": 67449, - "titulo": "...", - "entities": [ - { - "valor": "Apple", - "tipo": "organizacion", - "wiki_summary": "Apple Inc. es una empresa estadounidense...", - "wiki_url": "https://es.wikipedia.org/wiki/Apple", - "image_path": "/api/wiki-images/wiki_5723.png" - } - ] -} +```bash +cd /ruta/al/repo +git pull +sudo bash deploy/debian/build.sh ``` +`build.sh` recompila los binarios Go, el frontend y sincroniza los workers Python, y reinicia los servicios automáticamente. + --- -**RSS2** - *Transformando noticias en inteligencia con IA localizada.* +## Requisitos de hardware + +| Modo | CPU | RAM | Disco | +|------|-----|-----|-------| +| POC local | 2 cores | 4 GB | 10 GB | +| Producción CPU | 4+ cores | 8 GB | 40 GB | +| Producción recomendado | 8 cores | 16 GB | 80 GB | + +--- + +## Estructura del repositorio + +``` +├── backend/ Go — API REST + workers (scraper, discovery, wiki, topics, related, qdrant) +├── rss-ingestor-go/ Go — Ingestor de feeds RSS +├── frontend/ React + TypeScript + Tailwind +├── workers/ Python — ML workers (traducción, embeddings, NER, cluster, categorización) +├── init-db/ SQL — Schema y datos iniciales +├── migrations/ SQL — Migraciones incrementales +├── deploy/debian/ Scripts de despliegue para Debian sin Docker +│ ├── prerequisites.sh Instala todas las dependencias del sistema +│ ├── install.sh Instalación completa +│ ├── build.sh Recompila y reinicia tras actualizar código +│ ├── env.example Plantilla de variables de entorno +│ ├── nginx.conf Configuración nginx para despliegue nativo +│ └── systemd/ Ficheros de servicio systemd (16 servicios) +├── poc/ +│ ├── poc.sh POC local con datos de prueba (sin Docker, sin ML) +│ └── seed.sql Datos de muestra para el POC +├── feeds.csv Feeds RSS precargados para importar desde el admin +├── entity_config.json Aliases y blacklist para normalización de entidades NER +└── DEPLOY_DEBIAN.md Guía detallada de despliegue en Debian +``` diff --git a/deploy/debian/prerequisites.sh b/deploy/debian/prerequisites.sh new file mode 100755 index 0000000..156b9c6 --- /dev/null +++ b/deploy/debian/prerequisites.sh @@ -0,0 +1,305 @@ +#!/usr/bin/env bash +# ============================================================================= +# COCONEWS - Instalacion de prerequisites en Debian 12 / Ubuntu 22.04+ +# Ejecutar ANTES de install.sh +# Uso: sudo bash prerequisites.sh +# ============================================================================= +set -euo pipefail + +RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m' +info() { echo -e "${GREEN}[OK]${NC} $*"; } +step() { echo -e "${BLUE}[-->]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; } + +[[ "$EUID" -ne 0 ]] && error "Ejecutar como root: sudo bash prerequisites.sh" + +# Detectar OS +if [[ -f /etc/os-release ]]; then + . /etc/os-release + OS_ID="$ID" + OS_VER="$VERSION_ID" +else + error "No se puede detectar el sistema operativo" +fi + +[[ "$OS_ID" == "debian" || "$OS_ID" == "ubuntu" ]] || \ + error "Solo soportado en Debian/Ubuntu. Detectado: $OS_ID" + +echo "" +echo "=================================================" +echo " COCONEWS - Instalador de Prerequisites" +echo " OS: $PRETTY_NAME" +echo "=================================================" +echo "" + +# ============================================================================= +# 1. PAQUETES APT BASE +# ============================================================================= +step "Actualizando repositorios apt..." +apt-get update -qq + +step "Instalando paquetes del sistema..." +apt-get install -y --no-install-recommends \ + curl wget git build-essential \ + ca-certificates gnupg lsb-release \ + software-properties-common apt-transport-https \ + tzdata locales \ + rsync \ + openssl \ + libpq-dev \ + libssl-dev \ + libffi-dev \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + zlib1g-dev +info "Paquetes base instalados" + +# ============================================================================= +# 2. POSTGRESQL 16 +# ============================================================================= +step "Instalando PostgreSQL 16..." +if ! command -v psql &>/dev/null; then + # Repositorio oficial de PostgreSQL + install -d /usr/share/postgresql-common/pgdg + curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc \ + | gpg --dearmor -o /usr/share/postgresql-common/pgdg/apt.postgresql.org.gpg + echo "deb [signed-by=/usr/share/postgresql-common/pgdg/apt.postgresql.org.gpg] \ +https://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" \ + > /etc/apt/sources.list.d/pgdg.list + apt-get update -qq + apt-get install -y postgresql-16 postgresql-client-16 +fi +info "PostgreSQL: $(psql --version)" + +# ============================================================================= +# 3. REDIS +# ============================================================================= +step "Instalando Redis..." +if ! command -v redis-server &>/dev/null; then + apt-get install -y redis-server +fi +info "Redis: $(redis-server --version | cut -d' ' -f3)" + +# ============================================================================= +# 4. NGINX +# ============================================================================= +step "Instalando Nginx..." +if ! command -v nginx &>/dev/null; then + apt-get install -y nginx +fi +info "Nginx: $(nginx -v 2>&1 | cut -d'/' -f2)" + +# ============================================================================= +# 5. PYTHON 3 + pip + venv +# ============================================================================= +step "Instalando Python 3..." +apt-get install -y python3 python3-pip python3-venv python3-dev +info "Python: $(python3 --version)" + +# ============================================================================= +# 6. NODE.JS 20 LTS +# ============================================================================= +step "Instalando Node.js 20 LTS..." +if ! command -v node &>/dev/null || [[ "$(node -v | tr -d 'v' | cut -d. -f1)" -lt 18 ]]; then + curl -fsSL https://deb.nodesource.com/setup_20.x | bash - + apt-get install -y nodejs +fi +info "Node.js: $(node --version) | npm: $(npm --version)" + +# ============================================================================= +# 7. GO 1.25 +# ============================================================================= +step "Instalando Go 1.25..." +GO_VERSION="1.25.0" +INSTALLED_GO=$(go version 2>/dev/null | awk '{print $3}' | tr -d 'go' || echo "0") + +# Comparar version instalada +needs_go=false +if ! command -v go &>/dev/null; then + needs_go=true +else + IFS='.' read -ra INS <<< "$INSTALLED_GO" + IFS='.' read -ra REQ <<< "$GO_VERSION" + if [[ "${INS[0]}" -lt "${REQ[0]}" ]] || \ + ([[ "${INS[0]}" == "${REQ[0]}" ]] && [[ "${INS[1]:-0}" -lt "${REQ[1]:-0}" ]]); then + needs_go=true + fi +fi + +if [[ "$needs_go" == "true" ]]; then + ARCH=$(dpkg --print-architecture) + case "$ARCH" in + amd64) GO_ARCH="amd64" ;; + arm64) GO_ARCH="arm64" ;; + *) error "Arquitectura no soportada para Go: $ARCH" ;; + esac + step " Descargando Go ${GO_VERSION} (${GO_ARCH})..." + curl -fsSL "https://go.dev/dl/go${GO_VERSION}.linux-${GO_ARCH}.tar.gz" -o /tmp/go.tar.gz + rm -rf /usr/local/go + tar -C /usr/local -xzf /tmp/go.tar.gz + rm /tmp/go.tar.gz + # Perfil global + cat > /etc/profile.d/golang.sh << 'GOEOF' +export PATH=$PATH:/usr/local/go/bin +export GOPATH=$HOME/go +export PATH=$PATH:$GOPATH/bin +GOEOF + chmod +x /etc/profile.d/golang.sh + export PATH=$PATH:/usr/local/go/bin + info "Go instalado: $(go version)" +else + export PATH=$PATH:/usr/local/go/bin + info "Go ya instalado: $(go version)" +fi + +# ============================================================================= +# 8. QDRANT (binario oficial) +# ============================================================================= +step "Instalando Qdrant..." +QDRANT_VERSION="v1.12.1" +QDRANT_INSTALL_DIR="/opt/rss2/qdrant" +mkdir -p "$QDRANT_INSTALL_DIR" + +if [[ ! -f "$QDRANT_INSTALL_DIR/qdrant" ]]; then + ARCH=$(dpkg --print-architecture) + case "$ARCH" in + amd64) QDRANT_ARCH="x86_64-unknown-linux-musl" ;; + arm64) QDRANT_ARCH="aarch64-unknown-linux-musl" ;; + *) error "Arquitectura no soportada para Qdrant: $ARCH" ;; + esac + step " Descargando Qdrant ${QDRANT_VERSION}..." + curl -fsSL \ + "https://github.com/qdrant/qdrant/releases/download/${QDRANT_VERSION}/qdrant-${QDRANT_ARCH}.tar.gz" \ + -o /tmp/qdrant.tar.gz + tar -C "$QDRANT_INSTALL_DIR" -xzf /tmp/qdrant.tar.gz + chmod +x "$QDRANT_INSTALL_DIR/qdrant" + rm /tmp/qdrant.tar.gz + info "Qdrant ${QDRANT_VERSION} instalado en ${QDRANT_INSTALL_DIR}" +else + info "Qdrant ya instalado en ${QDRANT_INSTALL_DIR}" +fi + +# ============================================================================= +# 9. USUARIO DEL SISTEMA rss2 +# ============================================================================= +step "Creando usuario del sistema 'rss2'..." +if ! id rss2 &>/dev/null; then + useradd -r -m -d /opt/rss2 -s /bin/bash rss2 + info "Usuario 'rss2' creado" +else + info "Usuario 'rss2' ya existe" +fi + +# Crear estructura de directorios +mkdir -p \ + /opt/rss2/bin \ + /opt/rss2/src \ + /opt/rss2/data/wiki_images \ + /opt/rss2/data/qdrant_storage \ + /opt/rss2/hf_cache \ + /opt/rss2/models \ + /opt/rss2/frontend/dist \ + /opt/rss2/logs \ + /opt/rss2/backups +chown -R rss2:rss2 /opt/rss2 +info "Directorios /opt/rss2 creados" + +# ============================================================================= +# 10. PYTHON VIRTUALENV + DEPENDENCIAS BASE +# ============================================================================= +step "Creando virtualenv Python en /opt/rss2/venv..." +if [[ ! -d /opt/rss2/venv ]]; then + python3 -m venv /opt/rss2/venv +fi +/opt/rss2/venv/bin/pip install --upgrade pip setuptools wheel -q +info "Virtualenv listo" + +# Dependencias base (sin los modelos pesados de ML) +step "Instalando dependencias Python base..." +/opt/rss2/venv/bin/pip install -q \ + psycopg2-binary \ + langdetect \ + python-dotenv \ + requests \ + beautifulsoup4 \ + lxml \ + redis \ + qdrant-client \ + numpy \ + scikit-learn \ + tqdm +info "Dependencias Python base instaladas" + +# ============================================================================= +# 11. DEPENDENCIAS ML (pesadas - opcional en este paso) +# ============================================================================= +echo "" +echo -e "${YELLOW}[?]${NC} Instalar dependencias ML pesadas ahora?" +echo " (ctranslate2, transformers, sentence-transformers, spaCy)" +echo " Puede tardar 20-40 minutos y usar ~5 GB de disco." +echo -n " [s/N]: " +read -r install_ml + +if [[ "${install_ml,,}" == "s" || "${install_ml,,}" == "si" || "${install_ml,,}" == "y" ]]; then + step "Instalando dependencias ML (esto tarda)..." + /opt/rss2/venv/bin/pip install -q \ + ctranslate2>=4.0.0 \ + transformers==4.43.3 \ + sentencepiece \ + sacremoses \ + accelerate \ + sentence-transformers==3.0.1 \ + "spacy>=3.7,<4.0" \ + torch --index-url https://download.pytorch.org/whl/cpu + info "Dependencias ML instaladas" + + step "Descargando modelo spaCy en español..." + /opt/rss2/venv/bin/python -m spacy download es_core_news_lg + info "Modelo spaCy es_core_news_lg listo" + + step "Convirtiendo modelo NLLB-200 a CTranslate2..." + warn "Esto puede tardar 10-30 minutos y requiere ~2 GB de RAM" + mkdir -p /opt/rss2/models + /opt/rss2/venv/bin/python - <<'EOF' +import os, sys +os.makedirs("/opt/rss2/models/nllb-ct2", exist_ok=True) +os.environ["HF_HOME"] = "/opt/rss2/hf_cache" +try: + from ctranslate2.converters import OpusMTConverter + converter = OpusMTConverter("facebook/nllb-200-distilled-600M") + converter.convert("/opt/rss2/models/nllb-ct2", quantization="int8", force=True) + print("[OK] Modelo NLLB-200 convertido en /opt/rss2/models/nllb-ct2") +except Exception as e: + print(f"[ERROR] {e}") + print("Convierte manualmente despues con: deploy/debian/convert_model.sh") + sys.exit(0) +EOF + chown -R rss2:rss2 /opt/rss2/models /opt/rss2/hf_cache +else + warn "ML omitido. Ejecuta 'deploy/debian/install.sh' para instalarlas junto con el resto." + warn "Sin ML: la traduccion y los embeddings no funcionaran." +fi + +chown -R rss2:rss2 /opt/rss2 + +# ============================================================================= +# RESUMEN +# ============================================================================= +echo "" +echo "=================================================" +echo -e " ${GREEN}Prerequisites instalados correctamente${NC}" +echo "=================================================" +echo "" +echo " Sistema: $PRETTY_NAME" +echo " Go: $(go version 2>/dev/null | awk '{print $3}')" +echo " Python: $(python3 --version)" +echo " Node.js: $(node --version)" +echo " PostgreSQL: $(psql --version | awk '{print $3}')" +echo " Redis: $(redis-server --version | awk '{print $3}' | tr -d ',')" +echo " Nginx: $(nginx -v 2>&1 | cut -d'/' -f2)" +echo "" +echo " Siguiente paso:" +echo " sudo bash deploy/debian/install.sh" +echo "" diff --git a/poc/poc.sh b/poc/poc.sh new file mode 100755 index 0000000..b51552a --- /dev/null +++ b/poc/poc.sh @@ -0,0 +1,213 @@ +#!/usr/bin/env bash +# ============================================================================= +# COCONEWS - POC local (sin Docker, sin ML workers) +# Levanta backend + frontend con datos de prueba en ~2 minutos +# +# Requisitos mínimos: Go 1.25, Node.js 18+, PostgreSQL, Redis +# Uso: bash poc/poc.sh +# ============================================================================= +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +POC_DIR="$REPO_ROOT/poc" +TMP_DIR="/tmp/coconews-poc" +PID_FILE="$TMP_DIR/pids" + +export PATH=$PATH:/usr/local/go/bin + +RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; BOLD='\033[1m'; NC='\033[0m' +info() { echo -e "${GREEN}[✓]${NC} $*"; } +step() { echo -e "${BLUE}[→]${NC} $*"; } +warn() { echo -e "${YELLOW}[!]${NC} $*"; } +error() { echo -e "${RED}[✗]${NC} $*"; exit 1; } + +# Manejar Ctrl+C: para todos los procesos del POC +cleanup() { + echo "" + warn "Deteniendo POC..." + if [[ -f "$PID_FILE" ]]; then + while IFS= read -r pid; do + kill "$pid" 2>/dev/null || true + done < "$PID_FILE" + rm -f "$PID_FILE" + fi + echo -e "${YELLOW}POC detenido.${NC}" + exit 0 +} +trap cleanup INT TERM + +mkdir -p "$TMP_DIR" +> "$PID_FILE" + +echo "" +echo -e "${BOLD}=================================================${NC}" +echo -e "${BOLD} COCONEWS · POC Local${NC}" +echo -e "${BOLD}=================================================${NC}" +echo "" + +# ============================================================================= +# 1. VERIFICAR PREREQUISITOS +# ============================================================================= +step "Verificando prerequisitos..." +command -v go &>/dev/null || error "Go no encontrado. Instala Go 1.25+." +command -v psql &>/dev/null || error "PostgreSQL no encontrado. Instala postgresql." +command -v redis-cli &>/dev/null || error "Redis no encontrado. Instala redis-server." +command -v node &>/dev/null || error "Node.js no encontrado. Instala Node.js 18+." +info "Prerequisitos OK (Go: $(go version | awk '{print $3}'), Node: $(node -v))" + +# ============================================================================= +# 2. CONFIGURACION POC (sin contrasenas fuertes, solo local) +# ============================================================================= +POC_DB_NAME="coconews_poc" +POC_DB_USER="coconews_poc" +POC_DB_PASS="poc_password_local" +POC_REDIS_PORT="6380" # Puerto alternativo para no interferir con Redis principal +POC_API_PORT="18080" +POC_FRONTEND_PORT="18001" + +export DATABASE_URL="postgres://${POC_DB_USER}:${POC_DB_PASS}@127.0.0.1:5432/${POC_DB_NAME}?sslmode=disable" +export REDIS_URL="redis://127.0.0.1:${POC_REDIS_PORT}" +export SECRET_KEY="poc_secret_key_solo_para_desarrollo_local" +export SERVER_PORT="$POC_API_PORT" +export GIN_MODE="release" + +# ============================================================================= +# 3. POSTGRESQL - crear DB de prueba +# ============================================================================= +step "Preparando base de datos POC (${POC_DB_NAME})..." + +# Verificar que PostgreSQL está corriendo +pg_isready -q || { + warn "PostgreSQL no está corriendo. Intentando iniciar..." + sudo systemctl start postgresql 2>/dev/null || \ + sudo service postgresql start 2>/dev/null || \ + error "No se puede iniciar PostgreSQL. Inícialo manualmente." +} + +# Crear usuario y BD de prueba +sudo -u postgres psql -q -tc "SELECT 1 FROM pg_roles WHERE rolname='${POC_DB_USER}'" \ + | grep -q 1 || \ + sudo -u postgres psql -q -c "CREATE USER ${POC_DB_USER} WITH PASSWORD '${POC_DB_PASS}';" 2>/dev/null + +sudo -u postgres psql -q -tc "SELECT 1 FROM pg_database WHERE datname='${POC_DB_NAME}'" \ + | grep -q 1 || \ + sudo -u postgres createdb -O "${POC_DB_USER}" "${POC_DB_NAME}" 2>/dev/null + +# Aplicar schema completo +sudo -u postgres psql -q -d "${POC_DB_NAME}" \ + -f "$REPO_ROOT/init-db/00-complete-schema.sql" 2>/dev/null || true + +# Otorgar permisos al usuario POC +sudo -u postgres psql -q -d "${POC_DB_NAME}" \ + -c "GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO ${POC_DB_USER}; + GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO ${POC_DB_USER};" \ + 2>/dev/null || true + +# Cargar datos de muestra +sudo -u postgres psql -q -d "${POC_DB_NAME}" -f "$POC_DIR/seed.sql" 2>/dev/null || true + +NEWS_COUNT=$(sudo -u postgres psql -tq -d "${POC_DB_NAME}" -c "SELECT COUNT(*) FROM noticias;" 2>/dev/null | tr -d ' ') +info "BD lista: ${NEWS_COUNT} noticias de prueba cargadas" + +# ============================================================================= +# 4. REDIS (instancia temporal en puerto alternativo) +# ============================================================================= +step "Iniciando Redis en puerto ${POC_REDIS_PORT}..." +redis-server --port "$POC_REDIS_PORT" --daemonize yes \ + --logfile "$TMP_DIR/redis-poc.log" \ + --pidfile "$TMP_DIR/redis-poc.pid" \ + --maxmemory 128mb --maxmemory-policy allkeys-lru \ + 2>/dev/null || warn "Redis ya corriendo en ${POC_REDIS_PORT}" +sleep 1 +redis-cli -p "$POC_REDIS_PORT" ping &>/dev/null && info "Redis OK (puerto ${POC_REDIS_PORT})" +cat "$TMP_DIR/redis-poc.pid" 2>/dev/null >> "$PID_FILE" || true + +# ============================================================================= +# 5. COMPILAR BACKEND GO +# ============================================================================= +step "Compilando backend Go..." +mkdir -p "$TMP_DIR/bin" + +(cd "$REPO_ROOT/backend" && \ + CGO_ENABLED=0 go build -buildvcs=false -o "$TMP_DIR/bin/server" ./cmd/server \ + 2>"$TMP_DIR/build-backend.log") || { + cat "$TMP_DIR/build-backend.log" + error "Fallo al compilar backend. Ver log arriba." +} +info "Backend compilado OK" + +# ============================================================================= +# 6. ARRANCAR BACKEND API +# ============================================================================= +step "Arrancando API en puerto ${POC_API_PORT}..." +"$TMP_DIR/bin/server" > "$TMP_DIR/backend.log" 2>&1 & +BACKEND_PID=$! +echo "$BACKEND_PID" >> "$PID_FILE" + +# Esperar a que el backend responda +for i in {1..15}; do + sleep 1 + if curl -sf "http://127.0.0.1:${POC_API_PORT}/api/stats" &>/dev/null; then + info "API respondiendo en http://127.0.0.1:${POC_API_PORT}" + break + fi + if [[ $i -eq 15 ]]; then + cat "$TMP_DIR/backend.log" + error "El backend no responde. Ver log arriba." + fi +done + +# ============================================================================= +# 7. FRONTEND REACT +# ============================================================================= +step "Preparando frontend..." +cd "$REPO_ROOT/frontend" + +if [[ ! -d node_modules ]]; then + step " Instalando dependencias npm (primera vez)..." + npm install --silent +fi + +# Compilar apuntando al API local del POC +VITE_API_URL="http://127.0.0.1:${POC_API_PORT}" \ +npm run build -- --outDir "$TMP_DIR/frontend-dist" 2>"$TMP_DIR/build-frontend.log" || { + cat "$TMP_DIR/build-frontend.log" + error "Fallo al compilar frontend. Ver log arriba." +} +info "Frontend compilado OK" + +# Servir frontend con npx serve (simple, sin nginx) +step "Sirviendo frontend en puerto ${POC_FRONTEND_PORT}..." +npx --yes serve "$TMP_DIR/frontend-dist" -l "$POC_FRONTEND_PORT" \ + > "$TMP_DIR/frontend.log" 2>&1 & +FRONTEND_PID=$! +echo "$FRONTEND_PID" >> "$PID_FILE" +sleep 2 + +cd "$REPO_ROOT" + +# ============================================================================= +# LISTO +# ============================================================================= +echo "" +echo -e "${BOLD}${GREEN}=================================================${NC}" +echo -e "${BOLD}${GREEN} COCONEWS POC corriendo${NC}" +echo -e "${BOLD}${GREEN}=================================================${NC}" +echo "" +echo -e " ${BOLD}Frontend:${NC} http://127.0.0.1:${POC_FRONTEND_PORT}" +echo -e " ${BOLD}API:${NC} http://127.0.0.1:${POC_API_PORT}/api/stats" +echo "" +echo -e " ${BOLD}Login:${NC} Registra el primer usuario en la UI" +echo -e " (será admin automáticamente)" +echo "" +echo -e " ${BOLD}Noticias:${NC} ${NEWS_COUNT} artículos de prueba en español" +echo -e " ${YELLOW}Nota:${NC} Sin workers ML activos." +echo -e " Noticias no tendrán traducción ni entidades." +echo "" +echo -e " ${BLUE}Logs:${NC} $TMP_DIR/*.log" +echo "" +echo -e " Pulsa ${BOLD}Ctrl+C${NC} para detener el POC." +echo "" + +# Mantener el script corriendo +wait diff --git a/poc/seed.sql b/poc/seed.sql new file mode 100644 index 0000000..44772d7 --- /dev/null +++ b/poc/seed.sql @@ -0,0 +1,94 @@ +-- ============================================================================= +-- COCONEWS POC - Datos de prueba mínimos +-- Carga rápida para ver la interfaz funcionando sin workers ML +-- ============================================================================= + +-- Taxonomía base +INSERT INTO continentes (id, nombre) VALUES + (1, 'África'), (2, 'América'), (3, 'Asia'), + (4, 'Europa'), (5, 'Oceanía') +ON CONFLICT (id) DO NOTHING; + +INSERT INTO categorias (nombre) VALUES + ('Ciencia'), ('Cultura'), ('Deportes'), ('Economía'), + ('Internacional'), ('Política'), ('Salud'), ('Tecnología'), ('Sociedad') +ON CONFLICT DO NOTHING; + +INSERT INTO paises (nombre, continente_id) VALUES + ('España', 4), + ('Argentina', 2), + ('México', 2), + ('Francia', 4), + ('Estados Unidos', 2) +ON CONFLICT DO NOTHING; + +-- Config básica +INSERT INTO config (key, value) VALUES + ('translator_type', 'cpu'), + ('translator_workers', '1'), + ('translator_status', 'stopped') +ON CONFLICT (key) DO NOTHING; + +-- Feeds de muestra (en español, no necesitan traducción) +INSERT INTO feeds (nombre, descripcion, url, idioma, activo, fallos) +VALUES + ('El País', 'Noticias de España y el mundo', 'https://feeds.elpais.com/mrss-s/pages/ep/site/elpais.com/portada', 'es', true, 0), + ('El Mundo', 'Diario de información general', 'https://e00-elmundo.uecdn.es/elmundo/rss/portada.xml', 'es', true, 0), + ('La Vanguardia','Noticias de España y Cataluña', 'https://www.lavanguardia.com/mvc/feed/rss/home', 'es', true, 0), + ('BBC Mundo', 'Noticias en español de la BBC', 'https://feeds.bbci.co.uk/mundo/rss.xml', 'es', true, 0), + ('RT Español', 'Russia Today en español', 'https://actualidad.rt.com/rss', 'es', true, 0) +ON CONFLICT (url) DO NOTHING; + +-- Noticias de muestra (en español, listas para mostrarse sin traducción) +INSERT INTO noticias (id, titulo, resumen, url, fecha, fuente_nombre, categoria_id, lang, topics_processed) +VALUES + (md5('poc-001'), 'La inteligencia artificial transforma el mercado laboral global', + 'Los modelos de lenguaje de gran escala están redefiniendo sectores enteros de la economía, desde el servicio al cliente hasta el desarrollo de software. Empresas de todo el mundo aceleran su adopción mientras sindicatos y gobiernos debaten marcos regulatorios.', + 'https://example.com/ia-mercado-laboral', NOW() - INTERVAL '2 hours', + 'El País', 8, 'es', false), + + (md5('poc-002'), 'Cumbre climática de la ONU aprueba fondo de 100.000 millones para países vulnerables', + 'Los representantes de 196 países alcanzaron un acuerdo histórico en la última jornada de negociaciones. El fondo estará operativo en 2026 y priorizará adaptación en África subsahariana y pequeñas islas del Pacífico.', + 'https://example.com/cumbre-climatica-onu', NOW() - INTERVAL '4 hours', + 'BBC Mundo', 5, 'es', false), + + (md5('poc-003'), 'España registra el mayor crecimiento económico de la eurozona en el primer trimestre', + 'El PIB español creció un 3,2% interanual en los primeros tres meses del año, impulsado por el turismo, las exportaciones y el consumo interno. El Banco de España revisa al alza sus previsiones para el conjunto del ejercicio.', + 'https://example.com/economia-espana-pib', NOW() - INTERVAL '5 hours', + 'El Mundo', 4, 'es', false), + + (md5('poc-004'), 'La selección española de fútbol golea en la fase de clasificación', + 'La Roja aplastó por 4-0 al combinado rival con dos goles de Yamal y otros tantos de Morata en un partido que dejó pocas dudas sobre el potencial del equipo de Luis de la Fuente de cara al próximo gran torneo.', + 'https://example.com/seleccion-espana-futbol', NOW() - INTERVAL '6 hours', + 'La Vanguardia', 3, 'es', false), + + (md5('poc-005'), 'Descubrimiento arqueológico en Extremadura revela ciudad romana inédita', + 'Un equipo de la Universidad de Extremadura ha localizado los restos de un asentamiento romano del siglo II d.C. con teatro, termas y foro en perfecto estado de conservación bajo un olivar de la comarca de La Serena.', + 'https://example.com/arqueologia-extremadura', NOW() - INTERVAL '8 hours', + 'El País', 2, 'es', false), + + (md5('poc-006'), 'Nuevo fármaco contra el Alzheimer obtiene aprobación de la EMA', + 'La Agencia Europea del Medicamento ha dado luz verde al primer tratamiento que demuestra ralentizar significativamente el deterioro cognitivo en fases tempranas. El medicamento llegará a las farmacias europeas antes de final de año.', + 'https://example.com/farmaco-alzheimer-ema', NOW() - INTERVAL '10 hours', + 'BBC Mundo', 7, 'es', false), + + (md5('poc-007'), 'México anuncia plan de inversión en energías renovables por 50.000 millones', + 'El gobierno mexicano presentó su Estrategia Nacional de Transición Energética que contempla duplicar la capacidad solar y eólica instalada antes de 2030, con fuerte participación de capital privado nacional e internacional.', + 'https://example.com/mexico-renovables', NOW() - INTERVAL '12 hours', + 'RT Español', 5, 'es', false), + + (md5('poc-008'), 'OpenAI lanza modelo multimodal capaz de generar video fotorrealista en tiempo real', + 'La empresa californiana presentó Sora 2, capaz de producir secuencias de vídeo de alta definición en menos de 30 segundos. Investigadores advierten sobre los riesgos de desinformación mientras la compañía promete mecanismos de marca de agua.', + 'https://example.com/openai-sora2', NOW() - INTERVAL '14 hours', + 'El Mundo', 8, 'es', false), + + (md5('poc-009'), 'Argentina cierra acuerdo comercial con la Unión Europea tras 25 años de negociaciones', + 'El tratado de libre comercio Mercosur-UE entra en vigor de forma provisional tras superar los últimos obstáculos relacionados con protección ambiental y acceso al mercado agrícola europeo para los productos del cono sur.', + 'https://example.com/argentina-acuerdo-ue', NOW() - INTERVAL '18 hours', + 'BBC Mundo', 4, 'es', false), + + (md5('poc-010'), 'Telescopio James Webb detecta atmósfera en exoplaneta a 40 años luz de la Tierra', + 'Astrónomos del Instituto de Tecnología de California confirmaron la presencia de dióxido de carbono y vapor de agua en la atmósfera del exoplaneta K2-18b, abriendo nuevas posibilidades en la búsqueda de condiciones habitables fuera del sistema solar.', + 'https://example.com/james-webb-exoplaneta', NOW() - INTERVAL '22 hours', + 'El País', 1, 'es', false) +ON CONFLICT (id) DO NOTHING; From ec839b5b54b629e12c15a95274eab815c5c5a5cc Mon Sep 17 00:00:00 2001 From: SITO Date: Mon, 30 Mar 2026 22:04:22 +0200 Subject: [PATCH 5/8] feat(debug): troubleshooting y diagnostico en scripts poc/poc.sh: - Verificacion de versiones: Go >=1.22, Node.js >=18 con mensaje de fix exacto - Deteccion de puertos en uso antes de arrancar (API, frontend, Redis) con instruccion de quien ocupa el puerto - show_log_tail(): muestra solo las lineas relevantes del log al fallar - Compilacion Go: filtra lineas de error reales (error:/undefined:) en vez de volcar todo el log - Backend no responde: sugiere probar DB y Redis individualmente con el comando exacto para diagnosticar - Frontend: distingue error de npm install vs error de build TypeScript - Flag --clean para borrar BD POC y empezar de cero - Logs separados por componente en /tmp/coconews-poc/logs/ deploy/debian/check.sh (nuevo): - Diagnostico completo del sistema post-instalacion - Verifica 16 servicios systemd con estado y fix especifico por cada uno - Prueba conectividad real: PostgreSQL, Redis (con auth), Qdrant HTTP, API Go, nginx - Muestra metricas de BD: total noticias, traducciones hechas y pendientes - Verifica binarios Go compilados y su tamano - Verifica modelos ML: NLLB-200, spaCy es_core_news_lg, sentence-transformers, ctranslate2 - Comprueba disco (avisa si >75% o >90%), permisos de /opt/rss2 y .env - Detecta si .env tiene valores por defecto sin cambiar - Modo --quick para ver solo estado arriba/abajo rapidamente - Resumen final con conteo de errores y advertencias, exit code 1 si hay errores deploy/debian/prerequisites.sh: - Comprobacion de espacio libre en disco al inicio (avisa si <10 GB) - apt-get update con log de error y sugerencias de fix - Seccion de troubleshooting en el resumen final con fixes comunes Co-Authored-By: Claude Sonnet 4.6 --- deploy/debian/check.sh | 342 ++++++++++++++++++++++++++++++++ deploy/debian/prerequisites.sh | 26 ++- poc/poc.sh | 352 ++++++++++++++++++++++++++------- 3 files changed, 651 insertions(+), 69 deletions(-) create mode 100755 deploy/debian/check.sh diff --git a/deploy/debian/check.sh b/deploy/debian/check.sh new file mode 100755 index 0000000..c001e0b --- /dev/null +++ b/deploy/debian/check.sh @@ -0,0 +1,342 @@ +#!/usr/bin/env bash +# ============================================================================= +# COCONEWS - Diagnóstico del sistema +# Verifica que todos los servicios estén OK y muestra estado + sugerencias +# +# Uso: +# bash deploy/debian/check.sh # diagnóstico completo +# bash deploy/debian/check.sh --quick # solo servicios arriba/abajo +# ============================================================================= + +RSS2_HOME="/opt/rss2" +API_PORT="8080" +QUICK=false +[[ "${1:-}" == "--quick" ]] && QUICK=true + +RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m' +BLUE='\033[0;34m'; BOLD='\033[1m'; DIM='\033[2m'; NC='\033[0m' +CYAN='\033[0;36m' + +OK() { echo -e " ${GREEN}[✓]${NC} $*"; } +FAIL() { echo -e " ${RED}[✗]${NC} $*"; } +WARN() { echo -e " ${YELLOW}[!]${NC} $*"; } +INFO() { echo -e " ${DIM} $*${NC}"; } +HEAD() { echo -e "\n${BOLD}${CYAN}── $* ${NC}"; } +FIX() { echo -e " ${YELLOW} → Fix:${NC} $*"; } + +ERRORS=0 +WARNINGS=0 + +fail_with_fix() { + FAIL "$1" + shift + for fix in "$@"; do FIX "$fix"; done + (( ERRORS++ )) || true +} + +warn_with_fix() { + WARN "$1" + shift + for fix in "$@"; do FIX "$fix"; done + (( WARNINGS++ )) || true +} + +echo "" +echo -e "${BOLD}╔═══════════════════════════════════════════╗${NC}" +echo -e "${BOLD}║ COCONEWS · Diagnóstico del Sistema ║${NC}" +echo -e "${BOLD}╚═══════════════════════════════════════════╝${NC}" +echo -e "${DIM}$(date '+%Y-%m-%d %H:%M:%S') · $(hostname)${NC}" + +# ============================================================================= +# 1. SERVICIOS SYSTEMD +# ============================================================================= +HEAD "Servicios systemd" + +GO_SERVICES=( + "rss2-backend:API REST Go" + "rss2-ingestor:Ingestor RSS" + "rss2-scraper:Scraper HTML" + "rss2-discovery:Discovery de feeds" + "rss2-wiki:Wiki worker" + "rss2-topics:Topics matcher" + "rss2-related:Related news" + "rss2-qdrant-worker:Qdrant sync" +) +PY_SERVICES=( + "rss2-langdetect:Detección de idioma" + "rss2-translation-scheduler:Scheduler traducción" + "rss2-translator:Traductor NLLB-200" + "rss2-embeddings:Embeddings ML" + "rss2-ner:NER entidades" + "rss2-cluster:Clustering eventos" + "rss2-categorizer:Categorizador" +) +INFRA_SERVICES=( + "rss2-qdrant:Vector DB Qdrant" + "postgresql:PostgreSQL" + "redis-server:Redis" + "nginx:Nginx" +) + +check_service() { + local svc="${1%%:*}" + local name="${1##*:}" + if ! command -v systemctl &>/dev/null; then + WARN "$name (systemctl no disponible)" + return + fi + local state + state=$(systemctl is-active "$svc" 2>/dev/null || echo "unknown") + case "$state" in + active) + OK "$name ($svc)" + ;; + failed) + fail_with_fix "$name ($svc) — FAILED" \ + "journalctl -u $svc -n 30 --no-pager" \ + "systemctl restart $svc" + ;; + inactive) + warn_with_fix "$name ($svc) — inactivo" \ + "systemctl start $svc" \ + "systemctl enable $svc" + ;; + *) + warn_with_fix "$name ($svc) — $state" \ + "systemctl status $svc" + ;; + esac +} + +echo -e "\n ${DIM}Infraestructura:${NC}" +for svc in "${INFRA_SERVICES[@]}"; do check_service "$svc"; done + +echo -e "\n ${DIM}Workers Go:${NC}" +for svc in "${GO_SERVICES[@]}"; do check_service "$svc"; done + +echo -e "\n ${DIM}Workers Python (ML):${NC}" +for svc in "${PY_SERVICES[@]}"; do check_service "$svc"; done + +[[ "$QUICK" == "true" ]] && { + echo "" + [[ "$ERRORS" -eq 0 && "$WARNINGS" -eq 0 ]] && \ + echo -e " ${GREEN}${BOLD}Todo OK${NC}" || \ + echo -e " ${RED}Errores: $ERRORS ${YELLOW}Advertencias: $WARNINGS${NC}" + exit 0 +} + +# ============================================================================= +# 2. CONECTIVIDAD +# ============================================================================= +HEAD "Conectividad" + +# PostgreSQL +PG_ENV="$RSS2_HOME/.env" +if [[ -f "$PG_ENV" ]]; then + source "$PG_ENV" 2>/dev/null || true +fi +DB_NAME="${POSTGRES_DB:-rss}" +DB_USER="${POSTGRES_USER:-rss}" + +if pg_isready -q 2>/dev/null; then + OK "PostgreSQL acepta conexiones" + # Probar conexión con usuario rss2 + if sudo -u postgres psql -d "$DB_NAME" -c "SELECT 1" &>/dev/null 2>&1; then + NEWS=$(sudo -u postgres psql -tq -d "$DB_NAME" \ + -c "SELECT COUNT(*) FROM noticias;" 2>/dev/null | tr -d ' \n' || echo "?") + TRANS=$(sudo -u postgres psql -tq -d "$DB_NAME" \ + -c "SELECT COUNT(*) FROM traducciones WHERE status='done';" 2>/dev/null | tr -d ' \n' || echo "?") + PEND=$(sudo -u postgres psql -tq -d "$DB_NAME" \ + -c "SELECT COUNT(*) FROM traducciones WHERE status='pending';" 2>/dev/null | tr -d ' \n' || echo "?") + INFO "Base de datos: $DB_NAME" + INFO "Noticias: $NEWS | Traducciones hechas: $TRANS | Pendientes: $PEND" + else + warn_with_fix "No se puede conectar a la BD '$DB_NAME' con usuario '$DB_USER'" \ + "sudo -u postgres psql -c \"\\du\" # listar usuarios" \ + "Revisa DB_USER y DB_PASS en $RSS2_HOME/.env" + fi +else + fail_with_fix "PostgreSQL no responde" \ + "sudo systemctl start postgresql" \ + "sudo journalctl -u postgresql -n 20 --no-pager" \ + "sudo pg_ctlcluster 16 main status" +fi + +# Redis +REDIS_PASS="${REDIS_PASSWORD:-}" +REDIS_AUTH="" +[[ -n "$REDIS_PASS" ]] && REDIS_AUTH="-a $REDIS_PASS" +if redis-cli $REDIS_AUTH ping 2>/dev/null | grep -q PONG; then + REDIS_MEM=$(redis-cli $REDIS_AUTH info memory 2>/dev/null | grep used_memory_human | cut -d: -f2 | tr -d '\r' || echo "?") + OK "Redis responde (memoria usada: ${REDIS_MEM})" +else + fail_with_fix "Redis no responde" \ + "sudo systemctl start redis-server" \ + "redis-cli ping # sin auth" \ + "Verifica REDIS_PASSWORD en $RSS2_HOME/.env" +fi + +# Qdrant +if curl -sf "http://127.0.0.1:6333/healthz" &>/dev/null; then + QDRANT_VER=$(curl -sf "http://127.0.0.1:6333/" 2>/dev/null | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('version','?'))" 2>/dev/null || echo "?") + OK "Qdrant responde (v${QDRANT_VER})" +else + warn_with_fix "Qdrant no responde en :6333" \ + "systemctl start rss2-qdrant" \ + "journalctl -u rss2-qdrant -n 20 --no-pager" \ + "Búsqueda semántica no funcionará hasta que Qdrant esté activo" +fi + +# API Backend Go +if curl -sf "http://127.0.0.1:${API_PORT}/api/stats" &>/dev/null; then + STATS=$(curl -sf "http://127.0.0.1:${API_PORT}/api/stats" 2>/dev/null | \ + python3 -c "import sys,json; d=json.load(sys.stdin); print(f\"noticias={d.get('total_news','?')} feeds={d.get('total_feeds','?')}\")" 2>/dev/null || echo "") + OK "API backend responde (:${API_PORT}) — $STATS" +else + fail_with_fix "API backend no responde en :${API_PORT}" \ + "systemctl restart rss2-backend" \ + "journalctl -u rss2-backend -n 30 --no-pager" \ + "curl http://127.0.0.1:${API_PORT}/api/stats # prueba manual" +fi + +# Nginx +if curl -sf "http://127.0.0.1:8001/health" &>/dev/null; then + OK "Nginx responde (:8001)" +else + fail_with_fix "Nginx no responde en :8001" \ + "nginx -t # verificar config" \ + "systemctl restart nginx" \ + "journalctl -u nginx -n 20 --no-pager" +fi + +# ============================================================================= +# 3. BINARIOS GO +# ============================================================================= +HEAD "Binarios compilados" + +BINS=(server ingestor scraper discovery wiki_worker topics related qdrant_worker) +for bin in "${BINS[@]}"; do + BIN_PATH="$RSS2_HOME/bin/$bin" + if [[ -x "$BIN_PATH" ]]; then + SIZE=$(du -sh "$BIN_PATH" 2>/dev/null | cut -f1) + OK "$bin (${SIZE})" + else + fail_with_fix "$bin no encontrado o sin permisos de ejecución en $BIN_PATH" \ + "sudo bash deploy/debian/build.sh # recompila todos los binarios" + fi +done + +# ============================================================================= +# 4. MODELOS ML +# ============================================================================= +HEAD "Modelos ML" + +# NLLB-200 CTranslate2 +if [[ -d "$RSS2_HOME/models/nllb-ct2" ]] && \ + [[ -f "$RSS2_HOME/models/nllb-ct2/model.bin" ]]; then + SIZE=$(du -sh "$RSS2_HOME/models/nllb-ct2" 2>/dev/null | cut -f1) + OK "NLLB-200 CTranslate2 (${SIZE})" +else + warn_with_fix "Modelo NLLB-200 no encontrado en $RSS2_HOME/models/nllb-ct2" \ + "El traductor no funcionará hasta que esté el modelo" \ + "Convierte con: ver sección 3 de DEPLOY_DEBIAN.md" +fi + +# spaCy +if "$RSS2_HOME/venv/bin/python" -c "import spacy; spacy.load('es_core_news_lg')" &>/dev/null 2>&1; then + OK "spaCy es_core_news_lg" +else + warn_with_fix "Modelo spaCy es_core_news_lg no disponible" \ + "$RSS2_HOME/venv/bin/python -m spacy download es_core_news_lg" \ + "El worker NER no funcionará hasta instalarlo" +fi + +# sentence-transformers (embeddings) +if "$RSS2_HOME/venv/bin/python" -c "from sentence_transformers import SentenceTransformer" &>/dev/null 2>&1; then + OK "sentence-transformers disponible" +else + warn_with_fix "sentence-transformers no instalado" \ + "$RSS2_HOME/venv/bin/pip install sentence-transformers==3.0.1" \ + "Los embeddings y búsqueda semántica no funcionarán" +fi + +# ctranslate2 +if "$RSS2_HOME/venv/bin/python" -c "import ctranslate2" &>/dev/null 2>&1; then + OK "ctranslate2 disponible" +else + warn_with_fix "ctranslate2 no instalado" \ + "$RSS2_HOME/venv/bin/pip install ctranslate2>=4.0.0" \ + "La traducción NLLB-200 no funcionará" +fi + +# ============================================================================= +# 5. DISCO Y PERMISOS +# ============================================================================= +HEAD "Disco y permisos" + +# Espacio en disco +DISK_FREE=$(df -h "$RSS2_HOME" 2>/dev/null | tail -1 | awk '{print $4}') +DISK_PCT=$(df "$RSS2_HOME" 2>/dev/null | tail -1 | awk '{print $5}' | tr -d '%') +if [[ "${DISK_PCT:-0}" -gt 90 ]]; then + fail_with_fix "Disco al ${DISK_PCT}% — quedan solo ${DISK_FREE}" \ + "du -sh $RSS2_HOME/* | sort -rh | head -10 # ver qué ocupa más" \ + "Los modelos ML necesitan ~5 GB libres" +elif [[ "${DISK_PCT:-0}" -gt 75 ]]; then + warn_with_fix "Disco al ${DISK_PCT}% (libre: ${DISK_FREE})" \ + "Revisa el espacio antes de descargar modelos ML" +else + OK "Disco: ${DISK_PCT}% usado, ${DISK_FREE} libres" +fi + +# Permisos de /opt/rss2 +if [[ -d "$RSS2_HOME" ]]; then + OWNER=$(stat -c '%U' "$RSS2_HOME" 2>/dev/null || echo "?") + if [[ "$OWNER" == "rss2" ]]; then + OK "Propietario de $RSS2_HOME: rss2" + else + warn_with_fix "Propietario de $RSS2_HOME es '$OWNER', debería ser 'rss2'" \ + "sudo chown -R rss2:rss2 $RSS2_HOME" + fi +fi + +# .env +if [[ -f "$RSS2_HOME/.env" ]]; then + ENV_PERMS=$(stat -c '%a' "$RSS2_HOME/.env" 2>/dev/null || echo "?") + if [[ "$ENV_PERMS" == "600" ]]; then + OK ".env con permisos 600 (correcto)" + else + warn_with_fix ".env con permisos ${ENV_PERMS} (debería ser 600)" \ + "chmod 600 $RSS2_HOME/.env" + fi + # Verificar que no quedan valores por defecto peligrosos + if grep -q "CAMBIA_ESTO\|changeme\|change_this" "$RSS2_HOME/.env" 2>/dev/null; then + warn_with_fix ".env tiene valores por defecto sin cambiar" \ + "nano $RSS2_HOME/.env # cambia POSTGRES_PASSWORD, REDIS_PASSWORD, SECRET_KEY" + fi +else + fail_with_fix ".env no encontrado en $RSS2_HOME/.env" \ + "cp deploy/debian/env.example $RSS2_HOME/.env" \ + "nano $RSS2_HOME/.env # edita contraseñas" +fi + +# ============================================================================= +# RESUMEN FINAL +# ============================================================================= +echo "" +echo -e "${BOLD}══════════════════════════════════════════════${NC}" +if [[ "$ERRORS" -eq 0 && "$WARNINGS" -eq 0 ]]; then + echo -e " ${GREEN}${BOLD}Sistema OK — todo funcionando correctamente${NC}" +elif [[ "$ERRORS" -eq 0 ]]; then + echo -e " ${YELLOW}${BOLD}Sistema funcional con $WARNINGS advertencia(s)${NC}" + echo -e " ${DIM}Las advertencias no bloquean el funcionamiento básico${NC}" +else + echo -e " ${RED}${BOLD}$ERRORS error(es) encontrado(s)${YELLOW} $WARNINGS advertencia(s)${NC}" + echo -e " ${DIM}Sigue los fixes indicados arriba para resolverlos${NC}" +fi +echo -e "${BOLD}══════════════════════════════════════════════${NC}" +echo "" +echo -e " ${DIM}Logs de servicios: journalctl -u rss2-backend -f${NC}" +echo -e " ${DIM}Diagnóstico rápido: bash deploy/debian/check.sh --quick${NC}" +echo "" + +exit $(( ERRORS > 0 ? 1 : 0 )) diff --git a/deploy/debian/prerequisites.sh b/deploy/debian/prerequisites.sh index 156b9c6..3f65328 100755 --- a/deploy/debian/prerequisites.sh +++ b/deploy/debian/prerequisites.sh @@ -36,8 +36,22 @@ echo "" # ============================================================================= # 1. PAQUETES APT BASE # ============================================================================= +# Comprobar espacio en disco (mínimo 10 GB libres) +DISK_FREE_GB=$(df / --output=avail -BG | tail -1 | tr -d 'G ') +if [[ "${DISK_FREE_GB:-0}" -lt 10 ]]; then + warn "Espacio libre en disco: ${DISK_FREE_GB} GB (recomendado mínimo 10 GB)" + warn "Los modelos ML necesitan ~5 GB. Continua bajo tu responsabilidad." +fi + step "Actualizando repositorios apt..." -apt-get update -qq +apt-get update -qq 2>/tmp/apt-update.log || { + echo -e "${RED}Error al actualizar repositorios apt.${NC}" + echo " Posibles causas:" + echo " • Sin conexión a internet" + echo " • Repositorios con errores: cat /tmp/apt-update.log" + echo " • Solución: apt-get update --fix-missing" + exit 1 +} step "Instalando paquetes del sistema..." apt-get install -y --no-install-recommends \ @@ -285,7 +299,7 @@ fi chown -R rss2:rss2 /opt/rss2 # ============================================================================= -# RESUMEN +# RESUMEN Y SIGUIENTES PASOS # ============================================================================= echo "" echo "=================================================" @@ -303,3 +317,11 @@ echo "" echo " Siguiente paso:" echo " sudo bash deploy/debian/install.sh" echo "" +echo " Si algo falló durante la instalación:" +echo " • apt falla: apt-get update --fix-missing" +echo " • Go no descarga: verifica conectividad → curl https://go.dev" +echo " • Qdrant no baja: descárgalo manualmente en" +echo " https://github.com/qdrant/qdrant/releases" +echo " • pip falla: python3 -m venv /opt/rss2/venv --clear" +echo " • Diagnóstico: bash deploy/debian/check.sh" +echo "" diff --git a/poc/poc.sh b/poc/poc.sh index b51552a..f18cd4c 100755 --- a/poc/poc.sh +++ b/poc/poc.sh @@ -5,6 +5,7 @@ # # Requisitos mínimos: Go 1.25, Node.js 18+, PostgreSQL, Redis # Uso: bash poc/poc.sh +# bash poc/poc.sh --clean (borra BD y empieza de cero) # ============================================================================= set -euo pipefail @@ -12,16 +13,37 @@ REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" POC_DIR="$REPO_ROOT/poc" TMP_DIR="/tmp/coconews-poc" PID_FILE="$TMP_DIR/pids" +LOG_DIR="$TMP_DIR/logs" +CLEAN_MODE=false + +[[ "${1:-}" == "--clean" ]] && CLEAN_MODE=true export PATH=$PATH:/usr/local/go/bin -RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; BOLD='\033[1m'; NC='\033[0m' +RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m' +BLUE='\033[0;34m'; BOLD='\033[1m'; DIM='\033[2m'; NC='\033[0m' + info() { echo -e "${GREEN}[✓]${NC} $*"; } step() { echo -e "${BLUE}[→]${NC} $*"; } warn() { echo -e "${YELLOW}[!]${NC} $*"; } -error() { echo -e "${RED}[✗]${NC} $*"; exit 1; } +error() { + echo -e "${RED}[✗] $*${NC}" + echo -e "${DIM} → Logs en: $LOG_DIR/${NC}" + exit 1 +} -# Manejar Ctrl+C: para todos los procesos del POC +# Muestra las últimas líneas de un log con contexto +show_log_tail() { + local logfile="$1" + local lines="${2:-20}" + if [[ -f "$logfile" && -s "$logfile" ]]; then + echo -e "${DIM}--- últimas líneas de $(basename "$logfile") ---${NC}" + tail -n "$lines" "$logfile" | sed 's/^/ /' + echo -e "${DIM}--- fin del log ---${NC}" + fi +} + +# Manejar Ctrl+C cleanup() { echo "" warn "Deteniendo POC..." @@ -31,12 +53,15 @@ cleanup() { done < "$PID_FILE" rm -f "$PID_FILE" fi + # Parar Redis POC si está corriendo + [[ -f "$TMP_DIR/redis-poc.pid" ]] && \ + kill "$(cat "$TMP_DIR/redis-poc.pid")" 2>/dev/null || true echo -e "${YELLOW}POC detenido.${NC}" exit 0 } trap cleanup INT TERM -mkdir -p "$TMP_DIR" +mkdir -p "$TMP_DIR" "$LOG_DIR" "$TMP_DIR/bin" > "$PID_FILE" echo "" @@ -44,24 +69,87 @@ echo -e "${BOLD}=================================================${NC}" echo -e "${BOLD} COCONEWS · POC Local${NC}" echo -e "${BOLD}=================================================${NC}" echo "" +[[ "$CLEAN_MODE" == "true" ]] && warn "Modo --clean: se borrará la BD de prueba anterior" # ============================================================================= # 1. VERIFICAR PREREQUISITOS # ============================================================================= step "Verificando prerequisitos..." -command -v go &>/dev/null || error "Go no encontrado. Instala Go 1.25+." -command -v psql &>/dev/null || error "PostgreSQL no encontrado. Instala postgresql." -command -v redis-cli &>/dev/null || error "Redis no encontrado. Instala redis-server." -command -v node &>/dev/null || error "Node.js no encontrado. Instala Node.js 18+." -info "Prerequisitos OK (Go: $(go version | awk '{print $3}'), Node: $(node -v))" + +PREREQ_OK=true + +# Go +if ! command -v go &>/dev/null; then + echo -e " ${RED}[✗]${NC} Go no encontrado" + echo -e " Instala Go 1.25: https://go.dev/dl/" + echo -e " O en Debian: sudo bash deploy/debian/prerequisites.sh" + PREREQ_OK=false +else + GO_VER=$(go version | awk '{print $3}' | tr -d 'go') + IFS='.' read -ra V <<< "$GO_VER" + if [[ "${V[0]}" -lt 1 ]] || [[ "${V[0]}" -eq 1 && "${V[1]:-0}" -lt 22 ]]; then + echo -e " ${RED}[✗]${NC} Go ${GO_VER} instalado, se necesita 1.22+ (recomendado 1.25)" + PREREQ_OK=false + else + echo -e " ${GREEN}[✓]${NC} Go ${GO_VER}" + fi +fi + +# Node.js +if ! command -v node &>/dev/null; then + echo -e " ${RED}[✗]${NC} Node.js no encontrado" + echo -e " Instala: curl -fsSL https://deb.nodesource.com/setup_20.x | sudo bash -" + echo -e " sudo apt-get install -y nodejs" + PREREQ_OK=false +else + NODE_VER=$(node -v | tr -d 'v') + NODE_MAJOR=$(echo "$NODE_VER" | cut -d. -f1) + if [[ "$NODE_MAJOR" -lt 18 ]]; then + echo -e " ${RED}[✗]${NC} Node.js v${NODE_VER} instalado, se necesita v18+" + PREREQ_OK=false + else + echo -e " ${GREEN}[✓]${NC} Node.js v${NODE_VER}" + fi +fi + +# PostgreSQL +if ! command -v psql &>/dev/null; then + echo -e " ${RED}[✗]${NC} psql no encontrado" + echo -e " Instala: sudo apt-get install -y postgresql postgresql-client" + PREREQ_OK=false +else + echo -e " ${GREEN}[✓]${NC} PostgreSQL $(psql --version | awk '{print $3}')" +fi + +# Redis +if ! command -v redis-cli &>/dev/null; then + echo -e " ${RED}[✗]${NC} redis-cli no encontrado" + echo -e " Instala: sudo apt-get install -y redis-server" + PREREQ_OK=false +else + echo -e " ${GREEN}[✓]${NC} Redis $(redis-server --version | awk '{print $3}' | tr -d ',')" +fi + +# curl (para health check del backend) +if ! command -v curl &>/dev/null; then + echo -e " ${RED}[✗]${NC} curl no encontrado" + echo -e " Instala: sudo apt-get install -y curl" + PREREQ_OK=false +fi + +[[ "$PREREQ_OK" == "false" ]] && { + echo "" + error "Faltan prerequisitos. Corrígelos y vuelve a ejecutar." +} +info "Todos los prerequisitos OK" # ============================================================================= -# 2. CONFIGURACION POC (sin contrasenas fuertes, solo local) +# 2. CONFIGURACION POC # ============================================================================= POC_DB_NAME="coconews_poc" POC_DB_USER="coconews_poc" POC_DB_PASS="poc_password_local" -POC_REDIS_PORT="6380" # Puerto alternativo para no interferir con Redis principal +POC_REDIS_PORT="6380" POC_API_PORT="18080" POC_FRONTEND_PORT="18001" @@ -71,68 +159,150 @@ export SECRET_KEY="poc_secret_key_solo_para_desarrollo_local" export SERVER_PORT="$POC_API_PORT" export GIN_MODE="release" +# Verificar que los puertos necesarios estén libres +check_port() { + local port="$1" name="$2" + if ss -tlnp 2>/dev/null | grep -q ":${port} " || \ + lsof -i ":${port}" &>/dev/null 2>&1; then + warn "Puerto ${port} (${name}) ya en uso." + echo -e " Proceso usando ese puerto:" + ss -tlnp 2>/dev/null | grep ":${port} " | sed 's/^/ /' || true + echo -e " Puedes cambiarlo editando las variables POC_*_PORT en poc.sh" + return 1 + fi + return 0 +} + +step "Verificando puertos disponibles..." +PORT_OK=true +check_port "$POC_API_PORT" "API backend" || PORT_OK=false +check_port "$POC_FRONTEND_PORT" "Frontend" || PORT_OK=false +check_port "$POC_REDIS_PORT" "Redis POC" || PORT_OK=false +[[ "$PORT_OK" == "false" ]] && error "Hay puertos en conflicto. Resuélvelos antes de continuar." +info "Puertos ${POC_API_PORT}, ${POC_FRONTEND_PORT}, ${POC_REDIS_PORT} disponibles" + # ============================================================================= -# 3. POSTGRESQL - crear DB de prueba +# 3. POSTGRESQL # ============================================================================= step "Preparando base de datos POC (${POC_DB_NAME})..." # Verificar que PostgreSQL está corriendo -pg_isready -q || { +if ! pg_isready -q 2>/dev/null; then warn "PostgreSQL no está corriendo. Intentando iniciar..." sudo systemctl start postgresql 2>/dev/null || \ - sudo service postgresql start 2>/dev/null || \ - error "No se puede iniciar PostgreSQL. Inícialo manualmente." -} + sudo service postgresql start 2>/dev/null || { + echo -e " ${RED}Fallo al iniciar PostgreSQL.${NC}" + echo -e " Posibles causas y soluciones:" + echo -e " • Ver logs: sudo journalctl -u postgresql -n 30" + echo -e " • Ver estado: sudo systemctl status postgresql" + echo -e " • Puerto en uso: sudo ss -tlnp | grep 5432" + echo -e " • Datos corruptos: sudo pg_ctlcluster 16 main status" + error "PostgreSQL no pudo iniciarse." + } + sleep 2 + pg_isready -q || error "PostgreSQL sigue sin responder tras el inicio." +fi +info "PostgreSQL corriendo" -# Crear usuario y BD de prueba +# Limpiar si --clean +if [[ "$CLEAN_MODE" == "true" ]]; then + sudo -u postgres psql -q -c "DROP DATABASE IF EXISTS ${POC_DB_NAME};" 2>/dev/null || true + sudo -u postgres psql -q -c "DROP USER IF EXISTS ${POC_DB_USER};" 2>/dev/null || true + info "BD anterior eliminada" +fi + +# Crear usuario POC sudo -u postgres psql -q -tc "SELECT 1 FROM pg_roles WHERE rolname='${POC_DB_USER}'" \ - | grep -q 1 || \ - sudo -u postgres psql -q -c "CREATE USER ${POC_DB_USER} WITH PASSWORD '${POC_DB_PASS}';" 2>/dev/null + | grep -q 1 2>/dev/null || \ + sudo -u postgres psql -q -c "CREATE USER ${POC_DB_USER} WITH PASSWORD '${POC_DB_PASS}';" \ + 2>"$LOG_DIR/psql-setup.log" || { + show_log_tail "$LOG_DIR/psql-setup.log" + echo -e " Verifica que tienes permisos sudo para el usuario postgres:" + echo -e " sudo -u postgres psql -c '\\du'" + error "No se pudo crear el usuario PostgreSQL ${POC_DB_USER}." + } +# Crear BD sudo -u postgres psql -q -tc "SELECT 1 FROM pg_database WHERE datname='${POC_DB_NAME}'" \ - | grep -q 1 || \ - sudo -u postgres createdb -O "${POC_DB_USER}" "${POC_DB_NAME}" 2>/dev/null + | grep -q 1 2>/dev/null || \ + sudo -u postgres createdb -O "${POC_DB_USER}" "${POC_DB_NAME}" \ + 2>"$LOG_DIR/psql-createdb.log" || { + show_log_tail "$LOG_DIR/psql-createdb.log" + error "No se pudo crear la base de datos ${POC_DB_NAME}." + } -# Aplicar schema completo +# Schema sudo -u postgres psql -q -d "${POC_DB_NAME}" \ - -f "$REPO_ROOT/init-db/00-complete-schema.sql" 2>/dev/null || true + -f "$REPO_ROOT/init-db/00-complete-schema.sql" \ + >"$LOG_DIR/psql-schema.log" 2>&1 || \ + warn "Algunos statements del schema fallaron (puede ser normal si ya existían)" -# Otorgar permisos al usuario POC +# Permisos sudo -u postgres psql -q -d "${POC_DB_NAME}" \ -c "GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO ${POC_DB_USER}; - GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO ${POC_DB_USER};" \ + GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO ${POC_DB_USER}; + GRANT ALL PRIVILEGES ON ALL FUNCTIONS IN SCHEMA public TO ${POC_DB_USER};" \ 2>/dev/null || true -# Cargar datos de muestra -sudo -u postgres psql -q -d "${POC_DB_NAME}" -f "$POC_DIR/seed.sql" 2>/dev/null || true +# Seed +sudo -u postgres psql -q -d "${POC_DB_NAME}" \ + -f "$POC_DIR/seed.sql" >"$LOG_DIR/psql-seed.log" 2>&1 || \ + warn "Algunos inserts del seed fallaron (puede que ya existieran)" -NEWS_COUNT=$(sudo -u postgres psql -tq -d "${POC_DB_NAME}" -c "SELECT COUNT(*) FROM noticias;" 2>/dev/null | tr -d ' ') -info "BD lista: ${NEWS_COUNT} noticias de prueba cargadas" +NEWS_COUNT=$(sudo -u postgres psql -tq -d "${POC_DB_NAME}" \ + -c "SELECT COUNT(*) FROM noticias;" 2>/dev/null | tr -d ' \n' || echo "?") +info "BD lista: ${NEWS_COUNT} noticias de prueba" # ============================================================================= # 4. REDIS (instancia temporal en puerto alternativo) # ============================================================================= step "Iniciando Redis en puerto ${POC_REDIS_PORT}..." -redis-server --port "$POC_REDIS_PORT" --daemonize yes \ - --logfile "$TMP_DIR/redis-poc.log" \ + +redis-server \ + --port "$POC_REDIS_PORT" \ + --daemonize yes \ + --logfile "$LOG_DIR/redis.log" \ --pidfile "$TMP_DIR/redis-poc.pid" \ - --maxmemory 128mb --maxmemory-policy allkeys-lru \ - 2>/dev/null || warn "Redis ya corriendo en ${POC_REDIS_PORT}" + --maxmemory 128mb \ + --maxmemory-policy allkeys-lru \ + 2>"$LOG_DIR/redis-start.log" || { + show_log_tail "$LOG_DIR/redis-start.log" + echo -e " Posibles causas:" + echo -e " • Puerto ${POC_REDIS_PORT} en uso: sudo ss -tlnp | grep ${POC_REDIS_PORT}" + echo -e " • Permisos: redis-server necesita poder escribir en $LOG_DIR" + error "Redis no pudo iniciarse en puerto ${POC_REDIS_PORT}." + } + sleep 1 -redis-cli -p "$POC_REDIS_PORT" ping &>/dev/null && info "Redis OK (puerto ${POC_REDIS_PORT})" +if redis-cli -p "$POC_REDIS_PORT" ping 2>/dev/null | grep -q PONG; then + info "Redis OK (puerto ${POC_REDIS_PORT})" +else + show_log_tail "$LOG_DIR/redis.log" + error "Redis inició pero no responde a PING." +fi cat "$TMP_DIR/redis-poc.pid" 2>/dev/null >> "$PID_FILE" || true # ============================================================================= # 5. COMPILAR BACKEND GO # ============================================================================= step "Compilando backend Go..." -mkdir -p "$TMP_DIR/bin" (cd "$REPO_ROOT/backend" && \ - CGO_ENABLED=0 go build -buildvcs=false -o "$TMP_DIR/bin/server" ./cmd/server \ - 2>"$TMP_DIR/build-backend.log") || { - cat "$TMP_DIR/build-backend.log" - error "Fallo al compilar backend. Ver log arriba." + CGO_ENABLED=0 go build -buildvcs=false \ + -o "$TMP_DIR/bin/server" ./cmd/server \ + 2>"$LOG_DIR/build-backend.log") || { + echo "" + # Filtrar errores relevantes del log de compilación + echo -e " ${RED}Error de compilación:${NC}" + grep -E "^(.*\.go:[0-9]+:|error:|undefined:)" "$LOG_DIR/build-backend.log" \ + | head -20 | sed 's/^/ /' || show_log_tail "$LOG_DIR/build-backend.log" 15 + echo "" + echo -e " Posibles causas:" + echo -e " • Version de Go insuficiente: $(go version)" + echo -e " Se necesita 1.22+. Instala: sudo bash deploy/debian/prerequisites.sh" + echo -e " • Dependencias faltantes: cd backend && go mod download" + echo -e " • Log completo: cat $LOG_DIR/build-backend.log" + error "Fallo al compilar backend Go." } info "Backend compilado OK" @@ -140,74 +310,122 @@ info "Backend compilado OK" # 6. ARRANCAR BACKEND API # ============================================================================= step "Arrancando API en puerto ${POC_API_PORT}..." -"$TMP_DIR/bin/server" > "$TMP_DIR/backend.log" 2>&1 & + +"$TMP_DIR/bin/server" > "$LOG_DIR/backend.log" 2>&1 & BACKEND_PID=$! echo "$BACKEND_PID" >> "$PID_FILE" -# Esperar a que el backend responda -for i in {1..15}; do +# Esperar con feedback visual +printf " Esperando respuesta" +BACKEND_UP=false +for i in {1..20}; do sleep 1 + printf "." if curl -sf "http://127.0.0.1:${POC_API_PORT}/api/stats" &>/dev/null; then - info "API respondiendo en http://127.0.0.1:${POC_API_PORT}" + BACKEND_UP=true break fi - if [[ $i -eq 15 ]]; then - cat "$TMP_DIR/backend.log" - error "El backend no responde. Ver log arriba." + # Detectar si el proceso murió + if ! kill -0 "$BACKEND_PID" 2>/dev/null; then + break fi done +echo "" + +if [[ "$BACKEND_UP" == "false" ]]; then + echo "" + echo -e " ${RED}El backend no arrancó correctamente.${NC}" + echo -e " ${BOLD}Log del backend:${NC}" + show_log_tail "$LOG_DIR/backend.log" 25 + echo "" + echo -e " Posibles causas:" + echo -e " • Error de conexión a BD: revisa DATABASE_URL" + echo -e " Prueba: psql \"${DATABASE_URL}\" -c 'SELECT 1'" + echo -e " • Error de conexión a Redis: revisa REDIS_URL" + echo -e " Prueba: redis-cli -p ${POC_REDIS_PORT} ping" + echo -e " • Puerto ${POC_API_PORT} ocupado: ss -tlnp | grep ${POC_API_PORT}" + echo -e " • Log completo: cat $LOG_DIR/backend.log" + error "Backend no responde." +fi +info "API respondiendo en http://127.0.0.1:${POC_API_PORT}" # ============================================================================= # 7. FRONTEND REACT # ============================================================================= -step "Preparando frontend..." +step "Preparando frontend React..." cd "$REPO_ROOT/frontend" if [[ ! -d node_modules ]]; then - step " Instalando dependencias npm (primera vez)..." - npm install --silent + step " Instalando dependencias npm (primera vez, puede tardar)..." + npm install 2>"$LOG_DIR/npm-install.log" || { + show_log_tail "$LOG_DIR/npm-install.log" 20 + echo -e " Posibles causas:" + echo -e " • Sin conexión a internet (npm registry)" + echo -e " • Versión de Node.js incompatible: $(node -v)" + echo -e " • Disco lleno: df -h ." + echo -e " • Prueba manualmente: cd frontend && npm install" + error "npm install falló." + } fi -# Compilar apuntando al API local del POC +step " Compilando frontend..." VITE_API_URL="http://127.0.0.1:${POC_API_PORT}" \ -npm run build -- --outDir "$TMP_DIR/frontend-dist" 2>"$TMP_DIR/build-frontend.log" || { - cat "$TMP_DIR/build-frontend.log" - error "Fallo al compilar frontend. Ver log arriba." +npm run build -- --outDir "$TMP_DIR/frontend-dist" \ + >"$LOG_DIR/build-frontend.log" 2>&1 || { + echo -e " ${RED}Error de compilación del frontend:${NC}" + grep -E "(error TS|Error:|ERROR)" "$LOG_DIR/build-frontend.log" \ + | head -15 | sed 's/^/ /' || show_log_tail "$LOG_DIR/build-frontend.log" 15 + echo "" + echo -e " Posibles causas:" + echo -e " • Error TypeScript: revisa cambios recientes en src/" + echo -e " • Log completo: cat $LOG_DIR/build-frontend.log" + error "Compilación del frontend falló." } info "Frontend compilado OK" -# Servir frontend con npx serve (simple, sin nginx) -step "Sirviendo frontend en puerto ${POC_FRONTEND_PORT}..." +step " Sirviendo frontend en puerto ${POC_FRONTEND_PORT}..." npx --yes serve "$TMP_DIR/frontend-dist" -l "$POC_FRONTEND_PORT" \ - > "$TMP_DIR/frontend.log" 2>&1 & + > "$LOG_DIR/frontend-serve.log" 2>&1 & FRONTEND_PID=$! echo "$FRONTEND_PID" >> "$PID_FILE" + sleep 2 +if ! kill -0 "$FRONTEND_PID" 2>/dev/null; then + show_log_tail "$LOG_DIR/frontend-serve.log" + echo -e " Prueba manual: npx serve $TMP_DIR/frontend-dist -l ${POC_FRONTEND_PORT}" + error "El servidor del frontend no arrancó." +fi +info "Frontend sirviendo en http://127.0.0.1:${POC_FRONTEND_PORT}" cd "$REPO_ROOT" # ============================================================================= -# LISTO +# RESUMEN FINAL # ============================================================================= echo "" echo -e "${BOLD}${GREEN}=================================================${NC}" echo -e "${BOLD}${GREEN} COCONEWS POC corriendo${NC}" echo -e "${BOLD}${GREEN}=================================================${NC}" echo "" -echo -e " ${BOLD}Frontend:${NC} http://127.0.0.1:${POC_FRONTEND_PORT}" -echo -e " ${BOLD}API:${NC} http://127.0.0.1:${POC_API_PORT}/api/stats" +echo -e " ${BOLD}Abrir en el navegador:${NC}" +echo -e " → http://127.0.0.1:${POC_FRONTEND_PORT}" echo "" -echo -e " ${BOLD}Login:${NC} Registra el primer usuario en la UI" -echo -e " (será admin automáticamente)" +echo -e " ${BOLD}Endpoints útiles:${NC}" +echo -e " API stats: http://127.0.0.1:${POC_API_PORT}/api/stats" +echo -e " API noticias: http://127.0.0.1:${POC_API_PORT}/api/news" echo "" -echo -e " ${BOLD}Noticias:${NC} ${NEWS_COUNT} artículos de prueba en español" -echo -e " ${YELLOW}Nota:${NC} Sin workers ML activos." -echo -e " Noticias no tendrán traducción ni entidades." +echo -e " ${BOLD}Primer login:${NC}" +echo -e " Regístrate en la UI → el primer usuario es admin automáticamente" echo "" -echo -e " ${BLUE}Logs:${NC} $TMP_DIR/*.log" +echo -e " ${BOLD}Datos cargados:${NC} ${NEWS_COUNT} noticias de prueba en español" +echo -e " ${YELLOW}Sin workers ML:${NC} no hay traducción ni entidades (normal en POC)" echo "" -echo -e " Pulsa ${BOLD}Ctrl+C${NC} para detener el POC." +echo -e " ${BOLD}Si algo falla:${NC}" +echo -e " Logs: $LOG_DIR/" +echo -e " Backend: tail -f $LOG_DIR/backend.log" +echo -e " Limpiar: bash poc/poc.sh --clean" +echo "" +echo -e " ${DIM}Pulsa Ctrl+C para detener${NC}" echo "" -# Mantener el script corriendo wait From b3bf3d7a7fdce6d28b12c24f4307f135a9f411f6 Mon Sep 17 00:00:00 2001 From: SITO Date: Mon, 30 Mar 2026 22:29:50 +0200 Subject: [PATCH 6/8] refactor: reorganizar estructura de archivos en raiz MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Antes la raiz tenia 20+ archivos sueltos. Ahora organizado en: docs/ 10 archivos .md de documentacion tecnica scripts/ 3 scripts utilitarios (credentials, migrate, verify) config/ entity_config.json (aliases y blacklist NER) data/ feeds.csv (feeds precargados) Eliminados restos de Docker que ya no aplican: .dockerignore, .env.example, .env.secure.example, nginx.conf (raiz) Makefile: eliminados targets docker-build, añadidos install/rebuild/check/poc Referencias actualizadas en: deploy/debian/install.sh entity_config.json -> config/entity_config.json deploy/debian/build.sh entity_config.json -> config/entity_config.json README.md links a docs/ y data/ actualizados, arbol de estructura del repo reescrito Co-Authored-By: Claude Sonnet 4.6 --- .dockerignore | 13 -- .env.example | 67 ---------- .env.secure.example | 117 ------------------ Makefile | 33 +++-- README.md | 63 +++++++--- .../entity_config.json | 0 feeds.csv => data/feeds.csv | 0 deploy/debian/build.sh | 2 +- deploy/debian/install.sh | 2 +- DEPLOY.md => docs/DEPLOY.md | 0 DEPLOY_DEBIAN.md => docs/DEPLOY_DEBIAN.md | 0 .../FUNCIONES_DE_ARCHIVOS.md | 0 .../IMPLEMENTACION_LLM_RESUMEN.md | 0 .../NEWSPAPER_STYLE_GUIDE.md | 0 QDRANT_SETUP.md => docs/QDRANT_SETUP.md | 0 QUICKSTART_LLM.md => docs/QUICKSTART_LLM.md | 0 SECURITY_AUDIT.md => docs/SECURITY_AUDIT.md | 0 SECURITY_GUIDE.md => docs/SECURITY_GUIDE.md | 0 .../TRANSLATION_FIX_SUMMARY.md | 0 nginx.conf | 97 --------------- .../generate_secure_credentials.sh | 0 .../migrate_to_secure.sh | 0 .../verify_security.sh | 0 23 files changed, 59 insertions(+), 335 deletions(-) delete mode 100644 .dockerignore delete mode 100644 .env.example delete mode 100644 .env.secure.example rename entity_config.json => config/entity_config.json (100%) rename feeds.csv => data/feeds.csv (100%) rename DEPLOY.md => docs/DEPLOY.md (100%) rename DEPLOY_DEBIAN.md => docs/DEPLOY_DEBIAN.md (100%) rename FUNCIONES_DE_ARCHIVOS.md => docs/FUNCIONES_DE_ARCHIVOS.md (100%) rename IMPLEMENTACION_LLM_RESUMEN.md => docs/IMPLEMENTACION_LLM_RESUMEN.md (100%) rename NEWSPAPER_STYLE_GUIDE.md => docs/NEWSPAPER_STYLE_GUIDE.md (100%) rename QDRANT_SETUP.md => docs/QDRANT_SETUP.md (100%) rename QUICKSTART_LLM.md => docs/QUICKSTART_LLM.md (100%) rename SECURITY_AUDIT.md => docs/SECURITY_AUDIT.md (100%) rename SECURITY_GUIDE.md => docs/SECURITY_GUIDE.md (100%) rename TRANSLATION_FIX_SUMMARY.md => docs/TRANSLATION_FIX_SUMMARY.md (100%) delete mode 100644 nginx.conf rename generate_secure_credentials.sh => scripts/generate_secure_credentials.sh (100%) rename migrate_to_secure.sh => scripts/migrate_to_secure.sh (100%) rename verify_security.sh => scripts/verify_security.sh (100%) diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index b472c77..0000000 --- a/.dockerignore +++ /dev/null @@ -1,13 +0,0 @@ -.git -pgdata -pgdata-replica -pgdata-replica.old.* -pgdata.failed_restore -redis-data -hf_cache -qdrant_storage - -venv -__pycache__ -*.pyc -*.log diff --git a/.env.example b/.env.example deleted file mode 100644 index 4f5ab34..0000000 --- a/.env.example +++ /dev/null @@ -1,67 +0,0 @@ -# Database Configuration -POSTGRES_DB=rss -POSTGRES_USER=rss -POSTGRES_PASSWORD=change_this_password -DB_NAME=rss -DB_USER=rss -DB_PASS=change_this_password -DB_HOST=db -DB_PORT=5432 -DB_WRITE_HOST=db -DB_READ_HOST=db-replica - -# Redis Configuration -REDIS_HOST=redis -REDIS_PORT=6379 - -# Application Secrets -SECRET_KEY=change_this_to_a_long_random_string - -# External Services -ALLTALK_URL=http://host.docker.internal:7851 - -# AI Models & Workers -RSS_MAX_WORKERS=3 -# Translation Pipeline -TARGET_LANGS=es -TRANSLATOR_BATCH=16 -SCHEDULER_BATCH=2000 -SCHEDULER_SLEEP=30 -LANG_DETECT_BATCH=1000 -LANG_DETECT_SLEEP=60 - -# RSS Ingestor Configuration -RSS_POKE_INTERVAL_MIN=15 -RSS_MAX_FAILURES=10 -RSS_FEED_TIMEOUT=60 - -# URL Feed Discovery Worker -URL_DISCOVERY_INTERVAL_MIN=15 -URL_DISCOVERY_BATCH_SIZE=10 -MAX_FEEDS_PER_URL=5 - -# CTranslate2 / AI Model Paths -CT2_MODEL_PATH=/app/models/nllb-ct2 -CT2_DEVICE=cuda -CT2_COMPUTE_TYPE=int8_float16 -UNIVERSAL_MODEL=facebook/nllb-200-distilled-600M - -# Embeddings -EMB_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 -EMB_BATCH=64 -EMB_DEVICE=cuda - -# NER -NER_LANG=es -NER_BATCH=64 - -# Flask / Gunicorn -GUNICORN_WORKERS=8 -FLASK_DEBUG=0 - -# Qdrant Configuration -QDRANT_HOST=qdrant -QDRANT_PORT=6333 -QDRANT_COLLECTION_NAME=news_vectors -QDRANT_BATCH_SIZE=100 -QDRANT_SLEEP_IDLE=30 diff --git a/.env.secure.example b/.env.secure.example deleted file mode 100644 index 68b84cc..0000000 --- a/.env.secure.example +++ /dev/null @@ -1,117 +0,0 @@ -# ================================================================================== -# SEGURIDAD: CONFIGURACIÓN DE PRODUCCIÓN -# ================================================================================== -# -# IMPORTANTE: -# 1. Copia este archivo a .env -# 2. Cambia TODOS los valores de contraseñas y secrets -# 3. NO compartas este archivo en repositorios públicos -# 4. Añade .env al .gitignore -# -# ================================================================================== - -# ================================================================================== -# DATABASE CONFIGURATION - PostgreSQL -# ================================================================================== -POSTGRES_DB=rss -POSTGRES_USER=rss -# CRÍTICO: Genera una contraseña fuerte (mínimo 32 caracteres aleatorios) -# Ejemplo para generar: openssl rand -base64 32 -POSTGRES_PASSWORD=CAMBIAR_ESTO_POR_UNA_CONTRASEÑA_FUERTE_DE_32_CARACTERES - -DB_NAME=rss -DB_USER=rss -DB_PASS=CAMBIAR_ESTO_POR_UNA_CONTRASEÑA_FUERTE_DE_32_CARACTERES -DB_HOST=db -DB_PORT=5432 -DB_WRITE_HOST=db -DB_READ_HOST=db-replica - -# ================================================================================== -# REDIS CONFIGURATION - Autenticación habilitada -# ================================================================================== -REDIS_HOST=redis -REDIS_PORT=6379 -# CRÍTICO: Genera una contraseña fuerte para Redis -# Ejemplo: openssl rand -base64 32 -REDIS_PASSWORD=CAMBIAR_ESTO_POR_UNA_CONTRASEÑA_FUERTE_REDIS - -# ================================================================================== -# APPLICATION SECRETS -# ================================================================================== -# CRÍTICO: Secret key para Flask - debe ser único y secreto -# Genera con: python -c "import secrets; print(secrets.token_hex(32))" -SECRET_KEY=CAMBIAR_ESTO_POR_UN_TOKEN_HEX_DE_64_CARACTERES - -# ================================================================================== -# MONITORING - Grafana -# ================================================================================== -# IMPORTANTE: Cambia el password de admin de Grafana -GRAFANA_PASSWORD=CAMBIAR_ESTO_POR_UNA_CONTRASEÑA_FUERTE_GRAFANA - -# ================================================================================== -# EXTERNAL SERVICES -# ================================================================================== -ALLTALK_URL=http://host.docker.internal:7851 - -# ================================================================================== -# AI MODELS & WORKERS -# ================================================================================== -RSS_MAX_WORKERS=3 -TARGET_LANGS=es -TRANSLATOR_BATCH=128 -ENQUEUE=300 - -# RSS Ingestor Configuration -RSS_POKE_INTERVAL_MIN=15 -RSS_MAX_FAILURES=10 -RSS_FEED_TIMEOUT=60 - -# URL Feed Discovery Worker -URL_DISCOVERY_INTERVAL_MIN=15 -URL_DISCOVERY_BATCH_SIZE=10 -MAX_FEEDS_PER_URL=5 - -# CTranslate2 / AI Model Paths -CT2_MODEL_PATH=/app/models/nllb-ct2 -CT2_DEVICE=cuda -CT2_COMPUTE_TYPE=int8_float16 -UNIVERSAL_MODEL=facebook/nllb-200-distilled-600M - -# Embeddings -EMB_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 -EMB_BATCH=64 -EMB_DEVICE=cuda - -# NER -NER_LANG=es -NER_BATCH=64 - -# Flask / Gunicorn -GUNICORN_WORKERS=8 -FLASK_DEBUG=0 - -# Qdrant Configuration -QDRANT_HOST=qdrant -QDRANT_PORT=6333 -QDRANT_COLLECTION_NAME=news_vectors -QDRANT_BATCH_SIZE=100 -QDRANT_SLEEP_IDLE=30 - -# ================================================================================== -# COMANDOS ÚTILES PARA GENERAR CONTRASEÑAS SEGURAS -# ================================================================================== -# -# PostgreSQL Password (32 caracteres): -# openssl rand -base64 32 -# -# Redis Password (32 caracteres): -# openssl rand -base64 32 -# -# Flask Secret Key (64 hex chars): -# python -c "import secrets; print(secrets.token_hex(32))" -# -# Grafana Password (fuerte): -# openssl rand -base64 24 -# -# ================================================================================== diff --git a/Makefile b/Makefile index 5d463f2..3d7a6fb 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # RSS2 Workers Makefile -.PHONY: all build clean deps ingestor scraper discovery topics related qdrant server +.PHONY: all build clean deps ingestor scraper discovery topics related qdrant server install rebuild check poc # Binary output directory BIN_DIR := bin @@ -69,21 +69,16 @@ run-qdrant: DB_HOST=localhost DB_PORT=5432 DB_NAME=rss DB_USER=rss DB_PASS=rss \ QDRANT_HOST=localhost QDRANT_PORT=6333 OLLAMA_URL=http://localhost:11434 $(QDRANT) -# Docker builds -docker-build: - docker build -t rss2-ingestor -f rss-ingestor-go/Dockerfile ./rss-ingestor-go - docker build -t rss2-server -f backend/Dockerfile ./backend - docker build -t rss2-scraper -f Dockerfile.scraper ./backend - docker build -t rss2-discovery -f Dockerfile.discovery ./backend - docker build -t rss2-topics -f Dockerfile.topics ./backend - docker build -t rss2-related -f Dockerfile.related ./backend - docker build -t rss2-qdrant -f Dockerfile.qdrant ./backend - docker build -t rss2-langdetect -f Dockerfile . - docker build -t rss2-scheduler -f Dockerfile.scheduler . - docker build -t rss2-translator -f Dockerfile.translator . - docker build -t rss2-translator-gpu -f Dockerfile.translator-gpu . - docker build -t rss2-embeddings -f Dockerfile.embeddings_worker . - docker build -t rss2-ner -f Dockerfile . - docker build -t rss2-llm-categorizer -f Dockerfile.llm_worker . - docker build -t rss2-frontend -f frontend/Dockerfile ./frontend - docker build -t rss2-nginx -f Dockerfile.nginx . +# Despliegue en Debian (sin Docker) +install: + sudo bash deploy/debian/prerequisites.sh + sudo bash deploy/debian/install.sh + +rebuild: + sudo bash deploy/debian/build.sh + +check: + bash deploy/debian/check.sh + +poc: + bash poc/poc.sh diff --git a/README.md b/README.md index d70ea43..8f18314 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,7 @@ Compila los binarios Go, el frontend React, crea los servicios systemd y arranca http://IP_DEL_SERVIDOR:8001 ``` -Guía completa: [DEPLOY_DEBIAN.md](DEPLOY_DEBIAN.md) +Guía completa: [docs/DEPLOY_DEBIAN.md](docs/DEPLOY_DEBIAN.md) --- @@ -155,23 +155,46 @@ sudo bash deploy/debian/build.sh ## Estructura del repositorio ``` -├── backend/ Go — API REST + workers (scraper, discovery, wiki, topics, related, qdrant) -├── rss-ingestor-go/ Go — Ingestor de feeds RSS -├── frontend/ React + TypeScript + Tailwind -├── workers/ Python — ML workers (traducción, embeddings, NER, cluster, categorización) -├── init-db/ SQL — Schema y datos iniciales -├── migrations/ SQL — Migraciones incrementales -├── deploy/debian/ Scripts de despliegue para Debian sin Docker -│ ├── prerequisites.sh Instala todas las dependencias del sistema -│ ├── install.sh Instalación completa -│ ├── build.sh Recompila y reinicia tras actualizar código -│ ├── env.example Plantilla de variables de entorno -│ ├── nginx.conf Configuración nginx para despliegue nativo -│ └── systemd/ Ficheros de servicio systemd (16 servicios) -├── poc/ -│ ├── poc.sh POC local con datos de prueba (sin Docker, sin ML) -│ └── seed.sql Datos de muestra para el POC -├── feeds.csv Feeds RSS precargados para importar desde el admin -├── entity_config.json Aliases y blacklist para normalización de entidades NER -└── DEPLOY_DEBIAN.md Guía detallada de despliegue en Debian +├── README.md +├── requirements.txt Dependencias Python para workers ML +├── Makefile Compilación local de binarios Go +│ +├── backend/ Go — API REST + workers (scraper, discovery, wiki, topics, related, qdrant) +├── rss-ingestor-go/ Go — Ingestor de feeds RSS +├── frontend/ React + TypeScript + Tailwind +├── workers/ Python — Workers ML (traducción, embeddings, NER, cluster, categorización) +│ +├── init-db/ SQL — Schema completo y datos iniciales +├── migrations/ SQL — Migraciones incrementales +│ +├── config/ +│ └── entity_config.json Aliases y blacklist para normalización de entidades NER +│ +├── data/ +│ └── feeds.csv Feeds RSS precargados para importar desde el admin +│ +├── docs/ +│ ├── DEPLOY_DEBIAN.md Guía detallada de despliegue en Debian +│ ├── SECURITY_GUIDE.md Guía de seguridad +│ ├── SECURITY_AUDIT.md Resultado del audit de seguridad +│ ├── QDRANT_SETUP.md Configuración de Qdrant +│ └── ... Resto de documentación técnica +│ +├── scripts/ +│ ├── generate_secure_credentials.sh +│ ├── migrate_to_secure.sh +│ └── verify_security.sh +│ +├── deploy/debian/ Despliegue nativo en Debian (sin Docker) +│ ├── prerequisites.sh Instala todas las dependencias del sistema +│ ├── install.sh Instalación completa +│ ├── build.sh Recompila y reinicia tras actualizar código +│ ├── check.sh Diagnóstico del sistema +│ ├── env.example Plantilla de variables de entorno +│ ├── nginx.conf Configuración nginx +│ └── systemd/ 16 ficheros de servicio systemd +│ +└── poc/ + ├── poc.sh POC local en 2 minutos (sin Docker, sin ML) + └── seed.sql 10 noticias de muestra en español ``` diff --git a/entity_config.json b/config/entity_config.json similarity index 100% rename from entity_config.json rename to config/entity_config.json diff --git a/feeds.csv b/data/feeds.csv similarity index 100% rename from feeds.csv rename to data/feeds.csv diff --git a/deploy/debian/build.sh b/deploy/debian/build.sh index 6dd81b2..43efd5b 100755 --- a/deploy/debian/build.sh +++ b/deploy/debian/build.sh @@ -55,7 +55,7 @@ fi # --- Workers Python --- info "Sincronizando workers Python..." rsync -a --delete "$REPO_ROOT/workers/" "$RSS2_HOME/src/workers/" -cp "$REPO_ROOT/entity_config.json" "$RSS2_HOME/src/" 2>/dev/null || true +cp "$REPO_ROOT/config/entity_config.json" "$RSS2_HOME/src/" 2>/dev/null || true info " [OK] workers Python" chown -R rss2:rss2 "$RSS2_HOME/bin" "$RSS2_HOME/frontend/dist" "$RSS2_HOME/src" diff --git a/deploy/debian/install.sh b/deploy/debian/install.sh index 9627bbc..128336f 100755 --- a/deploy/debian/install.sh +++ b/deploy/debian/install.sh @@ -172,7 +172,7 @@ fi # Copiar workers Python al directorio de trabajo info "Copiando workers Python..." rsync -a --delete "$REPO_ROOT/workers/" "$RSS2_HOME/src/workers/" -cp "$REPO_ROOT/entity_config.json" "$RSS2_HOME/src/" 2>/dev/null || true +cp "$REPO_ROOT/config/entity_config.json" "$RSS2_HOME/src/" 2>/dev/null || true # ============================================================================= # 7. COMPILAR GO (backend + workers) diff --git a/DEPLOY.md b/docs/DEPLOY.md similarity index 100% rename from DEPLOY.md rename to docs/DEPLOY.md diff --git a/DEPLOY_DEBIAN.md b/docs/DEPLOY_DEBIAN.md similarity index 100% rename from DEPLOY_DEBIAN.md rename to docs/DEPLOY_DEBIAN.md diff --git a/FUNCIONES_DE_ARCHIVOS.md b/docs/FUNCIONES_DE_ARCHIVOS.md similarity index 100% rename from FUNCIONES_DE_ARCHIVOS.md rename to docs/FUNCIONES_DE_ARCHIVOS.md diff --git a/IMPLEMENTACION_LLM_RESUMEN.md b/docs/IMPLEMENTACION_LLM_RESUMEN.md similarity index 100% rename from IMPLEMENTACION_LLM_RESUMEN.md rename to docs/IMPLEMENTACION_LLM_RESUMEN.md diff --git a/NEWSPAPER_STYLE_GUIDE.md b/docs/NEWSPAPER_STYLE_GUIDE.md similarity index 100% rename from NEWSPAPER_STYLE_GUIDE.md rename to docs/NEWSPAPER_STYLE_GUIDE.md diff --git a/QDRANT_SETUP.md b/docs/QDRANT_SETUP.md similarity index 100% rename from QDRANT_SETUP.md rename to docs/QDRANT_SETUP.md diff --git a/QUICKSTART_LLM.md b/docs/QUICKSTART_LLM.md similarity index 100% rename from QUICKSTART_LLM.md rename to docs/QUICKSTART_LLM.md diff --git a/SECURITY_AUDIT.md b/docs/SECURITY_AUDIT.md similarity index 100% rename from SECURITY_AUDIT.md rename to docs/SECURITY_AUDIT.md diff --git a/SECURITY_GUIDE.md b/docs/SECURITY_GUIDE.md similarity index 100% rename from SECURITY_GUIDE.md rename to docs/SECURITY_GUIDE.md diff --git a/TRANSLATION_FIX_SUMMARY.md b/docs/TRANSLATION_FIX_SUMMARY.md similarity index 100% rename from TRANSLATION_FIX_SUMMARY.md rename to docs/TRANSLATION_FIX_SUMMARY.md diff --git a/nginx.conf b/nginx.conf deleted file mode 100644 index af65c19..0000000 --- a/nginx.conf +++ /dev/null @@ -1,97 +0,0 @@ -user nginx; -worker_processes auto; -error_log /var/log/nginx/error.log warn; -pid /var/run/nginx.pid; - -events { - worker_connections 2048; - use epoll; -} - -http { - include /etc/nginx/mime.types; - default_type application/octet-stream; - - log_format main '$remote_addr - $remote_user [$time_local] "$request" ' - '$status $body_bytes_sent "$http_referer" ' - '"$http_user_agent" "$http_x_forwarded_for"'; - - access_log /var/log/nginx/access.log main; - - sendfile on; - tcp_nopush on; - tcp_nodelay on; - keepalive_timeout 65; - types_hash_max_size 2048; - client_max_body_size 100M; - - gzip on; - gzip_vary on; - gzip_proxied any; - gzip_comp_level 6; - gzip_types text/plain text/css text/javascript - application/json application/javascript - application/xml text/xml; - - # Upstream for Go API - upstream api_backend { - server backend-go:8080; - keepalive 32; - } - - # Upstream for React Frontend - upstream frontend { - server rss2_frontend:80; - keepalive 16; - } - - server { - listen 80; - server_name _; - - client_body_timeout 60s; - client_header_timeout 60s; - send_timeout 300s; - - # Serve React Frontend - location / { - proxy_pass http://frontend; - proxy_http_version 1.1; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection 'upgrade'; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_cache_bypass $http_upgrade; - } - - # Proxy to Go API - location /api/ { - proxy_pass http://api_backend/api/; - proxy_http_version 1.1; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_set_header Connection ""; - - proxy_connect_timeout 60s; - proxy_send_timeout 300s; - proxy_read_timeout 300s; - } - - # Health check - location /health { - access_log off; - return 200 "ok"; - } - - # Block sensitive files - location ~ /\. { - deny all; - access_log off; - log_not_found off; - } - } -} diff --git a/generate_secure_credentials.sh b/scripts/generate_secure_credentials.sh similarity index 100% rename from generate_secure_credentials.sh rename to scripts/generate_secure_credentials.sh diff --git a/migrate_to_secure.sh b/scripts/migrate_to_secure.sh similarity index 100% rename from migrate_to_secure.sh rename to scripts/migrate_to_secure.sh diff --git a/verify_security.sh b/scripts/verify_security.sh similarity index 100% rename from verify_security.sh rename to scripts/verify_security.sh From 10f0555c468d534cc3fde964da7222a0cf6d4069 Mon Sep 17 00:00:00 2001 From: SITO Date: Mon, 30 Mar 2026 22:42:12 +0200 Subject: [PATCH 7/8] feat(poc): expandir POC con datos multilingues, admin pre-creado y guia completa MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - poc/seed.sql: 17 noticias (ES/EN/FR) con traducciones y 25 entidades NER - poc/poc.sh: corregir VITE_API_URL (faltaba sufijo /api), crear admin con bcrypt - docs/POC_GUIDE.md: guia paso a paso para que el compañero explore la demo - README.md: añadir credenciales admin y enlace a la guia POC Co-Authored-By: Claude Sonnet 4.6 --- README.md | 4 +- docs/POC_GUIDE.md | 287 ++++++++++++++++++++++++++++++++++++++++++++ poc/poc.sh | 40 ++++++- poc/seed.sql | 295 +++++++++++++++++++++++++++++++++++----------- 4 files changed, 553 insertions(+), 73 deletions(-) create mode 100644 docs/POC_GUIDE.md diff --git a/README.md b/README.md index 8f18314..f4a8ece 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,9 @@ bash poc/poc.sh ``` Abre `http://127.0.0.1:18001` en el navegador. -El primer usuario que se registre será administrador. +Login: **admin** / **admin123** (creado automáticamente si tienes `pip3 install bcrypt`). + +Guía detallada para explorar la demo: [docs/POC_GUIDE.md](docs/POC_GUIDE.md) --- diff --git a/docs/POC_GUIDE.md b/docs/POC_GUIDE.md new file mode 100644 index 0000000..6e83960 --- /dev/null +++ b/docs/POC_GUIDE.md @@ -0,0 +1,287 @@ +# COCONEWS — Guía de la POC local + +Esta guía te lleva de cero a tener COCONEWS corriendo en tu máquina en menos de 5 minutos, sin Docker, sin modelos ML y sin configuración compleja. + +--- + +## ¿Qué es esta POC? + +La POC (Proof of Concept) arranca únicamente: + +- **Backend Go** (API REST en puerto 18080) +- **Frontend React** compilado y servido estáticamente (puerto 18001) +- **PostgreSQL** con 17 noticias de muestra en español, inglés y francés +- **Redis** en un puerto alternativo (6380) para no interferir con Redis del sistema + +**No se ejecutan** los workers ML (NLLB-200, embeddings, NER, etc.). Las noticias ya vienen con traducciones y entidades precargadas en la BD. + +--- + +## Requisitos mínimos + +| Herramienta | Versión mínima | Instalar en Debian/Ubuntu | +|-------------|----------------|---------------------------| +| Go | 1.22+ (recomendado 1.25) | `sudo bash deploy/debian/prerequisites.sh` | +| Node.js | 18+ | `curl -fsSL https://deb.nodesource.com/setup_20.x \| sudo bash - && sudo apt install nodejs` | +| PostgreSQL | 14+ | `sudo apt install postgresql postgresql-client` | +| Redis | 6+ | `sudo apt install redis-server` | +| Python 3 + bcrypt | opcional | `pip3 install bcrypt` (para auto-crear admin) | + +Para instalar todo de golpe en Debian/Ubuntu: + +```bash +sudo bash deploy/debian/prerequisites.sh +``` + +--- + +## Ejecución + +```bash +# Desde la raíz del repositorio +bash poc/poc.sh +``` + +La primera vez tarda ~2-3 minutos (compilación Go + instalación de dependencias npm). +Las siguientes ejecuciones arrancan en ~30 segundos. + +**Abrir en el navegador:** `http://127.0.0.1:18001` + +--- + +## Login + +Si tienes `python3-bcrypt` instalado, el script crea automáticamente un usuario admin: + +| Campo | Valor | +|-------|-------| +| Usuario | `admin` | +| Contraseña | `admin123` | + +Si no tienes bcrypt, regístrate desde la UI. El **primer usuario registrado** se convierte automáticamente en administrador (comportamiento del WelcomeWizard integrado). + +--- + +## Qué puedes explorar + +### 1. Feed principal de noticias + +En la página de inicio verás 17 noticias de muestra distribuidas en varias categorías: + +- **Ciencia y tecnología**: GPT-5, inteligencia artificial, energía solar, exploración espacial +- **Economía**: criptomonedas, mercados, política fiscal +- **Deportes**: Lamine Yamal, Champions League, Fórmula 1 +- **Política internacional**: Macron, conflictos globales +- **Medio ambiente**: energía eólica, cambio climático + +### 2. Filtros y búsqueda + +- **Búsqueda por texto**: prueba "inteligencia artificial" o "energía" +- **Filtrar por idioma original**: muestra noticias en inglés o francés tal como llegaron +- **Solo traducidas**: filtra las que ya tienen traducción al español (todas en la POC) +- **Filtrar por categoría**: Ciencia, Economía, Deportes, etc. +- **Filtrar por entidades**: personas, organizaciones, lugares + +### 3. Entidades NER (personas, organizaciones, lugares) + +Las noticias tienen entidades precargadas. En el panel lateral o en los tooltips verás: + +| Tipo | Ejemplos | +|------|---------| +| Persona | Elon Musk, Lamine Yamal, Emmanuel Macron, Sam Altman | +| Organización | OpenAI, NASA, Tesla, FC Barcelona, UEFA | +| Lugar | España, China, Estados Unidos, París | +| Tema | inteligencia artificial, energía solar, criptomonedas | + +Haz clic en una entidad para ver todas las noticias relacionadas con ella. + +### 4. Panel de administración + +Accede con el usuario admin a: + +- **Feeds RSS**: lista de fuentes configuradas (importa desde `data/feeds.csv`) +- **Gestión de usuarios**: ver y gestionar cuentas +- **Configuración del sistema**: parámetros del backend +- **Estadísticas**: contadores de noticias, feeds, entidades + +Para importar los feeds de muestra incluidos en el repo: + +1. Ve a Admin → Feeds +2. Usa la opción de importar CSV +3. Selecciona `data/feeds.csv` (incluye ~200 fuentes precargadas) + +### 5. API REST directa + +El backend expone una API REST documentada. Puedes explorarla directamente: + +```bash +# Estadísticas generales +curl http://127.0.0.1:18080/api/stats | jq . + +# Últimas noticias +curl http://127.0.0.1:18080/api/news | jq . + +# Buscar noticias +curl "http://127.0.0.1:18080/api/news?q=inteligencia+artificial" | jq . + +# Filtrar por idioma +curl "http://127.0.0.1:18080/api/news?lang=en" | jq . + +# Entidades disponibles +curl http://127.0.0.1:18080/api/tags | jq . + +# Noticias de una entidad específica +curl "http://127.0.0.1:18080/api/news?tag=OpenAI" | jq . +``` + +--- + +## Empezar de cero + +Si quieres reiniciar la BD de prueba (por ejemplo, tras cambiar el seed): + +```bash +bash poc/poc.sh --clean +``` + +Esto borra la BD `coconews_poc` y la recrea desde cero. + +--- + +## Estructura de los datos de prueba + +El seed está en [poc/seed.sql](../poc/seed.sql). Contiene: + +| Elemento | Cantidad | +|----------|----------| +| Noticias en español | 10 | +| Noticias en inglés (con traducción) | 5 | +| Noticias en francés (con traducción) | 2 | +| Traducciones en tabla `traducciones` | 17 | +| Entidades NER en tabla `tags` | 25 | +| Asociaciones noticia↔entidad | ~45 | + +Los feeds de origen son ficticios (`techcrunch.com`, `bbc.com`, `lemonde.fr`, etc.) pero representan la estructura real que llegaría del ingestor RSS. + +--- + +## Diferencias con producción + +| Característica | POC | Producción | +|----------------|-----|-----------| +| Traducción automática | Precargada en BD | NLLB-200 en tiempo real | +| Extracción de entidades | Precargada en BD | spaCy es_core_news_lg | +| Embeddings semánticos | No disponible | MiniLM (sentence-transformers) | +| Búsqueda vectorial | No disponible | Qdrant | +| Ingesta de feeds | No activa | rss-ingestor-go continuo | +| Workers ML | No activos | 7 workers Python | +| Clusterización | No activa | Worker cluster | +| Redis | Puerto 6380 local | Puerto 6379 con auth | + +--- + +## Solución de problemas + +### El backend no arranca + +```bash +# Ver log completo +cat /tmp/coconews-poc/logs/backend.log + +# Probar conexión BD manualmente +psql "postgres://coconews_poc:poc_password_local@127.0.0.1:5432/coconews_poc" -c 'SELECT COUNT(*) FROM noticias;' + +# Probar Redis +redis-cli -p 6380 ping +``` + +### El frontend muestra pantalla en blanco + +Verifica que el backend está respondiendo: +```bash +curl http://127.0.0.1:18080/api/stats +``` + +Si el backend responde pero el frontend no carga datos, puede ser un problema de CORS o de la URL de la API. Revisa la consola del navegador (F12). + +### Error "puerto en uso" + +```bash +# Ver qué usa el puerto +ss -tlnp | grep 18080 +ss -tlnp | grep 18001 +ss -tlnp | grep 6380 + +# Matar proceso +kill $(lsof -ti:18080) +``` + +O edita las variables `POC_*_PORT` al inicio de `poc/poc.sh` para usar puertos diferentes. + +### Error de compilación Go + +```bash +# Descargar dependencias +cd backend && go mod download && go mod tidy + +# Verificar versión +go version # necesita 1.22+ + +# Ver log +cat /tmp/coconews-poc/logs/build-backend.log +``` + +### Error npm install + +```bash +# Limpiar caché y reintentar +cd frontend && rm -rf node_modules && npm install + +# Ver log +cat /tmp/coconews-poc/logs/npm-install.log +``` + +### PostgreSQL no arranca + +```bash +sudo systemctl status postgresql +sudo journalctl -u postgresql -n 30 +sudo pg_ctlcluster 16 main status +``` + +--- + +## Logs de la POC + +Todos los logs se guardan en `/tmp/coconews-poc/logs/`: + +| Archivo | Contenido | +|---------|-----------| +| `backend.log` | Logs del servidor Go en tiempo real | +| `build-backend.log` | Salida de compilación Go | +| `build-frontend.log` | Salida de `npm run build` | +| `redis.log` | Logs de Redis | +| `psql-schema.log` | Ejecución del schema SQL | +| `psql-seed.log` | Carga de datos de prueba | +| `frontend-serve.log` | Servidor estático `npx serve` | + +Para seguir los logs en tiempo real mientras la POC corre: + +```bash +# En otra terminal +tail -f /tmp/coconews-poc/logs/backend.log +``` + +--- + +## Siguiente paso: despliegue completo + +Cuando quieras probar el sistema completo con los workers ML: + +```bash +# En un servidor Debian +sudo bash deploy/debian/prerequisites.sh +sudo bash deploy/debian/install.sh +``` + +Consulta [DEPLOY_DEBIAN.md](DEPLOY_DEBIAN.md) para la guía completa. diff --git a/poc/poc.sh b/poc/poc.sh index f18cd4c..0f0c813 100755 --- a/poc/poc.sh +++ b/poc/poc.sh @@ -251,7 +251,32 @@ sudo -u postgres psql -q -d "${POC_DB_NAME}" \ NEWS_COUNT=$(sudo -u postgres psql -tq -d "${POC_DB_NAME}" \ -c "SELECT COUNT(*) FROM noticias;" 2>/dev/null | tr -d ' \n' || echo "?") -info "BD lista: ${NEWS_COUNT} noticias de prueba" +info "BD lista: ${NEWS_COUNT} noticias de prueba (ES/EN/FR con traducciones y entidades)" + +# Crear usuario admin pre-configurado +step "Creando usuario admin para la demo..." +ADMIN_USER="admin" +ADMIN_PASS="admin123" +ADMIN_CREATED=false + +if python3 -c "import bcrypt" 2>/dev/null; then + ADMIN_HASH=$(python3 -c " +import bcrypt +hashed = bcrypt.hashpw(b'admin123', bcrypt.gensalt(rounds=10)) +print(hashed.decode()) +") + sudo -u postgres psql -q -d "${POC_DB_NAME}" -c \ + "INSERT INTO users (username, email, password_hash, is_admin, role) + VALUES ('${ADMIN_USER}', 'admin@coconews.local', '${ADMIN_HASH}', TRUE, 'admin') + ON CONFLICT (username) DO NOTHING;" 2>/dev/null && ADMIN_CREATED=true || true +fi + +if [[ "$ADMIN_CREATED" == "true" ]]; then + info "Admin listo: ${ADMIN_USER} / ${ADMIN_PASS}" +else + warn "python3-bcrypt no disponible — regístrate en la UI (primer usuario será admin)" + warn "Instala bcrypt: pip3 install bcrypt y vuelve a ejecutar --clean" +fi # ============================================================================= # 4. REDIS (instancia temporal en puerto alternativo) @@ -369,7 +394,7 @@ if [[ ! -d node_modules ]]; then fi step " Compilando frontend..." -VITE_API_URL="http://127.0.0.1:${POC_API_PORT}" \ +VITE_API_URL="http://127.0.0.1:${POC_API_PORT}/api" \ npm run build -- --outDir "$TMP_DIR/frontend-dist" \ >"$LOG_DIR/build-frontend.log" 2>&1 || { echo -e " ${RED}Error de compilación del frontend:${NC}" @@ -414,11 +439,16 @@ echo -e " ${BOLD}Endpoints útiles:${NC}" echo -e " API stats: http://127.0.0.1:${POC_API_PORT}/api/stats" echo -e " API noticias: http://127.0.0.1:${POC_API_PORT}/api/news" echo "" -echo -e " ${BOLD}Primer login:${NC}" +echo -e " ${BOLD}Login admin:${NC}" +if [[ "$ADMIN_CREATED" == "true" ]]; then +echo -e " Usuario: ${BOLD}${ADMIN_USER}${NC}" +echo -e " Contraseña: ${BOLD}${ADMIN_PASS}${NC}" +else echo -e " Regístrate en la UI → el primer usuario es admin automáticamente" +fi echo "" -echo -e " ${BOLD}Datos cargados:${NC} ${NEWS_COUNT} noticias de prueba en español" -echo -e " ${YELLOW}Sin workers ML:${NC} no hay traducción ni entidades (normal en POC)" +echo -e " ${BOLD}Datos cargados:${NC} ${NEWS_COUNT} noticias (ES/EN/FR), traducciones y entidades NER" +echo -e " ${YELLOW}Sin workers ML:${NC} sin ingesta en tiempo real (normal en POC)" echo "" echo -e " ${BOLD}Si algo falla:${NC}" echo -e " Logs: $LOG_DIR/" diff --git a/poc/seed.sql b/poc/seed.sql index 44772d7..15aeed3 100644 --- a/poc/seed.sql +++ b/poc/seed.sql @@ -1,94 +1,255 @@ -- ============================================================================= --- COCONEWS POC - Datos de prueba mínimos --- Carga rápida para ver la interfaz funcionando sin workers ML +-- COCONEWS POC — Datos de demostración +-- Cubre: taxonomía, feeds, noticias en 3 idiomas, traducciones, entidades, +-- eventos agrupados y un usuario admin listo para usar -- ============================================================================= --- Taxonomía base +-- --------------------------------------------------------------------------- +-- TAXONOMÍA BASE +-- --------------------------------------------------------------------------- INSERT INTO continentes (id, nombre) VALUES - (1, 'África'), (2, 'América'), (3, 'Asia'), - (4, 'Europa'), (5, 'Oceanía') + (1,'África'),(2,'América'),(3,'Asia'),(4,'Europa'),(5,'Oceanía') ON CONFLICT (id) DO NOTHING; INSERT INTO categorias (nombre) VALUES - ('Ciencia'), ('Cultura'), ('Deportes'), ('Economía'), - ('Internacional'), ('Política'), ('Salud'), ('Tecnología'), ('Sociedad') + ('Ciencia'),('Cultura'),('Deportes'),('Economía'), + ('Internacional'),('Política'),('Salud'),('Tecnología'),('Sociedad') ON CONFLICT DO NOTHING; INSERT INTO paises (nombre, continente_id) VALUES - ('España', 4), - ('Argentina', 2), - ('México', 2), - ('Francia', 4), - ('Estados Unidos', 2) + ('España',4),('Argentina',2),('México',2),('Francia',4), + ('Estados Unidos',2),('Reino Unido',4),('Alemania',4),('China',3), + ('Brasil',2),('Italia',4) ON CONFLICT DO NOTHING; --- Config básica INSERT INTO config (key, value) VALUES - ('translator_type', 'cpu'), - ('translator_workers', '1'), - ('translator_status', 'stopped') + ('translator_type','cpu'), + ('translator_workers','1'), + ('translator_status','stopped') ON CONFLICT (key) DO NOTHING; --- Feeds de muestra (en español, no necesitan traducción) -INSERT INTO feeds (nombre, descripcion, url, idioma, activo, fallos) -VALUES - ('El País', 'Noticias de España y el mundo', 'https://feeds.elpais.com/mrss-s/pages/ep/site/elpais.com/portada', 'es', true, 0), - ('El Mundo', 'Diario de información general', 'https://e00-elmundo.uecdn.es/elmundo/rss/portada.xml', 'es', true, 0), - ('La Vanguardia','Noticias de España y Cataluña', 'https://www.lavanguardia.com/mvc/feed/rss/home', 'es', true, 0), - ('BBC Mundo', 'Noticias en español de la BBC', 'https://feeds.bbci.co.uk/mundo/rss.xml', 'es', true, 0), - ('RT Español', 'Russia Today en español', 'https://actualidad.rt.com/rss', 'es', true, 0) +-- --------------------------------------------------------------------------- +-- FEEDS +-- --------------------------------------------------------------------------- +INSERT INTO feeds (nombre, descripcion, url, idioma, activo, fallos) VALUES + ('El País', 'Diario de referencia en español', 'https://feeds.elpais.com/mrss-s/pages/ep/site/elpais.com/portada','es',true,0), + ('El Mundo', 'Información general de España', 'https://e00-elmundo.uecdn.es/elmundo/rss/portada.xml', 'es',true,0), + ('La Vanguardia', 'Cataluña y España', 'https://www.lavanguardia.com/mvc/feed/rss/home', 'es',true,0), + ('BBC Mundo', 'BBC en español', 'https://feeds.bbci.co.uk/mundo/rss.xml', 'es',true,0), + ('Le Monde', 'Diario francés de referencia', 'https://www.lemonde.fr/rss/une.xml', 'fr',true,0), + ('The Guardian', 'Periódico británico independiente', 'https://www.theguardian.com/world/rss', 'en',true,0), + ('Reuters', 'Agencia de noticias internacional', 'https://feeds.reuters.com/reuters/topNews', 'en',true,0), + ('El Confidencial','Periodismo de investigación en España', 'https://rss.elconfidencial.com/espana/', 'es',true,0) ON CONFLICT (url) DO NOTHING; --- Noticias de muestra (en español, listas para mostrarse sin traducción) -INSERT INTO noticias (id, titulo, resumen, url, fecha, fuente_nombre, categoria_id, lang, topics_processed) -VALUES - (md5('poc-001'), 'La inteligencia artificial transforma el mercado laboral global', - 'Los modelos de lenguaje de gran escala están redefiniendo sectores enteros de la economía, desde el servicio al cliente hasta el desarrollo de software. Empresas de todo el mundo aceleran su adopción mientras sindicatos y gobiernos debaten marcos regulatorios.', - 'https://example.com/ia-mercado-laboral', NOW() - INTERVAL '2 hours', - 'El País', 8, 'es', false), +-- --------------------------------------------------------------------------- +-- NOTICIAS — En español (no requieren traducción para mostrarse) +-- --------------------------------------------------------------------------- +INSERT INTO noticias (id,titulo,resumen,url,fecha,fuente_nombre,categoria_id,lang,topics_processed) VALUES +(md5('poc-es-01'), + 'La inteligencia artificial supera a médicos en diagnóstico de cáncer de piel', + 'Un modelo de deep learning desarrollado por investigadores del MIT logró detectar melanomas con una precisión del 94,2%, superando en 8 puntos porcentuales al diagnóstico de dermatólogos expertos en un ensayo clínico con 12.000 imágenes.', + 'https://example.com/ia-cancer-piel',NOW()-INTERVAL '1 hour','El País',1,'es',false), - (md5('poc-002'), 'Cumbre climática de la ONU aprueba fondo de 100.000 millones para países vulnerables', - 'Los representantes de 196 países alcanzaron un acuerdo histórico en la última jornada de negociaciones. El fondo estará operativo en 2026 y priorizará adaptación en África subsahariana y pequeñas islas del Pacífico.', - 'https://example.com/cumbre-climatica-onu', NOW() - INTERVAL '4 hours', - 'BBC Mundo', 5, 'es', false), +(md5('poc-es-02'), + 'España aprueba la mayor inversión en energía solar de su historia: 15.000 millones', + 'El Consejo de Ministros ha dado luz verde a un plan nacional que desplegará 40 gigavatios de capacidad fotovoltaica antes de 2030. La medida creará 120.000 empleos directos y situará a España como segundo productor solar de Europa.', + 'https://example.com/solar-espana',NOW()-INTERVAL '2 hours','La Vanguardia',4,'es',false), - (md5('poc-003'), 'España registra el mayor crecimiento económico de la eurozona en el primer trimestre', - 'El PIB español creció un 3,2% interanual en los primeros tres meses del año, impulsado por el turismo, las exportaciones y el consumo interno. El Banco de España revisa al alza sus previsiones para el conjunto del ejercicio.', - 'https://example.com/economia-espana-pib', NOW() - INTERVAL '5 hours', - 'El Mundo', 4, 'es', false), +(md5('poc-es-03'), + 'El Barça remonta ante el City y se clasifica para la final de Champions', + 'Remontada histórica en el Camp Nou. El Barcelona superó al Manchester City (3-1) tras ir perdiendo al descanso, con un Lamine Yamal estratosférico que firmó dos goles y una asistencia. La final se disputará en Wembley el próximo 31 de mayo.', + 'https://example.com/barca-champions',NOW()-INTERVAL '3 hours','El Mundo',3,'es',false), - (md5('poc-004'), 'La selección española de fútbol golea en la fase de clasificación', - 'La Roja aplastó por 4-0 al combinado rival con dos goles de Yamal y otros tantos de Morata en un partido que dejó pocas dudas sobre el potencial del equipo de Luis de la Fuente de cara al próximo gran torneo.', - 'https://example.com/seleccion-espana-futbol', NOW() - INTERVAL '6 hours', - 'La Vanguardia', 3, 'es', false), +(md5('poc-es-04'), + 'Argentina presenta un plan económico de estabilización con el FMI por 40.000 millones', + 'El gobierno de Buenos Aires y el Fondo Monetario Internacional cerraron un acuerdo que incluye una línea de crédito récord, reformas estructurales en el sector energético y un calendario de reducción del déficit fiscal hasta el equilibrio en 2026.', + 'https://example.com/argentina-fmi',NOW()-INTERVAL '4 hours','BBC Mundo',4,'es',false), - (md5('poc-005'), 'Descubrimiento arqueológico en Extremadura revela ciudad romana inédita', - 'Un equipo de la Universidad de Extremadura ha localizado los restos de un asentamiento romano del siglo II d.C. con teatro, termas y foro en perfecto estado de conservación bajo un olivar de la comarca de La Serena.', - 'https://example.com/arqueologia-extremadura', NOW() - INTERVAL '8 hours', - 'El País', 2, 'es', false), +(md5('poc-es-05'), + 'Descubrimiento en Pompeya revela un mercado de esclavos del siglo I d.C.', + 'Arqueólogos italianos desenterraron en el sector norte de Pompeya una estancia con frescos únicos que documentan por primera vez visualmente la venta de esclavos en el mundo romano. El hallazgo reescribe la comprensión del comercio humano en la Antigüedad.', + 'https://example.com/pompeya-esclavos',NOW()-INTERVAL '5 hours','El País',2,'es',false), - (md5('poc-006'), 'Nuevo fármaco contra el Alzheimer obtiene aprobación de la EMA', - 'La Agencia Europea del Medicamento ha dado luz verde al primer tratamiento que demuestra ralentizar significativamente el deterioro cognitivo en fases tempranas. El medicamento llegará a las farmacias europeas antes de final de año.', - 'https://example.com/farmaco-alzheimer-ema', NOW() - INTERVAL '10 hours', - 'BBC Mundo', 7, 'es', false), +(md5('poc-es-06'), + 'La OMS declara la resistencia antimicrobiana como emergencia sanitaria global', + 'La Organización Mundial de la Salud elevó al máximo nivel de alerta la crisis de los antibióticos, estimando 10 millones de muertes anuales para 2050 si no se toman medidas urgentes. Propone un fondo global de 5.000 millones de dólares para investigación.', + 'https://example.com/oms-antibioticos',NOW()-INTERVAL '6 hours','BBC Mundo',7,'es',false), - (md5('poc-007'), 'México anuncia plan de inversión en energías renovables por 50.000 millones', - 'El gobierno mexicano presentó su Estrategia Nacional de Transición Energética que contempla duplicar la capacidad solar y eólica instalada antes de 2030, con fuerte participación de capital privado nacional e internacional.', - 'https://example.com/mexico-renovables', NOW() - INTERVAL '12 hours', - 'RT Español', 5, 'es', false), +(md5('poc-es-07'), + 'Tesla presenta su robotaxi autónomo: sin volante, sin pedales y a 0,19€ el kilómetro', + 'Elon Musk reveló el Cybercab en un evento en Los Ángeles. El vehículo sin controles manuales utilizará visión por computador para la conducción autónoma de nivel 5 y estará disponible en 2026. El precio objetivo es 30.000 dólares por unidad.', + 'https://example.com/tesla-robotaxi',NOW()-INTERVAL '7 hours','El Confidencial',8,'es',false), - (md5('poc-008'), 'OpenAI lanza modelo multimodal capaz de generar video fotorrealista en tiempo real', - 'La empresa californiana presentó Sora 2, capaz de producir secuencias de vídeo de alta definición en menos de 30 segundos. Investigadores advierten sobre los riesgos de desinformación mientras la compañía promete mecanismos de marca de agua.', - 'https://example.com/openai-sora2', NOW() - INTERVAL '14 hours', - 'El Mundo', 8, 'es', false), +(md5('poc-es-08'), + 'México bate récord de remesas: 65.000 millones de dólares enviados desde el exterior', + 'El Banco de México informó que las transferencias de mexicanos en el extranjero alcanzaron un máximo histórico, representando ya el 3,8% del PIB nacional. Los estados de Michoacán, Jalisco y Guanajuato concentran el 45% de los ingresos.', + 'https://example.com/mexico-remesas',NOW()-INTERVAL '8 hours','BBC Mundo',4,'es',false), - (md5('poc-009'), 'Argentina cierra acuerdo comercial con la Unión Europea tras 25 años de negociaciones', - 'El tratado de libre comercio Mercosur-UE entra en vigor de forma provisional tras superar los últimos obstáculos relacionados con protección ambiental y acceso al mercado agrícola europeo para los productos del cono sur.', - 'https://example.com/argentina-acuerdo-ue', NOW() - INTERVAL '18 hours', - 'BBC Mundo', 4, 'es', false), +(md5('poc-es-09'), + 'Científicos españoles desarrollan una vacuna universal contra la gripe', + 'El equipo del CSIC liderado por la doctora Carmen López logró una vacuna que actúa sobre una región conservada del virus influenza, ofreciendo protección cruzada contra todas las cepas conocidas en estudios preclínicos con primates.', + 'https://example.com/vacuna-gripe-csic',NOW()-INTERVAL '9 hours','El País',1,'es',false), + +(md5('poc-es-10'), + 'El Gobierno lanza un bono cultural de 400€ para jóvenes de 18 años', + 'A partir del próximo trimestre, todos los españoles al cumplir la mayoría de edad recibirán un bono digital para gastar en libros, entradas de cine, teatro, museos y plataformas de streaming nacionales. El programa costará 200 millones anuales.', + 'https://example.com/bono-cultural',NOW()-INTERVAL '10 hours','La Vanguardia',2,'es',false), + +-- --------------------------------------------------------------------------- +-- NOTICIAS — En inglés (con traducción al español en tabla traducciones) +-- --------------------------------------------------------------------------- +(md5('poc-en-01'), + 'OpenAI releases GPT-5 with real-time reasoning and 1 million token context', + 'OpenAI announced GPT-5, its most advanced language model, featuring native multimodality, real-time web access and a context window of one million tokens — equivalent to an entire novel. The model outperforms human experts on 87% of professional benchmarks.', + 'https://example.com/gpt5-release',NOW()-INTERVAL '11 hours','The Guardian',8,'en',false), + +(md5('poc-en-02'), + 'NASA confirms water ice deposits at lunar south pole ahead of Artemis mission', + 'New data from the LCROSS mission confirms significant water ice deposits at Shackleton Crater near the lunar south pole. Scientists estimate up to 600 million metric tons of frozen water, enough to sustain a permanent Moon base for decades.', + 'https://example.com/nasa-moon-water',NOW()-INTERVAL '13 hours','Reuters',1,'en',false), + +(md5('poc-en-03'), + 'UK economy grows 3.1% in Q1, strongest performance since 2015', + 'Britain''s economy expanded at its fastest quarterly rate in nearly a decade, driven by a booming services sector, record exports of financial products and a surge in green technology manufacturing. The pound hit a two-year high against the dollar.', + 'https://example.com/uk-economy',NOW()-INTERVAL '15 hours','The Guardian',4,'en',false), + +(md5('poc-en-04'), + 'China launches world''s largest offshore wind farm: 16 gigawatts off Fujian coast', + 'State Grid Corporation of China connected the final turbines of the Fujian Offshore Wind Mega-Farm, generating enough clean electricity to power 20 million homes. The project took four years to build and employs 8,000 workers in ongoing maintenance.', + 'https://example.com/china-wind-farm',NOW()-INTERVAL '17 hours','Reuters',8,'en',false), + +(md5('poc-en-05'), + 'Amazon to acquire healthcare giant Humana for $28 billion in landmark deal', + 'In what would be the second-largest acquisition in Amazon''s history, the e-commerce and cloud giant has agreed to purchase Humana, one of the largest US health insurers. Regulators will review the deal which aims to combine pharmacy, insurance and logistics.', + 'https://example.com/amazon-humana',NOW()-INTERVAL '19 hours','The Guardian',4,'en',false), + +-- --------------------------------------------------------------------------- +-- NOTICIAS — En francés (con traducción al español) +-- --------------------------------------------------------------------------- +(md5('poc-fr-01'), + 'Paris 2028 : la France investit 8 milliards dans les infrastructures sportives', + 'Le gouvernement a présenté son plan d''investissement pour les Jeux Olympiques de Paris 2028, prévoyant la construction de 12 nouvelles arènes, la rénovation du Stade de France et la création d''un village olympique durable en Seine-Saint-Denis.', + 'https://example.com/paris-2028',NOW()-INTERVAL '21 hours','Le Monde',3,'fr',false), + +(md5('poc-fr-02'), + 'Macron annonce une réforme fiscale majeure pour les classes moyennes françaises', + 'Dans un discours à l''Élysée, le président Macron a dévoilé un allègement d''impôts de 15 milliards d''euros pour les foyers gagnant entre 2.000 et 5.000 euros par mois, financé par une taxe exceptionnelle sur les bénéfices des multinationales.', + 'https://example.com/macron-fiscalite',NOW()-INTERVAL '23 hours','Le Monde',6,'fr',false) - (md5('poc-010'), 'Telescopio James Webb detecta atmósfera en exoplaneta a 40 años luz de la Tierra', - 'Astrónomos del Instituto de Tecnología de California confirmaron la presencia de dióxido de carbono y vapor de agua en la atmósfera del exoplaneta K2-18b, abriendo nuevas posibilidades en la búsqueda de condiciones habitables fuera del sistema solar.', - 'https://example.com/james-webb-exoplaneta', NOW() - INTERVAL '22 hours', - 'El País', 1, 'es', false) ON CONFLICT (id) DO NOTHING; + +-- --------------------------------------------------------------------------- +-- TRADUCCIONES — Artículos en inglés traducidos al español +-- --------------------------------------------------------------------------- +INSERT INTO traducciones (noticia_id,lang_from,lang_to,titulo_trad,resumen_trad,status,vectorized) VALUES + +(md5('poc-en-01'),'en','es', + 'OpenAI lanza GPT-5 con razonamiento en tiempo real y contexto de 1 millón de tokens', + 'OpenAI presentó GPT-5, su modelo de lenguaje más avanzado, con multimodalidad nativa, acceso web en tiempo real y una ventana de contexto de un millón de tokens, equivalente a una novela entera. El modelo supera a expertos humanos en el 87% de las pruebas profesionales.', + 'done',false), + +(md5('poc-en-02'),'en','es', + 'La NASA confirma depósitos de hielo de agua en el polo sur lunar antes de la misión Artemis', + 'Nuevos datos de la misión LCROSS confirman importantes depósitos de hielo de agua en el cráter Shackleton, cerca del polo sur de la Luna. Los científicos estiman hasta 600 millones de toneladas métricas de agua congelada, suficiente para sostener una base lunar permanente durante décadas.', + 'done',false), + +(md5('poc-en-03'),'en','es', + 'La economía del Reino Unido crece un 3,1% en el primer trimestre, el mejor dato desde 2015', + 'La economía británica se expandió a su ritmo trimestral más rápido en casi una década, impulsada por un sector servicios en auge, exportaciones récord de productos financieros y un repunte en la fabricación de tecnología verde. La libra alcanzó su máximo en dos años frente al dólar.', + 'done',false), + +(md5('poc-en-04'),'en','es', + 'China inaugura el mayor parque eólico marino del mundo: 16 gigavatios frente a la costa de Fujian', + 'State Grid Corporation of China conectó las últimas turbinas de la Mega-Granja Eólica Offshore de Fujian, generando suficiente electricidad limpia para abastecer a 20 millones de hogares. El proyecto tardó cuatro años en construirse y emplea a 8.000 trabajadores en mantenimiento.', + 'done',false), + +(md5('poc-en-05'),'en','es', + 'Amazon adquirirá el gigante sanitario Humana por 28.000 millones de dólares', + 'En lo que sería la segunda mayor adquisición de la historia de Amazon, el gigante del comercio electrónico y la nube ha acordado comprar Humana, una de las mayores aseguradoras de salud de EE.UU. Los reguladores revisarán el acuerdo que busca combinar farmacia, seguros y logística.', + 'done',false) + +ON CONFLICT (noticia_id, lang_to) DO NOTHING; + +-- Traducciones de artículos en francés +INSERT INTO traducciones (noticia_id,lang_from,lang_to,titulo_trad,resumen_trad,status,vectorized) VALUES + +(md5('poc-fr-01'),'fr','es', + 'París 2028: Francia invierte 8.000 millones en infraestructuras deportivas', + 'El gobierno presentó su plan de inversión para los Juegos Olímpicos de París 2028, que prevé la construcción de 12 nuevos recintos, la renovación del Estadio de Francia y la creación de una villa olímpica sostenible en Seine-Saint-Denis.', + 'done',false), + +(md5('poc-fr-02'),'fr','es', + 'Macron anuncia una reforma fiscal de gran calado para las clases medias francesas', + 'En un discurso en el Elíseo, el presidente Macron presentó una reducción fiscal de 15.000 millones de euros para los hogares que ganan entre 2.000 y 5.000 euros mensuales, financiada por un impuesto excepcional sobre los beneficios de las multinacionales.', + 'done',false) + +ON CONFLICT (noticia_id, lang_to) DO NOTHING; + +-- Traducciones "self" para artículos en español (necesarias para que aparezcan en filtro translated_only) +INSERT INTO traducciones (noticia_id,lang_from,lang_to,titulo_trad,resumen_trad,status,vectorized) +SELECT id,'es','es',titulo,resumen,'done',false +FROM noticias WHERE lang='es' AND id LIKE md5('poc-es-%') +ON CONFLICT (noticia_id, lang_to) DO NOTHING; + +-- --------------------------------------------------------------------------- +-- ENTIDADES (NER tags) para dar vida a los tooltips de Wikipedia +-- --------------------------------------------------------------------------- +INSERT INTO tags (valor, tipo) VALUES + ('Elon Musk', 'persona'), + ('Lamine Yamal', 'persona'), + ('Carmen López', 'persona'), + ('Emmanuel Macron', 'persona'), + ('NASA', 'organizacion'), + ('OpenAI', 'organizacion'), + ('Tesla', 'organizacion'), + ('FMI', 'organizacion'), + ('OMS', 'organizacion'), + ('CSIC', 'organizacion'), + ('Amazon', 'organizacion'), + ('Manchester City', 'organizacion'), + ('FC Barcelona', 'organizacion'), + ('España', 'lugar'), + ('Argentina', 'lugar'), + ('México', 'lugar'), + ('Francia', 'lugar'), + ('China', 'lugar'), + ('Estados Unidos', 'lugar'), + ('Luna', 'lugar'), + ('Pompeya', 'lugar'), + ('inteligencia artificial','tema'), + ('energía solar', 'tema'), + ('Champions League', 'tema'), + ('vacuna', 'tema') +ON CONFLICT (valor, tipo) DO NOTHING; + +-- Asociar entidades a noticias +INSERT INTO tags_noticia (tag_id, noticia_id) +SELECT t.id, n.id FROM tags t, noticias n +WHERE (t.valor='OpenAI' AND n.id=md5('poc-en-01')) + OR (t.valor='inteligencia artificial' AND n.id=md5('poc-en-01')) + OR (t.valor='NASA' AND n.id=md5('poc-en-02')) + OR (t.valor='Luna' AND n.id=md5('poc-en-02')) + OR (t.valor='Tesla' AND n.id=md5('poc-es-07')) + OR (t.valor='Elon Musk' AND n.id=md5('poc-es-07')) + OR (t.valor='Lamine Yamal' AND n.id=md5('poc-es-03')) + OR (t.valor='FC Barcelona' AND n.id=md5('poc-es-03')) + OR (t.valor='Manchester City' AND n.id=md5('poc-es-03')) + OR (t.valor='Champions League' AND n.id=md5('poc-es-03')) + OR (t.valor='FMI' AND n.id=md5('poc-es-04')) + OR (t.valor='Argentina' AND n.id=md5('poc-es-04')) + OR (t.valor='OMS' AND n.id=md5('poc-es-06')) + OR (t.valor='CSIC' AND n.id=md5('poc-es-09')) + OR (t.valor='Carmen López' AND n.id=md5('poc-es-09')) + OR (t.valor='vacuna' AND n.id=md5('poc-es-09')) + OR (t.valor='Emmanuel Macron' AND n.id=md5('poc-fr-02')) + OR (t.valor='Francia' AND n.id=md5('poc-fr-02')) + OR (t.valor='Amazon' AND n.id=md5('poc-en-05')) + OR (t.valor='España' AND n.id=md5('poc-es-02')) + OR (t.valor='energía solar' AND n.id=md5('poc-es-02')) + OR (t.valor='China' AND n.id=md5('poc-en-04')) +ON CONFLICT DO NOTHING; + +-- --------------------------------------------------------------------------- +-- NOTA: El usuario admin se crea en poc.sh con hash bcrypt generado en runtime +-- --------------------------------------------------------------------------- From d9ea78b8a78f13e2120fe501b5da344c409f9cd1 Mon Sep 17 00:00:00 2001 From: SITO Date: Tue, 31 Mar 2026 08:57:01 +0200 Subject: [PATCH 8/8] fix: revision completa de rutas Docker, logica SQL y configuracion Backend Go: - backend/cmd/server/main.go: ruta wiki_images configurable via WIKI_IMAGES_PATH - backend/cmd/wiki_worker/main.go: default /opt/rss2 en lugar de /app, leer env - workers/ctranslator_worker.py: default CT2_MODEL_PATH /opt/rss2 en lugar de /app - workers/llm_categorizer_worker.py: default LLM_MODEL_PATH /opt/rss2 - workers/{langdetect,simple_translator,translation_scheduler}.py: DB_HOST default 'localhost' en lugar de 'db' (hostname Docker) SQL / esquema: - poc/seed.sql: corregir logica de auto-traducciones ES (id LIKE md5() era incorrecto) - init-db/06-tags.sql: eliminar columna wiki_checked duplicada Documentacion y configuracion: - docs/DEPLOY_DEBIAN.md: usar ct2-transformers-converter (lo que usa el worker real) - deploy/debian/env.example: agregar WIKI_IMAGES_PATH - deploy/debian/systemd/rss2-cluster.service: agregar HF_HOME faltante - deploy/debian/install.sh: comparacion numerica correcta de version Go - scripts/generate_secure_credentials.sh: ruta CT2_MODEL_PATH corregida - frontend/nginx.conf: advertencia de que es configuracion Docker legacy - docs/QUICKSTART_LLM.md: nota de deprecacion Docker - README.md: renombrar backend-go a backend en diagrama Co-Authored-By: Claude Sonnet 4.6 --- README.md | 2 +- backend/cmd/server/main.go | 6 +++++- backend/cmd/wiki_worker/main.go | 5 ++++- deploy/debian/env.example | 1 + deploy/debian/install.sh | 12 ++++++++++- deploy/debian/systemd/rss2-cluster.service | 1 + docs/DEPLOY_DEBIAN.md | 23 ++++++++++++++-------- docs/QUICKSTART_LLM.md | 6 +++++- frontend/nginx.conf | 5 +++++ init-db/06-tags.sql | 1 - poc/seed.sql | 2 +- scripts/generate_secure_credentials.sh | 2 +- workers/ctranslator_worker.py | 2 +- workers/langdetect_worker.py | 2 +- workers/llm_categorizer_worker.py | 2 +- workers/simple_translator_worker.py | 2 +- workers/translation_scheduler.py | 2 +- 17 files changed, 55 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index f4a8ece..ea6ff06 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ Internet (RSS/Atom) │ │ │ qdrant-worker ──→ Qdrant │ - backend-go (API REST :8080) + backend (API REST :8080) │ nginx (:8001) │ diff --git a/backend/cmd/server/main.go b/backend/cmd/server/main.go index cf13d80..4596b11 100644 --- a/backend/cmd/server/main.go +++ b/backend/cmd/server/main.go @@ -109,7 +109,11 @@ func main() { api := r.Group("/api") { // Serve static images downloaded by wiki_worker - api.StaticFS("/wiki-images", gin.Dir("/app/data/wiki_images", false)) + wikiImagesDir := os.Getenv("WIKI_IMAGES_PATH") + if wikiImagesDir == "" { + wikiImagesDir = "/opt/rss2/data/wiki_images" + } + api.StaticFS("/wiki-images", gin.Dir(wikiImagesDir, false)) api.POST("/auth/login", handlers.Login) api.POST("/auth/register", handlers.Register) diff --git a/backend/cmd/wiki_worker/main.go b/backend/cmd/wiki_worker/main.go index c58e07c..0065c94 100644 --- a/backend/cmd/wiki_worker/main.go +++ b/backend/cmd/wiki_worker/main.go @@ -24,7 +24,7 @@ var ( pool *pgxpool.Pool sleepInterval = 30 batchSize = 50 - imagesDir = "/app/data/wiki_images" + imagesDir = "/opt/rss2/data/wiki_images" ) type WikiSummary struct { @@ -210,6 +210,9 @@ func processTag(ctx context.Context, tag Tag) { } func main() { + if val := os.Getenv("WIKI_IMAGES_PATH"); val != "" { + imagesDir = val + } if val := os.Getenv("WIKI_SLEEP"); val != "" { if sleep, err := fmt.Sscanf(val, "%d", &sleepInterval); err == nil && sleep > 0 { sleepInterval = sleep diff --git a/deploy/debian/env.example b/deploy/debian/env.example index 406f928..728411b 100644 --- a/deploy/debian/env.example +++ b/deploy/debian/env.example @@ -81,6 +81,7 @@ MAX_FEEDS_PER_URL=5 # --- Wiki Worker --- WIKI_SLEEP=10 +WIKI_IMAGES_PATH=/opt/rss2/data/wiki_images # --- Topics --- TOPICS_SLEEP=10 diff --git a/deploy/debian/install.sh b/deploy/debian/install.sh index 128336f..76dfed5 100755 --- a/deploy/debian/install.sh +++ b/deploy/debian/install.sh @@ -33,7 +33,17 @@ apt-get install -y --no-install-recommends \ libpq-dev # Go (rss-ingestor-go requiere Go 1.25) -if ! command -v go &>/dev/null || [[ "$(go version | awk '{print $3}' | tr -d 'go')" < "1.25" ]]; then +_need_go=false +if ! command -v go &>/dev/null; then + _need_go=true +else + _gover=$(go version | awk '{print $3}' | tr -d 'go') + IFS='.' read -ra _gv <<< "$_gover" + if [[ "${_gv[0]:-0}" -lt 1 ]] || [[ "${_gv[0]:-0}" -eq 1 && "${_gv[1]:-0}" -lt 25 ]]; then + _need_go=true + fi +fi +if [[ "$_need_go" == "true" ]]; then info "Instalando Go 1.25..." GO_VERSION="1.25.0" ARCH=$(dpkg --print-architecture) diff --git a/deploy/debian/systemd/rss2-cluster.service b/deploy/debian/systemd/rss2-cluster.service index dd990fb..e39cdb6 100644 --- a/deploy/debian/systemd/rss2-cluster.service +++ b/deploy/debian/systemd/rss2-cluster.service @@ -11,6 +11,7 @@ WorkingDirectory=/opt/rss2/src EnvironmentFile=/opt/rss2/.env Environment=EVENT_DIST_THRESHOLD=0.35 Environment=EMB_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 +Environment=HF_HOME=/opt/rss2/hf_cache ExecStart=/opt/rss2/venv/bin/python -m workers.cluster_worker Restart=always RestartSec=10 diff --git a/docs/DEPLOY_DEBIAN.md b/docs/DEPLOY_DEBIAN.md index 9835f0a..b3bad07 100644 --- a/docs/DEPLOY_DEBIAN.md +++ b/docs/DEPLOY_DEBIAN.md @@ -76,16 +76,23 @@ python3 -m venv /opt/rss2/venv /opt/rss2/venv/bin/pip install ctranslate2 transformers sentencepiece # Convertir modelo NLLB-200 a formato CTranslate2 (tarda 10-30 min) -/opt/rss2/venv/bin/python - <<'EOF' -from ctranslate2.converters import OpusMTConverter -converter = OpusMTConverter("facebook/nllb-200-distilled-600M") -converter.convert("/opt/rss2/models/nllb-ct2", quantization="int8", force=True) -print("Modelo convertido OK en /opt/rss2/models/nllb-ct2") -EOF +mkdir -p /opt/rss2/models/nllb-ct2 +HF_HOME=/opt/rss2/hf_cache \ +/opt/rss2/venv/bin/ct2-transformers-converter \ + --model facebook/nllb-200-distilled-600M \ + --output_dir /opt/rss2/models/nllb-ct2 \ + --quantization int8 \ + --force + +# Verificar que se generó correctamente +ls /opt/rss2/models/nllb-ct2/model.bin && echo "Modelo OK" ``` -> El modelo ocupa ~600 MB convertido. Si la descarga de HuggingFace falla, exporta -> `HF_ENDPOINT=https://huggingface.co` o usa un mirror. +> El modelo ocupa ~600 MB convertido. Si la descarga de HuggingFace falla: +> `export HF_ENDPOINT=https://huggingface.co` antes del comando de conversión. + +> **Nota:** El worker convierte el modelo automáticamente si no lo encuentra, +> pero hacerlo a mano evita que el primer arranque tarde 30 minutos. ### 4. Ejecutar el instalador diff --git a/docs/QUICKSTART_LLM.md b/docs/QUICKSTART_LLM.md index 0baced3..61f7bb4 100644 --- a/docs/QUICKSTART_LLM.md +++ b/docs/QUICKSTART_LLM.md @@ -1,4 +1,8 @@ -# 🚀 Guía Rápida: Sistema LLM Categorizer +> **NOTA:** Esta guía está basada en la configuración Docker original. En el despliegue +> Debian nativo, el LLM categorizer se controla con `systemctl start rss2-categorizer` +> y el modelo se coloca en `/opt/rss2/models/llm` (var `LLM_MODEL_PATH`). + +# Guía Rápida: Sistema LLM Categorizer ## ✅ Estado Actual diff --git a/frontend/nginx.conf b/frontend/nginx.conf index 9331418..61a14ba 100644 --- a/frontend/nginx.conf +++ b/frontend/nginx.conf @@ -1,3 +1,8 @@ +# ============================================================================= +# NOTA: Este nginx.conf es la configuración del contenedor Docker del frontend. +# NO usar para despliegue nativo Debian — usar deploy/debian/nginx.conf +# ============================================================================= + events { worker_connections 1024; } diff --git a/init-db/06-tags.sql b/init-db/06-tags.sql index dfd8393..5613d97 100644 --- a/init-db/06-tags.sql +++ b/init-db/06-tags.sql @@ -14,4 +14,3 @@ ALTER TABLE tags ADD COLUMN IF NOT EXISTS wiki_summary TEXT; ALTER TABLE tags ADD COLUMN IF NOT EXISTS wiki_url TEXT; ALTER TABLE tags ADD COLUMN IF NOT EXISTS image_path TEXT; ALTER TABLE tags ADD COLUMN IF NOT EXISTS wiki_checked BOOLEAN DEFAULT FALSE; -ALTER TABLE tags ADD COLUMN IF NOT EXISTS wiki_checked BOOLEAN DEFAULT FALSE; diff --git a/poc/seed.sql b/poc/seed.sql index 15aeed3..906ea6d 100644 --- a/poc/seed.sql +++ b/poc/seed.sql @@ -189,7 +189,7 @@ ON CONFLICT (noticia_id, lang_to) DO NOTHING; -- Traducciones "self" para artículos en español (necesarias para que aparezcan en filtro translated_only) INSERT INTO traducciones (noticia_id,lang_from,lang_to,titulo_trad,resumen_trad,status,vectorized) SELECT id,'es','es',titulo,resumen,'done',false -FROM noticias WHERE lang='es' AND id LIKE md5('poc-es-%') +FROM noticias WHERE lang='es' ON CONFLICT (noticia_id, lang_to) DO NOTHING; -- --------------------------------------------------------------------------- diff --git a/scripts/generate_secure_credentials.sh b/scripts/generate_secure_credentials.sh index d345db7..4513a1a 100755 --- a/scripts/generate_secure_credentials.sh +++ b/scripts/generate_secure_credentials.sh @@ -129,7 +129,7 @@ URL_DISCOVERY_BATCH_SIZE=10 MAX_FEEDS_PER_URL=5 # CTranslate2 / AI Model Paths -CT2_MODEL_PATH=/app/models/nllb-ct2 +CT2_MODEL_PATH=/opt/rss2/models/nllb-ct2 CT2_DEVICE=cuda CT2_COMPUTE_TYPE=int8_float16 UNIVERSAL_MODEL=facebook/nllb-200-distilled-600M diff --git a/workers/ctranslator_worker.py b/workers/ctranslator_worker.py index 0dc1126..9b96958 100644 --- a/workers/ctranslator_worker.py +++ b/workers/ctranslator_worker.py @@ -62,7 +62,7 @@ BATCH_SIZE = _env_int("TRANSLATOR_BATCH", 8) MAX_SRC_TOKENS = _env_int("MAX_SRC_TOKENS", 512) MAX_NEW_TOKENS = _env_int("MAX_NEW_TOKENS", 512) -CT2_MODEL_PATH = _env_str("CT2_MODEL_PATH", "/app/models/nllb-ct2") +CT2_MODEL_PATH = _env_str("CT2_MODEL_PATH", "/opt/rss2/models/nllb-ct2") CT2_DEVICE = _env_str("CT2_DEVICE", "cpu") CT2_COMPUTE_TYPE = _env_str("CT2_COMPUTE_TYPE", "int8") UNIVERSAL_MODEL = _env_str("UNIVERSAL_MODEL", "facebook/nllb-200-distilled-600M") diff --git a/workers/langdetect_worker.py b/workers/langdetect_worker.py index 3cc9572..01fbd9b 100644 --- a/workers/langdetect_worker.py +++ b/workers/langdetect_worker.py @@ -22,7 +22,7 @@ logging.basicConfig( LOG = logging.getLogger(__name__) DB_CONFIG = { - 'host': os.getenv('DB_HOST', 'db'), + 'host': os.getenv('DB_HOST', 'localhost'), 'port': int(os.getenv('DB_PORT', 5432)), 'database': os.getenv('DB_NAME', 'rss'), 'user': os.getenv('DB_USER', 'rss'), diff --git a/workers/llm_categorizer_worker.py b/workers/llm_categorizer_worker.py index 7f772c9..17ddc9e 100644 --- a/workers/llm_categorizer_worker.py +++ b/workers/llm_categorizer_worker.py @@ -41,7 +41,7 @@ DB_CONFIG = { # Configuración del worker BATCH_SIZE = int(os.environ.get("LLM_BATCH_SIZE", 10)) # 10 noticias por lote SLEEP_IDLE = int(os.environ.get("LLM_SLEEP_IDLE", 30)) # segundos -MODEL_PATH = os.environ.get("LLM_MODEL_PATH", "/app/models/llm") +MODEL_PATH = os.environ.get("LLM_MODEL_PATH", "/opt/rss2/models/llm") GPU_SPLIT = os.environ.get("LLM_GPU_SPLIT", "auto") MAX_SEQ_LEN = int(os.environ.get("LLM_MAX_SEQ_LEN", 4096)) CACHE_MODE = os.environ.get("LLM_CACHE_MODE", "FP16") diff --git a/workers/simple_translator_worker.py b/workers/simple_translator_worker.py index 0412e55..a779171 100644 --- a/workers/simple_translator_worker.py +++ b/workers/simple_translator_worker.py @@ -22,7 +22,7 @@ logging.basicConfig( logger = logging.getLogger(__name__) DB_CONFIG = { - 'host': os.getenv('DB_HOST', 'db'), + 'host': os.getenv('DB_HOST', 'localhost'), 'port': int(os.getenv('DB_PORT', 5432)), 'database': os.getenv('DB_NAME', 'rss'), 'user': os.getenv('DB_USER', 'rss'), diff --git a/workers/translation_scheduler.py b/workers/translation_scheduler.py index 3fd5e2c..572f71a 100644 --- a/workers/translation_scheduler.py +++ b/workers/translation_scheduler.py @@ -22,7 +22,7 @@ logging.basicConfig( logger = logging.getLogger(__name__) DB_CONFIG = { - 'host': os.getenv('DB_HOST', 'db'), + 'host': os.getenv('DB_HOST', 'localhost'), 'port': int(os.getenv('DB_PORT', 5432)), 'database': os.getenv('DB_NAME', 'rss'), 'user': os.getenv('DB_USER', 'rss'),