From b3bf3d7a7fdce6d28b12c24f4307f135a9f411f6 Mon Sep 17 00:00:00 2001 From: SITO Date: Mon, 30 Mar 2026 22:29:50 +0200 Subject: [PATCH] refactor: reorganizar estructura de archivos en raiz MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Antes la raiz tenia 20+ archivos sueltos. Ahora organizado en: docs/ 10 archivos .md de documentacion tecnica scripts/ 3 scripts utilitarios (credentials, migrate, verify) config/ entity_config.json (aliases y blacklist NER) data/ feeds.csv (feeds precargados) Eliminados restos de Docker que ya no aplican: .dockerignore, .env.example, .env.secure.example, nginx.conf (raiz) Makefile: eliminados targets docker-build, añadidos install/rebuild/check/poc Referencias actualizadas en: deploy/debian/install.sh entity_config.json -> config/entity_config.json deploy/debian/build.sh entity_config.json -> config/entity_config.json README.md links a docs/ y data/ actualizados, arbol de estructura del repo reescrito Co-Authored-By: Claude Sonnet 4.6 --- .dockerignore | 13 -- .env.example | 67 ---------- .env.secure.example | 117 ------------------ Makefile | 33 +++-- README.md | 63 +++++++--- .../entity_config.json | 0 feeds.csv => data/feeds.csv | 0 deploy/debian/build.sh | 2 +- deploy/debian/install.sh | 2 +- DEPLOY.md => docs/DEPLOY.md | 0 DEPLOY_DEBIAN.md => docs/DEPLOY_DEBIAN.md | 0 .../FUNCIONES_DE_ARCHIVOS.md | 0 .../IMPLEMENTACION_LLM_RESUMEN.md | 0 .../NEWSPAPER_STYLE_GUIDE.md | 0 QDRANT_SETUP.md => docs/QDRANT_SETUP.md | 0 QUICKSTART_LLM.md => docs/QUICKSTART_LLM.md | 0 SECURITY_AUDIT.md => docs/SECURITY_AUDIT.md | 0 SECURITY_GUIDE.md => docs/SECURITY_GUIDE.md | 0 .../TRANSLATION_FIX_SUMMARY.md | 0 nginx.conf | 97 --------------- .../generate_secure_credentials.sh | 0 .../migrate_to_secure.sh | 0 .../verify_security.sh | 0 23 files changed, 59 insertions(+), 335 deletions(-) delete mode 100644 .dockerignore delete mode 100644 .env.example delete mode 100644 .env.secure.example rename entity_config.json => config/entity_config.json (100%) rename feeds.csv => data/feeds.csv (100%) rename DEPLOY.md => docs/DEPLOY.md (100%) rename DEPLOY_DEBIAN.md => docs/DEPLOY_DEBIAN.md (100%) rename FUNCIONES_DE_ARCHIVOS.md => docs/FUNCIONES_DE_ARCHIVOS.md (100%) rename IMPLEMENTACION_LLM_RESUMEN.md => docs/IMPLEMENTACION_LLM_RESUMEN.md (100%) rename NEWSPAPER_STYLE_GUIDE.md => docs/NEWSPAPER_STYLE_GUIDE.md (100%) rename QDRANT_SETUP.md => docs/QDRANT_SETUP.md (100%) rename QUICKSTART_LLM.md => docs/QUICKSTART_LLM.md (100%) rename SECURITY_AUDIT.md => docs/SECURITY_AUDIT.md (100%) rename SECURITY_GUIDE.md => docs/SECURITY_GUIDE.md (100%) rename TRANSLATION_FIX_SUMMARY.md => docs/TRANSLATION_FIX_SUMMARY.md (100%) delete mode 100644 nginx.conf rename generate_secure_credentials.sh => scripts/generate_secure_credentials.sh (100%) rename migrate_to_secure.sh => scripts/migrate_to_secure.sh (100%) rename verify_security.sh => scripts/verify_security.sh (100%) diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index b472c77..0000000 --- a/.dockerignore +++ /dev/null @@ -1,13 +0,0 @@ -.git -pgdata -pgdata-replica -pgdata-replica.old.* -pgdata.failed_restore -redis-data -hf_cache -qdrant_storage - -venv -__pycache__ -*.pyc -*.log diff --git a/.env.example b/.env.example deleted file mode 100644 index 4f5ab34..0000000 --- a/.env.example +++ /dev/null @@ -1,67 +0,0 @@ -# Database Configuration -POSTGRES_DB=rss -POSTGRES_USER=rss -POSTGRES_PASSWORD=change_this_password -DB_NAME=rss -DB_USER=rss -DB_PASS=change_this_password -DB_HOST=db -DB_PORT=5432 -DB_WRITE_HOST=db -DB_READ_HOST=db-replica - -# Redis Configuration -REDIS_HOST=redis -REDIS_PORT=6379 - -# Application Secrets -SECRET_KEY=change_this_to_a_long_random_string - -# External Services -ALLTALK_URL=http://host.docker.internal:7851 - -# AI Models & Workers -RSS_MAX_WORKERS=3 -# Translation Pipeline -TARGET_LANGS=es -TRANSLATOR_BATCH=16 -SCHEDULER_BATCH=2000 -SCHEDULER_SLEEP=30 -LANG_DETECT_BATCH=1000 -LANG_DETECT_SLEEP=60 - -# RSS Ingestor Configuration -RSS_POKE_INTERVAL_MIN=15 -RSS_MAX_FAILURES=10 -RSS_FEED_TIMEOUT=60 - -# URL Feed Discovery Worker -URL_DISCOVERY_INTERVAL_MIN=15 -URL_DISCOVERY_BATCH_SIZE=10 -MAX_FEEDS_PER_URL=5 - -# CTranslate2 / AI Model Paths -CT2_MODEL_PATH=/app/models/nllb-ct2 -CT2_DEVICE=cuda -CT2_COMPUTE_TYPE=int8_float16 -UNIVERSAL_MODEL=facebook/nllb-200-distilled-600M - -# Embeddings -EMB_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 -EMB_BATCH=64 -EMB_DEVICE=cuda - -# NER -NER_LANG=es -NER_BATCH=64 - -# Flask / Gunicorn -GUNICORN_WORKERS=8 -FLASK_DEBUG=0 - -# Qdrant Configuration -QDRANT_HOST=qdrant -QDRANT_PORT=6333 -QDRANT_COLLECTION_NAME=news_vectors -QDRANT_BATCH_SIZE=100 -QDRANT_SLEEP_IDLE=30 diff --git a/.env.secure.example b/.env.secure.example deleted file mode 100644 index 68b84cc..0000000 --- a/.env.secure.example +++ /dev/null @@ -1,117 +0,0 @@ -# ================================================================================== -# SEGURIDAD: CONFIGURACIÓN DE PRODUCCIÓN -# ================================================================================== -# -# IMPORTANTE: -# 1. Copia este archivo a .env -# 2. Cambia TODOS los valores de contraseñas y secrets -# 3. NO compartas este archivo en repositorios públicos -# 4. Añade .env al .gitignore -# -# ================================================================================== - -# ================================================================================== -# DATABASE CONFIGURATION - PostgreSQL -# ================================================================================== -POSTGRES_DB=rss -POSTGRES_USER=rss -# CRÍTICO: Genera una contraseña fuerte (mínimo 32 caracteres aleatorios) -# Ejemplo para generar: openssl rand -base64 32 -POSTGRES_PASSWORD=CAMBIAR_ESTO_POR_UNA_CONTRASEÑA_FUERTE_DE_32_CARACTERES - -DB_NAME=rss -DB_USER=rss -DB_PASS=CAMBIAR_ESTO_POR_UNA_CONTRASEÑA_FUERTE_DE_32_CARACTERES -DB_HOST=db -DB_PORT=5432 -DB_WRITE_HOST=db -DB_READ_HOST=db-replica - -# ================================================================================== -# REDIS CONFIGURATION - Autenticación habilitada -# ================================================================================== -REDIS_HOST=redis -REDIS_PORT=6379 -# CRÍTICO: Genera una contraseña fuerte para Redis -# Ejemplo: openssl rand -base64 32 -REDIS_PASSWORD=CAMBIAR_ESTO_POR_UNA_CONTRASEÑA_FUERTE_REDIS - -# ================================================================================== -# APPLICATION SECRETS -# ================================================================================== -# CRÍTICO: Secret key para Flask - debe ser único y secreto -# Genera con: python -c "import secrets; print(secrets.token_hex(32))" -SECRET_KEY=CAMBIAR_ESTO_POR_UN_TOKEN_HEX_DE_64_CARACTERES - -# ================================================================================== -# MONITORING - Grafana -# ================================================================================== -# IMPORTANTE: Cambia el password de admin de Grafana -GRAFANA_PASSWORD=CAMBIAR_ESTO_POR_UNA_CONTRASEÑA_FUERTE_GRAFANA - -# ================================================================================== -# EXTERNAL SERVICES -# ================================================================================== -ALLTALK_URL=http://host.docker.internal:7851 - -# ================================================================================== -# AI MODELS & WORKERS -# ================================================================================== -RSS_MAX_WORKERS=3 -TARGET_LANGS=es -TRANSLATOR_BATCH=128 -ENQUEUE=300 - -# RSS Ingestor Configuration -RSS_POKE_INTERVAL_MIN=15 -RSS_MAX_FAILURES=10 -RSS_FEED_TIMEOUT=60 - -# URL Feed Discovery Worker -URL_DISCOVERY_INTERVAL_MIN=15 -URL_DISCOVERY_BATCH_SIZE=10 -MAX_FEEDS_PER_URL=5 - -# CTranslate2 / AI Model Paths -CT2_MODEL_PATH=/app/models/nllb-ct2 -CT2_DEVICE=cuda -CT2_COMPUTE_TYPE=int8_float16 -UNIVERSAL_MODEL=facebook/nllb-200-distilled-600M - -# Embeddings -EMB_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 -EMB_BATCH=64 -EMB_DEVICE=cuda - -# NER -NER_LANG=es -NER_BATCH=64 - -# Flask / Gunicorn -GUNICORN_WORKERS=8 -FLASK_DEBUG=0 - -# Qdrant Configuration -QDRANT_HOST=qdrant -QDRANT_PORT=6333 -QDRANT_COLLECTION_NAME=news_vectors -QDRANT_BATCH_SIZE=100 -QDRANT_SLEEP_IDLE=30 - -# ================================================================================== -# COMANDOS ÚTILES PARA GENERAR CONTRASEÑAS SEGURAS -# ================================================================================== -# -# PostgreSQL Password (32 caracteres): -# openssl rand -base64 32 -# -# Redis Password (32 caracteres): -# openssl rand -base64 32 -# -# Flask Secret Key (64 hex chars): -# python -c "import secrets; print(secrets.token_hex(32))" -# -# Grafana Password (fuerte): -# openssl rand -base64 24 -# -# ================================================================================== diff --git a/Makefile b/Makefile index 5d463f2..3d7a6fb 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # RSS2 Workers Makefile -.PHONY: all build clean deps ingestor scraper discovery topics related qdrant server +.PHONY: all build clean deps ingestor scraper discovery topics related qdrant server install rebuild check poc # Binary output directory BIN_DIR := bin @@ -69,21 +69,16 @@ run-qdrant: DB_HOST=localhost DB_PORT=5432 DB_NAME=rss DB_USER=rss DB_PASS=rss \ QDRANT_HOST=localhost QDRANT_PORT=6333 OLLAMA_URL=http://localhost:11434 $(QDRANT) -# Docker builds -docker-build: - docker build -t rss2-ingestor -f rss-ingestor-go/Dockerfile ./rss-ingestor-go - docker build -t rss2-server -f backend/Dockerfile ./backend - docker build -t rss2-scraper -f Dockerfile.scraper ./backend - docker build -t rss2-discovery -f Dockerfile.discovery ./backend - docker build -t rss2-topics -f Dockerfile.topics ./backend - docker build -t rss2-related -f Dockerfile.related ./backend - docker build -t rss2-qdrant -f Dockerfile.qdrant ./backend - docker build -t rss2-langdetect -f Dockerfile . - docker build -t rss2-scheduler -f Dockerfile.scheduler . - docker build -t rss2-translator -f Dockerfile.translator . - docker build -t rss2-translator-gpu -f Dockerfile.translator-gpu . - docker build -t rss2-embeddings -f Dockerfile.embeddings_worker . - docker build -t rss2-ner -f Dockerfile . - docker build -t rss2-llm-categorizer -f Dockerfile.llm_worker . - docker build -t rss2-frontend -f frontend/Dockerfile ./frontend - docker build -t rss2-nginx -f Dockerfile.nginx . +# Despliegue en Debian (sin Docker) +install: + sudo bash deploy/debian/prerequisites.sh + sudo bash deploy/debian/install.sh + +rebuild: + sudo bash deploy/debian/build.sh + +check: + bash deploy/debian/check.sh + +poc: + bash poc/poc.sh diff --git a/README.md b/README.md index d70ea43..8f18314 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,7 @@ Compila los binarios Go, el frontend React, crea los servicios systemd y arranca http://IP_DEL_SERVIDOR:8001 ``` -Guía completa: [DEPLOY_DEBIAN.md](DEPLOY_DEBIAN.md) +Guía completa: [docs/DEPLOY_DEBIAN.md](docs/DEPLOY_DEBIAN.md) --- @@ -155,23 +155,46 @@ sudo bash deploy/debian/build.sh ## Estructura del repositorio ``` -├── backend/ Go — API REST + workers (scraper, discovery, wiki, topics, related, qdrant) -├── rss-ingestor-go/ Go — Ingestor de feeds RSS -├── frontend/ React + TypeScript + Tailwind -├── workers/ Python — ML workers (traducción, embeddings, NER, cluster, categorización) -├── init-db/ SQL — Schema y datos iniciales -├── migrations/ SQL — Migraciones incrementales -├── deploy/debian/ Scripts de despliegue para Debian sin Docker -│ ├── prerequisites.sh Instala todas las dependencias del sistema -│ ├── install.sh Instalación completa -│ ├── build.sh Recompila y reinicia tras actualizar código -│ ├── env.example Plantilla de variables de entorno -│ ├── nginx.conf Configuración nginx para despliegue nativo -│ └── systemd/ Ficheros de servicio systemd (16 servicios) -├── poc/ -│ ├── poc.sh POC local con datos de prueba (sin Docker, sin ML) -│ └── seed.sql Datos de muestra para el POC -├── feeds.csv Feeds RSS precargados para importar desde el admin -├── entity_config.json Aliases y blacklist para normalización de entidades NER -└── DEPLOY_DEBIAN.md Guía detallada de despliegue en Debian +├── README.md +├── requirements.txt Dependencias Python para workers ML +├── Makefile Compilación local de binarios Go +│ +├── backend/ Go — API REST + workers (scraper, discovery, wiki, topics, related, qdrant) +├── rss-ingestor-go/ Go — Ingestor de feeds RSS +├── frontend/ React + TypeScript + Tailwind +├── workers/ Python — Workers ML (traducción, embeddings, NER, cluster, categorización) +│ +├── init-db/ SQL — Schema completo y datos iniciales +├── migrations/ SQL — Migraciones incrementales +│ +├── config/ +│ └── entity_config.json Aliases y blacklist para normalización de entidades NER +│ +├── data/ +│ └── feeds.csv Feeds RSS precargados para importar desde el admin +│ +├── docs/ +│ ├── DEPLOY_DEBIAN.md Guía detallada de despliegue en Debian +│ ├── SECURITY_GUIDE.md Guía de seguridad +│ ├── SECURITY_AUDIT.md Resultado del audit de seguridad +│ ├── QDRANT_SETUP.md Configuración de Qdrant +│ └── ... Resto de documentación técnica +│ +├── scripts/ +│ ├── generate_secure_credentials.sh +│ ├── migrate_to_secure.sh +│ └── verify_security.sh +│ +├── deploy/debian/ Despliegue nativo en Debian (sin Docker) +│ ├── prerequisites.sh Instala todas las dependencias del sistema +│ ├── install.sh Instalación completa +│ ├── build.sh Recompila y reinicia tras actualizar código +│ ├── check.sh Diagnóstico del sistema +│ ├── env.example Plantilla de variables de entorno +│ ├── nginx.conf Configuración nginx +│ └── systemd/ 16 ficheros de servicio systemd +│ +└── poc/ + ├── poc.sh POC local en 2 minutos (sin Docker, sin ML) + └── seed.sql 10 noticias de muestra en español ``` diff --git a/entity_config.json b/config/entity_config.json similarity index 100% rename from entity_config.json rename to config/entity_config.json diff --git a/feeds.csv b/data/feeds.csv similarity index 100% rename from feeds.csv rename to data/feeds.csv diff --git a/deploy/debian/build.sh b/deploy/debian/build.sh index 6dd81b2..43efd5b 100755 --- a/deploy/debian/build.sh +++ b/deploy/debian/build.sh @@ -55,7 +55,7 @@ fi # --- Workers Python --- info "Sincronizando workers Python..." rsync -a --delete "$REPO_ROOT/workers/" "$RSS2_HOME/src/workers/" -cp "$REPO_ROOT/entity_config.json" "$RSS2_HOME/src/" 2>/dev/null || true +cp "$REPO_ROOT/config/entity_config.json" "$RSS2_HOME/src/" 2>/dev/null || true info " [OK] workers Python" chown -R rss2:rss2 "$RSS2_HOME/bin" "$RSS2_HOME/frontend/dist" "$RSS2_HOME/src" diff --git a/deploy/debian/install.sh b/deploy/debian/install.sh index 9627bbc..128336f 100755 --- a/deploy/debian/install.sh +++ b/deploy/debian/install.sh @@ -172,7 +172,7 @@ fi # Copiar workers Python al directorio de trabajo info "Copiando workers Python..." rsync -a --delete "$REPO_ROOT/workers/" "$RSS2_HOME/src/workers/" -cp "$REPO_ROOT/entity_config.json" "$RSS2_HOME/src/" 2>/dev/null || true +cp "$REPO_ROOT/config/entity_config.json" "$RSS2_HOME/src/" 2>/dev/null || true # ============================================================================= # 7. COMPILAR GO (backend + workers) diff --git a/DEPLOY.md b/docs/DEPLOY.md similarity index 100% rename from DEPLOY.md rename to docs/DEPLOY.md diff --git a/DEPLOY_DEBIAN.md b/docs/DEPLOY_DEBIAN.md similarity index 100% rename from DEPLOY_DEBIAN.md rename to docs/DEPLOY_DEBIAN.md diff --git a/FUNCIONES_DE_ARCHIVOS.md b/docs/FUNCIONES_DE_ARCHIVOS.md similarity index 100% rename from FUNCIONES_DE_ARCHIVOS.md rename to docs/FUNCIONES_DE_ARCHIVOS.md diff --git a/IMPLEMENTACION_LLM_RESUMEN.md b/docs/IMPLEMENTACION_LLM_RESUMEN.md similarity index 100% rename from IMPLEMENTACION_LLM_RESUMEN.md rename to docs/IMPLEMENTACION_LLM_RESUMEN.md diff --git a/NEWSPAPER_STYLE_GUIDE.md b/docs/NEWSPAPER_STYLE_GUIDE.md similarity index 100% rename from NEWSPAPER_STYLE_GUIDE.md rename to docs/NEWSPAPER_STYLE_GUIDE.md diff --git a/QDRANT_SETUP.md b/docs/QDRANT_SETUP.md similarity index 100% rename from QDRANT_SETUP.md rename to docs/QDRANT_SETUP.md diff --git a/QUICKSTART_LLM.md b/docs/QUICKSTART_LLM.md similarity index 100% rename from QUICKSTART_LLM.md rename to docs/QUICKSTART_LLM.md diff --git a/SECURITY_AUDIT.md b/docs/SECURITY_AUDIT.md similarity index 100% rename from SECURITY_AUDIT.md rename to docs/SECURITY_AUDIT.md diff --git a/SECURITY_GUIDE.md b/docs/SECURITY_GUIDE.md similarity index 100% rename from SECURITY_GUIDE.md rename to docs/SECURITY_GUIDE.md diff --git a/TRANSLATION_FIX_SUMMARY.md b/docs/TRANSLATION_FIX_SUMMARY.md similarity index 100% rename from TRANSLATION_FIX_SUMMARY.md rename to docs/TRANSLATION_FIX_SUMMARY.md diff --git a/nginx.conf b/nginx.conf deleted file mode 100644 index af65c19..0000000 --- a/nginx.conf +++ /dev/null @@ -1,97 +0,0 @@ -user nginx; -worker_processes auto; -error_log /var/log/nginx/error.log warn; -pid /var/run/nginx.pid; - -events { - worker_connections 2048; - use epoll; -} - -http { - include /etc/nginx/mime.types; - default_type application/octet-stream; - - log_format main '$remote_addr - $remote_user [$time_local] "$request" ' - '$status $body_bytes_sent "$http_referer" ' - '"$http_user_agent" "$http_x_forwarded_for"'; - - access_log /var/log/nginx/access.log main; - - sendfile on; - tcp_nopush on; - tcp_nodelay on; - keepalive_timeout 65; - types_hash_max_size 2048; - client_max_body_size 100M; - - gzip on; - gzip_vary on; - gzip_proxied any; - gzip_comp_level 6; - gzip_types text/plain text/css text/javascript - application/json application/javascript - application/xml text/xml; - - # Upstream for Go API - upstream api_backend { - server backend-go:8080; - keepalive 32; - } - - # Upstream for React Frontend - upstream frontend { - server rss2_frontend:80; - keepalive 16; - } - - server { - listen 80; - server_name _; - - client_body_timeout 60s; - client_header_timeout 60s; - send_timeout 300s; - - # Serve React Frontend - location / { - proxy_pass http://frontend; - proxy_http_version 1.1; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection 'upgrade'; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_cache_bypass $http_upgrade; - } - - # Proxy to Go API - location /api/ { - proxy_pass http://api_backend/api/; - proxy_http_version 1.1; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_set_header Connection ""; - - proxy_connect_timeout 60s; - proxy_send_timeout 300s; - proxy_read_timeout 300s; - } - - # Health check - location /health { - access_log off; - return 200 "ok"; - } - - # Block sensitive files - location ~ /\. { - deny all; - access_log off; - log_not_found off; - } - } -} diff --git a/generate_secure_credentials.sh b/scripts/generate_secure_credentials.sh similarity index 100% rename from generate_secure_credentials.sh rename to scripts/generate_secure_credentials.sh diff --git a/migrate_to_secure.sh b/scripts/migrate_to_secure.sh similarity index 100% rename from migrate_to_secure.sh rename to scripts/migrate_to_secure.sh diff --git a/verify_security.sh b/scripts/verify_security.sh similarity index 100% rename from verify_security.sh rename to scripts/verify_security.sh