Initial clean commit

This commit is contained in:
jlimolina 2026-01-13 13:39:51 +01:00
commit 6784d81c2c
141 changed files with 25219 additions and 0 deletions

10
.dockerignore Normal file
View file

@ -0,0 +1,10 @@
.git
pgdata
pgdata-replica
redis-data
hf_cache
venv
__pycache__
*.pyc
*.log

63
.env.example Normal file
View file

@ -0,0 +1,63 @@
# Database Configuration
POSTGRES_DB=rss
POSTGRES_USER=rss
POSTGRES_PASSWORD=change_this_password
DB_NAME=rss
DB_USER=rss
DB_PASS=change_this_password
DB_HOST=db
DB_PORT=5432
DB_WRITE_HOST=db
DB_READ_HOST=db-replica
# Redis Configuration
REDIS_HOST=redis
REDIS_PORT=6379
# Application Secrets
SECRET_KEY=change_this_to_a_long_random_string
# External Services
ALLTALK_URL=http://host.docker.internal:7851
# AI Models & Workers
RSS_MAX_WORKERS=3
TARGET_LANGS=es
TRANSLATOR_BATCH=128
ENQUEUE=300
# RSS Ingestor Configuration
RSS_POKE_INTERVAL_MIN=15
RSS_MAX_FAILURES=10
RSS_FEED_TIMEOUT=60
# URL Feed Discovery Worker
URL_DISCOVERY_INTERVAL_MIN=15
URL_DISCOVERY_BATCH_SIZE=10
MAX_FEEDS_PER_URL=5
# CTranslate2 / AI Model Paths
CT2_MODEL_PATH=/app/models/nllb-ct2
CT2_DEVICE=cuda
CT2_COMPUTE_TYPE=int8_float16
UNIVERSAL_MODEL=facebook/nllb-200-distilled-600M
# Embeddings
EMB_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
EMB_BATCH=64
EMB_DEVICE=cuda
# NER
NER_LANG=es
NER_BATCH=64
# Flask / Gunicorn
GUNICORN_WORKERS=8
FLASK_DEBUG=0
# Qdrant Configuration
QDRANT_HOST=qdrant
QDRANT_PORT=6333
QDRANT_COLLECTION_NAME=news_vectors
QDRANT_BATCH_SIZE=100
QDRANT_SLEEP_IDLE=30

117
.env.secure.example Normal file
View file

@ -0,0 +1,117 @@
# ==================================================================================
# SEGURIDAD: CONFIGURACIÓN DE PRODUCCIÓN
# ==================================================================================
#
# IMPORTANTE:
# 1. Copia este archivo a .env
# 2. Cambia TODOS los valores de contraseñas y secrets
# 3. NO compartas este archivo en repositorios públicos
# 4. Añade .env al .gitignore
#
# ==================================================================================
# ==================================================================================
# DATABASE CONFIGURATION - PostgreSQL
# ==================================================================================
POSTGRES_DB=rss
POSTGRES_USER=rss
# CRÍTICO: Genera una contraseña fuerte (mínimo 32 caracteres aleatorios)
# Ejemplo para generar: openssl rand -base64 32
POSTGRES_PASSWORD=CAMBIAR_ESTO_POR_UNA_CONTRASEÑA_FUERTE_DE_32_CARACTERES
DB_NAME=rss
DB_USER=rss
DB_PASS=CAMBIAR_ESTO_POR_UNA_CONTRASEÑA_FUERTE_DE_32_CARACTERES
DB_HOST=db
DB_PORT=5432
DB_WRITE_HOST=db
DB_READ_HOST=db-replica
# ==================================================================================
# REDIS CONFIGURATION - Autenticación habilitada
# ==================================================================================
REDIS_HOST=redis
REDIS_PORT=6379
# CRÍTICO: Genera una contraseña fuerte para Redis
# Ejemplo: openssl rand -base64 32
REDIS_PASSWORD=CAMBIAR_ESTO_POR_UNA_CONTRASEÑA_FUERTE_REDIS
# ==================================================================================
# APPLICATION SECRETS
# ==================================================================================
# CRÍTICO: Secret key para Flask - debe ser único y secreto
# Genera con: python -c "import secrets; print(secrets.token_hex(32))"
SECRET_KEY=CAMBIAR_ESTO_POR_UN_TOKEN_HEX_DE_64_CARACTERES
# ==================================================================================
# MONITORING - Grafana
# ==================================================================================
# IMPORTANTE: Cambia el password de admin de Grafana
GRAFANA_PASSWORD=CAMBIAR_ESTO_POR_UNA_CONTRASEÑA_FUERTE_GRAFANA
# ==================================================================================
# EXTERNAL SERVICES
# ==================================================================================
ALLTALK_URL=http://host.docker.internal:7851
# ==================================================================================
# AI MODELS & WORKERS
# ==================================================================================
RSS_MAX_WORKERS=3
TARGET_LANGS=es
TRANSLATOR_BATCH=128
ENQUEUE=300
# RSS Ingestor Configuration
RSS_POKE_INTERVAL_MIN=15
RSS_MAX_FAILURES=10
RSS_FEED_TIMEOUT=60
# URL Feed Discovery Worker
URL_DISCOVERY_INTERVAL_MIN=15
URL_DISCOVERY_BATCH_SIZE=10
MAX_FEEDS_PER_URL=5
# CTranslate2 / AI Model Paths
CT2_MODEL_PATH=/app/models/nllb-ct2
CT2_DEVICE=cuda
CT2_COMPUTE_TYPE=int8_float16
UNIVERSAL_MODEL=facebook/nllb-200-distilled-600M
# Embeddings
EMB_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
EMB_BATCH=64
EMB_DEVICE=cuda
# NER
NER_LANG=es
NER_BATCH=64
# Flask / Gunicorn
GUNICORN_WORKERS=8
FLASK_DEBUG=0
# Qdrant Configuration
QDRANT_HOST=qdrant
QDRANT_PORT=6333
QDRANT_COLLECTION_NAME=news_vectors
QDRANT_BATCH_SIZE=100
QDRANT_SLEEP_IDLE=30
# ==================================================================================
# COMANDOS ÚTILES PARA GENERAR CONTRASEÑAS SEGURAS
# ==================================================================================
#
# PostgreSQL Password (32 caracteres):
# openssl rand -base64 32
#
# Redis Password (32 caracteres):
# openssl rand -base64 32
#
# Flask Secret Key (64 hex chars):
# python -c "import secrets; print(secrets.token_hex(32))"
#
# Grafana Password (fuerte):
# openssl rand -base64 24
#
# ==================================================================================

64
.gitignore vendored Normal file
View file

@ -0,0 +1,64 @@
__pycache__/
*.pyc
*.pyo
*.pyd
.Python
*.so
*.egg
*.egg-info/
dist/
build/
venv/
.env
.vscode/
.idea/
*.swp
*.swo
*~
.DS_Store
pgdata/
pgdata-replica/
pgdata.failed_restore/
pgdata-replica.old.*/
redis-data/
hf_cache/
models/nllb-ct2/
qdrant_storage/
*.log
*.db
*.sqlite
*.sqlite3
data/
*.mp4
*.mp3
*.wav
*.srt
*.tar.gz
*.zip
*.bak
*.old
# ==================================================================================
# SECURITY FILES - NEVER COMMIT THESE
# ==================================================================================
# Environment files with credentials
.env
.env.backup*
.env.generated
.env.local
.env.*.local
# Database backups
*.sql
backup_*.sql
# Redis backups
*.rdb
redis_backup_*.rdb
# Qdrant backups
qdrant_backup_*.tar.gz
# Docker compose with real credentials (if you create variations)
docker-compose.override.yml

54
DEPLOY.md Normal file
View file

@ -0,0 +1,54 @@
# Deployment Guide
This guide describes how to deploy the application to a new server.
## Prerequisites
* **Linux Server** (Ubuntu 22.04+ recommended)
* **NVIDIA GPU**: Required for translation, embeddings, and NER services.
* **NVIDIA Container Toolkit**: Must be installed to allow Docker to access the GPU.
* **Docker** & **Docker Compose**: Latest versions.
* **Git**: To clone the repository.
* **External Service**: An instance of [AllTalk](https://github.com/erew123/alltalk_tts) running externally or on the host (port 7851 by default).
## Deployment Steps
1. **Clone the Repository**
```bash
git clone <your-repo-url>
cd <your-repo-name>
```
2. **Configure Environment Variables**
Copy the example configuration file:
```bash
cp .env.example .env
```
Edit `.env` and set secure passwords and configuration:
```bash
nano .env
```
* Change `POSTGRES_PASSWORD` and `DB_PASS` to a strong unique password.
* Change `SECRET_KEY` to a long random string.
* Verify `ALLTALK_URL` points to your AllTalk instance (default assumes host machine access).
3. **Start the Services**
Run the following command to build and start the application:
```bash
docker compose up -d --build
```
4. **Database Initialization**
The database will automatically initialize on the first run using the scripts in `init-db/`. This may take a few minutes. Check logs with:
```bash
docker compose logs -f db
```
5. **Verify Deployment**
Access the application at `http://<your-server-ip>:8001`.
## Important Notes
* **Models**: The application mounts `./models` and `./hf_cache` to persist AI models. On the first run, it will attempt to download necessary models (NLLB, BERT, etc.), which requires significant bandwidth and time.
* **Data Persistence**: Database data is stored in `./pgdata` (mapped in docker-compose). Ensure this directory is backed up.
* **Security**: Ensure port 5432 (Postgres) and 6379 (Redis) are firewall-protected and not exposed to the public internet unless intended (Docker maps them to the host network).

67
Dockerfile Normal file
View file

@ -0,0 +1,67 @@
FROM python:3.11-slim
# CUDA o CPU
ARG TORCH_CUDA=cu121
WORKDIR /app
# --------------------------------------------------------
# Dependencias del sistema
# --------------------------------------------------------
RUN apt-get update && apt-get install -y --no-install-recommends \
libpq-dev \
gcc \
git \
libcairo2 \
libpango-1.0-0 \
libpangocairo-1.0-0 \
libgdk-pixbuf-2.0-0 \
libffi-dev \
shared-mime-info \
&& rm -rf /var/lib/apt/lists/*
ENV PYTHONUNBUFFERED=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
TOKENIZERS_PARALLELISM=false \
HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
HF_HOME=/root/.cache/huggingface
# --------------------------------------------------------
# Instalación de requirements
# --------------------------------------------------------
COPY requirements.txt .
RUN python -m pip install --no-cache-dir --upgrade pip setuptools wheel
# Instalar PyTorch según GPU/CPU
RUN if [ "$TORCH_CUDA" = "cu121" ]; then \
pip install --no-cache-dir --index-url https://download.pytorch.org/whl/cu121 \
torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 ; \
else \
pip install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu \
torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 ; \
fi
RUN pip install --no-cache-dir -r requirements.txt
# Instalar ctranslate2 con soporte CUDA
RUN if [ "$TORCH_CUDA" = "cu121" ]; then \
pip install --no-cache-dir ctranslate2 ; \
else \
pip install --no-cache-dir ctranslate2 ; \
fi
# Descargar modelo spaCy ES
RUN python -m spacy download es_core_news_md || true
# --------------------------------------------------------
# Copiar TODO el proyecto rss2/
# --------------------------------------------------------
COPY . .
# --------------------------------------------------------
# Puede descargar modelos NLLB o Sentence-BERT si existe
# --------------------------------------------------------
RUN python download_models.py || true
EXPOSE 8000

12
Dockerfile.replica Normal file
View file

@ -0,0 +1,12 @@
FROM postgres:18-alpine
# Copy initialization script
COPY init-replica/init-replica.sh /docker-entrypoint-initdb.d/
# Make script executable
RUN chmod +x /docker-entrypoint-initdb.d/init-replica.sh
# Set environment for replication
ENV PRIMARY_HOST=db
ENV REPLICATION_USER=replicator
ENV REPLICATION_PASSWORD=replica_password

29
Dockerfile.url_worker Normal file
View file

@ -0,0 +1,29 @@
FROM python:3.10-slim
WORKDIR /app
# Install system dependencies for lxml and general build
RUN apt-get update && apt-get install -y \
gcc \
libxml2-dev \
libxslt-dev \
python3-dev \
&& rm -rf /var/lib/apt/lists/*
# Install python dependencies
RUN pip install --no-cache-dir \
psycopg2-binary \
requests \
newspaper3k \
lxml_html_clean \
python-dotenv
# Copy application code
COPY . /app
# Set environment
ENV PYTHONPATH=/app
ENV PYTHONUNBUFFERED=1
# Run the worker daemon
CMD ["python", "-m", "workers.url_worker_daemon"]

257
QDRANT_SETUP.md Normal file
View file

@ -0,0 +1,257 @@
# ✅ Sistema Qdrant - Búsquedas Semánticas
## 🎯 Arquitectura Actual
El sistema vectoriza **directamente** las noticias traducidas y proporciona búsqueda semántica en tiempo real.
```
Noticias Originales (RSS)
Traducción (translator workers)
PostgreSQL (tabla 'traducciones')
Qdrant Worker (vectorización directa)
Qdrant (búsquedas semánticas)
API de Búsqueda (utils/qdrant_search.py)
Buscador General + Monitor de Conflictos
```
## ✅ Servicios y Componentes
| Componente | Puerto/Ubicación | Descripción |
|----------|--------|-------------|
| **Qdrant** | 6333 | Base de datos vectorial |
| **Qdrant Worker** | - | Vectorización continua |
| **Búsqueda Semántica** | `utils/qdrant_search.py` | API de búsqueda vectorial |
| **Buscador General** | `routers/search.py` | Búsqueda con fallback a PostgreSQL |
| **Monitor de Conflictos** | `routers/conflicts.py` | Búsqueda por keywords con vectores |
### Configuración del Worker
- **Origen**: Tabla `traducciones`
- **Modelo**: `paraphrase-multilingual-MiniLM-L12-v2`
- **Dimensiones**: 384
- **Dispositivo**: CPU (GPU disponible)
- **Velocidad**: ~100+ noticias/minuto
- **Total Vectorizado**: ~507,000 noticias
## 🚀 Integración Completa
### Buscador General (`/api/search`)
La búsqueda ahora usa **Qdrant primero** para mayor velocidad y precisión semántica:
1. **Búsqueda Semántica** (por defecto): Usa vectores de Qdrant
2. **Fallback PostgreSQL**: Si falla o no hay resultados
3. **Enriquecimiento**: Combina datos de ambas fuentes
**Ventajas:**
- ✅ Búsquedas 10-100x más rápidas (sin escaneo de 500k filas)
- ✅ Comprende sinónimos y contexto ("protestas" encuentra "manifestaciones")
- ✅ Multilingüe automático
- ✅ Sin dependencia de palabras exactas
**Parámetros:**
- `q`: Texto de búsqueda
- `limit`: Máximo de resultados (default: 10, max: 50)
- `semantic`: `true/false` (default: `true`)
### Monitor de Conflictos (`/conflicts/<id>`)
Ahora usa búsqueda semántica por keywords:
**Antes (ILIKE con PostgreSQL):**
- ❌ "irán protestas" requería coincidencia exacta de toda la frase
- ❌ Lento con 500k noticias
- ❌ No encontraba variaciones ("manifestación", "protesta")
**Ahora (Qdrant):**
- ✅ "irán protestas" busca por ambas palabras independientemente
- ✅ Rápido (búsqueda vectorial)
- ✅ Encuentra contenido semánticamente similar
## 🔧 Comandos Útiles
### Ver Logs
```bash
docker-compose logs -f qdrant-worker
docker-compose logs -f qdrant
docker-compose logs -f rss2_web # Para ver logs de búsqueda
```
### Estadísticas
```bash
docker exec -it rss2_web python scripts/migrate_to_qdrant.py --stats
```
### Vectorizar Pendientes (Manual)
```bash
docker exec -it rss2_web python scripts/migrate_to_qdrant.py --vectorize --batch-size 200
```
### Reset Completo (⚠️ Destructivo)
```bash
docker exec -it rss2_web python scripts/migrate_to_qdrant.py --reset
```
### Probar Búsqueda Semántica
```bash
# Búsqueda semántica
curl "http://localhost:8001/api/search?q=protestas+en+iran&semantic=true"
# Búsqueda tradicional (fallback)
curl "http://localhost:8001/api/search?q=protestas+en+iran&semantic=false"
```
## 📊 Verificar Estado
### Base de Datos
```sql
-- Progreso de vectorización
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE vectorized = TRUE) as vectorizadas,
COUNT(*) FILTER (WHERE vectorized = FALSE) as pendientes
FROM traducciones
WHERE lang_to = 'es' AND status = 'done';
```
### Qdrant API
```bash
# Estado de la colección
curl http://localhost:6333/collections/news_vectors
# Health check
curl http://localhost:6333/healthz
# Conteo de puntos
curl http://localhost:6333/collections/news_vectors | jq '.result.points_count'
```
## 🔍 Variables de Entorno
```bash
# .env
QDRANT_HOST=qdrant
QDRANT_PORT=6333
QDRANT_COLLECTION_NAME=news_vectors
QDRANT_BATCH_SIZE=100
QDRANT_SLEEP_IDLE=30
EMB_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
```
## 📁 Archivos Relevantes
| Archivo | Función |
|---------|---------|
| `workers/qdrant_worker.py` | Worker de vectorización continua |
| `utils/qdrant_search.py` | **NUEVO**: API de búsqueda semántica |
| `routers/search.py` | **ACTUALIZADO**: Buscador con Qdrant |
| `routers/conflicts.py` | **ACTUALIZADO**: Monitor de conflictos con Qdrant |
| `scripts/migrate_to_qdrant.py` | Migración/estadísticas |
| `docker-compose.yml` | Configuración de servicios + timezone sync |
## ⏰ Sincronización de Hora
**Problema Resuelto:** Todos los contenedores Docker ahora tienen la hora sincronizada (TZ=Europe/Madrid).
**Cambios:**
- ✅ Variable `TZ=Europe/Madrid` en todos los servicios
- ✅ Volúmenes `/etc/timezone` y `/etc/localtime` en servicios clave
- ✅ Logs consistentes entre todos los workers
## 🚀 Despliegue en Nuevas Máquinas
### Requisitos Previos
1. Docker y Docker Compose instalados
2. Al menos 8GB RAM (recomendado 16GB)
3. GPU NVIDIA (opcional, para workers de traducción)
### Pasos de Instalación
```bash
# 1. Clonar repositorio
git clone <repo-url>
cd rss2
# 2. Configurar variables de entorno
cp .env.example .env
# Editar .env con tus credenciales
# 3. Iniciar servicios
docker-compose up -d
# 4. Verificar que Qdrant está funcionando
curl http://localhost:6333/healthz
# 5. Monitorear vectorización
docker-compose logs -f qdrant-worker
```
### Migración de Datos Existentes
Si ya tienes noticias traducidas sin vectorizar:
```bash
# Ver estadísticas
docker exec -it rss2_web python scripts/migrate_to_qdrant.py --stats
# Vectorizar todas las pendientes (puede tardar horas con 500k noticias)
# El worker lo hace automáticamente, pero puedes forzarlo:
docker-compose restart qdrant-worker
```
## 🔍 Troubleshooting
### La búsqueda no usa Qdrant
```bash
# Verificar que Qdrant está corriendo
docker ps | grep qdrant
# Ver logs del worker
docker-compose logs qdrant-worker
# Verificar conexión
curl http://localhost:6333/collections/news_vectors
```
### Búsqueda lenta aún
```bash
# Verificar cuántas noticias están vectorizadas
docker exec -it rss2_web python scripts/migrate_to_qdrant.py --stats
# Si hay muchas pendientes, el worker las procesará automáticamente
# Para acelerar, aumenta el batch size en docker-compose.yml:
# QDRANT_BATCH_SIZE=200
```
### Error "No module named 'qdrant_client'"
```bash
# Reconstruir imagen web
docker-compose build rss2_web
docker-compose restart rss2_web
```
## 📈 Rendimiento
**Antes (PostgreSQL solo):**
- Búsqueda simple: 2-5 segundos (500k filas)
- Búsqueda compleja: 10-30 segundos
- Monitor de conflictos: 5-15 segundos
**Ahora (Qdrant + PostgreSQL):**
- Búsqueda semántica: 50-200ms
- Enriquecimiento PostgreSQL: +50ms
- Monitor de conflictos: 100-300ms
- **Mejora: 10-100x más rápido**
## 🎯 Próximos Pasos
- [ ] Implementar filtros avanzados (fecha, país, categoría)
- [ ] Cachear resultados frecuentes en Redis
- [ ] Agregar búsqueda híbrida (combinar PostgreSQL FTS + Qdrant)
- [ ] Dashboard de métricas de búsqueda

155
README.md Normal file
View file

@ -0,0 +1,155 @@
# RSS2 - Plataforma de Inteligencia de Noticias con IA
RSS2 es una plataforma avanzada de agregación, traducción y análisis de noticias diseñada para procesar grandes volúmenes de información en tiempo real. Utiliza una arquitectura de **microservicios híbrida (Go + Python/FastAPI)** y modelos de **Inteligencia Artificial** locales para transformar flujos RSS crudos en inteligencia accionable.
---
## 🏗️ Arquitectura y Servicios
El sistema se compone de múltiples contenedores Docker orquestados, divididos en 3 redes aisladas (`frontend`, `backend`, `monitoring`) para máxima seguridad.
### 🌐 Core & Frontend
| Servicio | Tecnología | Puerto | Descripción |
|----------|------------|--------|-------------|
| **`nginx`** | Nginx Alpine | **8001** | **Único punto de entrada público**. Reverse proxy, SSL, estáticos. |
| **`rss2_web`** | Python/FastAPI | - | Backend principal. API REST, Jinja2, lógica de negocio. |
| **`rss-web-go`** | Go/Gin | - | (Opcional) Microservicio web de alto rendimiento. |
### 🤖 Inteligencia Artificial & Workers
| Servicio | Descripción | Recursos |
|----------|-------------|----------|
| **`translator`** (x3) | Traducción Neural (NLLB-200) de cualquier idioma a Español. | GPU/CPU |
| **`embeddings`** | Generación de vectores semánticos para búsqueda inteligente. | GPU/CPU |
| **`ner`** | Reconocimiento de Entidades (Personas, Org, Lugares). | CPU |
| **`cluster`** | Agrupación de noticias por eventos similares. | CPU |
| **`topics`** | Clasificación temática de noticias. | CPU |
| **`qdrant-worker`** | Sincronización de vectores con Qdrant. | CPU |
### 📥 Ingesta de Datos
| Servicio | Descripción |
|----------|-------------|
| **`rss-ingestor-go`** | Crawler de alto rendimiento en Go. Descarga cientos de RSS/min. |
| **`url-worker`** | Scraper que descarga y limpia el contenido completo (HTML) de las noticias. |
| **`url-discovery`** | Descubrimiento automático de nuevos feeds RSS. |
### 💾 Almacenamiento de Datos
| Servicio | Tecnología | Descripción |
|----------|------------|-------------|
| **`db`** | PostgreSQL 18 | Base de datos principal (Escritura). Contraseñas fuertes. |
| **`db-replica`** | PostgreSQL 18 | Réplica de lectura (Actualmente en standby). |
| **`qdrant`** | Qdrant | **Base de datos vectorial**. Almacena embeddings para búsqueda semántica. |
| **`redis`** | Redis 7 | Broker de mensajes y caché. **Autenticado**. |
### 📊 Monitorización (Stack de Observabilidad)
| Servicio | Acceso | Descripción |
|----------|--------|-------------|
| **`grafana`** | `localhost:3001` | Dashboard visual de métricas del sistema y contenedores. |
| **`prometheus`** | *Interno* | Recolección de métricas de todos los servicios. |
| **`cadvisor`** | *Interno* | Métricas de uso de recursos de Docker (CPU, RAM). |
---
## 🚀 Instalación y Despliegue
### 1. Clonar y Configurar
```bash
git clone <repo>
cd rss2
```
### 2. Generación de Credenciales Seguras (IMPORTANTE)
El sistema incluye un script para generar contraseñas fuertes automáticamente:
```bash
./generate_secure_credentials.sh
```
Esto creará un archivo `.env` con contraseñas aleatorias para DB, Redis y Grafana.
### 3. Iniciar Servicios
El despliegue se gestiona con el script maestro seguro:
```bash
./migrate_to_secure.sh
```
O manualmente:
```bash
docker-compose up -d
```
### 4. Acceso
* **Web Pública**: [http://localhost:8001](http://localhost:8001)
* **Grafana (Monitorización)**: [http://localhost:3001](http://localhost:3001)
* *Usuario*: `admin`
* *Password*: (Ver archivo `.env` variable `GRAFANA_PASSWORD` o el output del generador)
---
## 🔒 Seguridad
El sistema ha sido auditado y fortificado (Enero 2026):
1. **Redes Segmentadas**:
* `rss2_frontend`: Solo Nginx y Web App.
* `rss2_backend`: Base de datos y Workers (sin acceso externo).
* `rss2_monitoring`: Stack de observabilidad.
2. **Puertos Cerrados**:
* Qdrant (6333), Prometheus (9090), Redis (6379) NO están expuestos al host.
* Solo puerto **8001** (Web) y **3001** (Grafana/Local) están abiertos.
3. **Autenticación**:
* Redis requiere contraseña (`requirepass`).
* PostgreSQL usa autenticación estricta.
4. **Scripts de Seguridad**:
* `verify_security.sh`: Ejecuta un test completo de la configuración de seguridad.
* `SECURITY_GUIDE.md`: Guía detallada de administración segura.
---
## 🛠️ Comandos de Mantenimiento
### Verificar estado del sistema
```bash
docker-compose ps
```
### Ver logs
```bash
docker-compose logs -f rss2_web # Web App
docker-compose logs -f translator # Traductor
```
### Copia de Seguridad (Backup)
```bash
# Backup de Base de Datos
docker exec rss2_db pg_dump -U rss rss > backup_$(date +%Y%m%d).sql
# Backup de Vectores (Qdrant)
# (Detener servicio antes recomendado)
tar -czf qdrant_backup.tar.gz qdrant_storage/
```
### Actualización
```bash
git pull
docker-compose up -d --build
```
---
## 🧠 Características de IA
* **Búsqueda Semántica**: Encuentra noticias por significado ("conflictos en oriente medio") incluso si no contienen las palabras exactas, gracias a los embeddings vectoriales en Qdrant.
* **Detección de Idioma**: Automática para dirigir al traductor correcto.
* **Entidades**: Explorador visual de *quién* y *dónde* en las noticias.
---
## 📂 Estructura de Directorios
* `/routers`: API Endpoints (Python).
* `/workers`: Lógica de fondo (Traducción, Ingesta, IA).
* `/rss-ingestor-go`: Código del crawler en Go.
* `/monitoring`: Configuración de Prometheus y Grafana.
* `/templates`: Vistas HTML (Jinja2).
* `/static`: Assets frontend.
* `docker-compose.yml`: Definición de infraestructura.

201
SECURITY_AUDIT.md Normal file
View file

@ -0,0 +1,201 @@
# 🔒 Auditoría de Seguridad de Red - Resumen Ejecutivo
**Fecha**: 2026-01-12
**Sistema**: RSS2 News Aggregator
**Auditor**: Análisis Automatizado de Seguridad
---
## 📊 RESUMEN EJECUTIVO
Se han identificado **múltiples vulnerabilidades críticas** en la configuración de red de los contenedores Docker. El sistema actual expone servicios internos sin autenticación y utiliza credenciales débiles que comprometen severamente la seguridad de la aplicación.
**Nivel de Riesgo Global**: 🔴 **CRÍTICO**
---
## 🚨 VULNERABILIDADES CRÍTICAS (Prioridad 1)
### 1. Credenciales Comprometidas
- **Severidad**: 🔴 CRÍTICA
- **CVSS Score**: 9.8 (Critical)
- **Descripción**:
- PostgreSQL usa password `x` (1 carácter)
- Flask SECRET_KEY es `secret` (valor por defecto conocido)
- Grafana usa password `admin` (credencial por defecto)
- **Impacto**:
- Acceso completo a la base de datos
- Posible firma de sesiones falsas
- Compromiso total del sistema de autenticación
- **Solución**: Generar credenciales aleatorias de 32+ caracteres
### 2. Exposición de Base de Datos Vectorial (Qdrant)
- **Severidad**: 🔴 CRÍTICA
- **CVSS Score**: 8.6 (High)
- **Puertos Expuestos**: 6333, 6334
- **Descripción**: Qdrant accesible públicamente sin autenticación
- **Impacto**:
- Lectura/modificación de vectores de noticias
- Potencial exfiltración de datos
- Manipulación de búsquedas semánticas
- **Solución**: Eliminar exposición de puertos, usar solo red interna
### 3. Redis Sin Autenticación
- **Severidad**: 🔴 ALTA
- **CVSS Score**: 7.5 (High)
- **Descripción**: Redis accesible sin password
- **Impacto**:
- Acceso no autorizado a caché
- Posible inyección de datos maliciosos
- DoS mediante flush de caché
- **Solución**: Habilitar requirepass en Redis
### 4. Exposición de Prometheus y cAdvisor
- **Severidad**: 🟠 ALTA
- **CVSS Score**: 7.2 (High)
- **Puertos Expuestos**: 9090 (Prometheus), 8081 (cAdvisor)
- **Descripción**: Métricas del sistema accesibles públicamente
- **Impacto**:
- Información sensible sobre arquitectura
- Vectores de ataque (uptime, recursos, vulnerabilidades)
- Reconocimiento de servicios internos
- **Solución**: Internalizar puertos, acceso solo via túnel SSH
---
## ⚠️ VULNERABILIDADES DE RIESGO MEDIO (Prioridad 2)
### 5. Ausencia de Segmentación de Red
- **Severidad**: 🟠 MEDIA
- **Descripción**: Todos los servicios en una única red Docker
- **Impacto**: Movimiento lateral fácil si un contenedor es comprometido
- **Solución**: Implementar 3 redes segmentadas (frontend, backend, monitoring)
### 6. Sin Límites de Recursos
- **Severidad**: 🟡 MEDIA-BAJA
- **Descripción**: Contenedores sin límites de CPU/memoria
- **Impacto**: Posible DoS por consumo excesivo de recursos
- **Solución**: Establecer límites y reservas de recursos
### 7. Montaje de Volúmenes con Permisos Excesivos
- **Severidad**: 🟡 BAJA
- **Descripción**: Código fuente montado en read-write
- **Impacto**: Modificación de código desde contenedor comprometido
- **Solución**: Montar volúmenes críticos en modo read-only
---
## ✅ SOLUCIONES IMPLEMENTADAS
### Archivos Creados
1. **`docker-compose.secure.yml`**
- Redes segmentadas (frontend, backend, monitoring)
- Puertos internalizados
- Autenticación en Redis
- Límites de recursos en todos los servicios
- Volúmenes read-only donde aplica
2. **`.env.secure.example`**
- Template con instrucciones de seguridad
- Placeholders para credenciales fuertes
3. **`generate_secure_credentials.sh`**
- Script automatizado para generar credenciales
- Genera passwords de 32 caracteres
- Crea .env con configuración segura
4. **`SECURITY_GUIDE.md`**
- Guía completa de migración
- Troubleshooting
- Best practices
5. **Código Python actualizado**
- `config.py`: Soporte para REDIS_PASSWORD
- `cache.py`: Autenticación en Redis
---
## 📈 MEJORAS DE SEGURIDAD
| Métrica | ANTES | DESPUÉS | Mejora |
|---------|-------|---------|--------|
| Puertos públicos | 7 | 1 | **-85%** |
| Servicios con autenticación | 1/4 | 4/4 | **+300%** |
| Redes aisladas | 1 | 3 | **+200%** |
| Servicios con límites de recursos | 0% | 100% | **+100%** |
| Fortaleza de passwords (bits) | ~4 bits | ~256 bits | **+6300%** |
---
## 🎯 PLAN DE ACCIÓN RECOMENDADO
### Fase 1: INMEDIATO (Hoy)
1. ✅ Revisar archivos generados
2. ✅ Leer SECURITY_GUIDE.md
3. ⏳ Ejecutar `./generate_secure_credentials.sh`
4. ⏳ Guardar credenciales en gestor de passwords
### Fase 2: CORTO PLAZO (Esta semana)
5. ⏳ Hacer backup completo de datos
6. ⏳ Migrar a `docker-compose.secure.yml`
7. ⏳ Validar funcionamiento en desarrollo
8. ⏳ Configurar acceso SSH a Grafana
### Fase 3: MEDIO PLAZO (Este mes)
9. ⏳ Implementar monitoreo de seguridad
10. ⏳ Configurar backups automáticos encriptados
11. ⏳ Implementar rate limiting en nginx
12. ⏳ Configurar fail2ban
---
## 📋 CHECKLIST DE VALIDACIÓN
Antes de marcar como resuelto, verificar:
- [ ] Todas las credenciales cambiadas y guardadas
- [ ] Solo puerto 8001 expuesto públicamente
- [ ] Qdrant NO accesible desde internet
- [ ] Prometheus NO accesible desde internet
- [ ] cAdvisor NO accesible desde internet
- [ ] Redis requiere autenticación
- [ ] Grafana solo en localhost (127.0.0.1:3001)
- [ ] Web app funciona correctamente
- [ ] Workers se conectan a servicios
- [ ] Búsqueda funciona
- [ ] Backups configurados
- [ ] Firewall del servidor activo
---
## 🔗 REFERENCIAS
- [Docker Security Best Practices](https://docs.docker.com/develop/security-best-practices/)
- [OWASP Top 10](https://owasp.org/www-project-top-ten/)
- [CIS Docker Benchmark](https://www.cisecurity.org/benchmark/docker)
- [NIST Cybersecurity Framework](https://www.nist.gov/cyberframework)
---
## 📞 CONTACTO Y SOPORTE
Para asistencia con la migración:
- Revisar `SECURITY_GUIDE.md` (troubleshooting completo)
- Verificar logs: `docker-compose logs -f`
- Verificar conectividad de redes: `docker network inspect rss2_backend`
---
**Última actualización**: 2026-01-12 18:18 CET
**Próxima revisión recomendada**: 2026-02-12 (mensual)
---
## 🏆 CONCLUSIÓN
La implementación de las soluciones propuestas reducirá el riesgo de seguridad de **CRÍTICO a BAJO**, cerrando todas las vulnerabilidades identificadas y estableciendo una base sólida de seguridad para la aplicación RSS2.
**Tiempo estimado de implementación**: 2-4 horas
**Complejidad**: Media
**ROI de seguridad**: Extremadamente Alto

383
SECURITY_GUIDE.md Normal file
View file

@ -0,0 +1,383 @@
# 🔒 GUÍA DE SEGURIDAD Y MIGRACIÓN - RSS2 Application
## ⚠️ RESUMEN DE VULNERABILIDADES ENCONTRADAS
### CRÍTICAS (Arreglar INMEDIATAMENTE)
1. **Credenciales débiles en .env**
- PostgreSQL password: `x`
- Flask SECRET_KEY: `secret`
- Grafana password: `admin`
2. **Servicios expuestos públicamente sin autenticación**
- Qdrant (puertos 6333, 6334) - Base de datos vectorial
- Prometheus (puerto 9090) - Métricas del sistema
- cAdvisor (puerto 8081) - Estadísticas de contenedores
3. **Redis sin autenticación**
- Accesible por todos los contenedores sin password
### ALTO RIESGO
4. **Ausencia de segmentación de red**
- Todos los servicios en una única red Docker
5. **Ausencia de límites de recursos**
- Contenedores sin límites de CPU/memoria (riesgo de DoS)
6. **Volúmenes con permisos excesivos**
- Código fuente montado con permisos de escritura
---
## 🛠️ SOLUCIONES IMPLEMENTADAS
### 1. Archivo `docker-compose.secure.yml`
**Mejoras de seguridad implementadas:**
#### 🔹 Redes Segmentadas
```yaml
networks:
frontend: # Solo nginx y rss2_web
backend: # BD, workers, redis, qdrant (interna)
monitoring: # Prometheus, Grafana, cAdvisor (interna)
```
#### 🔹 Puertos Internalizados
- ❌ **Eliminados puertos públicos de:**
- Qdrant (6333, 6334) → Solo acceso interno
- Prometheus (9090) → Solo acceso interno
- cAdvisor (8081) → Solo acceso interno
- rss-web-go (8002) → Servicio comentado (duplicado)
- ✅ **Único puerto público:**
- Nginx (8001) → Proxy reverso con seguridad
- ✅ **Puerto localhost únicamente:**
- Grafana (127.0.0.1:3001) → Acceso solo local o via túnel SSH
#### 🔹 Autenticación en Redis
```yaml
redis:
command: >
redis-server
--requirepass ${REDIS_PASSWORD}
```
#### 🔹 Límites de Recursos
Todos los contenedores tienen límites de CPU y memoria:
```yaml
deploy:
resources:
limits:
cpus: '2'
memory: 2G
reservations:
memory: 512M
```
#### 🔹 Volúmenes Read-Only
Código fuente montado en modo lectura donde sea posible:
```yaml
volumes:
- ./app.py:/app/app.py:ro
- ./routers:/app/routers:ro
- ./templates:/app/templates:ro
```
---
## 📋 PASOS DE MIGRACIÓN
### Opción A: Migración Gradual (RECOMENDADO para producción)
#### Paso 1: Generar Credenciales Seguras
```bash
# 1. Generar password para PostgreSQL
POSTGRES_PASSWORD=$(openssl rand -base64 32)
echo "POSTGRES_PASSWORD=$POSTGRES_PASSWORD"
# 2. Generar password para Redis
REDIS_PASSWORD=$(openssl rand -base64 32)
echo "REDIS_PASSWORD=$REDIS_PASSWORD"
# 3. Generar SECRET_KEY para Flask
SECRET_KEY=$(python3 -c "import secrets; print(secrets.token_hex(32))")
echo "SECRET_KEY=$SECRET_KEY"
# 4. Generar password para Grafana
GRAFANA_PASSWORD=$(openssl rand -base64 24)
echo "GRAFANA_PASSWORD=$GRAFANA_PASSWORD"
# Guardar estos valores en un lugar seguro (gestor de passwords)
```
#### Paso 2: Copiar y Configurar .env Seguro
```bash
# Copiar el ejemplo seguro
cp .env.secure.example .env
# Editar .env y pegar las contraseñas generadas
nano .env # o usa tu editor preferido
```
#### Paso 3: Backup de Datos
```bash
# Backup de PostgreSQL
docker exec rss2_db pg_dump -U rss rss > backup_$(date +%Y%m%d_%H%M%S).sql
# Backup de Qdrant
tar -czf qdrant_backup_$(date +%Y%m%d_%H%M%S).tar.gz qdrant_storage/
# Backup de Redis (opcional)
docker exec rss2_redis redis-cli --rdb /data/dump.rdb
cp redis-data/dump.rdb redis_backup_$(date +%Y%m%d_%H%M%S).rdb
```
#### Paso 4: Detener Servicios Actuales
```bash
docker-compose down
```
#### Paso 5: Migrar a Configuración Segura
```bash
# Renombrar archivo actual (backup)
mv docker-compose.yml docker-compose.yml.insecure.bak
# Usar la versión segura
cp docker-compose.secure.yml docker-compose.yml
# Verificar configuración
docker-compose config
```
#### Paso 6: Iniciar con Nueva Configuración
```bash
# Iniciar servicios
docker-compose up -d
# Verificar logs
docker-compose logs -f
# Verificar que todos los contenedores están corriendo
docker-compose ps
```
#### Paso 7: Verificar Conectividad
```bash
# Test web app
curl http://localhost:8001
# Test Redis (desde dentro de un contenedor)
docker exec rss2_web bash -c 'python3 -c "import redis; r = redis.Redis(host=\"redis\", port=6379, password=\"$REDIS_PASSWORD\"); print(r.ping())"'
# Verificar logs de workers
docker-compose logs rss2_tasks_py | tail -20
```
---
### Opción B: Migración Directa (Para desarrollo/testing)
```bash
# 1. Backup de datos (como en Opción A, Paso 3)
# 2. Detener todo
docker-compose down -v # CUIDADO: -v elimina volúmenes
# 3. Generar credenciales y configurar .env
cp .env.secure.example .env
# Editar .env con credenciales generadas
# 4. Restaurar datos si es necesario
# (Restaurar dump SQL, qdrant_storage, etc.)
# 5. Iniciar con configuración segura
cp docker-compose.secure.yml docker-compose.yml
docker-compose up -d
```
---
## 🔐 ACCESO A SERVICIOS PROTEGIDOS
### Grafana (Monitoring)
Ahora solo accesible en localhost. Para acceso remoto:
```bash
# Opción 1: Túnel SSH (RECOMENDADO)
ssh -L 3001:localhost:3001 usuario@servidor
# Luego acceder en tu navegador local:
# http://localhost:3001
# Usuario: admin
# Password: El que configuraste en GRAFANA_PASSWORD
```
### Qdrant (Base de Datos Vectorial)
Ya no es accesible públicamente. Para acceso de desarrollo:
```bash
# Opción 1: Temporalmente exponer puerto (SOLO para debug)
# Editar docker-compose.yml y descomentar:
# ports:
# - "127.0.0.1:6333:6333"
# Opción 2: Acceder desde dentro de la red Docker
docker exec -it rss2_qdrant_worker bash
curl http://qdrant:6333/collections
```
### Prometheus (Métricas)
```bash
# Acceso via túnel SSH
ssh -L 9090:localhost:9090 usuario@servidor
# O exponer temporalmente en localhost:
# En docker-compose.yml, prometheus service:
# ports:
# - "127.0.0.1:9090:9090"
```
---
## 🧪 TESTING DE SEGURIDAD
### Verificar que puertos NO son accesibles públicamente:
```bash
# Desde FUERA del servidor (desde tu máquina local)
# Estos NO deberían responder:
curl http://servidor:6333 # Qdrant - debe fallar
curl http://servidor:9090 # Prometheus - debe fallar
curl http://servidor:8081 # cAdvisor - debe fallar
# Este SÍ debe responder:
curl http://servidor:8001 # Nginx - debe funcionar
```
### Verificar segmentación de redes:
```bash
# Los contenedores NO deberían poder acceder a servicios fuera de su red
# Desde un worker backend, NO debe alcanzar nginx:
docker exec rss2_cluster_py curl http://nginx # Debería fallar
# Desde monitoring, NO debe alcanzar db:
docker exec rss2_prometheus curl http://db:5432 # Debería fallar
```
---
## 📊 COMPARATIVA: ANTES vs DESPUÉS
| Aspecto | ANTES (Inseguro) | DESPUÉS (Seguro) |
|---------|------------------|------------------|
| **Puertos expuestos** | 7 puertos públicos | 1 puerto público + 1 localhost |
| **Autenticación Redis** | ❌ Sin password | ✅ Autenticado |
| **PostgreSQL Password** | `x` (débil) | 32+ caracteres aleatorios |
| **Flask SECRET_KEY** | `secret` | 64 caracteres hex |
| **Segmentación de red** | ❌ Una red única | ✅ 3 redes aisladas |
| **Límites de recursos** | ❌ Sin límites | ✅ CPU y RAM limitados |
| **Volúmenes** | Read-Write | Read-Only donde posible |
| **Qdrant público** | ⚠️ Sí (puerto 6333) | ✅ Solo interno |
| **Prometheus público** | ⚠️ Sí (puerto 9090) | ✅ Solo interno |
---
## 🚨 CHECKLIST FINAL DE SEGURIDAD
Antes de poner en producción, verifica:
- [ ] Todas las contraseñas generadas aleatoriamente (min 32 caracteres)
- [ ] Archivo `.env` NO está en el repositorio (revisar .gitignore)
- [ ] Solo puerto 8001 expuesto públicamente
- [ ] Grafana accesible solo en localhost
- [ ] Redis requiere autenticación
- [ ] Todos los workers pueden conectarse a Redis
- [ ] Todos los workers pueden conectarse a PostgreSQL
- [ ] Web app funciona correctamente
- [ ] Búsqueda semántica (Qdrant) funciona
- [ ] Backups automáticos configurados
- [ ] Monitoring (Grafana) accessible via SSH tunnel
- [ ] Firewall del servidor configurado (solo permitir 8001, 22)
---
## 🔧 TROUBLESHOOTING
### Error: Redis authentication failed
```bash
# Verificar que REDIS_PASSWORD está en .env
grep REDIS_PASSWORD .env
# Verificar que los workers tienen la variable
docker exec rss2_web env | grep REDIS
# Reiniciar servicios
docker-compose restart
```
### Error: No puedo acceder a Grafana desde mi máquina
```bash
# Asegurarte de que el túnel SSH está activo
ssh -L 3001:localhost:3001 usuario@servidor
# Verificar que Grafana está corriendo
docker-compose ps | grep grafana
# Logs de Grafana
docker-compose logs grafana
```
### Workers no pueden conectarse a Qdrant
```bash
# Verificar que Qdrant está en la red backend
docker network inspect rss2_backend | grep qdrant
# Verificar logs de Qdrant
docker-compose logs qdrant
# Test de conectividad desde un worker
docker exec rss2_qdrant_worker curl http://qdrant:6333/collections
```
---
## 📚 RECURSOS ADICIONALES
- [Docker Networks Security Best Practices](https://docs.docker.com/network/network-tutorial-standalone/)
- [Redis Security](https://redis.io/docs/management/security/)
- [PostgreSQL Security](https://www.postgresql.org/docs/current/security.html)
- [OWASP Docker Security Cheat Sheet](https://cheatsheetseries.owasp.org/cheatsheets/Docker_Security_Cheat_Sheet.html)
---
## 📝 NOTAS IMPORTANTES
1. **Backup Regular**: Configura backups automáticos ANTES de migrar
2. **Testing**: Prueba en un entorno de desarrollo primero
3. **Downtime**: Planifica una ventana de mantenimiento
4. **Monitoring**: Verifica que Grafana funciona después de migrar
5. **Documentation**: Documenta las contraseñas en un gestor seguro
---
**Última actualización**: 2026-01-12
**Autor**: Auditoría de Seguridad Automatizada

65
app.py Normal file
View file

@ -0,0 +1,65 @@
from flask import Flask
from config import SECRET_KEY
from utils import safe_html, format_date, country_flag
from routers.home import home_bp
from routers.feeds import feeds_bp
from routers.urls import urls_bp
from routers.noticia import noticia_bp
from routers.backup import backup_bp
# from routers.eventos import eventos_bp
from routers.config import config_bp
from routers.favoritos import favoritos_bp
from routers.search import search_bp
from routers.rss import rss_bp
from routers.resumen import resumen_bp
from routers.stats import stats_bp
from routers.pdf import pdf_bp
from routers.notifications import notifications_bp
from routers.auth import auth_bp
from routers.account import account_bp
from routers.parrillas import parrillas_bp
def create_app() -> Flask:
app = Flask(__name__)
app.config["SECRET_KEY"] = SECRET_KEY
app.jinja_env.filters["safe_html"] = safe_html
app.jinja_env.filters["format_date"] = format_date
app.jinja_env.filters["country_flag"] = country_flag
app.register_blueprint(home_bp)
app.register_blueprint(feeds_bp)
app.register_blueprint(urls_bp)
app.register_blueprint(noticia_bp)
app.register_blueprint(backup_bp)
# app.register_blueprint(eventos_bp) # Removed
app.register_blueprint(config_bp)
app.register_blueprint(favoritos_bp)
app.register_blueprint(search_bp)
app.register_blueprint(rss_bp)
# app.register_blueprint(resumen_bp) # Removed
app.register_blueprint(stats_bp)
app.register_blueprint(pdf_bp)
app.register_blueprint(notifications_bp)
from routers.conflicts import conflicts_bp
from routers.topics import topics_bp
app.register_blueprint(conflicts_bp)
app.register_blueprint(topics_bp)
app.register_blueprint(auth_bp)
app.register_blueprint(account_bp)
app.register_blueprint(parrillas_bp)
return app
app = create_app()
if __name__ == "__main__":
app.run(host="0.0.0.0", port=8001, debug=True)

177
cache.py Normal file
View file

@ -0,0 +1,177 @@
"""
Redis cache module for high-traffic endpoints.
Provides caching decorator and invalidation utilities.
"""
import redis
import json
import logging
import hashlib
from functools import wraps
from config import REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_TTL_DEFAULT
logger = logging.getLogger(__name__)
_redis_client = None
def get_redis():
"""Get Redis client singleton."""
global _redis_client
if _redis_client is None:
try:
redis_config = {
'host': REDIS_HOST,
'port': REDIS_PORT,
'decode_responses': True,
'socket_connect_timeout': 2,
'socket_timeout': 2
}
# Agregar autenticación si está configurada
if REDIS_PASSWORD:
redis_config['password'] = REDIS_PASSWORD
_redis_client = redis.Redis(**redis_config)
_redis_client.ping()
except redis.ConnectionError as e:
logger.warning(f"Redis connection failed: {e}. Caching disabled.")
_redis_client = None
return _redis_client
def cached(ttl_seconds=None, prefix="cache"):
"""
Decorator for caching function results in Redis.
Falls back to calling function directly if Redis is unavailable.
Args:
ttl_seconds: Time to live in seconds (default from config)
prefix: Key prefix for cache entries
"""
if ttl_seconds is None:
ttl_seconds = REDIS_TTL_DEFAULT
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
r = get_redis()
if r is None:
# Redis unavailable, call function directly
return func(*args, **kwargs)
# Build cache key from function name and arguments
# Use md5 for deterministic hash across processes
key_data = f"{args}:{sorted(kwargs.items())}"
# Add flask request args if available to prevent collision on filtered routes
try:
from flask import request
if request:
key_data += f":args:{sorted(request.args.items())}"
except Exception:
pass
key_hash = hashlib.md5(key_data.encode('utf-8')).hexdigest()
cache_key = f"cache:{prefix}:{func.__name__}:{key_hash}"
try:
# Try to get from cache
cached_value = r.get(cache_key)
if cached_value is not None:
# If it's a JSON response, we might need to return it correctly
try:
data = json.loads(cached_value)
# Detect if we should return as JSON
from flask import jsonify
return jsonify(data)
except (json.JSONDecodeError, ImportError):
return cached_value
# Cache miss - call function and cache result
result = func(*args, **kwargs)
# Handle Flask Response objects
cache_data = result
try:
from flask import Response
if isinstance(result, Response):
if result.is_json:
cache_data = result.get_json()
else:
cache_data = result.get_data(as_text=True)
except (ImportError, Exception):
pass
r.setex(cache_key, ttl_seconds, json.dumps(cache_data, default=str))
return result
except (redis.RedisError, json.JSONDecodeError) as e:
logger.warning(f"Cache error for {func.__name__}: {e}")
return func(*args, **kwargs)
return wrapper
return decorator
def invalidate_pattern(pattern):
"""
Invalidate all cache keys matching pattern.
Args:
pattern: Pattern to match (e.g., "home:*" or "stats:*")
"""
r = get_redis()
if r is None:
return
try:
cursor = 0
deleted = 0
while True:
cursor, keys = r.scan(cursor, match=f"cache:{pattern}", count=100)
if keys:
r.delete(*keys)
deleted += len(keys)
if cursor == 0:
break
if deleted > 0:
logger.info(f"Invalidated {deleted} cache keys matching '{pattern}'")
except redis.RedisError as e:
logger.warning(f"Cache invalidation failed: {e}")
def cache_get(key):
"""Get value from cache by key."""
r = get_redis()
if r is None:
return None
try:
value = r.get(f"cache:{key}")
return json.loads(value) if value else None
except (redis.RedisError, json.JSONDecodeError):
return None
def cache_set(key, value, ttl_seconds=None):
"""Set value in cache with optional TTL."""
if ttl_seconds is None:
ttl_seconds = REDIS_TTL_DEFAULT
r = get_redis()
if r is None:
return False
try:
r.setex(f"cache:{key}", ttl_seconds, json.dumps(value, default=str))
return True
except redis.RedisError:
return False
def cache_del(key):
"""Delete a key from cache."""
r = get_redis()
if r is None:
return False
try:
r.delete(f"cache:{key}")
return True
except redis.RedisError:
return False

69
config.py Normal file
View file

@ -0,0 +1,69 @@
import os
from dotenv import load_dotenv
load_dotenv()
DB_CONFIG = {
"dbname": os.getenv("DB_NAME", "rss"),
"user": os.getenv("DB_USER", "rss"),
"password": os.getenv("DB_PASS", ""),
"host": os.getenv("DB_HOST", "localhost"),
"port": int(os.getenv("DB_PORT", 5432)),
}
# Write DB (primary) - for workers/ingestion
DB_WRITE_CONFIG = {
"dbname": os.getenv("DB_NAME", "rss"),
"user": os.getenv("DB_USER", "rss"),
"password": os.getenv("DB_PASS", ""),
"host": os.getenv("DB_WRITE_HOST", os.getenv("DB_HOST", "localhost")),
"port": int(os.getenv("DB_PORT", 5432)),
}
# Read DB (replica) - for web queries
DB_READ_CONFIG = {
"dbname": os.getenv("DB_NAME", "rss"),
"user": os.getenv("DB_USER", "rss"),
"password": os.getenv("DB_PASS", ""),
"host": os.getenv("DB_READ_HOST", os.getenv("DB_HOST", "localhost")),
"port": int(os.getenv("DB_PORT", 5432)),
}
# Redis Cache
REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
REDIS_PORT = int(os.getenv("REDIS_PORT", 6379))
REDIS_PASSWORD = os.getenv("REDIS_PASSWORD", None) # None = sin autenticación (para compatibilidad)
REDIS_TTL_DEFAULT = int(os.getenv("REDIS_TTL_DEFAULT", 60))
SECRET_KEY = os.getenv("SECRET_KEY", "CAMBIA_ESTA_CLAVE_POR_ALGO_LARGO_Y_ALEATORIO")
DEFAULT_LANG = os.getenv("DEFAULT_LANG", "es")
DEFAULT_TRANSLATION_LANG = os.getenv("DEFAULT_TRANSLATION_LANG", "es")
WEB_TRANSLATED_DEFAULT = int(os.getenv("WEB_TRANSLATED_DEFAULT", "1"))
# Configuración de paginación
NEWS_PER_PAGE_DEFAULT = 30 # Reducido de 50 para mejor rendimiento
RSS_MAX_WORKERS = int(os.getenv("RSS_MAX_WORKERS", "3")) # Reducido de 10 a 3
RSS_FEED_TIMEOUT = int(os.getenv("RSS_FEED_TIMEOUT", "60")) # Aumentado timeout
RSS_MAX_FAILURES = int(os.getenv("RSS_MAX_FAILURES", "5"))
TARGET_LANGS = os.getenv("TARGET_LANGS", "es")
TRANSLATOR_BATCH = int(os.getenv("TRANSLATOR_BATCH", "2")) # Reducido de 4 a 2
ENQUEUE = int(os.getenv("ENQUEUE", "50")) # Reducido de 200 a 50
TRANSLATOR_SLEEP_IDLE = float(os.getenv("TRANSLATOR_SLEEP_IDLE", "10")) # Aumentado de 5 a 10
MAX_SRC_TOKENS = int(os.getenv("MAX_SRC_TOKENS", "512"))
MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "256"))
NUM_BEAMS_TITLE = int(os.getenv("NUM_BEAMS_TITLE", "1")) # Reducido beams para menos CPU
NUM_BEAMS_BODY = int(os.getenv("NUM_BEAMS_BODY", "1"))
UNIVERSAL_MODEL = os.getenv("UNIVERSAL_MODEL", "facebook/nllb-200-1.3B")
DEVICE = os.getenv("DEVICE", "cpu")
TOKENIZERS_PARALLELISM = os.getenv("TOKENIZERS_PARALLELISM", "false")
PYTHONUNBUFFERED = os.getenv("PYTHONUNBUFFERED", "1")
PYTORCH_CUDA_ALLOC_CONF = os.getenv("PYTORCH_CUDA_ALLOC_CONF", "")

76
db.py Normal file
View file

@ -0,0 +1,76 @@
import os
import psycopg2
from contextlib import contextmanager
# Database configuration
DB_HOST = os.environ.get("DB_HOST", "db")
DB_NAME = os.environ.get("DB_NAME", "rss")
DB_USER = os.environ.get("DB_USER", "rss")
DB_PASS = os.environ.get("DB_PASS", "x")
DB_PORT = os.environ.get("DB_PORT", "5432")
DB_READ_HOST = os.environ.get("DB_READ_HOST", "db-replica")
DB_WRITE_HOST = os.environ.get("DB_WRITE_HOST", "db")
@contextmanager
def get_conn():
"""Get a database connection (Default: Primary/Write)."""
conn = None
try:
conn = psycopg2.connect(
host=DB_HOST,
database=DB_NAME,
user=DB_USER,
password=DB_PASS,
port=DB_PORT
)
yield conn
finally:
if conn:
conn.close()
@contextmanager
def get_read_conn():
"""Get a read-only database connection (Replica)."""
conn = None
try:
try:
# Attempt to connect to Replica first
conn = psycopg2.connect(
host=DB_READ_HOST,
database=DB_NAME,
user=DB_USER,
password=DB_PASS,
port=DB_PORT,
connect_timeout=5
)
except (psycopg2.OperationalError, psycopg2.InterfaceError) as e:
# Fallback to Primary if Replica is down on initial connection
print(f"Warning: Replica unreachable ({e}), falling back to Primary for read.")
conn = psycopg2.connect(
host=DB_WRITE_HOST,
database=DB_NAME,
user=DB_USER,
password=DB_PASS,
port=DB_PORT
)
yield conn
finally:
if conn:
conn.close()
@contextmanager
def get_write_conn():
"""Get a write database connection (Primary)."""
conn = None
try:
conn = psycopg2.connect(
host=DB_WRITE_HOST,
database=DB_NAME,
user=DB_USER,
password=DB_PASS,
port=DB_PORT
)
yield conn
finally:
if conn:
conn.close()

790
docker-compose.yml Normal file
View file

@ -0,0 +1,790 @@
services:
db:
image: postgres:18-alpine
container_name: rss2_db
shm_size: 4gb
environment:
POSTGRES_DB: ${POSTGRES_DB:-rss}
POSTGRES_USER: ${POSTGRES_USER:-rss}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
POSTGRES_INITDB_ARGS: "--encoding=UTF8 --locale=C.UTF-8"
LANG: C.UTF-8
LC_ALL: C.UTF-8
TZ: Europe/Madrid
PGDATA: /var/lib/postgresql/data/18/main
command:
[
"postgres",
"-c",
"max_connections=200",
"-c",
"shared_buffers=4GB",
"-c",
"effective_cache_size=12GB",
"-c",
"work_mem=16MB",
"-c",
"maintenance_work_mem=512MB",
"-c",
"autovacuum_max_workers=3",
"-c",
"autovacuum_vacuum_scale_factor=0.02",
"-c",
"autovacuum_vacuum_cost_limit=1000",
# Parallel Query Optimization (Adjusted)
"-c",
"max_worker_processes=8",
"-c",
"max_parallel_workers=6",
"-c",
"max_parallel_workers_per_gather=2",
# Streaming Replication
"-c",
"wal_level=replica",
"-c",
"max_wal_senders=5",
"-c",
"wal_keep_size=1GB",
"-c",
"hot_standby=on"
]
volumes:
- ./pgdata:/var/lib/postgresql/data
- ./init-db:/docker-entrypoint-initdb.d:ro
networks:
- backend
restart: unless-stopped
healthcheck:
test: [ "CMD-SHELL", "pg_isready -h 127.0.0.1 -p 5432 -U $$POSTGRES_USER -d $$POSTGRES_DB || exit 1" ]
interval: 5s
timeout: 5s
retries: 30
start_period: 20s
deploy:
resources:
limits:
memory: 8G
reservations:
memory: 4G
db-replica:
build:
context: .
dockerfile: Dockerfile.replica
container_name: rss2_db_replica
shm_size: 2gb
environment:
POSTGRES_DB: ${POSTGRES_DB:-rss}
POSTGRES_USER: ${POSTGRES_USER:-rss}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
PGDATA: /var/lib/postgresql/data
TZ: Europe/Madrid
command: [ "postgres", "-c", "max_connections=200", "-c", "shared_buffers=256MB", "-c", "effective_cache_size=2GB", "-c", "hot_standby=on", "-c", "max_worker_processes=16", "-c", "hot_standby_feedback=on", "-c", "max_standby_streaming_delay=300s" ]
volumes:
- ./pgdata-replica:/var/lib/postgresql/data
networks:
- backend
depends_on:
db:
condition: service_healthy
restart: unless-stopped
healthcheck:
test: [ "CMD-SHELL", "pg_isready -h 127.0.0.1 -p 5432 -U rss -d rss || exit 1" ]
interval: 5s
timeout: 5s
retries: 30
start_period: 30s
deploy:
resources:
limits:
memory: 4G
reservations:
memory: 2G
redis:
image: redis:7-alpine
container_name: rss2_redis
environment:
TZ: Europe/Madrid
# SEGURIDAD: Redis con autenticación
command: >
redis-server --appendonly yes --maxmemory 512mb --maxmemory-policy allkeys-lru --requirepass ${REDIS_PASSWORD}
volumes:
- ./redis-data:/data
- /etc/timezone:/etc/timezone:ro
- /etc/localtime:/etc/localtime:ro
networks:
- backend
restart: unless-stopped
healthcheck:
test: [ "CMD", "redis-cli", "--no-auth-warning", "-a", "${REDIS_PASSWORD}", "ping" ]
interval: 5s
timeout: 3s
retries: 5
deploy:
resources:
limits:
memory: 768M
reservations:
memory: 512M
rss-ingestor-go:
build:
context: ./rss-ingestor-go
dockerfile: Dockerfile
container_name: rss2_ingestor_go
environment:
DB_HOST: db
DB_PORT: 5432
DB_NAME: ${DB_NAME:-rss}
DB_USER: ${DB_USER:-rss}
DB_PASS: ${DB_PASS}
RSS_MAX_WORKERS: 100
RSS_POKE_INTERVAL_MIN: 15
TZ: Europe/Madrid
networks:
- backend
depends_on:
db:
condition: service_healthy
restart: unless-stopped
deploy:
resources:
limits:
cpus: '2'
memory: 2G
reservations:
memory: 512M
rss-tasks:
build: .
container_name: rss2_tasks_py
command: bash -lc "python -m scheduler"
environment:
DB_HOST: db
DB_PORT: 5432
DB_NAME: ${DB_NAME:-rss}
DB_USER: ${DB_USER:-rss}
DB_PASS: ${DB_PASS}
REDIS_HOST: redis
REDIS_PORT: 6379
REDIS_PASSWORD: ${REDIS_PASSWORD}
TZ: Europe/Madrid
networks:
- backend
depends_on:
db:
condition: service_healthy
redis:
condition: service_healthy
restart: unless-stopped
deploy:
resources:
limits:
cpus: '1'
memory: 1G
url-worker:
build:
context: .
dockerfile: Dockerfile.url_worker
container_name: rss2_url_worker
command: bash -lc "python -m workers.url_worker_daemon"
environment:
DB_HOST: db
DB_PORT: 5432
DB_NAME: ${DB_NAME:-rss}
DB_USER: ${DB_USER:-rss}
DB_PASS: ${DB_PASS}
DB_READ_HOST: db
DB_WRITE_HOST: db
TZ: Europe/Madrid
networks:
- backend
depends_on:
db:
condition: service_healthy
restart: unless-stopped
deploy:
resources:
limits:
cpus: '2'
memory: 2G
url-discovery-worker:
build: .
container_name: rss2_url_discovery
command: bash -lc "python -m workers.url_discovery_worker"
environment:
DB_HOST: db
DB_PORT: 5432
DB_NAME: ${DB_NAME:-rss}
DB_USER: ${DB_USER:-rss}
DB_PASS: ${DB_PASS}
URL_DISCOVERY_INTERVAL_MIN: 15
URL_DISCOVERY_BATCH_SIZE: 10
MAX_FEEDS_PER_URL: 5
TZ: Europe/Madrid
networks:
- backend
depends_on:
db:
condition: service_healthy
restart: unless-stopped
deploy:
resources:
limits:
cpus: '1'
memory: 1G
# rss-web-go deshabilitado por duplicar funcionalidad
# Si es necesario, habilitar pero SIN exposición de puertos
# rss-web-go:
# build:
# context: ./rss-web-go
# dockerfile: Dockerfile
# container_name: rss2_web_go
# # SEGURIDAD: Sin exposición de puertos - solo acceso interno
# # ports:
# # - "8002:8001"
# environment:
# - DB_HOST=db
# - DB_PORT=5432
# - DB_NAME=${DB_NAME:-rss}
# - DB_USER=${DB_USER:-rss}
# - DB_PASS=${DB_PASS}
# - REDIS_HOST=redis
# - REDIS_PORT=6379
# - REDIS_PASSWORD=${REDIS_PASSWORD}
# - PORT=8001
# - TZ=Europe/Madrid
# volumes:
# - ./static:/root/static:ro
# - ./templates:/root/templates:ro
# networks:
# - backend
# depends_on:
# db:
# condition: service_healthy
# restart: unless-stopped
rss2_web:
build: .
container_name: rss2_web
command: bash -lc "gunicorn --config gunicorn_config.py app:app"
volumes:
# SEGURIDAD: Código en read-only donde sea posible
- ./app.py:/app/app.py:ro
- ./routers:/app/routers:ro
- ./models:/app/models:ro
- ./utils:/app/utils:ro
- ./templates:/app/templates:ro
- ./static:/app/static:ro
- ./config.py:/app/config.py:ro
- ./db.py:/app/db.py:ro
- ./cache.py:/app/cache.py:ro
- ./gunicorn_config.py:/app/gunicorn_config.py:ro
# Directorios escribibles
- ./hf_cache:/app/hf_cache
- ./data:/app/data
environment:
DB_HOST: db
DB_PORT: 5432
DB_NAME: ${DB_NAME:-rss}
DB_USER: ${DB_USER:-rss}
DB_PASS: ${DB_PASS}
DB_READ_HOST: db
DB_WRITE_HOST: db
REDIS_HOST: redis
REDIS_PORT: 6379
REDIS_PASSWORD: ${REDIS_PASSWORD}
QDRANT_HOST: qdrant
QDRANT_PORT: 6333
QDRANT_COLLECTION_NAME: ${QDRANT_COLLECTION_NAME:-news_vectors}
EMB_MODEL: ${EMB_MODEL:-sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2}
SECRET_KEY: ${SECRET_KEY}
GUNICORN_WORKERS: 8
ALLTALK_URL: http://host.docker.internal:7851
TZ: Europe/Madrid
extra_hosts:
- "host.docker.internal:host-gateway"
networks:
- frontend
- backend
depends_on:
db:
condition: service_healthy
# db-replica:
# condition: service_healthy
redis:
condition: service_healthy
qdrant:
condition: service_started
restart: unless-stopped
deploy:
resources:
limits:
cpus: '8'
memory: 8G
reservations:
memory: 4G
nginx:
image: nginx:alpine
container_name: rss2_nginx
environment:
TZ: Europe/Madrid
ports:
# ÚNICO puerto expuesto públicamente
- "8001:80"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
- ./static:/app/static:ro
- /etc/timezone:/etc/timezone:ro
- /etc/localtime:/etc/localtime:ro
networks:
- frontend
depends_on:
- rss2_web
restart: unless-stopped
deploy:
resources:
limits:
cpus: '2'
memory: 512M
translator:
build:
context: .
dockerfile: Dockerfile
image: rss2-translator:latest
container_name: rss2_translator_py
command: bash -lc "python -m workers.translation_worker"
environment:
DB_HOST: db
DB_PORT: 5432
DB_NAME: ${DB_NAME:-rss}
DB_USER: ${DB_USER:-rss}
DB_PASS: ${DB_PASS}
TARGET_LANGS: es
TRANSLATOR_BATCH: 128
ENQUEUE: 300
# CTranslate2 configuration
CT2_MODEL_PATH: /app/models/nllb-ct2
CT2_DEVICE: cuda
CT2_COMPUTE_TYPE: int8_float16
UNIVERSAL_MODEL: facebook/nllb-200-distilled-600M
HF_HOME: /app/hf_cache
TZ: Europe/Madrid
volumes:
- ./hf_cache:/app/hf_cache
- ./models:/app/models
networks:
- backend
deploy:
resources:
limits:
memory: 8G
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [ gpu ]
depends_on:
db:
condition: service_healthy
restart: unless-stopped
translator2:
build:
context: .
dockerfile: Dockerfile
image: rss2-translator2:latest
container_name: rss2_translator_py2
command: bash -lc "python -m workers.translation_worker"
environment:
DB_HOST: db
DB_PORT: 5432
DB_NAME: ${DB_NAME:-rss}
DB_USER: ${DB_USER:-rss}
DB_PASS: ${DB_PASS}
TARGET_LANGS: es
TRANSLATOR_BATCH: 128
ENQUEUE: 300
CT2_MODEL_PATH: /app/models/nllb-ct2
CT2_DEVICE: cuda
CT2_COMPUTE_TYPE: int8_float16
UNIVERSAL_MODEL: facebook/nllb-200-distilled-600M
HF_HOME: /app/hf_cache
TZ: Europe/Madrid
volumes:
- ./hf_cache:/app/hf_cache
- ./models:/app/models
networks:
- backend
deploy:
resources:
limits:
memory: 8G
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [ gpu ]
depends_on:
db:
condition: service_healthy
restart: unless-stopped
translator3:
build:
context: .
dockerfile: Dockerfile
image: rss2-translator3:latest
container_name: rss2_translator_py3
command: bash -lc "python -m workers.translation_worker"
environment:
DB_HOST: db
DB_PORT: 5432
DB_NAME: ${DB_NAME:-rss}
DB_USER: ${DB_USER:-rss}
DB_PASS: ${DB_PASS}
TARGET_LANGS: es
TRANSLATOR_BATCH: 128
ENQUEUE: 300
CT2_MODEL_PATH: /app/models/nllb-ct2
CT2_DEVICE: cuda
CT2_COMPUTE_TYPE: int8_float16
UNIVERSAL_MODEL: facebook/nllb-200-distilled-600M
HF_HOME: /app/hf_cache
volumes:
- ./hf_cache:/app/hf_cache
- ./models:/app/models
networks:
- backend
deploy:
resources:
limits:
memory: 8G
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [ gpu ]
depends_on:
db:
condition: service_healthy
restart: unless-stopped
embeddings:
build:
context: .
dockerfile: Dockerfile
container_name: rss2_embeddings_py
command: bash -lc "python -m workers.embeddings_worker"
environment:
DB_HOST: db
DB_PORT: 5432
DB_NAME: ${DB_NAME:-rss}
DB_USER: ${DB_USER:-rss}
DB_PASS: ${DB_PASS}
EMB_MODEL: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
EMB_BATCH: 64
EMB_SLEEP_IDLE: 5
EMB_LANGS: es
EMB_LIMIT: 1000
DEVICE: cuda
HF_HOME: /app/hf_cache
TZ: Europe/Madrid
volumes:
- ./hf_cache:/app/hf_cache
networks:
- backend
deploy:
resources:
limits:
memory: 6G
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [ gpu ]
depends_on:
db:
condition: service_healthy
restart: unless-stopped
related:
build:
context: .
dockerfile: Dockerfile
container_name: rss2_related_py
command: bash -lc "python -m workers.related_worker"
environment:
DB_HOST: db
DB_PORT: 5432
DB_NAME: ${DB_NAME:-rss}
DB_USER: ${DB_USER:-rss}
DB_PASS: ${DB_PASS}
RELATED_WINDOW_H: 168
TZ: Europe/Madrid
networks:
- backend
depends_on:
db:
condition: service_healthy
restart: unless-stopped
deploy:
resources:
limits:
cpus: '1'
memory: 1G
cluster:
build:
context: .
dockerfile: Dockerfile
container_name: rss2_cluster_py
command: bash -lc "python -m workers.cluster_worker"
environment:
DB_HOST: db
DB_PORT: 5432
DB_NAME: ${DB_NAME:-rss}
DB_USER: ${DB_USER:-rss}
DB_PASS: ${DB_PASS}
EVENT_DIST_THRESHOLD: 0.35
EMB_MODEL: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
TZ: Europe/Madrid
networks:
- backend
depends_on:
db:
condition: service_healthy
restart: unless-stopped
deploy:
resources:
limits:
cpus: '2'
memory: 2G
ner:
build: .
container_name: rss2_ner
command: bash -lc "python -m workers.ner_worker"
environment:
DB_HOST: db
DB_PORT: 5432
DB_NAME: ${DB_NAME:-rss}
DB_USER: ${DB_USER:-rss}
DB_PASS: ${DB_PASS}
NER_LANG: es
NER_BATCH: 64
HF_HOME: /app/hf_cache
TZ: Europe/Madrid
volumes:
- ./hf_cache:/app/hf_cache
networks:
- backend
depends_on:
db:
condition: service_healthy
restart: unless-stopped
deploy:
resources:
limits:
cpus: '2'
memory: 2G
topics:
build:
context: .
dockerfile: Dockerfile
container_name: rss2_topics_worker
command: bash -lc "python -m workers.topics_worker"
environment:
DB_HOST: db
DB_PORT: 5432
DB_NAME: ${DB_NAME:-rss}
DB_USER: ${DB_USER:-rss}
DB_PASS: ${DB_PASS}
TZ: Europe/Madrid
networks:
- backend
depends_on:
db:
condition: service_healthy
restart: unless-stopped
deploy:
resources:
limits:
cpus: '1'
memory: 1G
qdrant:
image: qdrant/qdrant:latest
container_name: rss2_qdrant
environment:
TZ: Europe/Madrid
QDRANT__SERVICE__GRPC_PORT: 6334
# SEGURIDAD: Puertos NO expuestos - solo acceso interno
# ports:
# - "6333:6333"
# - "6334:6334"
volumes:
- ./qdrant_storage:/qdrant/storage
- /etc/timezone:/etc/timezone:ro
- /etc/localtime:/etc/localtime:ro
networks:
- backend
restart: unless-stopped
deploy:
resources:
limits:
cpus: '4'
memory: 4G
reservations:
memory: 2G
qdrant-worker:
build:
context: .
dockerfile: Dockerfile
container_name: rss2_qdrant_worker
command: bash -lc "python -m workers.qdrant_worker"
environment:
DB_HOST: db
DB_PORT: 5432
DB_NAME: ${DB_NAME:-rss}
DB_USER: ${DB_USER:-rss}
DB_PASS: ${DB_PASS}
DB_READ_HOST: db
DB_WRITE_HOST: db
QDRANT_HOST: qdrant
QDRANT_PORT: 6333
QDRANT_COLLECTION_NAME: ${QDRANT_COLLECTION_NAME:-news_vectors}
EMB_MODEL: ${EMB_MODEL:-sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2}
EMB_DEVICE: cpu
QDRANT_BATCH_SIZE: ${QDRANT_BATCH_SIZE:-100}
QDRANT_SLEEP_IDLE: ${QDRANT_SLEEP_IDLE:-30}
HF_HOME: /app/hf_cache
TZ: Europe/Madrid
volumes:
- ./hf_cache:/app/hf_cache
networks:
- backend
depends_on:
db:
condition: service_healthy
# db-replica:
# condition: service_healthy
qdrant:
condition: service_started
restart: unless-stopped
deploy:
resources:
limits:
cpus: '2'
memory: 4G
# ==================================================================================
# MONITORING STACK - SECURED
# ==================================================================================
prometheus:
image: prom/prometheus:latest
container_name: rss2_prometheus
volumes:
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
# SEGURIDAD: Sin exposición de puertos - acceso solo vía Grafana o túnel SSH
# ports:
# - "9090:9090"
networks:
- monitoring
restart: unless-stopped
deploy:
resources:
limits:
cpus: '1'
memory: 2G
grafana:
image: grafana/grafana:latest
container_name: rss2_grafana
# SEGURIDAD: Acceso solo en localhost o vía túnel SSH
# Para acceso remoto, usar túnel SSH: ssh -L 3001:localhost:3001 user@server
ports:
- "127.0.0.1:3001:3000"
environment:
# SEGURIDAD: Cambiar este password en producción
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD:-change_this_password}
- GF_USERS_ALLOW_SIGN_UP=false
- GF_SERVER_ROOT_URL=http://localhost:3001
- GF_SECURITY_COOKIE_SECURE=false
- GF_SECURITY_COOKIE_SAMESITE=lax
volumes:
- grafana_data:/var/lib/grafana
networks:
- monitoring
depends_on:
- prometheus
restart: unless-stopped
deploy:
resources:
limits:
cpus: '1'
memory: 1G
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
container_name: rss2_cadvisor
# SEGURIDAD: Sin exposición de puertos - solo acceso interno
# ports:
# - "8081:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
devices:
- /dev/kmsg
networks:
- monitoring
restart: unless-stopped
deploy:
resources:
limits:
cpus: '0.5'
memory: 512M
# ==================================================================================
# REDES SEGMENTADAS
# ==================================================================================
networks:
# Red frontal - Solo nginx y web app
frontend:
name: rss2_frontend
driver: bridge
internal: false
# Red backend - Base de datos, workers, redis, qdrant
backend:
name: rss2_backend
driver: bridge
internal: false # Acceso externo permitido (necesario para ingestor)
# Red de monitoreo - Prometheus, Grafana, cAdvisor
monitoring:
name: rss2_monitoring
driver: bridge
internal: true
volumes:
prometheus_data:
grafana_data:

391
docs/FEED_DISCOVERY.md Normal file
View file

@ -0,0 +1,391 @@
# Sistema de Descubrimiento y Gestión de Feeds RSS
Este documento describe el sistema mejorado de descubrimiento automático y gestión de feeds RSS implementado en RSS2.
## 📋 Visión General
El sistema ahora incluye dos mecanismos para gestionar feeds RSS:
1. **Gestión Manual Mejorada**: Interfaz web para descubrir y añadir feeds desde cualquier URL
2. **Worker Automático**: Proceso en segundo plano que descubre feeds desde URLs almacenadas
## 🎯 Componentes del Sistema
### 1. Utilidad de Descubrimiento (`utils/feed_discovery.py`)
Módulo Python que proporciona funciones para:
- **`discover_feeds(url)`**: Descubre automáticamente todos los feeds RSS/Atom desde una URL
- **`validate_feed(feed_url)`**: Valida un feed y extrae su información básica
- **`get_feed_metadata(feed_url)`**: Obtiene metadatos detallados de un feed
```python
from utils.feed_discovery import discover_feeds
# Descubrir feeds desde una URL
feeds = discover_feeds('https://elpais.com')
# Retorna: [{'url': '...', 'title': '...', 'valid': True, ...}, ...]
```
### 2. Router de Feeds Mejorado (`routers/feeds.py`)
Nuevos endpoints añadidos:
#### Interfaz Web
- **`GET/POST /feeds/discover`**: Interfaz para descubrir feeds desde una URL
- Muestra todos los feeds encontrados
- Permite seleccionar cuáles añadir
- Aplica configuración global (categoría, país, idioma)
- **`POST /feeds/discover_and_add`**: Añade múltiples feeds seleccionados
- Extrae automáticamente título y descripción
- Evita duplicados
- Muestra estadísticas de feeds añadidos
#### API JSON
- **`POST /feeds/api/discover`**: API para descubrir feeds
```json
{
"url": "https://example.com"
}
```
Retorna:
```json
{
"feeds": [...],
"count": 5
}
```
- **`POST /feeds/api/validate`**: API para validar un feed específico
```json
{
"url": "https://example.com/rss"
}
```
### 3. Worker de Descubrimiento (`workers/url_discovery_worker.py`)
Worker automático que:
1. **Procesa URLs de la tabla `fuentes_url`**
- Prioriza URLs nunca procesadas
- Reintenta URLs con errores
- Actualiza URLs antiguas
2. **Descubre y Crea Feeds Automáticamente**
- Encuentra todos los feeds RSS en cada URL
- Valida cada feed encontrado
- Crea entradas en la tabla `feeds`
- Evita duplicados
3. **Gestión de Estado**
- Actualiza `last_check`, `last_status`, `status_message`
- Maneja errores gracefully
- Registra estadísticas detalladas
#### Configuración del Worker
Variables de entorno:
```bash
# Intervalo de ejecución (minutos)
URL_DISCOVERY_INTERVAL_MIN=15
# Número de URLs a procesar por ciclo
URL_DISCOVERY_BATCH_SIZE=10
# Máximo de feeds a crear por URL
MAX_FEEDS_PER_URL=5
```
#### Estados de URLs en `fuentes_url`
| Estado | Descripción |
|--------|-------------|
| `success` | Feeds creados exitosamente |
| `existing` | Feeds encontrados pero ya existían |
| `no_feeds` | No se encontraron feeds RSS |
| `no_valid_feeds` | Se encontraron feeds pero ninguno válido |
| `error` | Error al procesar la URL |
## 🚀 Uso del Sistema
### Método 1: Interfaz Web Manual
1. **Navega a `/feeds/discover`**
2. **Ingresa una URL** (ej: `https://elpais.com`)
3. **Haz clic en "Buscar Feeds"**
4. El sistema mostrará todos los feeds encontrados con:
- Estado de validación
- Título y descripción
- Número de entradas
- Tipo de feed (RSS/Atom)
5. **Configura opciones globales**:
- Categoría
- País
- Idioma
6. **Selecciona los feeds deseados** y haz clic en "Añadir Feeds Seleccionados"
### Método 2: Worker Automático
1. **Añade URLs a la tabla `fuentes_url`**:
```sql
INSERT INTO fuentes_url (nombre, url, categoria_id, pais_id, idioma, active)
VALUES ('El País', 'https://elpais.com', 1, 1, 'es', TRUE);
```
2. **El worker procesará automáticamente**:
- Cada 15 minutos (configurable)
- Descubrirá todos los feeds
- Creará entradas en `feeds`
- Actualizará el estado
3. **Monitorea el progreso**:
```sql
SELECT nombre, url, last_check, last_status, status_message
FROM fuentes_url
ORDER BY last_check DESC;
```
### Método 3: Interfaz de URLs (Existente)
Usa la interfaz web existente en `/urls/add_source` para añadir URLs que serán procesadas por el worker.
## 🔄 Flujo de Trabajo Completo
```
┌─────────────────┐
│ Usuario añade │
│ URL del sitio │
└────────┬────────┘
v
┌─────────────────────────┐
│ URL guardada en │
│ tabla fuentes_url │
└────────┬────────────────┘
v
┌─────────────────────────┐
│ Worker ejecuta cada │
│ 15 minutos │
└────────┬────────────────┘
v
┌─────────────────────────┐
│ Descubre feeds RSS │
│ usando feedfinder2 │
└────────┬────────────────┘
v
┌─────────────────────────┐
│ Valida cada feed │
│ encontrado │
└────────┬────────────────┘
v
┌─────────────────────────┐
│ Crea entradas en │
│ tabla feeds │
└────────┬────────────────┘
v
┌─────────────────────────┐
│ Ingestor Go procesa │
│ feeds cada 15 minutos │
└────────┬────────────────┘
v
┌─────────────────────────┐
│ Noticias descargadas │
│ y procesadas │
└─────────────────────────┘
```
## 📊 Tablas de Base de Datos
### `fuentes_url`
Almacena URLs de sitios web para descubrimiento automático:
```sql
CREATE TABLE fuentes_url (
id SERIAL PRIMARY KEY,
nombre VARCHAR(255) NOT NULL,
url TEXT NOT NULL UNIQUE,
categoria_id INTEGER REFERENCES categorias(id),
pais_id INTEGER REFERENCES paises(id),
idioma CHAR(2) DEFAULT 'es',
last_check TIMESTAMP,
last_status VARCHAR(50),
status_message TEXT,
last_http_code INTEGER,
active BOOLEAN DEFAULT TRUE
);
```
### `feeds`
Almacena feeds RSS descubiertos y validados:
```sql
CREATE TABLE feeds (
id SERIAL PRIMARY KEY,
nombre VARCHAR(255),
descripcion TEXT,
url TEXT NOT NULL UNIQUE,
categoria_id INTEGER REFERENCES categorias(id),
pais_id INTEGER REFERENCES paises(id),
idioma CHAR(2),
activo BOOLEAN DEFAULT TRUE,
fallos INTEGER DEFAULT 0,
last_etag TEXT,
last_modified TEXT,
last_error TEXT
);
```
## ⚙️ Configuración del Sistema
### Variables de Entorno
Añade al archivo `.env`:
```bash
# RSS Ingestor
RSS_POKE_INTERVAL_MIN=15 # Intervalo de ingesta (minutos)
RSS_MAX_FAILURES=10 # Fallos máximos antes de desactivar feed
RSS_FEED_TIMEOUT=60 # Timeout para descargar feeds (segundos)
# URL Discovery Worker
URL_DISCOVERY_INTERVAL_MIN=15 # Intervalo de descubrimiento (minutos)
URL_DISCOVERY_BATCH_SIZE=10 # URLs a procesar por ciclo
MAX_FEEDS_PER_URL=5 # Máximo de feeds por URL
```
### Docker Compose
El worker ya está configurado en `docker-compose.yml`:
```yaml
url-discovery-worker:
build: .
container_name: rss2_url_discovery
command: bash -lc "python -m workers.url_discovery_worker"
environment:
DB_HOST: db
URL_DISCOVERY_INTERVAL_MIN: 15
URL_DISCOVERY_BATCH_SIZE: 10
MAX_FEEDS_PER_URL: 5
depends_on:
db:
condition: service_healthy
restart: unless-stopped
```
## 🔧 Comandos Útiles
### Ver logs del worker de descubrimiento
```bash
docker logs -f rss2_url_discovery
```
### Reiniciar el worker
```bash
docker restart rss2_url_discovery
```
### Ejecutar manualmente el worker (testing)
```bash
docker exec -it rss2_url_discovery python -m workers.url_discovery_worker
```
### Ver estadísticas de descubrimiento
```sql
-- URLs procesadas recientemente
SELECT nombre, url, last_check, last_status, status_message
FROM fuentes_url
WHERE last_check > NOW() - INTERVAL '1 day'
ORDER BY last_check DESC;
-- Feeds creados recientemente
SELECT nombre, url, fecha_creacion
FROM feeds
WHERE fecha_creacion > NOW() - INTERVAL '1 day'
ORDER BY fecha_creacion DESC;
```
## 🛠️ Troubleshooting
### El worker no encuentra feeds
1. Verifica que la URL sea accesible:
```bash
curl -I https://example.com
```
2. Verifica los logs del worker:
```bash
docker logs rss2_url_discovery
```
3. Prueba manualmente el descubrimiento:
```python
from utils.feed_discovery import discover_feeds
feeds = discover_feeds('https://example.com')
print(feeds)
```
### Feeds duplicados
El sistema previene duplicados usando `ON CONFLICT (url) DO NOTHING`. Si un feed ya existe, simplemente se omite.
### Worker consume muchos recursos
Ajusta las configuraciones:
```bash
# Reduce el batch size
URL_DISCOVERY_BATCH_SIZE=5
# Aumenta el intervalo
URL_DISCOVERY_INTERVAL_MIN=30
# Reduce feeds por URL
MAX_FEEDS_PER_URL=3
```
## 📝 Mejores Prácticas
1. **Añade URLs de sitios, no feeds directos**
- ✅ `https://elpais.com`
- ❌ `https://elpais.com/rss/feed.xml`
2. **Configura categoría y país correctamente**
- Facilita la organización
- Mejora la experiencia del usuario
3. **Monitorea el estado de las URLs**
- Revisa periódicamente `fuentes_url`
- Desactiva URLs que fallan consistentemente
4. **Limita el número de feeds por URL**
- Evita sobrecarga de feeds similares
- Mantén `MAX_FEEDS_PER_URL` entre 3-5
## 🎉 Ventajas del Sistema
**Automatización completa**: Solo añade URLs, el sistema hace el resto
**Descubrimiento inteligente**: Encuentra todos los feeds disponibles
**Validación automática**: Solo crea feeds válidos y funcionales
**Sin duplicados**: Gestión inteligente de feeds existentes
**Escalable**: Procesa múltiples URLs en lotes
**Resiliente**: Manejo robusto de errores y reintentos
**Monitoreable**: Logs detallados y estados claros
## 📚 Referencias
- **feedfinder2**: https://github.com/dfm/feedfinder2
- **feedparser**: https://feedparser.readthedocs.io/
- **Tabla fuentes_url**: `/init-db/01.schema.sql`
- **Worker**: `/workers/url_discovery_worker.py`
- **Utilidades**: `/utils/feed_discovery.py`

223
docs/PARRILLAS_VIDEOS.md Normal file
View file

@ -0,0 +1,223 @@
# 🎬 Sistema de Parrillas de Videos Automatizados
## 📋 Descripción
Este sistema permite generar videos automáticos de noticias filtradas según diferentes criterios:
- **Por País**: Noticias de Bulgaria, España, Estados Unidos, etc.
- **Por Categoría**: Ciencia, Tecnología, Deport, Política, etc.
- **Por Entidad**: Personas u organizaciones específicas (ej: "Donald Trump", "OpenAI")
- **Por Continente**: Europa, Asia, América, etc.
## 🎯 Características
### ✅ Sistema Implementado
1. **Base de Datos**
- Tabla `video_parrillas`: Configuraciones de parrillas
- Tabla `video_generados`: Registro de videos creados
- Tabla `video_noticias`: Relación entre videos y noticias
2. **API REST**
- `GET /parrillas/` - Listado de parrillas
- `GET /parrillas/<id>` - Detalle de parrilla
- `POST /parrillas/nueva` - Crear parrilla
- `GET /parrillas/api/<id>/preview` - Preview de noticias
- `POST /parrillas/api/<id>/generar` - Generar video
- `POST /parrillas/api/<id>/toggle` - Activar/desactivar
- `DELETE /parrillas/api/<id>` - Eliminar parrilla
3. **Generador de Videos**
- Script: `generar_videos_noticias.py`
- Integración con AllTalk TTS
- Generación automática de subtítulos SRT
- Soporte para múltiples idiomas
## 🚀 Uso Rápido
### 1. Crear una Parrilla
```bash
# Acceder a la interfaz web
http://localhost:8001/parrillas/
# O usar SQL directo
docker-compose exec -T db psql -U rss -d rss -c "
INSERT INTO video_parrillas (nombre, descripcion, tipo_filtro, pais_id, max_noticias, frecuencia, activo)
VALUES ('Noticias de Bulgaria', 'Resumen diario de noticias de Bulgaria', 'pais',
(SELECT id FROM paises WHERE nombre = 'Bulgaria'), 5, 'daily', true);
"
```
### 2. Generar Video Manualmente
```bash
# Generar video para parrilla con ID 1
docker-compose exec web python generar_videos_noticias.py 1
```
### 3. Generación Automática (Diaria)
```bash
# Procesar todas las parrillas activas con frecuencia 'daily'
docker-compose exec web python generar_videos_noticias.py
```
## 📝 Ejemplos de Parrillas
### Ejemplo 1: Noticias de Ciencia en Europa
```sql
INSERT INTO video_parrillas (
nombre, descripcion, tipo_filtro,
categoria_id, continente_id,
max_noticias, duracion_maxima, idioma_voz,
frecuencia, activo
) VALUES (
'Ciencia en Europa',
'Las últimas noticias científicas de Europa',
'categoria',
(SELECT id FROM categorias WHERE nombre ILIKE '%ciencia%'),
(SELECT id FROM continentes WHERE nombre = 'Europa'),
7, 300, 'es',
'daily', true
);
```
### Ejemplo 2: Noticias sobre una Persona
```sql
INSERT INTO video_parrillas (
nombre, descripcion, tipo_filtro,
entidad_nombre, entidad_tipo,
max_noticias, idioma_voz,
frecuencia, activo
) VALUES (
'Donald Trump en las Noticias',
'Todas las menciones de Donald Trump',
'entidad',
'Donald Trump', 'persona',
10, 'es',
'daily', true
);
```
### Ejemplo 3: Noticias de Tecnología
```sql
INSERT INTO video_parrillas (
nombre, descripcion, tipo_filtro,
categoria_id,
max_noticias, idioma_voz,
include_subtitles, template,
frecuencia, activo
) VALUES (
'Tech News Daily',
'Resumen diario de tecnología',
'categoria',
(SELECT id FROM categorias WHERE nombre ILIKE '%tecnolog%'),
8, 'es',
true, 'modern',
'daily', true
);
```
## 🔧 Configuración Avanzada
### Opciones de Parrilla
| Campo | Tipo | Descripción |
|-------|------|-------------|
| `nombre` | string | Nombre único de la parrilla |
| `descripcion` | text | Descripción detallada |
| `tipo_filtro` | enum | 'pais', 'categoria', 'entidad', 'continente', 'custom' |
| `pais_id` | int | ID del país (si tipo_filtro='pais') |
| `categoria_id` | int | ID de categoría |
| `continente_id` | int | ID de continente |
| `entidad_nombre` | string | Nombre de persona/organización |
| `entidad_tipo` | string | 'persona' o 'organizacion' |
| `max_noticias` | int | Máximo de noticias por video (default: 5) |
| `duracion_maxima` | int | Duración máxima en segundos (default: 180) |
| `idioma_voz` | string | Idioma del TTS ('es', 'en', etc.) |
| `template` | string | 'standard', 'modern', 'minimal' |
| `include_images` | bool | Incluir imágenes en el video |
| `include_subtitles` | bool | Generar subtítulos SRT |
| `frecuencia` | string | 'daily', 'weekly', 'manual' |
### Configuración de AllTalk
El sistema utiliza AllTalk para generar la narración con voz. Configurar en docker-compose.yml:
```yaml
environment:
ALLTALK_URL: http://alltalk:7851
```
## 📊 Estructura de Archivos Generados
```
data/
videos/
<video_id>/
script.txt # Libreto completo del video
audio.wav # Audio generado con TTS
subtitles.srt # Subtítulos (si enabled)
metadata.json # Metadata del video
```
## 🔄 Workflow de Generación
1. **Consulta de Noticias**: Filtra noticias según criterios de la parrilla
2. **Construcción de Script**: Genera libreto narrativo
3. **Síntesis de Voz**: Envía texto a AllTalk TTS
4. **Generación de Subtítulos**: Crea archivo SRT con timestamps
5. **Registro en BD**: Guarda paths y metadata en `video_generados`
6. **Relación de Noticias**: Vincula noticias incluidas en `video_noticias`
## 🎨 Próximas Mejoras
- [ ] Integración con generador de videos (combinar audio + imágenes)
- [ ] Templates visuales personalizados
- [ ] Transiciones entre noticias
- [ ] Música de fondo
- [ ] Logo/branding personalizado
- [ ] Exportación directa a YouTube/TikTok
- [ ] Programación automática con cron
- [ ] Dashboard de analíticas de videos
- [ ] Sistema de thumbnails automáticos
## 🐛 Troubleshooting
### Error: "No hay noticias disponibles"
- Verificar que existan noticias traducidas (`traducciones.status = 'done'`)
- Ajustar filtros de la parrilla
- Verificar rango de fechas (por defecto últimas 24h)
### Error en AllTalk TTS
- Verificar que el servicio AllTalk esté corriendo
- Revisar URL en variable de entorno `ALLTALK_URL`
- Comprobar logs: `docker-compose logs alltalk`
### Video no se genera
- Revisar estado en tabla `video_generados`
- Ver columna `error_message` si `status = 'error'`
- Verificar permisos en directorio `/app/data/videos`
## 📞 Soporte
Para problemas o sugerencias, revisar los logs:
```bash
# Logs del generador
docker-compose exec web python generar_videos_noticias.py <id> 2>&1 | tee video_generation.log
# Ver videos en cola
docker-compose exec -T db psql -U rss -d rss -c "
SELECT id, parrilla_id, titulo, status, fecha_generacion
FROM video_generados
ORDER BY fecha_generacion DESC LIMIT 10;
"
```
## 📄 Licencia
Este módulo es parte del sistema RSS2 News Aggregator.

View file

@ -0,0 +1,426 @@
# 📖 PROCESO COMPLETO: Descubrimiento y Gestión de Feeds RSS
## 🎯 Problema Resuelto
**Pregunta:** ¿Cómo asigno país y categoría a los feeds descubiertos automáticamente?
**Respuesta:** El sistema ahora usa un flujo inteligente de 3 niveles:
1. **Auto-aprobación** (feeds con categoría/país)
2. **Revisión manual** (feeds sin metadata completa)
3. **Análisis automático** (sugerencias inteligentes)
---
## 🔄 FLUJO COMPLETO DEL SISTEMA
### Paso 1: Añadir URL Fuente
Tienes 2 opciones para añadir URLs:
#### Opción A: Con Categoría y País (AUTO-APROBACIÓN)
```sql
INSERT INTO fuentes_url (nombre, url, categoria_id, pais_id, idioma, active)
VALUES ('El País', 'https://elpais.com', 1, 44, 'es', TRUE);
-- ^ ^
-- categoria_id pais_id
```
**Resultado**: Feeds se crean **AUTOMÁTICAMENTE** y se activan
- Worker descubre feeds
- Hereda categoría (1) y país (44) del padre
- Crea feeds en tabla `feeds` directam ente
- Ingestor empieza a descargar noticias
#### Opción B: Sin Categoría o País (REQUIERE REVISIÓN)
```sql
INSERT INTO fuentes_url (nombre, url, active)
VALUES ('BBC News', 'https://www.bbc.com/news', TRUE);
-- Sin categoria_id ni pais_id
```
⚠️ **Resultado**: Feeds van a **REVISIÓN MANUAL**
- Worker descubre feeds
- Analiza automáticamente:
- Detecta país desde dominio (.com → Reino Unido)
- Detecta idioma (en)
- Sugiere categoría ("Internacional")
- Crea feeds en tabla `feeds_pending`
- **ESPERA APROBACIÓN MANUAL** antes de activar
---
### Paso 2: Worker Descubre Feeds (cada 15 min)
El worker `url_discovery_worker` ejecuta automaticamente:
```
1. Lee fuentes_url activas
2. Para cada URL:
a. Descubre todos los feeds RSS
b. Valida cada feed
c. Analiza metadata:
- Idioma del feed
- País (desde dominio: .es, .uk, .fr, etc.)
- Categoría sugerida (keywords en título/descripción)
d. DECIDE EL FLUJO:
┌─────────────────────────────────────┐
│ ¿Parent tiene categoria_id Y pais_id? │
└──────────┬──────────────────────────┘
┌────────┴────────┐
│ SÍ │ NO
▼ ▼
┌──────────────┐ ┌─────────────────┐
│ AUTO-APROBAR │ │ REQUIERE REVISIÓN│
└───────┬──────┘ └─────────┬───────┘
│ │
▼ ▼
tabla: feeds tabla: feeds_pending
activo: TRUE reviewed: FALSE
✅ Listo para ⏳ Espera aprobación
ingestor
```
---
### Paso 3A: Feeds AUTO-APROBADOS
Si la URL padre tiene `categoria_id` y `pais_id`:
```sql
-- Ejemplo: URL con metadata completa
fuentes_url:
id=1, url='https://elpais.com',
categoria_id=1 (Noticias),
pais_id=44 (España)
↓ Worker descubre 3 feeds:
- https://elpais.com/rss/portada.xml
- https://elpais.com/rss/internacional.xml
- https://elpais.com/rss/deportes.xml
↓ Se crean DIRECTAMENTE en tabla feeds:
INSERT INTO feeds (nombre, url, categoria_id, pais_id, activo)
VALUES
('El País - Portada', 'https://elpais.com/rss/portada.xml', 1, 44, TRUE),
('El País - Internacional', 'https://elpais.com/rss/internacional.xml', 1, 44, TRUE),
('El País - Deportes', 'https://elpais.com/rss/deportes.xml', 1, 44, TRUE);
✅ Feeds están ACTIVOS inmediatamente
✅ Ingestor Go los procesa en siguiente ciclo (15 min)
✅ Noticias empiezan a llegar
```
---
### Paso 3B: Feeds PENDIENTES (requieren revisión)
Si la URL padre NO tiene `categoria_id` o `pais_id`:
```sql
-- Ejemplo: URL sin metadata
fuentes_url:
id=2, url='https://www.bbc.com/news',
categoria_id=NULL,
pais_id=NULL
↓ Worker descubre 2 feeds y ANALIZA automáticamente:
Feed 1: https://www.bbc.com/news/world/rss.xml
- Título: "BBC News - World"
- Idioma detectado: 'en'
- País detectado: 'Reino Unido' (desde .com + idioma inglés)
- Categoría sugerida: 'Internacional' (keyword "world")
Feed 2: https://www.bbc.com/sport/rss.xml
- Título: "BBC Sport"
- Idioma detectado: 'en'
- País detectado: 'Reino Unido'
- Categoría sugerida: 'Deportes' (keyword "sport")
↓ Se crean en tabla feeds_pending:
INSERT INTO feeds_pending (
fuente_url_id, feed_url, feed_title,
feed_language, detected_country_id, suggested_categoria_id,
reviewed, approved, notes
) VALUES (
2,
'https://www.bbc.com/news/world/rss.xml',
'BBC News - World',
'en',
74, -- Reino Unido (ID detectado)
2, -- Internacional (ID sugerido)
FALSE, FALSE,
'Country from domain: Reino Un ido | Suggested category: Internacional (confidence: 85%)'
);
⏳ Feeds están PENDIENTES
⏳ NO están activos aún
⏳ Requieren revisión manual en /feeds/pending
```
---
## 📊 Tabla Comparativa
| Aspecto | Auto-Aprobación | Revisión Manual |
|---------|----------------|-----------------|
| **Requisito** | URL padre con `categoria_id` Y `pais_id` | URL padre sin uno o ambos |
| **Tabla destino** | `feeds` (directa) | `feeds_pending` (temporal) |
| **Estado inicial** | `activo = TRUE` | `reviewed = FALSE, approved = FALSE` |
| **Análisis automático** | Hereda valores del padre | Detecta país, sugiere categoría |
| **Intervención manual** | ❌ No necesaria | ✅ Requerida |
| **Tiempo hasta activación** | Inmediato | Después de aprobación |
| **Ingestor procesa** | Sí (próximo ciclo) | No (hasta aprobar) |
---
## 🛠️ Interfaces de Gestión
### 1. Añadir URL con Metadata (Auto-aprobación)
**Ruta:** `/urls/add_source`
```
Formulario:
┌─────────────────────────────────────┐
│ Nombre: El País │
│ URL: https://elpais.com │
│ Categoría: [Noticias ▼] ← IMPORTANTE
│ País: [España ▼] ← IMPORTANTE
│ Idioma: es │
│ │
│ [Añadir Fuente] │
└─────────────────────────────────────┘
Resultado: Feeds se crearán AUTOMÁTICAMENTE
```
### 2. Revisar Feeds Pendientes (Nueva interfaz)
**Ruta:** `/feeds/pending` (próximamente)
```
Feeds Pendientes de Revisión
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Feed: BBC News - World
URL: https://www.bbc.com/news/world/rss.xml
Fuente: BBC News (https://www.bbc.com/news)
Análisis Automático:
├─ Idioma: English (en)
├─ País detectado: Reino Unido (.com domain + language)
└─ Categoría sugerida: Internacional (85% confianza)
Keywords: "world", "international", "global"
┌─────────────────────────────────────┐
│ Categoría: [Internacional ▼] │ ← Pre-seleccionada
│ País: [Reino Unido ▼] │ ← Pre-seleccionado
│ Idioma: [en] │ ← Auto-detectado
│ │
│ [✓ Aprobar Feed] [✗ Rechazar] │
└─────────────────────────────────────┘
```
### 3. Descubrir Feeds Manualmente
**Ruta:** `/feeds/discover`
```
Perfecto para cuando quieres control total:
1. Ingresar URL
2. Ver todos los feeds encontrados
3. Seleccionar cuáles añadir
4. Asignar categoría/país globalmente
5. Feeds se crean directamente (no van a pending)
```
---
## 💡 RECOMENDACIONES DE USO
### Estrategia 1: Auto-aprobación Total
**Para fuentes conocidas y confiables:**
```sql
-- Añadir fuentes con metadata completa
INSERT INTO fuentes_url (nombre, url, categoria_id, pais_id, idioma) VALUES
('El País', 'https://elpais.com', 1, 44, 'es'),
('Le Monde', 'https://lemonde.fr', 1, 60, 'fr'),
('The Guardian', 'https://theguardian.com', 1, 74, 'en');
-- Worker creará feeds automáticamente
-- Sin intervención manual necesaria
```
### Estrategia 2: Revisión Manual
**Para fuentes nuevas o desconocidas:**
```sql
-- Añadir sin metadata
INSERT INTO fuentes_url (nombre, url) VALUES
('Sitio Desconocido', 'https://ejemplo.com');
-- Worker crea feeds en feeds_pending
-- Revisar en /feeds/pending
-- Aprobar/rechazar manualmente
```
### Estrategia 3: Híbrida (Recomendada)
**Combinar ambas:**
- URLs conocidas → Con categoría/país
- URLs nuevas → Sin metadata (revisión)
- Usar análisis automático como guía
- Ajustar manualmente si es necesario
---
## 🔍 Análisis Automático Explicado
### Detección de País
```python
# 1. Desde dominio (TLD)
.es → España
.uk, .co.uk → Reino Unido
.fr → Francia
.de → Alemania
.mx → México
.ar → Argentina
# 2. Desde idioma (si no hay dominio claro)
es → España (país principal)
en → Reino Unido
fr → Francia
pt → Portugal
# 3. Desde subdominios
es.example.com → España
uk.example.com → Reino Unido
```
### Sugerencia de Categoría
```python
# Análisis de keywords en título + descripción
Keywords encontrados → Categoría sugerida (% confianza)
"política", "gobierno", "elecciones" → Política (75%)
"economía", "bolsa", "mercado" → Economía (82%)
"tecnología", "software", "digital" → Tecnología (90%)
"deportes", "fútbol", "liga" → Deportes (95%)
"internacional", "mundo", "global" → Internacional (70%)
```
---
## 📝 Ejemplos Completos
### Ejemplo 1: Periódico Español (Auto-aprobación)
```sql
-- 1. Añadir fuente con metadata
INSERT INTO fuentes_url (nombre, url, categoria_id, pais_id, idioma)
VALUES ('El Mundo', 'https://elmundo.es', 1, 44, 'es');
-- 2. Worker ejecuta (15 min después):
-- - Descubre: elmundo.es/rss/portada.xml
-- - Descubre: elmundo.es/rss/deportes.xml
-- - Hereda: categoria_id=1, pais_id=44
-- - Crea en feeds directamente
-- 3. Resultado en tabla feeds:
SELECT id, nombre, url, categoria_id, pais_id, activo
FROM feeds
WHERE fuente_nombre LIKE '%El Mundo%';
-- id | nombre | url | cat | pais | activo
-- 1 | El Mundo - Portada | elmundo.es/rss/portada.xml | 1 | 44 | TRUE
-- 2 | El Mundo - Deportes | elmundo.es/rss/deportes.xml | 1 | 44 | TRUE
-- ✅ Feeds activos, ingestor procesando
```
### Ejemplo 2: Sitio Internacional (Revisión Manual)
```sql
-- 1. Añadir fuente SIN metadata
INSERT INTO fuentes_url (nombre, url)
VALUES ('Reuters', 'https://www.reuters.com');
-- 2. Worker ejecuta (15 min después):
-- - Descubre: reuters.com/rssfeed/worldNews
-- - Analiza: idioma=en, país=Reino Unido (dominio+idioma)
-- - Sugiere: categoría=Internacional (keyword "world")
-- - Crea en feeds_pending
-- 3. Resultado en tabla feeds_pending:
SELECT feed_title, detected_country_id, suggested_categoria_id, notes
FROM feeds_pending
WHERE fuente_url_id = 3;
-- feed_title | detected_country_id | suggested_cat | notes
-- Reuters World News | 74 (Reino Unido) | 2 (Int.) | "Country from domain..."
-- ⏳ Requiere aprobación en /feeds/pending
```
---
## ✅ CHECKLIST: Añadir Nueva Fuente
**Para auto-aprobación (recomendado si sabes país/categoría):**
- [ ] Ir a `/urls/add_source`
- [ ] Ingresar nombre descriptivo
- [ ] Ingresar URL del sitio (NO del feed RSS)
- [ ] **IMPORTANTE:** Seleccionar categoría
- [ ] **IMPORTANTE:** Seleccionar país
- [ ] Ingresar idioma (opcional, se detecta)
- [ ] Guardar
- [ ] Esperar 15 minutos (máximo)
- [ ] Ver feeds en `/feeds/` (activos automáticamente)
**Para revisión manual (si no estás seguro):**
- [ ] Ir a `/urls/add_source`
- [ ] Ingresar nombre y URL
- [ ] Dejar categoría/país vacíos
- [ ] Guardar
- [ ] Esperar 15 minutos
- [ ] Ir a `/feeds/pending`
- [ ] Revisar sugerencias automáticas
- [ ] Ajustar categoría/país si necesario
- [ ] Aprobar feeds
- [ ] Feeds se activan inmediatamente
---
## 🎓 Resumen Ejecutivo
**3 Niveles de Automatización:**
| Nivel | Descripción | Cuándo Usar |
|-------|-------------|-------------|
| **Nivel 1: Totalmente Manual** | Descubrir en `/feeds/discover` | Control total, pocas URLs |
| **Nivel 2: Auto-aprobación** | URL con cat/país → feeds activos | URLs confiables, muchas fuentes |
| **Nivel 3: Revisión Asistida** | URL sin cat/país → análisis → aprobar | URLs nuevas, verificación |
**Flujo Recomendado:**
1. Añade URL con categoría/país si la conoces
2. Si no, déjalo vacío y revisa sugerencias automáticas
3. Worker descubre y analiza todo automáticamente
4. Tú solo apruebas/ajustas lo necesario
**Resultado:** Gestión eficiente de cientos de fuentes RSS con mínima intervención manual.
---
**📅 Fecha de última actualización:** 2026-01-07
**📌 Versión del sistema:** 2.0 - Análisis Inteligente de Feeds

262
entity_config.json Normal file
View file

@ -0,0 +1,262 @@
{
"blacklist": [
"Indicó",
"Insistió",
"levier",
"Dios",
"Elefantes",
"AbidjanTV.net",
"Gabón",
"París",
"Bingerville",
"Biankouma",
"Bouaké",
"Cote",
"Costa",
"Pugh",
"Gracias",
"Tesla",
"Netflix",
"Disney",
"Ford",
"Continúe",
"Cenizas",
"Victoria",
"Fiscalía",
"Detalles",
"Presiones",
"Fuertes",
"Industria",
"Dios",
"Nuestro",
"Según",
"Finanzas",
"Enterate",
"Presupuesto",
"Adiós",
"Quién",
"Cuándo",
"Hablando",
"Términos",
"Agregó",
"Blockchain",
"blockchain",
"Bitcoin",
"Trono",
"Robotaxi",
"Penalidades",
"Deportes",
"Candidato",
"Parecía",
"Declaraciones",
"Había",
"Hermanos",
"Hombre de Dios",
"Creyendo",
"Renacimiento",
"Cristo",
"Señor",
"Jefe de Estado",
"Educación",
"Dice",
"Estoy",
"Expresaron",
"Bondi",
"Suspechoso",
"Sydney",
"Amazon",
"L'FC Andorra",
"Delanteros",
"Lee",
"Leé",
"Lea",
"Presione",
"Carrera",
"Alejandría",
"Javier Romero Communications and Marketing Officer Statistical Institute of Belize jromero@mail.sib.org.bz",
"Copyright",
"Allah",
"Crack",
"Akbar",
"Getty Images",
"Buffalo Bills",
"Comandantes",
"Gigantes",
"Anónimo",
"Raadiouudised",
"Eswatini",
"Stock",
"Ethio Telecom",
"DPWH",
"Obras Públicas",
"Malacañang",
"Sigue",
"Becas",
"Crédito",
"Claro",
"Reaccione",
"Indique",
"Pakistán",
"Tolo News",
"Gladbach",
"Alger"
],
"synonyms": {
"Alassane Ouattara": [
"Ouattara",
"M. Ouattara",
"Presidente Ouattara"
],
"Alexander Stubb": [
"Stubb"
],
"Brice Clotaire Oligui Nguema": [
"Oligui Nguema"
],
"Dr. Mahamudu Bawumia": [
"Dr. Bawumia"
],
"John Dramani Mahama": [
"John Mahama",
"Mahama"
],
"Antoine Semenyo": [
"Semenyo"
],
"Kim Jong Un": [
"Kim Jong"
],
"Daniel Noboa": [
"Noboa"
],
"Donald Trump": [
"Trump",
"Mr. Trump"
],
"Nayib Bukele": [
"Bukele"
],
"Peter Pellegrini": [
"Pellegrini"
],
"Robert Fico": [
"Fico"
],
"Vladimir Putin": [
"Putin",
"V. Putin"
],
"Emmanuel Macron": [
"Macron",
"Presidente Macron"
],
"Pedro Sánchez": [
"Sánchez",
"Pedro Sanchez"
],
"Nicolás Maduro": [
"Maduro"
],
"Lula da Silva": [
"Lula",
"Luiz Inácio Lula da Silva"
],
"Jeffrey Epstein": [
"Epstein"
],
"José Antonio Kast": [
"Kast"
],
"Jake Paul": [
"Paul"
],
"Lionel Messi": [
"Messi",
"Lionel Andres Messi",
"Leo Messi"
],
"Luis Caputo": [
"Caputo"
],
"Javier Milei": [
"Milei"
],
"Mia Amor Mottley": [
"Mia Mottley",
"Mottley"
],
"Elon Musk": [
"Musk"
],
"Ibrahim Traoré": [
"Traoré",
"Ibrahim Traore",
"Capitán Ibrahim TRAORÉ"
],
"Friedrich Merz": [
"Merz"
],
"Edi Rama": [
"Rama"
],
"Fabiola Hoxha": [
"Hoxha"
],
"Hugo Broos": [
"Broos"
],
"Himad Abdelli": [
"Abdelli"
],
"Abdelmadjid Tebboune": [
"Tebboune",
"Abdul Majid Tabon",
"Tabon"
],
"Ibrahim Maza": [
"Maza"
],
"Shehbaz Sharif": [
"Sharif"
],
"Lindsey Vonn": [
"Vonn"
],
"Erdogan": [
"Rəcəb Tayyib Erdoğan"
],
"Evo Morales": [
"Morales"
],
"Petteri Orpo": [
"Orpo"
],
"Abdel Fattah Al-Sisi": [
"Sisi"
],
"Sharif Osman Hadi": [
"Osman Hadi",
"Hadi",
"Hadir",
"Osmán Hadi"
],
"Nikol Pashinyan": [
"Pashinyan"
],
"René Benko": [
"Benko"
],
"Bashar al-Assad": [
"Assad"
],
"Hugo Motta": [
"Motta"
],
"Viktor Orbán": [
"Víctor Orbán",
"Orbán",
"Orban"
]
}
}

369
generar_videos_noticias.py Normal file
View file

@ -0,0 +1,369 @@
#!/usr/bin/env python3
"""
Generador de videos de noticias a partir de parrillas.
Este script procesa parrillas pendientes y genera videos con TTS.
"""
import os
import sys
import json
import logging
from datetime import datetime
from pathlib import Path
import requests
from db import get_conn
from psycopg2 import extras
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Configuración
OUTPUT_DIR = Path("/app/data/videos")
AUDIO_DIR = Path("/app/data/audio")
SUBTITLES_DIR = Path("/app/data/subtitles")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
SUBTITLES_DIR.mkdir(parents=True, exist_ok=True)
# URL del servicio AllTalk TTS (ajustar según configuración)
ALLTALK_URL = os.getenv("ALLTALK_URL", "http://alltalk:7851")
def obtener_noticias_parrilla(parrilla, conn):
"""
Obtiene las noticias que se incluirán en el video según los filtros de la parrilla.
"""
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
where_clauses = []
params = []
if parrilla['pais_id']:
where_clauses.append("n.pais_id = %s")
params.append(parrilla['pais_id'])
if parrilla['categoria_id']:
where_clauses.append("n.categoria_id = %s")
params.append(parrilla['categoria_id'])
if parrilla['entidad_nombre']:
where_clauses.append("""
EXISTS (
SELECT 1 FROM tags_noticia tn
JOIN tags t ON t.id = tn.tag_id
WHERE tn.traduccion_id = tr.id
AND t.tipo = %s
AND t.valor ILIKE %s
)
""")
params.append(parrilla['entidad_tipo'])
params.append(f"%{parrilla['entidad_nombre']}%")
# Solo noticias recientes (últimas 24 horas)
where_clauses.append("n.fecha >= NOW() - INTERVAL '1 day'")
where_sql = " AND ".join(where_clauses) if where_clauses else "1=1"
cur.execute(f"""
SELECT
n.id,
n.titulo,
n.imagen_url,
n.url,
n.fecha,
n.fuente_nombre,
tr.id as traduccion_id,
tr.titulo_trad,
tr.resumen_trad,
p.nombre as pais,
c.nombre as categoria
FROM noticias n
LEFT JOIN traducciones tr ON tr.noticia_id = n.id
AND tr.lang_to = %s
AND tr.status = 'done'
LEFT JOIN paises p ON p.id = n.pais_id
LEFT JOIN categorias c ON c.id = n.categoria_id
WHERE {where_sql}
AND tr.id IS NOT NULL
ORDER BY n.fecha DESC
LIMIT %s
""", [parrilla['idioma_voz']] + params + [parrilla['max_noticias']])
return cur.fetchall()
def generar_audio_tts(texto, output_path, idioma='es'):
"""
Genera audio usando el servicio AllTalk TTS.
"""
try:
# Preparar request para AllTalk
payload = {
"text_input": texto,
"text_filtering": "standard",
"character_voice_gen": "irene2.wav",
"narrator_enabled": False,
"narrator_voice_gen": "male_01.wav",
"text_not_inside": "character",
"language": idioma,
"output_file_name": output_path.stem,
"output_file_timestamp": False,
"autoplay": False,
"autoplay_volume": 0.8
}
response = requests.post(
f"{ALLTALK_URL}/api/tts-generate",
json=payload,
timeout=60
)
response.raise_for_status()
# El audio se guarda automáticamente por AllTalk
# Verificar que existe
if output_path.exists():
logger.info(f"Audio generado: {output_path}")
return True
else:
logger.error(f"Audio no encontrado después de generación: {output_path}")
return False
except Exception as e:
logger.error(f"Error generating TTS audio: {e}")
return False
def generar_subtitulos(noticias, output_path):
"""
Genera archivo SRT de subtítulos.
"""
try:
with open(output_path, 'w', encoding='utf-8') as f:
timestamp = 0
for i, noticia in enumerate(noticias, 1):
titulo = noticia['titulo_trad'] or noticia['titulo']
resumen = noticia['resumen_trad'] or ''
# Estimar duración basada en longitud de texto (aprox 150 palabras/min)
palabras = len((titulo + " " + resumen).split())
duracion = max(5, palabras / 2.5) # segundos
# Formatear timestamp SRT
start_time = timestamp
end_time = timestamp + duracion
f.write(f"{i}\n")
f.write(f"{format_srt_time(start_time)} --> {format_srt_time(end_time)}\n")
f.write(f"{titulo}\n\n")
timestamp = end_time
logger.info(f"Subtítulos generados: {output_path}")
return True
except Exception as e:
logger.error(f"Error generating subtitles: {e}")
return False
def format_srt_time(seconds):
"""Formatea segundos a formato SRT (HH:MM:SS,mmm)."""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
millis = int((seconds % 1) * 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
def procesar_parrilla(parrilla_id):
"""
Procesa una parrilla y genera el video.
"""
logger.info(f"Procesando parrilla {parrilla_id}")
with get_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
# Obtener configuración de parrilla
cur.execute("SELECT * FROM video_parrillas WHERE id = %s", (parrilla_id,))
parrilla = cur.fetchone()
if not parrilla or not parrilla['activo']:
logger.warning(f"Parrilla {parrilla_id} no encontrada o inactiva")
return False
# Obtener noticias
noticias = obtener_noticias_parrilla(parrilla, conn)
if not noticias:
logger.warning(f"No hay noticias disponibles para parrilla {parrilla_id}")
return False
logger.info(f"Encontradas {len(noticias)} noticias para el video")
# Crear registro de video
cur.execute("""
INSERT INTO video_generados (
parrilla_id, titulo, descripcion, status, num_noticias
) VALUES (
%s, %s, %s, 'processing', %s
) RETURNING id
""", (
parrilla_id,
f"{parrilla['nombre']} - {datetime.now().strftime('%Y-%m-%d')}",
f"Noticias de {parrilla['nombre']}",
len(noticias)
))
video_id = cur.fetchone()[0]
conn.commit()
# Preparar directorios
video_dir = OUTPUT_DIR / str(video_id)
video_dir.mkdir(exist_ok=True, parents=True)
# --- SETUP LOGGING FOR THIS VIDEO ---
log_file = video_dir / "generation.log"
file_handler = logging.FileHandler(log_file, mode='w')
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)
try:
logger.info(f"Iniciando generación de video {video_id}")
logger.info(f"Directorio: {video_dir}")
# Generar script de narración
logger.info("Generando guion narrativo...")
script_parts = []
script_parts.append(f"Hola, bienvenidos a {parrilla['nombre']}.")
script_parts.append(f"Estas son las noticias más importantes de hoy, {datetime.now().strftime('%d de %B de %Y')}.")
for i, noticia in enumerate(noticias, 1):
titulo = noticia['titulo_trad'] or noticia['titulo']
resumen = noticia['resumen_trad'] or ''
script_parts.append(f"Noticia número {i}.")
script_parts.append(titulo)
if resumen:
script_parts.append(resumen[:500]) # Limitar longitud
script_parts.append("") # Pausa
script_parts.append("Esto ha sido todo por hoy. Gracias por su atención.")
full_script = "\n".join(script_parts)
# Guardar script
script_path = video_dir / "script.txt"
with open(script_path, 'w', encoding='utf-8') as f:
f.write(full_script)
# Generar audio
logger.info(f"Generando audio TTS con AllTalk en: {ALLTALK_URL}")
audio_path = video_dir / "audio.wav"
if not generar_audio_tts(full_script, audio_path, parrilla['idioma_voz']):
raise Exception(f"Fallo al generar audio TTS en {ALLTALK_URL}")
# Generar subtítulos
if parrilla['include_subtitles']:
logger.info("Generando subtítulos SRT...")
subtitles_path = video_dir / "subtitles.srt"
generar_subtitulos(noticias, subtitles_path)
else:
subtitles_path = None
# Registrar noticias en el video
for i, noticia in enumerate(noticias, 1):
cur.execute("""
INSERT INTO video_noticias (
video_id, noticia_id, traduccion_id, orden
) VALUES (%s, %s, %s, %s)
""", (video_id, noticia['id'], noticia['traduccion_id'], i))
# Actualizar registro de video
cur.execute("""
UPDATE video_generados
SET status = 'completed',
audio_path = %s,
subtitles_path = %s,
noticias_ids = %s
WHERE id = %s
""", (
str(audio_path),
str(subtitles_path) if subtitles_path else None,
[n['id'] for n in noticias],
video_id
))
# Actualizar parrilla
cur.execute("""
UPDATE video_parrillas
SET ultima_generacion = NOW()
WHERE id = %s
""", (parrilla_id,))
conn.commit()
logger.info(f"Video {video_id} generado exitosamente")
# Cleanup handler
logger.removeHandler(file_handler)
file_handler.close()
return True
except Exception as e:
logger.error(f"Error processing video: {e}", exc_info=True)
# Marcar como error
cur.execute("""
UPDATE video_generados
SET status = 'error',
error_message = %s
WHERE id = %s
""", (str(e), video_id))
conn.commit()
# Cleanup handler
logger.removeHandler(file_handler)
file_handler.close()
return False
def main():
"""
Función principal: procesa parrillas activas que necesitan generación.
"""
logger.info("Iniciando generador de videos de noticias")
with get_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
# Buscar parrillas activas que necesitan generación
# Por ahora, procesar todas las activas manualmente
# TODO: Implementar lógica de programación automática
if len(sys.argv) > 1:
# Modo manual: procesar parrilla específica
parrilla_id = int(sys.argv[1])
procesar_parrilla(parrilla_id)
else:
# Modo batch: procesar todas las parrillas activas
cur.execute("""
SELECT id FROM video_parrillas
WHERE activo = true
AND frecuencia = 'daily'
AND (ultima_generacion IS NULL
OR ultima_generacion < NOW() - INTERVAL '1 day')
ORDER BY id
""")
parrillas = cur.fetchall()
logger.info(f"Encontradas {len(parrillas)} parrillas para procesar")
for p in parrillas:
try:
procesar_parrilla(p['id'])
except Exception as e:
logger.error(f"Error procesando parrilla {p['id']}: {e}")
continue
if __name__ == "__main__":
main()

190
generate_secure_credentials.sh Executable file
View file

@ -0,0 +1,190 @@
#!/bin/bash
# ==================================================================================
# Script de Generación de Credenciales Seguras
# ==================================================================================
#
# Este script genera credenciales aleatorias seguras para todos los servicios
# y crea un archivo .env con las configuraciones necesarias.
#
# Uso:
# ./generate_secure_credentials.sh
#
# El script creará:
# - .env.generated (con las credenciales nuevas)
# - .env.backup (backup de .env actual si existe)
#
# ==================================================================================
set -e # Exit on error
# Colores para output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
echo -e "${GREEN}=================================="
echo -e "🔒 Generador de Credenciales Seguras"
echo -e "==================================${NC}\n"
# Verificar dependencias
command -v openssl >/dev/null 2>&1 || { echo -e "${RED}❌ Error: openssl no está instalado${NC}"; exit 1; }
command -v python3 >/dev/null 2>&1 || { echo -e "${RED}❌ Error: python3 no está instalado${NC}"; exit 1; }
# Backup del .env actual si existe
if [ -f .env ]; then
echo -e "${YELLOW}⚠️ Encontrado archivo .env existente${NC}"
BACKUP_FILE=".env.backup.$(date +%Y%m%d_%H%M%S)"
cp .env "$BACKUP_FILE"
echo -e "${GREEN}✅ Backup creado: $BACKUP_FILE${NC}\n"
fi
echo -e "${GREEN}🔑 Generando credenciales seguras...${NC}\n"
# Generar credenciales
POSTGRES_PASSWORD=$(openssl rand -base64 32 | tr -d "=+/" | cut -c1-32)
REDIS_PASSWORD=$(openssl rand -base64 32 | tr -d "=+/" | cut -c1-32)
SECRET_KEY=$(python3 -c "import secrets; print(secrets.token_hex(32))")
GRAFANA_PASSWORD=$(openssl rand -base64 24 | tr -d "=+/" | cut -c1-24)
# Mostrar credenciales generadas (para que el usuario las guarde)
echo -e "${YELLOW}⚠️ IMPORTANTE: Guarda estas credenciales en un lugar seguro${NC}\n"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo -e "${GREEN}POSTGRES_PASSWORD:${NC} $POSTGRES_PASSWORD"
echo -e "${GREEN}REDIS_PASSWORD:${NC} $REDIS_PASSWORD"
echo -e "${GREEN}SECRET_KEY:${NC} $SECRET_KEY"
echo -e "${GREEN}GRAFANA_PASSWORD:${NC} $GRAFANA_PASSWORD"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
# Crear archivo .env.generated
ENV_FILE=".env.generated"
cat > "$ENV_FILE" << EOF
# ==================================================================================
# CONFIGURACIÓN SEGURA - Generado automáticamente
# Fecha: $(date +"%Y-%m-%d %H:%M:%S")
# ==================================================================================
#
# IMPORTANTE:
# - NO compartas este archivo
# - Guarda las credenciales en un gestor de contraseñas
# - Añade .env al .gitignore
#
# ==================================================================================
# ==================================================================================
# DATABASE CONFIGURATION - PostgreSQL
# ==================================================================================
POSTGRES_DB=rss
POSTGRES_USER=rss
POSTGRES_PASSWORD=$POSTGRES_PASSWORD
DB_NAME=rss
DB_USER=rss
DB_PASS=$POSTGRES_PASSWORD
DB_HOST=db
DB_PORT=5432
DB_WRITE_HOST=db
DB_READ_HOST=db-replica
# ==================================================================================
# REDIS CONFIGURATION - Con autenticación
# ==================================================================================
REDIS_HOST=redis
REDIS_PORT=6379
REDIS_PASSWORD=$REDIS_PASSWORD
# ==================================================================================
# APPLICATION SECRETS
# ==================================================================================
SECRET_KEY=$SECRET_KEY
# ==================================================================================
# MONITORING - Grafana
# ==================================================================================
GRAFANA_PASSWORD=$GRAFANA_PASSWORD
# ==================================================================================
# EXTERNAL SERVICES
# ==================================================================================
ALLTALK_URL=http://host.docker.internal:7851
# ==================================================================================
# AI MODELS & WORKERS
# ==================================================================================
RSS_MAX_WORKERS=3
TARGET_LANGS=es
TRANSLATOR_BATCH=128
ENQUEUE=300
# RSS Ingestor Configuration
RSS_POKE_INTERVAL_MIN=15
RSS_MAX_FAILURES=10
RSS_FEED_TIMEOUT=60
# URL Feed Discovery Worker
URL_DISCOVERY_INTERVAL_MIN=15
URL_DISCOVERY_BATCH_SIZE=10
MAX_FEEDS_PER_URL=5
# CTranslate2 / AI Model Paths
CT2_MODEL_PATH=/app/models/nllb-ct2
CT2_DEVICE=cuda
CT2_COMPUTE_TYPE=int8_float16
UNIVERSAL_MODEL=facebook/nllb-200-distilled-600M
# Embeddings
EMB_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
EMB_BATCH=64
EMB_DEVICE=cuda
# NER
NER_LANG=es
NER_BATCH=64
# Flask / Gunicorn
GUNICORN_WORKERS=8
FLASK_DEBUG=0
# Qdrant Configuration
QDRANT_HOST=qdrant
QDRANT_PORT=6333
QDRANT_COLLECTION_NAME=news_vectors
QDRANT_BATCH_SIZE=100
QDRANT_SLEEP_IDLE=30
EOF
echo -e "${GREEN}✅ Archivo generado: $ENV_FILE${NC}\n"
# Preguntar si quiere reemplazar .env
echo -e "${YELLOW}¿Deseas reemplazar el archivo .env actual con el generado?${NC}"
echo -e "${YELLOW}(Recomendado: revisa $ENV_FILE primero)${NC}"
read -p "¿Continuar? (s/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[SsYy]$ ]]; then
mv "$ENV_FILE" .env
echo -e "${GREEN}✅ Archivo .env actualizado${NC}"
else
echo -e "${YELLOW}⚠️ Archivo guardado como: $ENV_FILE${NC}"
echo -e "${YELLOW} Para usarlo: mv $ENV_FILE .env${NC}"
fi
echo ""
echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${GREEN}✅ ¡Credenciales generadas exitosamente!${NC}"
echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo ""
echo -e "${YELLOW}📋 PRÓXIMOS PASOS:${NC}"
echo ""
echo -e " 1. Revisa las credenciales generadas arriba"
echo -e " 2. Guárdalas en un gestor de contraseñas seguro"
echo -e " 3. Migra a docker-compose.secure.yml:"
echo -e " ${GREEN}cp docker-compose.secure.yml docker-compose.yml${NC}"
echo -e " 4. Haz backup de tus datos (ver SECURITY_GUIDE.md)"
echo -e " 5. Reinicia los servicios:"
echo -e " ${GREEN}docker-compose down && docker-compose up -d${NC}"
echo -e " 6. Verifica que todo funciona correctamente"
echo ""
echo -e "${YELLOW}📖 Para más detalles, revisa: SECURITY_GUIDE.md${NC}"
echo ""

74
gunicorn_config.py Normal file
View file

@ -0,0 +1,74 @@
"""
Configuración de Gunicorn optimizada para alta capacidad de proceso
"""
import multiprocessing
import os
# Bind
bind = "0.0.0.0:8000"
# Workers
# Fórmula recomendada: (2 x $num_cores) + 1
workers = int(os.getenv("GUNICORN_WORKERS", multiprocessing.cpu_count() * 2 + 1))
# Worker class - sync como fallback si gevent no está disponible
# Para máximo rendimiento, cambiar a "gevent" después de instalar gevent
worker_class = "sync"
worker_connections = 1000
# Timeouts
timeout = 300 # 5 minutos para queries pesadas
graceful_timeout = 30
keepalive = 5
# Reiniciar workers después de N requests para prevenir memory leaks
max_requests = 0 # Desactivado para evitar matar hilos de backup
max_requests_jitter = 0
# Logging
accesslog = "-"
errorlog = "-"
loglevel = "info"
access_log_format = '%(h)s %(l)s %(u)s %(t)s "%(r)s" %(s)s %(b)s "%(f)s" "%(a)s" %(D)s'
# Preload app para compartir memoria entre workers
preload_app = True
# Threading
threads = 2
# Process naming
proc_name = "rss2_gunicorn"
# Server mechanics
daemon = False
pidfile = None
umask = 0
user = None
group = None
tmp_upload_dir = None
# SSL (no usado, NGINX maneja SSL)
keyfile = None
certfile = None
# Configuración de seguridad
limit_request_line = 4094
limit_request_fields = 100
limit_request_field_size = 8190
def on_starting(server):
"""Callback cuando el servidor arranca"""
server.log.info("Starting RSS2 Gunicorn server with %d workers", workers)
def on_reload(server):
"""Callback cuando el servidor recarga"""
server.log.info("Reloading RSS2 Gunicorn server")
def worker_int(worker):
"""Callback cuando un worker recibe SIGINT o SIGQUIT"""
worker.log.info("Worker received INT or QUIT signal")
def worker_abort(worker):
"""Callback cuando un worker es abortado"""
worker.log.info("Worker received SIGABRT signal")

53
init-replica/init-replica.sh Executable file
View file

@ -0,0 +1,53 @@
#!/bin/bash
# Initialization script for PostgreSQL streaming replica
# This script sets up the replica from the primary using pg_basebackup
set -e
PGDATA="${PGDATA:-/var/lib/postgresql/data/18/main}"
PRIMARY_HOST="${PRIMARY_HOST:-db}"
REPLICATION_USER="${REPLICATION_USER:-replicator}"
REPLICATION_PASSWORD="${REPLICATION_PASSWORD:-replica_password}"
echo "=== PostgreSQL Replica Initialization ==="
# Check if PGDATA already has data (replica already initialized)
if [ -f "$PGDATA/standby.signal" ]; then
echo "Replica already initialized (standby.signal exists). Skipping initialization."
exit 0
fi
if [ -f "$PGDATA/PG_VERSION" ]; then
echo "PGDATA already contains data. Checking if it's a replica..."
if [ -f "$PGDATA/standby.signal" ] || grep -q "primary_conninfo" "$PGDATA/postgresql.auto.conf" 2>/dev/null; then
echo "Already configured as replica. Skipping."
exit 0
else
echo "WARNING: PGDATA contains data but is NOT a replica."
echo "Cleaning up existing data to initialize replica..."
rm -rf "$PGDATA"/*
# Continue to basebackup
fi
fi
echo "Waiting for primary at $PRIMARY_HOST to be ready..."
until pg_isready -h "$PRIMARY_HOST" -p 5432 -U postgres; do
echo "Primary not ready yet. Waiting 2 seconds..."
sleep 2
done
echo "Primary is ready. Starting pg_basebackup..."
# Use pg_basebackup to copy data from primary
PGPASSWORD="$REPLICATION_PASSWORD" pg_basebackup \
-h "$PRIMARY_HOST" \
-p 5432 \
-U "$REPLICATION_USER" \
-D "$PGDATA" \
-Fp \
-Xs \
-P \
-R
echo "pg_basebackup complete. Replica initialized successfully."
echo "standby.signal and postgresql.auto.conf with primary_conninfo created."

119
migrate_to_secure.sh Executable file
View file

@ -0,0 +1,119 @@
#!/bin/bash
# ==================================================================================
# Script de Migración a Configuración Segura - TODO EN UNO
# ==================================================================================
set -e
# Colores
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
RED='\033[0;31m'
NC='\033[0m'
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${BLUE}🔒 Migración a Configuración Segura - TODO EN UNO${NC}"
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"
echo -e "${YELLOW}⚠️ Este script hará lo siguiente:${NC}"
echo " 1. Detener los servicios actuales"
echo " 2. Iniciar con la configuración segura"
echo " 3. Verificar que todo funciona"
echo ""
echo -e "${YELLOW}📊 Tiempo estimado: 3-5 minutos${NC}\n"
read -p "¿Deseas continuar? (s/N): " -n 1 -r
echo
if [[ ! $REPLY =~ ^[SsYy]$ ]]; then
echo -e "${RED}❌ Operación cancelada${NC}"
exit 1
fi
echo ""
echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${GREEN}PASO 1: Deteniendo servicios actuales...${NC}"
echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"
docker-compose down
echo ""
echo -e "${GREEN}✅ Servicios detenidos${NC}\n"
echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${GREEN}PASO 2: Iniciando con configuración segura...${NC}"
echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"
docker-compose up -d
echo ""
echo -e "${GREEN}✅ Servicios iniciados${NC}\n"
echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${GREEN}PASO 3: Esperando que los servicios se inicialicen...${NC}"
echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"
echo -n "Esperando 30 segundos"
for i in {1..30}; do
echo -n "."
sleep 1
done
echo ""
echo ""
echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${GREEN}PASO 4: Verificando servicios...${NC}"
echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"
docker-compose ps
echo ""
echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${GREEN}PASO 5: Ejecutando verificación de seguridad...${NC}"
echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"
./verify_security.sh
echo ""
echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${GREEN}PASO 6: Verificando web app...${NC}"
echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"
if curl -s http://localhost:8001 > /dev/null 2>&1; then
echo -e "${GREEN}✅ Web app responde correctamente${NC}"
else
echo -e "${RED}❌ Web app no responde - revisar logs:${NC}"
echo " docker-compose logs nginx"
echo " docker-compose logs rss2_web"
fi
echo ""
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${BLUE}🎉 ¡Migración completada!${NC}"
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"
echo -e "${GREEN}✅ Tu sistema ahora está configurado de forma segura:${NC}\n"
echo " 🔒 Credenciales fuertes configuradas"
echo " 🌐 Redes segmentadas (frontend, backend, monitoring)"
echo " 🚪 Solo puerto 8001 expuesto públicamente"
echo " 🔐 Redis con autenticación"
echo " 📊 Límites de recursos configurados"
echo ""
echo -e "${YELLOW}📋 PRÓXIMOS PASOS:${NC}\n"
echo " 1. Verifica que puedes acceder a: http://localhost:8001"
echo " 2. Prueba búsqueda y funcionalidades principales"
echo " 3. Para Grafana (monitoring):"
echo " - Acceso local: http://localhost:3001"
echo " - Usuario: admin"
echo " - Password: Ver EJECUTAR_AHORA.md"
echo ""
echo -e "${YELLOW}📖 Documentación:${NC}"
echo " - EJECUTAR_AHORA.md → Instrucciones detalladas"
echo " - SECURITY_GUIDE.md → Guía completa de seguridad"
echo " - SECURITY_AUDIT.md → Resumen de auditoría"
echo ""
echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"

3
models/__init__.py Normal file
View file

@ -0,0 +1,3 @@
# models/__init__.py
# Para que Python reconozca el directorio como paquete.

9
models/categorias.py Normal file
View file

@ -0,0 +1,9 @@
from psycopg2 import extras
from typing import List, Dict
def get_categorias(conn) -> List[Dict]:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
cur.execute("SELECT id, nombre FROM categorias ORDER BY nombre;")
return cur.fetchall()

337
models/describe.txt Normal file
View file

@ -0,0 +1,337 @@
models/
├── __init__.py # Paquete Python (vacío)
├── categorias.py # Operaciones con categorías
├── feeds.py # Operaciones con feeds RSS
├── noticias.py # Búsqueda y consulta de noticias
└── paises.py # Operaciones con países
└── traducciones.py # Operaciones con traducciones
init.py
Propósito: Archivo necesario para que Python reconozca este directorio como un paquete.
Contenido: Vacío o comentario explicativo.
Uso: Permite importar módulos desde models:
python
from models.noticias import buscar_noticias
categorias.py
Propósito: Maneja todas las operaciones relacionadas con categorías de noticias.
Funciones principales:
get_categorias(conn) -> List[Dict]
Descripción: Obtiene todas las categorías disponibles ordenadas alfabéticamente.
Parámetros:
conn: Conexión a PostgreSQL activa
Consulta SQL:
sql
SELECT id, nombre FROM categorias ORDER BY nombre;
Retorna: Lista de diccionarios con estructura:
python
[
{"id": 1, "nombre": "Política"},
{"id": 2, "nombre": "Deportes"},
...
]
Uso típico: Para llenar dropdowns de filtrado en la interfaz web.
feeds.py
Propósito: Maneja operaciones relacionadas con feeds RSS.
Funciones principales:
get_feed_by_id(conn, feed_id: int) -> Optional[Dict]
Descripción: Obtiene un feed específico por su ID.
Parámetros:
conn: Conexión a PostgreSQL
feed_id: ID numérico del feed
Consulta SQL:
sql
SELECT * FROM feeds WHERE id = %s;
Retorna: Un diccionario con todos los campos del feed o None si no existe.
get_feeds_activos(conn) -> List[Dict]
Descripción: Obtiene todos los feeds activos y no caídos.
Criterios de activos:
activo = TRUE
fallos < 5 (o NULL)
Consulta SQL:
sql
SELECT id, nombre, url, categoria_id, pais_id, fallos, activo
FROM feeds
WHERE activo = TRUE
AND (fallos IS NULL OR fallos < 5)
ORDER BY id;
Retorna: Lista de feeds activos para el ingestor RSS.
Uso crítico: Esta función es utilizada por rss_ingestor.py para determinar qué feeds procesar.
noticias.py
Propósito: Módulo más complejo que maneja todas las operaciones de búsqueda y consulta de noticias.
Funciones auxiliares:
_extraer_tags_por_traduccion(cur, traduccion_ids: List[int]) -> Dict[int, List[tuple]]
Descripción: Función privada que obtiene tags agrupados por ID de traducción.
Parámetros:
cur: Cursor de base de datos
traduccion_ids: Lista de IDs de traducciones
Consulta SQL:
sql
SELECT tn.traduccion_id, tg.valor, tg.tipo
FROM tags_noticia tn
JOIN tags tg ON tg.id = tn.tag_id
WHERE tn.traduccion_id = ANY(%s);
Retorna: Diccionario donde:
Clave: traduccion_id
Valor: Lista de tuplas (valor_tag, tipo_tag)
Optimización: Evita el problema N+1 al cargar tags.
Funciones principales:
buscar_noticias(...) -> Tuple[List[Dict], int, int, Dict]
Descripción: Búsqueda avanzada con múltiples filtros, paginación y traducciones.
Parámetros:
conn: Conexión a PostgreSQL
page: Número de página (1-based)
per_page: Noticias por página
q: Término de búsqueda (opcional)
categoria_id: Filtrar por categoría (opcional)
continente_id: Filtrar por continente (opcional)
pais_id: Filtrar por país (opcional)
fecha: Filtrar por fecha exacta YYYY-MM-DD (opcional)
lang: Idioma objetivo para traducciones (default: "es")
use_tr: Incluir traducciones en búsqueda (default: True)
Retorna: Tupla con 4 elementos:
noticias: Lista de noticias con datos completos
total_results: Total de resultados (sin paginación)
total_pages: Total de páginas calculado
tags_por_tr: Diccionario de tags por traducción
Características de búsqueda:
Filtrado por fecha: Coincidencia exacta de fecha
Filtrado geográfico: País o continente (jerárquico)
Filtrado por categoría: Selección única
Búsqueda de texto:
Búsqueda full-text con PostgreSQL (websearch_to_tsquery)
Búsqueda ILIKE en múltiples campos
Incluye campos originales y traducidos
Paginación: Offset/Limit estándar
Traducciones: JOIN condicional con tabla traducciones
Optimización: Single query para contar y obtener datos
Consulta SQL principal (simplificada):
sql
-- Contar total
SELECT COUNT(DISTINCT n.id)
FROM noticias n
-- joins con categorias, paises, traducciones
WHERE [condiciones dinámicas]
-- Obtener datos paginados
SELECT
n.id, n.titulo, n.resumen, n.url, n.fecha,
n.imagen_url, n.fuente_nombre,
c.nombre AS categoria,
p.nombre AS pais,
t.id AS traduccion_id,
t.titulo_trad AS titulo_traducido,
t.resumen_trad AS resumen_traducido,
-- flag de traducción disponible
CASE WHEN t.id IS NOT NULL THEN TRUE ELSE FALSE END AS tiene_traduccion,
-- campos originales
n.titulo AS titulo_original,
n.resumen AS resumen_original
FROM noticias n
-- joins...
WHERE [condiciones dinámicas]
ORDER BY n.fecha DESC NULLS LAST, n.id DESC
LIMIT %s OFFSET %s
Campos retornados por noticia:
python
{
"id": 123,
"titulo": "Título original",
"resumen": "Resumen original",
"url": "https://ejemplo.com/noticia",
"fecha": datetime(...),
"imagen_url": "https://.../imagen.jpg",
"fuente_nombre": "BBC News",
"categoria": "Política",
"pais": "España",
"traduccion_id": 456, # o None
"titulo_traducido": "Título en español",
"resumen_traducido": "Resumen en español",
"tiene_traduccion": True, # o False
"titulo_original": "Original title",
"resumen_original": "Original summary"
}
Uso en la aplicación: Esta función es el corazón de la búsqueda en la web, utilizada por los blueprints de Flask.
paises.py
Propósito: Maneja operaciones relacionadas con países.
Funciones principales:
get_paises(conn) -> List[Dict]
Descripción: Obtiene todos los países ordenados alfabéticamente.
Parámetros:
conn: Conexión a PostgreSQL
Consulta SQL:
sql
SELECT id, nombre FROM paises ORDER BY nombre;
Retorna: Lista de diccionarios con id y nombre de cada país.
Uso típico: Para dropdowns de filtrado por país en la interfaz web.
traducciones.py
Propósito: Maneja operaciones relacionadas con traducciones específicas.
Funciones principales:
get_traduccion(conn, traduccion_id: int) -> Optional[Dict]
Descripción: Obtiene una traducción específica por su ID.
Parámetros:
conn: Conexión a PostgreSQL
traduccion_id: ID numérico de la traducción
Consulta SQL:
sql
SELECT * FROM traducciones WHERE id = %s;
Retorna: Diccionario con todos los campos de la traducción o None.
Campos incluidos: id, noticia_id, lang_from, lang_to, titulo_trad, resumen_trad, status, error, created_at, etc.
Uso típico: Para páginas de detalle de traducciones o debugging.
Patrones de Diseño Observados
1. Separación de Responsabilidades
Cada archivo maneja una entidad específica de la base de datos
Lógica de consultas separada de lógica de negocio
2. Interfaz Consistente
Todas las funciones reciben conn como primer parámetro
Retornan diccionarios (usando DictCursor)
Nombres descriptivos y consistentes
3. Optimización de Consultas
Uso de _extraer_tags_por_traduccion para evitar N+1 queries
Consultas COUNT y SELECT en la misma transacción
Índices implícitos en ORDER BY fecha DESC
4. Manejo de Traducciones
JOIN condicional con tabla traducciones
Flag tiene_traduccion para fácil verificación en frontend
Campos originales siempre disponibles como fallback
5. Seguridad
Uso de parámetros preparados (%s)
No concatenación directa de strings en SQL
Validación implícita de tipos
Flujo de Datos Típico
python
# En un blueprint de Flask
from db import get_conn
from models.noticias import buscar_noticias
def ruta_buscar():
conn = get_conn()
try:
noticias, total, paginas, tags = buscar_noticias(
conn=conn,
page=request.args.get('page', 1, type=int),
per_page=20,
q=request.args.get('q', ''),
categoria_id=request.args.get('categoria_id'),
pais_id=request.args.get('pais_id'),
lang='es'
)
# Procesar resultados...
finally:
conn.close()
Dependencias y Relaciones
Requisito: psycopg2.extras.DictCursor para retornar diccionarios
Usado por: Todos los blueprints en routers/
Base de datos: Asume estructura de tablas específica (feeds, noticias, traducciones, etc.)
Índices necesarios: Para optimizar búsquedas, se recomiendan índices en:
noticias(fecha DESC, id DESC)
traducciones(noticia_id, lang_to, status)
feeds(activo, fallos)

24
models/feeds.py Normal file
View file

@ -0,0 +1,24 @@
from psycopg2 import extras
from typing import List, Dict, Optional
def get_feed_by_id(conn, feed_id: int) -> Optional[Dict]:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
cur.execute("SELECT * FROM feeds WHERE id = %s;", (feed_id,))
return cur.fetchone()
def get_feeds_activos(conn) -> List[Dict]:
"""Feeds activos y no caídos, usados por el ingestor RSS."""
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
cur.execute(
"""
SELECT id, nombre, url, categoria_id, pais_id, fallos, activo
FROM feeds
WHERE activo = TRUE
AND (fallos IS NULL OR fallos < 5)
ORDER BY id;
"""
)
return cur.fetchall()

282
models/noticias.py Normal file
View file

@ -0,0 +1,282 @@
from psycopg2 import extras
from typing import List, Dict, Optional, Tuple, Any
import os
import torch
from sentence_transformers import SentenceTransformer
def _extraer_tags_por_traduccion(cur, traduccion_ids: List[int]) -> Dict[int, List[tuple]]:
"""Obtiene tags agrupados por traducción."""
tags_por_tr = {}
if not traduccion_ids:
return tags_por_tr
cur.execute(
"""
SELECT tn.traduccion_id, tg.valor, tg.tipo
FROM tags_noticia tn
JOIN tags tg ON tg.id = tn.tag_id
WHERE tn.traduccion_id = ANY(%s);
""",
(traduccion_ids,),
)
rows = cur.fetchall()
for tr_id, valor, tipo in rows:
tags_por_tr.setdefault(tr_id, []).append((valor, tipo))
return tags_por_tr
def buscar_noticias(
conn,
page: int,
per_page: int,
q: str = "",
categoria_id: Optional[str] = None,
continente_id: Optional[str] = None,
pais_id: Optional[str] = None,
fecha: Optional[str] = None,
lang: str = "es",
use_tr: bool = True,
skip_count: bool = False,
) -> Tuple[List[Dict], int, int, Dict]:
"""
Búsqueda avanzada de noticias con filtros:
- fecha
- país / continente
- categoría
- búsqueda fulltext + ILIKE
- traducciones
- paginación
"""
offset = (page - 1) * per_page
where = ["1=1"]
params = []
# Filtro por fecha exacta
if fecha:
where.append("n.fecha::date = %s")
params.append(fecha)
# Categoría
if categoria_id:
where.append("n.categoria_id = %s")
params.append(int(categoria_id))
# País o continente
if pais_id:
where.append("n.pais_id = %s")
params.append(int(pais_id))
elif continente_id:
where.append("p.continente_id = %s")
params.append(int(continente_id))
# Búsqueda
if q:
search_like = f"%{q}%"
if use_tr:
where.append(
"""
(
n.tsv @@ websearch_to_tsquery('spanish', %s)
OR t.titulo_trad ILIKE %s
OR t.resumen_trad ILIKE %s
OR n.titulo ILIKE %s
OR n.resumen ILIKE %s
)
"""
)
params.extend([q, search_like, search_like, search_like, search_like])
else:
where.append(
"""
(
n.tsv @@ websearch_to_tsquery('spanish', %s)
OR n.titulo ILIKE %s
OR n.resumen ILIKE %s
)
"""
)
params.extend([q, search_like, search_like])
where_sql = " AND ".join(where)
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
# =====================================================================
# TOTAL DE RESULTADOS (OPTIMIZADO)
# =====================================================================
total_results = 0
total_pages = 0
if not skip_count:
# Si no hay filtros de búsqueda de texto ni filtros complejos, usar estimación rápida
if not q and not categoria_id and not pais_id and not continente_id and not fecha:
cur.execute("SELECT reltuples::bigint FROM pg_class WHERE relname = 'noticias'")
row = cur.fetchone()
total_results = row[0] if row else 0
else:
# Conteo exacto si hay filtros (necesario para paginación filtrada)
cur.execute(
f"""
SELECT COUNT(n.id)
FROM noticias n
LEFT JOIN categorias c ON c.id = n.categoria_id
LEFT JOIN paises p ON p.id = n.pais_id
LEFT JOIN traducciones t
ON t.noticia_id = n.id
AND t.lang_to = %s
AND t.status = 'done'
WHERE {where_sql}
""",
[lang] + params,
)
total_results = cur.fetchone()[0]
total_pages = (total_results // per_page) + (1 if total_results % per_page else 0)
# =====================================================================
# LISTA DE NOTICIAS PAGINADAS
# =====================================================================
cur.execute(
f"""
SELECT
n.id,
n.titulo,
n.resumen,
n.url,
n.fecha,
n.imagen_url,
n.fuente_nombre,
c.nombre AS categoria,
p.nombre AS pais,
-- traducciones
t.id AS traduccion_id,
t.titulo_trad AS titulo_traducido,
t.resumen_trad AS resumen_traducido,
CASE WHEN t.id IS NOT NULL THEN TRUE ELSE FALSE END AS tiene_traduccion,
-- originales
n.titulo AS titulo_original,
n.resumen AS resumen_original
FROM noticias n
LEFT JOIN categorias c ON c.id = n.categoria_id
LEFT JOIN paises p ON p.id = n.pais_id
LEFT JOIN traducciones t
ON t.noticia_id = n.id
AND t.lang_to = %s
AND t.status = 'done'
WHERE {where_sql}
ORDER BY n.fecha DESC NULLS LAST, n.id DESC
LIMIT %s OFFSET %s
""",
[lang] + params + [per_page, offset],
)
noticias = cur.fetchall()
# =====================================================================
# TAGS POR TRADUCCIÓN
# =====================================================================
tr_ids = [n["traduccion_id"] for n in noticias if n["traduccion_id"]]
tags_por_tr = _extraer_tags_por_traduccion(cur, tr_ids)
return noticias, total_results, total_pages, tags_por_tr
# Cache del modelo para no cargarlo en cada petición
_model_cache = {}
def _get_emb_model():
model_name = os.environ.get("EMB_MODEL", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
if model_name not in _model_cache:
device = "cuda" if torch.cuda.is_available() else "cpu"
_model_cache[model_name] = SentenceTransformer(model_name, device=device)
return _model_cache[model_name], model_name
def buscar_noticias_semantica(
conn,
page: int,
per_page: int,
q: str,
categoria_id: Optional[str] = None,
continente_id: Optional[str] = None,
pais_id: Optional[str] = None,
fecha: Optional[str] = None,
lang: str = "es",
) -> Tuple[List[Dict], int, int, Dict]:
"""
Búsqueda semántica usando embeddings y similitud coseno (vía producto punto si están normalizados).
"""
if not q.strip():
return buscar_noticias(conn, page, per_page, "", categoria_id, continente_id, pais_id, fecha, lang)
offset = (page - 1) * per_page
model, model_name = _get_emb_model()
# Generar embedding de la consulta
q_emb = model.encode([q], normalize_embeddings=True)[0].tolist()
where = ["t.status = 'done'", "t.lang_to = %s"]
params = [lang]
if fecha:
where.append("n.fecha::date = %s")
params.append(fecha)
if categoria_id:
where.append("n.categoria_id = %s")
params.append(int(categoria_id))
if pais_id:
where.append("n.pais_id = %s")
params.append(int(pais_id))
elif continente_id:
where.append("p.continente_id = %s")
params.append(int(continente_id))
where_sql = " AND ".join(where)
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
# Consulta de búsqueda vectorial (usamos un array_agg o similar para el producto punto si no hay pgvector)
# Nota: Aquí asumo que usamos producto punto entre arrays de double precision
query_sql = f"""
WITH similarity AS (
SELECT
te.traduccion_id,
(
SELECT SUM(a*b)
FROM unnest(te.embedding, %s::double precision[]) AS t(a,b)
) AS score
FROM traduccion_embeddings te
WHERE te.model = %s
)
SELECT
n.id, n.titulo, n.resumen, n.url, n.fecha, n.imagen_url, n.fuente_nombre,
c.nombre AS categoria, p.nombre AS pais,
t.id AS traduccion_id, t.titulo_trad AS titulo_traducido, t.resumen_trad AS resumen_traducido,
TRUE AS tiene_traduccion, s.score
FROM similarity s
JOIN traducciones t ON t.id = s.traduccion_id
JOIN noticias n ON n.id = t.noticia_id
LEFT JOIN categorias c ON c.id = n.categoria_id
LEFT JOIN paises p ON p.id = n.pais_id
WHERE {where_sql}
ORDER BY n.fecha DESC NULLS LAST, s.score DESC
LIMIT %s OFFSET %s
"""
# Para el conteo total en semántica podemos simplificar o usar el mismo WHERE
cur.execute(f"SELECT COUNT(*) FROM traducciones t JOIN noticias n ON n.id = t.noticia_id LEFT JOIN paises p ON p.id = n.pais_id WHERE {where_sql}", params)
total_results = cur.fetchone()[0]
total_pages = (total_results // per_page) + (1 if total_results % per_page else 0)
cur.execute(query_sql, [q_emb, model_name] + params + [per_page, offset])
noticias = cur.fetchall()
tr_ids = [n["traduccion_id"] for n in noticias]
tags_por_tr = _extraer_tags_por_traduccion(cur, tr_ids)
return noticias, total_results, total_pages, tags_por_tr

9
models/paises.py Normal file
View file

@ -0,0 +1,9 @@
from typing import List, Dict
from psycopg2 import extras
def get_paises(conn) -> List[Dict]:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
cur.execute("SELECT id, nombre FROM paises ORDER BY nombre;")
return cur.fetchall()

16
models/traducciones.py Normal file
View file

@ -0,0 +1,16 @@
from psycopg2 import extras
from typing import Optional, Dict
def get_traduccion(conn, traduccion_id: int) -> Optional[Dict]:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
cur.execute(
"""
SELECT *
FROM traducciones
WHERE id = %s;
""",
(traduccion_id,),
)
return cur.fetchone()

21
monitoring/prometheus.yml Normal file
View file

@ -0,0 +1,21 @@
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
# If we had Node Exporter (for host metrics):
# - job_name: 'node_exporter'
# static_configs:
# - targets: ['node-exporter:9100']
# If the app exposes metrics (e.g. Flask/Gunicorn with prometheus_client)
# - job_name: 'rss2_web'
# static_configs:
# - targets: ['rss2_web:8000']

112
nginx.conf Normal file
View file

@ -0,0 +1,112 @@
user nginx;
worker_processes auto;
error_log /var/log/nginx/error.log warn;
pid /var/run/nginx.pid;
events {
worker_connections 4096; # Alta capacidad de conexiones concurrentes
use epoll; # Mejor rendimiento en Linux
}
http {
include /etc/nginx/mime.types;
default_type application/octet-stream;
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';
access_log /var/log/nginx/access.log main;
# Optimizaciones de rendimiento
sendfile on;
tcp_nopush on;
tcp_nodelay on;
keepalive_timeout 65;
types_hash_max_size 2048;
client_max_body_size 1000M;
# Compresión gzip para reducir ancho de banda
gzip on;
gzip_vary on;
gzip_proxied any;
gzip_comp_level 6;
gzip_types text/plain text/css text/xml text/javascript
application/json application/javascript application/xml+rss
application/atom+xml image/svg+xml;
gzip_disable "msie6";
# Cache de archivos estáticos
open_file_cache max=1000 inactive=20s;
open_file_cache_valid 30s;
open_file_cache_min_uses 2;
open_file_cache_errors on;
# Configuración upstream para Gunicorn
upstream gunicorn_backend {
server rss2_web:8000 max_fails=3 fail_timeout=30s;
keepalive 32;
}
server {
listen 80;
server_name _;
# Logs específicos
access_log /var/log/nginx/rss2_access.log main;
error_log /var/log/nginx/rss2_error.log warn;
# Límites de seguridad
client_body_timeout 60s;
client_header_timeout 60s;
send_timeout 300s;
# Servir archivos estáticos directamente desde NGINX
location /static/ {
alias /app/static/;
expires 7d;
add_header Cache-Control "public, immutable";
access_log off;
}
# Proxy pass a Gunicorn para todo lo demás
location / {
proxy_pass http://gunicorn_backend;
proxy_redirect off;
# Headers necesarios
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# Timeouts para queries lentas
proxy_connect_timeout 60s;
proxy_send_timeout 300s;
proxy_read_timeout 300s;
# Buffering
proxy_buffering on;
proxy_buffer_size 4k;
proxy_buffers 8 4k;
proxy_busy_buffers_size 8k;
# HTTP/1.1 para keepalive
proxy_http_version 1.1;
proxy_set_header Connection "";
}
# Health check endpoint
location /health {
access_log off;
proxy_pass http://gunicorn_backend;
}
# Bloquear acceso a archivos sensibles
location ~ /\. {
deny all;
access_log off;
log_not_found off;
}
}
}

33
requirements.txt Executable file
View file

@ -0,0 +1,33 @@
Flask==2.3.3
feedparser==6.0.11
APScheduler==3.10.4
psycopg2-binary==2.9.10
bleach==6.1.0
gunicorn==22.0.0
gevent>=23.9.1
waitress==2.1.2
bcrypt>=4.1.0
email-validator>=2.1.0
tqdm>=4.66
beautifulsoup4>=4.12
requests>=2.31
newspaper3k==0.2.8
lxml[html_clean]>=4.9.3
langdetect==1.0.9
transformers==4.43.3
sentencepiece==0.2.0
sacremoses==0.1.1
accelerate==0.33.0
ctranslate2>=4.0.0
spacy>=3.7,<4.0
pgvector==0.2.5
sentence-transformers==3.0.1
numpy>=1.26
scikit-learn>=1.4
python-dotenv>=1.0
weasyprint==60.1
pydyf==0.10.0
redis>=5.0.0
qdrant-client==1.11.0
feedfinder2>=0.0.4

14
reset_and_deploy.sh Executable file
View file

@ -0,0 +1,14 @@
#!/bin/bash
echo "Stopping all containers..."
docker-compose down
echo "Removing data volumes..."
# Use sudo if necessary, or ensure current user has permissions
rm -rf pgdata pgdata-replica redis-data qdrant_storage
echo "Starting deployment from scratch..."
docker-compose up -d --build
echo "Deployment complete. Checking status..."
docker-compose ps

3
routers/__init__.py Normal file
View file

@ -0,0 +1,3 @@
# routes/__init__.py
# Necesario para que Python lo trate como un paquete.

267
routers/account.py Normal file
View file

@ -0,0 +1,267 @@
"""
Account management router - User profile and account settings.
"""
from flask import Blueprint, render_template, request, redirect, url_for, flash, jsonify
from psycopg2 import extras
from db import get_conn
from utils.auth import get_current_user, login_required, hash_password, verify_password, validate_password
from datetime import datetime
account_bp = Blueprint("account", __name__, url_prefix="/account")
@account_bp.route("/")
@login_required
def index():
"""User account dashboard."""
user = get_current_user()
if not user:
return redirect(url_for('auth.login'))
with get_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
# Get favorites count
cur.execute("""
SELECT COUNT(*) as count
FROM favoritos
WHERE user_id = %s
""", (user['id'],))
favorites_count = cur.fetchone()['count']
# Get search history count
cur.execute("""
SELECT COUNT(*) as count
FROM search_history
WHERE user_id = %s
""", (user['id'],))
searches_count = cur.fetchone()['count']
# Get recent searches (last 10)
cur.execute("""
SELECT query, results_count, searched_at
FROM search_history
WHERE user_id = %s
ORDER BY searched_at DESC
LIMIT 10
""", (user['id'],))
recent_searches = cur.fetchall()
# Get recent favorites (last 5)
cur.execute("""
SELECT n.id, n.titulo, n.imagen_url, f.created_at,
t.titulo_trad, t.id AS traduccion_id
FROM favoritos f
JOIN noticias n ON n.id = f.noticia_id
LEFT JOIN traducciones t ON t.noticia_id = n.id AND t.lang_to = 'es' AND t.status = 'done'
WHERE f.user_id = %s
ORDER BY f.created_at DESC
LIMIT 5
""", (user['id'],))
recent_favorites = cur.fetchall()
return render_template("account.html",
user=user,
favorites_count=favorites_count,
searches_count=searches_count,
recent_searches=recent_searches,
recent_favorites=recent_favorites)
@account_bp.route("/search-history")
@login_required
def search_history():
"""Full search history page."""
user = get_current_user()
if not user:
return redirect(url_for('auth.login'))
page = max(1, int(request.args.get('page', 1)))
per_page = 50
offset = (page - 1) * per_page
with get_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
# Get total count
cur.execute("""
SELECT COUNT(*) as count
FROM search_history
WHERE user_id = %s
""", (user['id'],))
total = cur.fetchone()['count']
# Get paginated results
cur.execute("""
SELECT query, results_count, searched_at
FROM search_history
WHERE user_id = %s
ORDER BY searched_at DESC
LIMIT %s OFFSET %s
""", (user['id'], per_page, offset))
searches = cur.fetchall()
total_pages = (total + per_page - 1) // per_page
return render_template("search_history.html",
user=user,
searches=searches,
page=page,
total_pages=total_pages,
total=total)
@account_bp.route("/change-password", methods=["POST"])
@login_required
def change_password():
"""Change user password."""
user = get_current_user()
if not user:
return redirect(url_for('auth.login'))
current_password = request.form.get("current_password", "")
new_password = request.form.get("new_password", "")
new_password_confirm = request.form.get("new_password_confirm", "")
# Validation
if not current_password or not new_password:
flash("Por favor completa todos los campos", "danger")
return redirect(url_for('account.index'))
valid_password, password_error = validate_password(new_password)
if not valid_password:
flash(password_error, "danger")
return redirect(url_for('account.index'))
if new_password != new_password_confirm:
flash("Las contraseñas nuevas no coinciden", "danger")
return redirect(url_for('account.index'))
try:
with get_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
# Verify current password
cur.execute("""
SELECT password_hash
FROM usuarios
WHERE id = %s
""", (user['id'],))
result = cur.fetchone()
if not result or not verify_password(current_password, result['password_hash']):
flash("La contraseña actual es incorrecta", "danger")
return redirect(url_for('account.index'))
# Update password
new_hash = hash_password(new_password)
cur.execute("""
UPDATE usuarios
SET password_hash = %s, updated_at = NOW()
WHERE id = %s
""", (new_hash, user['id']))
conn.commit()
flash("Contraseña actualizada exitosamente", "success")
except Exception as e:
flash("Error al actualizar la contraseña", "danger")
return redirect(url_for('account.index'))
@account_bp.route("/upload-avatar", methods=["POST"])
@login_required
def upload_avatar():
"""Upload user avatar."""
import os
import secrets
from werkzeug.utils import secure_filename
from flask import current_app
user = get_current_user()
if not user:
return redirect(url_for('auth.login'))
if 'avatar' not in request.files:
flash("No se seleccionó ningún archivo", "danger")
return redirect(url_for('account.index'))
file = request.files['avatar']
if file.filename == '':
flash("No se seleccionó ningún archivo", "danger")
return redirect(url_for('account.index'))
if file:
# Check extension
allowed_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.webp'}
_, ext = os.path.splitext(file.filename)
if ext.lower() not in allowed_extensions:
flash("Formato de imagen no permitido. Usa JPG, PNG, GIF o WEBP.", "danger")
return redirect(url_for('account.index'))
# Save file
try:
# Create filename using user ID and random partial to avoid caching issues
random_hex = secrets.token_hex(4)
filename = f"user_{user['id']}_{random_hex}{ext.lower()}"
# Ensure upload folder exists
upload_folder = os.path.join(current_app.root_path, 'static/uploads/avatars')
os.makedirs(upload_folder, exist_ok=True)
# Delete old avatar if exists
if user.get('avatar_url'):
old_path = os.path.join(current_app.root_path, user['avatar_url'].lstrip('/'))
if os.path.exists(old_path) and 'user_' in old_path: # Safety check
try:
os.remove(old_path)
except:
pass
file_path = os.path.join(upload_folder, filename)
file.save(file_path)
# Update DB
relative_path = f"/static/uploads/avatars/{filename}"
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
UPDATE usuarios
SET avatar_url = %s, updated_at = NOW()
WHERE id = %s
""", (relative_path, user['id']))
conn.commit()
# Update session
from flask import session
session['avatar_url'] = relative_path
flash("Foto de perfil actualizada", "success")
except Exception as e:
print(f"Error uploading avatar: {e}")
flash("Error al subir la imagen", "danger")
return redirect(url_for('account.index'))
@account_bp.route("/stats")
@login_required
def stats():
"""Get user statistics as JSON."""
user = get_current_user()
if not user:
return jsonify({"error": "Not authenticated"}), 401
with get_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
cur.execute("""
SELECT
(SELECT COUNT(*) FROM favoritos WHERE user_id = %s) as favorites_count,
(SELECT COUNT(*) FROM search_history WHERE user_id = %s) as searches_count,
(SELECT MAX(searched_at) FROM search_history WHERE user_id = %s) as last_search
""", (user['id'], user['id'], user['id']))
stats = cur.fetchone()
return jsonify({
"favorites_count": stats['favorites_count'],
"searches_count": stats['searches_count'],
"last_search": stats['last_search'].isoformat() if stats['last_search'] else None
})

203
routers/auth.py Normal file
View file

@ -0,0 +1,203 @@
"""
Authentication router - User registration, login, and logout.
"""
from flask import Blueprint, request, render_template, redirect, url_for, session, flash
from psycopg2 import extras, IntegrityError
from db import get_conn
from utils.auth import (
hash_password, verify_password, is_authenticated,
validate_username, validate_password, validate_email
)
from datetime import datetime
auth_bp = Blueprint("auth", __name__, url_prefix="/auth")
def migrate_anonymous_favorites(session_id: str, user_id: int):
"""Migrate anonymous favorites to user account.
Args:
session_id: Anonymous session ID
user_id: User ID to migrate favorites to
"""
if not session_id:
return
with get_conn() as conn:
with conn.cursor() as cur:
# Migrate favorites, avoiding duplicates
cur.execute("""
UPDATE favoritos
SET user_id = %s, session_id = NULL
WHERE session_id = %s
AND noticia_id NOT IN (
SELECT noticia_id FROM favoritos WHERE user_id = %s
)
""", (user_id, session_id, user_id))
# Delete any remaining duplicates
cur.execute("""
DELETE FROM favoritos
WHERE session_id = %s
""", (session_id,))
conn.commit()
# ============================================================
# Registration
# ============================================================
@auth_bp.route("/register", methods=["GET", "POST"])
def register():
"""User registration page and handler."""
if is_authenticated():
return redirect(url_for('account.index'))
if request.method == "POST":
username = request.form.get("username", "").strip()
email = request.form.get("email", "").strip().lower()
password = request.form.get("password", "")
password_confirm = request.form.get("password_confirm", "")
# Validation
valid_username, username_error = validate_username(username)
if not valid_username:
flash(username_error, "danger")
return render_template("register.html", username=username, email=email)
valid_email, email_error = validate_email(email)
if not valid_email:
flash(email_error, "danger")
return render_template("register.html", username=username, email=email)
valid_password, password_error = validate_password(password)
if not valid_password:
flash(password_error, "danger")
return render_template("register.html", username=username, email=email)
if password != password_confirm:
flash("Las contraseñas no coinciden", "danger")
return render_template("register.html", username=username, email=email)
# Create user
try:
password_hash = hash_password(password)
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
INSERT INTO usuarios (username, email, password_hash, last_login)
VALUES (%s, %s, %s, NOW())
RETURNING id
""", (username, email, password_hash))
user_id = cur.fetchone()[0]
conn.commit()
# Auto-login after registration
old_session_id = session.get('user_session')
session['user_id'] = user_id
session['username'] = username
# Migrate anonymous favorites if any
if old_session_id:
migrate_anonymous_favorites(old_session_id, user_id)
session.pop('user_session', None)
flash(f"¡Bienvenido {username}! Tu cuenta ha sido creada exitosamente.", "success")
return redirect(url_for('account.index'))
except IntegrityError as e:
if 'username' in str(e):
flash("Este nombre de usuario ya está en uso", "danger")
elif 'email' in str(e):
flash("Este email ya está registrado", "danger")
else:
flash("Error al crear la cuenta. Por favor intenta de nuevo.", "danger")
return render_template("register.html", username=username, email=email)
except Exception as e:
flash("Error al crear la cuenta. Por favor intenta de nuevo.", "danger")
return render_template("register.html", username=username, email=email)
return render_template("register.html")
# ============================================================
# Login
# ============================================================
@auth_bp.route("/login", methods=["GET", "POST"])
def login():
"""User login page and handler."""
if is_authenticated():
return redirect(url_for('account.index'))
if request.method == "POST":
username_or_email = request.form.get("username", "").strip()
password = request.form.get("password", "")
if not username_or_email or not password:
flash("Por favor ingresa tu usuario/email y contraseña", "danger")
return render_template("login.html", username=username_or_email)
try:
with get_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
# Try login with username or email
cur.execute("""
SELECT id, username, email, password_hash, is_active, avatar_url
FROM usuarios
WHERE (username = %s OR email = %s) AND is_active = TRUE
""", (username_or_email, username_or_email.lower()))
user = cur.fetchone()
if not user:
flash("Usuario o contraseña incorrectos", "danger")
return render_template("login.html", username=username_or_email)
if not verify_password(password, user['password_hash']):
flash("Usuario o contraseña incorrectos", "danger")
return render_template("login.html", username=username_or_email)
# Update last login
cur.execute("""
UPDATE usuarios SET last_login = NOW() WHERE id = %s
""", (user['id'],))
conn.commit()
# Create session
old_session_id = session.get('user_session')
session['user_id'] = user['id']
session['username'] = user['username']
session['avatar_url'] = user.get('avatar_url')
# Migrate anonymous favorites
if old_session_id:
migrate_anonymous_favorites(old_session_id, user['id'])
session.pop('user_session', None)
flash(f"¡Bienvenido de vuelta, {user['username']}!", "success")
# Redirect to 'next' parameter if exists
next_page = request.args.get('next')
if next_page and next_page.startswith('/'):
return redirect(next_page)
return redirect(url_for('account.index'))
except Exception as e:
flash("Error al iniciar sesión. Por favor intenta de nuevo.", "danger")
return render_template("login.html", username=username_or_email)
return render_template("login.html")
# ============================================================
# Logout
# ============================================================
@auth_bp.route("/logout", methods=["POST", "GET"])
def logout():
"""Log out the current user."""
username = session.get('username', 'Usuario')
session.clear()
flash(f"Hasta luego, {username}. Has cerrado sesión exitosamente.", "info")
return redirect(url_for('home.index'))

353
routers/backup.py Normal file
View file

@ -0,0 +1,353 @@
from flask import Blueprint, send_file, render_template, request, flash, redirect, url_for
import csv
import io
from psycopg2 import extras
from db import get_conn
backup_bp = Blueprint("backup", __name__)
# ============================================================
# EXPORTAR FEEDS → CSV (OK)
# ============================================================
@backup_bp.route("/backup_feeds")
def backup_feeds():
with get_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur:
cur.execute("""
SELECT f.id, f.nombre, f.descripcion, f.url,
f.categoria_id, c.nombre AS categoria,
f.pais_id, p.nombre AS pais,
f.idioma, f.activo, f.fallos
FROM feeds f
LEFT JOIN categorias c ON c.id=f.categoria_id
LEFT JOIN paises p ON p.id=f.pais_id
ORDER BY f.id;
""")
rows = cur.fetchall()
output = io.StringIO()
writer = csv.writer(output)
writer.writerow([
"id", "nombre", "descripcion", "url",
"categoria_id", "categoria",
"pais_id", "pais",
"idioma", "activo", "fallos"
])
for r in rows:
writer.writerow([
r["id"],
r["nombre"],
r["descripcion"] or "",
r["url"],
r["categoria_id"] or "",
r["categoria"] or "",
r["pais_id"] or "",
r["pais"] or "",
r["idioma"] or "",
r["activo"],
r["fallos"],
])
output.seek(0)
return send_file(
io.BytesIO(output.getvalue().encode("utf-8")),
mimetype="text/csv",
as_attachment=True,
download_name="feeds_backup.csv",
)
# ============================================================
# EXPORTAR FEEDS FILTRADOS → CSV
# ============================================================
@backup_bp.route("/export_feeds_filtered")
def export_feeds_filtered():
"""Exportar feeds con filtros opcionales (país, categoría, estado)."""
pais_id = request.args.get("pais_id")
categoria_id = request.args.get("categoria_id")
estado = request.args.get("estado") or ""
# Construir filtros WHERE (misma lógica que list_feeds)
where = []
params = []
if pais_id:
where.append("f.pais_id = %s")
params.append(int(pais_id))
if categoria_id:
where.append("f.categoria_id = %s")
params.append(int(categoria_id))
if estado == "activos":
where.append("f.activo = TRUE")
elif estado == "inactivos":
where.append("f.activo = FALSE")
elif estado == "errores":
where.append("COALESCE(f.fallos, 0) > 0")
where_sql = "WHERE " + " AND ".join(where) if where else ""
# Query SQL con filtros
with get_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur:
cur.execute(f"""
SELECT f.id, f.nombre, f.descripcion, f.url,
f.categoria_id, c.nombre AS categoria,
f.pais_id, p.nombre AS pais,
f.idioma, f.activo, f.fallos
FROM feeds f
LEFT JOIN categorias c ON c.id=f.categoria_id
LEFT JOIN paises p ON p.id=f.pais_id
{where_sql}
ORDER BY p.nombre NULLS LAST, c.nombre NULLS LAST, f.nombre;
""", params)
rows = cur.fetchall()
# Obtener nombres para el archivo
pais_nombre = None
categoria_nombre = None
if pais_id:
cur.execute("SELECT nombre FROM paises WHERE id = %s", (int(pais_id),))
result = cur.fetchone()
if result:
pais_nombre = result["nombre"]
if categoria_id:
cur.execute("SELECT nombre FROM categorias WHERE id = %s", (int(categoria_id),))
result = cur.fetchone()
if result:
categoria_nombre = result["nombre"]
# Generar CSV
output = io.StringIO()
writer = csv.writer(output)
writer.writerow([
"id", "nombre", "descripcion", "url",
"categoria_id", "categoria",
"pais_id", "pais",
"idioma", "activo", "fallos"
])
for r in rows:
writer.writerow([
r["id"],
r["nombre"],
r["descripcion"] or "",
r["url"],
r["categoria_id"] or "",
r["categoria"] or "",
r["pais_id"] or "",
r["pais"] or "",
r["idioma"] or "",
r["activo"],
r["fallos"],
])
# Generar nombre de archivo dinámico
filename_parts = ["feeds"]
if pais_nombre:
# Limpiar nombre de país para usar en archivo
clean_pais = pais_nombre.lower().replace(" ", "_").replace("/", "_")
filename_parts.append(clean_pais)
if categoria_nombre:
clean_cat = categoria_nombre.lower().replace(" ", "_").replace("/", "_")
filename_parts.append(clean_cat)
if estado:
filename_parts.append(estado)
filename = "_".join(filename_parts) + ".csv"
output.seek(0)
return send_file(
io.BytesIO(output.getvalue().encode("utf-8")),
mimetype="text/csv",
as_attachment=True,
download_name=filename,
)
# ============================================================
# RESTAURAR FEEDS → CSV (VERSIÓN PROFESIONAL)
# ============================================================
@backup_bp.route("/restore_feeds", methods=["GET", "POST"])
def restore_feeds():
if request.method == "GET":
return render_template("restore_feeds.html")
file = request.files.get("file")
if not file:
flash("Debes seleccionar un archivo CSV.", "error")
return redirect(url_for("backup.restore_feeds"))
# 1) Leer CSV
try:
raw = file.read().decode("utf-8-sig").replace("\ufeff", "")
reader = csv.DictReader(io.StringIO(raw))
except Exception as e:
flash(f"Error al procesar CSV: {e}", "error")
return redirect(url_for("backup.restore_feeds"))
expected_fields = [
"id", "nombre", "descripcion", "url",
"categoria_id", "categoria",
"pais_id", "pais",
"idioma", "activo", "fallos"
]
if reader.fieldnames != expected_fields:
flash("El CSV no tiene el encabezado correcto.", "error")
return redirect(url_for("backup.restore_feeds"))
# Contadores
imported = 0
skipped = 0
failed = 0
with get_conn() as conn:
with conn.cursor() as cur:
# Vaciar tabla ELIMINADO para no borrar feeds existentes
# cur.execute("TRUNCATE feeds RESTART IDENTITY CASCADE;")
for row in reader:
# Limpieza general
row = {k: (v.strip().rstrip("ç") if isinstance(v, str) else v) for k, v in row.items()}
# Validaciones mínimas
if not row["url"] or not row["nombre"]:
skipped += 1
continue
try:
# Creating a savepoint to isolate this row's transaction
cur.execute("SAVEPOINT row_savepoint")
# Normalizar valores
categoria_id = int(row["categoria_id"]) if row["categoria_id"] else None
pais_id = int(row["pais_id"]) if row["pais_id"] else None
idioma = (row["idioma"] or "").lower().strip()
idioma = idioma[:2] if idioma else None
activo = str(row["activo"]).lower() in ("true", "1", "t", "yes", "y")
fallos = int(row["fallos"] or 0)
# Buscar si ya existe un feed con esta URL
cur.execute("SELECT id FROM feeds WHERE url = %s", (row["url"],))
existing_feed = cur.fetchone()
if existing_feed:
# URL ya existe -> ACTUALIZAR el feed existente
cur.execute("""
UPDATE feeds SET
nombre=%s,
descripcion=%s,
categoria_id=%s,
pais_id=%s,
idioma=%s,
activo=%s,
fallos=%s
WHERE id=%s
""", (
row["nombre"],
row["descripcion"] or None,
categoria_id,
pais_id,
idioma,
activo,
fallos,
existing_feed[0]
))
else:
# URL no existe -> INSERTAR NUEVO feed (ignorar ID del CSV, usar auto-increment)
cur.execute("""
INSERT INTO feeds (nombre, descripcion, url, categoria_id, pais_id, idioma, activo, fallos)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s)
""", (
row["nombre"],
row["descripcion"] or None,
row["url"],
categoria_id,
pais_id,
idioma,
activo,
fallos
))
cur.execute("RELEASE SAVEPOINT row_savepoint")
imported += 1
except Exception as e:
# If any error happens, rollback to the savepoint so the main transaction isn't aborted
cur.execute("ROLLBACK TO SAVEPOINT row_savepoint")
failed += 1
continue
# No need to reset sequence - auto-increment handles it
conn.commit()
flash(
f"Restauración completada. "
f"Importados: {imported} | Saltados: {skipped} | Fallidos: {failed}",
"success"
)
return redirect(url_for("feeds.list_feeds"))
# ============================================================
# EXPORTAR METADATOS (PAISES / CATEGORIAS)
# ============================================================
@backup_bp.route("/export_paises")
def export_paises():
"""Exportar listado de países a CSV."""
with get_conn() as conn, conn.cursor() as cur:
cur.execute("SELECT id, nombre FROM paises ORDER BY id;")
rows = cur.fetchall()
output = io.StringIO()
writer = csv.writer(output)
writer.writerow(["id", "nombre"])
for r in rows:
writer.writerow([r[0], r[1]])
output.seek(0)
return send_file(
io.BytesIO(output.getvalue().encode("utf-8")),
mimetype="text/csv",
as_attachment=True,
download_name="paises.csv",
)
@backup_bp.route("/export_categorias")
def export_categorias():
"""Exportar listado de categorías a CSV."""
with get_conn() as conn, conn.cursor() as cur:
cur.execute("SELECT id, nombre FROM categorias ORDER BY id;")
rows = cur.fetchall()
output = io.StringIO()
writer = csv.writer(output)
writer.writerow(["id", "nombre"])
for r in rows:
writer.writerow([r[0], r[1]])
output.seek(0)
return send_file(
io.BytesIO(output.getvalue().encode("utf-8")),
mimetype="text/csv",
as_attachment=True,
download_name="categorias.csv",
)

216
routers/config.py Normal file
View file

@ -0,0 +1,216 @@
from flask import Blueprint, render_template, request, redirect, url_for, flash, Response, stream_with_context
from datetime import datetime
import json
import zipfile
import io
from db import get_conn
from psycopg2 import extras
config_bp = Blueprint("config", __name__, url_prefix="/config")
@config_bp.route("/")
def config_home():
return render_template("config.html")
import tempfile
import os
import shutil
import threading
import uuid
import time
from flask import send_file, jsonify
from cache import cache_set, cache_get
# Global dictionary to store temporary file paths (optional, but Redis is safer for clustered env)
# Since we are in a single-server Docker setup, a global dict is fine for paths if we don't restart.
# But for absolute safety, we'll store paths in Redis too.
BACKUP_TASKS = {}
@config_bp.route("/backup/start")
def backup_start():
task_id = str(uuid.uuid4())
cache_set(f"backup_status:{task_id}", {"progress": 0, "total": 0, "status": "initializing"})
# Start thread
thread = threading.Thread(target=_backup_worker, args=(task_id,))
thread.daemon = True
thread.start()
return jsonify({"task_id": task_id})
@config_bp.route("/backup/status/<task_id>")
def backup_status(task_id):
status = cache_get(f"backup_status:{task_id}")
if not status:
return jsonify({"error": "Task not found"}), 404
return jsonify(status)
@config_bp.route("/backup/download/<task_id>")
def backup_download(task_id):
status = cache_get(f"backup_status:{task_id}")
if not status or status.get("status") != "completed":
return "Archivo no listo o expirado", 404
file_path = status.get("file_path")
if not file_path or not os.path.exists(file_path):
return "Archivo no encontrado", 404
filename = f"backup_noticias_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"
return send_file(file_path, as_attachment=True, download_name=filename)
import io
def _backup_worker(task_id):
"""Background thread to generate the backup ZIP with direct streaming."""
print(f"[BACKUP {task_id}] Inicia proceso...")
try:
tmp_dir = tempfile.mkdtemp()
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
zip_path = os.path.join(tmp_dir, f"backup_{timestamp}.zip")
from db import get_read_conn # Use replica for large reads
with get_read_conn() as conn:
# 1. Count totals for progress
print(f"[BACKUP {task_id}] Contando registros...")
with conn.cursor() as cur:
cur.execute("SELECT count(*) FROM noticias")
total_n = cur.fetchone()[0]
cur.execute("SELECT count(*) FROM traducciones WHERE status = 'done'")
total_t = cur.fetchone()[0]
total_total = total_n + total_t
print(f"[BACKUP {task_id}] Total registros: {total_total}")
cache_set(f"backup_status:{task_id}", {"progress": 0, "total": total_total, "status": "processing"})
processed = 0
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
# --- NOTICIAS ---
print(f"[BACKUP {task_id}] Exportando noticias...")
with zf.open("noticias.jsonl", "w") as bf:
# Wrap binary file for text writing
with io.TextIOWrapper(bf, encoding='utf-8') as f:
with conn.cursor(name=f'bak_n_{task_id}', cursor_factory=extras.DictCursor) as cur:
cur.itersize = 2000
cur.execute("SELECT id, titulo, resumen, url, fecha, imagen_url, fuente_nombre, categoria_id, pais_id FROM noticias")
for row in cur:
item = dict(row)
if item.get("fecha"): item["fecha"] = item["fecha"].isoformat()
f.write(json.dumps(item, ensure_ascii=False) + "\n")
processed += 1
if processed % 2000 == 0:
cache_set(f"backup_status:{task_id}", {"progress": processed, "total": total_total, "status": "processing"})
# --- TRADUCCIONES ---
print(f"[BACKUP {task_id}] Exportando traducciones...")
with zf.open("traducciones.jsonl", "w") as bf:
with io.TextIOWrapper(bf, encoding='utf-8') as f:
with conn.cursor(name=f'bak_t_{task_id}', cursor_factory=extras.DictCursor) as cur:
cur.itersize = 2000
cur.execute("SELECT id, noticia_id, lang_from, lang_to, titulo_trad, resumen_trad, status, created_at FROM traducciones WHERE status = 'done'")
for row in cur:
item = dict(row)
if item.get("created_at"): item["created_at"] = item["created_at"].isoformat()
f.write(json.dumps(item, ensure_ascii=False) + "\n")
processed += 1
if processed % 2000 == 0:
cache_set(f"backup_status:{task_id}", {"progress": processed, "total": total_total, "status": "processing"})
print(f"[BACKUP {task_id}] Finalizado con éxito: {zip_path}")
cache_set(f"backup_status:{task_id}", {
"progress": total_total,
"total": total_total,
"status": "completed",
"file_path": zip_path
}, ttl_seconds=3600)
except Exception as e:
import traceback
error_msg = traceback.format_exc()
print(f"[BACKUP {task_id}] ERROR: {error_msg}")
cache_set(f"backup_status:{task_id}", {"status": "error", "error": str(e)})
@config_bp.route("/restore/noticias", methods=["GET", "POST"])
def restore_noticias():
# Keep current restore logic but maybe add progress too?
# For now let's focus on fix the client's immediate backup download issue.
if request.method == "GET":
return render_template("config_restore.html")
file = request.files.get("file")
if not file:
flash("Debes seleccionar un archivo ZIP.", "error")
return redirect(url_for("config.restore_noticias"))
if not file.filename.endswith(".zip"):
flash("El formato debe ser .zip", "error")
return redirect(url_for("config.restore_noticias"))
imported_n = 0
imported_t = 0
tmp_zip = tempfile.NamedTemporaryFile(delete=False, suffix=".zip")
file.save(tmp_zip.name)
tmp_zip.close()
try:
with zipfile.ZipFile(tmp_zip.name, "r") as zf:
if "noticias.jsonl" in zf.namelist():
with zf.open("noticias.jsonl") as f:
chunk = []
for line in f:
chunk.append(json.loads(line.decode("utf-8")))
if len(chunk) >= 500:
_import_noticias_chunk(chunk)
imported_n += len(chunk)
chunk = []
if chunk:
_import_noticias_chunk(chunk)
imported_n += len(chunk)
if "traducciones.jsonl" in zf.namelist():
with zf.open("traducciones.jsonl") as f:
chunk = []
for line in f:
chunk.append(json.loads(line.decode("utf-8")))
if len(chunk) >= 500:
_import_traducciones_chunk(chunk)
imported_t += len(chunk)
chunk = []
if chunk:
_import_traducciones_chunk(chunk)
imported_t += len(chunk)
finally:
if os.path.exists(tmp_zip.name):
os.remove(tmp_zip.name)
flash(f"Restauración completada: {imported_n} noticias, {imported_t} traducciones.", "success")
return redirect(url_for("config.config_home"))
def _import_noticias_chunk(chunk):
with get_conn() as conn:
with conn.cursor() as cur:
cur.executemany("""
INSERT INTO noticias (id, titulo, resumen, url, fecha, imagen_url, fuente_nombre, categoria_id, pais_id)
VALUES (%(id)s, %(titulo)s, %(resumen)s, %(url)s, %(fecha)s, %(imagen_url)s, %(fuente_nombre)s, %(categoria_id)s, %(pais_id)s)
ON CONFLICT (id) DO UPDATE SET
titulo = EXCLUDED.titulo,
resumen = EXCLUDED.resumen
""", chunk)
conn.commit()
def _import_traducciones_chunk(chunk):
with get_conn() as conn:
with conn.cursor() as cur:
cur.executemany("""
INSERT INTO traducciones (id, noticia_id, lang_from, lang_to, titulo_trad, resumen_trad, status, created_at)
VALUES (%(id)s, %(noticia_id)s, %(lang_from)s, %(lang_to)s, %(titulo_trad)s, %(resumen_trad)s, %(status)s, %(created_at)s)
ON CONFLICT (id) DO UPDATE SET
titulo_trad = EXCLUDED.titulo_trad,
resumen_trad = EXCLUDED.resumen_trad
""", chunk)
conn.commit()
@config_bp.route("/translator")
def translator_config():
return "Pagina de configuracion del modelo (pendiente de implementar)"

141
routers/conflicts.py Normal file
View file

@ -0,0 +1,141 @@
from flask import Blueprint, render_template, request, flash, redirect, url_for
from db import get_conn, get_read_conn
import psycopg2.extras
from utils.qdrant_search import search_by_keywords
conflicts_bp = Blueprint("conflicts", __name__, url_prefix="/conflicts")
def ensure_table(conn):
with conn.cursor() as cur:
cur.execute("""
CREATE TABLE IF NOT EXISTS conflicts (
id SERIAL PRIMARY KEY,
name VARCHAR(100) NOT NULL,
keywords TEXT,
description TEXT,
created_at TIMESTAMP DEFAULT NOW()
);
""")
conn.commit()
@conflicts_bp.route("/")
def index():
with get_conn() as conn:
ensure_table(conn)
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute("SELECT * FROM conflicts ORDER BY id DESC")
conflicts = cur.fetchall()
return render_template("conflicts_list.html", conflicts=conflicts)
@conflicts_bp.route("/create", methods=["POST"])
def create():
name = request.form.get("name")
keywords = request.form.get("keywords")
description = request.form.get("description", "")
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute(
"INSERT INTO conflicts (name, keywords, description) VALUES (%s, %s, %s)",
(name, keywords, description)
)
conn.commit()
flash("Conflicto creado correctamente.", "success")
return redirect(url_for("conflicts.index"))
@conflicts_bp.route("/<int:id>")
def timeline(id):
with get_conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute("SELECT * FROM conflicts WHERE id = %s", (id,))
conflict = cur.fetchone()
if not conflict:
flash("Conflicto no encontrado.", "error")
return redirect(url_for("conflicts.index"))
# Keywords logic: comma separated
kw_raw = conflict['keywords'] or ""
kw_list = [k.strip() for k in kw_raw.split(',') if k.strip()]
noticias = []
if kw_list:
try:
# Usar búsqueda semántica por keywords (mucho más rápido y efectivo)
semantic_results = search_by_keywords(
keywords=kw_list,
limit=200,
score_threshold=0.35
)
# Enriquecer con datos de PostgreSQL
if semantic_results:
news_ids = [r['news_id'] for r in semantic_results]
with get_read_conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute("""
SELECT
t.id AS tr_id,
t.lang_to,
COALESCE(t.titulo_trad, n.titulo) as titulo,
COALESCE(t.resumen_trad, n.resumen) as resumen,
n.id AS noticia_id,
n.fecha,
n.imagen_url,
n.fuente_nombre,
p.nombre as pais
FROM noticias n
LEFT JOIN traducciones t ON n.id = t.noticia_id AND t.lang_to = 'es'
LEFT JOIN paises p ON p.id = n.pais_id
WHERE n.id = ANY(%s)
ORDER BY n.fecha DESC
""", (news_ids,))
noticias = cur.fetchall()
except Exception as e:
print(f"⚠️ Error en búsqueda semántica de conflictos, usando fallback: {e}")
# Fallback a búsqueda tradicional ILIKE
patterns = [f"%{k}%" for k in kw_list]
with get_read_conn() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute("""
SELECT
t.id AS tr_id,
t.lang_to,
COALESCE(t.titulo_trad, n.titulo) as titulo,
COALESCE(t.resumen_trad, n.resumen) as resumen,
n.id AS noticia_id,
n.fecha,
n.imagen_url,
n.fuente_nombre,
p.nombre as pais
FROM noticias n
LEFT JOIN traducciones t ON n.id = t.noticia_id AND t.lang_to = 'es'
LEFT JOIN paises p ON p.id = n.pais_id
WHERE
(t.titulo_trad ILIKE ANY(%s) OR n.titulo ILIKE ANY(%s))
OR
(t.resumen_trad ILIKE ANY(%s) OR n.resumen ILIKE ANY(%s))
ORDER BY n.fecha DESC
LIMIT 200
""", (patterns, patterns, patterns, patterns))
noticias = cur.fetchall()
return render_template("conflict_timeline.html", conflict=conflict, noticias=noticias)
@conflicts_bp.route("/delete/<int:id>", methods=["POST"])
def delete(id):
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute("DELETE FROM conflicts WHERE id = %s", (id,))
conn.commit()
flash("Conflicto eliminado.", "success")
return redirect(url_for("conflicts.index"))

494
routers/describe.txt Normal file
View file

@ -0,0 +1,494 @@
routers/
├── __init__.py # Paquete Python (vacío)
├── home.py # Página principal y búsqueda de noticias
├── feeds.py # Gestión de feeds RSS
├── urls.py # Gestión de fuentes de URL
├── noticia.py # Página de detalle de noticia
├── eventos.py # Visualización de eventos por país
└── backup.py # Importación/exportación de feeds
init.py
Propósito: Archivo necesario para que Python reconozca este directorio como un paquete.
Contenido: Vacío o comentario explicativo.
Uso: Permite importar blueprints desde routers:
python
from routers.home import home_bp
home.py
Propósito: Blueprint para la página principal y búsqueda de noticias.
Ruta base: / y /home
Blueprints definidos:
home_bp = Blueprint("home", __name__)
Rutas:
@home_bp.route("/") y @home_bp.route("/home")
Método: GET
Descripción: Página principal con sistema de búsqueda avanzada.
Parámetros de consulta soportados:
page: Número de página (default: 1)
per_page: Resultados por página (default: 20, range: 10-100)
q: Término de búsqueda
categoria_id: Filtrar por categoría
continente_id: Filtrar por continente
pais_id: Filtrar por país
fecha: Filtrar por fecha (YYYY-MM-DD)
lang: Idioma para mostrar (default: "es")
orig: Si está presente, mostrar sólo originales sin traducciones
Funcionalidades:
Paginación: Sistema robusto con límites
Búsqueda avanzada: Usa models.noticias.buscar_noticias()
Soporte AJAX: Si X-Requested-With: XMLHttpRequest, retorna solo _noticias_list.html
Filtros combinados: Todos los filtros pueden usarse simultáneamente
Manejo de fechas: Conversión segura de strings a date
Variables de contexto para template:
noticias: Lista de noticias con datos completos
total_results: Total de resultados
total_pages: Total de páginas
categorias, paises: Para dropdowns de filtros
tags_por_tr: Diccionario de tags por traducción
Templates utilizados:
noticias.html: Página completa (HTML)
_noticias_list.html: Fragmento para AJAX (solo lista de noticias)
Características especiales:
use_tr = not bool(request.args.get("orig")): Controla si mostrar traducciones
lang = (request.args.get("lang") or DEFAULT_TRANSLATION_LANG or DEFAULT_LANG).lower()[:5]: Manejo seguro de idioma
feeds.py
Propósito: Blueprint para la gestión completa de feeds RSS.
Ruta base: /feeds
Blueprints definidos:
feeds_bp = Blueprint("feeds", __name__, url_prefix="/feeds")
Rutas:
@feeds_bp.route("/") - list_feeds()
Método: GET
Descripción: Listado paginado de feeds con filtros avanzados.
Parámetros de filtro:
pais_id: Filtrar por país
categoria_id: Filtrar por categoría
estado: "activos", "inactivos", "errores" o vacío para todos
Características:
Paginación (50 feeds por página)
Contador de totales
Ordenamiento: país → categoría → nombre
@feeds_bp.route("/add", methods=["GET", "POST"]) - add_feed()
Método: GET y POST
Descripción: Formulario para añadir nuevo feed.
Campos del formulario:
nombre: Nombre del feed (requerido)
descripcion: Descripción opcional
url: URL del feed RSS (requerido)
categoria_id: Categoría (select dropdown)
pais_id: País (select dropdown)
idioma: Código de idioma (2 letras, opcional)
Validaciones:
idioma se normaliza a minúsculas y máximo 2 caracteres
Campos opcionales convertidos a None si vacíos
@feeds_bp.route("/<int:feed_id>/edit", methods=["GET", "POST"]) - edit_feed(feed_id)
Método: GET y POST
Descripción: Editar feed existente.
Funcionalidades:
Pre-carga datos actuales del feed
Mismo formulario que add_feed pero con datos existentes
Campo adicional: activo (checkbox)
@feeds_bp.route("/<int:feed_id>/delete") - delete_feed(feed_id)
Método: GET
Descripción: Eliminar feed por ID.
Nota: DELETE simple sin confirmación en frontend (depende de template).
@feeds_bp.route("/<int:feed_id>/reactivar") - reactivar_feed(feed_id)
Método: GET
Descripción: Reactivar feed que tiene fallos.
Acción: Establece activo=TRUE y fallos=0.
Templates utilizados:
feeds_list.html: Listado principal
add_feed.html: Formulario de añadir
edit_feed.html: Formulario de editar
urls.py
Propósito: Blueprint para gestión de fuentes de URL (no feeds RSS).
Ruta base: /urls
Blueprints definidos:
urls_bp = Blueprint("urls", __name__, url_prefix="/urls")
Rutas:
@urls_bp.route("/") - manage_urls()
Método: GET
Descripción: Lista todas las fuentes de URL registradas.
Datos mostrados: ID, nombre, URL, categoría, país, idioma.
@urls_bp.route("/add_source", methods=["GET", "POST"]) - add_url_source()
Método: GET y POST
Descripción: Añadir/actualizar fuente de URL.
Características únicas:
Usa ON CONFLICT (url) DO UPDATE: Si la URL ya existe, actualiza
idioma default: "es" si no se especifica
Mismos campos que feeds pero para URLs individuales
Templates utilizados:
urls_list.html: Listado
add_url_source.html: Formulario
noticia.py
Propósito: Blueprint para página de detalle de noticia individual.
Ruta base: /noticia
Blueprints definidos:
noticia_bp = Blueprint("noticia", __name__)
Rutas:
@noticia_bp.route("/noticia") - noticia()
Método: GET
Descripción: Muestra detalle completo de una noticia.
Parámetros de consulta:
tr_id: ID de traducción (prioritario)
id: ID de noticia original (si no hay tr_id)
Flujo de datos:
Si hay tr_id: Obtiene datos combinados de traducción y noticia original
Si solo hay id: Obtiene solo datos originales
Si no hay ninguno: Redirige a home con mensaje de error
Datos obtenidos:
Información básica: título, resumen, URL, fecha, imagen, fuente
Datos de traducción (si aplica): idiomas, títulos/resúmenes traducidos
Metadatos: categoría, país
Tags: Etiquetas asociadas a la traducción
Noticias relacionadas: Hasta 8, ordenadas por score de similitud
Consultas adicionales (solo si hay traducción):
Tags: SELECT tg.valor, tg.tipo FROM tags_noticia...
Noticias relacionadas: SELECT n2.url, n2.titulo... FROM related_noticias...
Templates utilizados:
noticia.html: Página de detalle completa
eventos.py
Propósito: Blueprint para visualización de eventos agrupados por país.
Ruta base: /eventos_pais
Blueprints definidos:
eventos_bp = Blueprint("eventos", __name__, url_prefix="/eventos_pais")
Rutas:
@eventos_bp.route("/") - eventos_pais()
Método: GET
Descripción: Lista eventos (clusters de noticias) filtrados por país.
Parámetros de consulta:
pais_id: ID del país (obligatorio para ver eventos)
page: Número de página (default: 1)
lang: Idioma para traducciones (default: "es")
Funcionalidades:
Lista de países: Siempre visible para selección
Eventos paginados: 30 por página
Noticias por evento: Agrupadas bajo cada evento
Datos completos: Cada noticia con originales y traducidos
Estructura de datos:
Países: Lista completa para dropdown
Eventos: Paginados, con título, fechas, conteo de noticias
Noticias por evento: Diccionario {evento_id: [noticias...]}
Consultas complejas:
Agrupación con GROUP BY y MAX(p.nombre)
JOIN múltiple: eventos ↔ traducciones ↔ noticias ↔ países
Subconsulta para noticias por evento usando ANY(%s)
Variables de contexto:
paises, eventos, noticias_por_evento
pais_nombre: Nombre del país seleccionado
total_eventos, total_pages, page, lang
Templates utilizados:
eventos_pais.html: Página principal
backup.py
Propósito: Blueprint para importación y exportación de feeds en CSV.
Ruta base: /backup_feeds y /restore_feeds
Blueprints definidos:
backup_bp = Blueprint("backup", __name__)
Rutas:
@backup_bp.route("/backup_feeds") - backup_feeds()
Método: GET
Descripción: Exporta todos los feeds a CSV.
Características:
Incluye joins con categorías y países para nombres legibles
Codificación UTF-8 con BOM
Nombre de archivo: feeds_backup.csv
Usa io.StringIO y io.BytesIO para evitar archivos temporales
Campos exportados:
Todos los campos de feeds más nombres de categoría y país
@backup_bp.route("/restore_feeds", methods=["GET", "POST"]) - restore_feeds()
Método: GET y POST
Descripción: Restaura feeds desde CSV (reemplazo completo).
Flujo de restauración:
GET: Muestra formulario de subida
POST:
Valida archivo y encabezados CSV
TRUNCATE feeds RESTART IDENTITY CASCADE: Borra todo antes de importar
Procesa cada fila con validación
Estadísticas: importados, saltados, fallidos
Validaciones:
Encabezados exactos esperados
URL y nombre no vacíos
Conversión segura de tipos (int, bool)
Normalización de idioma (2 caracteres minúsculas)
Limpieza de datos:
python
row = {k: (v.strip().rstrip("ç") if v else "") for k, v in row.items()}
Manejo de booleanos:
python
activo = str(row["activo"]).lower() in ("true", "1", "t", "yes", "y")
Templates utilizados:
restore_feeds.html: Formulario de subida
Patrones de Diseño Comunes
1. Estructura de Blueprints
python
# Definición estándar
bp = Blueprint("nombre", __name__, url_prefix="/ruta")
# Registro en app.py
app.register_blueprint(bp)
2. Manejo de Conexiones a BD
python
with get_conn() as conn:
# Usar conn para múltiples operaciones
# conn.autocommit = True si es necesario
3. Paginación Consistente
python
page = max(int(request.args.get("page", 1)), 1)
per_page = 50 # o variable
offset = (page - 1) * per_page
4. Manejo de Parámetros de Filtro
python
where = []
params = []
if pais_id:
where.append("f.pais_id = %s")
params.append(int(pais_id))
where_sql = "WHERE " + " AND ".join(where) if where else ""
5. Flash Messages
python
flash("Operación exitosa", "success")
flash("Error: algo salió mal", "error")
6. Redirecciones
python
return redirect(url_for("blueprint.funcion"))
7. Manejo de Formularios
python
if request.method == "POST":
# Procesar datos
return redirect(...)
# GET: mostrar formulario
return render_template("form.html", datos=...)
Seguridad y Validaciones
1. SQL Injection
Todos los parámetros usan %s con psycopg2
No hay concatenación de strings en SQL
2. Validación de Entrada
Conversión segura a int: int(valor) if valor else None
Limpieza de strings: .strip(), normalización
Rangos: min(max(per_page, 10), 100)
3. Manejo de Archivos
Validación de tipo de contenido
Decodificación UTF-8 con manejo de BOM
Uso de io para evitar archivos temporales
Optimizaciones
1. JOINs Eficientes
LEFT JOIN para datos opcionales
GROUP BY cuando es necesario
Uso de índices implícitos en ORDER BY
2. Batch Operations
TRUNCATE ... RESTART IDENTITY más rápido que DELETE
Inserción fila por fila con validación
3. Manejo de Memoria
io.StringIO para CSV en memoria
Cursors con DictCursor para acceso por nombre
Dependencias entre Blueprints
text
home.py
└── usa: models.noticias.buscar_noticias()
└── usa: _extraer_tags_por_traduccion()
feeds.py
└── usa: models.categorias.get_categorias()
└── usa: models.paises.get_paises()
urls.py
└── usa: models.categorias.get_categorias()
└── usa: models.paises.get_paises()
noticia.py
└── consultas directas (no usa models/)
eventos.py
└── consultas directas (no usa models/)
backup.py
└── consultas directas (no usa models/)

203
routers/favoritos.py Normal file
View file

@ -0,0 +1,203 @@
"""
Favorites router - Save and manage favorite news.
"""
from flask import Blueprint, request, jsonify, session, render_template
from psycopg2 import extras
from db import get_read_conn, get_write_conn
from utils.auth import get_current_user, is_authenticated
import secrets
favoritos_bp = Blueprint("favoritos", __name__, url_prefix="/favoritos")
def get_user_or_session_id():
"""Get user ID if authenticated, otherwise session ID.
Returns:
Tuple of (user_id, session_id)
"""
user = get_current_user()
if user:
return (user['id'], None)
# Anonymous user - use session_id
if "user_session" not in session:
session["user_session"] = secrets.token_hex(16)
return (None, session["user_session"])
def ensure_favoritos_table(conn):
"""Create/update favoritos table to support both users and sessions."""
with conn.cursor() as cur:
# Table is created by init-db scripts, just ensure it exists
cur.execute("""
CREATE TABLE IF NOT EXISTS favoritos (
id SERIAL PRIMARY KEY,
user_id INTEGER REFERENCES usuarios(id) ON DELETE CASCADE,
session_id VARCHAR(64),
noticia_id VARCHAR(32) REFERENCES noticias(id) ON DELETE CASCADE,
created_at TIMESTAMP DEFAULT NOW()
);
""")
cur.execute("CREATE INDEX IF NOT EXISTS idx_favoritos_session ON favoritos(session_id);")
cur.execute("CREATE INDEX IF NOT EXISTS idx_favoritos_user_id ON favoritos(user_id);")
# Ensure session_id can be null (for logged in users)
try:
cur.execute("ALTER TABLE favoritos ALTER COLUMN session_id DROP NOT NULL;")
except Exception:
conn.rollback()
else:
conn.commit()
conn.commit()
# ============================================================
# API: Toggle Favorite
# ============================================================
@favoritos_bp.route("/toggle/<noticia_id>", methods=["POST"])
def toggle_favorite(noticia_id):
"""Toggle favorite status for a news item."""
user_id, session_id = get_user_or_session_id()
with get_write_conn() as conn:
ensure_favoritos_table(conn)
with conn.cursor() as cur:
# Check if already favorited (by user_id OR session_id)
if user_id:
cur.execute(
"SELECT id FROM favoritos WHERE user_id = %s AND noticia_id = %s",
(user_id, noticia_id)
)
else:
cur.execute(
"SELECT id FROM favoritos WHERE session_id = %s AND noticia_id = %s",
(session_id, noticia_id)
)
existing = cur.fetchone()
if existing:
# Remove favorite
if user_id:
cur.execute(
"DELETE FROM favoritos WHERE user_id = %s AND noticia_id = %s",
(user_id, noticia_id)
)
else:
cur.execute(
"DELETE FROM favoritos WHERE session_id = %s AND noticia_id = %s",
(session_id, noticia_id)
)
is_favorite = False
else:
# Add favorite
cur.execute(
"INSERT INTO favoritos (user_id, session_id, noticia_id) VALUES (%s, %s, %s) ON CONFLICT DO NOTHING",
(user_id, session_id, noticia_id)
)
is_favorite = True
conn.commit()
return jsonify({"success": True, "is_favorite": is_favorite})
# ============================================================
# API: Check if Favorite
# ============================================================
@favoritos_bp.route("/check/<noticia_id>")
def check_favorite(noticia_id):
"""Check if a news item is favorited."""
user_id, session_id = get_user_or_session_id()
with get_read_conn() as conn:
with conn.cursor() as cur:
if user_id:
cur.execute(
"SELECT id FROM favoritos WHERE user_id = %s AND noticia_id = %s",
(user_id, noticia_id)
)
else:
cur.execute(
"SELECT id FROM favoritos WHERE session_id = %s AND noticia_id = %s",
(session_id, noticia_id)
)
is_favorite = cur.fetchone() is not None
return jsonify({"is_favorite": is_favorite})
# ============================================================
# API: Get User's Favorites IDs
# ============================================================
@favoritos_bp.route("/ids")
def get_favorite_ids():
"""Get list of favorite noticia IDs for current user."""
user_id, session_id = get_user_or_session_id()
with get_read_conn() as conn:
with conn.cursor() as cur:
if user_id:
cur.execute(
"SELECT noticia_id FROM favoritos WHERE user_id = %s",
(user_id,)
)
else:
cur.execute(
"SELECT noticia_id FROM favoritos WHERE session_id = %s",
(session_id,)
)
ids = [row[0] for row in cur.fetchall()]
return jsonify({"ids": ids})
# ============================================================
# Page: View Favorites
# ============================================================
@favoritos_bp.route("/")
def view_favorites():
"""View all favorited news items."""
user_id, session_id = get_user_or_session_id()
user = get_current_user()
with get_read_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
if user_id:
cur.execute("""
SELECT n.id, n.titulo, n.resumen, n.url, n.fecha, n.imagen_url,
n.fuente_nombre, c.nombre AS categoria, p.nombre AS pais,
t.titulo_trad, t.resumen_trad, t.lang_to,
f.created_at AS favorito_at
FROM favoritos f
JOIN noticias n ON n.id = f.noticia_id
LEFT JOIN categorias c ON c.id = n.categoria_id
LEFT JOIN paises p ON p.id = n.pais_id
LEFT JOIN traducciones t ON t.noticia_id = n.id AND t.lang_to = 'es' AND t.status = 'done'
WHERE f.user_id = %s
ORDER BY f.created_at DESC
LIMIT 100;
""", (user_id,))
else:
cur.execute("""
SELECT n.id, n.titulo, n.resumen, n.url, n.fecha, n.imagen_url,
n.fuente_nombre, c.nombre AS categoria, p.nombre AS pais,
t.titulo_trad, t.resumen_trad, t.lang_to,
f.created_at AS favorito_at
FROM favoritos f
JOIN noticias n ON n.id = f.noticia_id
LEFT JOIN categorias c ON c.id = n.categoria_id
LEFT JOIN paises p ON p.id = n.pais_id
LEFT JOIN traducciones t ON t.noticia_id = n.id AND t.lang_to = 'es' AND t.status = 'done'
WHERE f.session_id = %s
ORDER BY f.created_at DESC
LIMIT 100;
""", (session_id,))
noticias = cur.fetchall()
return render_template("favoritos.html", noticias=noticias, user=user)

428
routers/feeds.py Normal file
View file

@ -0,0 +1,428 @@
from flask import Blueprint, render_template, request, redirect, flash, url_for, jsonify
from db import get_conn
from psycopg2 import extras
from models.categorias import get_categorias
from models.paises import get_paises
from utils.feed_discovery import discover_feeds, validate_feed, get_feed_metadata
# Blueprint correcto
feeds_bp = Blueprint("feeds", __name__, url_prefix="/feeds")
@feeds_bp.route("/")
def list_feeds():
"""Listado con filtros"""
page = max(int(request.args.get("page", 1)), 1)
per_page = 50
offset = (page - 1) * per_page
pais_id = request.args.get("pais_id")
categoria_id = request.args.get("categoria_id")
estado = request.args.get("estado") or ""
where = []
params = []
if pais_id:
where.append("f.pais_id = %s")
params.append(int(pais_id))
if categoria_id:
where.append("f.categoria_id = %s")
params.append(int(categoria_id))
if estado == "activos":
where.append("f.activo = TRUE")
elif estado == "inactivos":
where.append("f.activo = FALSE")
elif estado == "errores":
where.append("COALESCE(f.fallos, 0) > 0")
where_sql = "WHERE " + " AND ".join(where) if where else ""
with get_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur:
# Total
# Total
cur.execute(f"SELECT COUNT(*) FROM feeds f {where_sql}", params)
total_feeds = cur.fetchone()[0]
# Caídos (Inactivos o con max fallos logic check, usually inactive is enough if logic works)
# Using the same filter context to see how many of THESE are fallen
# Caídos (Inactivos o con max fallos logic check)
# Using the same filter context to see how many of THESE are fallen
caidos_condition = "(f.activo = FALSE OR f.fallos >= 5)"
if where_sql:
# where_sql ya incluye "WHERE ..."
caidos_sql = f"SELECT COUNT(*) FROM feeds f {where_sql} AND {caidos_condition}"
else:
caidos_sql = f"SELECT COUNT(*) FROM feeds f WHERE {caidos_condition}"
cur.execute(caidos_sql, params)
feeds_caidos = cur.fetchone()[0]
total_pages = (total_feeds // per_page) + (1 if total_feeds % per_page else 0)
# Lista paginada
cur.execute(
f"""
SELECT
f.id, f.nombre, f.descripcion, f.url,
f.activo, f.fallos, f.last_error,
c.nombre AS categoria,
p.nombre AS pais,
(SELECT COUNT(*) FROM noticias n WHERE n.fuente_nombre = f.nombre) as noticias_count
FROM feeds f
LEFT JOIN categorias c ON c.id = f.categoria_id
LEFT JOIN paises p ON p.id = f.pais_id
{where_sql}
ORDER BY p.nombre NULLS LAST, f.activo DESC, f.fallos ASC, c.nombre NULLS LAST, f.nombre
LIMIT %s OFFSET %s
""",
params + [per_page, offset],
)
feeds = cur.fetchall()
# Selects
cur.execute("SELECT id, nombre FROM categorias ORDER BY nombre;")
categorias = cur.fetchall()
cur.execute("SELECT id, nombre FROM paises ORDER BY nombre;")
paises = cur.fetchall()
if request.headers.get("X-Requested-With") == "XMLHttpRequest":
return render_template(
"_feeds_table.html",
feeds=feeds,
total_feeds=total_feeds,
feeds_caidos=feeds_caidos,
total_pages=total_pages,
page=page,
filtro_pais_id=pais_id,
filtro_categoria_id=categoria_id,
filtro_estado=estado,
)
return render_template(
"feeds_list.html",
feeds=feeds,
total_feeds=total_feeds,
feeds_caidos=feeds_caidos,
total_pages=total_pages,
page=page,
categorias=categorias,
paises=paises,
filtro_pais_id=pais_id,
filtro_categoria_id=categoria_id,
filtro_estado=estado,
)
@feeds_bp.route("/add", methods=["GET", "POST"])
def add_feed():
"""Añadir feed"""
with get_conn() as conn:
categorias = get_categorias(conn)
paises = get_paises(conn)
if request.method == "POST":
nombre = request.form.get("nombre")
descripcion = request.form.get("descripcion") or None
url = request.form.get("url")
categoria_id = request.form.get("categoria_id")
pais_id = request.form.get("pais_id")
idioma = (request.form.get("idioma") or "").strip().lower()[:2] or None
try:
with conn.cursor() as cur:
cur.execute(
"""
INSERT INTO feeds (nombre, descripcion, url, categoria_id, pais_id, idioma)
VALUES (%s, %s, %s, %s, %s, %s)
""",
(
nombre,
descripcion,
url,
int(categoria_id) if categoria_id else None,
int(pais_id) if pais_id else None,
idioma,
),
)
conn.commit()
flash("Feed añadido correctamente.", "success")
return redirect(url_for("feeds.list_feeds"))
except Exception as e:
flash(f"Error al añadir feed: {e}", "error")
return render_template("add_feed.html", categorias=categorias, paises=paises)
@feeds_bp.route("/<int:feed_id>/edit", methods=["GET", "POST"])
def edit_feed(feed_id):
"""Editar feed"""
with get_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur:
cur.execute("SELECT * FROM feeds WHERE id = %s;", (feed_id,))
feed = cur.fetchone()
if not feed:
flash("Feed no encontrado.", "error")
return redirect(url_for("feeds.list_feeds"))
categorias = get_categorias(conn)
paises = get_paises(conn)
if request.method == "POST":
nombre = request.form.get("nombre")
descripcion = request.form.get("descripcion") or None
url = request.form.get("url")
categoria_id = request.form.get("categoria_id")
pais_id = request.form.get("pais_id")
idioma = (request.form.get("idioma") or "").strip().lower()[:2] or None
activo = bool(request.form.get("activo"))
try:
cur.execute(
"""
UPDATE feeds
SET nombre=%s, descripcion=%s, url=%s,
categoria_id=%s, pais_id=%s, idioma=%s, activo=%s
WHERE id=%s;
""",
(
nombre,
descripcion,
url,
int(categoria_id) if categoria_id else None,
int(pais_id) if pais_id else None,
idioma,
activo,
feed_id,
),
)
conn.commit()
flash("Feed actualizado.", "success")
return redirect(url_for("feeds.list_feeds"))
except Exception as e:
flash(f"Error al actualizar: {e}", "error")
return render_template("edit_feed.html", feed=feed, categorias=categorias, paises=paises)
@feeds_bp.route("/<int:feed_id>/delete")
def delete_feed(feed_id):
"""Eliminar feed"""
with get_conn() as conn, conn.cursor() as cur:
try:
cur.execute("DELETE FROM feeds WHERE id=%s;", (feed_id,))
conn.commit()
flash("Feed eliminado.", "success")
except Exception as e:
flash(f"No se pudo eliminar: {e}", "error")
return redirect(url_for("feeds.list_feeds"))
@feeds_bp.route("/<int:feed_id>/reactivar")
def reactivar_feed(feed_id):
"""Reactivar feed KO"""
with get_conn() as conn, conn.cursor() as cur:
try:
cur.execute(
"UPDATE feeds SET activo=TRUE, fallos=0 WHERE id=%s;",
(feed_id,),
)
conn.commit()
flash("Feed reactivado.", "success")
except Exception as e:
flash(f"No se pudo reactivar: {e}", "error")
return redirect(url_for("feeds.list_feeds"))
@feeds_bp.route("/discover", methods=["GET", "POST"])
def discover_feed():
"""Descubrir feeds RSS desde una URL"""
discovered_feeds = []
source_url = ""
with get_conn() as conn:
categorias = get_categorias(conn)
paises = get_paises(conn)
if request.method == "POST":
source_url = request.form.get("source_url", "").strip()
if not source_url:
flash("Por favor, ingresa una URL válida.", "error")
else:
try:
# Discover feeds from the URL
discovered_feeds = discover_feeds(source_url, timeout=15)
if not discovered_feeds:
flash(f"No se encontraron feeds RSS en la URL: {source_url}", "warning")
else:
# Check which feeds already exist in DB
found_urls = [f['url'] for f in discovered_feeds]
existing_urls = set()
try:
with conn.cursor() as cur:
cur.execute("SELECT url FROM feeds WHERE url = ANY(%s)", (found_urls,))
rows = cur.fetchall()
existing_urls = {r[0] for r in rows}
except Exception as db_e:
# Fallback if DB fails, though unlikely
print(f"Error checking existing feeds: {db_e}")
for feed in discovered_feeds:
feed['exists'] = feed['url'] in existing_urls
new_count = len(discovered_feeds) - len(existing_urls)
flash(f"Feeds disponibles: {new_count} de {len(discovered_feeds)} encontrados.", "success")
except Exception as e:
flash(f"Error al descubrir feeds: {e}", "error")
return render_template(
"discover_feeds.html",
discovered_feeds=discovered_feeds,
source_url=source_url,
categorias=categorias,
paises=paises
)
@feeds_bp.route("/discover_and_add", methods=["POST"])
def discover_and_add():
"""Añadir múltiples feeds descubiertos"""
selected_feeds = request.form.getlist("selected_feeds")
categoria_id = request.form.get("categoria_id")
pais_id = request.form.get("pais_id")
idioma = (request.form.get("idioma") or "").strip().lower()[:2] or None
if not selected_feeds:
flash("No se seleccionó ningún feed.", "warning")
return redirect(url_for("feeds.discover_feed"))
added_count = 0
errors = []
with get_conn() as conn:
for feed_url in selected_feeds:
try:
# Get individual settings for this feed
# The form uses the feed URL as part of the field name
item_cat_id = request.form.get(f"cat_{feed_url}")
item_country_id = request.form.get(f"country_{feed_url}")
item_lang = request.form.get(f"lang_{feed_url}")
# Get feed metadata
metadata = get_feed_metadata(feed_url, timeout=10)
if not metadata:
errors.append(f"No se pudo obtener metadata del feed: {feed_url}")
continue
# Use context title from discovery if available, otherwise use metadata title
context_title = request.form.get(f"context_{feed_url}")
nombre = context_title if context_title else metadata.get('title', 'Feed sin título')
descripcion = metadata.get('description', '')
with conn.cursor() as cur:
cur.execute(
"""
INSERT INTO feeds (nombre, descripcion, url, categoria_id, pais_id, idioma)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (url) DO NOTHING
""",
(
nombre,
descripcion[:500] if descripcion else None,
feed_url,
int(item_cat_id) if item_cat_id else None,
int(item_country_id) if item_country_id else None,
(item_lang or "").strip().lower()[:2] or None,
),
)
if cur.rowcount > 0:
added_count += 1
conn.commit()
except Exception as e:
errors.append(f"Error al añadir {feed_url}: {e}")
is_ajax = request.headers.get("X-Requested-With") == "XMLHttpRequest"
if added_count > 0:
msg = f"Se añadieron {added_count} feeds correctamente."
if not is_ajax:
flash(msg, "success")
else:
msg = "No se añadieron feeds nuevos."
if not is_ajax:
# Only flash warning if not ajax, or handle differently
if not errors:
flash(msg, "warning")
if errors:
for error in errors[:5]: # Mostrar solo los primeros 5 errores
if not is_ajax:
flash(error, "error")
if is_ajax:
return jsonify({
"success": added_count > 0,
"added_count": added_count,
"message": msg,
"errors": errors
})
return redirect(url_for("feeds.list_feeds"))
@feeds_bp.route("/api/validate", methods=["POST"])
def api_validate_feed():
"""API endpoint para validar una URL de feed"""
data = request.get_json()
feed_url = data.get("url", "").strip()
if not feed_url:
return jsonify({"error": "URL no proporcionada"}), 400
try:
feed_info = validate_feed(feed_url, timeout=10)
if not feed_info:
return jsonify({"error": "No se pudo validar el feed"}), 400
return jsonify(feed_info), 200
except Exception as e:
return jsonify({"error": str(e)}), 500
@feeds_bp.route("/api/discover", methods=["POST"])
def api_discover_feeds():
"""API endpoint para descubrir feeds desde una URL"""
data = request.get_json()
source_url = data.get("url", "").strip()
if not source_url:
return jsonify({"error": "URL no proporcionada"}), 400
try:
discovered = discover_feeds(source_url, timeout=15)
return jsonify({"feeds": discovered, "count": len(discovered)}), 200
except Exception as e:
return jsonify({"error": str(e)}), 500

294
routers/home.py Normal file
View file

@ -0,0 +1,294 @@
from flask import Blueprint, render_template, request
from datetime import datetime
from psycopg2 import extras
from db import get_read_conn, get_write_conn
from utils.auth import get_current_user
from config import DEFAULT_TRANSLATION_LANG, DEFAULT_LANG, NEWS_PER_PAGE_DEFAULT
from models.categorias import get_categorias
from models.paises import get_paises
from models.noticias import buscar_noticias, buscar_noticias_semantica
from cache import cached
home_bp = Blueprint("home", __name__)
@home_bp.route("/")
@home_bp.route("/home")
def home():
page = max(int(request.args.get("page", 1)), 1)
per_page = int(request.args.get("per_page", NEWS_PER_PAGE_DEFAULT))
per_page = min(max(per_page, 10), 100)
q = (request.args.get("q") or "").strip()
categoria_id = request.args.get("categoria_id")
continente_id = request.args.get("continente_id")
pais_id = request.args.get("pais_id")
fecha_str = request.args.get("fecha") or ""
lang = (request.args.get("lang") or DEFAULT_TRANSLATION_LANG or DEFAULT_LANG).lower()[:5]
use_tr = not bool(request.args.get("orig"))
fecha_str = request.args.get("fecha") or ""
fecha_filtro = None
if fecha_str:
try:
fecha_filtro = datetime.strptime(fecha_str, "%Y-%m-%d").date()
except ValueError:
fecha_filtro = None
from utils.qdrant_search import semantic_search
# Logic for semantic search enabled by default if query exists, unless explicitly disabled
# If the user passed 'semantic=' explicitly as empty string, it might mean False, but for UX speed default to True is better.
# However, let's respect the flag if it's explicitly 'false' or '0'.
# If key is missing, default to True. If key is present but empty, treat as False (standard HTML form behavior unfortunately).
# But wait, the previous log showed 'semantic='. HTML checkboxes send nothing if unchecked, 'on' if checked.
# So if it appears as empty string, it might be a hidden input or unassigned var.
# Let's check 'semantic' param presence.
raw_semantic = request.args.get("semantic")
if raw_semantic is None:
use_semantic = True # Default to semantic if not specified
elif raw_semantic == "" or raw_semantic.lower() in ["false", "0", "off"]:
use_semantic = False
else:
use_semantic = True
with get_read_conn() as conn:
conn.autocommit = True
categorias = get_categorias(conn)
paises = get_paises(conn)
noticias = []
total_results = 0
total_pages = 0
tags_por_tr = {}
# 1. Intentar búsqueda semántica si hay query y está habilitado
semantic_success = False
if use_semantic and q:
try:
# Obtener más resultados para 'llenar' la página si hay IDs no encontrados
limit_fetch = per_page * 2
sem_results = semantic_search(
query=q,
limit=limit_fetch, # Pedimos más para asegurar
score_threshold=0.30
)
if sem_results:
# Extraer IDs
news_ids = [r['news_id'] for r in sem_results]
# Traer datos completos de PostgreSQL (igual que en search.py)
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
query_sql = """
SELECT
n.id,
n.titulo,
n.resumen,
n.url,
n.fecha,
n.imagen_url,
n.fuente_nombre,
c.nombre AS categoria,
p.nombre AS pais,
-- traducciones
t.id AS traduccion_id,
t.titulo_trad AS titulo_traducido,
t.resumen_trad AS resumen_traducido,
CASE WHEN t.id IS NOT NULL THEN TRUE ELSE FALSE END AS tiene_traduccion,
-- originales
n.titulo AS titulo_original,
n.resumen AS resumen_original
FROM noticias n
LEFT JOIN categorias c ON c.id = n.categoria_id
LEFT JOIN paises p ON p.id = n.pais_id
LEFT JOIN traducciones t
ON t.noticia_id = n.id
AND t.lang_to = %s
AND t.status = 'done'
WHERE n.id = ANY(%s)
"""
cur.execute(query_sql, (lang, news_ids))
rows = cur.fetchall()
# Convertimos a lista para poder ordenar por fecha
rows_list = list(rows)
# Ordenar cronológicamente (más reciente primero)
sorted_rows = sorted(
rows_list,
key=lambda x: x['fecha'] if x['fecha'] else datetime.min,
reverse=True
)
# Aplicar paginación manual sobre los resultados ordenados
# Nota: semantic_search ya devolvió los "top" globales (aproximadamente).
# Para paginación real profunda con Qdrant se necesita scroll/offset,
# aquí asumimos que page request mapea al limit/offset enviado a Qdrant.
# Pero `semantic_search` simple en utils no tiene offset.
# Arreglo temporal: Solo mostramos la primera "tanda" de resultados semánticos.
# Si el usuario quiere paginar profundo, Qdrant search debe soportar offset.
# utils/qdrant_search.py NO tiene offset.
# ASÍ QUE: Solo funcionará bien para la página 1.
# Si page > 1, semantic_search simple no sirve sin offset.
# Fallback: Si page > 1, usamos búsqueda tradicional O implementamos offset en Qdrant (mejor).
# Por ahora: Usamos lo que devolvió semantic_search y cortamos localmente
# si page=1.
if len(sorted_rows) > 0:
noticias = sorted_rows
total_results = len(noticias) # Aproximado
total_pages = 1 # Qdrant simple no pagina bien aun
# Extraer tags
tr_ids = [n["traduccion_id"] for n in noticias if n["traduccion_id"]]
from models.noticias import _extraer_tags_por_traduccion
tags_por_tr = _extraer_tags_por_traduccion(cur, tr_ids)
semantic_success = True
except Exception as e:
print(f"⚠️ Error en semántica home, fallback: {e}")
semantic_success = False
# 2. Si no hubo búsqueda semántica (o falló, o no había query, o usuario la desactivó), usar la tradicional
if not semantic_success:
noticias, total_results, total_pages, tags_por_tr = buscar_noticias(
conn=conn,
page=page,
per_page=per_page,
q=q,
categoria_id=categoria_id,
continente_id=continente_id,
pais_id=pais_id,
fecha=fecha_filtro,
lang=lang,
use_tr=use_tr,
)
# Record search history for logged-in users (only on first page to avoid dupes)
if (q or categoria_id or pais_id) and page == 1:
user = get_current_user()
if user:
try:
with get_write_conn() as w_conn:
with w_conn.cursor() as w_cur:
# Check if it's the same as the last search to avoid immediate duplicates
w_cur.execute("""
SELECT query, pais_id, categoria_id
FROM search_history
WHERE user_id = %s
ORDER BY searched_at DESC LIMIT 1
""", (user['id'],))
last_search = w_cur.fetchone()
current_search = (q or None, int(pais_id) if pais_id else None, int(categoria_id) if categoria_id else None)
if not last_search or (last_search[0], last_search[1], last_search[2]) != current_search:
w_cur.execute("""
INSERT INTO search_history (user_id, query, pais_id, categoria_id, results_count)
VALUES (%s, %s, %s, %s, %s)
""", (user['id'], current_search[0], current_search[1], current_search[2], total_results))
w_conn.commit()
except Exception as e:
# Log error but don't break the page load
print(f"Error saving search history: {e}")
pass
user = get_current_user()
recent_searches_with_results = []
if user and not q and not categoria_id and not pais_id and page == 1:
with get_read_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
# Fetch unique latest searches using DISTINCT ON
cur.execute("""
SELECT sub.id, query, pais_id, categoria_id, results_count, searched_at,
p.nombre as pais_nombre, c.nombre as categoria_nombre
FROM (
SELECT DISTINCT ON (COALESCE(query, ''), COALESCE(pais_id, 0), COALESCE(categoria_id, 0))
id, query, pais_id, categoria_id, results_count, searched_at
FROM search_history
WHERE user_id = %s
ORDER BY COALESCE(query, ''), COALESCE(pais_id, 0), COALESCE(categoria_id, 0), searched_at DESC
) sub
LEFT JOIN paises p ON p.id = sub.pais_id
LEFT JOIN categorias c ON c.id = sub.categoria_id
ORDER BY searched_at DESC
LIMIT 6
""", (user['id'],))
recent_searches = cur.fetchall()
for s in recent_searches:
# Fetch top 6 news for this search
news_items, _, _, _ = buscar_noticias(
conn=conn,
page=1,
per_page=6,
q=s['query'] or "",
pais_id=s['pais_id'],
categoria_id=s['categoria_id'],
lang=lang,
use_tr=use_tr,
skip_count=True
)
recent_searches_with_results.append({
'id': s['id'],
'query': s['query'],
'pais_id': s['pais_id'],
'pais_nombre': s['pais_nombre'],
'categoria_id': s['categoria_id'],
'categoria_nombre': s['categoria_nombre'],
'results_count': s['results_count'],
'searched_at': s['searched_at'],
'noticias': news_items
})
context = dict(
noticias=noticias,
total_results=total_results,
total_pages=total_pages,
page=page,
per_page=per_page,
categorias=categorias,
paises=paises,
q=q,
cat_id=int(categoria_id) if categoria_id else None,
pais_id=int(pais_id) if pais_id else None,
fecha_filtro=fecha_str,
lang=lang,
use_tr=use_tr,
use_semantic=use_semantic,
tags_por_tr=tags_por_tr,
recent_searches_with_results=recent_searches_with_results,
)
if request.headers.get("X-Requested-With") == "XMLHttpRequest":
return render_template("_noticias_list.html", **context)
return render_template("noticias.html", **context)
@home_bp.route("/delete_search/<int:search_id>", methods=["POST"])
def delete_search(search_id):
user = get_current_user()
if not user:
return {"error": "No autenticado"}, 401
try:
with get_write_conn() as conn:
with conn.cursor() as cur:
# Direct deletion ensuring ownership
cur.execute(
"DELETE FROM search_history WHERE id = %s AND user_id = %s",
(search_id, user["id"])
)
conn.commit()
return {"success": True}
except Exception as e:
print(f"Error deleting search {search_id}: {e}")
return {"error": str(e)}, 500

116
routers/noticia.py Normal file
View file

@ -0,0 +1,116 @@
from flask import Blueprint, render_template, request, redirect, flash, url_for
from db import get_read_conn
from psycopg2 import extras
noticia_bp = Blueprint("noticia", __name__)
@noticia_bp.route("/noticia")
def noticia():
tr_id = request.args.get("tr_id")
noticia_id = request.args.get("id")
if not tr_id and not noticia_id:
flash("No se ha indicado ninguna noticia.", "warning")
return redirect(url_for("home.home"))
with get_read_conn() as conn:
conn.autocommit = True
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
dato = None
if tr_id:
cur.execute(
"""
SELECT
t.id AS traduccion_id,
t.lang_from,
t.lang_to,
t.titulo_trad,
t.resumen_trad,
n.id AS noticia_id,
n.titulo AS titulo_orig,
n.resumen AS resumen_orig,
n.url,
n.fecha,
n.imagen_url,
n.fuente_nombre,
c.nombre AS categoria,
p.nombre AS pais
FROM traducciones t
JOIN noticias n ON n.id = t.noticia_id
LEFT JOIN categorias c ON c.id = n.categoria_id
LEFT JOIN paises p ON p.id = n.pais_id
WHERE t.id = %s
""",
(int(tr_id),),
)
dato = cur.fetchone()
else:
cur.execute(
"""
SELECT
NULL AS traduccion_id,
NULL AS lang_from,
NULL AS lang_to,
NULL AS titulo_trad,
NULL AS resumen_trad,
n.id AS noticia_id,
n.titulo AS titulo_orig,
n.resumen AS resumen_orig,
n.url,
n.fecha,
n.imagen_url,
n.fuente_nombre,
c.nombre AS categoria,
p.nombre AS pais
FROM noticias n
LEFT JOIN categorias c ON c.id = n.categoria_id
LEFT JOIN paises p ON p.id = n.pais_id
WHERE n.id = %s
""",
(noticia_id,),
)
dato = cur.fetchone()
tags = []
relacionadas = []
if dato and dato["traduccion_id"]:
cur.execute(
"""
SELECT tg.valor, tg.tipo
FROM tags_noticia tn
JOIN tags tg ON tg.id = tn.tag_id
WHERE tn.traduccion_id = %s
ORDER BY tg.tipo, tg.valor;
""",
(dato["traduccion_id"],),
)
tags = cur.fetchall()
cur.execute(
"""
SELECT
n2.url,
n2.titulo,
n2.fecha,
n2.imagen_url,
n2.fuente_nombre,
rn.score,
t2.titulo_trad,
t2.id AS related_tr_id
FROM related_noticias rn
JOIN traducciones t2 ON t2.id = rn.related_traduccion_id
JOIN noticias n2 ON n2.id = t2.noticia_id
WHERE rn.traduccion_id = %s
ORDER BY rn.score DESC
LIMIT 8;
""",
(dato["traduccion_id"],),
)
relacionadas = cur.fetchall()
return render_template("noticia.html", dato=dato, tags=tags, relacionadas=relacionadas)

44
routers/notifications.py Normal file
View file

@ -0,0 +1,44 @@
"""
Notifications router - Check for new important news.
"""
from flask import Blueprint, jsonify, request
from db import get_conn
from datetime import datetime
notifications_bp = Blueprint("notifications", __name__, url_prefix="/api/notifications")
@notifications_bp.route("/check")
def check_notifications():
"""Check for new news since a given timestamp."""
last_check = request.args.get("last_check")
if not last_check:
return jsonify({"has_news": False, "timestamp": datetime.utcnow().isoformat()})
try:
# Check for news created after last_check
# We define "important" as having translation or high score (if score existed)
# For now, just any new news to demonstrate functionality
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT COUNT(*), MAX(fecha)
FROM noticias
WHERE fecha > %s
""", (last_check,))
row = cur.fetchone()
count = row[0]
latest = row[1]
if count > 0:
return jsonify({
"has_news": True,
"count": count,
"timestamp": latest.isoformat() if latest else datetime.utcnow().isoformat(),
"message": f"¡{count} noticias nuevas encontradas!"
})
except Exception as e:
print(f"Error checking notifications: {e}")
return jsonify({"has_news": False, "timestamp": datetime.utcnow().isoformat()})

325
routers/parrillas.py Normal file
View file

@ -0,0 +1,325 @@
"""
Router para gestionar parrill
as de videos de noticias.
"""
from flask import Blueprint, render_template, request, jsonify, redirect, url_for, flash
from db import get_conn
from psycopg2 import extras
from datetime import datetime, timedelta
import logging
logger = logging.getLogger(__name__)
parrillas_bp = Blueprint("parrillas", __name__, url_prefix="/parrillas")
@parrillas_bp.route("/")
def index():
"""Dashboard principal de parrillas."""
with get_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
# Obtener todas las parrillas
cur.execute("""
SELECT
p.*,
pa.nombre as pais_nombre,
c.nombre as categoria_nombre,
(SELECT COUNT(*) FROM video_generados WHERE parrilla_id = p.id) as total_videos
FROM video_parrillas p
LEFT JOIN paises pa ON pa.id = p.pais_id
LEFT JOIN categorias c ON c.id = p.categoria_id
ORDER BY p.created_at DESC
""")
parrillas = cur.fetchall()
return render_template("parrillas/index.html", parrillas=parrillas)
@parrillas_bp.route("/nueva", methods=["GET", "POST"])
def nueva():
"""Crear una nueva parrilla."""
if request.method == "GET":
# Cargar datos para el formulario
with get_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
cur.execute("SELECT id, nombre FROM paises ORDER BY nombre")
paises = cur.fetchall()
cur.execute("SELECT id, nombre FROM categorias ORDER BY nombre")
categorias = cur.fetchall()
return render_template("parrillas/form.html",
paises=paises,
categorias=categorias)
# POST: Crear parrilla
try:
data = request.form
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
INSERT INTO video_parrillas (
nombre, descripcion, tipo_filtro,
pais_id, categoria_id, entidad_nombre, entidad_tipo,
max_noticias, duracion_maxima, idioma_voz,
template, include_images, include_subtitles,
frecuencia, activo
) VALUES (
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
) RETURNING id
""", (
data.get('nombre'),
data.get('descripcion'),
data.get('tipo_filtro'),
data.get('pais_id') or None,
data.get('categoria_id') or None,
data.get('entidad_nombre') or None,
data.get('entidad_tipo') or None,
int(data.get('max_noticias', 5)),
int(data.get('duracion_maxima', 180)),
data.get('idioma_voz', 'es'),
data.get('template', 'standard'),
data.get('include_images') == 'on',
data.get('include_subtitles') == 'on',
data.get('frecuencia', 'manual'),
data.get('activo') == 'on'
))
parrilla_id = cur.fetchone()[0]
conn.commit()
flash(f"Parrilla '{data.get('nombre')}' creada exitosamente", "success")
return redirect(url_for('parrillas.ver', id=parrilla_id))
except Exception as e:
logger.error(f"Error creating parrilla: {e}", exc_info=True)
flash(f"Error al crear parrilla: {str(e)}", "error")
return redirect(url_for('parrillas.nueva'))
@parrillas_bp.route("/<int:id>")
def ver(id):
"""Ver detalles de una parrilla específica."""
with get_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
# Obtener parrilla
cur.execute("""
SELECT
p.*,
pa.nombre as pais_nombre,
c.nombre as categoria_nombre
FROM video_parrillas p
LEFT JOIN paises pa ON pa.id = p.pais_id
LEFT JOIN categorias c ON c.id = p.categoria_id
WHERE p.id = %s
""", (id,))
parrilla = cur.fetchone()
if not parrilla:
flash("Parrilla no encontrada", "error")
return redirect(url_for('parrillas.index'))
# Obtener videos generados
cur.execute("""
SELECT * FROM video_generados
WHERE parrilla_id = %s
ORDER BY fecha_generacion DESC
LIMIT 50
""", (id,))
videos = cur.fetchall()
return render_template("parrillas/detail.html", parrilla=parrilla, videos=videos)
@parrillas_bp.route("/api/<int:id>/preview")
def preview_noticias(id):
"""Preview de noticias que se incluirían en el siguiente video."""
with get_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
# Obtener configuración de parrilla
cur.execute("SELECT * FROM video_parrillas WHERE id = %s", (id,))
parrilla = cur.fetchone()
if not parrilla:
return jsonify({"error": "Parrilla no encontrada"}), 404
# Construir query según filtros
where_clauses = []
params = []
if parrilla['pais_id']:
where_clauses.append("n.pais_id = %s")
params.append(parrilla['pais_id'])
if parrilla['categoria_id']:
where_clauses.append("n.categoria_id = %s")
params.append(parrilla['categoria_id'])
if parrilla['entidad_nombre']:
# Filtrar por entidad
where_clauses.append("""
EXISTS (
SELECT 1 FROM tags_noticia tn
JOIN tags t ON t.id = tn.tag_id
WHERE tn.traduccion_id = tr.id
AND t.tipo = %s
AND t.valor ILIKE %s
)
""")
params.append(parrilla['entidad_tipo'])
params.append(f"%{parrilla['entidad_nombre']}%")
# Solo noticias de hoy o ayer
where_clauses.append("n.fecha >= NOW() - INTERVAL '1 day'")
where_sql = " AND ".join(where_clauses) if where_clauses else "1=1"
# Obtener noticias
cur.execute(f"""
SELECT
n.id,
n.titulo,
n.imagen_url,
n.fecha,
tr.titulo_trad,
tr.resumen_trad,
LENGTH(tr.resumen_trad) as longitud_texto
FROM noticias n
LEFT JOIN traducciones tr ON tr.noticia_id = n.id AND tr.lang_to = %s AND tr.status = 'done'
WHERE {where_sql}
AND tr.id IS NOT NULL
ORDER BY n.fecha DESC
LIMIT %s
""", [parrilla['idioma_voz']] + params + [parrilla['max_noticias']])
noticias = cur.fetchall()
return jsonify({
"noticias": [dict(n) for n in noticias],
"total": len(noticias),
"config": {
"max_noticias": parrilla['max_noticias'],
"duracion_maxima": parrilla['duracion_maxima']
}
})
@parrillas_bp.route("/api/<int:id>/generar", methods=["POST"])
def generar_video(id):
"""Iniciar generación de video para una parrilla."""
try:
with get_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
# Verificar que la parrilla existe
cur.execute("SELECT * FROM video_parrillas WHERE id = %s", (id,))
parrilla = cur.fetchone()
if not parrilla:
return jsonify({"error": "Parrilla no encontrada"}), 404
# Crear registro de video
cur.execute("""
INSERT INTO video_generados (
parrilla_id, titulo, descripcion, status
) VALUES (
%s, %s, %s, 'pending'
) RETURNING id
""", (
id,
f"{parrilla['nombre']} - {datetime.now().strftime('%Y-%m-%d %H:%M')}",
f"Video generado automáticamente para {parrilla['nombre']}"
))
video_id = cur.fetchone()[0]
# Actualizar fecha de última generación
cur.execute("""
UPDATE video_parrillas
SET ultima_generacion = NOW()
WHERE id = %s
""", (id,))
conn.commit()
# Lanzar el proceso de generación en segundo plano
import subprocess
import sys
# Ejecutamos el script generador pasando el ID de la parrilla
# Usamos Popen para no bloquear la respuesta HTTP (fire and forget)
cmd = [sys.executable, "generar_videos_noticias.py", str(id)]
subprocess.Popen(cmd, cwd="/app")
return jsonify({
"success": True,
"video_id": video_id,
"message": "Generación de video iniciada en segundo plano"
})
except Exception as e:
logger.error(f"Error queuing video: {e}", exc_info=True)
return jsonify({"error": str(e)}), 500
@parrillas_bp.route("/api/<int:id>", methods=["DELETE"])
def eliminar(id):
"""Eliminar una parrilla."""
try:
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute("DELETE FROM video_parrillas WHERE id = %s", (id,))
conn.commit()
return jsonify({"success": True})
except Exception as e:
logger.error(f"Error deleting parrilla: {e}", exc_info=True)
return jsonify({"error": str(e)}), 500
@parrillas_bp.route("/api/<int:id>/toggle", methods=["POST"])
def toggle_activo(id):
"""Activar/desactivar una parrilla."""
try:
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
UPDATE video_parrillas
SET activo = NOT activo
WHERE id = %s
RETURNING activo
""", (id,))
nuevo_estado = cur.fetchone()[0]
conn.commit()
return jsonify({"success": True, "activo": nuevo_estado})
except Exception as e:
logger.error(f"Error toggling parrilla: {e}", exc_info=True)
return jsonify({"error": str(e)}), 500
@parrillas_bp.route("/files/<int:video_id>/<filename>")
def serve_file(video_id, filename):
"""Servir archivos generados (audio, script, srt)."""
from flask import send_from_directory
import os
# Directorio base de videos
base_dir = "/app/data/videos"
video_dir = os.path.join(base_dir, str(video_id))
# Validar que sea un archivo permitido para evitar Path Traversal
allowed_files = ['audio.wav', 'script.txt', 'subtitles.srt', 'generation.log']
if filename not in allowed_files:
logger.warning(f"File download attempt blocked: {filename}")
return "File not allowed", 403
full_path = os.path.join(video_dir, filename)
if not os.path.exists(full_path):
logger.error(f"File not found: {full_path}")
return "File not found", 404
try:
return send_from_directory(video_dir, filename)
except Exception as e:
logger.error(f"Error serving file {full_path}: {e}")
return f"Error serving file: {e}", 500

88
routers/pdf.py Normal file
View file

@ -0,0 +1,88 @@
"""
PDF Export router.
"""
from flask import Blueprint, make_response, render_template, url_for
from db import get_conn
from psycopg2 import extras
from weasyprint import HTML
import logging
import re
from io import BytesIO
logger = logging.getLogger(__name__)
pdf_bp = Blueprint("pdf", __name__, url_prefix="/pdf")
def clean_text(text):
"""Clean text from problematic characters for PDF generation."""
if not text:
return ""
# Remove <unk> tokens
text = text.replace('<unk>', '')
text = text.replace('<EFBFBD>', '')
# Remove other problematic Unicode characters
text = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]', '', text)
return text.strip()
@pdf_bp.route("/noticia/<noticia_id>")
def export_noticia(noticia_id):
"""Exportar noticia a PDF."""
try:
with get_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
cur.execute("""
SELECT
n.*,
t.titulo_trad, t.resumen_trad, t.lang_to,
c.nombre as categoria_nombre,
p.nombre as pais_nombre
FROM noticias n
LEFT JOIN traducciones t ON t.noticia_id = n.id AND t.status = 'done' AND t.lang_to = 'es'
LEFT JOIN categorias c ON c.id = n.categoria_id
LEFT JOIN paises p ON p.id = n.pais_id
WHERE n.id = %s
""", (noticia_id,))
noticia = cur.fetchone()
if not noticia:
return "Noticia no encontrada", 404
# Prepare data for template
d = dict(noticia)
# Use translated content if available and clean it
titulo = clean_text(d.get('titulo_trad') or d.get('titulo', ''))
resumen = clean_text(d.get('resumen_trad') or d.get('resumen', ''))
# Don't include external images to avoid SSL/network errors
# imagen_url = d.get('imagen_url') if d.get('imagen_url', '').startswith('http') else None
html_content = render_template(
"pdf_template.html",
titulo=titulo,
resumen=resumen,
fecha=d.get('fecha', ''),
fuente=d.get('fuente_nombre', ''), # Esta columna existe directamente en noticias
categoria=d.get('categoria_nombre', ''),
url=d.get('url', ''),
imagen_url=None # Disable images for now to avoid errors
)
# Convert to PDF using WeasyPrint
logger.info(f"Generating PDF for noticia {noticia_id}")
# Create PDF in memory
pdf_file = BytesIO()
HTML(string=html_content).write_pdf(pdf_file)
pdf_bytes = pdf_file.getvalue()
response = make_response(pdf_bytes)
response.headers['Content-Type'] = 'application/pdf'
response.headers['Content-Disposition'] = f'attachment; filename=noticia_{noticia_id}.pdf'
logger.info(f"PDF generated successfully for noticia {noticia_id}")
return response
except Exception as e:
logger.error(f"Error generando PDF para noticia {noticia_id}: {str(e)}", exc_info=True)
return f"Error generando PDF: {str(e)}", 500

76
routers/resumen.py Normal file
View file

@ -0,0 +1,76 @@
"""
Resumen router - Daily summary of news.
"""
from flask import Blueprint, render_template, request
from psycopg2 import extras
from db import get_conn
from datetime import datetime, timedelta
resumen_bp = Blueprint("resumen", __name__, url_prefix="/resumen")
@resumen_bp.route("/")
def diario():
"""Daily summary page."""
# Default to today
date_str = request.args.get("date")
if date_str:
try:
target_date = datetime.strptime(date_str, "%Y-%m-%d").date()
except ValueError:
target_date = datetime.utcnow().date()
else:
target_date = datetime.utcnow().date()
prev_date = target_date - timedelta(days=1)
next_date = target_date + timedelta(days=1)
if next_date > datetime.utcnow().date():
next_date = None
with get_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
# Fetch top news for the day grouped by category
# We'll limit to 5 per category to keep it concise
cur.execute("""
WITH ranked_news AS (
SELECT
n.id, n.titulo, n.resumen, n.url, n.fecha, n.imagen_url, n.fuente_nombre,
c.id as cat_id, c.nombre as categoria,
t.titulo_trad, t.resumen_trad,
ROW_NUMBER() OVER (PARTITION BY n.categoria_id ORDER BY n.fecha DESC) as rn
FROM noticias n
LEFT JOIN categorias c ON c.id = n.categoria_id
LEFT JOIN traducciones t ON t.noticia_id = n.id
AND t.lang_to = 'es' AND t.status = 'done'
WHERE n.fecha >= %s AND n.fecha < %s + INTERVAL '1 day'
)
SELECT * FROM ranked_news WHERE rn <= 5 ORDER BY categoria, rn
""", (target_date, target_date))
rows = cur.fetchall()
# Group by category
noticias_by_cat = {}
for r in rows:
cat = r["categoria"] or "Sin Categoría"
if cat not in noticias_by_cat:
noticias_by_cat[cat] = []
noticias_by_cat[cat].append({
"id": r["id"],
"titulo": r["titulo_trad"] or r["titulo"],
"resumen": r["resumen_trad"] or r["resumen"],
"url": r["url"],
"fecha": r["fecha"],
"imagen_url": r["imagen_url"],
"fuente": r["fuente_nombre"]
})
return render_template(
"resumen.html",
noticias_by_cat=noticias_by_cat,
current_date=target_date,
prev_date=prev_date,
next_date=next_date
)

186
routers/rss.py Normal file
View file

@ -0,0 +1,186 @@
"""
RSS Feed router - Generate custom RSS feeds with filters.
"""
from flask import Blueprint, request, Response
from psycopg2 import extras
from db import get_read_conn
from datetime import datetime
import html
rss_bp = Blueprint("rss", __name__, url_prefix="/rss")
def escape_xml(text):
"""Escape text for XML."""
if not text:
return ""
return html.escape(str(text))
def build_rss_xml(title, description, link, items):
"""Build RSS 2.0 XML feed."""
now = datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S +0000")
xml = f'''<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
<channel>
<title>{escape_xml(title)}</title>
<description>{escape_xml(description)}</description>
<link>{escape_xml(link)}</link>
<lastBuildDate>{now}</lastBuildDate>
<language>es</language>
'''
for item in items:
pub_date = ""
if item.get("fecha"):
try:
pub_date = item["fecha"].strftime("%a, %d %b %Y %H:%M:%S +0000")
except:
pass
xml += f''' <item>
<title>{escape_xml(item.get("titulo", ""))}</title>
<description><![CDATA[{item.get("resumen", "")}]]></description>
<link>{escape_xml(item.get("url", ""))}</link>
<guid isPermaLink="false">{escape_xml(item.get("id", ""))}</guid>
<pubDate>{pub_date}</pubDate>
</item>
'''
xml += '''</channel>
</rss>'''
return xml
@rss_bp.route("/custom")
def custom_feed():
"""
Generate a custom RSS feed with filters.
Query params:
- pais_id: Filter by country ID
- categoria_id: Filter by category ID
- lang: Translation language (default: es)
- limit: Number of items (default: 50, max: 100)
"""
pais_id = request.args.get("pais_id")
categoria_id = request.args.get("categoria_id")
lang = (request.args.get("lang") or "es").lower()[:5]
limit = min(int(request.args.get("limit", 50)), 100)
# Build description based on filters
filters_desc = []
with get_read_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
# Get filter names for description
if pais_id:
cur.execute("SELECT nombre FROM paises WHERE id = %s", (pais_id,))
row = cur.fetchone()
if row:
filters_desc.append(f"País: {row['nombre']}")
if categoria_id:
cur.execute("SELECT nombre FROM categorias WHERE id = %s", (categoria_id,))
row = cur.fetchone()
if row:
filters_desc.append(f"Categoría: {row['nombre']}")
# Build query
query = """
SELECT
n.id, n.titulo, n.resumen, n.url, n.fecha,
n.imagen_url, n.fuente_nombre,
t.titulo_trad, t.resumen_trad
FROM noticias n
LEFT JOIN traducciones t ON t.noticia_id = n.id
AND t.lang_to = %s AND t.status = 'done'
WHERE 1=1
"""
params = [lang]
if pais_id:
query += " AND n.pais_id = %s"
params.append(pais_id)
if categoria_id:
query += " AND n.categoria_id = %s"
params.append(categoria_id)
query += " ORDER BY n.fecha DESC LIMIT %s"
params.append(limit)
cur.execute(query, tuple(params))
rows = cur.fetchall()
# Build items
items = []
for r in rows:
items.append({
"id": r["id"],
"titulo": r["titulo_trad"] or r["titulo"],
"resumen": r["resumen_trad"] or r["resumen"] or "",
"url": r["url"],
"fecha": r["fecha"],
})
# Build feed metadata
title = "The Daily Feed"
if filters_desc:
title += " - " + ", ".join(filters_desc)
description = "Noticias personalizadas"
if filters_desc:
description = "Feed personalizado: " + ", ".join(filters_desc)
link = request.host_url.rstrip("/")
xml = build_rss_xml(title, description, link, items)
return Response(xml, mimetype="application/rss+xml")
@rss_bp.route("/favoritos")
def favoritos_feed():
"""Generate RSS feed of user's favorites."""
from routers.favoritos import get_session_id, ensure_favoritos_table
session_id = get_session_id()
with get_read_conn() as conn:
ensure_favoritos_table(conn)
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
cur.execute("""
SELECT n.id, n.titulo, n.resumen, n.url, n.fecha,
t.titulo_trad, t.resumen_trad
FROM favoritos f
JOIN noticias n ON n.id = f.noticia_id
LEFT JOIN traducciones t ON t.noticia_id = n.id
AND t.lang_to = 'es' AND t.status = 'done'
WHERE f.session_id = %s
ORDER BY f.created_at DESC
LIMIT 50;
""", (session_id,))
rows = cur.fetchall()
items = []
for r in rows:
items.append({
"id": r["id"],
"titulo": r["titulo_trad"] or r["titulo"],
"resumen": r["resumen_trad"] or r["resumen"] or "",
"url": r["url"],
"fecha": r["fecha"],
})
xml = build_rss_xml(
"The Daily Feed - Mis Favoritos",
"Noticias guardadas en favoritos",
request.host_url.rstrip("/"),
items
)
return Response(xml, mimetype="application/rss+xml")

257
routers/search.py Normal file
View file

@ -0,0 +1,257 @@
"""
Search API router - Real-time search with semantic search (Qdrant) and autocomplete.
"""
from flask import Blueprint, request, jsonify
from psycopg2 import extras
from db import get_read_conn, get_write_conn
from utils.auth import get_current_user
from utils.qdrant_search import semantic_search
search_bp = Blueprint("search", __name__, url_prefix="/api/search")
@search_bp.route("/")
def search():
"""Search noticias using semantic search (Qdrant) with PostgreSQL fallback."""
q = (request.args.get("q") or "").strip()
limit = min(int(request.args.get("limit", 10)), 50)
page = max(int(request.args.get("page", 1)), 1) # Página actual (1-indexed)
offset = (page - 1) * limit # Calcular offset
lang = (request.args.get("lang") or "es").lower()[:5]
use_semantic = request.args.get("semantic", "true").lower() == "true"
if not q or len(q) < 2:
return jsonify({
"results": [],
"total": 0,
"page": page,
"limit": limit,
"total_pages": 0
})
results = []
total = 0
# Intentar búsqueda semántica primero (más rápida y mejor)
if use_semantic:
try:
# Para paginación, obtenemos más resultados de Qdrant
# Qdrant es muy rápido, así que podemos obtener bastantes resultados
max_qdrant_results = min(offset + limit * 3, 200) # Obtener hasta 3 páginas adelante
semantic_results = semantic_search(
query=q,
limit=max_qdrant_results,
score_threshold=0.3 # Umbral más bajo para capturar más resultados
)
if semantic_results:
# Calcular total encontrado (hasta el límite de fetching)
total = len(semantic_results)
# Obtener solo los resultados de la página actual
page_results = semantic_results[offset : offset + limit]
if page_results:
# Enriquecer con datos adicionales de PostgreSQL solo para esta página
news_ids = [r['news_id'] for r in page_results]
with get_read_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
# Obtener datos adicionales (categoría, país)
cur.execute("""
SELECT
n.id,
n.titulo,
n.resumen,
n.url,
n.fecha,
n.imagen_url,
n.fuente_nombre,
c.nombre AS categoria,
p.nombre AS pais,
t.titulo_trad,
t.resumen_trad,
t.id AS traduccion_id
FROM noticias n
LEFT JOIN categorias c ON c.id = n.categoria_id
LEFT JOIN paises p ON p.id = n.pais_id
LEFT JOIN traducciones t ON t.noticia_id = n.id
AND t.lang_to = %s AND t.status = 'done'
WHERE n.id = ANY(%s)
""", (lang, news_ids))
db_rows = {row['id']: row for row in cur.fetchall()}
# Combinar resultados semánticos con datos de PostgreSQL
for sem_result in page_results:
news_id = sem_result['news_id']
db_row = db_rows.get(news_id)
if db_row:
results.append({
"id": db_row["id"],
"titulo": db_row["titulo_trad"] or db_row["titulo"],
"resumen": (db_row["resumen_trad"] or db_row["resumen"] or "")[:150],
"url": db_row["url"],
"fecha": db_row["fecha"].isoformat() if db_row["fecha"] else None,
"imagen_url": db_row["imagen_url"],
"fuente": db_row["fuente_nombre"],
"categoria": db_row["categoria"],
"pais": db_row["pais"],
"traduccion_id": db_row["traduccion_id"],
"semantic_score": sem_result['score'],
"fecha_raw": db_row["fecha"] # Para ordenación
})
# Ordenar por fecha cronológicamente (más reciente primero)
results.sort(key=lambda x: x.get("fecha_raw") or "", reverse=True)
# Eliminar el campo temporal usado para ordenación
for r in results:
r.pop("fecha_raw", None)
except Exception as e:
print(f"⚠️ Error en búsqueda semántica, usando fallback: {e}")
import traceback
traceback.print_exc()
# Continuar con búsqueda tradicional
# Fallback a búsqueda tradicional si no hay resultados semánticos y no hubo error fatal
if not results and total == 0:
with get_read_conn() as conn:
with conn.cursor(cursor_factory=extras.DictCursor) as cur:
print(f"⚠️ Usando fallback PostgreSQL para búsqueda: '{q}'")
# Búsqueda tradicional optimizada usando Full Text Search
# Nota: Esta query es más lenta que Qdrant pero necesaria como fallback
cur.execute("""
WITH ranked_news AS (
-- Búsqueda en noticias originales
SELECT
n.id,
ts_rank(n.search_vector_es, websearch_to_tsquery('spanish', %s)) as rank
FROM noticias n
WHERE n.search_vector_es @@ websearch_to_tsquery('spanish', %s)
UNION ALL
-- Búsqueda en traducciones
SELECT
t.noticia_id as id,
ts_rank(t.search_vector_es, websearch_to_tsquery('spanish', %s)) as rank
FROM traducciones t
WHERE t.search_vector_es @@ websearch_to_tsquery('spanish', %s)
AND t.lang_to = 'es'
AND t.status = 'done'
),
best_ranks AS (
SELECT id, MAX(rank) as max_rank
FROM ranked_news
GROUP BY id
)
SELECT
n.id,
n.titulo,
n.resumen,
n.url,
n.fecha,
n.imagen_url,
n.fuente_nombre,
c.nombre AS categoria,
p.nombre AS pais,
t.titulo_trad,
t.resumen_trad,
t.id AS traduccion_id,
br.max_rank AS rank
FROM best_ranks br
JOIN noticias n ON n.id = br.id
LEFT JOIN categorias c ON c.id = n.categoria_id
LEFT JOIN paises p ON p.id = n.pais_id
LEFT JOIN traducciones t ON t.noticia_id = n.id
AND t.lang_to = %s AND t.status = 'done'
ORDER BY n.fecha DESC, br.max_rank DESC
LIMIT %s OFFSET %s
""", (q, q, q, q, lang, limit, offset))
rows = cur.fetchall()
print(f"✅ PostgreSQL retornó {len(rows)} resultados")
# Count total - Query simplificada
cur.execute("""
SELECT COUNT(DISTINCT id) FROM (
SELECT id FROM noticias
WHERE search_vector_es @@ websearch_to_tsquery('spanish', %s)
UNION
SELECT noticia_id as id FROM traducciones
WHERE search_vector_es @@ websearch_to_tsquery('spanish', %s)
AND lang_to = 'es' AND status = 'done'
) as all_hits
""", (q, q))
total_row = cur.fetchone()
total = total_row[0] if total_row else 0
for r in rows:
results.append({
"id": r["id"],
"titulo": r["titulo_trad"] or r["titulo"],
"resumen": (r["resumen_trad"] or r["resumen"] or "")[:150],
"url": r["url"],
"fecha": r["fecha"].isoformat() if r["fecha"] else None,
"imagen_url": r["imagen_url"],
"fuente": r["fuente_nombre"],
"categoria": r["categoria"],
"pais": r["pais"],
"traduccion_id": r["traduccion_id"],
})
# Save search history for authenticated users
user = get_current_user()
if user and q and page == 1: # Solo guardar en página 1
try:
with get_write_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
INSERT INTO search_history (user_id, query, results_count)
VALUES (%s, %s, %s)
""", (user['id'], q, total))
conn.commit()
except Exception as e:
print(f"ERROR SAVING SEARCH HISTORY: {e}")
pass
total_pages = (total + limit - 1) // limit if limit > 0 else 0
return jsonify({
"results": results,
"total": total,
"query": q,
"page": page,
"limit": limit,
"total_pages": total_pages
})
@search_bp.route("/suggestions")
def suggestions():
"""Get search suggestions based on recent/popular searches and tags."""
q = (request.args.get("q") or "").strip()
limit = min(int(request.args.get("limit", 5)), 10)
if not q or len(q) < 2:
return jsonify({"suggestions": []})
with get_read_conn() as conn:
with conn.cursor() as cur:
# Get matching tags as suggestions
cur.execute("""
SELECT DISTINCT valor
FROM tags
WHERE valor ILIKE %s
ORDER BY valor
LIMIT %s;
""", (f"%{q}%", limit))
suggestions = [row[0] for row in cur.fetchall()]
return jsonify({"suggestions": suggestions, "query": q})

911
routers/stats.py Normal file
View file

@ -0,0 +1,911 @@
from flask import Blueprint, render_template, jsonify
from db import get_read_conn
from datetime import datetime, timedelta
import os
import subprocess
import time
from cache import cached
stats_bp = Blueprint("stats", __name__, url_prefix="/stats")
# ==================================================================================
# ENTITY NORMALIZATION SYSTEM
# ==================================================================================
# Dictionary to map entity name variations to canonical names
import json
CONFIG_FILE = "entity_config.json"
_config_cache = {"data": None, "mtime": 0}
def load_entity_config():
"""Load entity config from JSON file with simple modification time caching."""
global _config_cache
try:
# Check if file exists
if not os.path.exists(CONFIG_FILE):
return {"blacklist": [], "synonyms": {}}
# Check modification time
mtime = os.path.getmtime(CONFIG_FILE)
if _config_cache["data"] is not None and mtime <= _config_cache["mtime"]:
return _config_cache["data"]
# Load fresh config
with open(CONFIG_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
# Normalize structure
if "blacklist" not in data: data["blacklist"] = []
if "synonyms" not in data: data["synonyms"] = {}
# Pre-process synonyms for reverse lookup (variation -> canonical)
lookup = {}
for canonical, variations in data["synonyms"].items():
lookup[canonical.lower()] = canonical # Map canonical to itself
for var in variations:
lookup[var.lower()] = canonical
data["_lookup"] = lookup
data["_blacklist_set"] = {x.lower() for x in data["blacklist"]}
_config_cache = {"data": data, "mtime": mtime}
return data
except Exception as e:
print(f"Error loading entity config: {e}")
# Return fallback or previous cache if available
return _config_cache["data"] if _config_cache["data"] else {"blacklist": [], "synonyms": {}}
def normalize_entity_name(name: str, config=None) -> str:
"""Normalize entity name to its canonical form."""
if config is None:
config = load_entity_config()
lookup = config.get("_lookup", {})
return lookup.get(name.lower(), name)
def aggregate_normalized_entities(rows, entity_type='persona'):
"""Aggregate entity counts by normalized names and filter blacklisted items.
Args:
rows: List of (name, count) tuples from database
entity_type: Type of entity for normalization (kept for compatibility but config is global now)
Returns:
List of (normalized_name, total_count) tuples sorted by count
"""
aggregated = {}
config = load_entity_config()
blacklist = config.get("_blacklist_set", set())
for name, count in rows:
# 1. Check blacklist (exact or lower match)
if name.lower() in blacklist:
continue
# 2. Normalize
normalized = normalize_entity_name(name, config)
# 3. Check blacklist again (in case canonical name is blacklisted)
if normalized.lower() in blacklist:
continue
aggregated[normalized] = aggregated.get(normalized, 0) + count
# Sort by count descending
sorted_items = sorted(aggregated.items(), key=lambda x: x[1], reverse=True)
return sorted_items
# ==================================================================================
@stats_bp.route("/")
def index():
"""Stats dashboard page."""
# Calculate translation stats for the banner
with get_read_conn() as conn:
with conn.cursor() as cur:
# Translations per minute (last 5 minutes)
cur.execute("""
SELECT COUNT(*) FROM traducciones
WHERE status = 'done'
AND created_at > NOW() - INTERVAL '5 minutes'
""")
recent_5min = cur.fetchone()[0]
translations_per_min = round(recent_5min / 5, 1) if recent_5min else 0
# Status counts
cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'done'")
traducciones_count = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'pending'")
pending_count = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'processing'")
processing_count = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM traducciones WHERE status = 'error'")
error_count = cur.fetchone()[0]
# Total noticias (exact count - cached for 5 min in view)
cur.execute("SELECT COUNT(*) FROM noticias")
noticias_count = cur.fetchone()[0] or 0
# News ingested today
cur.execute("""
SELECT COUNT(*) FROM noticias
WHERE DATE(fecha) = CURRENT_DATE
""")
noticias_hoy = cur.fetchone()[0] or 0
# News ingested in the last hour
cur.execute("""
SELECT COUNT(*) FROM noticias
WHERE fecha >= NOW() - INTERVAL '1 hour'
""")
noticias_ultima_hora = cur.fetchone()[0] or 0
return render_template("stats.html",
translations_per_min=translations_per_min,
noticias_count=noticias_count,
traducciones_count=traducciones_count,
pending_count=pending_count,
processing_count=processing_count,
error_count=error_count,
noticias_hoy=noticias_hoy,
noticias_ultima_hora=noticias_ultima_hora)
@stats_bp.route("/api/activity")
@cached(ttl_seconds=300, prefix="stats")
def activity_data():
"""Get activity data (news count) for the specified range."""
from flask import request
range_param = request.args.get("range", "30d")
# Default: 30d -> group by day
days = 30
minutes = 0
interval_sql = "day" # For date_trunc or casting
timedelta_step = timedelta(days=1)
date_format = "%Y-%m-%d"
if range_param == "1h":
minutes = 60
interval_sql = "minute"
timedelta_step = timedelta(minutes=1)
date_format = "%H:%M"
elif range_param == "8h":
minutes = 480
interval_sql = "minute"
timedelta_step = timedelta(minutes=1)
date_format = "%H:%M"
elif range_param == "1d": # Alias for 24h
minutes = 1440
interval_sql = "hour"
timedelta_step = timedelta(hours=1)
date_format = "%H:%M"
elif range_param == "24h":
minutes = 1440
interval_sql = "hour"
timedelta_step = timedelta(hours=1)
date_format = "%H:%M"
elif range_param == "7d":
minutes = 10080
interval_sql = "hour"
timedelta_step = timedelta(hours=1)
# Include Month-Day for 7d context
date_format = "%d %H:%M"
elif range_param == "30d":
# Specific existing logic uses date casting, we can adapt
minutes = 0
days = 30
interval_sql = "day"
timedelta_step = timedelta(days=1)
date_format = "%Y-%m-%d"
# Calculate start time
if minutes > 0:
start_time = datetime.utcnow() - timedelta(minutes=minutes)
# Using timestamp column directly
date_column = "fecha"
else:
start_time = datetime.utcnow() - timedelta(days=days)
# For 30d we might just use date part start
start_time = start_time.replace(hour=0, minute=0, second=0, microsecond=0)
date_column = "fecha"
with get_read_conn() as conn:
with conn.cursor() as cur:
# Construct query based on interval
if interval_sql == "day":
# Original logic style for 30d, but generalized
cur.execute("""
SELECT
fecha::date as time_slot,
COUNT(*) as count
FROM noticias
WHERE fecha >= %s
GROUP BY time_slot
ORDER BY time_slot
""", (start_time,))
else:
# Granular logic
cur.execute(f"""
SELECT
date_trunc('{interval_sql}', fecha) as time_slot,
COUNT(*) as count
FROM noticias
WHERE fecha >= %s
GROUP BY time_slot
ORDER BY time_slot
""", (start_time,))
rows = cur.fetchall()
# Fill gaps
data_map = {row[0]: row[1] for row in rows}
labels = []
data = []
# Iterate with step
if minutes > 0:
# Granular start alignment
current = start_time.replace(second=0, microsecond=0)
if interval_sql == "hour":
current = current.replace(minute=0)
end = datetime.utcnow().replace(second=0, microsecond=0)
if interval_sql == "hour":
end = end.replace(minute=0) + timedelta(hours=1)
else:
# Daily start alignment
current = start_time.date() if isinstance(start_time, datetime) else start_time
end = datetime.utcnow().date()
while current <= end:
# Format label
labels.append(current.strftime(date_format))
# Lookup key can be date or datetime depending on query
# DB returns date for ::date and datetime for date_trunc
# Let's handle both lookup types safely
lookup_key = current
# API might have mismatch if current is date object and DB returned datetime or vice versa
# rows[0] is date object for 'day', datetime for 'minute'/'hour'
val = data_map.get(lookup_key, 0)
# Fallback if types don't match exactly (datetime vs date) - unlikely if logic is consistent but good to check
if val == 0 and isinstance(lookup_key, datetime) and interval_sql == 'day':
val = data_map.get(lookup_key.date(), 0)
data.append(val)
current += timedelta_step
return jsonify({
"labels": labels,
"data": data
})
@stats_bp.route("/api/categories")
@cached(ttl_seconds=300, prefix="stats")
def categories_data():
"""Get news count per category (Top 8 + Others)."""
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT
c.nombre,
COUNT(n.id) as count
FROM noticias n
JOIN categorias c ON c.id = n.categoria_id
GROUP BY c.nombre
ORDER BY count DESC
""")
rows = cur.fetchall()
# Process Top 8 + Others
labels = []
data = []
others_count = 0
top_limit = 8
for i, row in enumerate(rows):
if i < top_limit:
labels.append(row[0])
data.append(row[1])
else:
others_count += row[1]
if others_count > 0:
labels.append("Otros")
data.append(others_count)
return jsonify({
"labels": labels,
"data": data
})
@stats_bp.route("/api/countries")
@cached(ttl_seconds=300, prefix="stats")
def countries_data():
"""Get news count per country (Top 10 + Others)."""
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT
p.nombre,
COUNT(n.id) as count
FROM noticias n
JOIN paises p ON p.id = n.pais_id
GROUP BY p.nombre
ORDER BY count DESC
""")
rows = cur.fetchall()
# Process Top 10 + Others
labels = []
data = []
others_count = 0
top_limit = 10
for i, row in enumerate(rows):
if i < top_limit:
labels.append(row[0])
data.append(row[1])
else:
others_count += row[1]
return jsonify({
"labels": labels,
"data": data
})
@stats_bp.route("/api/countries/list")
def countries_list():
"""Get alphabetical list of all countries with flags."""
from utils import country_flag
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("SELECT nombre FROM paises ORDER BY nombre ASC")
rows = cur.fetchall()
return jsonify([
{"name": row[0], "flag": country_flag(row[0])}
for row in rows
])
@stats_bp.route("/api/translations/activity")
def translations_activity_data():
"""Get translation count per day for the last 30 days."""
days = 30
start_date = (datetime.utcnow() - timedelta(days=days)).date()
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT
created_at::date as day,
COUNT(*) as count
FROM traducciones
WHERE created_at >= %s
GROUP BY day
ORDER BY day
""", (start_date,))
rows = cur.fetchall()
# Fill gaps
data_map = {row[0]: row[1] for row in rows}
labels = []
data = []
current = start_date
end = datetime.utcnow().date()
while current <= end:
labels.append(current.strftime("%Y-%m-%d"))
data.append(data_map.get(current, 0))
current += timedelta(days=1)
return jsonify({
"labels": labels,
"data": data
})
@stats_bp.route("/api/translations/languages")
@cached(ttl_seconds=60, prefix="stats")
def translations_languages_data():
"""Get translation count per source language."""
# Friendly names for common languages
LANG_NAMES = {
'en': 'Inglés',
'es': 'Español',
'fr': 'Francés',
'de': 'Alemán',
'it': 'Italiano',
'pt': 'Portugués',
'ru': 'Ruso',
'zh': 'Chino',
'ja': 'Japonés',
'ar': 'Árabe'
}
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT
lang_from,
COUNT(*) as count
FROM translation_stats
WHERE lang_from IS NOT NULL
GROUP BY lang_from
ORDER BY count DESC
""")
rows = cur.fetchall()
labels = []
data = []
for code, count in rows:
code = code.strip().lower()
labels.append(LANG_NAMES.get(code, code.upper()))
data.append(count)
return jsonify({
"labels": labels,
"data": data
})
def get_system_uptime():
try:
with open('/proc/uptime', 'r') as f:
uptime_seconds = float(f.readline().split()[0])
days = int(uptime_seconds // (24 * 3600))
hours = int((uptime_seconds % (24 * 3600)) // 3600)
minutes = int((uptime_seconds % 3600) // 60)
if days > 0:
return f"{days}d {hours}h {minutes}m"
return f"{hours}h {minutes}m"
except:
return "N/A"
def get_gpu_info():
try:
cmd = "nvidia-smi --query-gpu=name,temperature.gpu,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits"
with open(os.devnull, 'w') as devnull:
res = subprocess.check_output(cmd, shell=True, stderr=devnull).decode().strip()
parts = [p.strip() for p in res.split(',')]
if len(parts) >= 5:
return {
"name": parts[0],
"temp": f"{parts[1]}°C",
"util": f"{parts[2]}%",
"mem": f"{parts[3]} MB / {parts[4]} MB"
}
except:
pass
return None
def get_cpu_info():
try:
load = os.getloadavg()
cores = os.cpu_count()
return {
"load": f"{load[0]:.2f}, {load[1]:.2f}, {load[2]:.2f}",
"cores": cores
}
except:
return None
@stats_bp.route("/api/system/info")
def system_info_api():
"""Endpoint for real-time system monitoring."""
return jsonify({
"uptime": get_system_uptime(),
"gpu": get_gpu_info(),
"cpu": get_cpu_info(),
"timestamp": datetime.now().strftime("%H:%M:%S")
})
@stats_bp.route("/api/translations/rate")
@cached(ttl_seconds=60, prefix="stats")
def translations_rate_data():
"""Get translation count for the specified range (1h, 8h, 24h, 7d)."""
# Parameters
from flask import request
range_param = request.args.get("range", "1h")
# Default: 1h -> group by minute
minutes = 60
interval_sql = "minute"
timedelta_step = timedelta(minutes=1)
date_format = "%H:%M"
if range_param == "8h":
minutes = 8 * 60
interval_sql = "minute" # Still group by minute for detailed graph? Or 5 mins?
# Let's simple group by minute but it might be dense. 480 points. Fine.
timedelta_step = timedelta(minutes=1)
date_format = "%H:%M"
elif range_param == "24h":
minutes = 24 * 60
# Group by 15 minutes? Postgres: date_trunc('hour', ...) or extract?
# Let's use custom grouping? Or simple 'hour' is too granular? 1440 mins.
# Let's group by hour for 24h to be safe/clean
interval_sql = "hour"
timedelta_step = timedelta(hours=1)
date_format = "%H:%M"
elif range_param == "7d":
minutes = 7 * 24 * 60
interval_sql = "hour" # 7 * 24 = 168 points
timedelta_step = timedelta(hours=1)
date_format = "%Y-%m-%d %H:%M"
start_time = datetime.utcnow() - timedelta(minutes=minutes)
with get_read_conn() as conn:
with conn.cursor() as cur:
# Query translation_stats instead of traducciones
cur.execute(f"""
SELECT
date_trunc('{interval_sql}', created_at) as time_slot,
COUNT(*) as count
FROM translation_stats
WHERE created_at >= %s
GROUP BY time_slot
ORDER BY time_slot
""", (start_time,))
rows = cur.fetchall()
# Fill gaps
data_map = {row[0]: row[1] for row in rows}
labels = []
data = []
# Iterate by step
# Align start_time to step if possible (lazy alignment)
current = start_time.replace(second=0, microsecond=0)
if interval_sql == "hour":
current = current.replace(minute=0)
end = datetime.utcnow().replace(second=0, microsecond=0)
if interval_sql == "hour":
end = end.replace(minute=0) + timedelta(hours=1) # Ensure we cover current partial hour
while current <= end:
labels.append(current.strftime(date_format))
data.append(data_map.get(current, 0))
current += timedelta_step
return jsonify({
"labels": labels,
"data": data
})
@stats_bp.route("/entities")
def entities_dashboard():
"""Dashboard for Named Entities statistics."""
return render_template("stats_entities.html")
@stats_bp.route("/api/entities/people")
def entities_people():
"""Top 25 mentioned people, optionally filtered by country and/or date."""
from flask import request
from datetime import datetime
from cache import cache_get, cache_set
# 1. Check config mtime for cache invalidation
try:
config_mtime = os.path.getmtime(CONFIG_FILE)
except OSError:
config_mtime = 0
country_filter = request.args.get("country")
date_filter = request.args.get("date")
# 2. Build cache key with mtime
cache_key = f"entities:people:{country_filter}:{date_filter}:{config_mtime}"
# 3. Try cache
cached_data = cache_get(cache_key)
if cached_data:
return jsonify(cached_data)
# Determine time range
if date_filter:
# Single day query
try:
target_date = datetime.strptime(date_filter, "%Y-%m-%d").date()
time_condition = "DATE(tr.created_at) = %s"
time_params = [target_date]
except ValueError:
# Invalid date format, fallback to 30 days
time_condition = "tr.created_at >= NOW() - INTERVAL '30 days'"
time_params = []
else:
# Default: last 30 days
time_condition = "tr.created_at >= NOW() - INTERVAL '30 days'"
time_params = []
if country_filter and country_filter != 'global':
# Filtered by country
query = f"""
SELECT t.valor, COUNT(*) as menciones
FROM tags t
JOIN tags_noticia tn ON tn.tag_id = t.id
JOIN traducciones tr ON tn.traduccion_id = tr.id
JOIN noticias n ON tr.noticia_id = n.id
WHERE t.tipo = 'persona'
AND {time_condition}
AND n.pais_id = (SELECT id FROM paises WHERE nombre ILIKE %s LIMIT 1)
GROUP BY t.valor
ORDER BY menciones DESC
"""
params = tuple(time_params + [country_filter])
else:
# Global view
query = f"""
SELECT t.valor, COUNT(*) as menciones
FROM tags t
JOIN tags_noticia tn ON tn.tag_id = t.id
JOIN traducciones tr ON tn.traduccion_id = tr.id
WHERE t.tipo = 'persona'
AND {time_condition}
GROUP BY t.valor
ORDER BY menciones DESC
"""
params = tuple(time_params)
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute(query, params)
rows = cur.fetchall()
# Normalize and aggregate
normalized_rows = aggregate_normalized_entities(rows, entity_type='persona')
# Take top 50
top_50 = normalized_rows[:50]
# Enrich with Wikipedia Images (Parallel Execution)
from concurrent.futures import ThreadPoolExecutor
from utils.wiki import fetch_wiki_data
images = []
summaries = []
def get_image_safe(name):
try:
return fetch_wiki_data(name)
except Exception:
return None, None
if top_50:
names = [row[0] for row in top_50]
with ThreadPoolExecutor(max_workers=10) as executor:
try:
results = list(executor.map(get_image_safe, names))
# Unpack results
for img, smry in results:
images.append(img)
summaries.append(smry)
except Exception as e:
import logging
logging.error(f"Error fetching wiki data: {e}")
# Fallback to empty if threading fails
images = [None] * len(names)
summaries = [None] * len(names)
else:
images = []
summaries = []
result = {
"labels": [row[0] for row in top_50],
"data": [row[1] for row in top_50],
"images": images,
"summaries": summaries
}
# 4. Set cache
cache_set(cache_key, result, ttl_seconds=600)
return jsonify(result)
@stats_bp.route("/api/entities/orgs")
def entities_orgs():
"""Top mentioned organizations, optionally filtered by country."""
from flask import request
from cache import cache_get, cache_set
country_filter = request.args.get("country")
try:
config_mtime = os.path.getmtime(CONFIG_FILE)
except OSError:
config_mtime = 0
cache_key = f"entities:orgs:{country_filter}:{config_mtime}"
cached_data = cache_get(cache_key)
if cached_data:
return jsonify(cached_data)
if country_filter and country_filter != 'global':
query = """
SELECT t.valor, COUNT(*) as menciones
FROM tags t
JOIN tags_noticia tn ON tn.tag_id = t.id
JOIN traducciones tr ON tn.traduccion_id = tr.id
JOIN noticias n ON tr.noticia_id = n.id
WHERE t.tipo = 'organizacion'
AND tr.created_at >= NOW() - INTERVAL '30 days'
AND n.pais_id = (SELECT id FROM paises WHERE nombre ILIKE %s LIMIT 1)
GROUP BY t.valor
ORDER BY menciones DESC
LIMIT 50
"""
params = (country_filter,)
else:
query = """
SELECT t.valor, COUNT(*) as menciones
FROM tags t
JOIN tags_noticia tn ON tn.tag_id = t.id
JOIN traducciones tr ON tn.traduccion_id = tr.id
WHERE t.tipo = 'organizacion'
AND tr.created_at >= NOW() - INTERVAL '30 days'
GROUP BY t.valor
ORDER BY menciones DESC
LIMIT 50
"""
params = ()
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute(query, params)
rows = cur.fetchall()
normalized_rows = aggregate_normalized_entities(rows, entity_type='organizacion')
# Enrich with Wikipedia Images
from concurrent.futures import ThreadPoolExecutor
from utils.wiki import fetch_wiki_data
images = []
summaries = []
def get_info_safe(name):
try:
return fetch_wiki_data(name)
except Exception:
return None, None
if normalized_rows:
names = [row[0] for row in normalized_rows]
with ThreadPoolExecutor(max_workers=10) as executor:
results = list(executor.map(get_info_safe, names))
for img, smry in results:
images.append(img)
summaries.append(smry)
result = {
"labels": [row[0] for row in normalized_rows],
"data": [row[1] for row in normalized_rows],
"images": images,
"summaries": summaries
}
cache_set(cache_key, result, ttl_seconds=600)
return jsonify(result)
@stats_bp.route("/api/entities/places")
def entities_places():
"""Top mentioned places, optionally filtered by country."""
from flask import request
from cache import cache_get, cache_set
country_filter = request.args.get("country")
try:
config_mtime = os.path.getmtime(CONFIG_FILE)
except OSError:
config_mtime = 0
cache_key = f"entities:places:{country_filter}:{config_mtime}"
cached_data = cache_get(cache_key)
if cached_data:
return jsonify(cached_data)
if country_filter and country_filter != 'global':
query = """
SELECT t.valor, COUNT(*) as menciones
FROM tags t
JOIN tags_noticia tn ON tn.tag_id = t.id
JOIN traducciones tr ON tn.traduccion_id = tr.id
JOIN noticias n ON tr.noticia_id = n.id
WHERE t.tipo = 'lugar'
AND tr.created_at >= NOW() - INTERVAL '30 days'
AND n.pais_id = (SELECT id FROM paises WHERE nombre ILIKE %s LIMIT 1)
AND n.pais_id != (SELECT id FROM paises WHERE nombre = 'España')
GROUP BY t.valor
ORDER BY menciones DESC
LIMIT 50
"""
params = (country_filter,)
else:
query = """
SELECT t.valor, COUNT(*) as menciones
FROM tags t
JOIN tags_noticia tn ON tn.tag_id = t.id
JOIN traducciones tr ON tn.traduccion_id = tr.id
JOIN noticias n ON tr.noticia_id = n.id
WHERE t.tipo = 'lugar'
AND tr.created_at >= NOW() - INTERVAL '30 days'
AND n.pais_id != (SELECT id FROM paises WHERE nombre = 'España')
GROUP BY t.valor
ORDER BY menciones DESC
LIMIT 50
"""
params = ()
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute(query, params)
rows = cur.fetchall()
# Normalize
normalized_rows = aggregate_normalized_entities(rows, entity_type='lugar')
# Enrich with Wikipedia Images
from concurrent.futures import ThreadPoolExecutor
from utils.wiki import fetch_wiki_data
images = []
summaries = []
def get_info_safe(name):
try:
return fetch_wiki_data(name)
except Exception:
return None, None
if normalized_rows:
names = [row[0] for row in normalized_rows]
with ThreadPoolExecutor(max_workers=10) as executor:
results = list(executor.map(get_info_safe, names))
for img, smry in results:
images.append(img)
summaries.append(smry)
result = {
"labels": [row[0] for row in normalized_rows],
"data": [row[1] for row in normalized_rows],
"images": images,
"summaries": summaries
}
cache_set(cache_key, result, ttl_seconds=600)
return jsonify(result)

81
routers/topics.py Normal file
View file

@ -0,0 +1,81 @@
from flask import Blueprint, render_template, request
from db import get_read_conn
from psycopg2 import extras
import datetime
topics_bp = Blueprint("topics", __name__, url_prefix="/topics")
@topics_bp.route("/")
def monitor():
# Monitor de Impacto por País
days = int(request.args.get("days", 3))
with get_read_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur:
# Ranking de Países por "Calor" (Suma de scores de noticias recientes)
cur.execute("""
SELECT p.id, p.nombre,
COUNT(DISTINCT n.id) as news_count,
SUM(nt.score) as total_impact
FROM paises p
JOIN noticias n ON n.pais_id = p.id
JOIN news_topics nt ON nt.noticia_id = n.id
WHERE n.fecha > NOW() - INTERVAL '%s days'
GROUP BY p.id, p.nombre
HAVING SUM(nt.score) > 0
ORDER BY total_impact DESC
LIMIT 50;
""", (days,))
countries = cur.fetchall()
return render_template("monitor_list.html", countries=countries, days=days)
@topics_bp.route("/country/<int:pais_id>")
def country_detail(pais_id):
days = int(request.args.get("days", 3))
with get_read_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur:
# Info País
cur.execute("SELECT * FROM paises WHERE id = %s", (pais_id,))
pais = cur.fetchone()
if not pais:
return "País no encontrado", 404
# Top Noticias de Impacto (Últimos días)
# News with their highest topic score sum
cur.execute("""
SELECT n.id,
COALESCE(t.titulo_trad, n.titulo) as titulo,
COALESCE(t.resumen_trad, n.resumen) as resumen,
n.fecha, n.imagen_url, n.fuente_nombre, n.url,
SUM(nt.score) as impact_score
FROM noticias n
JOIN news_topics nt ON nt.noticia_id = n.id
LEFT JOIN traducciones t ON t.noticia_id = n.id AND t.lang_to = 'es' AND t.status = 'done'
WHERE n.pais_id = %s
AND n.fecha > NOW() - INTERVAL '%s days'
GROUP BY n.id, n.titulo, n.resumen, n.fecha, n.imagen_url, n.fuente_nombre, n.url, t.titulo_trad, t.resumen_trad
ORDER BY impact_score DESC
LIMIT 20;
""", (pais_id, days))
top_news = cur.fetchall()
# Temas Activos en este País
cur.execute("""
SELECT t.name, SUM(nt.score) as topic_volume
FROM topics t
JOIN news_topics nt ON nt.topic_id = t.id
JOIN noticias n ON n.id = nt.noticia_id
WHERE n.pais_id = %s
AND n.fecha > NOW() - INTERVAL '%s days'
GROUP BY t.id, t.name
ORDER BY topic_volume DESC
LIMIT 10;
""", (pais_id, days))
active_topics = cur.fetchall()
return render_template("monitor_detail.html",
pais=pais,
news=top_news,
active_topics=active_topics,
days=days)

59
routers/traducciones.py Normal file
View file

@ -0,0 +1,59 @@
from flask import Blueprint, render_template, request
from db import get_read_conn
traducciones_bp = Blueprint("traducciones", __name__)
@traducciones_bp.route("/traducciones")
def ultimas_traducciones():
"""Muestra las últimas noticias traducidas."""
page = max(int(request.args.get("page", 1)), 1)
per_page = min(max(int(request.args.get("per_page", 20)), 10), 100)
offset = (page - 1) * per_page
with get_read_conn() as conn:
conn.autocommit = True
with conn.cursor() as cur:
# Total count
cur.execute("""
SELECT COUNT(*) FROM traducciones WHERE status = 'done'
""")
total = cur.fetchone()[0]
# Fetch latest translations
cur.execute("""
SELECT
t.id,
t.noticia_id,
t.titulo_trad,
t.resumen_trad,
t.lang_from,
t.lang_to,
t.created_at AS updated_at,
n.url AS link,
n.imagen_url AS imagen,
n.fuente_nombre AS feed_nombre,
c.nombre AS categoria_nombre,
p.nombre AS pais_nombre
FROM traducciones t
JOIN noticias n ON n.id = t.noticia_id
LEFT JOIN categorias c ON c.id = n.categoria_id
LEFT JOIN paises p ON p.id = n.pais_id
WHERE t.status = 'done'
ORDER BY t.created_at DESC
LIMIT %s OFFSET %s
""", (per_page, offset))
columns = [desc[0] for desc in cur.description]
traducciones = [dict(zip(columns, row)) for row in cur.fetchall()]
total_pages = (total + per_page - 1) // per_page
return render_template(
"traducciones.html",
traducciones=traducciones,
page=page,
per_page=per_page,
total=total,
total_pages=total_pages,
)

81
routers/urls.py Normal file
View file

@ -0,0 +1,81 @@
from flask import Blueprint, render_template, request, redirect, flash, url_for
from psycopg2 import extras
from db import get_conn
from models.categorias import get_categorias
from models.paises import get_paises
urls_bp = Blueprint("urls", __name__, url_prefix="/urls")
@urls_bp.route("/")
def manage_urls():
with get_conn() as conn, conn.cursor(cursor_factory=extras.DictCursor) as cur:
cur.execute(
"""
SELECT fu.id, fu.nombre, fu.url,
c.nombre AS categoria,
p.nombre AS pais,
fu.idioma,
fu.last_check,
fu.last_status,
fu.status_message,
fu.last_http_code,
COALESCE((
SELECT COUNT(*)
FROM noticias n
JOIN feeds f ON n.fuente_nombre = f.nombre
WHERE f.fuente_url_id = fu.id
), 0) as noticias_count
FROM fuentes_url fu
LEFT JOIN categorias c ON c.id=fu.categoria_id
LEFT JOIN paises p ON p.id=fu.pais_id
ORDER BY fu.nombre;
"""
)
fuentes = cur.fetchall()
return render_template("urls_list.html", fuentes=fuentes)
@urls_bp.route("/add_source", methods=["GET", "POST"])
def add_url_source():
with get_conn() as conn:
categorias = get_categorias(conn)
paises = get_paises(conn)
if request.method == "POST":
nombre = request.form.get("nombre")
url = request.form.get("url")
categoria_id = request.form.get("categoria_id")
pais_id = request.form.get("pais_id")
idioma = (request.form.get("idioma", "es") or "es").strip().lower()[:2]
try:
with conn.cursor() as cur:
cur.execute(
"""
INSERT INTO fuentes_url (nombre, url, categoria_id, pais_id, idioma)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (url) DO UPDATE
SET nombre=EXCLUDED.nombre,
categoria_id=EXCLUDED.categoria_id,
pais_id=EXCLUDED.pais_id,
idioma=EXCLUDED.idioma;
""",
(
nombre,
url,
int(categoria_id) if categoria_id else None,
int(pais_id) if pais_id else None,
idioma,
),
)
conn.commit()
flash("Fuente añadida/actualizada.", "success")
return redirect(url_for("urls.manage_urls"))
except Exception as e:
flash(f"Error: {e}", "error")
return render_template("add_url_source.html", categorias=categorias, paises=paises)

View file

@ -0,0 +1,28 @@
# Build stage
FROM golang:1.21-alpine AS builder
WORKDIR /app
# Install git and SSL certs
RUN apk add --no-cache git ca-certificates
# Copy source code immediately
COPY . .
# Download dependencies
RUN go mod tidy && go mod download
# Build the Go app
RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o rss-ingestor .
# Final stage
FROM alpine:latest
WORKDIR /root/
# Copy the Pre-built binary file from the previous stage
COPY --from=builder /app/rss-ingestor .
COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
# Command to run the executable
CMD ["./rss-ingestor"]

8
rss-ingestor-go/go.mod Normal file
View file

@ -0,0 +1,8 @@
module rss-ingestor-go
go 1.21
require (
github.com/lib/pq v1.10.9
github.com/mmcdole/gofeed v1.2.1
)

458
rss-ingestor-go/main.go Normal file
View file

@ -0,0 +1,458 @@
package main
import (
"crypto/md5"
"database/sql"
"encoding/hex"
"fmt"
"log"
"net/http"
"os"
"strconv"
"strings"
"sync"
"time"
"github.com/lib/pq"
"github.com/mmcdole/gofeed"
)
// Config holds the configuration loaded from environment variables
type Config struct {
DBHost string
DBPort string
DBUser string
DBPass string
DBName string
MaxWorkers int
MaxFailures int
PokeInterval time.Duration
FeedTimeout int
}
// Feed represents a row in the feeds table
type Feed struct {
ID int
Nombre string
URL string
CategoriaID sql.NullInt64
PaisID sql.NullInt64
LastEtag sql.NullString
LastModified sql.NullString
Fallos int
}
// Noticia represents a news item to be inserted
type Noticia struct {
ID string
Titulo string
Resumen string
URL string
Fecha time.Time
ImagenURL string
FuenteNombre string
CategoriaID sql.NullInt64
PaisID sql.NullInt64
}
var (
db *sql.DB
config Config
)
func loadConfig() {
config = Config{
DBHost: getEnv("DB_HOST", "localhost"),
DBPort: getEnv("DB_PORT", "5432"),
DBUser: getEnv("DB_USER", "rss"),
DBPass: getEnv("DB_PASS", "x"),
DBName: getEnv("DB_NAME", "rss"),
MaxWorkers: getEnvInt("RSS_MAX_WORKERS", 20), // Default to higher concurrency in Go
MaxFailures: getEnvInt("RSS_MAX_FAILURES", 10),
PokeInterval: time.Duration(getEnvInt("RSS_POKE_INTERVAL_MIN", 8)) * time.Minute,
FeedTimeout: getEnvInt("RSS_FEED_TIMEOUT", 60),
}
}
func getEnv(key, fallback string) string {
if value, ok := os.LookupEnv(key); ok {
return value
}
return fallback
}
func getEnvInt(key string, fallback int) int {
strValue := getEnv(key, "")
if strValue == "" {
return fallback
}
val, err := strconv.Atoi(strValue)
if err != nil {
return fallback
}
return val
}
func initDB() {
connStr := fmt.Sprintf("host=%s port=%s user=%s password=%s dbname=%s sslmode=disable",
config.DBHost, config.DBPort, config.DBUser, config.DBPass, config.DBName)
var err error
db, err = sql.Open("postgres", connStr)
if err != nil {
log.Fatalf("Error opening DB: %v", err)
}
db.SetMaxOpenConns(config.MaxWorkers + 5)
db.SetMaxIdleConns(config.MaxWorkers)
db.SetConnMaxLifetime(5 * time.Minute)
if err = db.Ping(); err != nil {
log.Fatalf("Error connecting to DB: %v", err)
}
log.Println("Database connection established")
}
func getActiveFeeds() ([]Feed, error) {
query := `
SELECT id, nombre, url, categoria_id, pais_id, last_etag, last_modified, COALESCE(fallos, 0)
FROM feeds
WHERE activo = TRUE AND (fallos IS NULL OR fallos < $1)
ORDER BY id
`
rows, err := db.Query(query, config.MaxFailures)
if err != nil {
return nil, err
}
defer rows.Close()
var feeds []Feed
for rows.Next() {
var f Feed
if err := rows.Scan(&f.ID, &f.Nombre, &f.URL, &f.CategoriaID, &f.PaisID, &f.LastEtag, &f.LastModified, &f.Fallos); err != nil {
log.Printf("Error scanning feed: %v", err)
continue
}
feeds = append(feeds, f)
}
return feeds, nil
}
func generateID(link string) string {
hash := md5.Sum([]byte(link))
return hex.EncodeToString(hash[:])
}
// CleanHTML removes generic HTML tags to store plain text summary
func cleanHTML(input string) string {
// Simple harvester, real cleaning might need a library like bluemonday if strict security needed,
// but here we just want to strip tags roughly for the 'resumen' field if it's too raw.
// For now, we will trust the database or frontend to handle rendering/sanitization,
// or perform a simple strip.
// NOTE: The python version used BeautifulSoup. In Go, we can use 'bluemonday' or just simple replacements.
// To keep dependencies low for this snippet, sending as is, but stripping major noise if needed.
return strings.TrimSpace(input)
}
func extractImage(item *gofeed.Item) string {
if item.Image != nil && item.Image.URL != "" {
return item.Image.URL
}
if len(item.Enclosures) > 0 {
for _, enc := range item.Enclosures {
if strings.HasPrefix(enc.Type, "image/") {
return enc.URL
}
}
}
// Try extensions
if ex, ok := item.Extensions["media"]; ok {
if content, ok := ex["content"]; ok {
for _, c := range content {
if url, ok := c.Attrs["url"]; ok {
return url
}
}
}
if thumb, ok := ex["thumbnail"]; ok {
for _, c := range thumb {
if url, ok := c.Attrs["url"]; ok {
return url
}
}
}
}
return ""
}
func processFeed(fp *gofeed.Parser, feed Feed, results chan<- int) {
// Configure custom HTTP client with timeout and User-Agent
client := &http.Client{
Timeout: time.Duration(config.FeedTimeout) * time.Second,
}
// Create request to set User-Agent
req, err := http.NewRequest("GET", feed.URL, nil)
if err != nil {
log.Printf("[Feed %d] Error creating request: %v", feed.ID, err)
updateFeedStatus(feed.ID, "", "", false, err.Error())
results <- 0
return
}
req.Header.Set("User-Agent", "RSS2-Ingestor-Go/1.0")
// NOTE: We INTENTIONALLY SKIP ETag/Last-Modified headers based on user issues
// If needed in future, uncomment:
// if feed.LastEtag.Valid { req.Header.Set("If-None-Match", feed.LastEtag.String) }
// if feed.LastModified.Valid { req.Header.Set("If-Modified-Since", feed.LastModified.String) }
resp, err := client.Do(req)
if err != nil {
log.Printf("[Feed %d] Error fetching: %v", feed.ID, err)
updateFeedStatus(feed.ID, "", "", false, err.Error())
results <- 0
return
}
defer resp.Body.Close()
if resp.StatusCode == 304 {
log.Printf("[Feed %d] Not Modified (304)", feed.ID)
// Update timestamp only? Or keep as is.
updateFeedStatus(feed.ID, feed.LastEtag.String, feed.LastModified.String, true, "")
results <- 0
return
}
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
errMsg := fmt.Sprintf("HTTP %d", resp.StatusCode)
log.Printf("[Feed %d] Error: %s", feed.ID, errMsg)
updateFeedStatus(feed.ID, "", "", false, errMsg)
results <- 0
return
}
parsedFeed, err := fp.Parse(resp.Body)
if err != nil {
log.Printf("[Feed %d] Parser Error: %v", feed.ID, err)
updateFeedStatus(feed.ID, "", "", false, err.Error())
results <- 0
return
}
// Prepare news items
var noticias []Noticia
for _, item := range parsedFeed.Items {
if item.Link == "" {
continue
}
pubDate := time.Now()
if item.PublishedParsed != nil {
pubDate = *item.PublishedParsed
} else if item.UpdatedParsed != nil {
pubDate = *item.UpdatedParsed
}
// HTML cleanup simply takes Description or Content
resumen := item.Description
if resumen == "" {
resumen = item.Content
}
noticia := Noticia{
ID: generateID(item.Link),
Titulo: item.Title,
Resumen: cleanHTML(resumen),
URL: item.Link,
Fecha: pubDate,
ImagenURL: extractImage(item),
FuenteNombre: feed.Nombre,
CategoriaID: feed.CategoriaID,
PaisID: feed.PaisID,
}
noticias = append(noticias, noticia)
}
inserted := insertNoticias(noticias)
// Get new headers
newEtag := resp.Header.Get("ETag")
newModified := resp.Header.Get("Last-Modified")
updateFeedStatus(feed.ID, newEtag, newModified, true, "")
if inserted > 0 {
log.Printf("[Feed %d] Inserted %d new items", feed.ID, inserted)
}
results <- inserted
}
func insertNoticias(noticias []Noticia) int {
if len(noticias) == 0 {
return 0
}
// Using COPY for bulk insert is most efficient, but complexity with handling conflicts
// "ON CONFLICT DO NOTHING" works best with normal INSERT.
// For simplicity and correctness with "ON CONFLICT", we use transaction and prepared statement.
// For very high performance, we could channel all to a batch writer.
// Given 1400 feeds, batch of 10-20 items per feed, standard Insert is okay if parallelized.
txn, err := db.Begin()
if err != nil {
log.Printf("Error beginning txn: %v", err)
return 0
}
defer txn.Rollback()
stmt, err := txn.Prepare(pq.CopyIn("noticias", "id", "titulo", "resumen", "url", "fecha", "imagen_url", "fuente_nombre", "categoria_id", "pais_id"))
if err != nil {
// Fallback to individual inserts if CopyIn is too complex with ON CONFLICT (CopyIn doesn't support ON CONFLICT natively easily without temp tables)
// Let's use multi-row INSERT with ON CONFLICT.
return insertNoticiasWithConflict(noticias)
}
defer stmt.Close()
// WAIT. lib/pq CopyIn does NOT support ON CONFLICT DO NOTHING.
// It will fail if duplicates exist. Since we expect duplicates (RSS feeds repeat items),
// CopyIn is risky directly into main table.
// Strategy: Use INSERT ... ON CONFLICT DO NOTHING with unnest or VALUES.
return insertNoticiasWithConflict(noticias)
}
func insertNoticiasWithConflict(noticias []Noticia) int {
// Efficient bulk insert for Postgres using unnest
// Or standard multi-value insert.
count := 0
// Chunking to avoid parameter limit (65535)
chunkSize := 500
for i := 0; i < len(noticias); i += chunkSize {
end := i + chunkSize
if end > len(noticias) {
end = len(noticias)
}
chunk := noticias[i:end]
placeholders := []string{}
vals := []interface{}{}
for j, n := range chunk {
offset := j * 9
placeholders = append(placeholders, fmt.Sprintf("($%d, $%d, $%d, $%d, $%d, $%d, $%d, $%d, $%d)",
offset+1, offset+2, offset+3, offset+4, offset+5, offset+6, offset+7, offset+8, offset+9))
vals = append(vals, n.ID, n.Titulo, n.Resumen, n.URL, n.Fecha, n.ImagenURL, n.FuenteNombre, n.CategoriaID, n.PaisID)
}
query := fmt.Sprintf(`
INSERT INTO noticias (id, titulo, resumen, url, fecha, imagen_url, fuente_nombre, categoria_id, pais_id)
VALUES %s
ON CONFLICT (url) DO NOTHING
`, strings.Join(placeholders, ","))
res, err := db.Exec(query, vals...)
if err != nil {
log.Printf("Batch insert error: %v", err)
continue
}
rowsAff, _ := res.RowsAffected()
count += int(rowsAff)
}
return count
}
func updateFeedStatus(id int, etag, modified string, success bool, lastError string) {
var query string
var args []interface{}
if success {
query = `UPDATE feeds SET fallos = 0, last_etag = $1, last_modified = $2, last_error = NULL WHERE id = $3`
args = []interface{}{etag, modified, id}
} else {
// Increment failure count
query = `
UPDATE feeds
SET fallos = COALESCE(fallos, 0) + 1,
last_error = $1,
activo = CASE WHEN COALESCE(fallos, 0) + 1 >= $2 THEN FALSE ELSE activo END
WHERE id = $3`
args = []interface{}{lastError, config.MaxFailures, id}
}
_, err := db.Exec(query, args...)
if err != nil {
log.Printf("Error updating feed %d status: %v", id, err)
}
}
func ingestCycle() {
log.Println("Starting Ingestion Cycle...")
start := time.Now()
feeds, err := getActiveFeeds()
if err != nil {
log.Printf("Error getting feeds: %v", err)
return
}
if len(feeds) == 0 {
log.Println("No active feeds found.")
return
}
log.Printf("Processing %d feeds with %d workers...", len(feeds), config.MaxWorkers)
jobs := make(chan Feed, len(feeds))
results := make(chan int, len(feeds))
// Start workers
var wg sync.WaitGroup
for w := 0; w < config.MaxWorkers; w++ {
wg.Add(1)
go func() {
defer wg.Done()
fp := gofeed.NewParser()
for feed := range jobs {
processFeed(fp, feed, results)
}
}()
}
// Send jobs
for _, f := range feeds {
jobs <- f
}
close(jobs)
// Wait for workers in background to close results when done
go func() {
wg.Wait()
close(results)
}()
// Count results
totalNew := 0
for inserted := range results {
totalNew += inserted
}
duration := time.Since(start)
log.Printf("Ingestion Cycle Complete. Processed %d feeds in %v. New items: %d", len(feeds), duration, totalNew)
}
func main() {
loadConfig()
initDB()
// Run immediately on start
ingestCycle()
// Scheduler loop
ticker := time.NewTicker(config.PokeInterval)
defer ticker.Stop()
for range ticker.C {
ingestCycle()
}
}

27
rss-web-go/Dockerfile Normal file
View file

@ -0,0 +1,27 @@
FROM golang:1.21-alpine AS builder
WORKDIR /app
# Install git
RUN apk add --no-cache git
# Copy files
COPY . .
# Download dependencies
RUN go mod tidy && go mod download
# Build
RUN CGO_ENABLED=0 GOOS=linux go build -o rss-web .
# Final stage
FROM alpine:latest
WORKDIR /root/
COPY --from=builder /app/rss-web .
# Copy static assets and templates manually for now (assuming context is root)
# In docker-compose we will mount volumes or copy them
# COPY templates ./templates
# COPY static ./static
CMD ["./rss-web"]

9
rss-web-go/go.mod Normal file
View file

@ -0,0 +1,9 @@
module rss-web-go
go 1.21
require (
github.com/gin-gonic/gin v1.9.1
github.com/lib/pq v1.10.9
github.com/joho/godotenv v1.5.1
)

108
rss-web-go/main.go Normal file
View file

@ -0,0 +1,108 @@
package main
import (
"database/sql"
"fmt"
"log"
"os"
"html/template"
"time"
"github.com/gin-gonic/gin"
_ "github.com/lib/pq"
)
var db *sql.DB
func initDB() {
connStr := fmt.Sprintf("host=%s port=%s user=%s password=%s dbname=%s sslmode=disable",
os.Getenv("DB_HOST"), os.Getenv("DB_PORT"), os.Getenv("DB_USER"), os.Getenv("DB_PASS"), os.Getenv("DB_NAME"))
var err error
db, err = sql.Open("postgres", connStr)
if err != nil {
log.Fatal(err)
}
db.SetMaxOpenConns(25)
db.SetMaxIdleConns(25)
db.SetConnMaxLifetime(5 * time.Minute)
if err = db.Ping(); err != nil {
log.Fatalf("Cannot connect to DB: %v", err)
}
log.Println("Connected to Database")
}
// Template Functions (to replace Jinja filters)
var funcMap = template.FuncMap{
"safe_html": func(s string) template.HTML {
return template.HTML(s)
},
"format_date": func(t time.Time) string {
return t.Format("02/01/2006")
},
"country_flag": func(code string) string {
// Placeholder logic, real logic needs country mapping
return "🏳️"
},
}
func main() {
// Debug mode for now
gin.SetMode(gin.DebugMode)
initDB()
r := gin.Default()
// Load Templates with FuncMap
// We need to support the template structure.
// For now, let's try to load them directly, but likely we need to adapt syntax.
// r.SetFuncMap(funcMap)
// r.LoadHTMLGlob("templates/*")
// Static Files
r.Static("/static", "./static")
// Routes
r.GET("/", homeHandler)
r.GET("/health", func(c *gin.Context) {
c.JSON(200, gin.H{"status": "ok", "engine": "golang"})
})
port := os.Getenv("PORT")
if port == "" {
port = "8001"
}
log.Printf("Starting Server on port %s", port)
if err := r.Run("0.0.0.0:" + port); err != nil {
log.Fatalf("Server failed to start: %v", err)
}
}
func homeHandler(c *gin.Context) {
// Simple query for testing connectivity
rows, err := db.Query("SELECT titulo, url FROM noticias ORDER BY fecha DESC LIMIT 10")
if err != nil {
c.String(500, "DB Error: %v", err)
return
}
defer rows.Close()
var news []map[string]string
for rows.Next() {
var titulo, url string
if err := rows.Scan(&titulo, &url); err != nil {
continue
}
news = append(news, map[string]string{"titulo": titulo, "url": url})
}
// For now, return JSON to prove it works before porting the complex HTML
c.JSON(200, gin.H{
"message": "Welcome to RSS2 Go Web Server",
"news": news,
})
}

87
scheduler.py Normal file
View file

@ -0,0 +1,87 @@
import time
import logging
import atexit
from datetime import datetime, timedelta
import sys
from apscheduler.schedulers.background import BackgroundScheduler
# Carga de la app Flask
from app import app
# Import correcto de operaciones de traduccion
from translation_ops import run_producer_cycle
logging.basicConfig(
stream=sys.stdout,
level=logging.INFO,
format='[%(asctime)s] %(levelname)s in %(module)s: %(message)s'
)
scheduler = BackgroundScheduler(
daemon=True,
timezone="UTC"
)
def shutdown_scheduler():
"""Detiene el planificador al salir del proceso de forma segura."""
try:
if scheduler.running:
scheduler.shutdown(wait=False)
logging.info("Scheduler detenido correctamente.")
except Exception as e:
logging.error(f"Error al detener el scheduler: {e}")
# Registrar apagado limpio
atexit.register(shutdown_scheduler)
if __name__ == "__main__":
# Entrar al contexto de Flask (necesario para partes del proyecto)
with app.app_context():
try:
# Job 1: RSS Fetching -> MOVIDO A GO (rss-ingestor-go)
# Este scheduler ya no maneja la ingesta de noticias.
# Job 2: Translation Producer
scheduler.add_job(
run_producer_cycle,
trigger="interval",
minutes=1,
id="translation_producer_job",
next_run_time=datetime.utcnow() + timedelta(seconds=5),
max_instances=1,
coalesce=True,
)
# Job 3: Precache Entities
from scripts.precache_entities import run_precache
scheduler.add_job(
run_precache,
trigger="interval",
hours=6,
id="precache_entities_job",
next_run_time=datetime.utcnow() + timedelta(seconds=20),
max_instances=1,
coalesce=True,
)
scheduler.start()
logging.info("Scheduler iniciado correctamente.")
logging.info("Tareas activas: translation_producer_job, precache_entities_job")
except Exception as e:
logging.exception(f"Error inicializando el scheduler: {e}")
sys.exit(1)
# Mantener proceso vivo (necesario para Docker)
try:
while True:
time.sleep(60)
except (KeyboardInterrupt, SystemExit):
logging.info("Apagando el scheduler worker...")

View file

@ -0,0 +1,67 @@
#!/usr/bin/env python3
"""
Script para limpiar caracteres <unk> de las traducciones.
"""
import re
from db import get_conn
def clean_text(text):
"""Remove <unk> tokens and other problematic characters."""
if not text:
return text
# Remove <unk> tokens
text = text.replace('<unk>', '')
text = text.replace('<EFBFBD>', '')
# Remove other problematic Unicode characters
text = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]', '', text)
return text.strip()
def main():
"""Clean all translations with <unk> tokens."""
print("🧹 Limpiando tokens <unk> de traducciones...")
with get_conn() as conn:
with conn.cursor() as cur:
# Find translations with <unk> tokens
cur.execute("""
SELECT id, titulo_trad, resumen_trad
FROM traducciones
WHERE titulo_trad LIKE '%<unk>%'
OR resumen_trad LIKE '%<unk>%'
OR titulo_trad LIKE '%<EFBFBD>%'
OR resumen_trad LIKE '%<EFBFBD>%'
""")
translations = cur.fetchall()
print(f"📊 Encontradas {len(translations)} traducciones con tokens problemáticos")
if not translations:
print("✅ No hay traducciones que limpiar")
return
updated_count = 0
for row in translations:
tr_id, titulo, resumen = row
# Clean the fields
new_titulo = clean_text(titulo) if titulo else titulo
new_resumen = clean_text(resumen) if resumen else resumen
# Update only if something changed
if new_titulo != titulo or new_resumen != resumen:
cur.execute("""
UPDATE traducciones
SET titulo_trad = %s,
resumen_trad = %s
WHERE id = %s
""", (new_titulo, new_resumen, tr_id))
updated_count += 1
if updated_count % 100 == 0:
print(f" ⏳ Procesadas {updated_count} traducciones...")
conn.commit()
print(f"✅ Limpieza completada: {updated_count} traducciones actualizadas")
if __name__ == "__main__":
main()

39
scripts/convert_model.sh Executable file
View file

@ -0,0 +1,39 @@
#!/bin/bash
# Convertir modelo NLLB de HuggingFace a formato CTranslate2
# Ejecutar una vez antes de usar el translation_worker con CTranslate2
set -e
MODEL=${UNIVERSAL_MODEL:-"facebook/nllb-200-distilled-600M"}
OUTPUT_DIR=${CT2_MODEL_PATH:-"./models/nllb-ct2"}
QUANTIZATION=${CT2_QUANTIZATION:-"int8_float16"}
echo "=== Conversión de modelo NLLB a CTranslate2 ==="
echo "Modelo origen: $MODEL"
echo "Directorio destino: $OUTPUT_DIR"
echo "Quantización: $QUANTIZATION"
echo ""
# Verificar que ctranslate2 está instalado
if ! command -v ct2-transformers-converter &> /dev/null; then
echo "Error: ct2-transformers-converter no encontrado."
echo "Instala con: pip install ctranslate2"
exit 1
fi
# Crear directorio si no existe
mkdir -p "$(dirname "$OUTPUT_DIR")"
# Convertir el modelo
echo "Iniciando conversión (puede tardar 5-10 minutos)..."
ct2-transformers-converter \
--model "$MODEL" \
--output_dir "$OUTPUT_DIR" \
--quantization "$QUANTIZATION" \
--force
echo ""
echo "✓ Conversión completada: $OUTPUT_DIR"
echo ""
echo "Para usar el modelo, establece:"
echo " export CT2_MODEL_PATH=$OUTPUT_DIR"

View file

@ -0,0 +1,77 @@
#!/bin/bash
# Script de ejemplo para crear parrillas de videos
echo "🎬 Creando parrillas de ejemplo..."
# 1. Noticias de Bulgaria
docker-compose exec -T db psql -U rss -d rss << EOF
INSERT INTO video_parrillas (
nombre, descripcion, tipo_filtro,
pais_id, max_noticias, duracion_maxima,
idioma_voz, template, include_images, include_subtitles,
frecuencia, activo
) VALUES (
'Noticias de Bulgaria',
'Resumen diario de las noticias más importantes de Bulgaria',
'pais',
(SELECT id FROM paises WHERE nombre ILIKE '%bulgaria%' LIMIT 1),
5, 180,
'es', 'standard', true, true,
'daily', true
) ON CONFLICT DO NOTHING;
EOF
# 2. Ciencia en Europa
docker-compose exec -T db psql -U rss -d rss << EOF
INSERT INTO video_parrillas (
nombre, descripcion, tipo_filtro,
categoria_id, continente_id, max_noticias,
idioma_voz, template, include_subtitles,
frecuencia, activo
) VALUES (
'Ciencia en Europa',
'Las últimas noticias científicas de Europa',
'categoria',
(SELECT id FROM categorias WHERE nombre ILIKE '%ciencia%' LIMIT 1),
(SELECT id FROM continentes WHERE nombre = 'Europa' LIMIT 1),
7,
'es', 'modern', true,
'daily', true
) ON CONFLICT DO NOTHING;
EOF
# 3. Tecnología Global
docker-compose exec -T db psql -U rss -d rss << EOF
INSERT INTO video_parrillas (
nombre, descripcion, tipo_filtro,
categoria_id, max_noticias, duracion_maxima,
idioma_voz, template, include_subtitles,
frecuencia, activo
) VALUES (
'Tech News Daily',
'Resumen diario de tecnología mundial',
'categoria',
(SELECT id FROM categorias WHERE nombre ILIKE '%tecnolog%' LIMIT 1),
8, 300,
'es', 'modern', true,
'daily', true
) ON CONFLICT DO NOTHING;
EOF
echo "✅ Parrillas creadas!"
echo ""
echo "📊 Ver parrillas creadas:"
docker-compose exec -T db psql -U rss -d rss -c "
SELECT id, nombre, tipo_filtro, max_noticias, frecuencia, activo
FROM video_parrillas
ORDER BY id DESC;
"
echo ""
echo "🎥 Accede a la interfaz web en: http://localhost:8001/parrillas/"
echo ""
echo "💡 Para generar un video manualmente:"
echo " docker-compose exec web python generar_videos_noticias.py <id_parrilla>"
echo ""
echo "📅 Para generar todos los videos del día:"
echo " docker-compose exec web python generar_videos_noticias.py"

64
scripts/diagnose_rss.py Normal file
View file

@ -0,0 +1,64 @@
import os
import psycopg2
from datetime import datetime
# Database configuration
DB_WRITE_HOST = os.environ.get("DB_WRITE_HOST", "db")
DB_NAME = os.environ.get("DB_NAME", "rss")
DB_USER = os.environ.get("DB_USER", "rss")
DB_PASS = os.environ.get("DB_PASS", "x")
DB_PORT = os.environ.get("DB_PORT", "5432")
def check_db():
try:
conn = psycopg2.connect(
host=DB_WRITE_HOST,
database=DB_NAME,
user=DB_USER,
password=DB_PASS,
port=DB_PORT,
connect_timeout=5
)
print("✅ Database connection successful.")
with conn.cursor() as cur:
# 1. Total news and latest date
cur.execute("SELECT COUNT(*), MAX(fecha) FROM noticias;")
count, latest = cur.fetchone()
print(f"📊 Total news: {count}")
print(f"🕒 Latest news date: {latest}")
# 2. Feed status
cur.execute("SELECT COUNT(*) FROM feeds WHERE activo = TRUE;")
active_feeds = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM feeds WHERE activo = FALSE;")
inactive_feeds = cur.fetchone()[0]
print(f"📡 Active feeds: {active_feeds}")
print(f"🚫 Inactive feeds: {inactive_feeds}")
# 3. Feeds with most failures
cur.execute("SELECT id, nombre, url, fallos, last_error FROM feeds WHERE fallos > 0 ORDER BY fallos DESC LIMIT 5;")
failures = cur.fetchall()
if failures:
print("\n⚠️ Feeds with most failures:")
for f in failures:
print(f" - ID {f[0]}: {f[1]} ({f[3]} fallos) - Error: {f[4]}")
else:
print("\n✅ No feeds with reported failures.")
# 4. Check for unprocessed translations (if applicable)
# Checking schema again: table 'noticias' doesn't seem to have a 'translated' flag?
# Conversation eeb18716 mentioned 'TRAD/MIN, PENDING, PROCESSING, COMPLETED, ERRORS' metrics.
# Let's check 'traducciones' table if it exists.
cur.execute("SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'traducciones');")
if cur.fetchone()[0]:
cur.execute("SELECT COUNT(*) FROM noticias WHERE id NOT IN (SELECT noticia_id FROM traducciones);")
pending_trans = cur.fetchone()[0]
print(f"🌎 News pending translation: {pending_trans}")
conn.close()
except Exception as e:
print(f"❌ Database error: {e}")
if __name__ == "__main__":
check_db()

View file

@ -0,0 +1,99 @@
import logging
import ssl
import nltk
import os
import urllib.request
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# ================================================================
# Logging
# ================================================================
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
LOG = logging.getLogger("download_models")
# ================================================================
# SSL FIX
# ================================================================
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
# ================================================================
# Paths y modelos
# ================================================================
NLTK_PACKAGES = ["punkt", "punkt_tab", "stopwords"]
NLLB_MODEL = "facebook/nllb-200-distilled-600M"
FASTTEXT_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.218.bin"
FASTTEXT_DEST = "/app/models/lid.218.bin" # donde lo espera tu worker
# ================================================================
# Descargar NLTK
# ================================================================
def download_nltk():
for pkg in NLTK_PACKAGES:
try:
path = f"tokenizers/{pkg}" if pkg.startswith("punkt") else f"corpora/{pkg}"
nltk.data.find(path)
LOG.info(f"NLTK '{pkg}' already installed")
except LookupError:
LOG.info(f"Downloading NLTK '{pkg}'...")
nltk.download(pkg, quiet=True)
LOG.info(f"Downloaded OK: {pkg}")
# ================================================================
# Descargar NLLB
# ================================================================
def download_nllb(model_name: str):
LOG.info(f"Downloading NLLB model: {model_name}")
try:
AutoTokenizer.from_pretrained(model_name)
AutoModelForSeq2SeqLM.from_pretrained(model_name)
LOG.info(f"Downloaded OK: {model_name}")
except Exception as e:
LOG.error(f"Failed downloading NLLB model {model_name}: {e}")
# ================================================================
# Descargar fastText LID.218
# ================================================================
def download_fasttext():
# Crear carpeta /app/models si no existe
dest_dir = os.path.dirname(FASTTEXT_DEST)
os.makedirs(dest_dir, exist_ok=True)
# Si ya existe, no lo descargamos
if os.path.exists(FASTTEXT_DEST):
LOG.info(f"fastText LID already exists at {FASTTEXT_DEST}")
return
LOG.info(f"Downloading fastText LID model from {FASTTEXT_URL}")
try:
urllib.request.urlretrieve(FASTTEXT_URL, FASTTEXT_DEST)
LOG.info(f"Downloaded fastText LID model to {FASTTEXT_DEST}")
except Exception as e:
LOG.error(f"Failed to download fastText LID model: {e}")
# ================================================================
# Main
# ================================================================
if __name__ == "__main__":
LOG.info("Downloading NLTK data...")
download_nltk()
LOG.info("Downloading NLLB model...")
download_nllb(NLLB_MODEL)
LOG.info("Downloading fastText LID model...")
download_fasttext()
LOG.info("All downloads completed successfully.")

View file

@ -0,0 +1,71 @@
import html
import psycopg2
from db import get_conn
import re
def fix_entities():
print("🔧 Fixing HTML entities in database...")
with get_conn() as conn:
with conn.cursor() as cur:
# 1. Update Noticias
print("Processing 'noticias' table...")
cur.execute("""
SELECT id, titulo, resumen
FROM noticias
WHERE titulo LIKE '%&%;%' OR resumen LIKE '%&%;%'
""")
rows = cur.fetchall()
print(f"Found {len(rows)} rows in 'noticias' to check.")
count = 0
for r in rows:
nid, tit, res = r
new_tit = html.unescape(tit) if tit else tit
new_res = html.unescape(res) if res else res
if new_tit != tit or new_res != res:
cur.execute("""
UPDATE noticias
SET titulo = %s, resumen = %s
WHERE id = %s
""", (new_tit, new_res, nid))
count += 1
if count % 100 == 0:
print(f"Updated {count} noticias...")
print(f"Updated {count} rows in 'noticias'.")
# 2. Update Traducciones
print("\nProcessing 'traducciones' table...")
cur.execute("""
SELECT id, titulo_trad, resumen_trad
FROM traducciones
WHERE titulo_trad LIKE '%&%;%' OR resumen_trad LIKE '%&%;%'
""")
rows = cur.fetchall()
print(f"Found {len(rows)} translations to check.")
count_tr = 0
for r in rows:
tid, tit, res = r
new_tit = html.unescape(tit) if tit else tit
new_res = html.unescape(res) if res else res
if new_tit != tit or new_res != res:
cur.execute("""
UPDATE traducciones
SET titulo_trad = %s, resumen_trad = %s
WHERE id = %s
""", (new_tit, new_res, tid))
count_tr += 1
print(f"Updated {count_tr} rows in 'traducciones'.")
conn.commit()
print("✅ Database cleaning complete.")
if __name__ == "__main__":
fix_entities()

View file

@ -0,0 +1,92 @@
import html
import psycopg2
from db import get_conn
import sys
def recursive_unescape(text):
if not text:
return text
# Limit loops to prevent infinite loops on weird edge cases
max_loops = 5
current = text
for _ in range(max_loops):
new_text = html.unescape(current)
if new_text == current:
break
current = new_text
return current
def fix_entities_recursive():
print("🔧 Fixing HTML entities RECURSIVELY in database...")
with get_conn() as conn:
with conn.cursor() as cur:
# 1. Update Noticias
print("Processing 'noticias' table...")
# We select ALL rows that contain '&' to catch any entity
# Optimisation: limit to rows with '&'
# Note: This might be slow if table is huge, but we have ~13k rows, it's fine.
cur.execute("""
SELECT id, titulo, resumen
FROM noticias
WHERE titulo LIKE '%&%' OR resumen LIKE '%&%'
""")
rows = cur.fetchall()
print(f"Found {len(rows)} candidates in 'noticias'.")
count = 0
for r in rows:
nid, tit, res = r
new_tit = recursive_unescape(tit)
new_res = recursive_unescape(res)
if new_tit != tit or new_res != res:
cur.execute("""
UPDATE noticias
SET titulo = %s, resumen = %s
WHERE id = %s
""", (new_tit, new_res, nid))
count += 1
if count % 100 == 0:
print(f"Updated {count} noticias...")
print(f"Total updated in 'noticias': {count}")
# 2. Update Traducciones
print("\nProcessing 'traducciones' table...")
cur.execute("""
SELECT id, titulo_trad, resumen_trad
FROM traducciones
WHERE titulo_trad LIKE '%&%' OR resumen_trad LIKE '%&%'
""")
rows = cur.fetchall()
print(f"Found {len(rows)} candidates in 'traducciones'.")
count_tr = 0
for r in rows:
tid, tit, res = r
new_tit = recursive_unescape(tit)
new_res = recursive_unescape(res)
if new_tit != tit or new_res != res:
cur.execute("""
UPDATE traducciones
SET titulo_trad = %s, resumen_trad = %s
WHERE id = %s
""", (new_tit, new_res, tid))
count_tr += 1
if count_tr % 100 == 0:
print(f"Updated {count_tr} traducciones...")
print(f"Total updated in 'traducciones': {count_tr}")
conn.commit()
print("✅ Database cleaning complete.")
if __name__ == "__main__":
fix_entities_recursive()

244
scripts/migrate_to_qdrant.py Executable file
View file

@ -0,0 +1,244 @@
#!/usr/bin/env python3
"""
Script de migración para vectorizar noticias existentes en Qdrant.
Uso:
# Ver estadísticas
python scripts/migrate_to_qdrant.py --stats
# Vectorizar noticias (proceso completo)
python scripts/migrate_to_qdrant.py --vectorize --batch-size 200
# Limpiar y empezar de nuevo
python scripts/migrate_to_qdrant.py --reset
"""
import os
import sys
import argparse
import time
from datetime import datetime
# Añadir el directorio raíz al path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from db import get_read_conn, get_write_conn
def get_statistics():
"""
Muestra estadísticas del sistema.
"""
print("\n" + "=" * 80)
print("📊 ESTADÍSTICAS DEL SISTEMA")
print("=" * 80)
with get_read_conn() as conn:
with conn.cursor() as cur:
# Traducciones totales
cur.execute("""
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE lang_to = 'es') as es,
COUNT(*) FILTER (WHERE status = 'done') as completadas
FROM traducciones
""")
row = cur.fetchone()
print(f"\n📰 TRADUCCIONES:")
print(f" Total: {row[0]:,}")
print(f" En español: {row[1]:,}")
print(f" Completadas: {row[2]:,}")
# Estado vectorización
cur.execute("""
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE vectorized = TRUE) as vectorizadas,
COUNT(*) FILTER (WHERE vectorized = FALSE AND status = 'done') as pendientes
FROM traducciones
WHERE lang_to = 'es'
""")
row = cur.fetchone()
print(f"\n🔧 VECTORIZACIÓN:")
print(f" Total (ES): {row[0]:,}")
print(f" Vectorizadas: {row[1]:,}")
print(f" Pendientes: {row[2]:,}")
# Info de Qdrant (si existe)
try:
from qdrant_client import QdrantClient
qdrant_host = os.environ.get("QDRANT_HOST", "localhost")
qdrant_port = int(os.environ.get("QDRANT_PORT", "6333"))
collection_name = os.environ.get("QDRANT_COLLECTION_NAME", "news_vectors")
client = QdrantClient(host=qdrant_host, port=qdrant_port)
collection_info = client.get_collection(collection_name)
print(f"\n🔍 QDRANT:")
print(f" Colección: {collection_name}")
print(f" Puntos: {collection_info.points_count:,}")
print(f" Vectores: {collection_info.vectors_count:,}")
except Exception as e:
print(f"\n⚠️ No se pudo conectar a Qdrant: {e}")
print("\n" + "=" * 80 + "\n")
def vectorize_all(batch_size: int = 200):
"""
Vectoriza todas las noticias traducidas pendientes.
"""
print("\n" + "=" * 80)
print("🔍 INICIANDO VECTORIZACIÓN MASIVA")
print("=" * 80)
print(f"Tamaño de lote: {batch_size}")
print("=" * 80 + "\n")
# Importar el worker de Qdrant
from workers.qdrant_worker import (
init_qdrant_client,
init_embedding_model,
get_pending_news,
upload_to_qdrant
)
# Inicializar
print("🔌 Inicializando Qdrant...")
init_qdrant_client()
print("🤖 Cargando modelo de embeddings...")
init_embedding_model()
total_processed = 0
start_time = time.time()
while True:
# Obtener lote pendiente
news_batch = get_pending_news(limit=batch_size)
if not news_batch:
print("\n✅ No hay más noticias pendientes de vectorizar")
break
print(f"\n📋 Procesando lote de {len(news_batch)} noticias...")
try:
upload_to_qdrant(news_batch)
total_processed += len(news_batch)
elapsed = time.time() - start_time
rate = total_processed / elapsed if elapsed > 0 else 0
print(f"\n📊 Progreso: {total_processed:,} vectorizadas")
print(f"⏱️ Velocidad: {rate:.2f} noticias/segundo")
print(f"⏳ Tiempo transcurrido: {elapsed/60:.1f} minutos")
except Exception as e:
print(f"❌ Error procesando lote: {e}")
break
elapsed = time.time() - start_time
print("\n" + "=" * 80)
print("✅ VECTORIZACIÓN COMPLETADA")
print("=" * 80)
print(f"Total vectorizadas: {total_processed:,}")
print(f"Tiempo total: {elapsed/60:.1f} minutos")
print(f"Velocidad promedio: {total_processed/elapsed:.2f} noticias/segundo")
print("=" * 80 + "\n")
def reset_all():
"""
Resetea el estado de vectorización y limpia Qdrant.
"""
print("\n" + "=" * 80)
print("⚠️ RESET COMPLETO DEL SISTEMA DE VECTORES")
print("=" * 80)
response = input("\n¿Estás seguro? Esto eliminará TODOS los vectores y reiniciará el estado (s/N): ")
if response.lower() != 's':
print("❌ Operación cancelada")
return
print("\n🗑️ Reseteando base de datos...")
with get_write_conn() as conn:
with conn.cursor() as cur:
# Resetear flag de vectorización
cur.execute("""
UPDATE traducciones
SET vectorized = FALSE,
qdrant_point_id = NULL,
vectorization_date = NULL
""")
conn.commit()
print("✅ Flags de vectorización reseteados en PostgreSQL")
# Limpiar Qdrant
try:
from qdrant_client import QdrantClient
qdrant_host = os.environ.get("QDRANT_HOST", "localhost")
qdrant_port = int(os.environ.get("QDRANT_PORT", "6333"))
collection_name = os.environ.get("QDRANT_COLLECTION_NAME", "news_vectors")
client = QdrantClient(host=qdrant_host, port=qdrant_port)
# Eliminar colección
client.delete_collection(collection_name)
print(f"✅ Colección '{collection_name}' eliminada de Qdrant")
# Recrear colección
from qdrant_client.models import Distance, VectorParams
client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=384, distance=Distance.COSINE)
)
print(f"✅ Colección '{collection_name}' recreada")
except Exception as e:
print(f"⚠️ Error limpiando Qdrant: {e}")
print("\n✅ Reset completado\n")
def main():
parser = argparse.ArgumentParser(
description="Script de migración para Qdrant (Directo)",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument("--stats", action="store_true", help="Mostrar estadísticas")
parser.add_argument("--vectorize", action="store_true", help="Vectorizar noticias traducidas")
parser.add_argument("--reset", action="store_true", help="Limpiar Qdrant y reiniciar estado")
parser.add_argument("--batch-size", type=int, default=200, help="Tamaño de lote (default: 200)")
args = parser.parse_args()
# Si no se especifica ninguna opción, mostrar estadísticas
if not any([args.stats, args.vectorize, args.reset]):
args.stats = True
try:
if args.stats:
get_statistics()
if args.reset:
reset_all()
if args.vectorize:
vectorize_all(batch_size=args.batch_size)
except KeyboardInterrupt:
print("\n\n⏹️ Proceso interrumpido por el usuario")
except Exception as e:
print(f"\n❌ Error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,70 @@
import logging
import sys
import os
from concurrent.futures import ThreadPoolExecutor
# Add app to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from db import get_read_conn
from utils.wiki import fetch_wiki_data
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def get_top_entities():
"""Get top 100 people, 50 orgs, 50 places from last 30 days."""
entities = []
query = """
SELECT t.valor, COUNT(*) as c
FROM tags t
JOIN tags_noticia tn ON t.id = tn.tag_id
JOIN traducciones tr ON tn.traduccion_id = tr.id
WHERE tr.created_at > NOW() - INTERVAL '30 days'
AND t.tipo = %s
GROUP BY t.valor
ORDER BY c DESC
LIMIT %s
"""
try:
with get_read_conn() as conn:
with conn.cursor() as cur:
# People
cur.execute(query, ('persona', 100))
entities.extend([row[0] for row in cur.fetchall()])
# Orgs
cur.execute(query, ('organizacion', 50))
entities.extend([row[0] for row in cur.fetchall()])
# Places
cur.execute(query, ('lugar', 50))
entities.extend([row[0] for row in cur.fetchall()])
except Exception as e:
logger.error(f"Error fetching top entities: {e}")
return list(set(entities))
def precache_entity(name):
try:
img, summary = fetch_wiki_data(name)
if img or summary:
logger.info(f"✓ Cached: {name}")
else:
logger.info(f"✗ No data for: {name}")
except Exception as e:
logger.error(f"Error caching {name}: {e}")
def run_precache():
logger.info("Starting entity pre-cache...")
entities = get_top_entities()
logger.info(f"Found {len(entities)} unique top entities to cache.")
with ThreadPoolExecutor(max_workers=10) as executor:
executor.map(precache_entity, entities)
logger.info("Pre-cache complete.")
if __name__ == "__main__":
run_precache()

44
scripts/recover_system.py Normal file
View file

@ -0,0 +1,44 @@
import os
import psycopg2
import logging
from datetime import datetime
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
logger = logging.getLogger("recover_system")
DB_CONFIG = {
"host": os.environ.get("DB_HOST", "localhost"),
"port": int(os.environ.get("DB_PORT", 5432)),
"dbname": os.environ.get("DB_NAME", "rss"),
"user": os.environ.get("DB_USER", "rss"),
"password": os.environ.get("DB_PASS", "x"),
}
def recover():
try:
conn = psycopg2.connect(**DB_CONFIG)
conn.autocommit = True
with conn.cursor() as cur:
# 1. Reset stuck translations
logger.info("Resetting stuck 'processing' translations to 'pending'...")
cur.execute("UPDATE traducciones SET status = 'pending' WHERE status = 'processing';")
logger.info(f"Reset {cur.rowcount} translations.")
# 2. Correct future-dated news
logger.info("Correcting future-dated news...")
now = datetime.utcnow()
cur.execute("UPDATE noticias SET fecha = %s WHERE fecha > %s;", (now, now))
logger.info(f"Corrected {cur.rowcount} news items.")
# 3. Reactivate feeds (Optional - only those with few failures)
logger.info("Reactivating feeds with 10-29 failures (giving them another chance)...")
cur.execute("UPDATE feeds SET activo = TRUE, fallos = 0 WHERE activo = FALSE AND fallos < 30;")
logger.info(f"Reactivated {cur.rowcount} feeds.")
conn.close()
logger.info("Recovery complete!")
except Exception as e:
logger.error(f"Error during recovery: {e}")
if __name__ == "__main__":
recover()

View file

@ -0,0 +1,95 @@
#!/usr/bin/env python3
"""
Script de diagnóstico para verificar la conectividad con Qdrant.
Ejecutar desde el contenedor rss2_web para diagnosticar problemas de red.
"""
import os
import sys
def test_qdrant_connection():
"""Prueba la conexión a Qdrant y muestra información de diagnóstico."""
# Configuración
qdrant_host = os.environ.get("QDRANT_HOST", "localhost")
qdrant_port = int(os.environ.get("QDRANT_PORT", "6333"))
print("=" * 60)
print("🔍 DIAGNÓSTICO DE CONEXIÓN QDRANT")
print("=" * 60)
print(f"Host: {qdrant_host}")
print(f"Port: {qdrant_port}")
print()
# 1. Test de resolución DNS
print("1⃣ Probando resolución DNS...")
try:
import socket
ip = socket.gethostbyname(qdrant_host)
print(f" ✅ Host '{qdrant_host}' resuelve a: {ip}")
except Exception as e:
print(f" ❌ ERROR: No se pudo resolver '{qdrant_host}': {e}")
return False
# 2. Test de conectividad TCP
print("\n2⃣ Probando conectividad TCP...")
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(5)
result = sock.connect_ex((ip, qdrant_port))
sock.close()
if result == 0:
print(f" ✅ Puerto {qdrant_port} está abierto")
else:
print(f" ❌ ERROR: Puerto {qdrant_port} está cerrado o inaccesible")
return False
except Exception as e:
print(f" ❌ ERROR en test TCP: {e}")
return False
# 3. Test de cliente Qdrant
print("\n3⃣ Probando cliente Qdrant...")
try:
from qdrant_client import QdrantClient
client = QdrantClient(host=qdrant_host, port=qdrant_port, timeout=5)
collections = client.get_collections()
print(f" ✅ Cliente Qdrant conectado exitosamente")
print(f" 📊 Colecciones disponibles: {[c.name for c in collections.collections]}")
# Test de búsqueda
for collection in collections.collections:
try:
info = client.get_collection(collection.name)
print(f" 📁 {collection.name}: {info.points_count} vectores")
except Exception as e:
print(f" ⚠️ No se pudo obtener info de {collection.name}: {e}")
return True
except Exception as e:
print(f" ❌ ERROR en cliente Qdrant: {e}")
import traceback
traceback.print_exc()
return False
print("\n" + "=" * 60)
if __name__ == "__main__":
success = test_qdrant_connection()
if success:
print("\n✅ DIAGNÓSTICO EXITOSO: Qdrant está accesible")
sys.exit(0)
else:
print("\n❌ DIAGNÓSTICO FALLIDO: Problemas de conectividad con Qdrant")
print("\n💡 SOLUCIONES POSIBLES:")
print(" 1. Verificar que el contenedor 'qdrant' esté corriendo:")
print(" docker ps | grep qdrant")
print(" 2. Verificar que ambos contenedores estén en la misma red:")
print(" docker network inspect rss2_default")
print(" 3. Reiniciar el contenedor de Qdrant:")
print(" docker restart rss2_qdrant")
print(" 4. Verificar variables de entorno QDRANT_HOST y QDRANT_PORT")
sys.exit(1)

View file

@ -0,0 +1,54 @@
import sys
import os
# Add app to path
sys.path.append('/home/x/rss2')
try:
from db import get_conn, get_read_conn, get_write_conn
from cache import get_redis
import psycopg2
print("Imports successfull.")
except ImportError as e:
print(f"Import failed: {e}")
sys.exit(1)
def test_db():
print("\n--- Testing Database Connections ---")
print("Testing Primary (Write) Connection...")
try:
with get_write_conn() as conn:
with conn.cursor() as cur:
cur.execute("SELECT 1")
print(" [OK] Primary reachable.")
except Exception as e:
print(f" [FAIL] Primary unreachable: {e}")
print("Testing Replica (Read) Connection...")
try:
with get_read_conn() as conn:
with conn.cursor() as cur:
cur.execute("SELECT 1")
# Check if it's actually the replica (read-only mode is usually set in replica,
# but here we just check connectivity)
print(" [OK] Replica reachable.")
except Exception as e:
print(f" [FAIL] Replica unreachable: {e}")
def test_redis():
print("\n--- Testing Redis Connection ---")
try:
r = get_redis()
if r:
r.ping()
print(" [OK] Redis reachable.")
else:
print(" [FAIL] Redis client returned None (likely connection failed).")
except Exception as e:
print(f" [FAIL] Redis error: {e}")
if __name__ == "__main__":
test_db()
test_redis()
print("\nVerification complete.")

39
start_docker.sh Executable file
View file

@ -0,0 +1,39 @@
#!/bin/bash
# Script para iniciar los servicios de Docker
# Ejecutar con: sudo ./start_docker.sh
set -e
cd "$(dirname "$0")"
echo "=== RSS2 Docker Services ==="
# Verificar si el modelo CTranslate2 existe
CT2_MODEL="./models/nllb-ct2"
if [ ! -d "$CT2_MODEL" ]; then
echo ""
echo "⚠️ Modelo CTranslate2 no encontrado en $CT2_MODEL"
echo " Convirtiendo modelo (esto puede tardar 5-10 minutos)..."
echo ""
# Verificar si ctranslate2 está instalado
if ! python3 -c "import ctranslate2" 2>/dev/null; then
echo "Instalando ctranslate2..."
pip install ctranslate2
fi
# Convertir el modelo
./convert_model.sh
fi
echo ""
echo "Iniciando servicios Docker..."
docker compose up -d --build
echo ""
echo "✓ Servicios iniciados"
echo ""
echo "Para ver los logs:"
echo " docker compose logs -f translator"
echo ""
echo "Para verificar el estado:"
echo " docker compose ps"

7
static/placeholder.svg Normal file
View file

@ -0,0 +1,7 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 400 300" fill="none">
<rect width="400" height="300" fill="#f0f0f0"/>
<rect x="150" y="100" width="100" height="80" rx="8" fill="#ddd"/>
<circle cx="180" cy="130" r="15" fill="#bbb"/>
<path d="M165 165 L200 140 L235 165" stroke="#bbb" stroke-width="3" fill="none"/>
<text x="200" y="210" text-anchor="middle" font-family="Arial" font-size="14" fill="#999">Sin imagen</text>
</svg>

After

Width:  |  Height:  |  Size: 444 B

1043
static/style.css Normal file

File diff suppressed because it is too large Load diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 101 KiB

128
templates/_feeds_table.html Normal file
View file

@ -0,0 +1,128 @@
<!-- Tabla -->
<div id="feeds-table-container" class="feed-body" style="padding: 0;">
<div class="mt-2" style="margin: 10px 15px;">
{% set activos = total_feeds - feeds_caidos %}
<strong style="color: #27ae60;">{{ activos }} Activos</strong>
<span class="text-muted" style="margin-left: 5px;">(de {{ total_feeds }} Feeds)</span>
{% if filtro_pais_id or filtro_categoria_id or filtro_estado %}
<span class="text-muted" style="font-size:0.9em; margin-left: 10px;">(con filtros aplicados)</span>
{% endif %}
</div>
<table style="width:100%; border-collapse: collapse;">
<thead>
<tr style="background-color: rgba(0,0,0,0.05);">
<th style="padding: 12px 15px; text-align: left;">Nombre</th>
<th style="padding: 12px 15px; text-align: left;">Categoría</th>
<th style="padding: 12px 15px; text-align: left;">País</th>
<th style="padding: 12px 15px; text-align: center;">Noticias</th>
<th style="padding: 12px 15px; text-align: center;">Estado</th>
<th style="padding: 12px 15px; text-align: center;">Fallos</th>
<th style="padding: 12px 15px; text-align: right;">Acciones</th>
</tr>
</thead>
<tbody>
{% for feed in feeds %}
<tr {% if feed.fallos and feed.fallos> 0 %}style="background-color: rgba(192,57,43,0.05);" {% endif %}>
<td style="padding: 12px 15px; border-top: 1px solid var(--border-color);">
<a href="{{ feed.url }}" target="_blank" title="{{ feed.url }}">{{ feed.nombre }}</a>
</td>
<td style="padding: 12px 15px; border-top: 1px solid var(--border-color);">
{{ feed.categoria or 'N/A' }}
</td>
<td style="padding: 12px 15px; border-top: 1px solid var(--border-color);">
{{ feed.pais or 'Global' }}
</td>
<td style="padding: 12px 15px; text-align:center; border-top: 1px solid var(--border-color);">
<span class="badge"
style="background: rgba(52, 152, 219, 0.1); color: #3498db; padding: 2px 8px; border-radius: 10px;">
{{ feed.noticias_count or 0 }}
</span>
</td>
<td style="padding: 12px 15px; text-align: center; border-top: 1px solid var(--border-color);">
{% if not feed.activo %}
<span style="color: #c0392b; font-weight: bold;" title="Inactivo">KO</span>
{% elif feed.fallos and feed.fallos >= 5 %}
<span style="color: #e67e22; font-weight: bold; cursor: help;"
title="{{ feed.last_error or (feed.fallos ~ ' fallos') }}">⚠️</span>
{% elif feed.fallos and feed.fallos > 0 %}
<span style="color: #f39c12; font-weight: bold; cursor: help;"
title="{{ feed.last_error or (feed.fallos ~ ' fallos') }}">OK</span>
{% else %}
<span style="color: #27ae60; font-weight: bold;">OK</span>
{% endif %}
</td>
<td style="padding: 12px 15px; text-align:center; border-top: 1px solid var(--border-color);">
{{ feed.fallos or 0 }}
</td>
<td style="padding: 12px 15px; text-align:right; border-top: 1px solid var(--border-color);">
<a href="{{ url_for('feeds.edit_feed', feed_id=feed.id) }}" class="btn btn-small btn-info">
<i class="fas fa-edit"></i>
</a>
<a href="{{ url_for('feeds.delete_feed', feed_id=feed.id) }}" class="btn btn-small btn-danger"
onclick="return confirm('¿Estás seguro?')">
<i class="fas fa-trash"></i>
</a>
{% if not feed.activo %}
<a href="{{ url_for('feeds.reactivar_feed', feed_id=feed.id) }}" class="btn btn-small">
<i class="fas fa-sync-alt"></i>
</a>
{% endif %}
</td>
</tr>
{% else %}
<tr>
<td colspan="6" style="padding:20px; text-align:center;">
No hay feeds para mostrar.
<a href="{{ url_for('feeds.add_feed') }}">Añade el primero</a>.
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
<!-- Paginación -->
{% if total_pages > 1 %}
<nav class="pagination">
{% if page > 1 %}
<a href="{{ url_for('feeds.list_feeds',
page=page-1,
pais_id=filtro_pais_id,
categoria_id=filtro_categoria_id,
estado=filtro_estado) }}" class="page-link"
onclick="handlePageClick(event, this.href)">&laquo; Anterior</a>
{% endif %}
{% for p in range(1, total_pages + 1) %}
{% if p == page %}
<a href="#" class="page-link active">{{ p }}</a>
{% else %}
<a href="{{ url_for('feeds.list_feeds',
page=p,
pais_id=filtro_pais_id,
categoria_id=filtro_categoria_id,
estado=filtro_estado) }}" class="page-link"
onclick="handlePageClick(event, this.href)">{{ p }}</a>
{% endif %}
{% endfor %}
{% if page < total_pages %} <a href="{{ url_for('feeds.list_feeds',
page=page+1,
pais_id=filtro_pais_id,
categoria_id=filtro_categoria_id,
estado=filtro_estado) }}" class="page-link" onclick="handlePageClick(event, this.href)">
Siguiente &raquo;</a>
{% endif %}
</nav>
{% endif %}

View file

@ -0,0 +1,79 @@
{% for n in noticias %}
{% if n.traduccion_id %}
{% set detalle_url = url_for('noticia.noticia', tr_id=n.traduccion_id) %}
{% else %}
{% set detalle_url = url_for('noticia.noticia', id=n.id) %}
{% endif %}
<article class="noticia-card">
<div class="noticia-card-image-wrapper">
<a href="{{ detalle_url }}">
{% if n.imagen_url %}
<img src="{{ n.imagen_url }}" alt="{{ n.titulo }}" loading="lazy"
onerror="this.style.display='none'; this.parentElement.querySelector('.no-image-placeholder').style.display='flex';">
<div class="no-image-placeholder" style="display:none;"></div>
{% else %}
<div class="no-image-placeholder"></div>
{% endif %}
</a>
</div>
<div class="noticia-card-content">
<div class="noticia-meta">
{{ n.fuente_nombre }}
{% if n.fecha %} &bull; {{ n.fecha|format_date }}{% endif %}
{% if n.pais %} &bull; {{ n.pais }}{% endif %}
</div>
<h3>
<a href="{{ detalle_url }}">
{% if use_tr and n.tiene_traduccion %}
{{ n.titulo_traducido }}
{% else %}
{{ n.titulo_original or n.titulo }}
{% endif %}
</a>
</h3>
<div class="noticia-summary">
{% if use_tr and n.tiene_traduccion %}
{{ (n.resumen_traducido or '') | striptags | truncate(200) }}
{% else %}
{{ (n.resumen_original or n.resumen) | striptags | truncate(200) }}
{% endif %}
</div>
<div class="noticia-actions">
<button class="btn-fav" data-id="{{ n.id }}" onclick="toggleFav(this)" title="Guardar">
<i class="far fa-star"></i>
</button>
<a href="{{ detalle_url }}" class="btn btn-sm">Leer más</a>
</div>
</div>
</article>
{% else %}
<div style="grid-column: 1 / -1; text-align: center; padding: 50px;">
<p>No hay noticias para mostrar.</p>
</div>
{% endfor %}
{# Pagination Logic #}
{% if total_pages and total_pages > 1 %}
<div
style="grid-column: 1 / -1; margin-top: 30px; text-align: center; padding-top: 20px; border-top: 1px solid var(--border-color);">
{% set current = page %}
{% if current > 1 %}
<button class="btn" data-page="{{ current - 1 }}"
onclick="setPage(this.getAttribute('data-page')); cargarNoticias(true);">Newer</button>
{% endif %}
<span style="margin: 0 15px; font-weight: bold; font-family: var(--secondary-font);">
Page {{ current }} of {{ total_pages }}
</span>
{% if current < total_pages %} <button class="btn" data-page="{{ current + 1 }}"
onclick="setPage(this.getAttribute('data-page')); cargarNoticias(true);">
Older</button>
{% endif %}
</div>
{% endif %}

183
templates/account.html Normal file
View file

@ -0,0 +1,183 @@
{% extends "base.html" %}
{% block title %}Tu Cuenta - {{ user.username }}{% endblock %}
{% block content %}
<div style="max-width: 900px; margin: 20px auto;">
<h2 style="margin-bottom: 30px;"><i class="fas fa-user-circle"></i> Tu Cuenta</h2>
<!-- User Info Card -->
<div style="background: #f9f9f9; padding: 25px; border-radius: 10px; margin-bottom: 25px;">
<h3 style="margin-top: 0; color: #6c63ff;">Información del Perfil</h3>
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 15px;">
<div style="text-align: center; margin-bottom: 15px;">
{% if user.avatar_url %}
<img src="{{ user.avatar_url }}" alt="Avatar" style="width: 120px; height: 120px; object-fit: cover; border-radius: 50%; border: 3px solid #6c63ff; box-shadow: 0 4px 6px rgba(0,0,0,0.1);">
{% else %}
<div style="width: 120px; height: 120px; background: #e0e0e0; border-radius: 50%; display: flex; align-items: center; justify-content: center; margin: 0 auto; color: #888; font-size: 50px;">
<i class="fas fa-user"></i>
</div>
{% endif %}
<form action="{{ url_for('account.upload_avatar') }}" method="post" enctype="multipart/form-data" style="margin-top: 15px;">
<input type="file" name="avatar" id="avatar" accept="image/*" style="display: none;" onchange="this.form.submit()">
<label for="avatar" style="cursor: pointer; padding: 6px 12px; border: 1px solid #6c63ff; color: #6c63ff; background: transparent; border-radius: 4px; font-size: 14px; transition: all 0.2s;">
<i class="fas fa-camera"></i> Cambiar foto
</label>
</form>
</div>
</div>
<div>
<strong>Usuario:</strong> {{ user.username }}
</div>
<div>
<strong>Email:</strong> {{ user.email }}
</div>
<div>
<strong>Miembro desde:</strong> {{ user.created_at.strftime('%d/%m/%Y') }}
</div>
<div>
<strong>Último acceso:</strong> {{ user.last_login.strftime('%d/%m/%Y %H:%M') if user.last_login else
'N/A' }}
</div>
</div>
</div>
<!-- Statistics Card -->
<div style="background: #f9f9f9; padding: 25px; border-radius: 10px; margin-bottom: 25px;">
<h3 style="margin-top: 0; color: #6c63ff;">Estadísticas</h3>
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px;">
<div style="text-align: center; padding: 15px; background: white; border-radius: 8px;">
<div style="font-size: 32px; font-weight: bold; color: #6c63ff;">{{ favorites_count }}</div>
<div style="color: #666; margin-top: 5px;">Favoritos guardados</div>
</div>
<div style="text-align: center; padding: 15px; background: white; border-radius: 8px;">
<div style="font-size: 32px; font-weight: bold; color: #6c63ff;">{{ searches_count }}</div>
<div style="color: #666; margin-top: 5px;">Búsquedas realizadas</div>
</div>
</div>
</div>
<!-- Recent Searches -->
{% if recent_searches %}
<div style="background: #f9f9f9; padding: 25px; border-radius: 10px; margin-bottom: 25px;">
<h3 style="margin-top: 0; color: #6c63ff;">Búsquedas Recientes</h3>
<div style="overflow-x: auto;">
<table style="width: 100%; border-collapse: collapse;">
<thead>
<tr style="border-bottom: 2px solid #ddd;">
<th style="padding: 10px; text-align: left;">Búsqueda</th>
<th style="padding: 10px; text-align: center;">Resultados</th>
<th style="padding: 10px; text-align: right;">Fecha</th>
</tr>
</thead>
<tbody>
{% for search in recent_searches %}
<tr style="border-bottom: 1px solid #eee;">
<td style="padding: 10px;">
<a href="/api/search?q={{ search.query | urlencode }}"
style="color: #6c63ff; text-decoration: none;">
{{ search.query }}
</a>
</td>
<td style="padding: 10px; text-align: center;">{{ search.results_count }}</td>
<td style="padding: 10px; text-align: right; color: #666; font-size: 14px;">
{{ search.searched_at.strftime('%d/%m/%Y %H:%M') }}
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% if searches_count > 10 %}
<div style="text-align: center; margin-top: 15px;">
<a href="{{ url_for('account.search_history') }}"
style="color: #6c63ff; text-decoration: none; font-weight: 500;">
Ver historial completo →
</a>
</div>
{% endif %}
</div>
{% endif %}
<!-- Recent Favorites -->
{% if recent_favorites %}
<div style="background: #f9f9f9; padding: 25px; border-radius: 10px; margin-bottom: 25px;">
<h3 style="margin-top: 0; color: #6c63ff;">Favoritos Recientes</h3>
<div style="display: grid; gap: 15px;">
{% for noticia in recent_favorites %}
<div style="display: flex; gap: 15px; padding: 15px; background: white; border-radius: 8px;">
{% if noticia.imagen_url %}
<img src="{{ noticia.imagen_url }}" alt=""
style="width: 100px; height: 70px; object-fit: cover; border-radius: 5px;">
{% endif %}
<div style="flex: 1;">
{% if noticia.traduccion_id %}
<a href="/noticia?tr_id={{ noticia.traduccion_id }}"
style="color: #333; text-decoration: none; font-weight: 500;">
{{ noticia.titulo_trad or noticia.titulo }}
</a>
{% else %}
<a href="/noticia?id={{ noticia.id }}"
style="color: #333; text-decoration: none; font-weight: 500;">
{{ noticia.titulo }}
</a>
{% endif %}
<div style="color: #666; font-size: 12px; margin-top: 5px;">
Guardado: {{ noticia.created_at.strftime('%d/%m/%Y') }}
</div>
</div>
</div>
{% endfor %}
</div>
<div style="text-align: center; margin-top: 15px;">
<a href="{{ url_for('favoritos.view_favorites') }}"
style="color: #6c63ff; text-decoration: none; font-weight: 500;">
Ver todos los favoritos →
</a>
</div>
</div>
{% endif %}
<!-- Account Actions -->
<div style="background: #f9f9f9; padding: 25px; border-radius: 10px;">
<h3 style="margin-top: 0; color: #6c63ff;">Acciones</h3>
<!-- Change Password Form -->
<details style="margin-bottom: 20px;">
<summary style="cursor: pointer; font-weight: 500; padding: 10px; background: white; border-radius: 5px;">
<i class="fas fa-key"></i> Cambiar contraseña
</summary>
<form method="post" action="{{ url_for('account.change_password') }}"
style="margin-top: 15px; padding: 15px; background: white; border-radius: 5px;">
<div style="margin-bottom: 15px;">
<label for="current_password" style="display: block; margin-bottom: 5px;">Contraseña actual</label>
<input type="password" id="current_password" name="current_password" required
style="width: 100%; padding: 8px; border: 1px solid #ddd; border-radius: 4px;">
</div>
<div style="margin-bottom: 15px;">
<label for="new_password" style="display: block; margin-bottom: 5px;">Nueva contraseña</label>
<input type="password" id="new_password" name="new_password" required minlength="6"
style="width: 100%; padding: 8px; border: 1px solid #ddd; border-radius: 4px;">
</div>
<div style="margin-bottom: 15px;">
<label for="new_password_confirm" style="display: block; margin-bottom: 5px;">Confirmar nueva
contraseña</label>
<input type="password" id="new_password_confirm" name="new_password_confirm" required minlength="6"
style="width: 100%; padding: 8px; border: 1px solid #ddd; border-radius: 4px;">
</div>
<button type="submit"
style="padding: 10px 20px; background: #6c63ff; color: white; border: none; border-radius: 5px; cursor: pointer;">
Actualizar contraseña
</button>
</form>
</details>
<form method="post" action="{{ url_for('auth.logout') }}">
<button type="submit"
style="padding: 12px 24px; background: #dc3545; color: white; border: none; border-radius: 5px; cursor: pointer; font-size: 14px;">
<i class="fas fa-sign-out-alt"></i> Cerrar sesión
</button>
</form>
</div>
</div>
{% endblock %}

102
templates/add_feed.html Normal file
View file

@ -0,0 +1,102 @@
{% extends "base.html" %}
{% block title %}Añadir Feed{% endblock %}
{% block content %}
<div class="card feed-detail-card"
style="padding: 40px; border-radius: 15px; background-color: var(--glass-bg); box-shadow: 0 10px 30px rgba(0,0,0,0.05); backdrop-filter: blur(10px);">
<h1
style="font-family: var(--primary-font); font-weight: 700; margin-bottom: 30px; border-bottom: 2px solid var(--accent-color); display: inline-block; padding-bottom: 10px;">
Añadir Feed
</h1>
<form action="{{ url_for('feeds.add_feed') }}" method="post" id="addFeedForm" class="form-grid">
<!-- Nombre -->
<div class="floating-label-group">
<input type="text" id="nombre" name="nombre" placeholder=" " required>
<label for="nombre">Nombre del feed</label>
</div>
<!-- Descripción -->
<div class="floating-label-group">
<textarea id="descripcion" name="descripcion" placeholder=" " rows="3"></textarea>
<label for="descripcion">Descripción (opcional)</label>
</div>
<!-- URL -->
<div class="floating-label-group">
<input type="url" id="url" name="url" placeholder=" " required>
<label for="url">URL del feed</label>
</div>
<div class="form-row" style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px;">
<!-- Categoria (Searchable) -->
<div class="form-group">
<label for="categoria_id"
style="display: block; margin-bottom: 8px; font-weight: 600;">Categoría</label>
<select id="categoria_id" name="categoria_id" class="searchable"
placeholder="Selecciona una categoría...">
<option value="">— Sin categoría —</option>
{% for c in categorias %}
<option value="{{ c.id }}">{{ c.nombre }}</option>
{% endfor %}
</select>
</div>
<!-- Pais (Searchable) -->
<div class="form-group">
<label for="pais_id" style="display: block; margin-bottom: 8px; font-weight: 600;">País</label>
<select id="pais_id" name="pais_id" class="searchable" placeholder="Selecciona un país...">
<option value="">Global</option>
{% for p in paises %}
<option value="{{ p.id }}">{{ p.nombre }}</option>
{% endfor %}
</select>
</div>
</div>
<!-- Idioma & Submit -->
<div class="form-row"
style="display: grid; grid-template-columns: 1fr 2fr; gap: 20px; align-items: end; margin-top: 20px;">
<div class="floating-label-group" style="margin-bottom: 0;">
<input type="text" id="idioma" name="idioma" placeholder=" " maxlength="5">
<label for="idioma">Idioma (ej: es)</label>
</div>
<div class="form-actions" style="text-align: right;">
<a href="{{ url_for('feeds.list_feeds') }}" class="btn btn-secondary"
style="margin-right: 10px; background: transparent; color: var(--text-color); border: 1px solid var(--border-color);">
Cancelar
</a>
<button class="btn btn-primary" type="submit" id="submitBtn">
<i class="fas fa-plus"></i> Añadir Feed
</button>
</div>
</div>
</form>
</div>
<script>
document.addEventListener("DOMContentLoaded", function () {
// Real-time URL Validation
const urlInput = document.getElementById('url');
const submitBtn = document.getElementById('submitBtn');
urlInput.addEventListener('input', function () {
if (this.value && this.validity.valid) {
this.style.borderColor = "#2ecc71"; // Green
} else if (this.value) {
this.style.borderColor = "#e74c3c"; // Red
} else {
this.style.borderColor = ""; // Reset
}
});
// Form Submit State
document.getElementById('addFeedForm').addEventListener('submit', function () {
submitBtn.disabled = true;
submitBtn.innerHTML = '<i class="fas fa-spinner fa-spin"></i> Guardando...';
});
});
</script>
{% endblock %}

57
templates/add_feeds.html Normal file
View file

@ -0,0 +1,57 @@
{% extends "base.html" %}
{% block title %}Añadir Nuevo Feed{% endblock %}
{% block content %}
<header>
<h1>Añadir Nuevo Feed</h1>
<p class="subtitle">Introduce los detalles de la nueva fuente de noticias RSS.</p>
<a href="{{ url_for('dashboard') }}" class="top-link" style="margin-top:15px;">← Volver al Dashboard</a>
</header>
<div class="form-section">
<form action="{{ url_for('add_feed') }}" method="post" autocomplete="off">
<div>
<label for="nombre">Nombre del feed</label>
<input id="nombre" name="nombre" type="text" placeholder="Ej: Noticias de Tecnología" required>
</div>
<div style="margin-top:15px;">
<label for="url">URL del RSS</label>
<input id="url" name="url" type="url" placeholder="https://ejemplo.com/rss" required>
</div>
<div style="margin-top:15px;">
<label for="descripcion">Descripción</label>
<textarea id="descripcion" name="descripcion" rows="2" placeholder="Breve descripción del contenido del feed"></textarea>
</div>
<div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 15px; margin-top: 15px;">
<div>
<label for="categoria_id">Categoría</label>
<select id="categoria_id" name="categoria_id" required>
<option value="">— Elige categoría —</option>
{% for cat in categorias %}
<option value="{{ cat.id }}">{{ cat.nombre }}</option>
{% endfor %}
</select>
</div>
<div>
<label for="pais_id">País</label>
<select name="pais_id" id="pais_id">
<option value="">— Global / No aplica —</option>
{% for pais in paises %}
<option value="{{ pais.id }}">{{ pais.nombre }}</option>
{% endfor %}
</select>
</div>
<div>
<label for="idioma">Idioma (código)</label>
<input id="idioma" name="idioma" type="text" maxlength="2" placeholder="ej: es, en">
</div>
</div>
<button type="submit" class="btn" style="margin-top: 25px; width: 100%;">Añadir Feed</button>
</form>
</div>
{% endblock %}

59
templates/add_url.html Normal file
View file

@ -0,0 +1,59 @@
{% extends "base.html" %}
{% block title %}Añadir Noticia desde URL{% endblock %}
{% block content %}
<div class="container mt-4">
<div class="row justify-content-center">
<div class="col-md-8">
<div class="card">
<div class="card-header bg-info text-white">
<h4 class="mb-0">Añadir Noticia desde URL</h4>
</div>
<div class="card-body">
<p class="card-text text-muted">Pega la URL de un artículo de noticias. El sistema intentará extraer el título, resumen e imagen automáticamente.</p>
<form action="{{ url_for('add_url') }}" method="post" class="mt-3">
<!-- Campo para la URL -->
<div class="mb-3">
<label for="url" class="form-label"><strong>URL de la Noticia</strong></label>
<input type="url" class="form-control" id="url" name="url" required placeholder="https://ejemplo.com/noticia-a-scrapear">
</div>
<!-- Selector de Categoría -->
<div class="mb-3">
<label for="categoria_id" class="form-label"><strong>Categoría</strong></label>
<select class="form-select" id="categoria_id" name="categoria_id" required>
<option value="" disabled selected>-- Selecciona una categoría --</option>
{% for categoria in categorias %}
<option value="{{ categoria.id }}">{{ categoria.nombre }}</option>
{% endfor %}
</select>
</div>
<!-- Selector de País -->
<div class="mb-3">
<label for="pais_id" class="form-label"><strong>País</strong></label>
<select class="form-select" id="pais_id" name="pais_id" required>
<option value="" disabled selected>-- Selecciona un país --</option>
{% for pais in paises %}
<option value="{{ pais.id }}">{{ pais.nombre }}</option>
{% endfor %}
</select>
</div>
<!-- Botones de Acción -->
<div class="d-flex justify-content-end pt-3">
<a href="{{ url_for('dashboard') }}" class="btn btn-secondary me-2">Cancelar</a>
<button type="submit" class="btn btn-primary">Añadir Noticia</button>
</div>
</form>
</div>
</div>
</div>
</div>
</div>
{% endblock %}

View file

@ -0,0 +1,74 @@
{% extends "base.html" %}
{% block title %}Añadir Fuente URL{% endblock %}
{% block content %}
<h1>Añadir Fuente URL</h1>
<div style="margin-bottom: 25px;">
<div class="tabs" style="display: flex; gap: 10px; border-bottom: 2px solid #ddd; padding-bottom: 1px;">
<button class="tab-btn active" onclick="switchTab('manual')"
style="padding: 10px 20px; border: none; background: #fff; cursor: pointer; border-bottom: 3px solid #007bff; color: #007bff; font-weight: bold;">
<i class="fas fa-edit"></i> Añadir Manualmente
</button>
<a href="{{ url_for('feeds.discover_feed') }}" class="tab-btn"
style="padding: 10px 20px; border: none; background: #f8f9fa; cursor: pointer; text-decoration: none; color: #555; display: flex; align-items: center; gap: 8px;">
<i class="fas fa-search"></i> Analizar Web (Descubrimiento Automático)
<span class="badge"
style="background: #e9ecef; color: #555; font-size: 10px; padding: 2px 6px; border-radius: 4px;">RECOMENDADO</span>
</a>
</div>
</div>
<div class="card" id="manual-tab">
<div
style="margin-bottom: 20px; padding: 15px; background: #e3f2fd; border-radius: 8px; border-left: 4px solid #1976D2;">
<i class="fas fa-info-circle" style="color: #1976D2;"></i>
Utiliza esta opción para añadir una fuente de URL monitorizada manualmente. Si quieres buscar todos los feeds
RSS dentro de un sitio web, usa la pestaña <strong>Analizar Web</strong>.
</div>
<form method="post" action="{{ url_for('urls.add_url_source') }}" autocomplete="off">
<label for="nombre">Nombre</label>
<input id="nombre" name="nombre" type="text" required placeholder="Ej. El País">
<label for="url" style="margin-top:15px;">URL</label>
<input id="url" name="url" type="url" required placeholder="https://elpais.com">
<div style="display:grid;grid-template-columns:1fr 1fr;gap:15px;margin-top:15px;">
<div>
<label for="categoria_id">Categoría</label>
<select id="categoria_id" name="categoria_id">
<option value="">— Sin categoría —</option>
{% for c in categorias %}
<option value="{{ c.id }}">{{ c.nombre }}</option>
{% endfor %}
</select>
</div>
<div>
<label for="pais_id">País</label>
<select id="pais_id" name="pais_id">
<option value="">— Global —</option>
{% for p in paises %}
<option value="{{ p.id }}">{{ p.nombre }}</option>
{% endfor %}
</select>
</div>
</div>
<label for="idioma" style="margin-top:15px;">Idioma (2 letras)</label>
<input id="idioma" name="idioma" type="text" maxlength="2" value="es">
<div style="margin-top:20px;display:flex;gap:10px;justify-content:flex-end;">
<a href="{{ url_for('urls.manage_urls') }}" class="btn btn-secondary">Cancelar</a>
<button type="submit" class="btn btn-primary">
<i class="fas fa-save"></i> Guardar Fuente
</button>
</div>
</form>
</div>
<a href="{{ url_for('urls.manage_urls') }}" class="top-link">← Volver</a>
{% endblock %}

448
templates/base.html Normal file
View file

@ -0,0 +1,448 @@
<!DOCTYPE html>
<html lang="es">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{% block title %}Agregador de Noticias RSS{% endblock %}</title>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link
href="https://fonts.googleapis.com/css2?family=Poppins:wght@300;400;500;600;700&family=Roboto:wght@300;400;500;700&display=swap"
rel="stylesheet">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
<link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}?v=10">
<!-- TomSelect CSS -->
<link href="https://cdn.jsdelivr.net/npm/tom-select@2.2.2/dist/css/tom-select.css" rel="stylesheet">
<style>
.badge {
display: inline-block;
font-size: .75rem;
line-height: 1;
padding: .35rem .5rem;
border-radius: .5rem;
background: var(--secondary-color, #6c63ff);
color: #fff;
margin-left: .4rem;
}
.switch {
position: relative;
display: inline-block;
width: 42px;
height: 22px;
}
.switch input {
opacity: 0;
width: 0;
height: 0;
}
.slider {
position: absolute;
cursor: pointer;
top: 0;
left: 0;
right: 0;
bottom: 0;
background: #ccc;
transition: .2s;
border-radius: 999px;
}
.slider:before {
position: absolute;
content: "";
height: 16px;
width: 16px;
left: 3px;
bottom: 3px;
background: #fff;
transition: .2s;
border-radius: 50%;
}
.switch input:checked+.slider {
background: var(--secondary-color, #6c63ff);
}
.switch input:checked+.slider:before {
transform: translateX(20px);
}
</style>
</head>
<body class="theme-rss2">
<div class="container">
<!-- Mobile/Global Nav Elements -->
<div class="mobile-header">
<div class="logo-mobile">
<a href="/">THE DAILY FEED</a>
</div>
<button class="mobile-menu-toggle" id="mobile-menu-toggle" aria-label="Abrir menú">
<i class="fas fa-bars"></i>
</button>
</div>
<div class="nav-overlay" id="nav-overlay"></div>
<!-- Desktop Header -->
<header class="desktop-header">
<div class="header-top-row">
<div class="header-title-wrapper">
<h1><a href="/" style="text-decoration: none; color: inherit;">THE DAILY FEED</a></h1>
</div>
<div class="header-user-menu">
<div class="dropdown">
{% if session.get('user_id') %}
<button class="nav-link dropbtn user-menu-large">
{% if session.get('avatar_url') %}
<img src="{{ session.get('avatar_url') }}" alt="Avatar"
style="width: 24px; height: 24px; border-radius: 50%; vertical-align: middle; object-fit: cover; margin-right: 5px;">
{% else %}
<i class="fas fa-user-circle"></i>
{% endif %}
{{ session.get('username') }} <i class="fas fa-chevron-down"></i>
</button>
<div class="dropdown-content dropdown-right">
<a href="{{ url_for('account.index') }}"><i class="fas fa-user"></i> Tu Cuenta</a>
<a href="{{ url_for('favoritos.view_favorites') }}"><i class="fas fa-star"></i> Mis Favoritos</a>
<a href="{{ url_for('account.index', _anchor='search-history') }}"><i class="fas fa-history"></i>
Historial</a>
<div class="dropdown-divider"></div>
<form action="{{ url_for('auth.logout') }}" method="post" style="margin: 0;">
<button type="submit" class="dropdown-logout">
<i class="fas fa-sign-out-alt"></i> Cerrar Sesión
</button>
</form>
</div>
{% else %}
<button class="nav-link dropbtn user-menu-large">
<i class="fas fa-user"></i> Cuenta <i class="fas fa-chevron-down"></i>
</button>
<div class="dropdown-content dropdown-right">
<a href="{{ url_for('auth.login') }}"><i class="fas fa-sign-in-alt"></i> Iniciar Sesión</a>
<a href="{{ url_for('auth.register') }}"><i class="fas fa-user-plus"></i> Registrarse</a>
</div>
{% endif %}
</div>
</div>
</div>
<div class="header-date">
<span id="current-date-header"></span> |
MADRID: <span id="madrid-time">--:--:--</span>
</div>
</header>
<nav class="main-nav" id="main-nav">
<div class="nav-content-wrapper">
<div class="nav-left">
<a href="{{ url_for('home.home') }}" class="nav-link">Inicio</a>
<div class="dropdown">
<button class="nav-link dropbtn">Noticias <i class="fas fa-chevron-down"></i></button>
<div class="dropdown-content">
<a href="{{ url_for('home.home') }}">Todas las Noticias</a>
<a href="{{ url_for('topics.monitor') }}">Temas</a>
<a href="{{ url_for('favoritos.view_favorites') }}">Favoritos</a>
</div>
</div>
<div class="dropdown">
<button class="nav-link dropbtn">Análisis <i class="fas fa-chevron-down"></i></button>
<div class="dropdown-content">
<a href="{{ url_for('stats.index') }}">Estadísticas</a>
<a href="{{ url_for('stats.entities_dashboard') }}">Monitor de Entidades</a>
<a href="{{ url_for('conflicts.index') }}">Conflictos</a>
</div>
</div>
<div class="dropdown">
<button class="nav-link dropbtn">Admin <i class="fas fa-chevron-down"></i></button>
<div class="dropdown-content">
<a href="{{ url_for('feeds.list_feeds') }}">Gestión de Feeds</a>
<a href="{{ url_for('feeds.discover_feed') }}"><i class="fas fa-search-plus"></i> Descubrir Feeds</a>
<a href="{{ url_for('urls.manage_urls') }}">Gestión de URLs</a>
<a href="{{ url_for('backup.restore_feeds') }}">Importar Feeds</a>
<a href="{{ url_for('backup.backup_feeds') }}">Exportar Feeds</a>
<a href="{{ url_for('config.config_home') }}">Configuración</a>
</div>
</div>
</div>
<div class="nav-right">
<button id="dark-mode-toggle" class="icon-btn" title="Cambiar tema">
<i class="fas fa-moon"></i>
</button>
</div>
</div>
</nav>
<script>
(function () {
const options = { weekday: 'long', year: 'numeric', month: 'long', day: 'numeric' };
const dateStr = new Date().toLocaleDateString('es-ES', options);
const dateHeader = document.getElementById('current-date-header');
if (dateHeader) dateHeader.textContent = dateStr;
function updateMadridTime() {
const now = new Date();
const timeString = now.toLocaleTimeString('es-ES', { timeZone: 'Europe/Madrid' });
const el = document.getElementById('madrid-time');
if (el) el.textContent = timeString;
}
setInterval(updateMadridTime, 1000);
updateMadridTime();
})();
</script>
{% with messages = get_flashed_messages(with_categories=true) %}
{% if messages %}
<ul class="flash-messages">
{% for category, message in messages %}
<li class="{{ category }}">{{ message }}</li>
{% endfor %}
</ul>
{% endif %}
{% endwith %}
{% block content %}{% endblock %}
</div>
<!-- TomSelect JS -->
<script src="https://cdn.jsdelivr.net/npm/tom-select@2.2.2/dist/js/tom-select.complete.min.js"></script>
<script>
// Global Tool to Init Selects
function initSearchableSelects(selector = 'select.searchable') {
document.querySelectorAll(selector).forEach((el) => {
if (!el.tomselect) {
new TomSelect(el, {
create: false,
sortField: { field: "text", direction: "asc" },
plugins: ['dropdown_input'],
maxItems: 1
});
}
});
}
// Dark Mode Toggle
const darkModeToggle = document.getElementById('dark-mode-toggle');
const icon = darkModeToggle.querySelector('i');
// Check saved preference
if (localStorage.getItem('darkMode') === 'true') {
document.body.classList.add('dark-mode');
icon.classList.replace('fa-moon', 'fa-sun');
}
darkModeToggle.addEventListener('click', () => {
document.body.classList.toggle('dark-mode');
const isDark = document.body.classList.contains('dark-mode');
localStorage.setItem('darkMode', isDark);
icon.classList.replace(isDark ? 'fa-moon' : 'fa-sun', isDark ? 'fa-sun' : 'fa-moon');
});
// ========== FAVORITES ==========
async function toggleFav(btn) {
const id = btn.dataset.id;
try {
const response = await fetch(`/favoritos/toggle/${id}`, { method: 'POST' });
if (response.ok) {
const data = await response.json();
btn.classList.toggle('active', data.is_favorite);
const i = btn.querySelector('i');
i.className = data.is_favorite ? 'fas fa-star' : 'far fa-star';
}
} catch (e) { console.error("Error favoritos", e); }
}
// Load saved favorites on page load
async function loadFavorites() {
try {
const response = await fetch('/favoritos/ids');
if (response.ok) {
const data = await response.json();
const favIds = new Set(data.ids);
document.querySelectorAll('.btn-fav').forEach(btn => {
if (favIds.has(btn.dataset.id)) {
btn.classList.add('active');
btn.querySelector('i').className = 'fas fa-star';
}
});
}
} catch (e) { /* ignore */ }
}
// Run on page load
if (document.querySelector('.btn-fav')) {
loadFavorites();
}
// ========== READ HISTORY ==========
const READ_STORAGE_KEY = 'readHistory';
const MAX_READ_ITEMS = 500;
function getReadHistory() {
try {
return JSON.parse(localStorage.getItem(READ_STORAGE_KEY)) || [];
} catch { return []; }
}
function markAsRead(id) {
const history = getReadHistory();
if (!history.includes(id)) {
history.unshift(id);
if (history.length > MAX_READ_ITEMS) history.pop();
localStorage.setItem(READ_STORAGE_KEY, JSON.stringify(history));
}
}
function applyReadStyles() {
const history = new Set(getReadHistory());
document.querySelectorAll('.noticia-card').forEach(card => {
const link = card.querySelector('a[href*="/noticia"]');
if (link) {
const href = link.getAttribute('href');
// Extract ID from URL
const match = href.match(/[?&](?:id|tr_id)=([^&]+)/);
if (match && history.has(match[1])) {
card.classList.add('is-read');
}
}
});
}
// Track clicks on news links
document.addEventListener('click', (e) => {
const link = e.target.closest('a[href*="/noticia"]');
if (link) {
const href = link.getAttribute('href');
const match = href.match(/[?&](?:id|tr_id)=([^&]+)/);
if (match) {
markAsRead(match[1]);
}
}
});
// ========== NOTIFICATIONS ==========
let lastCheck = new Date().toISOString();
async function checkNotifications() {
if (Notification.permission !== "granted") return;
try {
const response = await fetch(`/api/notifications/check?last_check=${lastCheck}`);
const data = await response.json();
if (data.has_news) {
lastCheck = data.timestamp;
new Notification("The Daily Feed", {
body: data.message,
icon: "/static/favicon.ico" // Assuming generic icon
});
} else {
// Update timestamp to now to avoid checking old news if server time drifts
lastCheck = new Date().toISOString();
}
} catch (e) {
console.error("Notification check failed", e);
}
}
// Request permission on load
if ("Notification" in window) {
if (Notification.permission === "default") {
// Add a small button or toast to ask for permission instead of auto-prompting which is annoying
// For this demo, we'll auto-check on first interaction if possible, or just log
console.log("Notifications available, waiting for permission.");
}
// Check every 60 seconds
setInterval(checkNotifications, 60000);
}
// Add bell icon to enable notifications if not granted
document.addEventListener('DOMContentLoaded', () => {
if (Notification.permission !== 'granted' && Notification.permission !== 'denied') {
const nav = document.querySelector('.main-nav');
const btn = document.createElement('button');
btn.className = 'nav-link';
btn.innerHTML = '<i class="fas fa-bell"></i>';
btn.style.background = 'none';
btn.style.border = 'none';
btn.style.cursor = 'pointer';
btn.title = 'Activar notificaciones';
btn.onclick = () => {
Notification.requestPermission().then(permission => {
if (permission === "granted") {
btn.style.display = 'none';
new Notification("The Daily Feed", { body: "¡Notificaciones activadas!" });
}
});
};
nav.appendChild(btn);
}
// Init Selects
initSearchableSelects();
});
// Apply read styles on load
applyReadStyles();
// ========== MOBILE NAVIGATION (BULLETPROOF) ==========
const mobileMenuToggle = document.getElementById('mobile-menu-toggle');
const mainNav = document.getElementById('main-nav');
const navOverlay = document.getElementById('nav-overlay');
function toggleMenu() {
const isOpen = mainNav.classList.toggle('active');
navOverlay.classList.toggle('active');
document.body.classList.toggle('no-scroll', isOpen);
const icon = mobileMenuToggle.querySelector('i');
icon.className = isOpen ? 'fas fa-times' : 'fas fa-bars';
}
if (mobileMenuToggle) {
mobileMenuToggle.onclick = (e) => {
e.preventDefault();
e.stopPropagation();
toggleMenu();
};
}
if (navOverlay) {
navOverlay.onclick = toggleMenu;
}
// Interactive Dropdowns for Touch/Click
document.querySelectorAll('.dropbtn').forEach(btn => {
btn.onclick = (e) => {
if (window.innerWidth <= 768) {
e.preventDefault();
e.stopPropagation();
const dropdown = btn.parentNode;
const wasOpen = dropdown.classList.contains('is-open');
// Close other open ones
document.querySelectorAll('.dropdown.is-open').forEach(d => d.classList.remove('is-open'));
// Toggle current
if (!wasOpen) dropdown.classList.add('is-open');
}
};
});
// Close on escape
document.addEventListener('keydown', (e) => {
if (e.key === 'Escape' && mainNav.classList.contains('active')) toggleMenu();
});
</script>
</body>
</html>

452
templates/config.html Normal file
View file

@ -0,0 +1,452 @@
{% extends "base.html" %}
{% block title %}Configuración{% endblock %}
{% block content %}
<div id="backup-overlay"
style="display:none; position:fixed; top:0; left:0; width:100%; height:100%; background:rgba(0,0,0,0.9); z-index:9999; flex-direction:column; justify-content:center; align-items:center; color:white; padding: 2rem; text-align: center;">
<i class="fas fa-database fa-spin fa-3x" style="margin-bottom:20px; color: var(--accent-color);"></i>
<h2 id="backup-title">Preparando backup...</h2>
<div
style="width: 100%; max-width: 400px; background: #333; border-radius: 10px; height: 10px; margin: 20px 0; overflow: hidden;">
<div id="backup-progress-bar"
style="width: 0%; height: 100%; background: var(--accent-color); transition: width 0.3s;"></div>
</div>
<p id="backup-status-text">Calculando noticias...</p>
<button id="btn-close-backup" onclick="hideBackupLoading()" class="btn btn-secondary"
style="margin-top:20px; display: none;">Cerrar</button>
</div>
<script>
let isBackupRunning = false;
let autoReloadTimer = null;
function startAutoReload() {
if (!isBackupRunning) {
autoReloadTimer = setTimeout(() => location.reload(), 30000);
}
}
function stopAutoReload() {
if (autoReloadTimer) clearTimeout(autoReloadTimer);
}
function startBackup() {
isBackupRunning = true;
stopAutoReload();
document.getElementById('backup-overlay').style.display = 'flex';
document.getElementById('backup-title').innerText = "Iniciando Backup...";
document.getElementById('backup-progress-bar').style.width = '0%';
document.getElementById('btn-close-backup').style.display = 'none';
fetch('/config/backup/start')
.then(r => r.json())
.then(data => {
pollBackupStatus(data.task_id);
})
.catch(err => {
alert("Error al iniciar backup");
hideBackupLoading();
});
}
function pollBackupStatus(taskId) {
fetch(`/config/backup/status/${taskId}`)
.then(r => r.json())
.then(data => {
if (data.status === 'processing' || data.status === 'initializing') {
updateBackupUI(data);
setTimeout(() => pollBackupStatus(taskId), 2000);
} else if (data.status === 'completed') {
updateBackupUI(data);
document.getElementById('backup-title').innerText = "¡Backup Completado!";
document.getElementById('backup-status-text').innerText = "Iniciando descarga...";
window.location.href = `/config/backup/download/${taskId}`;
document.getElementById('btn-close-backup').style.display = 'block';
isBackupRunning = false;
startAutoReload();
} else if (data.status === 'error') {
alert("Error: " + data.error);
hideBackupLoading();
}
});
}
function updateBackupUI(data) {
if (data.total > 0) {
const percent = Math.round((data.progress / data.total) * 100);
document.getElementById('backup-progress-bar').style.width = percent + '%';
document.getElementById('backup-status-text').innerText = `Procesando: ${data.progress.toLocaleString()} / ${data.total.toLocaleString()} (${percent}%)`;
document.getElementById('backup-title').innerText = "Generando archivo ZIP...";
}
}
function hideBackupLoading() {
document.getElementById('backup-overlay').style.display = 'none';
isBackupRunning = false;
startAutoReload();
}
// Initialize auto-reload
startAutoReload();
</script>
<div class="config-page">
<h2><i class="fas fa-cog"></i> Configuración</h2>
<div class="config-grid">
<!-- Translator Card -->
<div class="config-card card-wide"
style="display: flex; align-items: center; justify-content: center; min-height: 120px;">
<a href="{{ url_for('config.translator_config') }}" class="btn btn-dark-outline"
style="font-size: 1.1rem; padding: 1rem 2rem;">
<i class="fas fa-robot"></i> Configurar Modelo
</a>
</div>
<!-- Backup Card -->
<div class="config-card">
<div class="card-header">
<div class="card-icon"><i class="fas fa-file-archive"></i></div>
</div>
<h3>Backup (ZIP)</h3>
<p>Exporta todas las noticias y traducciones en un archivo comprimido (ZIP) para ahorrar espacio.</p>
<button onclick="startBackup()" class="btn btn-dark" id="btn-start-backup">
<i class="fas fa-file-download"></i> Descargar ZIP (con progreso)
</button>
<div style="margin-top:10px; padding-top:10px; border-top:1px solid #eee;">
<small style="display:block; margin-bottom:5px; color:#666;">Metadatos:</small>
<div style="display:flex; gap:10px;">
<a href="{{ url_for('backup.export_paises') }}" class="btn btn-small btn-secondary_outline"
style="font-size:0.8em; padding:5px 10px;">
<i class="fas fa-file-csv"></i> Países
</a>
<a href="{{ url_for('backup.export_categorias') }}" class="btn btn-small btn-secondary_outline"
style="font-size:0.8em; padding:5px 10px;">
<i class="fas fa-file-csv"></i> Categorías
</a>
</div>
</div>
</div>
<!-- Restore Card -->
<div class="config-card">
<div class="card-header">
<div class="card-icon"><i class="fas fa-upload"></i></div>
</div>
<h3>Restaurar</h3>
<p>Importa datos desde un backup en formato <strong>JSON</strong> o <strong>ZIP</strong>.</p>
<a href="{{ url_for('config.restore_noticias') }}" class="btn btn-dark-outline">
<i class="fas fa-upload"></i> Subir Backup
</a>
</div>
</div>
</div>
<style>
.config-page h2 {
margin-bottom: 1.5rem;
display: flex;
align-items: center;
gap: 0.5rem;
font-family: 'Playfair Display', 'Times New Roman', serif;
font-weight: 700;
letter-spacing: -0.02em;
}
/* Stats Banner */
.stats-banner {
display: flex;
align-items: center;
justify-content: center;
gap: 1.5rem;
padding: 1rem 1.5rem;
background: #111;
border-radius: 8px;
margin-bottom: 2rem;
box-shadow: 0 4px 20px rgba(0, 0, 0, 0.2);
}
.stat-item {
text-align: center;
}
.stat-value {
font-size: 1.75rem;
font-weight: 700;
color: #fff;
font-family: 'Poppins', sans-serif;
line-height: 1;
}
.stat-value.stat-warning {
color: #ffc107;
}
.stat-value.stat-processing {
color: #17a2b8;
}
.stat-value.stat-error {
color: #dc3545;
}
.stat-label {
font-size: 0.7rem;
color: rgba(255, 255, 255, 0.6);
text-transform: uppercase;
letter-spacing: 0.05em;
margin-top: 0.25rem;
}
.stat-divider {
width: 1px;
height: 40px;
background: rgba(255, 255, 255, 0.15);
}
/* Cards Grid */
.config-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
gap: 1.5rem;
}
.config-card {
background: var(--card-bg, #fff);
border-radius: 12px;
padding: 1.5rem;
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
display: flex;
flex-direction: column;
gap: 0.75rem;
transition: transform 0.2s, box-shadow 0.2s;
border-top: 3px solid #111;
}
.config-card.card-wide {
grid-column: span 2;
}
@media (max-width: 700px) {
.config-card.card-wide {
grid-column: span 1;
}
}
.config-card:hover {
transform: translateY(-2px);
box-shadow: 0 8px 24px rgba(0, 0, 0, 0.12);
}
.card-header {
display: flex;
justify-content: space-between;
align-items: flex-start;
}
.card-icon {
font-size: 1.75rem;
color: #111;
}
.card-status {
display: flex;
align-items: center;
gap: 0.35rem;
font-size: 0.7rem;
text-transform: uppercase;
letter-spacing: 0.05em;
color: #28a745;
font-weight: 600;
}
.card-status .pulse {
width: 8px;
height: 8px;
background: #28a745;
border-radius: 50%;
animation: pulse 2s infinite;
}
@keyframes pulse {
0%,
100% {
opacity: 1;
transform: scale(1);
}
50% {
opacity: 0.5;
transform: scale(1.2);
}
}
.config-card h3 {
margin: 0;
font-size: 1.25rem;
font-family: 'Playfair Display', 'Times New Roman', serif;
font-weight: 600;
}
.config-card p {
color: var(--text-muted, #666);
font-size: 0.9rem;
margin: 0;
line-height: 1.5;
}
/* Big Stats */
.big-stats {
display: flex;
gap: 2rem;
margin: 1rem 0;
padding: 1rem 0;
border-top: 1px solid var(--border-color, #eee);
border-bottom: 1px solid var(--border-color, #eee);
}
.big-stat {
text-align: center;
flex: 1;
}
.big-stat .big-number {
font-size: 2.5rem;
font-weight: 800;
color: #111;
font-family: 'Poppins', sans-serif;
line-height: 1;
}
.big-stat .big-label {
font-size: 0.75rem;
color: var(--text-muted, #666);
text-transform: uppercase;
letter-spacing: 0.05em;
margin-top: 0.35rem;
}
.big-stat.highlight .big-number {
color: #28a745;
}
.card-values {
display: flex;
flex-wrap: wrap;
gap: 0.5rem;
}
.card-values code {
background: #f4f4f4;
padding: 0.25rem 0.5rem;
border-radius: 4px;
font-size: 0.75rem;
font-family: 'JetBrains Mono', monospace;
}
/* Buttons - Black Theme */
.btn {
display: inline-flex;
align-items: center;
justify-content: center;
gap: 0.5rem;
padding: 0.75rem 1.5rem;
border: none;
border-radius: 6px;
font-size: 0.85rem;
font-weight: 600;
cursor: pointer;
text-decoration: none;
text-transform: uppercase;
letter-spacing: 0.03em;
transition: all 0.2s;
}
.btn:hover {
transform: translateY(-1px);
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
}
.btn-dark {
background: #111;
color: #fff;
}
.btn-dark:hover {
background: #333;
}
.btn-dark-outline {
background: transparent;
color: #111;
border: 2px solid #111;
}
.btn-dark-outline:hover {
background: #111;
color: #fff;
}
/* Dark Mode */
.dark-mode .config-card {
background: var(--card-bg-dark, #1e1e1e);
border-top-color: #fff;
}
.dark-mode .card-icon {
color: #fff;
}
.dark-mode .big-stat .big-number {
color: #fff;
}
.dark-mode .card-values code {
background: #333;
}
.dark-mode .stats-banner {
background: #000;
}
.dark-mode .btn-dark {
background: #fff;
color: #111;
}
.dark-mode .btn-dark-outline {
border-color: #fff;
color: #fff;
}
.dark-mode .btn-dark-outline:hover {
background: #fff;
color: #111;
}
/* Responsive */
@media (max-width: 600px) {
.stats-banner {
flex-wrap: wrap;
gap: 1rem;
}
.stat-divider {
display: none;
}
.stat-item {
flex: 0 0 45%;
}
.big-stats {
flex-direction: column;
gap: 1rem;
}
}
</style>
{% endblock %}

View file

@ -0,0 +1,160 @@
{% extends "base.html" %}
{% block title %}Restaurar Noticias{% endblock %}
{% block content %}
<div class="config-form-page">
<h2><i class="fas fa-upload"></i> Restaurar Noticias</h2>
<div class="restore-warning">
<i class="fas fa-exclamation-triangle"></i>
<p>Esta acción importará noticias y traducciones desde un archivo de backup.
Las noticias existentes con el mismo ID serán actualizadas.</p>
</div>
<form method="POST" enctype="multipart/form-data" class="config-form">
<div class="form-group">
<label for="file">
<i class="fas fa-file-upload"></i> Archivo de Backup (JSON)
</label>
<input type="file" id="file" name="file" required>
<small>Selecciona un archivo JSON generado por el backup</small>
</div>
<div class="form-actions">
<a href="{{ url_for('config.config_home') }}" class="btn btn-secondary">
<i class="fas fa-arrow-left"></i> Cancelar
</a>
<button type="submit" class="btn btn-primary" onclick="showLoading()">
<i class="fas fa-upload"></i> Restaurar Backup
</button>
</div>
</form>
<div id="loading-overlay"
style="display:none; position:fixed; top:0; left:0; width:100%; height:100%; background:rgba(0,0,0,0.8); z-index:9999; flex-direction:column; justify-content:center; align-items:center; color:white;">
<i class="fas fa-spinner fa-spin fa-3x" style="margin-bottom:20px;"></i>
<h2>Restaurando noticias...</h2>
<p>Por favor espere, esto puede tardar unos minutos.</p>
</div>
<script>
function showLoading() {
const fileInput = document.querySelector('input[name="file"]');
if (fileInput && fileInput.files.length > 0) {
document.getElementById('loading-overlay').style.display = 'flex';
}
}
</script>
</div>
<style>
.config-form-page h2 {
margin-bottom: 1.5rem;
display: flex;
align-items: center;
gap: 0.5rem;
}
.restore-warning {
display: flex;
align-items: flex-start;
gap: 1rem;
padding: 1rem;
background: #fff3cd;
border-left: 4px solid #ffc107;
border-radius: 8px;
margin-bottom: 1.5rem;
max-width: 500px;
}
.restore-warning i {
color: #856404;
font-size: 1.25rem;
flex-shrink: 0;
}
.restore-warning p {
margin: 0;
color: #856404;
font-size: 0.9rem;
}
.config-form {
max-width: 500px;
background: var(--card-bg, #fff);
padding: 1.5rem;
border-radius: 12px;
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
}
.form-group {
margin-bottom: 1.25rem;
}
.form-group label {
display: flex;
align-items: center;
gap: 0.5rem;
font-weight: 500;
margin-bottom: 0.5rem;
}
.form-group input[type="file"] {
width: 100%;
padding: 0.6rem;
border: 2px dashed var(--border-color, #ddd);
border-radius: 8px;
background: var(--input-bg, #fafafa);
}
.form-group small {
display: block;
margin-top: 0.35rem;
color: var(--text-muted, #666);
font-size: 0.8rem;
}
.form-actions {
display: flex;
gap: 1rem;
justify-content: flex-end;
margin-top: 1.5rem;
}
.btn {
display: inline-flex;
align-items: center;
gap: 0.5rem;
padding: 0.6rem 1rem;
border: none;
border-radius: 8px;
font-size: 0.9rem;
cursor: pointer;
text-decoration: none;
}
.btn-secondary {
background: #6c757d;
color: #fff;
}
.btn-warning {
background: #ffc107;
color: #212529;
}
.dark-mode .config-form {
background: var(--card-bg-dark, #1e1e1e);
}
.dark-mode .restore-warning {
background: #332701;
border-color: #ffc107;
}
.dark-mode .restore-warning p,
.dark-mode .restore-warning i {
color: #ffc107;
}
</style>
{% endblock %}

View file

@ -0,0 +1,187 @@
{% extends "base.html" %}
{% block title %}Configurar Traductor{% endblock %}
{% block content %}
<div class="config-form-page">
<h2><i class="fas fa-language"></i> Configuración del Traductor</h2>
<form method="POST" class="config-form">
<div class="form-group">
<label for="target_langs">
<i class="fas fa-globe"></i> Idiomas Destino
</label>
<input type="text" id="target_langs" name="target_langs" value="{{ config.target_langs }}"
placeholder="es,en,fr">
<small>Separados por coma. Ej: es,en,fr</small>
</div>
<div class="form-group">
<label for="translator_batch">
<i class="fas fa-layer-group"></i> Tamaño de Batch
</label>
<select id="translator_batch" name="translator_batch">
{% for b in [8, 16, 32, 64, 128] %}
<option value="{{ b }}" {% if config.translator_batch|int==b %}selected{% endif %}>{{ b }}</option>
{% endfor %}
</select>
<small>Número de textos a traducir por lote (8-128)</small>
</div>
<div class="form-group">
<label for="universal_model">
<i class="fas fa-brain"></i> Modelo Universal
</label>
<select id="universal_model" name="universal_model">
{% set models = [
('facebook/nllb-200-distilled-600M', 'NLLB-200 Distilled 600M (Rápido / Default)'),
('facebook/nllb-200-distilled-1.3B', 'NLLB-200 Distilled 1.3B (Mejor Calidad / Lento)'),
('facebook/nllb-200-1.3B', 'NLLB-200 1.3B (Raw / Lento)'),
('facebook/nllb-200-3.3B', 'NLLB-200 3.3B (Máxima Calidad / Muy Lento / Requiere mucha RAM)')
] %}
{% for m_id, m_name in models %}
<option value="{{ m_id }}" {% if config.universal_model==m_id %}selected{% endif %}>{{ m_name }}
</option>
{% endfor %}
</select>
<small>Selecciona el modelo de traducción. Actualmente usando: <strong>{{ config.universal_model
}}</strong></small>
<div class="alert alert-warning" style="margin-top: 10px; font-size: 0.9em; display: none;"
id="model-warning">
<i class="fas fa-exclamation-triangle"></i> <strong>Atención:</strong> Cambiar el modelo eliminará todas
las traducciones existentes para regenerarlas con el nuevo modelo.
</div>
</div>
<script>
document.getElementById('universal_model').addEventListener('change', function () {
var current = "{{ config.universal_model }}";
var warning = document.getElementById('model-warning');
if (this.value !== current) {
warning.style.display = 'block';
} else {
warning.style.display = 'none';
}
});
</script>
<div class="form-group">
<label for="ct2_compute_type">
<i class="fas fa-microchip"></i> Tipo de Cuantización
</label>
<select id="ct2_compute_type" name="ct2_compute_type">
<option value="auto" {% if config.ct2_compute_type=='auto' %}selected{% endif %}>auto</option>
<option value="int8" {% if config.ct2_compute_type=='int8' %}selected{% endif %}>int8 (más rápido, menos
preciso)</option>
<option value="float16" {% if config.ct2_compute_type=='float16' %}selected{% endif %}>float16 (más
preciso)</option>
<option value="int8_float16" {% if config.ct2_compute_type=='int8_float16' %}selected{% endif %}>
int8_float16 (balance)</option>
</select>
<small>Requiere reiniciar el contenedor</small>
</div>
<div class="form-actions">
<a href="{{ url_for('config.config_home') }}" class="btn btn-secondary">
<i class="fas fa-arrow-left"></i> Volver
</a>
<button type="submit" class="btn btn-primary">
<i class="fas fa-save"></i> Guardar
</button>
</div>
</form>
</div>
<style>
.config-form-page h2 {
margin-bottom: 1.5rem;
display: flex;
align-items: center;
gap: 0.5rem;
}
.config-form {
max-width: 500px;
background: var(--card-bg, #fff);
padding: 1.5rem;
border-radius: 12px;
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
}
.form-group {
margin-bottom: 1.25rem;
}
.form-group label {
display: flex;
align-items: center;
gap: 0.5rem;
font-weight: 500;
margin-bottom: 0.5rem;
}
.form-group input,
.form-group select {
width: 100%;
padding: 0.6rem 0.75rem;
border: 1px solid var(--border-color, #ddd);
border-radius: 8px;
font-size: 1rem;
background: var(--input-bg, #fff);
color: var(--text-color, #333);
}
.form-group input:focus,
.form-group select:focus {
outline: none;
border-color: var(--secondary-color, #6c63ff);
box-shadow: 0 0 0 3px rgba(108, 99, 255, 0.15);
}
.form-group small {
display: block;
margin-top: 0.35rem;
color: var(--text-muted, #666);
font-size: 0.8rem;
}
.form-actions {
display: flex;
gap: 1rem;
justify-content: flex-end;
margin-top: 1.5rem;
}
.btn {
display: inline-flex;
align-items: center;
gap: 0.5rem;
padding: 0.6rem 1rem;
border: none;
border-radius: 8px;
font-size: 0.9rem;
cursor: pointer;
text-decoration: none;
}
.btn-primary {
background: var(--secondary-color, #6c63ff);
color: #fff;
}
.btn-secondary {
background: #6c757d;
color: #fff;
}
.dark-mode .config-form {
background: var(--card-bg-dark, #1e1e1e);
}
.dark-mode .form-group input,
.dark-mode .form-group select {
background: #2a2a2a;
border-color: #444;
color: #eee;
}
</style>
{% endblock %}

View file

@ -0,0 +1,163 @@
{% extends "base.html" %}
{% block title %}Timeline: {{ conflict.name }}{% endblock %}
{% block content %}
<div style="margin-bottom: 2rem;">
<div style="display: flex; justify-content: space-between; align-items: center;">
<h2>
<a href="{{ url_for('conflicts.index') }}" style="text-decoration: none; color: inherit;">
<i class="fas fa-arrow-left" style="font-size: 1rem; vertical-align: middle;"></i>
</a>
Timeline: {{ conflict.name }}
</h2>
<span class="badge" style="font-size: 1rem;">{{ noticias|length }} eventos</span>
</div>
<p style="color: #666;">
Keywords: {{ conflict.keywords }}
</p>
</div>
{% if not noticias %}
<div class="card" style="text-align: center; padding: 3rem;">
<p>No se encontraron noticias recientes con las palabras clave especificadas.</p>
</div>
{% else %}
<div class="timeline">
{% for n in noticias %}
<div class="timeline-item">
<div class="timeline-date">
{% if n.fecha %}
<span class="date-day">{{ n.fecha.strftime('%d') }}</span>
<span class="date-month">{{ n.fecha.strftime('%b') }}</span>
<span class="date-year">{{ n.fecha.strftime('%Y') }}</span>
{% endif %}
</div>
<div class="timeline-marker"></div>
<div class="timeline-content card">
{% if n.imagen_url %}
<div class="timeline-img">
<img src="{{ n.imagen_url }}" loading="lazy" onerror="this.style.display='none'">
</div>
{% endif %}
<h3>
<a
href="{{ url_for('noticia.noticia', tr_id=n.tr_id if n.tr_id else None, id=n.id if not n.tr_id else None) }}">
{{ n.titulo }}
</a>
</h3>
<div class="noticia-meta">
{{ n.fuente_nombre }}
{% if n.pais %}| {{ n.pais }}{% endif %}
</div>
<p>{{ (n.resumen or '') | safe_html | truncate(150) }}</p>
</div>
</div>
{% endfor %}
</div>
{% endif %}
<style>
/* Timeline Styles */
.timeline {
position: relative;
max-width: 900px;
margin: 0 auto;
padding: 20px 0;
}
/* Vertical Line */
.timeline::before {
content: '';
position: absolute;
top: 0;
bottom: 0;
left: 80px;
/* Position of line */
width: 2px;
background: var(--border-color);
}
.timeline-item {
position: relative;
margin-bottom: 40px;
padding-left: 120px;
/* Space for date and line */
}
.timeline-date {
position: absolute;
left: 0;
top: 0;
width: 60px;
text-align: center;
line-height: 1.2;
}
.date-day {
display: block;
font-size: 1.5rem;
font-weight: bold;
color: var(--accent-color);
}
.date-month {
display: block;
font-size: 0.9rem;
text-transform: uppercase;
font-weight: bold;
}
.date-year {
display: block;
font-size: 0.8rem;
color: #888;
}
.timeline-marker {
position: absolute;
left: 74px;
/* On the line (80px - 6px radius) */
top: 10px;
width: 14px;
height: 14px;
background: var(--paper-color);
border: 3px solid var(--accent-color);
border-radius: 50%;
z-index: 1;
}
.timeline-content {
position: relative;
padding: 1.5rem;
border-left: 4px solid var(--accent-color);
}
.timeline-content h3 {
font-size: 1.3rem;
margin-top: 0;
}
.timeline-img img {
width: 100%;
height: 150px;
object-fit: cover;
border-radius: 4px;
margin-bottom: 10px;
}
/* Dark Mode Overrides */
.dark-mode .timeline::before {
background: #444;
}
.dark-mode .timeline-marker {
background: #1a1a2e;
}
</style>
{% endblock %}

View file

@ -0,0 +1,118 @@
{% extends "base.html" %}
{% block title %}Conflictos{% endblock %}
{% block content %}
<div class="row">
<div style="margin-bottom: 2rem;">
<h2><i class="fas fa-exclamation-triangle"></i> Conflictos Monitorizados</h2>
<p>Define temas o conflictos para generar líneas de tiempo automáticas basadas en palabras clave.</p>
</div>
<!-- Create Form -->
<div class="card" style="margin-bottom: 2rem;">
<h3>Crear Nuevo Conflicto</h3>
<form action="{{ url_for('conflicts.create') }}" method="POST">
<div class="filter-row">
<div class="filter-group" style="flex: 2;">
<label>Nombre del Conflicto</label>
<input type="text" name="name" placeholder="Ej: Camboya vs Tailandia" required>
</div>
<div class="filter-group" style="flex: 3;">
<label>Palabras Clave (separadas por coma)</label>
<input type="text" name="keywords" placeholder="Ej: Camboya, Tailandia, Preah Vihear" required>
</div>
<div class="filter-group">
<label>&nbsp;</label>
<button type="submit" class="btn btn-primary" style="width: 100%;">Crear</button>
</div>
</div>
<div style="margin-top: 10px;">
<label style="font-weight: 600; font-size: 0.9rem;">Descripción (Opcional)</label>
<input type="text" name="description" style="width: 100%;"
placeholder="Breve descripción del contexto...">
</div>
</form>
</div>
<!-- List -->
<div class="conflicts-grid">
{% for c in conflicts %}
<div class="card conflict-card">
<div class="conflict-header">
<h3>{{ c.name }}</h3>
<form action="{{ url_for('conflicts.delete', id=c.id) }}" method="POST"
onsubmit="return confirm('¿Eliminar este conflicto?');">
<button type="submit" class="btn-icon" title="Eliminar"><i class="fas fa-trash"></i></button>
</form>
</div>
<p class="conflict-desc">{{ c.description or 'Sin descripción' }}</p>
<div class="keyword-tags">
{% for k in c.keywords.split(',') %}
{% if k.strip() %}
<span class="badge">{{ k.strip() }}</span>
{% endif %}
{% endfor %}
</div>
<div style="margin-top: 1rem; text-align: right;">
<a href="{{ url_for('conflicts.timeline', id=c.id) }}" class="btn">
<i class="fas fa-stream"></i> Ver Línea de Tiempo
</a>
</div>
</div>
{% else %}
<p style="text-align: center; color: #666;">No hay conflictos definidos.</p>
{% endfor %}
</div>
</div>
<style>
.conflicts-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(350px, 1fr));
gap: 20px;
}
.conflict-header {
display: flex;
justify-content: space-between;
align-items: flex-start;
margin-bottom: 10px;
}
.conflict-header h3 {
margin: 0;
font-size: 1.4rem;
color: var(--accent-color);
}
.conflict-desc {
color: #666;
font-size: 0.9rem;
margin-bottom: 15px;
min-height: 40px;
}
.keyword-tags {
display: flex;
flex-wrap: wrap;
gap: 5px;
}
.btn-icon {
background: none;
border: none;
color: #999;
cursor: pointer;
padding: 5px;
}
.btn-icon:hover {
color: #e74c3c;
}
.dark-mode .conflict-desc {
color: #aaa;
}
</style>
{% endblock %}

144
templates/dashboard.html Normal file
View file

@ -0,0 +1,144 @@
{% extends "base.html" %}
{% block title %}Dashboard{% endblock %}
{% block content %}
<div class="dashboard-grid">
<div class="stat-card">
<div class="stat-number">{{ stats.feeds_totales }}</div>
<div class="stat-label">Feeds Totales</div>
</div>
<div class="stat-card">
<div class="stat-number">{{ stats.noticias_totales }}</div>
<div class="stat-label">Noticias Totales</div>
</div>
<div class="stat-card">
<div class="stat-number">{{ stats.feeds_caidos }}</div>
<div class="stat-label">Feeds Caídos</div>
</div>
</div>
<div class="row">
<div class="col-md-6 mb-4">
<div class="card">
<div class="card-header">
<h3>Gestión de Feeds RSS</h3>
</div>
<div class="card-body">
<p>
Exporta tu lista de feeds RSS o restaura/importa desde un archivo CSV.
Además, puedes ir al organizador avanzado de feeds para filtrarlos
por país, categoría y estado.
</p>
<div style="display:flex; flex-wrap:wrap; gap:10px; margin-bottom:10px;">
<a href="{{ url_for('feeds.list_feeds') }}" class="btn btn-secondary">
<i class="fas fa-list"></i> Ver / Gestionar Feeds
</a>
</div>
<div style="display:flex; flex-wrap:wrap; gap:8px; margin-bottom:15px;">
<a href="{{ url_for('feeds.list_feeds', estado='activos') }}" class="btn btn-small">
<i class="fas fa-check-circle"></i> Feeds activos
</a>
<a href="{{ url_for('feeds.list_feeds', estado='inactivos') }}" class="btn btn-small btn-danger">
<i class="fas fa-times-circle"></i> Feeds caídos/inactivos
</a>
<a href="{{ url_for('feeds.list_feeds', estado='errores') }}" class="btn btn-small btn-info">
<i class="fas fa-exclamation-triangle"></i> Feeds con errores
</a>
</div>
<hr style="margin: 15px 0; border: 0; border-top: 1px solid var(--border-color);">
<a href="{{ url_for('backup_feeds') }}" class="btn">
<i class="fas fa-download"></i> Exportar Feeds
</a>
<a href="{{ url_for('restore_feeds') }}" class="btn btn-info">
<i class="fas fa-upload"></i> Importar Feeds
</a>
</div>
</div>
</div>
<div class="col-md-6 mb-4">
<div class="card">
<div class="card-header">
<h3>Gestión de Fuentes URL</h3>
</div>
<div class="card-body">
<p>Exporta tu lista de fuentes URL o restaura/importa desde un archivo CSV.</p>
<a href="{{ url_for('backup_urls') }}" class="btn">
<i class="fas fa-download"></i> Exportar URLs
</a>
<a href="{{ url_for('restore_urls') }}" class="btn btn-info">
<i class="fas fa-upload"></i> Importar Fuentes URL
</a>
</div>
</div>
</div>
</div>
<div class="card">
<div class="card-header">
<h3>Operaciones del Sistema</h3>
</div>
<div class="card-body">
<p>Genera o restaura una copia de seguridad completa de todas tus fuentes y noticias.</p>
<div style="display:flex; gap:10px; flex-wrap:wrap;">
<a href="{{ url_for('backup_completo') }}" class="btn btn-secondary">
<i class="fas fa-archive"></i> Backup Completo (.zip)
</a>
<a href="{{ url_for('restore_completo') }}" class="btn btn-info">
<i class="fas fa-upload"></i> Restaurar Backup (.zip)
</a>
</div>
</div>
</div>
{% if top_tags and top_tags|length > 0 %}
<div class="card">
<div class="card-header">
<h3>Top tags (últimas 24h)</h3>
</div>
<div class="card-body" style="padding:0;">
<table style="width:100%; border-collapse: collapse;">
<thead>
<tr style="background-color: rgba(0,0,0,0.05);">
<th style="padding: 12px 15px; text-align: left;">Tag</th>
<th style="padding: 12px 15px; text-align: left;">Tipo</th>
<th style="padding: 12px 15px; text-align: right;">Apariciones</th>
</tr>
</thead>
<tbody>
{% for t in top_tags %}
<tr>
<td style="padding: 12px 15px; border-top: 1px solid var(--border-color);">
{{ t.valor }}
</td>
<td style="padding: 12px 15px; border-top: 1px solid var(--border-color); text-transform: capitalize;">
{{ t.tipo }}
</td>
<td style="padding: 12px 15px; border-top: 1px solid var(--border-color); text-align: right;">
{{ t.apariciones }}
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
{% else %}
<div class="card">
<div class="card-header">
<h3>Top tags (últimas 24h)</h3>
</div>
<div class="card-body">
<p style="color: var(--text-color-light); margin: 0;">No hay tags para mostrar todavía.</p>
</div>
</div>
{% endif %}
{% endblock %}

View file

@ -0,0 +1,409 @@
{% extends "base.html" %}
{% block title %}Descubrir Feeds RSS{% endblock %}
{% block content %}
<div class="card feed-detail-card"
style="padding: 40px; border-radius: 15px; background-color: #fdfdfd; box-shadow: 0 10px 30px rgba(0,0,0,0.05);">
<h1
style="font-family: var(--primary-font); font-weight: 700; margin-bottom: 30px; border-bottom: 2px solid var(--accent-color); display: inline-block; padding-bottom: 10px;">
<i class="fas fa-search"></i> Descubrir Feeds RSS
</h1>
<p style="margin-bottom: 30px; color: #666;">
Ingresa la URL de un sitio web y automáticamente descubriremos todos los feeds RSS disponibles.
</p>
<!-- Loading Overlay -->
<div id="searching-overlay"
style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(255,255,255,0.9); z-index: 1000; flex-direction: column; align-items: center; justify-content: center;">
<div class="spinner"
style="border: 4px solid #f3f3f3; border-top: 4px solid var(--accent-color); border-radius: 50%; width: 50px; height: 50px; animation: spin 1s linear infinite;">
</div>
<h3 style="margin-top: 20px; color: #333;">Analizando sitio web...</h3>
<p style="color: #666;">Buscando feeds RSS, esto puede tardar unos segundos.</p>
</div>
<!-- Discovery Form -->
<form action="{{ url_for('feeds.discover_feed') }}" method="post" class="form-grid" style="margin-bottom: 40px;"
onsubmit="document.getElementById('searching-overlay').style.display = 'flex';">
<div class="form-row">
<label for="source_url">URL del sitio web</label>
<input type="url" id="source_url" name="source_url" placeholder="https://ejemplo.com"
value="{{ source_url }}" required style="font-size: 16px;">
</div>
<div class="form-row" style="border: none; padding-top: 20px;">
<div></div>
<div class="form-actions">
<button class="btn btn-primary" type="submit">
<i class="fas fa-search"></i> Buscar Feeds
</button>
<a href="{{ url_for('feeds.list_feeds') }}" class="btn btn-secondary">
<i class="fas fa-arrow-left"></i> Volver
</a>
</div>
</div>
</form>
<!-- Results -->
{% if discovered_feeds %}
<hr style="margin: 40px 0; border: none; border-top: 1px solid #e0e0e0;">
{% set new_feeds_count = discovered_feeds | rejectattr('exists') | list | length %}
<h2 style="font-size: 24px; margin-bottom: 20px; color: #333;">
<i class="fas fa-rss"></i> Feeds Disponibles: <strong>{{ new_feeds_count }}</strong> <span
style="font-size: 16px; color: #777; font-weight: normal;">(de {{ discovered_feeds|length }} encontrados en
total)</span>
</h2>
<form action="{{ url_for('feeds.discover_and_add') }}" method="post">
<!-- Global Settings -->
<div class="form-grid"
style="background: #f8f9fa; padding: 20px; border-radius: 10px; margin-bottom: 30px; border: 1px solid #e0e0e0;">
<h3
style="grid-column: 1 / -1; font-size: 16px; margin-bottom: 15px; color: #555; text-transform: uppercase; font-weight: 700; letter-spacing: 0.5px;">
<i class="fas fa-sliders-h"></i> Configuración Masiva
</h3>
<div class="form-row">
<label for="global_categoria_id">Aplicar Categoría a todos:</label>
<div style="display: flex; gap: 10px;">
<select id="global_categoria_id" class="form-control">
<option value="">— Seleccionar —</option>
{% for c in categorias %}
<option value="{{ c.id }}">{{ c.nombre }}</option>
{% endfor %}
</select>
<button type="button" class="btn btn-secondary btn-sm" onclick="applyGlobalCategory()"
title="Aplicar a todos">
<i class="fas fa-arrow-down"></i>
</button>
</div>
</div>
<div class="form-row">
<label for="global_pais_id">Aplicar País a todos:</label>
<div style="display: flex; gap: 10px;">
<select id="global_pais_id" class="form-control">
<option value="">— Seleccionar —</option>
{% for p in paises %}
<option value="{{ p.id }}">{{ p.nombre }}</option>
{% endfor %}
</select>
<button type="button" class="btn btn-secondary btn-sm" onclick="applyGlobalCountry()"
title="Aplicar a todos">
<i class="fas fa-arrow-down"></i>
</button>
</div>
</div>
<div class="form-row">
<label for="global_idioma">Aplicar Idioma a todos:</label>
<div style="display: flex; gap: 10px;">
<input type="text" id="global_idioma" class="form-control" maxlength="5" placeholder="es" value="es"
style="width: 80px;">
<button type="button" class="btn btn-secondary btn-sm" onclick="applyGlobalLanguage()"
title="Aplicar a todos">
<i class="fas fa-arrow-down"></i>
</button>
</div>
</div>
</div>
<!-- Feed List -->
<div style="margin-bottom: 30px;">
{% for feed in discovered_feeds %}
<div class="feed-discovery-item" style="
background: {{ 'white' if feed.valid and not feed.exists else ('#f0f0f0' if feed.exists else '#fff5f5') }};
border: 1px solid {{ '#e0e0e0' if feed.valid else '#ffcdd2' }};
border-left: 5px solid {{ '#4CAF50' if feed.valid and not feed.exists else ('#9e9e9e' if feed.exists else '#ff5252') }};
border-radius: 8px;
padding: 20px;
margin-bottom: 15px;
display: grid;
grid-template-columns: 40px 1fr 280px auto;
gap: 20px;
align-items: start;
transition: all 0.2s ease;
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
opacity: {{ '0.8' if feed.exists else '1' }};
">
<!-- Checkbox -->
<div style="padding-top: 5px; display: flex; justify-content: center;">
{% if feed.exists %}
<i class="fas fa-check-circle" style="color: #4CAF50; font-size: 22px;"
title="Ya existe en la base de datos"></i>
{% elif feed.valid %}
<input type="checkbox" name="selected_feeds" value="{{ feed.url }}" id="feed_{{ loop.index }}"
checked style="width: 22px; height: 22px; cursor: pointer; border-radius: 4px;">
<input type="hidden" name="context_{{ feed.url }}" value="{{ feed.context_label }}">
{% else %}
<i class="fas fa-exclamation-triangle" style="color: #ff5252; font-size: 20px;"
title="{{ feed.error }}"></i>
{% endif %}
</div>
<!-- Feed Info -->
<div>
<label for="feed_{{ loop.index }}" style="cursor: pointer; display: block; margin-bottom: 8px;">
<strong
style="font-size: 18px; color: #{{ '555' if feed.exists else '333' }}; line-height: 1.3;">
{{ feed.title }}
</strong>
{% if feed.exists %}
<span class="badge"
style="background: #e0e0e0; color: #555; font-size: 11px; vertical-align: middle; margin-left: 8px;">YA
INSTALADO</span>
{% endif %}
</label>
{% if feed.description %}
<p style="color: #666; margin-bottom: 12px; font-size: 14px; line-height: 1.5;">
{{ feed.description[:250] }}{% if feed.description|length > 250 %}...{% endif %}
</p>
{% endif %}
<div style="font-size: 13px; color: #888; display: flex; flex-direction: column; gap: 6px;">
<div>
<i class="fas fa-link" style="width: 16px; text-align: center;"></i>
<a href="{{ feed.url }}" target="_blank"
style="color: #888; text-decoration: none; border-bottom: 1px dotted #ccc;">
{{ feed.url[:60] }}{% if feed.url|length > 60 %}...{% endif %}
</a>
</div>
{% if feed.context_label %}
<div style="color: #1976D2; font-weight: 500;">
<i class="fas fa-tag" style="width: 16px; text-align: center;"></i> Encontrado en: "{{
feed.context_label }}"
</div>
{% endif %}
{% if feed.valid %}
<div style="display: flex; gap: 15px; margin-top: 5px;">
{% if feed.type %}
<span class="badge"
style="background: #e3f2fd; color: #1565c0; padding: 2px 8px; border-radius: 4px;">
{{ feed.type|upper }}
</span>
{% endif %}
{% if feed.entry_count is defined %}
<span class="badge"
style="background: #f3e5f5; color: #7b1fa2; padding: 2px 8px; border-radius: 4px;">
{{ feed.entry_count }} items
</span>
{% endif %}
</div>
{% else %}
<div style="color: #d32f2f; margin-top: 5px;">
<i class="fas fa-info-circle"></i> Error: {{ feed.error }}
</div>
{% endif %}
</div>
</div>
<!-- Individual Configurations -->
{% if feed.valid %}
<div style="background: #fdfdfd; padding: 15px; border-radius: 8px; border: 1px solid #eee;">
<div style="margin-bottom: 10px;">
<label
style="font-size: 12px; font-weight: 600; color: #555; display: block; margin-bottom: 4px;">Categoría</label>
<select name="cat_{{ feed.url }}" class="item-category-select"
style="width: 100%; padding: 6px; border: 1px solid #ddd; border-radius: 4px; font-size: 13px;">
<option value="">— Seleccionar —</option>
{% for c in categorias %}
<option value="{{ c.id }}">{{ c.nombre }}</option>
{% endfor %}
</select>
</div>
<div style="margin-bottom: 10px;">
<label
style="font-size: 12px; font-weight: 600; color: #555; display: block; margin-bottom: 4px;">País</label>
<select name="country_{{ feed.url }}" class="item-country-select"
style="width: 100%; padding: 6px; border: 1px solid #ddd; border-radius: 4px; font-size: 13px;">
<option value="">— Seleccionar —</option>
{% for p in paises %}
<option value="{{ p.id }}">{{ p.nombre }}</option>
{% endfor %}
</select>
</div>
<div>
<label
style="font-size: 12px; font-weight: 600; color: #555; display: block; margin-bottom: 4px;">Idioma</label>
<input type="text" name="lang_{{ feed.url }}" class="item-language-input" value="es"
maxlength="5"
style="width: 100%; padding: 6px; border: 1px solid #ddd; border-radius: 4px; font-size: 13px;">
</div>
</div>
{% else %}
<div></div>
{% endif %}
<!-- Actions -->
<div style="display: flex; flex-direction: column; gap: 10px; justify-content: flex-start;">
{% if feed.valid %}
<button type="button" class="btn btn-primary btn-sm" onclick="addSingleFeed('{{ feed.url }}')"
style="white-space: nowrap; box-shadow: 0 2px 5px rgba(0,0,0,0.1);">
<i class="fas fa-plus"></i> Añadir
</button>
<a href="{{ feed.url }}" target="_blank" class="btn btn-outline-secondary btn-sm"
style="white-space: nowrap;">
<i class="fas fa-external-link-alt"></i> Ver XML
</a>
{% endif %}
</div>
</div>
{% endfor %}
</div>
<!-- Action Buttons -->
<div class="form-actions"
style="display: flex; gap: 15px; justify-content: flex-end; padding-top: 20px; border-top: 1px solid #e0e0e0; position: sticky; bottom: 0; background: white; z-index: 10; padding-bottom: 20px;">
<div style="margin-right: auto; align-self: center; color: #666; font-size: 14px;">
<span id="selected_count">{{ discovered_feeds|selectattr('valid')|list|length }}</span> feeds
seleccionados
</div>
<button type="button" class="btn btn-secondary" onclick="toggleAllFeeds(true)">
<i class="fas fa-check-square"></i> Todos
</button>
<button type="button" class="btn btn-secondary" onclick="toggleAllFeeds(false)">
<i class="fas fa-square"></i> Ninguno
</button>
<button class="btn btn-primary" type="submit"
style="padding: 10px 25px; font-weight: 600; box-shadow: 0 4px 10px rgba(0,0,0,0.1);">
<i class="fas fa-plus-circle"></i> AÑADIR SELECCIONADOS
</button>
</div>
</form>
{% endif %}
</div>
<script>
function toggleAllFeeds(select) {
const checkboxes = document.querySelectorAll('input[name="selected_feeds"]');
checkboxes.forEach(cb => {
cb.checked = select;
});
updateCount();
}
function addSingleFeed(url) {
// Collect specific values
const cat = document.querySelector(`select[name="cat_${url}"]`).value;
const country = document.querySelector(`select[name="country_${url}"]`).value;
const lang = document.querySelector(`input[name="lang_${url}"]`).value;
const context = document.querySelector(`input[name="context_${url}"]`) ? document.querySelector(`input[name="context_${url}"]`).value : '';
const formData = new FormData();
formData.append('selected_feeds', url);
formData.append(`cat_${url}`, cat);
formData.append(`country_${url}`, country);
formData.append(`lang_${url}`, lang);
if (context) formData.append(`context_${url}`, context);
const btn = event.target.closest('button');
const originalContent = btn.innerHTML;
btn.innerHTML = '<i class="fas fa-spinner fa-spin"></i> ...';
btn.disabled = true;
fetch('{{ url_for("feeds.discover_and_add") }}', {
method: 'POST',
headers: {
'X-Requested-With': 'XMLHttpRequest'
},
body: formData
})
.then(response => response.json())
.then(data => {
if (data.success) {
btn.innerHTML = '<i class="fas fa-check"></i> Añadido';
btn.classList.remove('btn-primary');
btn.classList.add('btn-success');
// Disable inputs for this row
document.querySelector(`select[name="cat_${url}"]`).disabled = true;
document.querySelector(`select[name="country_${url}"]`).disabled = true;
document.querySelector(`input[name="lang_${url}"]`).disabled = true;
} else {
btn.innerHTML = '<i class="fas fa-times"></i> Error';
btn.classList.remove('btn-primary');
btn.classList.add('btn-danger');
alert('No se pudo añadir: ' + (data.errors ? data.errors.join(', ') : 'Error desconocido'));
setTimeout(() => {
btn.innerHTML = originalContent;
btn.disabled = false;
btn.classList.remove('btn-danger');
btn.classList.add('btn-primary');
}, 3000);
}
})
.catch(error => {
console.error('Error:', error);
btn.innerHTML = originalContent;
btn.disabled = false;
alert('Error de conexión');
});
}
function updateCount() {
const count = document.querySelectorAll('input[name="selected_feeds"]:checked').length;
document.getElementById('selected_count').innerText = count;
}
// Update count on individual clicks
document.addEventListener('change', function (e) {
if (e.target.name === 'selected_feeds') {
updateCount();
}
});
// Mass Update Functions
function applyGlobalCategory() {
const val = document.getElementById('global_categoria_id').value;
document.querySelectorAll('.item-category-select').forEach(el => el.value = val);
}
function applyGlobalCountry() {
const val = document.getElementById('global_pais_id').value;
document.querySelectorAll('.item-country-select').forEach(el => el.value = val);
}
function applyGlobalLanguage() {
const val = document.getElementById('global_idioma').value;
document.querySelectorAll('.item-language-input').forEach(el => el.value = val);
}
// Add hover effect to feed items
document.addEventListener('DOMContentLoaded', function () {
const feedItems = document.querySelectorAll('.feed-discovery-item');
feedItems.forEach(item => {
item.addEventListener('mouseenter', function () {
this.style.boxShadow = '0 8px 20px rgba(0,0,0,0.08)';
this.style.transform = 'translateY(-2px)';
});
item.addEventListener('mouseleave', function () {
this.style.boxShadow = '0 2px 5px rgba(0,0,0,0.05)';
this.style.transform = 'translateY(0)';
});
});
updateCount();
});
</script>
<style>
.feed-discovery-item input[type="checkbox"] {
accent-color: var(--accent-color, #4CAF50);
}
.form-control {
border: 1px solid #ddd;
border-radius: 4px;
padding: 8px;
font-size: 14px;
}
.btn-sm {
padding: 5px 10px;
font-size: 12px;
}
</style>
{% endblock %}

87
templates/edit_feed.html Normal file
View file

@ -0,0 +1,87 @@
{% extends "base.html" %}
{% block title %}Editar Feed RSS{% endblock %}
{% block content %}
<div class="card feed-detail-card"
style="padding: 40px; border-radius: 15px; background-color: #fdfdfd; box-shadow: 0 10px 30px rgba(0,0,0,0.05);">
<h1
style="font-family: var(--primary-font); font-weight: 700; margin-bottom: 30px; border-bottom: 2px solid var(--accent-color); display: inline-block; padding-bottom: 10px;">
Editar Feed
</h1>
<p class="subtitle" style="margin-bottom: 30px; font-style: italic; color: #666;">
Modificando fuente: <strong>{{ feed.nombre }}</strong>
</p>
<form method="post" action="{{ url_for('feeds.edit_feed', feed_id=feed.id) }}" autocomplete="off" class="form-grid">
<div class="form-row">
<label for="nombre">Nombre del feed</label>
<input id="nombre" name="nombre" type="text" value="{{ feed.nombre }}" required>
</div>
<div class="form-row">
<label for="descripcion">Descripción</label>
<textarea id="descripcion" name="descripcion" rows="3">{{ feed.descripcion or '' }}</textarea>
</div>
<div class="form-row">
<label for="url">URL del RSS</label>
<input id="url" name="url" type="url" value="{{ feed.url }}" required>
</div>
<div class="form-row">
<label for="categoria_id">Categoría</label>
<select id="categoria_id" name="categoria_id">
<option value="">— Sin categoría —</option>
{% for c in categorias %}
<option value="{{ c.id }}" {% if c.id==feed.categoria_id %}selected{% endif %}>
{{ c.nombre }}
</option>
{% endfor %}
</select>
</div>
<div class="form-row">
<label for="pais_id">País</label>
<select id="pais_id" name="pais_id">
<option value="">— Global —</option>
{% for p in paises %}
<option value="{{ p.id }}" {% if p.id==feed.pais_id %}selected{% endif %}>
{{ p.nombre }}
</option>
{% endfor %}
</select>
</div>
<div class="form-row">
<label for="idioma">Idioma (2 letras)</label>
<input id="idioma" name="idioma" type="text" value="{{ feed.idioma or '' }}" maxlength="2">
</div>
<div class="form-row">
<div></div> <!-- Alignment -->
<div style="display: flex; align-items: center; gap: 15px;">
<input type="checkbox" id="activo" name="activo" {% if feed.activo %}checked{% endif %}
style="width: 24px; height: 24px; margin: 0;">
<label for="activo" style="margin: 0; text-align: left; font-size: 1.1rem;">Feed activo</label>
</div>
</div>
<div class="form-row" style="border: none; padding-top: 20px;">
<div></div> <!-- Alignment -->
<div class="form-actions">
<button class="btn btn-primary" type="submit">
<i class="fas fa-save"></i> Guardar Cambios
</button>
<a href="{{ url_for('feeds.list_feeds') }}" class="btn btn-secondary">
<i class="fas fa-times"></i> Cancelar
</a>
</div>
</div>
</form>
</div>
<div style="margin-top: 20px;">
<a href="{{ url_for('feeds.list_feeds') }}" class="top-link">← Volver al listado</a>
</div>
{% endblock %}

View file

@ -0,0 +1,48 @@
{% extends "base.html" %}
{% block title %}Editar Fuente URL{% endblock %}
{% block content %}
<h1>Editar Fuente: {{ fuente.nombre }}</h1>
<div class="card">
<form action="{{ url_for('edit_url_source', url_id=fuente.id) }}" method="post">
<label for="nombre">Nombre</label>
<input type="text" id="nombre" name="nombre" value="{{ fuente.nombre }}" required>
<label for="url" style="margin-top:15px;">URL</label>
<input type="url" id="url" name="url" value="{{ fuente.url }}" required>
<label for="categoria_id" style="margin-top:15px;">Categoría</label>
<select id="categoria_id" name="categoria_id">
<option value="">— Sin categoría —</option>
{% for c in categorias %}
<option value="{{ c.id }}" {% if c.id == fuente.categoria_id %}selected{% endif %}>
{{ c.nombre }}
</option>
{% endfor %}
</select>
<label for="pais_id" style="margin-top:15px;">País</label>
<select id="pais_id" name="pais_id">
<option value="">— Sin país —</option>
{% for p in paises %}
<option value="{{ p.id }}" {% if p.id == fuente.pais_id %}selected{% endif %}>
{{ p.nombre }}
</option>
{% endfor %}
</select>
<label for="idioma" style="margin-top:15px;">Idioma (2 letras)</label>
<input id="idioma" name="idioma" value="{{ fuente.idioma }}" maxlength="2" required>
<div style="display:flex;justify-content:end;gap:10px;margin-top:20px;">
<a href="{{ url_for('manage_urls') }}" class="btn btn-secondary">Cancelar</a>
<button class="btn" type="submit">Actualizar</button>
</div>
</form>
</div>
{% endblock %}

114
templates/favoritos.html Normal file
View file

@ -0,0 +1,114 @@
{% extends "base.html" %}
{% block title %}Mis Favoritos{% endblock %}
{% block content %}
<div class="favoritos-page">
<h2><i class="fas fa-star"></i> Mis Favoritos</h2>
{% if noticias %}
<p class="favoritos-count">{{ noticias|length }} noticia{{ 's' if noticias|length > 1 else '' }} guardada{{ 's' if
noticias|length > 1 else '' }}</p>
<ul class="noticias-list">
{% for n in noticias %}
<li class="noticia-item">
{% if n.imagen_url %}
<div class="noticia-imagen">
<img src="{{ n.imagen_url }}" loading="lazy" onerror="this.parentElement.style.display='none'">
</div>
{% endif %}
<div class="noticia-texto">
<h3 class="m0">
<a href="{{ url_for('noticia.noticia', id=n.id) }}">
{{ n.titulo_trad or n.titulo }}
</a>
</h3>
<div class="noticia-meta">
{% if n.fecha %}
<i class="far fa-calendar-alt"></i>
{{ n.fecha.strftime('%d-%m-%Y %H:%M') if n.fecha else '' }}
{% endif %}
{% if n.fuente_nombre %} | {{ n.fuente_nombre }}{% endif %}
{% if n.pais %} | {{ n.pais|country_flag }} {{ n.pais }}{% endif %}
</div>
<p class="noticia-resumen">{{ (n.resumen_trad or n.resumen or '')[:200] }}...</p>
<button class="btn-remove-fav" onclick="removeFavorite('{{ n.id }}', this)" title="Quitar de favoritos">
<i class="fas fa-trash"></i> Quitar
</button>
</div>
</li>
{% endfor %}
</ul>
{% else %}
<div class="empty-state">
<i class="far fa-star"></i>
<p>No tienes noticias guardadas.</p>
<a href="{{ url_for('home.home') }}" class="btn btn-dark">Ver noticias</a>
</div>
{% endif %}
</div>
<style>
.favoritos-page h2 {
display: flex;
align-items: center;
gap: 0.5rem;
margin-bottom: 0.5rem;
}
.favoritos-count {
color: var(--text-muted, #666);
margin-bottom: 1.5rem;
}
.empty-state {
text-align: center;
padding: 3rem;
color: var(--text-muted, #666);
}
.empty-state i {
font-size: 3rem;
margin-bottom: 1rem;
opacity: 0.5;
}
.btn-remove-fav {
margin-top: 0.5rem;
padding: 0.4rem 0.8rem;
background: transparent;
border: 1px solid #dc3545;
color: #dc3545;
border-radius: 4px;
cursor: pointer;
font-size: 0.8rem;
transition: all 0.2s;
}
.btn-remove-fav:hover {
background: #dc3545;
color: #fff;
}
</style>
<script>
async function removeFavorite(noticiaId, btn) {
const response = await fetch(`/favoritos/toggle/${noticiaId}`, { method: 'POST' });
if (response.ok) {
btn.closest('.noticia-item').remove();
// Update count
const remaining = document.querySelectorAll('.noticia-item').length;
if (remaining === 0) {
location.reload();
} else {
document.querySelector('.favoritos-count').textContent =
`${remaining} noticia${remaining > 1 ? 's' : ''} guardada${remaining > 1 ? 's' : ''}`;
}
}
}
</script>
{% endblock %}

151
templates/feeds_list.html Normal file
View file

@ -0,0 +1,151 @@
{% extends "base.html" %}
{% block title %}Gestionar Feeds RSS{% endblock %}
{% block content %}
<div class="card feed-detail-card">
<div class="feed-header">
<h2>Lista de Feeds RSS</h2>
<div class="nav-actions" style="display:flex; gap:8px; align-items:center;">
<!-- 🔵 Exportar feeds CSV (con filtros aplicados) -->
<a href="#" id="export-btn" class="btn btn-small btn-secondary" onclick="exportFilteredFeeds(event)">
<i class="fas fa-download"></i> Exportar Feeds
</a>
<!-- 🟣 Importar feeds CSV -->
<a href="{{ url_for('backup.restore_feeds') }}" class="btn btn-small btn-secondary">
<i class="fas fa-upload"></i> Importar Feeds
</a>
<!-- 🟢 Añadir feed -->
<a href="{{ url_for('feeds.add_feed') }}" class="btn btn-small">
<i class="fas fa-plus"></i> Añadir Feed
</a>
</div>
</div>
<!-- Filtros avanzados -->
<div class="feed-body" style="padding: 15px 15px 0 15px;">
<form class="feed-filters" method="get" action="{{ url_for('feeds.list_feeds') }}" id="filter-form">
<div class="filter-row">
<div class="filter-group">
<label for="pais_id">País</label>
<select name="pais_id" id="pais_id" onchange="reloadTable()">
<option value="">Todos los países</option>
{% for p in paises %}
<option value="{{ p.id }}" {% if filtro_pais_id is not none and p.id==filtro_pais_id|int
%}selected{% endif %}>
{{ p.nombre }}
</option>
{% endfor %}
</select>
</div>
<div class="filter-group">
<label for="categoria_id">Categoría</label>
<select name="categoria_id" id="categoria_id" onchange="reloadTable()">
<option value="">Todas las categorías</option>
{% for c in categorias %}
<option value="{{ c.id }}" {% if filtro_categoria_id is not none and
c.id==filtro_categoria_id|int %}selected{% endif %}>
{{ c.nombre }}
</option>
{% endfor %}
</select>
</div>
<div class="filter-group">
<label for="estado">Estado</label>
<select name="estado" id="estado" onchange="reloadTable()">
<option value="" {% if not filtro_estado %}selected{% endif %}>Todos</option>
<option value="activos" {% if filtro_estado=="activos" %}selected{% endif %}>Activos</option>
<option value="inactivos" {% if filtro_estado=="inactivos" %}selected{% endif %}>Inactivos
</option>
<option value="errores" {% if filtro_estado=="errores" %}selected{% endif %}>Con errores
</option>
</select>
</div>
<div class="filter-group" style="flex: 0 0 auto; display:flex; gap:10px; align-self: flex-end;">
<!-- Button can be hidden if we fully rely on onchange, but useful for accessibility or clearing -->
<a href="{{ url_for('feeds.list_feeds') }}" class="btn btn-secondary">
Limpiar
</a>
</div>
</div>
</form>
</div>
<!-- Container for dynamic table -->
<div id="table-container">
{% include '_feeds_table.html' %}
</div>
</div>
<script>
async function reloadTable(urlOverride) {
const form = document.getElementById('filter-form');
const container = document.getElementById('table-container');
// Visual indicator
container.style.opacity = '0.5';
let url;
if (urlOverride) {
url = urlOverride;
} else {
const formData = new FormData(form);
const params = new URLSearchParams(formData);
url = `${form.action}?${params.toString()}`;
}
try {
const response = await fetch(url, {
headers: { 'X-Requested-With': 'XMLHttpRequest' }
});
const html = await response.text();
container.innerHTML = html;
// Update URL without reload
window.history.pushState({}, '', url);
} catch (error) {
console.error('Error reloading table:', error);
alert('Error al actualizar la lista.');
} finally {
container.style.opacity = '1';
}
}
function handlePageClick(event, url) {
event.preventDefault();
reloadTable(url);
}
function exportFilteredFeeds(event) {
event.preventDefault();
// Capturar valores actuales de los filtros
const paisId = document.getElementById('pais_id').value;
const categoriaId = document.getElementById('categoria_id').value;
const estado = document.getElementById('estado').value;
// Construir URL con parámetros
const params = new URLSearchParams();
if (paisId) params.append('pais_id', paisId);
if (categoriaId) params.append('categoria_id', categoriaId);
if (estado) params.append('estado', estado);
const exportUrl = `/export_feeds_filtered?${params.toString()}`;
// Redirigir para descargar
window.location.href = exportUrl;
}
</script>
{% endblock %}

Some files were not shown because too many files have changed in this diff Show more