go integration and wikipedia

This commit is contained in:
jlimolina 2026-03-28 18:30:07 +01:00
parent 47a252e339
commit ee90335b92
7828 changed files with 1307913 additions and 20807 deletions

View file

@ -1,67 +1,50 @@
FROM python:3.11-slim
# CUDA o CPU
ARG TORCH_CUDA=cu121
WORKDIR /app
# --------------------------------------------------------
# Dependencias del sistema
# --------------------------------------------------------
RUN apt-get update && apt-get install -y --no-install-recommends \
libpq-dev \
gcc \
git \
libcairo2 \
libpango-1.0-0 \
libpangocairo-1.0-0 \
libgdk-pixbuf-2.0-0 \
libffi-dev \
shared-mime-info \
libpq-dev gcc git curl \
&& rm -rf /var/lib/apt/lists/*
ENV PYTHONUNBUFFERED=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
TOKENIZERS_PARALLELISM=false \
HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
HF_HOME=/root/.cache/huggingface
# --------------------------------------------------------
# Instalación de requirements
# --------------------------------------------------------
COPY requirements.txt .
RUN python -m pip install --no-cache-dir --upgrade pip setuptools wheel
RUN pip install --no-cache-dir --upgrade pip
# Instalar PyTorch según GPU/CPU
RUN if [ "$TORCH_CUDA" = "cu121" ]; then \
pip install --no-cache-dir --index-url https://download.pytorch.org/whl/cu121 \
torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 ; \
else \
pip install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu \
torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 ; \
fi
RUN pip install --no-cache-dir torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu121
RUN pip install --no-cache-dir -r requirements.txt
RUN pip install --no-cache-dir \
ctranslate2 \
sentencepiece \
transformers==4.44.0 \
protobuf==3.20.3 \
"numpy<2" \
psycopg2-binary \
redis \
requests \
beautifulsoup4 \
lxml \
langdetect \
nltk \
scikit-learn \
pandas \
sentence-transformers \
spacy
# Instalar ctranslate2 con soporte CUDA
RUN if [ "$TORCH_CUDA" = "cu121" ]; then \
pip install --no-cache-dir ctranslate2 ; \
else \
pip install --no-cache-dir ctranslate2 ; \
fi
RUN python -m spacy download es_core_news_lg
# Descargar modelo spaCy ES
RUN python -m spacy download es_core_news_md || true
COPY workers/ ./workers/
COPY init-db/ ./init-db/
COPY migrations/ ./migrations/
COPY entity_config.json .
# --------------------------------------------------------
# Copiar TODO el proyecto rss2/
# --------------------------------------------------------
COPY . .
# --------------------------------------------------------
# Puede descargar modelos NLLB o Sentence-BERT si existe
# --------------------------------------------------------
RUN python download_models.py || true
EXPOSE 8000
ENV DB_HOST=db
ENV DB_PORT=5432
ENV DB_NAME=rss
ENV DB_USER=rss
ENV DB_PASS=x
CMD ["python", "-m", "workers.embeddings_worker"]