rss2/scripts/download_models.py
2026-01-13 13:39:51 +01:00

99 lines
3.4 KiB
Python

import logging
import ssl
import nltk
import os
import urllib.request
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# ================================================================
# Logging
# ================================================================
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
LOG = logging.getLogger("download_models")
# ================================================================
# SSL FIX
# ================================================================
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
# ================================================================
# Paths y modelos
# ================================================================
NLTK_PACKAGES = ["punkt", "punkt_tab", "stopwords"]
NLLB_MODEL = "facebook/nllb-200-distilled-600M"
FASTTEXT_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.218.bin"
FASTTEXT_DEST = "/app/models/lid.218.bin" # donde lo espera tu worker
# ================================================================
# Descargar NLTK
# ================================================================
def download_nltk():
for pkg in NLTK_PACKAGES:
try:
path = f"tokenizers/{pkg}" if pkg.startswith("punkt") else f"corpora/{pkg}"
nltk.data.find(path)
LOG.info(f"NLTK '{pkg}' already installed")
except LookupError:
LOG.info(f"Downloading NLTK '{pkg}'...")
nltk.download(pkg, quiet=True)
LOG.info(f"Downloaded OK: {pkg}")
# ================================================================
# Descargar NLLB
# ================================================================
def download_nllb(model_name: str):
LOG.info(f"Downloading NLLB model: {model_name}")
try:
AutoTokenizer.from_pretrained(model_name)
AutoModelForSeq2SeqLM.from_pretrained(model_name)
LOG.info(f"Downloaded OK: {model_name}")
except Exception as e:
LOG.error(f"Failed downloading NLLB model {model_name}: {e}")
# ================================================================
# Descargar fastText LID.218
# ================================================================
def download_fasttext():
# Crear carpeta /app/models si no existe
dest_dir = os.path.dirname(FASTTEXT_DEST)
os.makedirs(dest_dir, exist_ok=True)
# Si ya existe, no lo descargamos
if os.path.exists(FASTTEXT_DEST):
LOG.info(f"fastText LID already exists at {FASTTEXT_DEST}")
return
LOG.info(f"Downloading fastText LID model from {FASTTEXT_URL}")
try:
urllib.request.urlretrieve(FASTTEXT_URL, FASTTEXT_DEST)
LOG.info(f"Downloaded fastText LID model to {FASTTEXT_DEST}")
except Exception as e:
LOG.error(f"Failed to download fastText LID model: {e}")
# ================================================================
# Main
# ================================================================
if __name__ == "__main__":
LOG.info("Downloading NLTK data...")
download_nltk()
LOG.info("Downloading NLLB model...")
download_nllb(NLLB_MODEL)
LOG.info("Downloading fastText LID model...")
download_fasttext()
LOG.info("All downloads completed successfully.")