FLUJOS/FLUJOS_DATOS/NOTICIAS/main_noticias.py

from deep_translator import GoogleTranslator
from deep_translator import GoogleTranslator
import os
import re
import hashlib
import requests
import json
import time
import logging
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
import csv
import docx
import openpyxl
import zipfile
import html2text
from transformers import BertTokenizer
from tqdm import tqdm
from urllib.parse import urlparse, urljoin

# Configuración de logging para mostrar información en la terminal
logging.basicConfig(level=logging.INFO)

# Inicializar el tokenizador de BERT en español
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

# Lista de stopwords en español
STOPWORDS = set([
    "de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por",
    "un", "para", "con", "no", "una", "su", "al", "es", "lo", "como", "más",
    "pero", "sus", "le", "ya", "o", "fue", "este", "ha", "sí", "porque",
    "esta", "son", "entre", "cuando", "muy", "sin", "sobre", "también", "me",
    "hasta", "hay", "donde", "quien", "desde", "todo", "nos", "durante",
    "todos", "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante",
    "ellos", "e", "esto", "mí", "antes", "algunos", "qué", "unos", "yo",
    "otro", "otras", "otra", "él", "tanto", "esa", "estos", "mucho",
    "quienes", "nada", "muchos", "cual", "poco", "ella", "estar", "estas",
    "algunas", "algo", "nosotros", "mi", "mis", "tú", "te", "ti", "tu",
    "tus", "ellas", "nosotras", "vosotros", "vosotras", "os", "mío", "mía",
    "míos", "mías", "tuyo", "tuya", "tuyos", "tuyas", "suyo", "suya",
    "suyos", "suyas", "nuestro", "nuestra", "nuestros", "nuestras",
    "vuestro", "vuestra", "vuestros", "vuestras", "esos", "esas", "estoy",
    "estás", "está", "estamos", "estáis", "están", "esté", "estés",
    "estemos", "estéis", "estén", "estaré", "estarás", "estará",
    "estaremos", "estaréis", "estarán", "estaría", "estarías",
    "estaríamos", "estaríais", "estarían", "estaba", "estabas",
    "estábamos", "estabais", "estaban", "estuve", "estuviste", "estuvo",
    "estuvimos", "estuvisteis", "estuvieron", "estuviera", "estuvieras",
    "estuviéramos", "estuvierais", "estuvieran", "estuviese",
    "estuvieses", "estuviésemos", "estuvieseis", "estuviesen", "estando",
    "estado", "estada", "estados", "estadas", "estad"
])


def translate_text(text):
    """
    Traduce el texto completo usando deep-translator.
    """
    try:
        return GoogleTranslator(source='auto', target='es').translate(text)
    except Exception as e:
        logging.error(f"Error al traducir con deep-translator: {e}")
        return text

def clean_text(text):
    """
    Limpia el texto eliminando bloques CDATA (incluso con espacios extra),
    luego HTML, puntuación y stopwords.
    """
    # 1) Eliminar cualquier variante de CDATA (p. ej. '<![ CDATA [ ... ]]>')
    text = re.sub(r'<\!\[\s*CDATA\s*\[.*?\]\]>', '', text, flags=re.S)

    # 2) Parsear HTML
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text(separator=" ")

    # 3) Minúsculas
    text = text.lower()

    # 4) Eliminar URLs
    text = re.sub(r'http\S+', '', text)

    # 5) Quitar todo menos letras y espacios
    text = re.sub(r'[^a-záéíóúñü\s]', '', text)

    # 6) Unir múltiples espacios
    text = re.sub(r'\s+', ' ', text).strip()

    # 7) Eliminar stopwords
    words = text.split()
    filtered = [w for w in words if w not in STOPWORDS]

    return ' '.join(filtered)

def tokenize_and_save(text, filename, destination_folder):

    # → Tu lógica de tokenización con el tokenizer BERT
    tokens = tokenizer.encode(
        text,
        truncation=True,
        max_length=512,
        add_special_tokens=True
    )
    tokens_str = ' '.join(map(str, tokens))

    # Nos aseguramos de que el directorio destino existe
    os.makedirs(destination_folder, exist_ok=True)

    # Usamos filename **tal cual** para el fichero de salida
    out_path = os.path.join(destination_folder, filename)

    with open(out_path, 'w', encoding='utf-8') as f:
        f.write(tokens_str)


def tokenize_all_articles(articles_folder, destination_folder):
    """
    Tokeniza todos los artículos en la carpeta especificada.
    """
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    logging.info("Iniciando proceso de tokenización...")
    total_articles = 0
    total_size = 0

    for root, dirs, files in os.walk(articles_folder):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    tokenize_and_save(content, file, destination_folder)
                    total_articles += 1
                    total_size += os.path.getsize(file_path)

    total_size_mb = total_size / (1024 * 1024)
    logging.info(f"Tokenización completada para {total_articles} artículos.")
    logging.info(f"Tamaño total de artículos tokenizados: {total_size_mb:.2f} MB.")

def read_pdf(pdf_path):
    """
    Lee y extrae texto de un archivo PDF.
    """
    content = ''
    try:
        with open(pdf_path, 'rb') as f:
            pdf_reader = PdfReader(f)
            for page in pdf_reader.pages:
                text = page.extract_text()
                if text:
                    content += text + '\n'
    except Exception as e:
        logging.error(f"Error al leer PDF {pdf_path}: {e}")
    return content

def read_csv(csv_path):
    """
    Lee y extrae texto de un archivo CSV.
    """
    content = ''
    try:
        with open(csv_path, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            for row in reader:
                content += ' '.join(row) + '\n'
    except Exception as e:
        logging.error(f"Error al leer CSV {csv_path}: {e}")
    return content

def read_docx(docx_path):
    """
    Lee y extrae texto de un archivo DOCX.
    """
    content = ''
    try:
        doc = docx.Document(docx_path)
        for paragraph in doc.paragraphs:
            content += paragraph.text + '\n'
    except Exception as e:
        logging.error(f"Error al leer DOCX {docx_path}: {e}")
    return content

def read_xlsx(xlsx_path):
    """
    Lee y extrae texto de un archivo XLSX.
    """
    content = ''
    try:
        wb = openpyxl.load_workbook(xlsx_path)
        for sheet in wb.sheetnames:
            ws = wb[sheet]
            for row in ws.iter_rows():
                row_text = ' '.join([str(cell.value) if cell.value is not None else '' for cell in row])
                content += row_text + '\n'
    except Exception as e:
        logging.error(f"Error al leer XLSX {xlsx_path}: {e}")
    return content

def read_zip(zip_path):
    """
    Lee y extrae texto de un archivo ZIP.
    """
    content = ''
    try:
        with zipfile.ZipFile(zip_path, 'r') as z:
            for filename in z.namelist():
                with z.open(filename) as f:
                    file_content = f.read().decode('utf-8', errors='ignore')
                    content += file_content + '\n'
    except Exception as e:
        logging.error(f"Error al leer ZIP {zip_path}: {e}")
    return content

def read_html_md(file_path):
    """
    Lee y extrae texto de un archivo HTML o Markdown.
    """
    content = ''
    try:
        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
            content = f.read()
    except Exception as e:
        logging.error(f"Error al leer HTML/MD {file_path}: {e}")
    return content

def format_content(html_content):
    """
    Convierte contenido HTML a texto plano.
    """
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True
    text = h.handle(html_content)
    return text

def process_files(files_folder, destination_folder):
    """
    Procesa y tokeniza todos los archivos en la carpeta especificada.
    """
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    logging.info("Procesando archivos descargados...")
    total_files = 0
    total_size = 0

    for root, dirs, files in os.walk(files_folder):
        for file in files:
            file_path = os.path.join(root, file)
            content = ''

            if file.endswith('.pdf'):
                content = read_pdf(file_path)
            elif file.endswith('.csv'):
                content = read_csv(file_path)
            elif file.endswith('.txt'):
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                except Exception as e:
                    logging.error(f"Error al leer TXT {file_path}: {e}")
            elif file.endswith('.docx'):
                content = read_docx(file_path)
            elif file.endswith('.xlsx'):
                content = read_xlsx(file_path)
            elif file.endswith('.zip'):
                content = read_zip(file_path)
            elif file.endswith('.html') or file.endswith('.md'):
                content = read_html_md(file_path)
                content = format_content(content)
            else:
                logging.info(f"Formato de archivo no soportado: {file}")
                continue

            if content:
                translated_text = translate_text(content)
                cleaned_text = clean_text(translated_text)
                tokenize_and_save(cleaned_text, file, destination_folder)
                total_files += 1
                total_size += os.path.getsize(file_path)

    total_size_mb = total_size / (1024 * 1024)
    logging.info(f"Procesamiento completado para {total_files} archivos.")
    logging.info(f"Tamaño total de archivos procesados: {total_size_mb:.2f} MB.")

def download_and_save_file(url, destination_folder):
    """
    Descarga y guarda un archivo desde la URL especificada.
    """
    try:
        logging.info(f"Descargando archivo: {url}")
        response = requests.get(url, stream=True, timeout=30)
        if response.status_code == 200:
            filename = clean_filename(url.split('/')[-1])
            if not filename:
                filename = 'archivo_descargado'
            file_path = os.path.join(destination_folder, filename)
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            logging.info(f"Archivo descargado: {file_path}")
        else:
            logging.info(f"Error al descargar {url}: Código de estado {response.status_code}")
    except Exception as e:
        logging.error(f"Error al descargar {url}: {e}")

def extract_and_save_article(url, articles_folder):
    """
    Extrae y guarda el contenido de un artículo desde la URL especificada.
    """
    try:
        logging.info(f"Extrayendo artículo: {url}")
        response = requests.get(url, timeout=30)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            title_tag = soup.find('title')
            title = title_tag.get_text().strip() if title_tag else None
            paragraphs = soup.find_all('p')
            content = ' '.join([para.get_text() for para in paragraphs])

            if content.strip():
                translated_text = translate_text(content)
                cleaned_text = clean_text(translated_text)
                if title:
                    filename = clean_filename(title) + '.txt'
                else:
                    parsed_url = urlparse(url)
                    filename = clean_filename(parsed_url.path.split('/')[-1]) + '.txt'

                file_path = os.path.join(articles_folder, filename)

                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(cleaned_text)

                logging.info(f"Artículo guardado: {file_path}")
            else:
                logging.info(f"No se encontró contenido en {url}")
        else:
            logging.info(f"Error al acceder a {url}: Código de estado {response.status_code}")
    except Exception as e:
        logging.error(f"Error al extraer artículo de {url}: {e}")

def get_page_title(url):
    """
    Obtiene el título de la página web desde la URL especificada.
    """
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            title_tag = soup.find('title')
            return title_tag.get_text().strip() if title_tag else None
        else:
            return None
    except Exception as e:
        logging.error(f"Error al obtener el título de la página {url}: {e}")
        return None

def clean_filename(name):
    """
    Limpia el nombre del archivo eliminando caracteres no permitidos.
    """
    if name is None:
        return 'sin_nombre'
    name = re.sub(r'[\\/*?:"<>|]', "_", name)
    name = re.sub(r'\s+', '_', name)
    return name[:100]

def register_processed_notifications(base_folder, urls):
    """
    Registra las URLs ya procesadas para evitar duplicados.
    """
    if not os.path.exists(base_folder):
        os.makedirs(base_folder)

    txt_path = os.path.join(base_folder, "processed_articles.txt")
    processed_urls = set()

    if os.path.exists(txt_path):
        with open(txt_path, 'r') as f:
            processed_urls = set(f.read().splitlines())

    urls_to_process = [url for url in urls if url not in processed_urls]

    with open(txt_path, 'a') as f:
        for url in urls_to_process:
            f.write(url + "\n")

    if processed_urls:
        logging.info(f"Artículos ya procesados: {len(processed_urls)}")
    else:
        logging.info("No hay artículos procesados previamente.")

    return urls_to_process

def explore_wayback_machine(url, articles_folder):
    """
    Explora la Wayback Machine para obtener versiones archivadas de la URL.
    """
    try:
        logging.info(f"Explorando Wayback Machine para: {url}")
        api_url = f"http://archive.org/wayback/available?url={url}"
        response = requests.get(api_url, timeout=10)
        data = response.json()

        if 'archived_snapshots' in data and 'closest' in data['archived_snapshots']:
            archive_url = data['archived_snapshots']['closest']['url']
            logging.info(f"Descargando desde Wayback Machine: {archive_url}")
            extract_and_save_article(archive_url, articles_folder)
        else:
            logging.info(f"No se encontró versión archivada para {url}")
    except Exception as e:
        logging.error(f"Error al explorar Wayback Machine para {url}: {e}")

def get_folder_info(path):
    """
    Obtiene información de la carpeta: tamaño total y número de archivos.
    """
    total_size = 0
    total_files = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
            total_files += 1
    return total_size, total_files

def explore_and_extract_articles(url, articles_folder, files_folder, processed_urls, size_limit, depth=0, max_depth=6):
    """
    Explora y extrae artículos y archivos desde la URL especificada.
    """
    if depth > max_depth:
        return

    logging.info(f"Explorando {url} en profundidad {depth}...")
    try:
        session = HTMLSession()
        response = session.get(url, timeout=30)
        response.html.render(timeout=30, sleep=1)
        links = response.html.absolute_links
        session.close()
    except Exception as e:
        logging.error(f"Error al acceder a {url}: {e}")
        return

    for link in links:
        if link in processed_urls:
            continue

        processed_urls.add(link)

        parsed_link = urlparse(link)
        file_extension = os.path.splitext(parsed_link.path)[1].lower()

        if file_extension in ['.pdf', '.csv', '.txt', '.xlsx', '.docx', '.html', '.md', '.zip']:
            download_and_save_file(link, files_folder)
        elif 'mailto:' in link or 'tel:' in link:
            continue
        else:
            extract_and_save_article(link, articles_folder)
            explore_and_extract_articles(link, articles_folder, files_folder, processed_urls, size_limit, depth + 1, max_depth)

        total_size_articles, _ = get_folder_info(articles_folder)
        total_size_files, _ = get_folder_info(files_folder)
        total_size = total_size_articles + total_size_files

        if total_size >= size_limit:
            logging.info("Se ha alcanzado el límite de tamaño de 50 GB. Deteniendo exploración.")
            return

def main():
    logging.info("Función: main")

    urls = [
        'https://reactionary.international/database/',
        'https://aleph.occrp.org/',
        'https://offshoreleaks.icij.org/',
        'https://www.publico.es/',
        'https://www.elsaltodiario.com/',
        'https://www.nytimes.com/',
        'https://www.theguardian.com/',
        'https://www.lemonde.fr/',
        'https://www.spiegel.de/',
        'https://elpais.com/',
        'https://www.repubblica.it/',
        'https://www.scmp.com/',
        'https://www.smh.com.au/',
        'https://www.globo.com/',
        'https://timesofindia.indiatimes.com/',
        'https://www.asahi.com/',
        'https://www.washingtonpost.com/',
        'https://www.aljazeera.com/',
        'https://www.folha.uol.com.br/',
        'https://www.telegraph.co.uk/',
        'https://www.corriere.it/',
        'https://www.clarin.com/',
        'https://www.eluniversal.com.mx/',
        'https://www.welt.de/',
        'https://www.lanacion.com.ar/',
        'https://www.bbc.com/',
        'https://www.elconfidencial.com/',
        'https://www.expansion.com/',
        'https://www.lavanguardia.com/',
        'https://www.elperiodico.com/',
        'https://www.abc.es/',
        'https://www.elespanol.com/',
        'https://www.lainformacion.com/',
        'https://www.elcorreo.com/',
        'https://www.canarias7.es/',
        'https://www.diariovasco.com/',
        'https://www.farodevigo.es/',
        'https://www.lavozdegalicia.es/',
        'https://www.marca.com/',
        'https://www.mundodeportivo.com/',
        'https://www.elmundo.es/',
        'https://www.cnbc.com/',
        'https://www.bloomberg.com/',
        'https://www.forbes.com/',
        'https://www.economist.com/',
        'https://www.ft.com/',
        'https://www.wsj.com/',
        'https://www.technologyreview.com/',
        'https://www.cyberdefensemagazine.com/',
        'https://www.securityweek.com/',
        'https://www.darkreading.com/',
        'https://www.infosecurity-magazine.com/',
        'https://www.helpnetsecurity.com/',
        'https://www.computerweekly.com/',
        'https://www.csoonline.com/',
        'https://www.zdnet.com/',
        'https://www.itpro.co.uk/',
        'https://www.theregister.com/',
        'https://www.datacenterdynamics.com/',
        'https://www.scmagazine.com/',
        'https://www.teiss.co.uk/',
        'https://www.tripwire.com/',
        'https://www.infoworld.com/',
        'https://www.cnet.com/',
        'https://www.tomsguide.com/',
        'https://www.theverge.com/',
        'https://www.arstechnica.com/',
        'https://www.engadget.com/',
        'https://www.gizmodo.com/',
        'https://www.wired.com/',
        'https://www.vice.com/',
        'https://www.politico.com/',
        'https://www.theatlantic.com/',
        'https://www.newyorker.com/',
        'https://www.rollingstone.com/',
        'https://www.thedailybeast.com/',
        'https://www.salon.com/',
        'https://www.slate.com/',
        'https://www.huffpost.com/',
        'https://www.vox.com/',
        'https://www.bbc.co.uk/news',
        'https://www.dailymail.co.uk/home/index.html',
        'https://www.independent.co.uk/',
        'https://www.irishtimes.com/',
        'https://www.thejournal.ie/',
        'https://www.thetimes.co.uk/',
        'https://www.thesun.co.uk/',
        'https://www.telegraph.co.uk/',
        'https://www.euronews.com/',
        'https://www.reuters.com/',
        'https://www.dw.com/',
        'https://www.france24.com/',
        'https://www.lefigaro.fr/',
        'https://www.lemonde.fr/',
        'https://www.derstandard.at/',
        'https://www.nzz.ch/',
        'https://www.eldiario.es/',
        'https://www.rtve.es/',
        'https://www.rt.com/',
        'https://www.elciudadano.com/',
        'https://www.apnews.com/',
        'https://www.univision.com/',
        'https://www.televisa.com/',
        'https://www.bbc.com/',
        'https://www.cnn.com/',
        'https://www.foxnews.com/',
        'https://www.aljazeera.com/',
        'https://www.trtworld.com/',
        'https://www.newsweek.com/',
        'https://www.time.com/',
        'https://www.spectator.co.uk/'
    ]

    base_folder = '/var/www/theflows.net/flujos/FLUJOS_DATOS/NOTICIAS'
    articles_folder = os.path.join(base_folder, 'articulos')
    files_folder = os.path.join(base_folder, 'archivos')
    tokenized_folder = os.path.join(base_folder, 'tokenized')

    for folder in [articles_folder, files_folder, tokenized_folder]:
        if not os.path.exists(folder):
            os.makedirs(folder)

    FOLDER_SIZE_LIMIT = 50 * 1024 * 1024 * 1024  # 50 GB

    urls_to_process = register_processed_notifications(base_folder, urls)
    processed_urls = set()

    for url in urls_to_process:
        logging.info(f"\nProcesando URL: {url}")
        explore_and_extract_articles(url, articles_folder, files_folder, processed_urls, FOLDER_SIZE_LIMIT)
        explore_wayback_machine(url, articles_folder)

    process_files(files_folder, tokenized_folder)
    tokenize_all_articles(articles_folder, tokenized_folder)

    total_size_articles, total_files_articles = get_folder_info(articles_folder)
    total_size_files, total_files_files = get_folder_info(files_folder)
    total_size_tokenized, total_files_tokenized = get_folder_info(tokenized_folder)

    logging.info("\nResumen del proceso:")
    logging.info(f"Artículos descargados: {total_files_articles}")
    logging.info(f"Tamaño total de artículos: {total_size_articles / (1024 * 1024):.2f} MB")
    logging.info(f"Archivos descargados: {total_files_files}")
    logging.info(f"Tamaño total de archivos: {total_size_files / (1024 * 1024):.2f} MB")
    logging.info(f"Archivos tokenizados: {total_files_tokenized}")
    logging.info(f"Tamaño total de archivos tokenizados: {total_size_tokenized / (1024 * 1024):.2f} MB.")

if __name__ == "__main__":
    main()