from deep_translator import GoogleTranslator from deep_translator import GoogleTranslator import os import re import hashlib import requests import json import time import logging from requests_html import HTMLSession from bs4 import BeautifulSoup from PyPDF2 import PdfReader import csv import docx import openpyxl import zipfile import html2text from transformers import BertTokenizer from tqdm import tqdm from urllib.parse import urlparse, urljoin # Configuración de logging para mostrar información en la terminal logging.basicConfig(level=logging.INFO) # Inicializar el tokenizador de BERT en español tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased') # Lista de stopwords en español STOPWORDS = set([ "de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por", "un", "para", "con", "no", "una", "su", "al", "es", "lo", "como", "más", "pero", "sus", "le", "ya", "o", "fue", "este", "ha", "sí", "porque", "esta", "son", "entre", "cuando", "muy", "sin", "sobre", "también", "me", "hasta", "hay", "donde", "quien", "desde", "todo", "nos", "durante", "todos", "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante", "ellos", "e", "esto", "mí", "antes", "algunos", "qué", "unos", "yo", "otro", "otras", "otra", "él", "tanto", "esa", "estos", "mucho", "quienes", "nada", "muchos", "cual", "poco", "ella", "estar", "estas", "algunas", "algo", "nosotros", "mi", "mis", "tú", "te", "ti", "tu", "tus", "ellas", "nosotras", "vosotros", "vosotras", "os", "mío", "mía", "míos", "mías", "tuyo", "tuya", "tuyos", "tuyas", "suyo", "suya", "suyos", "suyas", "nuestro", "nuestra", "nuestros", "nuestras", "vuestro", "vuestra", "vuestros", "vuestras", "esos", "esas", "estoy", "estás", "está", "estamos", "estáis", "están", "esté", "estés", "estemos", "estéis", "estén", "estaré", "estarás", "estará", "estaremos", "estaréis", "estarán", "estaría", "estarías", "estaríamos", "estaríais", "estarían", "estaba", "estabas", "estábamos", "estabais", "estaban", "estuve", "estuviste", "estuvo", "estuvimos", "estuvisteis", "estuvieron", "estuviera", "estuvieras", "estuviéramos", "estuvierais", "estuvieran", "estuviese", "estuvieses", "estuviésemos", "estuvieseis", "estuviesen", "estando", "estado", "estada", "estados", "estadas", "estad" ]) def translate_text(text): """ Traduce el texto completo usando deep-translator. """ try: return GoogleTranslator(source='auto', target='es').translate(text) except Exception as e: logging.error(f"Error al traducir con deep-translator: {e}") return text def clean_text(text): """ Limpia el texto eliminando bloques CDATA (incluso con espacios extra), luego HTML, puntuación y stopwords. """ # 1) Eliminar cualquier variante de CDATA (p. ej. '') text = re.sub(r'<\!\[\s*CDATA\s*\[.*?\]\]>', '', text, flags=re.S) # 2) Parsear HTML soup = BeautifulSoup(text, 'html.parser') text = soup.get_text(separator=" ") # 3) Minúsculas text = text.lower() # 4) Eliminar URLs text = re.sub(r'http\S+', '', text) # 5) Quitar todo menos letras y espacios text = re.sub(r'[^a-záéíóúñü\s]', '', text) # 6) Unir múltiples espacios text = re.sub(r'\s+', ' ', text).strip() # 7) Eliminar stopwords words = text.split() filtered = [w for w in words if w not in STOPWORDS] return ' '.join(filtered) def tokenize_and_save(text, filename, destination_folder): # → Tu lógica de tokenización con el tokenizer BERT tokens = tokenizer.encode( text, truncation=True, max_length=512, add_special_tokens=True ) tokens_str = ' '.join(map(str, tokens)) # Nos aseguramos de que el directorio destino existe os.makedirs(destination_folder, exist_ok=True) # Usamos filename **tal cual** para el fichero de salida out_path = os.path.join(destination_folder, filename) with open(out_path, 'w', encoding='utf-8') as f: f.write(tokens_str) def tokenize_all_articles(articles_folder, destination_folder): """ Tokeniza todos los artículos en la carpeta especificada. """ if not os.path.exists(destination_folder): os.makedirs(destination_folder) logging.info("Iniciando proceso de tokenización...") total_articles = 0 total_size = 0 for root, dirs, files in os.walk(articles_folder): for file in files: if file.endswith('.txt'): file_path = os.path.join(root, file) with open(file_path, 'r', encoding='utf-8') as f: content = f.read() tokenize_and_save(content, file, destination_folder) total_articles += 1 total_size += os.path.getsize(file_path) total_size_mb = total_size / (1024 * 1024) logging.info(f"Tokenización completada para {total_articles} artículos.") logging.info(f"Tamaño total de artículos tokenizados: {total_size_mb:.2f} MB.") def read_pdf(pdf_path): """ Lee y extrae texto de un archivo PDF. """ content = '' try: with open(pdf_path, 'rb') as f: pdf_reader = PdfReader(f) for page in pdf_reader.pages: text = page.extract_text() if text: content += text + '\n' except Exception as e: logging.error(f"Error al leer PDF {pdf_path}: {e}") return content def read_csv(csv_path): """ Lee y extrae texto de un archivo CSV. """ content = '' try: with open(csv_path, 'r', encoding='utf-8') as f: reader = csv.reader(f) for row in reader: content += ' '.join(row) + '\n' except Exception as e: logging.error(f"Error al leer CSV {csv_path}: {e}") return content def read_docx(docx_path): """ Lee y extrae texto de un archivo DOCX. """ content = '' try: doc = docx.Document(docx_path) for paragraph in doc.paragraphs: content += paragraph.text + '\n' except Exception as e: logging.error(f"Error al leer DOCX {docx_path}: {e}") return content def read_xlsx(xlsx_path): """ Lee y extrae texto de un archivo XLSX. """ content = '' try: wb = openpyxl.load_workbook(xlsx_path) for sheet in wb.sheetnames: ws = wb[sheet] for row in ws.iter_rows(): row_text = ' '.join([str(cell.value) if cell.value is not None else '' for cell in row]) content += row_text + '\n' except Exception as e: logging.error(f"Error al leer XLSX {xlsx_path}: {e}") return content def read_zip(zip_path): """ Lee y extrae texto de un archivo ZIP. """ content = '' try: with zipfile.ZipFile(zip_path, 'r') as z: for filename in z.namelist(): with z.open(filename) as f: file_content = f.read().decode('utf-8', errors='ignore') content += file_content + '\n' except Exception as e: logging.error(f"Error al leer ZIP {zip_path}: {e}") return content def read_html_md(file_path): """ Lee y extrae texto de un archivo HTML o Markdown. """ content = '' try: with open(file_path, 'r', encoding='utf-8', errors='replace') as f: content = f.read() except Exception as e: logging.error(f"Error al leer HTML/MD {file_path}: {e}") return content def format_content(html_content): """ Convierte contenido HTML a texto plano. """ h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True text = h.handle(html_content) return text def process_files(files_folder, destination_folder): """ Procesa y tokeniza todos los archivos en la carpeta especificada. """ if not os.path.exists(destination_folder): os.makedirs(destination_folder) logging.info("Procesando archivos descargados...") total_files = 0 total_size = 0 for root, dirs, files in os.walk(files_folder): for file in files: file_path = os.path.join(root, file) content = '' if file.endswith('.pdf'): content = read_pdf(file_path) elif file.endswith('.csv'): content = read_csv(file_path) elif file.endswith('.txt'): try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() except Exception as e: logging.error(f"Error al leer TXT {file_path}: {e}") elif file.endswith('.docx'): content = read_docx(file_path) elif file.endswith('.xlsx'): content = read_xlsx(file_path) elif file.endswith('.zip'): content = read_zip(file_path) elif file.endswith('.html') or file.endswith('.md'): content = read_html_md(file_path) content = format_content(content) else: logging.info(f"Formato de archivo no soportado: {file}") continue if content: translated_text = translate_text(content) cleaned_text = clean_text(translated_text) tokenize_and_save(cleaned_text, file, destination_folder) total_files += 1 total_size += os.path.getsize(file_path) total_size_mb = total_size / (1024 * 1024) logging.info(f"Procesamiento completado para {total_files} archivos.") logging.info(f"Tamaño total de archivos procesados: {total_size_mb:.2f} MB.") def download_and_save_file(url, destination_folder): """ Descarga y guarda un archivo desde la URL especificada. """ try: logging.info(f"Descargando archivo: {url}") response = requests.get(url, stream=True, timeout=30) if response.status_code == 200: filename = clean_filename(url.split('/')[-1]) if not filename: filename = 'archivo_descargado' file_path = os.path.join(destination_folder, filename) with open(file_path, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) logging.info(f"Archivo descargado: {file_path}") else: logging.info(f"Error al descargar {url}: Código de estado {response.status_code}") except Exception as e: logging.error(f"Error al descargar {url}: {e}") def extract_and_save_article(url, articles_folder): """ Extrae y guarda el contenido de un artículo desde la URL especificada. """ try: logging.info(f"Extrayendo artículo: {url}") response = requests.get(url, timeout=30) if response.status_code == 200: soup = BeautifulSoup(response.content, 'html.parser') title_tag = soup.find('title') title = title_tag.get_text().strip() if title_tag else None paragraphs = soup.find_all('p') content = ' '.join([para.get_text() for para in paragraphs]) if content.strip(): translated_text = translate_text(content) cleaned_text = clean_text(translated_text) if title: filename = clean_filename(title) + '.txt' else: parsed_url = urlparse(url) filename = clean_filename(parsed_url.path.split('/')[-1]) + '.txt' file_path = os.path.join(articles_folder, filename) with open(file_path, 'w', encoding='utf-8') as f: f.write(cleaned_text) logging.info(f"Artículo guardado: {file_path}") else: logging.info(f"No se encontró contenido en {url}") else: logging.info(f"Error al acceder a {url}: Código de estado {response.status_code}") except Exception as e: logging.error(f"Error al extraer artículo de {url}: {e}") def get_page_title(url): """ Obtiene el título de la página web desde la URL especificada. """ try: response = requests.get(url, timeout=10) if response.status_code == 200: soup = BeautifulSoup(response.content, 'html.parser') title_tag = soup.find('title') return title_tag.get_text().strip() if title_tag else None else: return None except Exception as e: logging.error(f"Error al obtener el título de la página {url}: {e}") return None def clean_filename(name): """ Limpia el nombre del archivo eliminando caracteres no permitidos. """ if name is None: return 'sin_nombre' name = re.sub(r'[\\/*?:"<>|]', "_", name) name = re.sub(r'\s+', '_', name) return name[:100] def register_processed_notifications(base_folder, urls): """ Registra las URLs ya procesadas para evitar duplicados. """ if not os.path.exists(base_folder): os.makedirs(base_folder) txt_path = os.path.join(base_folder, "processed_articles.txt") processed_urls = set() if os.path.exists(txt_path): with open(txt_path, 'r') as f: processed_urls = set(f.read().splitlines()) urls_to_process = [url for url in urls if url not in processed_urls] with open(txt_path, 'a') as f: for url in urls_to_process: f.write(url + "\n") if processed_urls: logging.info(f"Artículos ya procesados: {len(processed_urls)}") else: logging.info("No hay artículos procesados previamente.") return urls_to_process def explore_wayback_machine(url, articles_folder): """ Explora la Wayback Machine para obtener versiones archivadas de la URL. """ try: logging.info(f"Explorando Wayback Machine para: {url}") api_url = f"http://archive.org/wayback/available?url={url}" response = requests.get(api_url, timeout=10) data = response.json() if 'archived_snapshots' in data and 'closest' in data['archived_snapshots']: archive_url = data['archived_snapshots']['closest']['url'] logging.info(f"Descargando desde Wayback Machine: {archive_url}") extract_and_save_article(archive_url, articles_folder) else: logging.info(f"No se encontró versión archivada para {url}") except Exception as e: logging.error(f"Error al explorar Wayback Machine para {url}: {e}") def get_folder_info(path): """ Obtiene información de la carpeta: tamaño total y número de archivos. """ total_size = 0 total_files = 0 for dirpath, dirnames, filenames in os.walk(path): for f in filenames: fp = os.path.join(dirpath, f) total_size += os.path.getsize(fp) total_files += 1 return total_size, total_files def explore_and_extract_articles(url, articles_folder, files_folder, processed_urls, size_limit, depth=0, max_depth=6): """ Explora y extrae artículos y archivos desde la URL especificada. """ if depth > max_depth: return logging.info(f"Explorando {url} en profundidad {depth}...") try: session = HTMLSession() response = session.get(url, timeout=30) response.html.render(timeout=30, sleep=1) links = response.html.absolute_links session.close() except Exception as e: logging.error(f"Error al acceder a {url}: {e}") return for link in links: if link in processed_urls: continue processed_urls.add(link) parsed_link = urlparse(link) file_extension = os.path.splitext(parsed_link.path)[1].lower() if file_extension in ['.pdf', '.csv', '.txt', '.xlsx', '.docx', '.html', '.md', '.zip']: download_and_save_file(link, files_folder) elif 'mailto:' in link or 'tel:' in link: continue else: extract_and_save_article(link, articles_folder) explore_and_extract_articles(link, articles_folder, files_folder, processed_urls, size_limit, depth + 1, max_depth) total_size_articles, _ = get_folder_info(articles_folder) total_size_files, _ = get_folder_info(files_folder) total_size = total_size_articles + total_size_files if total_size >= size_limit: logging.info("Se ha alcanzado el límite de tamaño de 50 GB. Deteniendo exploración.") return def main(): logging.info("Función: main") urls = [ 'https://reactionary.international/database/', 'https://aleph.occrp.org/', 'https://offshoreleaks.icij.org/', 'https://www.publico.es/', 'https://www.elsaltodiario.com/', 'https://www.nytimes.com/', 'https://www.theguardian.com/', 'https://www.lemonde.fr/', 'https://www.spiegel.de/', 'https://elpais.com/', 'https://www.repubblica.it/', 'https://www.scmp.com/', 'https://www.smh.com.au/', 'https://www.globo.com/', 'https://timesofindia.indiatimes.com/', 'https://www.asahi.com/', 'https://www.washingtonpost.com/', 'https://www.aljazeera.com/', 'https://www.folha.uol.com.br/', 'https://www.telegraph.co.uk/', 'https://www.corriere.it/', 'https://www.clarin.com/', 'https://www.eluniversal.com.mx/', 'https://www.welt.de/', 'https://www.lanacion.com.ar/', 'https://www.bbc.com/', 'https://www.elconfidencial.com/', 'https://www.expansion.com/', 'https://www.lavanguardia.com/', 'https://www.elperiodico.com/', 'https://www.abc.es/', 'https://www.elespanol.com/', 'https://www.lainformacion.com/', 'https://www.elcorreo.com/', 'https://www.canarias7.es/', 'https://www.diariovasco.com/', 'https://www.farodevigo.es/', 'https://www.lavozdegalicia.es/', 'https://www.marca.com/', 'https://www.mundodeportivo.com/', 'https://www.elmundo.es/', 'https://www.cnbc.com/', 'https://www.bloomberg.com/', 'https://www.forbes.com/', 'https://www.economist.com/', 'https://www.ft.com/', 'https://www.wsj.com/', 'https://www.technologyreview.com/', 'https://www.cyberdefensemagazine.com/', 'https://www.securityweek.com/', 'https://www.darkreading.com/', 'https://www.infosecurity-magazine.com/', 'https://www.helpnetsecurity.com/', 'https://www.computerweekly.com/', 'https://www.csoonline.com/', 'https://www.zdnet.com/', 'https://www.itpro.co.uk/', 'https://www.theregister.com/', 'https://www.datacenterdynamics.com/', 'https://www.scmagazine.com/', 'https://www.teiss.co.uk/', 'https://www.tripwire.com/', 'https://www.infoworld.com/', 'https://www.cnet.com/', 'https://www.tomsguide.com/', 'https://www.theverge.com/', 'https://www.arstechnica.com/', 'https://www.engadget.com/', 'https://www.gizmodo.com/', 'https://www.wired.com/', 'https://www.vice.com/', 'https://www.politico.com/', 'https://www.theatlantic.com/', 'https://www.newyorker.com/', 'https://www.rollingstone.com/', 'https://www.thedailybeast.com/', 'https://www.salon.com/', 'https://www.slate.com/', 'https://www.huffpost.com/', 'https://www.vox.com/', 'https://www.bbc.co.uk/news', 'https://www.dailymail.co.uk/home/index.html', 'https://www.independent.co.uk/', 'https://www.irishtimes.com/', 'https://www.thejournal.ie/', 'https://www.thetimes.co.uk/', 'https://www.thesun.co.uk/', 'https://www.telegraph.co.uk/', 'https://www.euronews.com/', 'https://www.reuters.com/', 'https://www.dw.com/', 'https://www.france24.com/', 'https://www.lefigaro.fr/', 'https://www.lemonde.fr/', 'https://www.derstandard.at/', 'https://www.nzz.ch/', 'https://www.eldiario.es/', 'https://www.rtve.es/', 'https://www.rt.com/', 'https://www.elciudadano.com/', 'https://www.apnews.com/', 'https://www.univision.com/', 'https://www.televisa.com/', 'https://www.bbc.com/', 'https://www.cnn.com/', 'https://www.foxnews.com/', 'https://www.aljazeera.com/', 'https://www.trtworld.com/', 'https://www.newsweek.com/', 'https://www.time.com/', 'https://www.spectator.co.uk/' ] base_folder = '/var/www/theflows.net/flujos/FLUJOS_DATOS/NOTICIAS' articles_folder = os.path.join(base_folder, 'articulos') files_folder = os.path.join(base_folder, 'archivos') tokenized_folder = os.path.join(base_folder, 'tokenized') for folder in [articles_folder, files_folder, tokenized_folder]: if not os.path.exists(folder): os.makedirs(folder) FOLDER_SIZE_LIMIT = 50 * 1024 * 1024 * 1024 # 50 GB urls_to_process = register_processed_notifications(base_folder, urls) processed_urls = set() for url in urls_to_process: logging.info(f"\nProcesando URL: {url}") explore_and_extract_articles(url, articles_folder, files_folder, processed_urls, FOLDER_SIZE_LIMIT) explore_wayback_machine(url, articles_folder) process_files(files_folder, tokenized_folder) tokenize_all_articles(articles_folder, tokenized_folder) total_size_articles, total_files_articles = get_folder_info(articles_folder) total_size_files, total_files_files = get_folder_info(files_folder) total_size_tokenized, total_files_tokenized = get_folder_info(tokenized_folder) logging.info("\nResumen del proceso:") logging.info(f"Artículos descargados: {total_files_articles}") logging.info(f"Tamaño total de artículos: {total_size_articles / (1024 * 1024):.2f} MB") logging.info(f"Archivos descargados: {total_files_files}") logging.info(f"Tamaño total de archivos: {total_size_files / (1024 * 1024):.2f} MB") logging.info(f"Archivos tokenizados: {total_files_tokenized}") logging.info(f"Tamaño total de archivos tokenizados: {total_size_tokenized / (1024 * 1024):.2f} MB.") if __name__ == "__main__": main()