Includes: FLUJOS app (Node/Flask/Python), FLUJOS_DATOS scripts (scrapers, Keras, Django) Excludes: MongoDB, scraped data, Wikipedia/WikiLeaks dumps, Python venv, node_modules
625 lines
22 KiB
Python
Executable file
625 lines
22 KiB
Python
Executable file
from deep_translator import GoogleTranslator
|
|
from deep_translator import GoogleTranslator
|
|
import os
|
|
import re
|
|
import hashlib
|
|
import requests
|
|
import json
|
|
import time
|
|
import logging
|
|
from requests_html import HTMLSession
|
|
from bs4 import BeautifulSoup
|
|
from PyPDF2 import PdfReader
|
|
import csv
|
|
import docx
|
|
import openpyxl
|
|
import zipfile
|
|
import html2text
|
|
from transformers import BertTokenizer
|
|
from tqdm import tqdm
|
|
from urllib.parse import urlparse, urljoin
|
|
|
|
# Configuración de logging para mostrar información en la terminal
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
# Inicializar el tokenizador de BERT en español
|
|
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
|
|
|
|
# Lista de stopwords en español
|
|
STOPWORDS = set([
|
|
"de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por",
|
|
"un", "para", "con", "no", "una", "su", "al", "es", "lo", "como", "más",
|
|
"pero", "sus", "le", "ya", "o", "fue", "este", "ha", "sí", "porque",
|
|
"esta", "son", "entre", "cuando", "muy", "sin", "sobre", "también", "me",
|
|
"hasta", "hay", "donde", "quien", "desde", "todo", "nos", "durante",
|
|
"todos", "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante",
|
|
"ellos", "e", "esto", "mí", "antes", "algunos", "qué", "unos", "yo",
|
|
"otro", "otras", "otra", "él", "tanto", "esa", "estos", "mucho",
|
|
"quienes", "nada", "muchos", "cual", "poco", "ella", "estar", "estas",
|
|
"algunas", "algo", "nosotros", "mi", "mis", "tú", "te", "ti", "tu",
|
|
"tus", "ellas", "nosotras", "vosotros", "vosotras", "os", "mío", "mía",
|
|
"míos", "mías", "tuyo", "tuya", "tuyos", "tuyas", "suyo", "suya",
|
|
"suyos", "suyas", "nuestro", "nuestra", "nuestros", "nuestras",
|
|
"vuestro", "vuestra", "vuestros", "vuestras", "esos", "esas", "estoy",
|
|
"estás", "está", "estamos", "estáis", "están", "esté", "estés",
|
|
"estemos", "estéis", "estén", "estaré", "estarás", "estará",
|
|
"estaremos", "estaréis", "estarán", "estaría", "estarías",
|
|
"estaríamos", "estaríais", "estarían", "estaba", "estabas",
|
|
"estábamos", "estabais", "estaban", "estuve", "estuviste", "estuvo",
|
|
"estuvimos", "estuvisteis", "estuvieron", "estuviera", "estuvieras",
|
|
"estuviéramos", "estuvierais", "estuvieran", "estuviese",
|
|
"estuvieses", "estuviésemos", "estuvieseis", "estuviesen", "estando",
|
|
"estado", "estada", "estados", "estadas", "estad"
|
|
])
|
|
|
|
|
|
def translate_text(text):
|
|
"""
|
|
Traduce el texto completo usando deep-translator.
|
|
"""
|
|
try:
|
|
return GoogleTranslator(source='auto', target='es').translate(text)
|
|
except Exception as e:
|
|
logging.error(f"Error al traducir con deep-translator: {e}")
|
|
return text
|
|
|
|
def clean_text(text):
|
|
"""
|
|
Limpia el texto eliminando bloques CDATA (incluso con espacios extra),
|
|
luego HTML, puntuación y stopwords.
|
|
"""
|
|
# 1) Eliminar cualquier variante de CDATA (p. ej. '<![ CDATA [ ... ]]>')
|
|
text = re.sub(r'<\!\[\s*CDATA\s*\[.*?\]\]>', '', text, flags=re.S)
|
|
|
|
# 2) Parsear HTML
|
|
soup = BeautifulSoup(text, 'html.parser')
|
|
text = soup.get_text(separator=" ")
|
|
|
|
# 3) Minúsculas
|
|
text = text.lower()
|
|
|
|
# 4) Eliminar URLs
|
|
text = re.sub(r'http\S+', '', text)
|
|
|
|
# 5) Quitar todo menos letras y espacios
|
|
text = re.sub(r'[^a-záéíóúñü\s]', '', text)
|
|
|
|
# 6) Unir múltiples espacios
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
|
|
# 7) Eliminar stopwords
|
|
words = text.split()
|
|
filtered = [w for w in words if w not in STOPWORDS]
|
|
|
|
return ' '.join(filtered)
|
|
|
|
def tokenize_and_save(text, filename, destination_folder):
|
|
|
|
# → Tu lógica de tokenización con el tokenizer BERT
|
|
tokens = tokenizer.encode(
|
|
text,
|
|
truncation=True,
|
|
max_length=512,
|
|
add_special_tokens=True
|
|
)
|
|
tokens_str = ' '.join(map(str, tokens))
|
|
|
|
# Nos aseguramos de que el directorio destino existe
|
|
os.makedirs(destination_folder, exist_ok=True)
|
|
|
|
# Usamos filename **tal cual** para el fichero de salida
|
|
out_path = os.path.join(destination_folder, filename)
|
|
|
|
with open(out_path, 'w', encoding='utf-8') as f:
|
|
f.write(tokens_str)
|
|
|
|
|
|
def tokenize_all_articles(articles_folder, destination_folder):
|
|
"""
|
|
Tokeniza todos los artículos en la carpeta especificada.
|
|
"""
|
|
if not os.path.exists(destination_folder):
|
|
os.makedirs(destination_folder)
|
|
|
|
logging.info("Iniciando proceso de tokenización...")
|
|
total_articles = 0
|
|
total_size = 0
|
|
|
|
for root, dirs, files in os.walk(articles_folder):
|
|
for file in files:
|
|
if file.endswith('.txt'):
|
|
file_path = os.path.join(root, file)
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
tokenize_and_save(content, file, destination_folder)
|
|
total_articles += 1
|
|
total_size += os.path.getsize(file_path)
|
|
|
|
total_size_mb = total_size / (1024 * 1024)
|
|
logging.info(f"Tokenización completada para {total_articles} artículos.")
|
|
logging.info(f"Tamaño total de artículos tokenizados: {total_size_mb:.2f} MB.")
|
|
|
|
def read_pdf(pdf_path):
|
|
"""
|
|
Lee y extrae texto de un archivo PDF.
|
|
"""
|
|
content = ''
|
|
try:
|
|
with open(pdf_path, 'rb') as f:
|
|
pdf_reader = PdfReader(f)
|
|
for page in pdf_reader.pages:
|
|
text = page.extract_text()
|
|
if text:
|
|
content += text + '\n'
|
|
except Exception as e:
|
|
logging.error(f"Error al leer PDF {pdf_path}: {e}")
|
|
return content
|
|
|
|
def read_csv(csv_path):
|
|
"""
|
|
Lee y extrae texto de un archivo CSV.
|
|
"""
|
|
content = ''
|
|
try:
|
|
with open(csv_path, 'r', encoding='utf-8') as f:
|
|
reader = csv.reader(f)
|
|
for row in reader:
|
|
content += ' '.join(row) + '\n'
|
|
except Exception as e:
|
|
logging.error(f"Error al leer CSV {csv_path}: {e}")
|
|
return content
|
|
|
|
def read_docx(docx_path):
|
|
"""
|
|
Lee y extrae texto de un archivo DOCX.
|
|
"""
|
|
content = ''
|
|
try:
|
|
doc = docx.Document(docx_path)
|
|
for paragraph in doc.paragraphs:
|
|
content += paragraph.text + '\n'
|
|
except Exception as e:
|
|
logging.error(f"Error al leer DOCX {docx_path}: {e}")
|
|
return content
|
|
|
|
def read_xlsx(xlsx_path):
|
|
"""
|
|
Lee y extrae texto de un archivo XLSX.
|
|
"""
|
|
content = ''
|
|
try:
|
|
wb = openpyxl.load_workbook(xlsx_path)
|
|
for sheet in wb.sheetnames:
|
|
ws = wb[sheet]
|
|
for row in ws.iter_rows():
|
|
row_text = ' '.join([str(cell.value) if cell.value is not None else '' for cell in row])
|
|
content += row_text + '\n'
|
|
except Exception as e:
|
|
logging.error(f"Error al leer XLSX {xlsx_path}: {e}")
|
|
return content
|
|
|
|
def read_zip(zip_path):
|
|
"""
|
|
Lee y extrae texto de un archivo ZIP.
|
|
"""
|
|
content = ''
|
|
try:
|
|
with zipfile.ZipFile(zip_path, 'r') as z:
|
|
for filename in z.namelist():
|
|
with z.open(filename) as f:
|
|
file_content = f.read().decode('utf-8', errors='ignore')
|
|
content += file_content + '\n'
|
|
except Exception as e:
|
|
logging.error(f"Error al leer ZIP {zip_path}: {e}")
|
|
return content
|
|
|
|
def read_html_md(file_path):
|
|
"""
|
|
Lee y extrae texto de un archivo HTML o Markdown.
|
|
"""
|
|
content = ''
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
|
content = f.read()
|
|
except Exception as e:
|
|
logging.error(f"Error al leer HTML/MD {file_path}: {e}")
|
|
return content
|
|
|
|
def format_content(html_content):
|
|
"""
|
|
Convierte contenido HTML a texto plano.
|
|
"""
|
|
h = html2text.HTML2Text()
|
|
h.ignore_links = True
|
|
h.ignore_images = True
|
|
text = h.handle(html_content)
|
|
return text
|
|
|
|
def process_files(files_folder, destination_folder):
|
|
"""
|
|
Procesa y tokeniza todos los archivos en la carpeta especificada.
|
|
"""
|
|
if not os.path.exists(destination_folder):
|
|
os.makedirs(destination_folder)
|
|
|
|
logging.info("Procesando archivos descargados...")
|
|
total_files = 0
|
|
total_size = 0
|
|
|
|
for root, dirs, files in os.walk(files_folder):
|
|
for file in files:
|
|
file_path = os.path.join(root, file)
|
|
content = ''
|
|
|
|
if file.endswith('.pdf'):
|
|
content = read_pdf(file_path)
|
|
elif file.endswith('.csv'):
|
|
content = read_csv(file_path)
|
|
elif file.endswith('.txt'):
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
except Exception as e:
|
|
logging.error(f"Error al leer TXT {file_path}: {e}")
|
|
elif file.endswith('.docx'):
|
|
content = read_docx(file_path)
|
|
elif file.endswith('.xlsx'):
|
|
content = read_xlsx(file_path)
|
|
elif file.endswith('.zip'):
|
|
content = read_zip(file_path)
|
|
elif file.endswith('.html') or file.endswith('.md'):
|
|
content = read_html_md(file_path)
|
|
content = format_content(content)
|
|
else:
|
|
logging.info(f"Formato de archivo no soportado: {file}")
|
|
continue
|
|
|
|
if content:
|
|
translated_text = translate_text(content)
|
|
cleaned_text = clean_text(translated_text)
|
|
tokenize_and_save(cleaned_text, file, destination_folder)
|
|
total_files += 1
|
|
total_size += os.path.getsize(file_path)
|
|
|
|
total_size_mb = total_size / (1024 * 1024)
|
|
logging.info(f"Procesamiento completado para {total_files} archivos.")
|
|
logging.info(f"Tamaño total de archivos procesados: {total_size_mb:.2f} MB.")
|
|
|
|
def download_and_save_file(url, destination_folder):
|
|
"""
|
|
Descarga y guarda un archivo desde la URL especificada.
|
|
"""
|
|
try:
|
|
logging.info(f"Descargando archivo: {url}")
|
|
response = requests.get(url, stream=True, timeout=30)
|
|
if response.status_code == 200:
|
|
filename = clean_filename(url.split('/')[-1])
|
|
if not filename:
|
|
filename = 'archivo_descargado'
|
|
file_path = os.path.join(destination_folder, filename)
|
|
with open(file_path, 'wb') as f:
|
|
for chunk in response.iter_content(chunk_size=8192):
|
|
f.write(chunk)
|
|
logging.info(f"Archivo descargado: {file_path}")
|
|
else:
|
|
logging.info(f"Error al descargar {url}: Código de estado {response.status_code}")
|
|
except Exception as e:
|
|
logging.error(f"Error al descargar {url}: {e}")
|
|
|
|
def extract_and_save_article(url, articles_folder):
|
|
"""
|
|
Extrae y guarda el contenido de un artículo desde la URL especificada.
|
|
"""
|
|
try:
|
|
logging.info(f"Extrayendo artículo: {url}")
|
|
response = requests.get(url, timeout=30)
|
|
if response.status_code == 200:
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
title_tag = soup.find('title')
|
|
title = title_tag.get_text().strip() if title_tag else None
|
|
paragraphs = soup.find_all('p')
|
|
content = ' '.join([para.get_text() for para in paragraphs])
|
|
|
|
if content.strip():
|
|
translated_text = translate_text(content)
|
|
cleaned_text = clean_text(translated_text)
|
|
if title:
|
|
filename = clean_filename(title) + '.txt'
|
|
else:
|
|
parsed_url = urlparse(url)
|
|
filename = clean_filename(parsed_url.path.split('/')[-1]) + '.txt'
|
|
|
|
file_path = os.path.join(articles_folder, filename)
|
|
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
f.write(cleaned_text)
|
|
|
|
logging.info(f"Artículo guardado: {file_path}")
|
|
else:
|
|
logging.info(f"No se encontró contenido en {url}")
|
|
else:
|
|
logging.info(f"Error al acceder a {url}: Código de estado {response.status_code}")
|
|
except Exception as e:
|
|
logging.error(f"Error al extraer artículo de {url}: {e}")
|
|
|
|
def get_page_title(url):
|
|
"""
|
|
Obtiene el título de la página web desde la URL especificada.
|
|
"""
|
|
try:
|
|
response = requests.get(url, timeout=10)
|
|
if response.status_code == 200:
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
title_tag = soup.find('title')
|
|
return title_tag.get_text().strip() if title_tag else None
|
|
else:
|
|
return None
|
|
except Exception as e:
|
|
logging.error(f"Error al obtener el título de la página {url}: {e}")
|
|
return None
|
|
|
|
def clean_filename(name):
|
|
"""
|
|
Limpia el nombre del archivo eliminando caracteres no permitidos.
|
|
"""
|
|
if name is None:
|
|
return 'sin_nombre'
|
|
name = re.sub(r'[\\/*?:"<>|]', "_", name)
|
|
name = re.sub(r'\s+', '_', name)
|
|
return name[:100]
|
|
|
|
def register_processed_notifications(base_folder, urls):
|
|
"""
|
|
Registra las URLs ya procesadas para evitar duplicados.
|
|
"""
|
|
if not os.path.exists(base_folder):
|
|
os.makedirs(base_folder)
|
|
|
|
txt_path = os.path.join(base_folder, "processed_articles.txt")
|
|
processed_urls = set()
|
|
|
|
if os.path.exists(txt_path):
|
|
with open(txt_path, 'r') as f:
|
|
processed_urls = set(f.read().splitlines())
|
|
|
|
urls_to_process = [url for url in urls if url not in processed_urls]
|
|
|
|
with open(txt_path, 'a') as f:
|
|
for url in urls_to_process:
|
|
f.write(url + "\n")
|
|
|
|
if processed_urls:
|
|
logging.info(f"Artículos ya procesados: {len(processed_urls)}")
|
|
else:
|
|
logging.info("No hay artículos procesados previamente.")
|
|
|
|
return urls_to_process
|
|
|
|
def explore_wayback_machine(url, articles_folder):
|
|
"""
|
|
Explora la Wayback Machine para obtener versiones archivadas de la URL.
|
|
"""
|
|
try:
|
|
logging.info(f"Explorando Wayback Machine para: {url}")
|
|
api_url = f"http://archive.org/wayback/available?url={url}"
|
|
response = requests.get(api_url, timeout=10)
|
|
data = response.json()
|
|
|
|
if 'archived_snapshots' in data and 'closest' in data['archived_snapshots']:
|
|
archive_url = data['archived_snapshots']['closest']['url']
|
|
logging.info(f"Descargando desde Wayback Machine: {archive_url}")
|
|
extract_and_save_article(archive_url, articles_folder)
|
|
else:
|
|
logging.info(f"No se encontró versión archivada para {url}")
|
|
except Exception as e:
|
|
logging.error(f"Error al explorar Wayback Machine para {url}: {e}")
|
|
|
|
def get_folder_info(path):
|
|
"""
|
|
Obtiene información de la carpeta: tamaño total y número de archivos.
|
|
"""
|
|
total_size = 0
|
|
total_files = 0
|
|
for dirpath, dirnames, filenames in os.walk(path):
|
|
for f in filenames:
|
|
fp = os.path.join(dirpath, f)
|
|
total_size += os.path.getsize(fp)
|
|
total_files += 1
|
|
return total_size, total_files
|
|
|
|
def explore_and_extract_articles(url, articles_folder, files_folder, processed_urls, size_limit, depth=0, max_depth=6):
|
|
"""
|
|
Explora y extrae artículos y archivos desde la URL especificada.
|
|
"""
|
|
if depth > max_depth:
|
|
return
|
|
|
|
logging.info(f"Explorando {url} en profundidad {depth}...")
|
|
try:
|
|
session = HTMLSession()
|
|
response = session.get(url, timeout=30)
|
|
response.html.render(timeout=30, sleep=1)
|
|
links = response.html.absolute_links
|
|
session.close()
|
|
except Exception as e:
|
|
logging.error(f"Error al acceder a {url}: {e}")
|
|
return
|
|
|
|
for link in links:
|
|
if link in processed_urls:
|
|
continue
|
|
|
|
processed_urls.add(link)
|
|
|
|
parsed_link = urlparse(link)
|
|
file_extension = os.path.splitext(parsed_link.path)[1].lower()
|
|
|
|
if file_extension in ['.pdf', '.csv', '.txt', '.xlsx', '.docx', '.html', '.md', '.zip']:
|
|
download_and_save_file(link, files_folder)
|
|
elif 'mailto:' in link or 'tel:' in link:
|
|
continue
|
|
else:
|
|
extract_and_save_article(link, articles_folder)
|
|
explore_and_extract_articles(link, articles_folder, files_folder, processed_urls, size_limit, depth + 1, max_depth)
|
|
|
|
total_size_articles, _ = get_folder_info(articles_folder)
|
|
total_size_files, _ = get_folder_info(files_folder)
|
|
total_size = total_size_articles + total_size_files
|
|
|
|
if total_size >= size_limit:
|
|
logging.info("Se ha alcanzado el límite de tamaño de 50 GB. Deteniendo exploración.")
|
|
return
|
|
|
|
def main():
|
|
logging.info("Función: main")
|
|
|
|
urls = [
|
|
'https://reactionary.international/database/',
|
|
'https://aleph.occrp.org/',
|
|
'https://offshoreleaks.icij.org/',
|
|
'https://www.publico.es/',
|
|
'https://www.elsaltodiario.com/',
|
|
'https://www.nytimes.com/',
|
|
'https://www.theguardian.com/',
|
|
'https://www.lemonde.fr/',
|
|
'https://www.spiegel.de/',
|
|
'https://elpais.com/',
|
|
'https://www.repubblica.it/',
|
|
'https://www.scmp.com/',
|
|
'https://www.smh.com.au/',
|
|
'https://www.globo.com/',
|
|
'https://timesofindia.indiatimes.com/',
|
|
'https://www.asahi.com/',
|
|
'https://www.washingtonpost.com/',
|
|
'https://www.aljazeera.com/',
|
|
'https://www.folha.uol.com.br/',
|
|
'https://www.telegraph.co.uk/',
|
|
'https://www.corriere.it/',
|
|
'https://www.clarin.com/',
|
|
'https://www.eluniversal.com.mx/',
|
|
'https://www.welt.de/',
|
|
'https://www.lanacion.com.ar/',
|
|
'https://www.bbc.com/',
|
|
'https://www.elconfidencial.com/',
|
|
'https://www.expansion.com/',
|
|
'https://www.lavanguardia.com/',
|
|
'https://www.elperiodico.com/',
|
|
'https://www.abc.es/',
|
|
'https://www.elespanol.com/',
|
|
'https://www.lainformacion.com/',
|
|
'https://www.elcorreo.com/',
|
|
'https://www.canarias7.es/',
|
|
'https://www.diariovasco.com/',
|
|
'https://www.farodevigo.es/',
|
|
'https://www.lavozdegalicia.es/',
|
|
'https://www.marca.com/',
|
|
'https://www.mundodeportivo.com/',
|
|
'https://www.elmundo.es/',
|
|
'https://www.cnbc.com/',
|
|
'https://www.bloomberg.com/',
|
|
'https://www.forbes.com/',
|
|
'https://www.economist.com/',
|
|
'https://www.ft.com/',
|
|
'https://www.wsj.com/',
|
|
'https://www.technologyreview.com/',
|
|
'https://www.cyberdefensemagazine.com/',
|
|
'https://www.securityweek.com/',
|
|
'https://www.darkreading.com/',
|
|
'https://www.infosecurity-magazine.com/',
|
|
'https://www.helpnetsecurity.com/',
|
|
'https://www.computerweekly.com/',
|
|
'https://www.csoonline.com/',
|
|
'https://www.zdnet.com/',
|
|
'https://www.itpro.co.uk/',
|
|
'https://www.theregister.com/',
|
|
'https://www.datacenterdynamics.com/',
|
|
'https://www.scmagazine.com/',
|
|
'https://www.teiss.co.uk/',
|
|
'https://www.tripwire.com/',
|
|
'https://www.infoworld.com/',
|
|
'https://www.cnet.com/',
|
|
'https://www.tomsguide.com/',
|
|
'https://www.theverge.com/',
|
|
'https://www.arstechnica.com/',
|
|
'https://www.engadget.com/',
|
|
'https://www.gizmodo.com/',
|
|
'https://www.wired.com/',
|
|
'https://www.vice.com/',
|
|
'https://www.politico.com/',
|
|
'https://www.theatlantic.com/',
|
|
'https://www.newyorker.com/',
|
|
'https://www.rollingstone.com/',
|
|
'https://www.thedailybeast.com/',
|
|
'https://www.salon.com/',
|
|
'https://www.slate.com/',
|
|
'https://www.huffpost.com/',
|
|
'https://www.vox.com/',
|
|
'https://www.bbc.co.uk/news',
|
|
'https://www.dailymail.co.uk/home/index.html',
|
|
'https://www.independent.co.uk/',
|
|
'https://www.irishtimes.com/',
|
|
'https://www.thejournal.ie/',
|
|
'https://www.thetimes.co.uk/',
|
|
'https://www.thesun.co.uk/',
|
|
'https://www.telegraph.co.uk/',
|
|
'https://www.euronews.com/',
|
|
'https://www.reuters.com/',
|
|
'https://www.dw.com/',
|
|
'https://www.france24.com/',
|
|
'https://www.lefigaro.fr/',
|
|
'https://www.lemonde.fr/',
|
|
'https://www.derstandard.at/',
|
|
'https://www.nzz.ch/',
|
|
'https://www.eldiario.es/',
|
|
'https://www.rtve.es/',
|
|
'https://www.rt.com/',
|
|
'https://www.elciudadano.com/',
|
|
'https://www.apnews.com/',
|
|
'https://www.univision.com/',
|
|
'https://www.televisa.com/',
|
|
'https://www.bbc.com/',
|
|
'https://www.cnn.com/',
|
|
'https://www.foxnews.com/',
|
|
'https://www.aljazeera.com/',
|
|
'https://www.trtworld.com/',
|
|
'https://www.newsweek.com/',
|
|
'https://www.time.com/',
|
|
'https://www.spectator.co.uk/'
|
|
]
|
|
|
|
base_folder = '/var/www/theflows.net/flujos/FLUJOS_DATOS/NOTICIAS'
|
|
articles_folder = os.path.join(base_folder, 'articulos')
|
|
files_folder = os.path.join(base_folder, 'archivos')
|
|
tokenized_folder = os.path.join(base_folder, 'tokenized')
|
|
|
|
for folder in [articles_folder, files_folder, tokenized_folder]:
|
|
if not os.path.exists(folder):
|
|
os.makedirs(folder)
|
|
|
|
FOLDER_SIZE_LIMIT = 50 * 1024 * 1024 * 1024 # 50 GB
|
|
|
|
urls_to_process = register_processed_notifications(base_folder, urls)
|
|
processed_urls = set()
|
|
|
|
for url in urls_to_process:
|
|
logging.info(f"\nProcesando URL: {url}")
|
|
explore_and_extract_articles(url, articles_folder, files_folder, processed_urls, FOLDER_SIZE_LIMIT)
|
|
explore_wayback_machine(url, articles_folder)
|
|
|
|
process_files(files_folder, tokenized_folder)
|
|
tokenize_all_articles(articles_folder, tokenized_folder)
|
|
|
|
total_size_articles, total_files_articles = get_folder_info(articles_folder)
|
|
total_size_files, total_files_files = get_folder_info(files_folder)
|
|
total_size_tokenized, total_files_tokenized = get_folder_info(tokenized_folder)
|
|
|
|
logging.info("\nResumen del proceso:")
|
|
logging.info(f"Artículos descargados: {total_files_articles}")
|
|
logging.info(f"Tamaño total de artículos: {total_size_articles / (1024 * 1024):.2f} MB")
|
|
logging.info(f"Archivos descargados: {total_files_files}")
|
|
logging.info(f"Tamaño total de archivos: {total_size_files / (1024 * 1024):.2f} MB")
|
|
logging.info(f"Archivos tokenizados: {total_files_tokenized}")
|
|
logging.info(f"Tamaño total de archivos tokenizados: {total_size_tokenized / (1024 * 1024):.2f} MB.")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|