FLUJOS/FLUJOS_DATOS/NOTICIAS/main_noticias.py
CAPITANSITO a40b946163 Initial commit - FLUJOS codebase (production branch)
Includes: FLUJOS app (Node/Flask/Python), FLUJOS_DATOS scripts (scrapers, Keras, Django)
Excludes: MongoDB, scraped data, Wikipedia/WikiLeaks dumps, Python venv, node_modules
2026-03-31 14:10:02 +02:00

625 lines
22 KiB
Python
Executable file

from deep_translator import GoogleTranslator
from deep_translator import GoogleTranslator
import os
import re
import hashlib
import requests
import json
import time
import logging
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
import csv
import docx
import openpyxl
import zipfile
import html2text
from transformers import BertTokenizer
from tqdm import tqdm
from urllib.parse import urlparse, urljoin
# Configuración de logging para mostrar información en la terminal
logging.basicConfig(level=logging.INFO)
# Inicializar el tokenizador de BERT en español
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
# Lista de stopwords en español
STOPWORDS = set([
"de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por",
"un", "para", "con", "no", "una", "su", "al", "es", "lo", "como", "más",
"pero", "sus", "le", "ya", "o", "fue", "este", "ha", "", "porque",
"esta", "son", "entre", "cuando", "muy", "sin", "sobre", "también", "me",
"hasta", "hay", "donde", "quien", "desde", "todo", "nos", "durante",
"todos", "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante",
"ellos", "e", "esto", "", "antes", "algunos", "qué", "unos", "yo",
"otro", "otras", "otra", "él", "tanto", "esa", "estos", "mucho",
"quienes", "nada", "muchos", "cual", "poco", "ella", "estar", "estas",
"algunas", "algo", "nosotros", "mi", "mis", "", "te", "ti", "tu",
"tus", "ellas", "nosotras", "vosotros", "vosotras", "os", "mío", "mía",
"míos", "mías", "tuyo", "tuya", "tuyos", "tuyas", "suyo", "suya",
"suyos", "suyas", "nuestro", "nuestra", "nuestros", "nuestras",
"vuestro", "vuestra", "vuestros", "vuestras", "esos", "esas", "estoy",
"estás", "está", "estamos", "estáis", "están", "esté", "estés",
"estemos", "estéis", "estén", "estaré", "estarás", "estará",
"estaremos", "estaréis", "estarán", "estaría", "estarías",
"estaríamos", "estaríais", "estarían", "estaba", "estabas",
"estábamos", "estabais", "estaban", "estuve", "estuviste", "estuvo",
"estuvimos", "estuvisteis", "estuvieron", "estuviera", "estuvieras",
"estuviéramos", "estuvierais", "estuvieran", "estuviese",
"estuvieses", "estuviésemos", "estuvieseis", "estuviesen", "estando",
"estado", "estada", "estados", "estadas", "estad"
])
def translate_text(text):
"""
Traduce el texto completo usando deep-translator.
"""
try:
return GoogleTranslator(source='auto', target='es').translate(text)
except Exception as e:
logging.error(f"Error al traducir con deep-translator: {e}")
return text
def clean_text(text):
"""
Limpia el texto eliminando bloques CDATA (incluso con espacios extra),
luego HTML, puntuación y stopwords.
"""
# 1) Eliminar cualquier variante de CDATA (p. ej. '<![ CDATA [ ... ]]>')
text = re.sub(r'<\!\[\s*CDATA\s*\[.*?\]\]>', '', text, flags=re.S)
# 2) Parsear HTML
soup = BeautifulSoup(text, 'html.parser')
text = soup.get_text(separator=" ")
# 3) Minúsculas
text = text.lower()
# 4) Eliminar URLs
text = re.sub(r'http\S+', '', text)
# 5) Quitar todo menos letras y espacios
text = re.sub(r'[^a-záéíóúñü\s]', '', text)
# 6) Unir múltiples espacios
text = re.sub(r'\s+', ' ', text).strip()
# 7) Eliminar stopwords
words = text.split()
filtered = [w for w in words if w not in STOPWORDS]
return ' '.join(filtered)
def tokenize_and_save(text, filename, destination_folder):
# → Tu lógica de tokenización con el tokenizer BERT
tokens = tokenizer.encode(
text,
truncation=True,
max_length=512,
add_special_tokens=True
)
tokens_str = ' '.join(map(str, tokens))
# Nos aseguramos de que el directorio destino existe
os.makedirs(destination_folder, exist_ok=True)
# Usamos filename **tal cual** para el fichero de salida
out_path = os.path.join(destination_folder, filename)
with open(out_path, 'w', encoding='utf-8') as f:
f.write(tokens_str)
def tokenize_all_articles(articles_folder, destination_folder):
"""
Tokeniza todos los artículos en la carpeta especificada.
"""
if not os.path.exists(destination_folder):
os.makedirs(destination_folder)
logging.info("Iniciando proceso de tokenización...")
total_articles = 0
total_size = 0
for root, dirs, files in os.walk(articles_folder):
for file in files:
if file.endswith('.txt'):
file_path = os.path.join(root, file)
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
tokenize_and_save(content, file, destination_folder)
total_articles += 1
total_size += os.path.getsize(file_path)
total_size_mb = total_size / (1024 * 1024)
logging.info(f"Tokenización completada para {total_articles} artículos.")
logging.info(f"Tamaño total de artículos tokenizados: {total_size_mb:.2f} MB.")
def read_pdf(pdf_path):
"""
Lee y extrae texto de un archivo PDF.
"""
content = ''
try:
with open(pdf_path, 'rb') as f:
pdf_reader = PdfReader(f)
for page in pdf_reader.pages:
text = page.extract_text()
if text:
content += text + '\n'
except Exception as e:
logging.error(f"Error al leer PDF {pdf_path}: {e}")
return content
def read_csv(csv_path):
"""
Lee y extrae texto de un archivo CSV.
"""
content = ''
try:
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
for row in reader:
content += ' '.join(row) + '\n'
except Exception as e:
logging.error(f"Error al leer CSV {csv_path}: {e}")
return content
def read_docx(docx_path):
"""
Lee y extrae texto de un archivo DOCX.
"""
content = ''
try:
doc = docx.Document(docx_path)
for paragraph in doc.paragraphs:
content += paragraph.text + '\n'
except Exception as e:
logging.error(f"Error al leer DOCX {docx_path}: {e}")
return content
def read_xlsx(xlsx_path):
"""
Lee y extrae texto de un archivo XLSX.
"""
content = ''
try:
wb = openpyxl.load_workbook(xlsx_path)
for sheet in wb.sheetnames:
ws = wb[sheet]
for row in ws.iter_rows():
row_text = ' '.join([str(cell.value) if cell.value is not None else '' for cell in row])
content += row_text + '\n'
except Exception as e:
logging.error(f"Error al leer XLSX {xlsx_path}: {e}")
return content
def read_zip(zip_path):
"""
Lee y extrae texto de un archivo ZIP.
"""
content = ''
try:
with zipfile.ZipFile(zip_path, 'r') as z:
for filename in z.namelist():
with z.open(filename) as f:
file_content = f.read().decode('utf-8', errors='ignore')
content += file_content + '\n'
except Exception as e:
logging.error(f"Error al leer ZIP {zip_path}: {e}")
return content
def read_html_md(file_path):
"""
Lee y extrae texto de un archivo HTML o Markdown.
"""
content = ''
try:
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
content = f.read()
except Exception as e:
logging.error(f"Error al leer HTML/MD {file_path}: {e}")
return content
def format_content(html_content):
"""
Convierte contenido HTML a texto plano.
"""
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
text = h.handle(html_content)
return text
def process_files(files_folder, destination_folder):
"""
Procesa y tokeniza todos los archivos en la carpeta especificada.
"""
if not os.path.exists(destination_folder):
os.makedirs(destination_folder)
logging.info("Procesando archivos descargados...")
total_files = 0
total_size = 0
for root, dirs, files in os.walk(files_folder):
for file in files:
file_path = os.path.join(root, file)
content = ''
if file.endswith('.pdf'):
content = read_pdf(file_path)
elif file.endswith('.csv'):
content = read_csv(file_path)
elif file.endswith('.txt'):
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
logging.error(f"Error al leer TXT {file_path}: {e}")
elif file.endswith('.docx'):
content = read_docx(file_path)
elif file.endswith('.xlsx'):
content = read_xlsx(file_path)
elif file.endswith('.zip'):
content = read_zip(file_path)
elif file.endswith('.html') or file.endswith('.md'):
content = read_html_md(file_path)
content = format_content(content)
else:
logging.info(f"Formato de archivo no soportado: {file}")
continue
if content:
translated_text = translate_text(content)
cleaned_text = clean_text(translated_text)
tokenize_and_save(cleaned_text, file, destination_folder)
total_files += 1
total_size += os.path.getsize(file_path)
total_size_mb = total_size / (1024 * 1024)
logging.info(f"Procesamiento completado para {total_files} archivos.")
logging.info(f"Tamaño total de archivos procesados: {total_size_mb:.2f} MB.")
def download_and_save_file(url, destination_folder):
"""
Descarga y guarda un archivo desde la URL especificada.
"""
try:
logging.info(f"Descargando archivo: {url}")
response = requests.get(url, stream=True, timeout=30)
if response.status_code == 200:
filename = clean_filename(url.split('/')[-1])
if not filename:
filename = 'archivo_descargado'
file_path = os.path.join(destination_folder, filename)
with open(file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
logging.info(f"Archivo descargado: {file_path}")
else:
logging.info(f"Error al descargar {url}: Código de estado {response.status_code}")
except Exception as e:
logging.error(f"Error al descargar {url}: {e}")
def extract_and_save_article(url, articles_folder):
"""
Extrae y guarda el contenido de un artículo desde la URL especificada.
"""
try:
logging.info(f"Extrayendo artículo: {url}")
response = requests.get(url, timeout=30)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
title_tag = soup.find('title')
title = title_tag.get_text().strip() if title_tag else None
paragraphs = soup.find_all('p')
content = ' '.join([para.get_text() for para in paragraphs])
if content.strip():
translated_text = translate_text(content)
cleaned_text = clean_text(translated_text)
if title:
filename = clean_filename(title) + '.txt'
else:
parsed_url = urlparse(url)
filename = clean_filename(parsed_url.path.split('/')[-1]) + '.txt'
file_path = os.path.join(articles_folder, filename)
with open(file_path, 'w', encoding='utf-8') as f:
f.write(cleaned_text)
logging.info(f"Artículo guardado: {file_path}")
else:
logging.info(f"No se encontró contenido en {url}")
else:
logging.info(f"Error al acceder a {url}: Código de estado {response.status_code}")
except Exception as e:
logging.error(f"Error al extraer artículo de {url}: {e}")
def get_page_title(url):
"""
Obtiene el título de la página web desde la URL especificada.
"""
try:
response = requests.get(url, timeout=10)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
title_tag = soup.find('title')
return title_tag.get_text().strip() if title_tag else None
else:
return None
except Exception as e:
logging.error(f"Error al obtener el título de la página {url}: {e}")
return None
def clean_filename(name):
"""
Limpia el nombre del archivo eliminando caracteres no permitidos.
"""
if name is None:
return 'sin_nombre'
name = re.sub(r'[\\/*?:"<>|]', "_", name)
name = re.sub(r'\s+', '_', name)
return name[:100]
def register_processed_notifications(base_folder, urls):
"""
Registra las URLs ya procesadas para evitar duplicados.
"""
if not os.path.exists(base_folder):
os.makedirs(base_folder)
txt_path = os.path.join(base_folder, "processed_articles.txt")
processed_urls = set()
if os.path.exists(txt_path):
with open(txt_path, 'r') as f:
processed_urls = set(f.read().splitlines())
urls_to_process = [url for url in urls if url not in processed_urls]
with open(txt_path, 'a') as f:
for url in urls_to_process:
f.write(url + "\n")
if processed_urls:
logging.info(f"Artículos ya procesados: {len(processed_urls)}")
else:
logging.info("No hay artículos procesados previamente.")
return urls_to_process
def explore_wayback_machine(url, articles_folder):
"""
Explora la Wayback Machine para obtener versiones archivadas de la URL.
"""
try:
logging.info(f"Explorando Wayback Machine para: {url}")
api_url = f"http://archive.org/wayback/available?url={url}"
response = requests.get(api_url, timeout=10)
data = response.json()
if 'archived_snapshots' in data and 'closest' in data['archived_snapshots']:
archive_url = data['archived_snapshots']['closest']['url']
logging.info(f"Descargando desde Wayback Machine: {archive_url}")
extract_and_save_article(archive_url, articles_folder)
else:
logging.info(f"No se encontró versión archivada para {url}")
except Exception as e:
logging.error(f"Error al explorar Wayback Machine para {url}: {e}")
def get_folder_info(path):
"""
Obtiene información de la carpeta: tamaño total y número de archivos.
"""
total_size = 0
total_files = 0
for dirpath, dirnames, filenames in os.walk(path):
for f in filenames:
fp = os.path.join(dirpath, f)
total_size += os.path.getsize(fp)
total_files += 1
return total_size, total_files
def explore_and_extract_articles(url, articles_folder, files_folder, processed_urls, size_limit, depth=0, max_depth=6):
"""
Explora y extrae artículos y archivos desde la URL especificada.
"""
if depth > max_depth:
return
logging.info(f"Explorando {url} en profundidad {depth}...")
try:
session = HTMLSession()
response = session.get(url, timeout=30)
response.html.render(timeout=30, sleep=1)
links = response.html.absolute_links
session.close()
except Exception as e:
logging.error(f"Error al acceder a {url}: {e}")
return
for link in links:
if link in processed_urls:
continue
processed_urls.add(link)
parsed_link = urlparse(link)
file_extension = os.path.splitext(parsed_link.path)[1].lower()
if file_extension in ['.pdf', '.csv', '.txt', '.xlsx', '.docx', '.html', '.md', '.zip']:
download_and_save_file(link, files_folder)
elif 'mailto:' in link or 'tel:' in link:
continue
else:
extract_and_save_article(link, articles_folder)
explore_and_extract_articles(link, articles_folder, files_folder, processed_urls, size_limit, depth + 1, max_depth)
total_size_articles, _ = get_folder_info(articles_folder)
total_size_files, _ = get_folder_info(files_folder)
total_size = total_size_articles + total_size_files
if total_size >= size_limit:
logging.info("Se ha alcanzado el límite de tamaño de 50 GB. Deteniendo exploración.")
return
def main():
logging.info("Función: main")
urls = [
'https://reactionary.international/database/',
'https://aleph.occrp.org/',
'https://offshoreleaks.icij.org/',
'https://www.publico.es/',
'https://www.elsaltodiario.com/',
'https://www.nytimes.com/',
'https://www.theguardian.com/',
'https://www.lemonde.fr/',
'https://www.spiegel.de/',
'https://elpais.com/',
'https://www.repubblica.it/',
'https://www.scmp.com/',
'https://www.smh.com.au/',
'https://www.globo.com/',
'https://timesofindia.indiatimes.com/',
'https://www.asahi.com/',
'https://www.washingtonpost.com/',
'https://www.aljazeera.com/',
'https://www.folha.uol.com.br/',
'https://www.telegraph.co.uk/',
'https://www.corriere.it/',
'https://www.clarin.com/',
'https://www.eluniversal.com.mx/',
'https://www.welt.de/',
'https://www.lanacion.com.ar/',
'https://www.bbc.com/',
'https://www.elconfidencial.com/',
'https://www.expansion.com/',
'https://www.lavanguardia.com/',
'https://www.elperiodico.com/',
'https://www.abc.es/',
'https://www.elespanol.com/',
'https://www.lainformacion.com/',
'https://www.elcorreo.com/',
'https://www.canarias7.es/',
'https://www.diariovasco.com/',
'https://www.farodevigo.es/',
'https://www.lavozdegalicia.es/',
'https://www.marca.com/',
'https://www.mundodeportivo.com/',
'https://www.elmundo.es/',
'https://www.cnbc.com/',
'https://www.bloomberg.com/',
'https://www.forbes.com/',
'https://www.economist.com/',
'https://www.ft.com/',
'https://www.wsj.com/',
'https://www.technologyreview.com/',
'https://www.cyberdefensemagazine.com/',
'https://www.securityweek.com/',
'https://www.darkreading.com/',
'https://www.infosecurity-magazine.com/',
'https://www.helpnetsecurity.com/',
'https://www.computerweekly.com/',
'https://www.csoonline.com/',
'https://www.zdnet.com/',
'https://www.itpro.co.uk/',
'https://www.theregister.com/',
'https://www.datacenterdynamics.com/',
'https://www.scmagazine.com/',
'https://www.teiss.co.uk/',
'https://www.tripwire.com/',
'https://www.infoworld.com/',
'https://www.cnet.com/',
'https://www.tomsguide.com/',
'https://www.theverge.com/',
'https://www.arstechnica.com/',
'https://www.engadget.com/',
'https://www.gizmodo.com/',
'https://www.wired.com/',
'https://www.vice.com/',
'https://www.politico.com/',
'https://www.theatlantic.com/',
'https://www.newyorker.com/',
'https://www.rollingstone.com/',
'https://www.thedailybeast.com/',
'https://www.salon.com/',
'https://www.slate.com/',
'https://www.huffpost.com/',
'https://www.vox.com/',
'https://www.bbc.co.uk/news',
'https://www.dailymail.co.uk/home/index.html',
'https://www.independent.co.uk/',
'https://www.irishtimes.com/',
'https://www.thejournal.ie/',
'https://www.thetimes.co.uk/',
'https://www.thesun.co.uk/',
'https://www.telegraph.co.uk/',
'https://www.euronews.com/',
'https://www.reuters.com/',
'https://www.dw.com/',
'https://www.france24.com/',
'https://www.lefigaro.fr/',
'https://www.lemonde.fr/',
'https://www.derstandard.at/',
'https://www.nzz.ch/',
'https://www.eldiario.es/',
'https://www.rtve.es/',
'https://www.rt.com/',
'https://www.elciudadano.com/',
'https://www.apnews.com/',
'https://www.univision.com/',
'https://www.televisa.com/',
'https://www.bbc.com/',
'https://www.cnn.com/',
'https://www.foxnews.com/',
'https://www.aljazeera.com/',
'https://www.trtworld.com/',
'https://www.newsweek.com/',
'https://www.time.com/',
'https://www.spectator.co.uk/'
]
base_folder = '/var/www/theflows.net/flujos/FLUJOS_DATOS/NOTICIAS'
articles_folder = os.path.join(base_folder, 'articulos')
files_folder = os.path.join(base_folder, 'archivos')
tokenized_folder = os.path.join(base_folder, 'tokenized')
for folder in [articles_folder, files_folder, tokenized_folder]:
if not os.path.exists(folder):
os.makedirs(folder)
FOLDER_SIZE_LIMIT = 50 * 1024 * 1024 * 1024 # 50 GB
urls_to_process = register_processed_notifications(base_folder, urls)
processed_urls = set()
for url in urls_to_process:
logging.info(f"\nProcesando URL: {url}")
explore_and_extract_articles(url, articles_folder, files_folder, processed_urls, FOLDER_SIZE_LIMIT)
explore_wayback_machine(url, articles_folder)
process_files(files_folder, tokenized_folder)
tokenize_all_articles(articles_folder, tokenized_folder)
total_size_articles, total_files_articles = get_folder_info(articles_folder)
total_size_files, total_files_files = get_folder_info(files_folder)
total_size_tokenized, total_files_tokenized = get_folder_info(tokenized_folder)
logging.info("\nResumen del proceso:")
logging.info(f"Artículos descargados: {total_files_articles}")
logging.info(f"Tamaño total de artículos: {total_size_articles / (1024 * 1024):.2f} MB")
logging.info(f"Archivos descargados: {total_files_files}")
logging.info(f"Tamaño total de archivos: {total_size_files / (1024 * 1024):.2f} MB")
logging.info(f"Archivos tokenizados: {total_files_tokenized}")
logging.info(f"Tamaño total de archivos tokenizados: {total_size_tokenized / (1024 * 1024):.2f} MB.")
if __name__ == "__main__":
main()