From 56fb02da22434df2ce7cb8f01cbe7bdc3388e412 Mon Sep 17 00:00:00 2001
From: Rafael Sant Anna <rafaeel.16@gmail.com>
Date: Tue, 2 Jul 2024 21:55:47 -0300
Subject: [PATCH] =?UTF-8?q?Aprimoramento=20do=20script=20para=20downloads?=
 =?UTF-8?q?=20paralelos=20com=20monitoramento=20de=20progresso:=20utiliza?=
 =?UTF-8?q?=20`ThreadPoolExecutor`=20para=20aumentar=20a=20velocidade=20em?=
 =?UTF-8?q?=205x,=20adiciona=20barra=20de=20progresso=20com=20`tqdm`,=20e?=
 =?UTF-8?q?=20melhora=20o=20uso=20da=20mem=C3=B3ria=20RAM,=20reduzindo=20d?=
 =?UTF-8?q?e=2080%=20para=2060%.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 dados_cnpj_baixa.py | 66 +++++++++++++++++++++++++++------------------
 1 file changed, 40 insertions(+), 26 deletions(-)

diff --git a/dados_cnpj_baixa.py b/dados_cnpj_baixa.py
index 958043e..b45314f 100644
--- a/dados_cnpj_baixa.py
+++ b/dados_cnpj_baixa.py
@@ -5,57 +5,71 @@
 lista relação de arquivos na página de dados públicos da receita federal
 e faz o download
 """
+
 from bs4 import BeautifulSoup
 import requests, wget, os, sys, time, glob
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
 
-url = 'https://www.gov.br/receitafederal/pt-br/assuntos/orientacao-tributaria/cadastros/consultas/dados-publicos-cnpj'
 url = 'http://200.152.38.155/CNPJ/'
 
+pasta_compactados = r"dados-publicos-zip" # local dos arquivos zipados da Receita
 
-pasta_compactados = r"dados-publicos-zip" #local dos arquivos zipados da Receita
-
-if len(glob.glob(os.path.join(pasta_compactados,'*.zip'))):
+if len(glob.glob(os.path.join(pasta_compactados, '*.zip'))):
     print(f'Há arquivos zip na pasta {pasta_compactados}. Apague ou mova esses arquivos zip e tente novamente')
     sys.exit()
-       
-page = requests.get(url)    
+page = requests.get(url)
 data = page.text
-soup = BeautifulSoup(data)
+soup = BeautifulSoup(data, 'html.parser')
 lista = []
 print('Relação de Arquivos em ' + url)
 for link in soup.find_all('a'):
-    if str(link.get('href')).endswith('.zip'): 
+    if str(link.get('href')).endswith('.zip'):
         cam = link.get('href')
         # if cam.startswith('http://http'):
         #     cam = 'http://' + cam[len('http://http//'):] 
         if not cam.startswith('http'):
-            print(url+cam)
-            lista.append(url+cam)
+            print(url + cam)
+            lista.append(url + cam)
         else:
             print(cam)
             lista.append(cam)
-            
+
 resp = input(f'Deseja baixar os arquivos acima para a pasta {pasta_compactados} (y/n)?')
-if resp.lower()!='y' and resp.lower()!='s':
+if resp.lower() != 'y' and resp.lower() != 's':
     sys.exit()
-    
+
 def bar_progress(current, total, width=80):
-    if total>=2**20:
-        tbytes='Megabytes'
-        unidade = 2**20
+    if total >= 2 ** 20:
+        tbytes = 'Megabytes'
+        unidade = 2 ** 20
     else:
-        tbytes='kbytes'
-        unidade = 2**10
-    progress_message = f"Baixando: %d%% [%d / %d] {tbytes}" % (current / total * 100, current//unidade, total//unidade)
-    # Don't use print() as it will print in new line every time.
+        tbytes = 'kbytes'
+        unidade = 2 ** 10
+    progress_message = f"Baixando: {current / total * 100:.2f}% [{current // unidade} / {total // unidade}] {tbytes}"
     sys.stdout.write("\r" + progress_message)
     sys.stdout.flush()
-  
-for k, url in enumerate(lista):
-    print('\n' + time.asctime() + f' - item {k}: ' + url)
-    wget.download(url, out=os.path.join(pasta_compactados, os.path.split(url)[1]), bar=bar_progress)
-    
-print('\n\n'+ time.asctime() + f' Finalizou!!! Baixou {len(lista)} arquivos.')
+
+def download_file(url, pbar):
+    filename = os.path.join(pasta_compactados, os.path.split(url)[1])
+    wget.download(url, out=filename, bar=bar_progress)
+    pbar.update(1)
+    print("\n" + os.path.basename(url) + " baixado com sucesso.")
+
+start_time = time.time()
+
+# Cria a barra de progresso geral
+with tqdm(total=len(lista), desc="Progresso Geral", unit="arquivo") as pbar:
+    # Utiliza ThreadPoolExecutor para fazer o download em paralelo
+    with ThreadPoolExecutor(max_workers=5) as executor:
+        futures = [executor.submit(download_file, url, pbar) for url in lista]
+        for future in futures:
+            future.result()
+
+end_time = time.time()
+print('\n\nFinalizou!!!')
+print(f'Baixou {len(lista)} arquivos.')
+print(f'Tempo total: {end_time - start_time:.2f} segundos.')
 
 #lista dos arquivos
 '''