-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
100 lines (98 loc) · 5.65 KB
/
utils.py
File metadata and controls
100 lines (98 loc) · 5.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import NoSuchElementException
import requests
import os
import time
from bs4 import BeautifulSoup
def download_vortal(link_value, driver, link_element, wait):
print("É VORTAL!")
link_value = os.path.basename(link_value).replace('/', '_').replace(':', '_').replace(" ","_").replace(".", "_").replace("=", "_").replace("?","_")
if not os.path.exists(f'Filedump/{link_value}'):
os.makedirs(f'Filedump/{link_value}')
link_element.click()
#wait for the new window to open
wait.until(EC.number_of_windows_to_be(3))
wait.until(EC.new_window_is_opened)
time.sleep(5)
#switch to the new window
driver.switch_to.window(driver.window_handles[-1])
print(len(driver.window_handles))
# wait for the table to be present in the DOM
vortal_table = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="grdGridDocumentList_tbl"]')))
# find table body
tbody_vortal = vortal_table.find_element(By.TAG_NAME, "tbody")
# get all rows in table
rows_vortal = tbody_vortal.find_elements(By.TAG_NAME, "tr")
for x, row_vortal in enumerate(rows_vortal):
try:
last_column_element = row_vortal.find_element(By.XPATH, './/*[@id="grdGridDocumentListtd_thColumnDownloadDocument"]')
last_column_contents = last_column_element.get_attribute("outerHTML")
start_index = last_column_contents.find("documentFileId=") + len("documentFileId=")
end_index = last_column_contents.find("&", start_index)
document_file_id = last_column_contents[start_index:end_index].strip().replace("' + '", "")
print("document file", document_file_id)
start_index2 = last_column_contents.find("&mkey=") + len("&mkey=")
end_index2 = last_column_contents.find("'", start_index2)
mkey = last_column_contents[start_index2:end_index2]
print("mkey", mkey)
file_vortal = f'https://community.vortal.biz/PRODPublic/Tendering/OpportunityDetail/DownloadFile?documentFileId={document_file_id}&mkey={mkey}'
# create a requests.Session object to maintain cookies and headers
session = requests.Session()
# set headers and cookies
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.3",
"Referer": "https://community.vortal.biz/",
}
cookies = {
"PublicSessionCookie": "gnk1anpt4mbgyum3dkhdaqyk",
"HAPRXSID": "10.101.2.33",
}
# make the initial request to get the redirect URL
url = f"https://community.vortal.biz/PRODPublic/Tendering/OpportunityDetail/DownloadFile?documentFileId={document_file_id}&mkey={mkey}"
print(url)
response = session.get(url, headers=headers, cookies=cookies, timeout=30)
if response.status_code == 200:
redirect_url = response.url # Get the final redirected URL
print("redirect url", redirect_url)
file_response = requests.get(redirect_url, headers=headers, cookies=cookies, timeout=30)
print("file response")
print(file_response)
soup = BeautifulSoup(file_response.content, "html.parser")
print("soup: ", soup)
script_tag = soup.find("script")
print("script tag: ", script_tag)
if script_tag is not None:
final_href = script_tag.text.strip().split("'")[1]
print("final href ", final_href)
# make the final request to download the file
final_url = f"https://community.vortal.biz{final_href}"
print("final url")
print(final_url)
final_response = session.get(final_url, headers=headers, cookies=cookies, timeout=30)
print("final response", final_response)
# save the file
content_disposition = final_response.headers.get('Content-Disposition')
filename_vortal = content_disposition.split('"')[1]
if ".pdf" in filename_vortal:
with open(f"Filedump/{link_value}/{filename_vortal}.pdf", "wb") as f:
f.write(final_response.content)
elif ".xlsx" in filename_vortal:
with open(f"Filedump/{link_value}/{filename_vortal}.xlsx", "wb") as f:
f.write(final_response.content)
elif ".xls" in filename_vortal:
with open(f"Filedump/{link_value}/{filename_vortal}.xls", "wb") as f:
f.write(final_response.content)
else:
with open(f"Filedump/{link_value}/{filename_vortal}.zip", "wb") as f:
f.write(final_response.content)
else:
print("No script tag found in the response.")
else:
print("Failed to access the initial URL.")
except NoSuchElementException:
print("Could not find last column element for row", x)