From 5064b29cf9c5ef45f38cf7278b4263910336b7e2 Mon Sep 17 00:00:00 2001
From: Dinan Xiao <2962142013@qq.com>
Date: Wed, 16 Apr 2025 20:51:28 +0800
Subject: [PATCH 1/2] Update SwissCrawler in Targets_Prediction.py

Update SwissCrawler:
- be able to crawl SwissTargetPrediction now
- add random_sleep to avoid anti-crawler
- keep the same api with original SwissCrawler
---
 Targets_Prediction.py | 164 +++++++++++++++++++++---------------------
 1 file changed, 82 insertions(+), 82 deletions(-)

diff --git a/Targets_Prediction.py b/Targets_Prediction.py
index fde886f..8c12415 100644
--- a/Targets_Prediction.py
+++ b/Targets_Prediction.py
@@ -1,6 +1,8 @@
 ### IMPORT LIBRARIES ###
 # make sure libraries are installed on your PC
 # install libraries via 'pip install xxx'
+import os
+
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
@@ -11,88 +13,78 @@
 import argparse
 import pandas as pd
 from concurrent.futures import ThreadPoolExecutor
-from bioservices import UniProt
-
+import sys
+import random
+import time
+from pathlib import Path
+# from bioservices import UniProt
 ### DEFINE FUNCTIONs ###
+
+def random_sleep(min_time=1, max_time=3):
+    """Generate a random sleep time between min_time and max_time."""
+    sleep_time = random.uniform(min_time, max_time)
+    time.sleep(sleep_time)
+    return sleep_time
+
 ## Crawl data from SwissTargetPrediction
-def SwissCrawler (smiles, CpdName):
-    SwissUrl = 'http://www.swisstargetprediction.ch/index.php'
-    platform = 'SwissTargetPrediction' 
-    driver.get(SwissUrl) 
-    SearchField = driver.find_element(By.NAME, 'smiles') 
-    SearchField.send_keys(smiles) 
-    SearchField.submit() 
-    dfs = []  
-    max_retries = 3  
-    retries = 0  
-    all_pages_processed = False
-    while retries < max_retries and not all_pages_processed:
-        try:
-            WebDriverWait(driver, 200).until(EC.presence_of_element_located((By.XPATH, '//*[@id="resultTable"]/tbody')))
-            CurrUrl = driver.current_url 
-            df = pd.read_html(CurrUrl) 
-            df = df[0] 
-            cols = [col for col in df.columns if col in ['Uniprot ID', 'Probability*']]
-            df = df[cols]          
-            df.insert(0, 'compound', CpdName)   
-            df.insert(1, 'platform', platform)  
-            df = df.rename(columns={"Uniprot ID": "uniprotID", "Probability*": "prob"}) 
-            dfs.append(df)  
-            ## Determine whether the current page is the last page. If it is not the last page, click the "Next" button to load the next page.
-            try:
-                next_button = driver.find_element(By.XPATH, '//*[@id="resultTable_next"]')
-                if (df['prob'] == 0).any():  
-                    all_pages_processed = True
-                elif next_button.get_attribute("class") == "paginate_button next disabled":
-                    all_pages_processed = True  
-            except NoSuchElementException:
-                all_pages_processed = True  
-            ## If the current page is not the last page, click the "Next" button to load the next page.
-            if not all_pages_processed:
-                next_button.click()
-                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="resultTable"]/tbody')))
-            else:
-                break
-        ## Handling exceptional situations: page loading timeout, pop-up warning boxes, and other exceptions.
-        except TimeoutException:
-            retries += 1
-            if retries >= max_retries:
-                all_pages_processed = True
-                CurrUrl = driver.current_url
-                dfs = pd.DataFrame(columns=['compound', 'platform', 'uniprotID', 'prob'])
-                dfs = pd.concat([dfs, pd.DataFrame({'compound': [CpdName],
-                                                'platform': [platform],
-                                                'uniprotID': ['result page reached timeout'],
-                                                'prob': [CurrUrl]})], ignore_index=True)
-                return dfs
-        except UnexpectedAlertPresentException:
-            retries += 1
-            if retries >= max_retries:
-                all_pages_processed = True
-                alert = driver.switch_to.alert
-                dfs = pd.DataFrame(columns=['compound', 'platform', 'uniprotID', 'prob'])
-                dfs = pd.concat([dfs, pd.DataFrame({'compound': [CpdName],
-                                                'platform': [platform],
-                                                'uniprotID': ['error message'],
-                                                'prob': [alert.text]})], ignore_index=True)
-                alert.accept()
-                return dfs
-        except Exception as e:
-            print(f"Error occurred: {e}")
-            all_pages_processed = True
-            break
-    ## If the page is loaded normally, the data of all pages is merged into one dataframe.        
-    df = pd.concat(dfs, ignore_index=True)
-    ## Retain target data from SwissTargetPrediction database with "Probability*" greater than or equal to 0.6.
+def SwissCrawler (smiles, CpdName, tmp_dir):
+    # CONFIG
+    SwissUrl = 'http://www.swisstargetprediction.ch/'
+    platform = 'SwissTargetPrediction'
+    wait_predict_clickable = 10
+    wait_csv_download_clickable = 300
+    wait_download_finish = 10
+
+    # Open the SwissTargetPrediction website
+    driver.get(SwissUrl)
+    random_sleep()
+
+    # Input SMILES
+    SearchField = driver.find_element(By.NAME, 'smiles')
+    SearchField.send_keys(smiles)
+
+    # Wait until the "Predict" button is clickable
+    WaitButton = WebDriverWait(driver, wait_predict_clickable).until(EC.element_to_be_clickable((By.ID, 'submitButton')))
+    random_sleep()
+
+    # Click the "Predict" button
+    PredictButton = driver.find_element(By.ID, 'submitButton')
+    PredictButton.click()
+
+    # Wait for Prediction to complete, by waiting for the "Download CSV" button to be clickable
+    CsvButtonElement = WebDriverWait(driver, wait_csv_download_clickable).until(
+        # locate the "Download CSV" button using img src, as no other unique identifier is available
+        EC.element_to_be_clickable((By.XPATH, '//button[./img[@src="/images/csv-24.png"]]'))
+    )
+    random_sleep()
+    CsvButtonElement.click()
+
+    # Wait for the CSV file to be downloaded
+    time.sleep(wait_download_finish)
+
+    # Create a DataFrame from the downloaded CSV file
+    SwissTargetPred_df = pd.read_csv(out_dir / 'SwissTargetPrediction.csv')
+    df_row_n = SwissTargetPred_df.shape[0]
+    df = pd.DataFrame({
+        'compound': [CpdName] * df_row_n,
+        'platform': [platform] * df_row_n,
+        'uniprotID': SwissTargetPred_df['Uniprot ID'],
+        'prob': SwissTargetPred_df['Probability*']
+    })
+
+    # Delete the downloaded CSV file
+    os.remove(tmp_dir / 'SwissTargetPrediction.csv')
+
+    # ## Retain target data from SwissTargetPrediction database with "Probability*" greater than or equal to 0.6.
     df = df[df['prob'] >= 0.6]
-    ## Retrieve the entry name corresponding to the UniProt ID from the UniProt database.      
+    ## Retrieve the entry name corresponding to the UniProt ID from the UniProt database.
     def get_uniprot_name(entry):
         u = UniProt(verbose=False)
-        res = u.search(f"{entry}+AND+organism_id:9606", frmt="tsv", columns="id", limit=1)        
+        res = u.search(f"{entry}+AND+organism_id:9606", frmt="tsv", columns="id", limit=1)
         if len(res.split('\n')) < 2:
             Entr = 'no_entry_found_in_uniprot'
         else:
-            Entr = res.split('\n')[1].split('\t')[0]        
+            Entr = res.split('\n')[1].split('\t')[0]
         return Entr
     def get_uniprot_names(df):
         def process_entry(entry):
@@ -115,7 +107,7 @@ def process_entry(entry):
         df = df.drop('uniprotID', axis=1)
         return df
     df = get_uniprot_names(df)
-    return df    
+    return df
 
 ## Crawl data from SEA       
 def SEACrawler (smiles, CpdName):
@@ -321,10 +313,16 @@ def process_entry(entry):
     ## The following code is used to start crawling.
     options = webdriver.ChromeOptions()
     options.add_experimental_option('excludeSwitches', ['enable-logging'])
+    out_dir = Path(args.output).parent
+    out_dir.mkdir(parents=True, exist_ok=True)
+    options.add_experimental_option("prefs", {  # Set download directory for SwissTargetPrediction
+        "download.default_directory": str(out_dir),
+        "download.prompt_for_download": False,
+        "download.directory_upgrade": True,
+        "safebrowsing.enabled": True
+    })
     driver = webdriver.Chrome(options=options)
-
-    # driver = webdriver.Chrome('C:\\Windows\\chromedriver.exe', options=options)
-    cols = ['compound','platform','prob','UniProt_name'] 
+    cols = ['compound','platform','prob','UniProt_name']
     results = pd.DataFrame(columns=cols)
     results.to_csv(args.output,sep=',')     
     ## The following code is used to crawl through the 3 target prediction servers.
@@ -332,16 +330,18 @@ def process_entry(entry):
     for index, row in data.iterrows():
         CpdName = row['name']
         smiles = row['smiles']
-        SwissResult = SwissCrawler(smiles, CpdName)
+        SwissResult = SwissCrawler(smiles, CpdName, tmp_dir=out_dir)
         SEAResult = SEACrawler(smiles, CpdName)
-        SuperPredResult = SuperPredCrawler(smiles, CpdName)        
+        SuperPredResult = SuperPredCrawler(smiles, CpdName)
         with open (args.output,'a',newline='') as f:
+            pass
             SwissResult.to_csv(f,sep=',',header=False)
             SEAResult.to_csv(f,sep=',',header=False)
-            SuperPredResult.to_csv(f,sep=',',header=False)           
+            SuperPredResult.to_csv(f,sep=',',header=False)
         print('         screened {} of {} molecules ({})'.format(index+1, rowcount, CpdName))
     ## The following code is used to close the browser.
-    driver.quit() 
+    # driver.quit()
     print('') 
     print('     Finished Analysis')
     print('     Results are now available in "{}"'.format(args.output))
+

From 99db19981a7b04a97b01e2f94756bb8c4865ca39 Mon Sep 17 00:00:00 2001
From: Dinan Xiao <2962142013@qq.com>
Date: Wed, 16 Apr 2025 20:55:17 +0800
Subject: [PATCH 2/2] Update Targets_Prediction.py

---
 Targets_Prediction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Targets_Prediction.py b/Targets_Prediction.py
index 8c12415..d9c75be 100644
--- a/Targets_Prediction.py
+++ b/Targets_Prediction.py
@@ -340,7 +340,7 @@ def process_entry(entry):
             SuperPredResult.to_csv(f,sep=',',header=False)
         print('         screened {} of {} molecules ({})'.format(index+1, rowcount, CpdName))
     ## The following code is used to close the browser.
-    # driver.quit()
+    driver.quit()
     print('') 
     print('     Finished Analysis')
     print('     Results are now available in "{}"'.format(args.output))