ferru97
diff --git a/‎PyPaperBot/Crossref.py‎
Lines changed: 40 additions & 34 deletions b/‎PyPaperBot/Crossref.py‎
Lines changed: 40 additions & 34 deletions
diff --git a/‎PyPaperBot/Downloader.py‎
Lines changed: 39 additions & 27 deletions b/‎PyPaperBot/Downloader.py‎
Lines changed: 39 additions & 27 deletions
diff --git a/‎PyPaperBot/HTMLparsers.py‎
Lines changed: 31 additions & 23 deletions b/‎PyPaperBot/HTMLparsers.py‎
Lines changed: 31 additions & 23 deletions
diff --git a/‎PyPaperBot/NetInfo.py‎
Lines changed: 1 addition & 0 deletions b/‎PyPaperBot/NetInfo.py‎
Lines changed: 1 addition & 0 deletions
@@ -12,9 +12,12 @@ def getBibtex(DOI):
     try:
         url_bibtex = "http://api.crossref.org/works/" + DOI + "/transform/application/x-bibtex"
         x = requests.get(url_bibtex)
+        if x.status_code == 404:
+            return ""
         return str(x.text)
-    except:
-        return None
+    except Exception as e:
+        print(e)
+        return ""
 
 
 def getPapersInfoFromDOIs(DOI, restrict):
@@ -23,60 +26,63 @@ def getPapersInfoFromDOIs(DOI, restrict):
 
     try:
         paper = get_entity(DOI, EntityType.PUBLICATION, OutputType.JSON)
-        if paper!=None and len(paper)>0:
+        if paper is not None and len(paper) > 0:
             if "title" in paper:
                 paper_found.title = paper["title"][0]
-            if "short-container-title" in paper and len(paper["short-container-title"])>0:
+            if "short-container-title" in paper and len(paper["short-container-title"]) > 0:
                 paper_found.jurnal = paper["short-container-title"][0]
 
-            if restrict==None or restrict!=1:
+            if restrict is None or restrict != 1:
                 paper_found.setBibtex(getBibtex(paper_found.DOI))
     except:
-        print("Paper not found "+DOI)
+        print("Paper not found " + DOI)
 
     return paper_found
 
 
-#Get paper information from Crossref and return a list of Paper
+# Get paper information from Crossref and return a list of Paper
 def getPapersInfo(papers, scholar_search_link, restrict, scholar_results):
     papers_return = []
     num = 1
     for paper in papers:
-        while num <= scholar_results:
-            title = paper['title']
-            queries = {'query.bibliographic': title.lower(),'sort':'relevance',"select":"DOI,title,deposited,author,short-container-title"}
+        # while num <= scholar_results:
+        title = paper['title']
+        queries = {'query.bibliographic': title.lower(), 'sort': 'relevance',
+                   "select": "DOI,title,deposited,author,short-container-title"}
 
-            print("Searching paper {} of {} on Crossref...".format(num,scholar_results))
-            num += 1
+        print("Searching paper {} of {} on Crossref...".format(num, len(papers)))
+        num += 1
 
-            found_timestamp = 0
-            paper_found = Paper(title,paper['link'],scholar_search_link, paper['cites'], paper['link_pdf'], paper['year'], paper['authors'])
-            while True:
-                try:
-                    for el in iterate_publications_as_json(max_results=30, queries=queries):
+        found_timestamp = 0
+        paper_found = Paper(title, paper['link'], scholar_search_link, paper['cites'], paper['link_pdf'], paper['year'],
+                            paper['authors'])
+        while True:
+            try:
+                for el in iterate_publications_as_json(max_results=30, queries=queries):
 
-                        el_date = 0
-                        if "deposited" in el and "timestamp" in el["deposited"]:
-                            el_date = int(el["deposited"]["timestamp"])
+                    el_date = 0
+                    if "deposited" in el and "timestamp" in el["deposited"]:
+                        el_date = int(el["deposited"]["timestamp"])
 
-                        if (paper_found.DOI==None or el_date>found_timestamp) and "title" in el and similarStrings(title.lower() ,el["title"][0].lower())>0.75:
-                            found_timestamp = el_date
+                    if (paper_found.DOI is None or el_date > found_timestamp) and "title" in el and similarStrings(
+                            title.lower(), el["title"][0].lower()) > 0.75:
+                        found_timestamp = el_date
 
-                            if "DOI" in el:
-                                paper_found.DOI = el["DOI"].strip().lower()
-                            if "short-container-title" in el and len(el["short-container-title"])>0:
-                                paper_found.jurnal = el["short-container-title"][0]
+                        if "DOI" in el:
+                            paper_found.DOI = el["DOI"].strip().lower()
+                        if "short-container-title" in el and len(el["short-container-title"]) > 0:
+                            paper_found.jurnal = el["short-container-title"][0]
 
-                            if restrict==None or restrict!=1:
-                                paper_found.setBibtex(getBibtex(paper_found.DOI))
+                        if restrict is None or restrict != 1:
+                            paper_found.setBibtex(getBibtex(paper_found.DOI))
 
-                    break
-                except ConnectionError as e:
-                    print("Wait 10 seconds and try again...")
-                    time.sleep(10)
+                break
+            except ConnectionError as e:
+                print("Wait 10 seconds and try again...")
+                time.sleep(10)
 
-            papers_return.append(paper_found)
+        papers_return.append(paper_found)
 
-            time.sleep(random.randint(1,10))
+        time.sleep(random.randint(1, 10))
 
     return papers_return
@@ -4,6 +4,8 @@
 from .HTMLparsers import getSchiHubPDF, SciHubUrls
 import random
 from .NetInfo import NetInfo
+from .Utils import URLjoin
+
 
 # Import enhanced downloader for improved experience
 try:
@@ -14,36 +16,36 @@
     print("Enhanced downloader not available. Install pySmartDL for better downloading experience.")
 
 def setSciHubUrl():
+    print("Searching for a sci-hub mirror")
     r = requests.get(NetInfo.SciHub_URLs_repo, headers=NetInfo.HEADERS)
     links = SciHubUrls(r.text)
-    found = False
 
     for l in links:
         try:
+            print("Trying with {}...".format(l))
             r = requests.get(l, headers=NetInfo.HEADERS)
             if r.status_code == 200:
-                found = True
                 NetInfo.SciHub_URL = l
                 break
         except:
             pass
-    if found:
-        print("\nUsing {} as Sci-Hub instance".format(NetInfo.SciHub_URL))
     else:
-        print("\nNo working Sci-Hub instance found!\nIf in your country Sci-Hub is not available consider using a VPN or a proxy")
+        print(
+            "\nNo working Sci-Hub instance found!\nIf in your country Sci-Hub is not available consider using a VPN or a proxy\nYou can use a specific mirror mirror with the --scihub-mirror argument")
         NetInfo.SciHub_URL = "https://sci-hub.st"
 
 
 def getSaveDir(folder, fname):
     dir_ = path.join(folder, fname)
     n = 1
     while path.exists(dir_):
-       n += 1
-       dir_ = path.join(folder, "("+str(n)+")"+fname)
+        n += 1
+        dir_ = path.join(folder, f"({n}){fname}")
 
     return dir_
 
-def saveFile(file_name,content, paper,dwn_source):
+
+def saveFile(file_name, content, paper, dwn_source):
     f = open(file_name, 'wb')
     f.write(content)
     f.close()
@@ -86,49 +88,59 @@ def URLjoin(*args):
         return "/".join(map(lambda x: str(x).rstrip('/'), args))
 
     NetInfo.SciHub_URL = SciHub_URL
-    if NetInfo.SciHub_URL==None:
+    if NetInfo.SciHub_URL is None:
         setSciHubUrl()
+    if SciDB_URL is not None:
+        NetInfo.SciDB_URL = SciDB_URL
+
+    print("\nUsing Sci-Hub mirror {}".format(NetInfo.SciHub_URL))
+    print("Using Sci-DB mirror {}".format(NetInfo.SciDB_URL))
+    print("You can use --scidb-mirror and --scidb-mirror to specify your're desired mirror URL\n")
 
     num_downloaded = 0
     paper_number = 1
     paper_files = []
     for p in papers:
-        if p.canBeDownloaded() and (num_limit==None or num_downloaded<num_limit):
-            print("Download {} of {} -> {}".format(paper_number, scholar_results, p.title))
+        if p.canBeDownloaded() and (num_limit is None or num_downloaded < num_limit):
+            print("Download {} of {} -> {}".format(paper_number, len(papers), p.title))
             paper_number += 1
 
             pdf_dir = getSaveDir(dwnl_dir, p.getFileName())
 
-            faild = 0
-            while p.downloaded==False and faild!=4:
+            failed = 0
+            url = ""
+            while not p.downloaded and failed != 5:
                 try:
-                    dwn_source = 1 #1 scihub 2 scholar
-                    if faild==0 and p.DOI!=None:
+                    dwn_source = 1  # 1 scidb - 2 scihub - 3 scholar
+                    if failed == 0 and p.DOI is not None:
+                        url = URLjoin(NetInfo.SciDB_URL, p.DOI)
+                    if failed == 1 and p.DOI is not None:
                         url = URLjoin(NetInfo.SciHub_URL, p.DOI)
-                    if faild==1 and p.scholar_link!=None:
+                        dwn_source = 2
+                    if failed == 2 and p.scholar_link is not None:
                         url = URLjoin(NetInfo.SciHub_URL, p.scholar_link)
-                    if faild==2 and p.scholar_link!=None and p.scholar_link[-3:]=="pdf":
+                    if failed == 3 and p.scholar_link is not None and p.scholar_link[-3:] == "pdf":
                         url = p.scholar_link
-                        dwn_source = 2
-                    if faild==3 and p.pdf_link!=None:
+                        dwn_source = 3
+                    if failed == 4 and p.pdf_link is not None:
                         url = p.pdf_link
-                        dwn_source = 2
+                        dwn_source = 3
 
-                    if url!="":
+                    if url != "":
                         r = requests.get(url, headers=NetInfo.HEADERS)
                         content_type = r.headers.get('content-type')
 
-                        if dwn_source==1 and 'application/pdf' not in content_type:
-                            time.sleep(random.randint(1,5))
+                        if (dwn_source == 1 or dwn_source == 2) and 'application/pdf' not in content_type and "application/octet-stream" not in content_type:
+                            time.sleep(random.randint(1, 4))
 
                             pdf_link = getSchiHubPDF(r.text)
-                            if(pdf_link != None):
+                            if pdf_link is not None:
                                 r = requests.get(pdf_link, headers=NetInfo.HEADERS)
                                 content_type = r.headers.get('content-type')
 
-                        if 'application/pdf' in content_type:
-                            paper_files.append(saveFile(pdf_dir,r.content,p,dwn_source))
+                        if 'application/pdf' in content_type or "application/octet-stream" in content_type:
+                            paper_files.append(saveFile(pdf_dir, r.content, p, dwn_source))
                 except Exception:
                     pass
 
-                faild += 1
+                failed += 1
@@ -5,12 +5,14 @@
 @author: Vito
 """
 from bs4 import BeautifulSoup
+import re
+
 
 def schoolarParser(html):
     result = []
     soup = BeautifulSoup(html, "html.parser")
     for element in soup.findAll("div", class_="gs_r gs_or gs_scl"):
-        if isBook(element) == False:
+        if not isBook(element):
             title = None
             link = None
             link_pdf = None
@@ -20,15 +22,15 @@ def schoolarParser(html):
             for h3 in element.findAll("h3", class_="gs_rt"):
                 found = False
                 for a in h3.findAll("a"):
-                    if found == False:
+                    if not found:
                         title = a.text
                         link = a.get("href")
                         found = True
             for a in element.findAll("a"):
-                 if "Cited by" in a.text:
-                     cites = int(a.text[8:])
-                 if "[PDF]" in a.text:
-                     link_pdf = a.get("href")
+                if "Cited by" in a.text:
+                    cites = int(a.text[8:])
+                if "[PDF]" in a.text:
+                    link_pdf = a.get("href")
             for div in element.findAll("div", class_="gs_a"):
                 try:
                     authors, source_and_year, source = div.text.replace('\u00A0', ' ').split(" - ")
@@ -48,45 +50,52 @@ def schoolarParser(html):
                     year = None
                 else:
                     year = str(year)
-            if title!=None:
+            if title is not None:
                 result.append({
-                    'title' : title,
-                    'link' : link,
-                    'cites' : cites,
-                    'link_pdf' : link_pdf,
-                    'year' : year,
-                    'authors' : authors})
+                    'title': title,
+                    'link': link,
+                    'cites': cites,
+                    'link_pdf': link_pdf,
+                    'year': year,
+                    'authors': authors})
     return result
 
 
-
 def isBook(tag):
     result = False
     for span in tag.findAll("span", class_="gs_ct2"):
-        if span.text=="[B]":
+        if span.text == "[B]":
             result = True
     return result
 
 
-
 def getSchiHubPDF(html):
     result = None
     soup = BeautifulSoup(html, "html.parser")
 
-    iframe = soup.find(id='pdf')
-    plugin = soup.find(id='plugin')
+    iframe = soup.find(id='pdf') #scihub logic
+    plugin = soup.find(id='plugin') #scihub logic
+    download_scidb = soup.find("a", text=lambda text: text and "Download" in text, href=re.compile(r"\.pdf$")) #scidb logic
+    embed_scihub = soup.find("embed") #scihub logic
 
-    if iframe!=None:
+    if iframe is not None:
         result = iframe.get("src")
 
-    if plugin!=None and result==None:
+    if plugin is not None and result is None:
         result = plugin.get("src")
 
-    if result!=None and result[0]!="h":
-        result = "https:"+result
+    if result is not None and result[0] != "h":
+        result = "https:" + result
+
+    if download_scidb is not None and result is None:
+        result = download_scidb.get("href")
+
+    if embed_scihub is not None and result is None:
+        result = embed_scihub.get("original-url")
 
     return result
 
+
 def SciHubUrls(html):
     result = []
     soup = BeautifulSoup(html, "html.parser")
@@ -98,4 +107,3 @@ def SciHubUrls(html):
                 result.append(link)
 
     return result
-
 
@@ -1,4 +1,5 @@
 class NetInfo:
     SciHub_URL = None
+    SciDB_URL = "https://annas-archive.se/scidb/"
     HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
     SciHub_URLs_repo = "https://sci-hub.41610.org/"