Skip to content

Commit 77c445f

Browse files
author
Test User
committed
Merge remote-tracking branch 'remotes/origin/HEAD' into enhanced-downloader-pysmartdl
2 parents b442b72 + a65bf70 commit 77c445f

14 files changed

Lines changed: 470 additions & 225 deletions

PyPaperBot/Crossref.py

Lines changed: 40 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,12 @@ def getBibtex(DOI):
1212
try:
1313
url_bibtex = "http://api.crossref.org/works/" + DOI + "/transform/application/x-bibtex"
1414
x = requests.get(url_bibtex)
15+
if x.status_code == 404:
16+
return ""
1517
return str(x.text)
16-
except:
17-
return None
18+
except Exception as e:
19+
print(e)
20+
return ""
1821

1922

2023
def getPapersInfoFromDOIs(DOI, restrict):
@@ -23,60 +26,63 @@ def getPapersInfoFromDOIs(DOI, restrict):
2326

2427
try:
2528
paper = get_entity(DOI, EntityType.PUBLICATION, OutputType.JSON)
26-
if paper!=None and len(paper)>0:
29+
if paper is not None and len(paper) > 0:
2730
if "title" in paper:
2831
paper_found.title = paper["title"][0]
29-
if "short-container-title" in paper and len(paper["short-container-title"])>0:
32+
if "short-container-title" in paper and len(paper["short-container-title"]) > 0:
3033
paper_found.jurnal = paper["short-container-title"][0]
3134

32-
if restrict==None or restrict!=1:
35+
if restrict is None or restrict != 1:
3336
paper_found.setBibtex(getBibtex(paper_found.DOI))
3437
except:
35-
print("Paper not found "+DOI)
38+
print("Paper not found " + DOI)
3639

3740
return paper_found
3841

3942

40-
#Get paper information from Crossref and return a list of Paper
43+
# Get paper information from Crossref and return a list of Paper
4144
def getPapersInfo(papers, scholar_search_link, restrict, scholar_results):
4245
papers_return = []
4346
num = 1
4447
for paper in papers:
45-
while num <= scholar_results:
46-
title = paper['title']
47-
queries = {'query.bibliographic': title.lower(),'sort':'relevance',"select":"DOI,title,deposited,author,short-container-title"}
48+
# while num <= scholar_results:
49+
title = paper['title']
50+
queries = {'query.bibliographic': title.lower(), 'sort': 'relevance',
51+
"select": "DOI,title,deposited,author,short-container-title"}
4852

49-
print("Searching paper {} of {} on Crossref...".format(num,scholar_results))
50-
num += 1
53+
print("Searching paper {} of {} on Crossref...".format(num, len(papers)))
54+
num += 1
5155

52-
found_timestamp = 0
53-
paper_found = Paper(title,paper['link'],scholar_search_link, paper['cites'], paper['link_pdf'], paper['year'], paper['authors'])
54-
while True:
55-
try:
56-
for el in iterate_publications_as_json(max_results=30, queries=queries):
56+
found_timestamp = 0
57+
paper_found = Paper(title, paper['link'], scholar_search_link, paper['cites'], paper['link_pdf'], paper['year'],
58+
paper['authors'])
59+
while True:
60+
try:
61+
for el in iterate_publications_as_json(max_results=30, queries=queries):
5762

58-
el_date = 0
59-
if "deposited" in el and "timestamp" in el["deposited"]:
60-
el_date = int(el["deposited"]["timestamp"])
63+
el_date = 0
64+
if "deposited" in el and "timestamp" in el["deposited"]:
65+
el_date = int(el["deposited"]["timestamp"])
6166

62-
if (paper_found.DOI==None or el_date>found_timestamp) and "title" in el and similarStrings(title.lower() ,el["title"][0].lower())>0.75:
63-
found_timestamp = el_date
67+
if (paper_found.DOI is None or el_date > found_timestamp) and "title" in el and similarStrings(
68+
title.lower(), el["title"][0].lower()) > 0.75:
69+
found_timestamp = el_date
6470

65-
if "DOI" in el:
66-
paper_found.DOI = el["DOI"].strip().lower()
67-
if "short-container-title" in el and len(el["short-container-title"])>0:
68-
paper_found.jurnal = el["short-container-title"][0]
71+
if "DOI" in el:
72+
paper_found.DOI = el["DOI"].strip().lower()
73+
if "short-container-title" in el and len(el["short-container-title"]) > 0:
74+
paper_found.jurnal = el["short-container-title"][0]
6975

70-
if restrict==None or restrict!=1:
71-
paper_found.setBibtex(getBibtex(paper_found.DOI))
76+
if restrict is None or restrict != 1:
77+
paper_found.setBibtex(getBibtex(paper_found.DOI))
7278

73-
break
74-
except ConnectionError as e:
75-
print("Wait 10 seconds and try again...")
76-
time.sleep(10)
79+
break
80+
except ConnectionError as e:
81+
print("Wait 10 seconds and try again...")
82+
time.sleep(10)
7783

78-
papers_return.append(paper_found)
84+
papers_return.append(paper_found)
7985

80-
time.sleep(random.randint(1,10))
86+
time.sleep(random.randint(1, 10))
8187

8288
return papers_return

PyPaperBot/Downloader.py

Lines changed: 44 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from .HTMLparsers import getSchiHubPDF, SciHubUrls
55
import random
66
from .NetInfo import NetInfo
7+
from .Utils import URLjoin
8+
79

810
# Import enhanced downloader for improved experience
911
try:
@@ -14,43 +16,44 @@
1416
print("Enhanced downloader not available. Install pySmartDL for better downloading experience.")
1517

1618
def setSciHubUrl():
19+
print("Searching for a sci-hub mirror")
1720
r = requests.get(NetInfo.SciHub_URLs_repo, headers=NetInfo.HEADERS)
1821
links = SciHubUrls(r.text)
19-
found = False
2022

2123
for l in links:
2224
try:
25+
print("Trying with {}...".format(l))
2326
r = requests.get(l, headers=NetInfo.HEADERS)
2427
if r.status_code == 200:
25-
found = True
2628
NetInfo.SciHub_URL = l
2729
break
2830
except:
2931
pass
30-
if found:
31-
print("\nUsing {} as Sci-Hub instance".format(NetInfo.SciHub_URL))
3232
else:
33-
print("\nNo working Sci-Hub instance found!\nIf in your country Sci-Hub is not available consider using a VPN or a proxy")
33+
print(
34+
"\nNo working Sci-Hub instance found!\nIf in your country Sci-Hub is not available consider using a VPN or a proxy\nYou can use a specific mirror mirror with the --scihub-mirror argument")
3435
NetInfo.SciHub_URL = "https://sci-hub.st"
3536

3637

3738
def getSaveDir(folder, fname):
3839
dir_ = path.join(folder, fname)
3940
n = 1
4041
while path.exists(dir_):
41-
n += 1
42-
dir_ = path.join(folder, "("+str(n)+")"+fname)
42+
n += 1
43+
dir_ = path.join(folder, f"({n}){fname}")
4344

4445
return dir_
4546

46-
def saveFile(file_name,content, paper,dwn_source):
47+
48+
def saveFile(file_name, content, paper, dwn_source):
4749
f = open(file_name, 'wb')
4850
f.write(content)
4951
f.close()
5052

5153
paper.downloaded = True
5254
paper.downloadedFrom = dwn_source
5355

56+
<<<<<<< HEAD
5457
def downloadPapers(papers, dwnl_dir, num_limit, scholar_results, SciHub_URL=None, use_enhanced=True):
5558
"""
5659
Download papers with option to use enhanced downloader
@@ -84,51 +87,65 @@ def _downloadPapersOriginal(papers, dwnl_dir, num_limit, scholar_results, SciHub
8487
"""Original download function (renamed for backward compatibility)"""
8588
def URLjoin(*args):
8689
return "/".join(map(lambda x: str(x).rstrip('/'), args))
90+
=======
91+
92+
def downloadPapers(papers, dwnl_dir, num_limit, SciHub_URL=None, SciDB_URL=None):
93+
>>>>>>> remotes/origin/HEAD
8794

8895
NetInfo.SciHub_URL = SciHub_URL
89-
if NetInfo.SciHub_URL==None:
96+
if NetInfo.SciHub_URL is None:
9097
setSciHubUrl()
98+
if SciDB_URL is not None:
99+
NetInfo.SciDB_URL = SciDB_URL
100+
101+
print("\nUsing Sci-Hub mirror {}".format(NetInfo.SciHub_URL))
102+
print("Using Sci-DB mirror {}".format(NetInfo.SciDB_URL))
103+
print("You can use --scidb-mirror and --scidb-mirror to specify your're desired mirror URL\n")
91104

92105
num_downloaded = 0
93106
paper_number = 1
94107
paper_files = []
95108
for p in papers:
96-
if p.canBeDownloaded() and (num_limit==None or num_downloaded<num_limit):
97-
print("Download {} of {} -> {}".format(paper_number, scholar_results, p.title))
109+
if p.canBeDownloaded() and (num_limit is None or num_downloaded < num_limit):
110+
print("Download {} of {} -> {}".format(paper_number, len(papers), p.title))
98111
paper_number += 1
99112

100113
pdf_dir = getSaveDir(dwnl_dir, p.getFileName())
101114

102-
faild = 0
103-
while p.downloaded==False and faild!=4:
115+
failed = 0
116+
url = ""
117+
while not p.downloaded and failed != 5:
104118
try:
105-
dwn_source = 1 #1 scihub 2 scholar
106-
if faild==0 and p.DOI!=None:
119+
dwn_source = 1 # 1 scidb - 2 scihub - 3 scholar
120+
if failed == 0 and p.DOI is not None:
121+
url = URLjoin(NetInfo.SciDB_URL, p.DOI)
122+
if failed == 1 and p.DOI is not None:
107123
url = URLjoin(NetInfo.SciHub_URL, p.DOI)
108-
if faild==1 and p.scholar_link!=None:
124+
dwn_source = 2
125+
if failed == 2 and p.scholar_link is not None:
109126
url = URLjoin(NetInfo.SciHub_URL, p.scholar_link)
110-
if faild==2 and p.scholar_link!=None and p.scholar_link[-3:]=="pdf":
127+
if failed == 3 and p.scholar_link is not None and p.scholar_link[-3:] == "pdf":
111128
url = p.scholar_link
112-
dwn_source = 2
113-
if faild==3 and p.pdf_link!=None:
129+
dwn_source = 3
130+
if failed == 4 and p.pdf_link is not None:
114131
url = p.pdf_link
115-
dwn_source = 2
132+
dwn_source = 3
116133

117-
if url!="":
134+
if url != "":
118135
r = requests.get(url, headers=NetInfo.HEADERS)
119136
content_type = r.headers.get('content-type')
120137

121-
if dwn_source==1 and 'application/pdf' not in content_type:
122-
time.sleep(random.randint(1,5))
138+
if (dwn_source == 1 or dwn_source == 2) and 'application/pdf' not in content_type and "application/octet-stream" not in content_type:
139+
time.sleep(random.randint(1, 4))
123140

124141
pdf_link = getSchiHubPDF(r.text)
125-
if(pdf_link != None):
142+
if pdf_link is not None:
126143
r = requests.get(pdf_link, headers=NetInfo.HEADERS)
127144
content_type = r.headers.get('content-type')
128145

129-
if 'application/pdf' in content_type:
130-
paper_files.append(saveFile(pdf_dir,r.content,p,dwn_source))
146+
if 'application/pdf' in content_type or "application/octet-stream" in content_type:
147+
paper_files.append(saveFile(pdf_dir, r.content, p, dwn_source))
131148
except Exception:
132149
pass
133150

134-
faild += 1
151+
failed += 1

PyPaperBot/HTMLparsers.py

Lines changed: 31 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@
55
@author: Vito
66
"""
77
from bs4 import BeautifulSoup
8+
import re
9+
810

911
def schoolarParser(html):
1012
result = []
1113
soup = BeautifulSoup(html, "html.parser")
1214
for element in soup.findAll("div", class_="gs_r gs_or gs_scl"):
13-
if isBook(element) == False:
15+
if not isBook(element):
1416
title = None
1517
link = None
1618
link_pdf = None
@@ -20,15 +22,15 @@ def schoolarParser(html):
2022
for h3 in element.findAll("h3", class_="gs_rt"):
2123
found = False
2224
for a in h3.findAll("a"):
23-
if found == False:
25+
if not found:
2426
title = a.text
2527
link = a.get("href")
2628
found = True
2729
for a in element.findAll("a"):
28-
if "Cited by" in a.text:
29-
cites = int(a.text[8:])
30-
if "[PDF]" in a.text:
31-
link_pdf = a.get("href")
30+
if "Cited by" in a.text:
31+
cites = int(a.text[8:])
32+
if "[PDF]" in a.text:
33+
link_pdf = a.get("href")
3234
for div in element.findAll("div", class_="gs_a"):
3335
try:
3436
authors, source_and_year, source = div.text.replace('\u00A0', ' ').split(" - ")
@@ -48,45 +50,52 @@ def schoolarParser(html):
4850
year = None
4951
else:
5052
year = str(year)
51-
if title!=None:
53+
if title is not None:
5254
result.append({
53-
'title' : title,
54-
'link' : link,
55-
'cites' : cites,
56-
'link_pdf' : link_pdf,
57-
'year' : year,
58-
'authors' : authors})
55+
'title': title,
56+
'link': link,
57+
'cites': cites,
58+
'link_pdf': link_pdf,
59+
'year': year,
60+
'authors': authors})
5961
return result
6062

6163

62-
6364
def isBook(tag):
6465
result = False
6566
for span in tag.findAll("span", class_="gs_ct2"):
66-
if span.text=="[B]":
67+
if span.text == "[B]":
6768
result = True
6869
return result
6970

7071

71-
7272
def getSchiHubPDF(html):
7373
result = None
7474
soup = BeautifulSoup(html, "html.parser")
7575

76-
iframe = soup.find(id='pdf')
77-
plugin = soup.find(id='plugin')
76+
iframe = soup.find(id='pdf') #scihub logic
77+
plugin = soup.find(id='plugin') #scihub logic
78+
download_scidb = soup.find("a", text=lambda text: text and "Download" in text, href=re.compile(r"\.pdf$")) #scidb logic
79+
embed_scihub = soup.find("embed") #scihub logic
7880

79-
if iframe!=None:
81+
if iframe is not None:
8082
result = iframe.get("src")
8183

82-
if plugin!=None and result==None:
84+
if plugin is not None and result is None:
8385
result = plugin.get("src")
8486

85-
if result!=None and result[0]!="h":
86-
result = "https:"+result
87+
if result is not None and result[0] != "h":
88+
result = "https:" + result
89+
90+
if download_scidb is not None and result is None:
91+
result = download_scidb.get("href")
92+
93+
if embed_scihub is not None and result is None:
94+
result = embed_scihub.get("original-url")
8795

8896
return result
8997

98+
9099
def SciHubUrls(html):
91100
result = []
92101
soup = BeautifulSoup(html, "html.parser")
@@ -98,4 +107,3 @@ def SciHubUrls(html):
98107
result.append(link)
99108

100109
return result
101-

PyPaperBot/NetInfo.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
class NetInfo:
22
SciHub_URL = None
3+
SciDB_URL = "https://annas-archive.se/scidb/"
34
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
45
SciHub_URLs_repo = "https://sci-hub.41610.org/"

0 commit comments

Comments
 (0)