|
4 | 4 | from .HTMLparsers import getSchiHubPDF, SciHubUrls |
5 | 5 | import random |
6 | 6 | from .NetInfo import NetInfo |
| 7 | +from .Utils import URLjoin |
| 8 | + |
7 | 9 |
|
8 | 10 | # Import enhanced downloader for improved experience |
9 | 11 | try: |
|
14 | 16 | print("Enhanced downloader not available. Install pySmartDL for better downloading experience.") |
15 | 17 |
|
16 | 18 | def setSciHubUrl(): |
| 19 | + print("Searching for a sci-hub mirror") |
17 | 20 | r = requests.get(NetInfo.SciHub_URLs_repo, headers=NetInfo.HEADERS) |
18 | 21 | links = SciHubUrls(r.text) |
19 | | - found = False |
20 | 22 |
|
21 | 23 | for l in links: |
22 | 24 | try: |
| 25 | + print("Trying with {}...".format(l)) |
23 | 26 | r = requests.get(l, headers=NetInfo.HEADERS) |
24 | 27 | if r.status_code == 200: |
25 | | - found = True |
26 | 28 | NetInfo.SciHub_URL = l |
27 | 29 | break |
28 | 30 | except: |
29 | 31 | pass |
30 | | - if found: |
31 | | - print("\nUsing {} as Sci-Hub instance".format(NetInfo.SciHub_URL)) |
32 | 32 | else: |
33 | | - print("\nNo working Sci-Hub instance found!\nIf in your country Sci-Hub is not available consider using a VPN or a proxy") |
| 33 | + print( |
| 34 | + "\nNo working Sci-Hub instance found!\nIf in your country Sci-Hub is not available consider using a VPN or a proxy\nYou can use a specific mirror mirror with the --scihub-mirror argument") |
34 | 35 | NetInfo.SciHub_URL = "https://sci-hub.st" |
35 | 36 |
|
36 | 37 |
|
37 | 38 | def getSaveDir(folder, fname): |
38 | 39 | dir_ = path.join(folder, fname) |
39 | 40 | n = 1 |
40 | 41 | while path.exists(dir_): |
41 | | - n += 1 |
42 | | - dir_ = path.join(folder, "("+str(n)+")"+fname) |
| 42 | + n += 1 |
| 43 | + dir_ = path.join(folder, f"({n}){fname}") |
43 | 44 |
|
44 | 45 | return dir_ |
45 | 46 |
|
46 | | -def saveFile(file_name,content, paper,dwn_source): |
| 47 | + |
| 48 | +def saveFile(file_name, content, paper, dwn_source): |
47 | 49 | f = open(file_name, 'wb') |
48 | 50 | f.write(content) |
49 | 51 | f.close() |
50 | 52 |
|
51 | 53 | paper.downloaded = True |
52 | 54 | paper.downloadedFrom = dwn_source |
53 | 55 |
|
| 56 | +<<<<<<< HEAD |
54 | 57 | def downloadPapers(papers, dwnl_dir, num_limit, scholar_results, SciHub_URL=None, use_enhanced=True): |
55 | 58 | """ |
56 | 59 | Download papers with option to use enhanced downloader |
@@ -84,51 +87,65 @@ def _downloadPapersOriginal(papers, dwnl_dir, num_limit, scholar_results, SciHub |
84 | 87 | """Original download function (renamed for backward compatibility)""" |
85 | 88 | def URLjoin(*args): |
86 | 89 | return "/".join(map(lambda x: str(x).rstrip('/'), args)) |
| 90 | +======= |
| 91 | + |
| 92 | +def downloadPapers(papers, dwnl_dir, num_limit, SciHub_URL=None, SciDB_URL=None): |
| 93 | +>>>>>>> remotes/origin/HEAD |
87 | 94 |
|
88 | 95 | NetInfo.SciHub_URL = SciHub_URL |
89 | | - if NetInfo.SciHub_URL==None: |
| 96 | + if NetInfo.SciHub_URL is None: |
90 | 97 | setSciHubUrl() |
| 98 | + if SciDB_URL is not None: |
| 99 | + NetInfo.SciDB_URL = SciDB_URL |
| 100 | + |
| 101 | + print("\nUsing Sci-Hub mirror {}".format(NetInfo.SciHub_URL)) |
| 102 | + print("Using Sci-DB mirror {}".format(NetInfo.SciDB_URL)) |
| 103 | + print("You can use --scidb-mirror and --scidb-mirror to specify your're desired mirror URL\n") |
91 | 104 |
|
92 | 105 | num_downloaded = 0 |
93 | 106 | paper_number = 1 |
94 | 107 | paper_files = [] |
95 | 108 | for p in papers: |
96 | | - if p.canBeDownloaded() and (num_limit==None or num_downloaded<num_limit): |
97 | | - print("Download {} of {} -> {}".format(paper_number, scholar_results, p.title)) |
| 109 | + if p.canBeDownloaded() and (num_limit is None or num_downloaded < num_limit): |
| 110 | + print("Download {} of {} -> {}".format(paper_number, len(papers), p.title)) |
98 | 111 | paper_number += 1 |
99 | 112 |
|
100 | 113 | pdf_dir = getSaveDir(dwnl_dir, p.getFileName()) |
101 | 114 |
|
102 | | - faild = 0 |
103 | | - while p.downloaded==False and faild!=4: |
| 115 | + failed = 0 |
| 116 | + url = "" |
| 117 | + while not p.downloaded and failed != 5: |
104 | 118 | try: |
105 | | - dwn_source = 1 #1 scihub 2 scholar |
106 | | - if faild==0 and p.DOI!=None: |
| 119 | + dwn_source = 1 # 1 scidb - 2 scihub - 3 scholar |
| 120 | + if failed == 0 and p.DOI is not None: |
| 121 | + url = URLjoin(NetInfo.SciDB_URL, p.DOI) |
| 122 | + if failed == 1 and p.DOI is not None: |
107 | 123 | url = URLjoin(NetInfo.SciHub_URL, p.DOI) |
108 | | - if faild==1 and p.scholar_link!=None: |
| 124 | + dwn_source = 2 |
| 125 | + if failed == 2 and p.scholar_link is not None: |
109 | 126 | url = URLjoin(NetInfo.SciHub_URL, p.scholar_link) |
110 | | - if faild==2 and p.scholar_link!=None and p.scholar_link[-3:]=="pdf": |
| 127 | + if failed == 3 and p.scholar_link is not None and p.scholar_link[-3:] == "pdf": |
111 | 128 | url = p.scholar_link |
112 | | - dwn_source = 2 |
113 | | - if faild==3 and p.pdf_link!=None: |
| 129 | + dwn_source = 3 |
| 130 | + if failed == 4 and p.pdf_link is not None: |
114 | 131 | url = p.pdf_link |
115 | | - dwn_source = 2 |
| 132 | + dwn_source = 3 |
116 | 133 |
|
117 | | - if url!="": |
| 134 | + if url != "": |
118 | 135 | r = requests.get(url, headers=NetInfo.HEADERS) |
119 | 136 | content_type = r.headers.get('content-type') |
120 | 137 |
|
121 | | - if dwn_source==1 and 'application/pdf' not in content_type: |
122 | | - time.sleep(random.randint(1,5)) |
| 138 | + if (dwn_source == 1 or dwn_source == 2) and 'application/pdf' not in content_type and "application/octet-stream" not in content_type: |
| 139 | + time.sleep(random.randint(1, 4)) |
123 | 140 |
|
124 | 141 | pdf_link = getSchiHubPDF(r.text) |
125 | | - if(pdf_link != None): |
| 142 | + if pdf_link is not None: |
126 | 143 | r = requests.get(pdf_link, headers=NetInfo.HEADERS) |
127 | 144 | content_type = r.headers.get('content-type') |
128 | 145 |
|
129 | | - if 'application/pdf' in content_type: |
130 | | - paper_files.append(saveFile(pdf_dir,r.content,p,dwn_source)) |
| 146 | + if 'application/pdf' in content_type or "application/octet-stream" in content_type: |
| 147 | + paper_files.append(saveFile(pdf_dir, r.content, p, dwn_source)) |
131 | 148 | except Exception: |
132 | 149 | pass |
133 | 150 |
|
134 | | - faild += 1 |
| 151 | + failed += 1 |
0 commit comments