This repository was archived by the owner on Mar 1, 2026. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpapers_that_are_cited_by_pmid.py
More file actions
56 lines (42 loc) · 1.56 KB
/
papers_that_are_cited_by_pmid.py
File metadata and controls
56 lines (42 loc) · 1.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import re
import sys
import codecs
import os.path
import argparse
from urllib.request import urlopen
from tqdm import tqdm
from sciknowmap.utils import add_boolean_argument
from bs4 import BeautifulSoup,Tag,Comment,NavigableString
import re
def nmxl_files(members):
try:
for tarinfo in members:
if os.path.splitext(tarinfo.name)[1] == ".nxml":
yield tarinfo
except IOError:
doNothing = None
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--inFile', help='Directory of robot-scraped files')
parser.add_argument('-o', '--outFile', help='Output file')
add_boolean_argument(parser, 'reviews_only')
args = parser.parse_args()
with codecs.open(args.inFile, 'r', 'utf-8') as f:
id_list = f.readlines()
f = codecs.open(args.outFile, 'w', 'utf-8')
for line in tqdm(id_list):
l = re.split('\s+', line)
pmid = l[0]
if len(l)>1 and l[1] == 'False' and args.reviews_only:
continue
url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?' + \
'dbfrom=pubmed&linkname=pubmed_pubmed_refs&id=' + pmid.strip()
#ascii_url = iriToUri(url)
pmid_response = urlopen(url)
pmid_data = pmid_response.read().decode('utf-8')
soup2 = BeautifulSoup(pmid_data, "lxml-xml")
pmidtags = soup2.find_all('Id')
for pmid_tag in pmidtags:
if pmid_tag.text != pmid.strip():
f.write(pmid_tag.text + '\t' + pmid.strip() + '\n')
f.close()