This repository was archived by the owner on Mar 1, 2026. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_abstracts_from_pmid_list.py
More file actions
90 lines (70 loc) · 2.7 KB
/
get_abstracts_from_pmid_list.py
File metadata and controls
90 lines (70 loc) · 2.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import re
import sys
import codecs
import os.path
import argparse
import time
from urllib.request import urlopen
from urllib.error import URLError
from tqdm import tqdm
from bs4 import BeautifulSoup,Tag,Comment,NavigableString
import nltk
nltk.download('punkt')
from nltk import word_tokenize, sent_tokenize
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def nmxl_files(members):
try:
for tarinfo in members:
if os.path.splitext(tarinfo.name)[1] == ".nxml":
yield tarinfo
except IOError:
doNothing = None
def write_text_from_medline_record_to_disk(record, f):
soup2 = BeautifulSoup(record, "lxml-xml")
for citation_tag in soup2.find_all('MedlineCitation'):
pmid_tag = citation_tag.find('PMID')
title_tag = citation_tag.find('ArticleTitle')
abstract_tag = citation_tag.find('AbstractText')
mesh_data = ",".join([x.text.replace('\n',' ') for x in citation_tag.findAll('MeshHeading')])
if pmid_tag is None or title_tag is None or abstract_tag is None:
continue
is_review = "D"
if True in [x.text == "Review" for x in citation_tag.findAll('PublicationType')]:
is_review = "R"
f.write(pmid_tag.text+'\t'+is_review+'\t'+title_tag.text+'\t'+abstract_tag.text+'\t'+mesh_data+'\n')
if __name__ == '__main__':
efetch_stem = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id='
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--inFile', help='Directory of robot-scraped files')
parser.add_argument('-o', '--outFile', help='Output file')
args = parser.parse_args()
eprint(args.inFile)
if os.path.isfile(args.outFile):
os.remove(args.outFile)
with codecs.open(args.inFile, 'r', 'utf-8') as f:
id_list = f.readlines()
f = codecs.open(args.outFile, 'w', 'utf-8')
f.write('pmid\ttype\ttitle\tabstract\tMeSH\n')
page_size = 200
i = 0
url = efetch_stem
efetch_data = []
for line in tqdm(id_list):
try:
l = re.split('\s+', line)
pmid = l[0]
i += 1
if i >= page_size :
efetch_response = urlopen(url)
write_text_from_medline_record_to_disk(efetch_response.read().decode('utf-8'), f)
url = efetch_stem
i = 0
if re.search('\d$',url) :
url += ','
url += pmid.strip()
except URLError as e:
time.sleep(10)
print("URLError({0}): {1}".format(e.errno, e.strerror))
efetch_response = urlopen(url)
write_text_from_medline_record_to_disk(efetch_response.read().decode('utf-8'), f)