This repository was archived by the owner on Mar 1, 2026. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_abstracts_from_pubmed.py
More file actions
81 lines (62 loc) · 2.39 KB
/
get_abstracts_from_pubmed.py
File metadata and controls
81 lines (62 loc) · 2.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from elasticsearch import Elasticsearch, helpers
from robot_biocurator import pmc, utils, sd2
import re
import sys
import codecs
import os.path
import argparse
from urllib.request import urlopen
from tqdm import tqdm
from bs4 import BeautifulSoup,Tag,Comment,NavigableString
def nmxl_files(members):
try:
for tarinfo in members:
if os.path.splitext(tarinfo.name)[1] == ".nxml":
yield tarinfo
except IOError:
doNothing = None
if __name__ == '__main__':
efetch_stem = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id='
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--inFile', help='Directory of robot-scraped files')
parser.add_argument('-o', '--outFile', help='Output file')
args = parser.parse_args()
if os.path.isfile(args.outFile):
os.remove(args.outFile)
with codecs.open(args.inFile, 'r', 'utf-8') as f:
id_list = f.readlines()
f = codecs.open(args.outFile, 'w', 'utf-8')
page_size = 100
i = 0
url = efetch_stem
efetch_data = []
for line in tqdm(id_list):
l = re.split('\s+', line)
pmid = l[0]
i += 1
if i == page_size :
efetch_response = urlopen(url)
efetch_data.append( efetch_response.read().decode('utf-8'))
url = efetch_stem
i = 0
if re.search('\d$',url) :
url += ','
url += pmid.strip()
efetch_response = urlopen(url)
efetch_data.append(efetch_response.read().decode('utf-8'))
url = efetch_stem
print("\n\nSaving records to output: " + args.outFile)
for record in tqdm(efetch_data):
soup2 = BeautifulSoup(record, "lxml-xml")
for citation_tag in soup2.find_all('MedlineCitation') :
pmid_tag = citation_tag.find('PMID')
title_tag = citation_tag.find('ArticleTitle')
abstract_tag = citation_tag.find('AbstractText')
review_tags = citation_tag.findAll('PublicationType')
if pmid_tag is None or title_tag is None or abstract_tag is None:
continue
is_review = False
if True in [x.text == "Review" for x in citation_tag.findAll('PublicationType')]:
is_review = True
f.write(pmid_tag.text + '\t' + str(is_review) + '\t' + title_tag.text +'\t' + abstract_tag.text + '\n')
f.close()