sciknowmap/papers_that_are_cited_by_pmid.py at master · SciKnowEngine/sciknowmap · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import re
import sys
import codecs
import os.path
import argparse
from urllib.request import urlopen
from tqdm import tqdm
from sciknowmap.utils import add_boolean_argument

from bs4 import BeautifulSoup,Tag,Comment,NavigableString

import re

def nmxl_files(members):
    try:
        for tarinfo in members:
            if os.path.splitext(tarinfo.name)[1] == ".nxml":
                yield tarinfo
    except IOError:
        doNothing = None

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--inFile', help='Directory of robot-scraped files')
    parser.add_argument('-o', '--outFile', help='Output file')
    add_boolean_argument(parser, 'reviews_only')

    args = parser.parse_args()

    with codecs.open(args.inFile, 'r', 'utf-8') as f:
        id_list = f.readlines()

    f = codecs.open(args.outFile, 'w', 'utf-8')

    for line in tqdm(id_list):

        l = re.split('\s+', line)
        pmid = l[0]
        if len(l)>1 and l[1] == 'False' and args.reviews_only:
            continue

        url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?' + \
                'dbfrom=pubmed&linkname=pubmed_pubmed_refs&id=' + pmid.strip()

        #ascii_url = iriToUri(url)
        pmid_response = urlopen(url)
        pmid_data = pmid_response.read().decode('utf-8')
        soup2 = BeautifulSoup(pmid_data, "lxml-xml")

        pmidtags = soup2.find_all('Id')
        for pmid_tag in pmidtags:
            if pmid_tag.text != pmid.strip():
                f.write(pmid_tag.text + '\t' + pmid.strip() + '\n')

    f.close()