scripts/extract_ip.py at master · greencommons/scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import re, uuid, os.path
from lxml import etree
from dateutil.parser import parse
import requests, pytz

from pdb import set_trace

import sys

reload(sys)
sys.setdefaultencoding('utf8')

# open onix
# for each file in onix, print out title and id

onix_file = './gccorpus/ip/BV_ONIX_feed_island_nodatelimit_ISBNGroup_364.xml'


RE_XML_ILLEGAL = u'([\u0000-\u0008\u000b-\u000c\u000e-\u001f\ufffe-\uffff])' + \
                 u'|' + \
                 u'([%s-%s][^%s-%s])|([^%s-%s][%s-%s])|([%s-%s]$)|(^[%s-%s])' % \
                 (unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff),
                  unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff),
                  unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff))

with open(onix_file, 'r') as fp:
    root = etree.fromstring(fp.read())

    products = root.xpath('/ONIXMessage/Product')

    products = products[477:]

    for product in products:

        # title and subtitle
        titles = []
        titles.append(product.xpath('Title/TitleText')[0].text)
        print product.xpath('Title/TitleText')[0].text.encode('utf-8')

        if len(product.xpath('Title/Subtitle')):
            titles.append(product.xpath('Title/Subtitle')[0].text)
            print product.xpath('Title/Subtitle')[0].text.encode('utf-8')

        # id
        id = product.xpath('ProductIdentifier/IDValue')[0].text

        # creators and contributors
        creators = []
        contributors = []
        for contributor in product.xpath('Contributor/PersonName'):

            if contributor.text[0] in ['A', 'a']:
                creators.append(contributor.text)
                print contributor.text.encode('utf-8')
            else:
                contributors.append(contributor.text)
                print contributor.text.encode('utf-8')


        # publisher
        publisher = product.xpath('Publisher/PublisherName')[0].text
        print publisher.encode('utf-8')

        # pub date
        pub_dates = []
        pub_date = product.xpath('PublicationDate')[0].text

        pd = parse(pub_date)
        #set_trace()
        local = pytz.timezone("US/Eastern")
        local_dt = local.localize(pd, is_dst=None)
        utc_dt = local_dt.astimezone(pytz.utc)
        pub_dates.append(utc_dt.strftime ("%Y-%m-%dT%H:%M:%SZ"))

        langs = []
        # lang
        for lang in product.xpath('Language/LanguageCode'):
            langs.append(lang.text)
            print lang.text

        # content
        content_file = 'gccorpus/ip-extracted/%s.txt' % id
        content = ''
        if os.path.exists(content_file):
            with open(content_file, 'r') as epub:
                content = epub.read()
                # AWS CloudSearch seems to have a size limit of 1048576 bytes
                content = content[:1000000]

            print ""

            docs = []

            content = re.sub(RE_XML_ILLEGAL, "?", content)
            content = unicode(content, 'utf-8')

        doc = {'type': 'add', 'id': str(uuid.uuid1()),
            'fields': {
                    'title': titles,
                    'identifier': [id],
                    'creator': creators,
                    'contributor': contributors,
                    'publisher': [publisher],
                    'date': pub_dates,
                    'language': langs,
                    'content': [content],
                    }
            }

        docs.append(doc)

        url = 'our cloudsearch batch endpoint'
        r = requests.post(url, json=docs)
        print titles[0]
        print r.status_code
        print r.text
        print ""