-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_ted.py
More file actions
113 lines (79 loc) · 2.69 KB
/
extract_ted.py
File metadata and controls
113 lines (79 loc) · 2.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import re, uuid, os.path
from datetime import datetime
import requests, pytz, json
from pdb import set_trace
import sys
from HTMLParser import HTMLParser
# thanks, http://stackoverflow.com/a/925630
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
reload(sys)
sys.setdefaultencoding('utf8')
# open onix
# for each file in onix, print out title and id
ted_file = './greenTED.json'
RE_XML_ILLEGAL = u'([\u0000-\u0008\u000b-\u000c\u000e-\u001f\ufffe-\uffff])' + \
u'|' + \
u'([%s-%s][^%s-%s])|([^%s-%s][%s-%s])|([%s-%s]$)|(^[%s-%s])' % \
(unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff),
unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff),
unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff))
with open(ted_file, 'r') as fp:
talks = json.loads(fp.read())
docs = []
for talk in talks:
# title and subtitle
titles = []
titles.append(talk['title'])
print titles
# id
id = talk['talkid']
#print id
# creators and contributors
creators = []
creators.append(talk['author'])
# date
date = datetime.strptime(talk['ddate'].strip(), "%b %Y")
local = pytz.timezone("US/Eastern")
local_dt = local.localize(date, is_dst=None)
utc_dt = local_dt.astimezone(pytz.utc)
dates = []
dates.append(utc_dt.strftime ("%Y-%m-%dT%H:%M:%SZ"))
# description
description = []
description.append(talk['description'])
# content
content = []
content.append(strip_tags(talk['transcript']))
# subjects
subject = talk['tags'].split(', ')
doc = {'type': 'add', 'id': str(uuid.uuid1()),
'fields': {
'title': titles,
'identifier': [id],
'creator': creators,
'publisher': ['TED'],
'date': dates,
'language': ['eng'],
'description': description,
'content': content,
'subject': subject,
}
}
docs.append(doc)
#print json.dumps(docs[0], sort_keys=True, indent=4, separators=(',', ': '))
url = 'our cloudsearch batch endpoint'
r = requests.post(url, json=docs)
print r.status_code
print r.text
print ""