-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathprepare.py
More file actions
52 lines (47 loc) · 1.64 KB
/
prepare.py
File metadata and controls
52 lines (47 loc) · 1.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from bs4 import BeautifulSoup as BS
from BetacodeConvertor.BetacodeConvertor import *
import re
from settings import NO_SPACES
"""
This function will take in a Perseus XML text and output a list of the text of the chapters
"""
def parse_perseus(xml_text, unit):
parsed_text = BS(xml_text, 'xml')
con = BetacodeConvertor()
author = parsed_text.author.text
def create_title(parsed):
text_info = parsed.titleStmt
title = " ".join([t.text for t in text_info('title')])
title = re.sub('Machine readable text|\.|\(Greek\)','', title).strip()
return title
title = create_title(parsed_text)
def strip_tag(tag):
for t in parsed_text.find_all(tag):
t.extract()
strip_tag('note')
strip_tag('bibl')
strip_tag('head')
books = []
for item in parsed_text.find_all(unit):
single_spaced = re.sub('\s+',' ', item.text)
unicode_passage = con.convert(single_spaced)[0]
books.append(unicode_passage)
return author, title, books
"""
For texts I prepare myself
"""
def parse_xml(xml_text):
parsed_text = BS(xml_text, 'xml')
author = parsed_text.author.text
title = parsed_text.title.text
books = [item.text for item in parsed_text.find_all('section')]
return author, title, books
def create_chunks(text_in, chunk_size):
if NO_SPACES:
chunks = [text_in[i:i+chunk_size] for i in range(0, len(text_in), chunk_size)]
else:
temp_words = text_in.split()
chunks = [' '.join(temp_words[i:i+chunk_size]) for i in range(0, len(temp_words), chunk_size)]
return chunks
def remove_spaces(text_in):
return re.sub(' ','', text_in)