-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathload.py
More file actions
43 lines (40 loc) · 1.88 KB
/
load.py
File metadata and controls
43 lines (40 loc) · 1.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from database.models import Author, Text, Section, SectionContent, SectionNgrams, GlobalNgrams
from database import Session
from log import log
from database.utilities import get_or_create
from prepare import parse_perseus, parse_xml, create_chunks
from generate import generate_ngrams
from update import update_global_counts, update_vector_space
from settings import USE_ORIGINAL_DIVISIONS, DIVISION_LENGTH
def load(file_path):
if not file_path.endswith('xml'):
print('Not an XML file:' + file_path)
pass
if file_path.endswith('DIY.xml'):
author, title, sections = parse_xml(open(file_path))
else:
author, title, sections = parse_perseus(open(file_path),'div1')
session = Session()
a = get_or_create(session, Author, name=author)
session.commit()
t = get_or_create(session, Text, name=title, author=a.id)
session.commit()
global_ngrams = session.query(GlobalNgrams).first()
section_count = 1
log('Loading: ' + t.name)
if not USE_ORIGINAL_DIVISIONS:
sections = [' '.join(sections)]
if DIVISION_LENGTH:
sections = create_chunks(sections[0],DIVISION_LENGTH)
for sec in sections:
temp_section = get_or_create(session, Section, source_text=t.id, number=section_count)
log('Loading section ' + str(section_count))
session.commit()
temp_section_content = get_or_create(session, SectionContent, section = temp_section.id, content = sec)
log('Creating ngrams of ' + str(section_count))
temp_section_ngrams = get_or_create(session, SectionNgrams, section = temp_section.id, ngrams = generate_ngrams(temp_section_content.content))
log('Updating global ngram counts.')
update_global_counts(session, global_ngrams,temp_section_ngrams.ngrams)
section_count = section_count + 1
session.commit()
update_vector_space(session, global_ngrams)