-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathner.py
More file actions
executable file
·106 lines (90 loc) · 3.37 KB
/
ner.py
File metadata and controls
executable file
·106 lines (90 loc) · 3.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from nltk.tag import StanfordNERTagger
from util import isnumber
import os
cache_file_path = "entities.json"
NER_dir = os.path.join('NER_Tagger')
model_name = os.path.join(NER_dir, 'english.all.3class.distsim.crf.ser.gz')
jar_name = os.path.join(NER_dir, 'stanford-ner.jar')
st = StanfordNERTagger(model_filename=model_name, path_to_jar=jar_name)
tagged_sents = []
def extract_all(sentences):
global tagged_sents
tagged_sents = st.tag_sents(sentences)
def get_trans_entities(tagged_sent):
entity_dict = []
for entity in tagged_sent:
entity_type = {entity[0]: entity[1]}
if entity[1] == 'O':
if isnumber(entity[0]):
entity_type[entity[0]] = "NUMBER"
entity_dict.append(entity_type)
continue
if entity[0].isalpha() and not entity[0].islower() and entity != tagged_sent[0]:
entity_type[entity[0]] = "OTHER"
entity_dict.append(entity_type)
continue
if entity[1] == 'ORGANIZATION':
entity_type[entity[0]] = "OTHER"
entity_dict.append(entity_type)
continue
entity_type[entity[0]] = entity[1]
entity_dict.append(entity_type)
return entity_dict
def get_merged_entities(entities):
merged_entities = []
start_type = ''
if len(entities) != 0:
for v in entities[0].values():
start_type = v
start_name = ''
added = []
for entity in entities:
for k, v in entity.items():
if v == start_type:
start_name = start_name + ' ' + k
else:
if start_type != 'O':
merged_entities.append({start_name: start_type})
added.append(start_name)
start_type = v
start_name = k
if start_name not in added and start_type != 'O':
merged_entities.append({start_name: start_type})
if not merged_entities and start_type != 'O':
merged_entities.append({start_name: start_type})
return merged_entities
def get_entities(sentence_id):
tagged_sent = tagged_sents[sentence_id]
entity_dict = get_trans_entities(tagged_sent)
return get_merged_entities(entity_dict)
def question_type(question):
weights = [0, 0, 0, 0] # per, loc, num, other
type = 'OTHER'
num_kw = {'many', 'much', 'amount', 'length', 'time', 'day', 'year', 'decade', 'decades', 'long', 'old',
'range', 'percent', 'level', 'average', 'population', 'wavelength'}
per_kw = {'who', 'who\'s', 'whom', 'whose', 'person', 'name', 'president'}
loc_kw = {'where', 'place', 'district', 'city', 'country'}
# wh_words = {'what','where','who','whom'}
# pre-processing questions
# assign weight for each type depending on the keywords
for word in question:
word = str.lower(word)
if word in per_kw:
weights[0] += len(question)
if word in loc_kw:
weights[1] += len(question)
if word in num_kw:
weights[2] += len(question)
else:
weights[3] += 1
index = weights.index(max(weights))
if index == 0:
type = 'PERSON'
if index == 1:
type = 'LOCATION'
if index == 2:
type = 'NUMBER'
if index == 3:
type = 'OTHER'
# print(type)
return type