diff --git a/corpustools/corpus/classes/lexicon.py b/corpustools/corpus/classes/lexicon.py index 2e21ab2e..7fa4bb1f 100644 --- a/corpustools/corpus/classes/lexicon.py +++ b/corpustools/corpus/classes/lexicon.py @@ -845,8 +845,9 @@ class EnvironmentFilter(object): Environments are strings of the form "[+feature,-feature]_[+feature]" or "[+feature]_" or "a_b" or "_b" """ - def __init__(self, corpus, env): - + def __init__(self, corpus, env, long_distance = False, segment_set = None): + self.segment_set = segment_set + self.long_distance = long_distance #there's a problem where some feature names have underscores in them #so doing lhs,rhs=env.split('_') causes unpacking problems #this in an awakward work-around that checks to see if either side of @@ -897,8 +898,50 @@ def __init__(self, corpus, env): self.rhs_string = rhs self.rhs = [rhs] + def apply(self, sequence): + results = list() + for i, s in enumerate(sequence): + if not s in self.segment_set: #Doesn't match + continue + if not self.long_distance: + #FIXME! Transcription.get_env should just return an Environment + env = Environment(*sequence.get_env(i)) #Basic local environment check + if env not in self: + continue + else: + if self.lhs and self.lhs != '#': + if i == 0: #First elements + continue + for j in range(i-1, -1,-1): + if sequence[j] in self.lhs: + break + else: + continue + if self.rhs and self.rhs != '#': + if i == len(sequence) - 1: + continue + for j in range(i+1, len(sequence)): + if sequence[j] in self.rhs: + break + else: + continue + results.append((s, self)) + return results + def __str__(self): - return '_'.join([self.lhs_string,self.rhs_string]) + if not self.long_distance: + to_join = [self.lhs_string,self.rhs_string] + else: + to_join = list() + if self.lhs: + to_join.append(self.lhs_string+'*') + else: + to_join.append('') + if self.rhs: + to_join.append('*'+self.rhs_string) + else: + to_join.append('') + return '_'.join(to_join) def __eq__(self, other): if not hasattr(other,'lhs'): @@ -915,14 +958,15 @@ def __hash__(self): return hash((self.rhs_string, self.lhs_string)) def __contains__(self, item): - if not isinstance(item, Environment): + if isinstance(item, Environment): + if self.rhs: + if item.rhs not in self.rhs: + return False + if self.lhs: + if item.lhs not in self.lhs: + return False + else: return False - if self.rhs: - if item.rhs not in self.rhs: - return False - if self.lhs: - if item.lhs not in self.lhs: - return False return True class Attribute(object): @@ -1625,6 +1669,7 @@ def check_coverage(self): return [x for x in self._inventory.keys() if x not in self.specifier] def phonological_search(self,seg_list,envs=None, sequence_type = 'transcription', + long_distance = False, call_back = None, stop_check = None): """ Perform a search of a corpus for segments, with the option of only @@ -1665,7 +1710,9 @@ def phonological_search(self,seg_list,envs=None, sequence_type = 'transcription' call_back(0,len(self)) cur = 0 if envs is not None: - envs = [EnvironmentFilter(self, env) for env in envs] + envs = [EnvironmentFilter(self, env, long_distance, seg_list) for env in envs] + else: + envs = [EnvironmentFilter(self, '#_#', True, seg_list)] results = list() for word in self: if stop_check is not None and stop_check(): @@ -1675,17 +1722,9 @@ def phonological_search(self,seg_list,envs=None, sequence_type = 'transcription' if cur % 20 == 0: call_back(cur) founds = list() - for pos,seg in enumerate(getattr(word, sequence_type)): - if not seg in seg_list: - continue - if envs is None: - founds.append((seg,'')) - continue - word_env = word.get_env(pos, sequence_type) - for env in envs: - if word_env in env: - founds.append((seg,env)) - break + tier = getattr(word, sequence_type) + for env in envs: + founds.extend(env.apply(tier)) if founds: results.append((word, founds)) return results diff --git a/corpustools/gui/models.py b/corpustools/gui/models.py index cd7e4e20..883f0db9 100644 --- a/corpustools/gui/models.py +++ b/corpustools/gui/models.py @@ -40,7 +40,7 @@ def data(self, index, role=None): else: data = 'No' elif isinstance(data,list): - data = ', '.join(data) + data = ', '.join(map(str,data)) else: data = str(data) except IndexError: @@ -453,18 +453,22 @@ def __init__(self, header, summary_header, results, settings, parent=None): self.summary_header = summary_header self.columns = self.header - self.rows = results - self.allData = self.rows + self.rows = [x[1] for x in results] #Tuples of feature specification, result_line) + self.allData = results self.summarized = False def _summarize(self): typefreq = defaultdict(float) tokenfreq = defaultdict(float) for line in self.allData: + features, line = line segs = line[2] envs = line[3] for i,seg in enumerate(segs): - segenv = seg,envs[i] + if features is None: + segenv = seg,envs[i] + else: + segenv = features,envs[i] typefreq[segenv] += 1 tokenfreq[segenv] += line[0].frequency @@ -481,7 +485,7 @@ def setSummarized(self, b): if self.summarized: self._summarize() else: - self.rows = self.allData + self.rows = [x[1] for x in self.allData] self.columns = self.header self.layoutChanged.emit() diff --git a/corpustools/gui/psgui.py b/corpustools/gui/psgui.py index 8d63f2eb..a66f390b 100644 --- a/corpustools/gui/psgui.py +++ b/corpustools/gui/psgui.py @@ -76,11 +76,21 @@ def __init__(self, parent, corpus, showToolTips): pslayout.addWidget(self.envWidget) - optionLayout = QVBoxLayout() + optionLayout = QFormLayout() self.tierWidget = TierWidget(corpus,include_spelling=False) - optionLayout.addWidget(self.tierWidget) + optionLayout.addRow(self.tierWidget) + + self.longDistanceCheck = QCheckBox() + + optionLayout.addRow('Allow for intervening segments', self.longDistanceCheck) + + self.forceSegmentsCheck = QCheckBox() + + self.forceSegmentsCheck.setChecked(True) + + optionLayout.addRow('Force summary by segments', self.forceSegmentsCheck) optionFrame = QGroupBox('Options') @@ -120,12 +130,17 @@ def generateKwargs(self): "Missing information", "Please specify at least one {}.".format(targetType[:-1].lower())) return if targetType == 'Features': + self.features = targetList targetList = targetList[1:-1] kwargs['seg_list'] = self.corpus.features_to_segments(targetList) else: + self.features = None kwargs['seg_list'] = targetList + if self.forceSegmentsCheck.isChecked(): + self.features = None kwargs['corpus'] = self.corpus kwargs['sequence_type'] = self.tierWidget.value() + kwargs['long_distance'] = self.longDistanceCheck.isChecked() envs = self.envWidget.value() if len(envs) > 0: kwargs['envs'] = envs @@ -152,5 +167,5 @@ def setResults(self,results): envs = [str(x[1]) for x in f] except IndexError: envs = [] - self.results.append([w, str(getattr(w,self.tierWidget.value())),segs, - envs]) + self.results.append((self.features, [w, str(getattr(w,self.tierWidget.value())),segs, + envs])) diff --git a/corpustools/gui/views.py b/corpustools/gui/views.py index fd83055b..7e8b3ca6 100644 --- a/corpustools/gui/views.py +++ b/corpustools/gui/views.py @@ -834,8 +834,11 @@ def redo(self): if self.dialog.update: self.table.model().addRows(self.dialog.results) else: + dataModel = PhonoSearchResultsModel(self.dialog.header, + self.dialog.summary_header, self.dialog.results, self._parent.settings) + dataModel.setSummarized(self.summarized) self.table.setModel(dataModel) self.raise_() self.activateWindow() diff --git a/tests/test_lexicon.py b/tests/test_lexicon.py index 15e1a22c..46b914fc 100644 --- a/tests/test_lexicon.py +++ b/tests/test_lexicon.py @@ -408,9 +408,18 @@ def test_contains(self): self.assertFalse(env2 in envfilt) envfilt = EnvironmentFilter(self.corpus,'[+feature1]_[+feature1]') - self.assertTrue(env1 in envfilt) - self.assertFalse(env2 in envfilt) - self.assertFalse(env3 in envfilt) + assert(env1 in envfilt) + assert(env2 not in envfilt) + assert(env3 not in envfilt) + + def test_apply(self): + envfilt = EnvironmentFilter(self.corpus,'[-feature1]_[+feature1]',False,['a']) + c = self.corpus['c'] + assert(envfilt.apply(c.transcription) == [('a',envfilt)]) + + envfilt = EnvironmentFilter(self.corpus,'[-feature1]_[+feature1]',True,['a']) + c = self.corpus['c'] + assert(envfilt.apply(c.transcription) == [('a',envfilt)]) def test_categories_spe(specified_test_corpus):