From 70a9657ce256a7ae4cc9901c5f31885702ea30c0 Mon Sep 17 00:00:00 2001 From: Unknown Date: Fri, 11 May 2018 18:35:41 +0200 Subject: [PATCH] error checking + compund stems + nominal categories *) Error checking when loading a .dic so the programs logs the line of the .dic where the error is comming. *) Stems can now be compound words (more than one word for stem) * is not allow in compound words. *) categories can be nominal, this means that they don't have a number. Are for humans to read. --- py_lex/liwc_parser.py | 44 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/py_lex/liwc_parser.py b/py_lex/liwc_parser.py index 9919b66..e37de23 100644 --- a/py_lex/liwc_parser.py +++ b/py_lex/liwc_parser.py @@ -71,7 +71,11 @@ def get_refs(self, key): if stems is not None: for stem in stems: - categories |= self.stem_to_categories[stem] + if stem in self.stem_to_categories: + categories |= self.stem_to_categories[stem] + else: + raise Exception("for key {2} stem {0} is not in stem to categories: {1}" + .format(stem, self.stem_to_categories, key)) return categories @@ -120,15 +124,33 @@ def _get_category_and_stem_lines(self, liwc_lines): category_lines: List[List[str, str]] -> Bidict ''' def _build_category_ref_bidict(self, category_lines): - return frozenbidict({ int(cat[0]):cat[1] for cat in category_lines }) + d = {} + try: + for cat in category_lines: + if len(cat) > 1 and cat[0].isdigit(): + d[int(cat[0])] = cat[1] + return frozenbidict(d) + except Exception as e: + raise Exception("category line {0} raise the exception: {1}".format(cat, e)) ''' [['stem*', '1', '3', ...], ...] -> { 'stem': set([1, 3, ...]), ...} ''' def _build_stem_cat_dict(self, stem_lines): - return { stem[0].rstrip('*'): set(int(x) for x in stem[1:]) - for stem in stem_lines } + try: + d = {} + for stem in stem_lines: + word_index = 0 + word = "" + while not stem[word_index].isdigit(): + word += stem[word_index] + word_index += 1 + + d[word.rstrip('*')] = set(int(x) for x in stem[word_index:]) + return d + except Exception as e: + raise Exception("stem line {0} raise exception: {1}".format(stem, e)) ''' Separate simple word lookup (where we can use a dictionary directly) from @@ -139,11 +161,19 @@ def _build_stem_cat_dict(self, stem_lines): ''' def _build_stem_key_lists(self, stem_lines): stems, keys = [], [] - endswith_star = lambda s: str.endswith(s, '*') for stem in stem_lines: - (stems if endswith_star(stem[0]) else keys) \ - .append(stem[0].rstrip('*')) + word_index = 0 + word = "" + while not stem[word_index].isdigit(): + word += stem[word_index] + " " + word_index += 1 + word = word.strip() + + if word.endswith('*'): + stems.append(word.rstrip('*')) + else: + keys.append(word) return set(stems), set(keys)