diff --git a/notebooks/textbrew-examples.ipynb b/notebooks/textbrew-examples.ipynb index 79b1148..4cf69f2 100644 --- a/notebooks/textbrew-examples.ipynb +++ b/notebooks/textbrew-examples.ipynb @@ -2,7 +2,21 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '../')" + ] + }, + { + "cell_type": "code", + "execution_count": 36, "metadata": { "collapsed": false, "deletable": true, @@ -11,12 +25,13 @@ "outputs": [], "source": [ "from textbrew.format import Format\n", - "from textbrew.regex import RegexStudio , transforms" + "from textbrew.regex import transforms\n", + "from textbrew.regex.studio import RegexStudio\n" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 37, "metadata": { "collapsed": true, "deletable": true, @@ -30,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 38, "metadata": { "collapsed": false, "deletable": true, @@ -43,7 +58,7 @@ "'@microsoft Bill Gates is the richest man on Wall Street 09, who also does most charity (or so he claims)!'" ] }, - "execution_count": 4, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -57,7 +72,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 43, "metadata": { "collapsed": false, "deletable": true, @@ -67,10 +82,10 @@ { "data": { "text/plain": [ - "'@microsoft bill_gates is the|richest|man on wall_street 09 , who also does most|charity ( or so he claims ) ! '" + "u'@microsoft bill_gates is the|richest|man on wall_street 09 , who also does most|charity ( or so he claims ) ! '" ] }, - "execution_count": 5, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -78,7 +93,7 @@ "source": [ "# Create compund nouns out out of text(probable applications - Training Compound Word Vectors)\n", "formatter = Format()\n", - "formatter.compound_nouns(cleaned_text)" + "formatter.compound_nouns(unicode(cleaned_text))" ] }, { @@ -93,21 +108,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 2", "language": "python", - "name": "python3" + "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2+" + "pygments_lexer": "ipython2", + "version": "2.7.12" } }, "nbformat": 4, diff --git a/textbrew/__init__.pyc b/textbrew/__init__.pyc new file mode 100644 index 0000000..fa738c2 Binary files /dev/null and b/textbrew/__init__.pyc differ diff --git a/textbrew/format.py b/textbrew/format.py index 05a4ffa..80d5bce 100644 --- a/textbrew/format.py +++ b/textbrew/format.py @@ -1,3 +1,5 @@ +from __future__ import absolute_import + import re import numpy as np from spacy.en import English @@ -5,18 +7,18 @@ PARSER = English() -class Format(): +class Format(object): - def __init__(self, lower=True): - """ + def __init__(self, lower = True): + u""" Constructor for 'Format' class. :param lower: bool: Lowercases the text """ self.lower = lower - def compound_nouns(self, text, nnp_sep="_", nn_sep="|"): - """ + def compound_nouns(self, text, nnp_sep=u"_", nn_sep=u"|"): + u""" Combines Noun Phrases into a single token (with separators as provided in arguments) @@ -35,24 +37,23 @@ def compound_nouns(self, text, nnp_sep="_", nn_sep="|"): phrase.merge(phrase.root.tag_, phrase.text, phrase.root.ent_type_) # Placeholder for formatted text - text = "" + text = u"" for token in parsed_doc: # Reformatted phrase - phrase = "" + phrase = u"" # Replaces white spaces between words in a phrase # with appropriate separators - if token.tag_ == "NNP": - phrase = token.text.replace(" ", nnp_sep) - elif token.tag_ == "NN": - phrase = token.text.replace(" ", nn_sep) + if token.tag_ == u"NNP": + phrase = token.text.replace(u" ", nnp_sep) + elif token.tag_ == u"NN": + phrase = token.text.replace(u" ", nn_sep) else: phrase = token.text - text += phrase + " " + text += phrase + u" " # If 'lower' argument is true then lowercase the text if self.lower: text = text.lower() - return text diff --git a/textbrew/regex/__init__.py b/textbrew/regex/__init__.py index 528eaf8..83cd9aa 100644 --- a/textbrew/regex/__init__.py +++ b/textbrew/regex/__init__.py @@ -1,2 +1,4 @@ +from __future__ import absolute_import + from .studio import RegexStudio from . import transforms diff --git a/textbrew/regex/studio.py b/textbrew/regex/studio.py index a4ee2a5..c2e8ff9 100644 --- a/textbrew/regex/studio.py +++ b/textbrew/regex/studio.py @@ -1,5 +1,6 @@ +from __future__ import absolute_import + import re -from functools import reduce from .transforms import BaseRegex, MergeSpaces @@ -8,19 +9,19 @@ def process_regex(data, regex): class RegexStudio(object): - """ + u""" Common Regex operations for text cleaning and matching. """ - def __init__(self, spl_chars=''): - """ + def __init__(self, spl_chars=u''): + u""" Constructor for RegexStudio, sets up regex patterns :param spl_chars: str: special characters to ignore for cleaning purposes(eg: '_|$') """ def add_escape_chars(self, text): - """ + u""" Adds escape characters in a string to make it regex complaint @@ -29,16 +30,16 @@ def add_escape_chars(self, text): :returns text """ # Get a list of all unique special characters - spl_chars = list(set(re.findall("[^A-Za-z0-9,\s]", text))) + spl_chars = list(set(re.findall(u"[^A-Za-z0-9,\s]", text))) # Append special characters with escape characters for char in spl_chars: - text = text.replace(char, '\\' + char) + text = text.replace(char, u'\\' + char) return text - def extract_substrings(self, text, start='^', end='$'): - """ + def extract_substrings(self, text, start=u'^', end=u'$'): + u""" Extracts sub strings between two words. By default the initial sub-string is set to start @@ -53,13 +54,13 @@ def extract_substrings(self, text, start='^', end='$'): start = self.add_escape_chars(start) end = self.add_escape_chars(end) - substring_regex = '.*' + start + '(.*?)' + end + substring_regex = u'.*' + start + u'(.*?)' + end matches = re.findall(substring_regex, text) return matches def cleaner(self, text, regexes=[MergeSpaces]): - """ + u""" Removes charactes with 'True' values in the argument from the input string. @@ -108,7 +109,7 @@ def cleaner(self, text, regexes=[MergeSpaces]): return reduce(process_regex, regexes, text) def findall(self, regex, text): - """ + u""" Finds all regex matches in a text string :param regex: str: regex pattern to be searched for @@ -120,7 +121,7 @@ def findall(self, regex, text): return matches def matcher(self, text): - """ + u""" Create a dictionary for all the properties(parts of text) and matches in constructor :param text: str @@ -131,10 +132,10 @@ def matcher(self, text): # Iterate through all the properties on this class(see constructor) for arg, regex in self.__dict__.items(): - key = "_".join(arg.split("_")[1:]) + key = u"_".join(arg.split(u"_")[1:]) matches[key] = self.findall(regex, text) # Pop 'merge_spaces' from the dictionary - matches.pop('merge_spaces') + matches.pop(u'merge_spaces') return matches diff --git a/textbrew/regex/transforms.py b/textbrew/regex/transforms.py index 54d9f80..aa377f3 100644 --- a/textbrew/regex/transforms.py +++ b/textbrew/regex/transforms.py @@ -1,5 +1,7 @@ +from __future__ import absolute_import + import re -from .utils import extract_sub_args +from utils import extract_sub_args class BaseRegex: diff --git a/textbrew/regex/utils.py b/textbrew/regex/utils.py index f646639..efd5f62 100644 --- a/textbrew/regex/utils.py +++ b/textbrew/regex/utils.py @@ -1,9 +1,5 @@ - - def extract_args(arg_list , **kwargs): - return { - i : kwargs.get(i , None) for i in arg_list - } + return dict((i, kwargs.get(i , None)) for i in arg_list) def extract_sub_args( *args ,**kwargs): - arg_list = ["replace" , "count" , "flags"] + arg_list = [u"replace" , u"count" , u"flags"] return extract_args(arg_list , **kwargs) \ No newline at end of file