datawarp · igagansingh · Mar 19, 2017
diff --git a/notebooks/textbrew-examples.ipynb b/notebooks/textbrew-examples.ipynb
@@ -2,7 +2,21 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, '../')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
    "metadata": {
     "collapsed": false,
     "deletable": true,
@@ -11,12 +25,13 @@
    "outputs": [],
    "source": [
     "from textbrew.format import Format\n",
-    "from textbrew.regex import RegexStudio , transforms"
+    "from textbrew.regex import transforms\n",
+    "from textbrew.regex.studio import RegexStudio\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 37,
    "metadata": {
     "collapsed": true,
     "deletable": true,
@@ -30,7 +45,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 38,
    "metadata": {
     "collapsed": false,
     "deletable": true,
@@ -43,7 +58,7 @@
        "'@microsoft   Bill Gates is the richest man on Wall Street 09, who also does most charity (or so he claims)!'"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 38,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -57,7 +72,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 43,
    "metadata": {
     "collapsed": false,
     "deletable": true,
@@ -67,18 +82,18 @@
     {
      "data": {
       "text/plain": [
-       "'@microsoft    bill_gates is the|richest|man on wall_street 09 , who also does most|charity ( or so he claims ) ! '"
+       "u'@microsoft    bill_gates is the|richest|man on wall_street 09 , who also does most|charity ( or so he claims ) ! '"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 43,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# Create compund nouns out out of text(probable applications - Training Compound Word Vectors)\n",
     "formatter = Format()\n",
-    "formatter.compound_nouns(cleaned_text)"
+    "formatter.compound_nouns(unicode(cleaned_text))"
    ]
   },
   {
@@ -93,21 +108,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.2+"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,

diff --git a/textbrew/__init__.pyc b/textbrew/__init__.pyc
diff --git a/textbrew/format.py b/textbrew/format.py
@@ -1,22 +1,24 @@
+from __future__ import absolute_import
+
 import re
 import numpy as np
 from spacy.en import English
 
 PARSER = English()
 
 
-class Format():
+class Format(object):
 
-    def __init__(self, lower=True):
-    	"""
+    def __init__(self, lower = True):
+    	u"""
         Constructor for 'Format' class.
 
         :param lower: bool: Lowercases the text
         """
         self.lower = lower
 
-    def compound_nouns(self, text, nnp_sep="_", nn_sep="|"):
-        """
+    def compound_nouns(self, text, nnp_sep=u"_", nn_sep=u"|"):
+        u"""
         Combines Noun Phrases into a single token
         (with separators as provided in arguments)
 
@@ -35,24 +37,23 @@ def compound_nouns(self, text, nnp_sep="_", nn_sep="|"):
             phrase.merge(phrase.root.tag_, phrase.text, phrase.root.ent_type_)
 
         # Placeholder for formatted text
-        text = ""
+        text = u""
 
         for token in parsed_doc:
             # Reformatted phrase
-            phrase = ""
+            phrase = u""
 
             # Replaces white spaces between words in a phrase
             # with appropriate separators
-            if token.tag_ == "NNP":
-                phrase = token.text.replace(" ", nnp_sep)
-            elif token.tag_ == "NN":
-                phrase = token.text.replace(" ", nn_sep)
+            if token.tag_ == u"NNP":
+                phrase = token.text.replace(u" ", nnp_sep)
+            elif token.tag_ == u"NN":
+                phrase = token.text.replace(u" ", nn_sep)
             else:
                 phrase = token.text
-            text += phrase + " "
+            text += phrase + u" "
 
         # If 'lower' argument is true then lowercase the text
         if self.lower:
             text = text.lower()
-
         return text
diff --git a/textbrew/regex/__init__.py b/textbrew/regex/__init__.py
@@ -1,2 +1,4 @@
+from __future__ import absolute_import
+
 from .studio import RegexStudio
 from . import transforms
diff --git a/textbrew/regex/studio.py b/textbrew/regex/studio.py
@@ -1,5 +1,6 @@
+from __future__ import absolute_import
+
 import re
-from functools import reduce
 from .transforms import BaseRegex, MergeSpaces
 
 
@@ -8,19 +9,19 @@ def process_regex(data, regex):
 
 
 class RegexStudio(object):
-    """
+    u"""
     Common Regex operations for text cleaning and matching.
     """
 
-    def __init__(self, spl_chars=''):
-        """
+    def __init__(self, spl_chars=u''):
+        u"""
         Constructor for RegexStudio, sets up regex patterns
 
         :param spl_chars: str: special characters to ignore for cleaning purposes(eg: '_|$')
         """
 
     def add_escape_chars(self, text):
-        """
+        u"""
         Adds escape characters in a string
         to make it regex complaint
 
@@ -29,16 +30,16 @@ def add_escape_chars(self, text):
         :returns text
         """
         # Get a list of all unique special characters
-        spl_chars = list(set(re.findall("[^A-Za-z0-9,\s]", text)))
+        spl_chars = list(set(re.findall(u"[^A-Za-z0-9,\s]", text)))
 
         # Append special characters with escape characters
         for char in spl_chars:
-            text = text.replace(char, '\\' + char)
+            text = text.replace(char, u'\\' + char)
 
         return text
 
-    def extract_substrings(self, text, start='^', end='$'):
-        """
+    def extract_substrings(self, text, start=u'^', end=u'$'):
+        u"""
         Extracts sub strings between two words.
 
         By default the initial sub-string is set to start
@@ -53,13 +54,13 @@ def extract_substrings(self, text, start='^', end='$'):
         start = self.add_escape_chars(start)
         end = self.add_escape_chars(end)
 
-        substring_regex = '.*' + start + '(.*?)' + end
+        substring_regex = u'.*' + start + u'(.*?)' + end
         matches = re.findall(substring_regex, text)
 
         return matches
 
     def cleaner(self, text, regexes=[MergeSpaces]):
-        """
+        u"""
         Removes charactes with 'True' values in the argument
         from the input string.
 
@@ -108,7 +109,7 @@ def cleaner(self, text, regexes=[MergeSpaces]):
         return reduce(process_regex, regexes, text)
 
     def findall(self, regex, text):
-        """
+        u"""
         Finds all regex matches in a text string
 
         :param regex: str: regex pattern to be searched for
@@ -120,7 +121,7 @@ def findall(self, regex, text):
         return matches
 
     def matcher(self, text):
-        """
+        u"""
         Create a dictionary for all the properties(parts of text) and matches in constructor
 
         :param text: str
@@ -131,10 +132,10 @@ def matcher(self, text):
 
         # Iterate through all the properties on this class(see constructor)
         for arg, regex in self.__dict__.items():
-            key = "_".join(arg.split("_")[1:])
+            key = u"_".join(arg.split(u"_")[1:])
             matches[key] = self.findall(regex, text)
 
         # Pop 'merge_spaces' from the dictionary
-        matches.pop('merge_spaces')
+        matches.pop(u'merge_spaces')
 
         return matches
diff --git a/textbrew/regex/transforms.py b/textbrew/regex/transforms.py
@@ -1,5 +1,7 @@
+from __future__ import absolute_import
+
 import re
-from .utils import extract_sub_args
+from utils import extract_sub_args
 
 
 class BaseRegex:

diff --git a/textbrew/regex/utils.py b/textbrew/regex/utils.py
@@ -1,9 +1,5 @@
-
-
 def extract_args(arg_list , **kwargs):
-	return {
-		i : kwargs.get(i , None) for i in arg_list
-	}
+	return dict((i, kwargs.get(i , None)) for i in arg_list)
 def extract_sub_args( *args ,**kwargs):
-	arg_list = ["replace" , "count" , "flags"]
+	arg_list = [u"replace" , u"count" , u"flags"]
 	return extract_args(arg_list , **kwargs)