Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 29 additions & 14 deletions notebooks/textbrew-examples.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,21 @@
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import sys\n",
"sys.path.insert(0, '../')"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false,
"deletable": true,
Expand All @@ -11,12 +25,13 @@
"outputs": [],
"source": [
"from textbrew.format import Format\n",
"from textbrew.regex import RegexStudio , transforms"
"from textbrew.regex import transforms\n",
"from textbrew.regex.studio import RegexStudio\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 37,
"metadata": {
"collapsed": true,
"deletable": true,
Expand All @@ -30,7 +45,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 38,
"metadata": {
"collapsed": false,
"deletable": true,
Expand All @@ -43,7 +58,7 @@
"'@microsoft Bill Gates is the richest man on Wall Street 09, who also does most charity (or so he claims)!'"
]
},
"execution_count": 4,
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -57,7 +72,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 43,
"metadata": {
"collapsed": false,
"deletable": true,
Expand All @@ -67,18 +82,18 @@
{
"data": {
"text/plain": [
"'@microsoft bill_gates is the|richest|man on wall_street 09 , who also does most|charity ( or so he claims ) ! '"
"u'@microsoft bill_gates is the|richest|man on wall_street 09 , who also does most|charity ( or so he claims ) ! '"
]
},
"execution_count": 5,
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create compund nouns out out of text(probable applications - Training Compound Word Vectors)\n",
"formatter = Format()\n",
"formatter.compound_nouns(cleaned_text)"
"formatter.compound_nouns(unicode(cleaned_text))"
]
},
{
Expand All @@ -93,21 +108,21 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 2",
"language": "python",
"name": "python3"
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2+"
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
Expand Down
Binary file added textbrew/__init__.pyc
Binary file not shown.
27 changes: 14 additions & 13 deletions textbrew/format.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,24 @@
from __future__ import absolute_import

import re
import numpy as np
from spacy.en import English

PARSER = English()


class Format():
class Format(object):

def __init__(self, lower=True):
"""
def __init__(self, lower = True):
u"""
Constructor for 'Format' class.

:param lower: bool: Lowercases the text
"""
self.lower = lower

def compound_nouns(self, text, nnp_sep="_", nn_sep="|"):
"""
def compound_nouns(self, text, nnp_sep=u"_", nn_sep=u"|"):
u"""
Combines Noun Phrases into a single token
(with separators as provided in arguments)

Expand All @@ -35,24 +37,23 @@ def compound_nouns(self, text, nnp_sep="_", nn_sep="|"):
phrase.merge(phrase.root.tag_, phrase.text, phrase.root.ent_type_)

# Placeholder for formatted text
text = ""
text = u""

for token in parsed_doc:
# Reformatted phrase
phrase = ""
phrase = u""

# Replaces white spaces between words in a phrase
# with appropriate separators
if token.tag_ == "NNP":
phrase = token.text.replace(" ", nnp_sep)
elif token.tag_ == "NN":
phrase = token.text.replace(" ", nn_sep)
if token.tag_ == u"NNP":
phrase = token.text.replace(u" ", nnp_sep)
elif token.tag_ == u"NN":
phrase = token.text.replace(u" ", nn_sep)
else:
phrase = token.text
text += phrase + " "
text += phrase + u" "

# If 'lower' argument is true then lowercase the text
if self.lower:
text = text.lower()

return text
2 changes: 2 additions & 0 deletions textbrew/regex/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
from __future__ import absolute_import

from .studio import RegexStudio
from . import transforms
31 changes: 16 additions & 15 deletions textbrew/regex/studio.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import absolute_import

import re
from functools import reduce
from .transforms import BaseRegex, MergeSpaces


Expand All @@ -8,19 +9,19 @@ def process_regex(data, regex):


class RegexStudio(object):
"""
u"""
Common Regex operations for text cleaning and matching.
"""

def __init__(self, spl_chars=''):
"""
def __init__(self, spl_chars=u''):
u"""
Constructor for RegexStudio, sets up regex patterns

:param spl_chars: str: special characters to ignore for cleaning purposes(eg: '_|$')
"""

def add_escape_chars(self, text):
"""
u"""
Adds escape characters in a string
to make it regex complaint

Expand All @@ -29,16 +30,16 @@ def add_escape_chars(self, text):
:returns text
"""
# Get a list of all unique special characters
spl_chars = list(set(re.findall("[^A-Za-z0-9,\s]", text)))
spl_chars = list(set(re.findall(u"[^A-Za-z0-9,\s]", text)))

# Append special characters with escape characters
for char in spl_chars:
text = text.replace(char, '\\' + char)
text = text.replace(char, u'\\' + char)

return text

def extract_substrings(self, text, start='^', end='$'):
"""
def extract_substrings(self, text, start=u'^', end=u'$'):
u"""
Extracts sub strings between two words.

By default the initial sub-string is set to start
Expand All @@ -53,13 +54,13 @@ def extract_substrings(self, text, start='^', end='$'):
start = self.add_escape_chars(start)
end = self.add_escape_chars(end)

substring_regex = '.*' + start + '(.*?)' + end
substring_regex = u'.*' + start + u'(.*?)' + end
matches = re.findall(substring_regex, text)

return matches

def cleaner(self, text, regexes=[MergeSpaces]):
"""
u"""
Removes charactes with 'True' values in the argument
from the input string.

Expand Down Expand Up @@ -108,7 +109,7 @@ def cleaner(self, text, regexes=[MergeSpaces]):
return reduce(process_regex, regexes, text)

def findall(self, regex, text):
"""
u"""
Finds all regex matches in a text string

:param regex: str: regex pattern to be searched for
Expand All @@ -120,7 +121,7 @@ def findall(self, regex, text):
return matches

def matcher(self, text):
"""
u"""
Create a dictionary for all the properties(parts of text) and matches in constructor

:param text: str
Expand All @@ -131,10 +132,10 @@ def matcher(self, text):

# Iterate through all the properties on this class(see constructor)
for arg, regex in self.__dict__.items():
key = "_".join(arg.split("_")[1:])
key = u"_".join(arg.split(u"_")[1:])
matches[key] = self.findall(regex, text)

# Pop 'merge_spaces' from the dictionary
matches.pop('merge_spaces')
matches.pop(u'merge_spaces')

return matches
4 changes: 3 additions & 1 deletion textbrew/regex/transforms.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import absolute_import

import re
from .utils import extract_sub_args
from utils import extract_sub_args


class BaseRegex:
Expand Down
8 changes: 2 additions & 6 deletions textbrew/regex/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@


def extract_args(arg_list , **kwargs):
return {
i : kwargs.get(i , None) for i in arg_list
}
return dict((i, kwargs.get(i , None)) for i in arg_list)
def extract_sub_args( *args ,**kwargs):
arg_list = ["replace" , "count" , "flags"]
arg_list = [u"replace" , u"count" , u"flags"]
return extract_args(arg_list , **kwargs)