diff --git a/.gitignore b/.gitignore index 7e99e36..f987842 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,7 @@ -*.pyc \ No newline at end of file +*.pyc +build/ +dist/ +inflector.egg-info.DS_Store +.idea/ +.DS_Store +inflector.egg-info/ diff --git a/README.markdown b/README.md similarity index 56% rename from README.markdown rename to README.md index 580285f..04f96cf 100644 --- a/README.markdown +++ b/README.md @@ -2,48 +2,85 @@ The Inflector is used for getting the plural and singular form of nouns. This piece of code helps on creating code that favors convention over configuration. -Only English and Spanish nouns are supported. The English version is a port of Ruby on Rails Inflector, while the Spanish Version has been developed from scratch with the help of Carles Sadurní. +Only English, French and Spanish nouns are supported. The English version is a port of Ruby on Rails Inflector, while the Spanish Version has been developed from scratch with the help of Carles Sadurní. +The French version was implemented by [sblondon](https://github.com/sblondon/pluralizefr). -Apart from converting singulars and plurals, this module also handles necessary string conversion for convention based applications like: +Apart from converting singulars and plurals, this module also handles necessary string + conversion for convention based applications like: *tableize*, *urlize*, and so forth. -Available methods are: -## pluralize(word) +## Requirements -Pluralizes nouns. + * Python 3.x -## singularize(word) +## Getting started -Singularizes nouns. +To install the inflector package, move to *inflector* directory, then run -## conditionalPlural(numer_of_records, word) + $ pip install . + +or if necessary -Returns the plural form of a word if first parameter is greater than 1 + $ pip3 install . + +To work with the inflector, import the `Inflector` and the language support modules: + +```{python} +>>> from inflector import Inflector, French, English, Spanish +``` + +Then, to pluralize, run the following code +```{python} +>>> Inflector(English()).pluralize("matrix") +'matrices' +>>> Inflector(French()).pluralize("cheval") +'chevaux' +>>> Inflector(Spanish()).pluralize("arbol") +'arboles' +``` + +Lastly, if you want to singularize, run : +```{python} +>>> Inflector(English()).pluralize("matrices") +'matrix' +>>> Inflector(French()).singularize("bijous") +'bijou' +>>> Inflector(Spanish()).singularize("Regímenes") +'Régimen' +``` + + +## Methods available + + * **pluralize(word)** +Pluralizes nouns. + + * **singularize(word)** +Singularizes nouns. -## titleize(word, uppercase = '') + * **conditionalPlural(numer_of_records, word)** +Returns the plural form of a word if first parameter is greater than 1 + * **titleize(word, uppercase = '')** Converts an underscored or CamelCase word into a sentence. The titleize function converts text like "WelcomePage", "welcome_page" or "welcome page" to this "Welcome Page". If the "uppercase" parameter is set to 'first' it will only capitalize the first character of the title. -## camelize(word): - + * **camelize(word):** Returns given word as CamelCased Converts a word like "send_email" to "SendEmail". It will remove non alphanumeric character from the word, so "who's online" will be converted to "WhoSOnline" -## underscore(word) - + * **underscore(word)** Converts a word "into_it_s_underscored_version" Convert any "CamelCased" or "ordinary Word" into an "underscored_word". This can be really useful for creating friendly URLs. -## humanize(word, uppercase = '') - + * **humanize(word, uppercase = '')** Returns a human-readable string from word Returns a human-readable string from word, by replacing underscores with a space, and by upper-casing the initial @@ -51,40 +88,33 @@ character by default. If you need to uppercase all the words you just have to pass 'all' as a second parameter. - -## variablize(word) - + * **variablize(word)** Same as camelize but first char is lowercased Converts a word like "send_email" to "sendEmail". It will remove non alphanumeric character from the word, so "who's online" will be converted to "whoSOnline" return self.Inflector.variablize(word) -## tableize(class_name) - + * **tableize(class_name)** Converts a class name to its table name according to rails naming conventions. Example. Converts "Person" to "people" -## classify(table_name) - + * **classify(table_name)** Converts a table name to its class name according to rails naming conventions. Example: Converts "people" to "Person" - -## ordinalize(number) +*) + * **ordinalize(number)** Converts number to its ordinal form. This method converts 13 to 13th, 2 to 2nd ... -## unaccent(text) - + * **unaccent(text)** Transforms a string to its unaccented version. This might be useful for generating "friendly" URLs -## urlize(text) - + * **urlize(text)** Transform a string its unaccented and underscored version ready to be inserted in friendly URLs -## foreignKey(class_name, separate_class_name_and_id_with_underscore = 1) - + * **foreignKey(class_name, separate_class_name_and_id_with_underscore = 1)** Returns class_name in underscored form, with "_id" tacked on at the end. This is for use in dealing with the database. diff --git a/inflector/__init__.py b/inflector/__init__.py new file mode 100644 index 0000000..f650bbc --- /dev/null +++ b/inflector/__init__.py @@ -0,0 +1,7 @@ +#coding=utf-8 + +from .rules.english import English +from .rules.french import French +from .rules.spanish import Spanish + +from .inflector import Inflector \ No newline at end of file diff --git a/inflector.py b/inflector/inflector.py similarity index 94% rename from inflector.py rename to inflector/inflector.py index c9be1c8..643ae5a 100644 --- a/inflector.py +++ b/inflector/inflector.py @@ -1,127 +1,126 @@ -#!/usr/bin/env python - -# Copyright (c) 2006 Bermi Ferrer Martinez -# -# bermi a-t bermilabs - com -# See the end of this file for the free software, open source license -# (BSD-style). - -from rules.english import English - - -class Inflector: - """ - Inflector for pluralizing and singularizing nouns. - - It provides methods for helping on creating programs - based on naming conventions like on Ruby on Rails. - """ - - def __init__(self, Inflector=English): - assert callable(Inflector), "Inflector should be a callable obj" - self.Inflector = apply(Inflector) - - def pluralize(self, word): - '''Pluralizes nouns.''' - return self.Inflector.pluralize(word) - - def singularize(self, word): - '''Singularizes nouns.''' - return self.Inflector.singularize(word) - - def conditionalPlural(self, numer_of_records, word): - '''Returns the plural form of a word if first parameter is greater than 1''' - return self.Inflector.conditionalPlural(numer_of_records, word) - - def titleize(self, word, uppercase=''): - '''Converts an underscored or CamelCase word into a sentence. - The titleize function converts text like "WelcomePage", - "welcome_page" or "welcome page" to this "Welcome Page". - If the "uppercase" parameter is set to 'first' it will only - capitalize the first character of the title.''' - return self.Inflector.titleize(word, uppercase) - - def camelize(self, word): - ''' Returns given word as CamelCased - Converts a word like "send_email" to "SendEmail". It - will remove non alphanumeric character from the word, so - "who's online" will be converted to "WhoSOnline"''' - return self.Inflector.camelize(word) - - def underscore(self, word): - ''' Converts a word "into_it_s_underscored_version" - Convert any "CamelCased" or "ordinary Word" into an - "underscored_word". - This can be really useful for creating friendly URLs.''' - return self.Inflector.underscore(word) - - def humanize(self, word, uppercase=''): - '''Returns a human-readable string from word - Returns a human-readable string from word, by replacing - underscores with a space, and by upper-casing the initial - character by default. - If you need to uppercase all the words you just have to - pass 'all' as a second parameter.''' - return self.Inflector.humanize(word, uppercase) - - def variablize(self, word): - '''Same as camelize but first char is lowercased - Converts a word like "send_email" to "sendEmail". It - will remove non alphanumeric character from the word, so - "who's online" will be converted to "whoSOnline"''' - return self.Inflector.variablize(word) - - def tableize(self, class_name): - ''' Converts a class name to its table name according to rails - naming conventions. Example. Converts "Person" to "people" ''' - return self.Inflector.tableize(class_name) - - def classify(self, table_name): - '''Converts a table name to its class name according to rails - naming conventions. Example: Converts "people" to "Person" ''' - return self.Inflector.classify(table_name) - - def ordinalize(self, number): - '''Converts number to its ordinal form. - This method converts 13 to 13th, 2 to 2nd ...''' - return self.Inflector.ordinalize(number) - - def unaccent(self, text): - '''Transforms a string to its unaccented version. - This might be useful for generating "friendly" URLs''' - return self.Inflector.unaccent(text) - - def urlize(self, text): - '''Transform a string its unaccented and underscored - version ready to be inserted in friendly URLs''' - return self.Inflector.urlize(text) - - def demodulize(self, module_name): - return self.Inflector.demodulize(module_name) - - def modulize(self, module_description): - return self.Inflector.modulize(module_description) - - def foreignKey(self, class_name, separate_class_name_and_id_with_underscore=1): - ''' Returns class_name in underscored form, with "_id" tacked on at the end. - This is for use in dealing with the database.''' - return self.Inflector.foreignKey(class_name, separate_class_name_and_id_with_underscore) - - - - -# Copyright (c) 2006 Bermi Ferrer Martinez -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software to deal in this software without restriction, including -# without limitation the rights to use, copy, modify, merge, publish, -# distribute, sublicense, and/or sell copies of this software, and to permit -# persons to whom this software is furnished to do so, subject to the following -# condition: -# -# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THIS SOFTWARE. +#!/usr/bin/env python + +# Copyright (c) 2006 Bermi Ferrer Martinez +# +# bermi a-t bermilabs - com +# See the end of this file for the free software, open source license +# (BSD-style). + +from .rules.english import English + + +class Inflector: + """ + Inflector for pluralizing and singularizing nouns. + + It provides methods for helping on creating programs + based on naming conventions like on Ruby on Rails. + """ + + def __init__(self, Inflector=English()): + self.Inflector = Inflector + + def pluralize(self, word): + '''Pluralizes nouns.''' + return self.Inflector.pluralize(word) + + def singularize(self, word): + '''Singularizes nouns.''' + return self.Inflector.singularize(word) + + def conditionalPlural(self, numer_of_records, word): + '''Returns the plural form of a word if first parameter is greater than 1''' + return self.Inflector.conditionalPlural(numer_of_records, word) + + def titleize(self, word, uppercase=''): + '''Converts an underscored or CamelCase word into a sentence. + The titleize function converts text like "WelcomePage", + "welcome_page" or "welcome page" to this "Welcome Page". + If the "uppercase" parameter is set to 'first' it will only + capitalize the first character of the title.''' + return self.Inflector.titleize(word, uppercase) + + def camelize(self, word): + ''' Returns given word as CamelCased + Converts a word like "send_email" to "SendEmail". It + will remove non alphanumeric character from the word, so + "who's online" will be converted to "WhoSOnline"''' + return self.Inflector.camelize(word) + + def underscore(self, word): + ''' Converts a word "into_it_s_underscored_version" + Convert any "CamelCased" or "ordinary Word" into an + "underscored_word". + This can be really useful for creating friendly URLs.''' + return self.Inflector.underscore(word) + + def humanize(self, word, uppercase=''): + '''Returns a human-readable string from word + Returns a human-readable string from word, by replacing + underscores with a space, and by upper-casing the initial + character by default. + If you need to uppercase all the words you just have to + pass 'all' as a second parameter.''' + return self.Inflector.humanize(word, uppercase) + + def variablize(self, word): + '''Same as camelize but first char is lowercased + Converts a word like "send_email" to "sendEmail". It + will remove non alphanumeric character from the word, so + "who's online" will be converted to "whoSOnline"''' + return self.Inflector.variablize(word) + + def tableize(self, class_name): + ''' Converts a class name to its table name according to rails + naming conventions. Example. Converts "Person" to "people" ''' + return self.Inflector.tableize(class_name) + + def classify(self, table_name): + '''Converts a table name to its class name according to rails + naming conventions. Example: Converts "people" to "Person" ''' + return self.Inflector.classify(table_name) + + def ordinalize(self, number): + '''Converts number to its ordinal form. + This method converts 13 to 13th, 2 to 2nd ...''' + return self.Inflector.ordinalize(number) + + def unaccent(self, text): + '''Transforms a string to its unaccented version. + This might be useful for generating "friendly" URLs''' + return self.Inflector.unaccent(text) + + def urlize(self, text): + '''Transform a string its unaccented and underscored + version ready to be inserted in friendly URLs''' + return self.Inflector.urlize(text) + + def demodulize(self, module_name): + return self.Inflector.demodulize(module_name) + + def modulize(self, module_description): + return self.Inflector.modulize(module_description) + + def foreignKey(self, class_name, separate_class_name_and_id_with_underscore=1): + ''' Returns class_name in underscored form, with "_id" tacked on at the end. + This is for use in dealing with the database.''' + return self.Inflector.foreignKey(class_name, separate_class_name_and_id_with_underscore) + + + + +# Copyright (c) 2006 Bermi Ferrer Martinez +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software to deal in this software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of this software, and to permit +# persons to whom this software is furnished to do so, subject to the following +# condition: +# +# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THIS SOFTWARE. diff --git a/__init__.py b/inflector/rules/__init__.py similarity index 100% rename from __init__.py rename to inflector/rules/__init__.py diff --git a/rules/base.py b/inflector/rules/base.py similarity index 100% rename from rules/base.py rename to inflector/rules/base.py diff --git a/rules/english.py b/inflector/rules/english.py similarity index 99% rename from rules/english.py rename to inflector/rules/english.py index 42d1b2e..412d9a7 100644 --- a/rules/english.py +++ b/inflector/rules/english.py @@ -7,7 +7,7 @@ # (BSD-style). import re -from base import Base +from .base import Base class English (Base): diff --git a/inflector/rules/french.py b/inflector/rules/french.py new file mode 100644 index 0000000..8f449e1 --- /dev/null +++ b/inflector/rules/french.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python + +# Copyright (c) 2006 Bermi Ferrer Martinez +# bermi a-t bermilabs - com +# +# See the end of this file for the free software, open source license +# (BSD-style). + +import re +from .base import Base + + +class French (Base): + """ + Inflector for pluralize and singularize French nouns. + + This is the default Inflector for the Inflector obj + """ + + def pluralize(self,word): + for GRAMMAR_RULE in (self._ail_word, self._al_word, self._au_word, self._eil_word, self._eu_word, self._ou_word, self._s_word, self._x_word, self._z_word, + self._default): + plural = GRAMMAR_RULE(word) + if plural: + return plural + + def _ail_word(self,word): + if word.endswith("ail"): + if word == "ail": + return "aulx" + elif word in ( + "bail", "corail", u"émail", "fermail", "soupirail", "travail", "vantail", "ventail", "vitrail"): + return word[:-3] + "aux" + return word + "s" + + def _al_word(self,word): + if word.endswith("al"): + if word in ( + "bal", "carnaval", "chacal", "festival", u"récital", u"régal", + "bancal", "fatal", "fractal", "final", "morfal", "natal", "naval", + u"aéronaval", + u"anténatal", u"néonatal", u"périnatal", u"postnatal", u"prénatal", + "tonal", "atonal", "bitonal", "polytonal", + "corral", "deal", "goal", "autogoal", "revival", "serial", "spiritual", "trial", + "caracal", "chacal", "gavial", "gayal", "narval", "quetzal", "rorqual", "serval", + "metical", "rial", "riyal", "ryal", + "cantal", "emmental", "emmenthal", + u"floréal", "germinal", "prairial", + ): + return word + "s" + return word[:-2] + "aux" + + def _au_word(self,word): + if word.endswith("au"): + if word in ("berimbau", "donau", "karbau", "landau", "pilau", "sarrau", "unau"): + return word + "s" + return word + "x" + + def _eil_word(self,word): + if word.endswith("eil"): + return "vieux" if word == "vieil" else word + "s" + + def _eu_word(self,word): + if word.endswith("eu"): + if word in ("bleu", u"émeu", "enfeu", "pneu", "rebeu"): + return word + "s" + return word + "x" + + def _ou_word(self,word): + if word.endswith("ou"): + if word in ("bijou", "caillou", "chou", "genou", "hibou", "joujou", "pou"): + return word + "x" + return word + "s" + + def _s_word(self,word): + if word[-1] == "s": + return word + + def _x_word(self,word): + if word[-1] == "x": + return word + + def _z_word(self,word): + if word[-1] == "z": + return word + + def _default(self,word): + return word + "s" + + def singularize(self, word): + '''Singularizes French nouns.''' + + word=word.lower() + if word in set(["baux", "coraux", "émaux", "fermaux", "soupiraux", "travaux", "vantaux", "ventaux", "vitraux"]): + return word[:-3] + "ail" + if (word.endswith("als") or word.endswith("aux")) and not word.endswith("eaux"): + return word[:-3]+"al" + if word.endswith == "vieux": + return "vieil" + if word.endswith("x") or word.endswith("s"): + return word[:-1] + return word + + + +# Copyright (c) 2006 Bermi Ferrer Martinez +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software to deal in this software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of this software, and to permit +# persons to whom this software is furnished to do so, subject to the following +# condition: +# +# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THIS SOFTWARE. diff --git a/inflector/rules/spanish.py b/inflector/rules/spanish.py new file mode 100644 index 0000000..2d95fb1 --- /dev/null +++ b/inflector/rules/spanish.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# coding=utf-8 +# Copyright (c) 2006 Bermi Ferrer Martinez +# Copyright (c) 2006 Carles Sadurní Anguita +# +# bermi a-t bermilabs - com +# +# See the end of this file for the free software, open source license +# (BSD-style). + +import re +from .base import Base + + +class Spanish (Base): + ''' + Inflector for pluralize and singularize Spanish nouns. + ''' + + irregular_words = { + 'base': 'bases', + 'carácter': 'caracteres', + 'champú': 'champús', + 'curriculum': 'currículos', + 'espécimen': 'especímenes', + 'jersey': 'jerséis', + 'memorándum': 'memorandos', + 'menú': 'menús', + 'no': 'noes', + 'país': 'países', + 'referéndum': 'referendos', + 'régimen': 'regímenes', + 'sándwich': 'sándwiches', + 'si': 'sis', # Nota musical ALERTA: ¡provoca efectos secundarios! + 'taxi': 'taxis', + 'ultimátum': 'ultimatos', + } + + # These words either have the same form in singular and plural, or have no singular form at all + non_changing_words = [ + 'lunes', 'martes', 'miércoles', 'jueves', 'viernes', + 'paraguas', 'tijeras', 'gafas', 'vacaciones', 'víveres', + 'cumpleaños', 'virus', 'atlas', 'sms', 'hummus', + ] + + + def pluralize(self, word): + ''' + Pluralizes Spanish nouns. + Input string can be Unicode (e.g. u"palabra"), or a str encoded in UTF-8 or Latin-1. + Output string will be encoded the same way as the input. + ''' + + #word, origType = utils.unicodify(word) # all internal calculations are done in Unicode + + rules = [ + ['(?i)([aeiou])x$', '\\1x'], + # This could fail if the word is oxytone. + ['(?i)([áéíóú])([ns])$', '|1\\2es'], + ['(?i)(^[bcdfghjklmnñpqrstvwxyz]*)an$', '\\1anes'], # clan->clanes + ['(?i)([áéíóú])s$', '|1ses'], + ['(?i)(^[bcdfghjklmnñpqrstvwxyz]*)([aeiou])([ns])$', '\\1\\2\\3es'], # tren->trenes + ['(?i)([aeiouáéó])$', '\\1s'], # casa->casas, padre->padres, papá->papás + ['(?i)([aeiou])s$', '\\1s'], # atlas->atlas, virus->virus, etc. + ['(?i)([éí])(s)$', '|1\\2es'], # inglés->ingleses + ['(?i)z$', 'ces'], # luz->luces + ['(?i)([íú])$', '\\1es'], # ceutí->ceutíes, tabú->tabúes + ['(?i)(ng|[wckgtp])$', '\\1s'], # Anglicismos como puenting, frac, crack, show (En que casos podría fallar esto?) + ['(?i)$', 'es'] # ELSE +es (v.g. árbol->árboles) + ] + + lower_cased_word = word.lower() + + for uncountable_word in self.non_changing_words: + if lower_cased_word[-1 * len(uncountable_word):] == uncountable_word: + return word + + for irregular_singular, irregular_plural in self.irregular_words.items(): + match = re.search('(?i)(^' + irregular_singular + ')$', word, re.IGNORECASE) + if match: + result = re.sub('(?i)' + irregular_singular + '$', match.expand('\\1')[0] + irregular_plural[1:], word) + return result + + for rule in rules: + match = re.search(rule[0], word, re.IGNORECASE) + if match: + groups = match.groups() + replacement = rule[1] + if re.match('\|', replacement): + for k in range(1, len(groups)): + replacement = replacement.replace('|' + k, self.string_replace(groups[k - 1], 'ÁÉÍÓÚáéíóú', 'AEIOUaeio')) + + result = re.sub(rule[0], replacement, word) + # Esto acentúa los sustantivos que al pluralizarse se + # convierten en esdrújulos como esmóquines, jóvenes... + match = re.search('(?i)([aeiou]).{1,3}([aeiou])nes$', result) + + if match and len(match.groups()) > 1 and not re.search('(?i)[áéíóú]', word): + result = result.replace(match.group(0), self.string_replace( + match.group(1), 'AEIOUaeio', 'ÁÉÍÓÚáéíóú') + match.group(0)[1:]) + + return result + + return word + + + def singularize(self, word): + ''' + Singularizes Spanish nouns. + Input string can be Unicode (e.g. u"palabras"), or a str encoded in UTF-8 or Latin-1. + Output string will be encoded the same way as the input. + ''' + + # all internal calculations are done in Unicode + + rules = [ + [r'(?i)^([bcdfghjklmnñpqrstvwxyz]*)([aeiou])([ns])es$', '\\1\\2\\3'], + [r'(?i)([aeiou])([ns])es$', '~1\\2'], + [r'(?i)shes$', 'sh'], # flashes->flash + [r'(?i)oides$', 'oide'], # androides->androide + [r'(?i)(sis|tis|xis)$', '\\1'], # crisis, apendicitis, praxis + [r'(?i)(é)s$', '\\1'], # bebés->bebé + [r'(?i)(ces)$', 'z'], # luces->luz + [r'(?i)([^e])s$', '\\1'], # casas->casa + [r'(?i)([bcdfghjklmnñprstvwxyz]{2,}e)s$', '\\1'], # cofres->cofre + [r'(?i)([ghñptv]e)s$', '\\1'], # llaves->llave, radiocasetes->radiocasete + [r'(?i)jes$', 'je'], # ejes->eje + [r'(?i)ques$', 'que'], # tanques->tanque + [r'(?i)es$', ''] # ELSE remove _es_ monitores->monitor + ] + + lower_cased_word = word.lower() + + for uncountable_word in self.non_changing_words: + if lower_cased_word[-1 * len(uncountable_word):] == uncountable_word: + return word + + for irregular_singular, irregular_plural in self.irregular_words.items(): + match = re.search('(^' + irregular_plural + ')$', word, re.IGNORECASE) + if match: + result = re.sub('(?i)' + irregular_plural + '$', match.expand('\\1')[0] + irregular_singular[1:], word) + return result + + for rule in rules: + match = re.search(rule[0], word, re.IGNORECASE) + if match: + groups = match.groups() + replacement = rule[1] + if re.match('~', replacement): + for k in range(1, len(groups)): + replacement = replacement.replace('~' + k, self.string_replace(groups[k - 1], 'AEIOUaeio', 'ÁÉÍÓÚáéíóú')) + + result = re.sub(rule[0], replacement, word) + # Esta es una posible solución para el problema de dobles + # acentos. Un poco guarrillo pero funciona + match = re.search('(?i)([áéíóú]).*([áéíóú])', result) + + if match and len(match.groups()) > 1 and not re.search('(?i)[áéíóú]', word): + result = self.string_replace( + result, 'ÁÉÍÓÚáéíóú', 'AEIOUaeio') + + return result + + return word + + +# Copyright (c) 2006 Bermi Ferrer Martinez +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software to deal in this software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of this software, and to permit +# persons to whom this software is furnished to do so, subject to the following +# condition: +# +# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THIS SOFTWARE. + diff --git a/rules/__init__.py b/rules/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/rules/spanish.py b/rules/spanish.py deleted file mode 100644 index 2c9f283..0000000 --- a/rules/spanish.py +++ /dev/null @@ -1,186 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# coding=utf-8 -# Copyright (c) 2006 Bermi Ferrer Martinez -# Copyright (c) 2006 Carles Sadurní Anguita -# -# bermi a-t bermilabs - com -# -# See the end of this file for the free software, open source license -# (BSD-style). - -import re -from base import Base -import utils - - -class Spanish (Base): - ''' - Inflector for pluralize and singularize Spanish nouns. - ''' - - irregular_words = { - u'base': u'bases', - u'carácter': u'caracteres', - u'champú': u'champús', - u'curriculum': u'currículos', - u'espécimen': u'especímenes', - u'jersey': u'jerséis', - u'memorándum': u'memorandos', - u'menú': u'menús', - u'no': u'noes', - u'país': u'países', - u'referéndum': u'referendos', - u'régimen': u'regímenes', - u'sándwich': u'sándwiches', - u'si': u'sis', # Nota musical ALERTA: ¡provoca efectos secundarios! - u'taxi': u'taxis', - u'ultimátum': u'ultimatos', - } - - # These words either have the same form in singular and plural, or have no singular form at all - non_changing_words = [ - u'lunes', u'martes', u'miércoles', u'jueves', u'viernes', - u'paraguas', u'tijeras', u'gafas', u'vacaciones', u'víveres', - u'cumpleaños', u'virus', u'atlas', u'sms', u'hummus', - ] - - - def pluralize(self, word): - ''' - Pluralizes Spanish nouns. - Input string can be Unicode (e.g. u"palabra"), or a str encoded in UTF-8 or Latin-1. - Output string will be encoded the same way as the input. - ''' - - word, origType = utils.unicodify(word) # all internal calculations are done in Unicode - - rules = [ - [u'(?i)([aeiou])x$', u'\\1x'], - # This could fail if the word is oxytone. - [u'(?i)([áéíóú])([ns])$', u'|1\\2es'], - [u'(?i)(^[bcdfghjklmnñpqrstvwxyz]*)an$', u'\\1anes'], # clan->clanes - [u'(?i)([áéíóú])s$', u'|1ses'], - [u'(?i)(^[bcdfghjklmnñpqrstvwxyz]*)([aeiou])([ns])$', u'\\1\\2\\3es'], # tren->trenes - [u'(?i)([aeiouáéó])$', u'\\1s'], # casa->casas, padre->padres, papá->papás - [u'(?i)([aeiou])s$', u'\\1s'], # atlas->atlas, virus->virus, etc. - [u'(?i)([éí])(s)$', u'|1\\2es'], # inglés->ingleses - [u'(?i)z$', u'ces'], # luz->luces - [u'(?i)([íú])$', u'\\1es'], # ceutí->ceutíes, tabú->tabúes - [u'(?i)(ng|[wckgtp])$', u'\\1s'], # Anglicismos como puenting, frac, crack, show (En que casos podría fallar esto?) - [u'(?i)$', u'es'] # ELSE +es (v.g. árbol->árboles) - ] - - lower_cased_word = word.lower() - - for uncountable_word in self.non_changing_words: - if lower_cased_word[-1 * len(uncountable_word):] == uncountable_word: - return utils.deunicodify(word, origType) - - for irregular_singular, irregular_plural in self.irregular_words.iteritems(): - match = re.search(u'(?i)(^' + irregular_singular + u')$', word, re.IGNORECASE) - if match: - result = re.sub(u'(?i)' + irregular_singular + u'$', match.expand(u'\\1')[0] + irregular_plural[1:], word) - return utils.deunicodify(result, origType) - - for rule in rules: - match = re.search(rule[0], word, re.IGNORECASE) - if match: - groups = match.groups() - replacement = rule[1] - if re.match(u'\|', replacement): - for k in range(1, len(groups)): - replacement = replacement.replace(u'|' + unicode( - k), self.string_replace(groups[k - 1], u'ÁÉÍÓÚáéíóú', u'AEIOUaeiou')) - - result = re.sub(rule[0], replacement, word) - # Esto acentúa los sustantivos que al pluralizarse se - # convierten en esdrújulos como esmóquines, jóvenes... - match = re.search(u'(?i)([aeiou]).{1,3}([aeiou])nes$', result) - - if match and len(match.groups()) > 1 and not re.search(u'(?i)[áéíóú]', word): - result = result.replace(match.group(0), self.string_replace( - match.group(1), u'AEIOUaeiou', u'ÁÉÍÓÚáéíóú') + match.group(0)[1:]) - - return utils.deunicodify(result, origType) - - return utils.deunicodify(word, origType) - - - def singularize(self, word): - ''' - Singularizes Spanish nouns. - Input string can be Unicode (e.g. u"palabras"), or a str encoded in UTF-8 or Latin-1. - Output string will be encoded the same way as the input. - ''' - - word, origType = utils.unicodify(word) # all internal calculations are done in Unicode - - rules = [ - [ur'(?i)^([bcdfghjklmnñpqrstvwxyz]*)([aeiou])([ns])es$', u'\\1\\2\\3'], - [ur'(?i)([aeiou])([ns])es$', u'~1\\2'], - [ur'(?i)shes$', u'sh'], # flashes->flash - [ur'(?i)oides$', u'oide'], # androides->androide - [ur'(?i)(sis|tis|xis)$', u'\\1'], # crisis, apendicitis, praxis - [ur'(?i)(é)s$', u'\\1'], # bebés->bebé - [ur'(?i)(ces)$', u'z'], # luces->luz - [ur'(?i)([^e])s$', u'\\1'], # casas->casa - [ur'(?i)([bcdfghjklmnñprstvwxyz]{2,}e)s$', u'\\1'], # cofres->cofre - [ur'(?i)([ghñptv]e)s$', u'\\1'], # llaves->llave, radiocasetes->radiocasete - [ur'(?i)jes$', u'je'], # ejes->eje - [ur'(?i)ques$', u'que'], # tanques->tanque - [ur'(?i)es$', u''] # ELSE remove _es_ monitores->monitor - ] - - lower_cased_word = word.lower() - - for uncountable_word in self.non_changing_words: - if lower_cased_word[-1 * len(uncountable_word):] == uncountable_word: - return utils.deunicodify(word, origType) - - for irregular_singular, irregular_plural in self.irregular_words.iteritems(): - match = re.search(u'(^' + irregular_plural + u')$', word, re.IGNORECASE) - if match: - result = re.sub(u'(?i)' + irregular_plural + u'$', match.expand(u'\\1')[0] + irregular_singular[1:], word) - return utils.deunicodify(result, origType) - - for rule in rules: - match = re.search(rule[0], word, re.IGNORECASE) - if match: - groups = match.groups() - replacement = rule[1] - if re.match(u'~', replacement): - for k in range(1, len(groups)): - replacement = replacement.replace(u'~' + unicode( - k), self.string_replace(groups[k - 1], u'AEIOUaeiou', u'ÁÉÍÓÚáéíóú')) - - result = re.sub(rule[0], replacement, word) - # Esta es una posible solución para el problema de dobles - # acentos. Un poco guarrillo pero funciona - match = re.search(u'(?i)([áéíóú]).*([áéíóú])', result) - - if match and len(match.groups()) > 1 and not re.search(u'(?i)[áéíóú]', word): - result = self.string_replace( - result, u'ÁÉÍÓÚáéíóú', u'AEIOUaeiou') - - return utils.deunicodify(result, origType) - - return utils.deunicodify(word, origType) - - -# Copyright (c) 2006 Bermi Ferrer Martinez -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software to deal in this software without restriction, including -# without limitation the rights to use, copy, modify, merge, publish, -# distribute, sublicense, and/or sell copies of this software, and to permit -# persons to whom this software is furnished to do so, subject to the following -# condition: -# -# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THIS SOFTWARE. - diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..92d14b6 --- /dev/null +++ b/setup.py @@ -0,0 +1,13 @@ +# coding = utf-8 + +from setuptools import setup, find_packages + + +setup(name='inflector', + version='0.1a', + description='Inflects English, French and Spanish nouns. Similar to Rails inflector but for Python', + url='https://github.com/bermi/Python-Inflector', + author='Bermi Ferrer', + license='BSD', + packages=['inflector','inflector.rules'], + zip_safe=False) \ No newline at end of file diff --git a/tests_es.py b/tests_es.py index 6f75030..55cf732 100755 --- a/tests_es.py +++ b/tests_es.py @@ -7,7 +7,7 @@ # import unittest from inflector import Inflector -from rules.spanish import Spanish +from inflector import Spanish class SpanishInflectorTestCase(unittest.TestCase): diff --git a/utils.py b/utils.py deleted file mode 100644 index e4f572e..0000000 --- a/utils.py +++ /dev/null @@ -1,44 +0,0 @@ -__author__ = 'omrio' - -import unicodedata - - -def unicodify(st): - ''' - Convert the given string to normalized Unicode (i.e. combining characters such as accents are combined) - If given arg is not a string, it's returned as is, and origType is 'noConversion'. - @return a tuple with the unicodified string and the original string encoding. - ''' - - # Convert 'st' to Unicode - if isinstance(st, unicode): - origType = 'unicode' - elif isinstance(st, str): - try: - st = st.decode('utf8') - origType = 'utf8' - except UnicodeDecodeError: - try: - st = st.decode('latin1') - origType = 'latin1' - except: - raise UnicodeEncodeError('Given string %s must be either Unicode, UTF-8 or Latin-1' % repr(st)) - else: - origType = 'noConversion' - - # Normalize the Unicode (to combine any combining characters, e.g. accents, into the previous letter) - if origType != 'noConversion': - st = unicodedata.normalize('NFKC', st) - - return st, origType - - -def deunicodify(unicodifiedStr, origType): - ''' - Convert the given unicodified string back to its original type and encoding - ''' - - if origType == 'unicode': - return unicodifiedStr - - return unicodifiedStr.encode(origType)