EXmatcher/cologne_phonetics.py at master · exciteproject/EXmatcher · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Cologne_phonetics is a Python implementation of the cologne-phonetics, a phonetic
algorithm similar to soundex but optimized for the german language

Documentation can be found at https://github.com/provinzkraut/cologne_phonetics

An online version and API are available at https://phonetics.provinzkraut.de

A detailed explanation of the cologne phonetics can be found at:
https://en.wikipedia.org/wiki/Cologne_phonetics
"""

__author__ = "Janek Nouvertné"
__version__ = "1.2.3"
__license__ = "MIT"

import sys
import re
from collections import Iterable
from argparse import ArgumentParser, ArgumentTypeError


RGX_SPECIAL_CHARS = re.compile(r"[äüößéèáàç]")

RGX_SPECIAL_CHAR_REPLACEMENTS = [
  (re.compile(r"ä"), "ae"),
  (re.compile(r"ö"), "oe"),
  (re.compile(r"ü"), "ue"),
  (re.compile(r"ß"), "s"),
  (re.compile(r"é"), "e"),
  (re.compile(r"è"), "e"),
  (re.compile(r"á"), "a"),
  (re.compile(r"à"), "a"),
  (re.compile(r"ç"), "c"),
]

RGX_RULES = [
        # ignore special characters that have not been replaced at this point
        (re.compile(r"[^a-z]"),         ''),

        ## d,t replacements
        # not before c,s,z
        (re.compile(r"[dt](?![csz])"), '2'),
        # before c,s,z
        (re.compile(r"[dt](?=[csz])"), '8'),

        ### x replacements
        # not after c,k,q
        (re.compile(r"(?<![ckq])x"),   '48'),
        # after c,k,q. insert new x for later comparison. will be removed later
        (re.compile(r"(?<=[ckq])x"),   'x8'),


        ## c replacements
        # at the start before a,h,k,l,o,q,r,u,x
        # | not after s,z before a,h,k,o,q,u,x
        (re.compile(r"^c(?=[ahkloqrux])|(?<![sz])c(?=[ahkoqux])"),   "4"),
        # not before a,h,k,o,q,u,x
        # | not before s,z
        # | at the start, not before a,h,k,l,o,q,r,u,x
        (re.compile(r"c(?![ahkoqux])|(?<=[sz])c|^c(?![ahkloqrux])"), "8"),

        # p not before h
        (re.compile(r"p(?!h)|b"),       '1'),
        # p before h and f,v,w
        (re.compile(r"p(?=h)|[fvw]"),   '3'),
        (re.compile(r"[hx]"),            ""),
        (re.compile(r"[aeijouy]"),      '0'),
        (re.compile(r"[gkq]"),          '4'),
        (re.compile(r"l"),              '5'),
        (re.compile(r"[mn]"),           '6'),
        (re.compile(r"r"),              '7'),
        (re.compile(r"[sz]"),           '8'),

        # repeating digits
        (re.compile(r"(\d)(?=\1)"),     ''),
        (re.compile(r"\B0"),            '')
    ]


def encode(data, concat=False):
    """
    :param data str: Input to be encoded. Every whitespace character will be\
    interpreted as a wordbreak.
    :param concat bool: The intended behaviour of the cologne-phonetics\
    is to ignore special characters. This leads to concatenation for strings\
    with hyphens. If :attr:`concatenate` is set to True` strings connected by\
    hyphens will be treated as two single strings.

    :rtype: dict
    :return: Return a dict of input / encoded substring pairs

    :note: Contrary to many other implementations, in the final pass only\
    repeated **digits** are removed, not repeated **numbers**. Resulting e.g.\
    in ``xx`` being encoded as `4848` and not `48``.
    """

    def _replace_by_rules(rules, s):
        for rule in rules:
            s = rule[0].sub(rule[1], s)
        return s

    def _enc(s):
        s = s.lower()
        if RGX_SPECIAL_CHARS.search(s):
            s = _replace_by_rules(RGX_SPECIAL_CHAR_REPLACEMENTS, s)
        o = s
        s = _replace_by_rules(RGX_RULES, s)
        return o, s


    if not concat:
        data = data.replace("-", " ")
    if " " in data:
        data = data.split(" ")
        result = []
        for i in data:
            result.append(_enc(i))
    else:
        result = [_enc(data)]

    return result


def compare(*data, concat=False):
    """
    Encode and compare strings.

    :param *data: Data to compare. Either at last 2 positional arguments or an iterable
    :param concat bool: Passed to `encode()`

    :returns: True or False

    :raises: ValueError if only one input string is given.
    """

    if not isinstance(data[0], str) and (data[0], Iterable) and len(data) == 1:
        data = data[0]

    if len(data) == 1:
        raise ValueError('Compare called with only one value: "%s"' % data[0])

    last = None
    for s in data:
        res = [r[1] for r in encode(s)]
        if last and res != last:
            return False
        else:
            last = res
    else:
        return True


def cli():
    parser = ArgumentParser(description=__doc__)
    parser.add_argument("data",
                        help="string to be encoded")
    parser.add_argument("-c", "--concat",
                        action="store_true",
                        help="treat words connected by hyphens as seperate words")
    parser.add_argument("-v", "--verbose",
                        action="store_true",
                        help="show detailed information")
    parser.add_argument("-p", "--pretty",
                       action="store_true",
                       help="use in combination with --verbose to format output nicely")
    args = parser.parse_args()
    res = encode(args.data, concat=args.concat)
    if args.verbose:
        sep = '\n' if args.pretty else ', '
        out = sep.join([r[0]+": "+r[1] for r in res])
    else:
        out = ', '.join([r[1] for r in res])
    print(out)

if __name__ == "__main__": # pragma: no cover
    cli()