Source code for eWRT.lib.thirdparty.advas.phonetics

# ----------------------------------------------------------
# AdvaS Advanced Search
# module for phonetic algorithms
#
# (C) 2002 - 2005 Frank Hofmann, Chemnitz, Germany
# email fh@efho.de
# ----------------------------------------------------------

# changed 2005-01-24

import string
import re

[docs]def soundex (term):
    "Return the soundex value to a string argument."

    # Create and compare soundex codes of English words.
    #
    # Soundex is an algorithm that hashes English strings into
    # alpha-numerical value that represents what the word sounds
    # like. For more information on soundex and some notes on the
    # differences in implemenations visit:
    # http://www.bluepoof.com/Soundex/info.html
    #
    # This version modified by Nathan Heagy at Front Logic Inc., to be
    # compatible with php's soundexing and much faster.
    #
    # eAndroid / Nathan Heagy / Jul 29 2000
    # changes by Frank Hofmann / Jan 02 2005

    # generate translation table only once. used to translate into soundex numbers
    #table = string.maketrans('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', '0123012002245501262301020201230120022455012623010202')
    table = string.maketrans('ABCDEFGHIJKLMNOPQRSTUVWXYZ', '01230120022455012623010202')

    # check parameter
    if not term:
        return "0000" # could be Z000 for compatibility with other implementations
    # end if

        # convert into uppercase letters
    term = string.upper(term)
    first_char = term[0]

    # translate the string into soundex code according to the table above
    term = string.translate(term[1:], table)

    # remove all 0s
    term = term.replace("0", "")

    # remove duplicate numbers in-a-row
    str2 = first_char
    for x in term:
        if x != str2[-1]:
               str2 = str2 + x
        # end if
    # end for

    # pad with zeros
    str2 = str2+"0"*len(str2)

    # take the first four letters
    return_value = str2[:4]

    # return value
    return return_value

[docs]def metaphone (term):
    "returns metaphone code for a given string"

    # implementation of the original algorithm from Lawrence Philips
    # extended/rewritten by M. Kuhn
    # improvements with thanks to John Machin <sjmachin@lexicon.net>

    # define return value
    code = ""

    i = 0
    term_length = len(term)

    if (term_length == 0):
        # empty string ?
        return code
    # end if

    # extension #1 (added 2005-01-28)
    # convert to lowercase
    term = term.lower()

    # extension #2 (added 2005-01-28)
    # remove all non-english characters, first
    term = re.sub(r'[^a-z]', '', term)
    if len(term) == 0:
        # nothing left
        return code
    # end if

    # extension #3 (added 2005-01-24)
    # conflate repeated letters
    firstChar = term[0]
    str2 = firstChar
    for x in term:
        if x != str2[-1]:
               str2 = str2 + x
        # end if
    # end for

    # extension #4 (added 2005-01-24)
    # remove any vowels unless a vowel is the first letter
    firstChar = str2[0]
    str3 = firstChar
    for x in str2[1:]:
        if (re.search(r'[^aeiou]', x)):
            str3 = str3 + x
        # end if
    # end for

    term = str3
    term_length = len(term)
    if term_length == 0:
        # nothing left
        return code
    # end if

    # check for exceptions
    if (term_length > 1):
        # get first two characters
        first_chars = term[0:2]

        # build translation table
        table = {
            "ae":"e",
            "gn":"n",
            "kn":"n",
            "pn":"n",
            "wr":"n",
            "wh":"w"
        }

        if first_chars in table.keys():
            term = term[2:]
            code = table[first_chars]
            term_length = len(term)
        # end if

    elif (term[0] == "x"):
        term = ""
        code = "s"
        term_length = 0
    # end if

    # define standard translation table
    st_trans = {
        "b":"b",
        "c":"k",
        "d":"t",
        "g":"k",
        "h":"h",
        "k":"k",
        "p":"p",
        "q":"k",
        "s":"s",
        "t":"t",
        "v":"f",
        "w":"w",
        "x":"ks",
        "y":"y",
        "z":"s"
    }

    i = 0
    while (i<term_length):
        # init character to add, init basic patterns
        add_char = ""
        part_n_2 = ""
        part_n_3 = ""
        part_n_4 = ""
        part_c_2 = ""
        part_c_3 = ""

        # extract a number of patterns, if possible
        if (i < (term_length - 1)):
            part_n_2 = term[i:i+2]

            if (i>0):
                part_c_2 = term[i-1:i+1]
                part_c_3 = term[i-1:i+2]
            # end if
        # end if

        if (i < (term_length - 2)):
            part_n_3 = term[i:i+3]
        # end if

        if (i < (term_length - 3)):
            part_n_4 = term[i:i+4]
        # end if

        # use table with conditions for translations
        if (term[i] == "b"):
            add_char = st_trans["b"]
            if (i == (term_length - 1)):
                if (i>0):
                    if (term[i-1] == "m"):
                        add_char = ""
                    # end if
                # end if
            # end if
        elif (term[i] == "c"):
            add_char = st_trans["c"]
            if (part_n_2 == "ch"):
                add_char = "x"
            elif (re.search(r'c[iey]', part_n_2)):
                add_char = "s"
            # end if

            if (part_n_3 == "cia"):
                add_char = "x"
            # end if

            if (re.search(r'sc[iey]', part_c_3)):
                add_char = ""
            # end if

        elif (term[i] == "d"):
            add_char = st_trans["d"]
            if (re.search(r'dg[eyi]', part_n_3)):
                add_char = "j"
            # end if

        elif (term[i] == "g"):
            add_char = st_trans["g"]

            if (part_n_2 == "gh"):
                if (i == (term_length - 2)):
                    add_char = ""
                # end if
            elif (re.search(r'gh[aeiouy]', part_n_3)):
                add_char = ""
            elif (part_n_2 == "gn"):
                add_char = ""
            elif (part_n_4 == "gned"):
                add_char = ""
            elif (re.search(r'dg[eyi]',part_c_3)):
                add_char = ""
            elif (part_n_2 == "gi"):
                if (part_c_3 != "ggi"):
                    add_char = "j"
                # end if
            elif (part_n_2 == "ge"):
                if (part_c_3 != "gge"):
                    add_char = "j"
                # end if
            elif (part_n_2 == "gy"):
                if (part_c_3 != "ggy"):
                    add_char = "j"
                # end if
            elif (part_n_2 == "gg"):
                add_char = ""
            # end if
        elif (term[i] == "h"):
            add_char = st_trans["h"]
            if (re.search(r'[aeiouy]h[^aeiouy]', part_c_3)):
                add_char = ""
            elif (re.search(r'[csptg]h', part_c_2)):
                add_char = ""
            # end if
        elif (term[i] == "k"):
            add_char = st_trans["k"]
            if (part_c_2 == "ck"):
                add_char = ""
            # end if
        elif (term[i] == "p"):
            add_char = st_trans["p"]
            if (part_n_2 == "ph"):
                add_char = "f"
            # end if
        elif (term[i] == "q"):
            add_char = st_trans["q"]
        elif (term[i] == "s"):
            add_char = st_trans["s"]
            if (part_n_2 == "sh"):
                add_char = "x"
            # end if

            if (re.search(r'si[ao]', part_n_3)):
                add_char = "x"
            # end if
        elif (term[i] == "t"):
            add_char = st_trans["t"]
            if (part_n_2 == "th"):
                add_char = "0"
            # end if

            if (re.search(r'ti[ao]', part_n_3)):
                add_char = "x"
            # end if
        elif (term[i] == "v"):
            add_char = st_trans["v"]
        elif (term[i] == "w"):
            add_char = st_trans["w"]
            if (re.search(r'w[^aeiouy]', part_n_2)):
                add_char = ""
            # end if
        elif (term[i] == "x"):
            add_char = st_trans["x"]
        elif (term[i] == "y"):
            add_char = st_trans["y"]
        elif (term[i] == "z"):
            add_char = st_trans["z"]
        else:
            # alternative
            add_char = term[i]
        # end if

        code = code + add_char
        i += 1
    # end while

    # return metaphone code
    return code

[docs]def nysiis (term):
    "returns New York State Identification and Intelligence Algorithm (NYSIIS) code for the given term"

    code = ""

    i = 0
    term_length = len(term)

    if (term_length == 0):
        # empty string ?
        return code
    # end if

    # build translation table for the first characters
    table = {
        "mac":"mcc",
        "ph":"ff",
        "kn":"nn",
        "pf":"ff",
        "k":"c",
        "sch":"sss"
    }

    for table_entry in table.keys():
        table_value = table[table_entry]    # get table value
        table_value_len = len(table_value)    # calculate its length
        first_chars = term[0:table_value_len]
        if (first_chars == table_entry):
            term = table_value + term[table_value_len:]
            break
        # end if
    # end for

    # build translation table for the last characters
    table = {
        "ee":"y",
        "ie":"y",
        "dt":"d",
        "rt":"d",
        "rd":"d",
        "nt":"d",
        "nd":"d",
    }

    for table_entry in table.keys():
        table_value = table[table_entry]    # get table value
        table_entry_len = len(table_entry)    # calculate its length
        last_chars = term[(0 - table_entry_len):]
        #print last_chars, ", ", table_entry, ", ", table_value
        if (last_chars == table_entry):
            term = term[:(0 - table_value_len + 1)] + table_value
            break
        # end if
    # end for

    # initialize code
    code = term

    # transform ev->af
    code = re.sub(r'ev', r'af', code)

    # transform a,e,i,o,u->a
    code = re.sub(r'[aeiouy]', r'a', code)

    # transform q->g
    code = re.sub(r'q', r'g', code)

    # transform z->s
    code = re.sub(r'z', r's', code)

    # transform m->n
    code = re.sub(r'm', r'n', code)

    # transform kn->n
    code = re.sub(r'kn', r'n', code)

    # transform k->c
    code = re.sub(r'k', r'c', code)

    # transform sch->sss
    code = re.sub(r'sch', r'sss', code)

    # transform ph->ff
    code = re.sub(r'ph', r'ff', code)

    # transform h-> if previous or next is nonvowel -> previous
    occur = re.findall(r'([a-z]{0,1}?)h([a-z]{0,1}?)', code)
    #print occur
    for occur_group in occur:
        occur_item_previous = occur_group[0]
        occur_item_next = occur_group[1]

        if ((re.match(r'[^aeiouy]', occur_item_previous)) or (re.match(r'[^aeiouy]', occur_item_next))):
            if (occur_item_previous != ""):
                # make substitution
                code = re.sub (occur_item_previous + "h", occur_item_previous * 2, code, 1)
            # end if
        # end if
    # end for

    # transform w-> if previous is vowel -> previous
    occur = re.findall(r'([aeiouy]{1}?)w', code)
    #print occur
    for occur_group in occur:
        occur_item_previous = occur_group[0]
        # make substitution
        code = re.sub (occur_item_previous + "w", occur_item_previous * 2, code, 1)
    # end for

    # check last character
    # -s, remove
    code = re.sub (r's$', r'', code)
    # -ay, replace by -y
    code = re.sub (r'ay$', r'y', code)
    # -a, remove
    code = re.sub (r'a$', r'', code)

    # return nysiis code
    return code

[docs]def caverphone (term):
    "returns the language key using the caverphone algorithm 2.0"

    # Developed at the University of Otago, New Zealand.
    # Project: Caversham Project (http://caversham.otago.ac.nz)
    # Developer: David Hood, University of Otago, New Zealand
    # Contact: caversham@otago.ac.nz
    # Project Technical Paper: http://caversham.otago.ac.nz/files/working/ctp150804.pdf
    # Version 2.0 (2004-08-15)

    code = ""

    i = 0
    term_length = len(term)

    if (term_length == 0):
        # empty string ?
        return code
    # end if

    # convert to lowercase
    code = term.lower()

    # remove anything not in the standard alphabet (a-z)
    code = re.sub(r'[^a-z]', '', code)

    # remove final e
    if code.endswith("e"):
        code = code[:-1]

    # if the name starts with cough, rough, tough, enough or trough -> cou2f (rou2f, tou2f, enou2f, trough)
    code = re.sub(r'^([crt]|(en)|(tr))ough', r'\1ou2f', code)

    # if the name starts with gn -> 2n
    code = re.sub(r'^gn', r'2n', code)

    # if the name ends with mb -> m2
    code = re.sub(r'mb$', r'm2', code)

    # replace cq -> 2q
    code = re.sub(r'cq', r'2q', code)

    # replace c[i,e,y] -> s[i,e,y]
    code = re.sub(r'c([iey])', r's\1', code)

    # replace tch -> 2ch
    code = re.sub(r'tch', r'2ch', code)

    # replace c,q,x -> k
    code = re.sub(r'[cqx]', r'k', code)

    # replace v -> f
    code = re.sub(r'v', r'f', code)

    # replace dg -> 2g
    code = re.sub(r'dg', r'2g', code)

    # replace ti[o,a] -> si[o,a]
    code = re.sub(r'ti([oa])', r'si\1', code)

    # replace d -> t
    code = re.sub(r'd', r't', code)

    # replace ph -> fh
    code = re.sub(r'ph', r'fh', code)

    # replace b -> p
    code = re.sub(r'b', r'p', code)

    # replace sh -> s2
    code = re.sub(r'sh', r's2', code)

    # replace z -> s
    code = re.sub(r'z', r's', code)

    # replace initial vowel [aeiou] -> A
    code = re.sub(r'^[aeiou]', r'A', code)

    # replace all other vowels [aeiou] -> 3
    code = re.sub(r'[aeiou]', r'3', code)

    # replace j -> y
    code = re.sub(r'j', r'y', code)

    # replace an initial y3 -> Y3
    code = re.sub(r'^y3', r'Y3', code)

    # replace an initial y -> A
    code = re.sub(r'^y', r'A', code)

    # replace y -> 3
    code = re.sub(r'y', r'3', code)

    # replace 3gh3 -> 3kh3
    code = re.sub(r'3gh3', r'3kh3', code)

    # replace gh -> 22
    code = re.sub(r'gh', r'22', code)

    # replace g -> k
    code = re.sub(r'g', r'k', code)

    # replace groups of s,t,p,k,f,m,n by its single, upper-case equivalent
    for single_letter in ["s", "t", "p", "k", "f", "m", "n"]:
        otherParts = re.split(single_letter + "+", code)
        code = string.join(otherParts, string.upper(single_letter))

    # replace w[3,h3] by W[3,h3]
    code = re.sub(r'w(h?3)', r'W\1', code)

    # replace final w with 3
    code = re.sub(r'w$', r'3', code)

    # replace w -> 2
    code = re.sub(r'w', r'2', code)

    # replace h at the beginning with an A
    code = re.sub(r'^h', r'A', code)

    # replace all other occurrences of h with a 2
    code = re.sub(r'h', r'2', code)

    # replace r3 with R3
    code = re.sub(r'r3', r'R3', code)

    # replace final r -> 3
    code = re.sub(r'r$', r'3', code)

    # replace r with 2
    code = re.sub(r'r', r'2', code)

    # replace l3 with L3
    code = re.sub(r'l3', r'L3', code)

    # replace final l -> 3
    code = re.sub(r'l$', r'3', code)

    # replace l with 2
    code = re.sub(r'l', r'2', code)

    # remove all 2's
    code = re.sub(r'2', r'', code)

    # replace the final 3 -> A
    code = re.sub(r'3$', r'A', code)

    # remove all 3's
    code = re.sub(r'3', r'', code)

    # extend the code by 10 '1' (one)
    code += '1' * 10

    # take the first 10 characters
    caverphoneCode = code[:10]

    # return caverphone code
    return caverphoneCode