NYSIIS: Difference between revisions
(+ D entry) |
(→{{header|Java}}: added Java) |
||
Line 266:
mitchell: MATCAL
o'daniel: ODANAL</pre>
=={{header|Java}}==
{{works with|Java|8}}
<lang java>import static java.util.Arrays.*;
import static java.lang.System.out;
public class NYSIIS {
final static String[][] first = {{"MAC", "MCC"}, {"KN", "NN"}, {"K", "C"},
{"PH", "FF"}, {"PF", "FF"}, {"SCH", "SSS"}};
final static String[][] last = {{"EE", "Y"}, {"IE", "Y"}, {"DT", "D"},
{"RT", "D"}, {"RD", "D"}, {"NT", "D"}, {"ND", "D"}};
final static String Vowels = "AEIOU";
public static void main(String[] args) {
stream(args).parallel().map(NYSIIS::transcode).forEach(out::println);
}
static String transcode(String s) {
int len = s.length();
StringBuilder sb = new StringBuilder(len);
for (int i = 0; i < len; i++) {
char c = s.charAt(i);
if (c >= 'a' && c <= 'z')
sb.append((char) (c - 32));
else if (c >= 'A' && c <= 'Z')
sb.append(c);
}
replace(sb, 0, first);
replace(sb, sb.length() - 2, last);
len = sb.length();
sb.append(" ");
for (int i = 1; i < len; i++) {
char prev = sb.charAt(i - 1);
char curr = sb.charAt(i);
char next = sb.charAt(i + 1);
if (curr == 'E' && next == 'V')
sb.replace(i, i + 2, "AF");
else if (isVowel(curr))
sb.setCharAt(i, 'A');
else if (curr == 'Q')
sb.setCharAt(i, 'G');
else if (curr == 'Z')
sb.setCharAt(i, 'S');
else if (curr == 'M')
sb.setCharAt(i, 'N');
else if (curr == 'K' && next == 'N')
sb.setCharAt(i, 'N');
else if (curr == 'K')
sb.setCharAt(i, 'C');
else if (sb.indexOf("SCH", i) == i)
sb.replace(i, i + 3, "SSS");
else if (curr == 'P' && next == 'H')
sb.replace(i, i + 2, "FF");
else if (curr == 'H' && (!isVowel(prev) || !isVowel(next)))
sb.setCharAt(i, prev);
else if (curr == 'W' && isVowel(prev))
sb.setCharAt(i, prev);
if (sb.charAt(i) == prev) {
sb.deleteCharAt(i--);
len--;
}
}
sb.setLength(sb.length() - 1); // We've added a space
int lastPos = sb.length() - 1;
if (lastPos > 1) {
if (sb.lastIndexOf("AY") == lastPos - 1)
sb.delete(lastPos - 1, lastPos + 1).append("Y");
else if (sb.lastIndexOf("AS") == lastPos - 1)
sb.setLength(lastPos - 1);
else if (sb.charAt(lastPos) == 'S')
sb.setLength(lastPos);
else if (sb.charAt(lastPos) == 'A')
sb.setLength(lastPos);
}
if (sb.length() > 6)
sb.insert(6, '[').append(']');
return String.format("%s -> %s", s, sb);
}
private static void replace(StringBuilder sb, int start, String[][] maps) {
if (start >= 0)
for (String[] map : maps) {
if (sb.indexOf(map[0]) == start) {
sb.replace(start, start + map[0].length(), map[1]);
break;
}
}
}
private static boolean isVowel(char c) {
return Vowels.indexOf(c) != -1;
}
}</lang>
<pre>Matthews -> MAT
Carr -> CAR
Lawson -> LASAN
Chapman -> CAPNAN
O'Brien -> OBRAN
LouisXVI -> LASXV
Carlson -> CARLSA[N]
Lawrence -> LARANC
Greene -> GRAN
Jacobs -> JACAB
Willis -> WAL
browneIV -> BRANAV
Harper -> HARPAR
browneIII -> BRAN
Mclaughlin -> MCLAGL[AN]
mitchell -> MATCAL
Morrison -> MARASA[N]
o'daniel -> ODANAL
Franklin -> FRANCL[AN]
McDonald -> MCDANA[LD]
knight -> NAGT
McDaniel -> MCDANA[L]
Larson -> LARSAN
Silva -> SALV
Wheeler -> WALAR
Richards -> RACARD
Bishop -> BASAP
O'Banion -> OBANAN
McCormack -> MCARNA[C]
Watkins -> WATCAN
Lynch -> LYNC
Mackenzie -> MCANSY
brownsr -> BRANSR</pre>
=={{header|Perl 6}}==
|
Revision as of 03:30, 21 August 2014
This page uses content from Wikipedia. The original article was at NYSIIS. The list of authors can be seen in the page history. As with Rosetta Code, the text of Wikipedia is available under the GNU FDL. (See links for details on variance) |
The New York State Identification and Intelligence System phonetic code, commonly known as NYSIIS, is a phonetic algorithm for creating indices for words based on their pronunciation. The goal is for homophones to be encoded to the same representation so that they can be matched despite minor differences in spelling.
The task here is to implement the original NYSIIS algorithm, shown in Wikipedia, rather than any other subsequent modification. Also, before the algorithm is applied the input string should be converted to upper case with all white space removed.
An optional step is to handle multiple names, including double-barrelled names or double surnames (e.g. 'Hoyle-Johnson' or 'Vaughan Williams') and unnecessary suffixes/honours that are not required for indexing purposes (e.g. 'Jnr', 'Sr', 'III', etc) - a small selection will suffice. The original implementation is also restricted to six characters, but this is not a requirement.
- See also
Caché ObjectScript
Refactored code based on other examples to reduce footprint.
<lang cos> Class Utils.Phonetic [ Abstract ] {
ClassMethod EncodeName(pAlgorithm As %String = "", pName As %String = "", Output pCode As %String, pSuffixRem As %Boolean = 1, pTruncate As %Integer = 0) As %Status { // check algorithm and name Set pAlgorithm=$ZConvert(pAlgorithm, "l") If pAlgorithm="" Quit $$$ERROR($$$GeneralError, "No algorithm specified.") If $Case(pAlgorithm, "nysiis":1, :0)=0 Quit $$$ERROR($$$GeneralError, "Unknown algorithm specified.") If $Match(pName, ".*\d.*# no numbers") Quit $$$ERROR($$$GeneralError, "Name cannot contain numerics.")
// remove apostrophes, find punctuation and replace with spaces (exclude hyphens) Set pName=$Translate(pName, "'") Set pun=$ZStrip(pName, "*E'P", "-") Set pName=$Translate(pName, pun, $Justify(" ", $Length(pun)))
// convert name(s) to uppercase and remove all white space Set pName=$ZStrip($ZConvert(pName, "U"), "<=>W")
// remove suffixes (e.g. 'Jnr', 'OBE', 'DSC', etc), including roman numerals (e.g. 'II', 'VIII') // - http://en.wikipedia.org/wiki/List_of_post-nominal_letters_(United_Kingdom) If pSuffixRem { Set ords=$ListBuild("KG", "LG", "KT", "LT", "GCB", "KCB", "DCB", "CB", "GCMG", "KCMG", "DCMG", "CMG", "DSO", "GCVO", "KCVO", "DCVO", "CVO", "LVO", "MVO", "OM", "ISO", "GBE", "KBE", "DBE", "CBE", "OBE", "MBE", "CH") Set decs=$ListBuild("VC", "GC", "CGC", "RRC", "DSC", "MC", "DFC", "AFC", "ARRC", "OBI", "IOM") Set regexp="( )(SNR|SR|JNR|JR|ESQ|"_$ListToString(ords, "|")_"|"_$ListToString(decs, "|")_"|[IVX]+)" Set rem=##class(%Regex.Matcher).%New(regexp, pName) Set pName=rem.ReplaceAll("") }
// replace hyphen and white space, plus some final validation Set pName=$ZStrip($Translate(pName, "-", " "), "<=>W") If $Length($Piece(pName, " "))<2 Quit $$$ERROR($$$GeneralError, "Invalid name.")
// begin algorithm and truncate result, if necessary Set pCode="" For piece=1:1:$Length(pName, " ") { If pAlgorithm="nysiis" Set pCode=pCode_..ToNYSIIS($Piece(pName, " ", piece)) } If pTruncate { Set pName=pCode Set pCode=$Extract(pCode, 1, pTruncate) Set $Extract(pName, 1, pTruncate)="" If $Length(pName) Set pCode=pCode_"["_pName_"]" }
// finished Quit $$$OK }
ClassMethod ToNYSIIS(pName As %String) As %String { /* New York State Identification and Intelligence System (NYSIIS) Phonetic Encoder - http://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System - http://www.dropby.com/indexLF.html?content=/NYSIIS.html */
// create regexp matcher instance, remove punctuation and convert all to upper case Set rem=##class(%Regex.Matcher).%New(" ") Set rem.Text=$ZConvert($ZStrip(pName, "*P"), "U")
// translate first characters of name: // => MAC->MCC, KN->N, K->C, PH/PF->FF, SCH->SSS For rule="^MAC->MCC", "^KN->N", "^K->C", "^(PH|PF)->FF", "^SCH->SSS" { Set rem.Pattern=$Piece(rule, "->") If rem.Locate() Set rem.Text=rem.ReplaceFirst($Piece(rule, "->", 2)) Quit }
// translate last characters of name: // => EE/IE->Y, DT/RT/RD/NT/ND->D For rule="(EE|IE)$->Y", "(DT|RT|RD|NT|ND)$->D" { Set rem.Pattern=$Piece(rule, "->") If rem.Locate() Set rem.Text=rem.ReplaceFirst($Piece(rule, "->", 2)) Quit }
// first character of key = first character of name Set pName1=$Extract(rem.Text, 1), rem.Text=$Extract(rem.Text, 2, *)
// translate remaining characters by following rules, incrementing by one character each time: // => EV->AF else A,E,I,O,U->A // => Q->G, Z->S, M->N // => KN->N else K->C // => SCH->SSS, PH->FF // => H->if previous or next is non-vowel, previous // => W->if previous is vowel, A (A is the only vowel left) // => add current to key if current is not same as the last key character Set ptr=0, rules=$ListBuild("EV->AF", "(A|E|I|O|U)->A", "Q->G", "Z->S", "M->N", "KN->N", "K->C", "SCH->SSS", "PH->FF", "H[^A]", "[^A]H", "AW->A") While $ListNext(rules, ptr, rule) { Set rem.Pattern=$Piece(rule, "->") If $Piece(rule, "->", 2)="", rem.Locate() { Set $Piece(rule, "->", 2)=$Translate(rem.Group, "H") } Set rem.Text=rem.ReplaceAll($Piece(rule, "->", 2)) } Set pName=$ZStrip(rem.Text, "=U") // remove duplicates
// if last character is S, remove it If $Extract(pName, *)="S" Set pName=$Extract(pName, 1, *-1)
// if last characters are AY, replace with Y If $Extract(pName, *-1, *)="AY" Set pName=$Extract(pName, 1, *-2)_"Y"
// if last character is A, remove it If $Extract(pName, *)="A" Set pName=$Extract(pName, 1, *-1)
// append translated key to removed first character Quit pName1_pName }
} </lang>
- Examples:
USER>For { Read !, name Quit:name="" Set sc=##class(Utils.Phonetic).EncodeName("nysiis", name, .code,, 6) If sc Write " -> ", code } knight -> NAGT mitchell -> MATCAL o'daniel -> ODANAL brown sr -> BRAN browne III -> BRAN browne IV -> BRAN O'Banion -> OBANAN Mclaughlin -> MCLAGL[AN] McCormack -> MCARNA[C] Chapman -> CHAPNA[N] Silva -> SALV McDonald -> MCDANA[LD] Lawson -> LASAN Jacobs -> JACAB Greene -> GRAN O'Brien -> OBRAN Morrison -> MARASA[N] Larson -> LARSAN Willis -> WAL Mackenzie -> MCANSY Carr -> CAR Lawrence -> LARANC Matthews -> MAT Richards -> RACARD Bishop -> BASAP Franklin -> FRANCL[AN] McDaniel -> MCDANA[L] Harper -> HARPAR Lynch -> LYNC Watkins -> WATCAN Carlson -> CARLSA[N] Wheeler -> WHALAR Louis XVI -> L Hoyle-Johnson -> HAYLJA[NSAN] Vaughan Williams -> VAGANW[ALAN] D'Souza -> DSAS de Sousa -> DSAS
D
<lang d>import std.stdio, std.regex, std.algorithm, std.range, std.string;
string replaceAt(in string text, in uint pos, in string[] fromList,
in string[] toList) pure /*nothrow*/ { foreach (const f, const t; zip(fromList, toList)) if (text[pos .. $].startsWith(f)) return [text[0 .. pos], t, text[pos + f.length .. $]].join; return text;
}
string replaceEnd(in string text, in string[] fromList,
in string[] toList) pure /*nothrow*/ { foreach (const f, const t; zip(fromList, toList)) if (text.endsWith(f)) return text[0 .. $ - f.length] ~ t; return text;
}
string nysiis(string name) /*pure nothrow*/ {
enum vowels = "AEIOU"; name = name.replaceAll(r"\W".regex, "").toUpper; name = name.replaceAt(0, ["MAC", "KN", "K", "PH", "PF", "SCH"], ["MCC", "N", "C", "FF", "FF", "SSS"]); name = name.replaceEnd(["EE", "IE", "DT", "RT", "RD", "NT", "ND"], ["Y", "Y", "D", "D", "D", "D", "D"]); string key = name[0 .. 1]; string key1;
foreach (immutable i; 1 .. name.length) { immutable n_1 = name[i - 1 .. i]; immutable n = name[i]; immutable n1b = (i + 1 < name.length) ? name[i+1 .. i+2] : ""; name = name.replaceAt(i, ["EV"] ~ vowels.split(""), ["AF"] ~ ["A"].replicate(5)); name = name.replaceAt(i, "QZM".split(""), "GSN".split("")); name = name.replaceAt(i, ["KN", "K"], ["N", "C"]); name = name.replaceAt(i, ["SCH", "PH"], ["SSS", "FF"]); if (n == 'H' && (!vowels.canFind(n_1) || !vowels.canFind(n1b))) name = [name[0 .. i], n_1, name[i + 1 .. $]].join; if (n == 'W' && vowels.canFind(n_1)) name = [name[0 .. i], "A", name[i + 1 .. $]].join; if (!key.empty && key[$ - 1] != name[i]) key ~= name[i]; }
return key1 ~ replaceEnd(key, ["S", "AY", "A"], ["", "Y", ""]);
}
void main() {
immutable names = ["Bishop", "Carlson", "Carr", "Chapman", "Franklin", "Greene", "Harper", "Jacobs", "Larson", "Lawrence", "Lawson", "Louis, XVI", "Lynch", "Mackenzie", "Matthews", "McCormack", "McDaniel", "McDonald", "Mclaughlin", "Morrison", "O'Banion", "O'Brien", "Richards", "Silva", "Watkins", "Wheeler", "Willis", "brown, sr", "browne, III", "browne, IV", "knight", "mitchell", "o'daniel"];
foreach (immutable name; names) writefln("%11s: %s", name, name.nysiis);
}</lang>
- Output:
Bishop: BASAP Carlson: CARLSAN Carr: CAR Chapman: CAPNAN Franklin: FRANCLAN Greene: GRAN Harper: HARPAR Jacobs: JACAB Larson: LARSAN Lawrence: LARANC Lawson: LASAN Louis, XVI: LASXV Lynch: LYNC Mackenzie: MCANSY Matthews: MATA McCormack: MCARNAC McDaniel: MCDANAL McDonald: MCDANALD Mclaughlin: MCLAGLAN Morrison: MARASAN O'Banion: OBANAN O'Brien: OBRAN Richards: RACARD Silva: SALV Watkins: WATCAN Wheeler: WALAR Willis: WALA brown, sr: BRANSR browne, III: BRAN browne, IV: BRANAV knight: NAGT mitchell: MATCAL o'daniel: ODANAL
Java
<lang java>import static java.util.Arrays.*; import static java.lang.System.out;
public class NYSIIS {
final static String[][] first = {{"MAC", "MCC"}, {"KN", "NN"}, {"K", "C"}, {"PH", "FF"}, {"PF", "FF"}, {"SCH", "SSS"}};
final static String[][] last = {{"EE", "Y"}, {"IE", "Y"}, {"DT", "D"}, {"RT", "D"}, {"RD", "D"}, {"NT", "D"}, {"ND", "D"}};
final static String Vowels = "AEIOU";
public static void main(String[] args) { stream(args).parallel().map(NYSIIS::transcode).forEach(out::println); }
static String transcode(String s) { int len = s.length(); StringBuilder sb = new StringBuilder(len);
for (int i = 0; i < len; i++) { char c = s.charAt(i); if (c >= 'a' && c <= 'z') sb.append((char) (c - 32)); else if (c >= 'A' && c <= 'Z') sb.append(c); }
replace(sb, 0, first); replace(sb, sb.length() - 2, last);
len = sb.length(); sb.append(" "); for (int i = 1; i < len; i++) { char prev = sb.charAt(i - 1); char curr = sb.charAt(i); char next = sb.charAt(i + 1);
if (curr == 'E' && next == 'V') sb.replace(i, i + 2, "AF");
else if (isVowel(curr)) sb.setCharAt(i, 'A');
else if (curr == 'Q') sb.setCharAt(i, 'G');
else if (curr == 'Z') sb.setCharAt(i, 'S');
else if (curr == 'M') sb.setCharAt(i, 'N');
else if (curr == 'K' && next == 'N') sb.setCharAt(i, 'N');
else if (curr == 'K') sb.setCharAt(i, 'C');
else if (sb.indexOf("SCH", i) == i) sb.replace(i, i + 3, "SSS");
else if (curr == 'P' && next == 'H') sb.replace(i, i + 2, "FF");
else if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) sb.setCharAt(i, prev);
else if (curr == 'W' && isVowel(prev)) sb.setCharAt(i, prev);
if (sb.charAt(i) == prev) { sb.deleteCharAt(i--); len--; } } sb.setLength(sb.length() - 1); // We've added a space
int lastPos = sb.length() - 1; if (lastPos > 1) {
if (sb.lastIndexOf("AY") == lastPos - 1) sb.delete(lastPos - 1, lastPos + 1).append("Y");
else if (sb.lastIndexOf("AS") == lastPos - 1) sb.setLength(lastPos - 1);
else if (sb.charAt(lastPos) == 'S') sb.setLength(lastPos);
else if (sb.charAt(lastPos) == 'A') sb.setLength(lastPos); }
if (sb.length() > 6) sb.insert(6, '[').append(']');
return String.format("%s -> %s", s, sb); }
private static void replace(StringBuilder sb, int start, String[][] maps) { if (start >= 0) for (String[] map : maps) { if (sb.indexOf(map[0]) == start) { sb.replace(start, start + map[0].length(), map[1]); break; } } }
private static boolean isVowel(char c) { return Vowels.indexOf(c) != -1; }
}</lang>
Matthews -> MAT Carr -> CAR Lawson -> LASAN Chapman -> CAPNAN O'Brien -> OBRAN LouisXVI -> LASXV Carlson -> CARLSA[N] Lawrence -> LARANC Greene -> GRAN Jacobs -> JACAB Willis -> WAL browneIV -> BRANAV Harper -> HARPAR browneIII -> BRAN Mclaughlin -> MCLAGL[AN] mitchell -> MATCAL Morrison -> MARASA[N] o'daniel -> ODANAL Franklin -> FRANCL[AN] McDonald -> MCDANA[LD] knight -> NAGT McDaniel -> MCDANA[L] Larson -> LARSAN Silva -> SALV Wheeler -> WALAR Richards -> RACARD Bishop -> BASAP O'Banion -> OBANAN McCormack -> MCARNA[C] Watkins -> WATCAN Lynch -> LYNC Mackenzie -> MCANSY brownsr -> BRANSR
Perl 6
This implementation removes common name suffixes similar to the reference implementation, even though it is not specified in the task description or on the linked NYSIIS page. This algorithm isn't too friendly to certain French kings. :)
<lang perl6>sub no_suffix ($name) {
$name.uc.subst: /\h (<[JS]>R) | (<[IVX]>+) $/, ;
}
sub nysiis ($name is copy) {
given $name .= uc { s:g/<-[A..Z]>//; s/^MAC/MCC/; s/^P<[FH]>/FF/; s/^SCH/SSS/; s/^KN/N/; s/<[IE]>E$ /Y/; s/<[DRN]>T$ /D/; s/<[RN]>D$ /D/; s:c(1):g/EV/AF/; s:c(1):g/<[AEIOU]>+/A/; s:c(1):g/Q/G/; s:c(1):g/Z/S/; s:c(1):g/M/N/; s:c(1):g/KN/N/; s:c(1):g/K/C/; s:c(1):g/SCH/S/; s:c(1):g/PF/F/; s:c(1):g/K/C/; s:c(1):g/H(<-[AEIOU]>)/$0/; s:g/(<-[AEIOU]>)H/$0/; s:g/(<-[AEIOU]>)W/$0/; s/ AY$ /Y/; s/ S$ //; s/ A$ //; s:g/ (.)$0+ /$0/; }
}
for «
knight mitchell "o'daniel" "brown sr" "browne III" "browne IV" "O'Banion" Mclaughlin McCormack Chapman Silva McDonald Lawson Jacobs Greene "O'Brien" Morrison Larson Willis Mackenzie Carr Lawrence Matthews Richards Bishop Franklin McDaniel Harper Lynch Watkins Carlson Wheeler "Louis XVI"
» {
my $nysiis = nysiis no_suffix $_; if $nysiis.chars > 6 { $nysiis .= subst: rx/ <after .**6> .+ /, -> $/ { "[$/]" }; } printf "%10s, %s\n", $_, $nysiis;
}</lang>
Output:
knight, NAGT mitchell, MATCAL o'daniel, ODANAL brown sr, BRAN browne III, BRAN browne IV, BRAN O'Banion, OBANAN Mclaughlin, MCLAGL[AN] McCormack, MCARNA[C] Chapman, CAPNAN Silva, SALV McDonald, MCDANA[LD] Lawson, LASAN Jacobs, JACAB Greene, GRAN O'Brien, OBRAN Morrison, MARASA[N] Larson, LARSAN Willis, WAL Mackenzie, MCANSY Carr, CAR Lawrence, LARANC Matthews, MAT Richards, RACARD Bishop, BASAP Franklin, FRANCL[AN] McDaniel, MCDANA[L] Harper, HARPAR Lynch, LYNC Watkins, WATCAN Carlson, CARLSA[N] Wheeler, WALAR Louis XVI, L
Python
A literal translation of the algorithm from the Wikipedia article. <lang python>import re
_vowels = 'AEIOU'
def replace_at(text, position, fromlist, tolist):
for f, t in zip(fromlist, tolist): if text[position:].startswith(f): return .join([text[:position], t, text[position+len(f):]]) return text
def replace_end(text, fromlist, tolist):
for f, t in zip(fromlist, tolist): if text.endswith(f): return text[:-len(f)] + t return text
def nysiis(name):
name = re.sub(r'\W', , name).upper() name = replace_at(name, 0, ['MAC', 'KN', 'K', 'PH', 'PF', 'SCH'], ['MCC', 'N', 'C', 'FF', 'FF', 'SSS']) name = replace_end(name, ['EE', 'IE', 'DT', 'RT', 'RD', 'NT', 'ND'], ['Y', 'Y', 'D', 'D', 'D', 'D', 'D']) key, key1 = name[0], i = 1 while i < len(name): #print(i, name, key1, key) n_1, n = name[i-1], name[i] n1_ = name[i+1] if i+1 < len(name) else name = replace_at(name, i, ['EV'] + list(_vowels), ['AF'] + ['A']*5) name = replace_at(name, i, 'QZM', 'GSN') name = replace_at(name, i, ['KN', 'K'], ['N', 'C']) name = replace_at(name, i, ['SCH', 'PH'], ['SSS', 'FF']) if n == 'H' and (n_1 not in _vowels or n1_ not in _vowels): name = .join([name[:i], n_1, name[i+1:]]) if n == 'W' and n_1 in _vowels: name = .join([name[:i], 'A', name[i+1:]]) if key and key[-1] != name[i]: key += name[i] i += 1 key = replace_end(key, ['S', 'AY', 'A'], [, 'Y', ]) return key1 + key
if __name__ == '__main__':
names = ['Bishop', 'Carlson', 'Carr', 'Chapman', 'Franklin', 'Greene', 'Harper', 'Jacobs', 'Larson', 'Lawrence', 'Lawson', 'Louis, XVI', 'Lynch', 'Mackenzie', 'Matthews', 'McCormack', 'McDaniel', 'McDonald', 'Mclaughlin', 'Morrison', "O'Banion", "O'Brien", 'Richards', 'Silva', 'Watkins', 'Wheeler', 'Willis', 'brown, sr', 'browne, III', 'browne, IV', 'knight', 'mitchell', "o'daniel"] for name in names: print('%15s: %s' % (name, nysiis(name)))</lang>
- Output:
Bishop: BASAP Carlson: CARLSAN Carr: CAR Chapman: CAPNAN Franklin: FRANCLAN Greene: GRAN Harper: HARPAR Jacobs: JACAB Larson: LARSAN Lawrence: LARANC Lawson: LASAN Louis, XVI: LASXV Lynch: LYNC Mackenzie: MCANSY Matthews: MATA McCormack: MCARNAC McDaniel: MCDANAL McDonald: MCDANALD Mclaughlin: MCLAGLAN Morrison: MARASAN O'Banion: OBANAN O'Brien: OBRAN Richards: RACARD Silva: SALV Watkins: WATCAN Wheeler: WALAR Willis: WALA brown, sr: BRANSR browne, III: BRAN browne, IV: BRANAV knight: NAGT mitchell: MATCAL o'daniel: ODANAL
REXX
This REXX version allows a blank to be inserted into names by using an underscore [_].
Code was added to the REXX program to allow various titles.
Any title ending in a period is ignored as well as some Roman numeral titles.
An "extra" RETURN statement (at the end of the NYSIIS subroutine) was included to show how to restrict the key to six characters.
<lang rexx>/*REXX program implements the NYSIIS phonetic algorithm for names. */
names="Bishop brown_sr browne_III browne_IV Carlson Carr Chapman D'Souza de_Sousa Franklin",
"Greene Harper Hoyle-Johnson Jacobs knight Larson Lawrence Lawson Louis_XVI Lynch", "Mackenzie Marshall,ESQ Matthews McCormack McDaniel McDonald Mclaughlin mitchell Morrison", "O'Banion O'Brien o'daniel Richards Silva Vaughan_Williams Watkins Wheeler Willis Xavier,MD."
arg z; if z= then z=names /*get optional list of names. */
do i=1 for words(z) /*process each name in the list. */ xx=translate(word(z,i),,'_') /*reconstitute any blanks. */ say right(xx,30) ' ──► ' nysiis(xx) /*show and tell stuff. */ end /*i*/
exit /*stick a fork in it, we're done.*/ /*──────────────────────────────────NYSIIS subroutine───────────────────*/ nysiis: procedure; arg x; x=space(x); x=translate(x,,','); w=words(x) lw=word(x,words(x)) /*pick off the last word in name.*/ titles = 'ESQ JNR JR SNR SR' /* [↓] is the last word special?*/ if w\==1 then if pos('IL',lw)==0 then /*disallow IL as Roman#*/
if right(lw,1)=='.' |, /*Sr. Jr. Esq. ... ?*/ datatype(left(lw,1),'W') |, /*2nd 3rd 4th ... ?*/ verify(lw,'IVXL')==0 |, /*Roman numeral suffix?*/ wordpos(x,titles)\==0 then x=subword(x,1,w-1)
x=space(x,0) /*remove all whitespace from name*/ if left(x,3)=='MAC' then x='MCC'substr(x,4) if left(x,2)=='KN' then x= 'N'substr(x,3) if left(x,1)=='K' then x= 'C'substr(x,2) if left(x,2)=='PH' | left(x,2)=='PF' then x= 'FF'substr(x,3) if left(x,3)=='SCH' then x='SSS'substr(x,4) r2=right(x,2) if wordpos(r2,'EE IE') \==0 then x=left(x,length(x)-2)"Y" if wordpos(r2,'DT RT RD NT ND')\==0 then x=left(x,length(x)-2)"D" key=left(x,1)
do j=2 to length(x); c2=substr(x,j,2); c=left(c2,1) if \datatype(c,'Upper') then iterate /*Not a Latin char? Ignore it*/ if c2=='EV' then x=overlay("F",x,j+1) else x=overlay(translate(c,'AAAAGSN',"EIOUQZM"),x,j) if c2=='KN' then x=left(x,j-1)"N"substr(x,j+1) else if c1=="K" then x=overlay('C',x,j) c3=substr(x,j,3) if c3=='SCH' then x=overlay("SSS",x,j) c2=substr(x,j,2) if c2=='PH' then x=overlay("FF",x,j) c=substr(x,j,1); p=substr(x,j-1,1); n=substr(x,j+1,1) if c=='H' then if \vowel(p) | \vowel(n) then x=overlay(p,x,j) c=substr(x,j,1); p=substr(x,j-1,1) if c=='W' then if vowel(p) then x=overlay("A",x,j) c=substr(x,j,1) if c\==right(key,1) then key=key||c end /*j*/
if right(key,1)=='S' then key=left(key, max(1, length(key)-1)) if right(key,2)=='AY' then key=left(key, length(key)-2)"Y" if right(key,1)=='A' then key=left(key, max(1, length(key)-1))
return strip(key) /*return the whole key. */ return strip(left(key, 6)) /*return leftmost six chars. */ /*──────────────────────────────────VOWEL subroutine────────────────────*/ vowel: return pos(arg(1), 'AEIOU') \== 0</lang> output when using the default input(s):
Bishop ──► BASAP brown sr ──► BRANSR browne III ──► BRAN browne IV ──► BRAN Carlson ──► CARLSAN Carr ──► CAR Chapman ──► CAPNAN D'Souza ──► DSAS de Sousa ──► DASAS Franklin ──► FRANKLAN Greene ──► GRAN Harper ──► HARPAR Hoyle-Johnson ──► HAYLAJANSAN Jacobs ──► JACAB knight ──► NAGT Larson ──► LARSAN Lawrence ──► LARANC Lawson ──► LASAN Louis XVI ──► L Lynch ──► LYNC Mackenzie ──► MCKANSY Marshall,ESQ ──► MARSALASG Matthews ──► MAT McCormack ──► MCARNACK McDaniel ──► MCDANAL McDonald ──► MCDANALD Mclaughlin ──► MCLAGLAN mitchell ──► MATCAL Morrison ──► MARASAN O'Banion ──► OBANAN O'Brien ──► OBRAN o'daniel ──► ODANAL Richards ──► RACARD Silva ──► SALV Vaughan Williams ──► VAGANWALAN Watkins ──► WATKAN Wheeler ──► WALAR Willis ──► WAL Xavier,MD. ──► XAVAR
Tcl
<lang tcl>proc nysiis {name {truncate false}} {
# Normalize to first word, uppercased, without non-letters set name [regsub -all {[^A-Z]+} [string toupper [regexp -inline {\S+} $name]] ""] # Prefix map foreach {from to} {MAC MCC KN N K C PH FF PF FF SCH SSS} {
if {[regsub ^$from $name $to name]} break
} # Suffix map foreach {from to} {EE Y IE Y DT D RT D NT D ND D} {
if {[regsub $from$ $name $to name]} break
} # Split regexp (.)(.*) $name -> name rest # Reduce suffix regsub -all {[AEIOU]} [regsub -all EV $rest AF] A rest set rest [string map {Q G Z S M N KN N K C SCH SSS PH FF} $rest] regsub -all {([^A])H|(.)H(?=[^A])} $rest {\1\2} rest regsub -all AW $rest A rest regsub -all {(.)\1+} $rest {\1} rest regsub {S$} $rest "" rest regsub {A(Y?)$} $rest {\1} rest append name $rest # Apply truncation if needed if {$truncate} {
set name [string range $name 0 5]
} return $name
}</lang> Demonstrating: <lang tcl>foreach name {
knight mitchell "o'daniel" "brown sr" "browne III" "browne IV" "O'Banion" Mclaughlin McCormack Chapman Silva McDonald Lawson Jacobs Greene "O'Brien" Morrison Larson Willis Mackenzie Carr Lawrence Matthews Richards Bishop Franklin McDaniel Harper Lynch Watkins Carlson Wheeler "Louis XVI"
} {
puts "$name -> [nysiis $name]"
}</lang>
- Output:
knight -> NAGT mitchell -> MATCAL o'daniel -> ODANAL brown sr -> BRAN browne III -> BRAN browne IV -> BRAN O'Banion -> OBANAN Mclaughlin -> MCLAGLAN McCormack -> MCARNAC Chapman -> CHAPNAN Silva -> SALV McDonald -> MCDANALD Lawson -> LASAN Jacobs -> JACAB Greene -> GRAN O'Brien -> OBRAN Morrison -> MARASAN Larson -> LARSAN Willis -> WAL Mackenzie -> MCANSY Carr -> CAR Lawrence -> LARANC Matthews -> MAT Richards -> RACARD Bishop -> BASAP Franklin -> FRANCLAN McDaniel -> MCDANAL Harper -> HARPAR Lynch -> LYNC Watkins -> WATCAN Carlson -> CARLSAN Wheeler -> WHALAR Louis XVI -> L