Natural sorting

You are encouraged to solve this task according to the task description, using any language you may know.
Natural sorting is the sorting of text that does more than rely on the order of individual characters codes to make the finding of individual strings easier for a human reader.
There is no "one true way" to do this, but for the purpose of this task 'natural' orderings might include:
- 1. Ignore leading, trailing and multiple adjacent spaces
- 2. Make all whitespace characters equivalent.
- 3. Sorting without regard to case.
- 4. Sorting numeric portions of strings in numeric order. That is split the string into fields on numeric boundaries, then sort on each field, with the rightmost fields being the most significant, and numeric fields of integers treated as numbers.
- foo9.txt before foo10.txt
- As well as ... x9y99 before x9y100, before x10y0
- ... (for any number of groups of integers in a string).
- 5. Title sorts: without regard to a leading, very common, word such
- as 'The' in "The thirty-nine steps".
- 6. Sort letters without regard to accents.
- 7. Sort ligatures as separate letters.
- 8. Replacements:
- Sort german scharfes S (ß) as ss
- Sort ſ, LATIN SMALL LETTER LONG S as s
- Sort ʒ, LATIN SMALL LETTER EZH as s
- ...
- Task Description
- Implement the first four of the eight given features in a natural sorting routine/function/method...
- Test each feature implemented separately with an ordered list of test strings from the 'Sample inputs' section below, and make sure your naturally sorted output is in the same order as other language outputs such as Python.
- Print and display your output.
- For extra credit implement more than the first four.
Note: It is not necessary to have individual control of which features are active in the natural sorting routine at any time.
- Sample input
# Ignoring leading spaces Text strings: ['ignore leading spaces: 2-2', ' ignore leading spaces: 2-1', ' ignore leading spaces: 2+0', ' ignore leading spaces: 2+1'] # Ignoring multiple adjacent spaces (m.a.s) Text strings: ['ignore m.a.s spaces: 2-2', 'ignore m.a.s spaces: 2-1', 'ignore m.a.s spaces: 2+0', 'ignore m.a.s spaces: 2+1'] # Equivalent whitespace characters Text strings: ['Equiv. spaces: 3-3', 'Equiv.\rspaces: 3-2', 'Equiv.\x0cspaces: 3-1', 'Equiv.\x0bspaces: 3+0', 'Equiv.\nspaces: 3+1', 'Equiv.\tspaces: 3+2'] # Case Indepenent sort Text strings: ['cASE INDEPENENT: 3-2', 'caSE INDEPENENT: 3-1', 'casE INDEPENENT: 3+0', 'case INDEPENENT: 3+1'] # Numeric fields as numerics Text strings: ['foo100bar99baz0.txt', 'foo100bar10baz0.txt', 'foo1000bar99baz10.txt', 'foo1000bar99baz9.txt'] # Title sorts Text strings: ['The Wind in the Willows', 'The 40th step more', 'The 39 steps', 'Wanda'] # Equivalent accented characters (and case) Text strings: [u'Equiv. \xfd accents: 2-2', u'Equiv. \xdd accents: 2-1', u'Equiv. y accents: 2+0', u'Equiv. Y accents: 2+1'] # Separated ligatures Text strings: [u'\u0132 ligatured ij', 'no ligature'] # Character replacements Text strings: [u'Start with an \u0292: 2-2', u'Start with an \u017f: 2-1', u'Start with an \xdf: 2+0', u'Start with an s: 2+1']
C
Some differences from task requirement:
- req 1 and 2 are not separated. I can't imagine a situation where I'd want one but not the other.
- req 5 is implemented differently: not only leading "the", but some common words like "it", "to" etc are discarded everywhere in the string
- req 6, 7, 8 are pretty much the same thing, so the implementation doesn't make too much of a distinction of it ("ß" is a ligature, you know.)
Besides the numeric part, everything else was done in a uniform way by transforming input strings into some normalized format and comparing those instead. All sort options flags can be freely mixed together. C source is written in UTF-8 for easier reading here: don't do this for serious code. <lang c>#include <stdio.h>
- include <stdlib.h>
- include <wchar.h>
- include <wctype.h>
- include <string.h>
- include <locale.h>
typedef struct wstr { wchar_t *s; int n, alloc; } wstr;
- define w_del(w) { free(w->s); free(w); }
- define forchars(i, c, w) for(i = 0, c = w->s[0]; i < w->n && c; c = w->s[++i])
wstr *w_new() { wstr *w = malloc(sizeof(wstr)); w->alloc = 1; w->n = 0; w->s = malloc(sizeof(wchar_t)); w->s[0] = 0; return w; }
void w_append(wstr *w, wchar_t c) { int n = w->n + 1; if (n >= w->alloc) { w->alloc *= 2; w->s = realloc(w->s, w->alloc * sizeof(wchar_t)); } w->s[w->n++] = c; w->s[w->n] = 0; }
wstr *w_make(wchar_t *s) { int i, len = wcslen(s); wstr *w = w_new(); for (i = 0; i < len; i++) w_append(w, s[i]); return w; }
typedef void (*wtrans_func)(wstr *, wstr *); void w_transform(wstr *in, wtrans_func f) { wstr t, *out = w_new(); f(in, out); t = *in; *in = *out; *out = t; w_del(out); }
- define transfunc(x) void w_##x(wstr *in, wstr *out)
transfunc(nocase) { int i; wchar_t c; forchars(i, c, in) w_append(out, towlower(c)); }
transfunc(despace) { int i, gotspace = 0; wchar_t c; forchars(i, c, in) { if (!iswspace(c)) { if (gotspace && out->n) w_append(out, L' '); w_append(out, c); gotspace = 0; } else gotspace = 1; } }
static const wchar_t *const tbl_accent[] = { /* copied from Perl6 code */ L"Þ", L"TH", L"þ", L"th", L"Ð", L"TH", L"ð", L"th", L"À", L"A", L"Á", L"A", L"Â", L"A", L"Ã", L"A", L"Ä", L"A", L"Å", L"A", L"à", L"a", L"á", L"a", L"â", L"a", L"ã", L"a", L"ä", L"a", L"å", L"a", L"Ç", L"C", L"ç", L"c", L"È", L"E", L"É", L"E", L"Ê", L"E", L"Ë", L"E", L"è", L"e", L"é", L"e", L"ê", L"e", L"ë", L"e", L"Ì", L"I", L"Í", L"I", L"Î", L"I", L"Ï", L"I", L"ì", L"i", L"í", L"i", L"î", L"i", L"ï", L"i", L"Ò", L"O", L"Ó", L"O", L"Ô", L"O", L"Õ", L"O", L"Ö", L"O", L"Ø", L"O", L"ò", L"o", L"ó", L"o", L"ô", L"o", L"õ", L"o", L"ö", L"o", L"ø", L"o", L"Ñ", L"N", L"ñ", L"n", L"Ù", L"U", L"Ú", L"U", L"Û", L"U", L"Ü", L"U", L"ù", L"u", L"ú", L"u", L"û", L"u", L"ü", L"u", L"Ý", L"Y", L"ÿ", L"y", L"ý", L"y" };
static const wchar_t *const tbl_ligature[] = { L"Æ", L"AE", L"æ", L"ae", L"ß", L"ss", L"ffl", L"ffl", L"ffi", L"ffi", L"fi", L"fi", L"ff", L"ff", L"fl", L"fl", L"ſ", L"s", L"ʒ", L"z", L"st", L"st", /* ... come on ... */ };
void w_char_repl(wstr *in, wstr *out, const wchar_t *const *tbl, int len) { int i, j, k; wchar_t c; forchars(i, c, in) { for (j = k = 0; j < len; j += 2) { if (c != tbl[j][0]) continue; for (k = 0; tbl[j + 1][k]; k++) w_append(out, tbl[j + 1][k]); break; } if (!k) w_append(out, c); } }
transfunc(noaccent) { w_char_repl(in, out, tbl_accent, sizeof(tbl_accent)/sizeof(wchar_t*)); }
transfunc(noligature) { w_char_repl(in, out, tbl_ligature, sizeof(tbl_ligature)/sizeof(wchar_t*)); }
static const wchar_t *const tbl_article[] = { L"the", L"a", L"of", L"to", L"is", L"it" };
- define N_ARTICLES sizeof(tbl_article)/sizeof(tbl_article[0])
transfunc(noarticle) { int i, j, n; wchar_t c, c0 = 0; forchars(i, c, in) { if (!c0 || (iswalnum(c) && !iswalnum(c0))) { /* word boundary */ for (j = N_ARTICLES - 1; j >= 0; j--) { n = wcslen(tbl_article[j]); if (wcsncasecmp(in->s + i, tbl_article[j], n)) continue; if (iswalnum(in->s[i + n])) continue; i += n; break; } if (j < 0) w_append(out, c); } else w_append(out, c); c0 = c; } }
enum { wi_space = 0, wi_case, wi_accent, wi_lig, wi_article, wi_numeric };
- define WS_NOSPACE (1 << wi_space)
- define WS_NOCASE (1 << wi_case)
- define WS_ACCENT (1 << wi_accent)
- define WS_LIGATURE (1 << wi_lig)
- define WS_NOARTICLE (1 << wi_article)
- define WS_NUMERIC (1 << wi_numeric)
const wtrans_func trans_funcs[] = { w_despace, w_nocase, w_noaccent, w_noligature, w_noarticle, 0 }; const char *const flagnames[] = { "collapse spaces", "case insensitive", "disregard accent", "decompose ligatures", "discard common words", "numeric", };
typedef struct { wchar_t* s; wstr *w; } kw_t; int w_numcmp(const void *a, const void *b) { wchar_t *pa = ((const kw_t*)a)->w->s, *pb = ((const kw_t*)b)->w->s; int sa, sb, ea, eb; while (*pa && *pb) { if (iswdigit(*pa) && iswdigit(*pb)) { /* skip leading zeros */ sa = sb = 0; while (pa[sa] == L'0') sa++; while (pb[sb] == L'0') sb++; /* find end of numbers */ ea = sa; eb = sb; while (iswdigit(pa[ea])) ea++; while (iswdigit(pb[eb])) eb++; if (eb - sb > ea - sa) return -1; if (eb - sb < ea - sa) return 1; while (sb < eb) { if (pa[sa] > pb[sb]) return 1; if (pa[sa] < pb[sb]) return -1; sa++; sb++; }
pa += ea; pb += eb; } else if (iswdigit(*pa)) return 1; else if (iswdigit(*pb)) return -1; else { if (*pa > *pb) return 1; if (*pa < *pb) return -1; pa++; pb++; } } return (!*pa && !*pb) ? 0 : *pa ? 1 : -1; }
int w_cmp(const void *a, const void *b) { return wcscmp(((const kw_t*)a)->w->s, ((const kw_t*)b)->w->s); }
void natural_sort(wchar_t **strings, int len, int flags) { int i, j; kw_t *kws = malloc(sizeof(kw_t) * len);
for (i = 0; i < len; i++) { kws[i].s = strings[i]; kws[i].w = w_make(strings[i]); for (j = 0; j < wi_numeric; j++) if (flags & (1 << j) && trans_funcs[j]) w_transform(kws[i].w, trans_funcs[j]); }
qsort(kws, len, sizeof(kw_t), (flags & WS_NUMERIC) ? w_numcmp : w_cmp); for (i = 0; i < len; i++) { w_del(kws[i].w); strings[i] = kws[i].s; } free(kws); }
const wchar_t *const test[] = { L" 0000098 nina", L"100 niño", L"99 Ninja", L"100 NINA", L" The work is so difficult to do it took ſome 100 aeons. ", L"The work is so difficult it took some 100 aeons.", L" The work is so difficult it took ſome 99 æons. ", };
- define N_STRINGS sizeof(test)/sizeof(*test)
void test_sort(int flags) { int i, j; const wchar_t *str[N_STRINGS]; memcpy(str, test, sizeof(test));
printf("Sort flags: ("); for (i = 0, j = flags; j; i++, j >>= 1) if ((j & 1)) printf("%s%s", flagnames[i], j > 1 ? ", ":")\n");
natural_sort((wchar_t **)str, N_STRINGS, flags);
for (i = 0; i < N_STRINGS; i++) printf("%ls\n", str[i]); printf("\n"); }
int main() { setlocale(LC_CTYPE, "");
test_sort(WS_NOSPACE); test_sort(WS_NOCASE); test_sort(WS_NUMERIC); test_sort(WS_NOARTICLE|WS_NOSPACE); test_sort(WS_NOCASE|WS_NOSPACE|WS_ACCENT); test_sort(WS_LIGATURE|WS_NOCASE|WS_NOSPACE|WS_NUMERIC|WS_ACCENT|WS_NOARTICLE);
return 0; }</lang>output<lang>Sort flags: (collapse spaces)
0000098 nina
100 NINA 100 niño 99 Ninja The work is so difficult it took some 100 aeons.
The work is so difficult it took ſome 99 æons. The work is so difficult to do it took ſome 100 aeons.
Sort flags: (case insensitive)
The work is so difficult it took ſome 99 æons. 0000098 nina The work is so difficult to do it took ſome 100 aeons.
100 NINA 100 niño 99 Ninja The work is so difficult it took some 100 aeons.
Sort flags: (numeric)
The work is so difficult it took ſome 99 æons. The work is so difficult to do it took ſome 100 aeons. 0000098 nina
The work is so difficult it took some 100 aeons. 99 Ninja 100 NINA 100 niño
Sort flags: (collapse spaces, discard common words)
0000098 nina
100 NINA 100 niño 99 Ninja The work is so difficult it took some 100 aeons.
The work is so difficult to do it took ſome 100 aeons. The work is so difficult it took ſome 99 æons.
Sort flags: (collapse spaces, case insensitive, disregard accent)
0000098 nina
100 NINA 100 niño 99 Ninja The work is so difficult it took some 100 aeons.
The work is so difficult it took ſome 99 æons. The work is so difficult to do it took ſome 100 aeons.
Sort flags: (collapse spaces, case insensitive, disregard accent, decompose ligatures, discard common words, numeric)
The work is so difficult to do it took ſome 100 aeons. The work is so difficult it took ſome 99 æons.
The work is so difficult it took some 100 aeons.
0000098 nina
99 Ninja 100 NINA 100 niño</lang>
D
Implements requests 1-5. <lang d>import std.stdio, std.string, std.algorithm, std.array, std.conv,
std.ascii, std.range;
string[] naturalSort(string[] arr) {
static struct Part { string s; bool isNumber = false; ulong n;
int opCmp(in ref Part other) const pure { return (isNumber && other.isNumber) ? cmp([n], [other.n]) : cmp(s, other.s); } }
static Part[] mapper(in string s) pure { // groupBy!isDigit could shorten this function. enum Kind { nothing, digit, notDigit }
auto lk = Kind.nothing; // Last Kind. string[] parts; foreach (c; s.strip.tr(whitespace, " ", "s").toLower) { if (lk != Kind.nothing && c.isDigit == (lk == Kind.digit)) parts.back ~= c; else parts ~= [c]; lk = c.isDigit ? Kind.digit : Kind.notDigit; }
auto r = parts.map!(p => p[0].isDigit ? Part(p, true, p.to!ulong) : Part(p)).array; return (r.length > 1 && r[0].s == "the") ? r.dropOne : r; }
return arr.schwartzSort!mapper.release;
}
void main() {
auto tests = [ // Ignoring leading spaces. ["ignore leading spaces: 2-2", " ignore leading spaces: 2-1", " ignore leading spaces: 2+1", " ignore leading spaces: 2+0"],
// Ignoring multiple adjacent spaces (m.a.s). ["ignore m.a.s spaces: 2-2", "ignore m.a.s spaces: 2-1", "ignore m.a.s spaces: 2+0", "ignore m.a.s spaces: 2+1"],
// Equivalent whitespace characters. ["Equiv. spaces: 3-3", "Equiv.\rspaces: 3-2", "Equiv.\x0cspaces: 3-1", "Equiv.\x0bspaces: 3+0", "Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2"],
// Case Indepenent sort. ["cASE INDEPENENT: 3-2", "caSE INDEPENENT: 3-1", "casE INDEPENENT: 3+0", "case INDEPENENT: 3+1"],
// Numeric fields as numerics. ["foo100bar99baz0.txt", "foo100bar10baz0.txt", "foo1000bar99baz10.txt", "foo1000bar99baz9.txt"],
// Title sorts. ["The Wind in the Willows", "The 40th step more", "The 39 steps", "Wanda"]];
foreach (test; tests) { test.writeln; writeln(test.naturalSort, "\n"); }
}</lang>
- Output:
["ignore leading spaces: 2-2", " ignore leading spaces: 2-1", " ignore leading spaces: 2+1", " ignore leading spaces: 2+0"] [" ignore leading spaces: 2+0", " ignore leading spaces: 2+1", " ignore leading spaces: 2-1", "ignore leading spaces: 2-2"] ["ignore m.a.s spaces: 2-2", "ignore m.a.s spaces: 2-1", "ignore m.a.s spaces: 2+0", "ignore m.a.s spaces: 2+1"] ["ignore m.a.s spaces: 2+0", "ignore m.a.s spaces: 2+1", "ignore m.a.s spaces: 2-1", "ignore m.a.s spaces: 2-2"] ["Equiv. spaces: 3-3", "Equiv.\rspaces: 3-2", "Equiv.\fspaces: 3-1", "Equiv.\vspaces: 3+0", "Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2"] ["Equiv.\vspaces: 3+0", "Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2", "Equiv.\fspaces: 3-1", "Equiv.\rspaces: 3-2", "Equiv. spaces: 3-3"] ["cASE INDEPENENT: 3-2", "caSE INDEPENENT: 3-1", "casE INDEPENENT: 3+0", "case INDEPENENT: 3+1"] ["casE INDEPENENT: 3+0", "case INDEPENENT: 3+1", "caSE INDEPENENT: 3-1", "cASE INDEPENENT: 3-2"] ["foo100bar99baz0.txt", "foo100bar10baz0.txt", "foo1000bar99baz10.txt", "foo1000bar99baz9.txt"] ["foo100bar10baz0.txt", "foo100bar99baz0.txt", "foo1000bar99baz9.txt", "foo1000bar99baz10.txt"] ["The Wind in the Willows", "The 40th step more", "The 39 steps", "Wanda"] ["The 39 steps", "The 40th step more", "The Wind in the Willows", "Wanda"]
Go
This solution varies from the task in interpretation of rule 4, describing numeric fields. This solution follows other solutions on the page by treating the left-most fields as most significant. See talk page.
First four rules, no extra credit: <lang go>package main
import (
"fmt" "regexp" "sort" "strconv" "strings"
)
var tests = []struct {
descr string list []string
}{
{"Ignoring leading spaces", []string{ "ignore leading spaces: 2-2", " ignore leading spaces: 2-1", " ignore leading spaces: 2+0", " ignore leading spaces: 2+1", }}, {"Ignoring multiple adjacent spaces", []string{ "ignore m.a.s spaces: 2-2", "ignore m.a.s spaces: 2-1", "ignore m.a.s spaces: 2+0", "ignore m.a.s spaces: 2+1", }}, {"Equivalent whitespace characters", []string{ "Equiv. spaces: 3-3", "Equiv.\rspaces: 3-2", "Equiv.\fspaces: 3-1", "Equiv.\bspaces: 3+0", "Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2", }}, {"Case Indepenent sort", []string{ "cASE INDEPENENT: 3-2", "caSE INDEPENENT: 3-1", "casE INDEPENENT: 3+0", "case INDEPENENT: 3+1", }}, {"Numeric fields as numerics", []string{ "foo100bar99baz0.txt", "foo100bar10baz0.txt", "foo1000bar99baz10.txt", "foo1000bar99baz9.txt", }},
}
func main() {
for _, test := range tests { fmt.Println(test.descr) fmt.Println("Input order:") for _, s := range test.list { fmt.Printf(" %q\n", s) } fmt.Println("Natural order:") l := make(list, len(test.list)) for i, s := range test.list { l[i] = newNatStr(s) } sort.Sort(l) for _, s := range l { fmt.Printf(" %q\n", s.s) } fmt.Println() }
}
// natStr associates a string with a preprocessed form type natStr struct {
s string // original t []tok // preprocessed "sub-fields"
}
func newNatStr(s string) (t natStr) {
t.s = s s = strings.ToLower(strings.Join(strings.Fields(s), " ")) x := dx.FindAllString(s, -1) t.t = make([]tok, len(x)) for i, s := range x { if n, err := strconv.Atoi(s); err == nil { t.t[i].n = n } else { t.t[i].s = s } } return t
}
var dx = regexp.MustCompile(`\d+|\D+`)
// rule is to use s unless it is empty, then use n type tok struct {
s string n int
}
// rule 2 of "numeric sub-fields" from talk page func (f1 tok) Cmp(f2 tok) int {
switch { case f1.s == "": switch { case f2.s > "" || f1.n < f2.n: return -1 case f1.n > f2.n: return 1 } case f2.s == "" || f1.s > f2.s: return 1 case f1.s < f2.s: return -1 } return 0
}
type list []natStr
func (l list) Len() int { return len(l) } func (l list) Swap(i, j int) { l[i], l[j] = l[j], l[i] } func (l list) Less(i, j int) bool {
ti := l[i].t for k, t := range l[j].t { if k == len(ti) { return true } switch ti[k].Cmp(t) { case -1: return true case 1: return false } } return false
}</lang>
- Output:
Ignoring leading spaces Input order: "ignore leading spaces: 2-2" " ignore leading spaces: 2-1" " ignore leading spaces: 2+0" " ignore leading spaces: 2+1" Natural order: " ignore leading spaces: 2+0" " ignore leading spaces: 2+1" " ignore leading spaces: 2-1" "ignore leading spaces: 2-2" Ignoring multiple adjacent spaces Input order: "ignore m.a.s spaces: 2-2" "ignore m.a.s spaces: 2-1" "ignore m.a.s spaces: 2+0" "ignore m.a.s spaces: 2+1" Natural order: "ignore m.a.s spaces: 2+0" "ignore m.a.s spaces: 2+1" "ignore m.a.s spaces: 2-1" "ignore m.a.s spaces: 2-2" Equivalent whitespace characters Input order: "Equiv. spaces: 3-3" "Equiv.\rspaces: 3-2" "Equiv.\fspaces: 3-1" "Equiv.\bspaces: 3+0" "Equiv.\nspaces: 3+1" "Equiv.\tspaces: 3+2" Natural order: "Equiv.\bspaces: 3+0" "Equiv.\nspaces: 3+1" "Equiv.\tspaces: 3+2" "Equiv.\fspaces: 3-1" "Equiv.\rspaces: 3-2" "Equiv. spaces: 3-3" Case Indepenent sort Input order: "cASE INDEPENENT: 3-2" "caSE INDEPENENT: 3-1" "casE INDEPENENT: 3+0" "case INDEPENENT: 3+1" Natural order: "casE INDEPENENT: 3+0" "case INDEPENENT: 3+1" "caSE INDEPENENT: 3-1" "cASE INDEPENENT: 3-2" Numeric fields as numerics Input order: "foo100bar99baz0.txt" "foo100bar10baz0.txt" "foo1000bar99baz10.txt" "foo1000bar99baz9.txt" Natural order: "foo100bar10baz0.txt" "foo100bar99baz0.txt" "foo1000bar99baz9.txt" "foo1000bar99baz10.txt"
J
The natural way of approaching this task in J is to normalize the text based on the rules desired for sorting. Here, we limit ourselves to ascii, for portability, and decide that our domain shall be terminated strings (where each string ends with the same character - typically a newline):
<lang j>require'strings regex'
lines=: <;.2 titleFix=: ('^\s*(the|a|an)\b';)&rxrplc isNum=: e.&'0123456789' num=: ".^:(isNum@{.) split=: <@num/.~ [:+/\1,2 ~:/\ isNum norm=: [: split (32 9 12 13 14 15{a.) -.~ [: titleFix tolower
natSor=: lines ;@/: norm&.>@lines</lang>
Example data:
<lang j>IgnoringLeadingSpaces=:0 :0 ignore leading spaces: 2-2
ignore leading spaces: 2-1 ignore leading spaces: 2+0 ignore leading spaces: 2+1
)
IgnoringMultipleAdjacentSpaces=: 0 :0
ignore m.a.s spaces: 2-2 ignore m.a.s spaces: 2-1 ignore m.a.s spaces: 2+0 ignore m.a.s spaces: 2+1
)
bsSubst=: rplc&((LF;'\'),('\r';13{a.),('\x0c';12{a.),('\x0b';11{a.),('\n';LF),:'\t';TAB)
EquivalentWhitespaceCharacters=: bsSubst 0 :0
Equiv. spaces: 3-3 Equiv.\rspaces: 3-2 Equiv.\x0cspaces: 3-1 Equiv.\x0bspaces: 3+0 Equiv.\nspaces: 3+1 Equiv.\tspaces: 3+2
)
CaseIndepenent=: 0 :0
cASE INDEPENENT: 3-2 caSE INDEPENENT: 3-1 casE INDEPENENT: 3+0 case INDEPENENT: 3+1
)
NumericFieldsAsNumerics=: 0 :0
foo100bar99baz0.txt foo100bar10baz0.txt foo1000bar99baz10.txt foo1000bar99baz9.txt
)
Titles=: 0 :0
The Wind in the Willows The 40th step more The 39 steps Wanda
)</lang>
Note that the required example which contains equivalent whitespace characters includes a '\n' in the data. So, for that example, we use a backslash as our terminator.
Example use:
<lang j> natSor IgnoringLeadingSpaces
ignore leading spaces: 2+0 ignore leading spaces: 2+1 ignore leading spaces: 2-1
ignore leading spaces: 2-2
natSor IgnoringMultipleAdjacentSpaces ignore m.a.s spaces: 2+0 ignore m.a.s spaces: 2+1 ignore m.a.s spaces: 2-1 ignore m.a.s spaces: 2-2
natSor EquivalentWhitespaceCharacters Equiv.
spaces: 3+1\ Equiv.�spaces: 3+0\ Equiv. spaces: 3+2\ Equiv.�spaces: 3-1\ Equiv. spaces: 3-2\ Equiv. spaces: 3-3\
natSor CaseIndepenent casE INDEPENENT: 3+0 case INDEPENENT: 3+1 caSE INDEPENENT: 3-1 cASE INDEPENENT: 3-2
natSor NumericFieldsAsNumerics foo100bar10baz0.txt foo100bar99baz0.txt foo1000bar99baz9.txt foo1000bar99baz10.txt
natSor Titles The 39 steps The 40th step more Wanda The Wind in the Willows
</lang>
Perl
This implements all 8 requirements*:
<lang perl> use feature 'fc'; use Unicode::Normalize;
sub natural_sort {
my @items = map { my $str = fc(NFKD($_)); $str =~ s/\s+/ /; $str =~ s/|^(?:the|a|an) \b|\p{Nonspacing_Mark}| $//g; my @fields = $str =~ /(?!\z) ([^0-9]*+) ([0-9]*+)/gx; [$_, \@fields] } @_; return map { $_->[0] } sort { my @x = @{$a->[1]}; my @y = @{$b->[1]}; my $numeric; while (@x && @y) { my ($x, $y) = (shift @x, shift @y); return (($numeric = !$numeric) ? $x cmp $y : $x <=> $y or next); } return @x <=> @y; } @items;
} </lang>
- *) Note that decomposing the strings to the NFKD normalization form and subsequently stripping off all code points of the
Nonspacing_Mark
category, removes differences caused by accents / ligatures / alternate character forms / etc. in a standards-compliant way. This coincides with all the examples given in the task description, with the exception that it does not replace "ʒ" with "s" — one could add$str =~ tr/ʒ/s/;
for that but it seems a bit whimsical.)
Testing:
<lang perl> use utf8; # interpret this script's source code as UTF8 use Test::More; # for plan(), is_deeply() use Data::Dump; # for dd()
my @testcases = (
['Leading spaces', '%sleading spaces: %i', map {' ' x $_} 2, 3, 1, 0 ], ['Adjacent spaces', 'adjacent%s spaces: %i', map {' ' x $_} 2, 3, 1, 0 ], ['Different spaces', 'equiv.%sspaces: %i', split //, "\x0b\n\t\x0c\r " ], ['Case differences', '%s INDEPENENT: %i', 'casE', 'case', 'caSE', 'cASE' ], ['Numeric fields', 'foo%ibar%ibaz%i.txt', [100, 10, 0], [100, 99, 0], [1000,99,9], [1000,99,10] ], ['Title case', '%s', 'The 39 steps', 'The 40th step more', 'Wanda', 'The Wind in the Willows' ], ['Accents', 'Equiv. %s accents: %i', 'y', 'Y', "\x{dd}", "\x{fd}" ], ['Ligatures', '%s', "IJ ligatured ij", 'no ligature' ], ['Alternate forms', 'Start with an %s: %i', 's', 'ſ', 'ß' ],
);
plan tests => scalar @testcases;
foreach (@testcases) {
my ($name, $pattern, @args) = @$_; my $i = 0; my @strings = map { sprintf $pattern, ref $_ ? @$_ : $_, $i++ } @args; is_deeply( [natural_sort(reverse sort @strings)], \@strings, $name ); dd @strings; print "\n";
} </lang>
- Output:
1..9 ok 1 - Leading spaces ( " leading spaces: 0", " leading spaces: 1", " leading spaces: 2", "leading spaces: 3", ) ok 2 - Adjacent spaces ( "adjacent spaces: 0", "adjacent spaces: 1", "adjacent spaces: 2", "adjacent spaces: 3", ) ok 3 - Different spaces ( "equiv.\13spaces: 0", "equiv.\nspaces: 1", "equiv.\tspaces: 2", "equiv.\fspaces: 3", "equiv.\rspaces: 4", "equiv. spaces: 5", ) ok 4 - Case differences ( "casE INDEPENENT: 0", "case INDEPENENT: 1", "caSE INDEPENENT: 2", "cASE INDEPENENT: 3", ) ok 5 - Numeric fields ( "foo100bar10baz0.txt", "foo100bar99baz0.txt", "foo1000bar99baz9.txt", "foo1000bar99baz10.txt", ) ok 6 - Title case ( "The 39 steps", "The 40th step more", "Wanda", "The Wind in the Willows", ) ok 7 - Accents ( "Equiv. y accents: 0", "Equiv. Y accents: 1", "Equiv. \xDD accents: 2", "Equiv. \xFD accents: 3", ) ok 8 - Ligatures ("\x{132} ligatured ij", "no ligature") ok 9 - Alternate forms ( "Start with an s: 0", "Start with an \x{17F}: 1", "Start with an \xDF: 2", )
Perl 6
Tested with Rakudo 2011.06
In Perl6 it is very easy to modify the default sorting order by passing in a transform routine to the sort function. If the transform routines are arity one, the sort function will apply a Schwartzian Transform so it only needs to calculate the transform once. Note that the transforms are non-destructive; The sort function returns the original strings.
The following are a series of subroutines to perform the various natural sorting transforms. They may be applied individually or mixed and matched to get the particular result desired. When more than one is strung together, they apply left to right. Some combinations may yield different results depending on the order they are applied.
<lang perl6># Sort groups of digits in number order. Sort by order of magnitude then lexically. sub naturally ($a) { $a.lc.subst(/(\d+)/, ->$/ {0~$0.chars.chr~$0},:g) ~"\x0"~$a }
- Collapse multiple ws characters to a single.
sub collapse ($a) { $a.subst( / ( \s ) $0+ /, -> $/ { $0 }, :g ) }
- Convert all ws characters to a space.
sub normalize ($a) { $a.subst( / ( \s ) /, ' ', :g ) }
- Ignore common leading articles for title sorts
sub title ($a) { $a.subst( / :i ^ ( a | an | the ) >> \s* /, ) }
- Decompose ISO-Latin1 glyphs to their base character.
sub latin1_decompose ($a) {
my %tr = < Æ AE æ ae Þ TH þ th Ð TH ð th ß ss À A Á A Â A Ã A Ä A Å A à a á a â a ã a ä a å a Ç C ç c È E É E Ê E Ë E è e é e ê e ë e Ì I Í I Î I Ï I ì i í i î i ï i Ò O Ó O Ô O Õ O Ö O Ø O ò o ó o ô o õ o ö o ø o Ñ N ñ n Ù U Ú U Û U Ü U ù u ú u û u ü u Ý Y ÿ y ý y >;
- Would probably be better implemented as
- $a.trans( [%tr.keys] => [%tr.values] );
- but the underlying Parrot VM leaks through in current Rakudo.
my $re = '<[' ~ %tr.keys.join('|') ~ ']>'; $a.subst(/ (<$re>) /, -> $/ { %tr{$0} }, :g)
}</lang>
Used as:
<lang perl6>my @tests = (
[ "Task 1a\nSort while ignoring leading spaces.", [ 'ignore leading spaces: 1', ' ignore leading spaces: 4', ' ignore leading spaces: 3', ' ignore leading spaces: 2' ], {.trim} # builtin method. ], [ "Task 1b\nSort while ignoring multiple adjacent spaces.", [ 'ignore m.a.s spaces: 3', 'ignore m.a.s spaces: 1', 'ignore m.a.s spaces: 4', 'ignore m.a.s spaces: 2' ], {.&collapse} ], [ "Task 2\nSort with all white space normalized to regular spaces.", [ "Normalized\tspaces: 4", "Normalized\xa0spaces: 1", "Normalized\x20spaces: 2", "Normalized\nspaces: 3" ], {.&normalize} ], [ "Task 3\nSort case independently.", [ 'caSE INDEPENDENT: 3', 'casE INDEPENDENT: 2', 'cASE INDEPENDENT: 4', 'case INDEPENDENT: 1' ], {.lc} # builtin method ], [ "Task 4\nSort groups of digits in natural number order.", [ <Foo100bar99baz0.txt foo100bar10baz0.txt foo1000bar99baz10.txt foo1000bar99baz9.txt 201st 32nd 3rd 144th 17th 2 95> ], {.&naturally} ], [ "Task 5 ( mixed with 1, 2, 3 & 4 )\n" ~ "Sort titles, normalize white space, collapse multiple spaces to\n" ~ "single, trim leading white space, ignore common leading articles\n" ~ 'and sort digit groups in natural order.', [ 'The Wind in the Willows 8', ' The 39 Steps 3', 'The 7th Seal 1', 'Wanda 6', 'A Fish Called Wanda 5', ' The Wind and the Lion 7', 'Any Which Way But Loose 4', '12 Monkeys 2' ], {.&normalize.&collapse.trim.&title.&naturally} ], [ "Task 6, 7, 8\nMap letters in Latin1 that have accents or decompose to two\n" ~ 'characters to their base characters for sorting.', [ <apple Ball bald car Card above Æon æon aether niño nina e-mail Évian evoke außen autumn> ], {.&latin1_decompose.&naturally} ]
);
for @tests -> $case {
my $code_ref = $case.pop; my @array = $case.pop.list; say $case.pop, "\n";
say "Standard Sort:\n"; .say for @array.sort;
say "\nNatural Sort:\n"; .say for @array.sort: {.$code_ref};
say "\n" ~ '*' x 40 ~ "\n";
}</lang>
Sample output:
Task 1a Sort while ignoring leading spaces. Standard Sort: ignore leading spaces: 4 ignore leading spaces: 3 ignore leading spaces: 2 ignore leading spaces: 1 Natural Sort: ignore leading spaces: 1 ignore leading spaces: 2 ignore leading spaces: 3 ignore leading spaces: 4 **************************************** Task 1b Sort while ignoring multiple adjacent spaces. Standard Sort: ignore m.a.s spaces: 4 ignore m.a.s spaces: 3 ignore m.a.s spaces: 2 ignore m.a.s spaces: 1 Natural Sort: ignore m.a.s spaces: 1 ignore m.a.s spaces: 2 ignore m.a.s spaces: 3 ignore m.a.s spaces: 4 **************************************** Task 2 Sort with all white space normalized to regular spaces. Standard Sort: Normalized spaces: 4 Normalized spaces: 3 Normalized spaces: 2 Normalized spaces: 1 Natural Sort: Normalized spaces: 1 Normalized spaces: 2 Normalized spaces: 3 Normalized spaces: 4 **************************************** Task 3 Sort case independently. Standard Sort: cASE INDEPENDENT: 4 caSE INDEPENDENT: 3 casE INDEPENDENT: 2 case INDEPENDENT: 1 Natural Sort: case INDEPENDENT: 1 casE INDEPENDENT: 2 caSE INDEPENDENT: 3 cASE INDEPENDENT: 4 **************************************** Task 4 Sort groups of digits in natural number order. Standard Sort: 144th 17th 2 201st 32nd 3rd 95 Foo100bar99baz0.txt foo1000bar99baz10.txt foo1000bar99baz9.txt foo100bar10baz0.txt Natural Sort: 2 3rd 17th 32nd 95 144th 201st foo100bar10baz0.txt Foo100bar99baz0.txt foo1000bar99baz9.txt foo1000bar99baz10.txt **************************************** Task 5 ( mixed with 1, 2, 3 & 4 ) Sort titles, normalize white space, collapse multiple spaces to single, trim leading white space, ignore common leading articles and sort digit groups in natural order. Standard Sort: The 39 Steps 3 The Wind and the Lion 7 12 Monkeys 2 A Fish Called Wanda 5 Any Which Way But Loose 4 The 7th Seal 1 The Wind in the Willows 8 Wanda 6 Natural Sort: The 7th Seal 1 12 Monkeys 2 The 39 Steps 3 Any Which Way But Loose 4 A Fish Called Wanda 5 Wanda 6 The Wind and the Lion 7 The Wind in the Willows 8 **************************************** Task 6, 7, 8 Map letters in Latin1 that have accents or decompose to two characters to their base characters for sorting. Standard Sort: Ball Card above aether apple autumn außen bald car e-mail evoke nina niño Æon Évian æon Natural Sort: above Æon æon aether apple außen autumn bald Ball car Card e-mail Évian evoke nina niño ****************************************
PicoLisp
This parser takes care of features 1,2,3,4,5 and 8: <lang PicoLisp>(de parseNatural (Str)
(clip (make (for (L (chop Str) L) (cond ((sp? (car L)) (link " ") (while (and L (sp? (car L))) (pop 'L) ) ) ((>= "9" (car L) "0") (link (format (make (loop (link (pop 'L)) (NIL (>= "9" (car L) "0")) ) ) ) ) ) (T (let Word (pack (replace (make (loop (link (lowc (pop 'L))) (NIL L) (T (sp? (car L))) (T (>= "9" (car L) "0")) ) ) "ß" "ss" "ſ" "s" "ʒ" "s" ) ) (unless (member Word '(the it to)) (link Word) ) ) ) ) ) ) ) )</lang>
Test: <lang PicoLisp>: (parseNatural " ^MThe abc123Defß ^I Ghi ") -> ("abc" 123 "defss" " " "ghi")</lang> Sorting is trivial then: <lang PicoLisp>(de naturalSort (Lst)
(by parseNatural sort Lst) )</lang>
Test: <lang PicoLisp>(de *TestData
"# Ignoring leading spaces" ("ignore leading spaces: 2-2" " ignore leading spaces: 2-1" " ignore leading spaces: 2+0" " ignore leading spaces: 2+1")
"# Ignoring multiple adjacent spaces (m.a.s)" ("ignore m.a.s spaces: 2-2" "ignore m.a.s spaces: 2-1" "ignore m.a.s spaces: 2+0" "ignore m.a.s spaces: 2+1")
"# Equivalent whitespace characters" ("Equiv. spaces: 3-3" "Equiv.^Mspaces: 3-2" "Equiv.^Acspaces: 3-1" "Equiv.^Kbspaces: 3+0" "Equiv.^Jspaces: 3+1" "Equiv.^Ispaces: 3+2")
"# Case Indepenent sort" ("cASE INDEPENENT: 3-2" "caSE INDEPENENT: 3-1" "casE INDEPENENT: 3+0" "case INDEPENENT: 3+1")
"# Numeric fields as numerics" ("foo100bar99baz0.txt" "foo100bar10baz0.txt" "foo1000bar99baz10.txt" "foo1000bar99baz9.txt")
"# Title sorts" ("The Wind in the Willows" "The 40th step more" "The 39 steps" "Wanda")
"# Equivalent accented characters (and case)" ("Equiv. ý accents: 2-2" "Equiv. Ý accents: 2-1" "Equiv. y accents: 2+0" "Equiv. Y accents: 2+1")
# "Separated ligatures" ### ("IJ ligatured ij" "no ligature")
"# Character replacements" ("Start with an ʒ: 2-2" "Start with an ſ: 2-1" "Start with an ß: 2+0" "Start with an s: 2+1") )
(de pythonOut (Ttl Lst)
(prinl Ttl) (prin "['" (car Lst)) (for S (cdr Lst) (prin "',^J '" S) ) (prinl "']") )
(for X *TestData
(if (atom X) (prinl X) (pythonOut "Text strings:" X) (pythonOut "Normally sorted :" (sort (copy X))) (pythonOut "Naturally sorted:" (naturalSort X)) (prinl) ) )</lang>
Output:
# Ignoring leading spaces Text strings: ['ignore leading spaces: 2-2', ' ignore leading spaces: 2-1', ' ignore leading spaces: 2+0', ' ignore leading spaces: 2+1'] Normally sorted : [' ignore leading spaces: 2+1', ' ignore leading spaces: 2+0', ' ignore leading spaces: 2-1', 'ignore leading spaces: 2-2'] Naturally sorted: [' ignore leading spaces: 2+0', ' ignore leading spaces: 2+1', ' ignore leading spaces: 2-1', 'ignore leading spaces: 2-2'] # Ignoring multiple adjacent spaces (m.a.s) Text strings: ['ignore m.a.s spaces: 2-2', 'ignore m.a.s spaces: 2-1', 'ignore m.a.s spaces: 2+0', 'ignore m.a.s spaces: 2+1'] Normally sorted : ['ignore m.a.s spaces: 2+1', 'ignore m.a.s spaces: 2+0', 'ignore m.a.s spaces: 2-1', 'ignore m.a.s spaces: 2-2'] Naturally sorted: ['ignore m.a.s spaces: 2+0', 'ignore m.a.s spaces: 2+1', 'ignore m.a.s spaces: 2-1', 'ignore m.a.s spaces: 2-2'] # Equivalent whitespace characters Text strings: ['Equiv. spaces: 3-3', 'Equiv. spaces: 3-2', 'Equiv.�cspaces: 3-1', 'Equiv.�bspaces: 3+0', 'Equiv. spaces: 3+1', 'Equiv. spaces: 3+2'] Normally sorted : ['Equiv.�cspaces: 3-1', 'Equiv. spaces: 3+2', 'Equiv. spaces: 3+1', 'Equiv.�bspaces: 3+0', 'Equiv. spaces: 3-2', 'Equiv. spaces: 3-3'] Naturally sorted: ['Equiv.�bspaces: 3+0', 'Equiv.�cspaces: 3-1', 'Equiv. spaces: 3+1', 'Equiv. spaces: 3+2', 'Equiv. spaces: 3-2', 'Equiv. spaces: 3-3'] # Case Indepenent sort Text strings: ['cASE INDEPENENT: 3-2', 'caSE INDEPENENT: 3-1', 'casE INDEPENENT: 3+0', 'case INDEPENENT: 3+1'] Normally sorted : ['cASE INDEPENENT: 3-2', 'caSE INDEPENENT: 3-1', 'casE INDEPENENT: 3+0', 'case INDEPENENT: 3+1'] Naturally sorted: ['casE INDEPENENT: 3+0', 'case INDEPENENT: 3+1', 'caSE INDEPENENT: 3-1', 'cASE INDEPENENT: 3-2'] # Numeric fields as numerics Text strings: ['foo100bar99baz0.txt', 'foo100bar10baz0.txt', 'foo1000bar99baz10.txt', 'foo1000bar99baz9.txt'] Normally sorted : ['foo1000bar99baz10.txt', 'foo1000bar99baz9.txt', 'foo100bar10baz0.txt', 'foo100bar99baz0.txt'] Naturally sorted: ['foo100bar10baz0.txt', 'foo100bar99baz0.txt', 'foo1000bar99baz9.txt', 'foo1000bar99baz10.txt'] # Title sorts Text strings: ['The Wind in the Willows', 'The 40th step more', 'The 39 steps', 'Wanda'] Normally sorted : ['The 39 steps', 'The 40th step more', 'The Wind in the Willows', 'Wanda'] Naturally sorted: ['The 39 steps', 'The 40th step more', 'Wanda', 'The Wind in the Willows'] # Equivalent accented characters (and case) Text strings: ['Equiv. ý accents: 2-2', 'Equiv. Ý accents: 2-1', 'Equiv. y accents: 2+0', 'Equiv. Y accents: 2+1'] Normally sorted : ['Equiv. Y accents: 2+1', 'Equiv. y accents: 2+0', 'Equiv. Ý accents: 2-1', 'Equiv. ý accents: 2-2'] Naturally sorted: ['Equiv. y accents: 2+0', 'Equiv. Y accents: 2+1', 'Equiv. Ý accents: 2-1', 'Equiv. ý accents: 2-2'] # Character replacements Text strings: ['Start with an ʒ: 2-2', 'Start with an ſ: 2-1', 'Start with an ß: 2+0', 'Start with an s: 2+1'] Normally sorted : ['Start with an s: 2+1', 'Start with an ß: 2+0', 'Start with an ſ: 2-1', 'Start with an ʒ: 2-2'] Naturally sorted: ['Start with an s: 2+1', 'Start with an ſ: 2-1', 'Start with an ʒ: 2-2', 'Start with an ß: 2+0']
Python
All eight features: <lang python># -*- coding: utf-8 -*-
- Not Python 3.x (Can't compare str and int)
from itertools import groupby
from unicodedata import decomposition, name
from pprint import pprint as pp
commonleaders = ['the'] # lowercase leading words to ignore replacements = {u'ß': 'ss', # Map single char to replacement string
u'ſ': 's', u'ʒ': 's', }
hexdigits = set('0123456789abcdef') decdigits = set('0123456789') # Don't use str.isnumeric
def splitchar(c):
' De-ligature. De-accent a char' de = decomposition(c) if de: # Just the words that are also hex numbers de = [d for d in de.split() if all(c.lower() in hexdigits for c in d)] n = name(c, c).upper() # (Gosh it's onerous) if len(de)> 1 and 'PRECEDE' in n: # E.g. ʼn LATIN SMALL LETTER N PRECEDED BY APOSTROPHE de[1], de[0] = de[0], de[1] tmp = [ unichr(int(k, 16)) for k in de] base, others = tmp[0], tmp[1:] if 'LIGATURE' in n: # Assume two character ligature base += others.pop(0) else: base = c return base
def sortkeygen(s):
Generate 'natural' sort key for s
Doctests: >>> sortkeygen(' some extra spaces ') [u'some extra spaces'] >>> sortkeygen('CasE InseNsItIve') [u'case insensitive'] >>> sortkeygen('The Wind in the Willows') [u'wind in the willows'] >>> sortkeygen(u'\462 ligature') [u'ij ligature'] >>> sortkeygen(u'\335\375 upper/lower case Y with acute accent') [u'yy upper/lower case y with acute accent'] >>> sortkeygen('foo9.txt') [u'foo', 9, u'.txt'] >>> sortkeygen('x9y99') [u'x', 9, u'y', 99] # Ignore leading and trailing spaces s = unicode(s).strip() # All space types are equivalent s = ' '.join(s.split()) # case insentsitive s = s.lower() # Title words = s.split() if len(words) > 1 and words[0] in commonleaders: s = ' '.join( words[1:]) # accent and ligatures s = .join(splitchar(c) for c in s) # Replacements (single char replaced by one or more) s = .join( replacements.get(ch, ch) for ch in s ) # Numeric sections as numerics s = [ int("".join(g)) if isinteger else "".join(g) for isinteger,g in groupby(s, lambda x: x in decdigits)]
return s
def naturalsort(items):
Naturally sort a series of strings
Doctests: >>> naturalsort(['The Wind in the Willows','The 40th step more', 'The 39 steps', 'Wanda']) ['The 39 steps', 'The 40th step more', 'Wanda', 'The Wind in the Willows']
return sorted(items, key=sortkeygen)
if __name__ == '__main__':
import string ns = naturalsort
print '\n# Ignoring leading spaces' txt = ['%signore leading spaces: 2%+i' % (' '*i, i-2) for i in range(4)] print 'Text strings:'; pp(txt) print 'Normally sorted :'; pp(sorted(txt)) print 'Naturally sorted:'; pp(ns(txt))
print '\n# Ignoring multiple adjacent spaces (m.a.s)' txt = ['ignore m.a.s%s spaces: 2%+i' % (' '*i, i-2) for i in range(4)] print 'Text strings:'; pp(txt) print 'Normally sorted :'; pp(sorted(txt)) print 'Naturally sorted:'; pp(ns(txt))
print '\n# Equivalent whitespace characters' txt = ['Equiv.%sspaces: 3%+i' % (ch, i-3) for i,ch in enumerate(reversed(string.whitespace))] print 'Text strings:'; pp(txt) print 'Normally sorted :'; pp(sorted(txt)) print 'Naturally sorted:'; pp(ns(txt))
print '\n# Case Indepenent sort' s = 'CASE INDEPENENT' txt = [s[:i].lower() + s[i:] + ': 3%+i' % (i-3) for i in range(1,5)] print 'Text strings:'; pp(txt) print 'Normally sorted :'; pp(sorted(txt)) print 'Naturally sorted:'; pp(ns(txt))
print '\n# Numeric fields as numerics' txt = ['foo100bar99baz0.txt', 'foo100bar10baz0.txt', 'foo1000bar99baz10.txt', 'foo1000bar99baz9.txt'] print 'Text strings:'; pp(txt) print 'Normally sorted :'; pp(sorted(txt)) print 'Naturally sorted:'; pp(ns(txt))
print '\n# Title sorts' txt = ['The Wind in the Willows','The 40th step more', 'The 39 steps', 'Wanda'] print 'Text strings:'; pp(txt) print 'Normally sorted :'; pp(sorted(txt)) print 'Naturally sorted:'; pp(ns(txt))
print '\n# Equivalent accented characters (and case)' txt = ['Equiv. %s accents: 2%+i' % (ch, i-2) for i,ch in enumerate(u'\xfd\xddyY')] print 'Text strings:'; pp(txt) print 'Normally sorted :'; pp(sorted(txt)) print 'Naturally sorted:'; pp(ns(txt))
print '\n# Separated ligatures' txt = [u'\462 ligatured ij', 'no ligature',] print 'Text strings:'; pp(txt) print 'Normally sorted :'; pp(sorted(txt)) print 'Naturally sorted:'; pp(ns(txt)) print '\n# Character replacements' s = u'ʒſßs' # u'\u0292\u017f\xdfs' txt = ['Start with an %s: 2%+i' % (ch, i-2) for i,ch in enumerate(s)] print 'Text strings:'; pp(txt) print 'Normally sorted :'; print '\n'.join(sorted(txt)) print 'Naturally sorted:'; print '\n'.join(ns(txt))</lang>
Sample Python output
# Ignoring leading spaces Text strings: ['ignore leading spaces: 2-2', ' ignore leading spaces: 2-1', ' ignore leading spaces: 2+0', ' ignore leading spaces: 2+1'] Normally sorted : [' ignore leading spaces: 2+1', ' ignore leading spaces: 2+0', ' ignore leading spaces: 2-1', 'ignore leading spaces: 2-2'] Naturally sorted: [' ignore leading spaces: 2+0', ' ignore leading spaces: 2+1', ' ignore leading spaces: 2-1', 'ignore leading spaces: 2-2'] # Ignoring multiple adjacent spaces (m.a.s) Text strings: ['ignore m.a.s spaces: 2-2', 'ignore m.a.s spaces: 2-1', 'ignore m.a.s spaces: 2+0', 'ignore m.a.s spaces: 2+1'] Normally sorted : ['ignore m.a.s spaces: 2+1', 'ignore m.a.s spaces: 2+0', 'ignore m.a.s spaces: 2-1', 'ignore m.a.s spaces: 2-2'] Naturally sorted: ['ignore m.a.s spaces: 2+0', 'ignore m.a.s spaces: 2+1', 'ignore m.a.s spaces: 2-1', 'ignore m.a.s spaces: 2-2'] # Equivalent whitespace characters Text strings: ['Equiv. spaces: 3-3', 'Equiv.\rspaces: 3-2', 'Equiv.\x0cspaces: 3-1', 'Equiv.\x0bspaces: 3+0', 'Equiv.\nspaces: 3+1', 'Equiv.\tspaces: 3+2'] Normally sorted : ['Equiv.\tspaces: 3+2', 'Equiv.\nspaces: 3+1', 'Equiv.\x0bspaces: 3+0', 'Equiv.\x0cspaces: 3-1', 'Equiv.\rspaces: 3-2', 'Equiv. spaces: 3-3'] Naturally sorted: ['Equiv.\x0bspaces: 3+0', 'Equiv.\nspaces: 3+1', 'Equiv.\tspaces: 3+2', 'Equiv.\x0cspaces: 3-1', 'Equiv.\rspaces: 3-2', 'Equiv. spaces: 3-3'] # Case Indepenent sort Text strings: ['cASE INDEPENENT: 3-2', 'caSE INDEPENENT: 3-1', 'casE INDEPENENT: 3+0', 'case INDEPENENT: 3+1'] Normally sorted : ['cASE INDEPENENT: 3-2', 'caSE INDEPENENT: 3-1', 'casE INDEPENENT: 3+0', 'case INDEPENENT: 3+1'] Naturally sorted: ['casE INDEPENENT: 3+0', 'case INDEPENENT: 3+1', 'caSE INDEPENENT: 3-1', 'cASE INDEPENENT: 3-2'] # Numeric fields as numerics Text strings: ['foo100bar99baz0.txt', 'foo100bar10baz0.txt', 'foo1000bar99baz10.txt', 'foo1000bar99baz9.txt'] Normally sorted : ['foo1000bar99baz10.txt', 'foo1000bar99baz9.txt', 'foo100bar10baz0.txt', 'foo100bar99baz0.txt'] Naturally sorted: ['foo100bar10baz0.txt', 'foo100bar99baz0.txt', 'foo1000bar99baz9.txt', 'foo1000bar99baz10.txt'] # Title sorts Text strings: ['The Wind in the Willows', 'The 40th step more', 'The 39 steps', 'Wanda'] Normally sorted : ['The 39 steps', 'The 40th step more', 'The Wind in the Willows', 'Wanda'] Naturally sorted: ['The 39 steps', 'The 40th step more', 'Wanda', 'The Wind in the Willows'] # Equivalent accented characters (and case) Text strings: [u'Equiv. \xfd accents: 2-2', u'Equiv. \xdd accents: 2-1', u'Equiv. y accents: 2+0', u'Equiv. Y accents: 2+1'] Normally sorted : [u'Equiv. Y accents: 2+1', u'Equiv. y accents: 2+0', u'Equiv. \xdd accents: 2-1', u'Equiv. \xfd accents: 2-2'] Naturally sorted: [u'Equiv. y accents: 2+0', u'Equiv. Y accents: 2+1', u'Equiv. \xdd accents: 2-1', u'Equiv. \xfd accents: 2-2'] # Separated ligatures Text strings: [u'\u0132 ligatured ij', 'no ligature'] Normally sorted : ['no ligature', u'\u0132 ligatured ij'] Naturally sorted: [u'\u0132 ligatured ij', 'no ligature'] # Character replacements Text strings: [u'Start with an \u0292: 2-2', u'Start with an \u017f: 2-1', u'Start with an \xdf: 2+0', u'Start with an s: 2+1'] Normally sorted : Start with an s: 2+1 Start with an ß: 2+0 Start with an ſ: 2-1 Start with an ʒ: 2-2 Naturally sorted: Start with an s: 2+1 Start with an ſ: 2-1 Start with an ʒ: 2-2 Start with an ß: 2+0
Racket
Implements 1-4 (but only normalize spaces -- don't ignore spaces at the beginning/end, easy to implement, but sounds wrong).
<lang racket>
- lang racket
(define (natural-sort l)
(define (list<? l1 l2) (cond [(null? l2) #f] [(null? l1) #t] [(number? (car l1)) (cond [(< (car l1) (car l2)) #t] [(< (car l2) (car l1)) #f] [else (list<? (cdr l1) (cdr l2))])] [(string? (car l1)) (cond [(string<? (car l1) (car l2)) #t] [(string<? (car l2) (car l1)) #f] [else (list<? (cdr l1) (cdr l2))])])) (define (->keys s) (define s* (string-normalize-spaces (string-foldcase s))) (for/list ([x (regexp-match* #px"\\d+" s* #:gap-select? #t)] [i (in-naturals)]) (if (odd? i) (string->number x) x))) (sort l list<? #:key ->keys #:cache-keys? #t))
(natural-sort
(shuffle '("foo9.txt" "foo10.txt" "x9y99" "x9y100" "x10y0" "x z" "x y")))
- => '("foo9.txt" "foo10.txt" "x9y99" "x9y100" "x10y0" "x y" "x z")
</lang>
Ruby
Requirements 1,2,3 and 5 are met in one line of code: <lang ruby>ar.sort_by{|str| str.downcase.gsub(/\Athe |\Aa |\Aan /, "").lstrip.gsub(/\s+/, " ")}</lang> Almost all of the code below is handling requirement 4. The problem is that Ruby will happily sort ["a",1] against ["a",2] or even ["b"], but it does not know how to handle [1, "a"] against ["a", 2] and raises an ArgumentError. The code below does not define a new sort method, it defines a new class which is sortable by the existing method (falling back on string comparison). <lang ruby>class NatSortString
include Comparable attr_reader :scrubbed, :ints_and_strings, :i_s_pattern
def initialize(str) @str = str @scrubbed = str.downcase.gsub(/\Athe |\Aa |\Aan /, "").lstrip.gsub(/\s+/, " ") @ints_and_strings = @scrubbed.scan(/\d+|\D+/).map{|s| s =~ /\d/ ? s.to_i : s} @i_s_pattern = @ints_and_strings.map{|el| el.is_a?(Integer) ? :i : :s}.join end
def <=> (other) if i_s_pattern.start_with?(other.i_s_pattern) or other.i_s_pattern.start_with?(i_s_pattern) then ints_and_strings <=> other.ints_and_strings else scrubbed <=> other.scrubbed end end def to_s @str.dup end
end </lang> Demo: <lang ruby>tests =
{"Ignoring leading spaces" => [ "ignore leading spaces: 2-2 ", " ignore leading spaces: 2-1 ", " ignore leading spaces: 2+0 ", " ignore leading spaces: 2+1 "], "Ignoring multiple adjacent spaces" => [ "ignore m.a.s spaces: 2-2 ", "ignore m.a.s spaces: 2-1 ", "ignore m.a.s spaces: 2+0 ", "ignore m.a.s spaces: 2+1 "], "Equivalent whitespace characters" => ["Equiv. spaces: 3-3", "Equiv.\rspaces: 3-2", "Equiv.\x0cspaces: 3-1", "Equiv.\x0bspaces: 3+0", "Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2"], "Case Indepenent sort" => [ "cASE INDEPENENT: 3-2 ", "caSE INDEPENENT: 3-1 ", "casE INDEPENENT: 3+0 ", "case INDEPENENT: 3+1 "], "Numeric fields as numerics" => [ "foo100bar99baz0.txt ", "foo100bar10baz0.txt ", "foo1000bar99baz10.txt ", "foo1000bar99baz9.txt "], "Title sorts" => [ "The Wind in the Willows ", "The 40th step more ", "The 39 steps ", "Wanda "]}
tests.each do |title, ar|
nat_sorts = ar.map{|s| NatSortString.new(s)} puts [title,"--input--", ar, "--normal sort--", ar.sort, "--natural sort--", nat_sorts.sort, "\n"]
end </lang>
- Output:
Ignoring leading spaces --input-- ignore leading spaces: 2-2 ignore leading spaces: 2-1 ignore leading spaces: 2+0 ignore leading spaces: 2+1 --normal sort-- ignore leading spaces: 2+1 ignore leading spaces: 2+0 ignore leading spaces: 2-1 ignore leading spaces: 2-2 --natural sort-- ignore leading spaces: 2+0 ignore leading spaces: 2+1 ignore leading spaces: 2-1 ignore leading spaces: 2-2 Ignoring multiple adjacent spaces --input-- ignore m.a.s spaces: 2-2 ignore m.a.s spaces: 2-1 ignore m.a.s spaces: 2+0 ignore m.a.s spaces: 2+1 --normal sort-- ignore m.a.s spaces: 2+1 ignore m.a.s spaces: 2+0 ignore m.a.s spaces: 2-1 ignore m.a.s spaces: 2-2 --natural sort-- ignore m.a.s spaces: 2+0 ignore m.a.s spaces: 2+1 ignore m.a.s spaces: 2-1 ignore m.a.s spaces: 2-2 Equivalent whitespace characters --input-- Equiv. spaces: 3-3 spaces: 3-2 Equiv. spaces: 3-1 Equiv. spaces: 3+0 Equiv. spaces: 3+1 Equiv. spaces: 3+2 --normal sort-- Equiv. spaces: 3+2 Equiv. spaces: 3+1 Equiv. spaces: 3+0 Equiv. spaces: 3-1 spaces: 3-2 Equiv. spaces: 3-3 --natural sort-- Equiv. spaces: 3+0 Equiv. spaces: 3+1 Equiv. spaces: 3+2 Equiv. spaces: 3-1 spaces: 3-2 Equiv. spaces: 3-3 Case Indepenent sort --input-- cASE INDEPENENT: 3-2 caSE INDEPENENT: 3-1 casE INDEPENENT: 3+0 case INDEPENENT: 3+1 --normal sort-- cASE INDEPENENT: 3-2 caSE INDEPENENT: 3-1 casE INDEPENENT: 3+0 case INDEPENENT: 3+1 --natural sort-- casE INDEPENENT: 3+0 case INDEPENENT: 3+1 caSE INDEPENENT: 3-1 cASE INDEPENENT: 3-2 Numeric fields as numerics --input-- foo100bar99baz0.txt foo100bar10baz0.txt foo1000bar99baz10.txt foo1000bar99baz9.txt --normal sort-- foo1000bar99baz10.txt foo1000bar99baz9.txt foo100bar10baz0.txt foo100bar99baz0.txt --natural sort-- foo100bar10baz0.txt foo100bar99baz0.txt foo1000bar99baz9.txt foo1000bar99baz10.txt Title sorts --input-- The Wind in the Willows The 40th step more The 39 steps Wanda --normal sort-- The 39 steps The 40th step more The Wind in the Willows Wanda --natural sort-- The 39 steps The 40th step more Wanda The Wind in the Willows
Tcl
Tcl supports two methods of doing sorting by non-natural keys in the lsort
command: the -command option allows the specification of code that makes the ordering decision for a pair of values, but instead the code below demonstrates sorting through the use of collation keys, strings that when sorted in their normal order result in the natural order being used. (These are handled through the use of the -indices option which makes it easy to generate a sorted original list without any need to build compound intermediate tuples.)
Note also that Tcl supports case-insensitive sorting and “treat digit sequences as numbers” as native sorting options. (The latter is particularly useful for handling filenames.) <lang tcl>package require Tcl 8.5
proc sortWithCollationKey {keyBuilder list} {
if {![llength $list]} return foreach value $list {
lappend toSort [{*}$keyBuilder $value]
} foreach idx [lsort -indices $toSort] {
lappend result [lindex $list $idx]
} return $result
} proc normalizeSpaces {str} {
regsub -all {[ ]+} [string trim $str " "] " "
} proc equivalentWhitespace {str} {
regsub -all {\s} $str " "
}
proc show {description sorter strings} {
puts "Input:\n\t[join $strings \n\t]" set sorted [lsort $strings] puts "Normally sorted:\n\t[join $sorted \n\t]" set sorted [{*}$sorter $strings] puts "Naturally sorted with ${description}:\n\t[join $sorted \n\t]"
}
- Two demonstrations of the space normalizer
show "normalized spaces" {sortWithCollationKey normalizeSpaces} {
{ignore leading spaces: 2-2} { ignore leading spaces: 2-1} { ignore leading spaces: 2+0} { ignore leading spaces: 2+1}}
show "normalized spaces" {sortWithCollationKey normalizeSpaces} {
{ignore m.a.s spaces: 2-2} {ignore m.a.s spaces: 2-1} {ignore m.a.s spaces: 2+0} {ignore m.a.s spaces: 2+1}}
- Use a collation key that maps all whitespace to spaces
show "all whitespace equivalent" {sortWithCollationKey equivalentWhitespace} {
"Equiv. spaces: 3-3" "Equiv.\rspaces: 3-2" "Equiv.\u000cspaces: 3-1" "Equiv.\u000bspaces: 3+0" "Equiv.\nspaces: 3+1" "Equiv.\tspaces: 3+2"}
- These are built-in modes
show "(built-in) case insensitivity" {lsort -nocase} {
{cASE INDEPENENT: 3-2} {caSE INDEPENENT: 3-1} {casE INDEPENENT: 3+0} {case INDEPENENT: 3+1}}
show "digit sequences as numbers" {lsort -dictionary} {
foo100bar99baz0.txt foo100bar10baz0.txt foo1000bar99baz10.txt foo1000bar99baz9.txt}</lang>
Output:
Input: ignore leading spaces: 2-2 ignore leading spaces: 2-1 ignore leading spaces: 2+0 ignore leading spaces: 2+1 Normally sorted: ignore leading spaces: 2+1 ignore leading spaces: 2+0 ignore leading spaces: 2-1 ignore leading spaces: 2-2 Naturally sorted with normalized spaces: ignore leading spaces: 2+0 ignore leading spaces: 2+1 ignore leading spaces: 2-1 ignore leading spaces: 2-2 Input: ignore m.a.s spaces: 2-2 ignore m.a.s spaces: 2-1 ignore m.a.s spaces: 2+0 ignore m.a.s spaces: 2+1 Normally sorted: ignore m.a.s spaces: 2+1 ignore m.a.s spaces: 2+0 ignore m.a.s spaces: 2-1 ignore m.a.s spaces: 2-2 Naturally sorted with normalized spaces: ignore m.a.s spaces: 2+0 ignore m.a.s spaces: 2+1 ignore m.a.s spaces: 2-1 ignore m.a.s spaces: 2-2 Input: Equiv. spaces: 3-3 spaces: 3-2iv. Equiv. spaces: 3-1 Equiv. spaces: 3+0 Equiv. spaces: 3+1 Equiv. spaces: 3+2 Normally sorted: Equiv. spaces: 3+2 Equiv. spaces: 3+1 Equiv. spaces: 3+0 Equiv. spaces: 3-1 spaces: 3-2iv. Equiv. spaces: 3-3 Naturally sorted with all whitespace equivalent: Equiv. spaces: 3+0 Equiv. spaces: 3+1 Equiv. spaces: 3+2 Equiv. spaces: 3-1 spaces: 3-2iv. Equiv. spaces: 3-3 Input: cASE INDEPENENT: 3-2 caSE INDEPENENT: 3-1 casE INDEPENENT: 3+0 case INDEPENENT: 3+1 Normally sorted: cASE INDEPENENT: 3-2 caSE INDEPENENT: 3-1 casE INDEPENENT: 3+0 case INDEPENENT: 3+1 Naturally sorted with (built-in) case insensitivity: casE INDEPENENT: 3+0 case INDEPENENT: 3+1 caSE INDEPENENT: 3-1 cASE INDEPENENT: 3-2 Input: foo100bar99baz0.txt foo100bar10baz0.txt foo1000bar99baz10.txt foo1000bar99baz9.txt Normally sorted: foo1000bar99baz10.txt foo1000bar99baz9.txt foo100bar10baz0.txt foo100bar99baz0.txt Naturally sorted with digit sequences as numbers: foo100bar10baz0.txt foo100bar99baz0.txt foo1000bar99baz9.txt foo1000bar99baz10.txt
zkl
These are all immutable list sorts. If using mutable lists, mangle a copy and sort that.
First, a tag-mangled-fields sort, use tags to pull from the original list in sorted order function: <lang zkl>fcn dsuSort(x,orig){ // decorate-sort-undecorate sort
x.enumerate().sort(fcn([(_,a)],[(_,b)]){a<b}) .apply('wrap([(n,_)]){orig[n]});
}</lang> Now, mangle a copy of the original list to "normalize" it to the task requirements and sort. <lang zkl># Ignoring leading spaces ts1:=T("ignore leading spaces: 2-2", " ignore leading spaces: 2-1",
" ignore leading spaces: 2+0", " ignore leading spaces: 2+1");
dsuSort(ts1.apply("strip"),ts1).println();</lang>
- Output:
" ignore leading spaces: 2+0" " ignore leading spaces: 2+1" " ignore leading spaces: 2-1" "ignore leading spaces: 2-2"
<lang zkl># Ignoring multiple adjacent spaces (m.a.s) ts2:=T("ignore m.a.s spaces: 2-2", "ignore m.a.s spaces: 2-1",
"ignore m.a.s spaces: 2+0", "ignore m.a.s spaces: 2+1");
dsuSort(ts2.apply('-(" ")),ts2).println();</lang>
- Output:
"ignore m.a.s spaces: 2+0" "ignore m.a.s spaces: 2+1" "ignore m.a.s spaces: 2-1" "ignore m.a.s spaces: 2-2"
<lang zkl># Equivalent whitespace characters ts3:=T("Equiv. spaces: 3-3", "Equiv.\rspaces: 3-2", "Equiv.\x0cspaces: 3-1",
"Equiv.\x0bspaces: 3+0", "Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2");
dsuSort(ts3.apply('-.fp1("\n\r\t\f\b\x0b ")),ts3).println();</lang>
- Output:
"Equiv.?spaces: 3+0" "Equiv.?spaces: 3+1" "Equiv. spaces: 3+2" "Equiv.?spaces: 3-1" "Equiv.?spaces: 3-2" "Equiv. spaces: 3-3"
<lang zkl># Case Indepenent sort ts4:=T("cASE INDEPENENT: 3-2", "caSE INDEPENENT: 3-1",
"casE INDEPENENT: 3+0", "case INDEPENENT: 3+1");
dsuSort(ts4.apply("toLower"),ts4).println();</lang>
- Output:
"casE INDEPENENT: 3+0" "case INDEPENENT: 3+1" "caSE INDEPENENT: 3-1" "cASE INDEPENENT: 3-2"
And the nasty one: multiple numeric fields. Each key is blown apart into a list of ints, where each int is consecutive numeric characters. To compare a pair of fields (ie two lists of ints), walk the two in parallel, stopping when a difference is found. Ugly but it works. Another tactic would be to normalize each field to the same length, concat the fields and use that as the mangled key. But that might have issues if the number of fields differ, the former doesn't care. <lang zkl># Numeric fields as numerics fcn fieldize(s){
s.apply(fcn(c){"1234567890".holds(c) and c or "."}).split(".") .filter().apply("toInt");
} fcn fcmp(a,b){ // T(key, T(numeric fields)), eg L(0, L(100,99,0))
a[1].zip(b[1]).reduce(fcn(_,[(a,b)]){ if(a==b)return(True); // continue to next field return(Void.Stop,a<b); },True);
} fcn fsort(list){
list.apply(fieldize).enumerate().sort(fcmp) .apply('wrap([(n,_)]){list[n]});
}</lang> <lang zkl>ts5:=T("foo100bar99baz0.txt", "foo100bar10baz0.txt", "foo1000bar99baz10.txt",
"foo1000bar99baz9.txt");
fsort(ts5).println();
x:=T("x9y99","foo10.txt","x10y0","foo9.txt","x9y100"); fsort(x).println();</lang>
- Output:
"foo100bar10baz0.txt" "foo100bar99baz0.txt" "foo1000bar99baz9.txt" "foo1000bar99baz10.txt" L("foo9.txt","x9y99","x9y100","x10y0","foo10.txt")