Natural sorting

From Rosetta Code
Jump to: navigation, search
Task
Natural sorting
You are encouraged to solve this task according to the task description, using any language you may know.

Natural sorting is the sorting of text that does more than rely on the order of individual characters codes to make the finding of individual strings easier for a human reader.

There is no "one true way" to do this, but for the purpose of this task 'natural' orderings might include:

1. Ignore leading, trailing and multiple adjacent spaces
2. Make all whitespace characters equivalent.
3. Sorting without regard to case.
4. Sorting numeric portions of strings in numeric order. That is split the string into fields on numeric boundaries, then sort on each field, with the rightmost fields being the most significant, and numeric fields of integers treated as numbers.
foo9.txt before foo10.txt
As well as ... x9y99 before x9y100, before x10y0
... (for any number of groups of integers in a string).
5. Title sorts: without regard to a leading, very common, word such
as 'The' in "The thirty-nine steps".
6. Sort letters without regard to accents.
7. Sort ligatures as separate letters.
8. Replacements:
Sort german scharfes S (ß) as ss
Sort ſ, LATIN SMALL LETTER LONG S as s
Sort ʒ, LATIN SMALL LETTER EZH as s
...
Task Description
  • Implement the first four of the eight given features in a natural sorting routine/function/method...
  • Test each feature implemented separately with an ordered list of test strings from the 'Sample inputs' section below, and make sure your naturally sorted output is in the same order as other language outputs such as Python.
  • Print and display your output.
  • For extra credit implement more than the first four.

Note: It is not necessary to have individual control of which features are active in the natural sorting routine at any time.

Sample input
# Ignoring leading spaces
Text strings:
['ignore leading spaces: 2-2', ' ignore leading spaces: 2-1', '  ignore leading spaces: 2+0', '   ignore leading spaces: 2+1']

# Ignoring multiple adjacent spaces (m.a.s)
Text strings:
['ignore m.a.s spaces: 2-2', 'ignore m.a.s  spaces: 2-1', 'ignore m.a.s   spaces: 2+0', 'ignore m.a.s    spaces: 2+1']


# Equivalent whitespace characters
Text strings:
['Equiv. spaces: 3-3', 'Equiv.\rspaces: 3-2', 'Equiv.\x0cspaces: 3-1', 'Equiv.\x0bspaces: 3+0', 'Equiv.\nspaces: 3+1', 'Equiv.\tspaces: 3+2']

# Case Indepenent sort
Text strings:
['cASE INDEPENENT: 3-2', 'caSE INDEPENENT: 3-1', 'casE INDEPENENT: 3+0', 'case INDEPENENT: 3+1']

# Numeric fields as numerics
Text strings:
['foo100bar99baz0.txt', 'foo100bar10baz0.txt', 'foo1000bar99baz10.txt', 'foo1000bar99baz9.txt']

# Title sorts
Text strings:
['The Wind in the Willows', 'The 40th step more', 'The 39 steps', 'Wanda']

# Equivalent accented characters (and case)
Text strings:
[u'Equiv. \xfd accents: 2-2', u'Equiv. \xdd accents: 2-1', u'Equiv. y accents: 2+0', u'Equiv. Y accents: 2+1']


# Separated ligatures
Text strings:
[u'\u0132 ligatured ij', 'no ligature']

# Character replacements
Text strings:
[u'Start with an \u0292: 2-2', u'Start with an \u017f: 2-1', u'Start with an \xdf: 2+0', u'Start with an s: 2+1']


Contents

[edit] C

Some differences from task requirement:

  1. req 1 and 2 are not separated. I can't imagine a situation where I'd want one but not the other.
  2. req 5 is implemented differently: not only leading "the", but some common words like "it", "to" etc are discarded everywhere in the string
  3. req 6, 7, 8 are pretty much the same thing, so the implementation doesn't make too much of a distinction of it ("ß" is a ligature, you know.)

Besides the numeric part, everything else was done in a uniform way by transforming input strings into some normalized format and comparing those instead. All sort options flags can be freely mixed together. C source is written in UTF-8 for easier reading here: don't do this for serious code.

#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <wctype.h>
#include <string.h>
#include <locale.h>
 
typedef struct wstr {
wchar_t *s;
int n, alloc;
} wstr;
 
#define w_del(w) { free(w->s); free(w); }
#define forchars(i, c, w) for(i = 0, c = w->s[0]; i < w->n && c; c = w->s[++i])
wstr *w_new()
{
wstr *w = malloc(sizeof(wstr));
w->alloc = 1;
w->n = 0;
w->s = malloc(sizeof(wchar_t));
w->s[0] = 0;
return w;
}
 
void w_append(wstr *w, wchar_t c)
{
int n = w->n + 1;
if (n >= w->alloc) {
w->alloc *= 2;
w->s = realloc(w->s, w->alloc * sizeof(wchar_t));
}
w->s[w->n++] = c;
w->s[w->n] = 0;
}
 
wstr *w_make(wchar_t *s)
{
int i, len = wcslen(s);
wstr *w = w_new();
for (i = 0; i < len; i++) w_append(w, s[i]);
return w;
}
 
typedef void (*wtrans_func)(wstr *, wstr *);
void w_transform(wstr *in, wtrans_func f)
{
wstr t, *out = w_new();
f(in, out);
t = *in; *in = *out; *out = t;
w_del(out);
}
#define transfunc(x) void w_##x(wstr *in, wstr *out)
 
transfunc(nocase) {
int i;
wchar_t c;
forchars(i, c, in) w_append(out, towlower(c));
}
 
transfunc(despace) {
int i, gotspace = 0;
wchar_t c;
forchars(i, c, in) {
if (!iswspace(c)) {
if (gotspace && out->n)
w_append(out, L' ');
w_append(out, c);
gotspace = 0;
} else gotspace = 1;
}
}
 
static const wchar_t *const tbl_accent[] = { /* copied from Perl6 code */
L"Þ", L"TH", L"þ", L"th", L"Ð", L"TH", L"ð", L"th", L"À", L"A",
L"Á", L"A", L"Â", L"A", L"Ã", L"A", L"Ä", L"A", L"Å", L"A", L"à",
L"a", L"á", L"a", L"â", L"a", L"ã", L"a", L"ä", L"a", L"å", L"a",
L"Ç", L"C", L"ç", L"c", L"È", L"E", L"É", L"E", L"Ê", L"E", L"Ë",
L"E", L"è", L"e", L"é", L"e", L"ê", L"e", L"ë", L"e", L"Ì",
L"I", L"Í", L"I", L"Î", L"I", L"Ï", L"I", L"ì", L"i", L"í",
L"i", L"î", L"i", L"ï", L"i", L"Ò", L"O", L"Ó", L"O", L"Ô",
L"O", L"Õ", L"O", L"Ö", L"O", L"Ø", L"O", L"ò", L"o", L"ó", L"o",
L"ô", L"o", L"õ", L"o", L"ö", L"o", L"ø", L"o", L"Ñ", L"N", L"ñ", L"n",
L"Ù", L"U", L"Ú", L"U", L"Û", L"U", L"Ü", L"U", L"ù", L"u", L"ú", L"u",
L"û", L"u", L"ü", L"u", L"Ý", L"Y", L"ÿ", L"y", L"ý", L"y" };
 
static const wchar_t *const tbl_ligature[] = {
L"Æ", L"AE", L"æ", L"ae", L"ß", L"ss",
L"ffl", L"ffl", L"ffi", L"ffi", L"fi", L"fi", L"ff", L"ff", L"fl", L"fl",
L"ſ", L"s", L"ʒ", L"z", L"st", L"st", /* ... come on ... */
};
 
void w_char_repl(wstr *in, wstr *out, const wchar_t *const *tbl, int len)
{
int i, j, k;
wchar_t c;
forchars(i, c, in) {
for (j = k = 0; j < len; j += 2) {
if (c != tbl[j][0]) continue;
for (k = 0; tbl[j + 1][k]; k++)
w_append(out, tbl[j + 1][k]);
break;
}
if (!k) w_append(out, c);
}
}
 
transfunc(noaccent) {
w_char_repl(in, out, tbl_accent, sizeof(tbl_accent)/sizeof(wchar_t*));
}
 
transfunc(noligature) {
w_char_repl(in, out, tbl_ligature, sizeof(tbl_ligature)/sizeof(wchar_t*));
}
 
static const wchar_t *const tbl_article[] = {
L"the", L"a", L"of", L"to", L"is", L"it" };
#define N_ARTICLES sizeof(tbl_article)/sizeof(tbl_article[0])
transfunc(noarticle) {
int i, j, n;
wchar_t c, c0 = 0;
forchars(i, c, in) {
if (!c0 || (iswalnum(c) && !iswalnum(c0))) { /* word boundary */
for (j = N_ARTICLES - 1; j >= 0; j--) {
n = wcslen(tbl_article[j]);
if (wcsncasecmp(in->s + i, tbl_article[j], n))
continue;
if (iswalnum(in->s[i + n])) continue;
i += n;
break;
}
if (j < 0) w_append(out, c);
} else
w_append(out, c);
c0 = c;
}
}
 
enum { wi_space = 0, wi_case, wi_accent, wi_lig, wi_article, wi_numeric };
#define WS_NOSPACE (1 << wi_space)
#define WS_NOCASE (1 << wi_case)
#define WS_ACCENT (1 << wi_accent)
#define WS_LIGATURE (1 << wi_lig)
#define WS_NOARTICLE (1 << wi_article)
#define WS_NUMERIC (1 << wi_numeric)
const wtrans_func trans_funcs[] = {
w_despace, w_nocase, w_noaccent, w_noligature, w_noarticle, 0
};
const char *const flagnames[] = {
"collapse spaces",
"case insensitive",
"disregard accent",
"decompose ligatures",
"discard common words",
"numeric",
};
 
typedef struct { wchar_t* s; wstr *w; } kw_t;
int w_numcmp(const void *a, const void *b)
{
wchar_t *pa = ((const kw_t*)a)->w->s, *pb = ((const kw_t*)b)->w->s;
int sa, sb, ea, eb;
while (*pa && *pb) {
if (iswdigit(*pa) && iswdigit(*pb)) {
/* skip leading zeros */
sa = sb = 0;
while (pa[sa] == L'0') sa++;
while (pb[sb] == L'0') sb++;
/* find end of numbers */
ea = sa; eb = sb;
while (iswdigit(pa[ea])) ea++;
while (iswdigit(pb[eb])) eb++;
if (eb - sb > ea - sa) return -1;
if (eb - sb < ea - sa) return 1;
while (sb < eb) {
if (pa[sa] > pb[sb]) return 1;
if (pa[sa] < pb[sb]) return -1;
sa++; sb++;
}
 
pa += ea; pb += eb;
}
else if (iswdigit(*pa)) return 1;
else if (iswdigit(*pb)) return -1;
else {
if (*pa > *pb) return 1;
if (*pa < *pb) return -1;
pa++; pb++;
}
}
return (!*pa && !*pb) ? 0 : *pa ? 1 : -1;
}
 
int w_cmp(const void *a, const void *b)
{
return wcscmp(((const kw_t*)a)->w->s, ((const kw_t*)b)->w->s);
}
 
void natural_sort(wchar_t **strings, int len, int flags)
{
int i, j;
kw_t *kws = malloc(sizeof(kw_t) * len);
 
for (i = 0; i < len; i++) {
kws[i].s = strings[i];
kws[i].w = w_make(strings[i]);
for (j = 0; j < wi_numeric; j++)
if (flags & (1 << j) && trans_funcs[j])
w_transform(kws[i].w, trans_funcs[j]);
}
 
qsort(kws, len, sizeof(kw_t), (flags & WS_NUMERIC) ? w_numcmp : w_cmp);
for (i = 0; i < len; i++) {
w_del(kws[i].w);
strings[i] = kws[i].s;
}
free(kws);
}
 
const wchar_t *const test[] = {
L" 0000098 nina", L"100 niño", L"99 Ninja", L"100 NINA",
L" The work is so difficult to do it took ſome 100 aeons. ",
L"The work is so difficult it took some 100 aeons.",
L" The work is so difficult it took ſome 99 æons. ",
};
#define N_STRINGS sizeof(test)/sizeof(*test)
 
void test_sort(int flags)
{
int i, j;
const wchar_t *str[N_STRINGS];
memcpy(str, test, sizeof(test));
 
printf("Sort flags: (");
for (i = 0, j = flags; j; i++, j >>= 1)
if ((j & 1))
printf("%s%s", flagnames[i], j > 1 ? ", ":")\n");
 
natural_sort((wchar_t **)str, N_STRINGS, flags);
 
for (i = 0; i < N_STRINGS; i++)
printf("%ls\n", str[i]);
printf("\n");
}
 
int main()
{
setlocale(LC_CTYPE, "");
 
test_sort(WS_NOSPACE);
test_sort(WS_NOCASE);
test_sort(WS_NUMERIC);
test_sort(WS_NOARTICLE|WS_NOSPACE);
test_sort(WS_NOCASE|WS_NOSPACE|WS_ACCENT);
test_sort(WS_LIGATURE|WS_NOCASE|WS_NOSPACE|WS_NUMERIC|WS_ACCENT|WS_NOARTICLE);
 
return 0;
}
output
Sort flags: (collapse spaces)
0000098 nina
100 NINA
100 niño
99 Ninja
The work is so difficult it took some 100 aeons.
The work is so difficult it took ſome 99 æons.
The work is so difficult to do it took ſome 100 aeons.
 
Sort flags: (case insensitive)
The work is so difficult it took ſome 99 æons.
0000098 nina
The work is so difficult to do it took ſome 100 aeons.
100 NINA
100 niño
99 Ninja
The work is so difficult it took some 100 aeons.
 
Sort flags: (numeric)
The work is so difficult it took ſome 99 æons.
The work is so difficult to do it took ſome 100 aeons.
0000098 nina
The work is so difficult it took some 100 aeons.
99 Ninja
100 NINA
100 niño
 
Sort flags: (collapse spaces, discard common words)
0000098 nina
100 NINA
100 niño
99 Ninja
The work is so difficult it took some 100 aeons.
The work is so difficult to do it took ſome 100 aeons.
The work is so difficult it took ſome 99 æons.
 
Sort flags: (collapse spaces, case insensitive, disregard accent)
0000098 nina
100 NINA
100 niño
99 Ninja
The work is so difficult it took some 100 aeons.
The work is so difficult it took ſome 99 æons.
The work is so difficult to do it took ſome 100 aeons.
 
Sort flags: (collapse spaces, case insensitive, disregard accent, decompose ligatures, discard common words, numeric)
The work is so difficult to do it took ſome 100 aeons.
The work is so difficult it took ſome 99 æons.
The work is so difficult it took some 100 aeons.
0000098 nina
99 Ninja
100 NINA
100 niño

[edit] D

Implements requests 1-5.

import std.stdio, std.string, std.algorithm, std.array, std.conv,
std.ascii, std.range;
 
string[] naturalSort(string[] arr) {
static struct Part {
string s;
bool isNumber = false;
ulong n;
 
int opCmp(in ref Part other) const pure {
return (isNumber && other.isNumber) ?
cmp([n], [other.n]) : cmp(s, other.s);
}
}
 
static Part[] mapper(in string s) pure {
// groupBy!isDigit could shorten this function.
enum Kind { nothing, digit, notDigit }
 
auto lk = Kind.nothing; // Last Kind.
string[] parts;
foreach (c; s.strip.tr(whitespace, " ", "s").toLower) {
if (lk != Kind.nothing && c.isDigit == (lk == Kind.digit))
parts.back ~= c;
else
parts ~= [c];
lk = c.isDigit ? Kind.digit : Kind.notDigit;
}
 
auto r = parts.map!(p => p[0].isDigit ?
Part(p, true, p.to!ulong) :
Part(p)).array;
return (r.length > 1 && r[0].s == "the") ? r.dropOne : r;
}
 
return arr.schwartzSort!mapper.release;
}
 
void main() {
auto tests = [
// Ignoring leading spaces.
["ignore leading spaces: 2-2", " ignore leading spaces: 2-1", "
ignore leading spaces: 2+1"
, " ignore leading spaces: 2+0"],
 
// Ignoring multiple adjacent spaces (m.a.s).
["ignore m.a.s spaces: 2-2", "ignore m.a.s spaces: 2-1",
"ignore m.a.s spaces: 2+0", "ignore m.a.s spaces: 2+1"],
 
// Equivalent whitespace characters.
["Equiv. spaces: 3-3", "Equiv.\rspaces: 3-2",
"Equiv.\x0cspaces: 3-1", "Equiv.\x0bspaces: 3+0",
"Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2"],
 
// Case Indepenent sort.
["cASE INDEPENENT: 3-2", "caSE INDEPENENT: 3-1",
"casE INDEPENENT: 3+0", "case INDEPENENT: 3+1"],
 
// Numeric fields as numerics.
["foo100bar99baz0.txt", "foo100bar10baz0.txt",
"foo1000bar99baz10.txt", "foo1000bar99baz9.txt"],
 
// Title sorts.
["The Wind in the Willows", "The 40th step more",
"The 39 steps", "Wanda"]];
 
foreach (test; tests) {
test.writeln;
writeln(test.naturalSort, "\n");
}
}
Output:
["ignore leading spaces: 2-2", " ignore leading spaces: 2-1", "   ignore leading spaces: 2+1", "  ignore leading spaces: 2+0"]
["  ignore leading spaces: 2+0", "   ignore leading spaces: 2+1", " ignore leading spaces: 2-1", "ignore leading spaces: 2-2"]

["ignore m.a.s spaces: 2-2", "ignore m.a.s  spaces: 2-1", "ignore m.a.s   spaces: 2+0", "ignore m.a.s    spaces: 2+1"]
["ignore m.a.s   spaces: 2+0", "ignore m.a.s    spaces: 2+1", "ignore m.a.s  spaces: 2-1", "ignore m.a.s spaces: 2-2"]

["Equiv. spaces: 3-3", "Equiv.\rspaces: 3-2", "Equiv.\fspaces: 3-1", "Equiv.\vspaces: 3+0", "Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2"]
["Equiv.\vspaces: 3+0", "Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2", "Equiv.\fspaces: 3-1", "Equiv.\rspaces: 3-2", "Equiv. spaces: 3-3"]

["cASE INDEPENENT: 3-2", "caSE INDEPENENT: 3-1", "casE INDEPENENT: 3+0", "case INDEPENENT: 3+1"]
["casE INDEPENENT: 3+0", "case INDEPENENT: 3+1", "caSE INDEPENENT: 3-1", "cASE INDEPENENT: 3-2"]

["foo100bar99baz0.txt", "foo100bar10baz0.txt", "foo1000bar99baz10.txt", "foo1000bar99baz9.txt"]
["foo100bar10baz0.txt", "foo100bar99baz0.txt", "foo1000bar99baz9.txt", "foo1000bar99baz10.txt"]

["The Wind in the Willows", "The 40th step more", "The 39 steps", "Wanda"]
["The 39 steps", "The 40th step more", "The Wind in the Willows", "Wanda"]

[edit] Go

This solution varies from the task in interpretation of rule 4, describing numeric fields. This solution follows other solutions on the page by treating the left-most fields as most significant. See talk page.

First four rules, no extra credit:

package main
 
import (
"fmt"
"regexp"
"sort"
"strconv"
"strings"
)
 
var tests = []struct {
descr string
list []string
}{
{"Ignoring leading spaces", []string{
"ignore leading spaces: 2-2",
" ignore leading spaces: 2-1",
" ignore leading spaces: 2+0",
" ignore leading spaces: 2+1",
}},
{"Ignoring multiple adjacent spaces", []string{
"ignore m.a.s spaces: 2-2",
"ignore m.a.s spaces: 2-1",
"ignore m.a.s spaces: 2+0",
"ignore m.a.s spaces: 2+1",
}},
{"Equivalent whitespace characters", []string{
"Equiv. spaces: 3-3",
"Equiv.\rspaces: 3-2",
"Equiv.\fspaces: 3-1",
"Equiv.\bspaces: 3+0",
"Equiv.\nspaces: 3+1",
"Equiv.\tspaces: 3+2",
}},
{"Case Indepenent sort", []string{
"cASE INDEPENENT: 3-2",
"caSE INDEPENENT: 3-1",
"casE INDEPENENT: 3+0",
"case INDEPENENT: 3+1",
}},
{"Numeric fields as numerics", []string{
"foo100bar99baz0.txt",
"foo100bar10baz0.txt",
"foo1000bar99baz10.txt",
"foo1000bar99baz9.txt",
}},
}
 
func main() {
for _, test := range tests {
fmt.Println(test.descr)
fmt.Println("Input order:")
for _, s := range test.list {
fmt.Printf("  %q\n", s)
}
fmt.Println("Natural order:")
l := make(list, len(test.list))
for i, s := range test.list {
l[i] = newNatStr(s)
}
sort.Sort(l)
for _, s := range l {
fmt.Printf("  %q\n", s.s)
}
fmt.Println()
}
}
 
// natStr associates a string with a preprocessed form
type natStr struct {
s string // original
t []tok // preprocessed "sub-fields"
}
 
func newNatStr(s string) (t natStr) {
t.s = s
s = strings.ToLower(strings.Join(strings.Fields(s), " "))
x := dx.FindAllString(s, -1)
t.t = make([]tok, len(x))
for i, s := range x {
if n, err := strconv.Atoi(s); err == nil {
t.t[i].n = n
} else {
t.t[i].s = s
}
}
return t
}
 
var dx = regexp.MustCompile(`\d+|\D+`)
 
// rule is to use s unless it is empty, then use n
type tok struct {
s string
n int
}
 
// rule 2 of "numeric sub-fields" from talk page
func (f1 tok) Cmp(f2 tok) int {
switch {
case f1.s == "":
switch {
case f2.s > "" || f1.n < f2.n:
return -1
case f1.n > f2.n:
return 1
}
case f2.s == "" || f1.s > f2.s:
return 1
case f1.s < f2.s:
return -1
}
return 0
}
 
type list []natStr
 
func (l list) Len() int { return len(l) }
func (l list) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
func (l list) Less(i, j int) bool {
ti := l[i].t
for k, t := range l[j].t {
if k == len(ti) {
return true
}
switch ti[k].Cmp(t) {
case -1:
return true
case 1:
return false
}
}
return false
}
Output:
Ignoring leading spaces
Input order:
   "ignore leading spaces: 2-2"
   " ignore leading spaces: 2-1"
   "  ignore leading spaces: 2+0"
   "   ignore leading spaces: 2+1"
Natural order:
   "  ignore leading spaces: 2+0"
   "   ignore leading spaces: 2+1"
   " ignore leading spaces: 2-1"
   "ignore leading spaces: 2-2"

Ignoring multiple adjacent spaces
Input order:
   "ignore m.a.s spaces: 2-2"
   "ignore m.a.s  spaces: 2-1"
   "ignore m.a.s   spaces: 2+0"
   "ignore m.a.s    spaces: 2+1"
Natural order:
   "ignore m.a.s   spaces: 2+0"
   "ignore m.a.s    spaces: 2+1"
   "ignore m.a.s  spaces: 2-1"
   "ignore m.a.s spaces: 2-2"

Equivalent whitespace characters
Input order:
   "Equiv. spaces: 3-3"
   "Equiv.\rspaces: 3-2"
   "Equiv.\fspaces: 3-1"
   "Equiv.\bspaces: 3+0"
   "Equiv.\nspaces: 3+1"
   "Equiv.\tspaces: 3+2"
Natural order:
   "Equiv.\bspaces: 3+0"
   "Equiv.\nspaces: 3+1"
   "Equiv.\tspaces: 3+2"
   "Equiv.\fspaces: 3-1"
   "Equiv.\rspaces: 3-2"
   "Equiv. spaces: 3-3"

Case Indepenent sort
Input order:
   "cASE INDEPENENT: 3-2"
   "caSE INDEPENENT: 3-1"
   "casE INDEPENENT: 3+0"
   "case INDEPENENT: 3+1"
Natural order:
   "casE INDEPENENT: 3+0"
   "case INDEPENENT: 3+1"
   "caSE INDEPENENT: 3-1"
   "cASE INDEPENENT: 3-2"

Numeric fields as numerics
Input order:
   "foo100bar99baz0.txt"
   "foo100bar10baz0.txt"
   "foo1000bar99baz10.txt"
   "foo1000bar99baz9.txt"
Natural order:
   "foo100bar10baz0.txt"
   "foo100bar99baz0.txt"
   "foo1000bar99baz9.txt"
   "foo1000bar99baz10.txt"

[edit] J

The natural way of approaching this task in J is to normalize the text based on the rules desired for sorting. Here, we limit ourselves to ascii, for portability, and decide that our domain shall be terminated strings (where each string ends with the same character - typically a newline):

require'strings regex'
 
lines=: <;.2
titleFix=: ('^\s*(the|a|an)\b';'')&rxrplc
isNum=: e.&'0123456789'
num=: ".^:(isNum@{.)
split=: <@num/.~ [:+/\1,2 ~:/\ isNum
norm=: [: split (32 9 12 13 14 15{a.) -.~ [: titleFix tolower
 
natSor=: lines ;@/: norm&.>@lines

Example data:

IgnoringLeadingSpaces=:0 :0
ignore leading spaces: 2-2
ignore leading spaces: 2-1
ignore leading spaces: 2+0
ignore leading spaces: 2+1
)
 
IgnoringMultipleAdjacentSpaces=: 0 :0
ignore m.a.s spaces: 2-2
ignore m.a.s spaces: 2-1
ignore m.a.s spaces: 2+0
ignore m.a.s spaces: 2+1
)
 
bsSubst=: rplc&((LF;'\'),('\r';13{a.),('\x0c';12{a.),('\x0b';11{a.),('\n';LF),:'\t';TAB)
 
EquivalentWhitespaceCharacters=: bsSubst 0 :0
Equiv. spaces: 3-3
Equiv.\rspaces: 3-2
Equiv.\x0cspaces: 3-1
Equiv.\x0bspaces: 3+0
Equiv.\nspaces: 3+1
Equiv.\tspaces: 3+2
)
 
CaseIndepenent=: 0 :0
cASE INDEPENENT: 3-2
caSE INDEPENENT: 3-1
casE INDEPENENT: 3+0
case INDEPENENT: 3+1
)
 
NumericFieldsAsNumerics=: 0 :0
foo100bar99baz0.txt
foo100bar10baz0.txt
foo1000bar99baz10.txt
foo1000bar99baz9.txt
)
 
Titles=: 0 :0
The Wind in the Willows
The 40th step more
The 39 steps
Wanda
)

Note that the required example which contains equivalent whitespace characters includes a '\n' in the data. So, for that example, we use a backslash as our terminator.

Example use:

   natSor IgnoringLeadingSpaces
ignore leading spaces: 2+0
ignore leading spaces: 2+1
ignore leading spaces: 2-1
ignore leading spaces: 2-2
 
natSor IgnoringMultipleAdjacentSpaces
ignore m.a.s spaces: 2+0
ignore m.a.s spaces: 2+1
ignore m.a.s spaces: 2-1
ignore m.a.s spaces: 2-2
 
natSor EquivalentWhitespaceCharacters
Equiv.
spaces: 3+1\ Equiv.�spaces: 3+0\ Equiv. spaces: 3+2\ Equiv.�spaces: 3-1\ Equiv.
spaces: 3-2\ Equiv. spaces: 3-3\
natSor CaseIndepenent
casE INDEPENENT: 3+0
case INDEPENENT: 3+1
caSE INDEPENENT: 3-1
cASE INDEPENENT: 3-2
 
natSor NumericFieldsAsNumerics
foo100bar10baz0.txt
foo100bar99baz0.txt
foo1000bar99baz9.txt
foo1000bar99baz10.txt
 
natSor Titles
The 39 steps
The 40th step more
Wanda
The Wind in the Willows
 
 

[edit] Perl

This implements all 8 requirements*:

 
use feature 'fc';
use Unicode::Normalize;
 
sub natural_sort {
my @items = map {
my $str = fc(NFKD($_));
$str =~ s/\s+/ /;
$str =~ s/|^(?:the|a|an) \b|\p{Nonspacing_Mark}| $//g;
my @fields = $str =~ /(?!\z) ([^0-9]*+) ([0-9]*+)/gx;
[$_, \@fields]
} @_;
return map { $_->[0] } sort {
my @x = @{$a->[1]};
my @y = @{$b->[1]};
my $numeric;
while (@x && @y) {
my ($x, $y) = (shift @x, shift @y);
return (($numeric = !$numeric) ? $x cmp $y : $x <=> $y or next);
}
return @x <=> @y;
} @items;
}
 
*) Note that decomposing the strings to the NFKD normalization form and subsequently stripping off all code points of the Nonspacing_Mark category, removes differences caused by accents / ligatures / alternate character forms / etc. in a standards-compliant way. This coincides with all the examples given in the task description, with the exception that it does not replace "ʒ" with "s" — one could add
$str =~ tr/ʒ/s/;
for that but it seems a bit whimsical.)

Testing:

 
use utf8; # interpret this script's source code as UTF8
use Test::More; # for plan(), is_deeply()
use Data::Dump; # for dd()
 
my @testcases = (
['Leading spaces', '%sleading spaces: %i', map {' ' x $_} 2, 3, 1, 0 ],
['Adjacent spaces', 'adjacent%s spaces: %i', map {' ' x $_} 2, 3, 1, 0 ],
['Different spaces', 'equiv.%sspaces: %i', split //, "\x0b\n\t\x0c\r " ],
['Case differences', '%s INDEPENENT: %i', 'casE', 'case', 'caSE', 'cASE' ],
['Numeric fields', 'foo%ibar%ibaz%i.txt', [100, 10, 0], [100, 99, 0],
[1000,99,9], [1000,99,10] ],
['Title case', '%s', 'The 39 steps', 'The 40th step more',
'Wanda', 'The Wind in the Willows' ],
['Accents', 'Equiv. %s accents: %i', 'y', 'Y', "\x{dd}", "\x{fd}" ],
['Ligatures', '%s', "IJ ligatured ij", 'no ligature' ],
['Alternate forms', 'Start with an %s: %i', 's', 'ſ', 'ß' ],
);
 
plan tests => scalar @testcases;
 
foreach (@testcases) {
my ($name, $pattern, @args) = @$_;
my $i = 0;
my @strings = map { sprintf $pattern, ref $_ ? @$_ : $_, $i++ } @args;
 
is_deeply( [natural_sort(reverse sort @strings)], \@strings, $name );
 
dd @strings;
print "\n";
}
 
Output:
1..9
ok 1 - Leading spaces
(
  "  leading spaces: 0",
  "   leading spaces: 1",
  " leading spaces: 2",
  "leading spaces: 3",
)

ok 2 - Adjacent spaces
(
  "adjacent   spaces: 0",
  "adjacent    spaces: 1",
  "adjacent  spaces: 2",
  "adjacent spaces: 3",
)

ok 3 - Different spaces
(
  "equiv.\13spaces: 0",
  "equiv.\nspaces: 1",
  "equiv.\tspaces: 2",
  "equiv.\fspaces: 3",
  "equiv.\rspaces: 4",
  "equiv. spaces: 5",
)

ok 4 - Case differences
(
  "casE INDEPENENT: 0",
  "case INDEPENENT: 1",
  "caSE INDEPENENT: 2",
  "cASE INDEPENENT: 3",
)

ok 5 - Numeric fields
(
  "foo100bar10baz0.txt",
  "foo100bar99baz0.txt",
  "foo1000bar99baz9.txt",
  "foo1000bar99baz10.txt",
)

ok 6 - Title case
(
  "The 39 steps",
  "The 40th step more",
  "Wanda",
  "The Wind in the Willows",
)

ok 7 - Accents
(
  "Equiv. y accents: 0",
  "Equiv. Y accents: 1",
  "Equiv. \xDD accents: 2",
  "Equiv. \xFD accents: 3",
)

ok 8 - Ligatures
("\x{132} ligatured ij", "no ligature")

ok 9 - Alternate forms
(
  "Start with an s: 0",
  "Start with an \x{17F}: 1",
  "Start with an \xDF: 2",
)

[edit] Perl 6

Tested with Rakudo 2011.06

In Perl6 it is very easy to modify the default sorting order by passing in a transform routine to the sort function. If the transform routines are arity one, the sort function will apply a Schwartzian Transform so it only needs to calculate the transform once. Note that the transforms are non-destructive; The sort function returns the original strings.

The following are a series of subroutines to perform the various natural sorting transforms. They may be applied individually or mixed and matched to get the particular result desired. When more than one is strung together, they apply left to right. Some combinations may yield different results depending on the order they are applied.

# Sort groups of digits in number order. Sort by order of magnitude then lexically.
sub naturally ($a) { $a.lc.subst(/(\d+)/, ->$/ {0~$0.chars.chr~$0},:g) ~"\x0"~$a }
 
# Collapse multiple ws characters to a single.
sub collapse ($a) { $a.subst( / ( \s ) $0+ /, -> $/ { $0 }, :g ) }
 
# Convert all ws characters to a space.
sub normalize ($a) { $a.subst( / ( \s ) /, ' ', :g ) }
 
# Ignore common leading articles for title sorts
sub title ($a) { $a.subst( / :i ^ ( a | an | the ) >> \s* /, '' ) }
 
# Decompose ISO-Latin1 glyphs to their base character.
sub latin1_decompose ($a) {
my %tr = <
Æ AE æ ae Þ TH þ th Ð TH ð th ß ss À A Á A Â A Ã A Ä A Å A à a á a
â a ã a ä a å a Ç C ç c È E É E Ê E Ë E è e é e ê e ë e Ì I Í I Î
I Ï I ì i í i î i ï i Ò O Ó O Ô O Õ O Ö O Ø O ò o ó o ô o õ o ö o
ø o Ñ N ñ n Ù U Ú U Û U Ü U ù u ú u û u ü u Ý Y ÿ y ý y
>;
 
# Would probably be better implemented as
# $a.trans( [%tr.keys] => [%tr.values] );
# but the underlying Parrot VM leaks through in current Rakudo.
my $re = '<[' ~ %tr.keys.join('|') ~ ']>';
$a.subst(/ (<$re>) /, -> $/ { %tr{$0} }, :g)
}


Used as:

my @tests = (
[
"Task 1a\nSort while ignoring leading spaces.",
[
'ignore leading spaces: 1', ' ignore leading spaces: 4',
' ignore leading spaces: 3', ' ignore leading spaces: 2'
],
{.trim} # builtin method.
],
[
"Task 1b\nSort while ignoring multiple adjacent spaces.",
[
'ignore m.a.s spaces: 3', 'ignore m.a.s spaces: 1',
'ignore m.a.s spaces: 4', 'ignore m.a.s spaces: 2'
],
{.&collapse}
],
[
"Task 2\nSort with all white space normalized to regular spaces.",
[
"Normalized\tspaces: 4", "Normalized\xa0spaces: 1",
"Normalized\x20spaces: 2", "Normalized\nspaces: 3"
],
{.&normalize}
],
[
"Task 3\nSort case independently.",
[
'caSE INDEPENDENT: 3', 'casE INDEPENDENT: 2',
'cASE INDEPENDENT: 4', 'case INDEPENDENT: 1'
],
{.lc} # builtin method
],
[
"Task 4\nSort groups of digits in natural number order.",
[
<Foo100bar99baz0.txt foo100bar10baz0.txt foo1000bar99baz10.txt
foo1000bar99baz9.txt 201st 32nd 3rd 144th 17th 2 95>
],
{.&naturally}
],
[
"Task 5 ( mixed with 1, 2, 3 & 4 )\n"
~ "Sort titles, normalize white space, collapse multiple spaces to\n"
~ "single, trim leading white space, ignore common leading articles\n"
~ 'and sort digit groups in natural order.',
[
'The Wind in the Willows 8', ' The 39 Steps 3',
'The 7th Seal 1', 'Wanda 6',
'A Fish Called Wanda 5', ' The Wind and the Lion 7',
'Any Which Way But Loose 4', '12 Monkeys 2'
],
{.&normalize.&collapse.trim.&title.&naturally}
],
[
"Task 6, 7, 8\nMap letters in Latin1 that have accents or decompose to two\n"
~ 'characters to their base characters for sorting.',
[
<apple Ball bald car Card above Æon æon aether
niño nina e-mail Évian evoke außen autumn>
],
{.&latin1_decompose.&naturally}
]
);
 
 
for @tests -> $case {
my $code_ref = $case.pop;
my @array = $case.pop.list;
say $case.pop, "\n";
 
say "Standard Sort:\n";
.say for @array.sort;
 
say "\nNatural Sort:\n";
.say for @array.sort: {.$code_ref};
 
say "\n" ~ '*' x 40 ~ "\n";
}

Sample output:

Task 1a
Sort while ignoring leading spaces.

Standard Sort:

   ignore leading spaces: 4
  ignore leading spaces: 3
 ignore leading spaces: 2
ignore leading spaces: 1

Natural Sort:

ignore leading spaces: 1
 ignore leading spaces: 2
  ignore leading spaces: 3
   ignore leading spaces: 4

****************************************

Task 1b
Sort while ignoring multiple adjacent spaces.

Standard Sort:

ignore m.a.s    spaces: 4
ignore m.a.s   spaces: 3
ignore m.a.s  spaces: 2
ignore m.a.s spaces: 1

Natural Sort:

ignore m.a.s spaces: 1
ignore m.a.s  spaces: 2
ignore m.a.s   spaces: 3
ignore m.a.s    spaces: 4

****************************************

Task 2
Sort with all white space normalized to regular spaces.

Standard Sort:

Normalized	spaces: 4
Normalized
spaces: 3
Normalized spaces: 2
Normalized spaces: 1

Natural Sort:

Normalized spaces: 1
Normalized spaces: 2
Normalized
spaces: 3
Normalized	spaces: 4

****************************************

Task 3
Sort case independently.

Standard Sort:

cASE INDEPENDENT: 4
caSE INDEPENDENT: 3
casE INDEPENDENT: 2
case INDEPENDENT: 1

Natural Sort:

case INDEPENDENT: 1
casE INDEPENDENT: 2
caSE INDEPENDENT: 3
cASE INDEPENDENT: 4

****************************************

Task 4
Sort groups of digits in natural number order.

Standard Sort:

144th
17th
2
201st
32nd
3rd
95
Foo100bar99baz0.txt
foo1000bar99baz10.txt
foo1000bar99baz9.txt
foo100bar10baz0.txt

Natural Sort:

2
3rd
17th
32nd
95
144th
201st
foo100bar10baz0.txt
Foo100bar99baz0.txt
foo1000bar99baz9.txt
foo1000bar99baz10.txt

****************************************

Task 5 ( mixed with 1, 2, 3 & 4 )
Sort titles, normalize white space, collapse multiple spaces to
single, trim leading white space, ignore common leading articles
and sort digit groups in natural order.

Standard Sort:

  The 39 Steps               3
 The Wind and the Lion       7
12 Monkeys                   2
A Fish Called Wanda          5
Any Which Way But Loose      4
The    7th Seal              1
The Wind	in the Willows  8
Wanda                        6

Natural Sort:

The    7th Seal              1
12 Monkeys                   2
  The 39 Steps               3
Any Which Way But Loose      4
A Fish Called Wanda          5
Wanda                        6
 The Wind and the Lion       7
The Wind	in the Willows  8

****************************************

Task 6, 7, 8
Map letters in Latin1 that have accents or decompose to two
characters to their base characters for sorting.

Standard Sort:

Ball
Card
above
aether
apple
autumn
außen
bald
car
e-mail
evoke
nina
niño
Æon
Évian
æon

Natural Sort:

above
Æon
æon
aether
apple
außen
autumn
bald
Ball
car
Card
e-mail
Évian
evoke
nina
niño

****************************************

[edit] PicoLisp

This parser takes care of features 1,2,3,4,5 and 8:

(de parseNatural (Str)
(clip
(make
(for (L (chop Str) L)
(cond
((sp? (car L))
(link " ")
(while (and L (sp? (car L)))
(pop 'L) ) )
((>= "9" (car L) "0")
(link
(format
(make
(loop
(link (pop 'L))
(NIL (>= "9" (car L) "0")) ) ) ) ) )
(T
(let Word
(pack
(replace
(make
(loop
(link (lowc (pop 'L)))
(NIL L)
(T (sp? (car L)))
(T (>= "9" (car L) "0")) ) )
"ß" "ss" "ſ" "s" "ʒ" "s" ) )
(unless (member Word '(the it to))
(link Word) ) ) ) ) ) ) ) )

Test:

: (parseNatural " ^MThe abc123Defß ^I Ghi ")
-> ("abc" 123 "defss" " " "ghi")

Sorting is trivial then:

(de naturalSort (Lst)
(by parseNatural sort Lst) )

Test:

(de *TestData
"# Ignoring leading spaces"
("ignore leading spaces: 2-2" " ignore leading spaces: 2-1" " ignore leading spaces: 2+0" " ignore leading spaces: 2+1")
 
"# Ignoring multiple adjacent spaces (m.a.s)"
("ignore m.a.s spaces: 2-2" "ignore m.a.s spaces: 2-1" "ignore m.a.s spaces: 2+0" "ignore m.a.s spaces: 2+1")
 
"# Equivalent whitespace characters"
("Equiv. spaces: 3-3" "Equiv.^Mspaces: 3-2" "Equiv.^Acspaces: 3-1" "Equiv.^Kbspaces: 3+0" "Equiv.^Jspaces: 3+1" "Equiv.^Ispaces: 3+2")
 
"# Case Indepenent sort"
("cASE INDEPENENT: 3-2" "caSE INDEPENENT: 3-1" "casE INDEPENENT: 3+0" "case INDEPENENT: 3+1")
 
"# Numeric fields as numerics"
("foo100bar99baz0.txt" "foo100bar10baz0.txt" "foo1000bar99baz10.txt" "foo1000bar99baz9.txt")
 
"# Title sorts"
("The Wind in the Willows" "The 40th step more" "The 39 steps" "Wanda")
 
"# Equivalent accented characters (and case)"
("Equiv. ý accents: 2-2" "Equiv. Ý accents: 2-1" "Equiv. y accents: 2+0" "Equiv. Y accents: 2+1")
 
# "Separated ligatures"
### ("IJ ligatured ij" "no ligature")
 
"# Character replacements"
("Start with an ʒ: 2-2" "Start with an ſ: 2-1" "Start with an ß: 2+0" "Start with an s: 2+1") )
 
(de pythonOut (Ttl Lst)
(prinl Ttl)
(prin "['" (car Lst))
(for S (cdr Lst)
(prin "',^J '" S) )
(prinl "']") )
 
(for X *TestData
(if (atom X)
(prinl X)
(pythonOut "Text strings:" X)
(pythonOut "Normally sorted :" (sort (copy X)))
(pythonOut "Naturally sorted:" (naturalSort X))
(prinl) ) )

Output:

# Ignoring leading spaces
Text strings:
['ignore leading spaces: 2-2',
 ' ignore leading spaces: 2-1',
 '  ignore leading spaces: 2+0',
 '   ignore leading spaces: 2+1']
Normally sorted :
['   ignore leading spaces: 2+1',
 '  ignore leading spaces: 2+0',
 ' ignore leading spaces: 2-1',
 'ignore leading spaces: 2-2']
Naturally sorted:
['  ignore leading spaces: 2+0',
 '   ignore leading spaces: 2+1',
 ' ignore leading spaces: 2-1',
 'ignore leading spaces: 2-2']

# Ignoring multiple adjacent spaces (m.a.s)
Text strings:
['ignore m.a.s spaces: 2-2',
 'ignore m.a.s  spaces: 2-1',
 'ignore m.a.s   spaces: 2+0',
 'ignore m.a.s    spaces: 2+1']
Normally sorted :
['ignore m.a.s    spaces: 2+1',
 'ignore m.a.s   spaces: 2+0',
 'ignore m.a.s  spaces: 2-1',
 'ignore m.a.s spaces: 2-2']
Naturally sorted:
['ignore m.a.s   spaces: 2+0',
 'ignore m.a.s    spaces: 2+1',
 'ignore m.a.s  spaces: 2-1',
 'ignore m.a.s spaces: 2-2']

# Equivalent whitespace characters
Text strings:
['Equiv. spaces: 3-3',
 'Equiv.
spaces: 3-2',
 'Equiv.�cspaces: 3-1',
 'Equiv.�bspaces: 3+0',
 'Equiv.
spaces: 3+1',
 'Equiv.	spaces: 3+2']
Normally sorted :
['Equiv.�cspaces: 3-1',
 'Equiv.	spaces: 3+2',
 'Equiv.
spaces: 3+1',
 'Equiv.�bspaces: 3+0',
 'Equiv.
spaces: 3-2',
 'Equiv. spaces: 3-3']
Naturally sorted:
['Equiv.�bspaces: 3+0',
 'Equiv.�cspaces: 3-1',
 'Equiv.
spaces: 3+1',
 'Equiv.	spaces: 3+2',
 'Equiv.
spaces: 3-2',
 'Equiv. spaces: 3-3']

# Case Indepenent sort
Text strings:
['cASE INDEPENENT: 3-2',
 'caSE INDEPENENT: 3-1',
 'casE INDEPENENT: 3+0',
 'case INDEPENENT: 3+1']
Normally sorted :
['cASE INDEPENENT: 3-2',
 'caSE INDEPENENT: 3-1',
 'casE INDEPENENT: 3+0',
 'case INDEPENENT: 3+1']
Naturally sorted:
['casE INDEPENENT: 3+0',
 'case INDEPENENT: 3+1',
 'caSE INDEPENENT: 3-1',
 'cASE INDEPENENT: 3-2']

# Numeric fields as numerics
Text strings:
['foo100bar99baz0.txt',
 'foo100bar10baz0.txt',
 'foo1000bar99baz10.txt',
 'foo1000bar99baz9.txt']
Normally sorted :
['foo1000bar99baz10.txt',
 'foo1000bar99baz9.txt',
 'foo100bar10baz0.txt',
 'foo100bar99baz0.txt']
Naturally sorted:
['foo100bar10baz0.txt',
 'foo100bar99baz0.txt',
 'foo1000bar99baz9.txt',
 'foo1000bar99baz10.txt']

# Title sorts
Text strings:
['The Wind in the Willows',
 'The 40th step more',
 'The 39 steps',
 'Wanda']
Normally sorted :
['The 39 steps',
 'The 40th step more',
 'The Wind in the Willows',
 'Wanda']
Naturally sorted:
['The 39 steps',
 'The 40th step more',
 'Wanda',
 'The Wind in the Willows']

# Equivalent accented characters (and case)
Text strings:
['Equiv. ý accents: 2-2',
 'Equiv. Ý accents: 2-1',
 'Equiv. y accents: 2+0',
 'Equiv. Y accents: 2+1']
Normally sorted :
['Equiv. Y accents: 2+1',
 'Equiv. y accents: 2+0',
 'Equiv. Ý accents: 2-1',
 'Equiv. ý accents: 2-2']
Naturally sorted:
['Equiv. y accents: 2+0',
 'Equiv. Y accents: 2+1',
 'Equiv. Ý accents: 2-1',
 'Equiv. ý accents: 2-2']

# Character replacements
Text strings:
['Start with an ʒ: 2-2',
 'Start with an ſ: 2-1',
 'Start with an ß: 2+0',
 'Start with an s: 2+1']
Normally sorted :
['Start with an s: 2+1',
 'Start with an ß: 2+0',
 'Start with an ſ: 2-1',
 'Start with an ʒ: 2-2']
Naturally sorted:
['Start with an s: 2+1',
 'Start with an ſ: 2-1',
 'Start with an ʒ: 2-2',
 'Start with an ß: 2+0']

[edit] Python

All eight features:

# -*- coding: utf-8 -*-
# Not Python 3.x (Can't compare str and int)
 
 
from itertools import groupby
from unicodedata import decomposition, name
from pprint import pprint as pp
 
commonleaders = ['the'] # lowercase leading words to ignore
replacements = {u'ß': 'ss', # Map single char to replacement string
u'ſ': 's',
u'ʒ': 's',
}
 
hexdigits = set('0123456789abcdef')
decdigits = set('0123456789') # Don't use str.isnumeric
 
def splitchar(c):
' De-ligature. De-accent a char'
de = decomposition(c)
if de:
# Just the words that are also hex numbers
de = [d for d in de.split()
if all(c.lower()
in hexdigits for c in d)]
n = name(c, c).upper()
# (Gosh it's onerous)
if len(de)> 1 and 'PRECEDE' in n:
# E.g. ʼn LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
de[1], de[0] = de[0], de[1]
tmp = [ unichr(int(k, 16)) for k in de]
base, others = tmp[0], tmp[1:]
if 'LIGATURE' in n:
# Assume two character ligature
base += others.pop(0)
else:
base = c
return base
 
 
def sortkeygen(s):
'''Generate 'natural' sort key for s
 
Doctests:
>>> sortkeygen(' some extra spaces ')
[u'some extra spaces']
>>> sortkeygen('CasE InseNsItIve')
[u'case insensitive']
>>> sortkeygen('The Wind in the Willows')
[u'wind in the willows']
>>> sortkeygen(u'\462 ligature')
[u'ij ligature']
>>> sortkeygen(u'\335\375 upper/lower case Y with acute accent')
[u'yy upper/lower case y with acute accent']
>>> sortkeygen('foo9.txt')
[u'foo', 9, u'.txt']
>>> sortkeygen('x9y99')
[u'x', 9, u'y', 99]
'''

# Ignore leading and trailing spaces
s = unicode(s).strip()
# All space types are equivalent
s = ' '.join(s.split())
# case insentsitive
s = s.lower()
# Title
words = s.split()
if len(words) > 1 and words[0] in commonleaders:
s = ' '.join( words[1:])
# accent and ligatures
s = ''.join(splitchar(c) for c in s)
# Replacements (single char replaced by one or more)
s = ''.join( replacements.get(ch, ch) for ch in s )
# Numeric sections as numerics
s = [ int("".join(g)) if isinteger else "".join(g)
for isinteger,g in groupby(s, lambda x: x in decdigits)]
 
return s
 
def naturalsort(items):
''' Naturally sort a series of strings
 
Doctests:
>>> naturalsort(['The Wind in the Willows','The 40th step more',
'The 39 steps', 'Wanda'])
['The 39 steps', 'The 40th step more', 'Wanda', 'The Wind in the Willows']
 
'''

return sorted(items, key=sortkeygen)
 
if __name__ == '__main__':
import string
 
ns = naturalsort
 
print '\n# Ignoring leading spaces'
txt = ['%signore leading spaces: 2%+i' % (' '*i, i-2) for i in range(4)]
print 'Text strings:'; pp(txt)
print 'Normally sorted :'; pp(sorted(txt))
print 'Naturally sorted:'; pp(ns(txt))
 
print '\n# Ignoring multiple adjacent spaces (m.a.s)'
txt = ['ignore m.a.s%s spaces: 2%+i' % (' '*i, i-2) for i in range(4)]
print 'Text strings:'; pp(txt)
print 'Normally sorted :'; pp(sorted(txt))
print 'Naturally sorted:'; pp(ns(txt))
 
print '\n# Equivalent whitespace characters'
txt = ['Equiv.%sspaces: 3%+i' % (ch, i-3)
for i,ch in enumerate(reversed(string.whitespace))]
print 'Text strings:'; pp(txt)
print 'Normally sorted :'; pp(sorted(txt))
print 'Naturally sorted:'; pp(ns(txt))
 
print '\n# Case Indepenent sort'
s = 'CASE INDEPENENT'
txt = [s[:i].lower() + s[i:] + ': 3%+i' % (i-3) for i in range(1,5)]
print 'Text strings:'; pp(txt)
print 'Normally sorted :'; pp(sorted(txt))
print 'Naturally sorted:'; pp(ns(txt))
 
print '\n# Numeric fields as numerics'
txt = ['foo100bar99baz0.txt', 'foo100bar10baz0.txt',
'foo1000bar99baz10.txt', 'foo1000bar99baz9.txt']
print 'Text strings:'; pp(txt)
print 'Normally sorted :'; pp(sorted(txt))
print 'Naturally sorted:'; pp(ns(txt))
 
print '\n# Title sorts'
txt = ['The Wind in the Willows','The 40th step more',
'The 39 steps', 'Wanda']
print 'Text strings:'; pp(txt)
print 'Normally sorted :'; pp(sorted(txt))
print 'Naturally sorted:'; pp(ns(txt))
 
print '\n# Equivalent accented characters (and case)'
txt = ['Equiv. %s accents: 2%+i' % (ch, i-2)
for i,ch in enumerate(u'\xfd\xddyY')]
print 'Text strings:'; pp(txt)
print 'Normally sorted :'; pp(sorted(txt))
print 'Naturally sorted:'; pp(ns(txt))
 
print '\n# Separated ligatures'
txt = [u'\462 ligatured ij', 'no ligature',]
print 'Text strings:'; pp(txt)
print 'Normally sorted :'; pp(sorted(txt))
print 'Naturally sorted:'; pp(ns(txt))
 
print '\n# Character replacements'
s = u'ʒſßs' # u'\u0292\u017f\xdfs'
txt = ['Start with an %s: 2%+i' % (ch, i-2)
for i,ch in enumerate(s)]
print 'Text strings:'; pp(txt)
print 'Normally sorted :'; print '\n'.join(sorted(txt))
print 'Naturally sorted:'; print '\n'.join(ns(txt))

[edit] Sample Python output

# Ignoring leading spaces
Text strings:
['ignore leading spaces: 2-2',
 ' ignore leading spaces: 2-1',
 '  ignore leading spaces: 2+0',
 '   ignore leading spaces: 2+1']
Normally sorted :
['   ignore leading spaces: 2+1',
 '  ignore leading spaces: 2+0',
 ' ignore leading spaces: 2-1',
 'ignore leading spaces: 2-2']
Naturally sorted:
['  ignore leading spaces: 2+0',
 '   ignore leading spaces: 2+1',
 ' ignore leading spaces: 2-1',
 'ignore leading spaces: 2-2']

# Ignoring multiple adjacent spaces (m.a.s)
Text strings:
['ignore m.a.s spaces: 2-2',
 'ignore m.a.s  spaces: 2-1',
 'ignore m.a.s   spaces: 2+0',
 'ignore m.a.s    spaces: 2+1']
Normally sorted :
['ignore m.a.s    spaces: 2+1',
 'ignore m.a.s   spaces: 2+0',
 'ignore m.a.s  spaces: 2-1',
 'ignore m.a.s spaces: 2-2']
Naturally sorted:
['ignore m.a.s   spaces: 2+0',
 'ignore m.a.s    spaces: 2+1',
 'ignore m.a.s  spaces: 2-1',
 'ignore m.a.s spaces: 2-2']

# Equivalent whitespace characters
Text strings:
['Equiv. spaces: 3-3',
 'Equiv.\rspaces: 3-2',
 'Equiv.\x0cspaces: 3-1',
 'Equiv.\x0bspaces: 3+0',
 'Equiv.\nspaces: 3+1',
 'Equiv.\tspaces: 3+2']
Normally sorted :
['Equiv.\tspaces: 3+2',
 'Equiv.\nspaces: 3+1',
 'Equiv.\x0bspaces: 3+0',
 'Equiv.\x0cspaces: 3-1',
 'Equiv.\rspaces: 3-2',
 'Equiv. spaces: 3-3']
Naturally sorted:
['Equiv.\x0bspaces: 3+0',
 'Equiv.\nspaces: 3+1',
 'Equiv.\tspaces: 3+2',
 'Equiv.\x0cspaces: 3-1',
 'Equiv.\rspaces: 3-2',
 'Equiv. spaces: 3-3']

# Case Indepenent sort
Text strings:
['cASE INDEPENENT: 3-2',
 'caSE INDEPENENT: 3-1',
 'casE INDEPENENT: 3+0',
 'case INDEPENENT: 3+1']
Normally sorted :
['cASE INDEPENENT: 3-2',
 'caSE INDEPENENT: 3-1',
 'casE INDEPENENT: 3+0',
 'case INDEPENENT: 3+1']
Naturally sorted:
['casE INDEPENENT: 3+0',
 'case INDEPENENT: 3+1',
 'caSE INDEPENENT: 3-1',
 'cASE INDEPENENT: 3-2']

# Numeric fields as numerics
Text strings:
['foo100bar99baz0.txt',
 'foo100bar10baz0.txt',
 'foo1000bar99baz10.txt',
 'foo1000bar99baz9.txt']
Normally sorted :
['foo1000bar99baz10.txt',
 'foo1000bar99baz9.txt',
 'foo100bar10baz0.txt',
 'foo100bar99baz0.txt']
Naturally sorted:
['foo100bar10baz0.txt',
 'foo100bar99baz0.txt',
 'foo1000bar99baz9.txt',
 'foo1000bar99baz10.txt']

# Title sorts
Text strings:
['The Wind in the Willows', 'The 40th step more', 'The 39 steps', 'Wanda']
Normally sorted :
['The 39 steps', 'The 40th step more', 'The Wind in the Willows', 'Wanda']
Naturally sorted:
['The 39 steps', 'The 40th step more', 'Wanda', 'The Wind in the Willows']

# Equivalent accented characters (and case)
Text strings:
[u'Equiv. \xfd accents: 2-2',
 u'Equiv. \xdd accents: 2-1',
 u'Equiv. y accents: 2+0',
 u'Equiv. Y accents: 2+1']
Normally sorted :
[u'Equiv. Y accents: 2+1',
 u'Equiv. y accents: 2+0',
 u'Equiv. \xdd accents: 2-1',
 u'Equiv. \xfd accents: 2-2']
Naturally sorted:
[u'Equiv. y accents: 2+0',
 u'Equiv. Y accents: 2+1',
 u'Equiv. \xdd accents: 2-1',
 u'Equiv. \xfd accents: 2-2']

# Separated ligatures
Text strings:
[u'\u0132 ligatured ij', 'no ligature']
Normally sorted :
['no ligature', u'\u0132 ligatured ij']
Naturally sorted:
[u'\u0132 ligatured ij', 'no ligature']

# Character replacements
Text strings:
[u'Start with an \u0292: 2-2',
 u'Start with an \u017f: 2-1',
 u'Start with an \xdf: 2+0',
 u'Start with an s: 2+1']
Normally sorted :
Start with an s: 2+1
Start with an ß: 2+0
Start with an ſ: 2-1
Start with an ʒ: 2-2
Naturally sorted:
Start with an s: 2+1
Start with an ſ: 2-1
Start with an ʒ: 2-2
Start with an ß: 2+0

[edit] Racket

Implements 1-4 (but only normalize spaces -- don't ignore spaces at the beginning/end, easy to implement, but sounds wrong).

 
#lang racket
(define (natural-sort l)
(define (list<? l1 l2)
(cond [(null? l2) #f]
[(null? l1) #t]
[(number? (car l1)) (cond [(< (car l1) (car l2)) #t]
[(< (car l2) (car l1)) #f]
[else (list<? (cdr l1) (cdr l2))])]
[(string? (car l1)) (cond [(string<? (car l1) (car l2)) #t]
[(string<? (car l2) (car l1)) #f]
[else (list<? (cdr l1) (cdr l2))])]))
(define (->keys s)
(define s* (string-normalize-spaces (string-foldcase s)))
(for/list ([x (regexp-match* #px"\\d+" s* #:gap-select? #t)]
[i (in-naturals)])
(if (odd? i) (string->number x) x)))
(sort l list<? #:key ->keys #:cache-keys? #t))
 
(natural-sort
(shuffle '("foo9.txt" "foo10.txt" "x9y99" "x9y100" "x10y0" "x z" "x y")))
;; => '("foo9.txt" "foo10.txt" "x9y99" "x9y100" "x10y0" "x y" "x z")
 

[edit] Ruby

Requirements 1,2,3 and 5 are met in one line of code:

ar.sort_by{|str| str.downcase.gsub(/\Athe |\Aa |\Aan /, "").lstrip.gsub(/\s+/, " ")}

Almost all of the code below is handling requirement 4. The problem is that Ruby will happily sort ["a",1] against ["a",2] or even ["b"], but it does not know how to handle [1, "a"] against ["a", 2] and raises an ArgumentError. The code below does not define a new sort method, it defines a new class which is sortable by the existing method (falling back on string comparison).

class NatSortString 
include Comparable
attr_reader :scrubbed, :ints_and_strings, :i_s_pattern
 
def initialize(str)
@str = str
@scrubbed = str.downcase.gsub(/\Athe |\Aa |\Aan /, "").lstrip.gsub(/\s+/, " ")
@ints_and_strings = @scrubbed.scan(/\d+|\D+/).map{|s| s =~ /\d/ ? s.to_i : s}
@i_s_pattern = @ints_and_strings.map{|el| el.is_a?(Integer) ? :i : :s}.join
end
 
def <=> (other)
if i_s_pattern.start_with?(other.i_s_pattern) or other.i_s_pattern.start_with?(i_s_pattern) then
ints_and_strings <=> other.ints_and_strings
else
scrubbed <=> other.scrubbed
end
end
 
def to_s
@str.dup
end
 
end
 

Demo:

tests = 
{"Ignoring leading spaces" =>
[ "ignore leading spaces: 2-2 ", " ignore leading spaces: 2-1 ", " ignore leading spaces: 2+0 ", " ignore leading spaces: 2+1 "],
"Ignoring multiple adjacent spaces" =>
[ "ignore m.a.s spaces: 2-2 ", "ignore m.a.s spaces: 2-1 ", "ignore m.a.s spaces: 2+0 ", "ignore m.a.s spaces: 2+1 "],
"Equivalent whitespace characters" =>
["Equiv. spaces: 3-3", "Equiv.\rspaces: 3-2", "Equiv.\x0cspaces: 3-1", "Equiv.\x0bspaces: 3+0", "Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2"],
"Case Indepenent sort" =>
[ "cASE INDEPENENT: 3-2 ", "caSE INDEPENENT: 3-1 ", "casE INDEPENENT: 3+0 ", "case INDEPENENT: 3+1 "],
"Numeric fields as numerics" =>
[ "foo100bar99baz0.txt ", "foo100bar10baz0.txt ", "foo1000bar99baz10.txt ", "foo1000bar99baz9.txt "],
"Title sorts" =>
[ "The Wind in the Willows ", "The 40th step more ", "The 39 steps ", "Wanda "]}
 
tests.each do |title, ar|
nat_sorts = ar.map{|s| NatSortString.new(s)}
puts [title,"--input--", ar, "--normal sort--", ar.sort, "--natural sort--", nat_sorts.sort, "\n"]
end
 
Output:
Ignoring leading spaces
--input--
ignore leading spaces: 2-2 
 ignore leading spaces: 2-1 
  ignore leading spaces: 2+0 
   ignore leading spaces: 2+1 
--normal sort--
   ignore leading spaces: 2+1 
  ignore leading spaces: 2+0 
 ignore leading spaces: 2-1 
ignore leading spaces: 2-2 
--natural sort--
  ignore leading spaces: 2+0 
   ignore leading spaces: 2+1 
 ignore leading spaces: 2-1 
ignore leading spaces: 2-2 

Ignoring multiple adjacent spaces
--input--
ignore m.a.s spaces: 2-2 
ignore m.a.s  spaces: 2-1 
ignore m.a.s   spaces: 2+0 
ignore m.a.s    spaces: 2+1 
--normal sort--
ignore m.a.s    spaces: 2+1 
ignore m.a.s   spaces: 2+0 
ignore m.a.s  spaces: 2-1 
ignore m.a.s spaces: 2-2 
--natural sort--
ignore m.a.s   spaces: 2+0 
ignore m.a.s    spaces: 2+1 
ignore m.a.s  spaces: 2-1 
ignore m.a.s spaces: 2-2 

Equivalent whitespace characters
--input--
Equiv. spaces: 3-3
spaces: 3-2
Equiv.
      spaces: 3-1
Equiv.
      spaces: 3+0
Equiv.
spaces: 3+1
Equiv.	spaces: 3+2
--normal sort--
Equiv.	spaces: 3+2
Equiv.
spaces: 3+1
Equiv.
      spaces: 3+0
Equiv.
      spaces: 3-1
spaces: 3-2
Equiv. spaces: 3-3
--natural sort--
Equiv.
      spaces: 3+0
Equiv.
spaces: 3+1
Equiv.	spaces: 3+2
Equiv.
      spaces: 3-1
spaces: 3-2
Equiv. spaces: 3-3

Case Indepenent sort
--input--
cASE INDEPENENT: 3-2 
caSE INDEPENENT: 3-1 
casE INDEPENENT: 3+0 
case INDEPENENT: 3+1 
--normal sort--
cASE INDEPENENT: 3-2 
caSE INDEPENENT: 3-1 
casE INDEPENENT: 3+0 
case INDEPENENT: 3+1 
--natural sort--
casE INDEPENENT: 3+0 
case INDEPENENT: 3+1 
caSE INDEPENENT: 3-1 
cASE INDEPENENT: 3-2 

Numeric fields as numerics
--input--
foo100bar99baz0.txt 
foo100bar10baz0.txt 
foo1000bar99baz10.txt 
foo1000bar99baz9.txt 
--normal sort--
foo1000bar99baz10.txt 
foo1000bar99baz9.txt 
foo100bar10baz0.txt 
foo100bar99baz0.txt 
--natural sort--
foo100bar10baz0.txt 
foo100bar99baz0.txt 
foo1000bar99baz9.txt 
foo1000bar99baz10.txt 

Title sorts
--input--
The Wind in the Willows 
The 40th step more 
The 39 steps 
Wanda 
--normal sort--
The 39 steps 
The 40th step more 
The Wind in the Willows 
Wanda 
--natural sort--
The 39 steps 
The 40th step more 
Wanda 
The Wind in the Willows 

[edit] Tcl

Tcl supports two methods of doing sorting by non-natural keys in the lsort command: the -command option allows the specification of code that makes the ordering decision for a pair of values, but instead the code below demonstrates sorting through the use of collation keys, strings that when sorted in their normal order result in the natural order being used. (These are handled through the use of the -indices option which makes it easy to generate a sorted original list without any need to build compound intermediate tuples.)

Note also that Tcl supports case-insensitive sorting and “treat digit sequences as numbers” as native sorting options. (The latter is particularly useful for handling filenames.)

package require Tcl 8.5
 
proc sortWithCollationKey {keyBuilder list} {
if {![llength $list]} return
foreach value $list {
lappend toSort [{*}$keyBuilder $value]
}
foreach idx [lsort -indices $toSort] {
lappend result [lindex $list $idx]
}
return $result
}
proc normalizeSpaces {str} {
regsub -all {[ ]+} [string trim $str " "] " "
}
proc equivalentWhitespace {str} {
regsub -all {\s} $str " "
}
 
proc show {description sorter strings} {
puts "Input:\n\t[join $strings \n\t]"
set sorted [lsort $strings]
puts "Normally sorted:\n\t[join $sorted \n\t]"
set sorted [{*}$sorter $strings]
puts "Naturally sorted with ${description}:\n\t[join $sorted \n\t]"
}
 
# Two demonstrations of the space normalizer
show "normalized spaces" {sortWithCollationKey normalizeSpaces} {
{ignore leading spaces: 2-2}
{ ignore leading spaces: 2-1}
{ ignore leading spaces: 2+0}
{ ignore leading spaces: 2+1}}
show "normalized spaces" {sortWithCollationKey normalizeSpaces} {
{ignore m.a.s spaces: 2-2}
{ignore m.a.s spaces: 2-1}
{ignore m.a.s spaces: 2+0}
{ignore m.a.s spaces: 2+1}}
 
# Use a collation key that maps all whitespace to spaces
show "all whitespace equivalent" {sortWithCollationKey equivalentWhitespace} {
"Equiv. spaces: 3-3"
"Equiv.\rspaces: 3-2"
"Equiv.\u000cspaces: 3-1"
"Equiv.\u000bspaces: 3+0"
"Equiv.\nspaces: 3+1"
"Equiv.\tspaces: 3+2"}
 
# These are built-in modes
show "(built-in) case insensitivity" {lsort -nocase} {
{cASE INDEPENENT: 3-2}
{caSE INDEPENENT: 3-1}
{casE INDEPENENT: 3+0}
{case INDEPENENT: 3+1}}
show "digit sequences as numbers" {lsort -dictionary} {
foo100bar99baz0.txt
foo100bar10baz0.txt
foo1000bar99baz10.txt
foo1000bar99baz9.txt}

Output:

Input:
	ignore leading spaces: 2-2
	 ignore leading spaces: 2-1
	  ignore leading spaces: 2+0
	   ignore leading spaces: 2+1
Normally sorted:
	   ignore leading spaces: 2+1
	  ignore leading spaces: 2+0
	 ignore leading spaces: 2-1
	ignore leading spaces: 2-2
Naturally sorted with normalized spaces:
	  ignore leading spaces: 2+0
	   ignore leading spaces: 2+1
	 ignore leading spaces: 2-1
	ignore leading spaces: 2-2
Input:
	ignore m.a.s spaces: 2-2
	ignore m.a.s  spaces: 2-1
	ignore m.a.s   spaces: 2+0
	ignore m.a.s    spaces: 2+1
Normally sorted:
	ignore m.a.s    spaces: 2+1
	ignore m.a.s   spaces: 2+0
	ignore m.a.s  spaces: 2-1
	ignore m.a.s spaces: 2-2
Naturally sorted with normalized spaces:
	ignore m.a.s   spaces: 2+0
	ignore m.a.s    spaces: 2+1
	ignore m.a.s  spaces: 2-1
	ignore m.a.s spaces: 2-2
Input:
	Equiv. spaces: 3-3
spaces: 3-2iv.
	Equiv.
              spaces: 3-1
	Equiv.
              spaces: 3+0
	Equiv.
spaces: 3+1
	Equiv.	spaces: 3+2
Normally sorted:
	Equiv.	spaces: 3+2
	Equiv.
spaces: 3+1
	Equiv.
              spaces: 3+0
	Equiv.
              spaces: 3-1
spaces: 3-2iv.
	Equiv. spaces: 3-3
Naturally sorted with all whitespace equivalent:
	Equiv.
              spaces: 3+0
	Equiv.
spaces: 3+1
	Equiv.	spaces: 3+2
	Equiv.
              spaces: 3-1
spaces: 3-2iv.
	Equiv. spaces: 3-3
Input:
	cASE INDEPENENT: 3-2
	caSE INDEPENENT: 3-1
	casE INDEPENENT: 3+0
	case INDEPENENT: 3+1
Normally sorted:
	cASE INDEPENENT: 3-2
	caSE INDEPENENT: 3-1
	casE INDEPENENT: 3+0
	case INDEPENENT: 3+1
Naturally sorted with (built-in) case insensitivity:
	casE INDEPENENT: 3+0
	case INDEPENENT: 3+1
	caSE INDEPENENT: 3-1
	cASE INDEPENENT: 3-2
Input:
	foo100bar99baz0.txt
	foo100bar10baz0.txt
	foo1000bar99baz10.txt
	foo1000bar99baz9.txt
Normally sorted:
	foo1000bar99baz10.txt
	foo1000bar99baz9.txt
	foo100bar10baz0.txt
	foo100bar99baz0.txt
Naturally sorted with digit sequences as numbers:
	foo100bar10baz0.txt
	foo100bar99baz0.txt
	foo1000bar99baz9.txt
	foo1000bar99baz10.txt

[edit] zkl

These are all immutable list sorts. If using mutable lists, mangle a copy and sort that.

First, a tag-mangled-fields sort, use tags to pull from the original list in sorted order function:

fcn dsuSort(x,orig){ // decorate-sort-undecorate sort
x.enumerate().sort(fcn([(_,a)],[(_,b)]){a<b})
.apply('wrap([(n,_)]){orig[n]});
}

Now, mangle a copy of the original list to "normalize" it to the task requirements and sort.

# Ignoring leading spaces
ts1:=T("ignore leading spaces: 2-2", " ignore leading spaces: 2-1",
" ignore leading spaces: 2+0", " ignore leading spaces: 2+1");
dsuSort(ts1.apply("strip"),ts1).println();
Output:
"  ignore leading spaces: 2+0"
"   ignore leading spaces: 2+1"
" ignore leading spaces: 2-1"
"ignore leading spaces: 2-2"
# Ignoring multiple adjacent spaces (m.a.s)
ts2:=T("ignore m.a.s spaces: 2-2", "ignore m.a.s spaces: 2-1",
"ignore m.a.s spaces: 2+0", "ignore m.a.s spaces: 2+1");
dsuSort(ts2.apply('-(" ")),ts2).println();
Output:
"ignore m.a.s   spaces: 2+0"
"ignore m.a.s    spaces: 2+1"
"ignore m.a.s  spaces: 2-1"
"ignore m.a.s spaces: 2-2"
# Equivalent whitespace characters
ts3:=T("Equiv. spaces: 3-3", "Equiv.\rspaces: 3-2", "Equiv.\x0cspaces: 3-1",
"Equiv.\x0bspaces: 3+0", "Equiv.\nspaces: 3+1", "Equiv.\tspaces: 3+2");
dsuSort(ts3.apply('-.fp1("\n\r\t\f\b\x0b ")),ts3).println();
Output:
"Equiv.?spaces: 3+0"
"Equiv.?spaces: 3+1"
"Equiv.	spaces: 3+2"
"Equiv.?spaces: 3-1"
"Equiv.?spaces: 3-2"
"Equiv. spaces: 3-3"
# Case Indepenent sort
ts4:=T("cASE INDEPENENT: 3-2", "caSE INDEPENENT: 3-1",
"casE INDEPENENT: 3+0", "case INDEPENENT: 3+1");
dsuSort(ts4.apply("toLower"),ts4).println();
Output:
"casE INDEPENENT: 3+0"
"case INDEPENENT: 3+1"
"caSE INDEPENENT: 3-1"
"cASE INDEPENENT: 3-2"

And the nasty one: multiple numeric fields. Each key is blown apart into a list of ints, where each int is consecutive numeric characters. To compare a pair of fields (ie two lists of ints), walk the two in parallel, stopping when a difference is found. Ugly but it works. Another tactic would be to normalize each field to the same length, concat the fields and use that as the mangled key. But that might have issues if the number of fields differ, the former doesn't care.

# Numeric fields as numerics
fcn fieldize(s){
s.apply(fcn(c){"1234567890".holds(c) and c or "."}).split(".")
.filter().apply("toInt");
}
fcn fcmp(a,b){ // T(key, T(numeric fields)), eg L(0, L(100,99,0))
a[1].zip(b[1]).reduce(fcn(_,[(a,b)]){
if(a==b)return(True); // continue to next field
return(Void.Stop,a<b);
},True);
}
fcn fsort(list){
list.apply(fieldize).enumerate().sort(fcmp)
.apply('wrap([(n,_)]){list[n]});
}
ts5:=T("foo100bar99baz0.txt", "foo100bar10baz0.txt", "foo1000bar99baz10.txt",
"foo1000bar99baz9.txt");
fsort(ts5).println();
 
x:=T("x9y99","foo10.txt","x10y0","foo9.txt","x9y100");
fsort(x).println();
Output:
"foo100bar10baz0.txt"
"foo100bar99baz0.txt"
"foo1000bar99baz9.txt"
"foo1000bar99baz10.txt"

L("foo9.txt","x9y99","x9y100","x10y0","foo10.txt")
Personal tools
Namespaces

Variants
Actions
Community
Explore
Misc
Toolbox