Category talk:Wren-str: Difference between revisions

→‎Source code: Added Str.lastIndexOf method.
(→‎Source code: Added a Str.indexOf method and a Utf8 class.)
(→‎Source code: Added Str.lastIndexOf method.)
 
(28 intermediate revisions by the same user not shown)
Line 1:
===Source code===
 
<langsyntaxhighlight ecmascriptlang="wren">/* Module "str.wren" */
 
/*
Char contains routines to perform various operations on characters.
A 'character' for this purpose is a single Unicode codepoint.
For convenience a string containing more than one character can be passed
Categorization and casing is supported for characters < 256 (Latin-1) but no higher.
The 'symbol' category includes 'other letter', 'other number' and soft hyphen (ªº¹²³¼½¾¯).
For convenience a string containing more than one character can be passed
as an argument but the methods will only operate on the first character.
*/
Line 18 ⟶ 21:
// Checks if the first character of a string falls into a particular category.
static isAscii(c) { code(c) < 128 }
static isSymbolisLatin1(c) { code(c) && "$+<=>^`|~".contains(c[0]) 256 }
static isControl(c) { (c = code(c)) && (c < 32 || c == 127) }
static isDigit(c) { (c = code(c)) && c >= 48 && c <= 57 }
static isLower(c) { (c = code(c)) && c >= 97 && c <= 122 }
static isUpper(c) { (c = code(c)) && c >= 65 && c <= 90 }
static isPrintable(c) { (c = code(c)) && c >= 32 && c < 127 }
static isSpace(c) { (c = code(c)) && (c == 32 || c == 9 || c == 10 || c == 13) }
static isWhitespace(c) { (c = code(c)) && (c == 32 || (c >= 9 && c <= 13)) }
 
// ASCII categories.
/* Rather than use combinations of the above, these only call the 'code' method once. */
static isDigit(c) { (c = code(c)) && c >= 48 && c <= 57 }
static isAsciiLower(c) { (c = code(c)) && c >= 97 && c <= 122 }
static isAsciiUpper(c) { (c = code(c)) && c >= 65 && c <= 90 }
static isAsciiLetter(c) { isAsciiLower(c) || isAsciiUpper(c) }
static isAsciiAlphaNum(c) { isAsciiLower(c) || isAsciiUpper(c) || isDigit(c) }
static isSpace(c) { (c = code(c)) && (c == 32 || c == 9 || c == 10 || c == 13) }
 
// Latin-1 categories.
static isLetter(c) {
static isLower(c) {
var d = code(c)
return (d >= 6597 && d <= 90122) || (d == 181) || (d >= 97223 && d <= 122246) ||
(d >= 248 && d <= 255)
}
 
static isAlphanumericisUpper(c) {
var d = code(c)
return (d >= 65 && d <= 90) || (d >= 97192 && d <= 122214) || (d >= 48216 && d <= 57222)
}
 
static isPunctuationisLetter(c) { isLower(c) || isUpper(c) }
static isAlphaNumeric(c) { isLower(c) || isUpper(c) || isDigit(c) }
 
static isControl(c) {
var d = code(c)
ifreturn (d < 3332 || (d >= 126)127 return&& falsed < 160)
if ((d >= 65 && d <= 90) || (d >= 97 && d <= 122) || (d >= 48 && d <= 57)) return false
if ("$+<=>^`|~".contains(c[0])) return false
return true
}
 
static isPrintable(c) {
var d = code(c)
return (d >= 32 && d < 127) || (d >= 160 && d < 256)
}
 
static isGraphic(c) {
var d = code(c)
return (d >= 33 && d < 127) || (d >= 161 && d < 256)
}
 
static isWhitespace(c) {
var d = code(c)
return d == 32 || (d >= 9 && d <= 13) || d == 160
}
static isPunctuation(c) { code(c) && "!\"#\%&'()*,-./:;?@[\\]_{}¡§«¶·»¿".contains(c[0]) }
 
static isSymbol(c) { isGraphic(c) && !isAlpaNumeric(c) && !isPunctuation(c) }
 
static category(c) {
var d = code(c)
return (d < 32) || d == 127) ? "control" :
(d == 32) ? "space" :
(d >= 48 && d <= 57) ? "digit" :
(d >= 64 65 && d <= 90) ? "upper" :
(d >= 97 && d <= 122) ? "lower" :
(d >=128) 127 && d <= 159) ? "control" ? "non-ascii" :
"$+<=>^`|~".contains(c[0]d == 160) ? "symbolspace" : "punctuation" :
(d == 181) ? "lower" :
(d >= 192 && d <= 214) ? "upper" :
(d >= 216 && d <= 222) ? "upper" :
(d >= 223 && d <= 246) ? "lower" :
(d >= 248 && d <= 255) ? "lower" :
(d >= 256) ? "non-latin1" :
isPunctuation(c) ? "punctuation" : "symbol"
}
 
// ReturnReturns the first character of a string converted to the appropriatelower case.
static lower(c) {
static upper(c) { ((c = code(c)) && c >= 97 && c <= 122) ? fromCode(c-32) : fromCode(c) }
var d = code(c)
static lower(c) { ((c = code(c)) && c >= 65 && c <= 90) ? fromCode(c+32) : fromCode(c) }
if ((d >= 65 && d <= 90) || (d >= 192 && d <= 214) || (d >= 216 && d <= 222)) {
return fromCode(d+32)
}
return c[0]
}
 
// Returns the first character of a string converted to upper case.
static upper(c) {
var d = code(c)
if ((d >= 97 && d <= 122) || (d >= 224 && d <= 246) || (d >= 248 && d <= 254)) {
return fromCode(d-32)
}
return c[0]
}
 
// Swaps the case of the first character in a string.
static swapCase(c) {
var d = code(c)
if ((d >= 65 && d <= 90) return|| fromCode(d+32 >= 192 && d <= 214) || (d >= 216 && d <= 222)) {
if (d >= 97 && d <= 122) return fromCode(d-+32)
}
if ((d >= 97 && d <= 122) || (d >= 224 && d <= 246) || (d >= 248 && d <= 254)) {
return fromCode(d-32)
}
return c[0]
}
Line 84 ⟶ 133:
static compare(s1, s2) {
if (s1 == s2) return 0
var cp1 = s1.codePoints.toList
var cp2 = s2.codePoints.toList
var len = (cp1.count <= cp2.count) ? cp1.count : cp2.count
for (i in 0...len) {
Line 95 ⟶ 144:
 
// Checks if a string falls into a particular category.
static allAscii(s) { s != "" && s.codePoints.all { |c| c < 128 } }
static allDigitsallLatin1(s) { s != "" && s.codePoints.all { |c| c >= 48 && c <= 57256 } }
static allLowerallDigits(s) { s != "" && s.codePoints.all { |c| c >= 9748 && c <= 122 57 } }
static allUpperallAsciiLower(s) { s {!= "" && s.codePoints.all { |c| c >= 6597 && c <= 90 122 } }
static allPrintableallAsciiUpper(s) { s != "" && s.codePoints.all { |c| c >= 3265 && c <= 127 90 } }
static allWhitespaceallAsciiLetters(s) { s != "" && s.codePointstoList.all { |c| c == 32 || Char.isAsciiLetter(c) >= 9 && c <= 13) } }
static allAsciiAlphaNum(s) { s != "" && s.toList.all { |c| Char.isAsciiAlphaNum(c) } }
 
static allLettersallSpace(s) { s != "" && s.codePointstoList.all { |c| Char.isSpace(c) } }
static allLower(s) return (c > { s != 65"" && cs.toList.all <= 90){ |c| Char.isLower(c) >= 97 && c <= 122) } }
static allUpper(s) { s != "" && s.toList.all { |c| Char.isUpper(c) } }
} }
static allLetters(s) { s != "" && s.toList.all { |c| Char.isLetter(c) } }
 
static allAlphanumericallAlphaNumeric(s) { s != "" && s.codepointstoList.all { |c| Char.isAlphanumeric(c) } }
static return allPrintable(cs) >= 65 && c <={ s 90) || (c >!= 97"" && cs.toList.all <= 122){ |c| Char.isPrintable(c) >= 48 && c <=} 57)}
static allGraphic(s) { s != "" && s.toList.all { |c| Char.isGraphic(c) } }
} }
static allWhitespace(s) { s != "" && s.toList.all { |c| Char.isWhitespace(c) } }
 
// Checks whether a string can be parsed to a number, an integer or a non-integer (float).
Line 119 ⟶ 169:
if (!(s is String)) s = "%(s)"
if (s == "") return s
var cpschars = s.codePoints.toList
forvar (icount in= 0...cpschars.count) {
var ci = cps[i]0
iffor (c >=in 65 && c <= 90s.codePoints) cps[i] = c + 32{
if ((c >= 65 && c <= 90) || (c >= 192 && c <= 214) || (c >= 216 && c <= 222)) {
chars[i] = String.fromCodePoint(c + 32)
}
i = i + 1
}
return cps.reduce("")count {< |acc,1000) c|? accStrs.concat_(chars) +: StringStrs.fromCodePointconcat(c)chars, }1000)
}
 
Line 131 ⟶ 185:
if (!(s is String)) s = "%(s)"
if (s == "") return s
var cpschars = s.codePoints.toList
forvar (icount in= 0...cpschars.count) {
var ci = cps[i]0
iffor (c >=in 97 && c <= 122s.codePoints) cps[i] = c - 32{
if ((c >= 97 && c <= 122) || (c >= 224 && c <= 246) || (c >= 248 && c <= 254)) {
chars[i] = String.fromCodePoint(c - 32)
}
i = i + 1
}
return cps.reduce("")count {< |acc,1000) c|? accStrs.concat_(chars) +: StringStrs.fromCodePointconcat(c)chars, }1000)
}
 
Line 143 ⟶ 201:
if (!(s is String)) s = "%(s)"
if (s == "") return s
var cpschars = s.codePoints.toList
forvar (icount in= 0...cpschars.count) {
var ci = cps[i]0
iffor (c >= 65 && c <=in 90s.codePoints) {
if ((c >= 65 cps[i]&& c <= 90) || (c +>= 192 && c <= 214) || (c >= 216 && c <= 222)) 32{
} else if (c >chars[i] = 97 && String.fromCodePoint(c <=+ 12232) {
} else if ((c cps[i]>= 97 && c <= 122) || (c ->= 224 && c <= 246) 32||
(c >= 248 && c <= 254)) {
chars[i] = String.fromCodePoint(c - 32)
}
i = i + 1
}
return cps.reduce("")count {< |acc,1000) c|? accStrs.concat_(chars) +: StringStrs.fromCodePointconcat(c)chars, }1000)
}
 
Line 159 ⟶ 220:
if (!(s is String)) s = "%(s)"
if (s == "") return s
var cpsstart = (s.codePointsstartsWith("[") && s.toListcount > 1) ? 1 : 0
var startc = (s[start].startsWith("codePoints[") && cps.count > 1) ? 1 : 0]
if ((c >= 97 && c <= 122) || (c >= 224 && c <= 246) || (c >= 248 && c <= 254)) {
var c = cps[start]
if (c >= 97 &&var ccs <= 122String.fromCodePoint(c - 32) {+ s[start+1..-1]
cps[if (start] == c1) cs = "[" -+ 32cs
return cps.reduce("") { |acc, c| acc + String.fromCodePoint(c) }cs
}
return s
Line 174 ⟶ 235:
if (s == "") return s
var words = s.split(" ")
return Strs.join(words.map { |w| capitalize(w) }.join(toList, " ")
}
 
// Removes accents and cedillas from all Latin-1 supplement characters in a string
// and also expands digraphs before returning the result.
static unaccent(s) {
if (!(s is String)) s = "%(s)"
if (s == "") return s
var accented = [
"àáâãäå", "ÀÁÂÃÄÅ", "ç", "Ç", "ð", "Ð", "èéêë", "ÈÉÊË", "ìíîï", "ÌÍÎÏ",
"ñ", "Ñ", "òóôõöø", "ÒÓÔÕÖØ", "ùúûü", "ÙÚÛÜ", "ýÿ", "Ý"
]
var unaccented = "aAcCdDeEiInNoOuUyY"
var digraphs = { "æ": "ae", "Æ": "AE", "þ": "th", "Þ": "TH", "ß": "ss" }
var r = ""
var chars = s.toList
var count = chars.count
var i = 0
for (c in s.codePoints) {
if (c >= 0xc0 && c <= 0xff) {
var found = false
for (j in 0...accented.count) {
if (accented[j].indexOf(chars[i]) >= 0) {
chars[i] = unaccented[j]
found = true
break
}
}
if (!found && digraphs.containsKey(chars[i])) chars[i] = digraphs[chars[i]]
}
i = i + 1
}
return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000)
}
 
Line 183 ⟶ 276:
}
 
// Performs a circular shift of the characters of 's' one'n' placeplaces to the left.
// If 'n' is negative performs a circular right shift by '-n' places instead.
static lshift(s) {
static lshift(s, n) {
if (!(s is String)) s = "%(s)"
if (!(n is Num) || !n.isInteger) Fiber.abort("'n' must be an integer.")
var chars = s.toList
var count = chars.count
if (count < 2) return s
varif t(n =< chars[0]) return rshift(s, -n)
forn (i= inn % 0..count-2) chars[i] = chars[i+1]
chars[-1]if (n == 0) return ts
returnfor chars.join(i in 1..n) {
var t = chars[0]
for (j in 0..count-2) chars[j] = chars[j+1]
chars[-1] = t
}
return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000)
}
 
// Performs a circular shift of the characters of 's' one'n' placeplaces to the right.
// If 'n' is negative performs a circular left shift by '-n' places instead.
static rshift(s) {
static rshift(s, n) {
if (!(s is String)) s = "%(s)"
if (!(n is Num) || !n.isInteger) Fiber.abort("'n' must be an integer.")
var chars = s.toList
var count = chars.count
if (count < 2) return s
varif t(n =< 0) return lshift(s, chars[-1]n)
forn (i= inn % count-2..0) chars[i+1] = chars[i]
chars[0]if (n == 0) return ts
returnfor chars.join(i in 1..n) {
var t = chars[-1]
for (j in count-2..0) chars[j+1] = chars[j]
chars[0] = t
}
return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000)
}
 
// Convenience versions of the above methods which shift by just 1 place.
/* The indices (or ranges thereof) for all the following functions are measured in codepoints (not bytes).
static lshift(s) { lshift(s, 1) }
As with core library methods, the indices must be within bounds or errors will be generated. */
static rshift(s) { rshift(s, 1) }
 
/* The indices (or ranges thereof) for all the following functions are measured in codepoints
(not bytes). Negative indices count backwards from the end of the string. As with core
library methods, the indices must be within bounds or errors will be generated. */
 
// Extracts the sub-string of 's' over the range 'r'.
Line 214 ⟶ 326:
if (!(r is Range)) Fiber.abort("Second argument must be a range.")
if (!(s is String)) s = "%(s)"
return Strs.concat(s.toList[r].join()
}
 
// Private helper method to check whether an index is valid.
static checkIndex_(s, index, inc) {
if (index.type != Num || !index.isInteger) Fiber.abort("Index must be an integer.")
var c = s.count + inc
if (index >= c || index < -c) Fiber.abort("Index is out of bounds.")
}
 
// Gets the character of 's' at index 'i'. Throws an error if 'i is out of bounds.
static get(s, i) {
if (!(i is Num && i.isInteger && i >= 0)) Fiber.abort("Index must be a non-negative integer.")
if (!(s is String)) s = "%(s)"
checkIndex_(s, i, 0)
if (i < 0) i = s.count + i
return s.toList[i]
}
Line 226 ⟶ 346:
// Gets the character of 's' at index 'i'. Returns null if 'i is out of bounds.
static getOrNull(s, i) {
if (!(i is Num && i.isInteger)) Fiber.abort("Index must be an integer.")
if (!(s is String)) s = "%(s)"
if (!(i is Num && i.isInteger)) Fiber.abort("Index must be an integer.")
if (i < 0) i = s.count + i
return (i >= 0 && i < s.count) ? s.toList[i] : null
}
Line 246 ⟶ 367:
cpCount = cpCount + 1
}
}
 
// Returns the codepoint index (not byte index) at which 'search' first occurs in 's'
// or -1 if 'search' is not found, starting from codepoint offset 'start'.
static indexOf(s, search, start) {
var ss = (start > 0) ? Str.sub(s, start..-1) : s
var ix = Str.indexOf(ss, search)
return (ix >= 0) ? start + ix : -1
}
 
// Returns the codepoint index (not byte index) at which 'search' last occurs in 's'
// or -1 if 'search' is not found.
static lastIndexOf(s, search) {
if (!(search is String)) Fiber.abort("Search argument must be a string.")
if (!(s is String)) s = "%(s)"
var l = s.toList
for (i in l.count-1..0) {
if (l[i] == search) return i
}
return -1
}
 
// Returns the number of non-overlapping occurrences of the string 't'
// within the string 's'.
static occurs(s, t) { s.split(t).count - 1 }
 
// Returns the number of non-overlapping occurrences of the string 't'
// within the string 's' starting from codepoint offset 'start'.
static occurs(s, t, start) {
if (start == 0) return occurs(s, t)
return occurs(Str.sub(s, start..-1), t)
}
 
// Changes the character of 's' at index 'i' to the string 't'.
static change(s, i, t) {
if (!(i is Num && i.isInteger && i >= 0)) Fiber.abort("Index must be a non-negative integer.")
if (!(t is String)) Fiber.abort("Replacement must be a string.")
if (!(s is String)) s = "%(s)"
checkIndex_(s, i, 0)
if (i < 0) i = s.count + i
var chars = s.toList
chars[i] = t
return charsStrs.joinconcat(chars)
}
 
// Inserts at index 'i' of 's' the string 't'.
static insert(s, i, t) {
if (!(i is Num && i.isInteger && i >= 0)) Fiber.abort("Index must be a non-negative integer.")
if (!(t is String)) Fiber.abort("Insertion must be a string.")
if (!(s is String)) s = "%(s)"
checkIndex_(s, i, 1)
if (i < 0) i = s.count + i + 1
var chars = s.toList
chars.insert(i, t)
return charsStrs.joinconcat(chars)
}
 
// Deletes the character of 's' at index 'i'.
static delete(s, i) {
if (!(i is Num && i.isInteger && i >= 0)) Fiber.abort("Index must be a non-negative integer.")
if (!(s is String)) s = "%(s)"
checkIndex_(s, i, 0)
if (i < 0) i = s.count + i
var chars = s.toList
chars.removeAt(i)
return charsStrs.joinconcat(chars)
}
 
// Exchanges the characters of 's' at indices 'i' and 'j'
static exchange(s, i, j) {
if (!(i is Num && i.isInteger && i >= 0)) Fiber.abort("First index must be a non-negative integer.")
if (!(j is Num && j.isInteger && j >= 0)) Fiber.abort("Second index must be a non-negative integer.")
if (!(s is String)) s = "%(s)"
checkIndex_(s, i, 0)
if (i < 0) i = s.count + i
checkIndex_(s, j, 0)
if (j < 0) j = s.count + j
if (i == j) return s
var chars = s.toList
var t = chars[.swap(i], j)
chars[i] =return Strs.concat(chars[j])
}
chars[j] = t
 
return chars.join()
// Returns 's' with 'from' replaced by 'to' up to 'n' times (all times if n is negative)
// but skipping the first 'skip' matches.
static replace(s, from, to, n, skip) {
if (!(from is String)) Fiber.abort("'from 'must be a string.")
if (!(to is String)) Fiber.abort("'to' must be a string.")
if (!(n is Num && n.isInteger)) Fiber.abort("'n' must be an integer.")
if (!(skip is Num && skip.isInteger && skip >= 0)) {
Fiber.abort("'skip' must be a non-negative integer.")
}
if (!(s is String)) s = "%(s)"
if (n < 0) {
if (skip == 0) return s.replace(from, to)
n = Num.maxSafeInteger
}
if (n == 0 || skip >= n) return s
var count = 0
var split = s.split(from)
var res = ""
for (i in 0...split.count-1) {
count = count + 1
res = res + split[i] + ((count <= skip || count > n) ? from : to)
}
return res + split[-1]
}
 
// Convenience version of 'replace' where 'skip' is always zero.
static replace(s, from, to, n) { replace(s, from, to, n, 0) }
 
// Adds 'by' to the start of each line of 's'
// and returns the result.
static indent(s, by) {
if (!(s is String)) Fiber.abort("First argument must be a string.")
if (!(by is String)) Fiber.abort("Second argument must be a string.")
var lines = s.split("\n")
return lines.map { |line| by + line }.join("\n")
}
 
// Removes 'by' from the start of each line of 's' which begins with it
// and returns the result.
static dedent(s, by) {
if (!(s is String)) Fiber.abort("First argument must be a string.")
if (!(by is String)) Fiber.abort("Second argument must be a string.")
var lines = s.split("\n")
var c = by.bytes.count
return lines.map { |line|
if (line.startsWith(by)) return line[c..-1]
return line
}.join("\n")
}
 
// Removes all spaces and tabs from the end of each line of s
// and returns the result.
static tidy(s) {
if (!(s is String)) Fiber.abort("Argument must be a string.")
var lines = s.split("\n")
return lines.map { |line| line.trimEnd(" \t") }.join("\n")
}
 
// Returns 's' repeated 'reps' times.
static repeat(s, reps) {
if (!(s is String)) s = "%(s)"
if (!(reps is Num && reps.isInteger && reps >= 0)) {
Fiber.abort("Repetitions must be a non-negative integer.")
}
var rs = ""
if (reps < 10) {
for (i in 0...reps) rs = rs + s
} else {
while (true) {
if (reps % 2 == 1) rs = rs + s
reps = reps >> 1
if (reps == 0) break
s = s + s
}
}
return rs
}
 
Line 311 ⟶ 544:
if (final > 0) res.add(sub(s, first..-1))
return res
}
 
// Splits 's' into a list of one or more strings separated by 'sep' but removes
// any empty elements from the list.
static splitNoEmpty(s, sep) {
if (!(s is String)) s = "%(s)"
if (!(sep is String) || sep.isEmpty) Fiber.abort("Separator must be a non-empty string.")
var split = s.split(sep)
return split.where { |e| !e.isEmpty }.toList
}
 
// Splits a CSV 'line' into a list of one or more strings separated by 'sep' which must be
// a single character (except \v). Deals properly with embedded separators in quoted fields.
// Removes leading and trailing quotes from quoted fields if 'dequote' is true.
static splitCsv(line, sep, dequote) {
if (!(line is String)) line = "%(line)"
if (!(sep is String) || sep.count != 1) {
Fiber.abort("Separator must be a single character string.")
}
if (!(dequote is Bool)) Fiber.abort("Dequote must be a boolean.")
var fields = line.split(sep)
var count = 0
var quoted = false
var chars = line.toList
for (i in 0...fields.count) {
var f = fields[i]
var fc = f.count
if (fc > 0) {
count = count + fc
if (!quoted && f[0] == "\"") {
if (f[-1] != "\"") {
quoted = true
chars[count] = "\v"
}
} else if (quoted && f[-1] == "\"") {
quoted = false
} else if (quoted) {
chars[count] = "\v"
}
} else if (quoted) {
chars[count] = "\v"
}
count = count + 1
}
fields = chars.join("").split(sep)
for (i in 0...fields.count) fields[i] = fields[i].replace("\v", sep)
if (dequote) {
for (i in 0...fields.count) {
var f = fields[i]
var fc = f.count
if (fc < 2) continue
if (f[0] == "\"" && f[-1] == "\"") fields[i] = f[1...-1]
}
}
return fields
}
 
// Convenience versions of the above method which use default parameters.
static splitCsv(line, sep) { splitCsv(line, sep, true) }
static splitCsv(line) { splitCsv(line, ",", true) }
 
// Splits a string 's' into two parts, before and after the first occurrence
// of 'delim' and returns a list of those parts.
// The 'delim' itself can be optionally included in the second part.
// If 'delim' does not occur in 's', returns [s, ""].
static bisect(s, delim, include) {
if (!(delim is String)) Fiber.abort("Delimiter must be a string.")
if (!(include is Bool)) Fiber.abort("Include must be true or false.")
if (!(s is String)) s = "%(s)"
var ix = s.indexOf(delim)
if (ix == -1) return [s, ""]
if (include) return [s[0...ix], s[ix..-1]]
var len = delim.bytes.count
return [s[0...ix], s[ix + len..-1]]
}
 
// Convenience version of bisect method which never includes the delimiter.
static bisect(s, delim) { bisect(s, delim, false) }
 
// Creates and returns a string from a list of bytes.
static fromBytes(ba) {
if (!(ba is List)) Fiber.abort("Argument must be list of bytes.")
var count = ba.count
if (count == 0) return ""
var chars = ba.map { |b| String.fromByte(b) }.toList
return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000)
}
 
// Creates and returns a string from a list of code points.
static fromCodePoints(ca) {
if (!(ca is List)) Fiber.abort("Argument must be list of code points.")
var count = ca.count
if (count == 0) return ""
var chars = ca.map { |c| String.fromCodePoint(c) }.toList
return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000)
}
 
// After trimming whitespace from the string 's', takes as many characters as possible
// to form a valid number and converts it thereto using the Num.fromString method.
// Returns null if such a conversion is impossible.
static toNum(s) {
if (s is Num) return s
if (!(s is String)) s = "%(s)"
s = s.trim()
var n = Num.fromString(s)
if (n) return n
if (s.count < 2) return null
var chars = s.toList
for (i in chars.count-1..1) {
chars.removeAt(i)
if (n = Num.fromString(chars.join())) return n
}
return null
}
 
// Converts a pattern into a list of tokens for processing by the 'isMatch' method.
// Characters within the pattern are represented as follows:
// Non-wildcard characters as themselves (i.e. single character strings);
// * (or **) by the number 0;
// ? (or *?) by the number 1;
// [set] by a list of the tokens within the set:
// single characters by themselves;
// a range of characters, a-b, by a Range of codepoints from 'a' to 'b'.
// If the first character of the set is '!' then the number -1 is inserted
// as a separate token immediately before the list.
static tokenize(pattern) {
var tokens = []
var i = 0
var j
while (i < pattern.count) {
var c = pattern[i]
if (c == "*") {
if (i == 0 || tokens[-1] != 0) tokens.add(0)
} else if (c == "?") {
if (i > 0 && tokens[-1] == 0) tokens[-1] = 1 else tokens.add(1)
} else if (c == "[") {
if (i == pattern.count - 1) {
tokens.add(c)
} else if ((j = indexOf(pattern, "]", i + 1)) == -1) {
tokens.add(c)
} else {
var l = []
var s = sub(pattern, i+1...j)
var k = 0
while (k < s.count) {
var d = s[k]
if (d == "!") {
if (k == 0) tokens.add(-1) else l.add(d)
} else if (k < s.count - 2 && s[k+1] == "-") {
l.add(d.codePoints[0]..s[k+2].codePoints[0])
k = k + 2
} else {
l.add(d)
}
k = k + 1
}
if (l.count == 0) Fiber.abort("set cannot be empty.")
tokens.add(l)
i = i + s.count + 1
}
} else {
tokens.add(c)
}
i = i + 1
}
return tokens
}
 
// Returns whether a string 's' matches a 'pattern' which may already be tokenized
// if many strings are to be matched. Matching is case sensitive.
// Patterns may contain the following wildcards:
// * (or **) matches zero or more characters until the next token (if any) matches
// and doesn't backtrack in the event of subsequent failure;
// ? (or *?) matches exactly one character;
// [set] matches a single character from the set within the brackets e.g. [aeiou].
// The set can also contain ranges of characters separated by '-' e.g. [a-zA-Z].
// If the first character of the set is '!' then only characters NOT within the rest
// of the set are matched e.g. [!0-9] matches any character other than a digit.
static isMatch(s, pattern) {
var tokens = pattern
if (tokens is String) tokens = tokenize(tokens)
if (!((tokens is List) && tokens.count > 0)) {
Fiber.abort("'pattern' must be a non-empty string or list of tokens.")
}
var i = 0
var j = 0
var star = false
var neg = false
while (i < s.count && j < tokens.count) {
var c = s[i]
var t = tokens[j]
if (t is Num) {
if (t == 0) {
star = true
} else if (t == 1) {
i = i + 1
star = false
} else if (t == -1) {
neg = true
} else {
Fiber.abort("'%(t)' is not a recognized token.")
}
j = j + 1
} else if (t is String) {
if (!star && c != t) return false
if (star && c == t) star = false
i = i + 1
if (!star) j = j + 1
} else if (t is List) {
var matched = false
for (e in t) {
if (e is String) {
if (e == c) {
matched = true
break
}
} else if (e is Range){
var cp = c.codePoints[0]
if (cp >= e.from && cp <= e.to) {
matched = true
break
}
} else {
Fiber.abort("'%(e)' is not a recognized token within a set.")
}
}
if (!star && !neg && !matched) return false
if (!star && neg && matched) return false
if (star && matched) star = false
i = i + 1
neg = false
if (!star) j = j + 1
} else {
Fiber.abort("'%(t)' is not a recognized token.")
}
}
if (i == s.count && j == tokens.count) return true
if (j == tokens.count && tokens[-1] == 0) return true
if (j == tokens.count - 1 && tokens[-1] == 0) return true
return false
}
}
 
/*
Strs contains routines applicable to lists of strings.
*/
class Strs {
// Private helper method for 'concat'.
static concat_(ls) {
var s = ""
for (e in ls) {
s = s + e
}
return s
}
 
// Returns the strings in the list 'ls' concatenated together.
// If 'chunkSize' is chosen appropriately, this should be much faster than Sequence.join()
// for a large list of strings. For extra speed, only minimal type checks are made.
static concat(ls, chunkSize) {
if (!(ls is List)) Fiber.abort("First argument must be a list of strings.")
if (chunkSize.type != Num || !chunkSize.isInteger || chunkSize < 1) {
Fiber.abort("Second argument must be a positive integer.")
}
var count = ls.count
if (count == 0) return ""
if (ls[0].type != String) Fiber.abort("First argument must be a list of strings.")
var chunks = (count/chunkSize).floor
if (chunks == 0) return concat_(ls)
var lastSize = count % chunkSize
if (lastSize == 0) {
lastSize = chunkSize
} else {
chunks = chunks + 1
}
var s = ""
for (i in 0...chunks) {
var endSize = (i < chunks-1) ? chunkSize : lastSize
s = s + concat_(ls[i*chunkSize...(i*chunkSize + endSize)])
}
return s
}
 
// Convenience version of the above which uses a 'chunkSize' of 1000. This usually gives a good result.
static concat(ls) { concat(ls, 1000) }
 
// Private helper method for 'join'.
static join_(ls, sep) {
var first = true
var s = ""
for (e in ls) {
if (!first) s = s + sep
first = false
s = s + e
}
return s
}
 
// Returns the strings in the list 'ls' joined together using the separator 'sep'.
// If 'chunkSize' is chosen appropriately, this should be much faster than Sequence.join(sep)
// for a large list of strings. For extra speed, only minimal type checks are made.
static join(ls, sep, chunkSize) {
if (!(ls is List)) Fiber.abort("First argument must be a list of strings.")
if (sep.type != String) Fiber.abort("Second argument must be a string")
if (sep == "") return concat(ls, chunkSize)
if (chunkSize.type != Num || !chunkSize.isInteger || chunkSize < 1) {
Fiber.abort("Third argument must be a positive integer.")
}
var count = ls.count
if (count == 0) return ""
if (ls[0].type != String) Fiber.abort("First argument must be a list of strings.")
var chunks = (count/chunkSize).floor
if (chunks == 0) return join_(ls, sep)
var lastSize = count % chunkSize
if (lastSize == 0) {
lastSize = chunkSize
} else {
chunks = chunks + 1
}
var s = ""
for (i in 0...chunks) {
if (i > 0) s = s + sep
var endSize = (i < chunks-1) ? chunkSize : lastSize
s = s + join_(ls[i*chunkSize...(i*chunkSize + endSize)], sep)
}
return s
}
 
// Convenience version of the above which uses a 'chunkSize' of 1000. This usually gives a good result.
static join(ls, sep) { join(ls, sep, 1000) }
}
/*
Utf8 contains routines which are specific to the UTF-8 encoding of a string's bytes or codepoints.
Line 349 ⟶ 911:
return (b0 & b4Mask) << 18 | (b[1] & mbMask) << 12 | (b[2] & mbMask) << 6 | (b[3] & mbMask)
}
}
 
/* The next four methods extend the casing performed by the corresponding 'Str' methods to include
Latin Extended-A, parts of Latin Extended-B, Latin Extended Additional, Greek, Cyrillic,
Armenian and Georgian. */
 
// Converts a UTF-8 string to lower case.
static lower(s) {
if (!(s is String)) s = "%(s)"
if (s == "") return s
var chars = s.toList
var count = chars.count
var i = 0
for (c in s.codePoints) {
if ((c >= 65 && c <= 90) || (c >= 192 && c <= 214) || (c >= 216 && c <= 222)) {
chars[i] = String.fromCodePoint(c + 32)
} else if (c < 256) {
// catch other Latin-1 characters quickly.
} else if ((c >= 0x0100 && c <= 0x0136) && (c % 2 == 0)) {
chars[i] = String.fromCodePoint(c + 1)
} else if ((c >= 0x0139 && c <= 0x0147) && (c % 2 == 1)) {
chars[i] = String.fromCodePoint(c + 1)
} else if ((c >= 0x014A && c <= 0x0176) && (c % 2 == 0)) {
chars[i] = String.fromCodePoint(c + 1)
} else if (c == 0x0178) {
chars[i] = "ÿ"
} else if (c == 0x0179 || c == 0x017B || c == 0x017D ||
c == 0x01A0 || c == 0x01AF || c == 0x01F4) {
chars[i] = String.fromCodePoint(c + 1)
} else if (c == 0x01C4 || c == 0x01C7 || c == 0x01CA || c == 0x01F1) {
chars[i] = String.fromCodePoint(c + 2)
} else if (c == 0x01C5 || c == 0x01C8 || c == 0x01CB || c == 0x01F2) {
chars[i] = String.fromCodePoint(c + 1)
} else if ((c >= 0x01DE && c <= 0x01EE) && (c % 2 == 0)) {
chars[i] = String.fromCodePoint(c + 1)
} else if ((c >= 0x01F8 && c <= 0x021E) && (c % 2 == 0)) {
chars[i] = String.fromCodePoint(c + 1)
} else if ((c >= 0x1E00 && c <= 0x1E94) && (c % 2 == 0)) {
chars[i] = String.fromCodePoint(c + 1)
} else if (c == 0x1E9E) {
chars[i] = "ß"
} else if ((c >= 0x1EA0 && c <= 0x1EFE) && (c % 2 == 0)) {
chars[i] = String.fromCodePoint(c + 1)
} else if (c == 0x0386) {
chars[i] = "ά"
} else if (c == 0x0388 || c == 0x0389 || c == 0x038A) {
chars[i] = String.fromCodePoint(c + 37)
} else if (c == 0x038C) {
chars[i] = "ό"
} else if (c == 0x038E || c == 0x038F) {
chars[i] = String.fromCodePoint(c + 63)
} else if (c >= 0x0391 && c <= 0x03A1) {
chars[i] = String.fromCodePoint(c + 32)
} else if (c == 0x03A3) {
chars[i] = (i == count - 1) ? "ς" : "σ"
} else if (c >= 0x03A4 && c <= 0x03AB) {
chars[i] = String.fromCodePoint(c + 32)
} else if (c >= 0x0400 && c <= 0x041F) {
chars[i] = String.fromCodePoint(c + 80)
} else if (c >= 0x0410 && c <= 0x042F) {
chars[i] = String.fromCodePoint(c + 32)
} else if ((c >= 0x048A && c <= 0x04BE) && (c % 2 == 0)) {
chars[i] = String.fromCodePoint(c + 1)
} else if ((c >= 0x04C1 && c <= 0x04CD) && (c % 2 == 1)) {
chars[i] = String.fromCodePoint(c + 1)
} else if ((c >= 0x04D0 && c <= 0x052E) && (c % 2 == 0)) {
chars[i] = String.fromCodePoint(c + 1)
} else if (c >= 0x0531 && c <= 0x0556) {
chars[i] = String.fromCodePoint(c + 48)
} else if (c >= 0x10A0 && c <= 0x10C5) {
chars[i] = String.fromCodePoint(c + 48)
}
i = i + 1
}
return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000)
}
 
// Converts a UTF-8 string to upper case.
static upper(s) {
if (!(s is String)) s = "%(s)"
if (s == "") return s
var chars = s.toList
var count = chars.count
var i = 0
for (c in s.codePoints) {
if ((c >= 97 && c <= 122) || (c >= 224 && c <= 246) || (c >= 248 && c <= 254)) {
chars[i] = String.fromCodePoint(c - 32)
} else if (c == 223) {
chars[i] = "ẞ"
} else if (c == 255) {
chars[i] = "Ŷ"
} else if (c < 255) {
// catch other Latin-1 characters quickly.
} else if ((c >= 0x0101 && c <= 0x0137) && (c % 2 == 1)) {
chars[i] = String.fromCodePoint(c - 1)
} else if ((c >= 0x013A && c <= 0x0148) && (c % 2 == 0)) {
chars[i] = String.fromCodePoint(c - 1)
} else if ((c >= 0x014B && c <= 0x0177) && (c % 2 == 1)) {
chars[i] = String.fromCodePoint(c - 1)
} else if (c == 0x017A || c == 0x017C || c == 0x017E ||
c == 0x01A1 || c == 0x01B0 || c == 0x01F5) {
chars[i] = String.fromCodePoint(c - 1)
} else if (c == 0x01C5 || c == 0x01C8 || c == 0x01CB || c == 0x01F2) {
chars[i] = String.fromCodePoint(c - 1)
} else if (c == 0x01C6 || c == 0x01C9 || c == 0x01CC || c == 0x01F3) {
chars[i] = String.fromCodePoint(c - 2)
} else if ((c >= 0x01DF && c <= 0x01EF) && (c % 2 == 1)) {
chars[i] = String.fromCodePoint(c - 1)
} else if ((c >= 0x01F9 && c <= 0x021F) && (c % 2 == 1)) {
chars[i] = String.fromCodePoint(c - 1)
} else if ((c >= 0x1E01 && c <= 0x1E95) && (c % 2 == 1)) {
chars[i] = String.fromCodePoint(c - 1)
} else if (c == 0x1E9E) {
chars[i] = "ß"
} else if ((c >= 0x1EA1 && c <= 0x1EFF) && (c % 2 == 1)) {
chars[i] = String.fromCodePoint(c - 1)
} else if (c == 0x03AC) {
chars[i] = "Ά"
} else if (c == 0x03AD || c == 0x03AE || c == 0x03AF) {
chars[i] = String.fromCodePoint(c - 37)
} else if (c >= 0x03B1 && c <= 0x03C1) {
chars[i] = String.fromCodePoint(c - 32)
} else if (c == 0x03C2) {
chars[i] = "Σ"
} else if (c >= 0x03C3 && c <= 0x03CB) {
chars[i] = String.fromCodePoint(c - 32)
} else if (c == 0x03CC) {
chars[i] = "Ό"
} else if (c == 0x03CD || c == 0x03CE) {
chars[i] = String.fromCodePoint(c - 63)
} else if (c >= 0x0430 && c <= 0x044F) {
chars[i] = String.fromCodePoint(c - 32)
} else if (c >= 0x0450 && c <= 0x045F) {
chars[i] = String.fromCodePoint(c - 80)
} else if ((c >= 0x048B && c <= 0x04BF) && (c % 2 == 1)) {
chars[i] = String.fromCodePoint(c - 1)
} else if ((c >= 0x04C2 && c <= 0x04CE) && (c % 2 == 0)) {
chars[i] = String.fromCodePoint(c - 1)
} else if ((c >= 0x04D1 && c <= 0x052F) && (c % 2 == 1)) {
chars[i] = String.fromCodePoint(c - 1)
} else if (c >= 0x0561 && c <= 0x0586) {
chars[i] = String.fromCodePoint(c - 48)
} else if (c >= 0x10D0 && c <= 0x10F5) {
chars[i] = String.fromCodePoint(c - 48)
}
i = i + 1
}
return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000)
}
 
// Capitalizes the first character of a UTF-8 string.
// Uses title rather than upper case variant if it's one of 4 supported digraphs.
static capitalize(s) {
if (!(s is String)) s = "%(s)"
if (s == "") return s
var start = (s.startsWith("[") && s.count > 1) ? 1 : 0
var cs = upper(s[start])
var c = cs.codePoints[0]
if (c == 0x01C4 || c == 0x01C7 || c == 0x01CA || c == 0x01F1) {
cs = String.fromCodePoint(c + 1)
}
if (s.count > start + 1) cs = cs + s[start+1..-1]
if (start == 1) cs = "[" + cs
return cs
}
 
// Capitalizes the first character of each word of a UTF-8 string.
// Uses title rather than upper case variant if it's one of 4 supported digraphs.
static title(s) {
if (!(s is String)) s = "%(s)"
if (s == "") return s
var words = s.split(" ")
return Strs.join(words.map { |w| capitalize(w) }.toList, " ")
}
 
// Removes accents and other diacritical marks from all characters in a string,
// expands digraphs and removes all combining characters before returning the result.
// As well as Latin-1 Supplement, coverage includes Latin Extended-A and various
// other characters found in modern European languages which use the Latin alphabet.
static unaccent(s) {
if (!(s is String)) s = "%(s)"
if (s == "") return s
var accented = [
"àáâãäåāăą", "ÀÁÂÃÄÅĀĂĄ", "ḃ", "Ḃ", "çćĉċč", "ÇĆĈĊČ", "ðďđḋ", "ÐĎĐḊ",
"èéêëēĕėęě", "ÈÉÊËĒĔĖĘĚ", "ḟ", "Ḟ", "ĝğġģ", "ĜĞĠĢ", "ĥħ", "ĤĦ",
"ìíîïĩīĭįı", "ÌÍÎÏĨĪĬĮİ", "Ĵ", "Ĵ", "ķĸ", "Ķ", "ĺļľŀł", "ĹĻĽĿŁ",
"ṁ", "Ṁ", "ñńņňʼn", "ÑŃŅŇ", "òóôõöøōŏő", "ÒÓÔÕÖØŌŎŐ", "ṗ", "Ṗ",
"ŕŗř", "ŔŖŘ", "śŝşšșſ", "ŚŜŞŠȘ", "ţťŧṱț", "ŢŤŦṰȚ", "ùúûüũūŭůűų",
"ÙÚÛÜŨŪŬŮŰŲ", "ŵẁẃẅ", "ŴẀẂẄ", "ýÿỳŷ", "ÝŸỲŶ", "źżž", "ŹŻŽ"
]
var unaccented = "aAbBcCdDeEfFgGhHiIjJkKlLmMnNoOpPrRsStTuUwWyYzZ"
var digraphs = {
"æ": "ae", "Æ": "AE", "þ": "th", "Þ": "TH", "ß": "ss", "ẞ": "SS",
"ij": "ij", "IJ": "IJ", "ŋ": "ng", "Ŋ": "NG", "œ": "OE", "Œ": "OE"
}
var r = ""
var chars = s.toList
var count = chars.count
var i = 0
for (c in s.codePoints) {
if ((c >= 0x00c0 && c <= 0x012B) || c >= 0x1e02 && c <= 0x1e9e) {
var found = false
for (j in 0...accented.count) {
if (accented[j].indexOf(chars[i]) >= 0) {
chars[i] = unaccented[j]
found = true
break
}
}
if (!found && digraphs.containsKey(chars[i])) chars[i] = digraphs[chars[i]]
} else if (c >= 0x0300 && c <= 0x036F) chars[i] = ""
i = i + 1
}
return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000)
}
 
// Converts a Windows-1252 encoded byte string to a UTF-8 encoded string.
static fromWin1252(win1252) {
if (!(win1252 is String)) System.print("Argument must be a byte string.")
if (win1252.count == 0) return ""
// mapping for Windows 1252 bytes 128-159.
// Unused bytes are mapped to the corresponding ISO-8859-1 control codes.
var bm = [
0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
]
var bytes = win1252.bytes
var utf8 = List.filled(bytes.count, 0)
for (i in 0...bytes.count) {
var b = bytes[i]
if (b < 128 || b > 159) {
utf8[i] = String.fromCodePoint(b)
} else {
utf8[i] = String.fromCodePoint(bm[b-128])
}
}
return utf8.join()
}
}
 
/*
// Type aliases for classes in case of any name clashes with other modules.
'Greek' enables characters from the Greek alphabet to be found from their name.
var Str_Char = Char
These characters are often used as mathematical or scientific symbols.
var Str_Str = Str
*/
var Str_Utf8 = Utf8</lang>
class Greek {
// Returns the Greek alphabet, lower then upper case characters.
static alphabet { "αβγδεζηθικλμνξοπρςστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡ\u03a2ΣΤΥΦΧΨΩ" }
 
// Returns a list of the names of all Greek letters in alphabetical order.
static names {
return [
"alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
"iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi",
"rho", "sigma final", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega"
]
}
 
// Returns the name of a Greek character or null if not found.
// Upper case characters are returned with the initial letter capitalized.
static name(char) {
if (char.count != 1) return null
var ix = alphabet.toList.indexOf(char)
if (ix == -1) return null
if (ix < 25) return names[ix]
return Str.capitalize(names[ix-25])
}
 
// Finds and returns a Greek lower case character from its name.
static lower(name) {
name = Str.lower(name)
var ix = names.indexOf(name)
if (ix == -1) Fiber.abort("Name not found.")
return String.fromCodePoint(0x03b1 + ix)
}
 
// Finds and returns a Greek upper case character from its name.
static upper(name) {
name = Str.lower(name)
var ix = names.indexOf(name)
if (ix == -1) Fiber.abort("Name not found.")
if (name == "sigma final") ix = ix + 1
return String.fromCodePoint(0x0391 + ix)
}
}</syntaxhighlight>
9,476

edits