Category talk:Wren-str: Difference between revisions

← Older edit

Category talk:Wren-str (view source)

Revision as of 10:53, 30 March 2024

35,705 bytes added , 1 month ago

→‎Source code: Added Str.lastIndexOf method.

PureFox

9,476

edits

Revision as of 10:36, 17 July 2020 (view source) PureFox (talk \| contribs) (→‎Source code: Added a Str.indexOf method and a Utf8 class.) ← Older edit		Latest revision as of 10:53, 30 March 2024 (view source) PureFox (talk \| contribs) (→‎Source code: Added Str.lastIndexOf method.)
(28 intermediate revisions by the same user not shown)
Line 1: ===Source code=== <~~lang~~syntaxhighlight ~~ecmascript~~lang="wren">/* Module "str.wren" / / Char contains routines to perform various operations on characters. A 'character' for this purpose is a single Unicode codepoint. ~~For convenience a string containing more than one character can be passed~~ Categorization and casing is supported for characters < 256 (Latin-1) but no higher. The 'symbol' category includes 'other letter', 'other number' and soft hyphen (ªº¹²³¼½¾¯). For convenience a string containing more than one character can be passed as an argument but the methods will only operate on the first character. / Line 18 ⟶ 21: // Checks if the first character of a string falls into a particular category. static isAscii(c) { code(c) < 128 } static ~~isSymbol~~isLatin1(c) { code(c) ~~&& "$+~~<~~=>^`\|~".contains(c[0])~~ 256 } ~~static isControl(c) { (c = code(c)) && (c < 32 \|\| c == 127) }~~ ~~static isDigit(c) { (c = code(c)) && c >= 48 && c <= 57 }~~ ~~static isLower(c) { (c = code(c)) && c >= 97 && c <= 122 }~~ ~~static isUpper(c) { (c = code(c)) && c >= 65 && c <= 90 }~~ ~~static isPrintable(c) { (c = code(c)) && c >= 32 && c < 127 }~~ ~~static isSpace(c) { (c = code(c)) && (c == 32 \|\| c == 9 \|\| c == 10 \|\| c == 13) }~~ ~~static isWhitespace(c) { (c = code(c)) && (c == 32 \|\| (c >= 9 && c <= 13)) }~~ // ASCII categories. ~~/ Rather than use combinations of the above, these only call the 'code' method once. /~~ static isDigit(c) { (c = code(c)) && c >= 48 && c <= 57 } static isAsciiLower(c) { (c = code(c)) && c >= 97 && c <= 122 } static isAsciiUpper(c) { (c = code(c)) && c >= 65 && c <= 90 } static isAsciiLetter(c) { isAsciiLower(c) \|\| isAsciiUpper(c) } static isAsciiAlphaNum(c) { isAsciiLower(c) \|\| isAsciiUpper(c) \|\| isDigit(c) } static isSpace(c) { (c = code(c)) && (c == 32 \|\| c == 9 \|\| c == 10 \|\| c == 13) } // Latin-1 categories. ~~static isLetter(c) {~~ static isLower(c) { var d = code(c) return (d >= 6597 && d <= 90122) \|\| (d == 181) \|\| (d >= 97223 && d <= ~~122~~246) \|\| (d >= 248 && d <= 255) } static ~~isAlphanumeric~~isUpper(c) { var d = code(c) return (d >= 65 && d <= 90) \|\| (d >= 97192 && d <= ~~122~~214) \|\| (d >= 48216 && d <= 57222) } static ~~isPunctuation~~isLetter(c) { isLower(c) \|\| isUpper(c) } static isAlphaNumeric(c) { isLower(c) \|\| isUpper(c) \|\| isDigit(c) } static isControl(c) { var d = code(c) ifreturn (d < 3332 \|\| (d >= ~~126)~~127 ~~return~~&& ~~false~~d < 160) ~~if ((d >= 65 && d <= 90) \|\| (d >= 97 && d <= 122) \|\| (d >= 48 && d <= 57)) return false~~ ~~if ("$+<=>^`\|~".contains(c[0])) return false~~ ~~return true~~ } static isPrintable(c) { var d = code(c) return (d >= 32 && d < 127) \|\| (d >= 160 && d < 256) } static isGraphic(c) { var d = code(c) return (d >= 33 && d < 127) \|\| (d >= 161 && d < 256) } static isWhitespace(c) { var d = code(c) return d == 32 \|\| (d >= 9 && d <= 13) \|\| d == 160 } static isPunctuation(c) { code(c) && "!\"#\%&'(),-./:;?@[\\]_{}¡§«¶·»¿".contains(c[0]) } static isSymbol(c) { isGraphic(c) && !isAlpaNumeric(c) && !isPunctuation(c) } static category(c) { var d = code(c) return (d < 32) \|\| d == ~~127)~~ ? "control" : (d == 32) ? "space" : (d >= 48 && d <= 57) ? "digit" : (d >= 64 65 && d <= 90) ? "upper" : (d >= 97 && d <= 122) ? "lower" : (d >=~~128)~~ 127 && d <= 159) ? "control" ~~? "non-ascii"~~ : ~~"$+<=>^`\|~".contains~~(~~c[0]~~d == 160) ? "~~symbol~~space" : ~~"punctuation"~~ : (d == 181) ? "lower" : (d >= 192 && d <= 214) ? "upper" : (d >= 216 && d <= 222) ? "upper" : (d >= 223 && d <= 246) ? "lower" : (d >= 248 && d <= 255) ? "lower" : (d >= 256) ? "non-latin1" : isPunctuation(c) ? "punctuation" : "symbol" } // ~~Return~~Returns the first character of a string converted to ~~the appropriate~~lower case. static lower(c) { ~~static upper(c) { ((c = code(c)) && c >= 97 && c <= 122) ? fromCode(c-32) : fromCode(c) }~~ var d = code(c) ~~static lower(c) { ((c = code(c)) && c >= 65 && c <= 90) ? fromCode(c+32) : fromCode(c) }~~ if ((d >= 65 && d <= 90) \|\| (d >= 192 && d <= 214) \|\| (d >= 216 && d <= 222)) { return fromCode(d+32) } return c[0] } // Returns the first character of a string converted to upper case. static upper(c) { var d = code(c) if ((d >= 97 && d <= 122) \|\| (d >= 224 && d <= 246) \|\| (d >= 248 && d <= 254)) { return fromCode(d-32) } return c[0] } // Swaps the case of the first character in a string. static swapCase(c) { var d = code(c) if ((d >= 65 && d <= 90) ~~return~~\|\| ~~fromCode~~(d~~+32~~ >= 192 && d <= 214) \|\| (d >= 216 && d <= 222)) { if ~~(d >= 97 && d~~ <= ~~122)~~ return fromCode(d-+32) } if ((d >= 97 && d <= 122) \|\| (d >= 224 && d <= 246) \|\| (d >= 248 && d <= 254)) { return fromCode(d-32) } return c[0] } Line 84 ⟶ 133: static compare(s1, s2) { if (s1 == s2) return 0 var cp1 = s1.codePoints.toList var cp2 = s2.codePoints.toList var len = (cp1.count <= cp2.count) ? cp1.count : cp2.count for (i in 0...len) { Line 95 ⟶ 144: // Checks if a string falls into a particular category. static allAscii(s) { s != "" && s.codePoints.all { \|c\| c < 128 } } static ~~allDigits~~allLatin1(s) { s != "" && s.codePoints.all { \|c\| ~~c >= 48 &&~~ c <= 57256 } } static ~~allLower~~allDigits(s) { s != "" && s.codePoints.all { \|c\| c >= 9748 && c <= ~~122~~ 57 } } static ~~allUpper~~allAsciiLower(s) { s {!= "" && s.codePoints.all { \|c\| c >= 6597 && c <= 90 122 } } static ~~allPrintable~~allAsciiUpper(s) { s != "" && s.codePoints.all { \|c\| c >= 3265 && c <= ~~127~~ 90 } } static ~~allWhitespace~~allAsciiLetters(s) { s != "" && s.~~codePoints~~toList.all { \|c\| ~~c == 32 \|\|~~ Char.isAsciiLetter(c) >= ~~9 && c <= 13)~~ } } static allAsciiAlphaNum(s) { s != "" && s.toList.all { \|c\| Char.isAsciiAlphaNum(c) } } static ~~allLetters~~allSpace(s) { s != "" && s.~~codePoints~~toList.all { \|c\| Char.isSpace(c) } } static allLower(s) ~~return~~ (c > { s != 65"" && cs.toList.all ~~<= 90)~~{ \|c\| Char.isLower(c) >= 97 && c <= ~~122)~~ } } static allUpper(s) { s != "" && s.toList.all { \|c\| Char.isUpper(c) } } ~~} }~~ static allLetters(s) { s != "" && s.toList.all { \|c\| Char.isLetter(c) } } static ~~allAlphanumeric~~allAlphaNumeric(s) { s != "" && s.~~codepoints~~toList.all { \|c\| Char.isAlphanumeric(c) } } static ~~return~~ allPrintable(cs) >= 65 && c <={ s ~~90) \|\| (c >~~!= 97"" && cs.toList.all ~~<= 122)~~{ \|c\| Char.isPrintable(c) >= 48 && c <=} ~~57)~~} static allGraphic(s) { s != "" && s.toList.all { \|c\| Char.isGraphic(c) } } ~~} }~~ static allWhitespace(s) { s != "" && s.toList.all { \|c\| Char.isWhitespace(c) } } // Checks whether a string can be parsed to a number, an integer or a non-integer (float). Line 119 ⟶ 169: if (!(s is String)) s = "%(s)" if (s == "") return s var ~~cps~~chars = s~~.codePoints~~.toList ~~for~~var (icount in= ~~0...cps~~chars.count~~) {~~ var ci = ~~cps[i]~~0 iffor (c >=in ~~65 && c <= 90~~s.codePoints) ~~cps[i] = c + 32~~{ if ((c >= 65 && c <= 90) \|\| (c >= 192 && c <= 214) \|\| (c >= 216 && c <= 222)) { chars[i] = String.fromCodePoint(c + 32) } i = i + 1 } return ~~cps.reduce~~(~~"")~~count {< ~~\|acc,~~1000) c\|? ~~acc~~Strs.concat_(chars) +: ~~String~~Strs.~~fromCodePoint~~concat(c)chars, }1000) } Line 131 ⟶ 185: if (!(s is String)) s = "%(s)" if (s == "") return s var ~~cps~~chars = s~~.codePoints~~.toList ~~for~~var (icount in= ~~0...cps~~chars.count~~) {~~ var ci = ~~cps[i]~~0 iffor (c >=in ~~97 && c <= 122~~s.codePoints) ~~cps[i] = c - 32~~{ if ((c >= 97 && c <= 122) \|\| (c >= 224 && c <= 246) \|\| (c >= 248 && c <= 254)) { chars[i] = String.fromCodePoint(c - 32) } i = i + 1 } return ~~cps.reduce~~(~~"")~~count {< ~~\|acc,~~1000) c\|? ~~acc~~Strs.concat_(chars) +: ~~String~~Strs.~~fromCodePoint~~concat(c)chars, }1000) } Line 143 ⟶ 201: if (!(s is String)) s = "%(s)" if (s == "") return s var ~~cps~~chars = s~~.codePoints~~.toList ~~for~~var (icount in= ~~0...cps~~chars.count~~) {~~ var ci = ~~cps[i]~~0 iffor (c ~~>= 65 && c <=~~in 90s.codePoints) { if ((c >= 65 ~~cps[i]~~&& c <= 90) \|\| (c +>= 192 && c <= 214) \|\| (c >= 216 && c <= 222)) 32{ } ~~else~~ if (c >chars[i] = ~~97 &&~~ String.fromCodePoint(c <=+ ~~122~~32) { } else if ((c ~~cps[i]~~>= 97 && c <= 122) \|\| (c ->= 224 && c <= 246) 32\|\| (c >= 248 && c <= 254)) { chars[i] = String.fromCodePoint(c - 32) } i = i + 1 } return ~~cps.reduce~~(~~"")~~count {< ~~\|acc,~~1000) c\|? ~~acc~~Strs.concat_(chars) +: ~~String~~Strs.~~fromCodePoint~~concat(c)chars, }1000) } Line 159 ⟶ 220: if (!(s is String)) s = "%(s)" if (s == "") return s var ~~cps~~start = (s.~~codePoints~~startsWith("[") && s.~~toList~~count > 1) ? 1 : 0 var ~~start~~c = (s[start].~~startsWith("~~codePoints[~~") && cps.count > 1) ? 1 :~~ 0] if ((c >= 97 && c <= 122) \|\| (c >= 224 && c <= 246) \|\| (c >= 248 && c <= 254)) { ~~var c = cps[start]~~ if (c >= 97 &&var ccs <= ~~122~~String.fromCodePoint(c - 32) {+ s[start+1..-1] ~~cps[~~if (start] == c1) cs = "[" -+ 32cs return ~~cps.reduce("") { \|acc, c\| acc + String.fromCodePoint(c) }~~cs } return s Line 174 ⟶ 235: if (s == "") return s var words = s.split(" ") return Strs.join(words.map { \|w\| capitalize(w) }.~~join(~~toList, " ") } // Removes accents and cedillas from all Latin-1 supplement characters in a string // and also expands digraphs before returning the result. static unaccent(s) { if (!(s is String)) s = "%(s)" if (s == "") return s var accented = [ "àáâãäå", "ÀÁÂÃÄÅ", "ç", "Ç", "ð", "Ð", "èéêë", "ÈÉÊË", "ìíîï", "ÌÍÎÏ", "ñ", "Ñ", "òóôõöø", "ÒÓÔÕÖØ", "ùúûü", "ÙÚÛÜ", "ýÿ", "Ý" ] var unaccented = "aAcCdDeEiInNoOuUyY" var digraphs = { "æ": "ae", "Æ": "AE", "þ": "th", "Þ": "TH", "ß": "ss" } var r = "" var chars = s.toList var count = chars.count var i = 0 for (c in s.codePoints) { if (c >= 0xc0 && c <= 0xff) { var found = false for (j in 0...accented.count) { if (accented[j].indexOf(chars[i]) >= 0) { chars[i] = unaccented[j] found = true break } } if (!found && digraphs.containsKey(chars[i])) chars[i] = digraphs[chars[i]] } i = i + 1 } return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000) } Line 183 ⟶ 276: } // Performs a circular shift of the characters of 's' ~~one~~'n' ~~place~~places to the left. // If 'n' is negative performs a circular right shift by '-n' places instead. ~~static lshift(s) {~~ static lshift(s, n) { if (!(s is String)) s = "%(s)" if (!(n is Num) \|\| !n.isInteger) Fiber.abort("'n' must be an integer.") var chars = s.toList var count = chars.count if (count < 2) return s ~~var~~if t(n =< ~~chars[~~0]) return rshift(s, -n) ~~for~~n (i= inn % ~~0..~~count~~-2) chars[i] = chars[i+1]~~ ~~chars[-1]~~if (n == 0) return ts ~~return~~for ~~chars.join~~(i in 1..n) { var t = chars[0] for (j in 0..count-2) chars[j] = chars[j+1] chars[-1] = t } return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000) } // Performs a circular shift of the characters of 's' ~~one~~'n' ~~place~~places to the right. // If 'n' is negative performs a circular left shift by '-n' places instead. ~~static rshift(s) {~~ static rshift(s, n) { if (!(s is String)) s = "%(s)" if (!(n is Num) \|\| !n.isInteger) Fiber.abort("'n' must be an integer.") var chars = s.toList var count = chars.count if (count < 2) return s ~~var~~if t(n =< 0) return lshift(s, ~~chars[~~-1]n) ~~for~~n (i= inn % count~~-2..0) chars[i+1] = chars[i]~~ ~~chars[0]~~if (n == 0) return ts ~~return~~for ~~chars.join~~(i in 1..n) { var t = chars[-1] for (j in count-2..0) chars[j+1] = chars[j] chars[0] = t } return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000) } // Convenience versions of the above methods which shift by just 1 place. ~~/* The indices (or ranges thereof) for all the following functions are measured in codepoints (not bytes).~~ static lshift(s) { lshift(s, 1) } ~~As with core library methods, the indices must be within bounds or errors will be generated. /~~ static rshift(s) { rshift(s, 1) } / The indices (or ranges thereof) for all the following functions are measured in codepoints (not bytes). Negative indices count backwards from the end of the string. As with core library methods, the indices must be within bounds or errors will be generated. / // Extracts the sub-string of 's' over the range 'r'. Line 214 ⟶ 326: if (!(r is Range)) Fiber.abort("Second argument must be a range.") if (!(s is String)) s = "%(s)" return Strs.concat(s.toList[r]~~.join(~~) } // Private helper method to check whether an index is valid. static checkIndex_(s, index, inc) { if (index.type != Num \|\| !index.isInteger) Fiber.abort("Index must be an integer.") var c = s.count + inc if (index >= c \|\| index < -c) Fiber.abort("Index is out of bounds.") } // Gets the character of 's' at index 'i'. Throws an error if 'i is out of bounds. static get(s, i) { ~~if (!(i is Num && i.isInteger && i >= 0)) Fiber.abort("Index must be a non-negative integer.")~~ if (!(s is String)) s = "%(s)" checkIndex_(s, i, 0) if (i < 0) i = s.count + i return s.toList[i] } Line 226 ⟶ 346: // Gets the character of 's' at index 'i'. Returns null if 'i is out of bounds. static getOrNull(s, i) { ~~if (!(i is Num && i.isInteger)) Fiber.abort("Index must be an integer.")~~ if (!(s is String)) s = "%(s)" if (!(i is Num && i.isInteger)) Fiber.abort("Index must be an integer.") if (i < 0) i = s.count + i return (i >= 0 && i < s.count) ? s.toList[i] : null } Line 246 ⟶ 367: cpCount = cpCount + 1 } } // Returns the codepoint index (not byte index) at which 'search' first occurs in 's' // or -1 if 'search' is not found, starting from codepoint offset 'start'. static indexOf(s, search, start) { var ss = (start > 0) ? Str.sub(s, start..-1) : s var ix = Str.indexOf(ss, search) return (ix >= 0) ? start + ix : -1 } // Returns the codepoint index (not byte index) at which 'search' last occurs in 's' // or -1 if 'search' is not found. static lastIndexOf(s, search) { if (!(search is String)) Fiber.abort("Search argument must be a string.") if (!(s is String)) s = "%(s)" var l = s.toList for (i in l.count-1..0) { if (l[i] == search) return i } return -1 } // Returns the number of non-overlapping occurrences of the string 't' // within the string 's'. static occurs(s, t) { s.split(t).count - 1 } // Returns the number of non-overlapping occurrences of the string 't' // within the string 's' starting from codepoint offset 'start'. static occurs(s, t, start) { if (start == 0) return occurs(s, t) return occurs(Str.sub(s, start..-1), t) } // Changes the character of 's' at index 'i' to the string 't'. static change(s, i, t) { ~~if (!(i is Num && i.isInteger && i >= 0)) Fiber.abort("Index must be a non-negative integer.")~~ if (!(t is String)) Fiber.abort("Replacement must be a string.") if (!(s is String)) s = "%(s)" checkIndex_(s, i, 0) if (i < 0) i = s.count + i var chars = s.toList chars[i] = t return ~~chars~~Strs.~~join~~concat(chars) } // Inserts at index 'i' of 's' the string 't'. static insert(s, i, t) { ~~if (!(i is Num && i.isInteger && i >= 0)) Fiber.abort("Index must be a non-negative integer.")~~ if (!(t is String)) Fiber.abort("Insertion must be a string.") if (!(s is String)) s = "%(s)" checkIndex_(s, i, 1) if (i < 0) i = s.count + i + 1 var chars = s.toList chars.insert(i, t) return ~~chars~~Strs.~~join~~concat(chars) } // Deletes the character of 's' at index 'i'. static delete(s, i) { ~~if (!(i is Num && i.isInteger && i >= 0)) Fiber.abort("Index must be a non-negative integer.")~~ if (!(s is String)) s = "%(s)" checkIndex_(s, i, 0) if (i < 0) i = s.count + i var chars = s.toList chars.removeAt(i) return ~~chars~~Strs.~~join~~concat(chars) } // Exchanges the characters of 's' at indices 'i' and 'j' static exchange(s, i, j) { ~~if (!(i is Num && i.isInteger && i >= 0)) Fiber.abort("First index must be a non-negative integer.")~~ ~~if (!(j is Num && j.isInteger && j >= 0)) Fiber.abort("Second index must be a non-negative integer.")~~ if (!(s is String)) s = "%(s)" checkIndex_(s, i, 0) if (i < 0) i = s.count + i checkIndex_(s, j, 0) if (j < 0) j = s.count + j if (i == j) return s var chars = s.toList ~~var t =~~ chars[.swap(i], j) ~~chars[i] =~~return Strs.concat(chars~~[j]~~) } ~~chars[j] = t~~ ~~return chars.join()~~ // Returns 's' with 'from' replaced by 'to' up to 'n' times (all times if n is negative) // but skipping the first 'skip' matches. static replace(s, from, to, n, skip) { if (!(from is String)) Fiber.abort("'from 'must be a string.") if (!(to is String)) Fiber.abort("'to' must be a string.") if (!(n is Num && n.isInteger)) Fiber.abort("'n' must be an integer.") if (!(skip is Num && skip.isInteger && skip >= 0)) { Fiber.abort("'skip' must be a non-negative integer.") } if (!(s is String)) s = "%(s)" if (n < 0) { if (skip == 0) return s.replace(from, to) n = Num.maxSafeInteger } if (n == 0 \|\| skip >= n) return s var count = 0 var split = s.split(from) var res = "" for (i in 0...split.count-1) { count = count + 1 res = res + split[i] + ((count <= skip \|\| count > n) ? from : to) } return res + split[-1] } // Convenience version of 'replace' where 'skip' is always zero. static replace(s, from, to, n) { replace(s, from, to, n, 0) } // Adds 'by' to the start of each line of 's' // and returns the result. static indent(s, by) { if (!(s is String)) Fiber.abort("First argument must be a string.") if (!(by is String)) Fiber.abort("Second argument must be a string.") var lines = s.split("\n") return lines.map { \|line\| by + line }.join("\n") } // Removes 'by' from the start of each line of 's' which begins with it // and returns the result. static dedent(s, by) { if (!(s is String)) Fiber.abort("First argument must be a string.") if (!(by is String)) Fiber.abort("Second argument must be a string.") var lines = s.split("\n") var c = by.bytes.count return lines.map { \|line\| if (line.startsWith(by)) return line[c..-1] return line }.join("\n") } // Removes all spaces and tabs from the end of each line of s // and returns the result. static tidy(s) { if (!(s is String)) Fiber.abort("Argument must be a string.") var lines = s.split("\n") return lines.map { \|line\| line.trimEnd(" \t") }.join("\n") } // Returns 's' repeated 'reps' times. static repeat(s, reps) { if (!(s is String)) s = "%(s)" if (!(reps is Num && reps.isInteger && reps >= 0)) { Fiber.abort("Repetitions must be a non-negative integer.") } var rs = "" if (reps < 10) { for (i in 0...reps) rs = rs + s } else { while (true) { if (reps % 2 == 1) rs = rs + s reps = reps >> 1 if (reps == 0) break s = s + s } } return rs } Line 311 ⟶ 544: if (final > 0) res.add(sub(s, first..-1)) return res } // Splits 's' into a list of one or more strings separated by 'sep' but removes // any empty elements from the list. static splitNoEmpty(s, sep) { if (!(s is String)) s = "%(s)" if (!(sep is String) \|\| sep.isEmpty) Fiber.abort("Separator must be a non-empty string.") var split = s.split(sep) return split.where { \|e\| !e.isEmpty }.toList } // Splits a CSV 'line' into a list of one or more strings separated by 'sep' which must be // a single character (except \v). Deals properly with embedded separators in quoted fields. // Removes leading and trailing quotes from quoted fields if 'dequote' is true. static splitCsv(line, sep, dequote) { if (!(line is String)) line = "%(line)" if (!(sep is String) \|\| sep.count != 1) { Fiber.abort("Separator must be a single character string.") } if (!(dequote is Bool)) Fiber.abort("Dequote must be a boolean.") var fields = line.split(sep) var count = 0 var quoted = false var chars = line.toList for (i in 0...fields.count) { var f = fields[i] var fc = f.count if (fc > 0) { count = count + fc if (!quoted && f[0] == "\"") { if (f[-1] != "\"") { quoted = true chars[count] = "\v" } } else if (quoted && f[-1] == "\"") { quoted = false } else if (quoted) { chars[count] = "\v" } } else if (quoted) { chars[count] = "\v" } count = count + 1 } fields = chars.join("").split(sep) for (i in 0...fields.count) fields[i] = fields[i].replace("\v", sep) if (dequote) { for (i in 0...fields.count) { var f = fields[i] var fc = f.count if (fc < 2) continue if (f[0] == "\"" && f[-1] == "\"") fields[i] = f[1...-1] } } return fields } // Convenience versions of the above method which use default parameters. static splitCsv(line, sep) { splitCsv(line, sep, true) } static splitCsv(line) { splitCsv(line, ",", true) } // Splits a string 's' into two parts, before and after the first occurrence // of 'delim' and returns a list of those parts. // The 'delim' itself can be optionally included in the second part. // If 'delim' does not occur in 's', returns [s, ""]. static bisect(s, delim, include) { if (!(delim is String)) Fiber.abort("Delimiter must be a string.") if (!(include is Bool)) Fiber.abort("Include must be true or false.") if (!(s is String)) s = "%(s)" var ix = s.indexOf(delim) if (ix == -1) return [s, ""] if (include) return [s[0...ix], s[ix..-1]] var len = delim.bytes.count return [s[0...ix], s[ix + len..-1]] } // Convenience version of bisect method which never includes the delimiter. static bisect(s, delim) { bisect(s, delim, false) } // Creates and returns a string from a list of bytes. static fromBytes(ba) { if (!(ba is List)) Fiber.abort("Argument must be list of bytes.") var count = ba.count if (count == 0) return "" var chars = ba.map { \|b\| String.fromByte(b) }.toList return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000) } // Creates and returns a string from a list of code points. static fromCodePoints(ca) { if (!(ca is List)) Fiber.abort("Argument must be list of code points.") var count = ca.count if (count == 0) return "" var chars = ca.map { \|c\| String.fromCodePoint(c) }.toList return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000) } // After trimming whitespace from the string 's', takes as many characters as possible // to form a valid number and converts it thereto using the Num.fromString method. // Returns null if such a conversion is impossible. static toNum(s) { if (s is Num) return s if (!(s is String)) s = "%(s)" s = s.trim() var n = Num.fromString(s) if (n) return n if (s.count < 2) return null var chars = s.toList for (i in chars.count-1..1) { chars.removeAt(i) if (n = Num.fromString(chars.join())) return n } return null } // Converts a pattern into a list of tokens for processing by the 'isMatch' method. // Characters within the pattern are represented as follows: // Non-wildcard characters as themselves (i.e. single character strings); // (or *) by the number 0; // ? (or ?) by the number 1; // [set] by a list of the tokens within the set: // single characters by themselves; // a range of characters, a-b, by a Range of codepoints from 'a' to 'b'. // If the first character of the set is '!' then the number -1 is inserted // as a separate token immediately before the list. static tokenize(pattern) { var tokens = [] var i = 0 var j while (i < pattern.count) { var c = pattern[i] if (c == "") { if (i == 0 \|\| tokens[-1] != 0) tokens.add(0) } else if (c == "?") { if (i > 0 && tokens[-1] == 0) tokens[-1] = 1 else tokens.add(1) } else if (c == "[") { if (i == pattern.count - 1) { tokens.add(c) } else if ((j = indexOf(pattern, "]", i + 1)) == -1) { tokens.add(c) } else { var l = [] var s = sub(pattern, i+1...j) var k = 0 while (k < s.count) { var d = s[k] if (d == "!") { if (k == 0) tokens.add(-1) else l.add(d) } else if (k < s.count - 2 && s[k+1] == "-") { l.add(d.codePoints[0]..s[k+2].codePoints[0]) k = k + 2 } else { l.add(d) } k = k + 1 } if (l.count == 0) Fiber.abort("set cannot be empty.") tokens.add(l) i = i + s.count + 1 } } else { tokens.add(c) } i = i + 1 } return tokens } // Returns whether a string 's' matches a 'pattern' which may already be tokenized // if many strings are to be matched. Matching is case sensitive. // Patterns may contain the following wildcards: // (or *) matches zero or more characters until the next token (if any) matches // and doesn't backtrack in the event of subsequent failure; // ? (or ?) matches exactly one character; // [set] matches a single character from the set within the brackets e.g. [aeiou]. // The set can also contain ranges of characters separated by '-' e.g. [a-zA-Z]. // If the first character of the set is '!' then only characters NOT within the rest // of the set are matched e.g. [!0-9] matches any character other than a digit. static isMatch(s, pattern) { var tokens = pattern if (tokens is String) tokens = tokenize(tokens) if (!((tokens is List) && tokens.count > 0)) { Fiber.abort("'pattern' must be a non-empty string or list of tokens.") } var i = 0 var j = 0 var star = false var neg = false while (i < s.count && j < tokens.count) { var c = s[i] var t = tokens[j] if (t is Num) { if (t == 0) { star = true } else if (t == 1) { i = i + 1 star = false } else if (t == -1) { neg = true } else { Fiber.abort("'%(t)' is not a recognized token.") } j = j + 1 } else if (t is String) { if (!star && c != t) return false if (star && c == t) star = false i = i + 1 if (!star) j = j + 1 } else if (t is List) { var matched = false for (e in t) { if (e is String) { if (e == c) { matched = true break } } else if (e is Range){ var cp = c.codePoints[0] if (cp >= e.from && cp <= e.to) { matched = true break } } else { Fiber.abort("'%(e)' is not a recognized token within a set.") } } if (!star && !neg && !matched) return false if (!star && neg && matched) return false if (star && matched) star = false i = i + 1 neg = false if (!star) j = j + 1 } else { Fiber.abort("'%(t)' is not a recognized token.") } } if (i == s.count && j == tokens.count) return true if (j == tokens.count && tokens[-1] == 0) return true if (j == tokens.count - 1 && tokens[-1] == 0) return true return false } } /* Strs contains routines applicable to lists of strings. / class Strs { // Private helper method for 'concat'. static concat_(ls) { var s = "" for (e in ls) { s = s + e } return s } // Returns the strings in the list 'ls' concatenated together. // If 'chunkSize' is chosen appropriately, this should be much faster than Sequence.join() // for a large list of strings. For extra speed, only minimal type checks are made. static concat(ls, chunkSize) { if (!(ls is List)) Fiber.abort("First argument must be a list of strings.") if (chunkSize.type != Num \|\| !chunkSize.isInteger \|\| chunkSize < 1) { Fiber.abort("Second argument must be a positive integer.") } var count = ls.count if (count == 0) return "" if (ls[0].type != String) Fiber.abort("First argument must be a list of strings.") var chunks = (count/chunkSize).floor if (chunks == 0) return concat_(ls) var lastSize = count % chunkSize if (lastSize == 0) { lastSize = chunkSize } else { chunks = chunks + 1 } var s = "" for (i in 0...chunks) { var endSize = (i < chunks-1) ? chunkSize : lastSize s = s + concat_(ls[ichunkSize...(ichunkSize + endSize)]) } return s } // Convenience version of the above which uses a 'chunkSize' of 1000. This usually gives a good result. static concat(ls) { concat(ls, 1000) } // Private helper method for 'join'. static join_(ls, sep) { var first = true var s = "" for (e in ls) { if (!first) s = s + sep first = false s = s + e } return s } // Returns the strings in the list 'ls' joined together using the separator 'sep'. // If 'chunkSize' is chosen appropriately, this should be much faster than Sequence.join(sep) // for a large list of strings. For extra speed, only minimal type checks are made. static join(ls, sep, chunkSize) { if (!(ls is List)) Fiber.abort("First argument must be a list of strings.") if (sep.type != String) Fiber.abort("Second argument must be a string") if (sep == "") return concat(ls, chunkSize) if (chunkSize.type != Num \|\| !chunkSize.isInteger \|\| chunkSize < 1) { Fiber.abort("Third argument must be a positive integer.") } var count = ls.count if (count == 0) return "" if (ls[0].type != String) Fiber.abort("First argument must be a list of strings.") var chunks = (count/chunkSize).floor if (chunks == 0) return join_(ls, sep) var lastSize = count % chunkSize if (lastSize == 0) { lastSize = chunkSize } else { chunks = chunks + 1 } var s = "" for (i in 0...chunks) { if (i > 0) s = s + sep var endSize = (i < chunks-1) ? chunkSize : lastSize s = s + join_(ls[ichunkSize...(ichunkSize + endSize)], sep) } return s } // Convenience version of the above which uses a 'chunkSize' of 1000. This usually gives a good result. static join(ls, sep) { join(ls, sep, 1000) } } / Utf8 contains routines which are specific to the UTF-8 encoding of a string's bytes or codepoints. Line 349 ⟶ 911: return (b0 & b4Mask) << 18 \| (b[1] & mbMask) << 12 \| (b[2] & mbMask) << 6 \| (b[3] & mbMask) } } /* The next four methods extend the casing performed by the corresponding 'Str' methods to include Latin Extended-A, parts of Latin Extended-B, Latin Extended Additional, Greek, Cyrillic, Armenian and Georgian. / // Converts a UTF-8 string to lower case. static lower(s) { if (!(s is String)) s = "%(s)" if (s == "") return s var chars = s.toList var count = chars.count var i = 0 for (c in s.codePoints) { if ((c >= 65 && c <= 90) \|\| (c >= 192 && c <= 214) \|\| (c >= 216 && c <= 222)) { chars[i] = String.fromCodePoint(c + 32) } else if (c < 256) { // catch other Latin-1 characters quickly. } else if ((c >= 0x0100 && c <= 0x0136) && (c % 2 == 0)) { chars[i] = String.fromCodePoint(c + 1) } else if ((c >= 0x0139 && c <= 0x0147) && (c % 2 == 1)) { chars[i] = String.fromCodePoint(c + 1) } else if ((c >= 0x014A && c <= 0x0176) && (c % 2 == 0)) { chars[i] = String.fromCodePoint(c + 1) } else if (c == 0x0178) { chars[i] = "ÿ" } else if (c == 0x0179 \|\| c == 0x017B \|\| c == 0x017D \|\| c == 0x01A0 \|\| c == 0x01AF \|\| c == 0x01F4) { chars[i] = String.fromCodePoint(c + 1) } else if (c == 0x01C4 \|\| c == 0x01C7 \|\| c == 0x01CA \|\| c == 0x01F1) { chars[i] = String.fromCodePoint(c + 2) } else if (c == 0x01C5 \|\| c == 0x01C8 \|\| c == 0x01CB \|\| c == 0x01F2) { chars[i] = String.fromCodePoint(c + 1) } else if ((c >= 0x01DE && c <= 0x01EE) && (c % 2 == 0)) { chars[i] = String.fromCodePoint(c + 1) } else if ((c >= 0x01F8 && c <= 0x021E) && (c % 2 == 0)) { chars[i] = String.fromCodePoint(c + 1) } else if ((c >= 0x1E00 && c <= 0x1E94) && (c % 2 == 0)) { chars[i] = String.fromCodePoint(c + 1) } else if (c == 0x1E9E) { chars[i] = "ß" } else if ((c >= 0x1EA0 && c <= 0x1EFE) && (c % 2 == 0)) { chars[i] = String.fromCodePoint(c + 1) } else if (c == 0x0386) { chars[i] = "ά" } else if (c == 0x0388 \|\| c == 0x0389 \|\| c == 0x038A) { chars[i] = String.fromCodePoint(c + 37) } else if (c == 0x038C) { chars[i] = "ό" } else if (c == 0x038E \|\| c == 0x038F) { chars[i] = String.fromCodePoint(c + 63) } else if (c >= 0x0391 && c <= 0x03A1) { chars[i] = String.fromCodePoint(c + 32) } else if (c == 0x03A3) { chars[i] = (i == count - 1) ? "ς" : "σ" } else if (c >= 0x03A4 && c <= 0x03AB) { chars[i] = String.fromCodePoint(c + 32) } else if (c >= 0x0400 && c <= 0x041F) { chars[i] = String.fromCodePoint(c + 80) } else if (c >= 0x0410 && c <= 0x042F) { chars[i] = String.fromCodePoint(c + 32) } else if ((c >= 0x048A && c <= 0x04BE) && (c % 2 == 0)) { chars[i] = String.fromCodePoint(c + 1) } else if ((c >= 0x04C1 && c <= 0x04CD) && (c % 2 == 1)) { chars[i] = String.fromCodePoint(c + 1) } else if ((c >= 0x04D0 && c <= 0x052E) && (c % 2 == 0)) { chars[i] = String.fromCodePoint(c + 1) } else if (c >= 0x0531 && c <= 0x0556) { chars[i] = String.fromCodePoint(c + 48) } else if (c >= 0x10A0 && c <= 0x10C5) { chars[i] = String.fromCodePoint(c + 48) } i = i + 1 } return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000) } // Converts a UTF-8 string to upper case. static upper(s) { if (!(s is String)) s = "%(s)" if (s == "") return s var chars = s.toList var count = chars.count var i = 0 for (c in s.codePoints) { if ((c >= 97 && c <= 122) \|\| (c >= 224 && c <= 246) \|\| (c >= 248 && c <= 254)) { chars[i] = String.fromCodePoint(c - 32) } else if (c == 223) { chars[i] = "ẞ" } else if (c == 255) { chars[i] = "Ŷ" } else if (c < 255) { // catch other Latin-1 characters quickly. } else if ((c >= 0x0101 && c <= 0x0137) && (c % 2 == 1)) { chars[i] = String.fromCodePoint(c - 1) } else if ((c >= 0x013A && c <= 0x0148) && (c % 2 == 0)) { chars[i] = String.fromCodePoint(c - 1) } else if ((c >= 0x014B && c <= 0x0177) && (c % 2 == 1)) { chars[i] = String.fromCodePoint(c - 1) } else if (c == 0x017A \|\| c == 0x017C \|\| c == 0x017E \|\| c == 0x01A1 \|\| c == 0x01B0 \|\| c == 0x01F5) { chars[i] = String.fromCodePoint(c - 1) } else if (c == 0x01C5 \|\| c == 0x01C8 \|\| c == 0x01CB \|\| c == 0x01F2) { chars[i] = String.fromCodePoint(c - 1) } else if (c == 0x01C6 \|\| c == 0x01C9 \|\| c == 0x01CC \|\| c == 0x01F3) { chars[i] = String.fromCodePoint(c - 2) } else if ((c >= 0x01DF && c <= 0x01EF) && (c % 2 == 1)) { chars[i] = String.fromCodePoint(c - 1) } else if ((c >= 0x01F9 && c <= 0x021F) && (c % 2 == 1)) { chars[i] = String.fromCodePoint(c - 1) } else if ((c >= 0x1E01 && c <= 0x1E95) && (c % 2 == 1)) { chars[i] = String.fromCodePoint(c - 1) } else if (c == 0x1E9E) { chars[i] = "ß" } else if ((c >= 0x1EA1 && c <= 0x1EFF) && (c % 2 == 1)) { chars[i] = String.fromCodePoint(c - 1) } else if (c == 0x03AC) { chars[i] = "Ά" } else if (c == 0x03AD \|\| c == 0x03AE \|\| c == 0x03AF) { chars[i] = String.fromCodePoint(c - 37) } else if (c >= 0x03B1 && c <= 0x03C1) { chars[i] = String.fromCodePoint(c - 32) } else if (c == 0x03C2) { chars[i] = "Σ" } else if (c >= 0x03C3 && c <= 0x03CB) { chars[i] = String.fromCodePoint(c - 32) } else if (c == 0x03CC) { chars[i] = "Ό" } else if (c == 0x03CD \|\| c == 0x03CE) { chars[i] = String.fromCodePoint(c - 63) } else if (c >= 0x0430 && c <= 0x044F) { chars[i] = String.fromCodePoint(c - 32) } else if (c >= 0x0450 && c <= 0x045F) { chars[i] = String.fromCodePoint(c - 80) } else if ((c >= 0x048B && c <= 0x04BF) && (c % 2 == 1)) { chars[i] = String.fromCodePoint(c - 1) } else if ((c >= 0x04C2 && c <= 0x04CE) && (c % 2 == 0)) { chars[i] = String.fromCodePoint(c - 1) } else if ((c >= 0x04D1 && c <= 0x052F) && (c % 2 == 1)) { chars[i] = String.fromCodePoint(c - 1) } else if (c >= 0x0561 && c <= 0x0586) { chars[i] = String.fromCodePoint(c - 48) } else if (c >= 0x10D0 && c <= 0x10F5) { chars[i] = String.fromCodePoint(c - 48) } i = i + 1 } return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000) } // Capitalizes the first character of a UTF-8 string. // Uses title rather than upper case variant if it's one of 4 supported digraphs. static capitalize(s) { if (!(s is String)) s = "%(s)" if (s == "") return s var start = (s.startsWith("[") && s.count > 1) ? 1 : 0 var cs = upper(s[start]) var c = cs.codePoints[0] if (c == 0x01C4 \|\| c == 0x01C7 \|\| c == 0x01CA \|\| c == 0x01F1) { cs = String.fromCodePoint(c + 1) } if (s.count > start + 1) cs = cs + s[start+1..-1] if (start == 1) cs = "[" + cs return cs } // Capitalizes the first character of each word of a UTF-8 string. // Uses title rather than upper case variant if it's one of 4 supported digraphs. static title(s) { if (!(s is String)) s = "%(s)" if (s == "") return s var words = s.split(" ") return Strs.join(words.map { \|w\| capitalize(w) }.toList, " ") } // Removes accents and other diacritical marks from all characters in a string, // expands digraphs and removes all combining characters before returning the result. // As well as Latin-1 Supplement, coverage includes Latin Extended-A and various // other characters found in modern European languages which use the Latin alphabet. static unaccent(s) { if (!(s is String)) s = "%(s)" if (s == "") return s var accented = [ "àáâãäåāăą", "ÀÁÂÃÄÅĀĂĄ", "ḃ", "Ḃ", "çćĉċč", "ÇĆĈĊČ", "ðďđḋ", "ÐĎĐḊ", "èéêëēĕėęě", "ÈÉÊËĒĔĖĘĚ", "ḟ", "Ḟ", "ĝğġģ", "ĜĞĠĢ", "ĥħ", "ĤĦ", "ìíîïĩīĭįı", "ÌÍÎÏĨĪĬĮİ", "Ĵ", "Ĵ", "ķĸ", "Ķ", "ĺļľŀł", "ĹĻĽĿŁ", "ṁ", "Ṁ", "ñńņňŉ", "ÑŃŅŇ", "òóôõöøōŏő", "ÒÓÔÕÖØŌŎŐ", "ṗ", "Ṗ", "ŕŗř", "ŔŖŘ", "śŝşšșſ", "ŚŜŞŠȘ", "ţťŧṱț", "ŢŤŦṰȚ", "ùúûüũūŭůűų", "ÙÚÛÜŨŪŬŮŰŲ", "ŵẁẃẅ", "ŴẀẂẄ", "ýÿỳŷ", "ÝŸỲŶ", "źżž", "ŹŻŽ" ] var unaccented = "aAbBcCdDeEfFgGhHiIjJkKlLmMnNoOpPrRsStTuUwWyYzZ" var digraphs = { "æ": "ae", "Æ": "AE", "þ": "th", "Þ": "TH", "ß": "ss", "ẞ": "SS", "ĳ": "ij", "Ĳ": "IJ", "ŋ": "ng", "Ŋ": "NG", "œ": "OE", "Œ": "OE" } var r = "" var chars = s.toList var count = chars.count var i = 0 for (c in s.codePoints) { if ((c >= 0x00c0 && c <= 0x012B) \|\| c >= 0x1e02 && c <= 0x1e9e) { var found = false for (j in 0...accented.count) { if (accented[j].indexOf(chars[i]) >= 0) { chars[i] = unaccented[j] found = true break } } if (!found && digraphs.containsKey(chars[i])) chars[i] = digraphs[chars[i]] } else if (c >= 0x0300 && c <= 0x036F) chars[i] = "" i = i + 1 } return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000) } // Converts a Windows-1252 encoded byte string to a UTF-8 encoded string. static fromWin1252(win1252) { if (!(win1252 is String)) System.print("Argument must be a byte string.") if (win1252.count == 0) return "" // mapping for Windows 1252 bytes 128-159. // Unused bytes are mapped to the corresponding ISO-8859-1 control codes. var bm = [ 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021, 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f, 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178 ] var bytes = win1252.bytes var utf8 = List.filled(bytes.count, 0) for (i in 0...bytes.count) { var b = bytes[i] if (b < 128 \|\| b > 159) { utf8[i] = String.fromCodePoint(b) } else { utf8[i] = String.fromCodePoint(bm[b-128]) } } return utf8.join() } } / ~~// Type aliases for classes in case of any name clashes with other modules.~~ 'Greek' enables characters from the Greek alphabet to be found from their name. ~~var Str_Char = Char~~ These characters are often used as mathematical or scientific symbols. ~~var Str_Str = Str~~ */ ~~var Str_Utf8 = Utf8</lang>~~ class Greek { // Returns the Greek alphabet, lower then upper case characters. static alphabet { "αβγδεζηθικλμνξοπρςστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡ\u03a2ΣΤΥΦΧΨΩ" } // Returns a list of the names of all Greek letters in alphabetical order. static names { return [ "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", "sigma final", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega" ] } // Returns the name of a Greek character or null if not found. // Upper case characters are returned with the initial letter capitalized. static name(char) { if (char.count != 1) return null var ix = alphabet.toList.indexOf(char) if (ix == -1) return null if (ix < 25) return names[ix] return Str.capitalize(names[ix-25]) } // Finds and returns a Greek lower case character from its name. static lower(name) { name = Str.lower(name) var ix = names.indexOf(name) if (ix == -1) Fiber.abort("Name not found.") return String.fromCodePoint(0x03b1 + ix) } // Finds and returns a Greek upper case character from its name. static upper(name) { name = Str.lower(name) var ix = names.indexOf(name) if (ix == -1) Fiber.abort("Name not found.") if (name == "sigma final") ix = ix + 1 return String.fromCodePoint(0x0391 + ix) } }</syntaxhighlight>