Category talk:Wren-str: Difference between revisions

← Older edit

Category talk:Wren-str (view source)

Revision as of 10:53, 30 March 2024

9,837 bytes added , 1 month ago

→‎Source code: Added Str.lastIndexOf method.

PureFox

9,476

edits

Revision as of 18:43, 3 January 2023 (view source) PureFox (talk \| contribs) (Bug fixed properly now!) ← Older edit		Latest revision as of 10:53, 30 March 2024 (view source) PureFox (talk \| contribs) (→‎Source code: Added Str.lastIndexOf method.)
(4 intermediate revisions by the same user not shown)
Line 1: ===Source code=== <syntaxhighlight lang="~~ecmascript~~wren">/* Module "str.wren" / / Line 236: var words = s.split(" ") return Strs.join(words.map { \|w\| capitalize(w) }.toList, " ") } // Removes accents and cedillas from all Latin-1 supplement characters in a string // and also expands digraphs before returning the result. static unaccent(s) { if (!(s is String)) s = "%(s)" if (s == "") return s var accented = [ "àáâãäå", "ÀÁÂÃÄÅ", "ç", "Ç", "ð", "Ð", "èéêë", "ÈÉÊË", "ìíîï", "ÌÍÎÏ", "ñ", "Ñ", "òóôõöø", "ÒÓÔÕÖØ", "ùúûü", "ÙÚÛÜ", "ýÿ", "Ý" ] var unaccented = "aAcCdDeEiInNoOuUyY" var digraphs = { "æ": "ae", "Æ": "AE", "þ": "th", "Þ": "TH", "ß": "ss" } var r = "" var chars = s.toList var count = chars.count var i = 0 for (c in s.codePoints) { if (c >= 0xc0 && c <= 0xff) { var found = false for (j in 0...accented.count) { if (accented[j].indexOf(chars[i]) >= 0) { chars[i] = unaccented[j] found = true break } } if (!found && digraphs.containsKey(chars[i])) chars[i] = digraphs[chars[i]] } i = i + 1 } return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000) } Line 343 ⟶ 375: var ix = Str.indexOf(ss, search) return (ix >= 0) ? start + ix : -1 } // Returns the codepoint index (not byte index) at which 'search' last occurs in 's' // or -1 if 'search' is not found. static lastIndexOf(s, search) { if (!(search is String)) Fiber.abort("Search argument must be a string.") if (!(s is String)) s = "%(s)" var l = s.toList for (i in l.count-1..0) { if (l[i] == search) return i } return -1 } Line 595 ⟶ 639: var chars = ca.map { \|c\| String.fromCodePoint(c) }.toList return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000) } // After trimming whitespace from the string 's', takes as many characters as possible // to form a valid number and converts it thereto using the Num.fromString method. // Returns null if such a conversion is impossible. static toNum(s) { if (s is Num) return s if (!(s is String)) s = "%(s)" s = s.trim() var n = Num.fromString(s) if (n) return n if (s.count < 2) return null var chars = s.toList for (i in chars.count-1..1) { chars.removeAt(i) if (n = Num.fromString(chars.join())) return n } return null } // Converts a pattern into a list of tokens for processing by the 'isMatch' method. // Characters within the pattern are represented as follows: // Non-wildcard characters as themselves (i.e. single character strings); // * (or *) by the number 0; // ? (or ?) by the number 1; // [set] by a list of the tokens within the set: // single characters by themselves; // a range of characters, a-b, by a Range of codepoints from 'a' to 'b'. // If the first character of the set is '!' then the number -1 is inserted // as a separate token immediately before the list. static tokenize(pattern) { var tokens = [] var i = 0 var j while (i < pattern.count) { var c = pattern[i] if (c == "") { if (i == 0 \|\| tokens[-1] != 0) tokens.add(0) } else if (c == "?") { if (i > 0 && tokens[-1] == 0) tokens[-1] = 1 else tokens.add(1) } else if (c == "[") { if (i == pattern.count - 1) { tokens.add(c) } else if ((j = indexOf(pattern, "]", i + 1)) == -1) { tokens.add(c) } else { var l = [] var s = sub(pattern, i+1...j) var k = 0 while (k < s.count) { var d = s[k] if (d == "!") { if (k == 0) tokens.add(-1) else l.add(d) } else if (k < s.count - 2 && s[k+1] == "-") { l.add(d.codePoints[0]..s[k+2].codePoints[0]) k = k + 2 } else { l.add(d) } k = k + 1 } if (l.count == 0) Fiber.abort("set cannot be empty.") tokens.add(l) i = i + s.count + 1 } } else { tokens.add(c) } i = i + 1 } return tokens } // Returns whether a string 's' matches a 'pattern' which may already be tokenized // if many strings are to be matched. Matching is case sensitive. // Patterns may contain the following wildcards: // (or *) matches zero or more characters until the next token (if any) matches // and doesn't backtrack in the event of subsequent failure; // ? (or ?) matches exactly one character; // [set] matches a single character from the set within the brackets e.g. [aeiou]. // The set can also contain ranges of characters separated by '-' e.g. [a-zA-Z]. // If the first character of the set is '!' then only characters NOT within the rest // of the set are matched e.g. [!0-9] matches any character other than a digit. static isMatch(s, pattern) { var tokens = pattern if (tokens is String) tokens = tokenize(tokens) if (!((tokens is List) && tokens.count > 0)) { Fiber.abort("'pattern' must be a non-empty string or list of tokens.") } var i = 0 var j = 0 var star = false var neg = false while (i < s.count && j < tokens.count) { var c = s[i] var t = tokens[j] if (t is Num) { if (t == 0) { star = true } else if (t == 1) { i = i + 1 star = false } else if (t == -1) { neg = true } else { Fiber.abort("'%(t)' is not a recognized token.") } j = j + 1 } else if (t is String) { if (!star && c != t) return false if (star && c == t) star = false i = i + 1 if (!star) j = j + 1 } else if (t is List) { var matched = false for (e in t) { if (e is String) { if (e == c) { matched = true break } } else if (e is Range){ var cp = c.codePoints[0] if (cp >= e.from && cp <= e.to) { matched = true break } } else { Fiber.abort("'%(e)' is not a recognized token within a set.") } } if (!star && !neg && !matched) return false if (!star && neg && matched) return false if (star && matched) star = false i = i + 1 neg = false if (!star) j = j + 1 } else { Fiber.abort("'%(t)' is not a recognized token.") } } if (i == s.count && j == tokens.count) return true if (j == tokens.count && tokens[-1] == 0) return true if (j == tokens.count - 1 && tokens[-1] == 0) return true return false } } Line 895 ⟶ 1,084: var words = s.split(" ") return Strs.join(words.map { \|w\| capitalize(w) }.toList, " ") } // Removes accents and other diacritical marks from all characters in a string, // expands digraphs and removes all combining characters before returning the result. // As well as Latin-1 Supplement, coverage includes Latin Extended-A and various // other characters found in modern European languages which use the Latin alphabet. static unaccent(s) { if (!(s is String)) s = "%(s)" if (s == "") return s var accented = [ "àáâãäåāăą", "ÀÁÂÃÄÅĀĂĄ", "ḃ", "Ḃ", "çćĉċč", "ÇĆĈĊČ", "ðďđḋ", "ÐĎĐḊ", "èéêëēĕėęě", "ÈÉÊËĒĔĖĘĚ", "ḟ", "Ḟ", "ĝğġģ", "ĜĞĠĢ", "ĥħ", "ĤĦ", "ìíîïĩīĭįı", "ÌÍÎÏĨĪĬĮİ", "Ĵ", "Ĵ", "ķĸ", "Ķ", "ĺļľŀł", "ĹĻĽĿŁ", "ṁ", "Ṁ", "ñńņňŉ", "ÑŃŅŇ", "òóôõöøōŏő", "ÒÓÔÕÖØŌŎŐ", "ṗ", "Ṗ", "ŕŗř", "ŔŖŘ", "śŝşšșſ", "ŚŜŞŠȘ", "ţťŧṱț", "ŢŤŦṰȚ", "ùúûüũūŭůűų", "ÙÚÛÜŨŪŬŮŰŲ", "ŵẁẃẅ", "ŴẀẂẄ", "ýÿỳŷ", "ÝŸỲŶ", "źżž", "ŹŻŽ" ] var unaccented = "aAbBcCdDeEfFgGhHiIjJkKlLmMnNoOpPrRsStTuUwWyYzZ" var digraphs = { "æ": "ae", "Æ": "AE", "þ": "th", "Þ": "TH", "ß": "ss", "ẞ": "SS", "ĳ": "ij", "Ĳ": "IJ", "ŋ": "ng", "Ŋ": "NG", "œ": "OE", "Œ": "OE" } var r = "" var chars = s.toList var count = chars.count var i = 0 for (c in s.codePoints) { if ((c >= 0x00c0 && c <= 0x012B) \|\| c >= 0x1e02 && c <= 0x1e9e) { var found = false for (j in 0...accented.count) { if (accented[j].indexOf(chars[i]) >= 0) { chars[i] = unaccented[j] found = true break } } if (!found && digraphs.containsKey(chars[i])) chars[i] = digraphs[chars[i]] } else if (c >= 0x0300 && c <= 0x036F) chars[i] = "" i = i + 1 } return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000) }