Category talk:Wren-str: Difference between revisions

→‎Source code: Added Str.lastIndexOf method.
(Bug fixed properly now!)
(→‎Source code: Added Str.lastIndexOf method.)
 
(4 intermediate revisions by the same user not shown)
Line 1:
===Source code===
 
<syntaxhighlight lang="ecmascriptwren">/* Module "str.wren" */
 
/*
Line 236:
var words = s.split(" ")
return Strs.join(words.map { |w| capitalize(w) }.toList, " ")
}
 
// Removes accents and cedillas from all Latin-1 supplement characters in a string
// and also expands digraphs before returning the result.
static unaccent(s) {
if (!(s is String)) s = "%(s)"
if (s == "") return s
var accented = [
"àáâãäå", "ÀÁÂÃÄÅ", "ç", "Ç", "ð", "Ð", "èéêë", "ÈÉÊË", "ìíîï", "ÌÍÎÏ",
"ñ", "Ñ", "òóôõöø", "ÒÓÔÕÖØ", "ùúûü", "ÙÚÛÜ", "ýÿ", "Ý"
]
var unaccented = "aAcCdDeEiInNoOuUyY"
var digraphs = { "æ": "ae", "Æ": "AE", "þ": "th", "Þ": "TH", "ß": "ss" }
var r = ""
var chars = s.toList
var count = chars.count
var i = 0
for (c in s.codePoints) {
if (c >= 0xc0 && c <= 0xff) {
var found = false
for (j in 0...accented.count) {
if (accented[j].indexOf(chars[i]) >= 0) {
chars[i] = unaccented[j]
found = true
break
}
}
if (!found && digraphs.containsKey(chars[i])) chars[i] = digraphs[chars[i]]
}
i = i + 1
}
return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000)
}
 
Line 343 ⟶ 375:
var ix = Str.indexOf(ss, search)
return (ix >= 0) ? start + ix : -1
}
 
// Returns the codepoint index (not byte index) at which 'search' last occurs in 's'
// or -1 if 'search' is not found.
static lastIndexOf(s, search) {
if (!(search is String)) Fiber.abort("Search argument must be a string.")
if (!(s is String)) s = "%(s)"
var l = s.toList
for (i in l.count-1..0) {
if (l[i] == search) return i
}
return -1
}
 
Line 595 ⟶ 639:
var chars = ca.map { |c| String.fromCodePoint(c) }.toList
return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000)
}
 
// After trimming whitespace from the string 's', takes as many characters as possible
// to form a valid number and converts it thereto using the Num.fromString method.
// Returns null if such a conversion is impossible.
static toNum(s) {
if (s is Num) return s
if (!(s is String)) s = "%(s)"
s = s.trim()
var n = Num.fromString(s)
if (n) return n
if (s.count < 2) return null
var chars = s.toList
for (i in chars.count-1..1) {
chars.removeAt(i)
if (n = Num.fromString(chars.join())) return n
}
return null
}
 
// Converts a pattern into a list of tokens for processing by the 'isMatch' method.
// Characters within the pattern are represented as follows:
// Non-wildcard characters as themselves (i.e. single character strings);
// * (or **) by the number 0;
// ? (or *?) by the number 1;
// [set] by a list of the tokens within the set:
// single characters by themselves;
// a range of characters, a-b, by a Range of codepoints from 'a' to 'b'.
// If the first character of the set is '!' then the number -1 is inserted
// as a separate token immediately before the list.
static tokenize(pattern) {
var tokens = []
var i = 0
var j
while (i < pattern.count) {
var c = pattern[i]
if (c == "*") {
if (i == 0 || tokens[-1] != 0) tokens.add(0)
} else if (c == "?") {
if (i > 0 && tokens[-1] == 0) tokens[-1] = 1 else tokens.add(1)
} else if (c == "[") {
if (i == pattern.count - 1) {
tokens.add(c)
} else if ((j = indexOf(pattern, "]", i + 1)) == -1) {
tokens.add(c)
} else {
var l = []
var s = sub(pattern, i+1...j)
var k = 0
while (k < s.count) {
var d = s[k]
if (d == "!") {
if (k == 0) tokens.add(-1) else l.add(d)
} else if (k < s.count - 2 && s[k+1] == "-") {
l.add(d.codePoints[0]..s[k+2].codePoints[0])
k = k + 2
} else {
l.add(d)
}
k = k + 1
}
if (l.count == 0) Fiber.abort("set cannot be empty.")
tokens.add(l)
i = i + s.count + 1
}
} else {
tokens.add(c)
}
i = i + 1
}
return tokens
}
 
// Returns whether a string 's' matches a 'pattern' which may already be tokenized
// if many strings are to be matched. Matching is case sensitive.
// Patterns may contain the following wildcards:
// * (or **) matches zero or more characters until the next token (if any) matches
// and doesn't backtrack in the event of subsequent failure;
// ? (or *?) matches exactly one character;
// [set] matches a single character from the set within the brackets e.g. [aeiou].
// The set can also contain ranges of characters separated by '-' e.g. [a-zA-Z].
// If the first character of the set is '!' then only characters NOT within the rest
// of the set are matched e.g. [!0-9] matches any character other than a digit.
static isMatch(s, pattern) {
var tokens = pattern
if (tokens is String) tokens = tokenize(tokens)
if (!((tokens is List) && tokens.count > 0)) {
Fiber.abort("'pattern' must be a non-empty string or list of tokens.")
}
var i = 0
var j = 0
var star = false
var neg = false
while (i < s.count && j < tokens.count) {
var c = s[i]
var t = tokens[j]
if (t is Num) {
if (t == 0) {
star = true
} else if (t == 1) {
i = i + 1
star = false
} else if (t == -1) {
neg = true
} else {
Fiber.abort("'%(t)' is not a recognized token.")
}
j = j + 1
} else if (t is String) {
if (!star && c != t) return false
if (star && c == t) star = false
i = i + 1
if (!star) j = j + 1
} else if (t is List) {
var matched = false
for (e in t) {
if (e is String) {
if (e == c) {
matched = true
break
}
} else if (e is Range){
var cp = c.codePoints[0]
if (cp >= e.from && cp <= e.to) {
matched = true
break
}
} else {
Fiber.abort("'%(e)' is not a recognized token within a set.")
}
}
if (!star && !neg && !matched) return false
if (!star && neg && matched) return false
if (star && matched) star = false
i = i + 1
neg = false
if (!star) j = j + 1
} else {
Fiber.abort("'%(t)' is not a recognized token.")
}
}
if (i == s.count && j == tokens.count) return true
if (j == tokens.count && tokens[-1] == 0) return true
if (j == tokens.count - 1 && tokens[-1] == 0) return true
return false
}
}
Line 895 ⟶ 1,084:
var words = s.split(" ")
return Strs.join(words.map { |w| capitalize(w) }.toList, " ")
}
 
// Removes accents and other diacritical marks from all characters in a string,
// expands digraphs and removes all combining characters before returning the result.
// As well as Latin-1 Supplement, coverage includes Latin Extended-A and various
// other characters found in modern European languages which use the Latin alphabet.
static unaccent(s) {
if (!(s is String)) s = "%(s)"
if (s == "") return s
var accented = [
"àáâãäåāăą", "ÀÁÂÃÄÅĀĂĄ", "ḃ", "Ḃ", "çćĉċč", "ÇĆĈĊČ", "ðďđḋ", "ÐĎĐḊ",
"èéêëēĕėęě", "ÈÉÊËĒĔĖĘĚ", "ḟ", "Ḟ", "ĝğġģ", "ĜĞĠĢ", "ĥħ", "ĤĦ",
"ìíîïĩīĭįı", "ÌÍÎÏĨĪĬĮİ", "Ĵ", "Ĵ", "ķĸ", "Ķ", "ĺļľŀł", "ĹĻĽĿŁ",
"ṁ", "Ṁ", "ñńņňʼn", "ÑŃŅŇ", "òóôõöøōŏő", "ÒÓÔÕÖØŌŎŐ", "ṗ", "Ṗ",
"ŕŗř", "ŔŖŘ", "śŝşšșſ", "ŚŜŞŠȘ", "ţťŧṱț", "ŢŤŦṰȚ", "ùúûüũūŭůűų",
"ÙÚÛÜŨŪŬŮŰŲ", "ŵẁẃẅ", "ŴẀẂẄ", "ýÿỳŷ", "ÝŸỲŶ", "źżž", "ŹŻŽ"
]
var unaccented = "aAbBcCdDeEfFgGhHiIjJkKlLmMnNoOpPrRsStTuUwWyYzZ"
var digraphs = {
"æ": "ae", "Æ": "AE", "þ": "th", "Þ": "TH", "ß": "ss", "ẞ": "SS",
"ij": "ij", "IJ": "IJ", "ŋ": "ng", "Ŋ": "NG", "œ": "OE", "Œ": "OE"
}
var r = ""
var chars = s.toList
var count = chars.count
var i = 0
for (c in s.codePoints) {
if ((c >= 0x00c0 && c <= 0x012B) || c >= 0x1e02 && c <= 0x1e9e) {
var found = false
for (j in 0...accented.count) {
if (accented[j].indexOf(chars[i]) >= 0) {
chars[i] = unaccented[j]
found = true
break
}
}
if (!found && digraphs.containsKey(chars[i])) chars[i] = digraphs[chars[i]]
} else if (c >= 0x0300 && c <= 0x036F) chars[i] = ""
i = i + 1
}
return (count < 1000) ? Strs.concat_(chars) : Strs.concat(chars, 1000)
}
 
9,476

edits