Jump to content

Category talk:Wren-str: Difference between revisions

→‎Source code: Added a Str.indexOf method and a Utf8 class.
(Copied over source code from previous 'str' module talk page.)
 
(→‎Source code: Added a Str.indexOf method and a Utf8 class.)
Line 27:
static isWhitespace(c) { (c = code(c)) && (c == 32 || (c >= 9 && c <= 13)) }
 
/* Rather than use combinations of the above, these only call the 'code' nethodmethod once. */
 
static isLetter(c) {
Line 229:
if (!(s is String)) s = "%(s)"
return (i >= 0 && i < s.count) ? s.toList[i] : null
}
 
// Returns the codepoint index (not byte index) at which 'search' first occurs in 's'
// or -1 if 'search' is not found.
static indexOf(s, search) {
if (!(search is String)) Fiber.abort("Search argument must be a string.")
if (!(s is String)) s = "%(s)"
var ix = s.indexOf(search)
if (ix == -1) return -1
if (ix == 0) return 0
var cpCount = 1
var byteCount = 0
for (cp in s.codePoints) {
byteCount = byteCount + Utf8.byteCount(cp)
if (ix == byteCount) return cpCount
cpCount = cpCount + 1
}
}
 
Line 234 ⟶ 251:
static change(s, i, t) {
if (!(i is Num && i.isInteger && i >= 0)) Fiber.abort("Index must be a non-negative integer.")
if (!(t is String)) Fiber.abort("ReplacmentReplacement must be a string.")
if (!(s is String)) s = "%(s)"
var chars = s.toList
Line 294 ⟶ 311:
if (final > 0) res.add(sub(s, first..-1))
return res
}
}
 
/*
Utf8 contains routines which are specific to the UTF-8 encoding of a string's bytes or codepoints.
*/
class Utf8 {
// Returns the number of bytes in the UTF-8 encoding of its codepoint argument.
static byteCount(cp) {
if (cp < 0 || cp > 0x10ffff) Fiber.abort("Codepoint is out of range.")
if (cp < 0x80) return 1
if (cp < 0x800) return 2
if (cp < 0x10000) return 3
return 4
}
 
// Converts a Unicode codepoint into its constituent UTF-8 bytes.
static encode(cp) { String.fromCodePoint(cp).bytes.toList }
 
// Converts a list of UTF-8 encoded bytes into the equivalent Unicode codepoint.
static decode(b) {
if (!((b is List) && b.count >= 1 && b.count <= 4 && (b[0] is Num) && b[0].isInteger)) {
Fiber.abort("Argument must be a byte list of length 1 to 4.")
}
var mbMask = 0x3f // non-first bytes start 10 and carry 6 bits of data
var b0 = b[0]
if (b0 < 0x80) {
return b0
} else if (b0 < 0xe0) {
var b2Mask = 0x1f // first byte of a 2-byte encoding starts 110 and carries 5 bits of data
return (b0 & b2Mask) << 6 | (b[1] & mbMask)
} else if (b0 < 0xf0) {
var b3Mask = 0x0f // first byte of a 3-byte encoding starts 1110 and carries 4 bits of data
return (b0 & b3Mask) << 12 | (b[1] & mbMask) << 6 | (b[2] & mbMask)
} else {
var b4Mask = 0x07 // first byte of a 4-byte encoding starts 11110 and carries 3 bits of data
return (b0 & b4Mask) << 18 | (b[1] & mbMask) << 12 | (b[2] & mbMask) << 6 | (b[3] & mbMask)
}
}
}
 
// Type aliases for classes in case of any name clashes with other modules.
var Fmt_CharStr_Char = Char
var Fmt_StrStr_Str = Str</lang>
var Str_Utf8 = Utf8</lang>
9,483

edits

Cookies help us deliver our services. By using our services, you agree to our use of cookies.