String length: Difference between revisions

Add Ecstasy example
(→‎{{header|Vlang}}: Rename "Vlang" in "V (Vlang)")
(Add Ecstasy example)
 
(29 intermediate revisions by 16 users not shown)
Line 579:
===Character Length===
{{works with|QBasic}}
 
{{works with|Liberty BASIC}}
 
{{works with|PowerBASIC|PB/CC, PB/DOS}}
 
Line 587 ⟶ 585:
<syntaxhighlight lang="qbasic"> INPUT a$
PRINT LEN(a$)</syntaxhighlight>
 
==={{header|ANSI BASIC}}===
The ANSI BASIC needs line numbers.
<syntaxhighlight lang="basic">
10 INPUT A$
20 PRINT LEN(A$)
</syntaxhighlight>
 
==={{header|Applesoft BASIC}}===
The [[#GW-BASIC|GW-BASIC]] solution works without any changes.
 
==={{header|BASIC256}}===
The [[#GW-BASIC|GW-BASIC]] solution works without any changes.
 
==={{header|Chipmunk Basic}}===
The [[#GW-BASIC|GW-BASIC]] solution works without any changes.
 
==={{header|MSX Basic}}===
{{works with|MSX BASIC|any}}
The [[#GW-BASIC|GW-BASIC]] solution works without any changes.>
 
==={{header|Quite BASIC}}===
The [[#GW-BASIC|GW-BASIC]] solution works without any changes.
 
==={{header|True BASIC}}===
The [[#GW-BASIC|GW-BASIC]] solution works without any changes.
 
==={{header|Yabasic}}===
The [[#GW-BASIC|GW-BASIC]] solution works without any changes.
 
==={{header|ZX Spectrum Basic}}===
Line 755 ⟶ 782:
. vap$((=.!arg).!arg):? [?length&!length
);</syntaxhighlight>
 
=={{header|Brainf***}}==
===Byte Length===
There are several limitations Brainf*** has that influence this solution:
*Brainf*** only supports 8-bit numbers in canonical implementations, so it only supports strings of length below 255.
*The rule of thumb in Brainf*** when reading a string is to always store exactly one byte, no matter how much bytes a character represents. That's why this solution is a strictly ByteLength one.
*No way to pass anything to Brainf*** but giving the arguments as input. That's why this program reads a string and outputs the number of bytes in it.
 
[[https://esolangs.org/wiki/Brainfuck_algorithms#Print_value_of_cell_x_as_number_for_ANY_sized_cell_.28eg_8bit dot 2C_100000bit_etc.29]] is used to print the number from memory.
 
<syntaxhighlight lang="bf">
,----- ----- [>,----- -----] ; read a text until a newline
<[+++++ +++++<] ; restore the original text
>[[-]<[>+<-]>+>]< ; add one to the accumulator cell for every byte read
;; from esolang dot org
>[-]>[-]+>[-]+< [>[-<-<<[->+>+<<]>[-<+>]>>]++++++++++>[-]+>[-]>[-]> [-]<<<<<[->-[>+>>]>[[-<+>]+>+>>]<<<<<]>>-[-<<+>>]<[-]++++++++ [-<++++++>]>>[-<<+>>]<<] <[.[-]<]
[-]+++++ +++++. ; print newline
</syntaxhighlight>
 
=={{header|C}}==
Line 1,238 ⟶ 1,283:
===Character Length===
<syntaxhighlight lang="e">"Hello World".size()</syntaxhighlight>
 
=={{header|EasyLang}}==
===Character Length===
<syntaxhighlight lang="easylang>
# 5
print len "møøse"
# 7
print len "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
# 8
print len "J̲o̲s̲é̲"
# 1
print len "😀"
</syntaxhighlight>
 
=={{header|Ecstasy}}==
<syntaxhighlight lang="ecstasy">
module StrLen {
@Inject Console console;
 
void run(String s = "José") {
console.print($|For the string {s.quoted()}:
| Character length: {s.size}
| UTF-8 byte length: {s.calcUtf8Length()}
);
}
}
</syntaxhighlight>
 
{{out}}
<pre>
For the string "José":
Character length: 4
UTF-8 byte length: 5
</pre>
 
=={{header|Elena}}==
Line 1,307 ⟶ 1,386:
(string-width str)))
;; => (6 18 4) ;; in emacs 23 up</syntaxhighlight>
 
=={{header|EMal}}==
<syntaxhighlight lang="emal">
text moose = "møøse"
text unicode = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
text jose = "J" + 0U0332 + "o" + 0U0332 + "s" + 0U0332 + "e" + 0U0301 + 0U0332
text emoji = "𠇰😈🎶🔥é-"
</syntaxhighlight>
===Byte Length===
<syntaxhighlight lang="emal">
writeLine((blob!moose).length)
writeLine((blob!unicode).length)
writeLine((blob!jose).length)
writeLine((blob!emoji).length)
</syntaxhighlight>
{{out}}
<pre>
7
28
14
19
</pre>
===Character Length===
<syntaxhighlight lang="emal">
writeLine(moose.codePointsLength)
writeLine(unicode.codePointsLength)
writeLine(jose.codePointsLength)
writeLine(emoji.codePointsLength)
</syntaxhighlight>
{{out}}
<pre>
5
7
9
6
</pre>
===Grapheme Length===
<syntaxhighlight lang="emal">
writeLine(moose.graphemesLength)
writeLine(unicode.graphemesLength)
writeLine(jose.graphemesLength)
writeLine(emoji.graphemesLength)
</syntaxhighlight>
{{out}}
<pre>
5
7
4
6
</pre>
 
=={{header|Erlang}}==
Line 1,505 ⟶ 1,634:
j := "J̲o̲s̲é̲"
fmt.Printf("%d %s % x\n", len(m), m, m)
fmt.Printf("%d %s % x\n", len(u), u, u)
fmt.Printf("%d %s % x\n", len(j), j, j)
}</syntaxhighlight>
Output:
<pre>
7 møøse  6d c3 b8 c3 b8 73 65
28 𝔘𝔫𝔦𝔠𝔬𝔡𝔢 f09d9498f09d94abf09d94a6f09d94a0f09d94acf09d94a1f09d94a2f0 9d 94 98 f0 9d 94 ab f0 9d 94 a6 f0 9d 94 a0 f0 9d 94 ac f0 9d 94 a1 f0 9d 94 a2
1413 J̲o̲s̲é̲  4a cc b2 6f cc b2 73 cc b2 65 ccc3 81a9 cc b2
</pre>
 
====Character Length====
<syntaxhighlight lang="go">package main
Line 1,578 ⟶ 1,708:
Calculating "Byte-length" (by which one typically means "in-memory storage size in bytes") is not possible through the facilities of the Groovy language alone. Calculating "Character length" is built into the Groovy extensions to java.lang.String.
===Character Length===
<syntaxhighlight lang="groovy">println "Hello World!".size()</syntaxhighlight>
println "Hello World!".size()
println "møøse".size()
println "𝔘𝔫𝔦𝔠𝔬𝔡𝔢".size()
println "J̲o̲s̲é̲".size()
</syntaxhighlight>
 
Output:
<pre>12</pre>
12
5
14
8
</pre>
 
Note: The Java "String.length()" method also works in Groovy, but "size()" is consistent with usage in other sequential or composite types.
Line 1,658 ⟶ 1,798:
5</syntaxhighlight>
Here we have used 16 bit wide character literals. See also the dictionary page for [http://www.jsoftware.com/help/dictionary/duco.htm u:].
 
=={{header|Jakt}}==
===Character Length===
<syntaxhighlight lang="jakt">
fn character_length(string: String) -> i64 {
mut length = 0
for _ in string.code_points() {
length++
}
return length
}
 
fn main() {
for string in [
"Hello world!"
"møøse"
"𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
"J̲o̲s̲é̲"
] {
println("\"{}\" {}", string, character_length(string))
}
}
</syntaxhighlight>
{{out}}
<pre>
"Hello world!" 12
"møøse" 5
"𝔘𝔫𝔦𝔠𝔬𝔡𝔢" 7
"J̲o̲s̲é̲" 8
</pre>
 
===Byte Length===
<syntaxhighlight lang="jakt">
fn main() {
for string in [
"Hello world!"
"møøse"
"𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
"J̲o̲s̲é̲"
] {
println("\"{}\" {}", string, string.length())
}
}
</syntaxhighlight>
{{out}}
<pre>
"Hello world!" 12
"møøse" 7
"𝔘𝔫𝔦𝔠𝔬𝔡𝔢" 28
"J̲o̲s̲é̲" 13
</pre>
 
=={{header|Java}}==
Line 1,682 ⟶ 1,873:
int actual_length = str.codePointCount(0, str.length()); // value is 1, which is the length in characters</syntaxhighlight>
===Grapheme Length===
 
Since JDK 20<ref>https://bugs.openjdk.org/browse/JDK-8291660</ref>.
 
<syntaxhighlight lang="java">import java.text.BreakIterator;
 
Line 1,709 ⟶ 1,903:
 
=={{header|JavaScript}}==
 
===Byte Length===
===Byte length===
JavaScript encodes strings in UTF-16, which represents each character with one or two 16-bit values. The length property of string objects gives the number of 16-bit values used to encode a string, so the number of bytes can be determined by doubling that number.
 
<syntaxhighlight lang="javascript">var s = "Hello, world!";
var s = "Hello, world!";
var byteCount = s.length * 2; //26</syntaxhighlight>
var byteCount = s.length * 2; // 26
===Character Length===
</syntaxhighlight>
 
It's easier to use Buffer.byteLength (Node.JS specific, not ECMAScript).
 
<syntaxhighlight lang="javascript">
a = '👩‍❤️‍👩'
Buffer.byteLength(a, 'utf16le'); // 16
Buffer.byteLength(a, 'utf8'); // 20
Buffer.byteLength(s, 'utf16le'); // 26
Buffer.byteLength(s, 'utf8'); // 13
</syntaxhighlight>
 
In pure ECMAScript, TextEncoder() can be used to return the UTF-8 byte size:
 
<syntaxhighlight lang="javascript">
(new TextEncoder().encode(a)).length; // 20
(new TextEncoder().encode(s)).length; // 13
</syntaxhighlight>
 
=== Unicode codepoint length ===
 
JavaScript encodes strings in UTF-16, which represents each character with one or two 16-bit values. The most commonly used characters are represented by one 16-bit value, while rarer ones like some mathematical symbols are represented by two.
 
JavaScript has no built-in way to determine how many characters are in a string. However, ifIf the string only contains commonly used characters, the number of characters will be equal to the number of 16-bit values used to represent the characters.
 
<syntaxhighlight lang="javascript">var str1 = "Hello, world!";
<syntaxhighlight lang="javascript">
var len1 = str1.length; //13
var str1 = "Hello, world!";
var len1 = str1.length; // 13
 
var str2 = "\uD834\uDD2A"; // U+1D12A represented by a UTF-16 surrogate pair
var len2 = str2.length; // 2
</syntaxhighlight>
 
More generally, the expansion operator in an array can be used to enumerate Unicode code points:
 
<syntaxhighlight lang="javascript">
[...str2].length // 1
</syntaxhighlight>
 
=== Unicode grapheme length ===
 
Counting Unicode codepoints when using combining characters such as joining sequences or diacritics will return the wrong size, so we must count graphemes instead. Intl.Segmenter() default granularity is grapheme.
 
<syntaxhighlight lang="javascript">
[...new Intl.Segmenter().segment(a)].length; // 1
</syntaxhighlight>
 
var str2 = "\uD834\uDD2A"; //U+1D12A represented by a UTF-16 surrogate pair
var len2 = str2.length; //2</syntaxhighlight>
===ES6 destructuring/iterators===
 
ES6 provides several ways to get a string split into an array of code points instead of UTF-16 code units:
<syntaxhighlight lang="javascript">let
Line 1,747 ⟶ 1,982:
}
</syntaxhighlight>
 
=={{header|Joy}}==
;Byte length
<syntaxhighlight lang="joy">"Café" size.</syntaxhighlight>
{{out}}
<pre>5</pre>
 
=={{header|jq}}==
Line 1,770 ⟶ 2,011:
 
=={{header|Julia}}==
 
Julia encodes strings as UTF-8, so the byte length (via <code>sizeof</code>) will be different from the string length (via <code>length</code>) only if the string contains non-ASCII characters.
 
===Byte Length===
 
<syntaxhighlight lang="julia">sizeof("Hello, world!") # gives 13
<syntaxhighlight lang="julia">
sizeof("Hellö, wørld!") # gives 15</syntaxhighlight>
sizeof("møøse") # 7
sizeof("𝔘𝔫𝔦𝔠𝔬𝔡𝔢") # 28
sizeof("J̲o̲s̲é̲") # 13
</syntaxhighlight>
 
===Character Length===
 
<syntaxhighlight lang="julia">length("Hello, world!") # gives 13
<syntaxhighlight lang="julia">
length("Hellö, wørld!") # gives 13</syntaxhighlight>
length("møøse") # 5
length("𝔘𝔫𝔦𝔠𝔬𝔡𝔢") # 7
length("J̲o̲s̲é̲") # 8
</syntaxhighlight>
 
===Grapheme Length===
 
<syntaxhighlight lang="julia">
import Unicode
length(Unicode.graphemes("møøse")) # 5
length(Unicode.graphemes("𝔘𝔫𝔦𝔠𝔬𝔡𝔢")) # 7
length(Unicode.graphemes("J̲o̲s̲é̲")) # 4
</syntaxhighlight>
 
=={{header|K}}==
Line 1,793 ⟶ 2,052:
 
As each UTF-16 character occupies 2 bytes, it follows that the number of bytes occupied by the string will be twice the length:
<syntaxhighlight lang="scalakotlin">// version 1.0.6
fun main(args: Array<String>) {
val s = "José"
println("The char length is ${s.length}")
println("The byte length is ${CharacterChar.BYTESSIZE_BYTES * s.length}")
}</syntaxhighlight>
 
Line 1,947 ⟶ 2,206:
 
In Lua, a character is always the size of one byte so there is no difference between byte length and character length.
 
===Byte Length===
 
Byte length in UTF-8:
 
<syntaxhighlight lang="lua">str = "Hello world"
length = #str</syntaxhighlight>
Line 1,957 ⟶ 2,220:
 
===Character Length===
 
Only valid for ASCII:
 
<syntaxhighlight lang="lua">str = "Hello world"
length = #str</syntaxhighlight>
Line 1,964 ⟶ 2,230:
<syntaxhighlight lang="lua">str = "Hello world"
length = string.len(str)</syntaxhighlight>
 
For Unicode string, use utf8 module:
 
<syntaxhighlight lang="lua">
utf8.len("møøse")
utf8.len("𝔘𝔫𝔦𝔠𝔬𝔡𝔢")
utf8.len("J̲o̲s̲é̲")
</syntaxhighlight>
 
{{out}}
 
<pre>
5
7
8
</pre>
 
=={{header|M2000 Interpreter}}==
<syntaxhighlight lang="m2000 interpreter">
module String_length {
A$=format$("J\u0332o\u0332s\u0332e\u0301\u0332")
A$=format$("J\u0332o\u0332s\u0332e\u0301\u0332")
Print Len(A$) = 9 ' true Utf-16LE
Print Len.Disp(A$) = 49 \\ display' lengthtrue Utf-16LE
Print Len.Disp(A$) = 4 \\ display length
Buffer Clear Mem as Byte*100
Buffer Clear Mem as Byte*100
\\ Write at memory at offset 0 or address Mem(0)
\\ Write at memory at offset 0 or address Mem(0)
Return Mem, 0:=A$
Print Return Eval$(Mem, 0, 18):=A$
Print Eval$(Mem, 0, 18)
For i=0 to 17 step 2
For i=0 to 17 step 2
\\ print hex value and character
\\ print hex value and character
Hex Eval(Mem, i as integer), ChrCode$(Eval(Mem, i as integer))
Hex Eval(Mem, i as integer), ChrCode$(Eval(Mem, i as integer))
Next i
Next i
Document B$=A$
Document B$=A$
\\ encode to utf-8 with BOM (3 bytes 0xEF,0xBB,0xBF)
\\ encode to utf-8 with BOM (3 bytes 0xEF,0xBB,0xBF)
Save.Doc B$, "Checklen.doc", 2
Print Save.Doc B$, Filelen("Checklen.doc")=17, 2
Print Filelen("Checklen.doc")=17
\\ So length is 14 bytes + 3 the BOM
\\ So length is 14 bytes + 3 the BOM
Mem=Buffer("Checklen.doc")
Print len(Mem)=17 // len works for buffers too - unit byte
// version 12 can handle strings without suffix $
C=eval$(mem, 3, 14) // from 4th byte get 14 bytes in a string
Print len(C)*2=14 ' bytes // len()) for strings return double type of words (can return 0.5)
C=string$(C as utf8dec) ' decode bytes from utf8 to utf16LE
Print len(C)=9, C=A$, Len.Disp(C)=4
Print C
Report 2, C // proportional print on console - for text center justified rendering (2 - center)
}
String_length
</syntaxhighlight>
 
Line 2,179 ⟶ 2,473:
 
=={{header|Nim}}==
In Nim, <code>len</code> returns the byte length of strings, ignoring the UTF-8 encoding. When dealing with Unicode strings, the module <code>unicode</code> must be used.
<syntaxhighlight lang="nim">import strformat, unicode
 
===Byte Length===
var s: string = "Hello, world! ☺"
 
<syntaxhighlight lang="nim">
echo &"“{s}” has byte length {s.len}."
echo "møøse".len # 7
echo &"“{s}” has Unicode char length {s.runeLen}."</syntaxhighlight>
echo "𝔘𝔫𝔦𝔠𝔬𝔡𝔢".len # 28
echo "J̲o̲s̲é̲".len # 13
</syntaxhighlight>
 
===Character Length===
{{out}}
 
<pre>“Hello, world! ☺” has byte length 17.
<syntaxhighlight lang="nim">
“Hello, world! ☺” has Unicode char length 15.</pre>
import unicode
echo "møøse".runeLen # 5
echo "𝔘𝔫𝔦𝔠𝔬𝔡𝔢".runeLen # 7
echo "J̲o̲s̲é̲".runeLen # 8
</syntaxhighlight>
 
===Grapheme Length===
 
[https://nim-lang.org/docs/unicode.html#graphemeLen%2Cstring%2CNatural graphemeLen()] does not do what you expect. It doesn't return the number of grapheme in a string but returns the number of bytes at a character/codepoint index for a given string.
 
=={{header|Oberon-2}}==
Line 2,301 ⟶ 2,605:
7
5
</pre>
 
Alternatively, you can use the UChar module (available since OCaml 4.03) to do it without additional modules.
<syntaxhighlight lang="OCaml">
let utf8_length (s: String.t) =
let byte_length = String.length s in
let rec count acc n =
if n = byte_length
then acc
else
let n' = n + (String.get_utf_8_uchar s n |> Uchar.utf_decode_length) in
count (succ acc) n'
in
count 0 0
;;
</syntaxhighlight>
 
<pre>
# utf8_length "møøse"
- : int = 5
</pre>
 
Line 2,482 ⟶ 2,806:
ucs4length := LENGTH4(string);
END;</syntaxhighlight>
 
=={{header|Plain English}}==
===Byte Length===
{{libheader|Plain English-output}}
Plain English does not handle Unicode, so strings return their length in bytes.
<syntaxhighlight lang="text">
To run:
Start up.
Put "møøse" into a string.
Write the string's length to the output.
Wait for the escape key.
Shut down.
</syntaxhighlight>
 
=={{header|Pop11}}==
Line 2,769 ⟶ 3,106:
 
Unfortunately, only character length can be retrieved in this language.
 
=={{header|RPL}}==
RPL strings are all made of 8-bit characters.
"RPL" SIZE
 
=={{header|Ruby}}==
Line 2,905 ⟶ 3,246:
Text is read from standard input e.g. <code>echo "string" | sed -f script.sed</code> or <code>sed -f script.sed file.txt</code> (The solution given would be the contents of a text file <code>script.sed</code> in these cases).
For files with more than one line, sed will give a count for each line.
<syntaxhighlight lang="sed"># create unary numeral (i = 1)
The 'convert to digits' section is based off of [http://unix.stackexchange.com/a/36959/11750 this StackExchange answer].
s/./i/g
<syntaxhighlight lang="sed"># Change all characters to '|'.
:loop
s/./\|/g;
# divide by 10 (x = 10)
 
s/i\{10\}/x/g
# Convert to digits
# convert remainder to decimal digit
:convert
/i/!s/[0-9]*$/0&/
s/||||||||||/</g
s/<i\([0-{9]*\)$}/<0\19/g
s/|||||||||i\{8\}/98/g;
s/i\{7\}/7/
s/|||||||||/9/g; s/||||||||/8/g; s/|||||||/7/g; s/||||||/6/g;
s/i\{6\}/6/
s/|||||/5/g; s/||||/4/g; s/|||/3/g; s/||/2/g; s/|/1/g;
s/<iiiii/|5/g
s/iiii/4/
t convert
s/iii/3/
s/^$/0/</syntaxhighlight>
s/ii/2/
s/i/1/
# convert quotient (10s) to 1s
y/x/i/
# start over for the next magnitude (if any)
/i/b loop</syntaxhighlight>
 
=={{header|Seed7}}==
Line 3,366 ⟶ 3,713:
di ustrlen(s)
47</syntaxhighlight>
 
=={{header|Stringle}}==
The only current implementation of Stringle uses 8-bit character sets, meaning character and byte length is always the same.
 
This prints the length of a string from input:
 
<syntaxhighlight lang="stringle">$ #$</syntaxhighlight>
 
=={{header|Swift}}==
Line 3,476 ⟶ 3,830:
 
=={{header|UNIX Shell}}==
====Byte Lengthlength via external utility:====
====With external utility:====
 
{{works with|Bourne Shell}}
Line 3,485 ⟶ 3,838:
 
====With [[Unix|SUSv3]] parameter expansion modifier:====
 
This returns the byte count in ash/dash, but the character count in bash, ksh, and zsh:
 
{{works with|Almquist SHell}}
{{works with|Bourne Again SHell|3.2}}
{{works with|pdksh|5.2.14Korn 99/07/13.2Shell|93}}
{{works with|Z SHell}}
<syntaxhighlight lang="bash">string='Hello, world!'
length="${#string}"
echo $length # if you want it printed to the terminal</syntaxhighlight>
 
Line 3,704 ⟶ 4,059:
=={{header|Wren}}==
===Byte Length===
<syntaxhighlight lang="ecmascriptwren">System.print("møøse".bytes.count)
System.print("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".bytes.count)
System.print("J̲o̲s̲é̲".bytes.count)</syntaxhighlight>
Line 3,716 ⟶ 4,071:
 
===Character Length===
<syntaxhighlight lang="ecmascriptwren">System.print("møøse".count)
System.print("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".count)
System.print("J̲o̲s̲é̲".count)</syntaxhighlight>
Line 3,725 ⟶ 4,080:
7
8
</pre>
 
===Grapheme Length===
{{libheader|Wren-upc}}
<syntaxhighlight lang="wren">import "./upc" for Graphemes
 
System.print(Graphemes.clusterCount("møøse"))
System.print(Graphemes.clusterCount("𝔘𝔫𝔦𝔠𝔬𝔡𝔢"))
System.print(Graphemes.clusterCount("J̲o̲s̲é̲"))</syntaxhighlight>
 
{{out}}
<pre>
5
7
4
</pre>
 
Line 3,869 ⟶ 4,239:
const string4: []const u8 = "J\u{332}o\u{332}s\u{332}e\u{301}\u{332}";
try printResults(arena, string4);
// utf8 codepoints = 13, bytes = 13
// utf16 codepoints = 13, bytes = 26
// utf8 codepoints = 5, bytes = 7
// utf16 codepoints = 5, bytes = 10
// utf8 codepoints = 7, bytes = 28
// utf16 codepoints = 7, bytes = 28
// utf8 codepoints = 9, bytes = 14
// utf16 codepoints = 9, bytes = 18
}</syntaxhighlight>
 
{{out}}
 
<pre>
utf8 codepoints = 13, bytes = 13
utf16 codepoints = 13, bytes = 26
utf8 codepoints = 5, bytes = 7
utf16 codepoints = 5, bytes = 10
utf8 codepoints = 7, bytes = 28
utf16 codepoints = 7, bytes = 28
utf8 codepoints = 9, bytes = 14
utf16 codepoints = 9, bytes = 18
</pre>
162

edits