String length: Difference between revisions

From Rosetta Code
Content added Content deleted
(Added solution for Action!)
(Add Ecstasy example)
 
(40 intermediate revisions by 25 users not shown)
Line 30: Line 30:
Assembler 360 use EBCDIC coding, so one character is one byte.
Assembler 360 use EBCDIC coding, so one character is one byte.
The L' atrribute can be seen as the length function for assembler 360.
The L' atrribute can be seen as the length function for assembler 360.
<lang 360asm>* String length 06/07/2016
<syntaxhighlight lang="360asm">* String length 06/07/2016
LEN CSECT
LEN CSECT
USING LEN,15 base register
USING LEN,15 base register
Line 54: Line 54:
D DS D double word 8
D DS D double word 8
PG DS CL12 string 12
PG DS CL12 string 12
END LEN</lang>
END LEN</syntaxhighlight>
{{out}}
{{out}}
<pre>
<pre>
Line 67: Line 67:
{{trans|Z80 Assembly}}
{{trans|Z80 Assembly}}
Most 6502-based computers predate Unicode, so only byte length will be demonstrated for now.
Most 6502-based computers predate Unicode, so only byte length will be demonstrated for now.
<lang 6502asm>GetStringLength: ;$00 and $01 make up the pointer to the string's base address.
<syntaxhighlight lang="6502asm">GetStringLength: ;$00 and $01 make up the pointer to the string's base address.
;(Of course, any two consecutive zero-page memory locations can fulfill this role.)
;(Of course, any two consecutive zero-page memory locations can fulfill this role.)
LDY #0 ;Y is both the index into the string and the length counter.
LDY #0 ;Y is both the index into the string and the length counter.
Line 78: Line 78:


exit:
exit:
RTS ;string length is now loaded into Y.</lang>
RTS ;string length is now loaded into Y.</syntaxhighlight>


=={{header|68000 Assembly}}==
=={{header|68000 Assembly}}==
===Byte Length (ASCII)===
===Byte Length (ASCII)===
<lang 68000devpac>GetStringLength:
<syntaxhighlight lang="68000devpac">GetStringLength:
; INPUT: A3 = BASE ADDRESS OF STRING
; INPUT: A3 = BASE ADDRESS OF STRING
; RETURNS LENGTH IN D1 (MEASURED IN BYTES)
; RETURNS LENGTH IN D1 (MEASURED IN BYTES)
Line 96: Line 96:


done:
done:
RTS</lang>
RTS</syntaxhighlight>


=={{header|8086 Assembly}}==
=={{header|8086 Assembly}}==
{{trans|68000 Assembly}}
{{trans|68000 Assembly}}
===Byte Length===
===Byte Length===
<lang asm>;INPUT: DS:SI = BASE ADDR. OF STRING
<syntaxhighlight lang="asm">;INPUT: DS:SI = BASE ADDR. OF STRING
;TYPICALLY, MS-DOS USES $ TO TERMINATE STRINGS.
;TYPICALLY, MS-DOS USES $ TO TERMINATE STRINGS.
GetStringLength:
GetStringLength:
Line 115: Line 115:


done:
done:
ret</lang>
ret</syntaxhighlight>


=={{header|4D}}==
=={{header|4D}}==
===Byte Length===
===Byte Length===
<lang 4d>$length:=Length("Hello, world!")</lang>
<syntaxhighlight lang="4d">$length:=Length("Hello, world!")</syntaxhighlight>


=={{header|AArch64 Assembly}}==
=={{header|AArch64 Assembly}}==
{{works with|as|Raspberry Pi 3B version Buster 64 bits}}
{{works with|as|Raspberry Pi 3B version Buster 64 bits}}
<syntaxhighlight lang="aarch64 assembly">
<lang AArch64 Assembly>
/* ARM assembly AARCH64 Raspberry PI 3B */
/* ARM assembly AARCH64 Raspberry PI 3B */
/* program stringLength64.s */
/* program stringLength64.s */
Line 222: Line 222:
/* for this file see task include a file in language AArch64 assembly */
/* for this file see task include a file in language AArch64 assembly */
.include "../includeARM64.inc"
.include "../includeARM64.inc"
</syntaxhighlight>
</lang>
=={{header|Action!}}==
=={{header|Action!}}==
<lang Action!>PROC Test(CHAR ARRAY s)
<syntaxhighlight lang="action!">PROC Test(CHAR ARRAY s)
PrintF("Length of ""%S"" is %B%E",s,s(0))
PrintF("Length of ""%S"" is %B%E",s,s(0))
RETURN
RETURN
Line 231: Line 231:
Test("Hello world!")
Test("Hello world!")
Test("")
Test("")
RETURN</lang>
RETURN</syntaxhighlight>
{{out}}
{{out}}
[https://gitlab.com/amarok8bit/action-rosetta-code/-/raw/master/images/String_length.png Screenshot from Atari 8-bit computer]
[https://gitlab.com/amarok8bit/action-rosetta-code/-/raw/master/images/String_length.png Screenshot from Atari 8-bit computer]
Line 242: Line 242:
===Byte length===
===Byte length===
This uses UTF-8 encoding. For other encodings, the ByteArray's <code>writeMultiByte()</code> method can be used.
This uses UTF-8 encoding. For other encodings, the ByteArray's <code>writeMultiByte()</code> method can be used.
<syntaxhighlight lang="actionscript">
<lang ActionScript>
package {
package {
Line 277: Line 277:
}
}
</syntaxhighlight>
</lang>


===Character Length===
===Character Length===
<lang actionscript>
<syntaxhighlight lang="actionscript">
var s1:String = "The quick brown fox jumps over the lazy dog";
var s1:String = "The quick brown fox jumps over the lazy dog";
var s2:String = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢";
var s2:String = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢";
var s3:String = "José";
var s3:String = "José";
trace(s1.length, s2.length, s3.length); // 43, 14, 4
trace(s1.length, s2.length, s3.length); // 43, 14, 4
</syntaxhighlight>
</lang>


=={{header|Ada}}==
=={{header|Ada}}==
{{works with|GCC|4.1.2}}
{{works with|GCC|4.1.2}}
===Byte Length===
===Byte Length===
<lang ada>Str : String := "Hello World";
<syntaxhighlight lang="ada">Str : String := "Hello World";
Length : constant Natural := Str'Size / 8;</lang>
Length : constant Natural := Str'Size / 8;</syntaxhighlight>
The 'Size attribute returns the size of an object in bits. Provided that under "byte" one understands an octet of bits, the length in "bytes" will be 'Size divided to 8. Note that this is not necessarily the machine storage unit. In order to make the program portable, System.Storage_Unit should be used instead of "magic number" 8. System.Storage_Unit yields the number of bits in a storage unit on the current machine. Further, the length of a string object is not the length of what the string contains in whatever measurement units. String as an object may have a "dope" to keep the array bounds. In fact the object length can even be 0, if the compiler optimized the object away. So in most cases "byte length" makes no sense in Ada.
The 'Size attribute returns the size of an object in bits. Provided that under "byte" one understands an octet of bits, the length in "bytes" will be 'Size divided to 8. Note that this is not necessarily the machine storage unit. In order to make the program portable, System.Storage_Unit should be used instead of "magic number" 8. System.Storage_Unit yields the number of bits in a storage unit on the current machine. Further, the length of a string object is not the length of what the string contains in whatever measurement units. String as an object may have a "dope" to keep the array bounds. In fact the object length can even be 0, if the compiler optimized the object away. So in most cases "byte length" makes no sense in Ada.


===Character Length===
===Character Length===
<lang ada>Latin_1_Str : String := "Hello World";
<syntaxhighlight lang="ada">Latin_1_Str : String := "Hello World";
UCS_16_Str : Wide_String := "Hello World";
UCS_16_Str : Wide_String := "Hello World";
Unicode_Str : Wide_Wide_String := "Hello World";
Unicode_Str : Wide_Wide_String := "Hello World";
Latin_1_Length : constant Natural := Latin_1_Str'Length;
Latin_1_Length : constant Natural := Latin_1_Str'Length;
UCS_16_Length : constant Natural := UCS_16_Str'Length;
UCS_16_Length : constant Natural := UCS_16_Str'Length;
Unicode_Length : constant Natural := Unicode_Str'Length;</lang>
Unicode_Length : constant Natural := Unicode_Str'Length;</syntaxhighlight>
The attribute 'Length yields the number of elements of an [[array]]. Since strings in Ada are arrays of characters, 'Length is the string length. Ada supports strings of [[Latin-1]], [[UCS-16]] and full [[Unicode]] characters. In the example above character length of all three strings is 11. The length of the objects in bits will differ.
The attribute 'Length yields the number of elements of an [[array]]. Since strings in Ada are arrays of characters, 'Length is the string length. Ada supports strings of [[Latin-1]], [[UCS-16]] and full [[Unicode]] characters. In the example above character length of all three strings is 11. The length of the objects in bits will differ.


=={{header|Aime}}==
=={{header|Aime}}==
===Byte Length===
===Byte Length===
<lang aime>length("Hello, World!")</lang>
<syntaxhighlight lang="aime">length("Hello, World!")</syntaxhighlight>
or
or
<lang aime>~"Hello, World!"</lang>
<syntaxhighlight lang="aime">~"Hello, World!"</syntaxhighlight>


=={{header|ALGOL 68}}==
=={{header|ALGOL 68}}==
===Bits and Bytes Length===
===Bits and Bytes Length===
<lang algol68>BITS bits := bits pack((TRUE, TRUE, FALSE, FALSE)); # packed array of BOOL #
<syntaxhighlight lang="algol68">BITS bits := bits pack((TRUE, TRUE, FALSE, FALSE)); # packed array of BOOL #
BYTES bytes := bytes pack("Hello, world"); # packed array of CHAR #
BYTES bytes := bytes pack("Hello, world"); # packed array of CHAR #
print((
print((
Line 317: Line 317:
"bits width:", bits width, ", max bits: ", max bits, ", bits:", bits, new line,
"bits width:", bits width, ", max bits: ", max bits, ", bits:", bits, new line,
"bytes width: ",bytes width, ", UPB:",UPB STRING(bytes), ", string:", STRING(bytes),"!", new line
"bytes width: ",bytes width, ", UPB:",UPB STRING(bytes), ", string:", STRING(bytes),"!", new line
))</lang>
))</syntaxhighlight>
Output:
Output:
<pre>
<pre>
Line 325: Line 325:
</pre>
</pre>
===Character Length===
===Character Length===
<lang algol68>STRING str := "hello, world";
<syntaxhighlight lang="algol68">STRING str := "hello, world";
INT length := UPB str;
INT length := UPB str;
printf(($"Length of """g""" is "g(3)l$,str,length));
printf(($"Length of """g""" is "g(3)l$,str,length));
Line 331: Line 331:
printf(($l"STRINGS can start at -1, in which case LWB must be used:"l$));
printf(($l"STRINGS can start at -1, in which case LWB must be used:"l$));
STRING s := "abcd"[@-1];
STRING s := "abcd"[@-1];
print(("s:",s, ", LWB:", LWB s, ", UPB:",UPB s, ", LEN:",UPB s - LWB s + 1))</lang>
print(("s:",s, ", LWB:", LWB s, ", UPB:",UPB s, ", LEN:",UPB s - LWB s + 1))</syntaxhighlight>
Output:
Output:
<pre>
<pre>
Line 340: Line 340:


=={{header|Apex}}==
=={{header|Apex}}==
<syntaxhighlight lang="apex">
<lang Apex>
String myString = 'abcd';
String myString = 'abcd';
System.debug('Size of String', myString.length());
System.debug('Size of String', myString.length());
</syntaxhighlight>
</lang>


=={{header|AppleScript}}==
=={{header|AppleScript}}==
===Byte Length===
===Byte Length===
<lang applescript>count of "Hello World"</lang>
<syntaxhighlight lang="applescript">count of "Hello World"</syntaxhighlight>
Mac OS X 10.5 (Leopard) includes AppleScript 2.0 which uses only Unicode (UTF-16) character strings.
Mac OS X 10.5 (Leopard) includes AppleScript 2.0 which uses only Unicode (UTF-16) character strings.
This example has been tested on OSX 10.8.5. Added a combining char for testing.
This example has been tested on OSX 10.8.5. Added a combining char for testing.
<lang applescript>
<syntaxhighlight lang="applescript">
set inString to "Hello é̦世界"
set inString to "Hello é̦世界"
set byteCount to 0
set byteCount to 0
Line 382: Line 382:
return 1
return 1
end if
end if
end doit</lang>
end doit</syntaxhighlight>


===Character Length===
===Character Length===
<lang applescript>count of "Hello World"</lang>
<syntaxhighlight lang="applescript">count of "Hello World"</syntaxhighlight>
Or:
Or:
<lang applescript>count "Hello World"</lang>
<syntaxhighlight lang="applescript">count "Hello World"</syntaxhighlight>


=={{header|Applesoft BASIC}}==
=={{header|Applesoft BASIC}}==
<lang ApplesoftBASIC>? LEN("HELLO, WORLD!")</lang>
<syntaxhighlight lang="applesoftbasic">? LEN("HELLO, WORLD!")</syntaxhighlight>
=={{header|ARM Assembly}}==
=={{header|ARM Assembly}}==
{{works with|as|Raspberry Pi}}
{{works with|as|Raspberry Pi}}
<syntaxhighlight lang="arm assembly">
<lang ARM Assembly>
/* ARM assembly Raspberry PI */
/* ARM assembly Raspberry PI */
/* program stringLength.s */
/* program stringLength.s */
Line 496: Line 496:
/***************************************************/
/***************************************************/
.include "../affichage.inc"
.include "../affichage.inc"
</syntaxhighlight>
</lang>
<pre>
<pre>
møøse€
møøse€
Line 506: Line 506:
===Character Length===
===Character Length===


<lang rebol>str: "Hello World"
<syntaxhighlight lang="rebol">str: "Hello World"


print ["length =" size str]</lang>
print ["length =" size str]</syntaxhighlight>


{{out}}
{{out}}
Line 516: Line 516:
=={{header|AutoHotkey}}==
=={{header|AutoHotkey}}==
===Character Length===
===Character Length===
<lang AutoHotkey>Msgbox % StrLen("Hello World")</lang>
<syntaxhighlight lang="autohotkey">Msgbox % StrLen("Hello World")</syntaxhighlight>
Or:
Or:
<lang AutoHotkey>String := "Hello World"
<syntaxhighlight lang="autohotkey">String := "Hello World"
StringLen, Length, String
StringLen, Length, String
Msgbox % Length</lang>
Msgbox % Length</syntaxhighlight>


=={{header|Avail}}==
=={{header|Avail}}==
===Character Length===
===Character Length===
Avail represents strings as a tuple of characters, with each character representing a single code point.
Avail represents strings as a tuple of characters, with each character representing a single code point.
<lang Avail>|"møøse"|</lang>
<syntaxhighlight lang="avail">|"møøse"|</syntaxhighlight>
===Byte Length===
===Byte Length===
A UTF-8 byte length can be acquired with the standard library's UTF-8 encoder.
A UTF-8 byte length can be acquired with the standard library's UTF-8 encoder.
<lang Avail>nonBMPString ::= "𝔘𝔫𝔦𝔠𝔬𝔡𝔢";
<syntaxhighlight lang="avail">nonBMPString ::= "𝔘𝔫𝔦𝔠𝔬𝔡𝔢";
encoder ::= a UTF8 encoder;
encoder ::= a UTF8 encoder;
bytes ::= encoder process nonBMPString;
bytes ::= encoder process nonBMPString;
Line 534: Line 534:


// or, as a one-liner
// or, as a one-liner
|a UTF8 encoder process "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"|</lang>
|a UTF8 encoder process "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"|</syntaxhighlight>


=={{header|AWK}}==
=={{header|AWK}}==
===Byte Length===
===Byte Length===
From within any code block:
From within any code block:
<lang awk>w=length("Hello, world!") # static string example
<syntaxhighlight lang="awk">w=length("Hello, world!") # static string example
x=length("Hello," s " world!") # dynamic string example
x=length("Hello," s " world!") # dynamic string example
y=length($1) # input field example
y=length($1) # input field example
z=length(s) # variable name example</lang>
z=length(s) # variable name example</syntaxhighlight>
Ad hoc program from command line:
Ad hoc program from command line:
<pre> echo "Hello, wørld!" | awk '{print length($0)}' # 14</pre>
<pre> echo "Hello, wørld!" | awk '{print length($0)}' # 14</pre>
From executable script: (prints for every line arriving on stdin)
From executable script: (prints for every line arriving on stdin)
<lang awk>#!/usr/bin/awk -f
<syntaxhighlight lang="awk">#!/usr/bin/awk -f
{print"The length of this line is "length($0)}</lang>
{print"The length of this line is "length($0)}</syntaxhighlight>


=={{header|Axe}}==
=={{header|Axe}}==
Line 553: Line 553:


===Byte Length===
===Byte Length===
<lang axe>"HELLO, WORLD"→Str1
<syntaxhighlight lang="axe">"HELLO, WORLD"→Str1
Disp length(Str1)▶Dec,i</lang>
Disp length(Str1)▶Dec,i</syntaxhighlight>


=={{header|BaCon}}==
=={{header|BaCon}}==
BaCon has full native support for UTF-8 encoding.
BaCon has full native support for UTF-8 encoding.
<lang qbasic>PRINT "Bytelen of 'hello': ", LEN("hello")
<syntaxhighlight lang="qbasic">PRINT "Bytelen of 'hello': ", LEN("hello")
PRINT "Charlen of 'hello': ", ULEN("hello")
PRINT "Charlen of 'hello': ", ULEN("hello")


Line 565: Line 565:


PRINT "Bytelen of '𝔘𝔫𝔦𝔠𝔬𝔡𝔢': ", LEN("𝔘𝔫𝔦𝔠𝔬𝔡𝔢")
PRINT "Bytelen of '𝔘𝔫𝔦𝔠𝔬𝔡𝔢': ", LEN("𝔘𝔫𝔦𝔠𝔬𝔡𝔢")
PRINT "Charlen of '𝔘𝔫𝔦𝔠𝔬𝔡𝔢': ", ULEN("𝔘𝔫𝔦𝔠𝔬𝔡𝔢")</lang>
PRINT "Charlen of '𝔘𝔫𝔦𝔠𝔬𝔡𝔢': ", ULEN("𝔘𝔫𝔦𝔠𝔬𝔡𝔢")</syntaxhighlight>
{{out}}
{{out}}
<pre>
<pre>
Line 579: Line 579:
===Character Length===
===Character Length===
{{works with|QBasic}}
{{works with|QBasic}}

{{works with|Liberty BASIC}}
{{works with|Liberty BASIC}}

{{works with|PowerBASIC|PB/CC, PB/DOS}}
{{works with|PowerBASIC|PB/CC, PB/DOS}}


BASIC only supports single-byte characters. The character "ø" is converted to "°" for printing to the console and length functions, but will still output to a file as "ø".
BASIC only supports single-byte characters. The character "ø" is converted to "°" for printing to the console and length functions, but will still output to a file as "ø".
<lang qbasic> INPUT a$
<syntaxhighlight lang="qbasic"> INPUT a$
PRINT LEN(a$)</lang>
PRINT LEN(a$)</syntaxhighlight>

==={{header|ANSI BASIC}}===
The ANSI BASIC needs line numbers.
<syntaxhighlight lang="basic">
10 INPUT A$
20 PRINT LEN(A$)
</syntaxhighlight>

==={{header|Applesoft BASIC}}===
The [[#GW-BASIC|GW-BASIC]] solution works without any changes.

==={{header|BASIC256}}===
The [[#GW-BASIC|GW-BASIC]] solution works without any changes.

==={{header|Chipmunk Basic}}===
The [[#GW-BASIC|GW-BASIC]] solution works without any changes.

==={{header|MSX Basic}}===
{{works with|MSX BASIC|any}}
The [[#GW-BASIC|GW-BASIC]] solution works without any changes.>

==={{header|Quite BASIC}}===
The [[#GW-BASIC|GW-BASIC]] solution works without any changes.

==={{header|True BASIC}}===
The [[#GW-BASIC|GW-BASIC]] solution works without any changes.

==={{header|Yabasic}}===
The [[#GW-BASIC|GW-BASIC]] solution works without any changes.


==={{header|ZX Spectrum Basic}}===
==={{header|ZX Spectrum Basic}}===
The ZX Spectrum needs line numbers:
The ZX Spectrum needs line numbers:


<lang zxbasic>10 INPUT a$
<syntaxhighlight lang="zxbasic">10 INPUT a$
20 PRINT LEN a$</lang>
20 PRINT LEN a$</syntaxhighlight>


However, it's not quite as trivial as this.
However, it's not quite as trivial as this.
Line 602: Line 629:
Stripping out all entries in the string with codes in the lower 32 will get rid of colour control codes. The character length of a token is not a simple thing to determine, so this version strips them out too by eliminating anything above CHR$ 164 (the last UDG). A 91-entry DATA list of token lengths might be the next step.
Stripping out all entries in the string with codes in the lower 32 will get rid of colour control codes. The character length of a token is not a simple thing to determine, so this version strips them out too by eliminating anything above CHR$ 164 (the last UDG). A 91-entry DATA list of token lengths might be the next step.


<lang zxbasic>10 INPUT a$
<syntaxhighlight lang="zxbasic">10 INPUT a$
20 LET b$=""
20 LET b$=""
30 FOR x=1 TO LEN a$
30 FOR x=1 TO LEN a$
Line 609: Line 636:
60 LET b$=b$+a$(k)
60 LET b$=b$+a$(k)
70 NEXT x
70 NEXT x
80 PRINT LEN b$</lang>
80 PRINT LEN b$</syntaxhighlight>


====Grapheme length====
====Grapheme length====
Line 615: Line 642:
Alternatively, the string might include control codes for backspacing and overwriting;
Alternatively, the string might include control codes for backspacing and overwriting;


<lang zxbasic>10 LET a$=CHR$ 111+CHR$ 8+CHR$ 21+CHR$ 1+CHR$ 34</lang>
<syntaxhighlight lang="zxbasic">10 LET a$=CHR$ 111+CHR$ 8+CHR$ 21+CHR$ 1+CHR$ 34</syntaxhighlight>
will produce an "o" character overprinted with a quotation mark, resulting in a "passable" impression of an umlaut. The above code will reduce this to two characters when the actual printed length is one (byte length is of course five). The other possible workaround is to print the string and calculate the character length based on the resultant change in screen position. (This will only work for a string with a character length that actually fits on the screen, so below about 670.)
will produce an "o" character overprinted with a quotation mark, resulting in a "passable" impression of an umlaut. The above code will reduce this to two characters when the actual printed length is one (byte length is of course five). The other possible workaround is to print the string and calculate the character length based on the resultant change in screen position. (This will only work for a string with a character length that actually fits on the screen, so below about 670.)


<lang zxbasic>10 INPUT a$
<syntaxhighlight lang="zxbasic">10 INPUT a$
20 CLS
20 CLS
30 PRINT a$;
30 PRINT a$;
40 LET x=PEEK 23688: LET y=PEEK 23689
40 LET x=PEEK 23688: LET y=PEEK 23689
50 PRINT CHR$ 13;33-x+32*(24-y)</lang>
50 PRINT CHR$ 13;33-x+32*(24-y)</syntaxhighlight>


==={{header|Commodore BASIC}}===
==={{header|Commodore BASIC}}===
Commodore BASIC needs line numbers too, and can't use mixed case. When in mixed case mode, everything must be in lower case letters. However, the default is UPPERCASE + graphic characters; thus everything appears as UPPER case character.
Commodore BASIC needs line numbers too, and can't use mixed case. When in mixed case mode, everything must be in lower case letters. However, the default is UPPERCASE + graphic characters; thus everything appears as UPPER case character.


<lang basic>10 INPUT A$
<syntaxhighlight lang="basic">10 INPUT A$
20 PRINT LEN(A$)</lang>
20 PRINT LEN(A$)</syntaxhighlight>


==={{header|IS-BASIC}}===
==={{header|IS-BASIC}}===
<lang IS-BASIC>100 INPUT PROMPT "String: ":TX$
<syntaxhighlight lang="is-basic">100 INPUT PROMPT "String: ":TX$
110 PRINT LEN(TX$)</lang>
110 PRINT LEN(TX$)</syntaxhighlight>


==={{header|QB64}}===
==={{header|QB64}}===
In QB64 a String variable is assumed to be UTF-8 and thus the byte length is the same as character length. That said there are methods to map UTF-16 and UTF-32 to the CP437 (ASCII) table (see, _MAPUNICODE).
In QB64 a String variable is assumed to be UTF-8 and thus the byte length is the same as character length. That said there are methods to map UTF-16 and UTF-32 to the CP437 (ASCII) table (see, _MAPUNICODE).
<lang QB64>Print Len(s$)</lang>
<syntaxhighlight lang="qb64">Print Len(s$)</syntaxhighlight>


=={{header|Batch File}}==
=={{header|Batch File}}==
===Byte Length===
===Byte Length===
<lang dos>@echo off
<syntaxhighlight lang="dos">@echo off
setlocal enabledelayedexpansion
setlocal enabledelayedexpansion
call :length %1 res
call :length %1 res
Line 656: Line 683:
set str=!str:~1!
set str=!str:~1!
set /a cnt = cnt + 1
set /a cnt = cnt + 1
goto loop</lang>
goto loop</syntaxhighlight>


=={{header|BBC BASIC}}==
=={{header|BBC BASIC}}==
===Character Length===
===Character Length===
<lang bbcbasic> INPUT text$
<syntaxhighlight lang="bbcbasic"> INPUT text$
PRINT LEN(text$)</lang>
PRINT LEN(text$)</syntaxhighlight>
===Byte Length===
===Byte Length===
{{works with|BBC BASIC for Windows}}
{{works with|BBC BASIC for Windows}}
<lang bbcbasic> CP_ACP = 0
<syntaxhighlight lang="bbcbasic"> CP_ACP = 0
CP_UTF8 = &FDE9
CP_UTF8 = &FDE9
Line 675: Line 702:
PRINT "Length in bytes (ANSI encoding) = " ; LEN(textA$)
PRINT "Length in bytes (ANSI encoding) = " ; LEN(textA$)
PRINT "Length in bytes (UTF-16 encoding) = " ; 2*(nW%-1)
PRINT "Length in bytes (UTF-16 encoding) = " ; 2*(nW%-1)
PRINT "Length in bytes (UTF-8 encoding) = " ; LEN($$!^textU$)</lang>
PRINT "Length in bytes (UTF-8 encoding) = " ; LEN($$!^textU$)</syntaxhighlight>
Output:
Output:
<pre>Length in bytes (ANSI encoding) = 5
<pre>Length in bytes (ANSI encoding) = 5
Length in bytes (UTF-16 encoding) = 10
Length in bytes (UTF-16 encoding) = 10
Length in bytes (UTF-8 encoding) = 7</pre>
Length in bytes (UTF-8 encoding) = 7</pre>

=={{header|BQN}}==
Strings are arrays of characters in BQN.
===Byte Length===
Each character is converted to its codepoint, and compared with the respective UTF boundary.
<syntaxhighlight lang="bqn">BLen ← {(≠𝕩)+´⥊𝕩≥⌜@+128‿2048‿65536}</syntaxhighlight>

===Character Length===
Character length is just array length.
<syntaxhighlight lang="bqn">Len ← ≠</syntaxhighlight>

'''Output'''
<syntaxhighlight lang="bqn">•Show >(⊢⋈⊸∾Len⋈BLen)¨⟨
"møøse"
"𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
"J̲o̲s̲é̲"
⟩</syntaxhighlight>
<syntaxhighlight lang="text">┌─
╵ "møøse" 5 7
"𝔘𝔫𝔦𝔠𝔬𝔡𝔢" 7 28
"J̲o̲s̲é̲" 8 13
┘</syntaxhighlight>


=={{header|Bracmat}}==
=={{header|Bracmat}}==
The solutions work with UTF-8 encoded strings.
The solutions work with UTF-8 encoded strings.
===Byte Length===
===Byte Length===
<lang bracmat>(ByteLength=
<syntaxhighlight lang="bracmat">(ByteLength=
length
length
. @(!arg:? [?length)
. @(!arg:? [?length)
Line 690: Line 739:
);
);


out$ByteLength$𝔘𝔫𝔦𝔠𝔬𝔡𝔢</lang>
out$ByteLength$𝔘𝔫𝔦𝔠𝔬𝔡𝔢</syntaxhighlight>
Answer:
Answer:
<pre>28</pre>
<pre>28</pre>
===Character Length===
===Character Length===
<lang bracmat>(CharacterLength=
<syntaxhighlight lang="bracmat">(CharacterLength=
length c
length c
. 0:?length
. 0:?length
Line 709: Line 758:
);
);


out$CharacterLength$𝔘𝔫𝔦𝔠𝔬𝔡𝔢</lang>
out$CharacterLength$𝔘𝔫𝔦𝔠𝔬𝔡𝔢</syntaxhighlight>
Answer:
Answer:
<pre>7</pre>
<pre>7</pre>
An improved version scans the input string character wise, not byte wise. Thus many string positions that are deemed not to be possible starting positions of UTF-8 are not even tried. The patterns <code>[!p</code> and <code>[?p</code> implement a ratchet mechanism. <code>[!p</code> indicates the start of a character and <code>[?p</code> remembers the end of the character, which becomes the start position of the next byte.
An improved version scans the input string character wise, not byte wise. Thus many string positions that are deemed not to be possible starting positions of UTF-8 are not even tried. The patterns <code>[!p</code> and <code>[?p</code> implement a ratchet mechanism. <code>[!p</code> indicates the start of a character and <code>[?p</code> remembers the end of the character, which becomes the start position of the next byte.
<lang bracmat>(CharacterLength=
<syntaxhighlight lang="bracmat">(CharacterLength=
length c p
length c p
. 0:?length:?p
. 0:?length:?p
Line 726: Line 775:
)
)
| !length
| !length
);</lang>
);</syntaxhighlight>


Later versions of Bracmat have the built in function <code>vap</code> that "vaporises" a string into "atoms". If the string is UTF-8 encoded, then each "atom" is one UTF-8 character, so the length of the list of atoms is the character length of the input string. The first argument to the <code>vap</code> function is a function that will be applied to every UTF-8 encoded character in the input string. The outcomes of these function calls are the elements in the resulting list. In the solution below we choose an anonymous function <code>(=.!arg)</code> that just returns the characters themselves.
Later versions of Bracmat have the built in function <code>vap</code> that "vaporises" a string into "atoms". If the string is UTF-8 encoded, then each "atom" is one UTF-8 character, so the length of the list of atoms is the character length of the input string. The first argument to the <code>vap</code> function is a function that will be applied to every UTF-8 encoded character in the input string. The outcomes of these function calls are the elements in the resulting list. In the solution below we choose an anonymous function <code>(=.!arg)</code> that just returns the characters themselves.
<lang bracmat>(CharacterLength=
<syntaxhighlight lang="bracmat">(CharacterLength=
length
length
. vap$((=.!arg).!arg):? [?length&!length
. vap$((=.!arg).!arg):? [?length&!length
);</lang>
);</syntaxhighlight>

=={{header|Brainf***}}==
===Byte Length===
There are several limitations Brainf*** has that influence this solution:
*Brainf*** only supports 8-bit numbers in canonical implementations, so it only supports strings of length below 255.
*The rule of thumb in Brainf*** when reading a string is to always store exactly one byte, no matter how much bytes a character represents. That's why this solution is a strictly ByteLength one.
*No way to pass anything to Brainf*** but giving the arguments as input. That's why this program reads a string and outputs the number of bytes in it.

[[https://esolangs.org/wiki/Brainfuck_algorithms#Print_value_of_cell_x_as_number_for_ANY_sized_cell_.28eg_8bit dot 2C_100000bit_etc.29]] is used to print the number from memory.

<syntaxhighlight lang="bf">
,----- ----- [>,----- -----] ; read a text until a newline
<[+++++ +++++<] ; restore the original text
>[[-]<[>+<-]>+>]< ; add one to the accumulator cell for every byte read
;; from esolang dot org
>[-]>[-]+>[-]+< [>[-<-<<[->+>+<<]>[-<+>]>>]++++++++++>[-]+>[-]>[-]> [-]<<<<<[->-[>+>>]>[[-<+>]+>+>>]<<<<<]>>-[-<<+>>]<[-]++++++++ [-<++++++>]>>[-<<+>>]<<] <[.[-]<]
[-]+++++ +++++. ; print newline
</syntaxhighlight>


=={{header|C}}==
=={{header|C}}==
Line 739: Line 806:


{{works with|GCC|3.3.3}}
{{works with|GCC|3.3.3}}
<lang c>#include <string.h>
<syntaxhighlight lang="c">#include <string.h>


int main(void)
int main(void)
Line 747: Line 814:
return 0;
return 0;
}</lang>
}</syntaxhighlight>
or by hand:
or by hand:


<lang c>int main(void)
<syntaxhighlight lang="c">int main(void)
{
{
const char *string = "Hello, world!";
const char *string = "Hello, world!";
Line 759: Line 826:
return 0;
return 0;
}</lang>
}</syntaxhighlight>


or (for arrays of char only)
or (for arrays of char only)


<lang c>#include <stdlib.h>
<syntaxhighlight lang="c">#include <stdlib.h>


int main(void)
int main(void)
Line 771: Line 838:
return 0;
return 0;
}</lang>
}</syntaxhighlight>


===Character Length===
===Character Length===
For wide character strings (usually Unicode uniform-width encodings such as UCS-2 or UCS-4):
For wide character strings (usually Unicode uniform-width encodings such as UCS-2 or UCS-4):


<lang c>#include <stdio.h>
<syntaxhighlight lang="c">#include <stdio.h>
#include <wchar.h>
#include <wchar.h>


Line 789: Line 856:
return 0;
return 0;
}</lang>
}</syntaxhighlight>


===Dealing with raw multibyte string===
===Dealing with raw multibyte string===
Following code is written in UTF-8, and environment locale is assumed to be UTF-8 too. Note that "møøse" is here directly written in the source code for clarity, which is not a good idea in general. <code>mbstowcs()</code>, when passed NULL as the first argument, effectively counts the number of chars in given string under current locale.
Following code is written in UTF-8, and environment locale is assumed to be UTF-8 too. Note that "møøse" is here directly written in the source code for clarity, which is not a good idea in general. <code>mbstowcs()</code>, when passed NULL as the first argument, effectively counts the number of chars in given string under current locale.
<lang c>#include <stdio.h>
<syntaxhighlight lang="c">#include <stdio.h>
#include <stdlib.h>
#include <stdlib.h>
#include <locale.h>
#include <locale.h>
Line 805: Line 872:


return 0;
return 0;
}</lang>output<pre>bytes: 7
}</syntaxhighlight>output<pre>bytes: 7
chars: 5</pre>
chars: 5</pre>


Line 813: Line 880:
{{works with|C sharp|C #|1.0+}}
{{works with|C sharp|C #|1.0+}}
===Character Length===
===Character Length===
<lang csharp>string s = "Hello, world!";
<syntaxhighlight lang="csharp">string s = "Hello, world!";
int characterLength = s.Length;</lang>
int characterLength = s.Length;</syntaxhighlight>


===Byte Length===
===Byte Length===
Strings in .NET are stored in Unicode.
Strings in .NET are stored in Unicode.
<lang csharp>using System.Text;
<syntaxhighlight lang="csharp">using System.Text;


string s = "Hello, world!";
string s = "Hello, world!";
int byteLength = Encoding.Unicode.GetByteCount(s);</lang>
int byteLength = Encoding.Unicode.GetByteCount(s);</syntaxhighlight>
To get the number of bytes that the string would require in a different encoding, e.g., UTF8:
To get the number of bytes that the string would require in a different encoding, e.g., UTF8:
<lang csharp>int utf8ByteLength = Encoding.UTF8.GetByteCount(s);</lang>
<syntaxhighlight lang="csharp">int utf8ByteLength = Encoding.UTF8.GetByteCount(s);</syntaxhighlight>


=={{header|C++}}==
=={{header|C++}}==
Line 829: Line 896:
{{works with|ISO C++}}
{{works with|ISO C++}}
{{works with|g++|4.0.2}}
{{works with|g++|4.0.2}}
<lang cpp>#include <string> // (not <string.h>!)
<syntaxhighlight lang="cpp">#include <string> // (not <string.h>!)
using std::string;
using std::string;


Line 839: Line 906:
// In bytes same as above since sizeof(char) == 1
// In bytes same as above since sizeof(char) == 1
string::size_type bytes = s.length() * sizeof(string::value_type);
string::size_type bytes = s.length() * sizeof(string::value_type);
}</lang>
}</syntaxhighlight>
For wide character strings:
For wide character strings:


<lang cpp>#include <string>
<syntaxhighlight lang="cpp">#include <string>
using std::wstring;
using std::wstring;
Line 849: Line 916:
wstring s = L"\u304A\u306F\u3088\u3046";
wstring s = L"\u304A\u306F\u3088\u3046";
wstring::size_type length = s.length() * sizeof(wstring::value_type); // in bytes
wstring::size_type length = s.length() * sizeof(wstring::value_type); // in bytes
}</lang>
}</syntaxhighlight>


===Character Length===
===Character Length===
Line 857: Line 924:
For wide character strings:
For wide character strings:


<lang cpp>#include <string>
<syntaxhighlight lang="cpp">#include <string>
using std::wstring;
using std::wstring;


Line 864: Line 931:
wstring s = L"\u304A\u306F\u3088\u3046";
wstring s = L"\u304A\u306F\u3088\u3046";
wstring::size_type length = s.length();
wstring::size_type length = s.length();
}</lang>
}</syntaxhighlight>


For narrow character strings:
For narrow character strings:
Line 871: Line 938:
{{works with|clang++|3.0}}
{{works with|clang++|3.0}}


<lang cpp>#include <iostream>
<syntaxhighlight lang="cpp">#include <iostream>
#include <codecvt>
#include <codecvt>
int main()
int main()
Line 879: Line 946:
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
std::cout << "Character length: " << conv.from_bytes(utf8).size() << '\n';
std::cout << "Character length: " << conv.from_bytes(utf8).size() << '\n';
}</lang>
}</syntaxhighlight>


{{works with|C++98}}
{{works with|C++98}}
{{works with|g++|4.1.2 20061115 (prerelease) (SUSE Linux)}}
{{works with|g++|4.1.2 20061115 (prerelease) (SUSE Linux)}}
<lang cpp>#include <cwchar> // for mbstate_t
<syntaxhighlight lang="cpp">#include <cwchar> // for mbstate_t
#include <locale>
#include <locale>


Line 919: Line 986:
// return the result
// return the result
return length;
return length;
}</lang>
}</syntaxhighlight>


Example usage (note that the locale names are OS specific):
Example usage (note that the locale names are OS specific):


<lang cpp>#include <iostream>
<syntaxhighlight lang="cpp">#include <iostream>


int main()
int main()
Line 932: Line 999:
// Tür in ISO-8859-1
// Tür in ISO-8859-1
std::cout << char_length("\x54\xfc\x72", "de_DE") << "\n"; // outputs 3
std::cout << char_length("\x54\xfc\x72", "de_DE") << "\n"; // outputs 3
}</lang>
}</syntaxhighlight>


Note that the strings are given as explicit hex sequences, so that the encoding used for the source code won't matter.
Note that the strings are given as explicit hex sequences, so that the encoding used for the source code won't matter.
Line 940: Line 1,007:
Clean Strings are unboxed arrays of characters. Characters are always a single byte. The function size returns the number of elements in an array.
Clean Strings are unboxed arrays of characters. Characters are always a single byte. The function size returns the number of elements in an array.


<lang clean>import StdEnv
<syntaxhighlight lang="clean">import StdEnv


strlen :: String -> Int
strlen :: String -> Int
strlen string = size string
strlen string = size string


Start = strlen "Hello, world!"</lang>
Start = strlen "Hello, world!"</syntaxhighlight>


=={{header|Clojure}}==
=={{header|Clojure}}==
===Byte Length===
===Byte Length===
<lang clojure>(def utf-8-octet-length #(-> % (.getBytes "UTF-8") count))
<syntaxhighlight lang="clojure">(def utf-8-octet-length #(-> % (.getBytes "UTF-8") count))
(map utf-8-octet-length ["møøse" "𝔘𝔫𝔦𝔠𝔬𝔡𝔢" "J\u0332o\u0332s\u0332e\u0301\u0332"]) ; (7 28 14)
(map utf-8-octet-length ["møøse" "𝔘𝔫𝔦𝔠𝔬𝔡𝔢" "J\u0332o\u0332s\u0332e\u0301\u0332"]) ; (7 28 14)


Line 956: Line 1,023:


(def code-unit-length count)
(def code-unit-length count)
(map code-unit-length ["møøse" "𝔘𝔫𝔦𝔠𝔬𝔡𝔢" "J\u0332o\u0332s\u0332e\u0301\u0332"]) ; (5 14 9)</lang>
(map code-unit-length ["møøse" "𝔘𝔫𝔦𝔠𝔬𝔡𝔢" "J\u0332o\u0332s\u0332e\u0301\u0332"]) ; (5 14 9)</syntaxhighlight>


===Character length===
===Character length===
<lang clojure>(def character-length #(.codePointCount % 0 (count %)))
<syntaxhighlight lang="clojure">(def character-length #(.codePointCount % 0 (count %)))
(map character-length ["møøse" "𝔘𝔫𝔦𝔠𝔬𝔡𝔢" "J\u0332o\u0332s\u0332e\u0301\u0332"]) ; (5 7 9)</lang>
(map character-length ["møøse" "𝔘𝔫𝔦𝔠𝔬𝔡𝔢" "J\u0332o\u0332s\u0332e\u0301\u0332"]) ; (5 7 9)</syntaxhighlight>


===Grapheme Length===
===Grapheme Length===
<lang clojure>(def grapheme-length
<syntaxhighlight lang="clojure">(def grapheme-length
#(->> (doto (java.text.BreakIterator/getCharacterInstance)
#(->> (doto (java.text.BreakIterator/getCharacterInstance)
(.setText %))
(.setText %))
Line 970: Line 1,037:
(take-while (partial not= java.text.BreakIterator/DONE))
(take-while (partial not= java.text.BreakIterator/DONE))
count))
count))
(map grapheme-length ["møøse" "𝔘𝔫𝔦𝔠𝔬𝔡𝔢" "J\u0332o\u0332s\u0332e\u0301\u0332"]) ; (5 7 4)</lang>
(map grapheme-length ["møøse" "𝔘𝔫𝔦𝔠𝔬𝔡𝔢" "J\u0332o\u0332s\u0332e\u0301\u0332"]) ; (5 7 4)</syntaxhighlight>


=={{header|COBOL}}==
=={{header|COBOL}}==
===Byte Length===
===Byte Length===
<lang cobol>FUNCTION BYTE-LENGTH(str)</lang>
<syntaxhighlight lang="cobol">FUNCTION BYTE-LENGTH(str)</syntaxhighlight>


Alternative, non-standard extensions:
Alternative, non-standard extensions:
{{works with|GNU Cobol}}
{{works with|GNU Cobol}}
<lang cobol>LENGTH OF str</lang>
<syntaxhighlight lang="cobol">LENGTH OF str</syntaxhighlight>


{{works with|GNU Cobol}}
{{works with|GNU Cobol}}
{{works with|Visual COBOL}}
{{works with|Visual COBOL}}
<lang cobol>FUNCTION LENGTH-AN(str)</lang>
<syntaxhighlight lang="cobol">FUNCTION LENGTH-AN(str)</syntaxhighlight>


===Character Length===
===Character Length===
<lang cobol>FUNCTION LENGTH(str)</lang>
<syntaxhighlight lang="cobol">FUNCTION LENGTH(str)</syntaxhighlight>


=={{header|ColdFusion}}==
=={{header|ColdFusion}}==
===Byte Length===
===Byte Length===
<lang cfm>
<syntaxhighlight lang="cfm">
<cfoutput>
<cfoutput>
<cfset str = "Hello World">
<cfset str = "Hello World">
Line 996: Line 1,063:
<p>#arrayLen(t)#</p>
<p>#arrayLen(t)#</p>
</cfoutput>
</cfoutput>
</syntaxhighlight>
</lang>


===Character Length===
===Character Length===
<lang cfm>#len("Hello World")#</lang>
<syntaxhighlight lang="cfm">#len("Hello World")#</syntaxhighlight>


=={{header|Common Lisp}}==
=={{header|Common Lisp}}==
Line 1,006: Line 1,073:


{{works with|SBCL}}
{{works with|SBCL}}
<lang lisp>(length (sb-ext:string-to-octets "Hello Wørld"))</lang>
<syntaxhighlight lang="lisp">(length (sb-ext:string-to-octets "Hello Wørld"))</syntaxhighlight>
returns 12.
returns 12.
===Character Length===
===Character Length===
Common Lisp represents strings as sequences of characters, not bytes, so there is no ambiguity about the encoding. The [http://www.lispworks.com/documentation/HyperSpec/Body/f_length.htm length] function always returns the number of characters in a string.
Common Lisp represents strings as sequences of characters, not bytes, so there is no ambiguity about the encoding. The [http://www.lispworks.com/documentation/HyperSpec/Body/f_length.htm length] function always returns the number of characters in a string.
<lang lisp>(length "Hello World")</lang>
<syntaxhighlight lang="lisp">(length "Hello World")</syntaxhighlight>
returns 11, and
returns 11, and
<pre>(length "Hello Wørld")</pre>
<pre>(length "Hello Wørld")</pre>
Line 1,019: Line 1,086:


===Character Length===
===Character Length===
<lang oberon2>
<syntaxhighlight lang="oberon2">
MODULE TestLen;
MODULE TestLen;


Line 1,034: Line 1,101:


END TestLen.
END TestLen.
</syntaxhighlight>
</lang>


A symbol ''$'' in ''LEN(s$)'' in Component Pascal allows to copy sequence of characters up to null-terminated character. So, ''LEN(s$)'' returns a real length of characters instead of allocated by variable.
A symbol ''$'' in ''LEN(s$)'' in Component Pascal allows to copy sequence of characters up to null-terminated character. So, ''LEN(s$)'' returns a real length of characters instead of allocated by variable.
Line 1,045: Line 1,112:


===Byte Length===
===Byte Length===
<lang oberon2>
<syntaxhighlight lang="oberon2">
MODULE TestLen;
MODULE TestLen;


Line 1,061: Line 1,128:


END TestLen.
END TestLen.
</syntaxhighlight>
</lang>


Running command ''TestLen.DoByteLength'' gives following output:
Running command ''TestLen.DoByteLength'' gives following output:
Line 1,068: Line 1,135:
Length of characters in bytes: 10
Length of characters in bytes: 10
</pre>
</pre>

=={{header|Crystal}}==
UTF8 is the default encoding in Crystal.
===Byte Length===
<syntaxhighlight lang="crystal">"J̲o̲s̲é̲".bytesize</syntaxhighlight>

===Character Length===
<syntaxhighlight lang="crystal">"J̲o̲s̲é̲".chars.length</syntaxhighlight>


=={{header|D}}==
=={{header|D}}==
===Byte Length===
===Byte Length===
<lang d>import std.stdio;
<syntaxhighlight lang="d">import std.stdio;


void showByteLen(T)(T[] str) {
void showByteLen(T)(T[] str) {
Line 1,101: Line 1,176:
dstring s3c = "J̲o̲s̲é̲";
dstring s3c = "J̲o̲s̲é̲";
showByteLen(s3c);
showByteLen(s3c);
}</lang>
}</syntaxhighlight>
{{out}}
{{out}}
<pre>Byte length: 7 - 6dc3b8c3b87365
<pre>Byte length: 7 - 6dc3b8c3b87365
Line 1,116: Line 1,191:


===Character Length===
===Character Length===
<lang d>import std.stdio, std.range, std.conv;
<syntaxhighlight lang="d">import std.stdio, std.range, std.conv;


void showCodePointsLen(T)(T[] str) {
void showCodePointsLen(T)(T[] str) {
Line 1,146: Line 1,221:
dstring s3c = "J̲o̲s̲é̲";
dstring s3c = "J̲o̲s̲é̲";
showCodePointsLen(s3c);
showCodePointsLen(s3c);
}</lang>
}</syntaxhighlight>
{{out}}
{{out}}
<pre>Character length: 5 - 6d f8 f8 73 65
<pre>Character length: 5 - 6d f8 f8 73 65
Line 1,162: Line 1,237:
=={{header|DataWeave}}==
=={{header|DataWeave}}==
===Character Length===
===Character Length===
<lang DataWeave>sizeOf("foo")</lang>
<syntaxhighlight lang="dataweave">sizeOf("foo")</syntaxhighlight>


{{out}}
{{out}}
Line 1,172: Line 1,247:
===Byte Length===
===Byte Length===
Dc's "P" command prints numbers as strings. The number 22405534230753963835153736737 (hint: look at it in hex) represents "Hello world!". Counting the byte length of it is counting how often it iteratively can be divided by 256 with non zero result. The snippet defines the macro which calculates the length, prints the string 1st and then its length.
Dc's "P" command prints numbers as strings. The number 22405534230753963835153736737 (hint: look at it in hex) represents "Hello world!". Counting the byte length of it is counting how often it iteratively can be divided by 256 with non zero result. The snippet defines the macro which calculates the length, prints the string 1st and then its length.
<lang Dc>[256 / d 0<L 1 + ] sL
<syntaxhighlight lang="dc">[256 / d 0<L 1 + ] sL
22405534230753963835153736737 d P A P
22405534230753963835153736737 d P A P
lL x f</lang>
lL x f</syntaxhighlight>
<pre>
<pre>
Hello world!
Hello world!
Line 1,182: Line 1,257:
===Character Length===
===Character Length===
The following code output 5, which is the length of the string "abcde"
The following code output 5, which is the length of the string "abcde"
<lang Dc>[abcde]Zp</lang>
<syntaxhighlight lang="dc">[abcde]Zp</syntaxhighlight>


=={{header|Déjà Vu}}==
=={{header|Déjà Vu}}==
===Byte Length===
===Byte Length===
Byte length depends on the encoding, which internally is UTF-8, but users of the language can only get at the raw bytes after encoding a string into a blob.
Byte length depends on the encoding, which internally is UTF-8, but users of the language can only get at the raw bytes after encoding a string into a blob.
<lang dejavu>!. len !encode!utf-8 "møøse"
<syntaxhighlight lang="dejavu">!. len !encode!utf-8 "møøse"
!. len !encode!utf-8 "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"</lang>
!. len !encode!utf-8 "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"</syntaxhighlight>
{{out}}
{{out}}
<pre>
<pre>
Line 1,195: Line 1,270:


===Character Length===
===Character Length===
<lang dejavu>!. len "møøse"
<syntaxhighlight lang="dejavu">!. len "møøse"
!. len "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"</lang>
!. len "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"</syntaxhighlight>
{{out}}
{{out}}
<pre>5
<pre>5
Line 1,203: Line 1,278:
See [https://rosettacode.org/wiki/String_length#Pascal Pascal].
See [https://rosettacode.org/wiki/String_length#Pascal Pascal].
=={{header|Dyalect}}==
=={{header|Dyalect}}==
<lang dyalect>"Hello World".len()</lang>
<syntaxhighlight lang="dyalect">"Hello World".Length()</syntaxhighlight>


=={{header|E}}==
=={{header|E}}==
===Character Length===
===Character Length===
<lang e>"Hello World".size()</lang>
<syntaxhighlight lang="e">"Hello World".size()</syntaxhighlight>

=={{header|EasyLang}}==
===Character Length===
<syntaxhighlight lang="easylang>
# 5
print len "møøse"
# 7
print len "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
# 8
print len "J̲o̲s̲é̲"
# 1
print len "😀"
</syntaxhighlight>

=={{header|Ecstasy}}==
<syntaxhighlight lang="ecstasy">
module StrLen {
@Inject Console console;

void run(String s = "José") {
console.print($|For the string {s.quoted()}:
| Character length: {s.size}
| UTF-8 byte length: {s.calcUtf8Length()}
);
}
}
</syntaxhighlight>

{{out}}
<pre>
For the string "José":
Character length: 4
UTF-8 byte length: 5
</pre>


=={{header|Elena}}==
=={{header|Elena}}==
===Character Length===
===Character Length===
ELENA 4.x :
ELENA 4.x :
<lang elena>import extensions;
<syntaxhighlight lang="elena">import extensions;
public program()
public program()
Line 1,222: Line 1,331:
var ws_length := ws.Length; // Number of UTF-16 characters
var ws_length := ws.Length; // Number of UTF-16 characters
var u_length := ws.toArray().Length; //Number of UTF-32 characters
var u_length := ws.toArray().Length; //Number of UTF-32 characters
}</lang>
}</syntaxhighlight>


===Byte Length===
===Byte Length===
ELENA 4.x :
ELENA 4.x :
<lang elena>import extensions;
<syntaxhighlight lang="elena">import extensions;
public program()
public program()
Line 1,235: Line 1,344:
var s_byte_length := s.toByteArray().Length; // Number of bytes
var s_byte_length := s.toByteArray().Length; // Number of bytes
var ws_byte_length := ws.toByteArray().Length; // Number of bytes
var ws_byte_length := ws.toByteArray().Length; // Number of bytes
}</lang>
}</syntaxhighlight>


=={{header|Elixir}}==
=={{header|Elixir}}==
===Byte Length===
===Byte Length===
<lang elixir>
<syntaxhighlight lang="elixir">
name = "J\x{332}o\x{332}s\x{332}e\x{301}\x{332}"
name = "J\x{332}o\x{332}s\x{332}e\x{301}\x{332}"
byte_size(name)
byte_size(name)
# => 14
# => 14
</syntaxhighlight>
</lang>
===Character Length===
===Character Length===
<lang elixir>
<syntaxhighlight lang="elixir">
name = "J\x{332}o\x{332}s\x{332}e\x{301}\x{332}"
name = "J\x{332}o\x{332}s\x{332}e\x{301}\x{332}"
Enum.count(String.codepoints(name))
Enum.count(String.codepoints(name))
# => 9
# => 9
</syntaxhighlight>
</lang>
===Grapheme Length===
===Grapheme Length===
<lang elixir>
<syntaxhighlight lang="elixir">
name = "J\x{332}o\x{332}s\x{332}e\x{301}\x{332}"
name = "J\x{332}o\x{332}s\x{332}e\x{301}\x{332}"
String.length(name)
String.length(name)
# => 4
# => 4
</syntaxhighlight>
</lang>


=={{header|Emacs Lisp}}==
=={{header|Emacs Lisp}}==
===Character Length===
===Character Length===
<lang lisp>(length "hello")
<syntaxhighlight lang="lisp">(length "hello")
=> 5</lang>
;; => 5</syntaxhighlight>
===Byte Length===
===Byte Length===
<lang lisp>(string-bytes "\u1D518\u1D52B\u1D526")
<syntaxhighlight lang="lisp">(string-bytes "\u1D518\u1D52B\u1D526")
=> 12</lang>
;; => 12</syntaxhighlight>


<code>string-bytes</code> is the length of Emacs' internal representation. In Emacs 23 up this is utf-8. In earlier versions it was "emacs-mule".
<code>string-bytes</code> is the length of Emacs' internal representation. In Emacs 23 up this is utf-8. In earlier versions it was "emacs-mule".
Line 1,270: Line 1,379:
<code>string-width</code> is the displayed width of a string in the current frame and window. This is not the same as grapheme length since various Asian characters may display in 2 columns, depending on the type of tty or GUI.
<code>string-width</code> is the displayed width of a string in the current frame and window. This is not the same as grapheme length since various Asian characters may display in 2 columns, depending on the type of tty or GUI.


<lang lisp>(let ((str (apply 'string
<syntaxhighlight lang="lisp">(let ((str (apply 'string
(mapcar (lambda (c) (decode-char 'ucs c))
(mapcar (lambda (c) (decode-char 'ucs c))
'(#x1112 #x1161 #x11ab #x1100 #x1173 #x11af)))))
'(#x1112 #x1161 #x11ab #x1100 #x1173 #x11af)))))
Line 1,276: Line 1,385:
(string-bytes str)
(string-bytes str)
(string-width str)))
(string-width str)))
=> (6 18 4) ;; in emacs 23 up</lang>
;; => (6 18 4) ;; in emacs 23 up</syntaxhighlight>

=={{header|EMal}}==
<syntaxhighlight lang="emal">
text moose = "møøse"
text unicode = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
text jose = "J" + 0U0332 + "o" + 0U0332 + "s" + 0U0332 + "e" + 0U0301 + 0U0332
text emoji = "𠇰😈🎶🔥é-"
</syntaxhighlight>
===Byte Length===
<syntaxhighlight lang="emal">
writeLine((blob!moose).length)
writeLine((blob!unicode).length)
writeLine((blob!jose).length)
writeLine((blob!emoji).length)
</syntaxhighlight>
{{out}}
<pre>
7
28
14
19
</pre>
===Character Length===
<syntaxhighlight lang="emal">
writeLine(moose.codePointsLength)
writeLine(unicode.codePointsLength)
writeLine(jose.codePointsLength)
writeLine(emoji.codePointsLength)
</syntaxhighlight>
{{out}}
<pre>
5
7
9
6
</pre>
===Grapheme Length===
<syntaxhighlight lang="emal">
writeLine(moose.graphemesLength)
writeLine(unicode.graphemesLength)
writeLine(jose.graphemesLength)
writeLine(emoji.graphemesLength)
</syntaxhighlight>
{{out}}
<pre>
5
7
4
6
</pre>


=={{header|Erlang}}==
=={{header|Erlang}}==
Line 1,290: Line 1,449:
=={{header|Euphoria}}==
=={{header|Euphoria}}==
===Character Length===
===Character Length===
<lang Euphoria>print(1,length("Hello World"))</lang>
<syntaxhighlight lang="euphoria">print(1,length("Hello World"))</syntaxhighlight>


=={{header|F_Sharp|F#}}==
=={{header|F_Sharp|F#}}==
This is delegated to the standard .Net framework string and encoding functions.
This is delegated to the standard .Net framework string and encoding functions.
===Byte Length===
===Byte Length===
<lang fsharp>open System.Text
<syntaxhighlight lang="fsharp">open System.Text
let byte_length str = Encoding.UTF8.GetByteCount(str)</lang>
let byte_length str = Encoding.UTF8.GetByteCount(str)</syntaxhighlight>
===Character Length===
===Character Length===
<lang fsharp>"Hello, World".Length</lang>
<syntaxhighlight lang="fsharp">"Hello, World".Length</syntaxhighlight>


=={{header|Factor}}==
=={{header|Factor}}==
===Byte Length===
===Byte Length===
Here are two words to compute the byte length of strings. The first one doesn't allocate new memory, the second one can easily be adapted to measure the byte length of encodings other than UTF8.
Here are two words to compute the byte length of strings. The first one doesn't allocate new memory, the second one can easily be adapted to measure the byte length of encodings other than UTF8.
<lang factor>: string-byte-length ( string -- n ) [ code-point-length ] map-sum ;
<syntaxhighlight lang="factor">: string-byte-length ( string -- n ) [ code-point-length ] map-sum ;
: string-byte-length-2 ( string -- n ) utf8 encode length ;</lang>
: string-byte-length-2 ( string -- n ) utf8 encode length ;</syntaxhighlight>
===Character Length===
===Character Length===
<code>length</code> works on any sequece, of which strings are one. Strings are UTF8 encoded.
<code>length</code> works on any sequece, of which strings are one. Strings are UTF8 encoded.
<lang factor>length</lang>
<syntaxhighlight lang="factor">length</syntaxhighlight>


=={{header|Fantom}}==
=={{header|Fantom}}==
Line 1,315: Line 1,474:
A string can be converted into an instance of <code>Buf</code> to treat the string as a sequence of bytes according to a given charset: the default is UTF8, but 16-bit representations can also be used.
A string can be converted into an instance of <code>Buf</code> to treat the string as a sequence of bytes according to a given charset: the default is UTF8, but 16-bit representations can also be used.


<lang fantom>
<syntaxhighlight lang="fantom">
fansh> c := "møøse"
fansh> c := "møøse"
møøse
møøse
Line 1,330: Line 1,489:
fansh> c.toBuf(Charset.utf16BE).toHex // display as UTF16 big-endian
fansh> c.toBuf(Charset.utf16BE).toHex // display as UTF16 big-endian
006d00f800f800730065
006d00f800f800730065
</syntaxhighlight>
</lang>


===Character length===
===Character length===


<lang fantom>
<syntaxhighlight lang="fantom">
fansh> c := "møøse"
fansh> c := "møøse"
møøse
møøse
fansh> c.size
fansh> c.size
5
5
</syntaxhighlight>
</lang>


=={{header|Forth}}==
=={{header|Forth}}==
Line 1,350: Line 1,509:
A counted string is a single pointer to a short string in memory. The string's first byte is the count of the number of characters in the string. This is how symbols are stored in a Forth dictionary.
A counted string is a single pointer to a short string in memory. The string's first byte is the count of the number of characters in the string. This is how symbols are stored in a Forth dictionary.


<lang forth>CREATE s ," Hello world" \ create string "s"
<syntaxhighlight lang="forth">CREATE s ," Hello world" \ create string "s"
s C@ ( -- length=11 )
s C@ ( -- length=11 )
s COUNT ( addr len ) \ convert to a stack string, described below</lang>
s COUNT ( addr len ) \ convert to a stack string, described below</syntaxhighlight>


'''Stack string'''
'''Stack string'''
Line 1,358: Line 1,517:
A string on the stack is represented by a pair of cells: the address of the string data and the length of the string data (in characters). The word '''COUNT''' converts a counted string into a stack string. The STRING utility wordset of ANS Forth works on these addr-len pairs. This representation has the advantages of not requiring null-termination, easy representation of substrings, and not being limited to 255 characters.
A string on the stack is represented by a pair of cells: the address of the string data and the length of the string data (in characters). The word '''COUNT''' converts a counted string into a stack string. The STRING utility wordset of ANS Forth works on these addr-len pairs. This representation has the advantages of not requiring null-termination, easy representation of substrings, and not being limited to 255 characters.


<lang forth>S" string" ( addr len)
<syntaxhighlight lang="forth">S" string" ( addr len)
DUP . \ 6</lang>
DUP . \ 6</syntaxhighlight>


===Character Length===
===Character Length===
Line 1,366: Line 1,525:
The following code will count the number of UTF-8 characters in a null-terminated string. It relies on the fact that all bytes of a UTF-8 character except the first have the the binary bit pattern "10xxxxxx".
The following code will count the number of UTF-8 characters in a null-terminated string. It relies on the fact that all bytes of a UTF-8 character except the first have the the binary bit pattern "10xxxxxx".


<lang forth>2 base !
<syntaxhighlight lang="forth">2 base !
: utf8+ ( str -- str )
: utf8+ ( str -- str )
begin
begin
Line 1,374: Line 1,533:
10000000 <>
10000000 <>
until ;
until ;
decimal</lang>
decimal</syntaxhighlight>


<lang forth>: count-utf8 ( zstr -- n )
<syntaxhighlight lang="forth">: count-utf8 ( zstr -- n )
0
0
begin
begin
Line 1,383: Line 1,542:
utf8+
utf8+
swap 1+
swap 1+
repeat drop ;</lang>
repeat drop ;</syntaxhighlight>


=={{header|Fortran}}==
=={{header|Fortran}}==
Line 1,400: Line 1,559:


=={{header|FreeBASIC}}==
=={{header|FreeBASIC}}==
<lang freebasic>' FB 1.05.0 Win64
<syntaxhighlight lang="freebasic">' FB 1.05.0 Win64


Dim s As String = "moose" '' variable length ascii string
Dim s As String = "moose" '' variable length ascii string
Line 1,422: Line 1,581:
Print "w : " ; w, "Character Length : "; Len(s), "Byte Length : "; SizeOf(w)
Print "w : " ; w, "Character Length : "; Len(s), "Byte Length : "; SizeOf(w)
Print
Print
Sleep</lang>
Sleep</syntaxhighlight>


{{out}}
{{out}}
Line 1,436: Line 1,595:
===Byte Length===
===Byte Length===
A string can be converted to an array of bytes in any supported encoding.
A string can be converted to an array of bytes in any supported encoding.
<lang frink>
<syntaxhighlight lang="frink">
b = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
b = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
length[stringToBytes[b, "UTF-8"]]
length[stringToBytes[b, "UTF-8"]]
</syntaxhighlight>
</lang>


===Character Length===
===Character Length===
Frink's string operations correctly handle upper-plane Unicode characters as a single codepoint.
Frink's string operations correctly handle upper-plane Unicode characters as a single codepoint.
<lang frink>
<syntaxhighlight lang="frink">
b = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
b = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
length[b]
length[b]
</syntaxhighlight>
</lang>


===Grapheme Length===
===Grapheme Length===
<lang frink>
<syntaxhighlight lang="frink">
b = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
b = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
graphemeLength[b]
graphemeLength[b]
</syntaxhighlight>
</lang>


=={{header|GAP}}==
=={{header|GAP}}==
<lang gap>Length("abc");
<syntaxhighlight lang="gap">Length("abc");
# or same result with
# or same result with
Size("abc");</lang>
Size("abc");</syntaxhighlight>


=={{header|Gnuplot}}==
=={{header|Gnuplot}}==
===Byte Length===
===Byte Length===
<lang gnuplot>print strlen("hello")
<syntaxhighlight lang="gnuplot">print strlen("hello")
=> 5</lang>
=> 5</syntaxhighlight>


=={{header|Go}}==
=={{header|Go}}==
====Byte Length====
====Byte Length====
<lang go>package main
<syntaxhighlight lang="go">package main


import "fmt"
import "fmt"
Line 1,475: Line 1,634:
j := "J̲o̲s̲é̲"
j := "J̲o̲s̲é̲"
fmt.Printf("%d %s % x\n", len(m), m, m)
fmt.Printf("%d %s % x\n", len(m), m, m)
fmt.Printf("%d %s %x\n", len(u), u, u)
fmt.Printf("%d %s % x\n", len(u), u, u)
fmt.Printf("%d %s % x\n", len(j), j, j)
fmt.Printf("%d %s % x\n", len(j), j, j)
}</lang>
}</syntaxhighlight>
Output:
Output:
<pre>
<pre>
7 møøse 6d c3 b8 c3 b8 73 65
7 møøse 6d c3 b8 c3 b8 73 65
28 𝔘𝔫𝔦𝔠𝔬𝔡𝔢 f09d9498f09d94abf09d94a6f09d94a0f09d94acf09d94a1f09d94a2
28 𝔘𝔫𝔦𝔠𝔬𝔡𝔢 f0 9d 94 98 f0 9d 94 ab f0 9d 94 a6 f0 9d 94 a0 f0 9d 94 ac f0 9d 94 a1 f0 9d 94 a2
14 J̲o̲s̲é̲ 4a cc b2 6f cc b2 73 cc b2 65 cc 81 cc b2
13 J̲o̲s̲é̲ 4a cc b2 6f cc b2 73 cc b2 c3 a9 cc b2
</pre>
</pre>

====Character Length====
====Character Length====
<lang go>package main
<syntaxhighlight lang="go">package main


import (
import (
Line 1,499: Line 1,659:
fmt.Printf("%d %s %x\n", utf8.RuneCountInString(u), u, []rune(u))
fmt.Printf("%d %s %x\n", utf8.RuneCountInString(u), u, []rune(u))
fmt.Printf("%d %s %x\n", utf8.RuneCountInString(j), j, []rune(j))
fmt.Printf("%d %s %x\n", utf8.RuneCountInString(j), j, []rune(j))
}</lang>
}</syntaxhighlight>
Output:
Output:
<pre>
<pre>
Line 1,508: Line 1,668:
===Grapheme Length===
===Grapheme Length===
Go does not have language or library features to recognize graphemes directly. For example, it does not provide functions implementing [http://www.unicode.org/reports/tr29/ Unicode Standard Annex #29, Unicode Text Segmentation]. It does however have convenient functions for recognizing Unicode character categories, and so an expected subset of grapheme possibilites is easy to recognize. Here is a solution recognizing the category "Mn", which includes the combining characters used in the task example.
Go does not have language or library features to recognize graphemes directly. For example, it does not provide functions implementing [http://www.unicode.org/reports/tr29/ Unicode Standard Annex #29, Unicode Text Segmentation]. It does however have convenient functions for recognizing Unicode character categories, and so an expected subset of grapheme possibilites is easy to recognize. Here is a solution recognizing the category "Mn", which includes the combining characters used in the task example.
<lang go>package main
<syntaxhighlight lang="go">package main


import (
import (
Line 1,537: Line 1,697:
}
}
return gr
return gr
}</lang>
}</syntaxhighlight>
Output:
Output:
<pre>
<pre>
Line 1,548: Line 1,708:
Calculating "Byte-length" (by which one typically means "in-memory storage size in bytes") is not possible through the facilities of the Groovy language alone. Calculating "Character length" is built into the Groovy extensions to java.lang.String.
Calculating "Byte-length" (by which one typically means "in-memory storage size in bytes") is not possible through the facilities of the Groovy language alone. Calculating "Character length" is built into the Groovy extensions to java.lang.String.
===Character Length===
===Character Length===
<syntaxhighlight lang="groovy">
<lang groovy>println "Hello World!".size()</lang>
println "Hello World!".size()
println "møøse".size()
println "𝔘𝔫𝔦𝔠𝔬𝔡𝔢".size()
println "J̲o̲s̲é̲".size()
</syntaxhighlight>


Output:
Output:
<pre>12</pre>
<pre>
12
5
14
8
</pre>


Note: The Java "String.length()" method also works in Groovy, but "size()" is consistent with usage in other sequential or composite types.
Note: The Java "String.length()" method also works in Groovy, but "size()" is consistent with usage in other sequential or composite types.
Line 1,558: Line 1,728:
GW-BASIC only supports single-byte characters.
GW-BASIC only supports single-byte characters.


<lang qbasic>10 INPUT A$
<syntaxhighlight lang="qbasic">10 INPUT A$
20 PRINT LEN(A$)</lang>
20 PRINT LEN(A$)</syntaxhighlight>


=={{header|Haskell}}==
=={{header|Haskell}}==
Line 1,569: Line 1,739:
There are several (non-standard, so far) Unicode encoding libraries available on [http://hackage.haskell.org/ Hackage]. As an example, we'll use [http://hackage.haskell.org/packages/archive/encoding/0.2/doc/html/Data-Encoding.html encoding-0.2], as ''Data.Encoding'':
There are several (non-standard, so far) Unicode encoding libraries available on [http://hackage.haskell.org/ Hackage]. As an example, we'll use [http://hackage.haskell.org/packages/archive/encoding/0.2/doc/html/Data-Encoding.html encoding-0.2], as ''Data.Encoding'':


<lang haskell>import Data.Encoding
<syntaxhighlight lang="haskell">import Data.Encoding
import Data.ByteString as B
import Data.ByteString as B


Line 1,579: Line 1,749:


strlenUTF8 = B.length strUTF8
strlenUTF8 = B.length strUTF8
strlenUTF32 = B.length strUTF32</lang>
strlenUTF32 = B.length strUTF32</syntaxhighlight>
===Character Length===
===Character Length===
{{works with|GHC|GHCi|6.6}}
{{works with|GHC|GHCi|6.6}}
Line 1,585: Line 1,755:
The base type ''Char'' defined by the standard is already intended for (plain) Unicode characters.
The base type ''Char'' defined by the standard is already intended for (plain) Unicode characters.


<lang haskell>strlen = length "Hello, world!"</lang>
<syntaxhighlight lang="haskell">strlen = length "Hello, world!"</syntaxhighlight>


=={{header|HicEst}}==
=={{header|HicEst}}==
<lang hicest>LEN("1 character == 1 byte") ! 21</lang>
<syntaxhighlight lang="hicest">LEN("1 character == 1 byte") ! 21</syntaxhighlight>


=={{header|HolyC}}==
=={{header|HolyC}}==
===Byte Length===
===Byte Length===
<lang holyc>U8 *string = "Hello, world!";
<syntaxhighlight lang="holyc">U8 *string = "Hello, world!";
Print("%d\n", StrLen(string));
Print("%d\n", StrLen(string));
</syntaxhighlight>
</lang>


=={{header|Icon}} and {{header|Unicon}}==
=={{header|Icon}} and {{header|Unicon}}==
==== Character Length ====
==== Character Length ====
<lang Icon> length := *s</lang>
<syntaxhighlight lang="icon"> length := *s</syntaxhighlight>


Note: Neither Icon nor Unicon currently supports double-byte character sets.
Note: Neither Icon nor Unicon currently supports double-byte character sets.
Line 1,607: Line 1,777:
'''Compiler:''' any IDL compiler should do
'''Compiler:''' any IDL compiler should do


<lang idl>length = strlen("Hello, world!")</lang>
<syntaxhighlight lang="idl">length = strlen("Hello, world!")</syntaxhighlight>
===Character Length===
===Character Length===
{{needs-review|IDL}}
{{needs-review|IDL}}
<lang idl>length = strlen("Hello, world!")</lang>
<syntaxhighlight lang="idl">length = strlen("Hello, world!")</syntaxhighlight>


=={{header|Io}}==
=={{header|Io}}==
===Byte Length===
===Byte Length===
<lang io>"møøse" sizeInBytes</lang>
<syntaxhighlight lang="io">"møøse" sizeInBytes</syntaxhighlight>


===Character Length===
===Character Length===
<lang io>"møøse" size</lang>
<syntaxhighlight lang="io">"møøse" size</syntaxhighlight>


=={{header|J}}==
=={{header|J}}==
===Byte Length===
===Byte Length===
<lang j> # 'møøse'
<syntaxhighlight lang="j"> # 'møøse'
7</lang>
7</syntaxhighlight>
Here we use the default encoding for character literals (8 bit wide literals).
Here we use the default encoding for character literals (8 bit wide literals).
===Character Length===
===Character Length===
<lang j> #7 u: 'møøse'
<syntaxhighlight lang="j"> #7 u: 'møøse'
5</lang>
5</syntaxhighlight>
Here we have used 16 bit wide character literals. See also the dictionary page for [http://www.jsoftware.com/help/dictionary/duco.htm u:].
Here we have used 16 bit wide character literals. See also the dictionary page for [http://www.jsoftware.com/help/dictionary/duco.htm u:].

=={{header|Jakt}}==
===Character Length===
<syntaxhighlight lang="jakt">
fn character_length(string: String) -> i64 {
mut length = 0
for _ in string.code_points() {
length++
}
return length
}

fn main() {
for string in [
"Hello world!"
"møøse"
"𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
"J̲o̲s̲é̲"
] {
println("\"{}\" {}", string, character_length(string))
}
}
</syntaxhighlight>
{{out}}
<pre>
"Hello world!" 12
"møøse" 5
"𝔘𝔫𝔦𝔠𝔬𝔡𝔢" 7
"J̲o̲s̲é̲" 8
</pre>

===Byte Length===
<syntaxhighlight lang="jakt">
fn main() {
for string in [
"Hello world!"
"møøse"
"𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
"J̲o̲s̲é̲"
] {
println("\"{}\" {}", string, string.length())
}
}
</syntaxhighlight>
{{out}}
<pre>
"Hello world!" 12
"møøse" 7
"𝔘𝔫𝔦𝔠𝔬𝔡𝔢" 28
"J̲o̲s̲é̲" 13
</pre>


=={{header|Java}}==
=={{header|Java}}==
Line 1,635: Line 1,856:
Another way to know the byte length of a string -who cares- is to explicitly specify the charset we desire.
Another way to know the byte length of a string -who cares- is to explicitly specify the charset we desire.


<lang java5>String s = "Hello, world!";
<syntaxhighlight lang="java5">String s = "Hello, world!";
int byteCountUTF16 = s.getBytes("UTF-16").length; // Incorrect: it yields 28 (that is with the BOM)
int byteCountUTF16 = s.getBytes("UTF-16").length; // Incorrect: it yields 28 (that is with the BOM)
int byteCountUTF16LE = s.getBytes("UTF-16LE").length; // Correct: it yields 26
int byteCountUTF16LE = s.getBytes("UTF-16LE").length; // Correct: it yields 26
int byteCountUTF8 = s.getBytes("UTF-8").length; // yields 13 </lang>
int byteCountUTF8 = s.getBytes("UTF-8").length; // yields 13 </syntaxhighlight>


===Character Length===
===Character Length===
Line 1,644: Line 1,865:


The length method of String objects is not the length of that String in characters. Instead, it only gives the number of 16-bit code units used to encode a string. This is not (always) the number of Unicode characters (code points) in the string.
The length method of String objects is not the length of that String in characters. Instead, it only gives the number of 16-bit code units used to encode a string. This is not (always) the number of Unicode characters (code points) in the string.
<lang java5>String s = "Hello, world!";
<syntaxhighlight lang="java5">String s = "Hello, world!";
int not_really_the_length = s.length(); // XXX: does not (always) count Unicode characters (code points)! </lang>
int not_really_the_length = s.length(); // XXX: does not (always) count Unicode characters (code points)! </syntaxhighlight>


Since Java 1.5, the actual number of characters (code points) can be determined by calling the codePointCount method.
Since Java 1.5, the actual number of characters (code points) can be determined by calling the codePointCount method.
<lang java5>String str = "\uD834\uDD2A"; //U+1D12A
<syntaxhighlight lang="java5">String str = "\uD834\uDD2A"; //U+1D12A
int not_really__the_length = str.length(); // value is 2, which is not the length in characters
int not_really__the_length = str.length(); // value is 2, which is not the length in characters
int actual_length = str.codePointCount(0, str.length()); // value is 1, which is the length in characters</lang>
int actual_length = str.codePointCount(0, str.length()); // value is 1, which is the length in characters</syntaxhighlight>
===Grapheme Length===
===Grapheme Length===

<lang java>import java.text.BreakIterator;
Since JDK 20<ref>https://bugs.openjdk.org/browse/JDK-8291660</ref>.

<syntaxhighlight lang="java">import java.text.BreakIterator;


public class Grapheme {
public class Grapheme {
Line 1,670: Line 1,894:
System.out.println("Grapheme length: " + count+ " " + s);
System.out.println("Grapheme length: " + count+ " " + s);
}
}
}</lang>
}</syntaxhighlight>
Output:
Output:
<pre>
<pre>
Line 1,679: Line 1,903:


=={{header|JavaScript}}==
=={{header|JavaScript}}==

===Byte Length===
===Byte length===
JavaScript encodes strings in UTF-16, which represents each character with one or two 16-bit values. The length property of string objects gives the number of 16-bit values used to encode a string, so the number of bytes can be determined by doubling that number.
JavaScript encodes strings in UTF-16, which represents each character with one or two 16-bit values. The length property of string objects gives the number of 16-bit values used to encode a string, so the number of bytes can be determined by doubling that number.


<syntaxhighlight lang="javascript">
<lang javascript>var s = "Hello, world!";
var s = "Hello, world!";
var byteCount = s.length * 2; //26</lang>
var byteCount = s.length * 2; // 26
===Character Length===
</syntaxhighlight>

It's easier to use Buffer.byteLength (Node.JS specific, not ECMAScript).

<syntaxhighlight lang="javascript">
a = '👩‍❤️‍👩'
Buffer.byteLength(a, 'utf16le'); // 16
Buffer.byteLength(a, 'utf8'); // 20
Buffer.byteLength(s, 'utf16le'); // 26
Buffer.byteLength(s, 'utf8'); // 13
</syntaxhighlight>

In pure ECMAScript, TextEncoder() can be used to return the UTF-8 byte size:

<syntaxhighlight lang="javascript">
(new TextEncoder().encode(a)).length; // 20
(new TextEncoder().encode(s)).length; // 13
</syntaxhighlight>

=== Unicode codepoint length ===

JavaScript encodes strings in UTF-16, which represents each character with one or two 16-bit values. The most commonly used characters are represented by one 16-bit value, while rarer ones like some mathematical symbols are represented by two.
JavaScript encodes strings in UTF-16, which represents each character with one or two 16-bit values. The most commonly used characters are represented by one 16-bit value, while rarer ones like some mathematical symbols are represented by two.


JavaScript has no built-in way to determine how many characters are in a string. However, if the string only contains commonly used characters, the number of characters will be equal to the number of 16-bit values used to represent the characters.
If the string only contains commonly used characters, the number of characters will be equal to the number of 16-bit values used to represent the characters.

<lang javascript>var str1 = "Hello, world!";
<syntaxhighlight lang="javascript">
var len1 = str1.length; //13
var str1 = "Hello, world!";
var len1 = str1.length; // 13

var str2 = "\uD834\uDD2A"; // U+1D12A represented by a UTF-16 surrogate pair
var len2 = str2.length; // 2
</syntaxhighlight>

More generally, the expansion operator in an array can be used to enumerate Unicode code points:

<syntaxhighlight lang="javascript">
[...str2].length // 1
</syntaxhighlight>

=== Unicode grapheme length ===

Counting Unicode codepoints when using combining characters such as joining sequences or diacritics will return the wrong size, so we must count graphemes instead. Intl.Segmenter() default granularity is grapheme.

<syntaxhighlight lang="javascript">
[...new Intl.Segmenter().segment(a)].length; // 1
</syntaxhighlight>


var str2 = "\uD834\uDD2A"; //U+1D12A represented by a UTF-16 surrogate pair
var len2 = str2.length; //2</lang>
===ES6 destructuring/iterators===
===ES6 destructuring/iterators===

ES6 provides several ways to get a string split into an array of code points instead of UTF-16 code units:
ES6 provides several ways to get a string split into an array of code points instead of UTF-16 code units:
<lang javascript>let
<syntaxhighlight lang="javascript">let
str='AöЖ€𝄞'
str='AöЖ€𝄞'
,countofcodeunits=str.length // 6
,countofcodeunits=str.length // 6
Line 1,716: Line 1,981:
countofcodepoints=cparr.length // 5
countofcodepoints=cparr.length // 5
}
}
</syntaxhighlight>
</lang>

=={{header|Joy}}==
;Byte length
<syntaxhighlight lang="joy">"Café" size.</syntaxhighlight>
{{out}}
<pre>5</pre>


=={{header|jq}}==
=={{header|jq}}==
jq strings are JSON strings and are therefore encoded as UTF-8. When given a JSON string, the <tt>length</tt> filter emits the number of Unicode codepoints that it contains:
jq strings are JSON strings and are therefore encoded as UTF-8. When given a JSON string, the <tt>length</tt> filter emits the number of Unicode codepoints that it contains:
<lang jq>$ cat String_length.jq
<syntaxhighlight lang="jq">$ cat String_length.jq
def describe:
def describe:
"length of \(.) is \(length)";
"length of \(.) is \(length)";


("J̲o̲s̲é̲", "𝔘𝔫𝔦𝔠𝔬𝔡𝔢") | describe</lang><lang sh>
("J̲o̲s̲é̲", "𝔘𝔫𝔦𝔠𝔬𝔡𝔢") | describe</syntaxhighlight><syntaxhighlight lang="sh">
$ jq -n -f String_length.jq
$ jq -n -f String_length.jq
"length of J̲o̲s̲é̲ is 8"
"length of J̲o̲s̲é̲ is 8"
"length of 𝔘𝔫𝔦𝔠𝔬𝔡𝔢 is 7"</lang>
"length of 𝔘𝔫𝔦𝔠𝔬𝔡𝔢 is 7"</syntaxhighlight>


=={{header|JudoScript}}==
=={{header|JudoScript}}==
===Byte Length===
===Byte Length===
{{needs-review|JudoScript}}
{{needs-review|JudoScript}}
<lang judoscript>//Store length of hello world in length and print it
<syntaxhighlight lang="judoscript">//Store length of hello world in length and print it
. length = "Hello World".length();</lang>
. length = "Hello World".length();</syntaxhighlight>
===Character Length===
===Character Length===
{{needs-review| JudoScript}}
{{needs-review| JudoScript}}
<lang judoscript>//Store length of hello world in length and print it
<syntaxhighlight lang="judoscript">//Store length of hello world in length and print it
. length = "Hello World".length()</lang>
. length = "Hello World".length()</syntaxhighlight>


=={{header|Julia}}==
=={{header|Julia}}==

Julia encodes strings as UTF-8, so the byte length (via <code>sizeof</code>) will be different from the string length (via <code>length</code>) only if the string contains non-ASCII characters.
Julia encodes strings as UTF-8, so the byte length (via <code>sizeof</code>) will be different from the string length (via <code>length</code>) only if the string contains non-ASCII characters.


===Byte Length===
===Byte Length===

<lang julia>sizeof("Hello, world!") # gives 13
<syntaxhighlight lang="julia">
sizeof("Hellö, wørld!") # gives 15</lang>
sizeof("møøse") # 7
sizeof("𝔘𝔫𝔦𝔠𝔬𝔡𝔢") # 28
sizeof("J̲o̲s̲é̲") # 13
</syntaxhighlight>


===Character Length===
===Character Length===

<lang julia>length("Hello, world!") # gives 13
<syntaxhighlight lang="julia">
length("Hellö, wørld!") # gives 13</lang>
length("møøse") # 5
length("𝔘𝔫𝔦𝔠𝔬𝔡𝔢") # 7
length("J̲o̲s̲é̲") # 8
</syntaxhighlight>

===Grapheme Length===

<syntaxhighlight lang="julia">
import Unicode
length(Unicode.graphemes("møøse")) # 5
length(Unicode.graphemes("𝔘𝔫𝔦𝔠𝔬𝔡𝔢")) # 7
length(Unicode.graphemes("J̲o̲s̲é̲")) # 4
</syntaxhighlight>


=={{header|K}}==
=={{header|K}}==
===Character Length===
===Character Length===
<syntaxhighlight lang="k">
<lang K>
#"Hello, world!"
#"Hello, world!"
13
13
#"Hëllo, world!"
#"Hëllo, world!"
13
13
</syntaxhighlight>
</lang>


=={{header|Kotlin}}==
=={{header|Kotlin}}==
Line 1,763: Line 2,052:


As each UTF-16 character occupies 2 bytes, it follows that the number of bytes occupied by the string will be twice the length:
As each UTF-16 character occupies 2 bytes, it follows that the number of bytes occupied by the string will be twice the length:
<syntaxhighlight lang="kotlin">
<lang scala>// version 1.0.6
fun main(args: Array<String>) {
fun main() {
val s = "José"
val s = "José"
println("The char length is ${s.length}")
println("The char length is ${s.length}")
println("The byte length is ${Character.BYTES * s.length}")
println("The byte length is ${Char.SIZE_BYTES * s.length}")
}</lang>
}</syntaxhighlight>


{{out}}
{{out}}
Line 1,788: Line 2,077:
The lambdatalk {W.length string} function returns the number of bytes in a string. For Unicode characters made of two bytes things are a little bit more tricky. It's easy to add (inline) a new javascript primitive to the dictionary:
The lambdatalk {W.length string} function returns the number of bytes in a string. For Unicode characters made of two bytes things are a little bit more tricky. It's easy to add (inline) a new javascript primitive to the dictionary:


<lang scheme>
<syntaxhighlight lang="scheme">
{script
{script
LAMBDATALK.DICT["W.unicodeLength"] = function() {
LAMBDATALK.DICT["W.unicodeLength"] = function() {
Line 1,822: Line 2,111:
{W.length 𝔘𝔫𝔦𝔠𝔬𝔡𝔢} -> 14
{W.length 𝔘𝔫𝔦𝔠𝔬𝔡𝔢} -> 14
{W.unicodeLength 𝔘𝔫𝔦𝔠𝔬𝔡𝔢} -> 7
{W.unicodeLength 𝔘𝔫𝔦𝔠𝔬𝔡𝔢} -> 7
</syntaxhighlight>
</lang>




=={{header|Lasso}}==
=={{header|Lasso}}==
===Character Length===
===Character Length===
<lang Lasso>'Hello, world!'->size // 13
<syntaxhighlight lang="lasso">'Hello, world!'->size // 13
'møøse'->size // 5
'møøse'->size // 5
'𝔘𝔫𝔦𝔠𝔬𝔡𝔢'->size // 7</lang>
'𝔘𝔫𝔦𝔠𝔬𝔡𝔢'->size // 7</syntaxhighlight>


===Byte Length===
===Byte Length===
<lang Lasso>'Hello, world!'->asBytes->size // 13
<syntaxhighlight lang="lasso">'Hello, world!'->asBytes->size // 13
'møøse'->asBytes->size // 7
'møøse'->asBytes->size // 7
'𝔘𝔫𝔦𝔠𝔬𝔡𝔢'->asBytes->size // 28</lang>
'𝔘𝔫𝔦𝔠𝔬𝔡𝔢'->asBytes->size // 28</syntaxhighlight>


=={{header|LFE}}==
=={{header|LFE}}==
Line 1,840: Line 2,129:
=== Character Length ===
=== Character Length ===


<lang lisp>
<syntaxhighlight lang="lisp">
(length "ASCII text")
(length "ASCII text")
10
10
Line 1,849: Line 2,138:
> (length (unicode:characters_to_list encoded 'utf8))
> (length (unicode:characters_to_list encoded 'utf8))
12
12
</syntaxhighlight>
</lang>


=== Byte Length ===
=== Byte Length ===


<lang lisp>
<syntaxhighlight lang="lisp">
> (set encoded (binary ("𝔘𝔫𝔦𝔠𝔬𝔡𝔢 𝔗𝔢𝒙𝔱" utf8)))
> (set encoded (binary ("𝔘𝔫𝔦𝔠𝔬𝔡𝔢 𝔗𝔢𝒙𝔱" utf8)))
#B(240 157 148 152 240 157 148 171 240 157 ...)
#B(240 157 148 152 240 157 148 171 240 157 ...)
Line 1,866: Line 2,155:
> (byte_size encoded)
> (byte_size encoded)
10
10
</syntaxhighlight>
</lang>


=={{header|Liberty BASIC}}==
=={{header|Liberty BASIC}}==
Line 1,873: Line 2,162:
=={{header|Lingo}}==
=={{header|Lingo}}==
===Character Length===
===Character Length===
<lang lingo>utf8Str = "Hello world äöü"
<syntaxhighlight lang="lingo">utf8Str = "Hello world äöü"
put utf8Str.length
put utf8Str.length
-- 15</lang>
-- 15</syntaxhighlight>
===Byte Length===
===Byte Length===
<lang lingo>utf8Str = "Hello world äöü"
<syntaxhighlight lang="lingo">utf8Str = "Hello world äöü"
put bytearray(utf8Str).length
put bytearray(utf8Str).length
-- 18</lang>
-- 18</syntaxhighlight>


=={{header|LiveCode}}==
=={{header|LiveCode}}==
Line 1,886: Line 2,175:


===Character Length===
===Character Length===
<lang LiveCode >put the length of "Hello World" </lang>
<syntaxhighlight lang="livecode ">put the length of "Hello World" </syntaxhighlight>
or
or
<lang LiveCode >put the number of characters in "Hello World" -- 'chars' short for characters is also valid</lang>
<syntaxhighlight lang="livecode ">put the number of characters in "Hello World" -- 'chars' short for characters is also valid</syntaxhighlight>
or
or
<lang LiveCode >put length("Hello World")</lang>
<syntaxhighlight lang="livecode ">put length("Hello World")</syntaxhighlight>


for Unicode character count use the code units keyword
for Unicode character count use the code units keyword
<lang LiveCode >put the number of codeunits of "Hello World" -- count of unicode characters </lang>
<syntaxhighlight lang="livecode ">put the number of codeunits of "Hello World" -- count of unicode characters </syntaxhighlight>


===Byte Length===
===Byte Length===
Use the 'byte' keyword in LiveCode for an accurate unicode char byte count
Use the 'byte' keyword in LiveCode for an accurate unicode char byte count
<lang LiveCode>put the number of bytes in "Hello World" </lang>
<syntaxhighlight lang="livecode">put the number of bytes in "Hello World" </syntaxhighlight>


=={{header|Logo}}==
=={{header|Logo}}==
Logo is so old that only ASCII encoding is supported. Modern versions of Logo may have enhanced character set support.
Logo is so old that only ASCII encoding is supported. Modern versions of Logo may have enhanced character set support.
<lang logo>print count "|Hello World| ; 11
<syntaxhighlight lang="logo">print count "|Hello World| ; 11
print count "møøse ; 5
print count "møøse ; 5
print char 248 ; ø - implies ISO-Latin character set</lang>
print char 248 ; ø - implies ISO-Latin character set</syntaxhighlight>


=={{header|LSE64}}==
=={{header|LSE64}}==
===Byte Length===
===Byte Length===
LSE stores strings as arrays of characters in 64-bit cells plus a count.
LSE stores strings as arrays of characters in 64-bit cells plus a count.
<lang lse64>" Hello world" @ 1 + 8 * , # 96 = (11+1)*(size of a cell) = 12*8</lang>
<syntaxhighlight lang="lse64">" Hello world" @ 1 + 8 * , # 96 = (11+1)*(size of a cell) = 12*8</syntaxhighlight>
===Character Length===
===Character Length===
LSE uses counted strings: arrays of characters, where the first cell contains the number of characters in the string.
LSE uses counted strings: arrays of characters, where the first cell contains the number of characters in the string.
<lang lse64>" Hello world" @ , # 11</lang>
<syntaxhighlight lang="lse64">" Hello world" @ , # 11</syntaxhighlight>


=={{header|Lua}}==
=={{header|Lua}}==
Line 1,917: Line 2,206:


In Lua, a character is always the size of one byte so there is no difference between byte length and character length.
In Lua, a character is always the size of one byte so there is no difference between byte length and character length.

===Byte Length===
===Byte Length===

<lang lua>str = "Hello world"
length = #str</lang>
Byte length in UTF-8:

<syntaxhighlight lang="lua">str = "Hello world"
length = #str</syntaxhighlight>


or
or


<lang lua>str = "Hello world"
<syntaxhighlight lang="lua">str = "Hello world"
length = string.len(str)</lang>
length = string.len(str)</syntaxhighlight>


===Character Length===
===Character Length===

<lang lua>str = "Hello world"
Only valid for ASCII:
length = #str</lang>

<syntaxhighlight lang="lua">str = "Hello world"
length = #str</syntaxhighlight>


or
or


<lang lua>str = "Hello world"
<syntaxhighlight lang="lua">str = "Hello world"
length = string.len(str)</lang>
length = string.len(str)</syntaxhighlight>

For Unicode string, use utf8 module:

<syntaxhighlight lang="lua">
utf8.len("møøse")
utf8.len("𝔘𝔫𝔦𝔠𝔬𝔡𝔢")
utf8.len("J̲o̲s̲é̲")
</syntaxhighlight>

{{out}}

<pre>
5
7
8
</pre>


=={{header|M2000 Interpreter}}==
=={{header|M2000 Interpreter}}==
<syntaxhighlight lang="m2000 interpreter">
<lang M2000 Interpreter>
module String_length {
A$=format$("J\u0332o\u0332s\u0332e\u0301\u0332")
A$=format$("J\u0332o\u0332s\u0332e\u0301\u0332")
Print Len(A$) = 9 ' true Utf-16LE
Print Len.Disp(A$) = 4 \\ display length
Print Len(A$) = 9 ' true Utf-16LE
Print Len.Disp(A$) = 4 \\ display length
Buffer Clear Mem as Byte*100
Buffer Clear Mem as Byte*100
\\ Write at memory at offset 0 or address Mem(0)
\\ Write at memory at offset 0 or address Mem(0)
Return Mem, 0:=A$
Print Eval$(Mem, 0, 18)
Return Mem, 0:=A$
Print Eval$(Mem, 0, 18)
For i=0 to 17 step 2
For i=0 to 17 step 2
\\ print hex value and character
\\ print hex value and character
Hex Eval(Mem, i as integer), ChrCode$(Eval(Mem, i as integer))
Hex Eval(Mem, i as integer), ChrCode$(Eval(Mem, i as integer))
Next i
Next i
Document B$=A$
Document B$=A$
\\ encode to utf-8 with BOM (3 bytes 0xEF,0xBB,0xBF)
\\ encode to utf-8 with BOM (3 bytes 0xEF,0xBB,0xBF)
Save.Doc B$, "Checklen.doc", 2
Print Filelen("Checklen.doc")=17
Save.Doc B$, "Checklen.doc", 2
Print Filelen("Checklen.doc")=17
\\ So length is 14 bytes + 3 the BOM
\\ So length is 14 bytes + 3 the BOM
</lang>
Mem=Buffer("Checklen.doc")
Print len(Mem)=17 // len works for buffers too - unit byte
// version 12 can handle strings without suffix $
C=eval$(mem, 3, 14) // from 4th byte get 14 bytes in a string
Print len(C)*2=14 ' bytes // len()) for strings return double type of words (can return 0.5)
C=string$(C as utf8dec) ' decode bytes from utf8 to utf16LE
Print len(C)=9, C=A$, Len.Disp(C)=4
Print C
Report 2, C // proportional print on console - for text center justified rendering (2 - center)
}
String_length
</syntaxhighlight>


=={{header|Maple}}==
=={{header|Maple}}==
=== Character length ===
=== Character length ===
<lang maple>length("Hello world");</lang>
<syntaxhighlight lang="maple">length("Hello world");</syntaxhighlight>
=== Byte count ===
=== Byte count ===
<lang maple>nops(convert("Hello world",bytes));</lang>
<syntaxhighlight lang="maple">nops(convert("Hello world",bytes));</syntaxhighlight>


=={{header|Mathematica}}/{{header|Wolfram Language}}==
=={{header|Mathematica}}/{{header|Wolfram Language}}==
=== Character length ===
=== Character length ===
<lang mathematica>StringLength["Hello world"]</lang>
<syntaxhighlight lang="mathematica">StringLength["Hello world"]</syntaxhighlight>
=== Byte length ===
=== Byte length ===
<lang mathematica>StringByteCount["Hello world"]</lang>
<syntaxhighlight lang="mathematica">StringByteCount["Hello world"]</syntaxhighlight>


=={{header|MATLAB}}==
=={{header|MATLAB}}==
===Character Length===
===Character Length===
<lang MATLAB>>> length('møøse')
<syntaxhighlight lang="matlab">>> length('møøse')


ans =
ans =


5</lang>
5</syntaxhighlight>
===Byte Length===
===Byte Length===
MATLAB apparently encodes strings using UTF-16.
MATLAB apparently encodes strings using UTF-16.
<lang MATLAB>>> numel(dec2hex('møøse'))
<syntaxhighlight lang="matlab">>> numel(dec2hex('møøse'))


ans =
ans =


10</lang>
10</syntaxhighlight>


=={{header|Maxima}}==
=={{header|Maxima}}==
<lang maxima>s: "the quick brown fox jumps over the lazy dog";
<syntaxhighlight lang="maxima">s: "the quick brown fox jumps over the lazy dog";
slength(s);
slength(s);
/* 43 */</lang>
/* 43 */</syntaxhighlight>


=={{header|MAXScript}}==
=={{header|MAXScript}}==
===Character Length===
===Character Length===
<lang maxscript>"Hello world".count</lang>
<syntaxhighlight lang="maxscript">"Hello world".count</syntaxhighlight>


=={{header|Mercury}}==
=={{header|Mercury}}==
Line 1,999: Line 2,323:


===Byte Length===
===Byte Length===
<lang mercury>:- module string_byte_length.
<syntaxhighlight lang="mercury">:- module string_byte_length.
:- interface.
:- interface.


Line 2,018: Line 2,342:
write_length(String, !IO):-
write_length(String, !IO):-
NumBytes = count_utf8_code_units(String),
NumBytes = count_utf8_code_units(String),
io.format("%s: %d bytes\n", [s(String), i(NumBytes)], !IO).</lang>
io.format("%s: %d bytes\n", [s(String), i(NumBytes)], !IO).</syntaxhighlight>


Output:
Output:
Line 2,029: Line 2,353:
===Character Length===
===Character Length===
The function <tt>string.count_codepoints/1</tt> returns the number of code points in a string.
The function <tt>string.count_codepoints/1</tt> returns the number of code points in a string.
<lang mercury>:- module string_character_length.
<syntaxhighlight lang="mercury">:- module string_character_length.
:- interface.
:- interface.


Line 2,048: Line 2,372:
write_length(String, !IO) :-
write_length(String, !IO) :-
NumChars = count_codepoints(String),
NumChars = count_codepoints(String),
io.format("%s: %d characters\n", [s(String), i(NumChars)], !IO).</lang>
io.format("%s: %d characters\n", [s(String), i(NumChars)], !IO).</syntaxhighlight>


Output:
Output:
Line 2,061: Line 2,385:
Metafont has no way of handling properly encodings different from ASCII. So it is able to count only the number of bytes in a string.
Metafont has no way of handling properly encodings different from ASCII. So it is able to count only the number of bytes in a string.


<lang metafont>string s;
<syntaxhighlight lang="metafont">string s;
s := "Hello Moose";
s := "Hello Moose";
show length(s); % 11 (ok)
show length(s); % 11 (ok)
s := "Hello Møøse";
s := "Hello Møøse";
show length(s); % 13 (number of bytes when the string is UTF-8 encoded,
show length(s); % 13 (number of bytes when the string is UTF-8 encoded,
% since ø takes two bytes)</lang>
% since ø takes two bytes)</syntaxhighlight>


'''Note''': in the lang tag, Møøse is Latin1-reencoded, showing up two bytes (as Latin1) instead of one
'''Note''': in the lang tag, Møøse is Latin1-reencoded, showing up two bytes (as Latin1) instead of one
Line 2,072: Line 2,396:
=={{header|MIPS Assembly}}==
=={{header|MIPS Assembly}}==
This only supports ASCII encoding, so it'll return both byte length and char length.
This only supports ASCII encoding, so it'll return both byte length and char length.
<lang mips>
<syntaxhighlight lang="mips">
.data
.data
#.asciiz automatically adds the NULL terminator character, \0 for us.
#.asciiz automatically adds the NULL terminator character, \0 for us.
Line 2,094: Line 2,418:
li $v0,10 #set syscall to cleanly exit EXIT_SUCCESS
li $v0,10 #set syscall to cleanly exit EXIT_SUCCESS
syscall
syscall
</syntaxhighlight>
</lang>


=={{header|mIRC Scripting Language}}==
=={{header|mIRC Scripting Language}}==
===Byte Length===
===Byte Length===
{{needs-review|mIRC Scripting Language}}
{{needs-review|mIRC Scripting Language}}
<lang mirc>alias stringlength { echo -a Your Name is: $len($$?="Whats your name") letters long! }</lang>
<syntaxhighlight lang="mirc">alias stringlength { echo -a Your Name is: $len($$?="Whats your name") letters long! }</syntaxhighlight>
===Character Length===
===Character Length===
{{needs-review|mIRC Scripting Language}}
{{needs-review|mIRC Scripting Language}}
''$utfdecode()'' converts an UTF-8 string to the locale encoding, with unrepresentable characters as question marks. Since mIRC is not yet fully Unicode aware, entering Unicode text trough a dialog box will automatically convert it to ASCII.
''$utfdecode()'' converts an UTF-8 string to the locale encoding, with unrepresentable characters as question marks. Since mIRC is not yet fully Unicode aware, entering Unicode text trough a dialog box will automatically convert it to ASCII.
<lang mirc>alias utf8len { return $len($utfdecode($1)) }
<syntaxhighlight lang="mirc">alias utf8len { return $len($utfdecode($1)) }
alias stringlength2 {
alias stringlength2 {
var %name = Børje
var %name = Børje
echo -a %name is: $utf8len(%name) characters long!
echo -a %name is: $utf8len(%name) characters long!
}</lang>
}</syntaxhighlight>


=={{header|Modula-3}}==
=={{header|Modula-3}}==
===Byte Length===
===Byte Length===
<lang modula3>MODULE ByteLength EXPORTS Main;
<syntaxhighlight lang="modula3">MODULE ByteLength EXPORTS Main;


IMPORT IO, Fmt, Text;
IMPORT IO, Fmt, Text;
Line 2,119: Line 2,443:
BEGIN
BEGIN
IO.Put("Byte length of s: " & Fmt.Int((Text.Length(s) * BYTESIZE(s))) & "\n");
IO.Put("Byte length of s: " & Fmt.Int((Text.Length(s) * BYTESIZE(s))) & "\n");
END ByteLength.</lang>
END ByteLength.</syntaxhighlight>
===Character Length===
===Character Length===
<lang modula3>MODULE StringLength EXPORTS Main;
<syntaxhighlight lang="modula3">MODULE StringLength EXPORTS Main;


IMPORT IO, Fmt, Text;
IMPORT IO, Fmt, Text;
Line 2,129: Line 2,453:
BEGIN
BEGIN
IO.Put("String length of s: " & Fmt.Int(Text.Length(s)) & "\n");
IO.Put("String length of s: " & Fmt.Int(Text.Length(s)) & "\n");
END StringLength.</lang>
END StringLength.</syntaxhighlight>


=={{header|Nemerle}}==
=={{header|Nemerle}}==
Both examples rely on .Net faculties, so they're almost identical to C#
Both examples rely on .Net faculties, so they're almost identical to C#
===Character Length===
===Character Length===
<lang Nemerle>def message = "How long am I anyways?";
<syntaxhighlight lang="nemerle">def message = "How long am I anyways?";
def charlength = message.Length;</lang>
def charlength = message.Length;</syntaxhighlight>


===Byte Length===
===Byte Length===
<lang Nemerle>using System.Text;
<syntaxhighlight lang="nemerle">using System.Text;


def message = "How long am I anyways?";
def message = "How long am I anyways?";
def bytelength = Encoding.Unicode.GetByteCount(message);</lang>
def bytelength = Encoding.Unicode.GetByteCount(message);</syntaxhighlight>


=={{header|NewLISP}}==
=={{header|NewLISP}}==
===Character Length===
===Character Length===
<lang NewLISP>(set 'Str "møøse")
<syntaxhighlight lang="newlisp">(set 'Str "møøse")
(println Str " is " (length Str) " characters long")</lang>
(println Str " is " (length Str) " characters long")</syntaxhighlight>


=={{header|Nim}}==
=={{header|Nim}}==
In Nim, <code>len</code> returns the byte length of strings, ignoring the UTF-8 encoding. When dealing with Unicode strings, the module <code>unicode</code> must be used.
<lang Nim>import strformat, unicode


===Byte Length===
var s: string = "Hello, world! ☺"


<syntaxhighlight lang="nim">
echo &"“{s}” has byte length {s.len}."
echo "møøse".len # 7
echo &"“{s}” has Unicode char length {s.runeLen}."</lang>
echo "𝔘𝔫𝔦𝔠𝔬𝔡𝔢".len # 28
echo "J̲o̲s̲é̲".len # 13
</syntaxhighlight>


===Character Length===
{{out}}

<pre>“Hello, world! ☺” has byte length 17.
<syntaxhighlight lang="nim">
“Hello, world! ☺” has Unicode char length 15.</pre>
import unicode
echo "møøse".runeLen # 5
echo "𝔘𝔫𝔦𝔠𝔬𝔡𝔢".runeLen # 7
echo "J̲o̲s̲é̲".runeLen # 8
</syntaxhighlight>

===Grapheme Length===

[https://nim-lang.org/docs/unicode.html#graphemeLen%2Cstring%2CNatural graphemeLen()] does not do what you expect. It doesn't return the number of grapheme in a string but returns the number of bytes at a character/codepoint index for a given string.


=={{header|Oberon-2}}==
=={{header|Oberon-2}}==


===Byte Length===
===Byte Length===
<lang oberon2>MODULE Size;
<syntaxhighlight lang="oberon2">MODULE Size;


IMPORT Out;
IMPORT Out;
Line 2,177: Line 2,511:
Out.LongInt(s,0);
Out.LongInt(s,0);
Out.Ln;
Out.Ln;
END Size.</lang>
END Size.</syntaxhighlight>


Output:
Output:
Line 2,185: Line 2,519:


===Character Length===
===Character Length===
<lang oberon2>MODULE Length;
<syntaxhighlight lang="oberon2">MODULE Length;


IMPORT Out, Strings;
IMPORT Out, Strings;
Line 2,198: Line 2,532:
Out.Int(l,0);
Out.Int(l,0);
Out.Ln;
Out.Ln;
END Length.</lang>
END Length.</syntaxhighlight>


Output:
Output:
Line 2,209: Line 2,543:


===Character Length===
===Character Length===
<lang objeck>
<syntaxhighlight lang="objeck">
"Foo"->Size()->PrintLine();
"Foo"->Size()->PrintLine();
</syntaxhighlight>
</lang>


===Byte Length===
===Byte Length===
<lang objeck>
<syntaxhighlight lang="objeck">
"Foo"->Size()->PrintLine();
"Foo"->Size()->PrintLine();
</syntaxhighlight>
</lang>


=={{header|Objective-C}}==
=={{header|Objective-C}}==
Line 2,226: Line 2,560:


The length method of NSString objects is not the length of that string in characters. Instead, it only gives the number of 16-bit code units used to encode a string. This is not (always) the number of Unicode characters (code points) in the string.
The length method of NSString objects is not the length of that string in characters. Instead, it only gives the number of 16-bit code units used to encode a string. This is not (always) the number of Unicode characters (code points) in the string.
<lang objc>// Return the length in characters
<syntaxhighlight lang="objc">// Return the length in characters
// XXX: does not (always) count Unicode characters (code points)!
// XXX: does not (always) count Unicode characters (code points)!
unsigned int numberOfCharacters = [@"møøse" length]; // 5</lang>
unsigned int numberOfCharacters = [@"møøse" length]; // 5</syntaxhighlight>


Since Mac OS X 10.6, CFString has methods for converting between supplementary characters and surrogate pair. However, the easiest way to get the number of characters is probably to encode it in UTF-32 (which is a fixed-length encoding) and divide by 4:
Since Mac OS X 10.6, CFString has methods for converting between supplementary characters and surrogate pair. However, the easiest way to get the number of characters is probably to encode it in UTF-32 (which is a fixed-length encoding) and divide by 4:
<lang objc>int realCharacterCount = [s lengthOfBytesUsingEncoding: NSUTF32StringEncoding] / 4;</lang>
<syntaxhighlight lang="objc">int realCharacterCount = [s lengthOfBytesUsingEncoding: NSUTF32StringEncoding] / 4;</syntaxhighlight>


===Byte Length===
===Byte Length===
Objective-C encodes strings in UTF-16, which represents each character with one or two 16-bit values. The length method of NSString objects returns the number of 16-bit values used to encode a string, so the number of bytes can be determined by doubling that number.
Objective-C encodes strings in UTF-16, which represents each character with one or two 16-bit values. The length method of NSString objects returns the number of 16-bit values used to encode a string, so the number of bytes can be determined by doubling that number.


<lang objc>int byteCount = [@"møøse" length] * 2; // 10</lang>
<syntaxhighlight lang="objc">int byteCount = [@"møøse" length] * 2; // 10</syntaxhighlight>


Another way to know the byte length of a string is to explicitly specify the charset we desire.
Another way to know the byte length of a string is to explicitly specify the charset we desire.


<lang objc>// Return the number of bytes depending on the encoding,
<syntaxhighlight lang="objc">// Return the number of bytes depending on the encoding,
// here explicitly UTF-8
// here explicitly UTF-8
unsigned numberOfBytes =
unsigned numberOfBytes =
[@"møøse" lengthOfBytesUsingEncoding: NSUTF8StringEncoding]; // 7</lang>
[@"møøse" lengthOfBytesUsingEncoding: NSUTF8StringEncoding]; // 7</syntaxhighlight>


=={{header|OCaml}}==
=={{header|OCaml}}==
Line 2,254: Line 2,588:


Standard OCaml strings are classic ASCII ISO 8859-1, so the function String.length returns the byte length which is the character length in this encoding:
Standard OCaml strings are classic ASCII ISO 8859-1, so the function String.length returns the byte length which is the character length in this encoding:
<lang ocaml>String.length "Hello world" ;;</lang>
<syntaxhighlight lang="ocaml">String.length "Hello world" ;;</syntaxhighlight>


===Character Length===
===Character Length===


While using the '''UTF8''' module of ''Camomile'' the byte length of an utf8 encoded string will be get with <tt>String.length</tt> and the character length will be returned by <tt>UTF8.length</tt>:
While using the '''UTF8''' module of ''Camomile'' the byte length of an utf8 encoded string will be get with <tt>String.length</tt> and the character length will be returned by <tt>UTF8.length</tt>:
<lang ocaml>open CamomileLibrary
<syntaxhighlight lang="ocaml">open CamomileLibrary


let () =
let () =
Printf.printf " %d\n" (String.length "møøse");
Printf.printf " %d\n" (String.length "møøse");
Printf.printf " %d\n" (UTF8.length "møøse");
Printf.printf " %d\n" (UTF8.length "møøse");
;;</lang>
;;</syntaxhighlight>


Run this code with the command:
Run this code with the command:
Line 2,271: Line 2,605:
7
7
5
5
</pre>

Alternatively, you can use the UChar module (available since OCaml 4.03) to do it without additional modules.
<syntaxhighlight lang="OCaml">
let utf8_length (s: String.t) =
let byte_length = String.length s in
let rec count acc n =
if n = byte_length
then acc
else
let n' = n + (String.get_utf_8_uchar s n |> Uchar.utf_decode_length) in
count (succ acc) n'
in
count 0 0
;;
</syntaxhighlight>

<pre>
# utf8_length "møøse"
- : int = 5
</pre>
</pre>


=={{header|Octave}}==
=={{header|Octave}}==
<lang octave>s = "string";
<syntaxhighlight lang="octave">s = "string";
stringlen = length(s)</lang>
stringlen = length(s)</syntaxhighlight>


This gives the number of bytes, not of characters. e.g. length("è") is 2 when "è" is encoded e.g. as UTF-8.
This gives the number of bytes, not of characters. e.g. length("è") is 2 when "è" is encoded e.g. as UTF-8.
Line 2,288: Line 2,642:


=={{header|Ol}}==
=={{header|Ol}}==
<lang scheme>
<syntaxhighlight lang="scheme">
; Character length
; Character length
(print (string-length "Hello, wørld!"))
(print (string-length "Hello, wørld!"))
Line 2,296: Line 2,650:
(print (length (string->bytes "Hello, wørld!")))
(print (length (string->bytes "Hello, wørld!")))
; ==> 14
; ==> 14
</syntaxhighlight>
</lang>


=={{header|OpenEdge/Progress}}==
=={{header|OpenEdge/Progress}}==
Line 2,302: Line 2,656:


===Character Length===
===Character Length===
<lang progress>DEF VAR lcc AS LONGCHAR.
<syntaxhighlight lang="progress">DEF VAR lcc AS LONGCHAR.
FIX-CODEPAGE( lcc ) = "UTF-8".
FIX-CODEPAGE( lcc ) = "UTF-8".
lcc = "møøse".
lcc = "møøse".


MESSAGE LENGTH( lcc ) VIEW-AS ALERT-BOX.</lang>
MESSAGE LENGTH( lcc ) VIEW-AS ALERT-BOX.</syntaxhighlight>
===Byte Length===
===Byte Length===
<lang progress>DEF VAR lcc AS LONGCHAR.
<syntaxhighlight lang="progress">DEF VAR lcc AS LONGCHAR.
FIX-CODEPAGE( lcc ) = "UTF-8".
FIX-CODEPAGE( lcc ) = "UTF-8".
lcc = "møøse".
lcc = "møøse".


MESSAGE LENGTH( lcc, "RAW" ) VIEW-AS ALERT-BOX.</lang>
MESSAGE LENGTH( lcc, "RAW" ) VIEW-AS ALERT-BOX.</syntaxhighlight>


=={{header|Oz}}==
=={{header|Oz}}==
===Byte Length===
===Byte Length===
<lang oz>{Show {Length "Hello World"}}</lang>
<syntaxhighlight lang="oz">{Show {Length "Hello World"}}</syntaxhighlight>
Oz uses a single-byte encoding by default. So for normal strings, this will also show the correct character length.
Oz uses a single-byte encoding by default. So for normal strings, this will also show the correct character length.


Line 2,324: Line 2,678:
===Character Length===
===Character Length===
Characters = bytes in Pari; the underlying strings are C strings interpreted as US-ASCII.
Characters = bytes in Pari; the underlying strings are C strings interpreted as US-ASCII.
<lang parigp>len(s)=#s; \\ Alternately, len(s)=length(s); or even len=length;</lang>
<syntaxhighlight lang="parigp">len(s)=#s; \\ Alternately, len(s)=length(s); or even len=length;</syntaxhighlight>
===Byte Length===
===Byte Length===
This works on objects of any sort, not just strings, and includes overhead.
This works on objects of any sort, not just strings, and includes overhead.
<lang parigp>len(s)=sizebyte(s);</lang>
<syntaxhighlight lang="parigp">len(s)=sizebyte(s);</syntaxhighlight>


=={{header|Pascal}}==
=={{header|Pascal}}==
===Byte Length===
===Byte Length===
<lang pascal>
<syntaxhighlight lang="pascal">
const
const
s = 'abcdef';
s = 'abcdef';
Line 2,337: Line 2,691:
writeln (length(s))
writeln (length(s))
end.
end.
</syntaxhighlight>
</lang>
Output:
Output:
<pre>
<pre>
Line 2,349: Line 2,703:
Strings in Perl consist of characters. Measuring the byte length therefore requires conversion to some binary representation (called encoding, both noun and verb).
Strings in Perl consist of characters. Measuring the byte length therefore requires conversion to some binary representation (called encoding, both noun and verb).


<lang perl>use utf8; # so we can use literal characters like ☺ in source
<syntaxhighlight lang="perl">use utf8; # so we can use literal characters like ☺ in source
use Encode qw(encode);
use Encode qw(encode);


Line 2,356: Line 2,710:


print length encode 'UTF-16', "Hello, world! ☺";
print length encode 'UTF-16', "Hello, world! ☺";
# 32. 2 bytes for the BOM, then 15 byte pairs for each character.</lang>
# 32. 2 bytes for the BOM, then 15 byte pairs for each character.</syntaxhighlight>


===Character Length===
===Character Length===
{{works with|Perl|5.X}}
{{works with|Perl|5.X}}


<lang perl>my $length = length "Hello, world!";</lang>
<syntaxhighlight lang="perl">my $length = length "Hello, world!";</syntaxhighlight>


===Grapheme Length===
===Grapheme Length===
Line 2,369: Line 2,723:


{{works with|Perl|5.12}}
{{works with|Perl|5.12}}
<lang perl>use v5.12;
<syntaxhighlight lang="perl">use v5.12;
my $string = "\x{1112}\x{1161}\x{11ab}\x{1100}\x{1173}\x{11af}"; # 한글
my $string = "\x{1112}\x{1161}\x{11ab}\x{1100}\x{1173}\x{11af}"; # 한글
my $len;
my $len;
$len++ while ($string =~ /\X/g);
$len++ while ($string =~ /\X/g);
printf "Grapheme length: %d\n", $len;</lang>
printf "Grapheme length: %d\n", $len;</syntaxhighlight>


{{out}}
{{out}}
Line 2,381: Line 2,735:
{{libheader|Phix/basics}}
{{libheader|Phix/basics}}
The standard length function returns the number of bytes, character length is achieved by converting to utf32
The standard length function returns the number of bytes, character length is achieved by converting to utf32
<!--<lang Phix>-->
<!--<syntaxhighlight lang="phix">-->
<span style="color: #008080;">constant</span> <span style="color: #000000;">s</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">"𝔘𝔫𝔦𝔠𝔬𝔡𝔢"</span>
<span style="color: #008080;">constant</span> <span style="color: #000000;">s</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">"𝔘𝔫𝔦𝔠𝔬𝔡𝔢"</span>
<span style="color: #0000FF;">?<span style="color: #7060A8;">length<span style="color: #0000FF;">(<span style="color: #000000;">s<span style="color: #0000FF;">)</span>
<span style="color: #0000FF;">?<span style="color: #7060A8;">length<span style="color: #0000FF;">(<span style="color: #000000;">s<span style="color: #0000FF;">)</span>
<span style="color: #0000FF;">?<span style="color: #7060A8;">length<span style="color: #0000FF;">(<span style="color: #000000;">utf8_to_utf32<span style="color: #0000FF;">(<span style="color: #000000;">s<span style="color: #0000FF;">)<span style="color: #0000FF;">)
<span style="color: #0000FF;">?<span style="color: #7060A8;">length<span style="color: #0000FF;">(<span style="color: #000000;">utf8_to_utf32<span style="color: #0000FF;">(<span style="color: #000000;">s<span style="color: #0000FF;">)<span style="color: #0000FF;">)
<!--</lang>-->
<!--</syntaxhighlight>-->
{{out}}
{{out}}
<pre>
<pre>
Line 2,394: Line 2,748:
=={{header|PHP}}==
=={{header|PHP}}==
Program in a UTF8 linux:
Program in a UTF8 linux:
<lang PHP><?php
<syntaxhighlight lang="php"><?php
foreach (array('møøse', '𝔘𝔫𝔦𝔠𝔬𝔡𝔢', 'J̲o̲s̲é̲') as $s1) {
foreach (array('møøse', '𝔘𝔫𝔦𝔠𝔬𝔡𝔢', 'J̲o̲s̲é̲') as $s1) {
printf('String "%s" measured with strlen: %d mb_strlen: %s grapheme_strlen %s%s',
printf('String "%s" measured with strlen: %d mb_strlen: %s grapheme_strlen %s%s',
$s1, strlen($s1),mb_strlen($s1), grapheme_strlen($s1), PHP_EOL);
$s1, strlen($s1),mb_strlen($s1), grapheme_strlen($s1), PHP_EOL);
}
}
</syntaxhighlight>
</lang>
yields the result:
yields the result:
<pre>
<pre>
Line 2,408: Line 2,762:


=={{header|PicoLisp}}==
=={{header|PicoLisp}}==
<lang PicoLisp>(let Str "møøse"
<syntaxhighlight lang="picolisp">(let Str "møøse"
(prinl "Character Length of \"" Str "\" is " (length Str))
(prinl "Character Length of \"" Str "\" is " (length Str))
(prinl "Byte Length of \"" Str "\" is " (size Str)) )</lang>
(prinl "Byte Length of \"" Str "\" is " (size Str)) )</syntaxhighlight>
Output:
Output:
<pre>Character Length of "møøse" is 5
<pre>Character Length of "møøse" is 5
Line 2,417: Line 2,771:


=={{header|PL/I}}==
=={{header|PL/I}}==
<lang pli>declare WS widechar (13) initial ('Hello world.');
<syntaxhighlight lang="pli">declare WS widechar (13) initial ('Hello world.');
put ('Character length=', length (WS));
put ('Character length=', length (WS));
put skip list ('Byte length=', size(WS));
put skip list ('Byte length=', size(WS));
Line 2,423: Line 2,777:
declare SM graphic (13) initial ('Hello world');
declare SM graphic (13) initial ('Hello world');
put ('Character length=', length(SM));
put ('Character length=', length(SM));
put skip list ('Byte length=', size(trim(SM)));</lang>
put skip list ('Byte length=', size(trim(SM)));</syntaxhighlight>


=={{header|PL/SQL}}==
=={{header|PL/SQL}}==
Line 2,432: Line 2,786:
LENGTH4 uses UCS4 code points.
LENGTH4 uses UCS4 code points.
===Byte Length===
===Byte Length===
<lang plsql>DECLARE
<syntaxhighlight lang="plsql">DECLARE
string VARCHAR2(50) := 'Hello, world!';
string VARCHAR2(50) := 'Hello, world!';
stringlength NUMBER;
stringlength NUMBER;
BEGIN
BEGIN
stringlength := LENGTHB(string);
stringlength := LENGTHB(string);
END;</lang>
END;</syntaxhighlight>


===Character Length===
===Character Length===
<lang plsql>DECLARE
<syntaxhighlight lang="plsql">DECLARE
string VARCHAR2(50) := 'Hello, world!';
string VARCHAR2(50) := 'Hello, world!';
stringlength NUMBER;
stringlength NUMBER;
Line 2,451: Line 2,805:
ucs2length := LENGTH2(string);
ucs2length := LENGTH2(string);
ucs4length := LENGTH4(string);
ucs4length := LENGTH4(string);
END;</lang>
END;</syntaxhighlight>

=={{header|Plain English}}==
===Byte Length===
{{libheader|Plain English-output}}
Plain English does not handle Unicode, so strings return their length in bytes.
<syntaxhighlight lang="text">
To run:
Start up.
Put "møøse" into a string.
Write the string's length to the output.
Wait for the escape key.
Shut down.
</syntaxhighlight>


=={{header|Pop11}}==
=={{header|Pop11}}==
Line 2,457: Line 2,824:
Currently Pop11 supports only strings consisting of 1-byte units. Strings can carry arbitrary binary data, so user can for example use UTF-8 (however builtin procedures will treat each byte as a single character). The length function for strings returns length in bytes:
Currently Pop11 supports only strings consisting of 1-byte units. Strings can carry arbitrary binary data, so user can for example use UTF-8 (however builtin procedures will treat each byte as a single character). The length function for strings returns length in bytes:


<lang pop11>lvars str = 'Hello, world!';
<syntaxhighlight lang="pop11">lvars str = 'Hello, world!';
lvars len = length(str);</lang>
lvars len = length(str);</syntaxhighlight>


=={{header|PostScript}}==
=={{header|PostScript}}==
===Character Length===
===Character Length===
<lang>
<syntaxhighlight lang="text">
(Hello World) length =
(Hello World) length =
11
11
</syntaxhighlight>
</lang>


=={{header|Potion}}==
=={{header|Potion}}==
===Character Length===
===Character Length===
<lang potion>"møøse" length print
<syntaxhighlight lang="potion">"møøse" length print
"𝔘𝔫𝔦𝔠𝔬𝔡𝔢" length print
"𝔘𝔫𝔦𝔠𝔬𝔡𝔢" length print
"J̲o̲s̲é̲" length print</lang>
"J̲o̲s̲é̲" length print</syntaxhighlight>


=={{header|PowerShell}}==
=={{header|PowerShell}}==
===Character Length===
===Character Length===
<lang powershell>$s = "Hëlló Wørłð"
<syntaxhighlight lang="powershell">$s = "Hëlló Wørłð"
$s.Length</lang>
$s.Length</syntaxhighlight>
===Byte Length===
===Byte Length===
{{trans|C#}}
{{trans|C#}}


For UTF-16, which is the default in .NET and therefore PowerShell:
For UTF-16, which is the default in .NET and therefore PowerShell:
<lang powershell>$s = "Hëlló Wørłð"
<syntaxhighlight lang="powershell">$s = "Hëlló Wørłð"
[System.Text.Encoding]::Unicode.GetByteCount($s)</lang>
[System.Text.Encoding]::Unicode.GetByteCount($s)</syntaxhighlight>
For UTF-8:
For UTF-8:
<lang powershell>[System.Text.Encoding]::UTF8.GetByteCount($s)</lang>
<syntaxhighlight lang="powershell">[System.Text.Encoding]::UTF8.GetByteCount($s)</syntaxhighlight>


=={{header|PureBasic}}==
=={{header|PureBasic}}==
===Character Length===
===Character Length===
<lang PureBasic> a = Len("Hello World") ;a will be 11</lang>
<syntaxhighlight lang="purebasic"> a = Len("Hello World") ;a will be 11</syntaxhighlight>


===Byte Length===
===Byte Length===
Line 2,495: Line 2,862:
Note: The number of bytes returned does not include the terminating Null-Character of the string. The size of the Null-Character is 1 byte for Ascii and UTF8 mode and 2 bytes for Unicode mode.
Note: The number of bytes returned does not include the terminating Null-Character of the string. The size of the Null-Character is 1 byte for Ascii and UTF8 mode and 2 bytes for Unicode mode.


<lang PureBasic>a = StringByteLength("ä", #PB_UTF8) ;a will be 2
<syntaxhighlight lang="purebasic">a = StringByteLength("ä", #PB_UTF8) ;a will be 2
b = StringByteLength("ä", #PB_Ascii) ;b will be 1
b = StringByteLength("ä", #PB_Ascii) ;b will be 1
c = StringByteLength("ä", #PB_Unicode) ;c will be 2
c = StringByteLength("ä", #PB_Unicode) ;c will be 2
</syntaxhighlight>
</lang>


=={{header|Python}}==
=={{header|Python}}==
Line 2,508: Line 2,875:


For 8-bit strings, the byte length is the same as the character length:
For 8-bit strings, the byte length is the same as the character length:
<lang python>print len('ascii')
<syntaxhighlight lang="python">print len('ascii')
# 5</lang>
# 5</syntaxhighlight>


For Unicode strings, length depends on the internal encoding. Since version 2.2 Python shipped with two build options: it either uses 2 or 4 bytes per character. The internal representation is not interesting for the user.
For Unicode strings, length depends on the internal encoding. Since version 2.2 Python shipped with two build options: it either uses 2 or 4 bytes per character. The internal representation is not interesting for the user.


<lang python># The letter Alef
<syntaxhighlight lang="python"># The letter Alef
print len(u'\u05d0'.encode('utf-8'))
print len(u'\u05d0'.encode('utf-8'))
# 2
# 2
print len(u'\u05d0'.encode('iso-8859-8'))
print len(u'\u05d0'.encode('iso-8859-8'))
# 1</lang>
# 1</syntaxhighlight>


Example from the problem statement:
Example from the problem statement:
<lang python>#!/bin/env python
<syntaxhighlight lang="python">#!/bin/env python
# -*- coding: UTF-8 -*-
# -*- coding: UTF-8 -*-
s = u"møøse"
s = u"møøse"
assert len(s) == 5
assert len(s) == 5
assert len(s.encode('UTF-8')) == 7
assert len(s.encode('UTF-8')) == 7
assert len(s.encode('UTF-16-BE')) == 10 # There are 3 different UTF-16 encodings: LE and BE are little endian and big endian respectively, the third one (without suffix) adds 2 extra leading bytes: the byte-order mark (BOM).</lang>
assert len(s.encode('UTF-16-BE')) == 10 # There are 3 different UTF-16 encodings: LE and BE are little endian and big endian respectively, the third one (without suffix) adds 2 extra leading bytes: the byte-order mark (BOM).</syntaxhighlight>
====Character Length====
====Character Length====
{{works with|Python|2.4}}
{{works with|Python|2.4}}
Line 2,531: Line 2,898:
len() returns the number of code units (not code points!) in a Unicode string or plain ASCII string. On a wide build, this is the same as the number of code points, but on a narrow one it is not. Most linux distributions install the wide build by default, you can check the build at runtime with:
len() returns the number of code units (not code points!) in a Unicode string or plain ASCII string. On a wide build, this is the same as the number of code points, but on a narrow one it is not. Most linux distributions install the wide build by default, you can check the build at runtime with:


<lang python>import sys
<syntaxhighlight lang="python">import sys
sys.maxunicode # 1114111 on a wide build, 65535 on a narrow build </lang>
sys.maxunicode # 1114111 on a wide build, 65535 on a narrow build </syntaxhighlight>


To get the length of encoded string, you have to decode it first:
To get the length of encoded string, you have to decode it first:
<lang python>print len('ascii')
<syntaxhighlight lang="python">print len('ascii')
# 5
# 5
print len(u'\u05d0') # the letter Alef as unicode literal
print len(u'\u05d0') # the letter Alef as unicode literal
Line 2,542: Line 2,909:
# 1
# 1
print hex(sys.maxunicode), len(unichr(0x1F4A9))
print hex(sys.maxunicode), len(unichr(0x1F4A9))
# ('0x10ffff', 1)</lang>
# ('0x10ffff', 1)</syntaxhighlight>


On a narrow build, len() gives the wrong answer for non-BMP chars
On a narrow build, len() gives the wrong answer for non-BMP chars


<lang python>print hex(sys.maxunicode), len(unichr(0x1F4A9))
<syntaxhighlight lang="python">print hex(sys.maxunicode), len(unichr(0x1F4A9))
# ('0xffff', 2)</lang>
# ('0xffff', 2)</syntaxhighlight>


===3.x===
===3.x===
Line 2,557: Line 2,924:
You can use len() to get the length of a byte sequence.
You can use len() to get the length of a byte sequence.


<lang python>print(len(b'Hello, World!'))
<syntaxhighlight lang="python">print(len(b'Hello, World!'))
# 13</lang>
# 13</syntaxhighlight>


To get a byte sequence from a string, you have to encode it with the desired encoding:
To get a byte sequence from a string, you have to encode it with the desired encoding:


<lang python># The letter Alef
<syntaxhighlight lang="python"># The letter Alef
print(len('\u05d0'.encode())) # the default encoding is utf-8 in Python3
print(len('\u05d0'.encode())) # the default encoding is utf-8 in Python3
# 2
# 2
print(len('\u05d0'.encode('iso-8859-8')))
print(len('\u05d0'.encode('iso-8859-8')))
# 1</lang>
# 1</syntaxhighlight>


Example from the problem statement:
Example from the problem statement:
<lang python>#!/bin/env python
<syntaxhighlight lang="python">#!/bin/env python
# -*- coding: UTF-8 -*-
# -*- coding: UTF-8 -*-
s = "møøse"
s = "møøse"
Line 2,577: Line 2,944:
u="𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
u="𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
assert len(u.encode()) == 28
assert len(u.encode()) == 28
assert len(u.encode('UTF-16-BE')) == 28</lang>
assert len(u.encode('UTF-16-BE')) == 28</syntaxhighlight>
====Character Length====
====Character Length====


Line 2,584: Line 2,951:
Thus Python is able to avoid memory overhead when dealing with only ASCII strings, while handling correctly all codepoints in Unicode. len() returns the number of characters/codepoints:
Thus Python is able to avoid memory overhead when dealing with only ASCII strings, while handling correctly all codepoints in Unicode. len() returns the number of characters/codepoints:


<lang python>print(len("𝔘𝔫𝔦𝔠𝔬𝔡𝔢"))
<syntaxhighlight lang="python">print(len("𝔘𝔫𝔦𝔠𝔬𝔡𝔢"))
# 7</lang>
# 7</syntaxhighlight>


Until Python 3.2 instead, length depended on the internal encoding, since it shipped with two build options: it either used 2 or 4 bytes per character.
Until Python 3.2 instead, length depended on the internal encoding, since it shipped with two build options: it either used 2 or 4 bytes per character.
Line 2,591: Line 2,958:
len() returned the number of code units in a string, which could be different from the number of characters. In a narrow build, this is not a reliable way to get the number of characters. You can only easily count code points in a wide build. Most linux distributions install the wide build by default, you can check the build at runtime with:
len() returned the number of code units in a string, which could be different from the number of characters. In a narrow build, this is not a reliable way to get the number of characters. You can only easily count code points in a wide build. Most linux distributions install the wide build by default, you can check the build at runtime with:


<lang python>import sys
<syntaxhighlight lang="python">import sys
sys.maxunicode # 1114111 on a wide build, 65535 on a narrow build</lang>
sys.maxunicode # 1114111 on a wide build, 65535 on a narrow build</syntaxhighlight>


<lang python>print(len('ascii'))
<syntaxhighlight lang="python">print(len('ascii'))
# 5
# 5
print(len('\u05d0')) # the letter Alef as unicode literal
print(len('\u05d0')) # the letter Alef as unicode literal
# 1</lang>
# 1</syntaxhighlight>


To get the length of an encoded byte sequence, you have to decode it first:
To get the length of an encoded byte sequence, you have to decode it first:


<lang python>print(len(b'\xd7\x90'.decode('utf-8'))) # Alef encoded as utf-8 byte sequence
<syntaxhighlight lang="python">print(len(b'\xd7\x90'.decode('utf-8'))) # Alef encoded as utf-8 byte sequence
# 1</lang>
# 1</syntaxhighlight>


<lang python>print(hex(sys.maxunicode), len(unichr(0x1F4A9)))
<syntaxhighlight lang="python">print(hex(sys.maxunicode), len(unichr(0x1F4A9)))
# ('0x10ffff', 1)</lang>
# ('0x10ffff', 1)</syntaxhighlight>


On a narrow build, len() gives the wrong answer for non-BMP chars
On a narrow build, len() gives the wrong answer for non-BMP chars


<lang python>print(hex(sys.maxunicode), len(unichr(0x1F4A9)))
<syntaxhighlight lang="python">print(hex(sys.maxunicode), len(unichr(0x1F4A9)))
# ('0xffff', 2)</lang>
# ('0xffff', 2)</syntaxhighlight>


=={{header|R}}==
=={{header|R}}==


===Byte length===
===Byte length===
<lang rsplus>a <- "m\u00f8\u00f8se"
<syntaxhighlight lang="rsplus">a <- "m\u00f8\u00f8se"
print(nchar(a, type="bytes")) # print 7</lang>
print(nchar(a, type="bytes")) # print 7</syntaxhighlight>


===Character length===
===Character length===
<lang rsplus>print(nchar(a, type="chars")) # print 5</lang>
<syntaxhighlight lang="rsplus">print(nchar(a, type="chars")) # print 5</syntaxhighlight>


=={{header|Racket}}==
=={{header|Racket}}==


Using this definition:
Using this definition:
<lang Racket>(define str "J\u0332o\u0332s\u0332e\u0301\u0332")</lang>
<syntaxhighlight lang="racket">(define str "J\u0332o\u0332s\u0332e\u0301\u0332")</syntaxhighlight>
on the REPL, we get the following:
on the REPL, we get the following:


===Character length===
===Character length===
<lang Racket>-> (printf "str has ~a characters" (string-length str))
<syntaxhighlight lang="racket">-> (printf "str has ~a characters" (string-length str))
str has 9 characters</lang>
str has 9 characters</syntaxhighlight>


===Byte length===
===Byte length===
<lang Racket>-> (printf "str has ~a bytes in utf-8" (bytes-length (string->bytes/utf-8 str)))
<syntaxhighlight lang="racket">-> (printf "str has ~a bytes in utf-8" (bytes-length (string->bytes/utf-8 str)))
str has 14 bytes in utf-8</lang>
str has 14 bytes in utf-8</syntaxhighlight>


=={{header|Raku}}==
=={{header|Raku}}==
Line 2,639: Line 3,006:
===Byte Length===
===Byte Length===


<lang perl6>say 'møøse'.encode('UTF-8').bytes;</lang>
<syntaxhighlight lang="raku" line>say 'møøse'.encode('UTF-8').bytes;</syntaxhighlight>


===Character Length===
===Character Length===


<lang perl6>say 'møøse'.codes;</lang>
<syntaxhighlight lang="raku" line>say 'møøse'.codes;</syntaxhighlight>


===Grapheme Length===
===Grapheme Length===


<lang perl6>say 'møøse'.chars;</lang>
<syntaxhighlight lang="raku" line>say 'møøse'.chars;</syntaxhighlight>


=={{header|REBOL}}==
=={{header|REBOL}}==
Line 2,658: Line 3,025:
===Byte Length===
===Byte Length===


<lang REBOL>;; r2
<syntaxhighlight lang="rebol">;; r2
length? "møøse"
length? "møøse"


;; r3
;; r3
length? to-binary "møøse"</lang>
length? to-binary "møøse"</syntaxhighlight>


===Character length===
===Character length===


<lang REBOL>;; r3
<syntaxhighlight lang="rebol">;; r3
length? "møøse"</lang>
length? "møøse"</syntaxhighlight>


=={{header|ReScript}}==
=={{header|ReScript}}==
===Byte Length===
===Byte Length===
<lang ReScript>Js.String2.length("abcd") == 4</lang>
<syntaxhighlight lang="rescript">Js.String2.length("abcd") == 4</syntaxhighlight>


=={{header|Retro}}==
=={{header|Retro}}==
===Byte Length===
===Byte Length===
<lang Retro>'møøse s:length n:put</lang>
<syntaxhighlight lang="retro">'møøse s:length n:put</syntaxhighlight>


===Character Length===
===Character Length===
Retro does not have built-in support for Unicode, but counting of characters can be done with a small amount of effort.
Retro does not have built-in support for Unicode, but counting of characters can be done with a small amount of effort.


<lang Retro>chain: UTF8'
<syntaxhighlight lang="retro">chain: UTF8'
{{
{{
: utf+ ( $-$ )
: utf+ ( $-$ )
Line 2,694: Line 3,061:
;chain
;chain


"møøse" ^UTF8'getLength putn</lang>
"møøse" ^UTF8'getLength putn</syntaxhighlight>


=={{header|REXX}}==
=={{header|REXX}}==
Line 2,701: Line 3,068:
<br>is stored as character strings.
<br>is stored as character strings.
===Byte Length===
===Byte Length===
<lang REXX>/*REXX program displays the lengths (in bytes/characters) for various strings. */
<syntaxhighlight lang="rexx">/*REXX program displays the lengths (in bytes/characters) for various strings. */
/* 1 */ /*a handy-dandy over/under scale.*/
/* 1 */ /*a handy-dandy over/under scale.*/
/* 123456789012345 */
/* 123456789012345 */
Line 2,711: Line 3,078:
sum = 5+1 ; say 'the length of SUM is ' length(sum)
sum = 5+1 ; say 'the length of SUM is ' length(sum)
/* [↑] is, of course, 6. */
/* [↑] is, of course, 6. */
/*stick a fork in it, we're done.*/</lang>
/*stick a fork in it, we're done.*/</syntaxhighlight>
'''output'''
'''output'''
<pre>
<pre>
Line 2,724: Line 3,091:
=={{header|Ring}}==
=={{header|Ring}}==
===Character Length===
===Character Length===
<lang ring>
<syntaxhighlight lang="ring">
aString = "Welcome to the Ring Programming Language"
aString = "Welcome to the Ring Programming Language"
aStringSize = len(aString)
aStringSize = len(aString)
see "Character lenghts : " + aStringSize
see "Character lenghts : " + aStringSize
</syntaxhighlight>
</lang>


=={{header|Robotic}}==
=={{header|Robotic}}==
===Character Length===
===Character Length===
<lang robotic>
<syntaxhighlight lang="robotic">
set "$local1" to "Hello world!"
set "$local1" to "Hello world!"
* "String length: &$local1.length&"
* "String length: &$local1.length&"
end
end
</syntaxhighlight>
</lang>


Unfortunately, only character length can be retrieved in this language.
Unfortunately, only character length can be retrieved in this language.

=={{header|RPL}}==
RPL strings are all made of 8-bit characters.
"RPL" SIZE


=={{header|Ruby}}==
=={{header|Ruby}}==
UTF8 is the default encoding in Ruby.
UTF8 is the default encoding in Ruby.
===Byte Length===
===Byte Length===
<lang ruby>"J̲o̲s̲é̲".bytesize</lang>
<syntaxhighlight lang="ruby">"J̲o̲s̲é̲".bytesize</syntaxhighlight>


===Character Length===
===Character Length===
<lang ruby>"J̲o̲s̲é̲".chars.length</lang>
<syntaxhighlight lang="ruby">"J̲o̲s̲é̲".chars.length</syntaxhighlight>
===Grapheme Length===
===Grapheme Length===
<lang ruby>"J̲o̲s̲é̲".grapheme_clusters.length</lang>
<syntaxhighlight lang="ruby">"J̲o̲s̲é̲".grapheme_clusters.length</syntaxhighlight>
===Code Set Independence===
===Code Set Independence===
The next examples show the '''byte length''' and '''character length''' of "møøse" in different encodings.
The next examples show the '''byte length''' and '''character length''' of "møøse" in different encodings.
Line 2,764: Line 3,135:
! Output
! Output
|-
|-
| <lang ruby># -*- coding: iso-8859-1 -*-
| <syntaxhighlight lang="ruby"># -*- coding: iso-8859-1 -*-
s = "møøse"
s = "møøse"
puts "Byte length: %d" % s.bytesize
puts "Byte length: %d" % s.bytesize
puts "Character length: %d" % s.length</lang>
puts "Character length: %d" % s.length</syntaxhighlight>
| <pre>Byte length: 5
| <pre>Byte length: 5
Character length: 5</pre>
Character length: 5</pre>
|-
|-
| <lang ruby># -*- coding: utf-8 -*-
| <syntaxhighlight lang="ruby"># -*- coding: utf-8 -*-
s = "møøse"
s = "møøse"
puts "Byte length: %d" % s.bytesize
puts "Byte length: %d" % s.bytesize
puts "Character length: %d" % s.length</lang>
puts "Character length: %d" % s.length</syntaxhighlight>
| <pre>Byte length: 7
| <pre>Byte length: 7
Character length: 5</pre>
Character length: 5</pre>
|-
|-
| <lang ruby># -*- coding: gb18030 -*-
| <syntaxhighlight lang="ruby"># -*- coding: gb18030 -*-
s = "møøse"
s = "møøse"
puts "Byte length: %d" % s.bytesize
puts "Byte length: %d" % s.bytesize
puts "Character length: %d" % s.length</lang>
puts "Character length: %d" % s.length</syntaxhighlight>
| <pre>Byte length: 11
| <pre>Byte length: 11
Character length: 5</pre>
Character length: 5</pre>
Line 2,796: Line 3,167:
Then either <code>string.scan(/./u).size</code> or <code>string.gsub(/./u, ' ').size</code> counts the UTF-8 characters in string.
Then either <code>string.scan(/./u).size</code> or <code>string.gsub(/./u, ' ').size</code> counts the UTF-8 characters in string.


<lang ruby># -*- coding: utf-8 -*-
<syntaxhighlight lang="ruby"># -*- coding: utf-8 -*-


class String
class String
Line 2,807: Line 3,178:
s = "文字化け"
s = "文字化け"
puts "Byte length: %d" % s.bytesize
puts "Byte length: %d" % s.bytesize
puts "Character length: %d" % s.gsub(/./u, ' ').size</lang>
puts "Character length: %d" % s.gsub(/./u, ' ').size</syntaxhighlight>


=={{header|Run BASIC}}==
=={{header|Run BASIC}}==
<lang runbasic>input a$
<syntaxhighlight lang="runbasic">input a$
print len(a$)</lang>
print len(a$)</syntaxhighlight>


=={{header|Rust}}==
=={{header|Rust}}==
===Byte Length===
===Byte Length===
<lang>
<syntaxhighlight lang="text">
fn main() {
fn main() {
let s = "文字化け"; // UTF-8
let s = "文字化け"; // UTF-8
println!("Byte Length: {}", s.len());
println!("Byte Length: {}", s.len());
}
}
</syntaxhighlight>
</lang>
===Character Length===
===Character Length===
<lang rust>
<syntaxhighlight lang="rust">
fn main() {
fn main() {
let s = "文字化け"; // UTF-8
let s = "文字化け"; // UTF-8
println!("Character length: {}", s.chars().count());
println!("Character length: {}", s.chars().count());
}
}
</syntaxhighlight>
</lang>


=={{header|SAS}}==
=={{header|SAS}}==
<lang sas>data _null_;
<syntaxhighlight lang="sas">data _null_;
a="Hello, World!";
a="Hello, World!";
b=length(c);
b=length(c);
put _all_;
put _all_;
run;</lang>
run;</syntaxhighlight>


=={{header|Scala}}==
=={{header|Scala}}==
{{libheader|Scala}}
{{libheader|Scala}}
<lang scala>
<syntaxhighlight lang="scala">
object StringLength extends App {
object StringLength extends App {
val s1 = "møøse"
val s1 = "møøse"
Line 2,850: Line 3,221:
} UTF16bytes= ${s.getBytes("UTF-16LE").size}"))
} UTF16bytes= ${s.getBytes("UTF-16LE").size}"))
}
}
</syntaxhighlight>
</lang>
{{out}}
{{out}}
<pre>The string: møøse, characterlength= 5 UTF8bytes= 7 UTF16bytes= 10
<pre>The string: møøse, characterlength= 5 UTF8bytes= 7 UTF16bytes= 10
Line 2,860: Line 3,231:
{{works_with|Gauche|0.8.7 [utf-8,pthreads]}}
{{works_with|Gauche|0.8.7 [utf-8,pthreads]}}
'''string-size''' function is only Gauche function.
'''string-size''' function is only Gauche function.
<lang scheme>(string-size "Hello world")</lang>
<syntaxhighlight lang="scheme">(string-size "Hello world")</syntaxhighlight>


{{works with|PLT Scheme|4.2.4}}
{{works with|PLT Scheme|4.2.4}}
<lang scheme>(bytes-length #"Hello world")</lang>
<syntaxhighlight lang="scheme">(bytes-length #"Hello world")</syntaxhighlight>


===Character Length===
===Character Length===
{{works_with|Gauche|0.8.7 [utf-8,pthreads]}}
{{works_with|Gauche|0.8.7 [utf-8,pthreads]}}
'''string-length''' function is in [[R5RS]], [[R6RS]].
'''string-length''' function is in [[R5RS]], [[R6RS]].
<lang scheme> (string-length "Hello world")</lang>
<syntaxhighlight lang="scheme"> (string-length "Hello world")</syntaxhighlight>


=={{header|sed}}==
=={{header|sed}}==
Line 2,875: Line 3,246:
Text is read from standard input e.g. <code>echo "string" | sed -f script.sed</code> or <code>sed -f script.sed file.txt</code> (The solution given would be the contents of a text file <code>script.sed</code> in these cases).
Text is read from standard input e.g. <code>echo "string" | sed -f script.sed</code> or <code>sed -f script.sed file.txt</code> (The solution given would be the contents of a text file <code>script.sed</code> in these cases).
For files with more than one line, sed will give a count for each line.
For files with more than one line, sed will give a count for each line.
<syntaxhighlight lang="sed"># create unary numeral (i = 1)
The 'convert to digits' section is based off of [http://unix.stackexchange.com/a/36959/11750 this StackExchange answer].
s/./i/g
<lang sed># Change all characters to '|'.
:loop
s/./\|/g;
# divide by 10 (x = 10)

s/i\{10\}/x/g
# Convert to digits
# convert remainder to decimal digit
:convert
/i/!s/[0-9]*$/0&/
s/||||||||||/</g
s/<\([0-9]*\)$/<0\1/g
s/i\{9\}/9/
s/|||||||||/9/g;
s/i\{8\}/8/
s/i\{7\}/7/
s/|||||||||/9/g; s/||||||||/8/g; s/|||||||/7/g; s/||||||/6/g;
s/i\{6\}/6/
s/|||||/5/g; s/||||/4/g; s/|||/3/g; s/||/2/g; s/|/1/g;
s/</|/g
s/iiiii/5/
s/iiii/4/
t convert
s/^$/0/</lang>
s/iii/3/
s/ii/2/
s/i/1/
# convert quotient (10s) to 1s
y/x/i/
# start over for the next magnitude (if any)
/i/b loop</syntaxhighlight>


=={{header|Seed7}}==
=={{header|Seed7}}==
===Character Length===
===Character Length===
<lang seed7>length("Hello, world!")</lang>
<syntaxhighlight lang="seed7">length("Hello, world!")</syntaxhighlight>


=={{header|SETL}}==
=={{header|SETL}}==
===Character Length===
===Character Length===
<lang haskell>print(# "Hello, world!"); -- '#' is the cardinality operator. Works on strings, tuples, and sets.</lang>
<syntaxhighlight lang="haskell">print(# "Hello, world!"); -- '#' is the cardinality operator. Works on strings, tuples, and sets.</syntaxhighlight>


=={{header|Sidef}}==
=={{header|Sidef}}==


<lang ruby>var str = "J\x{332}o\x{332}s\x{332}e\x{301}\x{332}";</lang>
<syntaxhighlight lang="ruby">var str = "J\x{332}o\x{332}s\x{332}e\x{301}\x{332}";</syntaxhighlight>


===Byte Length===
===Byte Length===
UTF-8 byte length (default):
UTF-8 byte length (default):
<lang ruby>say str.bytes.len; #=> 14</lang>
<syntaxhighlight lang="ruby">say str.bytes.len; #=> 14</syntaxhighlight>


UTF-16 byte length:
UTF-16 byte length:
<lang ruby>say str.encode('UTF-16').bytes.len; #=> 20</lang>
<syntaxhighlight lang="ruby">say str.encode('UTF-16').bytes.len; #=> 20</syntaxhighlight>


===Character Length===
===Character Length===
<lang ruby>say str.chars.len; #=> 9</lang>
<syntaxhighlight lang="ruby">say str.chars.len; #=> 9</syntaxhighlight>


===Grapheme Length===
===Grapheme Length===
<lang ruby>say str.graphs.len; #=> 4</lang>
<syntaxhighlight lang="ruby">say str.graphs.len; #=> 4</syntaxhighlight>


=={{header|Simula}}==
=={{header|Simula}}==
Line 2,927: Line 3,304:
</pre>
</pre>
===Byte Length===
===Byte Length===
<lang simula>BEGIN
<syntaxhighlight lang="simula">BEGIN
TEXT LINE;
TEXT LINE;
WHILE NOT LASTITEM DO
WHILE NOT LASTITEM DO
Line 2,941: Line 3,318:
END;
END;
END.
END.
</syntaxhighlight>
</lang>
{{out}}
{{out}}
<pre>
<pre>
Line 2,952: Line 3,329:
===Character Length===
===Character Length===
To calculate the character length, one can do it manually:
To calculate the character length, one can do it manually:
<lang simula>BEGIN
<syntaxhighlight lang="simula">BEGIN


! NUMBER OF UFT8 CHARACTERS IN STRING ;
! NUMBER OF UFT8 CHARACTERS IN STRING ;
Line 2,997: Line 3,374:
END;
END;


END.</lang>
END.</syntaxhighlight>
{{out}}
{{out}}
<pre>"møøse" CHARACTER LENGTH = 5
<pre>"møøse" CHARACTER LENGTH = 5
Line 3,006: Line 3,383:


=={{header|Slate}}==
=={{header|Slate}}==
<lang slate>'Hello, world!' length.</lang>
<syntaxhighlight lang="slate">'Hello, world!' length.</syntaxhighlight>

=={{header|Slope}}==

=== Character Length ===
<syntaxhighlight lang="slope">(length "møøse")</syntaxhighlight>
=== Byte Lenth ===
<syntaxhighlight lang="slope">(length (string->bytes "møøse"))</syntaxhighlight>


=={{header|Smalltalk}}==
=={{header|Smalltalk}}==
Line 3,012: Line 3,396:


{{works with|Smalltalk/X}}
{{works with|Smalltalk/X}}
<lang smalltalk>'hello' size -> 5
<syntaxhighlight lang="smalltalk">'hello' size -> 5
'hello' utf8Encoded size -> 5
'hello' utf8Encoded size -> 5
'hello' utf8Encoded asByteArray -> #[104 101 108 108 111]
'hello' utf8Encoded asByteArray -> #[104 101 108 108 111]
Line 3,026: Line 3,410:
'𝔘𝔫𝔦𝔠𝔬𝔡𝔢' utf8Encoded asByteArray -> #[240 157 148 152 240 157 148 171 240 157 148 166 240 157 148 160 240 157 148 172 240 157 148 161 240 157 148 162]
'𝔘𝔫𝔦𝔠𝔬𝔡𝔢' utf8Encoded asByteArray -> #[240 157 148 152 240 157 148 171 240 157 148 166 240 157 148 160 240 157 148 172 240 157 148 161 240 157 148 162]
'𝔘𝔫𝔦𝔠𝔬𝔡𝔢' utf16Encoded size -> 14
'𝔘𝔫𝔦𝔠𝔬𝔡𝔢' utf16Encoded size -> 14
'𝔘𝔫𝔦𝔠𝔬𝔡𝔢' utf8Encoded asWordArray -> WordArray(55349 56600 55349 56619 55349 56614 55349 56608 55349 56620 55349 56609 55349 56610)</lang>
'𝔘𝔫𝔦𝔠𝔬𝔡𝔢' utf8Encoded asWordArray -> WordArray(55349 56600 55349 56619 55349 56614 55349 56608 55349 56620 55349 56609 55349 56610)</syntaxhighlight>


===Byte Length===
===Byte Length===
{{works with|GNU Smalltalk}}
{{works with|GNU Smalltalk}}
<lang smalltalk>string := 'Hello, world!'.
<syntaxhighlight lang="smalltalk">string := 'Hello, world!'.
string size.</lang>
string size.</syntaxhighlight>
===Character Length===
===Character Length===
{{works with|GNU Smalltalk}}
{{works with|GNU Smalltalk}}
<lang smalltalk>string := 'Hello, world!'.
<syntaxhighlight lang="smalltalk">string := 'Hello, world!'.
string numberOfCharacters.</lang>
string numberOfCharacters.</syntaxhighlight>


requires loading the Iconv package:
requires loading the Iconv package:


<lang smalltalk>PackageLoader fileInPackage: 'Iconv'</lang>
<syntaxhighlight lang="smalltalk">PackageLoader fileInPackage: 'Iconv'</syntaxhighlight>


=={{header|SNOBOL4}}==
=={{header|SNOBOL4}}==
===Byte Length ===
===Byte Length ===
<lang snobol4>
<syntaxhighlight lang="snobol4">
output = "Byte length: " size(trim(input))
output = "Byte length: " size(trim(input))
end
end
</syntaxhighlight>
</lang>


===Character Length ===
===Character Length ===
The example works AFAIK only with CSnobol4 by Phil Budne
The example works AFAIK only with CSnobol4 by Phil Budne
<lang snobol4>
<syntaxhighlight lang="snobol4">
-include "utf.sno"
-include "utf.sno"
output = "Char length: " utfsize(trim(input))
output = "Char length: " utfsize(trim(input))
end
end
</syntaxhighlight>
</lang>


=={{header|Sparkling}}==
=={{header|Sparkling}}==
===Byte length===
===Byte length===
<lang Sparkling>spn:1> sizeof "Hello, wørld!"
<syntaxhighlight lang="sparkling">spn:1> sizeof "Hello, wørld!"
= 14</lang>
= 14</syntaxhighlight>


=={{header|SPL}}==
=={{header|SPL}}==
Line 3,065: Line 3,449:
All strings in SPL are Unicode. See code below.
All strings in SPL are Unicode. See code below.
===Character Length===
===Character Length===
<lang spl>t = ["abc","J̲o̲s̲é̲","møøse","𝔘𝔫𝔦𝔠𝔬𝔡𝔢"]
<syntaxhighlight lang="spl">t = ["abc","J̲o̲s̲é̲","møøse","𝔘𝔫𝔦𝔠𝔬𝔡𝔢"]


> i, 1..#.size(t,1)
> i, 1..#.size(t,1)
Line 3,088: Line 3,472:
<
<
#.output(s)
#.output(s)
<</lang>
<</syntaxhighlight>
{{out}}
{{out}}
<pre>
<pre>
Line 3,121: Line 3,505:
{{works with|Db2 LUW}}
{{works with|Db2 LUW}}
With SQL only:
With SQL only:
<lang sql pl>
<syntaxhighlight lang="sql pl">
VALUES LENGTH('møøse', CODEUNITS16);
VALUES LENGTH('møøse', CODEUNITS16);
VALUES LENGTH('møøse', CODEUNITS32);
VALUES LENGTH('møøse', CODEUNITS32);
Line 3,137: Line 3,521:
VALUES LENGTH2('J̲o̲s̲é̲');
VALUES LENGTH2('J̲o̲s̲é̲');
VALUES LENGTH4('J̲o̲s̲é̲');
VALUES LENGTH4('J̲o̲s̲é̲');
</syntaxhighlight>
</lang>
Output:
Output:
<pre>
<pre>
Line 3,252: Line 3,636:
{{works with|Db2 LUW}}
{{works with|Db2 LUW}}
With SQL only:
With SQL only:
<lang sql pl>
<syntaxhighlight lang="sql pl">
VALUES LENGTH('møøse');
VALUES LENGTH('møøse');
VALUES LENGTHB('møøse');
VALUES LENGTHB('møøse');
Line 3,259: Line 3,643:
VALUES LENGTH('J̲o̲s̲é̲');
VALUES LENGTH('J̲o̲s̲é̲');
VALUES LENGTHB('J̲o̲s̲é̲');
VALUES LENGTHB('J̲o̲s̲é̲');
</syntaxhighlight>
</lang>
Output:
Output:
<pre>
<pre>
Line 3,313: Line 3,697:
{{works with|Moscow ML|2.01}}
{{works with|Moscow ML|2.01}}
{{works with|MLton|20061107}}
{{works with|MLton|20061107}}
<lang sml>val strlen = size "Hello, world!";</lang>
<syntaxhighlight lang="sml">val strlen = size "Hello, world!";</syntaxhighlight>
===Character Length===
===Character Length===
{{works with|Standard ML of New Jersey|SML/NJ|110.74}}
{{works with|Standard ML of New Jersey|SML/NJ|110.74}}
<lang sml>val strlen = UTF8.size "Hello, world!";</lang>
<syntaxhighlight lang="sml">val strlen = UTF8.size "Hello, world!";</syntaxhighlight>


=={{header|Stata}}==
=={{header|Stata}}==
Line 3,322: Line 3,706:
Use '''[https://www.stata.com/help.cgi?f_strlen strlen]''' for byte length, and '''[https://www.stata.com/help.cgi?f_ustrlen ustrlen]''' for the number of Unicode characters in a string.
Use '''[https://www.stata.com/help.cgi?f_strlen strlen]''' for byte length, and '''[https://www.stata.com/help.cgi?f_ustrlen ustrlen]''' for the number of Unicode characters in a string.


<lang stata>scalar s="Ἐν ἀρχῇ ἐποίησεν ὁ θεὸς τὸν οὐρανὸν καὶ τὴν γῆν"
<syntaxhighlight lang="stata">scalar s="Ἐν ἀρχῇ ἐποίησεν ὁ θεὸς τὸν οὐρανὸν καὶ τὴν γῆν"


di strlen(s)
di strlen(s)
Line 3,328: Line 3,712:


di ustrlen(s)
di ustrlen(s)
47</lang>
47</syntaxhighlight>

=={{header|Stringle}}==
The only current implementation of Stringle uses 8-bit character sets, meaning character and byte length is always the same.

This prints the length of a string from input:

<syntaxhighlight lang="stringle">$ #$</syntaxhighlight>


=={{header|Swift}}==
=={{header|Swift}}==
Line 3,337: Line 3,728:
To count "characters" (Unicode grapheme clusters):
To count "characters" (Unicode grapheme clusters):
{{works with|Swift|2.x}}
{{works with|Swift|2.x}}
<lang swift>let numberOfCharacters = "møøse".characters.count // 5</lang>
<syntaxhighlight lang="swift">let numberOfCharacters = "møøse".characters.count // 5</syntaxhighlight>
{{works with|Swift|1.2}}
{{works with|Swift|1.2}}
<lang swift>let numberOfCharacters = count("møøse") // 5</lang>
<syntaxhighlight lang="swift">let numberOfCharacters = count("møøse") // 5</syntaxhighlight>
{{works with|Swift|1.0-1.1}}
{{works with|Swift|1.0-1.1}}
<lang swift>let numberOfCharacters = countElements("møøse") // 5</lang>
<syntaxhighlight lang="swift">let numberOfCharacters = countElements("møøse") // 5</syntaxhighlight>


===Character Length===
===Character Length===
To count Unicode code points:
To count Unicode code points:
{{works with|Swift|2.x}}
{{works with|Swift|2.x}}
<lang swift>let numberOfCodePoints = "møøse".unicodeScalars.count // 5</lang>
<syntaxhighlight lang="swift">let numberOfCodePoints = "møøse".unicodeScalars.count // 5</syntaxhighlight>
{{works with|Swift|1.2}}
{{works with|Swift|1.2}}
<lang swift>let numberOfCodePoints = count("møøse".unicodeScalars) // 5</lang>
<syntaxhighlight lang="swift">let numberOfCodePoints = count("møøse".unicodeScalars) // 5</syntaxhighlight>
{{works with|Swift|1.0-1.1}}
{{works with|Swift|1.0-1.1}}
<lang swift>let numberOfCodePoints = countElements("møøse".unicodeScalars) // 5</lang>
<syntaxhighlight lang="swift">let numberOfCodePoints = countElements("møøse".unicodeScalars) // 5</syntaxhighlight>


===Byte Length===
===Byte Length===
Line 3,357: Line 3,748:
For length in UTF-8, count the number of UTF-8 code units:
For length in UTF-8, count the number of UTF-8 code units:
{{works with|Swift|2.x}}
{{works with|Swift|2.x}}
<lang swift>let numberOfBytesUTF8 = "møøse".utf8.count // 7</lang>
<syntaxhighlight lang="swift">let numberOfBytesUTF8 = "møøse".utf8.count // 7</syntaxhighlight>
{{works with|Swift|1.2}}
{{works with|Swift|1.2}}
<lang swift>let numberOfBytesUTF8 = count("møøse".utf8) // 7</lang>
<syntaxhighlight lang="swift">let numberOfBytesUTF8 = count("møøse".utf8) // 7</syntaxhighlight>
{{works with|Swift|1.0-1.1}}
{{works with|Swift|1.0-1.1}}
<lang swift>let numberOfBytesUTF8 = countElements("møøse".utf8) // 7</lang>
<syntaxhighlight lang="swift">let numberOfBytesUTF8 = countElements("møøse".utf8) // 7</syntaxhighlight>


For length in UTF-16, count the number of UTF-16 code units, and multiply by 2:
For length in UTF-16, count the number of UTF-16 code units, and multiply by 2:
{{works with|Swift|2.x}}
{{works with|Swift|2.x}}
<lang swift>let numberOfBytesUTF16 = "møøse".utf16.count * 2 // 10</lang>
<syntaxhighlight lang="swift">let numberOfBytesUTF16 = "møøse".utf16.count * 2 // 10</syntaxhighlight>
{{works with|Swift|1.2}}
{{works with|Swift|1.2}}
<lang swift>let numberOfBytesUTF16 = count("møøse".utf16) * 2 // 10</lang>
<syntaxhighlight lang="swift">let numberOfBytesUTF16 = count("møøse".utf16) * 2 // 10</syntaxhighlight>
{{works with|Swift|1.0-1.1}}
{{works with|Swift|1.0-1.1}}
<lang swift>let numberOfBytesUTF16 = countElements("møøse".utf16) * 2 // 10</lang>
<syntaxhighlight lang="swift">let numberOfBytesUTF16 = countElements("møøse".utf16) * 2 // 10</syntaxhighlight>


=={{header|Symsyn}}==
=={{header|Symsyn}}==
===Byte Length===
===Byte Length===
<lang symsyn>
<syntaxhighlight lang="symsyn">
c : 'abcdefgh'
c : 'abcdefgh'
#c []
#c []
</syntaxhighlight>
</lang>
Output:
Output:
<pre>
<pre>
Line 3,385: Line 3,776:
===Byte Length===
===Byte Length===
Formally, Tcl does not guarantee to use any particular representation for its strings internally (the underlying implementation objects can hold strings in at least three different formats, mutating between them as necessary) so the way to calculate the "byte length" of a string can only be done with respect to some user-selected encoding. This is done this way (for UTF-8):
Formally, Tcl does not guarantee to use any particular representation for its strings internally (the underlying implementation objects can hold strings in at least three different formats, mutating between them as necessary) so the way to calculate the "byte length" of a string can only be done with respect to some user-selected encoding. This is done this way (for UTF-8):
<lang tcl>string length [encoding convertto utf-8 $theString]</lang>
<syntaxhighlight lang="tcl">string length [encoding convertto utf-8 $theString]</syntaxhighlight>
<!-- Yes, there's <tt>string bytelength</tt>; don't use it. It's deeply wrong-headed and will probably go away in future releases. [[DKF]] -->
<!-- Yes, there's <tt>string bytelength</tt>; don't use it. It's deeply wrong-headed and will probably go away in future releases. [[DKF]] -->
Thus, we have these examples:
Thus, we have these examples:
<lang tcl>set s1 "hello, world"
<syntaxhighlight lang="tcl">set s1 "hello, world"
set s2 "\u304A\u306F\u3088\u3046"
set s2 "\u304A\u306F\u3088\u3046"
set enc utf-8
set enc utf-8
Line 3,394: Line 3,785:
$s1 [string length [encoding convertto $enc $s1]]]
$s1 [string length [encoding convertto $enc $s1]]]
puts [format "length of \"%s\" in bytes is %d" \
puts [format "length of \"%s\" in bytes is %d" \
$s2 [string length [encoding convertto $enc $s2]]]</lang>
$s2 [string length [encoding convertto $enc $s2]]]</syntaxhighlight>


===Character Length===
===Character Length===
Basic version:
Basic version:


<lang tcl>string length "Hello, world!"</lang>
<syntaxhighlight lang="tcl">string length "Hello, world!"</syntaxhighlight>


or more elaborately, needs '''Interpreter''' any 8.X. Tested on 8.4.12.
or more elaborately, needs '''Interpreter''' any 8.X. Tested on 8.4.12.


<lang tcl>fconfigure stdout -encoding utf-8; #So that Unicode string will print correctly
<syntaxhighlight lang="tcl">fconfigure stdout -encoding utf-8; #So that Unicode string will print correctly
set s1 "hello, world"
set s1 "hello, world"
set s2 "\u304A\u306F\u3088\u3046"
set s2 "\u304A\u306F\u3088\u3046"
puts [format "length of \"%s\" in characters is %d" $s1 [string length $s1]]
puts [format "length of \"%s\" in characters is %d" $s1 [string length $s1]]
puts [format "length of \"%s\" in characters is %d" $s2 [string length $s2]]</lang>
puts [format "length of \"%s\" in characters is %d" $s2 [string length $s2]]</syntaxhighlight>


=={{header|TI-89 BASIC}}==
=={{header|TI-89 BASIC}}==
Line 3,413: Line 3,804:
The TI-89 uses an fixed 8-bit encoding so there is no difference between character length and byte length.
The TI-89 uses an fixed 8-bit encoding so there is no difference between character length and byte length.


<lang ti89b>■ dim("møøse") 5</lang>
<syntaxhighlight lang="ti89b">■ dim("møøse") 5</syntaxhighlight>


=={{header|Toka}}==
=={{header|Toka}}==
===Byte Length===
===Byte Length===
<lang toka>" hello, world!" string.getLength</lang>
<syntaxhighlight lang="toka">" hello, world!" string.getLength</syntaxhighlight>


=={{header|Trith}}==
=={{header|Trith}}==
===Character Length===
===Character Length===
<lang trith>"møøse" length</lang>
<syntaxhighlight lang="trith">"møøse" length</syntaxhighlight>
===Byte Length===
===Byte Length===
<lang trith>"møøse" size</lang>
<syntaxhighlight lang="trith">"møøse" size</syntaxhighlight>


=={{header|TUSCRIPT}}==
=={{header|TUSCRIPT}}==
===Character Length ===
===Character Length ===
<lang tuscript>
<syntaxhighlight lang="tuscript">
$$ MODE TUSCRIPT
$$ MODE TUSCRIPT
string="hello, world"
string="hello, world"
l=LENGTH (string)
l=LENGTH (string)
PRINT "character length of string '",string,"': ",l
PRINT "character length of string '",string,"': ",l
</syntaxhighlight>
</lang>
Output:
Output:
<pre>
<pre>
Line 3,439: Line 3,830:


=={{header|UNIX Shell}}==
=={{header|UNIX Shell}}==
===Byte Length===
====Byte length via external utility:====
====With external utility:====


{{works with|Bourne Shell}}
{{works with|Bourne Shell}}
<lang bash>string='Hello, world!'
<syntaxhighlight lang="bash">string='Hello, world!'
length=`expr "x$string" : '.*' - 1`
length=`expr "x$string" : '.*' - 1`
echo $length # if you want it printed to the terminal</lang>
echo $length # if you want it printed to the terminal</syntaxhighlight>


====With [[Unix|SUSv3]] parameter expansion modifier:====
====With [[Unix|SUSv3]] parameter expansion modifier:====

This returns the byte count in ash/dash, but the character count in bash, ksh, and zsh:


{{works with|Almquist SHell}}
{{works with|Almquist SHell}}
{{works with|Bourne Again SHell|3.2}}
{{works with|Bourne Again SHell}}
{{works with|pdksh|5.2.14 99/07/13.2}}
{{works with|Korn Shell|93}}
{{works with|Z SHell}}
{{works with|Z SHell}}
<lang bash>string='Hello, world!'
<syntaxhighlight lang="bash">string='Hello, world!'
length="${#string}"
length=${#string}
echo $length # if you want it printed to the terminal</lang>
echo $length # if you want it printed to the terminal</syntaxhighlight>


=={{header|Vala}}==
=={{header|Vala}}==
===Character Length===
===Character Length===
<lang vala>
<syntaxhighlight lang="vala">
string s = "Hello, world!";
string s = "Hello, world!";
int characterLength = s.length;
int characterLength = s.length;
</syntaxhighlight>
</lang>


=={{header|VBA}}==
=={{header|VBA}}==
Line 3,469: Line 3,861:
=={{header|VBScript}}==
=={{header|VBScript}}==
===Byte Length===
===Byte Length===
<lang vbscript>LenB(string|varname)</lang>
<syntaxhighlight lang="vbscript">LenB(string|varname)</syntaxhighlight>


Returns the number of bytes required to store a string in memory. Returns null if string|varname is null.
Returns the number of bytes required to store a string in memory. Returns null if string|varname is null.
===Character Length===
===Character Length===
<lang vbscript>Len(string|varname)</lang>
<syntaxhighlight lang="vbscript">Len(string|varname)</syntaxhighlight>


Returns the length of the string|varname . Returns null if string|varname is null.
Returns the length of the string|varname . Returns null if string|varname is null.
Line 3,490: Line 3,882:
One method of Encoding returns the number of bytes required to encode a .NET string in that encoding (encoding objects can be obtained through readonly static [Shared in VB.NET] properties of the Encoding class).
One method of Encoding returns the number of bytes required to encode a .NET string in that encoding (encoding objects can be obtained through readonly static [Shared in VB.NET] properties of the Encoding class).


<lang vbnet>Module ByteLength
<syntaxhighlight lang="vbnet">Module ByteLength
Function GetByteLength(s As String, encoding As Text.Encoding) As Integer
Function GetByteLength(s As String, encoding As Text.Encoding) As Integer
Return encoding.GetByteCount(s)
Return encoding.GetByteCount(s)
End Function
End Function
End Module</lang>
End Module</syntaxhighlight>


====Character Length====
====Character Length====
Line 3,502: Line 3,894:
An alternative implementation is to count the number of UTF-16 surrogate pairs in a string and subtract that number from the number of UTF-16 code units in the string.
An alternative implementation is to count the number of UTF-16 surrogate pairs in a string and subtract that number from the number of UTF-16 code units in the string.


<lang vbnet>Module CharacterLength
<syntaxhighlight lang="vbnet">Module CharacterLength
Function GetUTF16CodeUnitsLength(s As String) As Integer
Function GetUTF16CodeUnitsLength(s As String) As Integer
Return s.Length
Return s.Length
Line 3,521: Line 3,913:
Return GetByteLength(s, Text.Encoding.UTF32) \ 4
Return GetByteLength(s, Text.Encoding.UTF32) \ 4
End Function
End Function
End Module</lang>
End Module</syntaxhighlight>


====Grapheme Length====
====Grapheme Length====
Line 3,527: Line 3,919:
<code>System.Globalization.StringInfo</code> provides a means of enumerating the text elements of a string, where each "text element" is a Unicode grapheme.
<code>System.Globalization.StringInfo</code> provides a means of enumerating the text elements of a string, where each "text element" is a Unicode grapheme.


<lang vbnet>Module GraphemeLength
<syntaxhighlight lang="vbnet">Module GraphemeLength
' Wraps an IEnumerator, allowing it to be used as an IEnumerable.
' Wraps an IEnumerator, allowing it to be used as an IEnumerable.
Private Iterator Function AsEnumerable(enumerator As IEnumerator) As IEnumerable
Private Iterator Function AsEnumerable(enumerator As IEnumerator) As IEnumerable
Line 3,539: Line 3,931:
Return AsEnumerable(elements).OfType(Of String).Count()
Return AsEnumerable(elements).OfType(Of String).Count()
End Function
End Function
End Module</lang>
End Module</syntaxhighlight>


====Test Code====
====Test Code====
Line 3,545: Line 3,937:
The compiler constant <code>PRINT_TESTCASE</code> toggles whether to write the contents of each test case to the console; disable for inputs that may mess with the console.
The compiler constant <code>PRINT_TESTCASE</code> toggles whether to write the contents of each test case to the console; disable for inputs that may mess with the console.


<lang vbnet>#Const PRINT_TESTCASE = True
<syntaxhighlight lang="vbnet">#Const PRINT_TESTCASE = True


Module Program
Module Program
Line 3,583: Line 3,975:


End Sub
End Sub
End Module</lang>
End Module</syntaxhighlight>


{{out}}
{{out}}
Line 3,630: Line 4,022:
bytes (UTF-16) 18
bytes (UTF-16) 18
bytes (UTF-32) 36
bytes (UTF-32) 36
</pre>+

=={{header|V (Vlang)}}==
{{trans|go}}
====Byte Length====
<syntaxhighlight lang="v (vlang)">fn main() {
m := "møøse"
u := "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
j := "J̲o̲s̲é̲"
println("$m.len $m ${m.bytes()}")
println("$u.len $u ${u.bytes()}")
println("$j.len $j ${j.bytes()}")
}</syntaxhighlight>
Output:
<pre>
7 møøse [m, 0xc3, 0xb8, 0xc3, 0xb8, s, e]
28 𝔘𝔫𝔦𝔠𝔬𝔡𝔢 [0xf0, 0x9d, 0x94, 0x98, 0xf0, 0x9d, 0x94, 0xab, 0xf0, 0x9d, 0x94, 0xa6, 0xf0, 0x9d, 0x94, 0xa0, 0xf0, 0x9d, 0x94, 0xac, 0xf0, 0x9d, 0x94, 0xa1, 0xf0, 0x9d, 0x94, 0xa2]
13 J̲o̲s̲é̲ [J, 0xcc, 0xb2, o, 0xcc, 0xb2, s, 0xcc, 0xb2, 0xc3, 0xa9, 0xcc, 0xb2]
</pre>
====Character Length====
<syntaxhighlight lang="v (vlang)">fn main() {
m := "møøse"
u := "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
j := "J̲o̲s̲é̲"
println("$m.runes().len $m ${m.runes()}")
println("$u.runes().len $u ${u.runes()}")
println("$j.runes().len $j ${j.runes()}")
}</syntaxhighlight>
Output:
<pre>
5 møøse [`m`, `ø`, `ø`, `s`, `e`]
7 𝔘𝔫𝔦𝔠𝔬𝔡𝔢 [`𝔘`, `𝔫`, `𝔦`, `𝔠`, `𝔬`, `𝔡`, `𝔢`]
8 J̲o̲s̲é̲ [`J`, `̲`, `o`, `̲`, `s`, `̲`, `é`, `̲`]
</pre>
</pre>


=={{header|Wren}}==
=={{header|Wren}}==
===Byte Length===
===Byte Length===
<lang ecmascript>System.print("møøse".bytes.count)
<syntaxhighlight lang="wren">System.print("møøse".bytes.count)
System.print("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".bytes.count)
System.print("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".bytes.count)
System.print("J̲o̲s̲é̲".bytes.count)</lang>
System.print("J̲o̲s̲é̲".bytes.count)</syntaxhighlight>


{{out}}
{{out}}
Line 3,646: Line 4,071:


===Character Length===
===Character Length===
<lang ecmascript>System.print("møøse".count)
<syntaxhighlight lang="wren">System.print("møøse".count)
System.print("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".count)
System.print("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".count)
System.print("J̲o̲s̲é̲".count)</lang>
System.print("J̲o̲s̲é̲".count)</syntaxhighlight>


{{out}}
{{out}}
Line 3,655: Line 4,080:
7
7
8
8
</pre>

===Grapheme Length===
{{libheader|Wren-upc}}
<syntaxhighlight lang="wren">import "./upc" for Graphemes

System.print(Graphemes.clusterCount("møøse"))
System.print(Graphemes.clusterCount("𝔘𝔫𝔦𝔠𝔬𝔡𝔢"))
System.print(Graphemes.clusterCount("J̲o̲s̲é̲"))</syntaxhighlight>

{{out}}
<pre>
5
7
4
</pre>
</pre>


Line 3,661: Line 4,101:
The following code uses AT&T syntax and was tested using AS (the portable GNU assembler) under Linux.
The following code uses AT&T syntax and was tested using AS (the portable GNU assembler) under Linux.


<syntaxhighlight lang="x86 assembly">
<lang x86 Assembly>
.data
.data
string: .asciz "Test"
string: .asciz "Test"
Line 3,686: Line 4,126:
leave
leave
ret
ret
</syntaxhighlight>
</lang>


=={{header|XPL0}}==
=={{header|XPL0}}==
<lang XPL0>include c:\cxpl\stdlib;
<syntaxhighlight lang="xpl0">include c:\cxpl\stdlib;
IntOut(0, StrLen("Character length = Byte length = String length = "))</lang>
IntOut(0, StrLen("Character length = Byte length = String length = "))</syntaxhighlight>


Output:
Output:
Line 3,699: Line 4,139:
=={{header|XSLT}}==
=={{header|XSLT}}==
===Character Length===
===Character Length===
<lang xml><?xml version="1.0" encoding="UTF-8"?></lang>
<syntaxhighlight lang="xml"><?xml version="1.0" encoding="UTF-8"?></syntaxhighlight>
...
...
<lang xml><xsl:value-of select="string-length('møøse')" /> <!-- 5 --></lang>
<syntaxhighlight lang="xml"><xsl:value-of select="string-length('møøse')" /> <!-- 5 --></syntaxhighlight>


=={{header|xTalk}}==
=={{header|xTalk}}==
Line 3,709: Line 4,149:
LiveCode fully supports multi-byte Unicode characters since version 7. See the LiveCode section for more information.
LiveCode fully supports multi-byte Unicode characters since version 7. See the LiveCode section for more information.


<lang xtalk>put the length of "Hello World" </lang>
<syntaxhighlight lang="xtalk">put the length of "Hello World" </syntaxhighlight>
or
or
<lang xtalk>put the number of characters in "Hello World" -- 'chars' short for characters is also valid</lang>
<syntaxhighlight lang="xtalk">put the number of characters in "Hello World" -- 'chars' short for characters is also valid</syntaxhighlight>


===Byte Length===
===Byte Length===
<lang LiveCode>put the number of bytes in "Hello World" -- use byte keyword in LiveCode for multi-byte Unicode</lang>
<syntaxhighlight lang="livecode">put the number of bytes in "Hello World" -- use byte keyword in LiveCode for multi-byte Unicode</syntaxhighlight>


=={{header|Yorick}}==
=={{header|Yorick}}==
===Character Length===
===Character Length===
<lang yorick>strlen("Hello, world!")</lang>
<syntaxhighlight lang="yorick">strlen("Hello, world!")</syntaxhighlight>


=={{header|Z80 Assembly}}==
=={{header|Z80 Assembly}}==
Line 3,726: Line 4,166:
===Byte Length===
===Byte Length===
Code is called as a subroutine, i.e. <code>CALL getStringLength</code>.
Code is called as a subroutine, i.e. <code>CALL getStringLength</code>.
<lang z80>; input: HL - pointer to the 0th char of a string.
<syntaxhighlight lang="z80">; input: HL - pointer to the 0th char of a string.
; outputs length to B. HL will point to the last character in the string just before the terminator.
; outputs length to B. HL will point to the last character in the string just before the terminator.
; length is one-indexed and does not include the terminator. A null string will return 0 in B.
; length is one-indexed and does not include the terminator. A null string will return 0 in B.
Line 3,743: Line 4,183:
inc hl ;next char
inc hl ;next char
inc b ;increment the byte count
inc b ;increment the byte count
jr loop_getStringLength</lang>
jr loop_getStringLength</syntaxhighlight>


=={{header|zkl}}==
=={{header|zkl}}==
Line 3,750: Line 4,190:
If your terminal/editor deals with UTF-8 (mine doesn't), you don't need to use the escapes, just put the unicode characters in quotes (ie the editor inserts UTF bytes, which are non zero).
If your terminal/editor deals with UTF-8 (mine doesn't), you don't need to use the escapes, just put the unicode characters in quotes (ie the editor inserts UTF bytes, which are non zero).
===Character Length===
===Character Length===
<lang zkl>"abc".len() //-->3
<syntaxhighlight lang="zkl">"abc".len() //-->3
"\ufeff\u00A2 \u20ac".len() //-->9 "BOM¢ €"</lang>
"\ufeff\u00A2 \u20ac".len() //-->9 "BOM¢ €"</syntaxhighlight>
===Byte Length===
===Byte Length===
<lang zkl>"abc".len() //-->3
<syntaxhighlight lang="zkl">"abc".len() //-->3
"\ufeff\u00A2 \u20ac".len() //-->9
"\ufeff\u00A2 \u20ac".len() //-->9
Data(0,Int,"\ufeff\u00A2 \u20ac") //-->Data(9) (bytes)
Data(0,Int,"\ufeff\u00A2 \u20ac") //-->Data(9) (bytes)
"J\u0332o\u0332s\u0332e\u0301\u0332".len() //-->14
"J\u0332o\u0332s\u0332e\u0301\u0332".len() //-->14
"\U1D518;\U1D52B;\U1D526;\U1D520;\U1D52C;\U1D521;\U1D522;".len() //-->28</lang>
"\U1D518;\U1D52B;\U1D526;\U1D520;\U1D52C;\U1D521;\U1D522;".len() //-->28</syntaxhighlight>
===Character Length===
===Character Length===
UTF-8 characters are counted, modifiers (such as underscore) are counted as separate characters.
UTF-8 characters are counted, modifiers (such as underscore) are counted as separate characters.
<lang zkl>"abc".len(8) //-->3
<syntaxhighlight lang="zkl">"abc".len(8) //-->3
"\ufeff\u00A2 \u20ac".len(8) //-->4 "BOM¢ €"
"\ufeff\u00A2 \u20ac".len(8) //-->4 "BOM¢ €"
"\U1000;".len(8) //-->Exception thrown: ValueError(Invalid UTF-8 string)
"\U1000;".len(8) //-->Exception thrown: ValueError(Invalid UTF-8 string)
"\uD800" //-->SyntaxError : Line 2: Bad Unicode constant (\uD800-\uDFFF)
"\uD800" //-->SyntaxError : Line 2: Bad Unicode constant (\uD800-\uDFFF)
"J\u0332o\u0332s\u0332e\u0301\u0332".len(8) //-->9 "J̲o̲s̲é̲"
"J\u0332o\u0332s\u0332e\u0301\u0332".len(8) //-->9 "J̲o̲s̲é̲"
"\U1D518;\U1D52B;\U1D526;\U1D520;\U1D52C;\U1D521;\U1D522;".len(8) //-->7 "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"</lang>
"\U1D518;\U1D52B;\U1D526;\U1D520;\U1D52C;\U1D521;\U1D522;".len(8) //-->7 "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"</syntaxhighlight>
[[Wikipedia::https://en.wikipedia.org/wiki/Comparison_of_programming_languages_%28string_functions%29#length]]
[[Wikipedia::https://en.wikipedia.org/wiki/Comparison_of_programming_languages_%28string_functions%29#length]]

=={{header|Zig}}==
<syntaxhighlight lang="zig">const std = @import("std");

fn printResults(alloc: std.mem.Allocator, string: []const u8) !void {
const cnt_codepts_utf8 = try std.unicode.utf8CountCodepoints(string);
// There is no sane and portable extended ascii, so the best
// we get is counting the bytes and assume regular ascii.
const cnt_bytes_utf8 = string.len;
const stdout_wr = std.io.getStdOut().writer();
try stdout_wr.print("utf8 codepoints = {d}, bytes = {d}\n", .{ cnt_codepts_utf8, cnt_bytes_utf8 });

const utf16str = try std.unicode.utf8ToUtf16LeWithNull(alloc, string);
const cnt_codepts_utf16 = try std.unicode.utf16CountCodepoints(utf16str);
const cnt_2bytes_utf16 = try std.unicode.calcUtf16LeLen(string);
try stdout_wr.print("utf16 codepoints = {d}, bytes = {d}\n", .{ cnt_codepts_utf16, 2 * cnt_2bytes_utf16 });
}

pub fn main() !void {
var arena_instance = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena_instance.deinit();
const arena = arena_instance.allocator();
const string1: []const u8 = "Hello, world!";
try printResults(arena, string1);
const string2: []const u8 = "møøse";
try printResults(arena, string2);
const string3: []const u8 = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢";
try printResults(arena, string3);
// \u{332} is underscore of previous character, which the browser may not
// copy correctly
const string4: []const u8 = "J\u{332}o\u{332}s\u{332}e\u{301}\u{332}";
try printResults(arena, string4);
}</syntaxhighlight>

{{out}}

<pre>
utf8 codepoints = 13, bytes = 13
utf16 codepoints = 13, bytes = 26
utf8 codepoints = 5, bytes = 7
utf16 codepoints = 5, bytes = 10
utf8 codepoints = 7, bytes = 28
utf16 codepoints = 7, bytes = 28
utf8 codepoints = 9, bytes = 14
utf16 codepoints = 9, bytes = 18
</pre>

Latest revision as of 19:49, 30 April 2024

Task
String length
You are encouraged to solve this task according to the task description, using any language you may know.
Task

Find the character and byte length of a string.

This means encodings like UTF-8 need to be handled properly, as there is not necessarily a one-to-one relationship between bytes and characters.

By character, we mean an individual Unicode code point, not a user-visible grapheme containing combining characters.

For example, the character length of "møøse" is 5 but the byte length is 7 in UTF-8 and 10 in UTF-16.

Non-BMP code points (those between 0x10000 and 0x10FFFF) must also be handled correctly: answers should produce actual character counts in code points, not in code unit counts.

Therefore a string like "𝔘𝔫𝔦𝔠𝔬𝔡𝔢" (consisting of the 7 Unicode characters U+1D518 U+1D52B U+1D526 U+1D520 U+1D52C U+1D521 U+1D522) is 7 characters long, not 14 UTF-16 code units; and it is 28 bytes long whether encoded in UTF-8 or in UTF-16.

Please mark your examples with ===Character Length=== or ===Byte Length===.

If your language is capable of providing the string length in graphemes, mark those examples with ===Grapheme Length===.

For example, the string "J̲o̲s̲é̲" ("J\x{332}o\x{332}s\x{332}e\x{301}\x{332}") has 4 user-visible graphemes, 9 characters (code points), and 14 bytes when encoded in UTF-8.

Other tasks related to string operations:
Metrics
Counting
Remove/replace
Anagrams/Derangements/shuffling
Find/Search/Determine
Formatting
Song lyrics/poems/Mad Libs/phrases
Tokenize
Sequences



360 Assembly

Assembler 360 use EBCDIC coding, so one character is one byte. The L' atrribute can be seen as the length function for assembler 360.

*        String length             06/07/2016
LEN      CSECT
         USING  LEN,15             base register
         LA     1,L'C              length of C
         XDECO  1,PG
         XPRNT  PG,12
         LA     1,L'H              length of H
         XDECO  1,PG
         XPRNT  PG,12
         LA     1,L'F              length of F
         XDECO  1,PG
         XPRNT  PG,12
         LA     1,L'D              length of D
         XDECO  1,PG
         XPRNT  PG,12
         LA     1,L'PG             length of PG
         XDECO  1,PG
         XPRNT  PG,12
         BR     14                 exit           length
C        DS     C                  character       1
H        DS     H                  half word       2
F        DS     F                  full word       4    
D        DS     D                  double word     8
PG       DS     CL12               string         12
         END    LEN
Output:
           1
           2
           4
           8
          12

6502 Assembly

Translation of: Z80 Assembly

Most 6502-based computers predate Unicode, so only byte length will be demonstrated for now.

GetStringLength: ;$00 and $01 make up the pointer to the string's base address. 
                 ;(Of course, any two consecutive zero-page memory locations can fulfill this role.)
LDY #0           ;Y is both the index into the string and the length counter.

loop_getStringLength:
LDA ($00),y
BEQ exit
INY
JMP loop_getStringLength

exit:
RTS               ;string length is now loaded into Y.

68000 Assembly

Byte Length (ASCII)

GetStringLength:
; INPUT: A3 = BASE ADDRESS OF STRING
; RETURNS LENGTH IN D1 (MEASURED IN BYTES)
MOVE.L #0,D1

loop_getStringLength:

MOVE.B (A3)+,D0
CMP #0,D0
BEQ done
ADDQ.L #1,D1
BRA loop_getStringLength

done:
RTS

8086 Assembly

Translation of: 68000 Assembly

Byte Length

;INPUT: DS:SI = BASE ADDR. OF STRING
;TYPICALLY, MS-DOS USES $ TO TERMINATE STRINGS.
GetStringLength:
xor cx,cx      ;this takes fewer bytes to encode than "mov cx,0" 
cld            ;makes string functions post-inc rather than post-dec.

loop_GetStringLength:
lodsb          ;equivalent of "mov al,[ds:si],inc si" except this doesn't alter the flags.
cmp '$'
je done        ;if equal, we're finished.
inc cx         ;add 1 to length counter. A null string will have a length of zero.
jmp loop_GetStringLength

done:
ret

4D

Byte Length

$length:=Length("Hello, world!")

AArch64 Assembly

Works with: as version Raspberry Pi 3B version Buster 64 bits
/* ARM assembly AARCH64 Raspberry PI 3B */
/*  program stringLength64.s   */ 

/*******************************************/
/* Constantes file                         */
/*******************************************/
/* for this file see task include a file in language AArch64 assembly*/
.include "../includeConstantesARM64.inc"

/*********************************/
/* Initialized data              */
/*********************************/
.data
sMessResultByte:        .asciz "===Byte Length=== : @ \n"
sMessResultChar:        .asciz "===Character Length=== : @ \n"
szString1:              .asciz "møøse€"
szCarriageReturn:       .asciz "\n"
 
/*********************************/
/* UnInitialized data            */
/*********************************/
.bss
sZoneConv:        .skip 24
/*********************************/
/*  code section                 */
/*********************************/
.text
.global main 
main:                                 // entry of program 
    ldr x0,qAdrszString1
    bl affichageMess                  // display string
    ldr x0,qAdrszCarriageReturn
    bl affichageMess
    
    ldr x0,qAdrszString1
    mov x1,#0
1:                                    // loop compute length bytes
    ldrb w2,[x0,x1]
    cmp w2,#0
    cinc x1,x1,ne
    bne 1b
    
    mov x0,x1                         // result display
    ldr x1,qAdrsZoneConv
    bl conversion10                   // call decimal conversion
    ldr x0,qAdrsMessResultByte
    ldr x1,qAdrsZoneConv              // insert conversion in message
    bl strInsertAtCharInc
    bl affichageMess

    ldr x0,qAdrszString1
    mov x1,#0
    mov x3,#0
2:                                    // loop compute length characters
    ldrb w2,[x0,x1]
    cmp w2,#0
    beq 6f
    and x2,x2,#0b11100000             // 3 bytes ?
    cmp x2,#0b11100000
    bne 3f
    add x3,x3,#1
    add x1,x1,#3
    b 2b
3:
    and x2,x2,#0b11000000              // 2 bytes ?
    cmp x2,#0b11000000
    bne 4f
    add x3,x3,#1
    add x1,x1,#2
    b 2b
4:                                    // else 1 byte
    add x3,x3,#1
    add x1,x1,#1
    b 2b

6:
    mov x0,x3
    ldr x1,qAdrsZoneConv
    bl conversion10                   // call decimal conversion
    ldr x0,qAdrsMessResultChar
    ldr x1,qAdrsZoneConv              // insert conversion in message
    bl strInsertAtCharInc
    bl affichageMess
100:                                  // standard end of the program 
    mov x0,0                          // return code
    mov x8,EXIT                       // request to exit program
    svc 0                             // perform the system call
 
qAdrszCarriageReturn:     .quad szCarriageReturn
qAdrsMessResultByte:      .quad sMessResultByte
qAdrsMessResultChar:      .quad sMessResultChar
qAdrszString1:            .quad szString1
qAdrsZoneConv:            .quad sZoneConv
/********************************************************/
/*        File Include fonctions                        */
/********************************************************/
/* for this file see task include a file in language AArch64 assembly */
.include "../includeARM64.inc"

Action!

PROC Test(CHAR ARRAY s)
  PrintF("Length of ""%S"" is %B%E",s,s(0))
RETURN

PROC Main()
  Test("Hello world!")
  Test("")
RETURN
Output:

Screenshot from Atari 8-bit computer

Length of "Hello world!" is 12
Length of "" is 0

ActionScript

Byte length

This uses UTF-8 encoding. For other encodings, the ByteArray's writeMultiByte() method can be used.

package {
 
    import flash.display.Sprite;
    import flash.events.Event;
    import flash.utils.ByteArray;
 
    public class StringByteLength extends Sprite {
  
        public function StringByteLength() {
            if ( stage ) _init();
            else addEventListener(Event.ADDED_TO_STAGE, _init);
        }
 
        private function _init(e:Event = null):void {
            var s1:String = "The quick brown fox jumps over the lazy dog";
            var s2:String = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢";
            var s3:String = "José";
            
            var b:ByteArray = new ByteArray();
            b.writeUTFBytes(s1);
            trace(b.length);  // 43
            
            b.clear();
            b.writeUTFBytes(s2);
            trace(b.length);  // 28
            
            b.clear();
            b.writeUTFBytes(s3);
            trace(b.length);  // 5
        }
 
    }
 
}

Character Length

var s1:String = "The quick brown fox jumps over the lazy dog";
var s2:String = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢";
var s3:String = "José";
trace(s1.length, s2.length, s3.length);  // 43, 14, 4

Ada

Works with: GCC version 4.1.2

Byte Length

Str    : String := "Hello World";
Length : constant Natural := Str'Size / 8;

The 'Size attribute returns the size of an object in bits. Provided that under "byte" one understands an octet of bits, the length in "bytes" will be 'Size divided to 8. Note that this is not necessarily the machine storage unit. In order to make the program portable, System.Storage_Unit should be used instead of "magic number" 8. System.Storage_Unit yields the number of bits in a storage unit on the current machine. Further, the length of a string object is not the length of what the string contains in whatever measurement units. String as an object may have a "dope" to keep the array bounds. In fact the object length can even be 0, if the compiler optimized the object away. So in most cases "byte length" makes no sense in Ada.

Character Length

Latin_1_Str    : String           := "Hello World";
UCS_16_Str     : Wide_String      := "Hello World";
Unicode_Str    : Wide_Wide_String := "Hello World";
Latin_1_Length : constant Natural := Latin_1_Str'Length;
UCS_16_Length  : constant Natural := UCS_16_Str'Length;
Unicode_Length : constant Natural := Unicode_Str'Length;

The attribute 'Length yields the number of elements of an array. Since strings in Ada are arrays of characters, 'Length is the string length. Ada supports strings of Latin-1, UCS-16 and full Unicode characters. In the example above character length of all three strings is 11. The length of the objects in bits will differ.

Aime

Byte Length

length("Hello, World!")

or

~"Hello, World!"

ALGOL 68

Bits and Bytes Length

BITS bits := bits pack((TRUE, TRUE, FALSE, FALSE)); # packed array of BOOL #
BYTES bytes := bytes pack("Hello, world"); # packed array of CHAR #
print((
  "BITS and BYTES are fixed width:", new line,
  "bits width:", bits width, ", max bits: ", max bits, ", bits:", bits, new line,
  "bytes width: ",bytes width, ", UPB:",UPB STRING(bytes), ", string:", STRING(bytes),"!", new line
))

Output:

BITS and BYTES are fixed width:
bits width:        +32, max bits: TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT, bits:TTFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
bytes width:         +32, UPB:        +32, string:Hello, world!

Character Length

STRING str := "hello, world";
INT length := UPB str;
printf(($"Length of """g""" is "g(3)l$,str,length));

printf(($l"STRINGS can start at -1, in which case LWB must be used:"l$));
STRING s := "abcd"[@-1];
print(("s:",s, ", LWB:", LWB s, ", UPB:",UPB s, ", LEN:",UPB s - LWB s + 1))

Output:

Length of "hello, world" is +12
STRINGS can start at -1, in which case LWB must be used:
s:abcd, LWB:         -1, UPB:         +2, LEN:         +4

Apex

String myString = 'abcd';
System.debug('Size of String', myString.length());

AppleScript

Byte Length

count of "Hello World"

Mac OS X 10.5 (Leopard) includes AppleScript 2.0 which uses only Unicode (UTF-16) character strings. This example has been tested on OSX 10.8.5. Added a combining char for testing.

set inString to "Hello é̦世界"
set byteCount to 0

repeat with c in inString
	set t to id of c
	if ((count of t) > 0) then
		repeat with i in t
			set byteCount to byteCount + doit(i)
		end repeat
	else
		set byteCount to byteCount + doit(t)
	end if
end repeat

byteCount

on doit(cid)
	set n to (cid as integer)
	if n > 67108863 then -- 0x3FFFFFF
		return 6
	else if n > 2097151 then -- 0x1FFFFF
		return 5
	else if n > 65535 then -- 0xFFFF
		return 4
	else if n > 2047 then -- 0x07FF
		return 3
	else if n > 127 then -- 0x7F
		return 2
	else
		return 1
	end if
end doit

Character Length

count of "Hello World"

Or:

count "Hello World"

Applesoft BASIC

? LEN("HELLO, WORLD!")

ARM Assembly

Works with: as version Raspberry Pi
/* ARM assembly Raspberry PI  */
/*  program stringLength.s   */ 

/* REMARK 1 : this program use routines in a include file 
   see task Include a file language arm assembly 
   for the routine affichageMess conversion10 
   see at end of this program the instruction include */
/* for constantes see task include a file in arm assembly */
/************************************/
/* Constantes                       */
/************************************/
.include "../constantes.inc"

/*********************************/
/* Initialized data              */
/*********************************/
.data
sMessResultByte:        .asciz "===Byte Length=== : @ \n"
sMessResultChar:        .asciz "===Character Length=== : @ \n"
szString1:          .asciz "møøse€"
szCarriageReturn:   .asciz "\n"
 

/*********************************/
/* UnInitialized data            */
/*********************************/
.bss
sZoneConv:        .skip 24
/*********************************/
/*  code section                 */
/*********************************/
.text
.global main 
main:                                 @ entry of program 
    ldr r0,iAdrszString1
    bl affichageMess                  @ display string
    ldr r0,iAdrszCarriageReturn
    bl affichageMess
    
    ldr r0,iAdrszString1
    mov r1,#0
1:                                    @ loop compute length bytes
    ldrb r2,[r0,r1]
    cmp r2,#0
    addne r1,#1
    bne 1b
    
    mov r0,r1                         @ result display
    ldr r1,iAdrsZoneConv
    bl conversion10                   @ call decimal conversion
    ldr r0,iAdrsMessResultByte
    ldr r1,iAdrsZoneConv              @ insert conversion in message
    bl strInsertAtCharInc
    bl affichageMess

    ldr r0,iAdrszString1
    mov r1,#0
    mov r3,#0
2:                                    @ loop compute length characters
    ldrb r2,[r0,r1]
    cmp r2,#0
    beq 6f
    and r2,#0b11100000                @ 3 bytes ?
    cmp r2,#0b11100000
    bne 3f
    add r3,#1
    add r1,#3
    b 2b
3:
    and r2,#0b11000000                @ 2 bytes ?
    cmp r2,#0b11000000
    bne 4f
    add r3,#1
    add r1,#2
    b 2b
4:                                    @ else 1 byte
    add r3,#1
    add r1,#1
    b 2b

6:
    mov r0,r3
    ldr r1,iAdrsZoneConv
    bl conversion10                   @ call decimal conversion
    ldr r0,iAdrsMessResultChar
    ldr r1,iAdrsZoneConv              @ insert conversion in message
    bl strInsertAtCharInc
    bl affichageMess
100:                                  @ standard end of the program 
    mov r0, #0                        @ return code
    mov r7, #EXIT                     @ request to exit program
    svc #0                            @ perform the system call
 
iAdrszCarriageReturn:     .int szCarriageReturn
iAdrsMessResultByte:      .int sMessResultByte
iAdrsMessResultChar:      .int sMessResultChar
iAdrszString1:            .int szString1
iAdrsZoneConv:            .int sZoneConv
/***************************************************/
/*      ROUTINES INCLUDE                           */
/***************************************************/
.include "../affichage.inc"
møøse€
===Byte Length=== : 10
===Character Length=== : 6

Arturo

Character Length

str: "Hello World"

print ["length =" size str]
Output:
length = 11

AutoHotkey

Character Length

Msgbox % StrLen("Hello World")

Or:

String := "Hello World"
StringLen, Length, String
Msgbox % Length

Avail

Character Length

Avail represents strings as a tuple of characters, with each character representing a single code point.

|"møøse"|

Byte Length

A UTF-8 byte length can be acquired with the standard library's UTF-8 encoder.

nonBMPString ::= "𝔘𝔫𝔦𝔠𝔬𝔡𝔢";
encoder ::= a UTF8 encoder;
bytes ::= encoder process nonBMPString;
|bytes|

// or, as a one-liner
|a UTF8 encoder process "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"|

AWK

Byte Length

From within any code block:

w=length("Hello, world!")      # static string example
x=length("Hello," s " world!") # dynamic string example
y=length($1)                   # input field example
z=length(s)                    # variable name example

Ad hoc program from command line:

 echo "Hello, wørld!" | awk '{print length($0)}'   # 14

From executable script: (prints for every line arriving on stdin)

#!/usr/bin/awk -f
{print"The length of this line is "length($0)}

Axe

Axe supports two string encodings: a rough equivalent to ASCII, and a token-based format. These examples are for ASCII.

Byte Length

"HELLO, WORLD"→Str1
Disp length(Str1)▶Dec,i

BaCon

BaCon has full native support for UTF-8 encoding.

PRINT "Bytelen of 'hello': ", LEN("hello")
PRINT "Charlen of 'hello': ", ULEN("hello")

PRINT "Bytelen of 'møøse': ", LEN("møøse")
PRINT "Charlen of 'møøse': ", ULEN("møøse")

PRINT "Bytelen of '𝔘𝔫𝔦𝔠𝔬𝔡𝔢': ", LEN("𝔘𝔫𝔦𝔠𝔬𝔡𝔢")
PRINT "Charlen of '𝔘𝔫𝔦𝔠𝔬𝔡𝔢': ", ULEN("𝔘𝔫𝔦𝔠𝔬𝔡𝔢")
Output:
Bytelen of 'hello': 5
Charlen of 'hello': 5
Bytelen of 'møøse': 7
Charlen of 'møøse': 5
Bytelen of '𝔘𝔫𝔦𝔠𝔬𝔡𝔢': 28
Charlen of '𝔘𝔫𝔦𝔠𝔬𝔡𝔢': 7

BASIC

Character Length

Works with: QBasic
Works with: Liberty BASIC
Works with: PowerBASIC version PB/CC, PB/DOS

BASIC only supports single-byte characters. The character "ø" is converted to "°" for printing to the console and length functions, but will still output to a file as "ø".

 INPUT a$
 PRINT LEN(a$)

ANSI BASIC

The ANSI BASIC needs line numbers.

10 INPUT A$
20 PRINT LEN(A$)

Applesoft BASIC

The GW-BASIC solution works without any changes.

BASIC256

The GW-BASIC solution works without any changes.

Chipmunk Basic

The GW-BASIC solution works without any changes.

MSX Basic

Works with: MSX BASIC version any

The GW-BASIC solution works without any changes.>

Quite BASIC

The GW-BASIC solution works without any changes.

True BASIC

The GW-BASIC solution works without any changes.

Yabasic

The GW-BASIC solution works without any changes.

ZX Spectrum Basic

The ZX Spectrum needs line numbers:

10 INPUT a$
20 PRINT LEN a$

However, it's not quite as trivial as this.

Byte length

Strings can contain embedded colour codes; an inline INVERSE (CAPS SHIFT + 4) would be represented as CHR$ 20 + CHR$ 1. The LEN function will account for all these bytes. On the flipside, ZX Spectrum keywords are all tokenised, and there's nothing stopping you using them in a string; " RANDOMIZE ", if the keyword is used, will take a single byte (CHR$ 249) rather than the 11 characters it actually uses. The above version of the code will produce byte length.

Character length

Stripping out all entries in the string with codes in the lower 32 will get rid of colour control codes. The character length of a token is not a simple thing to determine, so this version strips them out too by eliminating anything above CHR$ 164 (the last UDG). A 91-entry DATA list of token lengths might be the next step.

10 INPUT a$
20 LET b$=""
30 FOR x=1 TO LEN a$
40 LET k=CODE a$(x)
50 IF k<32 OR k>164 THEN GOTO 70
60 LET b$=b$+a$(k)
70 NEXT x
80 PRINT LEN b$

Grapheme length

Alternatively, the string might include control codes for backspacing and overwriting;

10 LET a$=CHR$ 111+CHR$ 8+CHR$ 21+CHR$ 1+CHR$ 34

will produce an "o" character overprinted with a quotation mark, resulting in a "passable" impression of an umlaut. The above code will reduce this to two characters when the actual printed length is one (byte length is of course five). The other possible workaround is to print the string and calculate the character length based on the resultant change in screen position. (This will only work for a string with a character length that actually fits on the screen, so below about 670.)

10 INPUT a$
20 CLS
30 PRINT a$;
40 LET x=PEEK 23688: LET y=PEEK 23689
50 PRINT CHR$ 13;33-x+32*(24-y)

Commodore BASIC

Commodore BASIC needs line numbers too, and can't use mixed case. When in mixed case mode, everything must be in lower case letters. However, the default is UPPERCASE + graphic characters; thus everything appears as UPPER case character.

10 INPUT A$
20 PRINT LEN(A$)

IS-BASIC

100 INPUT PROMPT "String: ":TX$
110 PRINT LEN(TX$)

QB64

In QB64 a String variable is assumed to be UTF-8 and thus the byte length is the same as character length. That said there are methods to map UTF-16 and UTF-32 to the CP437 (ASCII) table (see, _MAPUNICODE).

Print Len(s$)

Batch File

Byte Length

@echo off
setlocal enabledelayedexpansion
call :length %1 res
echo length of %1 is %res%
goto :eof

:length
set str=%~1
set cnt=0
:loop
if "%str%" equ "" (
	set %2=%cnt%
	goto :eof
	)
set str=!str:~1!
set /a cnt = cnt + 1
goto loop

BBC BASIC

Character Length

      INPUT text$
      PRINT LEN(text$)

Byte Length

      CP_ACP = 0
      CP_UTF8 = &FDE9
      
      textA$ = "møøse"
      textW$ = "                 "
      textU$ = "                 "
      
      SYS "MultiByteToWideChar", CP_ACP, 0, textA$, -1, !^textW$, LEN(textW$)/2 TO nW%
      SYS "WideCharToMultiByte", CP_UTF8, 0, textW$, -1, !^textU$, LEN(textU$), 0, 0
      PRINT "Length in bytes (ANSI encoding) = " ; LEN(textA$)
      PRINT "Length in bytes (UTF-16 encoding) = " ; 2*(nW%-1)
      PRINT "Length in bytes (UTF-8 encoding) = " ; LEN($$!^textU$)

Output:

Length in bytes (ANSI encoding) = 5
Length in bytes (UTF-16 encoding) = 10
Length in bytes (UTF-8 encoding) = 7

BQN

Strings are arrays of characters in BQN.

Byte Length

Each character is converted to its codepoint, and compared with the respective UTF boundary.

BLen  {(𝕩)+´𝕩@+128204865536}

Character Length

Character length is just array length.

Len  

Output

•Show >(⊢⋈∾Len⋈BLen)¨
  "møøse"
  "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
  "J̲o̲s̲é̲"

┌─                 
╵ "møøse"    5 7   
  "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"  7 28  
  "J̲o̲s̲é̲" 8 13  
                  ┘

Bracmat

The solutions work with UTF-8 encoded strings.

Byte Length

(ByteLength=
  length
.   @(!arg:? [?length)
  & !length
);

out$ByteLength$𝔘𝔫𝔦𝔠𝔬𝔡𝔢

Answer:

28

Character Length

(CharacterLength=
  length c 
.     0:?length
    & @( !arg
       :   ?
           ( %?c
           & utf$!c:?k
           & 1+!length:?length
           & ~
           )
           ?
       )
  | !length
);

out$CharacterLength$𝔘𝔫𝔦𝔠𝔬𝔡𝔢

Answer:

7

An improved version scans the input string character wise, not byte wise. Thus many string positions that are deemed not to be possible starting positions of UTF-8 are not even tried. The patterns [!p and [?p implement a ratchet mechanism. [!p indicates the start of a character and [?p remembers the end of the character, which becomes the start position of the next byte.

(CharacterLength=
  length c p
.     0:?length:?p
    & @( !arg
       :   ?
           ( [!p %?c
           & utf$!c:?k
           & 1+!length:?length
           )
           ([?p&~)
           ?
       )
  | !length
);

Later versions of Bracmat have the built in function vap that "vaporises" a string into "atoms". If the string is UTF-8 encoded, then each "atom" is one UTF-8 character, so the length of the list of atoms is the character length of the input string. The first argument to the vap function is a function that will be applied to every UTF-8 encoded character in the input string. The outcomes of these function calls are the elements in the resulting list. In the solution below we choose an anonymous function (=.!arg) that just returns the characters themselves.

(CharacterLength=
  length
. vap$((=.!arg).!arg):? [?length&!length
);

Brainf***

Byte Length

There are several limitations Brainf*** has that influence this solution:

  • Brainf*** only supports 8-bit numbers in canonical implementations, so it only supports strings of length below 255.
  • The rule of thumb in Brainf*** when reading a string is to always store exactly one byte, no matter how much bytes a character represents. That's why this solution is a strictly ByteLength one.
  • No way to pass anything to Brainf*** but giving the arguments as input. That's why this program reads a string and outputs the number of bytes in it.

[dot 2C_100000bit_etc.29] is used to print the number from memory.

,----- ----- [>,----- -----] ; read a text until a newline
<[+++++ +++++<] ; restore the original text
>[[-]<[>+<-]>+>]< ; add one to the accumulator cell for every byte read
;; from esolang dot org
>[-]>[-]+>[-]+< [>[-<-<<[->+>+<<]>[-<+>]>>]++++++++++>[-]+>[-]>[-]> [-]<<<<<[->-[>+>>]>[[-<+>]+>+>>]<<<<<]>>-[-<<+>>]<[-]++++++++ [-<++++++>]>>[-<<+>>]<<] <[.[-]<]
[-]+++++ +++++. ; print newline

C

Byte Length

Works with: ANSI C
Works with: GCC version 3.3.3
#include <string.h>

int main(void) 
{
  const char *string = "Hello, world!";
  size_t length = strlen(string);
         
  return 0;
}

or by hand:

int main(void) 
{
  const char *string = "Hello, world!";
  size_t length = 0;
  
  const char *p = string;
  while (*p++ != '\0') length++;                                         
  
  return 0;
}

or (for arrays of char only)

#include <stdlib.h>

int main(void)
{
  char s[] = "Hello, world!";
  size_t length = sizeof s - 1;
  
  return 0;
}

Character Length

For wide character strings (usually Unicode uniform-width encodings such as UCS-2 or UCS-4):

#include <stdio.h>
#include <wchar.h>

int main(void) 
{
   wchar_t *s = L"\x304A\x306F\x3088\x3046"; /* Japanese hiragana ohayou */
   size_t length;

   length = wcslen(s);
   printf("Length in characters = %d\n", length);
   printf("Length in bytes      = %d\n", sizeof(s) * sizeof(wchar_t));
   
   return 0;
}

Dealing with raw multibyte string

Following code is written in UTF-8, and environment locale is assumed to be UTF-8 too. Note that "møøse" is here directly written in the source code for clarity, which is not a good idea in general. mbstowcs(), when passed NULL as the first argument, effectively counts the number of chars in given string under current locale.

#include <stdio.h>
#include <stdlib.h>
#include <locale.h>

int main()
{
	setlocale(LC_CTYPE, "");
	char moose[] = "møøse";
	printf("bytes: %d\n", sizeof(moose) - 1);
	printf("chars: %d\n", (int)mbstowcs(0, moose, 0));

	return 0;
}

output

bytes: 7
chars: 5

C#

Platform: .NET

Works with: C # version 1.0+

Character Length

string s = "Hello, world!";
int characterLength = s.Length;

Byte Length

Strings in .NET are stored in Unicode.

using System.Text;

string s = "Hello, world!";
int byteLength = Encoding.Unicode.GetByteCount(s);

To get the number of bytes that the string would require in a different encoding, e.g., UTF8:

int utf8ByteLength = Encoding.UTF8.GetByteCount(s);

C++

Byte Length

Works with: ISO C++
Works with: g++ version 4.0.2
#include <string> // (not <string.h>!)
using std::string;

int main()
{
  string s = "Hello, world!";
  string::size_type length = s.length(); // option 1: In Characters/Bytes
  string::size_type size = s.size();     // option 2: In Characters/Bytes
  // In bytes same as above since sizeof(char) == 1
  string::size_type bytes = s.length() * sizeof(string::value_type); 
}

For wide character strings:

#include <string>
using std::wstring;
  
int main()
{
  wstring s = L"\u304A\u306F\u3088\u3046";
  wstring::size_type length = s.length() * sizeof(wstring::value_type); // in bytes
}

Character Length

Works with: C++98
Works with: g++ version 4.0.2

For wide character strings:

#include <string>
using std::wstring;

int main()
{
  wstring s = L"\u304A\u306F\u3088\u3046";
  wstring::size_type length = s.length();
}

For narrow character strings:

Works with: C++11
Works with: clang++ version 3.0
#include <iostream>
#include <codecvt>
int main()
{
    std::string utf8 = "\x7a\xc3\x9f\xe6\xb0\xb4\xf0\x9d\x84\x8b"; // U+007a, U+00df, U+6c34, U+1d10b
    std::cout << "Byte length: " << utf8.size() << '\n';
    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
    std::cout << "Character length: " << conv.from_bytes(utf8).size() << '\n';
}
Works with: C++98
Works with: g++ version 4.1.2 20061115 (prerelease) (SUSE Linux)
#include <cwchar>  // for mbstate_t
#include <locale>

// give the character length for a given named locale
std::size_t char_length(std::string const& text, char const* locale_name)
{
  // locales work on pointers; get length and data from string and
  // then don't touch the original string any more, to avoid
  // invalidating the data pointer
  std::size_t len = text.length();
  char const* input = text.data();

  // get the named locale
  std::locale loc(locale_name);

  // get the conversion facet of the locale
  typedef std::codecvt<wchar_t, char, std::mbstate_t> cvt_type;
  cvt_type const& cvt = std::use_facet<cvt_type>(loc);

  // allocate buffer for conversion destination
  std::size_t bufsize = cvt.max_length()*len;
  wchar_t* destbuf = new wchar_t[bufsize];
  wchar_t* dest_end;

  // do the conversion
  mbstate_t state = mbstate_t();
  cvt.in(state, input, input+len, input, destbuf, destbuf+bufsize, dest_end);

  // determine the length of the converted sequence
  std::size_t length = dest_end - destbuf;

  // get rid of the buffer
  delete[] destbuf;

  // return the result
  return length;
}

Example usage (note that the locale names are OS specific):

#include <iostream>

int main()
{
  // Tür (German for door) in UTF8
  std::cout << char_length("\x54\xc3\xbc\x72", "de_DE.utf8") << "\n"; // outputs 3

  // Tür in ISO-8859-1
  std::cout << char_length("\x54\xfc\x72", "de_DE") << "\n"; // outputs 3
}

Note that the strings are given as explicit hex sequences, so that the encoding used for the source code won't matter.

Clean

Byte Length

Clean Strings are unboxed arrays of characters. Characters are always a single byte. The function size returns the number of elements in an array.

import StdEnv

strlen :: String -> Int
strlen string = size string 

Start = strlen "Hello, world!"

Clojure

Byte Length

(def utf-8-octet-length #(-> % (.getBytes "UTF-8") count))
(map utf-8-octet-length  ["møøse" "𝔘𝔫𝔦𝔠𝔬𝔡𝔢" "J\u0332o\u0332s\u0332e\u0301\u0332"]) ; (7 28 14)

(def utf-16-octet-length (comp (partial * 2) count))
(map utf-16-octet-length ["møøse" "𝔘𝔫𝔦𝔠𝔬𝔡𝔢" "J\u0332o\u0332s\u0332e\u0301\u0332"]) ; (10 28 18)

(def code-unit-length count)
(map code-unit-length    ["møøse" "𝔘𝔫𝔦𝔠𝔬𝔡𝔢" "J\u0332o\u0332s\u0332e\u0301\u0332"]) ; (5 14 9)

Character length

(def character-length #(.codePointCount % 0 (count %)))
(map character-length    ["møøse" "𝔘𝔫𝔦𝔠𝔬𝔡𝔢" "J\u0332o\u0332s\u0332e\u0301\u0332"]) ; (5 7 9)

Grapheme Length

(def grapheme-length
  #(->> (doto (java.text.BreakIterator/getCharacterInstance)
          (.setText %))
        (partial (memfn next))
        repeatedly
        (take-while (partial not= java.text.BreakIterator/DONE))
        count))
(map grapheme-length     ["møøse" "𝔘𝔫𝔦𝔠𝔬𝔡𝔢" "J\u0332o\u0332s\u0332e\u0301\u0332"]) ; (5 7 4)

COBOL

Byte Length

FUNCTION BYTE-LENGTH(str)

Alternative, non-standard extensions:

Works with: GNU Cobol
LENGTH OF str
Works with: GNU Cobol
Works with: Visual COBOL
FUNCTION LENGTH-AN(str)

Character Length

FUNCTION LENGTH(str)

ColdFusion

Byte Length

<cfoutput>
<cfset str = "Hello World">
<cfset j = createObject("java","java.lang.String").init(str)>
<cfset t = j.getBytes()>
<p>#arrayLen(t)#</p>
</cfoutput>

Character Length

#len("Hello World")#

Common Lisp

Byte Length

In Common Lisp, there is no standard way to examine byte representations of characters, except perhaps to write a string to a file, then reopen the file as binary. However, specific implementations will have ways to do so. For example:

Works with: SBCL
(length (sb-ext:string-to-octets "Hello Wørld"))

returns 12.

Character Length

Common Lisp represents strings as sequences of characters, not bytes, so there is no ambiguity about the encoding. The length function always returns the number of characters in a string.

(length "Hello World")

returns 11, and

(length "Hello Wørld")

returns 11 too.

Component Pascal

Component Pascal encodes strings in UTF-16, which represents each character with 16-bit value.

Character Length

MODULE TestLen;

	IMPORT Out;

	PROCEDURE DoCharLength*;
		VAR s: ARRAY 16 OF CHAR; len: INTEGER;
	BEGIN
		s := "møøse";
		len := LEN(s$);
		Out.String("s: "); Out.String(s); Out.Ln;
		Out.String("Length of characters: "); Out.Int(len, 0); Out.Ln
	END DoCharLength;

END TestLen.

A symbol $ in LEN(s$) in Component Pascal allows to copy sequence of characters up to null-terminated character. So, LEN(s$) returns a real length of characters instead of allocated by variable.

Running command TestLen.DoCharLength gives following output:

s: møøse
Length of characters: 5

Byte Length

MODULE TestLen;

	IMPORT Out;

	PROCEDURE DoByteLength*;
		VAR s: ARRAY 16 OF CHAR; len, v: INTEGER;
	BEGIN
		s := "møøse";
		len := LEN(s$);
		v := SIZE(CHAR) * len;
		Out.String("s: "); Out.String(s); Out.Ln;
		Out.String("Length of characters in bytes: "); Out.Int(v, 0); Out.Ln
	END DoByteLength;

END TestLen.

Running command TestLen.DoByteLength gives following output:

s: møøse
Length of characters in bytes: 10

Crystal

UTF8 is the default encoding in Crystal.

Byte Length

"J̲o̲s̲é̲".bytesize

Character Length

"J̲o̲s̲é̲".chars.length

D

Byte Length

import std.stdio;

void showByteLen(T)(T[] str) {
    writefln("Byte length: %2d - %(%02x%)",
             str.length * T.sizeof, cast(ubyte[])str);
}

void main() {
    string s1a = "møøse"; // UTF-8
    showByteLen(s1a);
    wstring s1b = "møøse"; // UTF-16
    showByteLen(s1b);
    dstring s1c = "møøse"; // UTF-32
    showByteLen(s1c);
    writeln();

    string s2a = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢";
    showByteLen(s2a);
    wstring s2b = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢";
    showByteLen(s2b);
    dstring s2c = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢";
    showByteLen(s2c);
    writeln();

    string s3a = "J̲o̲s̲é̲";
    showByteLen(s3a);
    wstring s3b = "J̲o̲s̲é̲";
    showByteLen(s3b);
    dstring s3c = "J̲o̲s̲é̲";
    showByteLen(s3c);
}
Output:
Byte length:  7 - 6dc3b8c3b87365
Byte length: 10 - 6d00f800f80073006500
Byte length: 20 - 6d000000f8000000f80000007300000065000000

Byte length: 28 - f09d9498f09d94abf09d94a6f09d94a0f09d94acf09d94a1f09d94a2
Byte length: 28 - 35d818dd35d82bdd35d826dd35d820dd35d82cdd35d821dd35d822dd
Byte length: 28 - 18d501002bd5010026d5010020d501002cd5010021d5010022d50100

Byte length: 14 - 4accb26fccb273ccb265cc81ccb2
Byte length: 18 - 4a0032036f00320373003203650001033203
Byte length: 36 - 4a000000320300006f000000320300007300000032030000650000000103000032030000

Character Length

import std.stdio, std.range, std.conv;

void showCodePointsLen(T)(T[] str) {
    writefln("Character length: %2d - %(%x %)",
             str.walkLength(), cast(uint[])to!(dchar[])(str));
}

void main() {
    string s1a = "møøse"; // UTF-8
    showCodePointsLen(s1a);
    wstring s1b = "møøse"; // UTF-16
    showCodePointsLen(s1b);
    dstring s1c = "møøse"; // UTF-32
    showCodePointsLen(s1c);
    writeln();

    string s2a = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢";
    showCodePointsLen(s2a);
    wstring s2b = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢";
    showCodePointsLen(s2b);
    dstring s2c = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢";
    showCodePointsLen(s2c);
    writeln();

    string s3a = "J̲o̲s̲é̲";
    showCodePointsLen(s3a);
    wstring s3b = "J̲o̲s̲é̲";
    showCodePointsLen(s3b);
    dstring s3c = "J̲o̲s̲é̲";
    showCodePointsLen(s3c);
}
Output:
Character length:  5 - 6d f8 f8 73 65
Character length:  5 - 6d f8 f8 73 65
Character length:  5 - 6d f8 f8 73 65

Character length:  7 - 1d518 1d52b 1d526 1d520 1d52c 1d521 1d522
Character length:  7 - 1d518 1d52b 1d526 1d520 1d52c 1d521 1d522
Character length:  7 - 1d518 1d52b 1d526 1d520 1d52c 1d521 1d522

Character length:  9 - 4a 332 6f 332 73 332 65 301 332
Character length:  9 - 4a 332 6f 332 73 332 65 301 332
Character length:  9 - 4a 332 6f 332 73 332 65 301 332

DataWeave

Character Length

sizeOf("foo")
Output:
3

Dc

Byte Length

Dc's "P" command prints numbers as strings. The number 22405534230753963835153736737 (hint: look at it in hex) represents "Hello world!". Counting the byte length of it is counting how often it iteratively can be divided by 256 with non zero result. The snippet defines the macro which calculates the length, prints the string 1st and then its length.

[256 / d 0<L 1 + ] sL
22405534230753963835153736737 d P A P
lL x f
Hello world!                                                                                        
12                                                                                                  

Character Length

The following code output 5, which is the length of the string "abcde"

[abcde]Zp

Déjà Vu

Byte Length

Byte length depends on the encoding, which internally is UTF-8, but users of the language can only get at the raw bytes after encoding a string into a blob.

!. len !encode!utf-8 "møøse"
!. len !encode!utf-8 "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
Output:
7
28

Character Length

!. len "møøse"
!. len "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
Output:
5
7

Delphi

See Pascal.

Dyalect

"Hello World".Length()

E

Character Length

"Hello World".size()

EasyLang

Character Length

# 5
print len "møøse"
# 7
print len "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
# 8
print len "J̲o̲s̲é̲"
# 1
print len "😀"

Ecstasy

module StrLen {
    @Inject Console console;

    void run(String s = "José") {
        console.print($|For the string {s.quoted()}:
                       |  Character length:  {s.size}
                       |  UTF-8 byte length: {s.calcUtf8Length()}
                     );
    }
}
Output:
For the string "José":
  Character length:  4
  UTF-8 byte length: 5

Elena

Character Length

ELENA 4.x :

import extensions;
 
public program()
{
    var s := "Hello, world!";             // UTF-8 literal
    var ws := "Привет мир!"w;             // UTF-16 literal
 
    var s_length := s.Length;             // Number of UTF-8 characters
    var ws_length := ws.Length;           // Number of UTF-16 characters
    var u_length := ws.toArray().Length;    //Number of UTF-32 characters
}

Byte Length

ELENA 4.x :

import extensions;
 
public program()
{
    var s := "Hello, world!";                     // UTF-8 literal
    var ws := "Привет мир!"w;                     // UTF-16 literal
 
    var s_byte_length := s.toByteArray().Length;  // Number of bytes
    var ws_byte_length := ws.toByteArray().Length;  // Number of bytes
}

Elixir

Byte Length

name = "J\x{332}o\x{332}s\x{332}e\x{301}\x{332}"
byte_size(name)
# => 14

Character Length

name = "J\x{332}o\x{332}s\x{332}e\x{301}\x{332}"
Enum.count(String.codepoints(name))
# => 9

Grapheme Length

name = "J\x{332}o\x{332}s\x{332}e\x{301}\x{332}"
String.length(name)
# => 4

Emacs Lisp

Character Length

(length "hello")
;; => 5

Byte Length

(string-bytes "\u1D518\u1D52B\u1D526")
;; => 12

string-bytes is the length of Emacs' internal representation. In Emacs 23 up this is utf-8. In earlier versions it was "emacs-mule".

Display Length

string-width is the displayed width of a string in the current frame and window. This is not the same as grapheme length since various Asian characters may display in 2 columns, depending on the type of tty or GUI.

(let ((str (apply 'string
                  (mapcar (lambda (c) (decode-char 'ucs c))
                          '(#x1112 #x1161 #x11ab #x1100 #x1173 #x11af)))))
  (list (length str)
        (string-bytes str)
        (string-width str)))
;; => (6 18 4)  ;; in emacs 23 up

EMal

text moose = "møøse"
text unicode = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
text jose = "J" + 0U0332 + "o" + 0U0332 + "s" + 0U0332 + "e" + 0U0301 + 0U0332
text emoji = "𠇰😈🎶🔥é-"

Byte Length

writeLine((blob!moose).length)
writeLine((blob!unicode).length)
writeLine((blob!jose).length)
writeLine((blob!emoji).length)
Output:
7
28
14
19

Character Length

writeLine(moose.codePointsLength)
writeLine(unicode.codePointsLength)
writeLine(jose.codePointsLength)
writeLine(emoji.codePointsLength)
Output:
5
7
9
6

Grapheme Length

writeLine(moose.graphemesLength)
writeLine(unicode.graphemesLength)
writeLine(jose.graphemesLength)
writeLine(emoji.graphemesLength)
Output:
5
7
4
6

Erlang

Character Length

Strings are lists of integers in Erlang. So "𝔘𝔫𝔦𝔠𝔬𝔡𝔢" is the list [120088,120107,120102,120096,120108,120097,120098].

9> U = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢".
[120088,120107,120102,120096,120108,120097,120098]
10> erlang:length(U).
7

Euphoria

Character Length

print(1,length("Hello World"))

F#

This is delegated to the standard .Net framework string and encoding functions.

Byte Length

open System.Text
let byte_length str = Encoding.UTF8.GetByteCount(str)

Character Length

"Hello, World".Length

Factor

Byte Length

Here are two words to compute the byte length of strings. The first one doesn't allocate new memory, the second one can easily be adapted to measure the byte length of encodings other than UTF8.

: string-byte-length ( string -- n ) [ code-point-length ] map-sum ;
: string-byte-length-2 ( string -- n ) utf8 encode length ;

Character Length

length works on any sequece, of which strings are one. Strings are UTF8 encoded.

length

Fantom

Byte length

A string can be converted into an instance of Buf to treat the string as a sequence of bytes according to a given charset: the default is UTF8, but 16-bit representations can also be used.

fansh> c := "møøse"
møøse
fansh> c.toBuf.size   // find the byte length of the string in default (UTF8) encoding
7
fansh> c.toBuf.toHex  // display UTF8 representation
6dc3b8c3b87365  
fansh> c.toBuf(Charset.utf16LE).size    // byte length in UTF16 little-endian
10
fansh> c.toBuf(Charset.utf16LE).toHex   // display as UTF16 little-endian
6d00f800f80073006500
fansh> c.toBuf(Charset.utf16BE).size    // byte length in UTF16 big-endian
10
fansh> c.toBuf(Charset.utf16BE).toHex   // display as UTF16 big-endian
006d00f800f800730065

Character length

fansh> c := "møøse"
møøse
fansh> c.size
5

Forth

Works with: ANS Forth

Byte Length

Strings in Forth come in two forms, neither of which are the null-terminated form commonly used in the C standard library.

Counted string

A counted string is a single pointer to a short string in memory. The string's first byte is the count of the number of characters in the string. This is how symbols are stored in a Forth dictionary.

CREATE s ," Hello world" \ create string "s"
s C@ ( -- length=11 )
s COUNT  ( addr len )   \ convert to a stack string, described below

Stack string

A string on the stack is represented by a pair of cells: the address of the string data and the length of the string data (in characters). The word COUNT converts a counted string into a stack string. The STRING utility wordset of ANS Forth works on these addr-len pairs. This representation has the advantages of not requiring null-termination, easy representation of substrings, and not being limited to 255 characters.

S" string" ( addr len)
DUP .   \ 6

Character Length

The 1994 ANS standard does not have any notion of a particular character encoding, although it distinguishes between character and machine-word addresses. (There is some ongoing work on standardizing an "XCHAR" wordset for dealing with strings in particular encodings such as UTF-8.)

The following code will count the number of UTF-8 characters in a null-terminated string. It relies on the fact that all bytes of a UTF-8 character except the first have the the binary bit pattern "10xxxxxx".

2 base !
: utf8+ ( str -- str )
  begin
    char+
    dup c@
    11000000 and
    10000000 <>
  until ;
decimal
: count-utf8 ( zstr -- n )
  0
  begin
    swap dup c@
  while
    utf8+
    swap 1+
  repeat drop ;

Fortran

Fortran 77 introduced variables of type CHARACTER and associated syntax. These are fixed-size entities, declared at compile time as in CHARACTER*66 TEXT, however a subroutine (or function) receiving such a variable could declare it as CHARACTER*(*) TEXT so that any size may be supplied to the routine, and with F90 came the ability within subroutines (or functions) to declare items of a size determined at run time. There is no associated length variable, as with strings that have both a content and a length, nor is there a special character value (such as zero) deemed to mark the end-of-text in such a variable to give string-like facilities. However, with F90 came facilities, standardised in F2003 whereby a CHARACTER variable could be re-allocated exactly the right amount of storage whenever it was assigned to. So, TEXT = "this" would cause TEXT to become a CHARACTER variable of length four, adjusted so at run time. Again, the length information is not associated with the variable itself, for instance as the content of a character zero prefixing the content to enable strings of a length up to 255. The length information must be stored somewhere...

Previously, character data would be stored in arithmetic variables, using format codes such as A1 to store one character per variable, which might be an integer or a floating-point variable of much larger size. Format A2 would store two such characters, and so on. Code A1 would give ease of manipulation, while A8 (say for a REAL*8 variable) would save space. Numerical values would be strange, and word sizes may not be a multiple of eight bits nor character encodements require eight bits, especially on a decimal computer such as the IBM1620 where storage usage was counted in digits, and a character required two.

An intrinsic function LEN(text) reports the number of characters in the variable (with no consideration of any storage needed anywhere to hold the length), while SIZE(array) reports the number of elements in an array and SIZEOF(x) may be available to report the number of bytes of storage of x. Since these days, everyone uses computers with eight-bit characters and this is deemed universal, the result from LEN will be equivalent to both a byte and a character count.

There is no facility for fancy Unicode schemes, other than by writing suitable routines. In that regard, plotting packages often supply a special function that returns the length of a text string, as it would appear on the plot, in plotting units, especially useful when the plotter's rendition of text employs a proportionally-spaced typeface and interprets superscripts and subscripts and so forth, so that the programmer can prepare code to juggle with the layout, perhaps of mathematical expressions. This is of course not in any standard.

Byte Length

LEN(text)

Character Length

LEN(text)

FreeBASIC

' FB 1.05.0 Win64

Dim s As String      = "moose"  '' variable length ascii string
Dim f As String  * 5 = "moose"  '' fixed length ascii string (in practice a zero byte is appended)
Dim z As ZString * 6 = "moose"  '' fixed length zero terminated ascii string 
Dim w As WString * 6 = "møøse"  '' fixed length zero terminated unicode string

' Variable length strings have a descriptor consisting of 3 Integers (12 bytes on 32 bit, 24 bytes on 64 bit systems)
' In order, the descriptor contains the address of the data, the memory currently used and the memory allocated

' In Windows, WString uses UCS-2 encoding (i.e. 2 bytes per character, surrogates are not supported)
' In Linux,   WString uses UCS-4 encoding (i.e. 4 bytes per character) 

' The Len function always returns the length of the string in characters
' The SizeOf function returns the bytes used (by the descriptor in the case of variable length strings)

Print "s : " ; s, "Character Length : "; Len(s), "Byte Length : "; Len(s); "  (data)"
Print "s : " ; s, "Character Length : "; Len(s), "Byte Length : "; SizeOf(s); " (descriptor)"
Print "f : " ; f, "Character Length : "; Len(s), "Byte Length : "; SizeOf(f) 
Print "z : " ; z, "Character Length : "; Len(s), "Byte Length : "; SizeOf(z) 
Print "w : " ; w, "Character Length : "; Len(s), "Byte Length : "; SizeOf(w) 
Print
Sleep
Output:
s : moose     Character Length :  5       Byte Length :  5  (data)
s : moose     Character Length :  5       Byte Length :  24 (descriptor)
f : moose     Character Length :  5       Byte Length :  6
z : moose     Character Length :  5       Byte Length :  6
w : møøse     Character Length :  5       Byte Length :  12

Frink

Byte Length

A string can be converted to an array of bytes in any supported encoding.

b = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
length[stringToBytes[b, "UTF-8"]]

Character Length

Frink's string operations correctly handle upper-plane Unicode characters as a single codepoint.

b = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
length[b]

Grapheme Length

b = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
graphemeLength[b]

GAP

Length("abc");
# or same result with
Size("abc");

Gnuplot

Byte Length

print strlen("hello")
=> 5

Go

Byte Length

package main

import "fmt"

func main() {
    m := "møøse"
    u := "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
    j := "J̲o̲s̲é̲"
    fmt.Printf("%d %s % x\n", len(m), m, m)
    fmt.Printf("%d %s % x\n", len(u), u, u)
    fmt.Printf("%d %s % x\n", len(j), j, j)
}

Output:

7 møøse 6d c3 b8 c3 b8 73 65
28 𝔘𝔫𝔦𝔠𝔬𝔡𝔢 f0 9d 94 98 f0 9d 94 ab f0 9d 94 a6 f0 9d 94 a0 f0 9d 94 ac f0 9d 94 a1 f0 9d 94 a2
13 J̲o̲s̲é̲ 4a cc b2 6f cc b2 73 cc b2 c3 a9 cc b2

Character Length

package main

import (
    "fmt"
    "unicode/utf8"
)

func main() {
    m := "møøse"
    u := "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
    j := "J̲o̲s̲é̲"
    fmt.Printf("%d %s %x\n", utf8.RuneCountInString(m), m, []rune(m))
    fmt.Printf("%d %s %x\n", utf8.RuneCountInString(u), u, []rune(u))
    fmt.Printf("%d %s %x\n", utf8.RuneCountInString(j), j, []rune(j))
}

Output:

5 møøse [6d f8 f8 73 65]
7 𝔘𝔫𝔦𝔠𝔬𝔡𝔢 [1d518 1d52b 1d526 1d520 1d52c 1d521 1d522]
9 J̲o̲s̲é̲ [4a 332 6f 332 73 332 65 301 332]

Grapheme Length

Go does not have language or library features to recognize graphemes directly. For example, it does not provide functions implementing Unicode Standard Annex #29, Unicode Text Segmentation. It does however have convenient functions for recognizing Unicode character categories, and so an expected subset of grapheme possibilites is easy to recognize. Here is a solution recognizing the category "Mn", which includes the combining characters used in the task example.

package main

import (
    "fmt"
    "unicode"
    "unicode/utf8"
)

func main() {
    m := "møøse"
    u := "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
    j := "J̲o̲s̲é̲"
    fmt.Printf("%d %s %x\n", grLen(m), m, []rune(m))
    fmt.Printf("%d %s %x\n", grLen(u), u, []rune(u))
    fmt.Printf("%d %s %x\n", grLen(j), j, []rune(j))
}

func grLen(s string) int {
    if len(s) == 0 {
        return 0
    }
    gr := 1
    _, s1 := utf8.DecodeRuneInString(s)
    for _, r := range s[s1:] {
        if !unicode.Is(unicode.Mn, r) {
            gr++
        }
    }
    return gr
}

Output:

5 møøse [6d f8 f8 73 65]
7 𝔘𝔫𝔦𝔠𝔬𝔡𝔢 [1d518 1d52b 1d526 1d520 1d52c 1d521 1d522]
4 J̲o̲s̲é̲ [4a 332 6f 332 73 332 65 301 332]

Groovy

Calculating "Byte-length" (by which one typically means "in-memory storage size in bytes") is not possible through the facilities of the Groovy language alone. Calculating "Character length" is built into the Groovy extensions to java.lang.String.

Character Length

println "Hello World!".size()
println "møøse".size()
println "𝔘𝔫𝔦𝔠𝔬𝔡𝔢".size()
println "J̲o̲s̲é̲".size()

Output:

12
5
14
8

Note: The Java "String.length()" method also works in Groovy, but "size()" is consistent with usage in other sequential or composite types.

GW-BASIC

GW-BASIC only supports single-byte characters.

10 INPUT A$
20 PRINT LEN(A$)

Haskell

Byte Length

It is not possible to determine the "byte length" of an ordinary string, because in Haskell, a string is a boxed list of unicode characters. So each character in a string is represented as whatever the compiler considers as the most efficient representation of a cons-cell and a unicode character, and not as a byte.

For efficient storage of sequences of bytes, there's Data.ByteString, which uses Word8 as a base type. Byte strings have an additional Data.ByteString.Char8 interface, which will truncate each Unicode Char to 8 bits as soon as it is converted to a byte string. However, this is not adequate for the task, because truncation simple will garble characters other than Latin-1, instead of encoding them into UTF-8, say.

There are several (non-standard, so far) Unicode encoding libraries available on Hackage. As an example, we'll use encoding-0.2, as Data.Encoding:

import Data.Encoding
import Data.ByteString as B

strUTF8  :: ByteString 
strUTF8  = encode UTF8  "Hello World!"

strUTF32 :: ByteString 
strUTF32 = encode UTF32 "Hello World!"

strlenUTF8  = B.length strUTF8
strlenUTF32 = B.length strUTF32

Character Length

Works with: GHCi version 6.6
Works with: Hugs

The base type Char defined by the standard is already intended for (plain) Unicode characters.

strlen = length "Hello, world!"

HicEst

LEN("1 character == 1 byte") ! 21

HolyC

Byte Length

U8 *string = "Hello, world!";
Print("%d\n", StrLen(string));

Icon and Unicon

Character Length

   length := *s

Note: Neither Icon nor Unicon currently supports double-byte character sets.

IDL

Byte Length

This example may be incorrect due to a recent change in the task requirements or a lack of testing. Please verify it and remove this message. If the example does not match the requirements or does not work, replace this message with Template:incorrect or fix the code yourself.

Compiler: any IDL compiler should do

length = strlen("Hello, world!")

Character Length

This example may be incorrect due to a recent change in the task requirements or a lack of testing. Please verify it and remove this message. If the example does not match the requirements or does not work, replace this message with Template:incorrect or fix the code yourself.
length = strlen("Hello, world!")

Io

Byte Length

"møøse" sizeInBytes

Character Length

"møøse" size

J

Byte Length

   #     'møøse'
7

Here we use the default encoding for character literals (8 bit wide literals).

Character Length

   #7 u: 'møøse'
5

Here we have used 16 bit wide character literals. See also the dictionary page for u:.

Jakt

Character Length

fn character_length(string: String) -> i64 {
    mut length = 0
    for _ in string.code_points() {
        length++
    }
    return length
}

fn main() {
    for string in [
            "Hello world!"
            "møøse"
            "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
            "J̲o̲s̲é̲"
        ] {
        println("\"{}\" {}", string, character_length(string))
    }
}
Output:
"Hello world!" 12
"møøse" 5
"𝔘𝔫𝔦𝔠𝔬𝔡𝔢" 7
"J̲o̲s̲é̲" 8

Byte Length

fn main() {
    for string in [
            "Hello world!"
            "møøse"
            "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
            "J̲o̲s̲é̲"
        ] {
        println("\"{}\" {}", string, string.length())
    }
}
Output:
"Hello world!" 12
"møøse" 7
"𝔘𝔫𝔦𝔠𝔬𝔡𝔢" 28
"J̲o̲s̲é̲" 13

Java

Byte Length

Java encodes strings in UTF-16, which represents each character with one or two 16-bit values.

Another way to know the byte length of a string -who cares- is to explicitly specify the charset we desire.

String s = "Hello, world!"; 
int byteCountUTF16 = s.getBytes("UTF-16").length; // Incorrect: it yields 28 (that is with the BOM)
int byteCountUTF16LE = s.getBytes("UTF-16LE").length; // Correct: it yields 26
int byteCountUTF8  = s.getBytes("UTF-8").length; // yields 13

Character Length

Java encodes strings in UTF-16, which represents each character (code point) with one or two 16-bit code units. This is a variable-length encoding scheme. The most commonly used characters are represented by one 16-bit code unit, while rarer ones like some mathematical symbols are represented by two.

The length method of String objects is not the length of that String in characters. Instead, it only gives the number of 16-bit code units used to encode a string. This is not (always) the number of Unicode characters (code points) in the string.

String s = "Hello, world!";
int not_really_the_length = s.length(); // XXX: does not (always) count Unicode characters (code points)!

Since Java 1.5, the actual number of characters (code points) can be determined by calling the codePointCount method.

String str = "\uD834\uDD2A"; //U+1D12A
int not_really__the_length = str.length(); // value is 2, which is not the length in characters
int actual_length = str.codePointCount(0, str.length()); // value is 1, which is the length in characters

Grapheme Length

Since JDK 20[1].

import java.text.BreakIterator;

public class Grapheme {
  public static void main(String[] args) {
    printLength("møøse");
    printLength("𝔘𝔫𝔦𝔠𝔬𝔡𝔢");
    printLength("J̲o̲s̲é̲");
  }
  
  public static void printLength(String s) {
    BreakIterator it = BreakIterator.getCharacterInstance();
    it.setText(s);
    int count = 0;
    while (it.next() != BreakIterator.DONE) {
      count++;
    }
    System.out.println("Grapheme length: " + count+ " " + s);
  }
}

Output:

Grapheme length: 5 møøse
Grapheme length: 7 𝔘𝔫𝔦𝔠𝔬𝔡𝔢
Grapheme length: 4 J̲o̲s̲é̲

JavaScript

Byte length

JavaScript encodes strings in UTF-16, which represents each character with one or two 16-bit values. The length property of string objects gives the number of 16-bit values used to encode a string, so the number of bytes can be determined by doubling that number.

var s = "Hello, world!";
var byteCount = s.length * 2; // 26

It's easier to use Buffer.byteLength (Node.JS specific, not ECMAScript).

a = '👩‍❤️‍👩'
Buffer.byteLength(a, 'utf16le'); // 16
Buffer.byteLength(a, 'utf8'); // 20
Buffer.byteLength(s, 'utf16le'); // 26
Buffer.byteLength(s, 'utf8'); // 13

In pure ECMAScript, TextEncoder() can be used to return the UTF-8 byte size:

(new TextEncoder().encode(a)).length; // 20
(new TextEncoder().encode(s)).length; // 13

Unicode codepoint length

JavaScript encodes strings in UTF-16, which represents each character with one or two 16-bit values. The most commonly used characters are represented by one 16-bit value, while rarer ones like some mathematical symbols are represented by two.

If the string only contains commonly used characters, the number of characters will be equal to the number of 16-bit values used to represent the characters.

var str1 = "Hello, world!";
var len1 = str1.length; // 13

var str2 = "\uD834\uDD2A"; // U+1D12A represented by a UTF-16 surrogate pair
var len2 = str2.length; // 2

More generally, the expansion operator in an array can be used to enumerate Unicode code points:

[...str2].length // 1

Unicode grapheme length

Counting Unicode codepoints when using combining characters such as joining sequences or diacritics will return the wrong size, so we must count graphemes instead. Intl.Segmenter() default granularity is grapheme.

[...new Intl.Segmenter().segment(a)].length; // 1

ES6 destructuring/iterators

ES6 provides several ways to get a string split into an array of code points instead of UTF-16 code units:

let
  str='AöЖ€𝄞'
 ,countofcodeunits=str.length // 6
 ,cparr=[...str],
 ,countofcodepoints=cparr.length; // 5
{ let
    count=0
  for(let codepoint of str)
    count++
  countofcodepoints=count // 5
}
{ let
    count=0,
    it=str[Symbol.iterator]()
  while(!it.next().done)
    count++
  countofcodepoints=count // 5
}
{ cparr=Array.from(str)
  countofcodepoints=cparr.length // 5
}

Joy

Byte length
"Café" size.
Output:
5

jq

jq strings are JSON strings and are therefore encoded as UTF-8. When given a JSON string, the length filter emits the number of Unicode codepoints that it contains:

$ cat String_length.jq
def describe:
   "length of \(.) is \(length)";

("J̲o̲s̲é̲", "𝔘𝔫𝔦𝔠𝔬𝔡𝔢") | describe
$ jq -n -f String_length.jq
"length of J̲o̲s̲é̲ is 8"
"length of 𝔘𝔫𝔦𝔠𝔬𝔡𝔢 is 7"

JudoScript

Byte Length

This example may be incorrect due to a recent change in the task requirements or a lack of testing. Please verify it and remove this message. If the example does not match the requirements or does not work, replace this message with Template:incorrect or fix the code yourself.
//Store length of hello world in length and print it
. length = "Hello World".length();

Character Length

This example may be incorrect due to a recent change in the task requirements or a lack of testing. Please verify it and remove this message. If the example does not match the requirements or does not work, replace this message with Template:incorrect or fix the code yourself.
//Store length of hello world in length and print it
. length = "Hello World".length()

Julia

Julia encodes strings as UTF-8, so the byte length (via sizeof) will be different from the string length (via length) only if the string contains non-ASCII characters.

Byte Length

sizeof("møøse") # 7
sizeof("𝔘𝔫𝔦𝔠𝔬𝔡𝔢") # 28
sizeof("J̲o̲s̲é̲") # 13

Character Length

length("møøse") # 5
length("𝔘𝔫𝔦𝔠𝔬𝔡𝔢") # 7
length("J̲o̲s̲é̲") # 8

Grapheme Length

import Unicode
length(Unicode.graphemes("møøse")) # 5
length(Unicode.graphemes("𝔘𝔫𝔦𝔠𝔬𝔡𝔢")) # 7
length(Unicode.graphemes("J̲o̲s̲é̲")) # 4

K

Character Length

    #"Hello, world!"
13
    #"Hëllo, world!"
13

Kotlin

As in Java, a string in Kotlin is essentially a sequence of UTF-16 encoded characters and the 'length' property simply returns the number of such characters in the string. Surrogates or graphemes are not treated specially for this purpose - they are just represented by the appropriate number of UTF-16 characters.

As each UTF-16 character occupies 2 bytes, it follows that the number of bytes occupied by the string will be twice the length:

fun main() {
    val s = "José"
    println("The char length is ${s.length}")
    println("The byte length is ${Char.SIZE_BYTES * s.length}")
}
Output:
The char length is 4
The byte length is 8

LabVIEW

Byte Length

LabVIEW is using a special variant of UTF-8, so byte length == character length.


Character Length

Lambdatalk

The lambdatalk {W.length string} function returns the number of bytes in a string. For Unicode characters made of two bytes things are a little bit more tricky. It's easy to add (inline) a new javascript primitive to the dictionary:

{script 
LAMBDATALK.DICT["W.unicodeLength"] = function() {
  function countCodePoints(str) {
    var point,
        index,
        width = 0,
        len = 0;
    for (index = 0; index < str.length;) {
      point = str.codePointAt(index);
      width = 0;
      while (point) {
          width += 1;
          point = point >> 8;
      }
      index += Math.round(width/2);
      len += 1;
    }
    return len;
  }
  var args = arguments[0].trim();
  return countCodePoints(args)
};
}

Testing:

{W.length Hello, World!} -> 13

{W.length José}          -> 4
{W.unicodeLength José}   -> 4

{W.length 𝔘𝔫𝔦𝔠𝔬𝔡𝔢}        -> 14 
{W.unicodeLength 𝔘𝔫𝔦𝔠𝔬𝔡𝔢} -> 7


Lasso

Character Length

'Hello, world!'->size // 13
'møøse'->size // 5
'𝔘𝔫𝔦𝔠𝔬𝔡𝔢'->size // 7

Byte Length

'Hello, world!'->asBytes->size // 13
'møøse'->asBytes->size // 7
'𝔘𝔫𝔦𝔠𝔬𝔡𝔢'->asBytes->size // 28

LFE

Character Length

(length "ASCII text")
10
(length "𝔘𝔫𝔦𝔠𝔬𝔡𝔢 𝔗𝔢𝒙𝔱")
12
> (set encoded (binary ("𝔘𝔫𝔦𝔠𝔬𝔡𝔢 𝔗𝔢𝒙𝔱" utf8)))
#B(240 157 148 152 240 157 148 171 240 157 ...)
> (length (unicode:characters_to_list encoded 'utf8))
12

Byte Length

> (set encoded (binary ("𝔘𝔫𝔦𝔠𝔬𝔡𝔢 𝔗𝔢𝒙𝔱" utf8)))
#B(240 157 148 152 240 157 148 171 240 157 ...)
> (byte_size encoded)
45
> (set bytes (binary ("𝔘𝔫𝔦𝔠𝔬𝔡𝔢 𝔗𝔢𝒙𝔱")))
#B(24 43 38 32 44 33 34 32 23 34 153 49)
> (byte_size bytes)
12
> (set encoded (binary ("ASCII text" utf8)))
#B(65 83 67 73 73 32 116 101 120 116)
> (byte_size encoded)
10

Liberty BASIC

See BASIC

Lingo

Character Length

utf8Str = "Hello world äöü"
put utf8Str.length
-- 15

Byte Length

utf8Str = "Hello world äöü"
put bytearray(utf8Str).length
-- 18

LiveCode

LiveCode fully supports Unicode characters since version 7 ASCII only older Xtalk environments such as Apple's HyperCard did not natively support unicode characters, although being extensible with externals, could possibly have add-on support.

Character Length

put the length of "Hello World"

or

put the number of characters in "Hello World" -- 'chars' short for characters is also valid

or

put length("Hello World")

for Unicode character count use the code units keyword

put the number of codeunits of "Hello World" -- count of unicode characters

Byte Length

Use the 'byte' keyword in LiveCode for an accurate unicode char byte count

put the number of bytes in "Hello World"

Logo is so old that only ASCII encoding is supported. Modern versions of Logo may have enhanced character set support.

print count "|Hello World|  ; 11
print count "møøse            ; 5
print char 248   ; ø - implies ISO-Latin character set

LSE64

Byte Length

LSE stores strings as arrays of characters in 64-bit cells plus a count.

" Hello world" @ 1 + 8 * ,   # 96 = (11+1)*(size of a cell) = 12*8

Character Length

LSE uses counted strings: arrays of characters, where the first cell contains the number of characters in the string.

" Hello world" @ ,   # 11

Lua

Works with: Lua version 5.0+

In Lua, a character is always the size of one byte so there is no difference between byte length and character length.

Byte Length

Byte length in UTF-8:

str = "Hello world"
length = #str

or

str = "Hello world"
length = string.len(str)

Character Length

Only valid for ASCII:

str = "Hello world"
length = #str

or

str = "Hello world"
length = string.len(str)

For Unicode string, use utf8 module:

utf8.len("møøse")
utf8.len("𝔘𝔫𝔦𝔠𝔬𝔡𝔢")
utf8.len("J̲o̲s̲é̲")
Output:
5
7
8

M2000 Interpreter

module  String_length {
	A$=format$("J\u0332o\u0332s\u0332e\u0301\u0332")
	Print Len(A$) = 9  ' true Utf-16LE
	Print Len.Disp(A$) = 4 \\ display length
	Buffer Clear Mem as Byte*100
	\\ Write at memory at offset 0 or address Mem(0)
	Return Mem, 0:=A$
	Print Eval$(Mem, 0, 18)
	For i=0 to 17 step 2
	      \\ print hex value and character
	      Hex Eval(Mem, i as integer), ChrCode$(Eval(Mem, i as integer))
	Next i
	Document B$=A$
	\\ encode to utf-8 with BOM (3 bytes 0xEF,0xBB,0xBF)
	Save.Doc B$, "Checklen.doc", 2
	Print Filelen("Checklen.doc")=17
	\\ So length is 14 bytes + 3 the BOM
	Mem=Buffer("Checklen.doc")
	Print len(Mem)=17 // len works for buffers too - unit byte
	// version 12 can handle strings without suffix $
	C=eval$(mem, 3, 14) // from 4th byte get 14 bytes in a string
	Print len(C)*2=14 ' bytes   // len()) for strings return double type of words (can return 0.5)
	C=string$(C as utf8dec) ' decode bytes from utf8 to utf16LE
	Print len(C)=9, C=A$, Len.Disp(C)=4
	Print C
	Report 2, C  // proportional print on console - for text center justified rendering (2 - center)
}
String_length

Maple

Character length

length("Hello world");

Byte count

nops(convert("Hello world",bytes));

Mathematica/Wolfram Language

Character length

StringLength["Hello world"]

Byte length

StringByteCount["Hello world"]

MATLAB

Character Length

>> length('møøse')

ans =

     5

Byte Length

MATLAB apparently encodes strings using UTF-16.

>> numel(dec2hex('møøse'))

ans =

    10

Maxima

s: "the quick brown fox jumps over the lazy dog";
slength(s);
/* 43 */

MAXScript

Character Length

"Hello world".count

Mercury

Mercury's C and Erlang backends use UTF-8 encoded strings; the Java and C# backends using the underlying UTF-16 encoding of those languages. The function string.length/1 returns the number of code units in a string in target language encoding. The function string.count_utf8_code_units/1 returns the number of UTF-8 code units in a string regardless of the target language.

Byte Length

:- module string_byte_length.
:- interface.

:- import_module io.

:- pred main(io::di, io::uo) is det.

:- implementation.

:- import_module list, string.

main(!IO) :-
    Words = ["møøse", "𝔘𝔫𝔦𝔠𝔬𝔡𝔢", "J\x332\o\x332\s\x332\e\x301\\x332\"],
    io.write_list(Words, "", write_length, !IO).

:- pred write_length(string::in, io::di, io::uo) is det.

write_length(String, !IO):-
    NumBytes = count_utf8_code_units(String),
    io.format("%s: %d bytes\n", [s(String), i(NumBytes)], !IO).

Output:

møøse: 7 bytes
𝔘𝔫𝔦𝔠𝔬𝔡𝔢: 28 bytes
J̲o̲s̲é̲: 14 bytes

Character Length

The function string.count_codepoints/1 returns the number of code points in a string.

:- module string_character_length.
:- interface.

:- import_module io.

:- pred main(io::di, io::uo) is det.

:- implementation.

:- import_module list, string.

main(!IO) :-
    Words = ["møøse", "𝔘𝔫𝔦𝔠𝔬𝔡𝔢", "J\x332\o\x332\s\x332\e\x301\\x332\"],
    io.write_list(Words, "", write_length, !IO).

:- pred write_length(string::in, io::di, io::uo) is det.

write_length(String, !IO) :-
    NumChars = count_codepoints(String),
    io.format("%s: %d characters\n", [s(String), i(NumChars)], !IO).

Output:

møøse: 5 characters
𝔘𝔫𝔦𝔠𝔬𝔡𝔢: 7 characters
J̲o̲s̲é̲: 9 characters

Metafont

Metafont has no way of handling properly encodings different from ASCII. So it is able to count only the number of bytes in a string.

string s;
s := "Hello Moose";
show length(s);          % 11 (ok)
s := "Hello Møøse";
show length(s);          % 13 (number of bytes when the string is UTF-8 encoded,
                         % since ø takes two bytes)

Note: in the lang tag, Møøse is Latin1-reencoded, showing up two bytes (as Latin1) instead of one

MIPS Assembly

This only supports ASCII encoding, so it'll return both byte length and char length.

.data 
	#.asciiz automatically adds the NULL terminator character, \0 for us.
	string: .asciiz "Nice string you got there!"

.text
main:
	la $a1,string           #load the beginning address of the string.
	
loop:
	lb $a2,($a1)            #load byte (i.e. the char) at $a1 into $a2
	addi $a1,$a1,1          #increment $a1
	beqz $a2,exit_procedure #see if we've hit the NULL char yet
	addi $a0,$a0,1          #increment counter
	j loop                  #back to start
	
exit_procedure:
	li $v0,1                #set syscall to print integer
	syscall                 
	
	li $v0,10               #set syscall to cleanly exit EXIT_SUCCESS
	syscall

mIRC Scripting Language

Byte Length

This example may be incorrect due to a recent change in the task requirements or a lack of testing. Please verify it and remove this message. If the example does not match the requirements or does not work, replace this message with Template:incorrect or fix the code yourself.
alias stringlength { echo -a Your Name is: $len($$?="Whats your name") letters long! }

Character Length

This example may be incorrect due to a recent change in the task requirements or a lack of testing. Please verify it and remove this message. If the example does not match the requirements or does not work, replace this message with Template:incorrect or fix the code yourself.

$utfdecode() converts an UTF-8 string to the locale encoding, with unrepresentable characters as question marks. Since mIRC is not yet fully Unicode aware, entering Unicode text trough a dialog box will automatically convert it to ASCII.

alias utf8len { return $len($utfdecode($1)) }
alias stringlength2 {
  var %name = Børje
  echo -a %name is: $utf8len(%name) characters long!
}

Modula-3

Byte Length

MODULE ByteLength EXPORTS Main;

IMPORT IO, Fmt, Text;

VAR s: TEXT := "Foo bar baz";

BEGIN
  IO.Put("Byte length of s: " & Fmt.Int((Text.Length(s) * BYTESIZE(s))) & "\n");
END ByteLength.

Character Length

MODULE StringLength EXPORTS Main;

IMPORT IO, Fmt, Text;

VAR s: TEXT := "Foo bar baz";

BEGIN
  IO.Put("String length of s: " & Fmt.Int(Text.Length(s)) & "\n");
END StringLength.

Nemerle

Both examples rely on .Net faculties, so they're almost identical to C#

Character Length

def message = "How long am I anyways?";
def charlength = message.Length;

Byte Length

using System.Text;

def message = "How long am I anyways?";
def bytelength = Encoding.Unicode.GetByteCount(message);

NewLISP

Character Length

(set 'Str "møøse")
(println  Str  " is " (length Str) " characters long")

Nim

Byte Length

echo "møøse".len # 7
echo "𝔘𝔫𝔦𝔠𝔬𝔡𝔢".len # 28
echo "J̲o̲s̲é̲".len # 13

Character Length

import unicode
echo "møøse".runeLen # 5
echo "𝔘𝔫𝔦𝔠𝔬𝔡𝔢".runeLen # 7
echo "J̲o̲s̲é̲".runeLen # 8

Grapheme Length

graphemeLen() does not do what you expect. It doesn't return the number of grapheme in a string but returns the number of bytes at a character/codepoint index for a given string.

Oberon-2

Byte Length

MODULE Size;

   IMPORT Out;

   VAR s: LONGINT;
      string: ARRAY 5 OF CHAR;

BEGIN 
   string := "Foo";
   s := LEN(string);
   Out.String("Size: ");
   Out.LongInt(s,0);
   Out.Ln;
END Size.

Output:

Size: 5

Character Length

MODULE Length;

   IMPORT Out, Strings;

   VAR l: INTEGER;
      string: ARRAY 5 OF CHAR;

BEGIN 
   string := "Foo";
   l := Strings.Length(string);
   Out.String("Length: ");
   Out.Int(l,0);
   Out.Ln;
END Length.

Output:

Length: 3

Objeck

All character string elements are 1-byte in size therefore a string's byte size and length are the same.

Character Length

"Foo"->Size()->PrintLine();

Byte Length

"Foo"->Size()->PrintLine();

Objective-C

In order to be not ambiguous about the encoding used in the string, we explicitly provide it in UTF-8 encoding. The string is "møøse" (ø UTF-8 encoded is in hexadecimal C3 B8).

Character Length

Objective-C encodes strings in UTF-16, which represents each character (code point) with one or two 16-bit code units. This is a variable-length encoding scheme. The most commonly used characters are represented by one 16-bit code unit, while "supplementary characters" are represented by two (called a "surrogate pair").

The length method of NSString objects is not the length of that string in characters. Instead, it only gives the number of 16-bit code units used to encode a string. This is not (always) the number of Unicode characters (code points) in the string.

// Return the length in characters
// XXX: does not (always) count Unicode characters (code points)! 
unsigned int numberOfCharacters = [@"møøse" length];  // 5

Since Mac OS X 10.6, CFString has methods for converting between supplementary characters and surrogate pair. However, the easiest way to get the number of characters is probably to encode it in UTF-32 (which is a fixed-length encoding) and divide by 4:

int realCharacterCount = [s lengthOfBytesUsingEncoding: NSUTF32StringEncoding] / 4;

Byte Length

Objective-C encodes strings in UTF-16, which represents each character with one or two 16-bit values. The length method of NSString objects returns the number of 16-bit values used to encode a string, so the number of bytes can be determined by doubling that number.

int byteCount = [@"møøse" length] * 2; // 10

Another way to know the byte length of a string is to explicitly specify the charset we desire.

// Return the number of bytes depending on the encoding,
// here explicitly UTF-8
unsigned numberOfBytes =
   [@"møøse" lengthOfBytesUsingEncoding: NSUTF8StringEncoding]; // 7

OCaml

In OCaml currently, characters inside the standard type string are bytes, and a single character taken alone has the same binary representation as the OCaml int (which is equivalent to a C long) which is a machine word.

For internationalization there is Camomile, a comprehensive Unicode library for OCaml. Camomile provides Unicode character type, UTF-8, UTF-16, and more...

Byte Length

Standard OCaml strings are classic ASCII ISO 8859-1, so the function String.length returns the byte length which is the character length in this encoding:

String.length "Hello world" ;;

Character Length

While using the UTF8 module of Camomile the byte length of an utf8 encoded string will be get with String.length and the character length will be returned by UTF8.length:

open CamomileLibrary

let () =
  Printf.printf " %d\n" (String.length "møøse");
  Printf.printf " %d\n" (UTF8.length "møøse");
;;

Run this code with the command:

$ ocaml bigarray.cma -I $(ocamlfind query camomile)/library/ camomileLibrary.cma strlen.ml 
 7
 5

Alternatively, you can use the UChar module (available since OCaml 4.03) to do it without additional modules.

let utf8_length (s: String.t) =
	let byte_length = String.length s in
	let rec count acc n =
		if n = byte_length
		then acc
		else 
		let n' = n + (String.get_utf_8_uchar s n |> Uchar.utf_decode_length) in
		count (succ acc) n'
	in
	count 0 0
;;
# utf8_length "møøse"
- : int = 5

Octave

s = "string";
stringlen = length(s)

This gives the number of bytes, not of characters. e.g. length("è") is 2 when "è" is encoded e.g. as UTF-8.

Oforth

Oforth strings are UTF8 encoded.

size method returns number of UTF8 characters into a string

basicSize method returns number of bytes into a string

Ol

; Character length
(print (string-length "Hello, wørld!"))
; ==> 13

; Byte (utf-8 encoded) length
(print (length (string->bytes "Hello, wørld!")))
; ==> 14

OpenEdge/Progress

The codepage can be set independently for input / output and internal operations. The following examples are started from an iso8859-1 session and therefore need to use fix-codepage to adjust the string to utf-8.

Character Length

DEF VAR lcc AS LONGCHAR.
 
FIX-CODEPAGE( lcc ) = "UTF-8".
lcc = "møøse".

MESSAGE LENGTH( lcc ) VIEW-AS ALERT-BOX.

Byte Length

DEF VAR lcc AS LONGCHAR.
 
FIX-CODEPAGE( lcc ) = "UTF-8".
lcc = "møøse".

MESSAGE LENGTH( lcc, "RAW" ) VIEW-AS ALERT-BOX.

Oz

Byte Length

{Show {Length "Hello World"}}

Oz uses a single-byte encoding by default. So for normal strings, this will also show the correct character length.

PARI/GP

Character Length

Characters = bytes in Pari; the underlying strings are C strings interpreted as US-ASCII.

len(s)=#s; \\ Alternately, len(s)=length(s); or even len=length;

Byte Length

This works on objects of any sort, not just strings, and includes overhead.

len(s)=sizebyte(s);

Pascal

Byte Length

const 
  s = 'abcdef';
begin
  writeln (length(s))
end.

Output:

6

Perl

Byte Length

Works with: Perl version 5.8

Strings in Perl consist of characters. Measuring the byte length therefore requires conversion to some binary representation (called encoding, both noun and verb).

use utf8; # so we can use literal characters like ☺ in source
use Encode qw(encode);

print length encode 'UTF-8', "Hello, world! ☺";
# 17. The last character takes 3 bytes, the others 1 byte each.

print length encode 'UTF-16', "Hello, world! ☺";
# 32. 2 bytes for the BOM, then 15 byte pairs for each character.

Character Length

Works with: Perl version 5.X
my $length = length "Hello, world!";

Grapheme Length

Since Perl 5.12, /\X/ matches an extended grapheme cluster. See "Unicode overhaul" in perl5120delta and also UAX #29.

Perl understands that "\x{1112}\x{1161}\x{11ab}\x{1100}\x{1173}\x{11af}" (한글) contains 2 graphemes, just like "\x{d55c}\x{ae00}" (한글). The longer string uses Korean combining jamo characters.

Works with: Perl version 5.12
use v5.12;
my $string = "\x{1112}\x{1161}\x{11ab}\x{1100}\x{1173}\x{11af}";  # 한글
my $len;
$len++ while ($string =~ /\X/g);
printf "Grapheme length: %d\n", $len;
Output:
Grapheme length: 2

Phix

Library: Phix/basics

The standard length function returns the number of bytes, character length is achieved by converting to utf32

constant s = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
?length(s)
?length(utf8_to_utf32(s))
Output:
28
7

PHP

Program in a UTF8 linux:

<?php
foreach (array('møøse', '𝔘𝔫𝔦𝔠𝔬𝔡𝔢', 'J̲o̲s̲é̲') as $s1) { 
   printf('String "%s" measured with strlen: %d mb_strlen: %s grapheme_strlen %s%s', 
                  $s1, strlen($s1),mb_strlen($s1), grapheme_strlen($s1), PHP_EOL);
}

yields the result:

String "møøse" measured with strlen: 7 mb_strlen: 7 grapheme_strlen 5
String "𝔘𝔫𝔦𝔠𝔬𝔡𝔢" measured with strlen: 28 mb_strlen: 28 grapheme_strlen 7
String "J̲o̲s̲é̲" measured with strlen: 13 mb_strlen: 13 grapheme_strlen 4

PicoLisp

(let Str "møøse"
   (prinl "Character Length of \"" Str "\" is " (length Str))
   (prinl "Byte Length of \"" Str "\" is " (size Str)) )

Output:

Character Length of "møøse" is 5
Byte Length of "møøse" is 7
-> 7

PL/I

declare WS widechar (13) initial ('Hello world.');
put ('Character length=', length (WS));
put skip list ('Byte length=', size(WS));

declare SM graphic (13) initial ('Hello world');
put ('Character length=', length(SM));
put skip list ('Byte length=', size(trim(SM)));

PL/SQL

LENGTH calculates length using characters as defined by the input character set. LENGTHB uses bytes instead of characters. LENGTHC uses Unicode complete characters. LENGTH2 uses UCS2 code points. LENGTH4 uses UCS4 code points.

Byte Length

DECLARE
  string VARCHAR2(50) := 'Hello, world!';
  stringlength NUMBER;
BEGIN
  stringlength := LENGTHB(string);
END;

Character Length

DECLARE
  string VARCHAR2(50) := 'Hello, world!';
  stringlength NUMBER;
  unicodelength NUMBER;
  ucs2length NUMBER;
  ucs4length NUMBER;
BEGIN
  stringlength := LENGTH(string);
  unicodelength := LENGTHC(string);
  ucs2length := LENGTH2(string);
  ucs4length := LENGTH4(string);
END;

Plain English

Byte Length

Plain English does not handle Unicode, so strings return their length in bytes.

To run:
Start up.
Put "møøse" into a string.
Write the string's length to the output.
Wait for the escape key.
Shut down.

Pop11

Byte Length

Currently Pop11 supports only strings consisting of 1-byte units. Strings can carry arbitrary binary data, so user can for example use UTF-8 (however builtin procedures will treat each byte as a single character). The length function for strings returns length in bytes:

lvars str = 'Hello, world!';
lvars len = length(str);

PostScript

Character Length

(Hello World) length =
11

Potion

Character Length

"møøse" length print
"𝔘𝔫𝔦𝔠𝔬𝔡𝔢" length print
"J̲o̲s̲é̲" length print

PowerShell

Character Length

$s = "Hëlló Wørłð"
$s.Length

Byte Length

Translation of: C#

For UTF-16, which is the default in .NET and therefore PowerShell:

$s = "Hëlló Wørłð"
[System.Text.Encoding]::Unicode.GetByteCount($s)

For UTF-8:

[System.Text.Encoding]::UTF8.GetByteCount($s)

PureBasic

Character Length

 a = Len("Hello World") ;a will be 11

Byte Length

Returns the number of bytes required to store the string in memory in the given format in bytes. 'Format' can be #PB_Ascii, #PB_UTF8 or #PB_Unicode. PureBasic code can be compiled using either Unicode (2-byte) or Ascii (1-byte) encodings for strings. If 'Format' is not specified, the mode of the executable (unicode or ascii) is used.

Note: The number of bytes returned does not include the terminating Null-Character of the string. The size of the Null-Character is 1 byte for Ascii and UTF8 mode and 2 bytes for Unicode mode.

a = StringByteLength("ä", #PB_UTF8)    ;a will be 2
b = StringByteLength("ä", #PB_Ascii)   ;b will be 1
c = StringByteLength("ä", #PB_Unicode) ;c will be 2

Python

2.x

In Python 2.x, there are two types of strings: regular (8-bit) strings, and Unicode strings. Unicode string literals are prefixed with "u".

Byte Length

Works with: Python version 2.x

For 8-bit strings, the byte length is the same as the character length:

print len('ascii')
# 5

For Unicode strings, length depends on the internal encoding. Since version 2.2 Python shipped with two build options: it either uses 2 or 4 bytes per character. The internal representation is not interesting for the user.

# The letter Alef
print len(u'\u05d0'.encode('utf-8'))
# 2
print len(u'\u05d0'.encode('iso-8859-8'))
# 1

Example from the problem statement:

#!/bin/env python
# -*- coding: UTF-8 -*-
s = u"møøse"
assert len(s) == 5
assert len(s.encode('UTF-8')) == 7
assert len(s.encode('UTF-16-BE')) == 10 # There are 3 different UTF-16 encodings: LE and BE are little endian and big endian respectively, the third one (without suffix) adds 2 extra leading bytes: the byte-order mark (BOM).

Character Length

Works with: Python version 2.4

len() returns the number of code units (not code points!) in a Unicode string or plain ASCII string. On a wide build, this is the same as the number of code points, but on a narrow one it is not. Most linux distributions install the wide build by default, you can check the build at runtime with:

import sys
sys.maxunicode # 1114111 on a wide build, 65535 on a narrow build

To get the length of encoded string, you have to decode it first:

print len('ascii')
# 5
print len(u'\u05d0') # the letter Alef as unicode literal
# 1
print len('\xd7\x90'.decode('utf-8')) # Same encoded as utf-8 string
# 1
print hex(sys.maxunicode), len(unichr(0x1F4A9))
# ('0x10ffff', 1)

On a narrow build, len() gives the wrong answer for non-BMP chars

print hex(sys.maxunicode), len(unichr(0x1F4A9))
# ('0xffff', 2)

3.x

In Python 3.x, strings are Unicode strings and a bytes type if available for storing an immutable sequence of bytes (there's also available a bytearray type, which is mutable)

Byte Length

You can use len() to get the length of a byte sequence.

print(len(b'Hello, World!'))
# 13

To get a byte sequence from a string, you have to encode it with the desired encoding:

# The letter Alef
print(len('\u05d0'.encode())) # the default encoding is utf-8 in Python3
# 2
print(len('\u05d0'.encode('iso-8859-8')))
# 1

Example from the problem statement:

#!/bin/env python
# -*- coding: UTF-8 -*-
s = "møøse"
assert len(s) == 5
assert len(s.encode('UTF-8')) == 7
assert len(s.encode('UTF-16-BE')) == 10 # There are 3 different UTF-16 encodings: LE and BE are little endian and big endian respectively, the third one (without suffix) adds 2 extra leading bytes: the byte-order mark (BOM).
u="𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
assert len(u.encode()) == 28
assert len(u.encode('UTF-16-BE')) == 28

Character Length

Since Python3.3 the internal storage of unicode strings has been optimized: strings that don't contain characters outside the latin-1 set, are stored with 8 bits for each character, strings that don't contain codepoints outside the BMP (lone surrogates aren't allowed) are stored as UCS-2, while all the others use UCS-4.

Thus Python is able to avoid memory overhead when dealing with only ASCII strings, while handling correctly all codepoints in Unicode. len() returns the number of characters/codepoints:

print(len("𝔘𝔫𝔦𝔠𝔬𝔡𝔢")) 
# 7

Until Python 3.2 instead, length depended on the internal encoding, since it shipped with two build options: it either used 2 or 4 bytes per character.

len() returned the number of code units in a string, which could be different from the number of characters. In a narrow build, this is not a reliable way to get the number of characters. You can only easily count code points in a wide build. Most linux distributions install the wide build by default, you can check the build at runtime with:

import sys
sys.maxunicode # 1114111 on a wide build, 65535 on a narrow build
print(len('ascii'))
# 5
print(len('\u05d0')) # the letter Alef as unicode literal
# 1

To get the length of an encoded byte sequence, you have to decode it first:

print(len(b'\xd7\x90'.decode('utf-8'))) # Alef encoded as utf-8 byte sequence
# 1
print(hex(sys.maxunicode), len(unichr(0x1F4A9)))
# ('0x10ffff', 1)

On a narrow build, len() gives the wrong answer for non-BMP chars

print(hex(sys.maxunicode), len(unichr(0x1F4A9)))
# ('0xffff', 2)

R

Byte length

a <- "m\u00f8\u00f8se"
print(nchar(a, type="bytes"))  # print 7

Character length

print(nchar(a, type="chars"))  # print 5

Racket

Using this definition:

(define str "J\u0332o\u0332s\u0332e\u0301\u0332")

on the REPL, we get the following:

Character length

-> (printf "str has ~a characters" (string-length str))
str has 9 characters

Byte length

-> (printf "str has ~a bytes in utf-8" (bytes-length (string->bytes/utf-8 str)))
str has 14 bytes in utf-8

Raku

(formerly Perl 6)

Byte Length

say 'møøse'.encode('UTF-8').bytes;

Character Length

say 'møøse'.codes;

Grapheme Length

say 'møøse'.chars;

REBOL

Rebol 2 does not natively support UCS (Unicode), so character and byte length are the same. See utf-8.r for an external UTF-8 library.

Rebol 3 natively supports UTF-8.

Byte Length

;; r2
length? "møøse"

;; r3
length? to-binary "møøse"

Character length

;; r3
length? "møøse"

ReScript

Byte Length

Js.String2.length("abcd") == 4

Retro

Byte Length

'møøse s:length n:put

Character Length

Retro does not have built-in support for Unicode, but counting of characters can be done with a small amount of effort.

chain: UTF8'
{{
  : utf+ ( $-$ )
    [ 1+ dup @ %11000000 and %10000000 = ] while ;

  : count ( $-$ )
    0 !here
    repeat dup @ 0; drop utf+ here ++ again ;
---reveal---
  : getLength ( $-n )
    count drop @here ;
}}
;chain

"møøse" ^UTF8'getLength putn

REXX

Classic REXX don't support Unicodes, so character and byte length are the same.
All characters (in strings) are stored as 8-bit bytes.     Indeed, everything in REXX
is stored as character strings.

Byte Length

/*REXX program displays the lengths  (in bytes/characters)  for various strings.        */
    /*            1         */                         /*a handy-dandy over/under scale.*/
    /*   123456789012345    */
hello = 'Hello, world!'      ;        say  'the length of HELLO is '   length(hello)
happy = 'Hello, world! ☺'    ;        say  'the length of HAPPY is '   length(happy)
jose  = 'José'               ;        say  'the length of  JOSE is '   length(jose)
nill  = ''                   ;        say  'the length of  NILL is '   length(nill)
null  =                      ;        say  'the length of  NULL is '   length(null)
sum   = 5+1                  ;        say  'the length of   SUM is '   length(sum)
                                                       /*   [↑]  is, of course,  6.     */
                                                       /*stick a fork in it, we're done.*/

output

length of HELLO is  13
length of HAPPY is  15
length of  JOSE is  4
length of  NILL is  0
length of  NULL is  0
length of   SUM is  1

Ring

Character Length

aString = "Welcome to the Ring Programming Language"
aStringSize = len(aString)
see  "Character lenghts : " + aStringSize

Robotic

Character Length

set "$local1" to "Hello world!"
* "String length: &$local1.length&"
end

Unfortunately, only character length can be retrieved in this language.

RPL

RPL strings are all made of 8-bit characters.

"RPL" SIZE

Ruby

UTF8 is the default encoding in Ruby.

Byte Length

"J̲o̲s̲é̲".bytesize

Character Length

"J̲o̲s̲é̲".chars.length

Grapheme Length

"J̲o̲s̲é̲".grapheme_clusters.length

Code Set Independence

The next examples show the byte length and character length of "møøse" in different encodings.

To run these programs, you must convert them to different encodings.

  • If you use Emacs: Paste each program into Emacs. The magic comment, like -*- coding: iso-8859-1 -*-, will tell Emacs to save with that encoding.
  • If your text editor saves UTF-8: Convert the file before running it. For example:
    $ ruby -pe '$_.encode!("iso-8859-1", "utf-8")' scratch.rb | ruby
Works with: Ruby version 1.9
Program Output
# -*- coding: iso-8859-1 -*-
s = "møøse"
puts "Byte length: %d" % s.bytesize
puts "Character length: %d" % s.length
Byte length: 5
Character length: 5
# -*- coding: utf-8 -*-
s = "møøse"
puts "Byte length: %d" % s.bytesize
puts "Character length: %d" % s.length
Byte length: 7
Character length: 5
# -*- coding: gb18030 -*-
s = "møøse"
puts "Byte length: %d" % s.bytesize
puts "Character length: %d" % s.length
Byte length: 11
Character length: 5

Ruby 1.8

The next example works with both Ruby 1.8 and Ruby 1.9. In Ruby 1.8, the strings have no encodings, and String#length is the byte length. In Ruby 1.8, the regular expressions knows three Japanese encodings.

  • /./n uses no multibyte encoding.
  • /./e uses EUC-JP.
  • /./s uses Shift-JIS or Windows-31J.
  • /./u uses UTF-8.

Then either string.scan(/./u).size or string.gsub(/./u, ' ').size counts the UTF-8 characters in string.

# -*- coding: utf-8 -*-

class String
  # Define String#bytesize for Ruby 1.8.6.
  unless method_defined?(:bytesize)
    alias bytesize length
  end
end

s = "文字化け"
puts "Byte length: %d" % s.bytesize
puts "Character length: %d" % s.gsub(/./u, ' ').size

Run BASIC

input a$
print len(a$)

Rust

Byte Length

fn main() {
    let s = "文字化け";  // UTF-8
    println!("Byte Length: {}", s.len());
}

Character Length

fn main() {
    let s = "文字化け";  // UTF-8
    println!("Character length: {}", s.chars().count());
}

SAS

data _null_;
   a="Hello, World!";
   b=length(c);
   put _all_;
run;

Scala

Library: Scala
object StringLength extends App {
  val s1 = "møøse"
  val s3 = List("\uD835\uDD18", "\uD835\uDD2B", "\uD835\uDD26",
    "\uD835\uDD20", "\uD835\uDD2C", "\uD835\uDD21", "\uD835\uDD22").mkString
  val s4 = "J\u0332o\u0332s\u0332e\u0301\u0332"

    List(s1, s3, s4).foreach(s => println(
        s"The string: $s, characterlength= ${s.length} UTF8bytes= ${
      s.getBytes("UTF-8").size
    } UTF16bytes= ${s.getBytes("UTF-16LE").size}"))
}
Output:
The string: møøse, characterlength= 5 UTF8bytes= 7 UTF16bytes= 10
The string: 𝔘𝔫𝔦𝔠𝔬𝔡𝔢, characterlength= 14 UTF8bytes= 28 UTF16bytes= 28
The string: J̲o̲s̲é̲, characterlength= 9 UTF8bytes= 14 UTF16bytes= 18

Scheme

Byte Length

Works with: Gauche version 0.8.7 [utf-8,pthreads]

string-size function is only Gauche function.

(string-size "Hello world")
Works with: PLT Scheme version 4.2.4
(bytes-length #"Hello world")

Character Length

Works with: Gauche version 0.8.7 [utf-8,pthreads]

string-length function is in R5RS, R6RS.

  (string-length "Hello world")

sed

Character Length

Sed breaks strings on newline characters, and doesn't include them in the count. Text is read from standard input e.g. echo "string" | sed -f script.sed or sed -f script.sed file.txt (The solution given would be the contents of a text file script.sed in these cases). For files with more than one line, sed will give a count for each line.

# create unary numeral (i = 1)
s/./i/g
:loop
# divide by 10 (x = 10)
s/i\{10\}/x/g
# convert remainder to decimal digit
/i/!s/[0-9]*$/0&/
s/i\{9\}/9/
s/i\{8\}/8/
s/i\{7\}/7/
s/i\{6\}/6/
s/iiiii/5/
s/iiii/4/
s/iii/3/
s/ii/2/
s/i/1/
# convert quotient (10s) to 1s
y/x/i/
# start over for the next magnitude (if any)
/i/b loop

Seed7

Character Length

length("Hello, world!")

SETL

Character Length

print(# "Hello, world!"); -- '#' is the cardinality operator. Works on strings, tuples, and sets.

Sidef

var str = "J\x{332}o\x{332}s\x{332}e\x{301}\x{332}";

Byte Length

UTF-8 byte length (default):

say str.bytes.len;       #=> 14

UTF-16 byte length:

say str.encode('UTF-16').bytes.len;      #=> 20

Character Length

say str.chars.len;    #=> 9

Grapheme Length

say str.graphs.len;   #=> 4

Simula

Simula has no bultin support for character encodings (Unicode was not even invented in the year 1967). The encoding was regarded responsibility of the operating system and one byte must match one character. So character constants encoded in UTF-8 are not possible. But reading from a utf8-encoded input file is actually possible.

Input:
møøse
𝔘𝔫𝔦𝔠𝔬𝔡𝔢
J̲o̲s̲é̲
€

Byte Length

BEGIN
    TEXT LINE;
    WHILE NOT LASTITEM DO
    BEGIN
        INTEGER L;
        LINE :- COPY(SYSIN.IMAGE).STRIP;
        OUTCHAR('"');
        OUTTEXT(LINE);
        OUTCHAR('"');
        OUTTEXT(" BYTE LENGTH = "); OUTINT(LINE.LENGTH, 0);
        OUTIMAGE;
        INIMAGE;
    END;
END.
Output:
"møøse" BYTE LENGTH = 7
"𝔘𝔫𝔦𝔠𝔬𝔡𝔢" BYTE LENGTH = 28
"J̲o̲s̲é̲" BYTE LENGTH = 13
"€" BYTE LENGTH = 3

Character Length

To calculate the character length, one can do it manually:

BEGIN

    ! NUMBER OF UFT8 CHARACTERS IN STRING ;
    INTEGER PROCEDURE UTF8STRLEN(S); TEXT S;
    BEGIN
        INTEGER R, LEN, BYTES, ALLBYTES;
        CHARACTER BYTE;
        WHILE S.MORE DO
        BEGIN
            BYTE := S.GETCHAR;
            ALLBYTES := ALLBYTES + 1;
            R := RANK(BYTE);
            LEN := LEN + 1;
            BYTES :=
                IF R >=   0 AND R <= 127 THEN 1 ELSE ! 0....... ASCII ;
                IF R >= 128 AND R <= 191 THEN 0 ELSE ! 10...... CONTINUATION ; 
                IF R >= 192 AND R <= 223 THEN 2 ELSE ! 110..... 10x ;
                IF R >= 224 AND R <= 239 THEN 3 ELSE ! 1110.... 10x 10x ; 
                IF R >= 240 AND R <= 247 THEN 4 ELSE ! 11110... 10x 10x 10x ; 
                  -1;
            IF BYTES = -1 THEN ERROR("ILLEGAL UTF8 STRING");
            WHILE BYTES > 1 DO
            BEGIN
                BYTE := S.GETCHAR;
                ALLBYTES := ALLBYTES + 1;
                BYTES := BYTES - 1;
            END;
        END;
        UTF8STRLEN := LEN;
    END UTF8STRLEN;

    TEXT LINE;
    WHILE NOT LASTITEM DO
    BEGIN
        INTEGER L;
        LINE :- COPY(SYSIN.IMAGE).STRIP;
        OUTCHAR('"');
        OUTTEXT(LINE);
        OUTCHAR('"');
        L := UTF8STRLEN(LINE);
        OUTTEXT(" CHARACTER LENGTH = "); OUTINT(UTF8STRLEN(LINE), 0);
        OUTIMAGE;
        INIMAGE;
    END;

END.
Output:
"møøse" CHARACTER LENGTH = 5
"𝔘𝔫𝔦𝔠𝔬𝔡𝔢" CHARACTER LENGTH = 7
"J̲o̲s̲é̲" CHARACTER LENGTH = 8
"€" CHARACTER LENGTH = 1

Slate

'Hello, world!' length.

Slope

Character Length

(length "møøse")

Byte Lenth

(length (string->bytes "møøse"))

Smalltalk

Internally, Smalltalk represents strings as a collection of characters, with each character representing a single code point. To get at/from bytes, the strings must be en/decoded and converted to/from a byte array. UTFX is only used when communicating with the external world (files/sockets etc.)

Works with: Smalltalk/X
'hello' size -> 5
'hello' utf8Encoded  size -> 5
'hello' utf8Encoded asByteArray -> #[104 101 108 108 111]
#[104 101 108 108 111] asString -: 'hello'
 
'møøse' size -> 5
'møøse' utf8Encoded size -> 7
'møøse' utf8Encoded asByteArray -> #[109 195 184 195 184 115 101]
#[109 195 184 195 184 115 101] utf8Decoded ->'møøse'

'𝔘𝔫𝔦𝔠𝔬𝔡𝔢' size -> 7
'𝔘𝔫𝔦𝔠𝔬𝔡𝔢' utf8Encoded size -> 28
'𝔘𝔫𝔦𝔠𝔬𝔡𝔢' utf8Encoded asByteArray -> #[240 157 148 152 240 157 148 171 240 157 148 166 240 157 148 160 240 157 148 172 240 157 148 161 240 157 148 162]  
'𝔘𝔫𝔦𝔠𝔬𝔡𝔢' utf16Encoded size -> 14
'𝔘𝔫𝔦𝔠𝔬𝔡𝔢' utf8Encoded asWordArray -> WordArray(55349 56600 55349 56619 55349 56614 55349 56608 55349 56620 55349 56609 55349 56610)

Byte Length

Works with: GNU Smalltalk
string := 'Hello, world!'.
string size.

Character Length

Works with: GNU Smalltalk
string := 'Hello, world!'.
string numberOfCharacters.

requires loading the Iconv package:

PackageLoader fileInPackage: 'Iconv'

SNOBOL4

Byte Length

	output = "Byte length: " size(trim(input))
end

Character Length

The example works AFAIK only with CSnobol4 by Phil Budne

-include "utf.sno"
	output = "Char length: " utfsize(trim(input))
end

Sparkling

Byte length

spn:1> sizeof "Hello, wørld!"
= 14

SPL

Byte Length

All strings in SPL are Unicode. See code below.

Character Length

t = ["abc","J̲o̲s̲é̲","møøse","𝔘𝔫𝔦𝔠𝔬𝔡𝔢"]

> i, 1..#.size(t,1)
  ? i>1, #.output()
  #.output(#.quot,t[i],#.quot," contains")

  p = #.split(t[i])
  cn = #.size(p,1)
  s = #.str(cn,">3>")+" chars: "
  > j, 1..cn
    ? j>1, s += ", "
    s += p[j]
  <
  #.output(s)

  q = #.array(t[i])
  bn = #.size(q,1)
  s = #.str(bn,">3>")+" bytes: "
  > j, 1..bn
    ? j>1, s += ", "
    s += #.str(q[j],"X2")+"h"
  <
  #.output(s)
<
Output:
"abc" contains
  3 chars: a,b,c
  6 bytes: 61h, 00h, 62h, 00h, 63h, 00h

"J̲o̲s̲é̲" contains
  4 chars: J̲,o̲,s̲,é̲
 16 bytes: 4Ah, 00h, 32h, 03h, 6Fh, 00h, 32h, 03h, 73h, 00h, 32h, 03h, E9h, 00h, 32h, 03h

"møøse" contains
  5 chars: m,ø,ø,s,e
 10 bytes: 6Dh, 00h, F8h, 00h, F8h, 00h, 73h, 00h, 65h, 00h

"𝔘𝔫𝔦𝔠𝔬𝔡𝔢" contains
  7 chars: 𝔘,𝔫,𝔦,𝔠,𝔬,𝔡,𝔢
 28 bytes: 35h, D8h, 18h, DDh, 35h, D8h, 2Bh, DDh, 35h, D8h, 26h, DDh, 35h, D8h, 20h, DDh, 35h, D8h, 2Ch, DDh, 35h, D8h, 21h, DDh, 35h, D8h, 22h, DDh

Grapheme Length

SPL treats grapheme as a single character when splitting text. See code above.

SQL

Byte length

SELECT LENGTH(CAST('møøse' AS BLOB));

Character length

SELECT LENGTH('møøse');

SQL PL

Character Length

Works with: Db2 LUW

With SQL only:

VALUES LENGTH('møøse', CODEUNITS16);
VALUES LENGTH('møøse', CODEUNITS32);
VALUES CHARACTER_LENGTH('møøse', CODEUNITS32);
VALUES LENGTH2('møøse');
VALUES LENGTH4('møøse');
VALUES LENGTH('𝔘𝔫𝔦𝔠𝔬𝔡𝔢', CODEUNITS16);
VALUES LENGTH('𝔘𝔫𝔦𝔠𝔬𝔡𝔢', CODEUNITS32);
VALUES CHARACTER_LENGTH('𝔘𝔫𝔦𝔠𝔬𝔡𝔢', CODEUNITS32);
VALUES LENGTH2('𝔘𝔫𝔦𝔠𝔬𝔡𝔢');
VALUES LENGTH4('𝔘𝔫𝔦𝔠𝔬𝔡𝔢');
VALUES LENGTH('J̲o̲s̲é̲', CODEUNITS16);
VALUES LENGTH('J̲o̲s̲é̲', CODEUNITS32);
VALUES CHARACTER_LENGTH('J̲o̲s̲é̲', CODEUNITS32);
VALUES LENGTH2('J̲o̲s̲é̲');
VALUES LENGTH4('J̲o̲s̲é̲');

Output:

db2 -t
db2 => VALUES LENGTH('møøse', CODEUNITS16);
1          
-----------
          5

  1 record(s) selected.

db2 => VALUES LENGTH('møøse', CODEUNITS32);
1          
-----------
          5

  1 record(s) selected.

db2 => VALUES CHARACTER_LENGTH('møøse', CODEUNITS32);
1          
-----------
          5

  1 record(s) selected.

db2 => VALUES LENGTH2('møøse');
1          
-----------
          5

  1 record(s) selected.

db2 => VALUES LENGTH4('møøse');
1          
-----------
          5

  1 record(s) selected.

db2 => VALUES LENGTH('𝔘𝔫𝔦𝔠𝔬𝔡𝔢', CODEUNITS16);

1          
-----------
         14

  1 record(s) selected.

db2 => VALUES LENGTH('𝔘𝔫𝔦𝔠𝔬𝔡𝔢', CODEUNITS32);
1          
-----------
          7

  1 record(s) selected.

db2 => VALUES CHARACTER_LENGTH('𝔘𝔫𝔦𝔠𝔬𝔡𝔢', CODEUNITS32);
1          
-----------
          7

  1 record(s) selected.

db2 => VALUES LENGTH2('𝔘𝔫𝔦𝔠𝔬𝔡𝔢');
1          
-----------
         14

  1 record(s) selected.

db2 => VALUES LENGTH4('𝔘𝔫𝔦𝔠𝔬𝔡𝔢');
1          
-----------
          7

  1 record(s) selected.

db2 => VALUES LENGTH('J̲o̲s̲é̲', CODEUNITS16);

1          
-----------
          8

  1 record(s) selected.

db2 => VALUES LENGTH('J̲o̲s̲é̲', CODEUNITS32);
1          
-----------
          8

  1 record(s) selected.

db2 => VALUES CHARACTER_LENGTH('J̲o̲s̲é̲', CODEUNITS32);
1          
-----------
          8

  1 record(s) selected.

db2 => VALUES LENGTH2('J̲o̲s̲é̲');
1          
-----------
          8

  1 record(s) selected.

db2 => VALUES LENGTH4('J̲o̲s̲é̲');
1          
-----------
          8

  1 record(s) selected.

Byte Length

Works with: Db2 LUW

With SQL only:

VALUES LENGTH('møøse');
VALUES LENGTHB('møøse');
VALUES LENGTH('𝔘𝔫𝔦𝔠𝔬𝔡𝔢');
VALUES LENGTHB('𝔘𝔫𝔦𝔠𝔬𝔡𝔢');
VALUES LENGTH('J̲o̲s̲é̲');
VALUES LENGTHB('J̲o̲s̲é̲');

Output:

db2 -t
db2 => VALUES LENGTH('møøse');

1          
-----------
          7

  1 record(s) selected.

db2 => VALUES LENGTHB('møøse');
1          
-----------
          7

  1 record(s) selected.

db2 => VALUES LENGTH('𝔘𝔫𝔦𝔠𝔬𝔡𝔢');
1          
-----------
         28

  1 record(s) selected.

db2 => VALUES LENGTHB('𝔘𝔫𝔦𝔠𝔬𝔡𝔢');
1          
-----------
         28

  1 record(s) selected.

db2 => VALUES LENGTH('J̲o̲s̲é̲');
1          
-----------
         13

  1 record(s) selected.

db2 => VALUES LENGTHB('J̲o̲s̲é̲');
1          
-----------
         13

  1 record(s) selected.

Standard ML

Byte Length

Works with: SML/NJ version 110.60
Works with: Moscow ML version 2.01
Works with: MLton version 20061107
val strlen = size "Hello, world!";

Character Length

Works with: SML/NJ version 110.74
val strlen = UTF8.size "Hello, world!";

Stata

Use strlen for byte length, and ustrlen for the number of Unicode characters in a string.

scalar s="Ἐν ἀρχῇ ἐποίησεν ὁ θεὸς τὸν οὐρανὸν καὶ τὴν γῆν"

di strlen(s)
97

di ustrlen(s)
47

Stringle

The only current implementation of Stringle uses 8-bit character sets, meaning character and byte length is always the same.

This prints the length of a string from input:

$ #$

Swift

Grapheme Length

Swift has a concept of "character" that goes beyond Unicode code points. A Character is a "Unicode grapheme cluster", which can consist of one or more Unicode code points.

To count "characters" (Unicode grapheme clusters):

Works with: Swift version 2.x
let numberOfCharacters = "møøse".characters.count  // 5
Works with: Swift version 1.2
let numberOfCharacters = count("møøse")            // 5
Works with: Swift version 1.0-1.1
let numberOfCharacters = countElements("møøse")    // 5

Character Length

To count Unicode code points:

Works with: Swift version 2.x
let numberOfCodePoints = "møøse".unicodeScalars.count           // 5
Works with: Swift version 1.2
let numberOfCodePoints = count("møøse".unicodeScalars)          // 5
Works with: Swift version 1.0-1.1
let numberOfCodePoints = countElements("møøse".unicodeScalars)  // 5

Byte Length

This depends on which encoding you want to use.

For length in UTF-8, count the number of UTF-8 code units:

Works with: Swift version 2.x
let numberOfBytesUTF8 = "møøse".utf8.count           // 7
Works with: Swift version 1.2
let numberOfBytesUTF8 = count("møøse".utf8)          // 7
Works with: Swift version 1.0-1.1
let numberOfBytesUTF8 = countElements("møøse".utf8)  // 7

For length in UTF-16, count the number of UTF-16 code units, and multiply by 2:

Works with: Swift version 2.x
let numberOfBytesUTF16 = "møøse".utf16.count * 2           // 10
Works with: Swift version 1.2
let numberOfBytesUTF16 = count("møøse".utf16) * 2          // 10
Works with: Swift version 1.0-1.1
let numberOfBytesUTF16 = countElements("møøse".utf16) * 2  // 10

Symsyn

Byte Length

c : 'abcdefgh'
 #c []

Output:

8

Tcl

Byte Length

Formally, Tcl does not guarantee to use any particular representation for its strings internally (the underlying implementation objects can hold strings in at least three different formats, mutating between them as necessary) so the way to calculate the "byte length" of a string can only be done with respect to some user-selected encoding. This is done this way (for UTF-8):

string length [encoding convertto utf-8 $theString]

Thus, we have these examples:

set s1 "hello, world"
set s2 "\u304A\u306F\u3088\u3046"
set enc utf-8
puts [format "length of \"%s\" in bytes is %d" \
     $s1 [string length [encoding convertto $enc $s1]]]
puts [format "length of \"%s\" in bytes is %d" \
     $s2 [string length [encoding convertto $enc $s2]]]

Character Length

Basic version:

string length "Hello, world!"

or more elaborately, needs Interpreter any 8.X. Tested on 8.4.12.

fconfigure stdout -encoding utf-8; #So that Unicode string will print correctly
set s1 "hello, world"
set s2 "\u304A\u306F\u3088\u3046"
puts [format "length of \"%s\" in characters is %d"  $s1 [string length $s1]]
puts [format "length of \"%s\" in characters is %d"  $s2 [string length $s2]]

TI-89 BASIC

The TI-89 uses an fixed 8-bit encoding so there is no difference between character length and byte length.

■ dim("møøse")              5

Toka

Byte Length

" hello, world!" string.getLength

Trith

Character Length

"møøse" length

Byte Length

"møøse" size

TUSCRIPT

Character Length

$$ MODE TUSCRIPT
string="hello, world"
l=LENGTH (string)
PRINT "character length of string '",string,"': ",l

Output:

Character length of string 'hello, world': 12 

UNIX Shell

Byte length via external utility:

Works with: Bourne Shell
string='Hello, world!'
length=`expr "x$string" : '.*' - 1`
echo $length # if you want it printed to the terminal

With SUSv3 parameter expansion modifier:

This returns the byte count in ash/dash, but the character count in bash, ksh, and zsh:

Works with: Almquist SHell
Works with: Bourne Again SHell
Works with: Korn Shell version 93
Works with: Z SHell
string='Hello, world!'
length=${#string}
echo $length # if you want it printed to the terminal

Vala

Character Length

string s = "Hello, world!";
int characterLength = s.length;

VBA

Cf. VBScript (below).

VBScript

Byte Length

LenB(string|varname)

Returns the number of bytes required to store a string in memory. Returns null if string|varname is null.

Character Length

Len(string|varname)

Returns the length of the string|varname . Returns null if string|varname is null.

Visual Basic

Works with: Visual Basic version VB6 Standard

same as #VBScript.

Visual Basic .NET

Compiler: Roslyn Visual Basic (language version >=15.5)

Strings in .NET are immutable wrappers around arrays of the Char type, which represents a UTF-16 code unit (with a size of two bytes). Classes for encoding and decoding strings to and from byte arrays in various encodings are located in the System.Text namespace, with System.Text.Encoding representing different string encodings (and providing means of encoding and decoding strings to raw byte arrays). The Length property of a string returns the number of Chars it contains, and is thus the number of UTF-16 code units in that string.

Byte Length

One method of Encoding returns the number of bytes required to encode a .NET string in that encoding (encoding objects can be obtained through readonly static [Shared in VB.NET] properties of the Encoding class).

Module ByteLength
    Function GetByteLength(s As String, encoding As Text.Encoding) As Integer
        Return encoding.GetByteCount(s)
    End Function
End Module

Character Length

There is no intended means of obtaining the number of code points in a string in .NET, though a straightforward implementation is to take one fourth of the string's byte length in UTF-32 (as UTF-32 is a fixed-length encoding where each code point is four bytes).

An alternative implementation is to count the number of UTF-16 surrogate pairs in a string and subtract that number from the number of UTF-16 code units in the string.

Module CharacterLength
    Function GetUTF16CodeUnitsLength(s As String) As Integer
        Return s.Length
    End Function

    Private Function GetUTF16SurrogatePairCount(s As String) As Integer
        GetUTF16SurrogatePairCount = 0
        For i = 1 To s.Length - 1
            If Char.IsSurrogatePair(s(i - 1), s(i)) Then GetUTF16SurrogatePairCount += 1
        Next
    End Function

    Function GetCharacterLength_FromUTF16(s As String) As Integer
        Return GetUTF16CodeUnitsLength(s) - GetUTF16SurrogatePairCount(s)
    End Function

    Function GetCharacterLength_FromUTF32(s As String) As Integer
        Return GetByteLength(s, Text.Encoding.UTF32) \ 4
    End Function
End Module

Grapheme Length

System.Globalization.StringInfo provides a means of enumerating the text elements of a string, where each "text element" is a Unicode grapheme.

Module GraphemeLength
    ' Wraps an IEnumerator, allowing it to be used as an IEnumerable.
    Private Iterator Function AsEnumerable(enumerator As IEnumerator) As IEnumerable
        Do While enumerator.MoveNext()
            Yield enumerator.Current
        Loop
    End Function

    Function GraphemeCount(s As String) As Integer
        Dim elements = Globalization.StringInfo.GetTextElementEnumerator(s)
        Return AsEnumerable(elements).OfType(Of String).Count()
    End Function
End Module

Test Code

The compiler constant PRINT_TESTCASE toggles whether to write the contents of each test case to the console; disable for inputs that may mess with the console.

#Const PRINT_TESTCASE = True

Module Program
    ReadOnly TestCases As String() =
    {
        "Hello, world!",
        "møøse",
        "𝔘𝔫𝔦𝔠𝔬𝔡𝔢", ' String normalization of the file makes the e and diacritic in é̲ one character, so use VB's char "escapes"
        $"J{ChrW(&H332)}o{ChrW(&H332)}s{ChrW(&H332)}e{ChrW(&H301)}{ChrW(&H332)}"
    }

    Sub Main()
        Const INDENT = "    "
        Console.OutputEncoding = Text.Encoding.Unicode

        Dim writeResult = Sub(s As String, result As Integer) Console.WriteLine("{0}{1,-20}{2}", INDENT, s, result)

        For i = 0 To TestCases.Length - 1
            Dim c = TestCases(i)

            Console.Write("Test case " & i)
#If PRINT_TESTCASE Then
            Console.WriteLine(": " & c)
#Else
            Console.WriteLine()
#End If
            writeResult("graphemes", GraphemeCount(c))
            writeResult("UTF-16 units", GetUTF16CodeUnitsLength(c))
            writeResult("Cd pts from UTF-16", GetCharacterLength_FromUTF16(c))
            writeResult("Cd pts from UTF-32", GetCharacterLength_FromUTF32(c))
            Console.WriteLine()
            writeResult("bytes (UTF-8)", GetByteLength(c, Text.Encoding.UTF8))
            writeResult("bytes (UTF-16)", GetByteLength(c, Text.Encoding.Unicode))
            writeResult("bytes (UTF-32)", GetByteLength(c, Text.Encoding.UTF32))
            Console.WriteLine()
        Next

    End Sub
End Module
Output:

graphemes corresponds to Grapheme Length in the task description, and either Cd pts value corresponds with Character Length. Byte lengths are given for three Unicode encodings.

Note that the byte length in UTF-16 is always twice the length of a string due to .NET strings using UTF-16.

Test case 0: Hello, world!
    graphemes           13
    UTF-16 units        13
    Cd pts from UTF-16  13
    Cd pts from UTF-32  13

    bytes (UTF-8)       13
    bytes (UTF-16)      26
    bytes (UTF-32)      52

Test case 1: møøse
    graphemes           5
    UTF-16 units        5
    Cd pts from UTF-16  5
    Cd pts from UTF-32  5

    bytes (UTF-8)       7
    bytes (UTF-16)      10
    bytes (UTF-32)      20

Test case 2: 𝔘𝔫𝔦𝔠𝔬𝔡𝔢
    graphemes           7
    UTF-16 units        14
    Cd pts from UTF-16  7
    Cd pts from UTF-32  7

    bytes (UTF-8)       28
    bytes (UTF-16)      28
    bytes (UTF-32)      28

Test case 3: J̲o̲s̲é̲
    graphemes           4
    UTF-16 units        9
    Cd pts from UTF-16  9
    Cd pts from UTF-32  9

    bytes (UTF-8)       14
    bytes (UTF-16)      18
    bytes (UTF-32)      36

+

V (Vlang)

Translation of: go

Byte Length

fn main() {
    m := "møøse"
    u := "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
    j := "J̲o̲s̲é̲"
    println("$m.len $m ${m.bytes()}")
    println("$u.len $u ${u.bytes()}")
    println("$j.len $j ${j.bytes()}")
}

Output:

7 møøse [m, 0xc3, 0xb8, 0xc3, 0xb8, s, e]
28 𝔘𝔫𝔦𝔠𝔬𝔡𝔢 [0xf0, 0x9d, 0x94, 0x98, 0xf0, 0x9d, 0x94, 0xab, 0xf0, 0x9d, 0x94, 0xa6, 0xf0, 0x9d, 0x94, 0xa0, 0xf0, 0x9d, 0x94, 0xac, 0xf0, 0x9d, 0x94, 0xa1, 0xf0, 0x9d, 0x94, 0xa2]
13 J̲o̲s̲é̲ [J, 0xcc, 0xb2, o, 0xcc, 0xb2, s, 0xcc, 0xb2, 0xc3, 0xa9, 0xcc, 0xb2]

Character Length

fn main() {
    m := "møøse"
    u := "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"
    j := "J̲o̲s̲é̲"
    println("$m.runes().len $m ${m.runes()}")
    println("$u.runes().len $u ${u.runes()}")
    println("$j.runes().len $j ${j.runes()}")
}

Output:

5 møøse [`m`, `ø`, `ø`, `s`, `e`]
7 𝔘𝔫𝔦𝔠𝔬𝔡𝔢 [`𝔘`, `𝔫`, `𝔦`, `𝔠`, `𝔬`, `𝔡`, `𝔢`]
8 J̲o̲s̲é̲ [`J`, `̲`, `o`, `̲`, `s`, `̲`, `é`, `̲`]

Wren

Byte Length

System.print("møøse".bytes.count)
System.print("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".bytes.count)
System.print("J̲o̲s̲é̲".bytes.count)
Output:
7
28
13

Character Length

System.print("møøse".count)
System.print("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".count)
System.print("J̲o̲s̲é̲".count)
Output:
5
7
8

Grapheme Length

Library: Wren-upc
import "./upc" for Graphemes

System.print(Graphemes.clusterCount("møøse"))
System.print(Graphemes.clusterCount("𝔘𝔫𝔦𝔠𝔬𝔡𝔢"))
System.print(Graphemes.clusterCount("J̲o̲s̲é̲"))
Output:
5
7
4

x86 Assembly

Byte Length

The following code uses AT&T syntax and was tested using AS (the portable GNU assembler) under Linux.

.data
string:         .asciz "Test"

.text
.globl  main

main:
        pushl   %ebp
        movl    %esp, %ebp

        pushl   %edi
        xorb    %al, %al
        movl    $-1, %ecx
        movl    $string, %edi
        cld
        repne   scasb
        not     %ecx
        dec     %ecx
        popl    %edi

        ;; string length is stored in %ecx register                                                                                                                  

        leave
        ret

XPL0

include c:\cxpl\stdlib;
IntOut(0, StrLen("Character length = Byte length = String length = "))

Output:

49

XSLT

Character Length

<?xml version="1.0" encoding="UTF-8"?>
...
<xsl:value-of select="string-length('møøse')" />   <!-- 5 -->

xTalk

Works with: HyperCard
Works with: LiveCode

Char Length

Note: That older Xtalk languages such as HyperCard's HyperTalk did not support non-ASCII characters natively. LiveCode fully supports multi-byte Unicode characters since version 7. See the LiveCode section for more information.

put the length of "Hello World"

or

put the number of characters in "Hello World" -- 'chars' short for characters is also valid

Byte Length

put the number of bytes in "Hello World" -- use byte keyword in LiveCode for multi-byte Unicode

Yorick

Character Length

strlen("Hello, world!")

Z80 Assembly

The majority of Z80-based hardware predates Unicode so only byte length will be demonstrated for now.

Byte Length

Code is called as a subroutine, i.e. CALL getStringLength.

; input: HL - pointer to the 0th char of a string. 
; outputs length to B. HL will point to the last character in the string just before the terminator.
; length is one-indexed and does not include the terminator. A null string will return 0 in B.

; "Terminator" is a label for a constant that can be configured in the source code. My code uses 0.
; Sample Usage:
;      ld hl,MyString
;      call GetStringLength

GetStringLength:
		ld b,0
loop_getStringLength:
		ld a,(hl)      ;load the next char
		cp Terminator  ;is it the terminator?
		ret z          ;if so, exit.
		inc hl         ;next char
		inc b          ;increment the byte count
		jr loop_getStringLength

zkl

Although zkl handles 8 bit ASCII natively, it also knows UTF-8 up to 6 bytes. In a string, "\uabcd" (hex) and "\U???;" encode 2 and n byte UTF-8 Unicode characters. It does not handle UTF-16 or 4 byte Unicode. It also knows about the bad spots in UTF-8 (such as overlongs).

If your terminal/editor deals with UTF-8 (mine doesn't), you don't need to use the escapes, just put the unicode characters in quotes (ie the editor inserts UTF bytes, which are non zero).

Character Length

"abc".len() //-->3
"\ufeff\u00A2 \u20ac".len() //-->9 "BOM¢ €"

Byte Length

"abc".len() //-->3
"\ufeff\u00A2 \u20ac".len() //-->9
Data(0,Int,"\ufeff\u00A2 \u20ac") //-->Data(9) (bytes)
"J\u0332o\u0332s\u0332e\u0301\u0332".len()  //-->14
"\U1D518;\U1D52B;\U1D526;\U1D520;\U1D52C;\U1D521;\U1D522;".len() //-->28

Character Length

UTF-8 characters are counted, modifiers (such as underscore) are counted as separate characters.

"abc".len(8) //-->3
"\ufeff\u00A2 \u20ac".len(8) //-->4 "BOM¢ €"
"\U1000;".len(8)  //-->Exception thrown: ValueError(Invalid UTF-8 string)
"\uD800" //-->SyntaxError : Line 2: Bad Unicode constant (\uD800-\uDFFF)
"J\u0332o\u0332s\u0332e\u0301\u0332".len(8) //-->9 "J̲o̲s̲é̲"
"\U1D518;\U1D52B;\U1D526;\U1D520;\U1D52C;\U1D521;\U1D522;".len(8) //-->7 "𝔘𝔫𝔦𝔠𝔬𝔡𝔢"

https://en.wikipedia.org/wiki/Comparison_of_programming_languages_%28string_functions%29#lengthProperty "Wikipedia" (as page type) with input value "https://en.wikipedia.org/wiki/Comparison_of_programming_languages_%28string_functions%29#length" contains invalid characters or is incomplete and therefore can cause unexpected results during a query or annotation process.

Zig

const std = @import("std");

fn printResults(alloc: std.mem.Allocator, string: []const u8) !void {
    const cnt_codepts_utf8 = try std.unicode.utf8CountCodepoints(string);
    // There is no sane and portable extended ascii, so the best
    // we get is counting the bytes and assume regular ascii.
    const cnt_bytes_utf8 = string.len;
    const stdout_wr = std.io.getStdOut().writer();
    try stdout_wr.print("utf8  codepoints = {d}, bytes = {d}\n", .{ cnt_codepts_utf8, cnt_bytes_utf8 });

    const utf16str = try std.unicode.utf8ToUtf16LeWithNull(alloc, string);
    const cnt_codepts_utf16 = try std.unicode.utf16CountCodepoints(utf16str);
    const cnt_2bytes_utf16 = try std.unicode.calcUtf16LeLen(string);
    try stdout_wr.print("utf16 codepoints = {d}, bytes = {d}\n", .{ cnt_codepts_utf16, 2 * cnt_2bytes_utf16 });
}

pub fn main() !void {
    var arena_instance = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer arena_instance.deinit();
    const arena = arena_instance.allocator();
    const string1: []const u8 = "Hello, world!";
    try printResults(arena, string1);
    const string2: []const u8 = "møøse";
    try printResults(arena, string2);
    const string3: []const u8 = "𝔘𝔫𝔦𝔠𝔬𝔡𝔢";
    try printResults(arena, string3);
    // \u{332} is underscore of previous character, which the browser may not
    // copy correctly
    const string4: []const u8 = "J\u{332}o\u{332}s\u{332}e\u{301}\u{332}";
    try printResults(arena, string4);
}
Output:
utf8  codepoints = 13, bytes = 13
utf16 codepoints = 13, bytes = 26
utf8  codepoints = 5, bytes = 7
utf16 codepoints = 5, bytes = 10
utf8  codepoints = 7, bytes = 28
utf16 codepoints = 7, bytes = 28
utf8  codepoints = 9, bytes = 14
utf16 codepoints = 9, bytes = 18