Tokenize a string: Difference between revisions

From Rosetta Code
Content added Content deleted
m (Using lang tags now.)
Line 34: Line 34:
</lang>
</lang>
=={{header|ALGOL 68}}==
=={{header|ALGOL 68}}==
main:(
<lang algol>main:(

OP +:= = (REF FLEX[]STRING in out, STRING item)VOID:(
OP +:= = (REF FLEX[]STRING in out, STRING item)VOID:(
[LWB in out: UPB in out+1]STRING new;
[LWB in out: UPB in out+1]STRING new;
new[LWB in out: UPB in out]:=in out;
new[LWB in out: UPB in out]:=in out;
new[UPB new]:=item;
new[UPB new]:=item;
in out := new
in out := new
);
);

PROC string split = (REF STRING beetles, STRING substr)[]STRING:(
PROC string split = (REF STRING beetles, STRING substr)[]STRING:(
""" Split beetles where substr is found """;
""" Split beetles where substr is found """;
FLEX[1:0]STRING out;
FLEX[1:0]STRING out;
INT start := 1, pos;
INT start := 1, pos;
WHILE string in string(substr, pos, beetles[start:]) DO
WHILE string in string(substr, pos, beetles[start:]) DO
out +:= STRING(beetles[start:start+pos-2]);
out +:= STRING(beetles[start:start+pos-2]);
start +:= pos + UPB substr - 1
start +:= pos + UPB substr - 1
OD;
OD;
IF start > LWB beetles THEN
IF start > LWB beetles THEN
out +:= STRING(beetles[start:])
out +:= STRING(beetles[start:])
FI;
FI;
out
out
);
);
PROC char split = (REF STRING beetles, STRING chars)[]STRING: (
PROC char split = (REF STRING beetles, STRING chars)[]STRING: (
""" Split beetles where character is found in chars """;
""" Split beetles where character is found in chars """;
FLEX[1:0]STRING out;
FLEX[1:0]STRING out;
FILE beetlef;
FILE beetlef;
associate(beetlef, beetles); # associate a FILE handle with a STRING #
associate(beetlef, beetles); # associate a FILE handle with a STRING #
make term(beetlef, chars); # make term: assign CSV string terminator #
make term(beetlef, chars); # make term: assign CSV string terminator #

PROC raise logical file end = (REF FILE f)BOOL: except logical file end;
PROC raise logical file end = (REF FILE f)BOOL: except logical file end;
on logical file end(beetlef, raise logical file end);
on logical file end(beetlef, raise logical file end);

STRING solo;
STRING solo;
DO
DO
getf(beetlef, ($g$, solo));
getf(beetlef, ($g$, solo));
out+:=solo;
out+:=solo;
getf(beetlef, ($x$)) # skip CHAR separator #
getf(beetlef, ($x$)) # skip CHAR separator #
OD;
OD;
except logical file end:
except logical file end:
SKIP;
SKIP;
out
out
);
);

STRING beetles := "John Lennon, Paul McCartney, George Harrison, Ringo Starr";
STRING beetles := "John Lennon, Paul McCartney, George Harrison, Ringo Starr";

printf(($g"."$, string split(beetles, ", "),$l$));
printf(($g"."$, string split(beetles, ", "),$l$));
printf(($g"."$, char split(beetles, ", "),$l$))
printf(($g"."$, char split(beetles, ", "),$l$))
)</lang>
)
Output:<pre>
Output:<pre>
John Lennon.Paul McCartney.George Harrison.Ringo Starr.
John Lennon.Paul McCartney.George Harrison.Ringo Starr.
Line 122: Line 122:


=={{header|C sharp|C#}}==
=={{header|C sharp|C#}}==
string str = "Hello,How,Are,You,Today";
<lang csharp>string str = "Hello,How,Are,You,Today";
// or Regex.Split ( "Hello,How,Are,You,Today", "," );
// or Regex.Split ( "Hello,How,Are,You,Today", "," );
// (Regex is in System.Text.RegularExpressions namespace
// (Regex is in System.Text.RegularExpressions namespace
string[] strings = str.Split(',');
string[] strings = str.Split(',');
foreach (string s in strings)
foreach (string s in strings)
{
{
Console.WriteLine (s + ".");
Console.WriteLine (s + ".");
}</lang>
}


=={{header|C++}}==
=={{header|C++}}==
Line 141: Line 141:
Note doxygen tags in comments before function, describing details of interface.
Note doxygen tags in comments before function, describing details of interface.


#include <string>
<lang cpp>#include <string>
#include <vector>
#include <vector>
/// \brief convert input string into vector of string tokens
/// \brief convert input string into vector of string tokens
///
///
/// \note consecutive delimiters will be treated as single delimiter
/// \note consecutive delimiters will be treated as single delimiter
/// \note delimiters are _not_ included in return data
/// \note delimiters are _not_ included in return data
///
///
/// \param input string to be parsed
/// \param input string to be parsed
/// \param delims list of delimiters.
/// \param delims list of delimiters.

std::vector<std::string> tokenize_str(const std::string & str,
std::vector<std::string> tokenize_str(const std::string & str,
const std::string & delims=", \t")
const std::string & delims=", \t")
{
{
using namespace std;
using namespace std;
// Skip delims at beginning, find start of first token
// Skip delims at beginning, find start of first token
string::size_type lastPos = str.find_first_not_of(delims, 0);
string::size_type lastPos = str.find_first_not_of(delims, 0);
// Find next delimiter @ end of token
// Find next delimiter @ end of token
string::size_type pos = str.find_first_of(delims, lastPos);
string::size_type pos = str.find_first_of(delims, lastPos);

// output vector
// output vector
vector<string> tokens;
vector<string> tokens;

while (string::npos != pos || string::npos != lastPos)
while (string::npos != pos || string::npos != lastPos)
{
{
// Found a token, add it to the vector.
// Found a token, add it to the vector.
tokens.push_back(str.substr(lastPos, pos - lastPos));
tokens.push_back(str.substr(lastPos, pos - lastPos));
// Skip delims. Note the "not_of". this is beginning of token
// Skip delims. Note the "not_of". this is beginning of token
lastPos = str.find_first_not_of(delims, pos);
lastPos = str.find_first_not_of(delims, pos);
// Find next delimiter at end of token.
// Find next delimiter at end of token.
pos = str.find_first_of(delims, lastPos);
pos = str.find_first_of(delims, lastPos);
}
}

return tokens;
return tokens;
}</lang>
}




here is sample usage code:
here is sample usage code:


#include <iostream>
<lang cpp>#include <iostream>
int main() {
int main() {
using namespace std;
using namespace std;
string s("Hello,How,Are,You,Today");
string s("Hello,How,Are,You,Today");

vector<string> v(tokenize_str(s));
vector<string> v(tokenize_str(s));

for (unsigned i = 0; i < v.size(); i++)
for (unsigned i = 0; i < v.size(); i++)
cout << v[i] << ".";
cout << v[i] << ".";
cout << endl;
cout << endl;
return 0;
return 0;
}</lang>
}


=={{header|D}}==
=={{header|D}}==


writefln( "Hello,How,Are,You,Today".split(",").join(".") );
<lang D>writefln( "Hello,How,Are,You,Today".split(",").join(".") );</lang>


=={{header|E}}==
=={{header|E}}==
".".rjoin("Hello,How,Are,You,Today".split(","))
<lang D>".".rjoin("Hello,How,Are,You,Today".split(","))</lang>


=={{header|Erlang}}==
=={{header|Erlang}}==
-module(tok).
<lang erlang>-module(tok).
-export([start/0]).
-export([start/0]).

start() ->
start() ->
Lst = string:tokens("Hello,How,Are,You,Today",","),
Lst = string:tokens("Hello,How,Are,You,Today",","),
io:fwrite("~s~n", [string:join(Lst,".")]),
io:fwrite("~s~n", [string:join(Lst,".")]),
ok.
ok.</lang>


=={{header|Forth}}==
=={{header|Forth}}==
There is no standard string split routine, but it is easily written. The results are saved temporarily to the dictionary.
There is no standard string split routine, but it is easily written. The results are saved temporarily to the dictionary.


: split ( str len separator len -- tokens count )
<lang forth>: split ( str len separator len -- tokens count )
here >r 2swap
here >r 2swap
begin
begin
2dup 2, \ save this token ( addr len )
2dup 2, \ save this token ( addr len )
2over search \ find next separator
2over search \ find next separator
while
while
dup negate here 2 cells - +! \ adjust last token length
dup negate here 2 cells - +! \ adjust last token length
2over nip /string \ start next search past separator
2over nip /string \ start next search past separator
repeat
repeat
2drop 2drop
2drop 2drop
r> here over - ( tokens length )
r> here over - ( tokens length )
dup negate allot \ reclaim dictionary
dup negate allot \ reclaim dictionary
2 cells / ; \ turn byte length into token count
2 cells / ; \ turn byte length into token count

: .tokens ( tokens count -- )
: .tokens ( tokens count -- )
1 ?do dup 2@ type ." ." cell+ cell+ loop 2@ type ;
1 ?do dup 2@ type ." ." cell+ cell+ loop 2@ type ;

s" Hello,How,Are,You,Today" s" ," split .tokens \ Hello.How.Are.You.Today
s" Hello,How,Are,You,Today" s" ," split .tokens \ Hello.How.Are.You.Today</lang>


=={{header|Fortran}}==
=={{header|Fortran}}==
{{works with|Fortran|90 and later}}
{{works with|Fortran|90 and later}}
PROGRAM Example
<lang fortran>PROGRAM Example

CHARACTER(23) :: str = "Hello,How,Are,You,Today"
CHARACTER(23) :: str = "Hello,How,Are,You,Today"
CHARACTER(5) :: word(5)
CHARACTER(5) :: word(5)
INTEGER :: pos1 = 1, pos2, n = 0, i
INTEGER :: pos1 = 1, pos2, n = 0, i

DO
pos2 = INDEX(str(pos1:), ",")
IF (pos2 == 0) THEN
n = n + 1
word(n) = str(pos1:)
EXIT
END IF
n = n + 1
word(n) = str(pos1:pos1+pos2-2)
pos1 = pos2+pos1
END DO

DO i = 1, n
WRITE(*,"(2A)", ADVANCE="NO") TRIM(word(i)), "."
END DO
END PROGRAM Example</lang>
DO
pos2 = INDEX(str(pos1:), ",")
IF (pos2 == 0) THEN
n = n + 1
word(n) = str(pos1:)
EXIT
END IF
n = n + 1
word(n) = str(pos1:pos1+pos2-2)
pos1 = pos2+pos1
END DO
DO i = 1, n
WRITE(*,"(2A)", ADVANCE="NO") TRIM(word(i)), "."
END DO
END PROGRAM Example
=={{header|Haskell}}==
=={{header|Haskell}}==


The necessary operations are unfortunately not in the standard library (yet), but simple to write:
The necessary operations are unfortunately not in the standard library (yet), but simple to write:


splitBy :: (a -> Bool) -> [a] -> <nowiki>[[a]]</nowiki>
<lang haskell>splitBy :: (a -> Bool) -> [a] -> <nowiki>[[a]]</nowiki>
splitBy _ [] = []
splitBy _ [] = []
splitBy f list = first : splitBy f (dropWhile f rest) where
splitBy f list = first : splitBy f (dropWhile f rest) where
(first, rest) = break f list
(first, rest) = break f list
splitRegex :: Regex -> String -> [String]
joinWith :: [a] -> <nowiki>[[a]]</nowiki> -> [a]
joinWith d xs = concat $ List.intersperse d xs
-- "concat $ intersperse" can be replaced with "intercalate" from the Data.List in GHC 6.8 and later


splitRegex :: Regex -> String -> [String]
putStrLn $ joinWith "." $ splitBy (== ',') $ "Hello,How,Are,You,Today"

joinWith :: [a] -> <nowiki>[[a]]</nowiki> -> [a]
-- using regular expression to split:
joinWith d xs = concat $ List.intersperse d xs
import Text.Regex
-- "concat $ intersperse" can be replaced with "intercalate" from the Data.List in GHC 6.8 and later
putStrLn $ joinWith "." $ splitRegex (mkRegex ',') $ "Hello,How,Are,You,Today"

putStrLn $ joinWith "." $ splitBy (== ',') $ "Hello,How,Are,You,Today"

-- using regular expression to split:
import Text.Regex
putStrLn $ joinWith "." $ splitRegex (mkRegex ',') $ "Hello,How,Are,You,Today"</lang>


=={{header|Groovy}}==
=={{header|Groovy}}==
Line 281: Line 281:


=={{header|Io}}==
=={{header|Io}}==
"Hello,How,Are,You,Today" split(",") join(".") println
<lang io>"Hello,How,Are,You,Today" split(",") join(".") println</lang>


=={{header|J}}==
=={{header|J}}==
s=: 'Hello,How,Are,You,Today'
<lang j> s=: 'Hello,How,Are,You,Today'
] t=: <;._1 ',',s
] t=: <;._1 ',',s
+-----+---+---+---+-----+
+-----+---+---+---+-----+
|Hello|How|Are|You|Today|
|Hello|How|Are|You|Today|
+-----+---+---+---+-----+
+-----+---+---+---+-----+
; t,&.>'.'
; t,&.>'.'
Hello.How.Are.You.Today.
Hello.How.Are.You.Today.

'.' (I.','=s)}s NB. two steps combined
'.' (I.','=s)}s NB. two steps combined
Hello.How.Are.You.Today
Hello.How.Are.You.Today</lang>


=={{header|Java}}==
=={{header|Java}}==
Line 300: Line 300:
There are multiple ways to tokenize a String in Java. The first is by splitting the String into an array of Strings, and the other way is to use StringTokenizer with a delimiter. The second way given here will skip any empty tokens. So if two commas are given in line, there will be an empty string in the array given by the split function, but no empty string with the StringTokenizer object.
There are multiple ways to tokenize a String in Java. The first is by splitting the String into an array of Strings, and the other way is to use StringTokenizer with a delimiter. The second way given here will skip any empty tokens. So if two commas are given in line, there will be an empty string in the array given by the split function, but no empty string with the StringTokenizer object.


String toTokenize = "Hello,How,Are,You,Today";
<lang java5>String toTokenize = "Hello,How,Are,You,Today";

//First way
//First way
String word[] = toTokenize.split(",");
String word[] = toTokenize.split(",");
for(int i=0; i<word.length; i++) {
for(int i=0; i<word.length; i++) {
System.out.print(word[i] + ".");
System.out.print(word[i] + ".");
}
}

//Second way
//Second way
StringTokenizer tokenizer = new StringTokenizer(toTokenize, ",");
StringTokenizer tokenizer = new StringTokenizer(toTokenize, ",");
while(tokenizer.hasMoreTokens()) {
while(tokenizer.hasMoreTokens()) {
System.out.print(tokenizer.nextToken() + ".");
System.out.print(tokenizer.nextToken() + ".");
}</lang>
}


=={{header|JavaScript}}==
=={{header|JavaScript}}==
{{works with|Firefox|2.0}}
{{works with|Firefox|2.0}}


alert( "Hello,How,Are,You,Today".split(",").join(".") );
<lang javascript>alert( "Hello,How,Are,You,Today".split(",").join(".") );</lang>


=={{header|Logo}}==
=={{header|Logo}}==
{{works with|UCB Logo}}
{{works with|UCB Logo}}
to split :str :sep
<lang logo>to split :str :sep
output parse map [ifelse ? = :sep ["| |] [?]] :str
output parse map [ifelse ? = :sep ["| |] [?]] :str
end
end</lang>

? show split "Hello,How,Are,You,Today ",
<lang logo> ? show split "Hello,How,Are,You,Today ",
[Hello How Are You Today]
[Hello How Are You Today]</lang>


=={{header|MAXScript}}==
=={{header|MAXScript}}==
output = ""
<lang maxscript>output = ""
for word in (filterString "Hello,How,Are,You,Today" ",") do
for word in (filterString "Hello,How,Are,You,Today" ",") do
(
(
output += (word + ".")
output += (word + ".")
)
)
format "%\n" output
format "%\n" output</lang>


=={{header|Objective-C}}==
=={{header|Objective-C}}==
Line 403: Line 403:
As a one liner without a trailing period, and most efficient way of doing it as you don't have to define an array.
As a one liner without a trailing period, and most efficient way of doing it as you don't have to define an array.


print join('.', split(/,/, "Hello,How,Are,You,Today"));
<lang perl>print join('.', split(/,/, "Hello,How,Are,You,Today"));</lang>


If you needed to keep an array for later use, again no trailing period
If you needed to keep an array for later use, again no trailing period


my @words = split(/,/, "Hello,How,Are,You,Today");
<lang perl>my @words = split(/,/, "Hello,How,Are,You,Today");
print join('.', @words);
print join('.', @words);</lang>


If you really want a trailing period, here is an example
If you really want a trailing period, here is an example


my @words = split(/,/, "Hello,How,Are,You,Today");
<lang perl>my @words = split(/,/, "Hello,How,Are,You,Today");
print $_.'.' for (@words);
print $_.'.' for (@words);</lang>


=={{header|PHP}}==
=={{header|PHP}}==
{{works with|PHP|5.x}}
{{works with|PHP|5.x}}


<?php
<lang php><?php
$str = 'Hello,How,Are,You,Today';
$str = 'Hello,How,Are,You,Today';
echo implode('.', explode(',', $str));
echo implode('.', explode(',', $str));
?></lang>
?>


=={{header|Pop11}}==
=={{header|Pop11}}==
Line 430: Line 430:
First show the use of sysparse_string to break up a string and make a list of strings.
First show the use of sysparse_string to break up a string and make a list of strings.


;;; Make a list of strings from a string using space as separator
<lang pop11>;;; Make a list of strings from a string using space as separator
lvars list;
lvars list;
sysparse_string('the cat sat on the mat') -> list;
sysparse_string('the cat sat on the mat') -> list;
;;; print the list of strings
;;; print the list of strings
list =>
list =>
** [the cat sat on the mat]
** [the cat sat on the mat]</lang>


By giving it an extra parameter 'true' we can make it recognize numbers and produce a list of strings and numbers
By giving it an extra parameter 'true' we can make it recognize numbers and produce a list of strings and numbers


lvars list;
<lang pop11>lvars list;
sysparse_string('one 1 two 2 three 3 four 4', true) -> list;
sysparse_string('one 1 two 2 three 3 four 4', true) -> list;
;;; print the list of strings and numbers
;;; print the list of strings and numbers
list =>
list =>
** [one 1 two 2 three 3 four 4]
** [one 1 two 2 three 3 four 4]
;;; check that first item is a string and second an integer
;;; check that first item is a string and second an integer
isstring(list(1))=>
isstring(list(1))=>
** <true>
** <true>
isinteger(list(2))=>
isinteger(list(2))=>
** <true>
** <true></lang>


Now show some uses of the built in procedure sys_parse_string, which allows more options:
Now show some uses of the built in procedure sys_parse_string, which allows more options:


;;; Make pop-11 print strings with quotes
<lang pop11>;;; Make pop-11 print strings with quotes
true -> pop_pr_quotes;
true -> pop_pr_quotes;
;;;
;;;
;;; Create a string of tokens using comma as token separator
;;; Create a string of tokens using comma as token separator
lvars str='Hello,How,Are,You,Today';
lvars str='Hello,How,Are,You,Today';
;;;
;;;
;;; Make a list of strings by applying sys_parse_string
;;; Make a list of strings by applying sys_parse_string
;;; to str, using the character `,` as separator (the default
;;; to str, using the character `,` as separator (the default
;;; separator, if none is provided, is the space character).
;;; separator, if none is provided, is the space character).
lvars strings;
lvars strings;
[% sys_parse_string(str, `,`) %] -> strings;
[% sys_parse_string(str, `,`) %] -> strings;
;;;
;;;
;;; print the list of strings
;;; print the list of strings
strings =>
strings =>
** ['Hello' 'How' 'Are' 'You' 'Today']
** ['Hello' 'How' 'Are' 'You' 'Today']</lang>

If {% ... %} were used instead of [% ... %] the result would be
If {% ... %} were used instead of [% ... %] the result would be
a vector (i.e. array) of strings rather than a list of strings.
a vector (i.e. array) of strings rather than a list of strings.


{% sys_parse_string(str, `,`) %} -> strings;
<lang pop11>{% sys_parse_string(str, `,`) %} -> strings;
;;; print the vector
;;; print the vector
strings =>
strings =>
** {'Hello' 'How' 'Are' 'You' 'Today'}
** {'Hello' 'How' 'Are' 'You' 'Today'}</lang>
It is also possible to give sys_parse_string a 'conversion' procedure, which is applied to each of the tokens.
It is also possible to give sys_parse_string a 'conversion' procedure, which is applied to each of the tokens.
E.g. it could be used to produce a vector of numbers, using the conversion procedure 'strnumber', which converts a string to a number:
E.g. it could be used to produce a vector of numbers, using the conversion procedure 'strnumber', which converts a string to a number:


lvars numbers;
<lang pop11>lvars numbers;
{% sys_parse_string('100 101 102 103 99.9 99.999', strnumber) %} -> numbers;
{% sys_parse_string('100 101 102 103 99.9 99.999', strnumber) %} -> numbers;
;;; the result is a vector containing integers and floats,
;;; the result is a vector containing integers and floats,
;;; which can be printed thus:
;;; which can be printed thus:
numbers =>
numbers =>
** {100 101 102 103 99.9 99.999}
** {100 101 102 103 99.9 99.999}</lang>


Using lower level pop-11 facilities to tokenise the string:
Using lower level pop-11 facilities to tokenise the string:


;;; Declare and initialize variables
<lang pop11>;;; Declare and initialize variables
lvars str='Hello,How,Are,You,Today';
lvars str='Hello,How,Are,You,Today';
;;; Iterate over string
;;; Iterate over string
lvars ls = [], i, j = 1;
lvars ls = [], i, j = 1;
for i from 1 to length(str) do
for i from 1 to length(str) do
;;; If comma
;;; If comma
if str(i) = `,` then
if str(i) = `,` then
;;; Prepend word (substring) to list
;;; Prepend word (substring) to list
cons(substring(j, i - j, str), ls) -> ls;
cons(substring(j, i - j, str), ls) -> ls;
i + 1 -> j;
i + 1 -> j;
endif;
endif;
endfor;
endfor;
;;; Prepend final word (if needed)
;;; Prepend final word (if needed)
if j <= length(str) then
if j <= length(str) then
cons(substring(j, length(str) - j + 1, str), ls) -> ls;
cons(substring(j, length(str) - j + 1, str), ls) -> ls;
endif;
endif;
;;; Reverse the list
;;; Reverse the list
rev(ls) -> ls;
rev(ls) -> ls;</lang>


Since the task requires to use array we convert list to array
Since the task requires to use array we convert list to array


;;; Put list elements and lenght on the stack
<lang pop11>;;; Put list elements and lenght on the stack
destlist(ls);
destlist(ls);
;;; Build a vector from them
;;; Build a vector from them
lvars ar = consvector();
lvars ar = consvector();
;;; Display in a loop, putting trailing period
;;; Display in a loop, putting trailing period
for i from 1 to length(ar) do
for i from 1 to length(ar) do
printf(ar(i), '%s.');
printf(ar(i), '%s.');
endfor;
endfor;
printf('\n');
printf('\n');</lang>


We could use list directly for printing:
We could use list directly for printing:


for i in ls do
<lang pop11>for i in ls do
printf(i, '%s.');
printf(i, '%s.');
endfor;
endfor;</lang>


so the conversion to vector is purely to satisfy task formulation.
so the conversion to vector is purely to satisfy task formulation.
Line 530: Line 530:
{{works with|Python|2.5}}
{{works with|Python|2.5}}


text = "Hello,How,Are,You,Today"
<lang python>text = "Hello,How,Are,You,Today"
tokens = text.split(',')
tokens = text.split(',')
print '.'.join(tokens)
print '.'.join(tokens)</lang>


If you want to print each word on its own line:
If you want to print each word on its own line:


for token in tokens:
<lang python>for token in tokens:
print token
print token</lang>


or
or


print "\n".join(tokens)
<lang python>print "\n".join(tokens)</lang>


or the one liner
or the one liner


print '.'.join('Hello,How,Are,You,Today'.split(','))
<lang python>print '.'.join('Hello,How,Are,You,Today'.split(','))</lang>


=={{header|Raven}}==
=={{header|Raven}}==
'Hello,How,Are,You,Today' ',' split '.' join print
<lang raven>'Hello,How,Are,You,Today' ',' split '.' join print</lang>


=={{header|Ruby}}==
=={{header|Ruby}}==
string = "Hello,How,Are,You,Today".split(',')
<lang ruby> string = "Hello,How,Are,You,Today".split(',')
string.each do |w|
string.each do |w|
print "#{w}."
print "#{w}."
end
end


puts "Hello,How,Are,You,Today".split(',').join('.')
puts "Hello,How,Are,You,Today".split(',').join('.')</lang>


=={{header|Seed7}}==
=={{header|Seed7}}==
var array string: tokens is 0 times "";
<lang seed7>var array string: tokens is 0 times "";


tokens := split("Hello,How,Are,You,Today", ",");
tokens := split("Hello,How,Are,You,Today", ",");</lang>


=={{header|Smalltalk}}==
=={{header|Smalltalk}}==
|array |
<lang smalltalk>|array |
array := 'Hello,How,Are,You,Today' subStrings: $,.
array := 'Hello,How,Are,You,Today' subStrings: $,.
array fold: [:concatenation :string | concatenation, '.', string ]
array fold: [:concatenation :string | concatenation, '.', string ]</lang>


Some implementations also have a ''join:'' convenience method that allows the following shorter solution:
Some implementations also have a ''join:'' convenience method that allows the following shorter solution:


('Hello,How,Are,You,Today' subStrings: $,) join: '.'
<lang smalltalk>('Hello,How,Are,You,Today' subStrings: $,) join: '.'</lang>


The solution displaying a trailing period would be:
The solution displaying a trailing period would be:


|array |
<lang smalltalk>|array |
array := 'Hello,How,Are,You,Today' subStrings: $,.
array := 'Hello,How,Are,You,Today' subStrings: $,.
array inject: '' into: [:concatenation :string | concatenation, string, '.' ]
array inject: '' into: [:concatenation :string | concatenation, string, '.' ]</lang>


=={{header|Standard ML}}==
=={{header|Standard ML}}==
val splitter = String.tokens (fn c => c = #",");
<lang sml>val splitter = String.tokens (fn c => c = #",");
val main = (String.concatWith ".") o splitter;
val main = (String.concatWith ".") o splitter;</lang>


Test:
Test:


- main "Hello,How,Are,You,Today"
<lang sml>- main "Hello,How,Are,You,Today"
<i>val it = "Hello.How.Are.You.Today" : string</i>
val it = "Hello.How.Are.You.Today" : string</lang>


=={{header|Tcl}}==
=={{header|Tcl}}==
Generating a list form a string by splitting on a comma:
Generating a list form a string by splitting on a comma:


split string ,
<lang tcl>split string ,</lang>


Joining the elements of a list by a period:
Joining the elements of a list by a period:


join list .
<lang tcl>join list .</lang>


Thus the whole thing would look like this:
Thus the whole thing would look like this:


puts [join [split "Hello,How,Are,You,Today" ,] .]
<lang tcl>puts [join [split "Hello,How,Are,You,Today" ,] .]</lang>


If you'd like to retain the list in a variable with the name "words", it would only be marginally more complex:
If you'd like to retain the list in a variable with the name "words", it would only be marginally more complex:


puts [join [set words [split "Hello,How,Are,You,Today" ,]] .]
<lang tcl>puts [join [set words [split "Hello,How,Are,You,Today" ,]] .]</lang>




=={{header|UnixPipes}}==
=={{header|UnixPipes}}==


rtoken() {
<lang bash>rtoken() {
(IFS=\ read A B ; echo $A; test -n "$B" && (echo $B | token) )
(IFS=\ read A B ; echo $A; test -n "$B" && (echo $B | token) )
}
}


tokens() {
tokens() {
IFS=, read A ; echo $A | rtoken
IFS=, read A ; echo $A | rtoken
}
}


echo "Hello,How,Are,You" | tokens
echo "Hello,How,Are,You" | tokens</lang>

Revision as of 03:24, 13 February 2009

Task
Tokenize a string
You are encouraged to solve this task according to the task description, using any language you may know.

Separate the string "Hello,How,Are,You,Today" by commas into an array (or list) so that each element of it stores a different word. Display the words to the 'user', in the simplest manner possible, separated by a period. To simplify, you may display a trailing period.

ActionScript

<lang actionscript> var hello:String = "Hello,How,Are,You,Today"; var tokens:Array = hello.split(","); trace(tokens.join("."));

// Or as a one-liner trace("Hello,How,Are,You,Today".split(",").join(".")); </lang>

Ada

<lang ada>

with Ada.Strings.Fixed; use Ada.Strings.Fixed;
with Ada.Text_Io; use Ada.Text_Io;

procedure Parse_Commas is
   Source_String : String := "Hello,How,Are,You,Today";
   Index_List : array(1..256) of Natural;
   Next_Index : Natural := 1;
begin
   Index_List(Next_Index) := 1;
   while Index_List(Next_Index) < Source_String'Last loop
      Next_Index := Next_Index + 1;
      Index_List(Next_Index) := 1 + Index(Source_String(Index_List(Next_Index - 1)..Source_String'Last), ",");
      if Index_List(Next_Index) = 1 then 
         Index_List(Next_Index) := Source_String'Last + 2;
      end if;
      Put(Source_String(Index_List(Next_Index - 1)..Index_List(Next_Index)-2) & ".");
   end loop;
end Parse_Commas;

</lang>

ALGOL 68

<lang algol>main:(

 OP +:=  = (REF FLEX[]STRING in out, STRING item)VOID:(
   [LWB in out: UPB in out+1]STRING new;
   new[LWB in out: UPB in out]:=in out;
   new[UPB new]:=item;
   in out := new
 );
 PROC string split = (REF STRING beetles, STRING substr)[]STRING:(
   """ Split beetles where substr is found """;
   FLEX[1:0]STRING out;
   INT start := 1, pos;
   WHILE string in string(substr, pos, beetles[start:]) DO
     out +:= STRING(beetles[start:start+pos-2]);
     start +:= pos + UPB substr - 1
   OD;
   IF start > LWB beetles THEN
     out +:= STRING(beetles[start:])
   FI;
   out
 );
 
 PROC char split = (REF STRING beetles, STRING chars)[]STRING: (
   """ Split beetles where character is found in chars """;
   FLEX[1:0]STRING out;
   FILE beetlef;
   associate(beetlef, beetles); # associate a FILE handle with a STRING   #
   make term(beetlef, chars);   # make term: assign CSV string terminator # 
   PROC raise logical file end = (REF FILE f)BOOL: except logical file end;
   on logical file end(beetlef, raise logical file end);
   STRING solo;
   DO
     getf(beetlef, ($g$, solo));
     out+:=solo;
     getf(beetlef, ($x$)) # skip CHAR separator #
   OD;
   except logical file end:
     SKIP;
   out
 );
 STRING beetles := "John Lennon, Paul McCartney, George Harrison, Ringo Starr";
 printf(($g"."$, string split(beetles, ", "),$l$));
 printf(($g"."$, char   split(beetles, ", "),$l$))

)</lang>

Output:

John Lennon.Paul McCartney.George Harrison.Ringo Starr.
John.Lennon..Paul.McCartney..George.Harrison..Ringo.Starr.

C

Works with: ANSI C
Works with: gcc version 3.3.3
Library: POSIX

This example uses the strtok() function to separate the tokens. This function is destructive (replacing token separators with '\0'), so we have to make a copy of the string (using strdup()) before tokenizing. strdup() is not part of ANSI C, but is available on most platforms. It can easily be implemented with a combination of strlen(), malloc(), and strcpy().

<lang c>

  1. include<string.h>
  2. include<stdio.h>
  3. include<stdlib.h>

int main(void) { char *a[5]; const char *s="Hello,How,Are,You,Today"; int n=0, nn;

char *ds=strdup(s);

a[n]=strtok(ds, ","); while(a[n] && n<4) a[++n]=strtok(NULL, ",");

for(nn=0; nn<=n; ++nn) printf("%s.", a[nn]); putchar('\n');

free(ds);

return 0; } </lang>

C#

<lang csharp>string str = "Hello,How,Are,You,Today"; // or Regex.Split ( "Hello,How,Are,You,Today", "," ); // (Regex is in System.Text.RegularExpressions namespace string[] strings = str.Split(','); foreach (string s in strings) {

   Console.WriteLine (s + ".");

}</lang>

C++

Works with: ANSI C++
Works with: g++ version 3.4.4 (cygming special)
Library: STL

This is not the most efficient method as it involves redundant copies in the background, but it is very easy to use. In most cases it will be a good choice as long as it is not used as an inner loop in a performance critical system.

Note doxygen tags in comments before function, describing details of interface.

<lang cpp>#include <string>

  1. include <vector>

/// \brief convert input string into vector of string tokens /// /// \note consecutive delimiters will be treated as single delimiter /// \note delimiters are _not_ included in return data /// /// \param input string to be parsed /// \param delims list of delimiters.

std::vector<std::string> tokenize_str(const std::string & str,

                                     const std::string & delims=", \t")

{

 using namespace std;
 // Skip delims at beginning, find start of first token
 string::size_type lastPos = str.find_first_not_of(delims, 0);
 // Find next delimiter @ end of token
 string::size_type pos     = str.find_first_of(delims, lastPos);
 // output vector
 vector<string> tokens;
 while (string::npos != pos || string::npos != lastPos)
   {
     // Found a token, add it to the vector.
     tokens.push_back(str.substr(lastPos, pos - lastPos));
     // Skip delims.  Note the "not_of". this is beginning of token
     lastPos = str.find_first_not_of(delims, pos);
     // Find next delimiter at end of token.
     pos     = str.find_first_of(delims, lastPos);
   }
 return tokens;

}</lang>


here is sample usage code:

<lang cpp>#include <iostream> int main() {

 using namespace std;
 string s("Hello,How,Are,You,Today");
 vector<string> v(tokenize_str(s));
 for (unsigned i  = 0; i < v.size(); i++) 
   cout << v[i] << ".";
 
 cout << endl;
 return 0;

}</lang>

D

<lang D>writefln( "Hello,How,Are,You,Today".split(",").join(".") );</lang>

E

<lang D>".".rjoin("Hello,How,Are,You,Today".split(","))</lang>

Erlang

<lang erlang>-module(tok). -export([start/0]).

start() ->

  Lst = string:tokens("Hello,How,Are,You,Today",","),
  io:fwrite("~s~n", [string:join(Lst,".")]),
  ok.</lang>

Forth

There is no standard string split routine, but it is easily written. The results are saved temporarily to the dictionary.

<lang forth>: split ( str len separator len -- tokens count )

 here >r 2swap
 begin
   2dup 2,             \ save this token ( addr len )
   2over search        \ find next separator
 while
   dup negate  here 2 cells -  +!  \ adjust last token length
   2over nip /string               \ start next search past separator
 repeat
 2drop 2drop
 r>  here over -   ( tokens length )
 dup negate allot           \ reclaim dictionary
 2 cells / ;                \ turn byte length into token count
.tokens ( tokens count -- )
 1 ?do dup 2@ type ." ." cell+ cell+ loop 2@ type ;

s" Hello,How,Are,You,Today" s" ," split .tokens \ Hello.How.Are.You.Today</lang>

Fortran

Works with: Fortran version 90 and later

<lang fortran>PROGRAM Example

 CHARACTER(23) :: str = "Hello,How,Are,You,Today"
 CHARACTER(5) :: word(5)
 INTEGER :: pos1 = 1, pos2, n = 0, i
 DO
   pos2 = INDEX(str(pos1:), ",")
   IF (pos2 == 0) THEN
      n = n + 1
      word(n) = str(pos1:)
      EXIT
   END IF
   n = n + 1
   word(n) = str(pos1:pos1+pos2-2)
   pos1 = pos2+pos1
END DO
DO i = 1, n
  WRITE(*,"(2A)", ADVANCE="NO") TRIM(word(i)), "."
END DO

END PROGRAM Example</lang>

Haskell

The necessary operations are unfortunately not in the standard library (yet), but simple to write:

<lang haskell>splitBy :: (a -> Bool) -> [a] -> [[a]] splitBy _ [] = [] splitBy f list = first : splitBy f (dropWhile f rest) where

 (first, rest) = break f list

splitRegex :: Regex -> String -> [String]

joinWith :: [a] -> [[a]] -> [a] joinWith d xs = concat $ List.intersperse d xs -- "concat $ intersperse" can be replaced with "intercalate" from the Data.List in GHC 6.8 and later

putStrLn $ joinWith "." $ splitBy (== ',') $ "Hello,How,Are,You,Today"

-- using regular expression to split: import Text.Regex putStrLn $ joinWith "." $ splitRegex (mkRegex ',') $ "Hello,How,Are,You,Today"</lang>

Groovy

println 'Hello,How,Are,You,Today'.split(',').join('.')

Io

<lang io>"Hello,How,Are,You,Today" split(",") join(".") println</lang>

J

<lang j> s=: 'Hello,How,Are,You,Today'

  ] t=: <;._1 ',',s

+-----+---+---+---+-----+ |Hello|How|Are|You|Today| +-----+---+---+---+-----+

  ; t,&.>'.'

Hello.How.Are.You.Today.

 '.' (I.','=s)}s  NB. two steps combined

Hello.How.Are.You.Today</lang>

Java

Works with: Java version 1.0+

There are multiple ways to tokenize a String in Java. The first is by splitting the String into an array of Strings, and the other way is to use StringTokenizer with a delimiter. The second way given here will skip any empty tokens. So if two commas are given in line, there will be an empty string in the array given by the split function, but no empty string with the StringTokenizer object.

<lang java5>String toTokenize = "Hello,How,Are,You,Today";

//First way String word[] = toTokenize.split(","); for(int i=0; i<word.length; i++) {

   System.out.print(word[i] + ".");

}

//Second way StringTokenizer tokenizer = new StringTokenizer(toTokenize, ","); while(tokenizer.hasMoreTokens()) {

   System.out.print(tokenizer.nextToken() + ".");

}</lang>

JavaScript

Works with: Firefox version 2.0

<lang javascript>alert( "Hello,How,Are,You,Today".split(",").join(".") );</lang>

Works with: UCB Logo

<lang logo>to split :str :sep

 output parse map [ifelse ? = :sep ["| |] [?]] :str

end</lang>

<lang logo> ? show split "Hello,How,Are,You,Today ",

[Hello How Are You Today]</lang>

MAXScript

<lang maxscript>output = "" for word in (filterString "Hello,How,Are,You,Today" ",") do (

   output += (word + ".")

) format "%\n" output</lang>

Objective-C

Works with: GNUstep
Works with: Cocoa

<lang objc>NSString *text = @"Hello,How,Are,You,Today"; NSArray *tokens = [text componentsSeparatedByString:@","]; NSString *result = [tokens componentsJoinedByString:@"."]; NSLog(result);</lang>

OCaml

To split on a single-character separator: <lang ocaml>let rec split_char sep str =

 try
   let i = String.index str sep in
   String.sub str 0 i ::
     split_char sep (String.sub str (i+1) (String.length str - i - 1))
 with Not_found ->
   [str]</lang>

Or the tail-recursive equivalent:

<lang ocaml>let split_char sep str =

 let rec aux acc str =
 try
   let i = String.index str sep in
   let this = String.sub str 0 i
   and next = String.sub str (i+1) (String.length str - i - 1) in
   aux (this::acc) next
 with Not_found ->
   List.rev(str::acc)
 in
 aux [] str
</lang>

But both of these will process extraneous String.sub (so one string alloc). For N tokens there will be (N - 2) unneeded allocs. To resolve this here is a version which first gets the indices, and then extracts the tokens:

<lang ocaml>let split_char sep str =

 let rec indices acc i =
   try
     let i = succ(String.index_from str i sep) in
     indices (i::acc) i
   with Not_found ->
     (String.length str + 1) :: acc
 in
 let is = indices [0] 0 in
 let rec aux acc = function
   | last::start::tl ->
       let w = String.sub str start (last-start-1) in
       aux (w::acc) (start::tl)
   | _ -> acc
 in
 aux [] is</lang>

Splitting on a string separator using the regular expressions library: <lang ocaml>#load "str.cma";; let split_str sep str =

 Str.split (Str.regexp_string sep) str</lang>

There is already a library function for joining: <lang ocaml>String.concat sep strings</lang>

Perl

Works with: Perl version 5.X

As a one liner without a trailing period, and most efficient way of doing it as you don't have to define an array.

<lang perl>print join('.', split(/,/, "Hello,How,Are,You,Today"));</lang>

If you needed to keep an array for later use, again no trailing period

<lang perl>my @words = split(/,/, "Hello,How,Are,You,Today"); print join('.', @words);</lang>

If you really want a trailing period, here is an example

<lang perl>my @words = split(/,/, "Hello,How,Are,You,Today"); print $_.'.' for (@words);</lang>

PHP

Works with: PHP version 5.x

<lang php><?php $str = 'Hello,How,Are,You,Today'; echo implode('.', explode(',', $str)); ?></lang>

Pop11

The natural solution in Pop11 uses lists.

There are built in libraries for tokenising strings, illustrated below, along with code that the user could create for the task.

First show the use of sysparse_string to break up a string and make a list of strings.

<lang pop11>;;; Make a list of strings from a string using space as separator lvars list; sysparse_string('the cat sat on the mat') -> list;

print the list of strings

list =>

    • [the cat sat on the mat]</lang>

By giving it an extra parameter 'true' we can make it recognize numbers and produce a list of strings and numbers

<lang pop11>lvars list; sysparse_string('one 1 two 2 three 3 four 4', true) -> list;

print the list of strings and numbers

list =>

    • [one 1 two 2 three 3 four 4]
check that first item is a string and second an integer

isstring(list(1))=>

    • <true>

isinteger(list(2))=>

    • <true></lang>

Now show some uses of the built in procedure sys_parse_string, which allows more options:

<lang pop11>;;; Make pop-11 print strings with quotes true -> pop_pr_quotes;

Create a string of tokens using comma as token separator

lvars str='Hello,How,Are,You,Today';

Make a list of strings by applying sys_parse_string
to str, using the character `,` as separator (the default
separator, if none is provided, is the space character).

lvars strings; [% sys_parse_string(str, `,`) %] -> strings;

print the list of strings

strings =>

    • ['Hello' 'How' 'Are' 'You' 'Today']</lang>

If {% ... %} were used instead of [% ... %] the result would be a vector (i.e. array) of strings rather than a list of strings.

<lang pop11>{% sys_parse_string(str, `,`) %} -> strings;

print the vector

strings =>

    • {'Hello' 'How' 'Are' 'You' 'Today'}</lang>

It is also possible to give sys_parse_string a 'conversion' procedure, which is applied to each of the tokens. E.g. it could be used to produce a vector of numbers, using the conversion procedure 'strnumber', which converts a string to a number:

<lang pop11>lvars numbers; {% sys_parse_string('100 101 102 103 99.9 99.999', strnumber) %} -> numbers;

the result is a vector containing integers and floats,
which can be printed thus

numbers =>

    • {100 101 102 103 99.9 99.999}</lang>

Using lower level pop-11 facilities to tokenise the string:

<lang pop11>;;; Declare and initialize variables lvars str='Hello,How,Are,You,Today';

Iterate over string

lvars ls = [], i, j = 1; for i from 1 to length(str) do

   ;;; If comma
   if str(i) = `,` then
      ;;; Prepend word (substring) to list
      cons(substring(j, i - j, str), ls) -> ls;
      i + 1 -> j;
   endif;

endfor;

Prepend final word (if needed)

if j <= length(str) then

   cons(substring(j, length(str) - j + 1, str), ls) -> ls;

endif;

Reverse the list

rev(ls) -> ls;</lang>

Since the task requires to use array we convert list to array

<lang pop11>;;; Put list elements and lenght on the stack destlist(ls);

Build a vector from them

lvars ar = consvector();

Display in a loop, putting trailing period

for i from 1 to length(ar) do

  printf(ar(i), '%s.');

endfor; printf('\n');</lang>

We could use list directly for printing:

<lang pop11>for i in ls do

   printf(i, '%s.');

endfor;</lang>

so the conversion to vector is purely to satisfy task formulation.

Python

Works with: Python version 2.5

<lang python>text = "Hello,How,Are,You,Today" tokens = text.split(',') print '.'.join(tokens)</lang>

If you want to print each word on its own line:

<lang python>for token in tokens:

   print token</lang>

or

<lang python>print "\n".join(tokens)</lang>

or the one liner

<lang python>print '.'.join('Hello,How,Are,You,Today'.split(','))</lang>

Raven

<lang raven>'Hello,How,Are,You,Today' ',' split '.' join print</lang>

Ruby

<lang ruby> string = "Hello,How,Are,You,Today".split(',')

    string.each do |w|
         print "#{w}."
    end
    puts "Hello,How,Are,You,Today".split(',').join('.')</lang>

Seed7

<lang seed7>var array string: tokens is 0 times "";

tokens := split("Hello,How,Are,You,Today", ",");</lang>

Smalltalk

<lang smalltalk>|array | array := 'Hello,How,Are,You,Today' subStrings: $,. array fold: [:concatenation :string | concatenation, '.', string ]</lang>

Some implementations also have a join: convenience method that allows the following shorter solution:

<lang smalltalk>('Hello,How,Are,You,Today' subStrings: $,) join: '.'</lang>

The solution displaying a trailing period would be:

<lang smalltalk>|array | array := 'Hello,How,Are,You,Today' subStrings: $,. array inject: into: [:concatenation :string | concatenation, string, '.' ]</lang>

Standard ML

<lang sml>val splitter = String.tokens (fn c => c = #","); val main = (String.concatWith ".") o splitter;</lang>

Test:

<lang sml>- main "Hello,How,Are,You,Today" val it = "Hello.How.Are.You.Today" : string</lang>

Tcl

Generating a list form a string by splitting on a comma:

<lang tcl>split string ,</lang>

Joining the elements of a list by a period:

<lang tcl>join list .</lang>

Thus the whole thing would look like this:

<lang tcl>puts [join [split "Hello,How,Are,You,Today" ,] .]</lang>

If you'd like to retain the list in a variable with the name "words", it would only be marginally more complex:

<lang tcl>puts [join [set words [split "Hello,How,Are,You,Today" ,]] .]</lang>


UnixPipes

<lang bash>rtoken() {

  (IFS=\ read A B ; echo $A; test -n "$B" && (echo $B | token) )

}

tokens() {

  IFS=, read A ; echo $A | rtoken

}

echo "Hello,How,Are,You" | tokens</lang>