Tokenize a string: Difference between revisions
Content added Content deleted
m (→{{header|Objective-C}}: works with) |
m (Using lang tags now.) |
||
Line 34: | Line 34: | ||
</lang> |
</lang> |
||
=={{header|ALGOL 68}}== |
=={{header|ALGOL 68}}== |
||
<lang algol>main:( |
|||
OP +:= = (REF FLEX[]STRING in out, STRING item)VOID:( |
|||
[LWB in out: UPB in out+1]STRING new; |
|||
new[LWB in out: UPB in out]:=in out; |
|||
new[UPB new]:=item; |
|||
in out := new |
|||
); |
|||
PROC string split = (REF STRING beetles, STRING substr)[]STRING:( |
|||
""" Split beetles where substr is found """; |
|||
FLEX[1:0]STRING out; |
|||
INT start := 1, pos; |
|||
WHILE string in string(substr, pos, beetles[start:]) DO |
|||
out +:= STRING(beetles[start:start+pos-2]); |
|||
start +:= pos + UPB substr - 1 |
|||
OD; |
|||
IF start > LWB beetles THEN |
|||
out +:= STRING(beetles[start:]) |
|||
FI; |
|||
out |
|||
); |
|||
PROC char split = (REF STRING beetles, STRING chars)[]STRING: ( |
|||
""" Split beetles where character is found in chars """; |
|||
FLEX[1:0]STRING out; |
|||
FILE beetlef; |
|||
associate(beetlef, beetles); # associate a FILE handle with a STRING # |
|||
make term(beetlef, chars); # make term: assign CSV string terminator # |
|||
PROC raise logical file end = (REF FILE f)BOOL: except logical file end; |
|||
on logical file end(beetlef, raise logical file end); |
|||
STRING solo; |
|||
DO |
|||
getf(beetlef, ($g$, solo)); |
|||
out+:=solo; |
|||
getf(beetlef, ($x$)) # skip CHAR separator # |
|||
OD; |
|||
except logical file end: |
|||
SKIP; |
|||
out |
|||
); |
|||
STRING beetles := "John Lennon, Paul McCartney, George Harrison, Ringo Starr"; |
|||
printf(($g"."$, string split(beetles, ", "),$l$)); |
|||
printf(($g"."$, char split(beetles, ", "),$l$)) |
|||
)</lang> |
|||
) |
|||
Output:<pre> |
Output:<pre> |
||
John Lennon.Paul McCartney.George Harrison.Ringo Starr. |
John Lennon.Paul McCartney.George Harrison.Ringo Starr. |
||
Line 122: | Line 122: | ||
=={{header|C sharp|C#}}== |
=={{header|C sharp|C#}}== |
||
<lang csharp>string str = "Hello,How,Are,You,Today"; |
|||
// or Regex.Split ( "Hello,How,Are,You,Today", "," ); |
|||
// (Regex is in System.Text.RegularExpressions namespace |
|||
string[] strings = str.Split(','); |
|||
foreach (string s in strings) |
|||
{ |
|||
Console.WriteLine (s + "."); |
|||
}</lang> |
|||
} |
|||
=={{header|C++}}== |
=={{header|C++}}== |
||
Line 141: | Line 141: | ||
Note doxygen tags in comments before function, describing details of interface. |
Note doxygen tags in comments before function, describing details of interface. |
||
<lang cpp>#include <string> |
|||
#include <vector> |
|||
/// \brief convert input string into vector of string tokens |
|||
/// |
|||
/// \note consecutive delimiters will be treated as single delimiter |
|||
/// \note delimiters are _not_ included in return data |
|||
/// |
|||
/// \param input string to be parsed |
|||
/// \param delims list of delimiters. |
|||
std::vector<std::string> tokenize_str(const std::string & str, |
|||
const std::string & delims=", \t") |
|||
{ |
|||
using namespace std; |
|||
// Skip delims at beginning, find start of first token |
|||
string::size_type lastPos = str.find_first_not_of(delims, 0); |
|||
// Find next delimiter @ end of token |
|||
string::size_type pos = str.find_first_of(delims, lastPos); |
|||
// output vector |
|||
vector<string> tokens; |
|||
while (string::npos != pos || string::npos != lastPos) |
|||
{ |
|||
// Found a token, add it to the vector. |
|||
tokens.push_back(str.substr(lastPos, pos - lastPos)); |
|||
// Skip delims. Note the "not_of". this is beginning of token |
|||
lastPos = str.find_first_not_of(delims, pos); |
|||
// Find next delimiter at end of token. |
|||
pos = str.find_first_of(delims, lastPos); |
|||
} |
|||
return tokens; |
|||
}</lang> |
|||
} |
|||
here is sample usage code: |
here is sample usage code: |
||
<lang cpp>#include <iostream> |
|||
int main() { |
|||
using namespace std; |
|||
string s("Hello,How,Are,You,Today"); |
|||
vector<string> v(tokenize_str(s)); |
|||
for (unsigned i = 0; i < v.size(); i++) |
|||
cout << v[i] << "."; |
|||
cout << endl; |
|||
return 0; |
|||
}</lang> |
|||
} |
|||
=={{header|D}}== |
=={{header|D}}== |
||
<lang D>writefln( "Hello,How,Are,You,Today".split(",").join(".") );</lang> |
|||
=={{header|E}}== |
=={{header|E}}== |
||
<lang D>".".rjoin("Hello,How,Are,You,Today".split(","))</lang> |
|||
=={{header|Erlang}}== |
=={{header|Erlang}}== |
||
<lang erlang>-module(tok). |
|||
-export([start/0]). |
|||
start() -> |
|||
Lst = string:tokens("Hello,How,Are,You,Today",","), |
|||
io:fwrite("~s~n", [string:join(Lst,".")]), |
|||
ok.</lang> |
|||
=={{header|Forth}}== |
=={{header|Forth}}== |
||
There is no standard string split routine, but it is easily written. The results are saved temporarily to the dictionary. |
There is no standard string split routine, but it is easily written. The results are saved temporarily to the dictionary. |
||
<lang forth>: split ( str len separator len -- tokens count ) |
|||
here >r 2swap |
|||
begin |
|||
2dup 2, \ save this token ( addr len ) |
|||
2over search \ find next separator |
|||
while |
|||
dup negate here 2 cells - +! \ adjust last token length |
|||
2over nip /string \ start next search past separator |
|||
repeat |
|||
2drop 2drop |
|||
r> here over - ( tokens length ) |
|||
dup negate allot \ reclaim dictionary |
|||
2 cells / ; \ turn byte length into token count |
|||
: .tokens ( tokens count -- ) |
|||
1 ?do dup 2@ type ." ." cell+ cell+ loop 2@ type ; |
|||
s" Hello,How,Are,You,Today" s" ," split .tokens \ Hello.How.Are.You.Today</lang> |
|||
=={{header|Fortran}}== |
=={{header|Fortran}}== |
||
{{works with|Fortran|90 and later}} |
{{works with|Fortran|90 and later}} |
||
<lang fortran>PROGRAM Example |
|||
CHARACTER(23) :: str = "Hello,How,Are,You,Today" |
|||
CHARACTER(5) :: word(5) |
|||
INTEGER :: pos1 = 1, pos2, n = 0, i |
|||
DO |
|||
pos2 = INDEX(str(pos1:), ",") |
|||
IF (pos2 == 0) THEN |
|||
n = n + 1 |
|||
word(n) = str(pos1:) |
|||
EXIT |
|||
END IF |
|||
n = n + 1 |
|||
word(n) = str(pos1:pos1+pos2-2) |
|||
pos1 = pos2+pos1 |
|||
END DO |
|||
DO i = 1, n |
|||
WRITE(*,"(2A)", ADVANCE="NO") TRIM(word(i)), "." |
|||
END DO |
|||
END PROGRAM Example</lang> |
|||
DO |
|||
pos2 = INDEX(str(pos1:), ",") |
|||
IF (pos2 == 0) THEN |
|||
n = n + 1 |
|||
word(n) = str(pos1:) |
|||
EXIT |
|||
END IF |
|||
n = n + 1 |
|||
word(n) = str(pos1:pos1+pos2-2) |
|||
pos1 = pos2+pos1 |
|||
END DO |
|||
DO i = 1, n |
|||
WRITE(*,"(2A)", ADVANCE="NO") TRIM(word(i)), "." |
|||
END DO |
|||
END PROGRAM Example |
|||
=={{header|Haskell}}== |
=={{header|Haskell}}== |
||
The necessary operations are unfortunately not in the standard library (yet), but simple to write: |
The necessary operations are unfortunately not in the standard library (yet), but simple to write: |
||
<lang haskell>splitBy :: (a -> Bool) -> [a] -> <nowiki>[[a]]</nowiki> |
|||
splitBy _ [] = [] |
|||
splitBy f list = first : splitBy f (dropWhile f rest) where |
|||
(first, rest) = break f list |
|||
splitRegex :: Regex -> String -> [String] |
|||
joinWith :: [a] -> <nowiki>[[a]]</nowiki> -> [a] |
|||
joinWith d xs = concat $ List.intersperse d xs |
|||
-- "concat $ intersperse" can be replaced with "intercalate" from the Data.List in GHC 6.8 and later |
|||
splitRegex :: Regex -> String -> [String] |
|||
putStrLn $ joinWith "." $ splitBy (== ',') $ "Hello,How,Are,You,Today" |
|||
joinWith :: [a] -> <nowiki>[[a]]</nowiki> -> [a] |
|||
-- using regular expression to split: |
|||
joinWith d xs = concat $ List.intersperse d xs |
|||
import Text.Regex |
|||
-- "concat $ intersperse" can be replaced with "intercalate" from the Data.List in GHC 6.8 and later |
|||
putStrLn $ joinWith "." $ splitRegex (mkRegex ',') $ "Hello,How,Are,You,Today" |
|||
putStrLn $ joinWith "." $ splitBy (== ',') $ "Hello,How,Are,You,Today" |
|||
-- using regular expression to split: |
|||
import Text.Regex |
|||
putStrLn $ joinWith "." $ splitRegex (mkRegex ',') $ "Hello,How,Are,You,Today"</lang> |
|||
=={{header|Groovy}}== |
=={{header|Groovy}}== |
||
Line 281: | Line 281: | ||
=={{header|Io}}== |
=={{header|Io}}== |
||
<lang io>"Hello,How,Are,You,Today" split(",") join(".") println</lang> |
|||
=={{header|J}}== |
=={{header|J}}== |
||
s=: 'Hello,How,Are,You,Today' |
<lang j> s=: 'Hello,How,Are,You,Today' |
||
] t=: <;._1 ',',s |
|||
+-----+---+---+---+-----+ |
|||
|Hello|How|Are|You|Today| |
|||
+-----+---+---+---+-----+ |
|||
; t,&.>'.' |
|||
Hello.How.Are.You.Today. |
|||
'.' (I.','=s)}s NB. two steps combined |
|||
Hello.How.Are.You.Today</lang> |
|||
=={{header|Java}}== |
=={{header|Java}}== |
||
Line 300: | Line 300: | ||
There are multiple ways to tokenize a String in Java. The first is by splitting the String into an array of Strings, and the other way is to use StringTokenizer with a delimiter. The second way given here will skip any empty tokens. So if two commas are given in line, there will be an empty string in the array given by the split function, but no empty string with the StringTokenizer object. |
There are multiple ways to tokenize a String in Java. The first is by splitting the String into an array of Strings, and the other way is to use StringTokenizer with a delimiter. The second way given here will skip any empty tokens. So if two commas are given in line, there will be an empty string in the array given by the split function, but no empty string with the StringTokenizer object. |
||
<lang java5>String toTokenize = "Hello,How,Are,You,Today"; |
|||
//First way |
|||
String word[] = toTokenize.split(","); |
|||
for(int i=0; i<word.length; i++) { |
|||
System.out.print(word[i] + "."); |
|||
} |
|||
//Second way |
|||
StringTokenizer tokenizer = new StringTokenizer(toTokenize, ","); |
|||
while(tokenizer.hasMoreTokens()) { |
|||
System.out.print(tokenizer.nextToken() + "."); |
|||
}</lang> |
|||
} |
|||
=={{header|JavaScript}}== |
=={{header|JavaScript}}== |
||
{{works with|Firefox|2.0}} |
{{works with|Firefox|2.0}} |
||
<lang javascript>alert( "Hello,How,Are,You,Today".split(",").join(".") );</lang> |
|||
=={{header|Logo}}== |
=={{header|Logo}}== |
||
{{works with|UCB Logo}} |
{{works with|UCB Logo}} |
||
<lang logo>to split :str :sep |
|||
output parse map [ifelse ? = :sep ["| |] [?]] :str |
|||
end</lang> |
|||
<lang logo> ? show split "Hello,How,Are,You,Today ", |
|||
[Hello How Are You Today] |
[Hello How Are You Today]</lang> |
||
=={{header|MAXScript}}== |
=={{header|MAXScript}}== |
||
<lang maxscript>output = "" |
|||
for word in (filterString "Hello,How,Are,You,Today" ",") do |
|||
( |
|||
output += (word + ".") |
|||
) |
|||
format "%\n" output</lang> |
|||
=={{header|Objective-C}}== |
=={{header|Objective-C}}== |
||
Line 403: | Line 403: | ||
As a one liner without a trailing period, and most efficient way of doing it as you don't have to define an array. |
As a one liner without a trailing period, and most efficient way of doing it as you don't have to define an array. |
||
<lang perl>print join('.', split(/,/, "Hello,How,Are,You,Today"));</lang> |
|||
If you needed to keep an array for later use, again no trailing period |
If you needed to keep an array for later use, again no trailing period |
||
<lang perl>my @words = split(/,/, "Hello,How,Are,You,Today"); |
|||
print join('.', @words);</lang> |
|||
If you really want a trailing period, here is an example |
If you really want a trailing period, here is an example |
||
<lang perl>my @words = split(/,/, "Hello,How,Are,You,Today"); |
|||
print $_.'.' for (@words);</lang> |
|||
=={{header|PHP}}== |
=={{header|PHP}}== |
||
{{works with|PHP|5.x}} |
{{works with|PHP|5.x}} |
||
<lang php><?php |
|||
$str = 'Hello,How,Are,You,Today'; |
|||
echo implode('.', explode(',', $str)); |
|||
?></lang> |
|||
?> |
|||
=={{header|Pop11}}== |
=={{header|Pop11}}== |
||
Line 430: | Line 430: | ||
First show the use of sysparse_string to break up a string and make a list of strings. |
First show the use of sysparse_string to break up a string and make a list of strings. |
||
<lang pop11>;;; Make a list of strings from a string using space as separator |
|||
lvars list; |
|||
sysparse_string('the cat sat on the mat') -> list; |
|||
;;; print the list of strings |
|||
list => |
|||
** [the cat sat on the mat]</lang> |
|||
By giving it an extra parameter 'true' we can make it recognize numbers and produce a list of strings and numbers |
By giving it an extra parameter 'true' we can make it recognize numbers and produce a list of strings and numbers |
||
<lang pop11>lvars list; |
|||
sysparse_string('one 1 two 2 three 3 four 4', true) -> list; |
|||
;;; print the list of strings and numbers |
|||
list => |
|||
** [one 1 two 2 three 3 four 4] |
|||
;;; check that first item is a string and second an integer |
|||
isstring(list(1))=> |
|||
** <true> |
|||
isinteger(list(2))=> |
|||
** <true></lang> |
|||
Now show some uses of the built in procedure sys_parse_string, which allows more options: |
Now show some uses of the built in procedure sys_parse_string, which allows more options: |
||
<lang pop11>;;; Make pop-11 print strings with quotes |
|||
true -> pop_pr_quotes; |
|||
;;; |
|||
;;; Create a string of tokens using comma as token separator |
|||
lvars str='Hello,How,Are,You,Today'; |
|||
;;; |
|||
;;; Make a list of strings by applying sys_parse_string |
|||
;;; to str, using the character `,` as separator (the default |
|||
;;; separator, if none is provided, is the space character). |
|||
lvars strings; |
|||
[% sys_parse_string(str, `,`) %] -> strings; |
|||
;;; |
|||
;;; print the list of strings |
|||
strings => |
|||
** ['Hello' 'How' 'Are' 'You' 'Today']</lang> |
|||
If {% ... %} were used instead of [% ... %] the result would be |
If {% ... %} were used instead of [% ... %] the result would be |
||
a vector (i.e. array) of strings rather than a list of strings. |
a vector (i.e. array) of strings rather than a list of strings. |
||
<lang pop11>{% sys_parse_string(str, `,`) %} -> strings; |
|||
;;; print the vector |
|||
strings => |
|||
** {'Hello' 'How' 'Are' 'You' 'Today'}</lang> |
|||
It is also possible to give sys_parse_string a 'conversion' procedure, which is applied to each of the tokens. |
It is also possible to give sys_parse_string a 'conversion' procedure, which is applied to each of the tokens. |
||
E.g. it could be used to produce a vector of numbers, using the conversion procedure 'strnumber', which converts a string to a number: |
E.g. it could be used to produce a vector of numbers, using the conversion procedure 'strnumber', which converts a string to a number: |
||
<lang pop11>lvars numbers; |
|||
{% sys_parse_string('100 101 102 103 99.9 99.999', strnumber) %} -> numbers; |
|||
;;; the result is a vector containing integers and floats, |
|||
;;; which can be printed thus: |
|||
numbers => |
|||
** {100 101 102 103 99.9 99.999}</lang> |
|||
Using lower level pop-11 facilities to tokenise the string: |
Using lower level pop-11 facilities to tokenise the string: |
||
<lang pop11>;;; Declare and initialize variables |
|||
lvars str='Hello,How,Are,You,Today'; |
|||
;;; Iterate over string |
|||
lvars ls = [], i, j = 1; |
|||
for i from 1 to length(str) do |
|||
;;; If comma |
|||
if str(i) = `,` then |
|||
;;; Prepend word (substring) to list |
|||
cons(substring(j, i - j, str), ls) -> ls; |
|||
i + 1 -> j; |
|||
endif; |
|||
endfor; |
|||
;;; Prepend final word (if needed) |
|||
if j <= length(str) then |
|||
cons(substring(j, length(str) - j + 1, str), ls) -> ls; |
|||
endif; |
|||
;;; Reverse the list |
|||
rev(ls) -> ls;</lang> |
|||
Since the task requires to use array we convert list to array |
Since the task requires to use array we convert list to array |
||
<lang pop11>;;; Put list elements and lenght on the stack |
|||
destlist(ls); |
|||
;;; Build a vector from them |
|||
lvars ar = consvector(); |
|||
;;; Display in a loop, putting trailing period |
|||
for i from 1 to length(ar) do |
|||
printf(ar(i), '%s.'); |
|||
endfor; |
|||
printf('\n');</lang> |
|||
We could use list directly for printing: |
We could use list directly for printing: |
||
<lang pop11>for i in ls do |
|||
printf(i, '%s.'); |
|||
endfor;</lang> |
|||
so the conversion to vector is purely to satisfy task formulation. |
so the conversion to vector is purely to satisfy task formulation. |
||
Line 530: | Line 530: | ||
{{works with|Python|2.5}} |
{{works with|Python|2.5}} |
||
<lang python>text = "Hello,How,Are,You,Today" |
|||
tokens = text.split(',') |
|||
print '.'.join(tokens)</lang> |
|||
If you want to print each word on its own line: |
If you want to print each word on its own line: |
||
<lang python>for token in tokens: |
|||
print token</lang> |
|||
or |
or |
||
<lang python>print "\n".join(tokens)</lang> |
|||
or the one liner |
or the one liner |
||
<lang python>print '.'.join('Hello,How,Are,You,Today'.split(','))</lang> |
|||
=={{header|Raven}}== |
=={{header|Raven}}== |
||
<lang raven>'Hello,How,Are,You,Today' ',' split '.' join print</lang> |
|||
=={{header|Ruby}}== |
=={{header|Ruby}}== |
||
string = "Hello,How,Are,You,Today".split(',') |
<lang ruby> string = "Hello,How,Are,You,Today".split(',') |
||
string.each do |w| |
string.each do |w| |
||
print "#{w}." |
print "#{w}." |
||
end |
end |
||
puts "Hello,How,Are,You,Today".split(',').join('.') |
puts "Hello,How,Are,You,Today".split(',').join('.')</lang> |
||
=={{header|Seed7}}== |
=={{header|Seed7}}== |
||
<lang seed7>var array string: tokens is 0 times ""; |
|||
tokens := split("Hello,How,Are,You,Today", ",");</lang> |
|||
=={{header|Smalltalk}}== |
=={{header|Smalltalk}}== |
||
<lang smalltalk>|array | |
|||
array := 'Hello,How,Are,You,Today' subStrings: $,. |
|||
array fold: [:concatenation :string | concatenation, '.', string ]</lang> |
|||
Some implementations also have a ''join:'' convenience method that allows the following shorter solution: |
Some implementations also have a ''join:'' convenience method that allows the following shorter solution: |
||
<lang smalltalk>('Hello,How,Are,You,Today' subStrings: $,) join: '.'</lang> |
|||
The solution displaying a trailing period would be: |
The solution displaying a trailing period would be: |
||
<lang smalltalk>|array | |
|||
array := 'Hello,How,Are,You,Today' subStrings: $,. |
|||
array inject: '' into: [:concatenation :string | concatenation, string, '.' ]</lang> |
|||
=={{header|Standard ML}}== |
=={{header|Standard ML}}== |
||
<lang sml>val splitter = String.tokens (fn c => c = #","); |
|||
val main = (String.concatWith ".") o splitter;</lang> |
|||
Test: |
Test: |
||
<lang sml>- main "Hello,How,Are,You,Today" |
|||
val it = "Hello.How.Are.You.Today" : string</lang> |
|||
=={{header|Tcl}}== |
=={{header|Tcl}}== |
||
Generating a list form a string by splitting on a comma: |
Generating a list form a string by splitting on a comma: |
||
<lang tcl>split string ,</lang> |
|||
Joining the elements of a list by a period: |
Joining the elements of a list by a period: |
||
<lang tcl>join list .</lang> |
|||
Thus the whole thing would look like this: |
Thus the whole thing would look like this: |
||
<lang tcl>puts [join [split "Hello,How,Are,You,Today" ,] .]</lang> |
|||
If you'd like to retain the list in a variable with the name "words", it would only be marginally more complex: |
If you'd like to retain the list in a variable with the name "words", it would only be marginally more complex: |
||
<lang tcl>puts [join [set words [split "Hello,How,Are,You,Today" ,]] .]</lang> |
|||
=={{header|UnixPipes}}== |
=={{header|UnixPipes}}== |
||
<lang bash>rtoken() { |
|||
(IFS=\ read A B ; echo $A; test -n "$B" && (echo $B | token) ) |
|||
} |
|||
tokens() { |
|||
IFS=, read A ; echo $A | rtoken |
|||
} |
|||
echo "Hello,How,Are,You" | tokens</lang> |