Tokenize a string: Difference between revisions
Content added Content deleted
(→{{header|MATLAB}}: works also in Octave) |
(→{{header|C++}}: replaced old code which does manual string searching with a more typical approach (getline) and a more C++-specific (ctype). Also added a boost example.) |
||
Line 289: | Line 289: | ||
=={{header|C++}}== |
=={{header|C++}}== |
||
⚫ | |||
{{ |
{{works with|C++98}} |
||
std::getline() is typically used to tokenize strings on a single-character delimiter |
|||
This is not the most efficient method as it involves redundant copies in the background, but it is very easy to use. In most cases it will be a good choice as long as it is not used as an inner loop in a performance critical system. |
|||
Note doxygen tags in comments before function, describing details of interface. |
|||
<lang cpp>#include <string> |
<lang cpp>#include <string> |
||
#include <sstream> |
|||
#include <vector> |
#include <vector> |
||
#include <iterator> |
|||
/// \brief convert input string into vector of string tokens |
|||
#include <iostream> |
|||
/// |
|||
#include <algorithm> |
|||
/// \note consecutive delimiters will be treated as single delimiter |
|||
⚫ | |||
/// \note delimiters are _not_ included in return data |
|||
/// |
|||
/// \param input string to be parsed |
|||
/// \param delims list of delimiters. |
|||
⚫ | |||
const std::string & delims=", \t") |
|||
{ |
{ |
||
⚫ | |||
using namespace std; |
|||
⚫ | |||
// Skip delims at beginning, find start of first token |
|||
std::istringstream buf(s); |
|||
string::size_type lastPos = str.find_first_not_of(delims, 0); |
|||
for(std::string token; getline(buf, token, ','); ) |
|||
// Find next delimiter @ end of token |
|||
v.push_back(token); |
|||
string::size_type pos = str.find_first_of(delims, lastPos); |
|||
copy(v.begin(), v.end(), std::ostream_iterator<std::string>(std::cout, ".")); |
|||
⚫ | |||
}</lang> |
|||
⚫ | |||
// output vector |
|||
C++ allows the user to redefine what is considered whitespace. If the delimiter is whitespace, tokenization becomes effortless. |
|||
⚫ | |||
<lang cpp>#include <string> |
|||
while (string::npos != pos || string::npos != lastPos) |
|||
#include <locale> |
|||
{ |
|||
#include <sstream> |
|||
// Found a token, add it to the vector. |
|||
#include <vector> |
|||
tokens.push_back(str.substr(lastPos, pos - lastPos)); |
|||
#include <iterator> |
|||
// Skip delims. Note the "not_of". this is beginning of token |
|||
#include <iostream> |
|||
lastPos = str.find_first_not_of(delims, pos); |
|||
#include <algorithm> |
|||
// Find next delimiter at end of token. |
|||
struct comma_ws : std::ctype<char> { |
|||
pos = str.find_first_of(delims, lastPos); |
|||
static const mask* make_table() { |
|||
static std::vector<mask> v(classic_table(), classic_table() + table_size); |
|||
v[','] |= space; // comma will be classified as whitespace |
|||
⚫ | |||
} |
} |
||
comma_ws(std::size_t refs = 0) : ctype<char>(make_table(), false, refs) {} |
|||
}; |
|||
return tokens; |
|||
int main() |
|||
{ |
|||
std::string s = "Hello,How,Are,You,Today"; |
|||
std::istringstream buf(s); |
|||
buf.imbue(std::locale(buf.getloc(), new comma_ws)); |
|||
std::istream_iterator<std::string> beg(buf), end; |
|||
⚫ | |||
copy(v.begin(), v.end(), std::ostream_iterator<std::string>(std::cout, ".")); |
|||
std::cout << '\n'; |
|||
}</lang> |
}</lang> |
||
{{works with|C++98}} |
|||
{{libheader|boost}} |
|||
The boost library has multiple options for easy tokenization. |
|||
<lang cpp>#include <string> |
|||
here is sample usage code: |
|||
#include <vector> |
|||
#include <iterator> |
|||
#include <algorithm> |
|||
⚫ | |||
#include <iostream> |
|||
using namespace std; |
|||
#include <boost/tokenizer.hpp> |
|||
⚫ | |||
int main() |
|||
{ |
|||
⚫ | |||
std::string s = "Hello,How,Are,You,Today"; |
|||
boost::tokenizer<> tok(s); |
|||
for (unsigned i = 0; i < v.size(); i++) |
|||
⚫ | |||
⚫ | |||
copy(v.begin(), v.end(), std::ostream_iterator<std::string>(std::cout, ".")) |
|||
cout << |
std::cout << '\n'; |
||
⚫ | |||
}</lang> |
}</lang> |
||