Tokenize a string: Difference between revisions

Content added Content deleted
(→‎{{header|MATLAB}}: works also in Octave)
(→‎{{header|C++}}: replaced old code which does manual string searching with a more typical approach (getline) and a more C++-specific (ctype). Also added a boost example.)
Line 289: Line 289:


=={{header|C++}}==
=={{header|C++}}==
{{works with|ANSI C++}}


{{libheader|STL}}
{{works with|C++98}}
std::getline() is typically used to tokenize strings on a single-character delimiter

This is not the most efficient method as it involves redundant copies in the background, but it is very easy to use. In most cases it will be a good choice as long as it is not used as an inner loop in a performance critical system.

Note doxygen tags in comments before function, describing details of interface.


<lang cpp>#include <string>
<lang cpp>#include <string>
#include <sstream>
#include <vector>
#include <vector>
#include <iterator>
/// \brief convert input string into vector of string tokens
#include <iostream>
///
#include <algorithm>
/// \note consecutive delimiters will be treated as single delimiter
int main()
/// \note delimiters are _not_ included in return data
///
/// \param input string to be parsed
/// \param delims list of delimiters.

std::vector<std::string> tokenize_str(const std::string & str,
const std::string & delims=", \t")
{
{
std::string s = "Hello,How,Are,You,Today";
using namespace std;
std::vector<std::string> v;
// Skip delims at beginning, find start of first token
std::istringstream buf(s);
string::size_type lastPos = str.find_first_not_of(delims, 0);
for(std::string token; getline(buf, token, ','); )
// Find next delimiter @ end of token
v.push_back(token);
string::size_type pos = str.find_first_of(delims, lastPos);
copy(v.begin(), v.end(), std::ostream_iterator<std::string>(std::cout, "."));
std::cout << '\n';
}</lang>


{{works with|C++98}}
// output vector
C++ allows the user to redefine what is considered whitespace. If the delimiter is whitespace, tokenization becomes effortless.
vector<string> tokens;


<lang cpp>#include <string>
while (string::npos != pos || string::npos != lastPos)
#include <locale>
{
#include <sstream>
// Found a token, add it to the vector.
#include <vector>
tokens.push_back(str.substr(lastPos, pos - lastPos));
#include <iterator>
// Skip delims. Note the "not_of". this is beginning of token
#include <iostream>
lastPos = str.find_first_not_of(delims, pos);
#include <algorithm>
// Find next delimiter at end of token.
struct comma_ws : std::ctype<char> {
pos = str.find_first_of(delims, lastPos);
static const mask* make_table() {
static std::vector<mask> v(classic_table(), classic_table() + table_size);
v[','] |= space; // comma will be classified as whitespace
return &v[0];
}
}
comma_ws(std::size_t refs = 0) : ctype<char>(make_table(), false, refs) {}

};
return tokens;
int main()
{
std::string s = "Hello,How,Are,You,Today";
std::istringstream buf(s);
buf.imbue(std::locale(buf.getloc(), new comma_ws));
std::istream_iterator<std::string> beg(buf), end;
std::vector<std::string> v(beg, end);
copy(v.begin(), v.end(), std::ostream_iterator<std::string>(std::cout, "."));
std::cout << '\n';
}</lang>
}</lang>


{{works with|C++98}}
{{libheader|boost}}
The boost library has multiple options for easy tokenization.


<lang cpp>#include <string>
here is sample usage code:
#include <vector>

<lang cpp>#include <iostream>
#include <iterator>
#include <algorithm>
int main() {
#include <iostream>
using namespace std;
#include <boost/tokenizer.hpp>
string s("Hello,How,Are,You,Today");
int main()

{
vector<string> v(tokenize_str(s));
std::string s = "Hello,How,Are,You,Today";

boost::tokenizer<> tok(s);
for (unsigned i = 0; i < v.size(); i++)
std::vector<std::string> v(tok.begin(), tok.end());
cout << v[i] << ".";
copy(v.begin(), v.end(), std::ostream_iterator<std::string>(std::cout, "."))
cout << endl;
std::cout << '\n';
return 0;
}</lang>
}</lang>