Tokenize a string: Difference between revisions

Content added Content deleted

Inline

@@ Line 289: / Line 289: @@
 =={{header|C++}}==
-{{works with|ANSI C++}}
-{{libheader|STL}}
+{{works with|C++98}}
+std::getline() is typically used to tokenize strings on a single-character delimiter
-This is not the most efficient method as it involves redundant copies in the background, but it is very easy to use. In most cases it will be a good choice as long as it is not used as an inner loop in a performance critical system.
-Note doxygen tags in comments before function, describing details of interface.
 <lang cpp>#include <string>
+#include <sstream>
 #include <vector>
+#include <iterator>
-/// \brief convert input string into vector of string tokens
+#include <iostream>
-///
+#include <algorithm>
-/// \note consecutive delimiters will be treated as single delimiter
+int main()
-/// \note delimiters are _not_ included in return data
-///
-/// \param input string to be parsed
-/// \param delims list of delimiters.
-std::vector<std::string> tokenize_str(const std::string & str,
-                                      const std::string & delims=", \t")
 {
+    std::string s = "Hello,How,Are,You,Today";
-  using namespace std;
+    std::vector<std::string> v;
-  // Skip delims at beginning, find start of first token
+    std::istringstream buf(s);
-  string::size_type lastPos = str.find_first_not_of(delims, 0);
+    for(std::string token; getline(buf, token, ','); )
-  // Find next delimiter @ end of token
+        v.push_back(token);
-  string::size_type pos     = str.find_first_of(delims, lastPos);
+    copy(v.begin(), v.end(), std::ostream_iterator<std::string>(std::cout, "."));
+    std::cout << '\n';
+}</lang>
+{{works with|C++98}}
-  // output vector
+C++ allows the user to redefine what is considered whitespace. If the delimiter is whitespace, tokenization becomes effortless.
-  vector<string> tokens;
+<lang cpp>#include <string>
-  while (string::npos != pos || string::npos != lastPos)
+#include <locale>
-    {
+#include <sstream>
-      // Found a token, add it to the vector.
+#include <vector>
-      tokens.push_back(str.substr(lastPos, pos - lastPos));
+#include <iterator>
-      // Skip delims.  Note the "not_of". this is beginning of token
+#include <iostream>
-      lastPos = str.find_first_not_of(delims, pos);
+#include <algorithm>
-      // Find next delimiter at end of token.
+struct comma_ws : std::ctype<char> {
-      pos     = str.find_first_of(delims, lastPos);
+    static const mask* make_table() {
+    static std::vector<mask> v(classic_table(), classic_table() + table_size);
+        v[','] |= space;  // comma will be classified as whitespace
+        return &v[0];
     }
+    comma_ws(std::size_t refs = 0) : ctype<char>(make_table(), false, refs) {}
+};
-  return tokens;
+int main()
+{
+    std::string s = "Hello,How,Are,You,Today";
+    std::istringstream buf(s);
+    buf.imbue(std::locale(buf.getloc(), new comma_ws));
+    std::istream_iterator<std::string> beg(buf), end;
+    std::vector<std::string> v(beg, end);
+    copy(v.begin(), v.end(), std::ostream_iterator<std::string>(std::cout, "."));
+    std::cout << '\n';
 }</lang>
+{{works with|C++98}}
+{{libheader|boost}}
+The boost library has multiple options for easy tokenization.
+<lang cpp>#include <string>
-here is sample usage code:
+#include <vector>
-<lang cpp>#include <iostream>
+#include <iterator>
+#include <algorithm>
-int main() {
+#include <iostream>
-  using namespace std;
+#include <boost/tokenizer.hpp>
-  string s("Hello,How,Are,You,Today");
+int main()
+{
-  vector<string> v(tokenize_str(s));
+    std::string s = "Hello,How,Are,You,Today";
+    boost::tokenizer<> tok(s);
-  for (unsigned i  = 0; i < v.size(); i++)
+    std::vector<std::string> v(tok.begin(), tok.end());
-    cout << v[i] << ".";
+    copy(v.begin(), v.end(), std::ostream_iterator<std::string>(std::cout, "."))
-  cout << endl;
+    std::cout << '\n';
-  return 0;
 }</lang>