Sorensen–Dice coefficient: Difference between revisions

Content added Content deleted
(Added C++ solution)
m (C++ - minor performance improvement)
Line 57: Line 57:
#include <vector>
#include <vector>


using bigram = std::pair<char, char>;
std::multiset<std::string> split(const std::string& phrase) {

std::multiset<std::string> result;
std::multiset<bigram> split(const std::string& phrase) {
std::multiset<bigram> result;
std::istringstream is(phrase);
std::istringstream is(phrase);
std::string word;
std::string word;
Line 67: Line 69:
size_t length = word.size();
size_t length = word.size();
if (length == 1) {
if (length == 1) {
result.emplace(1, word[0]);
result.emplace(word[0], '\0');
} else {
} else {
for (size_t i = 0; i + 1 < length; ++i) {
for (size_t i = 0; i + 1 < length; ++i) {
result.insert(std::string{word[i], word[i + 1]});
result.emplace(word[i], word[i + 1]);
}
}
}
}
Line 80: Line 82:
auto a = split(s1);
auto a = split(s1);
auto b = split(s2);
auto b = split(s2);
std::multiset<std::string> c;
std::multiset<bigram> c;
std::set_intersection(a.begin(), a.end(), b.begin(), b.end(),
std::set_intersection(a.begin(), a.end(), b.begin(), b.end(),
std::inserter(c, c.begin()));
std::inserter(c, c.begin()));