Longest common subsequence: Difference between revisions

Content added Content deleted
m (Switched std::map to C++11 std::unordered_map to reduce initial Match() overhead.)
(Resorted to verbose typedef and parameter names to help clarify their purpose.)
Line 490: Line 490:
typedef deque<uint32_t> THRESHOLD;
typedef deque<uint32_t> THRESHOLD;
typedef deque<uint32_t> INDEXES;
typedef deque<uint32_t> INDEXES;
typedef unordered_map<char, INDEXES> CHAR2INDEXES;
typedef unordered_map<char, INDEXES> CHAR_TO_INDEXES_MAP;
typedef deque<INDEXES*> MATCHES;
typedef deque<INDEXES*> MATCHES;


uint32_t Pairs(MATCHES& indexesOf2MatchedByIndex1, shared_ptr<Pair>* pairs) {
// The following implements the Hunt and Szymanski algorithm:
uint32_t Pairs(MATCHES& matches, shared_ptr<Pair>* pairs) {
auto trace = pairs != nullptr;
auto trace = pairs != nullptr;
PAIRS traces;
PAIRS traces;
Line 504: Line 503:
//
//
uint32_t index1 = 0;
uint32_t index1 = 0;
for (const auto& it1 : matches) {
for (const auto& it1 : indexesOf2MatchedByIndex1) {
if (!it1->empty()) {
if (!it1->empty()) {
auto dq2 = *it1;
auto dq2 = *it1;
Line 571: Line 570:


//
//
// Match() avoids incurring m*n comparisons by using the associative
// Match() avoids m*n comparisons by using CHAR_TO_INDEXES_MAP to
// memory implemented by CHAR2INDEXES to achieve O(m+n) performance,
// achieve O(m+n) performance, where m and n are the input lengths.
// where m and n are the input lengths.
//
//
// The lookup time can be assumed constant in the case of characters.
// The lookup time can be assumed constant in the case of characters.
Line 579: Line 577:
// time will be O(log(m+n)), at most.
// time will be O(log(m+n)), at most.
//
//
void Match(CHAR2INDEXES& indexes, MATCHES& matches,
void Match(CHAR_TO_INDEXES_MAP& indexesOf2MatchedByChar, MATCHES& indexesOf2MatchedByIndex1,
const string& s1, const string& s2) {
const string& s1, const string& s2) {
uint32_t index = 0;
uint32_t index = 0;
for (const auto& it : s2)
for (const auto& it : s2)
indexes[it].push_back(index++);
indexesOf2MatchedByChar[it].push_back(index++);


for (const auto& it : s1) {
for (const auto& it : s1) {
auto& dq2 = indexes[it];
auto& dq2 = indexesOf2MatchedByChar[it];
matches.push_back(&dq2);
indexesOf2MatchedByIndex1.push_back(&dq2);
}
}
}
}
Line 604: Line 602:
public:
public:
string Correspondence(const string& s1, const string& s2) {
string Correspondence(const string& s1, const string& s2) {
CHAR_TO_INDEXES_MAP indexesOf2MatchedByChar;
CHAR2INDEXES indexes;
MATCHES matches; // holds references into indexes
MATCHES indexesOf2MatchedByIndex1; // holds references into indexesOf2MatchedByChar
Match(indexes, matches, s1, s2);
Match(indexesOf2MatchedByChar, indexesOf2MatchedByIndex1, s1, s2);
shared_ptr<Pair> pairs; // obtain the LCS as index pairs
shared_ptr<Pair> pairs; // obtain the LCS as index pairs
auto length = Pairs(matches, &pairs);
auto length = Pairs(indexesOf2MatchedByIndex1, &pairs);
return Select(pairs, length, false, s1, s2);
return Select(pairs, length, false, s1, s2);
}
}