Bioinformatics/Global alignment: Difference between revisions
Content added Content deleted
m (Clarified a comment.) |
(New post.) |
||
Line 182: | Line 182: | ||
Total length 300 |
Total length 300 |
||
</pre> |
</pre> |
||
=={{header|C++}}== |
|||
<syntaxhighlight lang="c++"> |
|||
#include <algorithm> |
|||
#include <cstdint> |
|||
#include <iostream> |
|||
#include <numeric> |
|||
#include <unordered_map> |
|||
#include <unordered_set> |
|||
#include <string> |
|||
#include <vector> |
|||
// Print a report of the given string to the standard output device. |
|||
void print_report(const std::string& text) { |
|||
std::unordered_map<char, int32_t> bases; |
|||
for ( const char& ch : text ) { |
|||
bases[ch]++; |
|||
} |
|||
const int32_t total = std::accumulate(bases.begin(), bases.end(), 0, |
|||
[&](int32_t previous_sum, std::pair<char, int32_t> entry) { |
|||
return previous_sum + entry.second; |
|||
}); |
|||
std::cout << "Nucleotide counts for: " << ( ( text.length() > 50 ) ? "\n" : "" ); |
|||
std::cout << text << std::endl; |
|||
std::cout << "Bases: A " << bases['A'] << ", C: " << bases['C'] << ", G: " << bases['G'] << ", T: " << bases['T'] |
|||
<< ", total: " << total << "\n" << std::endl; |
|||
} |
|||
// Return all permutations of the given list of strings. |
|||
std::vector<std::vector<std::string>> permutations(std::vector<std::string>& list) { |
|||
int32_t indexes[list.size()] = {}; |
|||
std::vector<std::vector<std::string>> result; |
|||
result.push_back(list); |
|||
int32_t i = 0; |
|||
while ( (uint64_t) i < list.size() ) { |
|||
if ( indexes[i] < i ) { |
|||
const int j = ( i % 2 == 0 ) ? 0 : indexes[i]; |
|||
std::swap(list[i], list[j]); |
|||
result.push_back(list); |
|||
indexes[i]++; |
|||
i = 0; |
|||
} else { |
|||
indexes[i] = 0; |
|||
i++; |
|||
} |
|||
} |
|||
return result; |
|||
} |
|||
// Return 'before' concatenated with 'after', removing the longest suffix of 'before' that matches a prefix of 'after'. |
|||
std::string concatenate(const std::string& before, const std::string& after) { |
|||
for ( uint64_t i = 0; i < before.length(); ++i ) { |
|||
if ( after.starts_with(before.substr(i, before.length())) ) { |
|||
return before.substr(0, i) + after; |
|||
} |
|||
} |
|||
return before + after; |
|||
} |
|||
// Remove duplicate strings and strings which are substrings of other strings in the given list. |
|||
std::vector<std::string> deduplicate(const std::vector<std::string>& list) { |
|||
std::vector<std::string> singletons(list); |
|||
std::sort(singletons.begin(), singletons.end()); |
|||
singletons.erase(std::unique(singletons.begin(), singletons.end()), singletons.end()); |
|||
std::vector<std::string> result(singletons); |
|||
std::unordered_set<std::string> marked_for_removal; |
|||
for ( const std::string& test_word : result ) { |
|||
for ( const std::string& word : singletons ) { |
|||
if ( word != test_word && word.find(test_word) != std::string::npos ) { |
|||
marked_for_removal.emplace(test_word); |
|||
} |
|||
} |
|||
} |
|||
result.erase(std::remove_if(result.begin(), result.end(), |
|||
[&](std::string& word) { |
|||
return marked_for_removal.count(word) != 0; |
|||
} |
|||
), result.end()); |
|||
return result; |
|||
} |
|||
// Return a set containing all of the shortest common superstrings of the given list of strings. |
|||
std::unordered_set<std::string> shortest_common_superstrings(const std::vector<std::string>& list) { |
|||
std::vector<std::string> deduplicated = deduplicate(list); |
|||
std::unordered_set<std::string> shortest; |
|||
shortest.insert(std::reduce(list.begin(), list.end(), std::string(""))); |
|||
uint64_t shortest_length; |
|||
for ( const std::string& word : list ) { |
|||
shortest_length += word.length(); |
|||
} |
|||
for ( std::vector<std::string> permutation : permutations(deduplicated) ) { |
|||
std::string candidate; |
|||
for ( const std::string& word : permutation ) { |
|||
candidate = concatenate(candidate, word); |
|||
} |
|||
if ( candidate.length() < shortest_length ) { |
|||
shortest.clear(); |
|||
shortest.insert(candidate); |
|||
shortest_length = candidate.length(); |
|||
} else if ( candidate.length() == shortest_length ) { |
|||
shortest.insert(candidate); |
|||
} |
|||
} |
|||
return shortest; |
|||
} |
|||
int main() { |
|||
const std::vector<std::vector<std::string>> test_sequences = { |
|||
{ "TA", "AAG", "TA", "GAA", "TA" }, |
|||
{ "CATTAGGG", "ATTAG", "GGG", "TA" }, |
|||
{ "AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA" }, |
|||
{ "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", |
|||
"GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", |
|||
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", |
|||
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", |
|||
"AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", |
|||
"GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC", |
|||
"CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT", |
|||
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", |
|||
"CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC", |
|||
"GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT", |
|||
"TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", |
|||
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", |
|||
"TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA" } }; |
|||
for ( const std::vector<std::string>& test : test_sequences ) { |
|||
for ( const std::string& superstring : shortest_common_superstrings(test) ) { |
|||
print_report(superstring); |
|||
} |
|||
} |
|||
} |
|||
</syntaxhighlight> |
|||
<pre> |
|||
Nucleotide counts for: TAGAAG |
|||
Bases: A 3, C: 0, G: 2, T: 1, total: 6 |
|||
Nucleotide counts for: TAAGAA |
|||
Bases: A 4, C: 0, G: 1, T: 1, total: 6 |
|||
Nucleotide counts for: GAAGTA |
|||
Bases: A 3, C: 0, G: 2, T: 1, total: 6 |
|||
Nucleotide counts for: CATTAGGG |
|||
Bases: A 2, C: 1, G: 3, T: 2, total: 8 |
|||
Nucleotide counts for: AAGAUGGAGCGCAUCGCAAUAAGGA |
|||
Bases: A 10, C: 4, G: 8, T: 0, total: 25 |
|||
Nucleotide counts for: |
|||
CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA |
|||
Bases: A 74, C: 57, G: 75, T: 94, total: 300 |
|||
</pre> |
|||
=={{header|Go}}== |
=={{header|Go}}== |
||
{{trans|Julia}} |
{{trans|Julia}} |