Bioinformatics/Global alignment: Difference between revisions

← Older edit

Bioinformatics/Global alignment (view source)

Revision as of 09:33, 8 November 2023

5,933 bytes added , 6 months ago

m

→‎{{header|Wren}}: Minor tidy

PureFox

9,479

edits

Revision as of 09:46, 1 July 2023 (view source) PSNOW123 (talk \| contribs) m (Renamed a variable.) ← Older edit		Latest revision as of 09:33, 8 November 2023 (view source) PureFox (talk \| contribs) m (→‎{{header\|Wren}}: Minor tidy)
(4 intermediate revisions by one other user not shown)
Line 182: Total length 300 </pre> =={{header\|C++}}== <syntaxhighlight lang="c++"> #include <algorithm> #include <cstdint> #include <iostream> #include <numeric> #include <unordered_map> #include <unordered_set> #include <string> #include <vector> // Print a report of the given string to the standard output device. void print_report(const std::string& text) { std::unordered_map<char, int32_t> bases; for ( const char& ch : text ) { bases[ch]++; } const int32_t total = std::accumulate(bases.begin(), bases.end(), 0, [&](int32_t previous_sum, std::pair<char, int32_t> entry) { return previous_sum + entry.second; }); std::cout << "Nucleotide counts for: " << ( ( text.length() > 50 ) ? "\n" : "" ); std::cout << text << std::endl; std::cout << "Bases: A " << bases['A'] << ", C: " << bases['C'] << ", G: " << bases['G'] << ", T: " << bases['T'] << ", total: " << total << "\n" << std::endl; } // Return all permutations of the given list of strings. std::vector<std::vector<std::string>> permutations(std::vector<std::string>& list) { int32_t indexes[list.size()] = {}; std::vector<std::vector<std::string>> result; result.push_back(list); int32_t i = 0; while ( (uint64_t) i < list.size() ) { if ( indexes[i] < i ) { const int j = ( i % 2 == 0 ) ? 0 : indexes[i]; std::swap(list[i], list[j]); result.push_back(list); indexes[i]++; i = 0; } else { indexes[i] = 0; i++; } } return result; } // Return 'before' concatenated with 'after', removing the longest suffix of 'before' that matches a prefix of 'after'. std::string concatenate(const std::string& before, const std::string& after) { for ( uint64_t i = 0; i < before.length(); ++i ) { if ( after.starts_with(before.substr(i, before.length())) ) { return before.substr(0, i) + after; } } return before + after; } // Remove duplicate strings and strings which are substrings of other strings in the given list. std::vector<std::string> deduplicate(const std::vector<std::string>& list) { std::vector<std::string> singletons(list); std::sort(singletons.begin(), singletons.end()); singletons.erase(std::unique(singletons.begin(), singletons.end()), singletons.end()); std::vector<std::string> result(singletons); std::unordered_set<std::string> marked_for_removal; for ( const std::string& test_word : result ) { for ( const std::string& word : singletons ) { if ( word != test_word && word.find(test_word) != std::string::npos ) { marked_for_removal.emplace(test_word); } } } result.erase(std::remove_if(result.begin(), result.end(), [&](std::string& word) { return marked_for_removal.count(word) != 0; } ), result.end()); return result; } // Return a set containing all of the shortest common superstrings of the given list of strings. std::unordered_set<std::string> shortest_common_superstrings(const std::vector<std::string>& list) { std::vector<std::string> deduplicated = deduplicate(list); std::unordered_set<std::string> shortest; shortest.emplace(std::reduce(list.begin(), list.end(), std::string(""))); uint64_t shortest_length; for ( const std::string& word : list ) { shortest_length += word.length(); } for ( std::vector<std::string> permutation : permutations(deduplicated) ) { std::string candidate; for ( const std::string& word : permutation ) { candidate = concatenate(candidate, word); } if ( candidate.length() < shortest_length ) { shortest.clear(); shortest.emplace(candidate); shortest_length = candidate.length(); } else if ( candidate.length() == shortest_length ) { shortest.emplace(candidate); } } return shortest; } int main() { const std::vector<std::vector<std::string>> test_sequences = { { "TA", "AAG", "TA", "GAA", "TA" }, { "CATTAGGG", "ATTAG", "GGG", "TA" }, { "AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA" }, { "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC", "CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC", "GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT", "TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA" } }; for ( const std::vector<std::string>& test : test_sequences ) { for ( const std::string& superstring : shortest_common_superstrings(test) ) { print_report(superstring); } } } </syntaxhighlight> <pre> Nucleotide counts for: TAGAAG Bases: A 3, C: 0, G: 2, T: 1, total: 6 Nucleotide counts for: TAAGAA Bases: A 4, C: 0, G: 1, T: 1, total: 6 Nucleotide counts for: GAAGTA Bases: A 3, C: 0, G: 2, T: 1, total: 6 Nucleotide counts for: CATTAGGG Bases: A 2, C: 1, G: 3, T: 2, total: 8 Nucleotide counts for: AAGAUGGAGCGCAUCGCAAUAAGGA Bases: A 10, C: 4, G: 8, T: 0, total: 25 Nucleotide counts for: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA Bases: A 74, C: 57, G: 75, T: 94, total: 300 </pre> =={{header\|Go}}== {{trans\|Julia}} Line 440 ⟶ 603: } // Return a set ofcontaining all of the shortest common superstrings of the given list of strings. private static Set<String> shortestCommonSuperstrings(List<String> aList) { List<String> deduplicated = deduplicate(aList); Line 460 ⟶ 623: } // Remove duplicate ~~words~~strings and ~~words~~strings which are substrings of other ~~words~~strings in the given list. private static List<String> deduplicate(List<String> aList) { List<String> unique = aList.stream().distinct().collect(Collectors.toList()); Line 503 ⟶ 666: } else { indexes[i] = 0; i ++= 1; } } Line 1,752 ⟶ 1,915: {{libheader\|Wren-str}} {{libheader\|Wren-math}} <syntaxhighlight lang="~~ecmascript~~wren">import "./fmt" for Fmt import "./seq" for Lst import "./str" for Str import "./math" for Int /* Gets all permutations of a list of strings. */