Bioinformatics/Global alignment: Difference between revisions

Content added Content deleted

Inline

@@ Line 182: / Line 182: @@
   Total length     300
 </pre>
+=={{header|C++}}==
+<syntaxhighlight lang="c++">
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+#include <numeric>
+#include <unordered_map>
+#include <unordered_set>
+#include <string>
+#include <vector>
+// Print a report of the given string to the standard output device.
+void print_report(const std::string& text) {
+	std::unordered_map<char, int32_t> bases;
+	for ( const char& ch : text ) {
+		bases[ch]++;
+	}
+	const int32_t total = std::accumulate(bases.begin(), bases.end(), 0,
+            [&](int32_t previous_sum, std::pair<char, int32_t> entry) {
+                return previous_sum + entry.second;
+            });
+	std::cout << "Nucleotide counts for: " << ( ( text.length() > 50 ) ? "\n" : "" );
+	std::cout << text << std::endl;
+	std::cout << "Bases: A " << bases['A'] << ", C: " << bases['C'] << ", G: " << bases['G'] << ", T: " << bases['T']
+			  << ", total: " << total << "\n" << std::endl;
+}
+// Return all permutations of the given list of strings.
+std::vector<std::vector<std::string>> permutations(std::vector<std::string>& list) {
+	int32_t indexes[list.size()] = {};
+	std::vector<std::vector<std::string>> result;
+	result.push_back(list);
+	int32_t i = 0;
+	while ( (uint64_t) i < list.size() ) {
+		if ( indexes[i] < i ) {
+			const int j = ( i % 2 == 0 ) ? 0 : indexes[i];
+			std::swap(list[i], list[j]);
+			result.push_back(list);
+			indexes[i]++;
+			i = 0;
+		} else {
+			indexes[i] = 0;
+			i++;
+		}
+	}
+	return result;
+}
+// Return 'before' concatenated with 'after', removing the longest suffix of 'before' that matches a prefix of 'after'.
+std::string concatenate(const std::string& before, const std::string& after) {
+	for ( uint64_t i = 0; i < before.length(); ++i ) {
+		if ( after.starts_with(before.substr(i, before.length())) ) {
+			return before.substr(0, i) + after;
+		}
+	}
+	return before + after;
+}
+// Remove duplicate strings and strings which are substrings of other strings in the given list.
+std::vector<std::string> deduplicate(const std::vector<std::string>& list) {
+	std::vector<std::string> singletons(list);
+	std::sort(singletons.begin(), singletons.end());
+	singletons.erase(std::unique(singletons.begin(), singletons.end()), singletons.end());
+	std::vector<std::string> result(singletons);
+	std::unordered_set<std::string> marked_for_removal;
+	for ( const std::string& test_word : result ) {
+		for ( const std::string& word : singletons ) {
+			if ( word != test_word && word.find(test_word) != std::string::npos ) {
+				marked_for_removal.emplace(test_word);
+			}
+		}
+	}
+	result.erase(std::remove_if(result.begin(), result.end(),
+		[&](std::string& word) {
+	    	return marked_for_removal.count(word) != 0;
+		}
+	), result.end());
+	return result;
+}
+// Return a set containing all of the shortest common superstrings of the given list of strings.
+std::unordered_set<std::string> shortest_common_superstrings(const std::vector<std::string>& list) {
+	std::vector<std::string> deduplicated = deduplicate(list);
+	std::unordered_set<std::string> shortest;
+	shortest.insert(std::reduce(list.begin(), list.end(), std::string("")));
+	uint64_t shortest_length;
+	for ( const std::string& word : list ) {
+		shortest_length += word.length();
+	}
+	for ( std::vector<std::string> permutation : permutations(deduplicated) ) {
+		std::string candidate;
+		for ( const std::string& word : permutation ) {
+			candidate = concatenate(candidate, word);
+		}
+		if ( candidate.length() < shortest_length ) {
+			shortest.clear();
+			shortest.insert(candidate);
+			shortest_length = candidate.length();
+		} else if ( candidate.length() == shortest_length ) {
+			shortest.insert(candidate);
+		}
+	}
+	return shortest;
+}
+int main() {
+	const std::vector<std::vector<std::string>> test_sequences = {
+		{ "TA", "AAG", "TA", "GAA", "TA" },
+		{ "CATTAGGG", "ATTAG", "GGG", "TA" },
+		{ "AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA" },
+		{ "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT",
+		  "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT",
+		  "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
+		  "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
+		  "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT",
+		  "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC",
+		  "CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT",
+		  "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
+		  "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC",
+		  "GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT",
+		  "TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
+		  "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
+		  "TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA" } };
+	for ( const std::vector<std::string>& test : test_sequences ) {
+		for ( const std::string& superstring : shortest_common_superstrings(test) ) {
+			print_report(superstring);
+		}
+	}
+}
+</syntaxhighlight>
+<pre>
+Nucleotide counts for: TAGAAG
+Bases: A 3, C: 0, G: 2, T: 1, total: 6
+Nucleotide counts for: TAAGAA
+Bases: A 4, C: 0, G: 1, T: 1, total: 6
+Nucleotide counts for: GAAGTA
+Bases: A 3, C: 0, G: 2, T: 1, total: 6
+Nucleotide counts for: CATTAGGG
+Bases: A 2, C: 1, G: 3, T: 2, total: 8
+Nucleotide counts for: AAGAUGGAGCGCAUCGCAAUAAGGA
+Bases: A 10, C: 4, G: 8, T: 0, total: 25
+Nucleotide counts for:
+CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA
+Bases: A 74, C: 57, G: 75, T: 94, total: 300
+</pre>
 =={{header|Go}}==
 {{trans|Julia}}