Bioinformatics/Global alignment: Difference between revisions

← Older edit

Bioinformatics/Global alignment (view source)

Revision as of 09:33, 8 November 2023

11,993 bytes added , 7 months ago

m

→‎{{header|Wren}}: Minor tidy

PureFox

9,485

edits

Revision as of 11:08, 31 October 2022 (view source) Thundergnat (talk \| contribs) m (Re-add task markup) ← Older edit		Latest revision as of 09:33, 8 November 2023 (view source) PureFox (talk \| contribs) m (→‎{{header\|Wren}}: Minor tidy)
(6 intermediate revisions by one other user not shown)
Line 182: Total length 300 </pre> =={{header\|C++}}== <syntaxhighlight lang="c++"> #include <algorithm> #include <cstdint> #include <iostream> #include <numeric> #include <unordered_map> #include <unordered_set> #include <string> #include <vector> // Print a report of the given string to the standard output device. void print_report(const std::string& text) { std::unordered_map<char, int32_t> bases; for ( const char& ch : text ) { bases[ch]++; } const int32_t total = std::accumulate(bases.begin(), bases.end(), 0, [&](int32_t previous_sum, std::pair<char, int32_t> entry) { return previous_sum + entry.second; }); std::cout << "Nucleotide counts for: " << ( ( text.length() > 50 ) ? "\n" : "" ); std::cout << text << std::endl; std::cout << "Bases: A " << bases['A'] << ", C: " << bases['C'] << ", G: " << bases['G'] << ", T: " << bases['T'] << ", total: " << total << "\n" << std::endl; } // Return all permutations of the given list of strings. std::vector<std::vector<std::string>> permutations(std::vector<std::string>& list) { int32_t indexes[list.size()] = {}; std::vector<std::vector<std::string>> result; result.push_back(list); int32_t i = 0; while ( (uint64_t) i < list.size() ) { if ( indexes[i] < i ) { const int j = ( i % 2 == 0 ) ? 0 : indexes[i]; std::swap(list[i], list[j]); result.push_back(list); indexes[i]++; i = 0; } else { indexes[i] = 0; i++; } } return result; } // Return 'before' concatenated with 'after', removing the longest suffix of 'before' that matches a prefix of 'after'. std::string concatenate(const std::string& before, const std::string& after) { for ( uint64_t i = 0; i < before.length(); ++i ) { if ( after.starts_with(before.substr(i, before.length())) ) { return before.substr(0, i) + after; } } return before + after; } // Remove duplicate strings and strings which are substrings of other strings in the given list. std::vector<std::string> deduplicate(const std::vector<std::string>& list) { std::vector<std::string> singletons(list); std::sort(singletons.begin(), singletons.end()); singletons.erase(std::unique(singletons.begin(), singletons.end()), singletons.end()); std::vector<std::string> result(singletons); std::unordered_set<std::string> marked_for_removal; for ( const std::string& test_word : result ) { for ( const std::string& word : singletons ) { if ( word != test_word && word.find(test_word) != std::string::npos ) { marked_for_removal.emplace(test_word); } } } result.erase(std::remove_if(result.begin(), result.end(), [&](std::string& word) { return marked_for_removal.count(word) != 0; } ), result.end()); return result; } // Return a set containing all of the shortest common superstrings of the given list of strings. std::unordered_set<std::string> shortest_common_superstrings(const std::vector<std::string>& list) { std::vector<std::string> deduplicated = deduplicate(list); std::unordered_set<std::string> shortest; shortest.emplace(std::reduce(list.begin(), list.end(), std::string(""))); uint64_t shortest_length; for ( const std::string& word : list ) { shortest_length += word.length(); } for ( std::vector<std::string> permutation : permutations(deduplicated) ) { std::string candidate; for ( const std::string& word : permutation ) { candidate = concatenate(candidate, word); } if ( candidate.length() < shortest_length ) { shortest.clear(); shortest.emplace(candidate); shortest_length = candidate.length(); } else if ( candidate.length() == shortest_length ) { shortest.emplace(candidate); } } return shortest; } int main() { const std::vector<std::vector<std::string>> test_sequences = { { "TA", "AAG", "TA", "GAA", "TA" }, { "CATTAGGG", "ATTAG", "GGG", "TA" }, { "AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA" }, { "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC", "CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC", "GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT", "TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA" } }; for ( const std::vector<std::string>& test : test_sequences ) { for ( const std::string& superstring : shortest_common_superstrings(test) ) { print_report(superstring); } } } </syntaxhighlight> <pre> Nucleotide counts for: TAGAAG Bases: A 3, C: 0, G: 2, T: 1, total: 6 Nucleotide counts for: TAAGAA Bases: A 4, C: 0, G: 1, T: 1, total: 6 Nucleotide counts for: GAAGTA Bases: A 3, C: 0, G: 2, T: 1, total: 6 Nucleotide counts for: CATTAGGG Bases: A 2, C: 1, G: 3, T: 2, total: 8 Nucleotide counts for: AAGAUGGAGCGCAUCGCAAUAAGGA Bases: A 10, C: 4, G: 8, T: 0, total: 25 Nucleotide counts for: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA Bases: A 74, C: 57, G: 75, T: 94, total: 300 </pre> =={{header\|Go}}== {{trans\|Julia}} Line 398 ⟶ 561: Total length 300 </pre> =={{header\|Java}}== <syntaxhighlight lang="java"> import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; public final class BioinformaticsGlobalAlignment { public static void main(String[] aArgs) { List<List<String>> testSequences = Arrays.asList( Arrays.asList( "TA", "AAG", "TA", "GAA", "TA" ), Arrays.asList( "CATTAGGG", "ATTAG", "GGG", "TA" ), Arrays.asList( "AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA" ), Arrays.asList( "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC", "CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC", "GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT", "TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA" ) ); for ( List<String> test : testSequences ) { for ( String superstring : shortestCommonSuperstrings(test) ) { printReport(superstring); } } } // Return a set containing all of the shortest common superstrings of the given list of strings. private static Set<String> shortestCommonSuperstrings(List<String> aList) { List<String> deduplicated = deduplicate(aList); Set<String> shortest = new HashSet<String>(); shortest.add(String.join("", deduplicated)); int shortestLength = aList.stream().mapToInt( s -> s.length() ).sum(); for ( List<String> permutation : permutations(deduplicated) ) { String candidate = permutation.stream().reduce("", (a, b) -> concatenate(a, b) ); if ( candidate.length() < shortestLength ) { shortest.clear(); shortest.add(candidate); shortestLength = candidate.length(); } else if ( candidate.length() == shortestLength ) { shortest.add(candidate); } } return shortest; } // Remove duplicate strings and strings which are substrings of other strings in the given list. private static List<String> deduplicate(List<String> aList) { List<String> unique = aList.stream().distinct().collect(Collectors.toList()); List<String> result = new ArrayList<String>(unique); List<String> markedForRemoval = new ArrayList<String>(); for ( String testWord : result ) { for ( String word : unique ) { if ( ! word.equals(testWord) && word.contains(testWord) ) { markedForRemoval.add(testWord); } } } result.removeAll(markedForRemoval); return result; } // Return aBefore concatenated with aAfter, removing the longest suffix of aBefore that matches a prefix of aAfter. private static String concatenate(String aBefore, String aAfter) { for ( int i = 0; i < aBefore.length(); i++ ) { if ( aAfter.startsWith(aBefore.substring(i, aBefore.length())) ) { return aBefore.substring(0, i) + aAfter; } } return aBefore + aAfter; } // Return all permutations of the given list of strings. private static List<List<String>> permutations(List<String> aList) { int[] indexes = new int[aList.size()]; List<List<String>> result = new ArrayList<List<String>>(); result.add( new ArrayList<String>(aList) ); int i = 0; while ( i < aList.size() ) { if ( indexes[i] < i ) { final int j = ( i % 2 == 0 ) ? 0 : indexes[i]; String temp = aList.get(j); aList.set(j, aList.get(i)); aList.set(i, temp); result.add( new ArrayList<String>(aList) ); indexes[i]++; i = 0; } else { indexes[i] = 0; i += 1; } } return result; } // Print a report of the given string to the standard output device. private static void printReport(String aText) { char[] nucleotides = new char[] {'A', 'C', 'G', 'T' }; Map<Character, Integer> bases = new HashMap<Character, Integer>(); for ( char base : nucleotides ) { bases.put(base, 0); } for ( char ch : aText.toCharArray() ) { bases.merge(ch, 1, Integer::sum); } final int total = bases.values().stream().reduce(0, Integer::sum); System.out.print("Nucleotide counts for: " + ( ( aText.length() > 50 ) ? System.lineSeparator() : "") ); System.out.println(aText); System.out.print(String.format("%s%d%s%d%s%d%s%d", "Bases: A: ", bases.get('A'), ", C: ", bases.get('C'), ", G: ", bases.get('G'), ", T: ", bases.get('T'))); System.out.println(", total: " + total + System.lineSeparator()); } } </syntaxhighlight> {{ out }} <pre> Nucleotide counts for: TAGAAG Bases: A: 3, C: 0, G: 2, T: 1, total: 6 Nucleotide counts for: GAAGTA Bases: A: 3, C: 0, G: 2, T: 1, total: 6 Nucleotide counts for: TAAGAA Bases: A: 4, C: 0, G: 1, T: 1, total: 6 Nucleotide counts for: CATTAGGG Bases: A: 2, C: 1, G: 3, T: 2, total: 8 Nucleotide counts for: AAGAUGGAGCGCAUCGCAAUAAGGA Bases: A: 10, C: 4, G: 8, T: 0, total: 25 Nucleotide counts for: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA Bases: A: 74, C: 57, G: 75, T: 94, total: 300 </pre> =={{header\|jq}}== {{works with\|jq}} Line 1,597 ⟶ 1,915: {{libheader\|Wren-str}} {{libheader\|Wren-math}} <syntaxhighlight lang="~~ecmascript~~wren">import "./fmt" for Fmt import "./seq" for Lst import "./str" for Str import "./math" for Int /* Gets all permutations of a list of strings. */