Bioinformatics/Global alignment: Difference between revisions

← Older edit

Bioinformatics/Global alignment (view source)

Revision as of 09:33, 8 November 2023

12,003 bytes added , 7 months ago

m

→‎{{header|Wren}}: Minor tidy

PureFox

9,485

edits

Revision as of 18:00, 26 August 2022 (view source) Thundergnat (talk \| contribs) m (syntax highlighting fixup automation) ← Older edit		Latest revision as of 09:33, 8 November 2023 (view source) PureFox (talk \| contribs) m (→‎{{header\|Wren}}: Minor tidy)
(9 intermediate revisions by 3 users not shown)
Line 1: {{task}} ~~{{Draft task}}[[Category:Bioinfomatics]][[Category:Strings]]~~ [[Category:Bioinfomatics]] [[Category:Strings]] Global alignment is designed to search for highly similar regions in two or more DNA sequences, where the sequences appear in the same order and orientation, fitting the sequences in as pieces in a puzzle. Line 53 ⟶ 55: :* [[Bioinformatics/Sequence_mutation\|Bioinformatics sequence mutation]]. <br><br> =={{header\|11l}}== {{trans\|Nim}} <syntaxhighlight lang="11l">-V ACGT = [‘A’, ‘C’, ‘G’, ‘T’] F permutations(slist) Line 181: -------------------- Total length 300 </pre> =={{header\|C++}}== <syntaxhighlight lang="c++"> #include <algorithm> #include <cstdint> #include <iostream> #include <numeric> #include <unordered_map> #include <unordered_set> #include <string> #include <vector> // Print a report of the given string to the standard output device. void print_report(const std::string& text) { std::unordered_map<char, int32_t> bases; for ( const char& ch : text ) { bases[ch]++; } const int32_t total = std::accumulate(bases.begin(), bases.end(), 0, [&](int32_t previous_sum, std::pair<char, int32_t> entry) { return previous_sum + entry.second; }); std::cout << "Nucleotide counts for: " << ( ( text.length() > 50 ) ? "\n" : "" ); std::cout << text << std::endl; std::cout << "Bases: A " << bases['A'] << ", C: " << bases['C'] << ", G: " << bases['G'] << ", T: " << bases['T'] << ", total: " << total << "\n" << std::endl; } // Return all permutations of the given list of strings. std::vector<std::vector<std::string>> permutations(std::vector<std::string>& list) { int32_t indexes[list.size()] = {}; std::vector<std::vector<std::string>> result; result.push_back(list); int32_t i = 0; while ( (uint64_t) i < list.size() ) { if ( indexes[i] < i ) { const int j = ( i % 2 == 0 ) ? 0 : indexes[i]; std::swap(list[i], list[j]); result.push_back(list); indexes[i]++; i = 0; } else { indexes[i] = 0; i++; } } return result; } // Return 'before' concatenated with 'after', removing the longest suffix of 'before' that matches a prefix of 'after'. std::string concatenate(const std::string& before, const std::string& after) { for ( uint64_t i = 0; i < before.length(); ++i ) { if ( after.starts_with(before.substr(i, before.length())) ) { return before.substr(0, i) + after; } } return before + after; } // Remove duplicate strings and strings which are substrings of other strings in the given list. std::vector<std::string> deduplicate(const std::vector<std::string>& list) { std::vector<std::string> singletons(list); std::sort(singletons.begin(), singletons.end()); singletons.erase(std::unique(singletons.begin(), singletons.end()), singletons.end()); std::vector<std::string> result(singletons); std::unordered_set<std::string> marked_for_removal; for ( const std::string& test_word : result ) { for ( const std::string& word : singletons ) { if ( word != test_word && word.find(test_word) != std::string::npos ) { marked_for_removal.emplace(test_word); } } } result.erase(std::remove_if(result.begin(), result.end(), [&](std::string& word) { return marked_for_removal.count(word) != 0; } ), result.end()); return result; } // Return a set containing all of the shortest common superstrings of the given list of strings. std::unordered_set<std::string> shortest_common_superstrings(const std::vector<std::string>& list) { std::vector<std::string> deduplicated = deduplicate(list); std::unordered_set<std::string> shortest; shortest.emplace(std::reduce(list.begin(), list.end(), std::string(""))); uint64_t shortest_length; for ( const std::string& word : list ) { shortest_length += word.length(); } for ( std::vector<std::string> permutation : permutations(deduplicated) ) { std::string candidate; for ( const std::string& word : permutation ) { candidate = concatenate(candidate, word); } if ( candidate.length() < shortest_length ) { shortest.clear(); shortest.emplace(candidate); shortest_length = candidate.length(); } else if ( candidate.length() == shortest_length ) { shortest.emplace(candidate); } } return shortest; } int main() { const std::vector<std::vector<std::string>> test_sequences = { { "TA", "AAG", "TA", "GAA", "TA" }, { "CATTAGGG", "ATTAG", "GGG", "TA" }, { "AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA" }, { "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC", "CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC", "GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT", "TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA" } }; for ( const std::vector<std::string>& test : test_sequences ) { for ( const std::string& superstring : shortest_common_superstrings(test) ) { print_report(superstring); } } } </syntaxhighlight> <pre> Nucleotide counts for: TAGAAG Bases: A 3, C: 0, G: 2, T: 1, total: 6 Nucleotide counts for: TAAGAA Bases: A 4, C: 0, G: 1, T: 1, total: 6 Nucleotide counts for: GAAGTA Bases: A 3, C: 0, G: 2, T: 1, total: 6 Nucleotide counts for: CATTAGGG Bases: A 2, C: 1, G: 3, T: 2, total: 8 Nucleotide counts for: AAGAUGGAGCGCAUCGCAAUAAGGA Bases: A 10, C: 4, G: 8, T: 0, total: 25 Nucleotide counts for: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA Bases: A 74, C: 57, G: 75, T: 94, total: 300 </pre> =={{header\|Go}}== {{trans\|Julia}} <syntaxhighlight lang="go">package main import ( Line 398 ⟶ 560: ____________________ Total length 300 </pre> =={{header\|Java}}== <syntaxhighlight lang="java"> import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; public final class BioinformaticsGlobalAlignment { public static void main(String[] aArgs) { List<List<String>> testSequences = Arrays.asList( Arrays.asList( "TA", "AAG", "TA", "GAA", "TA" ), Arrays.asList( "CATTAGGG", "ATTAG", "GGG", "TA" ), Arrays.asList( "AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA" ), Arrays.asList( "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC", "CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC", "GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT", "TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA" ) ); for ( List<String> test : testSequences ) { for ( String superstring : shortestCommonSuperstrings(test) ) { printReport(superstring); } } } // Return a set containing all of the shortest common superstrings of the given list of strings. private static Set<String> shortestCommonSuperstrings(List<String> aList) { List<String> deduplicated = deduplicate(aList); Set<String> shortest = new HashSet<String>(); shortest.add(String.join("", deduplicated)); int shortestLength = aList.stream().mapToInt( s -> s.length() ).sum(); for ( List<String> permutation : permutations(deduplicated) ) { String candidate = permutation.stream().reduce("", (a, b) -> concatenate(a, b) ); if ( candidate.length() < shortestLength ) { shortest.clear(); shortest.add(candidate); shortestLength = candidate.length(); } else if ( candidate.length() == shortestLength ) { shortest.add(candidate); } } return shortest; } // Remove duplicate strings and strings which are substrings of other strings in the given list. private static List<String> deduplicate(List<String> aList) { List<String> unique = aList.stream().distinct().collect(Collectors.toList()); List<String> result = new ArrayList<String>(unique); List<String> markedForRemoval = new ArrayList<String>(); for ( String testWord : result ) { for ( String word : unique ) { if ( ! word.equals(testWord) && word.contains(testWord) ) { markedForRemoval.add(testWord); } } } result.removeAll(markedForRemoval); return result; } // Return aBefore concatenated with aAfter, removing the longest suffix of aBefore that matches a prefix of aAfter. private static String concatenate(String aBefore, String aAfter) { for ( int i = 0; i < aBefore.length(); i++ ) { if ( aAfter.startsWith(aBefore.substring(i, aBefore.length())) ) { return aBefore.substring(0, i) + aAfter; } } return aBefore + aAfter; } // Return all permutations of the given list of strings. private static List<List<String>> permutations(List<String> aList) { int[] indexes = new int[aList.size()]; List<List<String>> result = new ArrayList<List<String>>(); result.add( new ArrayList<String>(aList) ); int i = 0; while ( i < aList.size() ) { if ( indexes[i] < i ) { final int j = ( i % 2 == 0 ) ? 0 : indexes[i]; String temp = aList.get(j); aList.set(j, aList.get(i)); aList.set(i, temp); result.add( new ArrayList<String>(aList) ); indexes[i]++; i = 0; } else { indexes[i] = 0; i += 1; } } return result; } // Print a report of the given string to the standard output device. private static void printReport(String aText) { char[] nucleotides = new char[] {'A', 'C', 'G', 'T' }; Map<Character, Integer> bases = new HashMap<Character, Integer>(); for ( char base : nucleotides ) { bases.put(base, 0); } for ( char ch : aText.toCharArray() ) { bases.merge(ch, 1, Integer::sum); } final int total = bases.values().stream().reduce(0, Integer::sum); System.out.print("Nucleotide counts for: " + ( ( aText.length() > 50 ) ? System.lineSeparator() : "") ); System.out.println(aText); System.out.print(String.format("%s%d%s%d%s%d%s%d", "Bases: A: ", bases.get('A'), ", C: ", bases.get('C'), ", G: ", bases.get('G'), ", T: ", bases.get('T'))); System.out.println(", total: " + total + System.lineSeparator()); } } </syntaxhighlight> {{ out }} <pre> Nucleotide counts for: TAGAAG Bases: A: 3, C: 0, G: 2, T: 1, total: 6 Nucleotide counts for: GAAGTA Bases: A: 3, C: 0, G: 2, T: 1, total: 6 Nucleotide counts for: TAAGAA Bases: A: 4, C: 0, G: 1, T: 1, total: 6 Nucleotide counts for: CATTAGGG Bases: A: 2, C: 1, G: 3, T: 2, total: 8 Nucleotide counts for: AAGAUGGAGCGCAUCGCAAUAAGGA Bases: A: 10, C: 4, G: 8, T: 0, total: 25 Nucleotide counts for: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA Bases: A: 74, C: 57, G: 75, T: 94, total: 300 </pre> Line 403 ⟶ 719: {{works with\|jq}} '''Works with gojq, the Go implementation of jq''' <syntaxhighlight lang="jq"> ### Generic helper functions Line 415 ⟶ 731: range(0;length) as $i \| [.[$i]] + (del(.[$i])\|permutations) end ;</syntaxhighlight><syntaxhighlight lang="jq"> # Give a synoptic view of the input string, # highlighting the occurrence of ACGTU letters Line 477 ⟶ 793: </syntaxhighlight> '''The specific tasks''' <syntaxhighlight lang="jq"> def examples: [ Line 556 ⟶ 872: Σ: 300 </pre> =={{header\|Julia}}== <syntaxhighlight lang="julia">using Combinatorics """ Given a DNA sequence, report the sequence, length and base counts""" Line 681 ⟶ 996: Total length 300 </pre> =={{header\|Nim}}== {{trans\|Wren}} <syntaxhighlight lang=~~Nim~~"nim">import algorithm, sequtils, strformat, strutils, tables const ACGT = ['A', 'C', 'G', 'T'] # Four DNA bases. Line 807 ⟶ 1,121: ———————————————————— Total length 300</pre> =={{header\|Pascal}}== Used a matrix of head-tail overlapping and modified n-queens to generate the permutations.<BR> Here nearly no runtime.But see [[N-queens_problem]] that using permutation is not the way for > 17<BR> Of course this is more a traveling salesman problem. <syntaxhighlight lang="pascal"> program BaseInDNA; {$IFDEF FPC} Line 1,125 ⟶ 1,438: A : 74 C : 57 G : 75 T : 94 U : 0 </pre> =={{header\|Perl}}== <syntaxhighlight lang="perl">#!/usr/bin/perl use strict; # https://rosettacode.org/wiki/Bioinformatics/global_alignment Line 1,225 ⟶ 1,537: { A => 74, C => 57, G => 75, T => 94 } </pre> =={{header\|Phix}}== <!--<syntaxhighlight lang=~~Phix~~"phix">(phixonline)--> <span style="color: #008080;">procedure</span> <span style="color: #000000;">printcounts</span><span style="color: #0000FF;">(</span><span style="color: #004080;">sequence</span> <span style="color: #000000;">ss</span><span style="color: #0000FF;">)</span> <span style="color: #000080;font-style:italic;">-- Given DNA sequence(s), report the sequence, length and base counts</span> Line 1,339 ⟶ 1,650: Base counts: Other:0, A:74, C:57, G:75, T:94, total:300 </pre> =={{header\|Python}}== {{trans\|Go}} <syntaxhighlight lang="python">import os from collections import Counter Line 1,523 ⟶ 1,833: Total length 300 </pre> =={{header\|Raku}}== {{trans\|Go}} {{trans\|Julia}} <syntaxhighlight lang="raku" line># 20210209 Raku programming solution sub printCounts(\seq) { Line 1,600 ⟶ 1,909: (C 57 G 75 A 74 T 94) and total length = 300 </pre> =={{header\|Wren}}== {{trans\|Julia}} Line 1,607 ⟶ 1,915: {{libheader\|Wren-str}} {{libheader\|Wren-math}} <syntaxhighlight lang=~~ecmascript~~"wren">import "./fmt" for Fmt import "./seq" for Lst import "./str" for Str import "./math" for Int /* Gets all permutations of a list of strings. */