Bioinformatics/Global alignment: Difference between revisions

m
m (syntax highlighting fixup automation)
m (→‎{{header|Wren}}: Minor tidy)
 
(9 intermediate revisions by 3 users not shown)
Line 1:
{{task}}
{{Draft task}}[[Category:Bioinfomatics]][[Category:Strings]]
 
[[Category:Bioinfomatics]]
[[Category:Strings]]
Global alignment is designed to search for highly similar regions in two or more DNA sequences, where the
sequences appear in the same order and orientation, fitting the sequences in as pieces in a puzzle.
Line 53 ⟶ 55:
:* [[Bioinformatics/Sequence_mutation|Bioinformatics sequence mutation]].
<br><br>
 
 
=={{header|11l}}==
{{trans|Nim}}
 
<syntaxhighlight lang="11l">-V ACGT = [‘A’, ‘C’, ‘G’, ‘T’]
 
F permutations(slist)
Line 181:
--------------------
Total length 300
</pre>
 
=={{header|C++}}==
<syntaxhighlight lang="c++">
 
#include <algorithm>
#include <cstdint>
#include <iostream>
#include <numeric>
#include <unordered_map>
#include <unordered_set>
#include <string>
#include <vector>
 
// Print a report of the given string to the standard output device.
void print_report(const std::string& text) {
std::unordered_map<char, int32_t> bases;
for ( const char& ch : text ) {
bases[ch]++;
}
 
const int32_t total = std::accumulate(bases.begin(), bases.end(), 0,
[&](int32_t previous_sum, std::pair<char, int32_t> entry) {
return previous_sum + entry.second;
});
 
std::cout << "Nucleotide counts for: " << ( ( text.length() > 50 ) ? "\n" : "" );
std::cout << text << std::endl;
std::cout << "Bases: A " << bases['A'] << ", C: " << bases['C'] << ", G: " << bases['G'] << ", T: " << bases['T']
<< ", total: " << total << "\n" << std::endl;
}
 
// Return all permutations of the given list of strings.
std::vector<std::vector<std::string>> permutations(std::vector<std::string>& list) {
int32_t indexes[list.size()] = {};
std::vector<std::vector<std::string>> result;
result.push_back(list);
int32_t i = 0;
while ( (uint64_t) i < list.size() ) {
if ( indexes[i] < i ) {
const int j = ( i % 2 == 0 ) ? 0 : indexes[i];
std::swap(list[i], list[j]);
result.push_back(list);
indexes[i]++;
i = 0;
} else {
indexes[i] = 0;
i++;
}
}
return result;
}
 
// Return 'before' concatenated with 'after', removing the longest suffix of 'before' that matches a prefix of 'after'.
std::string concatenate(const std::string& before, const std::string& after) {
for ( uint64_t i = 0; i < before.length(); ++i ) {
if ( after.starts_with(before.substr(i, before.length())) ) {
return before.substr(0, i) + after;
}
}
return before + after;
}
 
// Remove duplicate strings and strings which are substrings of other strings in the given list.
std::vector<std::string> deduplicate(const std::vector<std::string>& list) {
std::vector<std::string> singletons(list);
std::sort(singletons.begin(), singletons.end());
singletons.erase(std::unique(singletons.begin(), singletons.end()), singletons.end());
 
std::vector<std::string> result(singletons);
std::unordered_set<std::string> marked_for_removal;
for ( const std::string& test_word : result ) {
for ( const std::string& word : singletons ) {
if ( word != test_word && word.find(test_word) != std::string::npos ) {
marked_for_removal.emplace(test_word);
}
}
}
 
result.erase(std::remove_if(result.begin(), result.end(),
[&](std::string& word) {
return marked_for_removal.count(word) != 0;
}
), result.end());
 
return result;
}
 
// Return a set containing all of the shortest common superstrings of the given list of strings.
std::unordered_set<std::string> shortest_common_superstrings(const std::vector<std::string>& list) {
std::vector<std::string> deduplicated = deduplicate(list);
 
std::unordered_set<std::string> shortest;
shortest.emplace(std::reduce(list.begin(), list.end(), std::string("")));
 
uint64_t shortest_length;
for ( const std::string& word : list ) {
shortest_length += word.length();
}
 
for ( std::vector<std::string> permutation : permutations(deduplicated) ) {
std::string candidate;
for ( const std::string& word : permutation ) {
candidate = concatenate(candidate, word);
}
 
if ( candidate.length() < shortest_length ) {
shortest.clear();
shortest.emplace(candidate);
shortest_length = candidate.length();
} else if ( candidate.length() == shortest_length ) {
shortest.emplace(candidate);
}
}
return shortest;
}
 
int main() {
const std::vector<std::vector<std::string>> test_sequences = {
{ "TA", "AAG", "TA", "GAA", "TA" },
{ "CATTAGGG", "ATTAG", "GGG", "TA" },
{ "AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA" },
{ "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT",
"GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT",
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT",
"GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC",
"CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT",
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC",
"GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT",
"TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
"TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA" } };
 
for ( const std::vector<std::string>& test : test_sequences ) {
for ( const std::string& superstring : shortest_common_superstrings(test) ) {
print_report(superstring);
}
}
}
</syntaxhighlight>
<pre>
Nucleotide counts for: TAGAAG
Bases: A 3, C: 0, G: 2, T: 1, total: 6
 
Nucleotide counts for: TAAGAA
Bases: A 4, C: 0, G: 1, T: 1, total: 6
 
Nucleotide counts for: GAAGTA
Bases: A 3, C: 0, G: 2, T: 1, total: 6
 
Nucleotide counts for: CATTAGGG
Bases: A 2, C: 1, G: 3, T: 2, total: 8
 
Nucleotide counts for: AAGAUGGAGCGCAUCGCAAUAAGGA
Bases: A 10, C: 4, G: 8, T: 0, total: 25
 
Nucleotide counts for:
CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA
Bases: A 74, C: 57, G: 75, T: 94, total: 300
</pre>
 
=={{header|Go}}==
{{trans|Julia}}
<syntaxhighlight lang="go">package main
 
import (
Line 398 ⟶ 560:
____________________
Total length 300
</pre>
 
=={{header|Java}}==
<syntaxhighlight lang="java">
 
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
 
public final class BioinformaticsGlobalAlignment {
 
public static void main(String[] aArgs) {
List<List<String>> testSequences = Arrays.asList(
Arrays.asList( "TA", "AAG", "TA", "GAA", "TA" ),
Arrays.asList( "CATTAGGG", "ATTAG", "GGG", "TA" ),
Arrays.asList( "AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA" ),
Arrays.asList( "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT",
"GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT",
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT",
"GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC",
"CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT",
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC",
"GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT",
"TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
"TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA" )
);
for ( List<String> test : testSequences ) {
for ( String superstring : shortestCommonSuperstrings(test) ) {
printReport(superstring);
}
}
}
// Return a set containing all of the shortest common superstrings of the given list of strings.
private static Set<String> shortestCommonSuperstrings(List<String> aList) {
List<String> deduplicated = deduplicate(aList);
Set<String> shortest = new HashSet<String>();
shortest.add(String.join("", deduplicated));
int shortestLength = aList.stream().mapToInt( s -> s.length() ).sum();
for ( List<String> permutation : permutations(deduplicated) ) {
String candidate = permutation.stream().reduce("", (a, b) -> concatenate(a, b) );
if ( candidate.length() < shortestLength ) {
shortest.clear();
shortest.add(candidate);
shortestLength = candidate.length();
} else if ( candidate.length() == shortestLength ) {
shortest.add(candidate);
}
}
return shortest;
}
 
// Remove duplicate strings and strings which are substrings of other strings in the given list.
private static List<String> deduplicate(List<String> aList) {
List<String> unique = aList.stream().distinct().collect(Collectors.toList());
List<String> result = new ArrayList<String>(unique);
List<String> markedForRemoval = new ArrayList<String>();
for ( String testWord : result ) {
for ( String word : unique ) {
if ( ! word.equals(testWord) && word.contains(testWord) ) {
markedForRemoval.add(testWord);
}
}
}
result.removeAll(markedForRemoval);
return result;
}
// Return aBefore concatenated with aAfter, removing the longest suffix of aBefore that matches a prefix of aAfter.
private static String concatenate(String aBefore, String aAfter) {
for ( int i = 0; i < aBefore.length(); i++ ) {
if ( aAfter.startsWith(aBefore.substring(i, aBefore.length())) ) {
return aBefore.substring(0, i) + aAfter;
}
}
return aBefore + aAfter;
}
// Return all permutations of the given list of strings.
private static List<List<String>> permutations(List<String> aList) {
int[] indexes = new int[aList.size()];
List<List<String>> result = new ArrayList<List<String>>();
result.add( new ArrayList<String>(aList) );
int i = 0;
while ( i < aList.size() ) {
if ( indexes[i] < i ) {
final int j = ( i % 2 == 0 ) ? 0 : indexes[i];
String temp = aList.get(j);
aList.set(j, aList.get(i));
aList.set(i, temp);
result.add( new ArrayList<String>(aList) );
indexes[i]++;
i = 0;
} else {
indexes[i] = 0;
i += 1;
}
}
return result;
}
// Print a report of the given string to the standard output device.
private static void printReport(String aText) {
char[] nucleotides = new char[] {'A', 'C', 'G', 'T' };
Map<Character, Integer> bases = new HashMap<Character, Integer>();
for ( char base : nucleotides ) {
bases.put(base, 0);
}
for ( char ch : aText.toCharArray() ) {
bases.merge(ch, 1, Integer::sum);
}
final int total = bases.values().stream().reduce(0, Integer::sum);
System.out.print("Nucleotide counts for: " + ( ( aText.length() > 50 ) ? System.lineSeparator() : "") );
System.out.println(aText);
System.out.print(String.format("%s%d%s%d%s%d%s%d",
"Bases: A: ", bases.get('A'), ", C: ", bases.get('C'), ", G: ", bases.get('G'), ", T: ", bases.get('T')));
System.out.println(", total: " + total + System.lineSeparator());
}
 
}
</syntaxhighlight>
{{ out }}
<pre>
Nucleotide counts for: TAGAAG
Bases: A: 3, C: 0, G: 2, T: 1, total: 6
 
Nucleotide counts for: GAAGTA
Bases: A: 3, C: 0, G: 2, T: 1, total: 6
 
Nucleotide counts for: TAAGAA
Bases: A: 4, C: 0, G: 1, T: 1, total: 6
 
Nucleotide counts for: CATTAGGG
Bases: A: 2, C: 1, G: 3, T: 2, total: 8
 
Nucleotide counts for: AAGAUGGAGCGCAUCGCAAUAAGGA
Bases: A: 10, C: 4, G: 8, T: 0, total: 25
 
Nucleotide counts for:
CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA
Bases: A: 74, C: 57, G: 75, T: 94, total: 300
</pre>
 
Line 403 ⟶ 719:
{{works with|jq}}
'''Works with gojq, the Go implementation of jq'''
<syntaxhighlight lang="jq">
### Generic helper functions
 
Line 415 ⟶ 731:
range(0;length) as $i
| [.[$i]] + (del(.[$i])|permutations)
end ;</syntaxhighlight><syntaxhighlight lang="jq">
# Give a synoptic view of the input string,
# highlighting the occurrence of ACGTU letters
Line 477 ⟶ 793:
</syntaxhighlight>
'''The specific tasks'''
<syntaxhighlight lang="jq">
def examples:
[
Line 556 ⟶ 872:
Σ: 300
</pre>
 
=={{header|Julia}}==
<syntaxhighlight lang="julia">using Combinatorics
 
""" Given a DNA sequence, report the sequence, length and base counts"""
Line 681 ⟶ 996:
Total length 300
</pre>
 
=={{header|Nim}}==
{{trans|Wren}}
<syntaxhighlight lang=Nim"nim">import algorithm, sequtils, strformat, strutils, tables
 
const ACGT = ['A', 'C', 'G', 'T'] # Four DNA bases.
Line 807 ⟶ 1,121:
————————————————————
Total length 300</pre>
 
=={{header|Pascal}}==
Used a matrix of head-tail overlapping and modified n-queens to generate the permutations.<BR>
Here nearly no runtime.But see [[N-queens_problem]] that using permutation is not the way for > 17<BR>
Of course this is more a traveling salesman problem.
<syntaxhighlight lang="pascal">
program BaseInDNA;
{$IFDEF FPC}
Line 1,125 ⟶ 1,438:
A : 74 C : 57 G : 75 T : 94 U : 0
</pre>
 
=={{header|Perl}}==
<syntaxhighlight lang="perl">#!/usr/bin/perl
 
use strict; # https://rosettacode.org/wiki/Bioinformatics/global_alignment
Line 1,225 ⟶ 1,537:
{ A => 74, C => 57, G => 75, T => 94 }
</pre>
 
=={{header|Phix}}==
<!--<syntaxhighlight lang=Phix"phix">(phixonline)-->
<span style="color: #008080;">procedure</span> <span style="color: #000000;">printcounts</span><span style="color: #0000FF;">(</span><span style="color: #004080;">sequence</span> <span style="color: #000000;">ss</span><span style="color: #0000FF;">)</span>
<span style="color: #000080;font-style:italic;">-- Given DNA sequence(s), report the sequence, length and base counts</span>
Line 1,339 ⟶ 1,650:
Base counts: Other:0, A:74, C:57, G:75, T:94, total:300
</pre>
 
=={{header|Python}}==
{{trans|Go}}
 
<syntaxhighlight lang="python">import os
 
from collections import Counter
Line 1,523 ⟶ 1,833:
Total length 300
</pre>
 
=={{header|Raku}}==
{{trans|Go}}
{{trans|Julia}}
<syntaxhighlight lang="raku" line># 20210209 Raku programming solution
 
sub printCounts(\seq) {
Line 1,600 ⟶ 1,909:
(C 57 G 75 A 74 T 94) and total length = 300
</pre>
 
=={{header|Wren}}==
{{trans|Julia}}
Line 1,607 ⟶ 1,915:
{{libheader|Wren-str}}
{{libheader|Wren-math}}
<syntaxhighlight lang=ecmascript"wren">import "./fmt" for Fmt
import "./seq" for Lst
import "./str" for Str
import "./math" for Int
 
/* Gets all permutations of a list of strings. */
9,485

edits