Bioinformatics/Global alignment: Difference between revisions

m
m (Re-add task markup)
m (→‎{{header|Wren}}: Minor tidy)
 
(6 intermediate revisions by one other user not shown)
Line 182:
Total length 300
</pre>
 
=={{header|C++}}==
<syntaxhighlight lang="c++">
 
#include <algorithm>
#include <cstdint>
#include <iostream>
#include <numeric>
#include <unordered_map>
#include <unordered_set>
#include <string>
#include <vector>
 
// Print a report of the given string to the standard output device.
void print_report(const std::string& text) {
std::unordered_map<char, int32_t> bases;
for ( const char& ch : text ) {
bases[ch]++;
}
 
const int32_t total = std::accumulate(bases.begin(), bases.end(), 0,
[&](int32_t previous_sum, std::pair<char, int32_t> entry) {
return previous_sum + entry.second;
});
 
std::cout << "Nucleotide counts for: " << ( ( text.length() > 50 ) ? "\n" : "" );
std::cout << text << std::endl;
std::cout << "Bases: A " << bases['A'] << ", C: " << bases['C'] << ", G: " << bases['G'] << ", T: " << bases['T']
<< ", total: " << total << "\n" << std::endl;
}
 
// Return all permutations of the given list of strings.
std::vector<std::vector<std::string>> permutations(std::vector<std::string>& list) {
int32_t indexes[list.size()] = {};
std::vector<std::vector<std::string>> result;
result.push_back(list);
int32_t i = 0;
while ( (uint64_t) i < list.size() ) {
if ( indexes[i] < i ) {
const int j = ( i % 2 == 0 ) ? 0 : indexes[i];
std::swap(list[i], list[j]);
result.push_back(list);
indexes[i]++;
i = 0;
} else {
indexes[i] = 0;
i++;
}
}
return result;
}
 
// Return 'before' concatenated with 'after', removing the longest suffix of 'before' that matches a prefix of 'after'.
std::string concatenate(const std::string& before, const std::string& after) {
for ( uint64_t i = 0; i < before.length(); ++i ) {
if ( after.starts_with(before.substr(i, before.length())) ) {
return before.substr(0, i) + after;
}
}
return before + after;
}
 
// Remove duplicate strings and strings which are substrings of other strings in the given list.
std::vector<std::string> deduplicate(const std::vector<std::string>& list) {
std::vector<std::string> singletons(list);
std::sort(singletons.begin(), singletons.end());
singletons.erase(std::unique(singletons.begin(), singletons.end()), singletons.end());
 
std::vector<std::string> result(singletons);
std::unordered_set<std::string> marked_for_removal;
for ( const std::string& test_word : result ) {
for ( const std::string& word : singletons ) {
if ( word != test_word && word.find(test_word) != std::string::npos ) {
marked_for_removal.emplace(test_word);
}
}
}
 
result.erase(std::remove_if(result.begin(), result.end(),
[&](std::string& word) {
return marked_for_removal.count(word) != 0;
}
), result.end());
 
return result;
}
 
// Return a set containing all of the shortest common superstrings of the given list of strings.
std::unordered_set<std::string> shortest_common_superstrings(const std::vector<std::string>& list) {
std::vector<std::string> deduplicated = deduplicate(list);
 
std::unordered_set<std::string> shortest;
shortest.emplace(std::reduce(list.begin(), list.end(), std::string("")));
 
uint64_t shortest_length;
for ( const std::string& word : list ) {
shortest_length += word.length();
}
 
for ( std::vector<std::string> permutation : permutations(deduplicated) ) {
std::string candidate;
for ( const std::string& word : permutation ) {
candidate = concatenate(candidate, word);
}
 
if ( candidate.length() < shortest_length ) {
shortest.clear();
shortest.emplace(candidate);
shortest_length = candidate.length();
} else if ( candidate.length() == shortest_length ) {
shortest.emplace(candidate);
}
}
return shortest;
}
 
int main() {
const std::vector<std::vector<std::string>> test_sequences = {
{ "TA", "AAG", "TA", "GAA", "TA" },
{ "CATTAGGG", "ATTAG", "GGG", "TA" },
{ "AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA" },
{ "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT",
"GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT",
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT",
"GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC",
"CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT",
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC",
"GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT",
"TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
"TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA" } };
 
for ( const std::vector<std::string>& test : test_sequences ) {
for ( const std::string& superstring : shortest_common_superstrings(test) ) {
print_report(superstring);
}
}
}
</syntaxhighlight>
<pre>
Nucleotide counts for: TAGAAG
Bases: A 3, C: 0, G: 2, T: 1, total: 6
 
Nucleotide counts for: TAAGAA
Bases: A 4, C: 0, G: 1, T: 1, total: 6
 
Nucleotide counts for: GAAGTA
Bases: A 3, C: 0, G: 2, T: 1, total: 6
 
Nucleotide counts for: CATTAGGG
Bases: A 2, C: 1, G: 3, T: 2, total: 8
 
Nucleotide counts for: AAGAUGGAGCGCAUCGCAAUAAGGA
Bases: A 10, C: 4, G: 8, T: 0, total: 25
 
Nucleotide counts for:
CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA
Bases: A 74, C: 57, G: 75, T: 94, total: 300
</pre>
 
=={{header|Go}}==
{{trans|Julia}}
Line 398 ⟶ 561:
Total length 300
</pre>
 
=={{header|Java}}==
<syntaxhighlight lang="java">
 
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
 
public final class BioinformaticsGlobalAlignment {
 
public static void main(String[] aArgs) {
List<List<String>> testSequences = Arrays.asList(
Arrays.asList( "TA", "AAG", "TA", "GAA", "TA" ),
Arrays.asList( "CATTAGGG", "ATTAG", "GGG", "TA" ),
Arrays.asList( "AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA" ),
Arrays.asList( "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT",
"GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT",
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT",
"GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC",
"CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT",
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC",
"GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT",
"TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
"TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA" )
);
for ( List<String> test : testSequences ) {
for ( String superstring : shortestCommonSuperstrings(test) ) {
printReport(superstring);
}
}
}
// Return a set containing all of the shortest common superstrings of the given list of strings.
private static Set<String> shortestCommonSuperstrings(List<String> aList) {
List<String> deduplicated = deduplicate(aList);
Set<String> shortest = new HashSet<String>();
shortest.add(String.join("", deduplicated));
int shortestLength = aList.stream().mapToInt( s -> s.length() ).sum();
for ( List<String> permutation : permutations(deduplicated) ) {
String candidate = permutation.stream().reduce("", (a, b) -> concatenate(a, b) );
if ( candidate.length() < shortestLength ) {
shortest.clear();
shortest.add(candidate);
shortestLength = candidate.length();
} else if ( candidate.length() == shortestLength ) {
shortest.add(candidate);
}
}
return shortest;
}
 
// Remove duplicate strings and strings which are substrings of other strings in the given list.
private static List<String> deduplicate(List<String> aList) {
List<String> unique = aList.stream().distinct().collect(Collectors.toList());
List<String> result = new ArrayList<String>(unique);
List<String> markedForRemoval = new ArrayList<String>();
for ( String testWord : result ) {
for ( String word : unique ) {
if ( ! word.equals(testWord) && word.contains(testWord) ) {
markedForRemoval.add(testWord);
}
}
}
result.removeAll(markedForRemoval);
return result;
}
// Return aBefore concatenated with aAfter, removing the longest suffix of aBefore that matches a prefix of aAfter.
private static String concatenate(String aBefore, String aAfter) {
for ( int i = 0; i < aBefore.length(); i++ ) {
if ( aAfter.startsWith(aBefore.substring(i, aBefore.length())) ) {
return aBefore.substring(0, i) + aAfter;
}
}
return aBefore + aAfter;
}
// Return all permutations of the given list of strings.
private static List<List<String>> permutations(List<String> aList) {
int[] indexes = new int[aList.size()];
List<List<String>> result = new ArrayList<List<String>>();
result.add( new ArrayList<String>(aList) );
int i = 0;
while ( i < aList.size() ) {
if ( indexes[i] < i ) {
final int j = ( i % 2 == 0 ) ? 0 : indexes[i];
String temp = aList.get(j);
aList.set(j, aList.get(i));
aList.set(i, temp);
result.add( new ArrayList<String>(aList) );
indexes[i]++;
i = 0;
} else {
indexes[i] = 0;
i += 1;
}
}
return result;
}
// Print a report of the given string to the standard output device.
private static void printReport(String aText) {
char[] nucleotides = new char[] {'A', 'C', 'G', 'T' };
Map<Character, Integer> bases = new HashMap<Character, Integer>();
for ( char base : nucleotides ) {
bases.put(base, 0);
}
for ( char ch : aText.toCharArray() ) {
bases.merge(ch, 1, Integer::sum);
}
final int total = bases.values().stream().reduce(0, Integer::sum);
System.out.print("Nucleotide counts for: " + ( ( aText.length() > 50 ) ? System.lineSeparator() : "") );
System.out.println(aText);
System.out.print(String.format("%s%d%s%d%s%d%s%d",
"Bases: A: ", bases.get('A'), ", C: ", bases.get('C'), ", G: ", bases.get('G'), ", T: ", bases.get('T')));
System.out.println(", total: " + total + System.lineSeparator());
}
 
}
</syntaxhighlight>
{{ out }}
<pre>
Nucleotide counts for: TAGAAG
Bases: A: 3, C: 0, G: 2, T: 1, total: 6
 
Nucleotide counts for: GAAGTA
Bases: A: 3, C: 0, G: 2, T: 1, total: 6
 
Nucleotide counts for: TAAGAA
Bases: A: 4, C: 0, G: 1, T: 1, total: 6
 
Nucleotide counts for: CATTAGGG
Bases: A: 2, C: 1, G: 3, T: 2, total: 8
 
Nucleotide counts for: AAGAUGGAGCGCAUCGCAAUAAGGA
Bases: A: 10, C: 4, G: 8, T: 0, total: 25
 
Nucleotide counts for:
CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA
Bases: A: 74, C: 57, G: 75, T: 94, total: 300
</pre>
 
=={{header|jq}}==
{{works with|jq}}
Line 1,597 ⟶ 1,915:
{{libheader|Wren-str}}
{{libheader|Wren-math}}
<syntaxhighlight lang="ecmascriptwren">import "./fmt" for Fmt
import "./seq" for Lst
import "./str" for Str
import "./math" for Int
 
/* Gets all permutations of a list of strings. */
9,485

edits