Bioinformatics/Global alignment
Global alignment is designed to search for highly similar regions in two or more DNA sequences, where the sequences appear in the same order and orientation, fitting the sequences in as pieces in a puzzle.
Current DNA sequencers find the sequence for multiple small segments of DNA which have mostly randomly formed by splitting a much larger DNA molecule into shorter segments. When re-assembling such segments of DNA sequences into a larger sequence to form, for example, the DNA coding for the relevant gene, the overlaps between multiple shorter sequences are commonly used to decide how the longer sequence is to be assembled. For example, "AAGATGGA", GGAGCGCATC", and "ATCGCAATAAGGA" can be assembled into the sequence "AAGATGGAGCGCATCGCAATAAGGA" by noting that "GGA" is at the tail of the first string and head of the second string and "ATC" likewise is at the tail of the second and head of the third string.
When looking for the best global alignment in the output strings produced by DNA sequences, there are typically a large number of such overlaps among a large number of sequences. In such a case, the ordering that results in the shortest common superstring is generrally preferred.
Finding such a supersequence is an NP-hard problem, and many algorithms have been proposed to shorten calculations, especially when many very long sequences are matched.
The shortest common superstring as used in bioinfomatics here differs from the string task Shortest_common_supersequence. In that task, a supersequence may have other characters interposed as long as the characters of each subsequence appear in order, so that (abcbdab, abdcaba) -> abdcabdab. In this task, (abcbdab, abdcaba) -> abcbdabdcaba.
- Task
-
- Given N non-identical strings of characters A, C, G, and T representing N DNA sequences, find the shortest DNA sequence containing all N sequences.
- Handle cases where two sequences are identical or one sequence is entirely contained in another.
- Print the resulting sequence along with its size (its base count) and a count of each base in the sequence.
- Find the shortest common superstring for the following four examples:
("TA", "AAG", "TA", "GAA", "TA")
("CATTAGGG", "ATTAG", "GGG", "TA")
("AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA")
("ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC", "CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC", "GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT", "TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA")
- Related tasks
Go
<lang go>package main
import (
"fmt" "strings"
)
/* Gets n! for small n. */ func factorial(n int) int {
fact := 1 for i := 2; i <= n; i++ { fact *= i } return fact
}
/* Gets all permutations of a list of strings. */ func getPerms(input []string) [][]string {
perms := [][]string{input} le := len(input) a := make([]string, le) copy(a, input) n := le - 1 fact := factorial(n + 1)
for c := 1; c < fact; c++ { i := n - 1 j := n for i >= 0 && a[i] > a[i+1] { i-- } if i == -1 { i = n } for a[j] < a[i] { j-- } a[i], a[j] = a[j], a[i] j = n i++ if i == n+1 { i = 0 } for i < j { a[i], a[j] = a[j], a[i] i++ j-- } b := make([]string, le) copy(b, a) perms = append(perms, b) } return perms
}
/* Returns all distinct elements from a list of strings. */ func distinct(slist []string) []string {
distinctSet := make(map[string]int, len(slist)) i := 0 for _, s := range slist { if _, ok := distinctSet[s]; !ok { distinctSet[s] = i i++ } } result := make([]string, len(distinctSet)) for s, i := range distinctSet { result[i] = s } return result
}
/* Given a DNA sequence, report the sequence, length and base counts. */ func printCounts(seq string) {
bases := [][]rune{{'A', 0}, {'C', 0}, {'G', 0}, {'T', 0}} for _, c := range seq { for _, base := range bases { if c == base[0] { base[1]++ } } } sum := 0 fmt.Println("\nNucleotide counts for", seq, "\b:\n") for _, base := range bases { fmt.Printf("%10c%12d\n", base[0], base[1]) sum += int(base[1]) } le := len(seq) fmt.Printf("%10s%12d\n", "Other", le-sum) fmt.Printf(" ____________________\n%14s%8d\n", "Total length", le)
}
/* Return the position in s1 of the start of overlap of tail of string s1 with head of string s2. */ func headTailOverlap(s1, s2 string) int {
for start := 0; ; start++ { ix := strings.IndexByte(s1[start:], s2[0]) if ix == -1 { return 0 } else { start += ix } if strings.HasPrefix(s2, s1[start:]) { return len(s1) - start } }
}
/* Remove duplicates and strings contained within a larger string from a list of strings. */ func deduplicate(slist []string) []string {
var filtered []string arr := distinct(slist) for i, s1 := range arr { withinLarger := false for j, s2 := range arr { if j != i && strings.Contains(s2, s1) { withinLarger = true break } } if !withinLarger { filtered = append(filtered, s1) } } return filtered
}
/* Returns shortest common superstring of a list of strings. */ func shortestCommonSuperstring(slist []string) string {
ss := deduplicate(slist) shortestSuper := strings.Join(ss, "") for _, perm := range getPerms(ss) { sup := perm[0] for i := 0; i < len(ss)-1; i++ { overlapPos := headTailOverlap(perm[i], perm[i+1]) sup += perm[i+1][overlapPos:] } if len(sup) < len(shortestSuper) { shortestSuper = sup } } return shortestSuper
}
func main() {
testSequences := [][]string{ {"TA", "AAG", "TA", "GAA", "TA"}, {"CATTAGGG", "ATTAG", "GGG", "TA"}, {"AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA"}, { "ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC", "CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC", "GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT", "TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA", }, }
for _, test := range testSequences { scs := shortestCommonSuperstring(test) printCounts(scs) }
}</lang>
- Output:
Nucleotide counts for TAAGAA: A 4 C 0 G 1 T 1 Other 0 ____________________ Total length 6 Nucleotide counts for CATTAGGG: A 2 C 1 G 3 T 2 Other 0 ____________________ Total length 8 Nucleotide counts for AAGAUGGAGCGCAUCGCAAUAAGGA: A 10 C 4 G 8 T 0 Other 3 ____________________ Total length 25 Nucleotide counts for CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA: A 74 C 57 G 75 T 94 Other 0 ____________________ Total length 300
Julia
<lang julia>using Combinatorics
""" Given a DNA sequence, report the sequence, length and base counts""" function printcounts(seq)
bases = [['A', 0], ['C', 0], ['G', 0], ['T', 0]] for c in seq, base in bases if c == base[1] base[2] += 1 end end println("\nNucleotide counts for $seq:\n") for base in bases println(lpad(base[1], 10), lpad(string(base[2]), 12)) end println(lpad("Other", 10), lpad(string(length(seq) - sum(x[2] for x in bases)), 12)) println(" _________________\n", lpad("Total length", 14), lpad(string(length(seq)), 8))
end
"""Return the position in s1 of the start of overlap of tail of string s1 with head of string s2""" function headtailoverlap(s1, s2, minimumoverlap=1)
start = 1 while true range = findnext(s2[1:minimumoverlap], s1, start) range == nothing && return 0 start = range.start startswith(s2, s1[start:end]) && return length(s1) - start + 1 start += 1 end
end
"""Remove duplicates and strings contained within a larger string from vector of strings""" function deduplicate(svect)
filtered = empty(svect) arr = unique(svect) for (i, s1) in enumerate(arr) any(p -> p[1] != i && occursin(s1, p[2]), enumerate(arr)) && continue push!(filtered, s1) end return filtered
end
"""Returns shortest common superstring of a vector of strings""" function shortest_common_superstring(svect)
ss = deduplicate(svect) shortestsuper = prod(ss) for perm in permutations(ss) sup = first(perm) for i in 1:length(ss)-1 overlap_position = headtailoverlap(perm[i], perm[i+1], 1) sup *= perm[i + 1][overlap_position+1:end] end if length(sup) < length(shortestsuper) shortestsuper = sup end end return shortestsuper
end
testsequences = [ ["TA", "AAG", "TA", "GAA", "TA"], ["CATTAGGG", "ATTAG", "GGG", "TA"], ["AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA"], ["ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC", "CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC", "GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT", "TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA"] ]
for test in testsequences
scs = shortest_common_superstring(test) printcounts(scs)
end
</lang>
- Output:
Nucleotide counts for TAAGAA: A 4 C 0 G 1 T 1 Other 0 _________________ Total length 6 Nucleotide counts for CATTAGGG: A 2 C 1 G 3 T 2 Other 0 _________________ Total length 8 Nucleotide counts for AAGAUGGAGCGCAUCGCAAUAAGGA: A 10 C 4 G 8 T 0 Other 3 _________________ Total length 25 Nucleotide counts for CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA: A 74 C 57 G 75 T 94 Other 0 _________________ Total length 300
Nim
<lang Nim>import algorithm, sequtils, strformat, strutils, tables
const ACGT = ['A', 'C', 'G', 'T'] # Four DNA bases.
iterator permutations(slist: seq[string]): seq[string] =
var slist = sorted(slist) yield slist while slist.nextPermutation(): yield slist
proc printCounts(dnaSeq: string) =
## Given a DNA sequence, report the sequence, length and base counts. let counts = dnaSeq.toCountTable() echo &"\nNucleotide counts for {dnaSeq}:\n" for base in ACGT: echo &"{($base):>10} {counts[base]:11}" var others = 0 for base in counts.keys: if base notin ACGT: inc others, counts[base] echo &" Other {others:11}" echo &" ————————————————————" echo &" Total length {dnaSeq.len: 7}"
func headTailOverlap(s1, s2: string): int =
## Return the position in "s1" of the start of overlap ## of tail of string "s1" with head of string "s2". var start = 0 while true: start = s1.find(s2[0], start) if start < 0: return 0 if s2.startsWith(s1[start..^1]): return s1.len - start inc start
proc deduplicate(slist: seq[string]): seq[string] =
## Remove duplicates and strings contained within a larger string from a list of strings. let slist = sequtils.deduplicate(slist) for i, s1 in slist: block check: for j, s2 in slist: if j != i and s1 in s2: break check # "s1" is not contained in another string. result.add s1
func shortestCommonSuperstring(slist: seq[string]): string =
## Return shortest common superstring of a list of strings.
let slist = slist.deduplicate() result = slist.join() for perm in slist.permutations(): var sup = perm[0] for i in 0..<slist.high: let overlapPos = headTailOverlap(perm[i], perm[i+1]) sup &= perm[i+1][overlapPos..^1] if sup.len < result.len: result = sup
const TestSequences = [
@["TA", "AAG", "TA", "GAA", "TA"], @["CATTAGGG", "ATTAG", "GGG", "TA"], @["AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA"], @["ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC", "CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC", "GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT", "TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA"]]
for test in TestSequences:
let scs = test.shortestCommonSuperstring scs.printCounts()</lang>
- Output:
Nucleotide counts for GAAGTA: A 3 C 0 G 2 T 1 Other 0 ———————————————————— Total length 6 Nucleotide counts for CATTAGGG: A 2 C 1 G 3 T 2 Other 0 ———————————————————— Total length 8 Nucleotide counts for AAGAUGGAGCGCAUCGCAAUAAGGA: A 10 C 4 G 8 T 0 Other 3 ———————————————————— Total length 25 Nucleotide counts for CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA: A 74 C 57 G 75 T 94 Other 0 ———————————————————— Total length 300
Perl
<lang perl>#!/usr/bin/perl
use strict; # https://rosettacode.org/wiki/Bioinformatics/global_alignment use warnings; use List::Util qw( first uniq );
my @seq = (
[ qw( TA AAG TA GAA TA ) ],
[ qw( CATTAGGG ATTAG GGG TA) ],
[ qw( AAGAUGGA GGAGCGCAUC AUCGCAAUAAGGA ) ],
[ qw( ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA ) ], );
sub removedups # remove dups and subseqs
{ local $_ = join ' ', sort { length $a <=> length $b } split ' ', shift; 1 while s/\b(\w+) (?=.*\1)//; return $_; }
for ( @seq )
{ local $_ = removedups join ' ', @$_; my @queue = $_; my @best;
while( @queue ) { local $_ = shift @queue; my @seq = split ' ', $_; my @over; for my $left ( @seq ) { for my $right ( @seq ) { $left eq $right and next; "$left $right" =~ /(.+) \1/ or next; my $len = length $1; $over[$len] .= "$left $right\n"; } } if( @over ) { for my $join ( split /\n/, $over[-1] ) { my ($left, $right) = split ' ', $join; my @newseq = grep $_ ne $left && $_ ne $right, @seq; # remove used push @queue, removedups "$left $right" =~ s/(.+) (?=\1)//r . join ' ', , @newseq; } } else { tr/ //d; $best[length] .= "$_\n"; next; } }
for ( uniq split /\n/, first {defined} @best ) { printf "\nlength %d - %s\n", length, $_; my %ch; $ch{$_}++ for /./g; use Data::Dump 'dd'; dd \%ch; } }</lang>
- Output:
length 6 - TAGAAG { A => 3, G => 2, T => 1 } length 8 - CATTAGGG { A => 2, C => 1, G => 3, T => 2 } length 25 - AAGAUGGAGCGCAUCGCAAUAAGGA { A => 10, C => 4, G => 8, U => 3 } length 300 - CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA { A => 74, C => 57, G => 75, T => 94 }
Phix
procedure printcounts(sequence ss) -- Given DNA sequence(s), report the sequence, length and base counts for i=1 to length(ss) do string dna = ss[i] sequence acgt = repeat(0,6) for j=1 to length(dna) do acgt[find(dna[j],"ACGT")+1] += 1 end for acgt[$] = sum(acgt) string ncf = "Nucleotide counts for :" printf(1,"%s%s\n",{ncf,join(split_by(dna,50),"\n"&repeat(' ',length(ncf)))}) printf(1,"Base counts: Other:%d, A:%d, C:%d, G:%d, T:%d, total:%d\n\n",acgt) end for end procedure function deduplicate(sequence ss) -- Remove any strings contained within a larger string from a set of strings sequence filtered = {} for i=1 to length(ss) do string si = ss[i] bool found = false for j=1 to length(ss) do if i!=j and match(si,ss[j]) then found = true exit end if end for if not found then filtered = append(filtered, si) end if end for return filtered end function procedure shortest_common_superstring(sequence ss) -- Returns shortest common superstring of a set of strings ss = deduplicate(unique(ss)) sequence shortestsuper = {join(ss,"")} integer shortest = length(shortestsuper[1]) for p=1 to factorial(length(ss)) do sequence perm = permute(p,ss) string sup = perm[1] for i=2 to length(perm) do string pi = perm[i] for j=-min(length(pi),length(sup)) to 0 do string overlap = sup[j..$] if overlap = pi[1..length(overlap)] then sup &= pi[length(overlap)+1..$] pi = "" exit end if end for if length(pi) then ?9/0 end if -- (sanity chk) end for if length(sup) < shortest then shortest = length(sup) shortestsuper = {sup} elsif length(sup) = shortest and not find(sup,shortestsuper) then shortestsuper = append(shortestsuper,sup) end if end for printcounts(shortestsuper) end procedure constant tests = { {"TA", "AAG", "TA", "GAA", "TA"}, {"CATTAGGG", "ATTAG", "GGG", "TA"}, {"AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA"}, {"ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC"