Bioinformatics/Global alignment: Difference between revisions
Content added Content deleted
(A Python implementation) |
|||
Line 270: | Line 270: | ||
____________________ |
____________________ |
||
Total length 300 |
Total length 300 |
||
</pre> |
|||
=={{header|jq}}== |
|||
{{works with|jq}} |
|||
'''Works with gojq, the Go implementation of jq''' |
|||
<lang jq> |
|||
### Generic helper functions |
|||
# bag-of-words |
|||
def bow(stream): |
|||
reduce stream as $word ({}; .[($word|tostring)] += 1); |
|||
def permutations: |
|||
if length == 0 then [] |
|||
else |
|||
range(0;length) as $i |
|||
| [.[$i]] + (del(.[$i])|permutations) |
|||
end ;</lang><lang jq> |
|||
# Give a synoptic view of the input string, |
|||
# highlighting the occurrence of ACGTU letters |
|||
def synopsis: |
|||
["A", "C", "G", "T", "U"] as $standard |
|||
| . as $seq |
|||
| bow(explode | map([.]|implode)[]) as $bases |
|||
| ("Nucleotide counts for \($seq):\n"), |
|||
(($standard + ($bases|keys - $standard))[] | "\(.): \($bases[.]//0)"), |
|||
"__", |
|||
"Σ: \($seq|length)" ; |
|||
# If the strings, $s1 and $s2, overlap by at least $minimumoverlap characters, |
|||
# return { i1: <index in $s1 where overlap starts>, overlap: <overlapping string>}, |
|||
# otherwise, return null |
|||
def overlap_info($s1; $s2; $minimumoverlap): |
|||
first( range(0; $s1|length + 1 - $minimumoverlap) as $i1 |
|||
| $s1[$i1:] as $overlap |
|||
| select($s2 | startswith($overlap)) |
|||
| {$i1, $overlap} ) // null ; |
|||
# Input: an array of strings |
|||
# Remove duplicates and strings contained within a larger string |
|||
def deduplicate: |
|||
unique |
|||
| . as $arr |
|||
| reduce range(0;length) as $i ([]; |
|||
$arr[$i] as $s1 |
|||
| if any( $arr[] | select(. != $s1); index($s1)) |
|||
then . |
|||
else . + [$s1] |
|||
end); |
|||
# Given an array of strings, attempt to find a superstring composed |
|||
# of these strings in the same order; |
|||
# return it if found, else null. |
|||
def relevant($min): |
|||
. as $in |
|||
| reduce range(0; length-1) as $i (.[0]; |
|||
if . |
|||
then overlap_info(.; $in[$i+1]; $min) as $overlap |
|||
| if $overlap then (. + $in[$i+1][$overlap.overlap|length:]) |
|||
else null |
|||
end |
|||
else . end) ; |
|||
# Input: an array of strings |
|||
# Return shortest common superstring |
|||
def shortest_common_superstring: |
|||
deduplicate as $ss |
|||
| reduce ($ss | permutations) as $perm ({shortestsuper: ($ss | add) }; |
|||
($perm | relevant(1)) as $candidate |
|||
| if $candidate and ($candidate|length) < (.shortestsuper|length) |
|||
then .shortestsuper = $candidate |
|||
else . end) |
|||
| .shortestsuper; |
|||
</lang> |
|||
'''The specific tasks''' |
|||
<lang jq> |
|||
def task1: |
|||
["TA", "AAG", "TA", "GAA", "TA"]; |
|||
def task2: |
|||
["CATTAGGG", "ATTAG", "GGG", "TA"]; |
|||
def task3: |
|||
["AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA"]; |
|||
def task4: |
|||
["ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT", |
|||
"GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT", |
|||
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", |
|||
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", |
|||
"AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT", |
|||
"GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC", |
|||
"CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT", |
|||
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", |
|||
"CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC", |
|||
"GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT", |
|||
"TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC", |
|||
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA", |
|||
"TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA"]; |
|||
def tasks: |
|||
def t: shortest_common_superstring | synopsis; |
|||
"Task 1:", (task1|t), |
|||
"\nTask 2:", (task2|t), |
|||
"\nTask 3:", (task3|t), |
|||
"\nTask 4:", (task4|t); |
|||
tasks</lang> |
|||
{{out}} |
|||
<pre> |
|||
Task 1: |
|||
Nucleotide counts for TAAGAA: |
|||
A: 4 |
|||
C: 0 |
|||
G: 1 |
|||
T: 1 |
|||
U: 0 |
|||
__ |
|||
Σ: 6 |
|||
Task 2: |
|||
Nucleotide counts for CATTAGGG: |
|||
A: 2 |
|||
C: 1 |
|||
G: 3 |
|||
T: 2 |
|||
U: 0 |
|||
__ |
|||
Σ: 8 |
|||
Task 3: |
|||
Nucleotide counts for AAGAUGGAGCGCAUCGCAAUAAGGA: |
|||
A: 10 |
|||
C: 4 |
|||
G: 8 |
|||
T: 0 |
|||
U: 3 |
|||
__ |
|||
Σ: 25 |
|||
Task 4: |
|||
Nucleotide counts for CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA: |
|||
A: 74 |
|||
C: 57 |
|||
G: 75 |
|||
T: 94 |
|||
U: 0 |
|||
__ |
|||
Σ: 300 |
|||
</pre> |
</pre> |
||