Bioinformatics/Global alignment: Difference between revisions

Content added Content deleted

Inline

@@ Line 270: / Line 270: @@
   ____________________
   Total length     300
+</pre>
+=={{header|jq}}==
+{{works with|jq}}
+'''Works with gojq, the Go implementation of jq'''
+<lang jq>
+### Generic helper functions
+# bag-of-words
+def bow(stream):
+  reduce stream as $word ({}; .[($word|tostring)] += 1);
+def permutations:
+  if length == 0 then []
+  else
+    range(0;length) as $i
+    | [.[$i]] + (del(.[$i])|permutations)
+  end ;</lang><lang jq>
+# Give a synoptic view of the input string,
+# highlighting the occurrence of ACGTU letters
+def synopsis:
+  ["A", "C", "G", "T", "U"] as $standard
+  | . as $seq
+  | bow(explode | map([.]|implode)[]) as $bases
+  | ("Nucleotide counts for \($seq):\n"),
+    (($standard + ($bases|keys - $standard))[] | "\(.): \($bases[.]//0)"),
+    "__",
+    "Σ: \($seq|length)" ;
+# If the strings, $s1 and $s2, overlap by at least $minimumoverlap characters,
+# return { i1: <index in $s1 where overlap starts>,  overlap: <overlapping string>},
+# otherwise, return null
+def overlap_info($s1; $s2; $minimumoverlap):
+  first( range(0; $s1|length + 1 - $minimumoverlap) as $i1
+         | $s1[$i1:] as $overlap
+         | select($s2 | startswith($overlap))
+	 | {$i1, $overlap} ) // null ;
+# Input: an array of strings
+# Remove duplicates and strings contained within a larger string
+def deduplicate:
+  unique
+  | . as $arr
+  | reduce range(0;length) as $i ([];
+      $arr[$i] as $s1
+      | if any( $arr[] | select(. != $s1); index($s1))
+        then .
+	else . + [$s1]
+	end);
+# Given an array of strings, attempt to find a superstring composed
+# of these strings in the same order;
+# return it if found, else null.
+def relevant($min):
+  . as $in
+  | reduce range(0; length-1) as $i (.[0];
+       if .
+       then overlap_info(.; $in[$i+1]; $min) as $overlap
+       | if $overlap then (. + $in[$i+1][$overlap.overlap|length:])
+         else null
+	 end
+       else . end) ;
+# Input: an array of strings
+# Return shortest common superstring
+def shortest_common_superstring:
+  deduplicate as $ss
+  | reduce ($ss | permutations) as $perm ({shortestsuper: ($ss | add) };
+      ($perm | relevant(1)) as $candidate
+      | if $candidate and ($candidate|length) < (.shortestsuper|length)
+        then .shortestsuper = $candidate
+        else . end)
+  | .shortestsuper;
+</lang>
+'''The specific tasks'''
+<lang jq>
+def task1:
+  ["TA", "AAG", "TA", "GAA", "TA"];
+def task2:
+  ["CATTAGGG", "ATTAG", "GGG", "TA"];
+def task3:
+  ["AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA"];
+def task4:
+  ["ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT",
+   "GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT",
+   "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
+   "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
+   "AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT",
+   "GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC",
+   "CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT",
+   "TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
+   "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC",
+   "GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT",
+   "TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
+   "CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
+   "TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA"];
+def tasks:
+  def t: shortest_common_superstring | synopsis;
+    "Task 1:", (task1|t),
+  "\nTask 2:", (task2|t),
+  "\nTask 3:", (task3|t),
+  "\nTask 4:", (task4|t);
+tasks</lang>
+{{out}}
+<pre>
+Task 1:
+Nucleotide counts for TAAGAA:
+A: 4
+C: 0
+G: 1
+T: 1
+U: 0
+__
+Σ: 6
+Task 2:
+Nucleotide counts for CATTAGGG:
+A: 2
+C: 1
+G: 3
+T: 2
+U: 0
+__
+Σ: 8
+Task 3:
+Nucleotide counts for AAGAUGGAGCGCAUCGCAAUAAGGA:
+A: 10
+C: 4
+G: 8
+T: 0
+U: 3
+__
+Σ: 25
+Task 4:
+Nucleotide counts for CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA:
+A: 74
+C: 57
+G: 75
+T: 94
+U: 0
+__
+Σ: 300
 </pre>