Bioinformatics/Subsequence: Difference between revisions

m
 
(14 intermediate revisions by 7 users not shown)
Line 5:
Write a routine to find all the positions of a randomly generated subsequence   (four letters).
<br><br>
=={{header|11l}}==
{{trans|Python}}
 
<syntaxhighlight lang="11l">UInt32 seed = 34
F nonrandom_choice(lst)
:seed = (1664525 * :seed + 1013904223) [&] FFFF'FFFF
R lst[Int(:seed >> 16) % lst.len]
 
F generate_sequence(Int n)
R ((0 .< n).map(_ -> nonrandom_choice([‘A’, ‘C’, ‘G’, ‘T’]))).join(‘’)
 
F positions(dnaSeq, subSeq)
[Int] r
V start = 0
L
V? pos = dnaSeq.find(subSeq, start)
I pos == N
L.break
r.append(pos)
start = pos + 1
R r
 
F dna_findall(String needle, haystack) -> Void
V pp = positions(haystack, needle)
I pp.empty
print(‘No matches found’)
E
print(‘Found ’needle‘ at the following indices:’)
L(p) pp
print(p‘:’(p + needle.len))
 
V dna_seq = generate_sequence(200)
V sample_seq = generate_sequence(4)
 
V c = 1
L(i) dna_seq
I c % 20 != 0 {print(i, end' ‘’)} E print(i)
c++
print("\nSearch Sample: "sample_seq)
 
dna_findall(sample_seq, dna_seq)</syntaxhighlight>
 
{{out}}
<pre>
GAAGTGCTCAAACCCTTTTT
CCTTGCCGTAGGTTGTGCTG
CCGCCGCACACCCGCAACAG
CTTTTAGGCATAAGTATACG
GACCGCGGACGGGGCGTAAC
GGTGAACATTTTGCTAAATT
GGCTCTAGGGATGAGCCCTA
TAGCGCTGGGGACTACGCCC
CGGTAAAGATCGAGGCGACT
CACCGATTGCGCTAGGGACA
 
Search Sample: CGTA
Found CGTA at the following indices:
26:30
94:98
</pre>
 
=={{header|Action!}}==
<syntaxhighlight lang="action!">DEFINE SEQLEN="200"
DEFINE SUBLEN="4"
 
PROC RandomSeq(CHAR ARRAY s BYTE len)
CHAR ARRAY letters="ACGT"
BYTE i
 
FOR i=1 TO len
DO
s(i)=letters(Rand(4)+1)
OD
s(0)=len
RETURN
 
PROC PrintSeq(CHAR ARRAY s)
BYTE i
 
FOR i=1 TO s(0)
DO
IF i MOD 20=1 THEN
IF i<10 THEN Put(32) FI
IF i<100 THEN Put(32) FI
PrintB(i)
Print(": ")
FI
Put(s(i))
IF i MOD 20=0 THEN
PutE()
FI
OD
RETURN
 
BYTE FUNC StartsWith(CHAR ARRAY s,prefix BYTE start)
BYTE i
 
FOR i=1 TO prefix(0)
DO
IF s(start+i-1)#prefix(i) THEN
RETURN (0)
FI
OD
RETURN (1)
 
PROC Main()
CHAR ARRAY seq(SEQLEN+1),sub(SUBLEN+1)
BYTE i,notfirst
 
RandomSeq(seq,SEQLEN)
RandomSeq(sub,SUBLEN)
 
PrintE("Search sequence:")
PrintSeq(seq)
PutE()
PrintF("Subsequence to find: %S%E%E",sub)
 
PrintE("Found subsequence at positions:")
notfirst=0
FOR i=1 TO SEQLEN-SUBLEN
DO
IF StartsWith(seq,sub,i) THEN
IF notfirst THEN
Print(", ")
FI
notfirst=1
PrintF("%I-%I",i,i+SUBLEN-1)
FI
OD
IF notfirst=0 THEN
PrintE("Not found")
FI
RETURN</syntaxhighlight>
{{out}}
[https://gitlab.com/amarok8bit/action-rosetta-code/-/raw/master/images/Bioinformatics_subsequence.png Screenshot from Atari 8-bit computer]
<pre>
Search sequence:
1: CGACTCAGGAAGGCCACGTG
21: GTAACTTCTTAGTTACCGTA
41: AGGCTAATAGCTAGCGCTGC
61: GTGACCAGGCATAGTAACCG
81: GCACGCACGTTCACCAAGGG
101: GTCCCGATGGGAGGCACGTT
121: ACTACTCCAAGAACTGTAGT
141: AAGTTACCGAAAAGTTCTCA
161: TCCTTGGGTAGTGAGTACTT
181: TGTGCTATGAAAAATAAGGA
 
Subsequence to find: ACGC
 
Found subsequence at positions:
83-86
</pre>
=={{header|Ada}}==
<langsyntaxhighlight Adalang="ada">with Ada.Text_Io;
with Ada.Strings.Fixed;
with Ada.Numerics.Discrete_Random;
Line 71 ⟶ 223:
New_Line;
end loop;
end Sub_Sequence;</langsyntaxhighlight>
{{out}}
<pre>Search sequence:
Line 89 ⟶ 241:
Found at position: 371..374
Found at position: 380..383</pre>
 
=={{header|Arturo}}==
 
<langsyntaxhighlight lang="rebol">bases: [`A` `G` `C` `T`]
randSeq: join map 1..200 => [sample bases]
randSub: join map 1..4 => [sample bases]
Line 110 ⟶ 261:
print ["Found subsequence at position:" idx]
idx: idx + 1
]</langsyntaxhighlight>
 
{{out}}
Line 131 ⟶ 282:
Found subsequence at position: 71
Found subsequence at position: 169</pre>
 
=={{header|Factor}}==
{{works with|Factor|0.99 2021-02-05}}
<langsyntaxhighlight lang="factor">USING: accessors formatting grouping io kernel math
math.functions.integer-logs math.parser random regexp sequences ;
 
Line 160 ⟶ 310:
 
80 10 .biosub nl
600 39 .biosub nl</langsyntaxhighlight>
{{out}}
<pre> 0: ATTCAAGGAC
0: ATTCAAGGAC
10: CACTATTAAC
20: CTGCATTGTG
Line 196 ⟶ 345:
145..149
289..293
312..316</pre>
 
</pre>
=={{header|FreeBASIC}}==
{{trans|Wren}}
<syntaxhighlight lang="vb">Const base_ = "ACGT"
 
Sub findDnaSubsequence(dnaSize As Integer, chunkSize As Integer)
Dim As String dnaSeq(1 To dnaSize)
Dim As Integer i, chunk
For i = 1 To dnaSize
dnaSeq(i) = Mid(base_, Int(Rnd * 4)+1, 1)
Next
Dim As String dnaStr
For i = 1 To dnaSize
dnaStr += dnaSeq(i)
Next
Dim As String dnaSubseq(1 To 4)
For i = 1 To 4
dnaSubseq(i) = Mid(base_, Int(Rnd * 4)+1, 1)
Next
Dim As String dnaSubstr
For i = 1 To 4
dnaSubstr += dnaSubseq(i)
Next
Print "DNA sequence:"
For chunk = 1 To Len(dnaStr) Step chunkSize
Print Using "###_._.###: &"; chunk; chunk+chunkSize-1; Mid(dnaStr, chunk, chunkSize)
Next
Print !"\nSubsequence to locate: "; dnaSubstr
Dim As Integer idx = Instr(dnaStr, dnaSubstr)
Print Iif(idx <> 0, "Matches found at the following indices:", "No matches found.")
Do While idx > 0
If idx <> 0 Then Print Using "###_._.###"; idx; idx + 3
idx = Instr(idx+4, dnaStr, dnaSubstr)
Loop
End Sub
 
findDnaSubsequence(200, 20)
Print
findDnaSubsequence(600, 40)
 
Sleep</syntaxhighlight>
{{out}}
<pre>DNA sequence:
1.. 20: TTATAGTCTTGGAGGCATGT
21.. 40: TAACTTATGCGGAGCAGACA
41.. 60: CGGAGTATGCATTCCTCTTA
61.. 80: CCAAACGGTGCTGCCCGCGC
81..100: ACTCGCTGTATTCCGTATCG
101..120: TCACATTATCTAAACCACGA
121..140: TTTCCAGCGTGCGTGGGAAG
141..160: GCCATGTTTAGTCGGGGGCC
161..180: AAGGTCTTTGGCTTATGCTG
181..200: TTTTTTTTTCTTCGGTTACA
 
Subsequence to locate: ATTT
Matches found at the following indices:
120..123
 
DNA sequence:
1.. 40: GTGCGGGCCGTTAGCAGCTACGAGTGCTAGATGGAACTAG
41.. 80: TCCCCGCTCCCAAATGCAAAGCGTCCCAGACCAGTCTTGA
81..120: AGCCCGTTAAATTACACCTGAACCGTTGCAAATGATCGAT
121..160: AGACGGGGTATAATAGCGGAAAACACAGGGGAACTGCATG
161..200: CAAGCTCGAGCCGCTGAAGGATGGCTCCCCCCCGAGTGTA
201..240: AGTGGATCTCGCCCAAATAGCGGGGGAACAAAGAAAGGTA
241..280: AGTCTTACTTCGCACGTCCCCTCTCATACACGCCAGGACT
281..320: AATGGATCATTCATAGGTGACGGGTGACTTGCGGTGTTTC
321..360: TAGTTGGAGTCACCCGTCAGCTTAGATCTAAGTATGAACC
361..400: GTAAGAGTTTGTAACTGCACCTTCCGTCTCTTCCTCTGTA
401..440: GGAACGCTTTTGCTTGTTATCAGATAGTGTCTCCTTATCA
441..480: TAGGACAGGTTCCTTGTGAAGGTCCACAGAGTTTGCCCGG
481..520: GGTTCGAATATACGACGCTTGTGGTTCCGGCACTATAACT
521..560: TCCGCAGTGTTGTCGACGCCCCTAGCTCCCGGGGTCTTTT
561..600: CGCTTCCCTATAGCGCGAAATGAGTGCAAGGGTACCGGCC
 
Subsequence to locate: GCAC
Matches found at the following indices:
252..255
377..380
510..513</pre>
 
=={{header|Go}}==
{{trans|Wren}}
<langsyntaxhighlight lang="go">package main
 
import (
Line 246 ⟶ 475:
fmt.Println()
findDnaSubsequence(600, 40)
}</langsyntaxhighlight>
 
{{out}}
Line 290 ⟶ 519:
388..391
</pre>
=={{header|jq}}==
{{works with|jq}}
'''Works with gojq, the Go implementation of jq'''
 
Neither jq nor gojq currently has any PRNG built-ins so one
possibility is to use a jq-coded PRNG function such
as can be found at https://rosettacode.org/wiki/Random_numbers#jq
 
In practice, it's usually more convenient to use a utility such as
gshuf or jot to provide the source of randomness. Here we use
`jot -r N MIN MAX` but a fourth argument can also be
used to specify a seed. An alternative would be to use `gshuf` along the lines of:
<syntaxhighlight lang="sh">
# For 200 pseudo-random integers in the range 0 to 3 inclusive:
gshuf -i 0-3 -r -n 200 --random-source=/dev/random
</syntaxhighlight>
 
Note that the indices shown below are offsets (i.e., the index origin is taken to be 0).
<syntaxhighlight lang="sh">
#!/bin/bash
 
jot -r 200 0 3 | jq -nr --slurpfile four <(jot -r 4 0 3) '
 
# input: an array of integers
def toDNA:
def base: . as $in | "ACGT" | .[$in : $in+1];
map(base) | join("");
 
([inputs] | toDNA) as $strand
| ($four | toDNA) as $four
| "Strand of length \($strand|length):",
$strand,
"Zero-based indices of \($four):",
($strand | indices($four) | join(" "))
'</syntaxhighlight>
{{out}}
<pre>
./bioinformatics-subsequence.sh
Strand of length 200:
TGGGCCCAAGCATTGCCACGTAGCTTTGTCAGTGGGCTTGTAAGGGACGAACACAAACTCACAGACCAGGAATTCTCGAGTTCCAGTCCCCCCACTTGTCGCTATTTAGTTAAGACGTTCAGTTTCGTTGCGAACTGTGTCCCCCAGGCTAACGTGATGGGTGTCAGGAATCAATGGCCAACTTTCAGTTAGACTTGACC
Zero-based indices of CAAC:
178
 
./bioinformatics-subsequence.sh
Strand of length 200:
TAAGACTGCAGGGTACGAAGAGTGGAAGATTGGCTCGTACTTGTCGACGTCGCGTGACATAATCTCTGTGCTCGCCTCGCAGTAAGGGACTAGGTCCCGTTCGAGCGCCCTGCTAGAAGGAGCATCCTACCATGCTCTGATGACATCCTGTCGGCATTAGAGTTTCTACGACATCTAAAGAGTACGATCGACTTCCCAGT
Zero-based indices of GACA:
55 141 169
</pre>
=={{header|Julia}}==
<langsyntaxhighlight lang="julia">DNArand(n, bases=['A', 'T', 'C', 'G']) = String(rand(bases, n))
 
DNAsearch(needle, haystack, lap=true) = findall(needle, haystack, overlap=lap)
Line 301 ⟶ 578:
println("Search sequence:\n$rand_string\nfor substring $subseq. Found at positions: ")
foreach(p -> print(rpad(p[2], 8), p[1] % 10 == 0 ? "\n" : ""), enumerate(DNAsearch(subseq, rand_string)))
</langsyntaxhighlight>{{out}}
<pre>
Search sequence:
Line 308 ⟶ 585:
21:24 74:77 99:102
</pre>
 
=={{header|Nim}}==
<langsyntaxhighlight Nimlang="nim">import random, sequtils, strutils
 
proc dnaSequence(n: Positive): string =
Line 349 ⟶ 625:
else:
let tail = if pos.len == 1: ": " else: "s: "
echo "Subsequence found at position", tail, pos.join(", ")</langsyntaxhighlight>
 
{{out}}
Line 367 ⟶ 643:
 
Subsequence found at positions: 61, 122, 170</pre>
 
=={{header|Perl}}==
<langsyntaxhighlight lang="perl">use strict;
use warnings;
use feature 'say';
Line 381 ⟶ 656:
say "Target: $target";
say 'Matches at these positions:';
say (($string =~ s/.{1,40}\K/\n/gr) =~ s/($target)/ >$1< /gr);</langsyntaxhighlight>
{{out}}
<pre>Target: CCTG
Line 392 ⟶ 667:
TGCGAG >CCTG< TAGAGCCGGGCCTCAAATTAAACGAAAAAT
ATAAGTTTGCTTGGCACGCTGTACTACTTATCC >CCTG< ACT</pre>
 
=={{header|Phix}}==
Currently only searches for non-overlapped sequences, but it should be pretty obvious how to change that, in which case the next underline will simply partially overwrite the previous, so you'll get eg "<=<==>".
<!--<langsyntaxhighlight Phixlang="phix">(phixonline)-->
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span>
Line 450 ⟶ 724:
<span style="color: #004080;">sequence</span> <span style="color: #000000;">idx</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">match_all</span><span style="color: #0000FF;">(</span><span style="color: #000000;">test</span><span style="color: #0000FF;">,</span><span style="color: #000000;">dna</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">show</span><span style="color: #0000FF;">(</span><span style="color: #000000;">dna</span><span style="color: #0000FF;">,</span><span style="color: #000000;">test</span><span style="color: #0000FF;">,</span><span style="color: #000000;">idx</span><span style="color: #0000FF;">)</span>
<!--</langsyntaxhighlight>-->
{{out}}
with cheat enabled
Line 471 ⟶ 745:
GCTA does not occur
</pre>
 
=={{header|Python}}==
 
Line 477 ⟶ 750:
{{libheader|regex}}
 
<langsyntaxhighlight lang="python">
from random import choice
import regex as re
import time
 
def generate_sequence(n: int ) -> liststr:
return "".join([ choice(['A','C','G','T']) for _ in range(n) ])
 
Line 504 ⟶ 777:
 
dna_findall(sample_seq, dna_seq)
</syntaxhighlight>
</lang>
{{out}}
 
Line 525 ⟶ 798:
167:171
</pre>
=={{header|Racket}}==
 
<syntaxhighlight lang="racket">#lang racket
 
(define (rand-seq n)
(build-string n (lambda _ (string-ref "TGAC" (random 4)))))
 
(define (subsequence-indices full part)
(let ((part-length (string-length part)) (full-length (string-length full)))
(for/list ((i (- full-length part-length))
#:when (for/and ((p part) (f (in-string full i))) (eq? p f)))
(cons i (+ i part-length -1)))))
 
(define (report-sequence s (l 50))
(string-join (for/list ((i (in-range 0 (string-length s) l)))
(format "~a: ~a" (~a #:width 4 i)
(substring s i (min (string-length s) (+ i l)))))
"\n"))
(define (Bioinformatics/Subsequence (full (rand-seq 400)) (sub (rand-seq 4)))
(printf "Indices of ~a in~%~a~%~a~%"
sub (report-sequence full) (subsequence-indices full sub)))
 
(module+ main (for ((i 4)) (Bioinformatics/Subsequence)))</syntaxhighlight>
 
{{out}}
<pre>Indices of TTAC in
0 : TTATCCTACCGCGTAAGTTCAATGCTCACCGCAGTTTGCTAACCGTTCCT
50 : AAATTCACTTCCTAAGGTATCTTTCGCTTAATTGATGCCGATTGAATTCC
100 : ACGGAGGGCGTAATTGTTTCGGACTTTAGACCTGACATAAGGGCACACTA
150 : GTCCTATTGAATTTGGTGCTATTCGGCGACCTACTAACCTTAGTCAGTGA
200 : AGAGCCATCTCAAAAGTACAGTCATCCTCAAGTGTTACATACGGCACCAT
250 : GACAGTGTATAAGCATGGAGGTTGGCCTATCGTCATATCGAGGCGGCGCC
300 : ATAGACCGGCCAGGTGATGAGATCGACTTTAATGTTGTTGCTTAGCTTGA
350 : CCTCTAGTTTGGATTAAGACGGTCATAGATAGATAGACCGTAAAGTATTC
((234 . 237))
Indices of GTAA in
0 : GTCAGTCCACGCAAGAATAGCAGTTGAGTGGACAATTTATGAGACGGAGA
50 : TAAGTAACCCGCTCCGAGATAAACGTCAGCCGGATTCCGCTGAGTCGGTC
100 : GCCTTCCAAGTGGCAGCTTGTTTGCATTGCTTACAGTGACTTGAACGATC
150 : ACCTACTCGAGGACTCTGCGGGTATTCCAGTTGCCTTGCACTCAGCGATG
200 : CACAAACTTTAAATTATCACAGAAAGAATGTGATTCGGGTGGTCACCCTT
250 : ATCGGTGAAACCAGTCCTTCCATGGGCATATTCTGCGTCGAAATGAGCCC
300 : GCTGTTTACGTTGTACGAACTGGGGACCTAAGGAAACGGGCCGTTCTTAG
350 : GTGATGTCAGCTGCAACGAACTACTGTTAACCTTCTCGATCTGTTGAAAA
((53 . 56))
Indices of AACG in
0 : TTTACAGTACGATTCCGAAGACACAAGAATGCGCCGGCTGTGGGTAGGGG
50 : CGACCCTGCGCGACCTATAAAAGGGGCGACTCAATTTTAGGCCCACCACG
100 : GACCCAGCCCTGTGCACAGAGCGGGGCATTTTTACCTCGCGTGCGCACCA
150 : ACTGCGATCTGCCTTGTCACATAATCCCACATACGAGTTGTATCTCTAAG
200 : AAGGGATGAGGCCAATTTAAATCCGGGTGCATTTCTCGGGGGGAGACACC
250 : AATGAGAGTGGGGCAAGGTGGCGTAGAGAGCTAATCGGGTTTTATGACCG
300 : CGGAAGACCTGGGATACGTCTGGGTGATAACTGAGGGCAGGTCAACGAAC
350 : CCTGATGCGTAGCCACGTCTCAGCTATCGGGCCTGTTTTCATAGTCCATG
((343 . 346))
Indices of CAGC in
0 : TGTGAACCACTATGACACGCTACACGCCTCAAGTTGGCCCCCATATAAGA
50 : ATATCCATCGGTTAATGTGTCTCGCGGCCGTTAGAACAAGCACTAAAGTT
100 : AGAGAAACCAACCATTGGACTAGATCAACATCAACGTCGCTGATAATAAA
150 : TGTATATCTGATGTGGCCGTTCATAAAATCGTTAACTACAGGTATCAACA
200 : TAGTCTCCCAACTTATATAATTGGTTAACTTAGGAGGAGCTTGCACAGCT
250 : CAGCTATATGCTATCTGGCCCTGGGCTTGGTAGGCATCACGTCGTTATGC
300 : TGCGAACATCTCAAAGACAAACGTTGATCCAGCCCCTAGAGAGGTCATTA
350 : GGCCTCGACCCAATTTAACCTCCCACTCCGTGGGTACAGCTTGAACCCCC
((245 . 248) (250 . 253) (329 . 332) (386 . 389))</pre>
=={{header|Raku}}==
Chances are actually pretty small that a random 4 codon string will show up at all in a random 200 codon sequence. Bump up the sequence size to get a reasonable chance of multiple matches.
<syntaxhighlight lang="raku" perl6line>use String::Splice:ver<0.0.3+>;
 
my $line = 80;
Line 549 ⟶ 887:
}
 
say $disp;</langsyntaxhighlight>
{{out}}
Show in custom div to better display highlighting.
Line 578 ⟶ 916:
:* &nbsp; DNA proteins to be searched in the data &nbsp; &nbsp; &nbsp; &nbsp; (the default is four unique random proteins).
:* &nbsp; the seed for the RANDOM function so runs can be repeated with the same data &nbsp; &nbsp; (no default).
<langsyntaxhighlight lang="rexx">/*REXX pgm gens random DNA (ACGT) sequence & finds positions of a random 4─protein seq. */
parse arg totLen rndLen basePr oWidth Bevery rndDNA seed .
if totLen=='' | totLen=="," then totLen= 200 /*Not specified? Then use the default.*/
Line 624 ⟶ 962:
if $\=='' then say right(idx, 7)"│" strip($, 'T') /*show residual protein data*/
say "───────┴"center('' , oWidth+10, '─')
say; return</langsyntaxhighlight>
{{out|output|text=&nbsp; when using the default inputs:}}
<pre>
Line 660 ⟶ 998:
the random DNA proteins were found in positions: 5 6 16 69 157 158 159 340 796 797 962 963
</pre>
 
=={{header|Ring}}==
<langsyntaxhighlight lang="ring">
/*-----------------------------------
# Project : DNA subsequences
Line 910 ⟶ 1,247:
 
//-----------------------------------------
</syntaxhighlight>
</lang>
 
'''Output:'''
 
[https://i.imgur.com/5hhbRBK.mp4 Bioinformatics/Subsequence - video]
 
=={{header|Wren}}==
{{libheader|Wren-pattern}}
{{libheader|Wren-str}}
{{libheader|Wren-fmt}}
<langsyntaxhighlight ecmascriptlang="wren">import "random" for Random
import "./pattern" for Pattern
import "./str" for Str
import "./fmt" for Fmt
 
var rand = Random.new()
Line 956 ⟶ 1,292:
findDnaSubsequence.call(200, 20)
System.print()
findDnaSubsequence.call(600, 40)</langsyntaxhighlight>
 
{{out}}
1,480

edits