Bioinformatics/Subsequence: Difference between revisions

m
 
(8 intermediate revisions by 5 users not shown)
Line 5:
Write a routine to find all the positions of a randomly generated subsequence   (four letters).
<br><br>
=={{header|11l}}==
{{trans|Python}}
 
<syntaxhighlight lang="11l">UInt32 seed = 34
F nonrandom_choice(lst)
:seed = (1664525 * :seed + 1013904223) [&] FFFF'FFFF
R lst[Int(:seed >> 16) % lst.len]
 
F generate_sequence(Int n)
R ((0 .< n).map(_ -> nonrandom_choice([‘A’, ‘C’, ‘G’, ‘T’]))).join(‘’)
 
F positions(dnaSeq, subSeq)
[Int] r
V start = 0
L
V? pos = dnaSeq.find(subSeq, start)
I pos == N
L.break
r.append(pos)
start = pos + 1
R r
 
F dna_findall(String needle, haystack) -> Void
V pp = positions(haystack, needle)
I pp.empty
print(‘No matches found’)
E
print(‘Found ’needle‘ at the following indices:’)
L(p) pp
print(p‘:’(p + needle.len))
 
V dna_seq = generate_sequence(200)
V sample_seq = generate_sequence(4)
 
V c = 1
L(i) dna_seq
I c % 20 != 0 {print(i, end' ‘’)} E print(i)
c++
print("\nSearch Sample: "sample_seq)
 
dna_findall(sample_seq, dna_seq)</syntaxhighlight>
 
{{out}}
<pre>
GAAGTGCTCAAACCCTTTTT
CCTTGCCGTAGGTTGTGCTG
CCGCCGCACACCCGCAACAG
CTTTTAGGCATAAGTATACG
GACCGCGGACGGGGCGTAAC
GGTGAACATTTTGCTAAATT
GGCTCTAGGGATGAGCCCTA
TAGCGCTGGGGACTACGCCC
CGGTAAAGATCGAGGCGACT
CACCGATTGCGCTAGGGACA
 
Search Sample: CGTA
Found CGTA at the following indices:
26:30
94:98
</pre>
 
=={{header|Action!}}==
<syntaxhighlight lang="action!">DEFINE SEQLEN="200"
DEFINE SUBLEN="4"
 
PROC RandomSeq(CHAR ARRAY s BYTE len)
CHAR ARRAY letters="ACGT"
BYTE i
 
FOR i=1 TO len
DO
s(i)=letters(Rand(4)+1)
OD
s(0)=len
RETURN
 
PROC PrintSeq(CHAR ARRAY s)
BYTE i
 
FOR i=1 TO s(0)
DO
IF i MOD 20=1 THEN
IF i<10 THEN Put(32) FI
IF i<100 THEN Put(32) FI
PrintB(i)
Print(": ")
FI
Put(s(i))
IF i MOD 20=0 THEN
PutE()
FI
OD
RETURN
 
BYTE FUNC StartsWith(CHAR ARRAY s,prefix BYTE start)
BYTE i
 
FOR i=1 TO prefix(0)
DO
IF s(start+i-1)#prefix(i) THEN
RETURN (0)
FI
OD
RETURN (1)
 
PROC Main()
CHAR ARRAY seq(SEQLEN+1),sub(SUBLEN+1)
BYTE i,notfirst
 
RandomSeq(seq,SEQLEN)
RandomSeq(sub,SUBLEN)
 
PrintE("Search sequence:")
PrintSeq(seq)
PutE()
PrintF("Subsequence to find: %S%E%E",sub)
 
PrintE("Found subsequence at positions:")
notfirst=0
FOR i=1 TO SEQLEN-SUBLEN
DO
IF StartsWith(seq,sub,i) THEN
IF notfirst THEN
Print(", ")
FI
notfirst=1
PrintF("%I-%I",i,i+SUBLEN-1)
FI
OD
IF notfirst=0 THEN
PrintE("Not found")
FI
RETURN</syntaxhighlight>
{{out}}
[https://gitlab.com/amarok8bit/action-rosetta-code/-/raw/master/images/Bioinformatics_subsequence.png Screenshot from Atari 8-bit computer]
<pre>
Search sequence:
1: CGACTCAGGAAGGCCACGTG
21: GTAACTTCTTAGTTACCGTA
41: AGGCTAATAGCTAGCGCTGC
61: GTGACCAGGCATAGTAACCG
81: GCACGCACGTTCACCAAGGG
101: GTCCCGATGGGAGGCACGTT
121: ACTACTCCAAGAACTGTAGT
141: AAGTTACCGAAAAGTTCTCA
161: TCCTTGGGTAGTGAGTACTT
181: TGTGCTATGAAAAATAAGGA
 
Subsequence to find: ACGC
 
Found subsequence at positions:
83-86
</pre>
=={{header|Ada}}==
<langsyntaxhighlight Adalang="ada">with Ada.Text_Io;
with Ada.Strings.Fixed;
with Ada.Numerics.Discrete_Random;
Line 71 ⟶ 223:
New_Line;
end loop;
end Sub_Sequence;</langsyntaxhighlight>
{{out}}
<pre>Search sequence:
Line 89 ⟶ 241:
Found at position: 371..374
Found at position: 380..383</pre>
 
=={{header|Arturo}}==
 
<langsyntaxhighlight lang="rebol">bases: [`A` `G` `C` `T`]
randSeq: join map 1..200 => [sample bases]
randSub: join map 1..4 => [sample bases]
Line 110 ⟶ 261:
print ["Found subsequence at position:" idx]
idx: idx + 1
]</langsyntaxhighlight>
 
{{out}}
Line 131 ⟶ 282:
Found subsequence at position: 71
Found subsequence at position: 169</pre>
 
=={{header|Factor}}==
{{works with|Factor|0.99 2021-02-05}}
<langsyntaxhighlight lang="factor">USING: accessors formatting grouping io kernel math
math.functions.integer-logs math.parser random regexp sequences ;
 
Line 160 ⟶ 310:
 
80 10 .biosub nl
600 39 .biosub nl</langsyntaxhighlight>
{{out}}
<pre> 0: ATTCAAGGAC
0: ATTCAAGGAC
10: CACTATTAAC
20: CTGCATTGTG
Line 196 ⟶ 345:
145..149
289..293
312..316</pre>
 
</pre>
=={{header|FreeBASIC}}==
{{trans|Wren}}
<syntaxhighlight lang="vb">Const base_ = "ACGT"
 
Sub findDnaSubsequence(dnaSize As Integer, chunkSize As Integer)
Dim As String dnaSeq(1 To dnaSize)
Dim As Integer i, chunk
For i = 1 To dnaSize
dnaSeq(i) = Mid(base_, Int(Rnd * 4)+1, 1)
Next
Dim As String dnaStr
For i = 1 To dnaSize
dnaStr += dnaSeq(i)
Next
Dim As String dnaSubseq(1 To 4)
For i = 1 To 4
dnaSubseq(i) = Mid(base_, Int(Rnd * 4)+1, 1)
Next
Dim As String dnaSubstr
For i = 1 To 4
dnaSubstr += dnaSubseq(i)
Next
Print "DNA sequence:"
For chunk = 1 To Len(dnaStr) Step chunkSize
Print Using "###_._.###: &"; chunk; chunk+chunkSize-1; Mid(dnaStr, chunk, chunkSize)
Next
Print !"\nSubsequence to locate: "; dnaSubstr
Dim As Integer idx = Instr(dnaStr, dnaSubstr)
Print Iif(idx <> 0, "Matches found at the following indices:", "No matches found.")
Do While idx > 0
If idx <> 0 Then Print Using "###_._.###"; idx; idx + 3
idx = Instr(idx+4, dnaStr, dnaSubstr)
Loop
End Sub
 
findDnaSubsequence(200, 20)
Print
findDnaSubsequence(600, 40)
 
Sleep</syntaxhighlight>
{{out}}
<pre>DNA sequence:
1.. 20: TTATAGTCTTGGAGGCATGT
21.. 40: TAACTTATGCGGAGCAGACA
41.. 60: CGGAGTATGCATTCCTCTTA
61.. 80: CCAAACGGTGCTGCCCGCGC
81..100: ACTCGCTGTATTCCGTATCG
101..120: TCACATTATCTAAACCACGA
121..140: TTTCCAGCGTGCGTGGGAAG
141..160: GCCATGTTTAGTCGGGGGCC
161..180: AAGGTCTTTGGCTTATGCTG
181..200: TTTTTTTTTCTTCGGTTACA
 
Subsequence to locate: ATTT
Matches found at the following indices:
120..123
 
DNA sequence:
1.. 40: GTGCGGGCCGTTAGCAGCTACGAGTGCTAGATGGAACTAG
41.. 80: TCCCCGCTCCCAAATGCAAAGCGTCCCAGACCAGTCTTGA
81..120: AGCCCGTTAAATTACACCTGAACCGTTGCAAATGATCGAT
121..160: AGACGGGGTATAATAGCGGAAAACACAGGGGAACTGCATG
161..200: CAAGCTCGAGCCGCTGAAGGATGGCTCCCCCCCGAGTGTA
201..240: AGTGGATCTCGCCCAAATAGCGGGGGAACAAAGAAAGGTA
241..280: AGTCTTACTTCGCACGTCCCCTCTCATACACGCCAGGACT
281..320: AATGGATCATTCATAGGTGACGGGTGACTTGCGGTGTTTC
321..360: TAGTTGGAGTCACCCGTCAGCTTAGATCTAAGTATGAACC
361..400: GTAAGAGTTTGTAACTGCACCTTCCGTCTCTTCCTCTGTA
401..440: GGAACGCTTTTGCTTGTTATCAGATAGTGTCTCCTTATCA
441..480: TAGGACAGGTTCCTTGTGAAGGTCCACAGAGTTTGCCCGG
481..520: GGTTCGAATATACGACGCTTGTGGTTCCGGCACTATAACT
521..560: TCCGCAGTGTTGTCGACGCCCCTAGCTCCCGGGGTCTTTT
561..600: CGCTTCCCTATAGCGCGAAATGAGTGCAAGGGTACCGGCC
 
Subsequence to locate: GCAC
Matches found at the following indices:
252..255
377..380
510..513</pre>
 
=={{header|Go}}==
{{trans|Wren}}
<langsyntaxhighlight lang="go">package main
 
import (
Line 246 ⟶ 475:
fmt.Println()
findDnaSubsequence(600, 40)
}</langsyntaxhighlight>
 
{{out}}
Line 290 ⟶ 519:
388..391
</pre>
 
=={{header|jq}}==
{{works with|jq}}
Line 303 ⟶ 531:
`jot -r N MIN MAX` but a fourth argument can also be
used to specify a seed. An alternative would be to use `gshuf` along the lines of:
<syntaxhighlight lang="sh">
<lang sh>
# For 200 pseudo-random integers in the range 0 to 3 inclusive:
gshuf -i 0-3 -r -n 200 --random-source=/dev/random
</syntaxhighlight>
</lang>
 
Note that the indices shown below are offsets (i.e., the index origin is taken to be 0).
<syntaxhighlight lang="sh">
<lang sh>
#!/bin/bash
 
Line 325 ⟶ 553:
"Zero-based indices of \($four):",
($strand | indices($four) | join(" "))
'</langsyntaxhighlight>
{{out}}
<pre>
Line 340 ⟶ 568:
55 141 169
</pre>
 
=={{header|Julia}}==
<langsyntaxhighlight lang="julia">DNArand(n, bases=['A', 'T', 'C', 'G']) = String(rand(bases, n))
 
DNAsearch(needle, haystack, lap=true) = findall(needle, haystack, overlap=lap)
Line 351 ⟶ 578:
println("Search sequence:\n$rand_string\nfor substring $subseq. Found at positions: ")
foreach(p -> print(rpad(p[2], 8), p[1] % 10 == 0 ? "\n" : ""), enumerate(DNAsearch(subseq, rand_string)))
</langsyntaxhighlight>{{out}}
<pre>
Search sequence:
Line 358 ⟶ 585:
21:24 74:77 99:102
</pre>
 
=={{header|Nim}}==
<langsyntaxhighlight Nimlang="nim">import random, sequtils, strutils
 
proc dnaSequence(n: Positive): string =
Line 399 ⟶ 625:
else:
let tail = if pos.len == 1: ": " else: "s: "
echo "Subsequence found at position", tail, pos.join(", ")</langsyntaxhighlight>
 
{{out}}
Line 417 ⟶ 643:
 
Subsequence found at positions: 61, 122, 170</pre>
 
=={{header|Perl}}==
<langsyntaxhighlight lang="perl">use strict;
use warnings;
use feature 'say';
Line 431 ⟶ 656:
say "Target: $target";
say 'Matches at these positions:';
say (($string =~ s/.{1,40}\K/\n/gr) =~ s/($target)/ >$1< /gr);</langsyntaxhighlight>
{{out}}
<pre>Target: CCTG
Line 442 ⟶ 667:
TGCGAG >CCTG< TAGAGCCGGGCCTCAAATTAAACGAAAAAT
ATAAGTTTGCTTGGCACGCTGTACTACTTATCC >CCTG< ACT</pre>
 
=={{header|Phix}}==
Currently only searches for non-overlapped sequences, but it should be pretty obvious how to change that, in which case the next underline will simply partially overwrite the previous, so you'll get eg "<=<==>".
<!--<langsyntaxhighlight Phixlang="phix">(phixonline)-->
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span>
Line 500 ⟶ 724:
<span style="color: #004080;">sequence</span> <span style="color: #000000;">idx</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">match_all</span><span style="color: #0000FF;">(</span><span style="color: #000000;">test</span><span style="color: #0000FF;">,</span><span style="color: #000000;">dna</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">show</span><span style="color: #0000FF;">(</span><span style="color: #000000;">dna</span><span style="color: #0000FF;">,</span><span style="color: #000000;">test</span><span style="color: #0000FF;">,</span><span style="color: #000000;">idx</span><span style="color: #0000FF;">)</span>
<!--</langsyntaxhighlight>-->
{{out}}
with cheat enabled
Line 521 ⟶ 745:
GCTA does not occur
</pre>
 
=={{header|Python}}==
 
Line 527 ⟶ 750:
{{libheader|regex}}
 
<langsyntaxhighlight lang="python">
from random import choice
import regex as re
Line 554 ⟶ 777:
 
dna_findall(sample_seq, dna_seq)
</syntaxhighlight>
</lang>
{{out}}
 
Line 575 ⟶ 798:
167:171
</pre>
 
=={{header|Racket}}==
 
<langsyntaxhighlight lang="racket">#lang racket
 
(define (rand-seq n)
Line 599 ⟶ 821:
sub (report-sequence full) (subsequence-indices full sub)))
 
(module+ main (for ((i 4)) (Bioinformatics/Subsequence)))</langsyntaxhighlight>
 
{{out}}
Line 642 ⟶ 864:
350 : GGCCTCGACCCAATTTAACCTCCCACTCCGTGGGTACAGCTTGAACCCCC
((245 . 248) (250 . 253) (329 . 332) (386 . 389))</pre>
 
=={{header|Raku}}==
Chances are actually pretty small that a random 4 codon string will show up at all in a random 200 codon sequence. Bump up the sequence size to get a reasonable chance of multiple matches.
<syntaxhighlight lang="raku" perl6line>use String::Splice:ver<0.0.3+>;
 
my $line = 80;
Line 666 ⟶ 887:
}
 
say $disp;</langsyntaxhighlight>
{{out}}
Show in custom div to better display highlighting.
Line 695 ⟶ 916:
:* &nbsp; DNA proteins to be searched in the data &nbsp; &nbsp; &nbsp; &nbsp; (the default is four unique random proteins).
:* &nbsp; the seed for the RANDOM function so runs can be repeated with the same data &nbsp; &nbsp; (no default).
<langsyntaxhighlight lang="rexx">/*REXX pgm gens random DNA (ACGT) sequence & finds positions of a random 4─protein seq. */
parse arg totLen rndLen basePr oWidth Bevery rndDNA seed .
if totLen=='' | totLen=="," then totLen= 200 /*Not specified? Then use the default.*/
Line 741 ⟶ 962:
if $\=='' then say right(idx, 7)"│" strip($, 'T') /*show residual protein data*/
say "───────┴"center('' , oWidth+10, '─')
say; return</langsyntaxhighlight>
{{out|output|text=&nbsp; when using the default inputs:}}
<pre>
Line 777 ⟶ 998:
the random DNA proteins were found in positions: 5 6 16 69 157 158 159 340 796 797 962 963
</pre>
 
=={{header|Ring}}==
<langsyntaxhighlight lang="ring">
/*-----------------------------------
# Project : DNA subsequences
Line 1,027 ⟶ 1,247:
 
//-----------------------------------------
</syntaxhighlight>
</lang>
 
'''Output:'''
 
[https://i.imgur.com/5hhbRBK.mp4 Bioinformatics/Subsequence - video]
 
=={{header|Wren}}==
{{libheader|Wren-pattern}}
{{libheader|Wren-str}}
{{libheader|Wren-fmt}}
<langsyntaxhighlight ecmascriptlang="wren">import "random" for Random
import "./pattern" for Pattern
import "./str" for Str
import "./fmt" for Fmt
 
var rand = Random.new()
Line 1,073 ⟶ 1,292:
findDnaSubsequence.call(200, 20)
System.print()
findDnaSubsequence.call(600, 40)</langsyntaxhighlight>
 
{{out}}
1,480

edits