Bioinformatics/Subsequence: Difference between revisions

From Rosetta Code
Content added Content deleted
m (→‎{{header|Phix}}: added syntax colouring the hard way)
Line 94: Line 94:
Note: match_all() is due to become a builtin in the next release, so the version below may or may not need renaming/deleting before it will run.<br>
Note: match_all() is due to become a builtin in the next release, so the version below may or may not need renaming/deleting before it will run.<br>
Currently only searches for non-overlapped sequences, but it should be pretty obvious how to change that, in which case the next underline will simply partially overwrite the previous, so you'll get eg "<=<==>".
Currently only searches for non-overlapped sequences, but it should be pretty obvious how to change that, in which case the next underline will simply partially overwrite the previous, so you'll get eg "<=<==>".
<lang Phix>constant cheat = false
<!--lang Phix-->
<span style="color: #008080;">constant</span> <span style="color: #000000;">cheat</span> <span style="color: #0000FF;">=</span> <span style="color: #004600;">false</span>

function grandna(integer len)
string dna = repeat(' ',len)
for i=1 to len do dna[i] = "ACGT"[rand(4)] end for
return dna
end function

procedure show(string dna, sequence idx)
idx &= length(dna)+100 -- (add an otherwise unused sentinel)
sequence s = split(trim(join_by(split(join_by(dna,1,10,""),"\n"),1,5," ")),"\n")
integer ii = 1, -- idx index
i = idx[ii], -- current target
ux = 1, -- underline index (1..4)
ldx = 1 -- line index (1, 51, 101, etc)
for si=1 to length(s) do
printf(1,"%3d: %s\n",{ldx,s[si]})
ldx += 50
if i and i<ldx then
string ul = repeat(' ',59)
while i and i<ldx do
integer up = i-ldx+51 -- underline pos (relative to ldx)
up += floor((up-1)/10)+5 -- (plus any needed spacing)
ul[up] = "<==>"[ux]
ux += 1
i += 1
if ux>4 then
ux = 1
ii += 1
i = idx[ii]
end if
end while
printf(1,"%s\n",ul)
end if
end for
if length(idx) then
string s = iff(length(idx)>1?"s":""),
t = join(apply(idx,sprint),", ")
printf(1,"%s occurs at location%s: %s\n",{test,s,t})
else
printf(1,"%s does not occur\n",{test})
end if
end procedure

function match_all(object needle, sequence haystack, bool bOverlap = false)
if atom(needle) then return find_all(needle,haystack) end if
integer start = 1
sequence res = {}
while 1 do
start = match(needle,haystack,start)
if start=0 then exit end if
res = append(res,start)
start += iff(bOverlap?1:length(needle))
end while
return res
end function
<span style="color: #008080;">function</span> <span style="color: #000000;">grandna<span style="color: #0000FF;">(<span style="color: #004080;">integer</span> <span style="color: #000000;">len<span style="color: #0000FF;">)</span>
string dna = grandna(200),
<span style="color: #004080;">string</span> <span style="color: #000000;">dna</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat<span style="color: #0000FF;">(<span style="color: #008000;">' '<span style="color: #0000FF;">,<span style="color: #000000;">len<span style="color: #0000FF;">)</span>
test = grandna(4)
<span style="color: #008080;">for</span> <span style="color: #000000;">i<span style="color: #0000FF;">=<span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #000000;">len</span> <span style="color: #008080;">do</span> <span style="color: #000000;">dna<span style="color: #0000FF;">[<span style="color: #000000;">i<span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">"ACGT"<span style="color: #0000FF;">[<span style="color: #7060A8;">rand<span style="color: #0000FF;">(<span style="color: #000000;">4<span style="color: #0000FF;">)<span style="color: #0000FF;">]</span> <span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
constant cheats = iff(cheat?{9,13,49,60,64,68}:{})
<span style="color: #008080;">return</span> <span style="color: #000000;">dna</span>
for i=1 to length(cheats) do
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
dna[cheats[i]..cheats[i]+3] = test
end for
<span style="color: #008080;">procedure</span> <span style="color: #000000;">show<span style="color: #0000FF;">(<span style="color: #004080;">string</span> <span style="color: #000000;">dna<span style="color: #0000FF;">,</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">idx<span style="color: #0000FF;">)</span>
sequence idx = match_all(test,dna)
<span style="color: #000000;">idx</span> <span style="color: #0000FF;">&=</span> <span style="color: #7060A8;">length<span style="color: #0000FF;">(<span style="color: #000000;">dna<span style="color: #0000FF;">)<span style="color: #0000FF;">+<span style="color: #000000;">100</span> <span style="color: #000080;font-style:italic;">-- (add an otherwise unused sentinel)</span>
show(dna,idx)</lang>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">s</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">split<span style="color: #0000FF;">(<span style="color: #7060A8;">trim<span style="color: #0000FF;">(<span style="color: #7060A8;">join_by<span style="color: #0000FF;">(<span style="color: #7060A8;">split<span style="color: #0000FF;">(<span style="color: #7060A8;">join_by<span style="color: #0000FF;">(<span style="color: #000000;">dna<span style="color: #0000FF;">,<span style="color: #000000;">1<span style="color: #0000FF;">,<span style="color: #000000;">10<span style="color: #0000FF;">,<span style="color: #008000;">""<span style="color: #0000FF;">)<span style="color: #0000FF;">,<span style="color: #008000;">"\n"<span style="color: #0000FF;">)<span style="color: #0000FF;">,<span style="color: #000000;">1<span style="color: #0000FF;">,<span style="color: #000000;">5<span style="color: #0000FF;">,<span style="color: #008000;">" "<span style="color: #0000FF;">)<span style="color: #0000FF;">)<span style="color: #0000FF;">,<span style="color: #008000;">"\n"<span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">ii</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1<span style="color: #0000FF;">,</span> <span style="color: #000080;font-style:italic;">-- idx index</span>
<span style="color: #000000;">i</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">idx<span style="color: #0000FF;">[<span style="color: #000000;">ii<span style="color: #0000FF;">]<span style="color: #0000FF;">,</span> <span style="color: #000080;font-style:italic;">-- current target</span>
<span style="color: #000000;">ux</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1<span style="color: #0000FF;">,</span> <span style="color: #000080;font-style:italic;">-- underline index (1..4)</span>
<span style="color: #000000;">ldx</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span> <span style="color: #000080;font-style:italic;">-- line index (1, 51, 101, etc)</span>
<span style="color: #008080;">for</span> <span style="color: #000000;">si<span style="color: #0000FF;">=<span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length<span style="color: #0000FF;">(<span style="color: #000000;">s<span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
<span style="color: #7060A8;">printf<span style="color: #0000FF;">(<span style="color: #000000;">1<span style="color: #0000FF;">,<span style="color: #008000;">"%3d: %s\n"<span style="color: #0000FF;">,<span style="color: #0000FF;">{<span style="color: #000000;">ldx<span style="color: #0000FF;">,<span style="color: #000000;">s<span style="color: #0000FF;">[<span style="color: #000000;">si<span style="color: #0000FF;">]<span style="color: #0000FF;">}<span style="color: #0000FF;">)</span>
<span style="color: #000000;">ldx</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">50</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">i</span> <span style="color: #008080;">and</span> <span style="color: #000000;">i<span style="color: #0000FF;"><<span style="color: #000000;">ldx</span> <span style="color: #008080;">then</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">ul</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat<span style="color: #0000FF;">(<span style="color: #008000;">' '<span style="color: #0000FF;">,<span style="color: #000000;">59<span style="color: #0000FF;">)</span>
<span style="color: #008080;">while</span> <span style="color: #000000;">i</span> <span style="color: #008080;">and</span> <span style="color: #000000;">i<span style="color: #0000FF;"><<span style="color: #000000;">ldx</span> <span style="color: #008080;">do</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">up</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">i<span style="color: #0000FF;">-<span style="color: #000000;">ldx<span style="color: #0000FF;">+<span style="color: #000000;">51</span> <span style="color: #000080;font-style:italic;">-- underline pos (relative to ldx)</span>
<span style="color: #000000;">up</span> <span style="color: #0000FF;">+=</span> <span style="color: #7060A8;">floor<span style="color: #0000FF;">(<span style="color: #0000FF;">(<span style="color: #000000;">up<span style="color: #0000FF;">-<span style="color: #000000;">1<span style="color: #0000FF;">)<span style="color: #0000FF;">/<span style="color: #000000;">10<span style="color: #0000FF;">)<span style="color: #0000FF;">+<span style="color: #000000;">5</span> <span style="color: #000080;font-style:italic;">-- (plus any needed spacing)</span>
<span style="color: #000000;">ul<span style="color: #0000FF;">[<span style="color: #000000;">up<span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">"<==>"<span style="color: #0000FF;">[<span style="color: #000000;">ux<span style="color: #0000FF;">]</span>
<span style="color: #000000;">ux</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
<span style="color: #000000;">i</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">ux<span style="color: #0000FF;">><span style="color: #000000;">4</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">ux</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span>
<span style="color: #000000;">ii</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
<span style="color: #000000;">i</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">idx<span style="color: #0000FF;">[<span style="color: #000000;">ii<span style="color: #0000FF;">]</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #7060A8;">printf<span style="color: #0000FF;">(<span style="color: #000000;">1<span style="color: #0000FF;">,<span style="color: #008000;">"%s\n"<span style="color: #0000FF;">,<span style="color: #000000;">ul<span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">length<span style="color: #0000FF;">(<span style="color: #000000;">idx<span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">s</span> <span style="color: #0000FF;">=</span> <span style="color: #008080;">iff<span style="color: #0000FF;">(<span style="color: #7060A8;">length<span style="color: #0000FF;">(<span style="color: #000000;">idx<span style="color: #0000FF;">)<span style="color: #0000FF;">><span style="color: #000000;">1<span style="color: #0000FF;">?<span style="color: #008000;">"s"<span style="color: #0000FF;">:<span style="color: #008000;">""<span style="color: #0000FF;">)<span style="color: #0000FF;">,</span>
<span style="color: #000000;">t</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">join<span style="color: #0000FF;">(<span style="color: #7060A8;">apply<span style="color: #0000FF;">(<span style="color: #000000;">idx<span style="color: #0000FF;">,<span style="color: #7060A8;">sprint<span style="color: #0000FF;">)<span style="color: #0000FF;">,<span style="color: #008000;">", "<span style="color: #0000FF;">)</span>
<span style="color: #7060A8;">printf<span style="color: #0000FF;">(<span style="color: #000000;">1<span style="color: #0000FF;">,<span style="color: #008000;">"%s occurs at location%s: %s\n"<span style="color: #0000FF;">,<span style="color: #0000FF;">{<span style="color: #000000;">test<span style="color: #0000FF;">,<span style="color: #000000;">s<span style="color: #0000FF;">,<span style="color: #000000;">t<span style="color: #0000FF;">}<span style="color: #0000FF;">)</span>
<span style="color: #008080;">else</span>
<span style="color: #7060A8;">printf<span style="color: #0000FF;">(<span style="color: #000000;">1<span style="color: #0000FF;">,<span style="color: #008000;">"%s does not occur\n"<span style="color: #0000FF;">,<span style="color: #0000FF;">{<span style="color: #000000;">test<span style="color: #0000FF;">}<span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">match_all<span style="color: #0000FF;">(<span style="color: #004080;">object</span> <span style="color: #000000;">needle<span style="color: #0000FF;">,</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">haystack<span style="color: #0000FF;">,</span> <span style="color: #004080;">bool</span> <span style="color: #000000;">bOverlap</span> <span style="color: #0000FF;">=</span> <span style="color: #004600;">false<span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #004080;">atom<span style="color: #0000FF;">(<span style="color: #000000;">needle<span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #7060A8;">find_all<span style="color: #0000FF;">(<span style="color: #000000;">needle<span style="color: #0000FF;">,<span style="color: #000000;">haystack<span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">start</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{<span style="color: #0000FF;">}</span>
<span style="color: #008080;">while</span> <span style="color: #000000;">1</span> <span style="color: #008080;">do</span>
<span style="color: #000000;">start</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">match<span style="color: #0000FF;">(<span style="color: #000000;">needle<span style="color: #0000FF;">,<span style="color: #000000;">haystack<span style="color: #0000FF;">,<span style="color: #000000;">start<span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">start<span style="color: #0000FF;">=<span style="color: #000000;">0</span> <span style="color: #008080;">then</span> <span style="color: #008080;">exit</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append<span style="color: #0000FF;">(<span style="color: #000000;">res<span style="color: #0000FF;">,<span style="color: #000000;">start<span style="color: #0000FF;">)</span>
<span style="color: #000000;">start</span> <span style="color: #0000FF;">+=</span> <span style="color: #008080;">iff<span style="color: #0000FF;">(<span style="color: #000000;">bOverlap<span style="color: #0000FF;">?<span style="color: #000000;">1<span style="color: #0000FF;">:<span style="color: #7060A8;">length<span style="color: #0000FF;">(<span style="color: #000000;">needle<span style="color: #0000FF;">)<span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">res</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">dna</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">grandna<span style="color: #0000FF;">(<span style="color: #000000;">200<span style="color: #0000FF;">)<span style="color: #0000FF;">,</span>
<span style="color: #000000;">test</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">grandna<span style="color: #0000FF;">(<span style="color: #000000;">4<span style="color: #0000FF;">)</span>
<span style="color: #008080;">constant</span> <span style="color: #000000;">cheats</span> <span style="color: #0000FF;">=</span> <span style="color: #008080;">iff<span style="color: #0000FF;">(<span style="color: #000000;">cheat<span style="color: #0000FF;">?<span style="color: #0000FF;">{<span style="color: #000000;">9<span style="color: #0000FF;">,<span style="color: #000000;">13<span style="color: #0000FF;">,<span style="color: #000000;">49<span style="color: #0000FF;">,<span style="color: #000000;">60<span style="color: #0000FF;">,<span style="color: #000000;">64<span style="color: #0000FF;">,<span style="color: #000000;">68<span style="color: #0000FF;">}<span style="color: #0000FF;">:<span style="color: #0000FF;">{<span style="color: #0000FF;">}<span style="color: #0000FF;">)</span>
<span style="color: #008080;">for</span> <span style="color: #000000;">i<span style="color: #0000FF;">=<span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length<span style="color: #0000FF;">(<span style="color: #000000;">cheats<span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
<span style="color: #000000;">dna<span style="color: #0000FF;">[<span style="color: #000000;">cheats<span style="color: #0000FF;">[<span style="color: #000000;">i<span style="color: #0000FF;">]<span style="color: #0000FF;">..<span style="color: #000000;">cheats<span style="color: #0000FF;">[<span style="color: #000000;">i<span style="color: #0000FF;">]<span style="color: #0000FF;">+<span style="color: #000000;">3<span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">test</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">idx</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">match_all<span style="color: #0000FF;">(<span style="color: #000000;">test<span style="color: #0000FF;">,<span style="color: #000000;">dna<span style="color: #0000FF;">)</span>
<span style="color: #000000;">show<span style="color: #0000FF;">(<span style="color: #000000;">dna<span style="color: #0000FF;">,<span style="color: #000000;">idx<span style="color: #0000FF;">)
<!--</lang>-->
{{out}}
{{out}}
with cheat enabled
with cheat enabled

Revision as of 04:57, 23 March 2021

Bioinformatics/Subsequence is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.
Task

Randomly generate a string of   200   DNA bases   (represented by   A,  C,  G,  and  T).

Write a routine to find all the positions of a randomly generated subsequence   (four letters).

Factor

Works with: Factor version 0.99 2021-02-05

<lang factor>USING: accessors formatting grouping io kernel math math.functions.integer-logs math.parser random regexp sequences ;

new-dna ( n -- str ) [ "ACGT" random ] "" replicate-as ;
pad ( n d -- str ) [ number>string ] dip 32 pad-head ;
.dna ( seq n -- )
   seq length integer-log10 1 + :> d seq n group
   [ n * d pad write ": " write write nl ] each-index ;
.match ( slice -- ) [ from>> ] [ to>> ] bi "%d..%d\n" printf ;
.matches ( slices -- )
   "Matches found at the following indices:" print
   [ .match ] each ;
.locate ( slices -- )
   [ "No matches found." print ] [ .matches ] if-empty ;
.biosub ( dna-size row-size -- )
   [ new-dna dup ] [ .dna nl ] bi*
   4 new-dna dup "Subsequence to locate: %s\n" printf
   <regexp> all-matching-slices .locate ;

80 10 .biosub nl 600 39 .biosub nl</lang>

Output:
 0: ATTCAAGGAC
10: CACTATTAAC
20: CTGCATTGTG
30: AGAACTTGCA
40: GTGTACCGAG
50: AGCGAGTTTA
60: AAGCAACACA
70: TCTTTACCGA

Subsequence to locate: GTAG
No matches found.

  0: GATCTCGTCATGGTCCATCCTAACATTTCGGTTGTGGGC
 39: GCATCCCGATAGGCGAAGTTAAATCTACGTAGTCCTACG
 78: TCACGACGGAACATGATTGCCCACCGAAGTCGTAGGCGA
117: GCTAAAGTCGGTACATACACGATCTGCTATATTCGTTCT
156: CCGACACACGACATGCAATCCGAGAAGCTCTCGAAGTGC
195: GGTCAGATCCTCAGACTCGAACAGAGGAGACCTTAACTG
234: ATACCCACAGTACTTCTCGCATAACCTAAGCACCTATGC
273: TTACACCATCGTCCTGATATTGAGTGAGTCTGGTCGGAG
312: ATATTATCTAGCACCCTCAAGCTCTGTGTGCCACACCAG
351: GATTCCACTTCGCGCTTGCCTAGAGAAAGTAGAGTAGGT
390: GGTGTCATTAGTACACTGTTTGCGATGCACCAACCAAAC
429: CCGACCGCCATGATGACTGCTTTTCGGCCAACGTCAGAT
468: TAAGAGTACTTTTAGTAGCACCGCAAGCCAGCCGGTTTA
507: GCAAGATCCTGCAGCCTCCACGTTATTTCAGGTCTCTAA
546: GCGTTCTTTCCATGGAAGTAGTCACCGCTCCCGTTGCCA
585: ATGGACACAGACGTT

Subsequence to locate: ATAT
Matches found at the following indices:
145..149
289..293
312..316

Julia

<lang julia>DNArand(n, bases=['A', 'T', 'C', 'G']) = String(rand(bases, n))

DNAsearch(needle, haystack, lap=true) = findall(needle, haystack, overlap=lap)

const rand_string = DNArand(200) const subseq = DNArand(4)

println("Search sequence:\n$rand_string\nfor substring $subseq. Found at positions: ") foreach(p -> print(rpad(p[2], 8), p[1] % 10 == 0 ? "\n" : ""), enumerate(DNAsearch(subseq, rand_string)))

</lang>

Output:
Search sequence:
CCGAAGCCAGGAGGACTGAGCGCTTGCGTCCCGAGTTCTGCGACGAGTCTCTTCATTATAAGGCCACTGATTGCGCTCATCATGAGTGCCAGAAGCACCGCTAAACATAAGTGTCCTTTCTTCCTGACGCACTTGAAGATTGTGACCATTTGTGCGGGTTGTGAGTTAGGGGCTCTCATTGTACACGATCTATAGTGTGC
for substring CGCT. Found at positions:
21:24   74:77   99:102

Phix

Note: match_all() is due to become a builtin in the next release, so the version below may or may not need renaming/deleting before it will run.
Currently only searches for non-overlapped sequences, but it should be pretty obvious how to change that, in which case the next underline will simply partially overwrite the previous, so you'll get eg "<=<==>".

constant cheat = false

function grandna(integer len)
    string dna = repeat(' ',len)
    for i=1 to len do dna[i] = "ACGT"[rand(4)] end for
    return dna
end function

procedure show(string dna, sequence idx)
    idx &= length(dna)+100 -- (add an otherwise unused sentinel)
    sequence s = split(trim(join_by(split(join_by(dna,1,10,""),"\n"),1,5," ")),"\n")
    integer ii = 1,         -- idx index
            i = idx[ii],    -- current target
            ux = 1,         -- underline index (1..4)
            ldx = 1         -- line index (1, 51, 101, etc)
    for si=1 to length(s) do
        printf(1,"%3d: %s\n",{ldx,s[si]})
        ldx += 50
        if i and i<ldx then
            string ul = repeat(' ',59)
            while i and i<ldx do
                integer up = i-ldx+51       -- underline pos (relative to ldx)
                up += floor((up-1)/10)+5    -- (plus any needed spacing)
                ul[up] = "<==>"[ux]
                ux += 1
                i += 1
                if ux>4 then
                    ux = 1
                    ii += 1
                    i = idx[ii]
                end if
            end while
            printf(1,"%s\n",ul)
        end if
    end for
    if length(idx) then
        string s = iff(length(idx)>1?"s":""),
               t = join(apply(idx,sprint),", ")
        printf(1,"%s occurs at location%s: %s\n",{test,s,t})
    else
        printf(1,"%s does not occur\n",{test})
    end if
end procedure

function match_all(object needle, sequence haystack, bool bOverlap = false)
    if atom(needle) then return find_all(needle,haystack) end if
    integer start = 1
    sequence res = {}
    while 1 do
        start = match(needle,haystack,start)
        if start=0 then exit end if
        res = append(res,start)
        start += iff(bOverlap?1:length(needle))
    end while
    return res
end function
 
string dna = grandna(200),
       test = grandna(4)
constant cheats = iff(cheat?{9,13,49,60,64,68}:{})
for i=1 to length(cheats) do
    dna[cheats[i]..cheats[i]+3] = test
end for
sequence idx = match_all(test,dna)
show(dna,idx)
Output:

with cheat enabled

  1: GGAGATATCG ACCGACCGAA GTAAAGTCAA AGTCGTCCAA TCCACGGACG
             <= =><==>                                   <=
 51: ACTTCAGCAC GACCGACCGA CCTATTTAAG AGACCACACT TAAGGAATCC
     =>       < ==><==><== >
101: ATGCGAAATA AAAATGGGCG AGTAGCCGTG GGGCGCTAAA GCACCCACCT
151: AGTTTTCGCC GAAGTACTAG ACCACCTTCG GATCGACAAA GCTTTCACCA
                                         <==>
CGAC occurs at locations: 9, 13, 49, 60, 64, 68, 184

with cheat disabled

  1: TGATTTAAAC CGTGGTGCAA TTTATAAACA CTGCGATATG CCTCCTGATG
 51: GCATGGTATT CGACACCAAG ACGCTGGTGG GCACACTGGC TTTCAGAATA
101: GGAGTCACAA TCCCTCTATG ATGTCCTCTA GCGGGTGTGT GTTCAGTGCC
151: AGCGCTTACT TCCGGCGTGG CCGACTCTTT TTAAAGCGTA TAGCTGGGGT
GCTA does not occur

Raku

Chances are actually pretty small that a random 4 codon string will show up at all in a random 200 codon sequence. Bump up the sequence size to get a reasonable chance of multiple matches. <lang perl6>use String::Splice:ver<0.0.3>;

my $line = 80;

my $haystack = [~] <A C G T>.roll($line * 8);

say 'Needle: ' ~ my $needle = [~] <A C G T>.roll(4);

my $these = $haystack ~~ m:g/<$needle>/;

my @match = $these.map: { .from, .pos }

printf "From: %3s to %3s\n", |$_ for @match;

my $disp = $haystack.comb.batch($line)».join.join("\n");

for @match.reverse {

   $disp.=&splice(.[1] + .[1] div $line, "\e[0m" );
   $disp.=&splice(.[0] + .[0] div $line, "\e[31m");

}

say $disp;</lang>

Output:

Show in custom div to better display highlighting.

Needle: TAGC
From: 159 to 163
From: 262 to 266
From: 315 to 319
From: 505 to 509
From: 632 to 636
CATATGTGACACTGACAGCTCGCGCGAAAATCCGTGTGACGGTCTGAACACTATACTATAGGCCCGGTCGGCATTTGTGG
CTCCCCAGTGGAGAGACCACTCGTCAATTGCTGACGACTTAACACAAATCGAGTCGCCCTTAGTGCCAGACGGGACTCCT
AGCAAAGGGCGGCACGTGGTGACTCCCAATATGTGAGCATGCCATCTAATTGATCTGGGGGGTTTCGCGGGAATACCTAG
GGGCGTTCTGTCCATGGATCTCTAGCCCTGCGAAGAGATACCCGCAGTGAGTTGCACGTGCAAAGAACTTGTAACTAGCG
TATTCTGTATCCGCCGCGCGATATGCTTCTGCGGGATGTACTTCTTGTGACTAAGACTTTGTTATCCAAATTGACCAATA
TTCAACGGTCGACTCTCCGAGGCAGTATCGGTACGCCGAAAAATGGTTACTTCGGCCATACGTAACCTCTCAAGTCACGA
TTACAGCCCACGGGGGCTTACAGCATAGCTCCAAAGACATTCCAATTGAGCTACAACGTGTTCAGTGCGGAGCAGTATCC
AGTACTCGACTGTTATGGTAAAAGGGCATCGTGATCGTTTATATTAATCATTGGGACAGGTGGTTAATGTCATAGCTTAG

REXX

This REXX version allows the user to specify:

  •   length of the (random) DNA data sequence     (default is 200).
  •   length of the (random) DNA sequence             (default is four).
  •   DNA proteins to be used in the sequence         (default is ACGT).
  •   width of the output lines of (random DNA)         (default is 100).
  •   often (if ever) to add a blank to the output       (default is every 10 proteins).
  •   DNA proteins to be searched in the data         (the default is four unique random proteins).
  •   the seed for the RANDOM function so runs can be repeated with the same data     (no default).

<lang rexx>/*REXX pgm gens random DNA (ACGT) sequence & finds positions of a random 4─protein seq. */ parse arg totLen rndLen basePr oWidth Bevery rndDNA seed . if totLen== | totLen=="," then totLen= 200 /*Not specified? Then use the default.*/ if rndLen== | rndLen=="," then rndLen= 4 /* " " " " " " */ if basePr== | basePr=="," then basePr= 'acgt' /* " " " " " " */ if oWidth== | oWidth=="," then oWidth= 100 /* " " " " " " */ if Bevery== | Bevery=="," then Bevery= 10 /* " " " " " " */ if rndDNA== | rndDNA=="," then rndDNA= copies(., rndLen) /*what we're looking for*/ if datatype(seed, 'W') then call random ,,seed /*used to generate repeatable random #s*/ call genRnd /*gen data field of random proteins. */ call show /*show " " " " " */ say ' base DNA proteins used: ' basePr say 'random DNA proteins used: ' dna? call findRnd if @== then do; say "the random DNA proteins weren't found."; exit 4; end say 'the random DNA proteins were found in positions:' strip(@) exit 0 /*stick a fork in it, we're all done. */ /*──────────────────────────────────────────────────────────────────────────────────────*/ commas: parse arg ?; do jc=length(?)-3 to 1 by -3; ?=insert(',', ?, jc); end; return ? /*──────────────────────────────────────────────────────────────────────────────────────*/ findRnd: @=; p=0 /*@: list of the found target proteins*/

             do until p==0;    p= pos(dna?, $$, p+1);     if p>0  then @= @ commas(p)
  /*Found one?  Append it to the "Found"s*/
             end   /*p*/;             return

/*──────────────────────────────────────────────────────────────────────────────────────*/ genRnd: dna?=; use= basePr; upper use basePr rndDNA; lenB= length(basePr)

             do k=1  for rndLEN;      x= substr(rndDNA, k, 1)
             if x==.  then  do;  ?= random(1, length(use) );         x= substr(use, ?, 1)
                                 use= delstr(use, ?, 1)   /*elide so no protein repeats*/
                            end
             dna?= dna? || x                              /*build a random protein seq.*/
             end   /*k*/
       return

/*──────────────────────────────────────────────────────────────────────────────────────*/ show: say " index │"center('DNA sequence of ' commas(totLen) " proteins", oWidth+10)

     say "───────┼"center(                                            , oWidth+10, '─')
     $=; $$=;                 idx= 1               /*gen data field of random proteins.*/
        do j=1  for totLen;   c= substr( basePr, random(1, lenB), 1)
         $$= $$ || c                                       /*append a random protein.  */
         if Bevery\==0  then if j//Bevery==0  then $= $' ' /*possibly add a blank.     */
         if length( space($ || c, 0) )<oWidth  then do;   $= $  ||  c;   iterate;   end
         say strip( right(idx, 7)'│' $, 'T');  $=          /*display line ──► terminal.*/
         idx= idx + oWidth                                 /*bump the index number.    */
         end   /*j*/
     if $\==  then say right(idx, 7)"│" strip($, 'T')    /*show residual protein data*/
     say;                return</lang>
output   when using the default inputs:
 index │                                                 DNA sequence of  200  proteins
───────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────
      1│ TTTTTAGCG CGTTTTGTAG CGCTCTAAAA ACCGTAGCTA TATTTCTCGA AGTTTCACCC AGCTCTTTTG CCCCAGGGTT GCGCTAAGCC CAGCTTCGAG
    101│ GGGGCACAG GTAAAATACT ACCGTCCGTG GAGGGGGATG AATTGACCCG ACATTTTTTG AAGCATAACT CGTGACTCAA TATTGCATGA TTACACCAGC

  base DNA proteins used:  ACGT
random DNA proteins used:  GCAT

the random DNA proteins were found in positions: 162 184
output   when using the inputs of:     1000   ,   ,   ,   ,   tttt
 index │                                       DNA sequence of  1,000  proteins
───────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────
      1│ GTGATTTTT AGCGCGTTTT GTAGCGCTCT AAAAACCGTA GCTATATTTC TCGAAGTTTC ACCCAGCTCT TTTGCCCCAG GGTTGCGCTA AGCCCAGCTT
    101│ GAGGGGGGC ACAGGTAAAA TACTACCGTC CGTGGAGGGG GATGAATTGA CCCGACATTT TTTGAAGCAT AACTCGTGAC TCAATATTGC ATGATTACAC
    201│ AGCTAGGTT AGTGTAAAAA CCCCCCTATC TTCCTGATCA ATGGCGAGTA AAACATGCAA CCAATTTGTG AGCGAGTACT GGAAATTATT GTTTACGGGA
    301│ AGGCACATG CTACGCGCAA CAGATATCTT AGACTGACCC TTTTAGAGTC ATAAGCCCCT GTCGCCTACA TGCTACTAAT ACTCCAACTA GCGGCGCACC
    401│ TCAACCGGA TCATGGCGCC AGGGAAAATG TGGCGTAGCG ACGTGCTCAT CGCTCGCCGG GGAGAGCCTT TCAGAATCTC GAATAAAACC TGGTAATGAC
    501│ TCATCAATC GTAATGGTCG TCTGGGGCAA GAAGCCGATA TTATAGACTC AGGTCAGACG TGTGCACAAC GGCAGAATTT ATAGTAATTC GCGTGAACTA
    601│ GTTTCGGGA TAGGCCTACG ACCAATCATA GGACATTCGA TGCACGGTGT AGAAACAGTT CTCTGATGTT ACTCGGGATA ACACTCGCAA TCCCCTAGGA
    701│ ACCGTGAGC GTCGCTAGTA TCTGAGATAG TCGCGACTGC CCAGCGGTCT TTAAGTTCGC ACACTACGGG ACTCCTAGTT CGCCCATTCA TGGCTATTTT
    801│ CCTATCAGT CCAATCCCAC GGGGAGGGCA CTCGCGCAAT TCATTCAAAG AGGGCCATTT GCCGATATAA GGTCCATCAT CGGGAGGAAT ATGACTCCTG
    901│ TTAGTATTA GAGCAGCCTC GCTGCGTACT ACTGTCAGTG GCCCGTCAGG GAAGGCAAAA CGTTTTTCCT CTAGGAATCC GTCAATTGGA CTTCTAGACT

  base DNA proteins used:  ACGT
random DNA proteins used:  TTTT

the random DNA proteins were found in positions: 5 6 16 69 157 158 159 340 796 797 962 963

Ring

<lang ring> load "consolecolors.ring"

row = 0 dnaList = [] dnaSeq = [] base = ["A","C","G","T"] long = 20 plus = 0 see "DNA sequence:" + nl + nl see " 12345678901234567890" + nl see " " + long + ": "

for nr = 1 to 200

   row = row + 1
   rnd = random(3)+1
   baseStr = base[rnd]
   see baseStr
   plusLine()
   add(dnaList,baseStr)

next see nl+ " 12345678901234567890" + nl

strDna = list2str(dnaList) strDna = substr(strDna,nl,"")

while true

     strBase = ""
     for n = 1 to 4
         rnd = random(3)+1
         strBase = strBase + base[rnd]
     next
     ind = substr(strDna,strBase)
     if ind > 0
        exit
     ok

end

see nl + "subsequence to search: " + strBase + nl

seqok = 0 see "start positions of subsequence : "

for n = 1 to 196

   flag = 1
   for m = 0 to 3
       if dnaList[n+m] != strBase[m+1]
          flag = 0
          exit
       ok
   next
   if flag = 1
      add(dnaSeq,n)
      seqok = 1
      see "" + n + " "
   ok

next

if seqok = 0

  see "sequence not found" + nl

ok

row = 0 showDna(dnaList)

func showDna(dnaList)

    long = 20
    see nl + "found subsequences:" + nl + nl
    see "     12345678901234567890" + nl
    see " " + long + ": "
    for nr = 1 to len(dnaList)
        if plus = 0
           row = row + 1
        ok
        if plus = 1
           nr = nr + 3
           row = row + 1
           plusLine()
        ok
        ind = find(dnaSeq,nr)
        if ind > 0
           for n = nr to nr + 3
               cc_print(CC_BG_DARK_RED | CC_FG_WHITE,dnaList[n])
               if n != nr
                   row = row + 1
               ok 
               plusLine()
           next
           plus = 1
           if (row%20) = 0
               row = row + 1
               nr = nr + 1
           ok
        else
           plus = 0
           see dnaList[nr]
        ok
        plusLine()
    next
    see nl+ "     12345678901234567890" + nl

func plusLine()

    if (row%20) = 0 and long < 200
        long = long + 20
        see nl
        if long < 100
           see " " + long + ": "
        else
           see "" + long + ": "
        ok
    ok

</lang>

Output:
DNA sequence:

     12345678901234567890
 20: CAGTAAATAAGGAGAACAGG
 40: GATCTATCTGCGCAGTTGTT
 60: CAAATCAAGAGGAAAAAGTT
 80: AAATCCAACACGGTAGGATG
100: CATTGAAAGGTTGCGTAAGA
120: AAAAAGGAGGGAAATGATCG
140: AAACAAAGTACGTCAATTAG
160: ATGCCAAAGACCGATAAAAG
180: GTATTAGTATTAGAGCAGCG
200: AATGAGGAAGACTTCGAGAA
     12345678901234567890

subsequence to search: AAGA
start positions of subsequence : 47 97 147 188
found subsequences:

     12345678901234567890
 20: CAGTAAATAAGGAGAACAGG
 40: GATCTATCTGCGCAGTTGTT
 60: CAAATCAAGAGGAAAAAGTT
 80: AAATCCAACACGGTAGGATG
100: CATTGAAAGGTTGCGTAAGA
120: AAAAGGAGGGAAATGATCG
140: AAACAAAGTACGTCAATTAG
160: ATGCCAAAGACCGATAAAAG
180: GTATTAGTATTAGAGCAGCG
200: AATGAGGAAGACTTCGAGAA
     12345678901234567890

Wren

Library: Wren-pattern
Library: Wren-str
Library: Wren-fmt

<lang ecmascript>import "random" for Random import "/pattern" for Pattern import "/str" for Str import "/fmt" for Fmt

var rand = Random.new() var base = "ACGT"

var findDnaSubsequence = Fn.new { |dnaSize, chunkSize|

   var dnaSeq = List.filled(dnaSize, null)
   for (i in 0...dnaSize) dnaSeq[i] = base[rand.int(4)]
   var dnaStr = dnaSeq.join()
   var dnaSubseq = List.filled(4, null)
   for (i in 0...4) dnaSubseq[i] = base[rand.int(4)]
   var dnaSubstr = dnaSubseq.join()
   System.print("DNA sequence:")
   var i = chunkSize
   for (chunk in Str.chunks(dnaStr, chunkSize)) {
        Fmt.print("$3d..$3d: $s", i - chunkSize + 1, i, chunk)
        i = i + chunkSize
   }
   System.print("\nSubsequence to locate: %(dnaSubstr)")
   var p = Pattern.new(dnaSubstr)
   var matches = p.findAll(dnaStr)
   if (matches.count == 0) {
       System.print("No matches found.")
   } else {
       System.print("Matches found at the following indices:")
       for (m in matches) {
           Fmt.print("$3d..$3d", m.index + 1, m.index + 4)
       }
   }

}

findDnaSubsequence.call(200, 20) System.print() findDnaSubsequence.call(600, 40)</lang>

Output:
DNA sequence:
  1.. 20: TATGGGCGCATTATGACAAC
 21.. 40: GGCTACTGAAACGAAAATTC
 41.. 60: ATGCCTTCGGAGGCTAGACC
 61.. 80: ACTCATACATGATTTACAGC
 81..100: TAGTCAGTTGCGTCCGCCAT
101..120: CCCGCATAACTATGTATTAC
121..140: GAGCATGTTCTGGCAACCTT
141..160: TCAGTGACAGTTCCTCAGGC
161..180: GCGTTCGCGTTGAAGGCCTC
181..200: CCCACACCGCACCCCTGCCG

Subsequence to locate: AATT
Matches found at the following indices:
 36.. 39

DNA sequence:
  1.. 40: GCGCTGAGCGCCCCAGTACAGCGGGTTAAACCGAGCCCGC
 41.. 80: TCCGATGAACCAACTCCCATTCCTATAATGGTGCCCCGAC
 81..120: ATATTGAATTCGGCGGGTCCGCTATCGGGCTGAGGATGCC
121..160: AATATCTAGGCGCTACCCTGAAGATCCTCAGTTGTGGTGT
161..200: CGCGGAGTGTCGATCCCAGAGCTCCCAATTGACTCAATTA
201..240: CTTTTTCCGTCCTCTTGCTTACGGATTTATGTTTGTGGCA
241..280: GAGGTTATGCTTCAGGCATCCCCATGTTTCCTGAGATACG
281..320: ACCACTGTCAGGTGGCTTGAATCTACCTTGTATTTCCTCT
321..360: AGTACCAGTCACTGTCATCTACTGGAAGCCATATCAGCGT
361..400: TGAAATGTCTATAATTTACTCTCCGGTTGTACCCAAGCGA
401..440: TAACAGCAACGTGTGGGTCTAAAGAGTTCCGCGTTTCGAC
441..480: ATAACGTGCTCCTATTTATCTACCGAAACACCCTATTTTC
481..520: CATCTAACCGGCACCCAATGCGCAGGTGTACGCGTCCTAC
521..560: TACGTTTGAAACGGTTCCATCTCGCCATGTACAATTGTGG
561..600: GGCTACGATTAAGTGTAGTCGGTAATTCAGGGTGAAGTTG

Subsequence to locate: TTCG
Matches found at the following indices:
 89.. 92
435..438