Bioinformatics/Subsequence: Difference between revisions
No edit summary |
Alextretyak (talk | contribs) m (→{{header|11l}}: Void) |
||
(82 intermediate revisions by 17 users not shown) | |||
Line 1: | Line 1: | ||
{{Draft task}} |
{{Draft task}} |
||
;Task: |
;Task: |
||
Randomly generate a string of '''200''' DNA bases (represented by '''A, C, G,''' and '''T'''). |
|||
Genarate randomly a string (200 elements) of characters A, C, G, and T representing a DNA sequence write a routine to find the position of subsequence (also generating randomly). |
|||
<br> Let length of subsequence equal to '''4''' |
|||
Write a routine to find all the positions of a randomly generated subsequence (four letters). |
|||
<br><br> |
|||
=={{header|11l}}== |
|||
{{trans|Python}} |
|||
<syntaxhighlight lang="11l">UInt32 seed = 34 |
|||
F nonrandom_choice(lst) |
|||
:seed = (1664525 * :seed + 1013904223) [&] FFFF'FFFF |
|||
R lst[Int(:seed >> 16) % lst.len] |
|||
F generate_sequence(Int n) |
|||
R ((0 .< n).map(_ -> nonrandom_choice([‘A’, ‘C’, ‘G’, ‘T’]))).join(‘’) |
|||
F positions(dnaSeq, subSeq) |
|||
[Int] r |
|||
V start = 0 |
|||
L |
|||
V? pos = dnaSeq.find(subSeq, start) |
|||
I pos == N |
|||
L.break |
|||
r.append(pos) |
|||
start = pos + 1 |
|||
R r |
|||
F dna_findall(String needle, haystack) -> Void |
|||
V pp = positions(haystack, needle) |
|||
I pp.empty |
|||
print(‘No matches found’) |
|||
E |
|||
print(‘Found ’needle‘ at the following indices:’) |
|||
L(p) pp |
|||
print(p‘:’(p + needle.len)) |
|||
V dna_seq = generate_sequence(200) |
|||
V sample_seq = generate_sequence(4) |
|||
V c = 1 |
|||
L(i) dna_seq |
|||
I c % 20 != 0 {print(i, end' ‘’)} E print(i) |
|||
c++ |
|||
print("\nSearch Sample: "sample_seq) |
|||
dna_findall(sample_seq, dna_seq)</syntaxhighlight> |
|||
{{out}} |
|||
<pre> |
|||
GAAGTGCTCAAACCCTTTTT |
|||
CCTTGCCGTAGGTTGTGCTG |
|||
CCGCCGCACACCCGCAACAG |
|||
CTTTTAGGCATAAGTATACG |
|||
GACCGCGGACGGGGCGTAAC |
|||
GGTGAACATTTTGCTAAATT |
|||
GGCTCTAGGGATGAGCCCTA |
|||
TAGCGCTGGGGACTACGCCC |
|||
CGGTAAAGATCGAGGCGACT |
|||
CACCGATTGCGCTAGGGACA |
|||
Search Sample: CGTA |
|||
Found CGTA at the following indices: |
|||
26:30 |
|||
94:98 |
|||
</pre> |
|||
=={{header|Action!}}== |
|||
<syntaxhighlight lang="action!">DEFINE SEQLEN="200" |
|||
DEFINE SUBLEN="4" |
|||
PROC RandomSeq(CHAR ARRAY s BYTE len) |
|||
CHAR ARRAY letters="ACGT" |
|||
BYTE i |
|||
FOR i=1 TO len |
|||
DO |
|||
s(i)=letters(Rand(4)+1) |
|||
OD |
|||
s(0)=len |
|||
RETURN |
|||
PROC PrintSeq(CHAR ARRAY s) |
|||
BYTE i |
|||
FOR i=1 TO s(0) |
|||
DO |
|||
IF i MOD 20=1 THEN |
|||
IF i<10 THEN Put(32) FI |
|||
IF i<100 THEN Put(32) FI |
|||
PrintB(i) |
|||
Print(": ") |
|||
FI |
|||
Put(s(i)) |
|||
IF i MOD 20=0 THEN |
|||
PutE() |
|||
FI |
|||
OD |
|||
RETURN |
|||
BYTE FUNC StartsWith(CHAR ARRAY s,prefix BYTE start) |
|||
BYTE i |
|||
FOR i=1 TO prefix(0) |
|||
DO |
|||
IF s(start+i-1)#prefix(i) THEN |
|||
RETURN (0) |
|||
FI |
|||
OD |
|||
RETURN (1) |
|||
PROC Main() |
|||
CHAR ARRAY seq(SEQLEN+1),sub(SUBLEN+1) |
|||
BYTE i,notfirst |
|||
RandomSeq(seq,SEQLEN) |
|||
RandomSeq(sub,SUBLEN) |
|||
PrintE("Search sequence:") |
|||
PrintSeq(seq) |
|||
PutE() |
|||
PrintF("Subsequence to find: %S%E%E",sub) |
|||
PrintE("Found subsequence at positions:") |
|||
notfirst=0 |
|||
FOR i=1 TO SEQLEN-SUBLEN |
|||
DO |
|||
IF StartsWith(seq,sub,i) THEN |
|||
IF notfirst THEN |
|||
Print(", ") |
|||
FI |
|||
notfirst=1 |
|||
PrintF("%I-%I",i,i+SUBLEN-1) |
|||
FI |
|||
OD |
|||
IF notfirst=0 THEN |
|||
PrintE("Not found") |
|||
FI |
|||
RETURN</syntaxhighlight> |
|||
{{out}} |
|||
[https://gitlab.com/amarok8bit/action-rosetta-code/-/raw/master/images/Bioinformatics_subsequence.png Screenshot from Atari 8-bit computer] |
|||
<pre> |
|||
Search sequence: |
|||
1: CGACTCAGGAAGGCCACGTG |
|||
21: GTAACTTCTTAGTTACCGTA |
|||
41: AGGCTAATAGCTAGCGCTGC |
|||
61: GTGACCAGGCATAGTAACCG |
|||
81: GCACGCACGTTCACCAAGGG |
|||
101: GTCCCGATGGGAGGCACGTT |
|||
121: ACTACTCCAAGAACTGTAGT |
|||
141: AAGTTACCGAAAAGTTCTCA |
|||
161: TCCTTGGGTAGTGAGTACTT |
|||
181: TGTGCTATGAAAAATAAGGA |
|||
Subsequence to find: ACGC |
|||
Found subsequence at positions: |
|||
83-86 |
|||
</pre> |
|||
=={{header|Ada}}== |
|||
<syntaxhighlight lang="ada">with Ada.Text_Io; |
|||
with Ada.Strings.Fixed; |
|||
with Ada.Numerics.Discrete_Random; |
|||
procedure Sub_Sequence is |
|||
type Nucleotide is (A, C, G, T); |
|||
function To_Character (N : Nucleotide) return Character |
|||
is (case N is |
|||
when A => 'A', when C => 'C', |
|||
when G => 'G', when T => 'T'); |
|||
package Random_Nucleotide is new Ada.Numerics.Discrete_Random (Nucleotide); |
|||
use Random_Nucleotide; |
|||
package Position_Io is new Ada.Text_Io.Integer_Io (Natural); |
|||
use Ada.Text_Io; |
|||
procedure Put_Bases (Seq : String; Width : Positive) is |
|||
First : Natural := Seq'First; |
|||
begin |
|||
while First < Seq'Last loop |
|||
declare |
|||
Last : constant Natural := |
|||
Natural'Min (First + Width - 1, Seq'Last); |
|||
begin |
|||
Position_Io.Put (First); Put (".."); |
|||
Position_Io.Put (Last); Put (" "); |
|||
Put (Seq (First .. Last)); |
|||
New_Line; |
|||
First := Last + 1; |
|||
end; |
|||
end loop; |
|||
end Put_Bases; |
|||
Gen : Generator; |
|||
Sequence : String (1 .. 405); |
|||
Substring : String (1 .. 4); |
|||
Pos : Natural := 0; |
|||
begin |
|||
Position_Io.Default_Width := 3; |
|||
Reset (Gen); |
|||
Sequence := (others => To_Character (Random (Gen))); |
|||
Substring := (others => To_Character (Random (Gen))); |
|||
Put_Line ("Search sequence:"); |
|||
Put_Bases (Sequence, Width => 50); |
|||
New_Line; |
|||
Put ("Substring to search: "); |
|||
Put (Substring); |
|||
New_Line; |
|||
loop |
|||
Pos := Ada.Strings.Fixed.Index (Sequence, Substring, Pos + 1); |
|||
exit when Pos = 0; |
|||
Put ("Found at position: "); |
|||
Position_Io.Put (Pos); Put (".."); |
|||
Position_Io.Put (Pos + Substring'Length - 1); |
|||
New_Line; |
|||
end loop; |
|||
end Sub_Sequence;</syntaxhighlight> |
|||
{{out}} |
|||
<pre>Search sequence: |
|||
1.. 50 CCTACGGAAAAGTGATAAGGACAGATACATAATCCTAAAACCCTGGAAAA |
|||
51..100 CTTGTCTCGCCAGAGTAGGGCTCGGCAGGGGGGGCAGTGTTTTAAAACGT |
|||
101..150 CAGAGAATAGGCTCTACCTTGTTAGACTGCGAGTACTGGAGCGTAGTTCC |
|||
151..200 TATATTGCAAGCTGCTACAGTAAGTATCAAAGTATGCCACACATCCTTCT |
|||
201..250 ACAACCGGATTGGTTGCCCAGTAGAAGGCTCGTAGTCACCGGACACGCTG |
|||
251..300 TTCTTAAGGTCGGTAAGCTATTACGTCCATGGGAGATTCTCAAGGGTGCG |
|||
301..350 TTAGCGGACCCCCGTTACGTCCACGTATCTTCCGTCCAACTACCCCCTAA |
|||
351..400 TGTCATTGACATCGCCCGAGTATTTAATTTATTTGAACGGCACCAATTTA |
|||
401..405 GAGCT |
|||
Substring to search: TATT |
|||
Found at position: 153..156 |
|||
Found at position: 269..272 |
|||
Found at position: 371..374 |
|||
Found at position: 380..383</pre> |
|||
=={{header|Arturo}}== |
|||
<syntaxhighlight lang="rebol">bases: [`A` `G` `C` `T`] |
|||
randSeq: join map 1..200 => [sample bases] |
|||
randSub: join map 1..4 => [sample bases] |
|||
idx: 0 |
|||
print "Random sequence:" |
|||
print join.with:"\n" split.every: 20 randSeq |
|||
print "" |
|||
print "Looking for subsequence:" |
|||
print randSub |
|||
print "" |
|||
while [(size randSeq) > idx + 4][ |
|||
if prefix? slice randSeq idx idx+4 randSub -> |
|||
print ["Found subsequence at position:" idx] |
|||
idx: idx + 1 |
|||
]</syntaxhighlight> |
|||
{{out}} |
|||
<pre>Random sequence: |
|||
CACGCGCGTTAACCCTGCAT |
|||
CTTTTCTCTAAGATGATGCG |
|||
CTACTCTGCCCGATTACTAT |
|||
GATGTCACCGGCGGTTCGGC |
|||
GACTGGCGCTGGCAGAAAGC |
|||
GCATGTCAAATTGCCCCAGT |
|||
GTGCAAGTCCAAGTATTAGT |
|||
GAGGTGCTCCGCTTCGTCCG |
|||
GGGTCGACTCGGTCCCACTT |
|||
CATTACATGTTGGTAATAGT |
|||
Looking for subsequence: |
|||
CGGT |
|||
Found subsequence at position: 71 |
|||
Found subsequence at position: 169</pre> |
|||
=={{header|Factor}}== |
|||
{{works with|Factor|0.99 2021-02-05}} |
|||
<syntaxhighlight lang="factor">USING: accessors formatting grouping io kernel math |
|||
math.functions.integer-logs math.parser random regexp sequences ; |
|||
: new-dna ( n -- str ) [ "ACGT" random ] "" replicate-as ; |
|||
: pad ( n d -- str ) [ number>string ] dip 32 pad-head ; |
|||
:: .dna ( seq n -- ) |
|||
seq length integer-log10 1 + :> d seq n group |
|||
[ n * d pad write ": " write write nl ] each-index ; |
|||
: .match ( slice -- ) [ from>> ] [ to>> ] bi "%d..%d\n" printf ; |
|||
: .matches ( slices -- ) |
|||
"Matches found at the following indices:" print |
|||
[ .match ] each ; |
|||
: .locate ( slices -- ) |
|||
[ "No matches found." print ] [ .matches ] if-empty ; |
|||
: .biosub ( dna-size row-size -- ) |
|||
[ new-dna dup ] [ .dna nl ] bi* |
|||
4 new-dna dup "Subsequence to locate: %s\n" printf |
|||
<regexp> all-matching-slices .locate ; |
|||
80 10 .biosub nl |
|||
600 39 .biosub nl</syntaxhighlight> |
|||
{{out}} |
|||
<pre> 0: ATTCAAGGAC |
|||
10: CACTATTAAC |
|||
20: CTGCATTGTG |
|||
30: AGAACTTGCA |
|||
40: GTGTACCGAG |
|||
50: AGCGAGTTTA |
|||
60: AAGCAACACA |
|||
70: TCTTTACCGA |
|||
Subsequence to locate: GTAG |
|||
No matches found. |
|||
0: GATCTCGTCATGGTCCATCCTAACATTTCGGTTGTGGGC |
|||
39: GCATCCCGATAGGCGAAGTTAAATCTACGTAGTCCTACG |
|||
78: TCACGACGGAACATGATTGCCCACCGAAGTCGTAGGCGA |
|||
117: GCTAAAGTCGGTACATACACGATCTGCTATATTCGTTCT |
|||
156: CCGACACACGACATGCAATCCGAGAAGCTCTCGAAGTGC |
|||
195: GGTCAGATCCTCAGACTCGAACAGAGGAGACCTTAACTG |
|||
234: ATACCCACAGTACTTCTCGCATAACCTAAGCACCTATGC |
|||
273: TTACACCATCGTCCTGATATTGAGTGAGTCTGGTCGGAG |
|||
312: ATATTATCTAGCACCCTCAAGCTCTGTGTGCCACACCAG |
|||
351: GATTCCACTTCGCGCTTGCCTAGAGAAAGTAGAGTAGGT |
|||
390: GGTGTCATTAGTACACTGTTTGCGATGCACCAACCAAAC |
|||
429: CCGACCGCCATGATGACTGCTTTTCGGCCAACGTCAGAT |
|||
468: TAAGAGTACTTTTAGTAGCACCGCAAGCCAGCCGGTTTA |
|||
507: GCAAGATCCTGCAGCCTCCACGTTATTTCAGGTCTCTAA |
|||
546: GCGTTCTTTCCATGGAAGTAGTCACCGCTCCCGTTGCCA |
|||
585: ATGGACACAGACGTT |
|||
Subsequence to locate: ATAT |
|||
Matches found at the following indices: |
|||
145..149 |
|||
289..293 |
|||
312..316</pre> |
|||
=={{header|FreeBASIC}}== |
|||
{{trans|Wren}} |
|||
<syntaxhighlight lang="vb">Const base_ = "ACGT" |
|||
Sub findDnaSubsequence(dnaSize As Integer, chunkSize As Integer) |
|||
Dim As String dnaSeq(1 To dnaSize) |
|||
Dim As Integer i, chunk |
|||
For i = 1 To dnaSize |
|||
dnaSeq(i) = Mid(base_, Int(Rnd * 4)+1, 1) |
|||
Next |
|||
Dim As String dnaStr |
|||
For i = 1 To dnaSize |
|||
dnaStr += dnaSeq(i) |
|||
Next |
|||
Dim As String dnaSubseq(1 To 4) |
|||
For i = 1 To 4 |
|||
dnaSubseq(i) = Mid(base_, Int(Rnd * 4)+1, 1) |
|||
Next |
|||
Dim As String dnaSubstr |
|||
For i = 1 To 4 |
|||
dnaSubstr += dnaSubseq(i) |
|||
Next |
|||
Print "DNA sequence:" |
|||
For chunk = 1 To Len(dnaStr) Step chunkSize |
|||
Print Using "###_._.###: &"; chunk; chunk+chunkSize-1; Mid(dnaStr, chunk, chunkSize) |
|||
Next |
|||
Print !"\nSubsequence to locate: "; dnaSubstr |
|||
Dim As Integer idx = Instr(dnaStr, dnaSubstr) |
|||
Print Iif(idx <> 0, "Matches found at the following indices:", "No matches found.") |
|||
Do While idx > 0 |
|||
If idx <> 0 Then Print Using "###_._.###"; idx; idx + 3 |
|||
idx = Instr(idx+4, dnaStr, dnaSubstr) |
|||
Loop |
|||
End Sub |
|||
findDnaSubsequence(200, 20) |
|||
Print |
|||
findDnaSubsequence(600, 40) |
|||
Sleep</syntaxhighlight> |
|||
{{out}} |
|||
<pre>DNA sequence: |
|||
1.. 20: TTATAGTCTTGGAGGCATGT |
|||
21.. 40: TAACTTATGCGGAGCAGACA |
|||
41.. 60: CGGAGTATGCATTCCTCTTA |
|||
61.. 80: CCAAACGGTGCTGCCCGCGC |
|||
81..100: ACTCGCTGTATTCCGTATCG |
|||
101..120: TCACATTATCTAAACCACGA |
|||
121..140: TTTCCAGCGTGCGTGGGAAG |
|||
141..160: GCCATGTTTAGTCGGGGGCC |
|||
161..180: AAGGTCTTTGGCTTATGCTG |
|||
181..200: TTTTTTTTTCTTCGGTTACA |
|||
Subsequence to locate: ATTT |
|||
Matches found at the following indices: |
|||
120..123 |
|||
DNA sequence: |
|||
1.. 40: GTGCGGGCCGTTAGCAGCTACGAGTGCTAGATGGAACTAG |
|||
41.. 80: TCCCCGCTCCCAAATGCAAAGCGTCCCAGACCAGTCTTGA |
|||
81..120: AGCCCGTTAAATTACACCTGAACCGTTGCAAATGATCGAT |
|||
121..160: AGACGGGGTATAATAGCGGAAAACACAGGGGAACTGCATG |
|||
161..200: CAAGCTCGAGCCGCTGAAGGATGGCTCCCCCCCGAGTGTA |
|||
201..240: AGTGGATCTCGCCCAAATAGCGGGGGAACAAAGAAAGGTA |
|||
241..280: AGTCTTACTTCGCACGTCCCCTCTCATACACGCCAGGACT |
|||
281..320: AATGGATCATTCATAGGTGACGGGTGACTTGCGGTGTTTC |
|||
321..360: TAGTTGGAGTCACCCGTCAGCTTAGATCTAAGTATGAACC |
|||
361..400: GTAAGAGTTTGTAACTGCACCTTCCGTCTCTTCCTCTGTA |
|||
401..440: GGAACGCTTTTGCTTGTTATCAGATAGTGTCTCCTTATCA |
|||
441..480: TAGGACAGGTTCCTTGTGAAGGTCCACAGAGTTTGCCCGG |
|||
481..520: GGTTCGAATATACGACGCTTGTGGTTCCGGCACTATAACT |
|||
521..560: TCCGCAGTGTTGTCGACGCCCCTAGCTCCCGGGGTCTTTT |
|||
561..600: CGCTTCCCTATAGCGCGAAATGAGTGCAAGGGTACCGGCC |
|||
Subsequence to locate: GCAC |
|||
Matches found at the following indices: |
|||
252..255 |
|||
377..380 |
|||
510..513</pre> |
|||
=={{header|Go}}== |
|||
{{trans|Wren}} |
|||
<syntaxhighlight lang="go">package main |
|||
import ( |
|||
"fmt" |
|||
"math/rand" |
|||
"regexp" |
|||
"time" |
|||
) |
|||
const base = "ACGT" |
|||
func findDnaSubsequence(dnaSize, chunkSize int) { |
|||
dnaSeq := make([]byte, dnaSize) |
|||
for i := 0; i < dnaSize; i++ { |
|||
dnaSeq[i] = base[rand.Intn(4)] |
|||
} |
|||
dnaStr := string(dnaSeq) |
|||
dnaSubseq := make([]byte, 4) |
|||
for i := 0; i < 4; i++ { |
|||
dnaSubseq[i] = base[rand.Intn(4)] |
|||
} |
|||
dnaSubstr := string(dnaSubseq) |
|||
fmt.Println("DNA sequnence:") |
|||
for i := chunkSize; i <= len(dnaStr); i += chunkSize { |
|||
start := i - chunkSize |
|||
fmt.Printf("%3d..%3d: %s\n", start+1, i, dnaStr[start:i]) |
|||
} |
|||
fmt.Println("\nSubsequence to locate:", dnaSubstr) |
|||
var r = regexp.MustCompile(dnaSubstr) |
|||
var matches = r.FindAllStringIndex(dnaStr, -1) |
|||
if len(matches) == 0 { |
|||
fmt.Println("No matches found.") |
|||
} else { |
|||
fmt.Println("Matches found at the following indices:") |
|||
for _, m := range matches { |
|||
fmt.Printf("%3d..%-3d\n", m[0]+1, m[1]) |
|||
} |
|||
} |
|||
} |
|||
func main() { |
|||
rand.Seed(time.Now().UnixNano()) |
|||
findDnaSubsequence(200, 20) |
|||
fmt.Println() |
|||
findDnaSubsequence(600, 40) |
|||
}</syntaxhighlight> |
|||
{{out}} |
|||
Sample run: |
|||
<pre> |
|||
DNA sequnence: |
|||
1.. 20: GTTGCCCACACGTCTTATTG |
|||
21.. 40: TAAAAATCACCGTGCAGCGA |
|||
41.. 60: GGTTAAAAATGGTAGGAAAA |
|||
61.. 80: TATCCTCAGCCAGCGGTGCC |
|||
81..100: GGCCAACAAAAGGGACGTTG |
|||
101..120: GATTAAAGTAGGTCTAGGTA |
|||
121..140: TCTCGTATCCGGTTGATCCG |
|||
141..160: GGATGGTGGACGATATTGGA |
|||
161..180: GACCGGAGTGTACATCGGTG |
|||
181..200: TTGTCGCTTGCAGCTACGGT |
|||
Subsequence to locate: AATA |
|||
Matches found at the following indices: |
|||
59..62 |
|||
DNA sequnence: |
|||
1.. 40: GTACAGCCACTGTTAGTAGACGGATGCTATTGGGACGCAA |
|||
41.. 80: CACATCAGTACACTGCTTGTTCGTAATCGCGTACCCAGCG |
|||
81..120: CAAAAGGAGGGGAGGAACCTGCTCAGACTGTCGCTAAAAA |
|||
121..160: CGAGCACGTGTCCTTACGCAGTGATGGTAGCGGTCCACGA |
|||
161..200: CTTCCACTGGCATAAGGAGAATGTTTAGTAACGCCCCTCA |
|||
201..240: TAGGTGCAATTCTACAGGTTAAGGGACCGTGGGATGTTTC |
|||
241..280: TATAAAAGTTGAAGAGATTACTAATCCGTCCCGTGCGCGT |
|||
281..320: GCCGCAATTTAGCGCCCGTTCTTGAGTAAACATACATGCA |
|||
321..360: CGCTCTTGAGTTTTCTAAAACCTGATCAAAACGGTCGCCC |
|||
361..400: ACATGCAGGAGCGCCGCAGGGTTTCAGAGGTCAACCATCG |
|||
401..440: GCAGCACACGTGAACCCTCTGTACTGACCAGGGGCTTGCT |
|||
441..480: CCTTGGTAGGAGATGGTGGAGAATGCGTCGATGCACTGAA |
|||
481..520: GCAGACCGCTGATAGCATGTACGATGTTTACGGGTTGACG |
|||
521..560: ATAGCTTTGCTAGTGATCGAACATATGATGAAAAACGCTT |
|||
561..600: CCATTGATAGAGCATCTTAGGAGCTCAGTCCAGTGACCTC |
|||
Subsequence to locate: AGGT |
|||
Matches found at the following indices: |
|||
202..205 |
|||
216..219 |
|||
388..391 |
|||
</pre> |
|||
=={{header|jq}}== |
|||
{{works with|jq}} |
|||
'''Works with gojq, the Go implementation of jq''' |
|||
Neither jq nor gojq currently has any PRNG built-ins so one |
|||
possibility is to use a jq-coded PRNG function such |
|||
as can be found at https://rosettacode.org/wiki/Random_numbers#jq |
|||
In practice, it's usually more convenient to use a utility such as |
|||
gshuf or jot to provide the source of randomness. Here we use |
|||
`jot -r N MIN MAX` but a fourth argument can also be |
|||
used to specify a seed. An alternative would be to use `gshuf` along the lines of: |
|||
<syntaxhighlight lang="sh"> |
|||
# For 200 pseudo-random integers in the range 0 to 3 inclusive: |
|||
gshuf -i 0-3 -r -n 200 --random-source=/dev/random |
|||
</syntaxhighlight> |
|||
Note that the indices shown below are offsets (i.e., the index origin is taken to be 0). |
|||
<syntaxhighlight lang="sh"> |
|||
#!/bin/bash |
|||
jot -r 200 0 3 | jq -nr --slurpfile four <(jot -r 4 0 3) ' |
|||
# input: an array of integers |
|||
def toDNA: |
|||
def base: . as $in | "ACGT" | .[$in : $in+1]; |
|||
map(base) | join(""); |
|||
([inputs] | toDNA) as $strand |
|||
| ($four | toDNA) as $four |
|||
| "Strand of length \($strand|length):", |
|||
$strand, |
|||
"Zero-based indices of \($four):", |
|||
($strand | indices($four) | join(" ")) |
|||
'</syntaxhighlight> |
|||
{{out}} |
|||
<pre> |
|||
./bioinformatics-subsequence.sh |
|||
Strand of length 200: |
|||
TGGGCCCAAGCATTGCCACGTAGCTTTGTCAGTGGGCTTGTAAGGGACGAACACAAACTCACAGACCAGGAATTCTCGAGTTCCAGTCCCCCCACTTGTCGCTATTTAGTTAAGACGTTCAGTTTCGTTGCGAACTGTGTCCCCCAGGCTAACGTGATGGGTGTCAGGAATCAATGGCCAACTTTCAGTTAGACTTGACC |
|||
Zero-based indices of CAAC: |
|||
178 |
|||
./bioinformatics-subsequence.sh |
|||
Strand of length 200: |
|||
TAAGACTGCAGGGTACGAAGAGTGGAAGATTGGCTCGTACTTGTCGACGTCGCGTGACATAATCTCTGTGCTCGCCTCGCAGTAAGGGACTAGGTCCCGTTCGAGCGCCCTGCTAGAAGGAGCATCCTACCATGCTCTGATGACATCCTGTCGGCATTAGAGTTTCTACGACATCTAAAGAGTACGATCGACTTCCCAGT |
|||
Zero-based indices of GACA: |
|||
55 141 169 |
|||
</pre> |
|||
=={{header|Julia}}== |
|||
<syntaxhighlight lang="julia">DNArand(n, bases=['A', 'T', 'C', 'G']) = String(rand(bases, n)) |
|||
DNAsearch(needle, haystack, lap=true) = findall(needle, haystack, overlap=lap) |
|||
const rand_string = DNArand(200) |
|||
const subseq = DNArand(4) |
|||
println("Search sequence:\n$rand_string\nfor substring $subseq. Found at positions: ") |
|||
foreach(p -> print(rpad(p[2], 8), p[1] % 10 == 0 ? "\n" : ""), enumerate(DNAsearch(subseq, rand_string))) |
|||
</syntaxhighlight>{{out}} |
|||
<pre> |
|||
Search sequence: |
|||
CCGAAGCCAGGAGGACTGAGCGCTTGCGTCCCGAGTTCTGCGACGAGTCTCTTCATTATAAGGCCACTGATTGCGCTCATCATGAGTGCCAGAAGCACCGCTAAACATAAGTGTCCTTTCTTCCTGACGCACTTGAAGATTGTGACCATTTGTGCGGGTTGTGAGTTAGGGGCTCTCATTGTACACGATCTATAGTGTGC |
|||
for substring CGCT. Found at positions: |
|||
21:24 74:77 99:102 |
|||
</pre> |
|||
=={{header|Nim}}== |
|||
<syntaxhighlight lang="nim">import random, sequtils, strutils |
|||
proc dnaSequence(n: Positive): string = |
|||
## Create a random DNA sequence of length "n". |
|||
newSeqWith(n, sample("ACGT")).join() |
|||
proc positions(dnaSeq, subSeq: string): seq[int] = |
|||
## Return the list of starting positions of a subsequence |
|||
## "subSeq" in a sequence "dnaSeq". Positions start at 1. |
|||
var start = 0 |
|||
while true: |
|||
let pos = dnaSeq.find(subSeq, start) |
|||
if pos < 0: break |
|||
result.add pos + 1 |
|||
start = pos + 1 |
|||
when isMainModule: |
|||
const |
|||
N = 200 |
|||
Step = 20 |
|||
randomize() |
|||
let dnaSeq = dnaSequence(N) |
|||
echo "DNA sequence:" |
|||
for i in countup(0, N - 1, Step): |
|||
echo ($(i+1)).align(3), ' ', dnaSeq[i..i+(Step-1)] |
|||
let subSeq = dnaSequence(3) |
|||
echo "\nDNA subsequence: ", subSeq |
|||
echo() |
|||
let pos = dnaSeq.positions(subSeq) |
|||
if pos.len == 0: |
|||
echo "Subsequence not found." |
|||
else: |
|||
let tail = if pos.len == 1: ": " else: "s: " |
|||
echo "Subsequence found at position", tail, pos.join(", ")</syntaxhighlight> |
|||
{{out}} |
|||
<pre>DNA sequence: |
|||
1 CACATACGATGAGCTGGGCG |
|||
21 CCTAAGAGGCGGAAAGACAA |
|||
41 CCGTGTGTGTCTAACCCATG |
|||
61 GTTTAATTGCAGATAGTCTC |
|||
81 TAGACTACAAACATTAGAGC |
|||
101 AATGCACCGGGGTGCACGTG |
|||
121 TGTTTTGACTTCCCATGAAA |
|||
141 GCCCTTATCCTAGAGTACAG |
|||
161 TCGGCAAATGTTCGCTCCTT |
|||
181 GGCCCACTCCATTTGGACGG |
|||
DNA subsequence: GTT |
|||
Subsequence found at positions: 61, 122, 170</pre> |
|||
=={{header|Perl}}== |
|||
<syntaxhighlight lang="perl">use strict; |
|||
use warnings; |
|||
use feature 'say'; |
|||
my @bases = <A C G T>; |
|||
my $basecnt = 160; |
|||
my($string,$target); |
|||
$string .= $bases[ int rand @bases ] for 1 .. $basecnt; |
|||
$target .= $bases[ int rand @bases ] for 1 .. 4; |
|||
say "Target: $target"; |
|||
say 'Matches at these positions:'; |
|||
say (($string =~ s/.{1,40}\K/\n/gr) =~ s/($target)/ >$1< /gr);</syntaxhighlight> |
|||
{{out}} |
|||
<pre>Target: CCTG |
|||
Matches at these positions: |
|||
9 |
|||
90 |
|||
157 |
|||
TTGCC >CCTG< CAAAGTTAATAAGTAAACAATTAAGTGAGTG |
|||
CTCTAGGGTAAGGTGAGGGCGGGAAGGGGAAAAATACCGA |
|||
TGCGAG >CCTG< TAGAGCCGGGCCTCAAATTAAACGAAAAAT |
|||
ATAAGTTTGCTTGGCACGCTGTACTACTTATCC >CCTG< ACT</pre> |
|||
=={{header|Phix}}== |
|||
Currently only searches for non-overlapped sequences, but it should be pretty obvious how to change that, in which case the next underline will simply partially overwrite the previous, so you'll get eg "<=<==>". |
|||
<!--<syntaxhighlight lang="phix">(phixonline)--> |
|||
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span> |
|||
<span style="color: #008080;">constant</span> <span style="color: #000000;">cheat</span> <span style="color: #0000FF;">=</span> <span style="color: #004600;">false</span> |
|||
<span style="color: #008080;">function</span> <span style="color: #000000;">grandna</span><span style="color: #0000FF;">(</span><span style="color: #004080;">integer</span> <span style="color: #000000;">len</span><span style="color: #0000FF;">)</span> |
|||
<span style="color: #004080;">string</span> <span style="color: #000000;">dna</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #000000;">len</span><span style="color: #0000FF;">)</span> |
|||
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #000000;">len</span> <span style="color: #008080;">do</span> <span style="color: #000000;">dna</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">"ACGT"</span><span style="color: #0000FF;">[</span><span style="color: #7060A8;">rand</span><span style="color: #0000FF;">(</span><span style="color: #000000;">4</span><span style="color: #0000FF;">)]</span> <span style="color: #008080;">end</span> <span style="color: #008080;">for</span> |
|||
<span style="color: #008080;">return</span> <span style="color: #000000;">dna</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span> |
|||
<span style="color: #008080;">procedure</span> <span style="color: #000000;">show</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">dna</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">test</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">idx</span><span style="color: #0000FF;">)</span> |
|||
<span style="color: #000000;">idx</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">deep_copy</span><span style="color: #0000FF;">(</span><span style="color: #000000;">idx</span><span style="color: #0000FF;">)</span> <span style="color: #0000FF;">&</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">dna</span><span style="color: #0000FF;">)+</span><span style="color: #000000;">100</span> <span style="color: #000080;font-style:italic;">-- (add an otherwise unused sentinel)</span> |
|||
<span style="color: #004080;">sequence</span> <span style="color: #000000;">s</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">trim</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">join_by</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">join_by</span><span style="color: #0000FF;">(</span><span style="color: #000000;">dna</span><span style="color: #0000FF;">,</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #000000;">10</span><span style="color: #0000FF;">,</span><span style="color: #008000;">""</span><span style="color: #0000FF;">),</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">),</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #000000;">5</span><span style="color: #0000FF;">,</span><span style="color: #008000;">" "</span><span style="color: #0000FF;">)),</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)</span> |
|||
<span style="color: #004080;">integer</span> <span style="color: #000000;">ii</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span><span style="color: #0000FF;">,</span> <span style="color: #000080;font-style:italic;">-- idx index</span> |
|||
<span style="color: #000000;">i</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">idx</span><span style="color: #0000FF;">[</span><span style="color: #000000;">ii</span><span style="color: #0000FF;">],</span> <span style="color: #000080;font-style:italic;">-- current target</span> |
|||
<span style="color: #000000;">ux</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span><span style="color: #0000FF;">,</span> <span style="color: #000080;font-style:italic;">-- underline index (1..4)</span> |
|||
<span style="color: #000000;">ldx</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span> <span style="color: #000080;font-style:italic;">-- line index (1, 51, 101, etc)</span> |
|||
<span style="color: #008080;">for</span> <span style="color: #000000;">si</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span> |
|||
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%3d: %s\n"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">ldx</span><span style="color: #0000FF;">,</span><span style="color: #000000;">s</span><span style="color: #0000FF;">[</span><span style="color: #000000;">si</span><span style="color: #0000FF;">]})</span> |
|||
<span style="color: #000000;">ldx</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">50</span> |
|||
<span style="color: #008080;">if</span> <span style="color: #000000;">i</span> <span style="color: #008080;">and</span> <span style="color: #000000;">i</span><span style="color: #0000FF;"><</span><span style="color: #000000;">ldx</span> <span style="color: #008080;">then</span> |
|||
<span style="color: #004080;">string</span> <span style="color: #000000;">ul</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #000000;">59</span><span style="color: #0000FF;">)</span> |
|||
<span style="color: #008080;">while</span> <span style="color: #000000;">i</span> <span style="color: #008080;">and</span> <span style="color: #000000;">i</span><span style="color: #0000FF;"><</span><span style="color: #000000;">ldx</span> <span style="color: #008080;">do</span> |
|||
<span style="color: #004080;">integer</span> <span style="color: #000000;">up</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">-</span><span style="color: #000000;">ldx</span><span style="color: #0000FF;">+</span><span style="color: #000000;">51</span> <span style="color: #000080;font-style:italic;">-- underline pos (relative to ldx)</span> |
|||
<span style="color: #000000;">up</span> <span style="color: #0000FF;">+=</span> <span style="color: #7060A8;">floor</span><span style="color: #0000FF;">((</span><span style="color: #000000;">up</span><span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">)/</span><span style="color: #000000;">10</span><span style="color: #0000FF;">)+</span><span style="color: #000000;">5</span> <span style="color: #000080;font-style:italic;">-- (plus any needed spacing)</span> |
|||
<span style="color: #000000;">ul</span><span style="color: #0000FF;">[</span><span style="color: #000000;">up</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">"<==>"</span><span style="color: #0000FF;">[</span><span style="color: #000000;">ux</span><span style="color: #0000FF;">]</span> |
|||
<span style="color: #000000;">ux</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span> |
|||
<span style="color: #000000;">i</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span> |
|||
<span style="color: #008080;">if</span> <span style="color: #000000;">ux</span><span style="color: #0000FF;">></span><span style="color: #000000;">4</span> <span style="color: #008080;">then</span> |
|||
<span style="color: #000000;">ux</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span> |
|||
<span style="color: #000000;">ii</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span> |
|||
<span style="color: #000000;">i</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">idx</span><span style="color: #0000FF;">[</span><span style="color: #000000;">ii</span><span style="color: #0000FF;">]</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span> |
|||
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%s\n"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">ul</span><span style="color: #0000FF;">)</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span> |
|||
<span style="color: #008080;">if</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">idx</span><span style="color: #0000FF;">)></span><span style="color: #000000;">1</span> <span style="color: #008080;">then</span> |
|||
<span style="color: #004080;">string</span> <span style="color: #000000;">p</span> <span style="color: #0000FF;">=</span> <span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">idx</span><span style="color: #0000FF;">)></span><span style="color: #000000;">1</span><span style="color: #0000FF;">?</span><span style="color: #008000;">"s"</span><span style="color: #0000FF;">:</span><span style="color: #008000;">""</span><span style="color: #0000FF;">),</span> |
|||
<span style="color: #000000;">t</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">join</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">apply</span><span style="color: #0000FF;">(</span><span style="color: #000000;">idx</span><span style="color: #0000FF;">[</span><span style="color: #000000;">1</span><span style="color: #0000FF;">..$-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">],</span><span style="color: #7060A8;">sprint</span><span style="color: #0000FF;">),</span><span style="color: #008000;">", "</span><span style="color: #0000FF;">)</span> |
|||
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%s occurs at location%s: %s\n"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">test</span><span style="color: #0000FF;">,</span><span style="color: #000000;">p</span><span style="color: #0000FF;">,</span><span style="color: #000000;">t</span><span style="color: #0000FF;">})</span> |
|||
<span style="color: #008080;">else</span> |
|||
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%s does not occur\n"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">test</span><span style="color: #0000FF;">})</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span> |
|||
<span style="color: #004080;">string</span> <span style="color: #000000;">dna</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">grandna</span><span style="color: #0000FF;">(</span><span style="color: #000000;">200</span><span style="color: #0000FF;">),</span> |
|||
<span style="color: #000000;">test</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">grandna</span><span style="color: #0000FF;">(</span><span style="color: #000000;">4</span><span style="color: #0000FF;">)</span> |
|||
<span style="color: #008080;">constant</span> <span style="color: #000000;">cheats</span> <span style="color: #0000FF;">=</span> <span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #000000;">cheat</span><span style="color: #0000FF;">?{</span><span style="color: #000000;">9</span><span style="color: #0000FF;">,</span><span style="color: #000000;">13</span><span style="color: #0000FF;">,</span><span style="color: #000000;">49</span><span style="color: #0000FF;">,</span><span style="color: #000000;">60</span><span style="color: #0000FF;">,</span><span style="color: #000000;">64</span><span style="color: #0000FF;">,</span><span style="color: #000000;">68</span><span style="color: #0000FF;">}:{})</span> |
|||
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">cheats</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span> |
|||
<span style="color: #000000;">dna</span><span style="color: #0000FF;">[</span><span style="color: #000000;">cheats</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]..</span><span style="color: #000000;">cheats</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]+</span><span style="color: #000000;">3</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">test</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span> |
|||
<span style="color: #004080;">sequence</span> <span style="color: #000000;">idx</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">match_all</span><span style="color: #0000FF;">(</span><span style="color: #000000;">test</span><span style="color: #0000FF;">,</span><span style="color: #000000;">dna</span><span style="color: #0000FF;">)</span> |
|||
<span style="color: #000000;">show</span><span style="color: #0000FF;">(</span><span style="color: #000000;">dna</span><span style="color: #0000FF;">,</span><span style="color: #000000;">test</span><span style="color: #0000FF;">,</span><span style="color: #000000;">idx</span><span style="color: #0000FF;">)</span> |
|||
<!--</syntaxhighlight>--> |
|||
{{out}} |
|||
with cheat enabled |
|||
<pre> |
|||
1: GGAGATATCG ACCGACCGAA GTAAAGTCAA AGTCGTCCAA TCCACGGACG |
|||
<= =><==> <= |
|||
51: ACTTCAGCAC GACCGACCGA CCTATTTAAG AGACCACACT TAAGGAATCC |
|||
=> < ==><==><== > |
|||
101: ATGCGAAATA AAAATGGGCG AGTAGCCGTG GGGCGCTAAA GCACCCACCT |
|||
151: AGTTTTCGCC GAAGTACTAG ACCACCTTCG GATCGACAAA GCTTTCACCA |
|||
<==> |
|||
CGAC occurs at locations: 9, 13, 49, 60, 64, 68, 184 |
|||
</pre> |
|||
with cheat disabled |
|||
<pre> |
|||
1: TGATTTAAAC CGTGGTGCAA TTTATAAACA CTGCGATATG CCTCCTGATG |
|||
51: GCATGGTATT CGACACCAAG ACGCTGGTGG GCACACTGGC TTTCAGAATA |
|||
101: GGAGTCACAA TCCCTCTATG ATGTCCTCTA GCGGGTGTGT GTTCAGTGCC |
|||
151: AGCGCTTACT TCCGGCGTGG CCGACTCTTT TTAAAGCGTA TAGCTGGGGT |
|||
GCTA does not occur |
|||
</pre> |
|||
=={{header|Python}}== |
|||
{{works with|Python|3.8}} |
|||
{{libheader|regex}} |
|||
<syntaxhighlight lang="python"> |
|||
from random import choice |
|||
import regex as re |
|||
import time |
|||
def generate_sequence(n: int ) -> str: |
|||
return "".join([ choice(['A','C','G','T']) for _ in range(n) ]) |
|||
def dna_findall(needle: str, haystack: str) -> None: |
|||
if sum(1 for _ in re.finditer(needle, haystack, overlapped=True)) == 0: |
|||
print("No matches found") |
|||
else: |
|||
print(f"Found {needle} at the following indices: ") |
|||
for match in re.finditer(needle, haystack, overlapped=True): |
|||
print(f"{match.start()}:{match.end()} ") |
|||
dna_seq = generate_sequence(200) |
|||
sample_seq = generate_sequence(4) |
|||
c = 1 |
|||
for i in dna_seq: |
|||
print(i, end="") if c % 20 != 0 else print(f"{i}") |
|||
c += 1 |
|||
print(f"\nSearch Sample: {sample_seq}") |
|||
dna_findall(sample_seq, dna_seq) |
|||
</syntaxhighlight> |
|||
{{out}} |
|||
<pre> |
|||
TTGCCCCTGTACTGAGCCCA |
|||
TAAGCTTGCACTCAAGGTTT |
|||
TGCCCCCTCATATTATAACG |
|||
CATCCATTATACAAAACCGA |
|||
TACCCTTCCGCATATTATGA |
|||
AAAGTGGCGAAGTGCCTTGA |
|||
TTTGCATTCATAGTACAACG |
|||
GTGCAAAAGCATTGTATGTC |
|||
TCACATTTACATGGGAAATG |
|||
CCTAGTAGGTGCAAGACCTG |
|||
Search Sample: TACA |
|||
Found TACA at the following indices: |
|||
69:73 |
|||
133:137 |
|||
167:171 |
|||
</pre> |
|||
=={{header|Racket}}== |
|||
<syntaxhighlight lang="racket">#lang racket |
|||
(define (rand-seq n) |
|||
(build-string n (lambda _ (string-ref "TGAC" (random 4))))) |
|||
(define (subsequence-indices full part) |
|||
(let ((part-length (string-length part)) (full-length (string-length full))) |
|||
(for/list ((i (- full-length part-length)) |
|||
#:when (for/and ((p part) (f (in-string full i))) (eq? p f))) |
|||
(cons i (+ i part-length -1))))) |
|||
(define (report-sequence s (l 50)) |
|||
(string-join (for/list ((i (in-range 0 (string-length s) l))) |
|||
(format "~a: ~a" (~a #:width 4 i) |
|||
(substring s i (min (string-length s) (+ i l))))) |
|||
"\n")) |
|||
(define (Bioinformatics/Subsequence (full (rand-seq 400)) (sub (rand-seq 4))) |
|||
(printf "Indices of ~a in~%~a~%~a~%" |
|||
sub (report-sequence full) (subsequence-indices full sub))) |
|||
(module+ main (for ((i 4)) (Bioinformatics/Subsequence)))</syntaxhighlight> |
|||
{{out}} |
|||
<pre>Indices of TTAC in |
|||
0 : TTATCCTACCGCGTAAGTTCAATGCTCACCGCAGTTTGCTAACCGTTCCT |
|||
50 : AAATTCACTTCCTAAGGTATCTTTCGCTTAATTGATGCCGATTGAATTCC |
|||
100 : ACGGAGGGCGTAATTGTTTCGGACTTTAGACCTGACATAAGGGCACACTA |
|||
150 : GTCCTATTGAATTTGGTGCTATTCGGCGACCTACTAACCTTAGTCAGTGA |
|||
200 : AGAGCCATCTCAAAAGTACAGTCATCCTCAAGTGTTACATACGGCACCAT |
|||
250 : GACAGTGTATAAGCATGGAGGTTGGCCTATCGTCATATCGAGGCGGCGCC |
|||
300 : ATAGACCGGCCAGGTGATGAGATCGACTTTAATGTTGTTGCTTAGCTTGA |
|||
350 : CCTCTAGTTTGGATTAAGACGGTCATAGATAGATAGACCGTAAAGTATTC |
|||
((234 . 237)) |
|||
Indices of GTAA in |
|||
0 : GTCAGTCCACGCAAGAATAGCAGTTGAGTGGACAATTTATGAGACGGAGA |
|||
50 : TAAGTAACCCGCTCCGAGATAAACGTCAGCCGGATTCCGCTGAGTCGGTC |
|||
100 : GCCTTCCAAGTGGCAGCTTGTTTGCATTGCTTACAGTGACTTGAACGATC |
|||
150 : ACCTACTCGAGGACTCTGCGGGTATTCCAGTTGCCTTGCACTCAGCGATG |
|||
200 : CACAAACTTTAAATTATCACAGAAAGAATGTGATTCGGGTGGTCACCCTT |
|||
250 : ATCGGTGAAACCAGTCCTTCCATGGGCATATTCTGCGTCGAAATGAGCCC |
|||
300 : GCTGTTTACGTTGTACGAACTGGGGACCTAAGGAAACGGGCCGTTCTTAG |
|||
350 : GTGATGTCAGCTGCAACGAACTACTGTTAACCTTCTCGATCTGTTGAAAA |
|||
((53 . 56)) |
|||
Indices of AACG in |
|||
0 : TTTACAGTACGATTCCGAAGACACAAGAATGCGCCGGCTGTGGGTAGGGG |
|||
50 : CGACCCTGCGCGACCTATAAAAGGGGCGACTCAATTTTAGGCCCACCACG |
|||
100 : GACCCAGCCCTGTGCACAGAGCGGGGCATTTTTACCTCGCGTGCGCACCA |
|||
150 : ACTGCGATCTGCCTTGTCACATAATCCCACATACGAGTTGTATCTCTAAG |
|||
200 : AAGGGATGAGGCCAATTTAAATCCGGGTGCATTTCTCGGGGGGAGACACC |
|||
250 : AATGAGAGTGGGGCAAGGTGGCGTAGAGAGCTAATCGGGTTTTATGACCG |
|||
300 : CGGAAGACCTGGGATACGTCTGGGTGATAACTGAGGGCAGGTCAACGAAC |
|||
350 : CCTGATGCGTAGCCACGTCTCAGCTATCGGGCCTGTTTTCATAGTCCATG |
|||
((343 . 346)) |
|||
Indices of CAGC in |
|||
0 : TGTGAACCACTATGACACGCTACACGCCTCAAGTTGGCCCCCATATAAGA |
|||
50 : ATATCCATCGGTTAATGTGTCTCGCGGCCGTTAGAACAAGCACTAAAGTT |
|||
100 : AGAGAAACCAACCATTGGACTAGATCAACATCAACGTCGCTGATAATAAA |
|||
150 : TGTATATCTGATGTGGCCGTTCATAAAATCGTTAACTACAGGTATCAACA |
|||
200 : TAGTCTCCCAACTTATATAATTGGTTAACTTAGGAGGAGCTTGCACAGCT |
|||
250 : CAGCTATATGCTATCTGGCCCTGGGCTTGGTAGGCATCACGTCGTTATGC |
|||
300 : TGCGAACATCTCAAAGACAAACGTTGATCCAGCCCCTAGAGAGGTCATTA |
|||
350 : GGCCTCGACCCAATTTAACCTCCCACTCCGTGGGTACAGCTTGAACCCCC |
|||
((245 . 248) (250 . 253) (329 . 332) (386 . 389))</pre> |
|||
=={{header|Raku}}== |
|||
Chances are actually pretty small that a random 4 codon string will show up at all in a random 200 codon sequence. Bump up the sequence size to get a reasonable chance of multiple matches. |
|||
<syntaxhighlight lang="raku" line>use String::Splice:ver<0.0.3+>; |
|||
my $line = 80; |
|||
my $haystack = [~] <A C G T>.roll($line * 8); |
|||
say 'Needle: ' ~ my $needle = [~] <A C G T>.roll(4); |
|||
my $these = $haystack ~~ m:g/<$needle>/; |
|||
my @match = $these.map: { .from, .pos } |
|||
printf "From: %3s to %3s\n", |$_ for @match; |
|||
my $disp = $haystack.comb.batch($line)».join.join("\n"); |
|||
for @match.reverse { |
|||
$disp.=&splice(.[1] + .[1] div $line, "\e[0m" ); |
|||
$disp.=&splice(.[0] + .[0] div $line, "\e[31m"); |
|||
} |
|||
say $disp;</syntaxhighlight> |
|||
{{out}} |
|||
Show in custom div to better display highlighting. |
|||
<div style="font-family: monospace,Courier; line-height: 1.2em; background-color: #f9f9f9; border: 1px solid #ddd; padding: 1em;"> |
|||
Needle: TAGC<br> |
|||
From: 159 to 163<br> |
|||
From: 262 to 266<br> |
|||
From: 315 to 319<br> |
|||
From: 505 to 509<br> |
|||
From: 632 to 636<br> |
|||
CATATGTGACACTGACAGCTCGCGCGAAAATCCGTGTGACGGTCTGAACACTATACTATAGGCCCGGTCGGCATTTGTGG<br> |
|||
CTCCCCAGTGGAGAGACCACTCGTCAATTGCTGACGACTTAACACAAATCGAGTCGCCCTTAGTGCCAGACGGGACTCC<span style="color: #CC0000;">T</span><br> |
|||
<span style="color: #CC0000;">AGC</span>AAAGGGCGGCACGTGGTGACTCCCAATATGTGAGCATGCCATCTAATTGATCTGGGGGGTTTCGCGGGAATACCTAG<br> |
|||
GGGCGTTCTGTCCATGGATCTC<span style="color: #CC0000;">TAGC</span>CCTGCGAAGAGATACCCGCAGTGAGTTGCACGTGCAAAGAACTTGTAAC<span style="color: #CC0000;">TAGC</span>G<br> |
|||
TATTCTGTATCCGCCGCGCGATATGCTTCTGCGGGATGTACTTCTTGTGACTAAGACTTTGTTATCCAAATTGACCAATA<br> |
|||
TTCAACGGTCGACTCTCCGAGGCAGTATCGGTACGCCGAAAAATGGTTACTTCGGCCATACGTAACCTCTCAAGTCACGA<br> |
|||
TTACAGCCCACGGGGGCTTACAGCA<span style="color: #CC0000;">TAGC</span>TCCAAAGACATTCCAATTGAGCTACAACGTGTTCAGTGCGGAGCAGTATCC<br> |
|||
AGTACTCGACTGTTATGGTAAAAGGGCATCGTGATCGTTTATATTAATCATTGGGACAGGTGGTTAATGTCA<span style="color: #CC0000;">TAGC</span>TTAG<br> |
|||
</div> |
|||
=={{header|REXX}}== |
|||
This REXX version allows the user to specify: |
|||
:* length of the (random) DNA data sequence (default is 200). |
|||
:* length of the (random) DNA sequence (default is four). |
|||
:* DNA proteins to be used in the sequence (default is '''ACGT'''). |
|||
:* width of the output lines of (random DNA) (default is 100). |
|||
:* often (if ever) to add a blank to the output (default is every 10 proteins). |
|||
:* DNA proteins to be searched in the data (the default is four unique random proteins). |
|||
:* the seed for the RANDOM function so runs can be repeated with the same data (no default). |
|||
<syntaxhighlight lang="rexx">/*REXX pgm gens random DNA (ACGT) sequence & finds positions of a random 4─protein seq. */ |
|||
parse arg totLen rndLen basePr oWidth Bevery rndDNA seed . |
|||
if totLen=='' | totLen=="," then totLen= 200 /*Not specified? Then use the default.*/ |
|||
if rndLen=='' | rndLen=="," then rndLen= 4 /* " " " " " " */ |
|||
if basePr=='' | basePr=="," then basePr= 'acgt' /* " " " " " " */ |
|||
if oWidth=='' | oWidth=="," then oWidth= 100 /* " " " " " " */ |
|||
if Bevery=='' | Bevery=="," then Bevery= 10 /* " " " " " " */ |
|||
if rndDNA=='' | rndDNA=="," then rndDNA= copies(., rndLen) /*what we're looking for*/ |
|||
if datatype(seed, 'W') then call random ,,seed /*used to generate repeatable random #s*/ |
|||
call genRnd /*gen data field of random proteins. */ |
|||
call show /*show " " " " " */ |
|||
say ' base DNA proteins used: ' basePr |
|||
say 'random DNA proteins used: ' dna? |
|||
call findRnd |
|||
if @=='' then do; say "the random DNA proteins weren't found."; exit 4; end |
|||
say 'the random DNA proteins were found in positions:' strip(@) |
|||
exit 0 /*stick a fork in it, we're all done. */ |
|||
/*──────────────────────────────────────────────────────────────────────────────────────*/ |
|||
commas: parse arg ?; do jc=length(?)-3 to 1 by -3; ?=insert(',', ?, jc); end; return ? |
|||
/*──────────────────────────────────────────────────────────────────────────────────────*/ |
|||
findRnd: @=; p=0 /*@: list of the found target proteins*/ |
|||
do until p==0; p= pos(dna?, $$, p+1); if p>0 then @= @ commas(p) |
|||
/*Found one? Append it to the "Found"s*/ |
|||
end /*p*/; return |
|||
/*──────────────────────────────────────────────────────────────────────────────────────*/ |
|||
genRnd: dna?=; use= basePr; upper use basePr rndDNA; lenB= length(basePr) |
|||
do k=1 for rndLEN; x= substr(rndDNA, k, 1) |
|||
if x==. then do; ?= random(1, length(use) ); x= substr(use, ?, 1) |
|||
use= delstr(use, ?, 1) /*elide so no protein repeats*/ |
|||
end |
|||
dna?= dna? || x /*build a random protein seq.*/ |
|||
end /*k*/ |
|||
return |
|||
/*──────────────────────────────────────────────────────────────────────────────────────*/ |
|||
show: say " index │"center('DNA sequence of ' commas(totLen) " proteins", oWidth+10) |
|||
say "───────┼"center('' , oWidth+10, '─') |
|||
$=; $$=; idx= 1 /*gen data field of random proteins.*/ |
|||
do j=1 for totLen; c= substr( basePr, random(1, lenB), 1) |
|||
$$= $$ || c /*append a random protein. */ |
|||
if Bevery\==0 then if j//Bevery==0 then $= $' ' /*possibly add a blank. */ |
|||
if length( space($ || c, 0) )<oWidth then do; $= $ || c; iterate; end |
|||
say strip( right(idx, 7)'│' $, 'T'); $= /*display line ──► terminal.*/ |
|||
idx= idx + oWidth /*bump the index number. */ |
|||
end /*j*/ |
|||
if $\=='' then say right(idx, 7)"│" strip($, 'T') /*show residual protein data*/ |
|||
say "───────┴"center('' , oWidth+10, '─') |
|||
say; return</syntaxhighlight> |
|||
{{out|output|text= when using the default inputs:}} |
|||
<pre> |
|||
index │ DNA sequence of 200 proteins |
|||
───────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────── |
|||
1│ TTTTTAGCG CGTTTTGTAG CGCTCTAAAA ACCGTAGCTA TATTTCTCGA AGTTTCACCC AGCTCTTTTG CCCCAGGGTT GCGCTAAGCC CAGCTTCGAG |
|||
101│ GGGGCACAG GTAAAATACT ACCGTCCGTG GAGGGGGATG AATTGACCCG ACATTTTTTG AAGCATAACT CGTGACTCAA TATTGCATGA TTACACCAGC |
|||
───────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────── |
|||
base DNA proteins used: ACGT |
|||
random DNA proteins used: GCAT |
|||
the random DNA proteins were found in positions: 162 184 |
|||
</pre> |
|||
{{out|output|text= when using the inputs of: <tt> 1000 , , , , tttt </tt>}} |
|||
<pre> |
|||
index │ DNA sequence of 1,000 proteins |
|||
───────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────── |
|||
1│ GTGATTTTT AGCGCGTTTT GTAGCGCTCT AAAAACCGTA GCTATATTTC TCGAAGTTTC ACCCAGCTCT TTTGCCCCAG GGTTGCGCTA AGCCCAGCTT |
|||
101│ GAGGGGGGC ACAGGTAAAA TACTACCGTC CGTGGAGGGG GATGAATTGA CCCGACATTT TTTGAAGCAT AACTCGTGAC TCAATATTGC ATGATTACAC |
|||
201│ AGCTAGGTT AGTGTAAAAA CCCCCCTATC TTCCTGATCA ATGGCGAGTA AAACATGCAA CCAATTTGTG AGCGAGTACT GGAAATTATT GTTTACGGGA |
|||
301│ AGGCACATG CTACGCGCAA CAGATATCTT AGACTGACCC TTTTAGAGTC ATAAGCCCCT GTCGCCTACA TGCTACTAAT ACTCCAACTA GCGGCGCACC |
|||
401│ TCAACCGGA TCATGGCGCC AGGGAAAATG TGGCGTAGCG ACGTGCTCAT CGCTCGCCGG GGAGAGCCTT TCAGAATCTC GAATAAAACC TGGTAATGAC |
|||
501│ TCATCAATC GTAATGGTCG TCTGGGGCAA GAAGCCGATA TTATAGACTC AGGTCAGACG TGTGCACAAC GGCAGAATTT ATAGTAATTC GCGTGAACTA |
|||
601│ GTTTCGGGA TAGGCCTACG ACCAATCATA GGACATTCGA TGCACGGTGT AGAAACAGTT CTCTGATGTT ACTCGGGATA ACACTCGCAA TCCCCTAGGA |
|||
701│ ACCGTGAGC GTCGCTAGTA TCTGAGATAG TCGCGACTGC CCAGCGGTCT TTAAGTTCGC ACACTACGGG ACTCCTAGTT CGCCCATTCA TGGCTATTTT |
|||
801│ CCTATCAGT CCAATCCCAC GGGGAGGGCA CTCGCGCAAT TCATTCAAAG AGGGCCATTT GCCGATATAA GGTCCATCAT CGGGAGGAAT ATGACTCCTG |
|||
901│ TTAGTATTA GAGCAGCCTC GCTGCGTACT ACTGTCAGTG GCCCGTCAGG GAAGGCAAAA CGTTTTTCCT CTAGGAATCC GTCAATTGGA CTTCTAGACT |
|||
───────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────── |
|||
base DNA proteins used: ACGT |
|||
random DNA proteins used: TTTT |
|||
the random DNA proteins were found in positions: 5 6 16 69 157 158 159 340 796 797 962 963 |
|||
</pre> |
|||
=={{header|Ring}}== |
=={{header|Ring}}== |
||
< |
<syntaxhighlight lang="ring"> |
||
/*----------------------------------- |
|||
row = 0 |
|||
# Project : DNA subsequences |
|||
dnaList = [] |
|||
# Date : 2021/03/23 |
|||
# Author : Gal Zsolt (~ CalmoSoft ~) |
|||
# Email : <calmosoft@gmail.com> |
|||
-----------------------------------*/ |
|||
//----------------------------------------- |
|||
load "stdlibcore.ring" |
|||
load "guilib.ring" |
|||
start = 0 |
|||
base = ["A","C","G","T"] |
base = ["A","C","G","T"] |
||
dnaList = [] |
|||
dnaSeq = [] |
|||
see "DNA sequence:" + nl |
|||
ColLine = list(21) |
|||
see " " + long + ": " |
|||
C_Spacing = 2 |
|||
for nr = 1 to 200 |
|||
row = row + 1 |
|||
rnd = random(3)+1 |
|||
baseStr = base[rnd] |
|||
see baseStr # + " " |
|||
if (row%20) = 0 and long < 200 |
|||
long = long + 20 |
|||
see nl |
|||
if long < 100 |
|||
see " " + long + ": " |
|||
else |
|||
see "" + long + ": " |
|||
ok |
|||
ok |
|||
add(dnaList,baseStr) |
|||
next |
|||
C_ButtonDnaStyle = ' background-color: Red; border-radius: 8px;' |
|||
strBase = "" |
|||
C_ButtonStyle = '"background-color:white"; border-radius: 8px;' |
|||
for n = 1 to 4 |
|||
Button = newlist(10,20) |
|||
rnd = random(3)+1 |
|||
LayoutButtonRow = list(10) |
|||
strBase = strBase + base[rnd] |
|||
next |
|||
//----------------------------------------- |
|||
see "subsequence to search: " + strBase + nl |
|||
app = new qApp |
|||
seqok = 0 |
|||
{ |
|||
win = new qWidget() { |
|||
setWindowTitle('DNA subsequences') |
|||
setWinIcon(self,AppFile("white.jpg")) |
|||
setStyleSheet('background-color:White') |
|||
setgeometry(560,180,300,300) |
|||
//reSize(400,400) |
|||
winheight = 10 |
|||
fontSize = 8 # + (winheight / 100) |
|||
LayoutButtonMain = new QVBoxLayout() |
|||
for n = 1 to 196 |
|||
LayoutButtonMain.setSpacing(C_Spacing) |
|||
flag = 1 |
|||
LayoutButtonMain.setContentsmargins(0,0,0,0) |
|||
for m = 0 to 3 |
|||
if dnaList[n+m] != strBase[m+1] |
|||
LabelInd = new qLabel(win) { settext(" DNA subsequences start positions:") |
|||
flag = 0 |
|||
setAlignment(Qt_AlignHCenter | Qt_AlignVCenter) |
|||
exit |
|||
setStyleSheet("background-color:yellow") } |
|||
ok |
|||
next |
|||
ButtonInd = new QPushButton(win) { setStyleSheet("background-color:yellow") } |
|||
if flag = 1 |
|||
seqok = 1 |
|||
LabelFind = new qLabel(win) { settext(" DNA subsequence to find:") |
|||
see "start position of sequence = " + n + nl |
|||
setStyleSheet("background-color:yellow") } |
|||
ok |
|||
next |
|||
ButtonFind = new QPushButton(win) |
|||
DnaSearch = new QPushButton(win) { setclickevent("pstart()") |
|||
setStyleSheet("background-color:yellow") |
|||
settext("Find") |
|||
} |
|||
for Col = 1 to 21 |
|||
ColLine[Col] = new qLabel(win) { |
|||
setmaximumheight(20) |
|||
setAlignment(Qt_AlignHCenter | Qt_AlignVCenter) |
|||
setStyleSheet("background-color:darkgray") |
|||
setText(string(Col-1)) |
|||
} |
|||
next |
|||
LayoutInd = new QHBoxLayout() { setSpacing(C_Spacing) setContentsMargins(0,0,0,0) } |
|||
LayoutInd.AddWidget(LabelInd) |
|||
LayoutInd.AddWidget(ButtonInd) |
|||
LayoutButtonMain.AddLayout(LayoutInd) |
|||
LayoutTitleRow = new QHBoxLayout() { setSpacing(C_Spacing) setContentsMargins(0,0,0,0) } |
|||
for Col = 1 to 21 |
|||
LayoutTitleRow.AddWidget(ColLine[Col]) |
|||
next |
|||
LayoutButtonMain.AddLayout(LayoutTitleRow) |
|||
RowLine = list(10) |
|||
for Row = 1 to 10 |
|||
Letter = "" + Row*20 |
|||
if Row*20 < 100 |
|||
Letter = " " + Row*20 |
|||
ok |
|||
RowLine[Row] = new qLabel(win) { setFont(new qFont("Verdana",fontSize,40,0)) |
|||
setAlignment(Qt_AlignHCenter | Qt_AlignVCenter) |
|||
setStyleSheet("background-color:darkgray") |
|||
setText(Letter) |
|||
} |
|||
next |
|||
for Row = 1 to 10 |
|||
LayoutButtonRow[Row] = new QHBoxLayout() |
|||
{ |
|||
setSpacing(C_Spacing) |
|||
setContentsmargins(0,0,0,0) |
|||
} |
|||
LayoutButtonRow[Row].AddWidget(RowLine[Row]) |
|||
for Col = 1 to 20 |
|||
Button[Row][Col] = new QPushButton(win) { |
|||
setmaximumwidth(20) |
|||
} |
|||
LayoutButtonRow[Row].AddWidget(Button[Row][Col]) |
|||
next |
|||
LayoutButtonMain.AddLayout(LayoutButtonRow[Row]) |
|||
next |
|||
LayoutDataRow = new QHBoxLayout() { setSpacing(C_Spacing) setContentsMargins(0,0,0,0) } |
|||
LayoutDataRow.AddWidget(LabelFind) |
|||
LayoutDataRow.AddWidget(ButtonFind) |
|||
LayoutDataRow.AddWidget(DnaSearch) |
|||
LayoutButtonMain.AddLayout(LayoutDataRow) |
|||
setLayout(LayoutButtonMain) |
|||
pStart() |
|||
show() |
|||
} |
|||
exec() |
|||
} |
|||
//----------------------------------------- |
|||
func pStart() |
|||
start = start + 1 |
|||
dnaList = [] |
|||
for row = 1 to 10 |
|||
for col = 1 to 20 |
|||
Button[row][col].settext("") |
|||
next |
|||
next |
|||
for nr = 1 to 200 |
|||
rnd = random(3)+1 |
|||
baseStr = base[rnd] |
|||
row = ceil(nr/20) |
|||
col = nr%20 |
|||
if col = 0 |
|||
col = 20 |
|||
ok |
|||
Button[row][col].settext(baseStr) |
|||
add(dnaList,baseStr) |
|||
next |
|||
startDna() |
|||
//----------------------------------------- |
|||
func startDna() |
|||
strDna = list2str(dnaList) |
|||
strDna = substr(strDna,nl,"") |
|||
while true |
|||
strBase = "" |
|||
for n = 1 to 4 |
|||
rnd = random(3)+1 |
|||
strBase = strBase + base[rnd] |
|||
next |
|||
ind = substr(strDna,strBase) |
|||
if ind > 0 |
|||
exit |
|||
ok |
|||
end |
|||
showDna(dnaList) |
|||
//----------------------------------------- |
|||
func showDna(dnaList) |
|||
if start > 1 |
|||
see nl |
|||
for n = 1 to len(dnaSeq) |
|||
for m = 0 to 3 |
|||
ind = dnaSeq[n] + m |
|||
row = ceil(ind/20) |
|||
col = ind%20 |
|||
if col = 0 |
|||
col = 20 |
|||
ok |
|||
Button[row][col].setstylesheet(C_ButtonStyle) |
|||
next |
|||
next |
|||
ok |
|||
dnaSeq = [] |
|||
strDna = list2str(dnaList) |
|||
strDna = substr(strDna,nl,"") |
|||
while true |
|||
strBase = "" |
|||
for n = 1 to 4 |
|||
rnd = random(3)+1 |
|||
strBase = strBase + base[rnd] |
|||
next |
|||
ind = substr(strDna,strBase) |
|||
if ind > 0 |
|||
exit |
|||
ok |
|||
end |
|||
ButtonFind.setStyleSheet("background-color:yellow") |
|||
ButtonFind.settext(strBase) |
|||
for n = 1 to 196 |
|||
flag = 1 |
|||
for m = 0 to 3 |
|||
if dnaList[n+m] != strBase[m+1] |
|||
flag = 0 |
|||
exit |
|||
ok |
|||
next |
|||
if flag = 1 |
|||
add(dnaSeq,n) |
|||
ok |
|||
next |
|||
temp = "" |
|||
ButtonInd.settext("") |
|||
for nr = 1 to len(dnaList) |
|||
ind = find(dnaSeq,nr) |
|||
if ind > 0 |
|||
temp = temp + string(dnaSeq[ind]) + " " |
|||
ButtonInd.settext(temp) |
|||
for n = nr to nr + 3 |
|||
row = ceil(n/20) |
|||
col = n%20 |
|||
if col = 0 |
|||
col = 20 |
|||
ok |
|||
Button[row][col].setStyleSheet(C_ButtonDnaStyle) |
|||
Button[row][col].settext(dnaList[n]) |
|||
next |
|||
ok |
|||
next |
|||
//----------------------------------------- |
|||
</syntaxhighlight> |
|||
'''Output:''' |
|||
[https://i.imgur.com/5hhbRBK.mp4 Bioinformatics/Subsequence - video] |
|||
=={{header|Wren}}== |
|||
{{libheader|Wren-pattern}} |
|||
{{libheader|Wren-str}} |
|||
{{libheader|Wren-fmt}} |
|||
<syntaxhighlight lang="wren">import "random" for Random |
|||
import "./pattern" for Pattern |
|||
import "./str" for Str |
|||
import "./fmt" for Fmt |
|||
var rand = Random.new() |
|||
var base = "ACGT" |
|||
var findDnaSubsequence = Fn.new { |dnaSize, chunkSize| |
|||
var dnaSeq = List.filled(dnaSize, null) |
|||
for (i in 0...dnaSize) dnaSeq[i] = base[rand.int(4)] |
|||
var dnaStr = dnaSeq.join() |
|||
var dnaSubseq = List.filled(4, null) |
|||
for (i in 0...4) dnaSubseq[i] = base[rand.int(4)] |
|||
var dnaSubstr = dnaSubseq.join() |
|||
System.print("DNA sequence:") |
|||
var i = chunkSize |
|||
for (chunk in Str.chunks(dnaStr, chunkSize)) { |
|||
Fmt.print("$3d..$3d: $s", i - chunkSize + 1, i, chunk) |
|||
i = i + chunkSize |
|||
} |
|||
System.print("\nSubsequence to locate: %(dnaSubstr)") |
|||
var p = Pattern.new(dnaSubstr) |
|||
var matches = p.findAll(dnaStr) |
|||
if (matches.count == 0) { |
|||
System.print("No matches found.") |
|||
} else { |
|||
System.print("Matches found at the following indices:") |
|||
for (m in matches) { |
|||
Fmt.print("$3d..$3d", m.index + 1, m.index + 4) |
|||
} |
|||
} |
|||
} |
|||
findDnaSubsequence.call(200, 20) |
|||
System.print() |
|||
findDnaSubsequence.call(600, 40)</syntaxhighlight> |
|||
if seqok = 0 |
|||
see "subsequence not found" + nl |
|||
ok |
|||
</lang> |
|||
{{out}} |
{{out}} |
||
<pre> |
<pre> |
||
DNA sequence: |
DNA sequence: |
||
1.. 20: TATGGGCGCATTATGACAAC |
|||
20: GAGTATAAAAAGCGACATAG |
|||
21.. 40: GGCTACTGAAACGAAAATTC |
|||
40: AAGCAGGGGGGGAACAGACA |
|||
41.. 60: ATGCCTTCGGAGGCTAGACC |
|||
60: ACAATTGTGAAAACTAATCA |
|||
61.. 80: ACTCATACATGATTTACAGC |
|||
80: ATACGGAAAAGGATAAACAT |
|||
81..100: TAGTCAGTTGCGTCCGCCAT |
|||
100: GAGGGACTGCGGTTGGTAGG |
|||
101..120: CCCGCATAACTATGTATTAC |
|||
120: CGATGAAACCTAAGAATGAA |
|||
121..140: GAGCATGTTCTGGCAACCTT |
|||
140: AACGAGGAAGGTGTAAAGTG |
|||
141..160: TCAGTGACAGTTCCTCAGGC |
|||
160: ATGGGGTCATGGGACAGACA |
|||
161..180: GCGTTCGCGTTGAAGGCCTC |
|||
180: TAGCTAAATGGATAAAAGCG |
|||
181..200: CCCACACCGCACCCCTGCCG |
|||
200: GGTGAAGTCGGTCGCAAACG |
|||
subsequence to search: ATGA |
|||
Subsequence to locate: AATT |
|||
start position of subsequence = 79 |
|||
Matches found at the following indices: |
|||
start position of subsequence = 103 |
|||
36.. 39 |
|||
start position of subsequence = 116 |
|||
DNA sequence: |
|||
1.. 40: GCGCTGAGCGCCCCAGTACAGCGGGTTAAACCGAGCCCGC |
|||
41.. 80: TCCGATGAACCAACTCCCATTCCTATAATGGTGCCCCGAC |
|||
81..120: ATATTGAATTCGGCGGGTCCGCTATCGGGCTGAGGATGCC |
|||
121..160: AATATCTAGGCGCTACCCTGAAGATCCTCAGTTGTGGTGT |
|||
161..200: CGCGGAGTGTCGATCCCAGAGCTCCCAATTGACTCAATTA |
|||
201..240: CTTTTTCCGTCCTCTTGCTTACGGATTTATGTTTGTGGCA |
|||
241..280: GAGGTTATGCTTCAGGCATCCCCATGTTTCCTGAGATACG |
|||
281..320: ACCACTGTCAGGTGGCTTGAATCTACCTTGTATTTCCTCT |
|||
321..360: AGTACCAGTCACTGTCATCTACTGGAAGCCATATCAGCGT |
|||
361..400: TGAAATGTCTATAATTTACTCTCCGGTTGTACCCAAGCGA |
|||
401..440: TAACAGCAACGTGTGGGTCTAAAGAGTTCCGCGTTTCGAC |
|||
441..480: ATAACGTGCTCCTATTTATCTACCGAAACACCCTATTTTC |
|||
481..520: CATCTAACCGGCACCCAATGCGCAGGTGTACGCGTCCTAC |
|||
521..560: TACGTTTGAAACGGTTCCATCTCGCCATGTACAATTGTGG |
|||
561..600: GGCTACGATTAAGTGTAGTCGGTAATTCAGGGTGAAGTTG |
|||
Subsequence to locate: TTCG |
|||
Matches found at the following indices: |
|||
89.. 92 |
|||
435..438 |
|||
</pre> |
</pre> |
Latest revision as of 08:45, 7 May 2024
- Task
Randomly generate a string of 200 DNA bases (represented by A, C, G, and T).
Write a routine to find all the positions of a randomly generated subsequence (four letters).
11l
UInt32 seed = 34
F nonrandom_choice(lst)
:seed = (1664525 * :seed + 1013904223) [&] FFFF'FFFF
R lst[Int(:seed >> 16) % lst.len]
F generate_sequence(Int n)
R ((0 .< n).map(_ -> nonrandom_choice([‘A’, ‘C’, ‘G’, ‘T’]))).join(‘’)
F positions(dnaSeq, subSeq)
[Int] r
V start = 0
L
V? pos = dnaSeq.find(subSeq, start)
I pos == N
L.break
r.append(pos)
start = pos + 1
R r
F dna_findall(String needle, haystack) -> Void
V pp = positions(haystack, needle)
I pp.empty
print(‘No matches found’)
E
print(‘Found ’needle‘ at the following indices:’)
L(p) pp
print(p‘:’(p + needle.len))
V dna_seq = generate_sequence(200)
V sample_seq = generate_sequence(4)
V c = 1
L(i) dna_seq
I c % 20 != 0 {print(i, end' ‘’)} E print(i)
c++
print("\nSearch Sample: "sample_seq)
dna_findall(sample_seq, dna_seq)
- Output:
GAAGTGCTCAAACCCTTTTT CCTTGCCGTAGGTTGTGCTG CCGCCGCACACCCGCAACAG CTTTTAGGCATAAGTATACG GACCGCGGACGGGGCGTAAC GGTGAACATTTTGCTAAATT GGCTCTAGGGATGAGCCCTA TAGCGCTGGGGACTACGCCC CGGTAAAGATCGAGGCGACT CACCGATTGCGCTAGGGACA Search Sample: CGTA Found CGTA at the following indices: 26:30 94:98
Action!
DEFINE SEQLEN="200"
DEFINE SUBLEN="4"
PROC RandomSeq(CHAR ARRAY s BYTE len)
CHAR ARRAY letters="ACGT"
BYTE i
FOR i=1 TO len
DO
s(i)=letters(Rand(4)+1)
OD
s(0)=len
RETURN
PROC PrintSeq(CHAR ARRAY s)
BYTE i
FOR i=1 TO s(0)
DO
IF i MOD 20=1 THEN
IF i<10 THEN Put(32) FI
IF i<100 THEN Put(32) FI
PrintB(i)
Print(": ")
FI
Put(s(i))
IF i MOD 20=0 THEN
PutE()
FI
OD
RETURN
BYTE FUNC StartsWith(CHAR ARRAY s,prefix BYTE start)
BYTE i
FOR i=1 TO prefix(0)
DO
IF s(start+i-1)#prefix(i) THEN
RETURN (0)
FI
OD
RETURN (1)
PROC Main()
CHAR ARRAY seq(SEQLEN+1),sub(SUBLEN+1)
BYTE i,notfirst
RandomSeq(seq,SEQLEN)
RandomSeq(sub,SUBLEN)
PrintE("Search sequence:")
PrintSeq(seq)
PutE()
PrintF("Subsequence to find: %S%E%E",sub)
PrintE("Found subsequence at positions:")
notfirst=0
FOR i=1 TO SEQLEN-SUBLEN
DO
IF StartsWith(seq,sub,i) THEN
IF notfirst THEN
Print(", ")
FI
notfirst=1
PrintF("%I-%I",i,i+SUBLEN-1)
FI
OD
IF notfirst=0 THEN
PrintE("Not found")
FI
RETURN
- Output:
Screenshot from Atari 8-bit computer
Search sequence: 1: CGACTCAGGAAGGCCACGTG 21: GTAACTTCTTAGTTACCGTA 41: AGGCTAATAGCTAGCGCTGC 61: GTGACCAGGCATAGTAACCG 81: GCACGCACGTTCACCAAGGG 101: GTCCCGATGGGAGGCACGTT 121: ACTACTCCAAGAACTGTAGT 141: AAGTTACCGAAAAGTTCTCA 161: TCCTTGGGTAGTGAGTACTT 181: TGTGCTATGAAAAATAAGGA Subsequence to find: ACGC Found subsequence at positions: 83-86
Ada
with Ada.Text_Io;
with Ada.Strings.Fixed;
with Ada.Numerics.Discrete_Random;
procedure Sub_Sequence is
type Nucleotide is (A, C, G, T);
function To_Character (N : Nucleotide) return Character
is (case N is
when A => 'A', when C => 'C',
when G => 'G', when T => 'T');
package Random_Nucleotide is new Ada.Numerics.Discrete_Random (Nucleotide);
use Random_Nucleotide;
package Position_Io is new Ada.Text_Io.Integer_Io (Natural);
use Ada.Text_Io;
procedure Put_Bases (Seq : String; Width : Positive) is
First : Natural := Seq'First;
begin
while First < Seq'Last loop
declare
Last : constant Natural :=
Natural'Min (First + Width - 1, Seq'Last);
begin
Position_Io.Put (First); Put ("..");
Position_Io.Put (Last); Put (" ");
Put (Seq (First .. Last));
New_Line;
First := Last + 1;
end;
end loop;
end Put_Bases;
Gen : Generator;
Sequence : String (1 .. 405);
Substring : String (1 .. 4);
Pos : Natural := 0;
begin
Position_Io.Default_Width := 3;
Reset (Gen);
Sequence := (others => To_Character (Random (Gen)));
Substring := (others => To_Character (Random (Gen)));
Put_Line ("Search sequence:");
Put_Bases (Sequence, Width => 50);
New_Line;
Put ("Substring to search: ");
Put (Substring);
New_Line;
loop
Pos := Ada.Strings.Fixed.Index (Sequence, Substring, Pos + 1);
exit when Pos = 0;
Put ("Found at position: ");
Position_Io.Put (Pos); Put ("..");
Position_Io.Put (Pos + Substring'Length - 1);
New_Line;
end loop;
end Sub_Sequence;
- Output:
Search sequence: 1.. 50 CCTACGGAAAAGTGATAAGGACAGATACATAATCCTAAAACCCTGGAAAA 51..100 CTTGTCTCGCCAGAGTAGGGCTCGGCAGGGGGGGCAGTGTTTTAAAACGT 101..150 CAGAGAATAGGCTCTACCTTGTTAGACTGCGAGTACTGGAGCGTAGTTCC 151..200 TATATTGCAAGCTGCTACAGTAAGTATCAAAGTATGCCACACATCCTTCT 201..250 ACAACCGGATTGGTTGCCCAGTAGAAGGCTCGTAGTCACCGGACACGCTG 251..300 TTCTTAAGGTCGGTAAGCTATTACGTCCATGGGAGATTCTCAAGGGTGCG 301..350 TTAGCGGACCCCCGTTACGTCCACGTATCTTCCGTCCAACTACCCCCTAA 351..400 TGTCATTGACATCGCCCGAGTATTTAATTTATTTGAACGGCACCAATTTA 401..405 GAGCT Substring to search: TATT Found at position: 153..156 Found at position: 269..272 Found at position: 371..374 Found at position: 380..383
Arturo
bases: [`A` `G` `C` `T`]
randSeq: join map 1..200 => [sample bases]
randSub: join map 1..4 => [sample bases]
idx: 0
print "Random sequence:"
print join.with:"\n" split.every: 20 randSeq
print ""
print "Looking for subsequence:"
print randSub
print ""
while [(size randSeq) > idx + 4][
if prefix? slice randSeq idx idx+4 randSub ->
print ["Found subsequence at position:" idx]
idx: idx + 1
]
- Output:
Random sequence: CACGCGCGTTAACCCTGCAT CTTTTCTCTAAGATGATGCG CTACTCTGCCCGATTACTAT GATGTCACCGGCGGTTCGGC GACTGGCGCTGGCAGAAAGC GCATGTCAAATTGCCCCAGT GTGCAAGTCCAAGTATTAGT GAGGTGCTCCGCTTCGTCCG GGGTCGACTCGGTCCCACTT CATTACATGTTGGTAATAGT Looking for subsequence: CGGT Found subsequence at position: 71 Found subsequence at position: 169
Factor
USING: accessors formatting grouping io kernel math
math.functions.integer-logs math.parser random regexp sequences ;
: new-dna ( n -- str ) [ "ACGT" random ] "" replicate-as ;
: pad ( n d -- str ) [ number>string ] dip 32 pad-head ;
:: .dna ( seq n -- )
seq length integer-log10 1 + :> d seq n group
[ n * d pad write ": " write write nl ] each-index ;
: .match ( slice -- ) [ from>> ] [ to>> ] bi "%d..%d\n" printf ;
: .matches ( slices -- )
"Matches found at the following indices:" print
[ .match ] each ;
: .locate ( slices -- )
[ "No matches found." print ] [ .matches ] if-empty ;
: .biosub ( dna-size row-size -- )
[ new-dna dup ] [ .dna nl ] bi*
4 new-dna dup "Subsequence to locate: %s\n" printf
<regexp> all-matching-slices .locate ;
80 10 .biosub nl
600 39 .biosub nl
- Output:
0: ATTCAAGGAC 10: CACTATTAAC 20: CTGCATTGTG 30: AGAACTTGCA 40: GTGTACCGAG 50: AGCGAGTTTA 60: AAGCAACACA 70: TCTTTACCGA Subsequence to locate: GTAG No matches found. 0: GATCTCGTCATGGTCCATCCTAACATTTCGGTTGTGGGC 39: GCATCCCGATAGGCGAAGTTAAATCTACGTAGTCCTACG 78: TCACGACGGAACATGATTGCCCACCGAAGTCGTAGGCGA 117: GCTAAAGTCGGTACATACACGATCTGCTATATTCGTTCT 156: CCGACACACGACATGCAATCCGAGAAGCTCTCGAAGTGC 195: GGTCAGATCCTCAGACTCGAACAGAGGAGACCTTAACTG 234: ATACCCACAGTACTTCTCGCATAACCTAAGCACCTATGC 273: TTACACCATCGTCCTGATATTGAGTGAGTCTGGTCGGAG 312: ATATTATCTAGCACCCTCAAGCTCTGTGTGCCACACCAG 351: GATTCCACTTCGCGCTTGCCTAGAGAAAGTAGAGTAGGT 390: GGTGTCATTAGTACACTGTTTGCGATGCACCAACCAAAC 429: CCGACCGCCATGATGACTGCTTTTCGGCCAACGTCAGAT 468: TAAGAGTACTTTTAGTAGCACCGCAAGCCAGCCGGTTTA 507: GCAAGATCCTGCAGCCTCCACGTTATTTCAGGTCTCTAA 546: GCGTTCTTTCCATGGAAGTAGTCACCGCTCCCGTTGCCA 585: ATGGACACAGACGTT Subsequence to locate: ATAT Matches found at the following indices: 145..149 289..293 312..316
FreeBASIC
Const base_ = "ACGT"
Sub findDnaSubsequence(dnaSize As Integer, chunkSize As Integer)
Dim As String dnaSeq(1 To dnaSize)
Dim As Integer i, chunk
For i = 1 To dnaSize
dnaSeq(i) = Mid(base_, Int(Rnd * 4)+1, 1)
Next
Dim As String dnaStr
For i = 1 To dnaSize
dnaStr += dnaSeq(i)
Next
Dim As String dnaSubseq(1 To 4)
For i = 1 To 4
dnaSubseq(i) = Mid(base_, Int(Rnd * 4)+1, 1)
Next
Dim As String dnaSubstr
For i = 1 To 4
dnaSubstr += dnaSubseq(i)
Next
Print "DNA sequence:"
For chunk = 1 To Len(dnaStr) Step chunkSize
Print Using "###_._.###: &"; chunk; chunk+chunkSize-1; Mid(dnaStr, chunk, chunkSize)
Next
Print !"\nSubsequence to locate: "; dnaSubstr
Dim As Integer idx = Instr(dnaStr, dnaSubstr)
Print Iif(idx <> 0, "Matches found at the following indices:", "No matches found.")
Do While idx > 0
If idx <> 0 Then Print Using "###_._.###"; idx; idx + 3
idx = Instr(idx+4, dnaStr, dnaSubstr)
Loop
End Sub
findDnaSubsequence(200, 20)
Print
findDnaSubsequence(600, 40)
Sleep
- Output:
DNA sequence: 1.. 20: TTATAGTCTTGGAGGCATGT 21.. 40: TAACTTATGCGGAGCAGACA 41.. 60: CGGAGTATGCATTCCTCTTA 61.. 80: CCAAACGGTGCTGCCCGCGC 81..100: ACTCGCTGTATTCCGTATCG 101..120: TCACATTATCTAAACCACGA 121..140: TTTCCAGCGTGCGTGGGAAG 141..160: GCCATGTTTAGTCGGGGGCC 161..180: AAGGTCTTTGGCTTATGCTG 181..200: TTTTTTTTTCTTCGGTTACA Subsequence to locate: ATTT Matches found at the following indices: 120..123 DNA sequence: 1.. 40: GTGCGGGCCGTTAGCAGCTACGAGTGCTAGATGGAACTAG 41.. 80: TCCCCGCTCCCAAATGCAAAGCGTCCCAGACCAGTCTTGA 81..120: AGCCCGTTAAATTACACCTGAACCGTTGCAAATGATCGAT 121..160: AGACGGGGTATAATAGCGGAAAACACAGGGGAACTGCATG 161..200: CAAGCTCGAGCCGCTGAAGGATGGCTCCCCCCCGAGTGTA 201..240: AGTGGATCTCGCCCAAATAGCGGGGGAACAAAGAAAGGTA 241..280: AGTCTTACTTCGCACGTCCCCTCTCATACACGCCAGGACT 281..320: AATGGATCATTCATAGGTGACGGGTGACTTGCGGTGTTTC 321..360: TAGTTGGAGTCACCCGTCAGCTTAGATCTAAGTATGAACC 361..400: GTAAGAGTTTGTAACTGCACCTTCCGTCTCTTCCTCTGTA 401..440: GGAACGCTTTTGCTTGTTATCAGATAGTGTCTCCTTATCA 441..480: TAGGACAGGTTCCTTGTGAAGGTCCACAGAGTTTGCCCGG 481..520: GGTTCGAATATACGACGCTTGTGGTTCCGGCACTATAACT 521..560: TCCGCAGTGTTGTCGACGCCCCTAGCTCCCGGGGTCTTTT 561..600: CGCTTCCCTATAGCGCGAAATGAGTGCAAGGGTACCGGCC Subsequence to locate: GCAC Matches found at the following indices: 252..255 377..380 510..513
Go
package main
import (
"fmt"
"math/rand"
"regexp"
"time"
)
const base = "ACGT"
func findDnaSubsequence(dnaSize, chunkSize int) {
dnaSeq := make([]byte, dnaSize)
for i := 0; i < dnaSize; i++ {
dnaSeq[i] = base[rand.Intn(4)]
}
dnaStr := string(dnaSeq)
dnaSubseq := make([]byte, 4)
for i := 0; i < 4; i++ {
dnaSubseq[i] = base[rand.Intn(4)]
}
dnaSubstr := string(dnaSubseq)
fmt.Println("DNA sequnence:")
for i := chunkSize; i <= len(dnaStr); i += chunkSize {
start := i - chunkSize
fmt.Printf("%3d..%3d: %s\n", start+1, i, dnaStr[start:i])
}
fmt.Println("\nSubsequence to locate:", dnaSubstr)
var r = regexp.MustCompile(dnaSubstr)
var matches = r.FindAllStringIndex(dnaStr, -1)
if len(matches) == 0 {
fmt.Println("No matches found.")
} else {
fmt.Println("Matches found at the following indices:")
for _, m := range matches {
fmt.Printf("%3d..%-3d\n", m[0]+1, m[1])
}
}
}
func main() {
rand.Seed(time.Now().UnixNano())
findDnaSubsequence(200, 20)
fmt.Println()
findDnaSubsequence(600, 40)
}
- Output:
Sample run:
DNA sequnence: 1.. 20: GTTGCCCACACGTCTTATTG 21.. 40: TAAAAATCACCGTGCAGCGA 41.. 60: GGTTAAAAATGGTAGGAAAA 61.. 80: TATCCTCAGCCAGCGGTGCC 81..100: GGCCAACAAAAGGGACGTTG 101..120: GATTAAAGTAGGTCTAGGTA 121..140: TCTCGTATCCGGTTGATCCG 141..160: GGATGGTGGACGATATTGGA 161..180: GACCGGAGTGTACATCGGTG 181..200: TTGTCGCTTGCAGCTACGGT Subsequence to locate: AATA Matches found at the following indices: 59..62 DNA sequnence: 1.. 40: GTACAGCCACTGTTAGTAGACGGATGCTATTGGGACGCAA 41.. 80: CACATCAGTACACTGCTTGTTCGTAATCGCGTACCCAGCG 81..120: CAAAAGGAGGGGAGGAACCTGCTCAGACTGTCGCTAAAAA 121..160: CGAGCACGTGTCCTTACGCAGTGATGGTAGCGGTCCACGA 161..200: CTTCCACTGGCATAAGGAGAATGTTTAGTAACGCCCCTCA 201..240: TAGGTGCAATTCTACAGGTTAAGGGACCGTGGGATGTTTC 241..280: TATAAAAGTTGAAGAGATTACTAATCCGTCCCGTGCGCGT 281..320: GCCGCAATTTAGCGCCCGTTCTTGAGTAAACATACATGCA 321..360: CGCTCTTGAGTTTTCTAAAACCTGATCAAAACGGTCGCCC 361..400: ACATGCAGGAGCGCCGCAGGGTTTCAGAGGTCAACCATCG 401..440: GCAGCACACGTGAACCCTCTGTACTGACCAGGGGCTTGCT 441..480: CCTTGGTAGGAGATGGTGGAGAATGCGTCGATGCACTGAA 481..520: GCAGACCGCTGATAGCATGTACGATGTTTACGGGTTGACG 521..560: ATAGCTTTGCTAGTGATCGAACATATGATGAAAAACGCTT 561..600: CCATTGATAGAGCATCTTAGGAGCTCAGTCCAGTGACCTC Subsequence to locate: AGGT Matches found at the following indices: 202..205 216..219 388..391
jq
Works with gojq, the Go implementation of jq
Neither jq nor gojq currently has any PRNG built-ins so one possibility is to use a jq-coded PRNG function such as can be found at https://rosettacode.org/wiki/Random_numbers#jq
In practice, it's usually more convenient to use a utility such as gshuf or jot to provide the source of randomness. Here we use `jot -r N MIN MAX` but a fourth argument can also be used to specify a seed. An alternative would be to use `gshuf` along the lines of:
# For 200 pseudo-random integers in the range 0 to 3 inclusive:
gshuf -i 0-3 -r -n 200 --random-source=/dev/random
Note that the indices shown below are offsets (i.e., the index origin is taken to be 0).
#!/bin/bash
jot -r 200 0 3 | jq -nr --slurpfile four <(jot -r 4 0 3) '
# input: an array of integers
def toDNA:
def base: . as $in | "ACGT" | .[$in : $in+1];
map(base) | join("");
([inputs] | toDNA) as $strand
| ($four | toDNA) as $four
| "Strand of length \($strand|length):",
$strand,
"Zero-based indices of \($four):",
($strand | indices($four) | join(" "))
'
- Output:
./bioinformatics-subsequence.sh Strand of length 200: TGGGCCCAAGCATTGCCACGTAGCTTTGTCAGTGGGCTTGTAAGGGACGAACACAAACTCACAGACCAGGAATTCTCGAGTTCCAGTCCCCCCACTTGTCGCTATTTAGTTAAGACGTTCAGTTTCGTTGCGAACTGTGTCCCCCAGGCTAACGTGATGGGTGTCAGGAATCAATGGCCAACTTTCAGTTAGACTTGACC Zero-based indices of CAAC: 178 ./bioinformatics-subsequence.sh Strand of length 200: TAAGACTGCAGGGTACGAAGAGTGGAAGATTGGCTCGTACTTGTCGACGTCGCGTGACATAATCTCTGTGCTCGCCTCGCAGTAAGGGACTAGGTCCCGTTCGAGCGCCCTGCTAGAAGGAGCATCCTACCATGCTCTGATGACATCCTGTCGGCATTAGAGTTTCTACGACATCTAAAGAGTACGATCGACTTCCCAGT Zero-based indices of GACA: 55 141 169
Julia
DNArand(n, bases=['A', 'T', 'C', 'G']) = String(rand(bases, n))
DNAsearch(needle, haystack, lap=true) = findall(needle, haystack, overlap=lap)
const rand_string = DNArand(200)
const subseq = DNArand(4)
println("Search sequence:\n$rand_string\nfor substring $subseq. Found at positions: ")
foreach(p -> print(rpad(p[2], 8), p[1] % 10 == 0 ? "\n" : ""), enumerate(DNAsearch(subseq, rand_string)))
- Output:
Search sequence: CCGAAGCCAGGAGGACTGAGCGCTTGCGTCCCGAGTTCTGCGACGAGTCTCTTCATTATAAGGCCACTGATTGCGCTCATCATGAGTGCCAGAAGCACCGCTAAACATAAGTGTCCTTTCTTCCTGACGCACTTGAAGATTGTGACCATTTGTGCGGGTTGTGAGTTAGGGGCTCTCATTGTACACGATCTATAGTGTGC for substring CGCT. Found at positions: 21:24 74:77 99:102
Nim
import random, sequtils, strutils
proc dnaSequence(n: Positive): string =
## Create a random DNA sequence of length "n".
newSeqWith(n, sample("ACGT")).join()
proc positions(dnaSeq, subSeq: string): seq[int] =
## Return the list of starting positions of a subsequence
## "subSeq" in a sequence "dnaSeq". Positions start at 1.
var start = 0
while true:
let pos = dnaSeq.find(subSeq, start)
if pos < 0: break
result.add pos + 1
start = pos + 1
when isMainModule:
const
N = 200
Step = 20
randomize()
let dnaSeq = dnaSequence(N)
echo "DNA sequence:"
for i in countup(0, N - 1, Step):
echo ($(i+1)).align(3), ' ', dnaSeq[i..i+(Step-1)]
let subSeq = dnaSequence(3)
echo "\nDNA subsequence: ", subSeq
echo()
let pos = dnaSeq.positions(subSeq)
if pos.len == 0:
echo "Subsequence not found."
else:
let tail = if pos.len == 1: ": " else: "s: "
echo "Subsequence found at position", tail, pos.join(", ")
- Output:
DNA sequence: 1 CACATACGATGAGCTGGGCG 21 CCTAAGAGGCGGAAAGACAA 41 CCGTGTGTGTCTAACCCATG 61 GTTTAATTGCAGATAGTCTC 81 TAGACTACAAACATTAGAGC 101 AATGCACCGGGGTGCACGTG 121 TGTTTTGACTTCCCATGAAA 141 GCCCTTATCCTAGAGTACAG 161 TCGGCAAATGTTCGCTCCTT 181 GGCCCACTCCATTTGGACGG DNA subsequence: GTT Subsequence found at positions: 61, 122, 170
Perl
use strict;
use warnings;
use feature 'say';
my @bases = <A C G T>;
my $basecnt = 160;
my($string,$target);
$string .= $bases[ int rand @bases ] for 1 .. $basecnt;
$target .= $bases[ int rand @bases ] for 1 .. 4;
say "Target: $target";
say 'Matches at these positions:';
say (($string =~ s/.{1,40}\K/\n/gr) =~ s/($target)/ >$1< /gr);
- Output:
Target: CCTG Matches at these positions: 9 90 157 TTGCC >CCTG< CAAAGTTAATAAGTAAACAATTAAGTGAGTG CTCTAGGGTAAGGTGAGGGCGGGAAGGGGAAAAATACCGA TGCGAG >CCTG< TAGAGCCGGGCCTCAAATTAAACGAAAAAT ATAAGTTTGCTTGGCACGCTGTACTACTTATCC >CCTG< ACT
Phix
Currently only searches for non-overlapped sequences, but it should be pretty obvious how to change that, in which case the next underline will simply partially overwrite the previous, so you'll get eg "<=<==>".
with javascript_semantics constant cheat = false function grandna(integer len) string dna = repeat(' ',len) for i=1 to len do dna[i] = "ACGT"[rand(4)] end for return dna end function procedure show(string dna, test, sequence idx) idx = deep_copy(idx) & length(dna)+100 -- (add an otherwise unused sentinel) sequence s = split(trim(join_by(split(join_by(dna,1,10,""),"\n"),1,5," ")),"\n") integer ii = 1, -- idx index i = idx[ii], -- current target ux = 1, -- underline index (1..4) ldx = 1 -- line index (1, 51, 101, etc) for si=1 to length(s) do printf(1,"%3d: %s\n",{ldx,s[si]}) ldx += 50 if i and i<ldx then string ul = repeat(' ',59) while i and i<ldx do integer up = i-ldx+51 -- underline pos (relative to ldx) up += floor((up-1)/10)+5 -- (plus any needed spacing) ul[up] = "<==>"[ux] ux += 1 i += 1 if ux>4 then ux = 1 ii += 1 i = idx[ii] end if end while printf(1,"%s\n",ul) end if end for if length(idx)>1 then string p = iff(length(idx)>1?"s":""), t = join(apply(idx[1..$-1],sprint),", ") printf(1,"%s occurs at location%s: %s\n",{test,p,t}) else printf(1,"%s does not occur\n",{test}) end if end procedure string dna = grandna(200), test = grandna(4) constant cheats = iff(cheat?{9,13,49,60,64,68}:{}) for i=1 to length(cheats) do dna[cheats[i]..cheats[i]+3] = test end for sequence idx = match_all(test,dna) show(dna,test,idx)
- Output:
with cheat enabled
1: GGAGATATCG ACCGACCGAA GTAAAGTCAA AGTCGTCCAA TCCACGGACG <= =><==> <= 51: ACTTCAGCAC GACCGACCGA CCTATTTAAG AGACCACACT TAAGGAATCC => < ==><==><== > 101: ATGCGAAATA AAAATGGGCG AGTAGCCGTG GGGCGCTAAA GCACCCACCT 151: AGTTTTCGCC GAAGTACTAG ACCACCTTCG GATCGACAAA GCTTTCACCA <==> CGAC occurs at locations: 9, 13, 49, 60, 64, 68, 184
with cheat disabled
1: TGATTTAAAC CGTGGTGCAA TTTATAAACA CTGCGATATG CCTCCTGATG 51: GCATGGTATT CGACACCAAG ACGCTGGTGG GCACACTGGC TTTCAGAATA 101: GGAGTCACAA TCCCTCTATG ATGTCCTCTA GCGGGTGTGT GTTCAGTGCC 151: AGCGCTTACT TCCGGCGTGG CCGACTCTTT TTAAAGCGTA TAGCTGGGGT GCTA does not occur
Python
from random import choice
import regex as re
import time
def generate_sequence(n: int ) -> str:
return "".join([ choice(['A','C','G','T']) for _ in range(n) ])
def dna_findall(needle: str, haystack: str) -> None:
if sum(1 for _ in re.finditer(needle, haystack, overlapped=True)) == 0:
print("No matches found")
else:
print(f"Found {needle} at the following indices: ")
for match in re.finditer(needle, haystack, overlapped=True):
print(f"{match.start()}:{match.end()} ")
dna_seq = generate_sequence(200)
sample_seq = generate_sequence(4)
c = 1
for i in dna_seq:
print(i, end="") if c % 20 != 0 else print(f"{i}")
c += 1
print(f"\nSearch Sample: {sample_seq}")
dna_findall(sample_seq, dna_seq)
- Output:
TTGCCCCTGTACTGAGCCCA TAAGCTTGCACTCAAGGTTT TGCCCCCTCATATTATAACG CATCCATTATACAAAACCGA TACCCTTCCGCATATTATGA AAAGTGGCGAAGTGCCTTGA TTTGCATTCATAGTACAACG GTGCAAAAGCATTGTATGTC TCACATTTACATGGGAAATG CCTAGTAGGTGCAAGACCTG Search Sample: TACA Found TACA at the following indices: 69:73 133:137 167:171
Racket
#lang racket
(define (rand-seq n)
(build-string n (lambda _ (string-ref "TGAC" (random 4)))))
(define (subsequence-indices full part)
(let ((part-length (string-length part)) (full-length (string-length full)))
(for/list ((i (- full-length part-length))
#:when (for/and ((p part) (f (in-string full i))) (eq? p f)))
(cons i (+ i part-length -1)))))
(define (report-sequence s (l 50))
(string-join (for/list ((i (in-range 0 (string-length s) l)))
(format "~a: ~a" (~a #:width 4 i)
(substring s i (min (string-length s) (+ i l)))))
"\n"))
(define (Bioinformatics/Subsequence (full (rand-seq 400)) (sub (rand-seq 4)))
(printf "Indices of ~a in~%~a~%~a~%"
sub (report-sequence full) (subsequence-indices full sub)))
(module+ main (for ((i 4)) (Bioinformatics/Subsequence)))
- Output:
Indices of TTAC in 0 : TTATCCTACCGCGTAAGTTCAATGCTCACCGCAGTTTGCTAACCGTTCCT 50 : AAATTCACTTCCTAAGGTATCTTTCGCTTAATTGATGCCGATTGAATTCC 100 : ACGGAGGGCGTAATTGTTTCGGACTTTAGACCTGACATAAGGGCACACTA 150 : GTCCTATTGAATTTGGTGCTATTCGGCGACCTACTAACCTTAGTCAGTGA 200 : AGAGCCATCTCAAAAGTACAGTCATCCTCAAGTGTTACATACGGCACCAT 250 : GACAGTGTATAAGCATGGAGGTTGGCCTATCGTCATATCGAGGCGGCGCC 300 : ATAGACCGGCCAGGTGATGAGATCGACTTTAATGTTGTTGCTTAGCTTGA 350 : CCTCTAGTTTGGATTAAGACGGTCATAGATAGATAGACCGTAAAGTATTC ((234 . 237)) Indices of GTAA in 0 : GTCAGTCCACGCAAGAATAGCAGTTGAGTGGACAATTTATGAGACGGAGA 50 : TAAGTAACCCGCTCCGAGATAAACGTCAGCCGGATTCCGCTGAGTCGGTC 100 : GCCTTCCAAGTGGCAGCTTGTTTGCATTGCTTACAGTGACTTGAACGATC 150 : ACCTACTCGAGGACTCTGCGGGTATTCCAGTTGCCTTGCACTCAGCGATG 200 : CACAAACTTTAAATTATCACAGAAAGAATGTGATTCGGGTGGTCACCCTT 250 : ATCGGTGAAACCAGTCCTTCCATGGGCATATTCTGCGTCGAAATGAGCCC 300 : GCTGTTTACGTTGTACGAACTGGGGACCTAAGGAAACGGGCCGTTCTTAG 350 : GTGATGTCAGCTGCAACGAACTACTGTTAACCTTCTCGATCTGTTGAAAA ((53 . 56)) Indices of AACG in 0 : TTTACAGTACGATTCCGAAGACACAAGAATGCGCCGGCTGTGGGTAGGGG 50 : CGACCCTGCGCGACCTATAAAAGGGGCGACTCAATTTTAGGCCCACCACG 100 : GACCCAGCCCTGTGCACAGAGCGGGGCATTTTTACCTCGCGTGCGCACCA 150 : ACTGCGATCTGCCTTGTCACATAATCCCACATACGAGTTGTATCTCTAAG 200 : AAGGGATGAGGCCAATTTAAATCCGGGTGCATTTCTCGGGGGGAGACACC 250 : AATGAGAGTGGGGCAAGGTGGCGTAGAGAGCTAATCGGGTTTTATGACCG 300 : CGGAAGACCTGGGATACGTCTGGGTGATAACTGAGGGCAGGTCAACGAAC 350 : CCTGATGCGTAGCCACGTCTCAGCTATCGGGCCTGTTTTCATAGTCCATG ((343 . 346)) Indices of CAGC in 0 : TGTGAACCACTATGACACGCTACACGCCTCAAGTTGGCCCCCATATAAGA 50 : ATATCCATCGGTTAATGTGTCTCGCGGCCGTTAGAACAAGCACTAAAGTT 100 : AGAGAAACCAACCATTGGACTAGATCAACATCAACGTCGCTGATAATAAA 150 : TGTATATCTGATGTGGCCGTTCATAAAATCGTTAACTACAGGTATCAACA 200 : TAGTCTCCCAACTTATATAATTGGTTAACTTAGGAGGAGCTTGCACAGCT 250 : CAGCTATATGCTATCTGGCCCTGGGCTTGGTAGGCATCACGTCGTTATGC 300 : TGCGAACATCTCAAAGACAAACGTTGATCCAGCCCCTAGAGAGGTCATTA 350 : GGCCTCGACCCAATTTAACCTCCCACTCCGTGGGTACAGCTTGAACCCCC ((245 . 248) (250 . 253) (329 . 332) (386 . 389))
Raku
Chances are actually pretty small that a random 4 codon string will show up at all in a random 200 codon sequence. Bump up the sequence size to get a reasonable chance of multiple matches.
use String::Splice:ver<0.0.3+>;
my $line = 80;
my $haystack = [~] <A C G T>.roll($line * 8);
say 'Needle: ' ~ my $needle = [~] <A C G T>.roll(4);
my $these = $haystack ~~ m:g/<$needle>/;
my @match = $these.map: { .from, .pos }
printf "From: %3s to %3s\n", |$_ for @match;
my $disp = $haystack.comb.batch($line)».join.join("\n");
for @match.reverse {
$disp.=&splice(.[1] + .[1] div $line, "\e[0m" );
$disp.=&splice(.[0] + .[0] div $line, "\e[31m");
}
say $disp;
- Output:
Show in custom div to better display highlighting.
Needle: TAGC
From: 159 to 163
From: 262 to 266
From: 315 to 319
From: 505 to 509
From: 632 to 636
CATATGTGACACTGACAGCTCGCGCGAAAATCCGTGTGACGGTCTGAACACTATACTATAGGCCCGGTCGGCATTTGTGG
CTCCCCAGTGGAGAGACCACTCGTCAATTGCTGACGACTTAACACAAATCGAGTCGCCCTTAGTGCCAGACGGGACTCCT
AGCAAAGGGCGGCACGTGGTGACTCCCAATATGTGAGCATGCCATCTAATTGATCTGGGGGGTTTCGCGGGAATACCTAG
GGGCGTTCTGTCCATGGATCTCTAGCCCTGCGAAGAGATACCCGCAGTGAGTTGCACGTGCAAAGAACTTGTAACTAGCG
TATTCTGTATCCGCCGCGCGATATGCTTCTGCGGGATGTACTTCTTGTGACTAAGACTTTGTTATCCAAATTGACCAATA
TTCAACGGTCGACTCTCCGAGGCAGTATCGGTACGCCGAAAAATGGTTACTTCGGCCATACGTAACCTCTCAAGTCACGA
TTACAGCCCACGGGGGCTTACAGCATAGCTCCAAAGACATTCCAATTGAGCTACAACGTGTTCAGTGCGGAGCAGTATCC
AGTACTCGACTGTTATGGTAAAAGGGCATCGTGATCGTTTATATTAATCATTGGGACAGGTGGTTAATGTCATAGCTTAG
REXX
This REXX version allows the user to specify:
- length of the (random) DNA data sequence (default is 200).
- length of the (random) DNA sequence (default is four).
- DNA proteins to be used in the sequence (default is ACGT).
- width of the output lines of (random DNA) (default is 100).
- often (if ever) to add a blank to the output (default is every 10 proteins).
- DNA proteins to be searched in the data (the default is four unique random proteins).
- the seed for the RANDOM function so runs can be repeated with the same data (no default).
/*REXX pgm gens random DNA (ACGT) sequence & finds positions of a random 4─protein seq. */
parse arg totLen rndLen basePr oWidth Bevery rndDNA seed .
if totLen=='' | totLen=="," then totLen= 200 /*Not specified? Then use the default.*/
if rndLen=='' | rndLen=="," then rndLen= 4 /* " " " " " " */
if basePr=='' | basePr=="," then basePr= 'acgt' /* " " " " " " */
if oWidth=='' | oWidth=="," then oWidth= 100 /* " " " " " " */
if Bevery=='' | Bevery=="," then Bevery= 10 /* " " " " " " */
if rndDNA=='' | rndDNA=="," then rndDNA= copies(., rndLen) /*what we're looking for*/
if datatype(seed, 'W') then call random ,,seed /*used to generate repeatable random #s*/
call genRnd /*gen data field of random proteins. */
call show /*show " " " " " */
say ' base DNA proteins used: ' basePr
say 'random DNA proteins used: ' dna?
call findRnd
if @=='' then do; say "the random DNA proteins weren't found."; exit 4; end
say 'the random DNA proteins were found in positions:' strip(@)
exit 0 /*stick a fork in it, we're all done. */
/*──────────────────────────────────────────────────────────────────────────────────────*/
commas: parse arg ?; do jc=length(?)-3 to 1 by -3; ?=insert(',', ?, jc); end; return ?
/*──────────────────────────────────────────────────────────────────────────────────────*/
findRnd: @=; p=0 /*@: list of the found target proteins*/
do until p==0; p= pos(dna?, $$, p+1); if p>0 then @= @ commas(p)
/*Found one? Append it to the "Found"s*/
end /*p*/; return
/*──────────────────────────────────────────────────────────────────────────────────────*/
genRnd: dna?=; use= basePr; upper use basePr rndDNA; lenB= length(basePr)
do k=1 for rndLEN; x= substr(rndDNA, k, 1)
if x==. then do; ?= random(1, length(use) ); x= substr(use, ?, 1)
use= delstr(use, ?, 1) /*elide so no protein repeats*/
end
dna?= dna? || x /*build a random protein seq.*/
end /*k*/
return
/*──────────────────────────────────────────────────────────────────────────────────────*/
show: say " index │"center('DNA sequence of ' commas(totLen) " proteins", oWidth+10)
say "───────┼"center('' , oWidth+10, '─')
$=; $$=; idx= 1 /*gen data field of random proteins.*/
do j=1 for totLen; c= substr( basePr, random(1, lenB), 1)
$$= $$ || c /*append a random protein. */
if Bevery\==0 then if j//Bevery==0 then $= $' ' /*possibly add a blank. */
if length( space($ || c, 0) )<oWidth then do; $= $ || c; iterate; end
say strip( right(idx, 7)'│' $, 'T'); $= /*display line ──► terminal.*/
idx= idx + oWidth /*bump the index number. */
end /*j*/
if $\=='' then say right(idx, 7)"│" strip($, 'T') /*show residual protein data*/
say "───────┴"center('' , oWidth+10, '─')
say; return
- output when using the default inputs:
index │ DNA sequence of 200 proteins ───────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────── 1│ TTTTTAGCG CGTTTTGTAG CGCTCTAAAA ACCGTAGCTA TATTTCTCGA AGTTTCACCC AGCTCTTTTG CCCCAGGGTT GCGCTAAGCC CAGCTTCGAG 101│ GGGGCACAG GTAAAATACT ACCGTCCGTG GAGGGGGATG AATTGACCCG ACATTTTTTG AAGCATAACT CGTGACTCAA TATTGCATGA TTACACCAGC ───────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────── base DNA proteins used: ACGT random DNA proteins used: GCAT the random DNA proteins were found in positions: 162 184
- output when using the inputs of: 1000 , , , , tttt
index │ DNA sequence of 1,000 proteins ───────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────── 1│ GTGATTTTT AGCGCGTTTT GTAGCGCTCT AAAAACCGTA GCTATATTTC TCGAAGTTTC ACCCAGCTCT TTTGCCCCAG GGTTGCGCTA AGCCCAGCTT 101│ GAGGGGGGC ACAGGTAAAA TACTACCGTC CGTGGAGGGG GATGAATTGA CCCGACATTT TTTGAAGCAT AACTCGTGAC TCAATATTGC ATGATTACAC 201│ AGCTAGGTT AGTGTAAAAA CCCCCCTATC TTCCTGATCA ATGGCGAGTA AAACATGCAA CCAATTTGTG AGCGAGTACT GGAAATTATT GTTTACGGGA 301│ AGGCACATG CTACGCGCAA CAGATATCTT AGACTGACCC TTTTAGAGTC ATAAGCCCCT GTCGCCTACA TGCTACTAAT ACTCCAACTA GCGGCGCACC 401│ TCAACCGGA TCATGGCGCC AGGGAAAATG TGGCGTAGCG ACGTGCTCAT CGCTCGCCGG GGAGAGCCTT TCAGAATCTC GAATAAAACC TGGTAATGAC 501│ TCATCAATC GTAATGGTCG TCTGGGGCAA GAAGCCGATA TTATAGACTC AGGTCAGACG TGTGCACAAC GGCAGAATTT ATAGTAATTC GCGTGAACTA 601│ GTTTCGGGA TAGGCCTACG ACCAATCATA GGACATTCGA TGCACGGTGT AGAAACAGTT CTCTGATGTT ACTCGGGATA ACACTCGCAA TCCCCTAGGA 701│ ACCGTGAGC GTCGCTAGTA TCTGAGATAG TCGCGACTGC CCAGCGGTCT TTAAGTTCGC ACACTACGGG ACTCCTAGTT CGCCCATTCA TGGCTATTTT 801│ CCTATCAGT CCAATCCCAC GGGGAGGGCA CTCGCGCAAT TCATTCAAAG AGGGCCATTT GCCGATATAA GGTCCATCAT CGGGAGGAAT ATGACTCCTG 901│ TTAGTATTA GAGCAGCCTC GCTGCGTACT ACTGTCAGTG GCCCGTCAGG GAAGGCAAAA CGTTTTTCCT CTAGGAATCC GTCAATTGGA CTTCTAGACT ───────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────── base DNA proteins used: ACGT random DNA proteins used: TTTT the random DNA proteins were found in positions: 5 6 16 69 157 158 159 340 796 797 962 963
Ring
/*-----------------------------------
# Project : DNA subsequences
# Date : 2021/03/23
# Author : Gal Zsolt (~ CalmoSoft ~)
# Email : <calmosoft@gmail.com>
-----------------------------------*/
//-----------------------------------------
load "stdlibcore.ring"
load "guilib.ring"
start = 0
base = ["A","C","G","T"]
dnaList = []
dnaSeq = []
ColLine = list(21)
C_Spacing = 2
C_ButtonDnaStyle = ' background-color: Red; border-radius: 8px;'
C_ButtonStyle = '"background-color:white"; border-radius: 8px;'
Button = newlist(10,20)
LayoutButtonRow = list(10)
//-----------------------------------------
app = new qApp
{
win = new qWidget() {
setWindowTitle('DNA subsequences')
setWinIcon(self,AppFile("white.jpg"))
setStyleSheet('background-color:White')
setgeometry(560,180,300,300)
//reSize(400,400)
winheight = 10
fontSize = 8 # + (winheight / 100)
LayoutButtonMain = new QVBoxLayout()
LayoutButtonMain.setSpacing(C_Spacing)
LayoutButtonMain.setContentsmargins(0,0,0,0)
LabelInd = new qLabel(win) { settext(" DNA subsequences start positions:")
setAlignment(Qt_AlignHCenter | Qt_AlignVCenter)
setStyleSheet("background-color:yellow") }
ButtonInd = new QPushButton(win) { setStyleSheet("background-color:yellow") }
LabelFind = new qLabel(win) { settext(" DNA subsequence to find:")
setStyleSheet("background-color:yellow") }
ButtonFind = new QPushButton(win)
DnaSearch = new QPushButton(win) { setclickevent("pstart()")
setStyleSheet("background-color:yellow")
settext("Find")
}
for Col = 1 to 21
ColLine[Col] = new qLabel(win) {
setmaximumheight(20)
setAlignment(Qt_AlignHCenter | Qt_AlignVCenter)
setStyleSheet("background-color:darkgray")
setText(string(Col-1))
}
next
LayoutInd = new QHBoxLayout() { setSpacing(C_Spacing) setContentsMargins(0,0,0,0) }
LayoutInd.AddWidget(LabelInd)
LayoutInd.AddWidget(ButtonInd)
LayoutButtonMain.AddLayout(LayoutInd)
LayoutTitleRow = new QHBoxLayout() { setSpacing(C_Spacing) setContentsMargins(0,0,0,0) }
for Col = 1 to 21
LayoutTitleRow.AddWidget(ColLine[Col])
next
LayoutButtonMain.AddLayout(LayoutTitleRow)
RowLine = list(10)
for Row = 1 to 10
Letter = "" + Row*20
if Row*20 < 100
Letter = " " + Row*20
ok
RowLine[Row] = new qLabel(win) { setFont(new qFont("Verdana",fontSize,40,0))
setAlignment(Qt_AlignHCenter | Qt_AlignVCenter)
setStyleSheet("background-color:darkgray")
setText(Letter)
}
next
for Row = 1 to 10
LayoutButtonRow[Row] = new QHBoxLayout()
{
setSpacing(C_Spacing)
setContentsmargins(0,0,0,0)
}
LayoutButtonRow[Row].AddWidget(RowLine[Row])
for Col = 1 to 20
Button[Row][Col] = new QPushButton(win) {
setmaximumwidth(20)
}
LayoutButtonRow[Row].AddWidget(Button[Row][Col])
next
LayoutButtonMain.AddLayout(LayoutButtonRow[Row])
next
LayoutDataRow = new QHBoxLayout() { setSpacing(C_Spacing) setContentsMargins(0,0,0,0) }
LayoutDataRow.AddWidget(LabelFind)
LayoutDataRow.AddWidget(ButtonFind)
LayoutDataRow.AddWidget(DnaSearch)
LayoutButtonMain.AddLayout(LayoutDataRow)
setLayout(LayoutButtonMain)
pStart()
show()
}
exec()
}
//-----------------------------------------
func pStart()
start = start + 1
dnaList = []
for row = 1 to 10
for col = 1 to 20
Button[row][col].settext("")
next
next
for nr = 1 to 200
rnd = random(3)+1
baseStr = base[rnd]
row = ceil(nr/20)
col = nr%20
if col = 0
col = 20
ok
Button[row][col].settext(baseStr)
add(dnaList,baseStr)
next
startDna()
//-----------------------------------------
func startDna()
strDna = list2str(dnaList)
strDna = substr(strDna,nl,"")
while true
strBase = ""
for n = 1 to 4
rnd = random(3)+1
strBase = strBase + base[rnd]
next
ind = substr(strDna,strBase)
if ind > 0
exit
ok
end
showDna(dnaList)
//-----------------------------------------
func showDna(dnaList)
if start > 1
see nl
for n = 1 to len(dnaSeq)
for m = 0 to 3
ind = dnaSeq[n] + m
row = ceil(ind/20)
col = ind%20
if col = 0
col = 20
ok
Button[row][col].setstylesheet(C_ButtonStyle)
next
next
ok
dnaSeq = []
strDna = list2str(dnaList)
strDna = substr(strDna,nl,"")
while true
strBase = ""
for n = 1 to 4
rnd = random(3)+1
strBase = strBase + base[rnd]
next
ind = substr(strDna,strBase)
if ind > 0
exit
ok
end
ButtonFind.setStyleSheet("background-color:yellow")
ButtonFind.settext(strBase)
for n = 1 to 196
flag = 1
for m = 0 to 3
if dnaList[n+m] != strBase[m+1]
flag = 0
exit
ok
next
if flag = 1
add(dnaSeq,n)
ok
next
temp = ""
ButtonInd.settext("")
for nr = 1 to len(dnaList)
ind = find(dnaSeq,nr)
if ind > 0
temp = temp + string(dnaSeq[ind]) + " "
ButtonInd.settext(temp)
for n = nr to nr + 3
row = ceil(n/20)
col = n%20
if col = 0
col = 20
ok
Button[row][col].setStyleSheet(C_ButtonDnaStyle)
Button[row][col].settext(dnaList[n])
next
ok
next
//-----------------------------------------
Output:
Bioinformatics/Subsequence - video
Wren
import "random" for Random
import "./pattern" for Pattern
import "./str" for Str
import "./fmt" for Fmt
var rand = Random.new()
var base = "ACGT"
var findDnaSubsequence = Fn.new { |dnaSize, chunkSize|
var dnaSeq = List.filled(dnaSize, null)
for (i in 0...dnaSize) dnaSeq[i] = base[rand.int(4)]
var dnaStr = dnaSeq.join()
var dnaSubseq = List.filled(4, null)
for (i in 0...4) dnaSubseq[i] = base[rand.int(4)]
var dnaSubstr = dnaSubseq.join()
System.print("DNA sequence:")
var i = chunkSize
for (chunk in Str.chunks(dnaStr, chunkSize)) {
Fmt.print("$3d..$3d: $s", i - chunkSize + 1, i, chunk)
i = i + chunkSize
}
System.print("\nSubsequence to locate: %(dnaSubstr)")
var p = Pattern.new(dnaSubstr)
var matches = p.findAll(dnaStr)
if (matches.count == 0) {
System.print("No matches found.")
} else {
System.print("Matches found at the following indices:")
for (m in matches) {
Fmt.print("$3d..$3d", m.index + 1, m.index + 4)
}
}
}
findDnaSubsequence.call(200, 20)
System.print()
findDnaSubsequence.call(600, 40)
- Output:
DNA sequence: 1.. 20: TATGGGCGCATTATGACAAC 21.. 40: GGCTACTGAAACGAAAATTC 41.. 60: ATGCCTTCGGAGGCTAGACC 61.. 80: ACTCATACATGATTTACAGC 81..100: TAGTCAGTTGCGTCCGCCAT 101..120: CCCGCATAACTATGTATTAC 121..140: GAGCATGTTCTGGCAACCTT 141..160: TCAGTGACAGTTCCTCAGGC 161..180: GCGTTCGCGTTGAAGGCCTC 181..200: CCCACACCGCACCCCTGCCG Subsequence to locate: AATT Matches found at the following indices: 36.. 39 DNA sequence: 1.. 40: GCGCTGAGCGCCCCAGTACAGCGGGTTAAACCGAGCCCGC 41.. 80: TCCGATGAACCAACTCCCATTCCTATAATGGTGCCCCGAC 81..120: ATATTGAATTCGGCGGGTCCGCTATCGGGCTGAGGATGCC 121..160: AATATCTAGGCGCTACCCTGAAGATCCTCAGTTGTGGTGT 161..200: CGCGGAGTGTCGATCCCAGAGCTCCCAATTGACTCAATTA 201..240: CTTTTTCCGTCCTCTTGCTTACGGATTTATGTTTGTGGCA 241..280: GAGGTTATGCTTCAGGCATCCCCATGTTTCCTGAGATACG 281..320: ACCACTGTCAGGTGGCTTGAATCTACCTTGTATTTCCTCT 321..360: AGTACCAGTCACTGTCATCTACTGGAAGCCATATCAGCGT 361..400: TGAAATGTCTATAATTTACTCTCCGGTTGTACCCAAGCGA 401..440: TAACAGCAACGTGTGGGTCTAAAGAGTTCCGCGTTTCGAC 441..480: ATAACGTGCTCCTATTTATCTACCGAAACACCCTATTTTC 481..520: CATCTAACCGGCACCCAATGCGCAGGTGTACGCGTCCTAC 521..560: TACGTTTGAAACGGTTCCATCTCGCCATGTACAATTGTGG 561..600: GGCTACGATTAAGTGTAGTCGGTAATTCAGGGTGAAGTTG Subsequence to locate: TTCG Matches found at the following indices: 89.. 92 435..438