Bioinformatics/Subsequence
- Task
Randomly generate a string of 200 DNA bases (represented by A, C, G, and T).
Write a routine to find all the positions of a randomly generated subsequence (four letters).
Ada
<lang Ada>with Ada.Text_Io; with Ada.Strings.Fixed; with Ada.Numerics.Discrete_Random;
procedure Sub_Sequence is
type Nucleotide is (A, C, G, T);
function To_Character (N : Nucleotide) return Character is (case N is when A => 'A', when C => 'C', when G => 'G', when T => 'T');
package Random_Nucleotide is new Ada.Numerics.Discrete_Random (Nucleotide); use Random_Nucleotide;
package Position_Io is new Ada.Text_Io.Integer_Io (Natural); use Ada.Text_Io;
procedure Put_Bases (Seq : String; Width : Positive) is First : Natural := Seq'First; begin while First < Seq'Last loop declare Last : constant Natural := Natural'Min (First + Width - 1, Seq'Last); begin Position_Io.Put (First); Put (".."); Position_Io.Put (Last); Put (" "); Put (Seq (First .. Last)); New_Line; First := Last + 1; end; end loop; end Put_Bases;
Gen : Generator; Sequence : String (1 .. 405); Substring : String (1 .. 4); Pos : Natural := 0;
begin
Position_Io.Default_Width := 3;
Reset (Gen);
Sequence := (others => To_Character (Random (Gen))); Substring := (others => To_Character (Random (Gen)));
Put_Line ("Search sequence:"); Put_Bases (Sequence, Width => 50); New_Line;
Put ("Substring to search: "); Put (Substring); New_Line;
loop Pos := Ada.Strings.Fixed.Index (Sequence, Substring, Pos + 1); exit when Pos = 0; Put ("Found at position: "); Position_Io.Put (Pos); Put (".."); Position_Io.Put (Pos + Substring'Length - 1); New_Line; end loop;
end Sub_Sequence;</lang>
- Output:
Search sequence: 1.. 50 CCTACGGAAAAGTGATAAGGACAGATACATAATCCTAAAACCCTGGAAAA 51..100 CTTGTCTCGCCAGAGTAGGGCTCGGCAGGGGGGGCAGTGTTTTAAAACGT 101..150 CAGAGAATAGGCTCTACCTTGTTAGACTGCGAGTACTGGAGCGTAGTTCC 151..200 TATATTGCAAGCTGCTACAGTAAGTATCAAAGTATGCCACACATCCTTCT 201..250 ACAACCGGATTGGTTGCCCAGTAGAAGGCTCGTAGTCACCGGACACGCTG 251..300 TTCTTAAGGTCGGTAAGCTATTACGTCCATGGGAGATTCTCAAGGGTGCG 301..350 TTAGCGGACCCCCGTTACGTCCACGTATCTTCCGTCCAACTACCCCCTAA 351..400 TGTCATTGACATCGCCCGAGTATTTAATTTATTTGAACGGCACCAATTTA 401..405 GAGCT Substring to search: TATT Found at position: 153..156 Found at position: 269..272 Found at position: 371..374 Found at position: 380..383
Arturo
<lang rebol>bases: [`A` `G` `C` `T`] randSeq: join map 1..200 => [sample bases] randSub: join map 1..4 => [sample bases]
idx: 0
print "Random sequence:" print join.with:"\n" split.every: 20 randSeq print ""
print "Looking for subsequence:" print randSub print ""
while [(size randSeq) > idx + 4][
if prefix? slice randSeq idx idx+4 randSub -> print ["Found subsequence at position:" idx] idx: idx + 1
]</lang>
- Output:
Random sequence: CACGCGCGTTAACCCTGCAT CTTTTCTCTAAGATGATGCG CTACTCTGCCCGATTACTAT GATGTCACCGGCGGTTCGGC GACTGGCGCTGGCAGAAAGC GCATGTCAAATTGCCCCAGT GTGCAAGTCCAAGTATTAGT GAGGTGCTCCGCTTCGTCCG GGGTCGACTCGGTCCCACTT CATTACATGTTGGTAATAGT Looking for subsequence: CGGT Found subsequence at position: 71 Found subsequence at position: 169
Factor
<lang factor>USING: accessors formatting grouping io kernel math math.functions.integer-logs math.parser random regexp sequences ;
- new-dna ( n -- str ) [ "ACGT" random ] "" replicate-as ;
- pad ( n d -- str ) [ number>string ] dip 32 pad-head ;
- .dna ( seq n -- )
seq length integer-log10 1 + :> d seq n group [ n * d pad write ": " write write nl ] each-index ;
- .match ( slice -- ) [ from>> ] [ to>> ] bi "%d..%d\n" printf ;
- .matches ( slices -- )
"Matches found at the following indices:" print [ .match ] each ;
- .locate ( slices -- )
[ "No matches found." print ] [ .matches ] if-empty ;
- .biosub ( dna-size row-size -- )
[ new-dna dup ] [ .dna nl ] bi* 4 new-dna dup "Subsequence to locate: %s\n" printf <regexp> all-matching-slices .locate ;
80 10 .biosub nl 600 39 .biosub nl</lang>
- Output:
0: ATTCAAGGAC 10: CACTATTAAC 20: CTGCATTGTG 30: AGAACTTGCA 40: GTGTACCGAG 50: AGCGAGTTTA 60: AAGCAACACA 70: TCTTTACCGA Subsequence to locate: GTAG No matches found. 0: GATCTCGTCATGGTCCATCCTAACATTTCGGTTGTGGGC 39: GCATCCCGATAGGCGAAGTTAAATCTACGTAGTCCTACG 78: TCACGACGGAACATGATTGCCCACCGAAGTCGTAGGCGA 117: GCTAAAGTCGGTACATACACGATCTGCTATATTCGTTCT 156: CCGACACACGACATGCAATCCGAGAAGCTCTCGAAGTGC 195: GGTCAGATCCTCAGACTCGAACAGAGGAGACCTTAACTG 234: ATACCCACAGTACTTCTCGCATAACCTAAGCACCTATGC 273: TTACACCATCGTCCTGATATTGAGTGAGTCTGGTCGGAG 312: ATATTATCTAGCACCCTCAAGCTCTGTGTGCCACACCAG 351: GATTCCACTTCGCGCTTGCCTAGAGAAAGTAGAGTAGGT 390: GGTGTCATTAGTACACTGTTTGCGATGCACCAACCAAAC 429: CCGACCGCCATGATGACTGCTTTTCGGCCAACGTCAGAT 468: TAAGAGTACTTTTAGTAGCACCGCAAGCCAGCCGGTTTA 507: GCAAGATCCTGCAGCCTCCACGTTATTTCAGGTCTCTAA 546: GCGTTCTTTCCATGGAAGTAGTCACCGCTCCCGTTGCCA 585: ATGGACACAGACGTT Subsequence to locate: ATAT Matches found at the following indices: 145..149 289..293 312..316
Go
<lang go>package main
import (
"fmt" "math/rand" "regexp" "time"
)
const base = "ACGT"
func findDnaSubsequence(dnaSize, chunkSize int) {
dnaSeq := make([]byte, dnaSize) for i := 0; i < dnaSize; i++ { dnaSeq[i] = base[rand.Intn(4)] } dnaStr := string(dnaSeq) dnaSubseq := make([]byte, 4) for i := 0; i < 4; i++ { dnaSubseq[i] = base[rand.Intn(4)] } dnaSubstr := string(dnaSubseq) fmt.Println("DNA sequnence:") for i := chunkSize; i <= len(dnaStr); i += chunkSize { start := i - chunkSize fmt.Printf("%3d..%3d: %s\n", start+1, i, dnaStr[start:i]) } fmt.Println("\nSubsequence to locate:", dnaSubstr) var r = regexp.MustCompile(dnaSubstr) var matches = r.FindAllStringIndex(dnaStr, -1) if len(matches) == 0 { fmt.Println("No matches found.") } else { fmt.Println("Matches found at the following indices:") for _, m := range matches { fmt.Printf("%3d..%-3d\n", m[0]+1, m[1]) } }
}
func main() {
rand.Seed(time.Now().UnixNano()) findDnaSubsequence(200, 20) fmt.Println() findDnaSubsequence(600, 40)
}</lang>
- Output:
Sample run:
DNA sequnence: 1.. 20: GTTGCCCACACGTCTTATTG 21.. 40: TAAAAATCACCGTGCAGCGA 41.. 60: GGTTAAAAATGGTAGGAAAA 61.. 80: TATCCTCAGCCAGCGGTGCC 81..100: GGCCAACAAAAGGGACGTTG 101..120: GATTAAAGTAGGTCTAGGTA 121..140: TCTCGTATCCGGTTGATCCG 141..160: GGATGGTGGACGATATTGGA 161..180: GACCGGAGTGTACATCGGTG 181..200: TTGTCGCTTGCAGCTACGGT Subsequence to locate: AATA Matches found at the following indices: 59..62 DNA sequnence: 1.. 40: GTACAGCCACTGTTAGTAGACGGATGCTATTGGGACGCAA 41.. 80: CACATCAGTACACTGCTTGTTCGTAATCGCGTACCCAGCG 81..120: CAAAAGGAGGGGAGGAACCTGCTCAGACTGTCGCTAAAAA 121..160: CGAGCACGTGTCCTTACGCAGTGATGGTAGCGGTCCACGA 161..200: CTTCCACTGGCATAAGGAGAATGTTTAGTAACGCCCCTCA 201..240: TAGGTGCAATTCTACAGGTTAAGGGACCGTGGGATGTTTC 241..280: TATAAAAGTTGAAGAGATTACTAATCCGTCCCGTGCGCGT 281..320: GCCGCAATTTAGCGCCCGTTCTTGAGTAAACATACATGCA 321..360: CGCTCTTGAGTTTTCTAAAACCTGATCAAAACGGTCGCCC 361..400: ACATGCAGGAGCGCCGCAGGGTTTCAGAGGTCAACCATCG 401..440: GCAGCACACGTGAACCCTCTGTACTGACCAGGGGCTTGCT 441..480: CCTTGGTAGGAGATGGTGGAGAATGCGTCGATGCACTGAA 481..520: GCAGACCGCTGATAGCATGTACGATGTTTACGGGTTGACG 521..560: ATAGCTTTGCTAGTGATCGAACATATGATGAAAAACGCTT 561..600: CCATTGATAGAGCATCTTAGGAGCTCAGTCCAGTGACCTC Subsequence to locate: AGGT Matches found at the following indices: 202..205 216..219 388..391
jq
Works with gojq, the Go implementation of jq
Neither jq nor gojq currently has any PRNG built-ins so one possibility is to use a jq-coded PRNG function such as can be found at https://rosettacode.org/wiki/Random_numbers#jq
In practice, it's usually more convenient to use a utility such as gshuf or jot to provide the source of randomness. Here we use `jot -r N MIN MAX` but a fourth argument can also be used to specify a seed. An alternative would be to use `gshuf` along the lines of: <lang sh>
- For 200 pseudo-random integers in the range 0 to 3 inclusive:
gshuf -i 0-3 -r -n 200 --random-source=/dev/random </lang>
Note that the indices shown below are offsets (i.e., the index origin is taken to be 0). <lang sh>
- !/bin/bash
jot -r 200 0 3 | jq -nr --slurpfile four <(jot -r 4 0 3) '
# input: an array of integers def toDNA: def base: . as $in | "ACGT" | .[$in : $in+1]; map(base) | join("");
([inputs] | toDNA) as $strand | ($four | toDNA) as $four | "Strand of length \($strand|length):", $strand, "Zero-based indices of \($four):", ($strand | indices($four) | join(" "))
'</lang>
- Output:
./bioinformatics-subsequence.sh Strand of length 200: TGGGCCCAAGCATTGCCACGTAGCTTTGTCAGTGGGCTTGTAAGGGACGAACACAAACTCACAGACCAGGAATTCTCGAGTTCCAGTCCCCCCACTTGTCGCTATTTAGTTAAGACGTTCAGTTTCGTTGCGAACTGTGTCCCCCAGGCTAACGTGATGGGTGTCAGGAATCAATGGCCAACTTTCAGTTAGACTTGACC Zero-based indices of CAAC: 178 ./bioinformatics-subsequence.sh Strand of length 200: TAAGACTGCAGGGTACGAAGAGTGGAAGATTGGCTCGTACTTGTCGACGTCGCGTGACATAATCTCTGTGCTCGCCTCGCAGTAAGGGACTAGGTCCCGTTCGAGCGCCCTGCTAGAAGGAGCATCCTACCATGCTCTGATGACATCCTGTCGGCATTAGAGTTTCTACGACATCTAAAGAGTACGATCGACTTCCCAGT Zero-based indices of GACA: 55 141 169
Julia
<lang julia>DNArand(n, bases=['A', 'T', 'C', 'G']) = String(rand(bases, n))
DNAsearch(needle, haystack, lap=true) = findall(needle, haystack, overlap=lap)
const rand_string = DNArand(200) const subseq = DNArand(4)
println("Search sequence:\n$rand_string\nfor substring $subseq. Found at positions: ") foreach(p -> print(rpad(p[2], 8), p[1] % 10 == 0 ? "\n" : ""), enumerate(DNAsearch(subseq, rand_string)))
</lang>
- Output:
Search sequence: CCGAAGCCAGGAGGACTGAGCGCTTGCGTCCCGAGTTCTGCGACGAGTCTCTTCATTATAAGGCCACTGATTGCGCTCATCATGAGTGCCAGAAGCACCGCTAAACATAAGTGTCCTTTCTTCCTGACGCACTTGAAGATTGTGACCATTTGTGCGGGTTGTGAGTTAGGGGCTCTCATTGTACACGATCTATAGTGTGC for substring CGCT. Found at positions: 21:24 74:77 99:102
Nim
<lang Nim>import random, sequtils, strutils
proc dnaSequence(n: Positive): string =
## Create a random DNA sequence of length "n". newSeqWith(n, sample("ACGT")).join()
proc positions(dnaSeq, subSeq: string): seq[int] =
## Return the list of starting positions of a subsequence ## "subSeq" in a sequence "dnaSeq". Positions start at 1. var start = 0 while true: let pos = dnaSeq.find(subSeq, start) if pos < 0: break result.add pos + 1 start = pos + 1
when isMainModule:
const N = 200 Step = 20
randomize()
let dnaSeq = dnaSequence(N) echo "DNA sequence:" for i in countup(0, N - 1, Step): echo ($(i+1)).align(3), ' ', dnaSeq[i..i+(Step-1)]
let subSeq = dnaSequence(3) echo "\nDNA subsequence: ", subSeq
echo() let pos = dnaSeq.positions(subSeq) if pos.len == 0: echo "Subsequence not found." else: let tail = if pos.len == 1: ": " else: "s: " echo "Subsequence found at position", tail, pos.join(", ")</lang>
- Output:
DNA sequence: 1 CACATACGATGAGCTGGGCG 21 CCTAAGAGGCGGAAAGACAA 41 CCGTGTGTGTCTAACCCATG 61 GTTTAATTGCAGATAGTCTC 81 TAGACTACAAACATTAGAGC 101 AATGCACCGGGGTGCACGTG 121 TGTTTTGACTTCCCATGAAA 141 GCCCTTATCCTAGAGTACAG 161 TCGGCAAATGTTCGCTCCTT 181 GGCCCACTCCATTTGGACGG DNA subsequence: GTT Subsequence found at positions: 61, 122, 170
Perl
<lang perl>use strict; use warnings; use feature 'say';
my @bases = <A C G T>; my $basecnt = 160;
my($string,$target); $string .= $bases[ int rand @bases ] for 1 .. $basecnt; $target .= $bases[ int rand @bases ] for 1 .. 4; say "Target: $target"; say 'Matches at these positions:'; say (($string =~ s/.{1,40}\K/\n/gr) =~ s/($target)/ >$1< /gr);</lang>
- Output:
Target: CCTG Matches at these positions: 9 90 157 TTGCC >CCTG< CAAAGTTAATAAGTAAACAATTAAGTGAGTG CTCTAGGGTAAGGTGAGGGCGGGAAGGGGAAAAATACCGA TGCGAG >CCTG< TAGAGCCGGGCCTCAAATTAAACGAAAAAT ATAAGTTTGCTTGGCACGCTGTACTACTTATCC >CCTG< ACT
Phix
Currently only searches for non-overlapped sequences, but it should be pretty obvious how to change that, in which case the next underline will simply partially overwrite the previous, so you'll get eg "<=<==>".
with javascript_semantics constant cheat = false function grandna(integer len) string dna = repeat(' ',len) for i=1 to len do dna[i] = "ACGT"[rand(4)] end for return dna end function procedure show(string dna, test, sequence idx) idx = deep_copy(idx) & length(dna)+100 -- (add an otherwise unused sentinel) sequence s = split(trim(join_by(split(join_by(dna,1,10,""),"\n"),1,5," ")),"\n") integer ii = 1, -- idx index i = idx[ii], -- current target ux = 1, -- underline index (1..4) ldx = 1 -- line index (1, 51, 101, etc) for si=1 to length(s) do printf(1,"%3d: %s\n",{ldx,s[si]}) ldx += 50 if i and i<ldx then string ul = repeat(' ',59) while i and i<ldx do integer up = i-ldx+51 -- underline pos (relative to ldx) up += floor((up-1)/10)+5 -- (plus any needed spacing) ul[up] = "<==>"[ux] ux += 1 i += 1 if ux>4 then ux = 1 ii += 1 i = idx[ii] end if end while printf(1,"%s\n",ul) end if end for if length(idx)>1 then string p = iff(length(idx)>1?"s":""), t = join(apply(idx[1..$-1],sprint),", ") printf(1,"%s occurs at location%s: %s\n",{test,p,t}) else printf(1,"%s does not occur\n",{test}) end if end procedure string dna = grandna(200), test = grandna(4) constant cheats = iff(cheat?{9,13,49,60,64,68}:{}) for i=1 to length(cheats) do dna[cheats[i]..cheats[i]+3] = test end for sequence idx = match_all(test,dna) show(dna,test,idx)
- Output:
with cheat enabled
1: GGAGATATCG ACCGACCGAA GTAAAGTCAA AGTCGTCCAA TCCACGGACG <= =><==> <= 51: ACTTCAGCAC GACCGACCGA CCTATTTAAG AGACCACACT TAAGGAATCC => < ==><==><== > 101: ATGCGAAATA AAAATGGGCG AGTAGCCGTG GGGCGCTAAA GCACCCACCT 151: AGTTTTCGCC GAAGTACTAG ACCACCTTCG GATCGACAAA GCTTTCACCA <==> CGAC occurs at locations: 9, 13, 49, 60, 64, 68, 184
with cheat disabled
1: TGATTTAAAC CGTGGTGCAA TTTATAAACA CTGCGATATG CCTCCTGATG 51: GCATGGTATT CGACACCAAG ACGCTGGTGG GCACACTGGC TTTCAGAATA 101: GGAGTCACAA TCCCTCTATG ATGTCCTCTA GCGGGTGTGT GTTCAGTGCC 151: AGCGCTTACT TCCGGCGTGG CCGACTCTTT TTAAAGCGTA TAGCTGGGGT GCTA does not occur
Python
<lang python> from random import choice import regex as re import time
def generate_sequence(n: int ) -> list:
return "".join([ choice(['A','C','G','T']) for _ in range(n) ])
def dna_findall(needle: str, haystack: str) -> None:
if sum(1 for _ in re.finditer(needle, haystack, overlapped=True)) == 0: print("No matches found") else: print(f"Found {needle} at the following indices: ") for match in re.finditer(needle, haystack, overlapped=True): print(f"{match.start()}:{match.end()} ")
dna_seq = generate_sequence(200) sample_seq = generate_sequence(4)
c = 1 for i in dna_seq:
print(i, end="") if c % 20 != 0 else print(f"{i}") c += 1
print(f"\nSearch Sample: {sample_seq}")
dna_findall(sample_seq, dna_seq) </lang>
- Output:
TTGCCCCTGTACTGAGCCCA TAAGCTTGCACTCAAGGTTT TGCCCCCTCATATTATAACG CATCCATTATACAAAACCGA TACCCTTCCGCATATTATGA AAAGTGGCGAAGTGCCTTGA TTTGCATTCATAGTACAACG GTGCAAAAGCATTGTATGTC TCACATTTACATGGGAAATG CCTAGTAGGTGCAAGACCTG Search Sample: TACA Found TACA at the following indices: 69:73 133:137 167:171
Raku
Chances are actually pretty small that a random 4 codon string will show up at all in a random 200 codon sequence. Bump up the sequence size to get a reasonable chance of multiple matches. <lang perl6>use String::Splice:ver<0.0.3>;
my $line = 80;
my $haystack = [~] <A C G T>.roll($line * 8);
say 'Needle: ' ~ my $needle = [~] <A C G T>.roll(4);
my $these = $haystack ~~ m:g/<$needle>/;
my @match = $these.map: { .from, .pos }
printf "From: %3s to %3s\n", |$_ for @match;
my $disp = $haystack.comb.batch($line)».join.join("\n");
for @match.reverse {
$disp.=&splice(.[1] + .[1] div $line, "\e[0m" ); $disp.=&splice(.[0] + .[0] div $line, "\e[31m");
}
say $disp;</lang>
- Output:
Show in custom div to better display highlighting.
Needle: TAGC
From: 159 to 163
From: 262 to 266
From: 315 to 319
From: 505 to 509
From: 632 to 636
CATATGTGACACTGACAGCTCGCGCGAAAATCCGTGTGACGGTCTGAACACTATACTATAGGCCCGGTCGGCATTTGTGG
CTCCCCAGTGGAGAGACCACTCGTCAATTGCTGACGACTTAACACAAATCGAGTCGCCCTTAGTGCCAGACGGGACTCCT
AGCAAAGGGCGGCACGTGGTGACTCCCAATATGTGAGCATGCCATCTAATTGATCTGGGGGGTTTCGCGGGAATACCTAG
GGGCGTTCTGTCCATGGATCTCTAGCCCTGCGAAGAGATACCCGCAGTGAGTTGCACGTGCAAAGAACTTGTAACTAGCG
TATTCTGTATCCGCCGCGCGATATGCTTCTGCGGGATGTACTTCTTGTGACTAAGACTTTGTTATCCAAATTGACCAATA
TTCAACGGTCGACTCTCCGAGGCAGTATCGGTACGCCGAAAAATGGTTACTTCGGCCATACGTAACCTCTCAAGTCACGA
TTACAGCCCACGGGGGCTTACAGCATAGCTCCAAAGACATTCCAATTGAGCTACAACGTGTTCAGTGCGGAGCAGTATCC
AGTACTCGACTGTTATGGTAAAAGGGCATCGTGATCGTTTATATTAATCATTGGGACAGGTGGTTAATGTCATAGCTTAG
REXX
This REXX version allows the user to specify:
- length of the (random) DNA data sequence (default is 200).
- length of the (random) DNA sequence (default is four).
- DNA proteins to be used in the sequence (default is ACGT).
- width of the output lines of (random DNA) (default is 100).
- often (if ever) to add a blank to the output (default is every 10 proteins).
- DNA proteins to be searched in the data (the default is four unique random proteins).
- the seed for the RANDOM function so runs can be repeated with the same data (no default).
<lang rexx>/*REXX pgm gens random DNA (ACGT) sequence & finds positions of a random 4─protein seq. */ parse arg totLen rndLen basePr oWidth Bevery rndDNA seed . if totLen== | totLen=="," then totLen= 200 /*Not specified? Then use the default.*/ if rndLen== | rndLen=="," then rndLen= 4 /* " " " " " " */ if basePr== | basePr=="," then basePr= 'acgt' /* " " " " " " */ if oWidth== | oWidth=="," then oWidth= 100 /* " " " " " " */ if Bevery== | Bevery=="," then Bevery= 10 /* " " " " " " */ if rndDNA== | rndDNA=="," then rndDNA= copies(., rndLen) /*what we're looking for*/ if datatype(seed, 'W') then call random ,,seed /*used to generate repeatable random #s*/ call genRnd /*gen data field of random proteins. */ call show /*show " " " " " */ say ' base DNA proteins used: ' basePr say 'random DNA proteins used: ' dna? call findRnd if @== then do; say "the random DNA proteins weren't found."; exit 4; end say 'the random DNA proteins were found in positions:' strip(@) exit 0 /*stick a fork in it, we're all done. */ /*──────────────────────────────────────────────────────────────────────────────────────*/ commas: parse arg ?; do jc=length(?)-3 to 1 by -3; ?=insert(',', ?, jc); end; return ? /*──────────────────────────────────────────────────────────────────────────────────────*/ findRnd: @=; p=0 /*@: list of the found target proteins*/
do until p==0; p= pos(dna?, $$, p+1); if p>0 then @= @ commas(p) /*Found one? Append it to the "Found"s*/ end /*p*/; return
/*──────────────────────────────────────────────────────────────────────────────────────*/ genRnd: dna?=; use= basePr; upper use basePr rndDNA; lenB= length(basePr)
do k=1 for rndLEN; x= substr(rndDNA, k, 1) if x==. then do; ?= random(1, length(use) ); x= substr(use, ?, 1) use= delstr(use, ?, 1) /*elide so no protein repeats*/ end dna?= dna? || x /*build a random protein seq.*/ end /*k*/ return
/*──────────────────────────────────────────────────────────────────────────────────────*/ show: say " index │"center('DNA sequence of ' commas(totLen) " proteins", oWidth+10)
say "───────┼"center( , oWidth+10, '─') $=; $$=; idx= 1 /*gen data field of random proteins.*/ do j=1 for totLen; c= substr( basePr, random(1, lenB), 1) $$= $$ || c /*append a random protein. */ if Bevery\==0 then if j//Bevery==0 then $= $' ' /*possibly add a blank. */ if length( space($ || c, 0) )<oWidth then do; $= $ || c; iterate; end say strip( right(idx, 7)'│' $, 'T'); $= /*display line ──► terminal.*/ idx= idx + oWidth /*bump the index number. */ end /*j*/ if $\== then say right(idx, 7)"│" strip($, 'T') /*show residual protein data*/ say "───────┴"center( , oWidth+10, '─') say; return</lang>
- output when using the default inputs:
index │ DNA sequence of 200 proteins ───────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────── 1│ TTTTTAGCG CGTTTTGTAG CGCTCTAAAA ACCGTAGCTA TATTTCTCGA AGTTTCACCC AGCTCTTTTG CCCCAGGGTT GCGCTAAGCC CAGCTTCGAG 101│ GGGGCACAG GTAAAATACT ACCGTCCGTG GAGGGGGATG AATTGACCCG ACATTTTTTG AAGCATAACT CGTGACTCAA TATTGCATGA TTACACCAGC ───────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────── base DNA proteins used: ACGT random DNA proteins used: GCAT the random DNA proteins were found in positions: 162 184
- output when using the inputs of: 1000 , , , , tttt
index │ DNA sequence of 1,000 proteins ───────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────── 1│ GTGATTTTT AGCGCGTTTT GTAGCGCTCT AAAAACCGTA GCTATATTTC TCGAAGTTTC ACCCAGCTCT TTTGCCCCAG GGTTGCGCTA AGCCCAGCTT 101│ GAGGGGGGC ACAGGTAAAA TACTACCGTC CGTGGAGGGG GATGAATTGA CCCGACATTT TTTGAAGCAT AACTCGTGAC TCAATATTGC ATGATTACAC 201│ AGCTAGGTT AGTGTAAAAA CCCCCCTATC TTCCTGATCA ATGGCGAGTA AAACATGCAA CCAATTTGTG AGCGAGTACT GGAAATTATT GTTTACGGGA 301│ AGGCACATG CTACGCGCAA CAGATATCTT AGACTGACCC TTTTAGAGTC ATAAGCCCCT GTCGCCTACA TGCTACTAAT ACTCCAACTA GCGGCGCACC 401│ TCAACCGGA TCATGGCGCC AGGGAAAATG TGGCGTAGCG ACGTGCTCAT CGCTCGCCGG GGAGAGCCTT TCAGAATCTC GAATAAAACC TGGTAATGAC 501│ TCATCAATC GTAATGGTCG TCTGGGGCAA GAAGCCGATA TTATAGACTC AGGTCAGACG TGTGCACAAC GGCAGAATTT ATAGTAATTC GCGTGAACTA 601│ GTTTCGGGA TAGGCCTACG ACCAATCATA GGACATTCGA TGCACGGTGT AGAAACAGTT CTCTGATGTT ACTCGGGATA ACACTCGCAA TCCCCTAGGA 701│ ACCGTGAGC GTCGCTAGTA TCTGAGATAG TCGCGACTGC CCAGCGGTCT TTAAGTTCGC ACACTACGGG ACTCCTAGTT CGCCCATTCA TGGCTATTTT 801│ CCTATCAGT CCAATCCCAC GGGGAGGGCA CTCGCGCAAT TCATTCAAAG AGGGCCATTT GCCGATATAA GGTCCATCAT CGGGAGGAAT ATGACTCCTG 901│ TTAGTATTA GAGCAGCCTC GCTGCGTACT ACTGTCAGTG GCCCGTCAGG GAAGGCAAAA CGTTTTTCCT CTAGGAATCC GTCAATTGGA CTTCTAGACT ───────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────── base DNA proteins used: ACGT random DNA proteins used: TTTT the random DNA proteins were found in positions: 5 6 16 69 157 158 159 340 796 797 962 963
Ring
<lang ring> /*-----------------------------------
- Project : DNA subsequences
- Date : 2021/03/23
- Author : Gal Zsolt (~ CalmoSoft ~)
- Email : <calmosoft@gmail.com>
*/
//-----------------------------------------
load "stdlibcore.ring" load "guilib.ring"
start = 0 base = ["A","C","G","T"] dnaList = [] dnaSeq = [] ColLine = list(21)
C_Spacing = 2
C_ButtonDnaStyle = ' background-color: Red; border-radius: 8px;' C_ButtonStyle = '"background-color:white"; border-radius: 8px;' Button = newlist(10,20) LayoutButtonRow = list(10)
//-----------------------------------------
app = new qApp {
win = new qWidget() { setWindowTitle('DNA subsequences')
setWinIcon(self,AppFile("white.jpg"))
setStyleSheet('background-color:White') setgeometry(560,180,300,300) //reSize(400,400) winheight = 10 fontSize = 8 # + (winheight / 100)
LayoutButtonMain = new QVBoxLayout() LayoutButtonMain.setSpacing(C_Spacing) LayoutButtonMain.setContentsmargins(0,0,0,0)
LabelInd = new qLabel(win) { settext(" DNA subsequences start positions:") setAlignment(Qt_AlignHCenter | Qt_AlignVCenter) setStyleSheet("background-color:yellow") }
ButtonInd = new QPushButton(win) { setStyleSheet("background-color:yellow") }
LabelFind = new qLabel(win) { settext(" DNA subsequence to find:") setStyleSheet("background-color:yellow") }
ButtonFind = new QPushButton(win)
DnaSearch = new QPushButton(win) { setclickevent("pstart()") setStyleSheet("background-color:yellow") settext("Find") } for Col = 1 to 21 ColLine[Col] = new qLabel(win) { setmaximumheight(20) setAlignment(Qt_AlignHCenter | Qt_AlignVCenter) setStyleSheet("background-color:darkgray") setText(string(Col-1)) } next
LayoutInd = new QHBoxLayout() { setSpacing(C_Spacing) setContentsMargins(0,0,0,0) } LayoutInd.AddWidget(LabelInd) LayoutInd.AddWidget(ButtonInd) LayoutButtonMain.AddLayout(LayoutInd) LayoutTitleRow = new QHBoxLayout() { setSpacing(C_Spacing) setContentsMargins(0,0,0,0) }
for Col = 1 to 21 LayoutTitleRow.AddWidget(ColLine[Col]) next LayoutButtonMain.AddLayout(LayoutTitleRow) RowLine = list(10)
for Row = 1 to 10 Letter = "" + Row*20 if Row*20 < 100 Letter = " " + Row*20 ok RowLine[Row] = new qLabel(win) { setFont(new qFont("Verdana",fontSize,40,0)) setAlignment(Qt_AlignHCenter | Qt_AlignVCenter) setStyleSheet("background-color:darkgray") setText(Letter) } next
for Row = 1 to 10 LayoutButtonRow[Row] = new QHBoxLayout() { setSpacing(C_Spacing) setContentsmargins(0,0,0,0) }
LayoutButtonRow[Row].AddWidget(RowLine[Row]) for Col = 1 to 20 Button[Row][Col] = new QPushButton(win) { setmaximumwidth(20) } LayoutButtonRow[Row].AddWidget(Button[Row][Col]) next LayoutButtonMain.AddLayout(LayoutButtonRow[Row]) next
LayoutDataRow = new QHBoxLayout() { setSpacing(C_Spacing) setContentsMargins(0,0,0,0) }
LayoutDataRow.AddWidget(LabelFind) LayoutDataRow.AddWidget(ButtonFind) LayoutDataRow.AddWidget(DnaSearch) LayoutButtonMain.AddLayout(LayoutDataRow) setLayout(LayoutButtonMain) pStart() show() } exec() }
//-----------------------------------------
func pStart()
start = start + 1
dnaList = [] for row = 1 to 10 for col = 1 to 20 Button[row][col].settext("") next next for nr = 1 to 200 rnd = random(3)+1 baseStr = base[rnd] row = ceil(nr/20) col = nr%20 if col = 0 col = 20 ok Button[row][col].settext(baseStr) add(dnaList,baseStr) next
startDna()
//-----------------------------------------
func startDna()
strDna = list2str(dnaList) strDna = substr(strDna,nl,"")
while true strBase = "" for n = 1 to 4 rnd = random(3)+1 strBase = strBase + base[rnd] next ind = substr(strDna,strBase) if ind > 0 exit ok end
showDna(dnaList)
//-----------------------------------------
func showDna(dnaList)
if start > 1 see nl for n = 1 to len(dnaSeq) for m = 0 to 3 ind = dnaSeq[n] + m row = ceil(ind/20) col = ind%20 if col = 0 col = 20 ok Button[row][col].setstylesheet(C_ButtonStyle) next next ok
dnaSeq = [] strDna = list2str(dnaList) strDna = substr(strDna,nl,"")
while true strBase = "" for n = 1 to 4 rnd = random(3)+1 strBase = strBase + base[rnd] next ind = substr(strDna,strBase) if ind > 0 exit ok end
ButtonFind.setStyleSheet("background-color:yellow") ButtonFind.settext(strBase)
for n = 1 to 196 flag = 1 for m = 0 to 3 if dnaList[n+m] != strBase[m+1] flag = 0 exit ok next if flag = 1 add(dnaSeq,n) ok next temp = "" ButtonInd.settext("") for nr = 1 to len(dnaList) ind = find(dnaSeq,nr) if ind > 0 temp = temp + string(dnaSeq[ind]) + " " ButtonInd.settext(temp) for n = nr to nr + 3 row = ceil(n/20) col = n%20 if col = 0 col = 20 ok Button[row][col].setStyleSheet(C_ButtonDnaStyle) Button[row][col].settext(dnaList[n]) next ok next
//----------------------------------------- </lang>
Output:
Bioinformatics/Subsequence - video
Wren
<lang ecmascript>import "random" for Random import "/pattern" for Pattern import "/str" for Str import "/fmt" for Fmt
var rand = Random.new() var base = "ACGT"
var findDnaSubsequence = Fn.new { |dnaSize, chunkSize|
var dnaSeq = List.filled(dnaSize, null) for (i in 0...dnaSize) dnaSeq[i] = base[rand.int(4)] var dnaStr = dnaSeq.join() var dnaSubseq = List.filled(4, null) for (i in 0...4) dnaSubseq[i] = base[rand.int(4)] var dnaSubstr = dnaSubseq.join() System.print("DNA sequence:") var i = chunkSize for (chunk in Str.chunks(dnaStr, chunkSize)) { Fmt.print("$3d..$3d: $s", i - chunkSize + 1, i, chunk) i = i + chunkSize } System.print("\nSubsequence to locate: %(dnaSubstr)") var p = Pattern.new(dnaSubstr) var matches = p.findAll(dnaStr) if (matches.count == 0) { System.print("No matches found.") } else { System.print("Matches found at the following indices:") for (m in matches) { Fmt.print("$3d..$3d", m.index + 1, m.index + 4) } }
}
findDnaSubsequence.call(200, 20) System.print() findDnaSubsequence.call(600, 40)</lang>
- Output:
DNA sequence: 1.. 20: TATGGGCGCATTATGACAAC 21.. 40: GGCTACTGAAACGAAAATTC 41.. 60: ATGCCTTCGGAGGCTAGACC 61.. 80: ACTCATACATGATTTACAGC 81..100: TAGTCAGTTGCGTCCGCCAT 101..120: CCCGCATAACTATGTATTAC 121..140: GAGCATGTTCTGGCAACCTT 141..160: TCAGTGACAGTTCCTCAGGC 161..180: GCGTTCGCGTTGAAGGCCTC 181..200: CCCACACCGCACCCCTGCCG Subsequence to locate: AATT Matches found at the following indices: 36.. 39 DNA sequence: 1.. 40: GCGCTGAGCGCCCCAGTACAGCGGGTTAAACCGAGCCCGC 41.. 80: TCCGATGAACCAACTCCCATTCCTATAATGGTGCCCCGAC 81..120: ATATTGAATTCGGCGGGTCCGCTATCGGGCTGAGGATGCC 121..160: AATATCTAGGCGCTACCCTGAAGATCCTCAGTTGTGGTGT 161..200: CGCGGAGTGTCGATCCCAGAGCTCCCAATTGACTCAATTA 201..240: CTTTTTCCGTCCTCTTGCTTACGGATTTATGTTTGTGGCA 241..280: GAGGTTATGCTTCAGGCATCCCCATGTTTCCTGAGATACG 281..320: ACCACTGTCAGGTGGCTTGAATCTACCTTGTATTTCCTCT 321..360: AGTACCAGTCACTGTCATCTACTGGAAGCCATATCAGCGT 361..400: TGAAATGTCTATAATTTACTCTCCGGTTGTACCCAAGCGA 401..440: TAACAGCAACGTGTGGGTCTAAAGAGTTCCGCGTTTCGAC 441..480: ATAACGTGCTCCTATTTATCTACCGAAACACCCTATTTTC 481..520: CATCTAACCGGCACCCAATGCGCAGGTGTACGCGTCCTAC 521..560: TACGTTTGAAACGGTTCCATCTCGCCATGTACAATTGTGG 561..600: GGCTACGATTAAGTGTAGTCGGTAATTCAGGGTGAAGTTG Subsequence to locate: TTCG Matches found at the following indices: 89.. 92 435..438