Bioinformatics/Subsequence: Difference between revisions
m →{{header|REXX}}: added commas to the numbers in the output, enhanced title with the length that was used for data field width. |
m →{{header|REXX}}: moved some code to subroutines. |
||
Line 65: | Line 65: | ||
if rndDNA=='' | rndDNA=="," then rndDNA= copies(., rndLen) /*what we're looking for*/ |
if rndDNA=='' | rndDNA=="," then rndDNA= copies(., rndLen) /*what we're looking for*/ |
||
if datatype(seed, 'W') then call random ,,seed /*used to generate repeatable random #s*/ |
if datatype(seed, 'W') then call random ,,seed /*used to generate repeatable random #s*/ |
||
call genRnd /*gen data field of random proteins. |
call genRnd /*gen data field of random proteins. */ |
||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
idx= idx + oWidth /*bump the index number. */ |
|||
⚫ | |||
⚫ | |||
say |
|||
say ' base DNA proteins used: ' basePr |
say ' base DNA proteins used: ' basePr |
||
say 'random DNA proteins used: ' dna? |
say 'random DNA proteins used: ' dna? |
||
call findRnd |
|||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
say |
|||
if @=='' then do; say "the random DNA proteins weren't found."; exit 4; end |
if @=='' then do; say "the random DNA proteins weren't found."; exit 4; end |
||
say 'the random DNA proteins were found in positions:' strip(@) |
say 'the random DNA proteins were found in positions:' strip(@) |
||
Line 92: | Line 75: | ||
/*──────────────────────────────────────────────────────────────────────────────────────*/ |
/*──────────────────────────────────────────────────────────────────────────────────────*/ |
||
commas: parse arg ?; do jc=length(?)-3 to 1 by -3; ?=insert(',', ?, jc); end; return ? |
commas: parse arg ?; do jc=length(?)-3 to 1 by -3; ?=insert(',', ?, jc); end; return ? |
||
/*──────────────────────────────────────────────────────────────────────────────────────*/ |
|||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
/*──────────────────────────────────────────────────────────────────────────────────────*/ |
/*──────────────────────────────────────────────────────────────────────────────────────*/ |
||
genRnd: dna?=; use= basePr; upper use basePr rndDNA; lenB= length(basePr) |
genRnd: dna?=; use= basePr; upper use basePr rndDNA; lenB= length(basePr) |
||
Line 100: | Line 88: | ||
dna?= dna? || x /*build a random protein seq.*/ |
dna?= dna? || x /*build a random protein seq.*/ |
||
end /*k*/ |
end /*k*/ |
||
return |
return |
||
/*──────────────────────────────────────────────────────────────────────────────────────*/ |
|||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
say; return</lang> |
|||
{{out|output|text= when using the default inputs:}} |
{{out|output|text= when using the default inputs:}} |
||
<pre> |
<pre> |
Revision as of 20:44, 20 March 2021
- Task
Genarate randomly a string (200 elements) of characters A, C, G, and T representing a DNA sequence write a routine to find the position of subsequence (also generating randomly).
Let length of subsequence equal to 4
Raku
Chances are actually pretty small that a random 4 codon string will show up at all in a random 200 codon sequence. Bump up the sequence size to get a reasonable chance of multiple matches. <lang perl6>use String::Splice:ver<0.0.3>;
my $line = 80;
my $haystack = [~] <A C G T>.roll($line * 8);
say 'Needle: ' ~ my $needle = [~] <A C G T>.roll(4);
my $these = $haystack ~~ m:g/<$needle>/;
my @match = $these.map: { .from, .pos }
printf "From: %3s to %3s\n", |$_ for @match;
my $disp = $haystack.comb.batch($line)».join.join("\n");
for @match.reverse {
$disp.=&splice(.[1] + .[1] div $line, "\e[0m" ); $disp.=&splice(.[0] + .[0] div $line, "\e[31m");
}
say $disp;</lang>
- Output:
Show in custom div to better display highlighting.
Needle: TAGC
From: 159 to 163
From: 262 to 266
From: 315 to 319
From: 505 to 509
From: 632 to 636
CATATGTGACACTGACAGCTCGCGCGAAAATCCGTGTGACGGTCTGAACACTATACTATAGGCCCGGTCGGCATTTGTGG
CTCCCCAGTGGAGAGACCACTCGTCAATTGCTGACGACTTAACACAAATCGAGTCGCCCTTAGTGCCAGACGGGACTCCT
AGCAAAGGGCGGCACGTGGTGACTCCCAATATGTGAGCATGCCATCTAATTGATCTGGGGGGTTTCGCGGGAATACCTAG
GGGCGTTCTGTCCATGGATCTCTAGCCCTGCGAAGAGATACCCGCAGTGAGTTGCACGTGCAAAGAACTTGTAACTAGCG
TATTCTGTATCCGCCGCGCGATATGCTTCTGCGGGATGTACTTCTTGTGACTAAGACTTTGTTATCCAAATTGACCAATA
TTCAACGGTCGACTCTCCGAGGCAGTATCGGTACGCCGAAAAATGGTTACTTCGGCCATACGTAACCTCTCAAGTCACGA
TTACAGCCCACGGGGGCTTACAGCATAGCTCCAAAGACATTCCAATTGAGCTACAACGTGTTCAGTGCGGAGCAGTATCC
AGTACTCGACTGTTATGGTAAAAGGGCATCGTGATCGTTTATATTAATCATTGGGACAGGTGGTTAATGTCATAGCTTAG
REXX
This REXX version allows the user to specify:
- length of the (random) DNA data sequence (default is 200).
- length of the (random) DNA sequence (default is four).
- DNA proteins to be used in the sequence (default is ACGT).
- width of the output lines of (random DNA) (default is 100).
- often (if ever) to add a blank to the output (default is every 10 proteins).
- DNA proteins to be searched in the data (the default is four unique random proteins).
- the seed for the RANDOM function so runs can be repeated with the same data (no default).
<lang rexx>/*REXX pgm gens random DNA (ACGT) sequence & finds positions of a random 4─protein seq. */ parse arg totLen rndLen basePr oWidth Bevery rndDNA seed . if totLen== | totLen=="," then totLen= 200 /*Not specified? Then use the default.*/ if rndLen== | rndLen=="," then rndLen= 4 /* " " " " " " */ if basePr== | basePr=="," then basePr= 'acgt' /* " " " " " " */ if oWidth== | oWidth=="," then oWidth= 100 /* " " " " " " */ if Bevery== | Bevery=="," then Bevery= 10 /* " " " " " " */ if rndDNA== | rndDNA=="," then rndDNA= copies(., rndLen) /*what we're looking for*/ if datatype(seed, 'W') then call random ,,seed /*used to generate repeatable random #s*/ call genRnd /*gen data field of random proteins. */ call show /*show " " " " " */ say ' base DNA proteins used: ' basePr say 'random DNA proteins used: ' dna? call findRnd if @== then do; say "the random DNA proteins weren't found."; exit 4; end say 'the random DNA proteins were found in positions:' strip(@) exit 0 /*stick a fork in it, we're all done. */ /*──────────────────────────────────────────────────────────────────────────────────────*/ commas: parse arg ?; do jc=length(?)-3 to 1 by -3; ?=insert(',', ?, jc); end; return ? /*──────────────────────────────────────────────────────────────────────────────────────*/ findRnd: @=; p=0 /*@: list of the found target proteins*/
do until p==0; p= pos(dna?, $$, p+1); if p>0 then @= @ commas(p) /*Found one? Append it to the "Found"s*/ end /*p*/; return
/*──────────────────────────────────────────────────────────────────────────────────────*/ genRnd: dna?=; use= basePr; upper use basePr rndDNA; lenB= length(basePr)
do k=1 for rndLEN; x= substr(rndDNA, k, 1) if x==. then do; ?= random(1, length(use) ); x= substr(use, ?, 1) use= delstr(use, ?, 1) /*elide so no protein repeats*/ end dna?= dna? || x /*build a random protein seq.*/ end /*k*/ return
/*──────────────────────────────────────────────────────────────────────────────────────*/ show: say " index │"center('DNA sequence of ' commas(totLen) " proteins", oWidth+10)
say "───────┼"center( , oWidth+10, '─') $=; $$=; idx= 1 /*gen data field of random proteins.*/ do j=1 for totLen; c= substr( basePr, random(1, lenB), 1) $$= $$ || c /*append a random protein. */ if Bevery\==0 then if j//Bevery==0 then $= $' ' /*possibly add a blank. */ if length( space($ || c, 0) )<oWidth then do; $= $ || c; iterate; end say strip( right(idx, 7)'│' $, 'T'); $= /*display line ──► terminal.*/ idx= idx + oWidth /*bump the index number. */ end /*j*/ if $\== then say right(idx, 7)"│" strip($, 'T') /*show residual protein data*/ say; return</lang>
- output when using the default inputs:
index │ DNA sequence of 200 proteins ───────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────── 1│ TTTTTAGCG CGTTTTGTAG CGCTCTAAAA ACCGTAGCTA TATTTCTCGA AGTTTCACCC AGCTCTTTTG CCCCAGGGTT GCGCTAAGCC CAGCTTCGAG 101│ GGGGCACAG GTAAAATACT ACCGTCCGTG GAGGGGGATG AATTGACCCG ACATTTTTTG AAGCATAACT CGTGACTCAA TATTGCATGA TTACACCAGC base DNA proteins used: ACGT random DNA proteins used: GCAT the random DNA proteins were found in positions: 162 184
- output when using the inputs of: 1000 , , , , tttt
index │ DNA sequence of 1,000 proteins ───────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────── 1│ GTGATTTTT AGCGCGTTTT GTAGCGCTCT AAAAACCGTA GCTATATTTC TCGAAGTTTC ACCCAGCTCT TTTGCCCCAG GGTTGCGCTA AGCCCAGCTT 101│ GAGGGGGGC ACAGGTAAAA TACTACCGTC CGTGGAGGGG GATGAATTGA CCCGACATTT TTTGAAGCAT AACTCGTGAC TCAATATTGC ATGATTACAC 201│ AGCTAGGTT AGTGTAAAAA CCCCCCTATC TTCCTGATCA ATGGCGAGTA AAACATGCAA CCAATTTGTG AGCGAGTACT GGAAATTATT GTTTACGGGA 301│ AGGCACATG CTACGCGCAA CAGATATCTT AGACTGACCC TTTTAGAGTC ATAAGCCCCT GTCGCCTACA TGCTACTAAT ACTCCAACTA GCGGCGCACC 401│ TCAACCGGA TCATGGCGCC AGGGAAAATG TGGCGTAGCG ACGTGCTCAT CGCTCGCCGG GGAGAGCCTT TCAGAATCTC GAATAAAACC TGGTAATGAC 501│ TCATCAATC GTAATGGTCG TCTGGGGCAA GAAGCCGATA TTATAGACTC AGGTCAGACG TGTGCACAAC GGCAGAATTT ATAGTAATTC GCGTGAACTA 601│ GTTTCGGGA TAGGCCTACG ACCAATCATA GGACATTCGA TGCACGGTGT AGAAACAGTT CTCTGATGTT ACTCGGGATA ACACTCGCAA TCCCCTAGGA 701│ ACCGTGAGC GTCGCTAGTA TCTGAGATAG TCGCGACTGC CCAGCGGTCT TTAAGTTCGC ACACTACGGG ACTCCTAGTT CGCCCATTCA TGGCTATTTT 801│ CCTATCAGT CCAATCCCAC GGGGAGGGCA CTCGCGCAAT TCATTCAAAG AGGGCCATTT GCCGATATAA GGTCCATCAT CGGGAGGAAT ATGACTCCTG 901│ TTAGTATTA GAGCAGCCTC GCTGCGTACT ACTGTCAGTG GCCCGTCAGG GAAGGCAAAA CGTTTTTCCT CTAGGAATCC GTCAATTGGA CTTCTAGACT base DNA proteins used: ACGT random DNA proteins used: TTTT the random DNA proteins were found in positions: 5 6 16 69 157 158 159 340 796 797 962 963
Ring
<lang ring> row = 0 dnaList = [] base = ["A","C","G","T"] long = 20 see "DNA sequence:" + nl see " " + long + ": "
for nr = 1 to 200
row = row + 1 rnd = random(3)+1 baseStr = base[rnd] see baseStr # + " " if (row%20) = 0 and long < 200 long = long + 20 see nl if long < 100 see " " + long + ": " else see "" + long + ": " ok ok add(dnaList,baseStr)
next
strBase = "" for n = 1 to 4
rnd = random(3)+1 strBase = strBase + base[rnd]
next
see "subsequence to search: " + strBase + nl
seqok = 0
for n = 1 to 196
flag = 1 for m = 0 to 3 if dnaList[n+m] != strBase[m+1] flag = 0 exit ok next if flag = 1 seqok = 1 see "start position of sequence = " + n + nl ok
next
if seqok = 0
see "subsequence not found" + nl
ok </lang>
- Output:
DNA sequence: 20: GAGTATAAAAAGCGACATAG 40: AAGCAGGGGGGGAACAGACA 60: ACAATTGTGAAAACTAATCA 80: ATACGGAAAAGGATAAACAT 100: GAGGGACTGCGGTTGGTAGG 120: CGATGAAACCTAAGAATGAA 140: AACGAGGAAGGTGTAAAGTG 160: ATGGGGTCATGGGACAGACA 180: TAGCTAAATGGATAAAAGCG 200: GGTGAAGTCGGTCGCAAACG subsequence to search: ATGA start position of subsequence = 79 start position of subsequence = 103 start position of subsequence = 116
Wren
<lang ecmascript>import "random" for Random import "/pattern" for Pattern import "/str" for Str import "/fmt" for Fmt
var rand = Random.new() var base = "ACGT"
var findDnaSubsequence = Fn.new { |dnaSize, chunkSize|
var dnaSeq = List.filled(dnaSize, null) for (i in 0...dnaSize) dnaSeq[i] = base[rand.int(4)] var dnaStr = dnaSeq.join() var dnaSubseq = List.filled(4, null) for (i in 0...4) dnaSubseq[i] = base[rand.int(4)] var dnaSubstr = dnaSubseq.join() System.print("DNA sequence:") var i = chunkSize for (chunk in Str.chunks(dnaStr, chunkSize)) { Fmt.print("$3d..$3d: $s", i - chunkSize + 1, i, chunk) i = i + chunkSize } System.print("\nSubsequence to locate: %(dnaSubstr)") var p = Pattern.new(dnaSubstr) var matches = p.findAll(dnaStr) if (matches.count == 0) { System.print("No matches found.") } else { System.print("Matches found at the following indices:") for (m in matches) { Fmt.print("$3d..$3d", m.index + 1, m.index + 4) } }
}
findDnaSubsequence.call(200, 20) System.print() findDnaSubsequence.call(600, 40)</lang>
- Output:
DNA sequence: 1.. 20: TATGGGCGCATTATGACAAC 21.. 40: GGCTACTGAAACGAAAATTC 41.. 60: ATGCCTTCGGAGGCTAGACC 61.. 80: ACTCATACATGATTTACAGC 81..100: TAGTCAGTTGCGTCCGCCAT 101..120: CCCGCATAACTATGTATTAC 121..140: GAGCATGTTCTGGCAACCTT 141..160: TCAGTGACAGTTCCTCAGGC 161..180: GCGTTCGCGTTGAAGGCCTC 181..200: CCCACACCGCACCCCTGCCG Subsequence to locate: AATT Matches found at the following indices: 36.. 39 DNA sequence: 1.. 40: GCGCTGAGCGCCCCAGTACAGCGGGTTAAACCGAGCCCGC 41.. 80: TCCGATGAACCAACTCCCATTCCTATAATGGTGCCCCGAC 81..120: ATATTGAATTCGGCGGGTCCGCTATCGGGCTGAGGATGCC 121..160: AATATCTAGGCGCTACCCTGAAGATCCTCAGTTGTGGTGT 161..200: CGCGGAGTGTCGATCCCAGAGCTCCCAATTGACTCAATTA 201..240: CTTTTTCCGTCCTCTTGCTTACGGATTTATGTTTGTGGCA 241..280: GAGGTTATGCTTCAGGCATCCCCATGTTTCCTGAGATACG 281..320: ACCACTGTCAGGTGGCTTGAATCTACCTTGTATTTCCTCT 321..360: AGTACCAGTCACTGTCATCTACTGGAAGCCATATCAGCGT 361..400: TGAAATGTCTATAATTTACTCTCCGGTTGTACCCAAGCGA 401..440: TAACAGCAACGTGTGGGTCTAAAGAGTTCCGCGTTTCGAC 441..480: ATAACGTGCTCCTATTTATCTACCGAAACACCCTATTTTC 481..520: CATCTAACCGGCACCCAATGCGCAGGTGTACGCGTCCTAC 521..560: TACGTTTGAAACGGTTCCATCTCGCCATGTACAATTGTGG 561..600: GGCTACGATTAAGTGTAGTCGGTAATTCAGGGTGAAGTTG Subsequence to locate: TTCG Matches found at the following indices: 89.. 92 435..438