Bioinformatics/Subsequence: Difference between revisions
Line 316: | Line 316: | ||
=={{header|Ring}}== |
=={{header|Ring}}== |
||
<lang ring> |
<lang ring> |
||
/*----------------------------------- |
|||
load "consolecolors.ring" |
|||
# Project : DNA subsequences |
|||
# Date : 2021/03/23 |
|||
# Author : Gal Zsolt (~ CalmoSoft ~) |
|||
# Email : <calmosoft@gmail.com> |
|||
-----------------------------------*/ |
|||
load "stdlibcore.ring" |
|||
row = 0 |
|||
load "guilib.ring" |
|||
base = ["A","C","G","T"] |
|||
dnaList = [] |
dnaList = [] |
||
dnaSeq = [] |
dnaSeq = [] |
||
ColLine = list(21) |
|||
base = ["A","C","G","T"] |
|||
long = 20 |
|||
plus = 0 |
|||
see "DNA sequence:" + nl + nl |
|||
see " 12345678901234567890" + nl |
|||
see " " + long + ": " |
|||
C_Spacing = 2 |
|||
for nr = 1 to 200 |
|||
row = row + 1 |
|||
rnd = random(3)+1 |
|||
baseStr = base[rnd] |
|||
see baseStr |
|||
plusLine() |
|||
add(dnaList,baseStr) |
|||
next |
|||
see nl+ " 12345678901234567890" + nl |
|||
C_ButtonDnaStyle = ' background-color: Red; border-radius: 8px;' |
|||
strDna = list2str(dnaList) |
|||
strDna = substr(strDna,nl,"") |
|||
Button = newlist(10,20) |
|||
while true |
|||
LayoutButtonRow = list(10) |
|||
strBase = "" |
|||
for n = 1 to 4 |
|||
rnd = random(3)+1 |
|||
strBase = strBase + base[rnd] |
|||
next |
|||
ind = substr(strDna,strBase) |
|||
if ind > 0 |
|||
exit |
|||
ok |
|||
end |
|||
app = new qApp |
|||
see nl + "subsequence to search: " + strBase + nl |
|||
{ |
|||
win = new qWidget() { |
|||
setWindowTitle('DNA subsequences') |
|||
setWinIcon(self,AppFile("white.jpg")) |
|||
setStyleSheet('background-color:White') |
|||
setgeometry(560,180,300,300) |
|||
//reSize(400,400) |
|||
winheight = 10 |
|||
fontSize = 8 # + (winheight / 100) |
|||
LayoutButtonMain = new QVBoxLayout() |
|||
seqok = 0 |
|||
LayoutButtonMain.setSpacing(C_Spacing) |
|||
see "start positions of subsequence : " |
|||
LayoutButtonMain.setContentsmargins(0,0,0,0) |
|||
for Col = 1 to 21 |
|||
ColLine[Col] = new qLabel(win) { |
|||
setmaximumheight(20) |
|||
setAlignment(Qt_AlignHCenter | Qt_AlignVCenter) |
|||
setStyleSheet("background-color:darkgray") |
|||
setText(string(Col-1)) |
|||
} |
|||
next |
|||
LayoutTitleRow = new QHBoxLayout() { setSpacing(C_Spacing) setContentsMargins(0,0,0,0) } |
|||
for |
for Col = 1 to 21 |
||
LayoutTitleRow.AddWidget(ColLine[Col]) |
|||
flag = 1 |
|||
next |
|||
LayoutButtonMain.AddLayout(LayoutTitleRow) |
|||
RowLine = list(10) |
|||
next |
|||
if flag = 1 |
|||
add(dnaSeq,n) |
|||
seqok = 1 |
|||
see "" + n + " " |
|||
ok |
|||
next |
|||
for Row = 1 to 10 |
|||
if seqok = 0 |
|||
Letter = "" + Row*20 |
|||
see "sequence not found" + nl |
|||
if Row*20 < 100 |
|||
ok |
|||
Letter = " " + Row*20 |
|||
ok |
|||
RowLine[Row] = new qLabel(win) { //setFont(new qFont("Verdana",fontSize,40,0)) |
|||
setAlignment(Qt_AlignHCenter | Qt_AlignVCenter) |
|||
setStyleSheet("background-color:darkgray") |
|||
setText(Letter) |
|||
} |
|||
next |
|||
for Row = 1 to 10 |
|||
LayoutButtonRow[Row] = new QHBoxLayout() |
|||
{ |
|||
setSpacing(C_Spacing) |
|||
setContentsmargins(0,0,0,0) |
|||
} |
|||
LayoutButtonRow[Row].AddWidget(RowLine[Row]) |
|||
row = 0 |
|||
showDna(dnaList) |
|||
for Col = 1 to 20 |
|||
Button[Row][Col] = new QPushButton(win) { |
|||
setmaximumwidth(20) |
|||
} |
|||
LayoutButtonRow[Row].AddWidget(Button[Row][Col]) |
|||
next |
|||
LayoutButtonMain.AddLayout(LayoutButtonRow[Row]) |
|||
next |
|||
LayoutDataRow = new QHBoxLayout() { setSpacing(C_Spacing) setContentsMargins(0,0,0,0) } |
|||
//LayoutDataRow.AddWidget(PlayScoreBlack) |
|||
setLayout(LayoutButtonMain) |
|||
pStart() |
|||
show() |
|||
} |
|||
exec() |
|||
} |
|||
//----------------------------------------- |
|||
func pStart() |
|||
for nr = 1 to 200 |
|||
rnd = random(3)+1 |
|||
baseStr = base[rnd] |
|||
row = ceil(nr/20) |
|||
col = nr%20 |
|||
if col = 0 |
|||
col = 20 |
|||
ok |
|||
Button[row][col].settext(baseStr) |
|||
add(dnaList,baseStr) |
|||
next |
|||
startDna() |
|||
//----------------------------------------- |
|||
func startDna() |
|||
strDna = list2str(dnaList) |
|||
strDna = substr(strDna,nl,"") |
|||
while true |
|||
strBase = "" |
|||
for n = 1 to 4 |
|||
rnd = random(3)+1 |
|||
strBase = strBase + base[rnd] |
|||
next |
|||
ind = substr(strDna,strBase) |
|||
if ind > 0 |
|||
exit |
|||
ok |
|||
end |
|||
seqok = 0 |
|||
for n = 1 to 196 |
|||
flag = 1 |
|||
for m = 0 to 3 |
|||
if dnaList[n+m] != strBase[m+1] |
|||
flag = 0 |
|||
exit |
|||
ok |
|||
next |
|||
/*if flag = 1 |
|||
add(dnaSeq,n) |
|||
seqok = 1 |
|||
//see "" + n + " " |
|||
ok*/ |
|||
next |
|||
showDna(dnaList) |
|||
//----------------------------------------- |
|||
func showDna(dnaList) |
func showDna(dnaList) |
||
long = 20 |
|||
strDna = list2str(dnaList) |
|||
see nl + "found subsequences:" + nl + nl |
|||
strDna = substr(strDna,nl,"") |
|||
see " 12345678901234567890" + nl |
|||
see " " + long + ": " |
|||
while true |
|||
for nr = 1 to len(dnaList) |
|||
strBase = "" |
|||
for n = 1 to 4 |
|||
rnd = random(3)+1 |
|||
strBase = strBase + base[rnd] |
|||
next |
|||
ind = substr(strDna,strBase) |
|||
if ind > 0 |
|||
exit |
|||
ok |
|||
end |
|||
for n = 1 to 196 |
|||
flag = 1 |
|||
for m = 0 to 3 |
|||
if dnaList[n+m] != strBase[m+1] |
|||
flag = 0 |
|||
exit |
|||
ok |
|||
next |
|||
if flag = 1 |
|||
add(dnaSeq,n) |
|||
seqok = 1 |
|||
see "" + n + " " |
|||
ok |
ok |
||
next |
|||
for nr = 1 to len(dnaList) |
|||
ind = find(dnaSeq,nr) |
ind = find(dnaSeq,nr) |
||
if ind > 0 |
if ind > 0 |
||
for n = nr to nr + 3 |
for n = nr to nr + 3 |
||
row = ceil(n/20) |
|||
col = n%20 |
|||
if col = 0 |
|||
col = 20 |
|||
ok |
|||
Button[row][col].setStyleSheet(C_ButtonDnaStyle) |
|||
Button[row][col].settext(dnaList[n]) |
|||
next |
next |
||
plus = 1 |
|||
if (row%20) = 0 |
|||
row = row + 1 |
|||
nr = nr + 1 |
|||
ok |
|||
else |
|||
plus = 0 |
|||
see dnaList[nr] |
|||
ok |
ok |
||
next |
|||
next |
|||
see nl+ " 12345678901234567890" + nl |
|||
//----------------------------------------- |
|||
func plusLine() |
|||
if (row%20) = 0 and long < 200 |
|||
long = long + 20 |
|||
see nl |
|||
if long < 100 |
|||
see " " + long + ": " |
|||
else |
|||
see "" + long + ": " |
|||
ok |
|||
ok |
|||
</lang> |
</lang> |
||
[https://youtu.be/mwzp3qsgvZk Bioinformatics/Subsequence - video] |
[https://youtu.be/mwzp3qsgvZk Bioinformatics/Subsequence - video] |
||
{{out}} |
|||
<pre> |
|||
DNA sequence: |
|||
12345678901234567890 |
|||
20: CAGTAAATAAGGAGAACAGG |
|||
40: GATCTATCTGCGCAGTTGTT |
|||
60: CAAATCAAGAGGAAAAAGTT |
|||
80: AAATCCAACACGGTAGGATG |
|||
100: CATTGAAAGGTTGCGTAAGA |
|||
120: AAAAAGGAGGGAAATGATCG |
|||
140: AAACAAAGTACGTCAATTAG |
|||
160: ATGCCAAAGACCGATAAAAG |
|||
180: GTATTAGTATTAGAGCAGCG |
|||
200: AATGAGGAAGACTTCGAGAA |
|||
12345678901234567890 |
|||
subsequence to search: AAGA |
|||
start positions of subsequence : 47 97 147 188 |
|||
found subsequences: |
|||
12345678901234567890 |
|||
20: CAGTAAATAAGGAGAACAGG |
|||
40: GATCTATCTGCGCAGTTGTT |
|||
60: CAAATC<span style="color: #CC0000;">AAGA</span>GGAAAAAGTT |
|||
80: AAATCCAACACGGTAGGATG |
|||
100: CATTGAAAGGTTGCGTAAGA |
|||
120: AAAAGGAGGGAAATGATCG |
|||
140: AAACAAAGTACGTCAATTAG |
|||
160: ATGCCAAAGACCGATAAAAG |
|||
180: GTATTAGTATTAGAGCAGCG |
|||
200: AATGAGGAAGACTTCGAGAA |
|||
12345678901234567890 |
|||
</pre> |
|||
=={{header|Wren}}== |
=={{header|Wren}}== |
Revision as of 11:27, 23 March 2021
- Task
Randomly generate a string of 200 DNA bases (represented by A, C, G, and T).
Write a routine to find all the positions of a randomly generated subsequence (four letters).
Factor
<lang factor>USING: accessors formatting grouping io kernel math math.functions.integer-logs math.parser random regexp sequences ;
- new-dna ( n -- str ) [ "ACGT" random ] "" replicate-as ;
- pad ( n d -- str ) [ number>string ] dip 32 pad-head ;
- .dna ( seq n -- )
seq length integer-log10 1 + :> d seq n group [ n * d pad write ": " write write nl ] each-index ;
- .match ( slice -- ) [ from>> ] [ to>> ] bi "%d..%d\n" printf ;
- .matches ( slices -- )
"Matches found at the following indices:" print [ .match ] each ;
- .locate ( slices -- )
[ "No matches found." print ] [ .matches ] if-empty ;
- .biosub ( dna-size row-size -- )
[ new-dna dup ] [ .dna nl ] bi* 4 new-dna dup "Subsequence to locate: %s\n" printf <regexp> all-matching-slices .locate ;
80 10 .biosub nl 600 39 .biosub nl</lang>
- Output:
0: ATTCAAGGAC 10: CACTATTAAC 20: CTGCATTGTG 30: AGAACTTGCA 40: GTGTACCGAG 50: AGCGAGTTTA 60: AAGCAACACA 70: TCTTTACCGA Subsequence to locate: GTAG No matches found. 0: GATCTCGTCATGGTCCATCCTAACATTTCGGTTGTGGGC 39: GCATCCCGATAGGCGAAGTTAAATCTACGTAGTCCTACG 78: TCACGACGGAACATGATTGCCCACCGAAGTCGTAGGCGA 117: GCTAAAGTCGGTACATACACGATCTGCTATATTCGTTCT 156: CCGACACACGACATGCAATCCGAGAAGCTCTCGAAGTGC 195: GGTCAGATCCTCAGACTCGAACAGAGGAGACCTTAACTG 234: ATACCCACAGTACTTCTCGCATAACCTAAGCACCTATGC 273: TTACACCATCGTCCTGATATTGAGTGAGTCTGGTCGGAG 312: ATATTATCTAGCACCCTCAAGCTCTGTGTGCCACACCAG 351: GATTCCACTTCGCGCTTGCCTAGAGAAAGTAGAGTAGGT 390: GGTGTCATTAGTACACTGTTTGCGATGCACCAACCAAAC 429: CCGACCGCCATGATGACTGCTTTTCGGCCAACGTCAGAT 468: TAAGAGTACTTTTAGTAGCACCGCAAGCCAGCCGGTTTA 507: GCAAGATCCTGCAGCCTCCACGTTATTTCAGGTCTCTAA 546: GCGTTCTTTCCATGGAAGTAGTCACCGCTCCCGTTGCCA 585: ATGGACACAGACGTT Subsequence to locate: ATAT Matches found at the following indices: 145..149 289..293 312..316
Julia
<lang julia>DNArand(n, bases=['A', 'T', 'C', 'G']) = String(rand(bases, n))
DNAsearch(needle, haystack, lap=true) = findall(needle, haystack, overlap=lap)
const rand_string = DNArand(200) const subseq = DNArand(4)
println("Search sequence:\n$rand_string\nfor substring $subseq. Found at positions: ") foreach(p -> print(rpad(p[2], 8), p[1] % 10 == 0 ? "\n" : ""), enumerate(DNAsearch(subseq, rand_string)))
</lang>
- Output:
Search sequence: CCGAAGCCAGGAGGACTGAGCGCTTGCGTCCCGAGTTCTGCGACGAGTCTCTTCATTATAAGGCCACTGATTGCGCTCATCATGAGTGCCAGAAGCACCGCTAAACATAAGTGTCCTTTCTTCCTGACGCACTTGAAGATTGTGACCATTTGTGCGGGTTGTGAGTTAGGGGCTCTCATTGTACACGATCTATAGTGTGC for substring CGCT. Found at positions: 21:24 74:77 99:102
Phix
Note: match_all() is due to become a builtin in the next release, so the version below may or may not need renaming/deleting before it will run.
Currently only searches for non-overlapped sequences, but it should be pretty obvious how to change that, in which case the next underline will simply partially overwrite the previous, so you'll get eg "<=<==>".
constant cheat = false function grandna(integer len) string dna = repeat(' ',len) for i=1 to len do dna[i] = "ACGT"[rand(4)] end for return dna end function procedure show(string dna, sequence idx) idx &= length(dna)+100 -- (add an otherwise unused sentinel) sequence s = split(trim(join_by(split(join_by(dna,1,10,""),"\n"),1,5," ")),"\n") integer ii = 1, -- idx index i = idx[ii], -- current target ux = 1, -- underline index (1..4) ldx = 1 -- line index (1, 51, 101, etc) for si=1 to length(s) do printf(1,"%3d: %s\n",{ldx,s[si]}) ldx += 50 if i and i<ldx then string ul = repeat(' ',59) while i and i<ldx do integer up = i-ldx+51 -- underline pos (relative to ldx) up += floor((up-1)/10)+5 -- (plus any needed spacing) ul[up] = "<==>"[ux] ux += 1 i += 1 if ux>4 then ux = 1 ii += 1 i = idx[ii] end if end while printf(1,"%s\n",ul) end if end for if length(idx) then string s = iff(length(idx)>1?"s":""), t = join(apply(idx,sprint),", ") printf(1,"%s occurs at location%s: %s\n",{test,s,t}) else printf(1,"%s does not occur\n",{test}) end if end procedure function match_all(object needle, sequence haystack, bool bOverlap = false) if atom(needle) then return find_all(needle,haystack) end if integer start = 1 sequence res = {} while 1 do start = match(needle,haystack,start) if start=0 then exit end if res = append(res,start) start += iff(bOverlap?1:length(needle)) end while return res end function string dna = grandna(200), test = grandna(4) constant cheats = iff(cheat?{9,13,49,60,64,68}:{}) for i=1 to length(cheats) do dna[cheats