Bioinformatics/Subsequence: Difference between revisions
No edit summary |
(Ada version) |
||
Line 5: | Line 5: | ||
Write a routine to find all the positions of a randomly generated subsequence (four letters). |
Write a routine to find all the positions of a randomly generated subsequence (four letters). |
||
<br><br> |
<br><br> |
||
=={{header|Ada}}== |
|||
<lang Ada>with Ada.Text_Io; |
|||
with Ada.Strings.Fixed; |
|||
with Ada.Numerics.Discrete_Random; |
|||
procedure Sub_Sequence is |
|||
type Nucleotide is (A, C, G, T); |
|||
function To_Character (N : Nucleotide) return Character |
|||
is (case N is |
|||
when A => 'A', when C => 'C', |
|||
when G => 'G', when T => 'T'); |
|||
package Random_Nucleotide is new Ada.Numerics.Discrete_Random (Nucleotide); |
|||
use Random_Nucleotide; |
|||
package Position_Io is new Ada.Text_Io.Integer_Io (Natural); |
|||
use Ada.Text_Io; |
|||
procedure Put_Bases (Seq : String; Width : Positive) is |
|||
First : Natural := Seq'First; |
|||
begin |
|||
while First < Seq'Last loop |
|||
declare |
|||
Last : constant Natural := |
|||
Natural'Min (First + Width - 1, Seq'Last); |
|||
begin |
|||
Position_Io.Put (First); Put (".."); |
|||
Position_Io.Put (Last); Put (" "); |
|||
Put (Seq (First .. Last)); |
|||
New_Line; |
|||
First := Last + 1; |
|||
end; |
|||
end loop; |
|||
end Put_Bases; |
|||
Gen : Generator; |
|||
Sequence : String (1 .. 405); |
|||
Substring : String (1 .. 4); |
|||
Pos : Natural := 0; |
|||
begin |
|||
Position_Io.Default_Width := 3; |
|||
Reset (Gen); |
|||
Sequence := (others => To_Character (Random (Gen))); |
|||
Substring := (others => To_Character (Random (Gen))); |
|||
Put_Line ("Search sequence:"); |
|||
Put_Bases (Sequence, Width => 50); |
|||
New_Line; |
|||
Put ("Substring to search: "); |
|||
Put (Substring); |
|||
New_Line; |
|||
loop |
|||
Pos := Ada.Strings.Fixed.Index (Sequence, Substring, Pos + 1); |
|||
exit when Pos = 0; |
|||
Put ("Found at position: "); |
|||
Position_Io.Put (Pos); Put (".."); |
|||
Position_Io.Put (Pos + Substring'Length - 1); |
|||
New_Line; |
|||
end loop; |
|||
end Sub_Sequence;</lang> |
|||
{{out}} |
|||
<pre>Search sequence: |
|||
1.. 50 CCTACGGAAAAGTGATAAGGACAGATACATAATCCTAAAACCCTGGAAAA |
|||
51..100 CTTGTCTCGCCAGAGTAGGGCTCGGCAGGGGGGGCAGTGTTTTAAAACGT |
|||
101..150 CAGAGAATAGGCTCTACCTTGTTAGACTGCGAGTACTGGAGCGTAGTTCC |
|||
151..200 TATATTGCAAGCTGCTACAGTAAGTATCAAAGTATGCCACACATCCTTCT |
|||
201..250 ACAACCGGATTGGTTGCCCAGTAGAAGGCTCGTAGTCACCGGACACGCTG |
|||
251..300 TTCTTAAGGTCGGTAAGCTATTACGTCCATGGGAGATTCTCAAGGGTGCG |
|||
301..350 TTAGCGGACCCCCGTTACGTCCACGTATCTTCCGTCCAACTACCCCCTAA |
|||
351..400 TGTCATTGACATCGCCCGAGTATTTAATTTATTTGAACGGCACCAATTTA |
|||
401..405 GAGCT |
|||
Substring to search: TATT |
|||
Found at position: 153..156 |
|||
Found at position: 269..272 |
|||
Found at position: 371..374 |
|||
Found at position: 380..383</pre> |
|||
=={{header|Factor}}== |
=={{header|Factor}}== |
Revision as of 16:11, 9 April 2021
- Task
Randomly generate a string of 200 DNA bases (represented by A, C, G, and T).
Write a routine to find all the positions of a randomly generated subsequence (four letters).
Ada
<lang Ada>with Ada.Text_Io; with Ada.Strings.Fixed; with Ada.Numerics.Discrete_Random;
procedure Sub_Sequence is
type Nucleotide is (A, C, G, T);
function To_Character (N : Nucleotide) return Character is (case N is when A => 'A', when C => 'C', when G => 'G', when T => 'T');
package Random_Nucleotide is new Ada.Numerics.Discrete_Random (Nucleotide); use Random_Nucleotide;
package Position_Io is new Ada.Text_Io.Integer_Io (Natural); use Ada.Text_Io;
procedure Put_Bases (Seq : String; Width : Positive) is First : Natural := Seq'First; begin while First < Seq'Last loop declare Last : constant Natural := Natural'Min (First + Width - 1, Seq'Last); begin Position_Io.Put (First); Put (".."); Position_Io.Put (Last); Put (" "); Put (Seq (First .. Last)); New_Line; First := Last + 1; end; end loop; end Put_Bases;
Gen : Generator; Sequence : String (1 .. 405); Substring : String (1 .. 4); Pos : Natural := 0;
begin
Position_Io.Default_Width := 3;
Reset (Gen);
Sequence := (others => To_Character (Random (Gen))); Substring := (others => To_Character (Random (Gen)));
Put_Line ("Search sequence:"); Put_Bases (Sequence, Width => 50); New_Line;
Put ("Substring to search: "); Put (Substring); New_Line;
loop Pos := Ada.Strings.Fixed.Index (Sequence, Substring, Pos + 1); exit when Pos = 0; Put ("Found at position: "); Position_Io.Put (Pos); Put (".."); Position_Io.Put (Pos + Substring'Length - 1); New_Line; end loop;
end Sub_Sequence;</lang>
- Output:
Search sequence: 1.. 50 CCTACGGAAAAGTGATAAGGACAGATACATAATCCTAAAACCCTGGAAAA 51..100 CTTGTCTCGCCAGAGTAGGGCTCGGCAGGGGGGGCAGTGTTTTAAAACGT 101..150 CAGAGAATAGGCTCTACCTTGTTAGACTGCGAGTACTGGAGCGTAGTTCC 151..200 TATATTGCAAGCTGCTACAGTAAGTATCAAAGTATGCCACACATCCTTCT 201..250 ACAACCGGATTGGTTGCCCAGTAGAAGGCTCGTAGTCACCGGACACGCTG 251..300 TTCTTAAGGTCGGTAAGCTATTACGTCCATGGGAGATTCTCAAGGGTGCG 301..350 TTAGCGGACCCCCGTTACGTCCACGTATCTTCCGTCCAACTACCCCCTAA 351..400 TGTCATTGACATCGCCCGAGTATTTAATTTATTTGAACGGCACCAATTTA 401..405 GAGCT Substring to search: TATT Found at position: 153..156 Found at position: 269..272 Found at position: 371..374 Found at position: 380..383
Factor
<lang factor>USING: accessors formatting grouping io kernel math math.functions.integer-logs math.parser random regexp sequences ;
- new-dna ( n -- str ) [ "ACGT" random ] "" replicate-as ;
- pad ( n d -- str ) [ number>string ] dip 32 pad-head ;
- .dna ( seq n -- )
seq length integer-log10 1 + :> d seq n group [ n * d pad write ": " write write nl ] each-index ;
- .match ( slice -- ) [ from>> ] [ to>> ] bi "%d..%d\n" printf ;
- .matches ( slices -- )
"Matches found at the following indices:" print [ .match ] each ;
- .locate ( slices -- )
[ "No matches found." print ] [ .matches ] if-empty ;
- .biosub ( dna-size row-size -- )
[ new-dna dup ] [ .dna nl ] bi* 4 new-dna dup "Subsequence to locate: %s\n" printf <regexp> all-matching-slices .locate ;
80 10 .biosub nl 600 39 .biosub nl</lang>
- Output:
0: ATTCAAGGAC 10: CACTATTAAC 20: CTGCATTGTG 30: AGAACTTGCA 40: GTGTACCGAG 50: AGCGAGTTTA 60: AAGCAACACA 70: TCTTTACCGA Subsequence to locate: GTAG No matches found. 0: GATCTCGTCATGGTCCATCCTAACATTTCGGTTGTGGGC 39: GCATCCCGATAGGCGAAGTTAAATCTACGTAGTCCTACG 78: TCACGACGGAACATGATTGCCCACCGAAGTCGTAGGCGA 117: GCTAAAGTCGGTACATACACGATCTGCTATATTCGTTCT 156: CCGACACACGACATGCAATCCGAGAAGCTCTCGAAGTGC 195: GGTCAGATCCTCAGACTCGAACAGAGGAGACCTTAACTG 234: ATACCCACAGTACTTCTCGCATAACCTAAGCACCTATGC 273: TTACACCATCGTCCTGATATTGAGTGAGTCTGGTCGGAG 312: ATATTATCTAGCACCCTCAAGCTCTGTGTGCCACACCAG 351: GATTCCACTTCGCGCTTGCCTAGAGAAAGTAGAGTAGGT 390: GGTGTCATTAGTACACTGTTTGCGATGCACCAACCAAAC 429: CCGACCGCCATGATGACTGCTTTTCGGCCAACGTCAGAT 468: TAAGAGTACTTTTAGTAGCACCGCAAGCCAGCCGGTTTA 507: GCAAGATCCTGCAGCCTCCACGTTATTTCAGGTCTCTAA 546: GCGTTCTTTCCATGGAAGTAGTCACCGCTCCCGTTGCCA 585: ATGGACACAGACGTT Subsequence to locate: ATAT Matches found at the following indices: 145..149 289..293 312..316
Julia
<lang julia>DNArand(n, bases=['A', 'T', 'C', 'G']) = String(rand(bases, n))
DNAsearch(needle, haystack, lap=true) = findall(needle, haystack, overlap=lap)
const rand_string = DNArand(200) const subseq = DNArand(4)
println("Search sequence:\n$rand_string\nfor substring $subseq. Found at positions: ") foreach(p -> print(rpad(p[2], 8), p[1] % 10 == 0 ? "\n" : ""), enumerate(DNAsearch(subseq, rand_string)))
</lang>
- Output:
Search sequence: CCGAAGCCAGGAGGACTGAGCGCTTGCGTCCCGAGTTCTGCGACGAGTCTCTTCATTATAAGGCCACTGATTGCGCTCATCATGAGTGCCAGAAGCACCGCTAAACATAAGTGTCCTTTCTTCCTGACGCACTTGAAGATTGTGACCATTTGTGCGGGTTGTGAGTTAGGGGCTCTCATTGTACACGATCTATAGTGTGC for substring CGCT. Found at positions: 21:24 74:77 99:102
Perl
<lang perl>use strict; use warnings; use feature 'say';
my @bases = <A C G T>; my $basecnt = 160;
my($string,$target); $string .= $bases[ int rand @bases ] for 1 .. $basecnt; $target .= $bases[ int rand @bases ] for 1 .. 4; say "Target: $target"; say 'Matches at these positions:'; say (($string =~ s/.{1,40}\K/\n/gr) =~ s/($target)/ >$1< /gr);</lang>
- Output:
Target: CCTG Matches at these positions: 9 90 157 TTGCC >CCTG< CAAAGTTAATAAGTAAACAATTAAGTGAGTG CTCTAGGGTAAGGTGAGGGCGGGAAGGGGAAAAATACCGA TGCGAG >CCTG< TAGAGCCGGGCCTCAAATTAAACGAAAAAT ATAAGTTTGCTTGGCACGCTGTACTACTTATCC >CCTG< ACT
Phix
Note: match_all() is due to become a builtin in the next release, so the version below may or may not need renaming/deleting before it will run.
Currently only searches for non-overlapped sequences, but it should be pretty obvious how to change that, in which case the next underline will simply partially overwrite the previous, so you'll get eg "<=<==>".
constant cheat = false function grandna(integer len) string dna = repeat(' ',len) for i=1 to len do dna[i] = "ACGT"[rand(4)] end for return dna end function procedure show(string dna, sequence idx) idx &= length(dna)+100 -- (add an otherwise unused sentinel) sequence s = split(trim(join_by(split(join_by(dna,1,10,""),"\n"),1,5," ")),"\n") integer ii = 1, -- idx index i = idx[ii], -- current target ux = 1, -- underline index (1..4) ldx = 1 -- line index (1, 51, 101, etc) for si=1 to length(s) do printf(1,"%3d: %s\n",{ldx,s[si]}) ldx += 50 if i and i<ldx then string ul = repeat(' ',59) while i and i<ldx do integer up = i-ldx+51 -- underline pos (relative to ldx) up += floor((up-1)/10)+5 -- (plus any needed spacing) ul[up] = "<==>"[ux] ux += 1 i += 1 if ux>4 then ux = 1 ii += 1 i = idx[ii] end if end while printf(1,"%s\n",ul) end if end for if length(idx) then string s = iff(length(idx)>1?"s":""), t = join(apply(idx,sprint),", ") printf(1,"%s occurs at location%s: %s\n",{test,s,t}) else printf(1,"%s does not occur\n",{test}) end if end procedure function match_all(object needle, sequence haystack, bool bOverlap = false) if atom(needle) then return find_all(needle,haystack) end if integer start = 1 sequence res = {} while 1 do start = match(needle,haystack,start) if start=0 then exit end if res = append(res,start) start += iff(bOverlap?1:length(needle)) end while return res end function string dna = grandna(200), test = grandna(4) constant cheats = iff(cheat?{9,13,49,60,64,68}:{}) for i=1 to length(cheats) do dna[cheats