Bioinformatics/Sequence mutation
You are encouraged to solve this task according to the task description, using any language you may know.
- Task
Given a string of characters A, C, G, and T representing a DNA sequence write a routine to mutate the sequence, (string) by:
- Choosing a random base position in the sequence.
- Mutate the sequence by doing one of either:
- Swap the base at that position by changing it to one of A, C, G, or T. (which has a chance of swapping the base for the same base)
- Delete the chosen base at the position.
- Insert another base randomly chosen from A,C, G, or T into the sequence at that position.
- Randomly generate a test DNA sequence of at least 200 bases
- "Pretty print" the sequence and a count of its size, and the count of each base in the sequence
- Mutate the sequence ten times.
- "Pretty print" the sequence after all mutations, and a count of its size, and the count of each base in the sequence.
- Extra credit
- Give more information on the individual mutations applied.
- Allow mutations to be weighted and/or chosen.
11l
UInt32 seed = 0
F nonrandom(n)
:seed = 1664525 * :seed + 1013904223
R Int(:seed >> 16) % n
F nonrandom_choice(lst)
R lst[nonrandom(lst.len)]
F basecount(dna)
DefaultDict[Char, Int] d
L(c) dna
d[c]++
R sorted(d.items())
F seq_split(dna, n = 50)
R (0 .< dna.len).step(n).map(i -> @dna[i .< i + @n])
F seq_pp(dna, n = 50)
L(part) seq_split(dna, n)
print(‘#5: #.’.format(L.index * n, part))
print("\n BASECOUNT:")
V tot = 0
L(base, count) basecount(dna)
print(‘ #3: #.’.format(base, count))
tot += count
V (base, count) = (‘TOT’, tot)
print(‘ #3= #.’.format(base, count))
F seq_mutate(String =dna; count = 1, kinds = ‘IDSSSS’, choice = ‘ATCG’)
[(String, Int)] mutation
V k2txt = [‘I’ = ‘Insert’, ‘D’ = ‘Delete’, ‘S’ = ‘Substitute’]
L 0 .< count
V kind = nonrandom_choice(kinds)
V index = nonrandom(dna.len + 1)
I kind == ‘I’
dna = dna[0 .< index]‘’nonrandom_choice(choice)‘’dna[index..]
E I kind == ‘D’ & !dna.empty
dna = dna[0 .< index]‘’dna[index+1..]
E I kind == ‘S’ & !dna.empty
dna = dna[0 .< index]‘’nonrandom_choice(choice)‘’dna[index+1..]
mutation.append((k2txt[kind], index))
R (dna, mutation)
print(‘SEQUENCE:’)
V sequence = ‘TCAATCATTAATCGATTAATACATTCAATTTGAACATCTCCAGGAGAAGGCAGGGTAATCTCGTGTAGCCGTGCTTGGGGCCTCCGATATGGCCGGGGAATTTCAAAGTATAGTGTGCATCCCCTCATAATACATAGATCTATAGGTAAGTATATGGGTTGACGTTGTTAGATGCGATACACGTGCACACTTTATGAATTTTACGTTCCTCTGCCTAGAGTGCCAAGTTTCAATTTGCTACGGTTCCTCA’
seq_pp(sequence)
print("\n\nMUTATIONS:")
V (mseq, m) = seq_mutate(sequence, 10)
L(kind, index) m
print(‘ #10 @#.’.format(kind, index))
print()
seq_pp(mseq)
- Output:
SEQUENCE: 0: TCAATCATTAATCGATTAATACATTCAATTTGAACATCTCCAGGAGAAGG 50: CAGGGTAATCTCGTGTAGCCGTGCTTGGGGCCTCCGATATGGCCGGGGAA 100: TTTCAAAGTATAGTGTGCATCCCCTCATAATACATAGATCTATAGGTAAG 150: TATATGGGTTGACGTTGTTAGATGCGATACACGTGCACACTTTATGAATT 200: TTACGTTCCTCTGCCTAGAGTGCCAAGTTTCAATTTGCTACGGTTCCTCA BASECOUNT: A: 66 C: 51 G: 55 T: 78 TOT= 250 MUTATIONS: Substitute @184 Substitute @70 Substitute @28 Substitute @6 Substitute @25 Substitute @197 Substitute @81 Substitute @130 Substitute @76 Delete @76 0: TCAATCTTTAATCGATTAATACATTCAATTTGAACATCTCCAGGAGAAGG 50: CAGGGTAATCTCGTGTAGCCCTGCTTGGGCATCCGATATGGCCGGGGAAT 100: TTCAAAGTATAGTGTGCATCCCCTCATAACACATAGATCTATAGGTAAGT 150: ATATGGGTTGACGTTGTTAGATGCGATACACGTACACACTTTATGATTTT 200: TACGTTCCTCTGCCTAGAGTGCCAAGTTTCAATTTGCTACGGTTCCTCA BASECOUNT: A: 66 C: 52 G: 52 T: 79 TOT= 249
Ada
with Ada.Containers.Vectors;
with Ada.Numerics.Discrete_Random;
with Ada.Text_Io;
procedure Mutations is
Width : constant := 60;
type Nucleotide_Type is (A, C, G, T);
type Operation_Type is (Delete, Insert, Swap);
type Position_Type is new Natural;
package Position_Io is new Ada.Text_Io.Integer_Io (Position_Type);
package Nucleotide_Io is new Ada.Text_Io.Enumeration_Io (Nucleotide_Type);
package Operation_Io is new Ada.Text_Io.Enumeration_Io (Operation_Type);
use Ada.Text_Io, Position_Io, Nucleotide_Io, Operation_Io;
package Sequence_Vectors is new Ada.Containers.Vectors (Index_Type => Position_Type,
Element_Type => Nucleotide_Type);
package Nucleotide_Generators is new Ada.Numerics.Discrete_Random (Result_Subtype => Nucleotide_Type);
package Operation_Generators is new Ada.Numerics.Discrete_Random (Result_Subtype => Operation_Type);
procedure Pretty_Print (Sequence : Sequence_Vectors.Vector) is
First : Position_Type := Sequence.First_Index;
Last : Position_Type;
Count : array (Nucleotide_Type) of Natural := (others => 0);
begin
Last := Position_Type'Min (First + Width - 1,
Sequence.Last_Index);
loop
Position_Io.Put (First, Width => 4);
Put (": ");
for N in First .. Last loop
declare
Nucleotide : Nucleotide_Type renames Sequence (N);
begin
Put (Nucleotide);
Count (Nucleotide) := Count (Nucleotide) + 1;
end;
end loop;
New_Line;
exit when Last = Sequence.Last_Index;
First := Last + 1;
Last := Position_Type'Min (First + Width - 1,
Sequence.Last_Index);
end loop;
for N in Count'Range loop
Put ("Count of "); Put (N); Put (" is "); Put (Natural'Image (Count (N))); New_Line;
end loop;
end Pretty_Print;
function Random_Position (First, Last : Position_Type) return Position_Type is
subtype Position_Range is Position_Type range First .. Last;
package Position_Generators is new Ada.Numerics.Discrete_Random (Result_Subtype => Position_Range);
Generator : Position_Generators.Generator;
begin
Position_Generators.Reset (Generator);
return Position_Generators.Random (Generator);
end Random_Position;
Nucleotide_Generator : Nucleotide_Generators.Generator;
Operation_Generator : Operation_Generators.Generator;
Sequence : Sequence_Vectors.Vector;
Position : Position_Type;
Nucleotide : Nucleotide_Type;
Operation : Operation_Type;
begin
Nucleotide_Generators.Reset (Nucleotide_Generator);
Operation_Generators.Reset (Operation_Generator);
for A in 1 .. 200 loop
Sequence.Append (Nucleotide_Generators.Random (Nucleotide_Generator));
end loop;
Put_Line ("Initial sequence:");
Pretty_Print (Sequence);
New_Line;
Put_Line ("Mutations:");
for Mutate in 1 .. 10 loop
Operation := Operation_Generators.Random (Operation_Generator);
case Operation is
when Delete =>
Position := Random_Position (Sequence.First_Index, Sequence.Last_Index);
Sequence.Delete (Index => Position);
Put (Operation); Put (" at position "); Put (Position, Width => 0); New_Line;
when Insert =>
Position := Random_Position (Sequence.First_Index, Sequence.Last_Index + 1);
Nucleotide := Nucleotide_Generators.Random (Nucleotide_Generator);
Sequence.Insert (Before => Position,
New_Item => Nucleotide);
Put (Operation); Put (" "); Put (Nucleotide); Put (" at position ");
Put (Position, Width => 0); New_Line;
when Swap =>
Position := Random_Position (Sequence.First_Index, Sequence.Last_Index);
Nucleotide := Nucleotide_Generators.Random (Nucleotide_Generator);
Sequence.Replace_Element (Index => Position,
New_Item => Nucleotide);
Put (Operation); Put (" at position "); Put (Position, Width => 0);
Put (" to "); Put (Nucleotide); New_Line;
end case;
end loop;
New_Line;
Put_Line ("Mutated sequence:");
Pretty_Print (Sequence);
end Mutations;
- Output:
Initial sequence: 0: GCTGAGTCCGAATTAGTATTCATGAGATACGCATGTCAGTACGGCGACGACACGGGAAGA 60: GCAGATGAAAACTACTGGGGAGCTACCGAGCTGCCGTCGATTGTACGGATGTTATATTTC 120: CCATAGAACTACGAAGTTTTAGGATCCTTTCGGCGATGTGATAAGCAGGTATCAGTAGTA 180: AGCGAAGCGTTGACGTTTTT Count of A is 55 Count of C is 37 Count of G is 56 Count of T is 52 Mutations: DELETE at position 129 SWAP at position 172 to T SWAP at position 28 to T INSERT A at position 193 DELETE at position 164 SWAP at position 165 to G DELETE at position 91 INSERT A at position 169 INSERT C at position 72 DELETE at position 146 Mutated sequence: 0: GCTGAGTCCGAATTAGTATTCATGAGATTCGCATGTCAGTACGGCGACGACACGGGAAGA 60: GCAGATGAAAACCTACTGGGGAGCTACCGAGCGCCGTCGATTGTACGGATGTTATATTTC 120: CCATAGAACACGAAGTTTTAGGATCCTTCGGCGATGTGATAAGAGGTATACTGTAGTAAG 180: CGAAGCGTTGACAGTTTTT Count of A is 55 Count of C is 37 Count of G is 56 Count of T is 51
Arturo
bases: ["A" "T" "G" "C"]
dna: map 1..200 => [sample bases]
prettyPrint: function [in][
count: #[ A: 0, T: 0, G: 0, C: 0 ]
loop.with:'i split.every:50 in 'line [
prints [pad to :string i*50 3 ":"]
print map split.every:10 line => join
loop split line 'ch [
case [ch=]
when? -> "A" -> count\A: count\A + 1
when? -> "T" -> count\T: count\T + 1
when? -> "G" -> count\G: count\G + 1
when? -> "C" -> count\C: count\C + 1
else []
]
]
print ["Total count => A:" count\A, "T:" count\T "G:" count\G "C:" count\C]
]
performRandomModifications: function [seq,times][
result: new seq
loop times [x][
what: random 1 3
case [what=]
when? -> 1 [
ind: random 0 (size result)
previous: get result ind
next: sample bases
set result ind next
print ["changing base at position" ind "from" previous "to" next]
]
when? -> 2 [
ind: random 0 (size result)
next: sample bases
result: insert result ind next
print ["inserting base" next "at position" ind]
]
else [
ind: random 0 (size result)
previous: get result ind
result: remove result .index ind
print ["deleting base" previous "at position" ind]
]
]
return result
]
print "------------------------------"
print " Initial sequence"
print "------------------------------"
prettyPrint dna
print ""
print "------------------------------"
print " Modifying sequence"
print "------------------------------"
dna: performRandomModifications dna 10
print ""
print "------------------------------"
print " Final sequence"
print "------------------------------"
prettyPrint dna
print ""
- Output:
------------------------------ Initial sequence ------------------------------ 0 : GGCCGAAGGC GGCATCAGTG GACTGGGTTG TGGAGCAAAA CGAACACGCC 50 : GAGAGCCGGA GGGTTCGGAA GATTTATTTA GGACGAAATC CCGGACATGT 100 : CGCCCTAGAT TGGCCTCCCT ACACCTAGTA TTATTACTCC TACGCGTTTG 150 : CCTACTGGGT GTCATCTCGT GTTAATCGCA AAATCACCTA CGAATTGCCC Total count => A: 48 T: 47 G: 53 C: 52 ------------------------------ Modifying sequence ------------------------------ deleting base A at position 180 changing base at position 110 from T to G inserting base T at position 104 inserting base C at position 180 changing base at position 183 from A to A deleting base C at position 90 changing base at position 6 from A to T inserting base C at position 146 inserting base G at position 4 changing base at position 150 from T to C ------------------------------ Final sequence ------------------------------ 0 : GGCCGGATGG CGGCATCAGT GGACTGGGTT GTGGAGCAAA ACGAACACGC 50 : CGAGAGCCGG AGGGTTCGGA AGATTTATTT AGGACGAAAT CCGGACATGT 100 : CGCCTCTAGA TGGGCCTCCC TACACCTAGT ATTATTACTC CTACGCGCTT 150 : CGCCTACTGG GTGTCATCTC GTGTTAATCG CCAAATCACC TACGAATTGC 200 : CC Total count => A: 46 T: 47 G: 55 C: 54
BBC BASIC
Mutations = 10
InitialLength = 400
@%=3
REM Generate sequence and Pretty Print result.
FOR I%=1 TO InitialLength
Sequence$ += FNRandomBase
NEXT
PROCDisplaySequence(Sequence$, 50)
REM Make mutations and Pretty Print result.
PRINT '"Mutating..."
FOR I%=1 TO Mutations
Position = RND(LENSequence$)
CurBase$ = MID$(Sequence$, Position, 1)
NewBase$ = FNRandomBase
CASE RND(3) OF
WHEN 1 REM Change a base
PRINT "Change base " CurBase$ " at position " Position " to base " NewBase$
MID$(Sequence$, Position, 1)=NewBase$
WHEN 2 REM Delete a base
PRINT "Delete base " CurBase$ " at position " Position
Sequence$=LEFT$(Sequence$, Position - 1) + MID$(Sequence$, Position + 1)
WHEN 3 REM Insert a base
PRINT "Insert base " NewBase$ " at position " Position
Sequence$=LEFT$(Sequence$, Position) + NewBase$ + MID$(Sequence$, Position + 1)
ENDCASE
NEXT
PROCDisplaySequence(Sequence$, 50)
END
DEF FNRandomBase = MID$("ACGT", RND(4), 1)
DEF PROCDisplaySequence(seq$, snap%)
LOCAL a, c, g, t, i%, p%
p% = !^seq$
FOR i%=0 TO LENseq$ - 1
IF i% MOD snap% == 0 PRINT 'i% ": ";
VDU p%?i%
CASE p%?i% OF
WHEN ASC"A" a += 1
WHEN ASC"C" c += 1
WHEN ASC"G" g += 1
WHEN ASC"T" t += 1
ENDCASE
NEXT
PRINT ' "A: " a ' "C: " c ' "G: " g ' "T: " t
PRINT "Total: "; a + c + g + t
ENDPROC
- Output:
0: CATGGAAGCTACGTGACTGAGGTACCCGTCGCAGGTTCGAATAAATGATA 50: CTAAAATATCGACGCTAGATACAATATAATGTCTGTAGAAAGCGTCCCTT 100: ATGTTTACATAGGAAAGTATGTGTCGGGCGCCCATGCATTTTCTTAGGCA 150: GCGGAAGCCCCGTGGCGCTCGGCCTCCGCTTTTATTACTTTTAACGTAAC 200: GAGGCGCGGGCGTTGCTTTCTTCCGGCTACCGGCGTCGCACCTAACGCCG 250: GCTGCGAATCGCGCGTTTGTAATTACAAGTTAATTACGATATGCCTCGCA 300: AGTTTTGGCTACCGCTGCCCGGATACTTGGGACGTACGGTATTTCACGCA 350: TCAACAGGTATCCCCCTCCCCTTAGTCTTCCACGACTACTTATTTGAGGG A: 90 C: 104 G: 97 T: 109 Total: 400 Mutating... Delete base A at position 69 Change base A at position 154 to base T Delete base T at position 342 Delete base T at position 83 Insert base G at position 278 Insert base G at position 336 Delete base A at position 48 Insert base T at position 233 Change base T at position 233 to base C Delete base G at position 148 0: CATGGAAGCTACGTGACTGAGGTACCCGTCGCAGGTTCGAATAAATGTAC 50: TAAAATATCGACGCTAGTACAATATAATGTCGTAGAAAGCGTCCCTTATG 100: TTTACATAGGAAAGTATGTGTCGGGCGCCCATGCATTTTCTTAGGCACGG 150: TAGCCCCGTGGCGCTCGGCCTCCGCTTTTATTACTTTTAACGTAACGAGG 200: CGCGGGCGTTGCTTTCTTCCGGCTACCGGCGCTCGCACCTAACGCCGGCT 250: GCGAATCGCGCGTTTGTAATTACAAGTGTAATTACGATATGCCTCGCAAG 300: TTTTGGCTACCGCTGCCCGGATACTTGGGACGTACGGGTATTCACGCATC 350: AACAGGTATCCCCCTCCCCTTAGTCTTCCACGACTACTTATTTGAGGG A: 87 C: 105 G: 98 T: 108 Total: 398
C
Adenine ( A ) is always swapped for Thymine ( T ) and vice versa. Similarly with Cytosine ( C ) and Guanine ( G ).
#include<stdlib.h>
#include<stdio.h>
#include<time.h>
typedef struct genome{
char base;
struct genome *next;
}genome;
typedef struct{
char mutation;
int position;
}genomeChange;
typedef struct{
int adenineCount,thymineCount,cytosineCount,guanineCount;
}baseCounts;
genome *strand;
baseCounts baseData;
int genomeLength = 100, lineLength = 50;
int numDigits(int num){
int len = 1;
while(num>10){
num /= 10;
len++;
}
return len;
}
void generateStrand(){
int baseChoice = rand()%4, i;
genome *strandIterator, *newStrand;
baseData.adenineCount = 0;
baseData.thymineCount = 0;
baseData.cytosineCount = 0;
baseData.guanineCount = 0;
strand = (genome*)malloc(sizeof(genome));
strand->base = baseChoice==0?'A':(baseChoice==1?'T':(baseChoice==2?'C':'G'));
baseChoice==0?baseData.adenineCount++:(baseChoice==1?baseData.thymineCount++:(baseChoice==2?baseData.cytosineCount++:baseData.guanineCount++));
strand->next = NULL;
strandIterator = strand;
for(i=1;i<genomeLength;i++){
baseChoice = rand()%4;
newStrand = (genome*)malloc(sizeof(genome));
newStrand->base = baseChoice==0?'A':(baseChoice==1?'T':(baseChoice==2?'C':'G'));
baseChoice==0?baseData.adenineCount++:(baseChoice==1?baseData.thymineCount++:(baseChoice==2?baseData.cytosineCount++:baseData.guanineCount++));
newStrand->next = NULL;
strandIterator->next = newStrand;
strandIterator = newStrand;
}
}
genomeChange generateMutation(int swapWeight, int insertionWeight, int deletionWeight){
int mutationChoice = rand()%(swapWeight + insertionWeight + deletionWeight);
genomeChange mutationCommand;
mutationCommand.mutation = mutationChoice<swapWeight?'S':((mutationChoice>=swapWeight && mutationChoice<swapWeight+insertionWeight)?'I':'D');
mutationCommand.position = rand()%genomeLength;
return mutationCommand;
}
void printGenome(){
int rows, width = numDigits(genomeLength), len = 0,i,j;
lineLength = (genomeLength<lineLength)?genomeLength:lineLength;
rows = genomeLength/lineLength + (genomeLength%lineLength!=0);
genome* strandIterator = strand;
printf("\n\nGenome : \n--------\n");
for(i=0;i<rows;i++){
printf("\n%*d%3s",width,len,":");
for(j=0;j<lineLength && strandIterator!=NULL;j++){
printf("%c",strandIterator->base);
strandIterator = strandIterator->next;
}
len += lineLength;
}
while(strandIterator!=NULL){
printf("%c",strandIterator->base);
strandIterator = strandIterator->next;
}
printf("\n\nBase Counts\n-----------");
printf("\n%*c%3s%*d",width,'A',":",width,baseData.adenineCount);
printf("\n%*c%3s%*d",width,'T',":",width,baseData.thymineCount);
printf("\n%*c%3s%*d",width,'C',":",width,baseData.cytosineCount);
printf("\n%*c%3s%*d",width,'G',":",width,baseData.guanineCount);
printf("\n\nTotal:%*d",width,baseData.adenineCount + baseData.thymineCount + baseData.cytosineCount + baseData.guanineCount);
printf("\n");
}
void mutateStrand(int numMutations, int swapWeight, int insertionWeight, int deletionWeight){
int i,j,width,baseChoice;
genomeChange newMutation;
genome *strandIterator, *strandFollower, *newStrand;
for(i=0;i<numMutations;i++){
strandIterator = strand;
strandFollower = strand;
newMutation = generateMutation(swapWeight,insertionWeight,deletionWeight);
width = numDigits(genomeLength);
for(j=0;j<newMutation.position;j++){
strandFollower = strandIterator;
strandIterator = strandIterator->next;
}
if(newMutation.mutation=='S'){
if(strandIterator->base=='A'){
strandIterator->base='T';
printf("\nSwapping A at position : %*d with T",width,newMutation.position);
}
else if(strandIterator->base=='A'){
strandIterator->base='T';
printf("\nSwapping A at position : %*d with T",width,newMutation.position);
}
else if(strandIterator->base=='C'){
strandIterator->base='G';
printf("\nSwapping C at position : %*d with G",width,newMutation.position);
}
else{
strandIterator->base='C';
printf("\nSwapping G at position : %*d with C",width,newMutation.position);
}
}
else if(newMutation.mutation=='I'){
baseChoice = rand()%4;
newStrand = (genome*)malloc(sizeof(genome));
newStrand->base = baseChoice==0?'A':(baseChoice==1?'T':(baseChoice==2?'C':'G'));
printf("\nInserting %c at position : %*d",newStrand->base,width,newMutation.position);
baseChoice==0?baseData.adenineCount++:(baseChoice==1?baseData.thymineCount++:(baseChoice==2?baseData.cytosineCount++:baseData.guanineCount++));
newStrand->next = strandIterator;
strandFollower->next = newStrand;
genomeLength++;
}
else{
strandFollower->next = strandIterator->next;
strandIterator->next = NULL;
printf("\nDeleting %c at position : %*d",strandIterator->base,width,newMutation.position);
free(strandIterator);
genomeLength--;
}
}
}
int main(int argc,char* argv[])
{
int numMutations = 10, swapWeight = 10, insertWeight = 10, deleteWeight = 10;
if(argc==1||argc>6){
printf("Usage : %s <Genome Length> <Optional number of mutations> <Optional Swapping weight> <Optional Insertion weight> <Optional Deletion weight>\n",argv[0]);
return 0;
}
switch(argc){
case 2: genomeLength = atoi(argv[1]);
break;
case 3: genomeLength = atoi(argv[1]);
numMutations = atoi(argv[2]);
break;
case 4: genomeLength = atoi(argv[1]);
numMutations = atoi(argv[2]);
swapWeight = atoi(argv[3]);
break;
case 5: genomeLength = atoi(argv[1]);
numMutations = atoi(argv[2]);
swapWeight = atoi(argv[3]);
insertWeight = atoi(argv[4]);
break;
case 6: genomeLength = atoi(argv[1]);
numMutations = atoi(argv[2]);
swapWeight = atoi(argv[3]);
insertWeight = atoi(argv[4]);
deleteWeight = atoi(argv[5]);
break;
};
srand(time(NULL));
generateStrand();
printf("\nOriginal:");
printGenome();
mutateStrand(numMutations,swapWeight,insertWeight,deleteWeight);
printf("\n\nMutated:");
printGenome();
return 0;
}
Sample run :
C:\My Projects\networks>a 500 30 15 10 5 Original: Genome : -------- 0 :CGATGAGTTTCCTCCAAGGAGCAGGGCGTGACGGAAGGGAGGCTTAGGTC 50 :CGCATGCTCGTCGGCAGCCGGCTGGTGCCGTCGTAACCTTCACATTATTC 100 :TAGAATTTCGATGCACCTGATGACTCATACCCAGATGTAGGGGTACGCGA 150 :TGCAGATGCGGGCACGAGGAATTGTGGGCAAGCCGGCAGGTCTTTTGTAA 200 :GTTGTCACTAACTAAATAGAGGGATGGATGTTATAGCACACTACTGTCGA 250 :TTACGGACAGCGTCCCGATTCGTCATACGACCAGGATATATACTCGACGT 300 :CCAACAGGAGATTCACGTAGTGAACGCAGTTGACAGCCTGCTCGTATCTC 350 :CAGGGGTGGACTGCACCGTTCGTTAACTGCTGCCACATTAAACAGCTTCC 400 :CACTCCTTGACGCCAGACTCGGTACCACAGACCGTCAAGCTCCTATTTCC 450 :TTTGCAGTTAAAAAACACTATGGTGAAGGTCGGAGAGATGACCTCATCTA Base Counts ----------- A :124 T :118 C :126 G :132 Total:500 Inserting G at position : 205 Inserting G at position : 144 Inserting C at position : 171 Swapping A at position : 335 with T Inserting A at position : 101 Swapping C at position : 109 with G Swapping A at position : 306 with T Inserting G at position : 51 Swapping G at position : 1 with C Deleting G at position : 60 Swapping G at position : 66 with C Inserting C at position : 41 Inserting C at position : 425 Swapping C at position : 173 with G Inserting A at position : 319 Swapping G at position : 460 with C Deleting T at position : 61 Swapping C at position : 160 with G Inserting C at position : 251 Swapping G at position : 337 with C Inserting G at position : 43 Inserting T at position : 146 Inserting T at position : 181 Deleting G at position : 53 Deleting A at position : 464 Swapping G at position : 362 with C Swapping G at position : 190 with C Swapping C at position : 280 with G Inserting T at position : 479 Deleting C at position : 400 Mutated: Genome : -------- 0 :CCATGAGTTTCCTCCAAGGAGCAGGGCGTGACGGAAGGGAGCGGCTTAGG 50 :TCCGCATGCTCCGGCACCCGGCTGGTGCCGTCGTAACCTTCACATTATTC 100 :TAAGAATTTGGATGCACCTGATGACTCATACCCAGATGTAGGGGTTGACG 150 :CGATGCAGATGGGGGCACGAGGAGATTGTGTGGCAAGCCGCCAGGTCTTT 200 :TGTAAGTTGTGCACTAACTAAATAGAGGGATGGATGTTATAGCACACTAC 250 :TGTCCGATTACGGACAGCGTCCCGATTCGTGATACGACCAGGATATATAC 300 :TCGACGTCCTACAGGAGATTCAACGTAGTGAACGCAGTTCTCAGCCTGCT 350 :CGTATCTCCAGGCGTGGACTGCACCGTTCGTTAACTGCTGCCACATTAAA 400 :AGCTTCCCACTCCTTGACGCCAGACTCCGGTACCACAGACCGTCAAGCTC 450 :CTATTTCCTTTCCGTTAAAAAACACTATTGGTGAAGGTCGGAGAGATGAC 500 :CTCATCTA Base Counts ----------- A :126 T :121 C :130 G :136 Total:513
C++
#include <array>
#include <iomanip>
#include <iostream>
#include <random>
#include <string>
class sequence_generator {
public:
sequence_generator();
std::string generate_sequence(size_t length);
void mutate_sequence(std::string&);
static void print_sequence(std::ostream&, const std::string&);
enum class operation { change, erase, insert };
void set_weight(operation, unsigned int);
private:
char get_random_base() {
return bases_[base_dist_(engine_)];
}
operation get_random_operation();
static const std::array<char, 4> bases_;
std::mt19937 engine_;
std::uniform_int_distribution<size_t> base_dist_;
std::array<unsigned int, 3> operation_weight_;
unsigned int total_weight_;
};
const std::array<char, 4> sequence_generator::bases_{ 'A', 'C', 'G', 'T' };
sequence_generator::sequence_generator() : engine_(std::random_device()()),
base_dist_(0, bases_.size() - 1),
total_weight_(operation_weight_.size()) {
operation_weight_.fill(1);
}
sequence_generator::operation sequence_generator::get_random_operation() {
std::uniform_int_distribution<unsigned int> op_dist(0, total_weight_ - 1);
unsigned int n = op_dist(engine_), op = 0, weight = 0;
for (; op < operation_weight_.size(); ++op) {
weight += operation_weight_[op];
if (n < weight)
break;
}
return static_cast<operation>(op);
}
void sequence_generator::set_weight(operation op, unsigned int weight) {
total_weight_ -= operation_weight_[static_cast<size_t>(op)];
operation_weight_[static_cast<size_t>(op)] = weight;
total_weight_ += weight;
}
std::string sequence_generator::generate_sequence(size_t length) {
std::string sequence;
sequence.reserve(length);
for (size_t i = 0; i < length; ++i)
sequence += get_random_base();
return sequence;
}
void sequence_generator::mutate_sequence(std::string& sequence) {
std::uniform_int_distribution<size_t> dist(0, sequence.length() - 1);
size_t pos = dist(engine_);
char b;
switch (get_random_operation()) {
case operation::change:
b = get_random_base();
std::cout << "Change base at position " << pos << " from "
<< sequence[pos] << " to " << b << '\n';
sequence[pos] = b;
break;
case operation::erase:
std::cout << "Erase base " << sequence[pos] << " at position "
<< pos << '\n';
sequence.erase(pos, 1);
break;
case operation::insert:
b = get_random_base();
std::cout << "Insert base " << b << " at position "
<< pos << '\n';
sequence.insert(pos, 1, b);
break;
}
}
void sequence_generator::print_sequence(std::ostream& out, const std::string& sequence) {
constexpr size_t base_count = bases_.size();
std::array<size_t, base_count> count = { 0 };
for (size_t i = 0, n = sequence.length(); i < n; ++i) {
if (i % 50 == 0) {
if (i != 0)
out << '\n';
out << std::setw(3) << i << ": ";
}
out << sequence[i];
for (size_t j = 0; j < base_count; ++j) {
if (bases_[j] == sequence[i]) {
++count[j];
break;
}
}
}
out << '\n';
out << "Base counts:\n";
size_t total = 0;
for (size_t j = 0; j < base_count; ++j) {
total += count[j];
out << bases_[j] << ": " << count[j] << ", ";
}
out << "Total: " << total << '\n';
}
int main() {
sequence_generator gen;
gen.set_weight(sequence_generator::operation::change, 2);
std::string sequence = gen.generate_sequence(250);
std::cout << "Initial sequence:\n";
sequence_generator::print_sequence(std::cout, sequence);
constexpr int count = 10;
for (int i = 0; i < count; ++i)
gen.mutate_sequence(sequence);
std::cout << "After " << count << " mutations:\n";
sequence_generator::print_sequence(std::cout, sequence);
return 0;
}
- Output:
Initial sequence: 0: CATATCTGCGTAAGGCGTCGAATCCTTAGAGAAAACTCGCCAAACGCGCT 50: AGCCAAGACTTAATTAAAGGCTGGCCACATAACAGTAGTACTGCAAGGAT 100: GACGTGACTACAACGTGGAATACTCTATCTGATGAGCCCCACGTGGGCCA 150: ACCTTCCAATGCGGCGTCTTGCAGTCTTCGGACTTTGCCTCTACTAGGAG 200: TAGCCATGACGAGTGGTGAGGCGGAGGGACCAATTCCGCACTTCGAATCG Base counts: A: 67, C: 65, G: 64, T: 54, Total: 250 Change base at position 39 from C to C Erase base T at position 194 Insert base T at position 70 Insert base C at position 190 Insert base T at position 45 Erase base A at position 111 Change base at position 96 from A to C Change base at position 113 from A to C Change base at position 5 from C to A Change base at position 44 from C to T After 10 mutations: 0: CATATATGCGTAAGGCGTCGAATCCTTAGAGAAAACTCGCCAAATTGCGC 50: TAGCCAAGACTTAATTAAAGGTCTGGCCACATAACAGTAGTACTGCCAGG 100: ATGACGTGACTCACCGTGGAATACTCTATCTGATGAGCCCCACGTGGGCC 150: AACCTTCCAATGCGGCGTCTTGCAGTCTTCGGACTTTGCCCTCTACAGGA 200: GTAGCCATGACGAGTGGTGAGGCGGAGGGACCAATTCCGCACTTCGAATC 250: G Base counts: A: 65, C: 66, G: 64, T: 56, Total: 251
Common Lisp
Usage :
- (mutate (<Genome length> <Number of mutations>
- :ins_w <Insertion weight> :swp_w <Swap weight> :del_w <Delete weight>
- :genome <Genome Sequence>)
All keys are optional. <Genome length> is discarded when :genome is set.
(defun random_base ()
(random 4))
(defun basechar (base)
(char "ACTG" base))
(defun generate_genome (genome_length)
(let (genome '())
(loop for i below genome_length do
(push (random_base) genome))
(return-from generate_genome genome)))
(defun map_genome (genome)
(let (seq '())
(loop for n from (1- (length genome)) downto 0 do
(push (position (char genome n) "ACTG") seq))
seq))
(defun output_genome_info (genome &optional (genome_name "ORIGINAL"))
(let ((ac 0) (tc 0) (cc 0) (gc 0))
(format t "~% ---- ~a ----" genome_name)
(do ((n 0 (1+ n)))
((= n (length genome)))
(when (= 0 (mod n 50)) (format t "~& ~4d: " (1+ n)))
(case (nth n genome)
(0 (incf ac))
(1 (incf tc))
(2 (incf cc))
(3 (incf gc)))
(format t "~c" (basechar (nth n genome))))
(format t "~2%- Total : ~3d~%A : ~d C : ~d~%T : ~d G : ~d~2%" (length genome) ac tc cc gc)))
(defun insert_base (genome)
(let ((place (random (length genome)))
(base (random_base)))
(format t "Insert + ~c at ~3d~%"
(basechar base) (+ 1 place))
(if (= 0 place)
(push base genome)
(push base (cdr (nthcdr (1- place) genome))))
(return-from insert_base genome)))
(defun swap_base (genome)
(let ((place (random (length genome)))
(base (random_base)))
(format t "Swap ~c -> ~c at ~3d~%"
(basechar (nth place genome)) (basechar base) (+ 1 place))
(setf (nth place genome) base)
(return-from swap_base genome)))
(defun delete_base (genome)
(let ((place (random (length genome))))
(format t "Delete - ~c at ~3d~%"
(basechar (nth place genome)) (+ 1 place))
(if (= 0 place) (pop genome)
(pop (cdr (nthcdr (1- place) genome))))
(return-from delete_base genome)))
(defun mutate (genome_length n_mutations
&key (ins_w 10) (swp_w 10) (del_w 10)
(genome (generate_genome genome_length) has_genome))
(if has_genome (setf genome (map_genome genome)))
(output_genome_info genome)
(format t " ---- MUTATION SEQUENCE ----~%")
(do ((n 0 (1+ n)))
((= n n_mutations))
(setf mutation_type (random (+ ins_w swp_w del_w)))
(format t "~3d : " (1+ n))
(setf genome
(cond ((< mutation_type ins_w) (insert_base genome))
((< mutation_type (+ ins_w swp_w)) (swap_base genome))
(t (delete_base genome)))))
(output_genome_info genome "MUTATED"))
- Output:
[CLISP]> (mutate 500 30 :ins_w 5 :swp_w 10 :del_w 15) ---- ORIGINAL ---- 1: CTGCCATGCGTAAGATAGCAGAAGTGTTCGCGTATATGTTCATTTTGGCT 51: TCAACCTACGGGCGTATACACATTTCAGCTCGGAGCTCGGGCCCCAGTAC 101: CTAGTTTTTCTTTCAAGCTGGAACTACGGCGCCTTGTGCCGTAATCCGTC 151: GGGGTGATAATTCTAACTTGCTAACACGCGCAGATGGTCCGGTCGCGGGT 201: CAAACATTCGCCAAGGTCAACTTACCCTTAAAAGGCTTGCAACAGGGACC 251: AGTACTAGGAAGTAGACTTCATGGATCTTGGGCATAGTGGACTAGCTTAT 301: TTACCAGCGACGTTCTTGCACCCGAGACATTATCATAGTTCGACAGCGTT 351: GAACTGTGCTTAGGTTAATGCCGCGCTTCCTACCTTGAACTAAACACAGC 401: ACACAGTGAGAACGTAGCGGCCTCTTTTCCTGCCTGCAATTCGTAAGCCT 451: GATTTGACGGGTCTGGAGTTTGGCTCGGAGTAGGTCTGCTACTTAAATTC - Total : 500 A : 116 C : 124 T : 137 G : 123 ---- MUTATION SEQUENCE ---- 1 : Swap T -> C at 74 2 : Delete - T at 208 3 : Insert + C at 332 4 : Insert + G at 287 5 : Delete - T at 188 6 : Swap G -> A at 263 7 : Delete - G at 323 8 : Delete - A at 336 9 : Swap T -> A at 426 10 : Swap G -> G at 38 11 : Swap G -> C at 288 12 : Delete - T at 11 13 : Swap G -> A at 197 14 : Insert + T at 476 15 : Swap C -> A at 5 16 : Swap A -> T at 211 17 : Swap A -> T at 248 18 : Delete - C at 471 19 : Delete - C at 455 20 : Swap T -> T at 184 21 : Insert + T at 224 22 : Delete - T at 224 23 : Insert + T at 333 24 : Delete - C at 18 25 : Delete - G at 139 26 : Delete - T at 333 27 : Insert + T at 80 28 : Insert + T at 480 29 : Swap A -> T at 341 30 : Swap T -> C at 73 ---- MUTATED ---- 1: CTGCAATGCGAAGATAGAGAAGTGTTCGCGTATATGTTCATTTTGGCTTC 51: AACCTACGGGCGTATACACATCCCAGCTCTGGAGCTCGGGCCCCAGTACC 101: TAGTTTTTCTTTCAAGCTGGAACTACGGCGCCTTGTGCCTAATCCGTCGG 151: GGTGATAATTCTAACTTGCTAACACGCGCAGATGGCCGGTCGCGGATCAA 201: ACATCGCCATGGTCAACTTACCCTTAAAAGGCTTGCAACAGGGACCTGTA 251: CTAGGAAGTAAACTTCATGGATCTTGGGCATAGGTCGACTAGCTTATTTA 301: CCAGCGACGTTCTTGCACCCAGACATTACTCTAGTTCGACTGCGTTGAAC 351: TGTGCTTAGGTTAATGCCGCGCTTCCTACCTTGAACTAAACACAGCACAC 401: AGTGAGAACGTAGCGGCCTCTTTACCTGCCTGCAATTCGTAAGCCTGATT 451: TGAGGGTCTGGAGTTTGGTCGGTAGTAGGTTCTGCTACTTAAATTC - Total : 496 A : 116 C : 124 T : 137 G : 119
EasyLang
base$[] = [ "A" "C" "T" "G" ]
global seq[] seqnx[] seqpr[] .
proc prseq . .
len cnt[] 4
numfmt 0 3
ind = 1
while seqnx[ind] <> 1
pos += 1
ind = seqnx[ind]
if pos mod 40 = 1
print ""
.
if pos mod 40 = 1
write pos & ":"
.
if pos mod 4 = 1
write " "
.
cnt[seq[ind]] += 1
write base$[seq[ind]]
.
print ""
print ""
for i to 4
print base$[i] & ":" & cnt[i]
sum += cnt[i]
.
print "====="
print " " & sum
print ""
.
proc init . .
seq[] = [ 0 ]
seqnx[] = [ 2 ]
seqpr[] = [ 0 ]
for i = 2 to 201
seq[] &= random 4
seqnx[] &= i + 1
seqpr[] &= i - 1
.
seqpr[1] = len seq[]
seqnx[$] = 1
.
proc delete pos . .
nx = seqnx[pos]
pre = seqpr[pos]
seqnx[pre] = nx
seqpr[nx] = pre
last = len seq[]
seq[pos] = seq[last]
seqnx[pos] = seqnx[last]
seqpr[pos] = seqpr[last]
seqpr[seqnx[pos]] = pos
seqnx[seqpr[pos]] = pos
len seq[] -1
len seqnx[] -1
len seqpr[] -1
.
proc insert pos . .
seq[] &= random 4
last = len seq[]
seqnx[] &= pos
seqpr[] &= seqpr[pos]
seqnx[seqpr[pos]] = last
seqpr[pos] = last
.
proc mutate . .
op = random 3
pos = random (len seq[] - 1) + 1
if op = 1
seq[pos] = random 4
elif op = 2
insert pos
else
delete pos
.
.
init
print "Original:"
prseq
for i to 10
mutate
.
print "Mutated:"
prseq
- Output:
Original: 1: GGCA CGTG TGTA CAAC GCAT CTTT TGTC ATTG CCCT GATC 41: CGCA GTAA CTCC ACAT GCTC GTAC AAAT CGTT ATCT GAGC 81: GCAC GGCC GCCG TGCC AAAA GGAG AAAT CCGA GTAA GTAG 121: CTGA AGTG AGTC GCAT CAAC GTTC ATTA AGAG GAGC GAAA 161: TGTA GGGT AGCT CTTC CTCT TCGT AGTC CCTT ACCG ATGC A: 50 C: 52 T: 49 G: 49 ===== 200 Mutated: 1: GGCA CGTG TGTG CAAC GATC TTTT GTCA TTGC CCTG ATCA 41: CGCA TTAA CTCC ACAT GCTC GTAC AAAT CGTA TCTG AGCG 81: CACG GCCG CCGT GCCA AAAG GAGA ATCC GAGG TAAG TAGC 121: TGAA TGAG TCGC ATCA ACGT TCAT TGAG AGGA GCGA AATG 161: TAGG GTAA CTCT TCCT CTTC GTAG TCCC TTAC CGAT GC A: 49 C: 51 T: 49 G: 49 ===== 198
Factor
USING: assocs combinators.random formatting grouping io kernel
macros math math.statistics namespaces prettyprint quotations
random sequences sorting ;
IN: sequence-mutation
SYMBOL: verbose? ! Turn on to show mutation details.
! Off by default.
! Return a random base as a character.
: rand-base ( -- n ) "ACGT" random ;
! Generate a random dna sequence of length n.
: <dna> ( n -- seq ) [ rand-base ] "" replicate-as ;
! Prettyprint a dna sequence in blocks of n.
: .dna ( seq n -- )
"SEQUENCE:" print [ group ] keep
[ * swap " %3d: %s\n" printf ] curry each-index ;
! Show a histogram of bases in a dna sequence and their total.
: show-counts ( seq -- )
"BASE COUNTS:" print histogram >alist [ first ] sort-with
[ [ " %c: %3d\n" printf ] assoc-each ]
[ "TOTAL: " write [ second ] [ + ] map-reduce . ] bi ;
! Prettyprint the overall state of a dna sequence.
: show-dna ( seq -- ) [ 50 .dna nl ] [ show-counts nl ] bi ;
! Call a quotation only if verbose? is on.
: log ( quot -- ) verbose? get [ call ] [ drop ] if ; inline
! Set index n to a random base.
: bswap ( n seq -- seq' )
[ rand-base ] 2dip 3dup [ nth ] keepd spin
[ " index %3d: swapping %c with %c\n" printf ] 3curry log
[ set-nth ] keep ;
! Remove the base at index n.
: bdelete ( n seq -- seq' )
2dup dupd nth [ " index %3d: deleting %c\n" printf ]
2curry log remove-nth ;
! Insert a random base at index n.
: binsert ( n seq -- seq' )
[ rand-base ] 2dip over reach
[ " index %3d: inserting %c\n" printf ] 2curry log
insert-nth ;
! Allow "passing" probabilities to casep. This is necessary
! because casep is a macro.
MACRO: build-casep-seq ( seq -- quot )
{ [ bswap ] [ bdelete ] [ binsert ] } zip 1quotation ;
! Mutate a dna sequence according to some weights.
! For example,
! "ACGT" { 0.1 0.3 0.6 } mutate
! means swap with 0.1 probability, delete with 0.3 probability,
! and insert with 0.6 probability.
: mutate ( dna-seq weights-seq -- dna-seq' )
[ [ length random ] keep ] [ build-casep-seq ] bi* casep ;
inline
! Prettyprint a sequence of weights.
: show-weights ( seq -- )
"MUTATION PROBABILITIES:" print
" swap: %.2f\n delete: %.2f\n insert: %.2f\n\n" vprintf
;
: main ( -- )
verbose? on "ORIGINAL " write 200 <dna> dup show-dna 10
{ 0.2 0.2 0.6 } dup show-weights "MUTATION LOG:" print
[ mutate ] curry times nl "MUTATED " write show-dna ;
MAIN: main
- Output:
ORIGINAL SEQUENCE: 0: CACAGGCAAGGGTCGTATGCTACTATAGATGTTTCAGAACCGTATTTCGA 50: CTCCGACGCGGTCATGAAGCAGACACTCCGTCACCGATTGCAAGTGTGCA 100: GTTGGGAGAATGCATTAAAATTCTGGGTTATGAAACGGGCAGCCTTGATT 150: GACAGGTGGTCCAGCGACAGTTTAACATACCAAACTCTTTGAGTACGCAG BASE COUNTS: A: 55 C: 44 G: 52 T: 49 TOTAL: 200 MUTATION PROBABILITIES: swap: 0.20 delete: 0.20 insert: 0.60 MUTATION LOG: index 82: deleting A index 161: inserting C index 48: deleting G index 10: swapping G with T index 184: swapping T with C index 137: inserting T index 60: inserting T index 135: inserting C index 32: inserting T index 201: inserting A MUTATED SEQUENCE: 0: CACAGGCAAGTGTCGTATGCTACTATAGATGTTTTCAGAACCGTATTTCA 50: CTCCGACGCGGTTCATGAAGCAGACACTCCGTCCCGATTGCAAGTGTGCA 100: GTTGGGAGAATGCATTAAAATTCTGGGTTATGAAACCGGGTCAGCCTTGA 150: TTGACAGGTGGTCCCAGCGACAGTTTAACATACCAAACCCTTTGAGTACG 200: CAAG BASE COUNTS: A: 55 C: 47 G: 50 T: 52 TOTAL: 204
FreeBASIC
'' Rosetta Code problem: https://rosettacode.org/wiki/Bioinformatics/Sequence_mutation
'' by Jjuanhdez, 05/2023
Randomize Timer
Dim As Integer r, i
r = Int(Rnd * (300))
Dim Shared As String dnaS
For i = 1 To 200 + r : dnaS += Mid("ACGT", Int(Rnd * (4))+1, 1) : Next
Sub show()
Dim As Integer acgt(4), i, j, x, total
For i = 1 To Len(dnaS)
x = Instr("ACGT", Mid(dnaS, i, 1))
acgt(x) += 1
Next
For i = 1 To 4 : total += acgt(i) : Next
For i = 1 To Len(dnaS) Step 50
Print i; ":"; !"\t";
For j = 0 To 49 Step 10
Print Mid(dnaS, i+j, 10); " ";
Next
Print
Next
Print !"\nBase counts: A:"; acgt(1); ", C:"; acgt(2); ", G:"; acgt(3); ", T:"; acgt(4); ", total:"; total
End Sub
Sub mutate()
Dim As Integer i, p
Dim As String sdiS, repS, wasS
Print
For i = 1 To 10
p = Int(Rnd * (Len(dnaS))) + 1
sdiS = Mid("SDI", Int(Rnd * (3)) + 1, 1)
repS = Mid("ACGT", Int(Rnd * (4)) + 1, 1)
wasS = Mid(dnaS, p, 1)
Select Case sdiS
Case "S"
Mid(dnaS, p, 1) = repS
Print "swapped "; wasS; " at "; p; " for "; repS
Case "D"
dnaS = Left(dnaS, p - 1) + Right(dnaS, Len(dnaS) - p)
Print "deleted "; wasS; " at "; p
Case "I"
dnaS = Left(dnaS, p - 1) + repS + Right(dnaS, (Len(dnaS) - p + 1))
Print "inserted "; repS; " at "; p; ", before "; wasS
End Select
Next
Print
End Sub
show()
mutate()
show()
Sleep
- Output:
1: GAAATGATTT GTATCGAGCA GACTGGAGAA AGCACTTATT TAAGCACCGT 51: TTCAAAGCCA CTCTGTTAGG AAGCTAATCC GTAGGTACGT AGGGACGACT 101: CGATCGGACC CTTGCTTCGG TGTCTTCGTT CATCCCGGTT TCCGCGCTCA 151: GCTGCATTTT GGTCGAGCCA GGCGATCGAC AATGTTCGAC GCAATAACGC 201: GCCGGATAGG CACCTGGTGT AGTTTAGGCT GTGTCCGCTT CTGCATCTCC 251: GTTTTGAACA ATGAATTTCC ACGCGTCCAA CAGAAAGATT TGCGCCTGTC 301: TGGAGTGGTC GGAACTTAGG TATTCCGTCG TCAGTCGCGC AGAGATCAGC 351: GACCCTCTTG CTCGTGGCCC TGGACGCGTT TCCTCGTTTT AACTCGACAT 401: CCCTGACCAG CATCACTA Base counts: A: 88, C: 112, G: 106, T: 112, total: 418 swapped T at 246 for C swapped T at 90 for G inserted C at 141, before T deleted T at 62 swapped T at 63 for G deleted T at 381 deleted T at 389 swapped T at 81 for G inserted G at 149, before C swapped T at 256 for T 1: GAAATGATTT GTATCGAGCA GACTGGAGAA AGCACTTATT TAAGCACCGT 51: TTCAAAGCCA CCGGTTAGGA AGCTAATCCG GAGGTACGGA GGGACGACTC 101: GATCGGACCC TTGCTTCGGT GTCTTCGTTC ATCCCGGTTC TCCGCGCTGC 151: AGCTGCATTT TGGTCGAGCC AGGCGATCGA CAATGTTCGA CGCAATAACG 201: CGCCGGATAG GCACCTGGTG TAGTTTAGGC TGTGTCCGCT TCTGCACCTC 251: CGTTTTGAAC AATGAATTTC CACGCGTCCA ACAGAAAGAT TTGCGCCTGT 301: CTGGAGTGGT CGGAACTTAG GTATTCCGTC GTCAGTCGCG CAGAGATCAG 351: CGACCCTCTT GCTCGTGGCC CTGGACGCGT TCCTCGTTTA ACTCGACATC 401: CCTGACCAGC ATCACTA Base counts: A: 88, C: 114, G: 110, T: 105, total: 417
Go
package main
import (
"fmt"
"math/rand"
"sort"
"time"
)
const bases = "ACGT"
// 'w' contains the weights out of 300 for each
// of swap, delete or insert in that order.
func mutate(dna string, w [3]int) string {
le := len(dna)
// get a random position in the dna to mutate
p := rand.Intn(le)
// get a random number between 0 and 299 inclusive
r := rand.Intn(300)
bytes := []byte(dna)
switch {
case r < w[0]: // swap
base := bases[rand.Intn(4)]
fmt.Printf(" Change @%3d %q to %q\n", p, bytes[p], base)
bytes[p] = base
case r < w[0]+w[1]: // delete
fmt.Printf(" Delete @%3d %q\n", p, bytes[p])
copy(bytes[p:], bytes[p+1:])
bytes = bytes[0 : le-1]
default: // insert
base := bases[rand.Intn(4)]
bytes = append(bytes, 0)
copy(bytes[p+1:], bytes[p:])
fmt.Printf(" Insert @%3d %q\n", p, base)
bytes[p] = base
}
return string(bytes)
}
// Generate a random dna sequence of given length.
func generate(le int) string {
bytes := make([]byte, le)
for i := 0; i < le; i++ {
bytes[i] = bases[rand.Intn(4)]
}
return string(bytes)
}
// Pretty print dna and stats.
func prettyPrint(dna string, rowLen int) {
fmt.Println("SEQUENCE:")
le := len(dna)
for i := 0; i < le; i += rowLen {
k := i + rowLen
if k > le {
k = le
}
fmt.Printf("%5d: %s\n", i, dna[i:k])
}
baseMap := make(map[byte]int) // allows for 'any' base
for i := 0; i < le; i++ {
baseMap[dna[i]]++
}
var bases []byte
for k := range baseMap {
bases = append(bases, k)
}
sort.Slice(bases, func(i, j int) bool { // get bases into alphabetic order
return bases[i] < bases[j]
})
fmt.Println("\nBASE COUNT:")
for _, base := range bases {
fmt.Printf(" %c: %3d\n", base, baseMap[base])
}
fmt.Println(" ------")
fmt.Println(" Σ:", le)
fmt.Println(" ======\n")
}
// Express weights as a string.
func wstring(w [3]int) string {
return fmt.Sprintf(" Change: %d\n Delete: %d\n Insert: %d\n", w[0], w[1], w[2])
}
func main() {
rand.Seed(time.Now().UnixNano())
dna := generate(250)
prettyPrint(dna, 50)
muts := 10
w := [3]int{100, 100, 100} // use e.g. {0, 300, 0} to choose only deletions
fmt.Printf("WEIGHTS (ex 300):\n%s\n", wstring(w))
fmt.Printf("MUTATIONS (%d):\n", muts)
for i := 0; i < muts; i++ {
dna = mutate(dna, w)
}
fmt.Println()
prettyPrint(dna, 50)
}
- Output:
Sample run:
SEQUENCE: 0: AATCCAGAAGTTGCGGGAACCGTCGAATAGTGTTCATTAAGTGTCCCGCG 50: GAGTAGCCTCGTAATATAGAATGACCGGGCTTCCCAGCTAGACTTGTCCG 100: CCACGTTTGTGTAGGGCGCAGCGAGACTGCTCTTGATACTCGTTATGTTC 150: CTCACCGGATTATTGAATAGAGTCGAGGGGCTGACGTGACTGAACATTGC 200: CTCCTTTGCGACTAATCTTTCCTTCAATGAACAGGCGCTACCCGTCATCG BASE COUNT: A: 56 C: 63 G: 64 T: 67 ------ Σ: 250 ====== WEIGHTS (ex 300): Change: 100 Delete: 100 Insert: 100 MUTATIONS (10): Change @195 'A' to 'C' Insert @ 95 'G' Change @137 'T' to 'C' Delete @207 'T' Insert @148 'C' Insert @113 'A' Change @ 45 'C' to 'T' Delete @ 93 'T' Insert @ 51 'C' Delete @248 'A' SEQUENCE: 0: AATCCAGAAGTTGCGGGAACCGTCGAATAGTGTTCATTAAGTGTCTCGCG 50: GCAGTAGCCTCGTAATATAGAATGACCGGGCTTCCCAGCTAGACTGGTCC 100: GCCACGTTTGTGTAAGGGCGCAGCGAGACTGCTCTTGACACTCGTTATGC 150: TTCCTCACCGGATTATTGAATAGAGTCGAGGGGCTGACGTGACTGAACCT 200: TGCCTCCTTGCGACTAATCTTTCCTTCAATGAACAGGCGCTACCCGTCTC 250: G BASE COUNT: A: 55 C: 66 G: 65 T: 65 ------ Σ: 251 ======
Haskell
import Data.List (group, sort)
import Data.List.Split (chunksOf)
import System.Random (Random, randomR, random, newStdGen, randoms, getStdRandom)
import Text.Printf (PrintfArg(..), fmtChar, fmtPrecision, formatString, IsChar(..), printf)
data Mutation = Swap | Delete | Insert deriving (Show, Eq, Ord, Enum, Bounded)
data DNABase = A | C | G | T deriving (Show, Read, Eq, Ord, Enum, Bounded)
type DNASequence = [DNABase]
data Result = Swapped Mutation Int (DNABase, DNABase)
| InsertDeleted Mutation Int DNABase
instance Random DNABase where
randomR (a, b) g = case randomR (fromEnum a, fromEnum b) g of (x, y) -> (toEnum x, y)
random = randomR (minBound, maxBound)
instance Random Mutation where
randomR (a, b) g = case randomR (fromEnum a, fromEnum b) g of (x, y) -> (toEnum x, y)
random = randomR (minBound, maxBound)
instance PrintfArg DNABase where
formatArg x fmt = formatString (show x) (fmt { fmtChar = 's', fmtPrecision = Nothing })
instance PrintfArg Mutation where
formatArg x fmt = formatString (show x) (fmt { fmtChar = 's', fmtPrecision = Nothing })
instance IsChar DNABase where
toChar = head . show
fromChar = read . pure
chunkedDNASequence :: DNASequence -> [(Int, [DNABase])]
chunkedDNASequence = zip [50,100..] . chunksOf 50
baseCounts :: DNASequence -> [(DNABase, Int)]
baseCounts = fmap ((,) . head <*> length) . group . sort
newSequence :: Int -> IO DNASequence
newSequence n = take n . randoms <$> newStdGen
mutateSequence :: DNASequence -> IO (Result, DNASequence)
mutateSequence [] = fail "empty dna sequence"
mutateSequence ds = randomMutation >>= mutate ds
where
randomMutation = head . randoms <$> newStdGen
mutate xs m = do
i <- randomIndex (length xs)
case m of
Swap -> randomDNA >>= \d -> pure (Swapped Swap i (xs !! pred i, d), swapElement i d xs)
Insert -> randomDNA >>= \d -> pure (InsertDeleted Insert i d, insertElement i d xs)
Delete -> pure (InsertDeleted Delete i (xs !! pred i), dropElement i xs)
where
dropElement i xs = take (pred i) xs <> drop i xs
insertElement i e xs = take i xs <> [e] <> drop i xs
swapElement i a xs = take (pred i) xs <> [a] <> drop i xs
randomIndex n = getStdRandom (randomR (1, n))
randomDNA = head . randoms <$> newStdGen
mutate :: Int -> DNASequence -> IO DNASequence
mutate 0 s = pure s
mutate n s = do
(r, ms) <- mutateSequence s
case r of
Swapped m i (a, b) -> printf "%6s @ %-3d : %s -> %s \n" m i a b
InsertDeleted m i a -> printf "%6s @ %-3d : %s\n" m i a
mutate (pred n) ms
main :: IO ()
main = do
ds <- newSequence 200
putStrLn "\nInitial Sequence:" >> showSequence ds
putStrLn "\nBase Counts:" >> showBaseCounts ds
showSumBaseCounts ds
ms <- mutate 10 ds
putStrLn "\nMutated Sequence:" >> showSequence ms
putStrLn "\nBase Counts:" >> showBaseCounts ms
showSumBaseCounts ms
where
showSequence = mapM_ (uncurry (printf "%3d: %s\n")) . chunkedDNASequence
showBaseCounts = mapM_ (uncurry (printf "%s: %3d\n")) . baseCounts
showSumBaseCounts xs = putStrLn (replicate 6 '-') >> printf "Σ: %d\n\n" (length xs)
- Output:
Initial Sequence: 50: CCGGCGAACTGGTAGGTCTTTAATTATGCGGCCGCGATCGCGACACAGGT 100: GCAGGAGGAAAATAGGCCCCCGTTCTGGGCAGCCTGATTGCACACTCCCG 150: ATACCAGACGTGTGGCGGCTTTTTCGCAAGATCTTACCAAACATTAAGAT 200: TCGAAATACCAACTGTCGAAAGCAGAACGTGAATGTACCACCCGGATGCG Base Counts: A: 53 C: 53 G: 53 T: 41 ------ Σ: 200 Insert @ 104 : C Delete @ 133 : T Insert @ 60 : A Insert @ 42 : G Swap @ 14 : A -> C Insert @ 88 : A Delete @ 9 : C Swap @ 185 : A -> G Insert @ 27 : G Swap @ 102 : C -> T Mutated Sequence: 50: CCGGCGAATGGTCGGTCTTTAATTATGGCGGCCGCGATCGCGGACACAGG 100: TGCAGGAGGAAAAATAGGCCCCCGTTCTGGGCAGCCTGAATTGCACACTC 150: CTGATACCCAGACGTGTGGCGGCTTTTTCGCAAGACTTACCAAACATTAA 200: GATTCGAAATACCAACTGTCGAAAGCAGAACGTGAGTGTACCACCCGGAT 250: GCG Base Counts: A: 53 C: 53 G: 56 T: 41 ------ Σ: 203
J
ACGT=: 'ACGT'
MUTS=: ;: 'del ins mut'
NB. generate sequence of size y of uniformly selected nucleotides.
NB. represent sequences as ints in range i.4 pretty printed. nuc
NB. defined separately to avoid fixing value inside mutation
NB. functions.
nuc=: monad : '?4'
dna=: nuc"0 @ i.
NB. randomly mutate nucleotide at a random index by deletion insertion
NB. or mutation of a nucleotide.
del=: {.,[:}.}.
ins=: {.,nuc@],}.
mut=: {.,nuc@],[:}.}.
NB. pretty print nucleotides in rows of 50 with numbering
seq=: [: (;~ [: (4&":"0) 50*i.@#) _50]\{&ACGT
sim=: monad define
'n k ws'=. y NB. initial size, mutations, and weights for mutations
ws=. (% +/) ws NB. normalize weights
A=.0$]D0=.D=. dna n NB. initial dna and history of actions
NB. k times do a random action according to weights and record it
for. i.k do.
D=.". action=. (":?#D),' ',(":MUTS{::~(+/\ws)I.?0),' D'
A=. action ; A
end.
echo 'actions';,. A-.a:
echo ('mutation';'probability') , MUTS ,. <"0 ws
('start';'end'),.(seq D0) ,: seq D
)
simulate=: (sim@(1 1 1&; &. |. ))`sim@.(3=#)
- Output:
simulate 200 ; 10 ┌─────────┐ │actions │ ├─────────┤ │60 mut D │ ├─────────┤ │156 del D│ ├─────────┤ │44 mut D │ ├─────────┤ │64 mut D │ ├─────────┤ │167 mut D│ ├─────────┤ │40 ins D │ ├─────────┤ │39 mut D │ ├─────────┤ │187 del D│ ├─────────┤ │186 del D│ ├─────────┤ │150 del D│ └─────────┘ ┌────────┬───────────┐ │mutation│probability│ ├────────┼───────────┤ │del │0.333333 │ ├────────┼───────────┤ │ins │0.333333 │ ├────────┼───────────┤ │mut │0.333333 │ └────────┴───────────┘ ┌─────┬────┬──────────────────────────────────────────────────┐ │start│ 0│GGCTGTTGGCCGCCAATCTACAATGATAGCGCGTGAGGAGGGCTAATGTA│ │ │ 50│GAGCCAATAATGGATGCTCGCGCTTCTGCTTATGCTGGTTACTGCTGCCC│ │ │ 100│AAAAACGGGGTACATTGAGCGATAAGCCCGCAAGGTTACTGCTCGTGACA│ │ │ 150│GTCCGAACACCACATTCGTGGTTACTCGACTCTGCCACCTCTTAGCGGAT│ ├─────┼────┼──────────────────────────────────────────────────┤ │end │ 0│GGCTGTTGGCCGCCAATCTACAATGATAGCGCGTGAGGACCGGCAAATGT│ │ │ 50│AGAGCCAATACTGGATGCTCGCGCTTCTGCTTATGCTGGTTACTGCTGCC│ │ │ 100│CAAAAACGGGGTACATTGAGCGATAAGCCCGCAAGGTTACTGCTCGTGAC│ │ │ 150│ATCCGACACCACATTCCTGGTTACTCGACTCTGCCACCTTAGCGGAT │ └─────┴────┴──────────────────────────────────────────────────┘ simulate 200 ; 10 ; 1 3 1 ┌─────────┐ │actions │ ├─────────┤ │120 ins D│ ├─────────┤ │199 ins D│ ├─────────┤ │138 mut D│ ├─────────┤ │15 ins D │ ├─────────┤ │8 del D │ ├─────────┤ │135 ins D│ ├─────────┤ │29 ins D │ ├─────────┤ │118 del D│ ├─────────┤ │111 ins D│ ├─────────┤ │10 del D │ └─────────┘ ┌────────┬───────────┐ │mutation│probability│ ├────────┼───────────┤ │del │0.2 │ ├────────┼───────────┤ │ins │0.6 │ ├────────┼───────────┤ │mut │0.2 │ └────────┴───────────┘ ┌─────┬────┬──────────────────────────────────────────────────┐ │start│ 0│GAACATACAATATCGTGTGGGTGGTAAGGTGCGCCGATTTGGCAGTGTAG│ │ │ 50│AGCGGCCTCTGGCCGGGCCCATACTGACATATCTTTTATCTCCGTGCTAG│ │ │ 100│CAGAAGAATCAAACGCGTCAAGATGCTGGCGCGGGCTGATATGCGCCCGG│ │ │ 150│CAGTGGAGAACTGCGTTGATACACCTCAAAGATAAGCGGACGATATTAGC│ ├─────┼────┼──────────────────────────────────────────────────┤ │end │ 0│GAACATACAATCGTGCTGGGTGGTAAGGTTGCGCCGATTTGGCAGTGTAG│ │ │ 50│AGCGGCCTCTGGCCGGGCCCATACTGACATATCTTTTATCTCCGTGCTAG│ │ │ 100│CAGAAGAATCAACACGCGTAGAGATGCTGGCGCGGGACTGATATGCGCCC│ │ │ 150│GGCAGTGGAGAACTGCGTTGATACACCTCAAAGATAAGCGGACGATATTA│ │ │ 200│GGC │ └─────┴────┴──────────────────────────────────────────────────┘
Java
This example use a List
to hold the base values.
The Random
class is used to generate random integer values.
A record
is used to hold the counts of each base.
The "pretty print" is defined within the toString
method.
Which uses a StringBuilder
to generate a string of sequential bases.
A BufferedReader
to read the augmented string line-for-line.
Finally, a string formatter is used to justify and format the output text.
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
class Program {
List<Character> sequence;
Random random;
SequenceMutation() {
sequence = new ArrayList<>();
random = new Random();
}
void generate(int amount) {
for (int count = 0; count < amount; count++)
sequence.add(randomBase());
}
void mutate(int amount) {
int index;
for (int count = 0; count < amount; count++) {
index = random.nextInt(0, sequence.size());
switch (random.nextInt(0, 3)) {
case 0 -> sequence.set(index, randomBase());
case 1 -> sequence.remove(index);
case 2 -> sequence.add(index, randomBase());
}
}
}
private char randomBase() {
return switch (random.nextInt(0, 4)) {
case 0 -> 'A';
case 1 -> 'C';
case 2 -> 'G';
case 3 -> 'T';
default -> '?';
};
}
private Base count(String string) {
int a = 0, c = 0, g = 0, t = 0;
for (char base : string.toCharArray()) {
switch (base) {
case 'A' -> a++;
case 'C' -> c++;
case 'G' -> g++;
case 'T' -> t++;
}
}
return new Base(a, c, g, t);
}
/* used exclusively for count totals */
private record Base(int a, int c, int g, int t) {
int total() {
return a + c + g + t;
}
@Override
public String toString() {
return "[A %2d, C %2d, G %2d, T %2d]".formatted(a, c, g, t);
}
}
@Override
public String toString() {
StringBuilder string = new StringBuilder();
StringBuilder stringB = new StringBuilder();
String newline = System.lineSeparator();
for (int index = 0; index < sequence.size(); index++) {
if (index != 0 && index % 50 == 0)
string.append(newline);
string.append(sequence.get(index));
stringB.append(sequence.get(index));
}
try {
BufferedReader reader = new BufferedReader(new StringReader(string.toString()));
string = new StringBuilder();
int count = 0;
String line;
while ((line = reader.readLine()) != null) {
string.append(count++);
string.append(" %-50s ".formatted(line));
string.append(count(line));
string.append(newline);
}
} catch (IOException exception) {
/* ignore */
}
string.append(newline);
Base bases = count(stringB.toString());
int total = bases.total();
string.append("Total of %d bases%n".formatted(total));
string.append("A %3d (%.2f%%)%n".formatted(bases.a, ((double) bases.a / total) * 100));
string.append("C %3d (%.2f%%)%n".formatted(bases.c, ((double) bases.c / total) * 100));
string.append("G %3d (%.2f%%)%n".formatted(bases.g, ((double) bases.g / total) * 100));
string.append("T %3d (%.2f%%)%n".formatted(bases.t, ((double) bases.t / total) * 100));
return string.toString();
}
}
Here is a sequence of 200 mutated 10 times.
Before mutation 0 TCGCTTGGGGGGAGCAAGGTGTTCGCAATAGATCACAGCCGGTCTCGCAT [A 10, C 12, G 17, T 11] 1 AGCTATTTCTACGCTATCGAGCCTGTACTGTGTCAGTAGACCATGTACTC [A 11, C 13, G 10, T 16] 2 CCCAGACTCGTCTTGCCAAGTGACTGCTCAAGGGGAGCGCCCACAGGGTA [A 11, C 16, G 15, T 8] 3 TCTAGAGCATTCGATCACACGGAAAAATTTTATTCGGCAGATCCAGTTAA [A 17, C 10, G 9, T 14] Total of 200 bases A 49 (24.50%) C 51 (25.50%) G 51 (25.50%) T 49 (24.50%) After mutation 0 TCGCTTGGGGGGAGTAAGGTGTTCGCAATAGTCACAGCCGGTCTCGCATA [A 10, C 11, G 17, T 12] 1 GCTTTTCTACGCATCGAGCCTGTACTGTGTCAGTAGACATGTACTCCCCA [A 10, C 15, G 10, T 15] 2 GACTCGTTTGCCAAGTGACCTGCTCAAGGGGAGCGCCCACAGGGTACTAG [A 11, C 14, G 16, T 9] 3 AGCAGTTCGATCACACGGAAAAATTTTTTCGGCAGATCCAGTTAA [A 15, C 9, G 9, T 12] Total of 195 bases A 46 (23.59%) C 49 (25.13%) G 52 (26.67%) T 48 (24.62%)
Here is a sequence of 200 mutated 90,000 times
Before mutation 0 CGCACCCTCCTTCGGGCGAAGCGGGGTTATTTACCCGATTCACCGCACCT [A 8, C 19, G 12, T 11] 1 CGCGGCTCTAAAAGTTCGAAGATCCCTGCGTAGACTGGACCTCATAACAA [A 15, C 14, G 11, T 10] 2 CCGTATTACGCTCCGTACGAATAACTCGGTTGTGCGATGCGGAAAGCGAC [A 12, C 13, G 14, T 11] 3 ATTTCTCAGGCCGAACGTACGCTTCTCTCCTACACCTCGCCTCGAGTATG [A 9, C 18, G 9, T 14] Total of 200 bases A 44 (22.00%) C 64 (32.00%) G 46 (23.00%) T 46 (23.00%) After mutation 0 CGTTTAAGCGGGAAGGTCGTCCACCACACGAAGGCCCCCCTCCAGCACTA [A 12, C 19, G 12, T 7] 1 CCCTGGGCGAGTGCGACCGGCTACAAGAATACGGACAACCGCACTTCGTA [A 13, C 16, G 14, T 7] 2 GTTGCGACGCCAAACCGAGGTTTGAAAGGCAGCCGAAACTCCTAGCCATC [A 14, C 15, G 13, T 8] 3 CGGGCAGCCCACTGGTTTAGATGTTACGTGATGGAAAGGTGGATCATCGT [A 11, C 9, G 17, T 13] 4 GGTTGCCCTGGCGTTGCGTACTTCGTGTCTGAATATTGGTTACAATCGCT [A 7, C 11, G 14, T 18] 5 CGACGACCTGACGATTCTGGATCAACCAACTGCCTAAAGTCGCGAATTAA [A 16, C 14, G 10, T 10] 6 TAATCGACTGCATCACATGTTAGTCTAGTCATCACGAGTACATAGTGTGG [A 14, C 10, G 11, T 15] 7 CCACCTCCTAACGTACTATTTACATAGGATATGGCAGCCCTAACGCACAC [A 15, C 17, G 7, T 11] 8 TGTACGAAAGTGAGACTCCTTACCGAGATTCTAGGCTTAGTGATCCTTGA [A 13, C 10, G 12, T 15] 9 AAACGCTAGCCTAGGAATGACGGGGACTTGATCGGCC [A 10, C 9, G 12, T 6] Total of 487 bases A 125 (25.67%) C 130 (26.69%) G 122 (25.05%) T 110 (22.59%)
Here is an alternate demonstration
import java.util.Arrays;
import java.util.Random;
public class SequenceMutation {
public static void main(String[] args) {
SequenceMutation sm = new SequenceMutation();
sm.setWeight(OP_CHANGE, 3);
String sequence = sm.generateSequence(250);
System.out.println("Initial sequence:");
printSequence(sequence);
int count = 10;
for (int i = 0; i < count; ++i)
sequence = sm.mutateSequence(sequence);
System.out.println("After " + count + " mutations:");
printSequence(sequence);
}
public SequenceMutation() {
totalWeight_ = OP_COUNT;
Arrays.fill(operationWeight_, 1);
}
public String generateSequence(int length) {
char[] ch = new char[length];
for (int i = 0; i < length; ++i)
ch[i] = getRandomBase();
return new String(ch);
}
public void setWeight(int operation, int weight) {
totalWeight_ -= operationWeight_[operation];
operationWeight_[operation] = weight;
totalWeight_ += weight;
}
public String mutateSequence(String sequence) {
char[] ch = sequence.toCharArray();
int pos = random_.nextInt(ch.length);
int operation = getRandomOperation();
if (operation == OP_CHANGE) {
char b = getRandomBase();
System.out.println("Change base at position " + pos + " from "
+ ch[pos] + " to " + b);
ch[pos] = b;
} else if (operation == OP_ERASE) {
System.out.println("Erase base " + ch[pos] + " at position " + pos);
char[] newCh = new char[ch.length - 1];
System.arraycopy(ch, 0, newCh, 0, pos);
System.arraycopy(ch, pos + 1, newCh, pos, ch.length - pos - 1);
ch = newCh;
} else if (operation == OP_INSERT) {
char b = getRandomBase();
System.out.println("Insert base " + b + " at position " + pos);
char[] newCh = new char[ch.length + 1];
System.arraycopy(ch, 0, newCh, 0, pos);
System.arraycopy(ch, pos, newCh, pos + 1, ch.length - pos);
newCh[pos] = b;
ch = newCh;
}
return new String(ch);
}
public static void printSequence(String sequence) {
int[] count = new int[BASES.length];
for (int i = 0, n = sequence.length(); i < n; ++i) {
if (i % 50 == 0) {
if (i != 0)
System.out.println();
System.out.printf("%3d: ", i);
}
char ch = sequence.charAt(i);
System.out.print(ch);
for (int j = 0; j < BASES.length; ++j) {
if (BASES[j] == ch) {
++count[j];
break;
}
}
}
System.out.println();
System.out.println("Base counts:");
int total = 0;
for (int j = 0; j < BASES.length; ++j) {
total += count[j];
System.out.print(BASES[j] + ": " + count[j] + ", ");
}
System.out.println("Total: " + total);
}
private char getRandomBase() {
return BASES[random_.nextInt(BASES.length)];
}
private int getRandomOperation() {
int n = random_.nextInt(totalWeight_), op = 0;
for (int weight = 0; op < OP_COUNT; ++op) {
weight += operationWeight_[op];
if (n < weight)
break;
}
return op;
}
private final Random random_ = new Random();
private int[] operationWeight_ = new int[OP_COUNT];
private int totalWeight_ = 0;
private static final int OP_CHANGE = 0;
private static final int OP_ERASE = 1;
private static final int OP_INSERT = 2;
private static final int OP_COUNT = 3;
private static final char[] BASES = {'A', 'C', 'G', 'T'};
}
- Output:
Initial sequence: 0: TCCCCTCCAGTTAGCAGAAATATTAGCTAACGATACCTCGACACGGAGGG 50: GTGGGGCCAACTCTTAACACAATTACGAGAACCATCCTTCGAAAGCAAAA 100: AAGTTTATGCCTGTTGTGTCAGGAACCCCCCGCGACGGACAACACAGTAA 150: GCACCTGCGGATACTGTGGTTGCCCTGAAAGACGGAGGATGCCTCCTATG 200: TCATTTAGAACTATCGAACGTACGGTTCTTAAATGGTCGTAGTTAGATAG Base counts: A: 73, C: 61, G: 59, T: 57, Total: 250 Insert base T at position 196 Change base at position 19 from A to G Erase base T at position 204 Change base at position 223 from G to T Change base at position 183 from G to C Change base at position 21 from A to T Insert base T at position 40 Change base at position 20 from T to G Insert base T at position 69 Change base at position 19 from G to T After 10 mutations: 0: TCCCCTCCAGTTAGCAGAATGTTTAGCTAACGATACCTCGTACACGGAGG 50: GGTGGGGCCAACTCTTAACTACAATTACGAGAACCATCCTTCGAAAGCAA 100: AAAAGTTTATGCCTGTTGTGTCAGGAACCCCCCGCGACGGACAACACAGT 150: AAGCACCTGCGGATACTGTGGTTGCCCTGAAAGACCGAGGATGCCTCCTT 200: ATGTCATTAGAACTATCGAACGTACTGTTCTTAAATGGTCGTAGTTAGAT 250: AG Base counts: A: 71, C: 62, G: 58, T: 61, Total: 252
JavaScript
// Basic set-up
const numBases = 250
const numMutations = 30
const bases = ['A', 'C', 'G', 'T'];
// Utility functions
/**
* Return a shallow copy of an array
* @param {Array<*>} arr
* @returns {*[]}
*/
const copy = arr => [...arr];
/**
* Get a random int up to but excluding the the given number
* @param {number} max
* @returns {number}
*/
const randTo = max => (Math.random() * max) | 0;
/**
* Given an array return a random element and the index of that element from
* the array.
* @param {Array<*>} arr
* @returns {[*[], number]}
*/
const randSelect = arr => {
const at = randTo(arr.length);
return [arr[at], at];
};
/**
* Given a number or string, return a left padded string
* @param {string|number} v
* @returns {string}
*/
const pad = v => ('' + v).padStart(4, ' ');
/**
* Count the number of elements that match the given value in an array
* @param {Array<string>} arr
* @returns {function(string): number}
*/
const filterCount = arr => s => arr.filter(e => e === s).length;
/**
* Utility logging function
* @param {string|number} v
* @param {string|number} n
*/
const print = (v, n) => console.log(`${pad(v)}:\t${n}`)
/**
* Utility function to randomly select a new base, and an index in the given
* sequence.
* @param {Array<string>} seq
* @param {Array<string>} bases
* @returns {[string, string, number]}
*/
const getVars = (seq, bases) => {
const [newBase, _] = randSelect(bases);
const [extBase, randPos] = randSelect(seq);
return [newBase, extBase, randPos];
};
// Bias the operations
/**
* Given a map of function to ratio, return an array of those functions
* appearing ratio number of times in the array.
* @param weightMap
* @returns {Array<function>}
*/
const weightedOps = weightMap => {
return [...weightMap.entries()].reduce((p, [op, weight]) =>
[...p, ...(Array(weight).fill(op))], []);
};
// Pretty Print functions
const prettyPrint = seq => {
let idx = 0;
const rem = seq.reduce((p, c) => {
const s = p + c;
if (s.length === 50) {
print(idx, s);
idx = idx + 50;
return '';
}
return s;
}, '');
if (rem !== '') {
print(idx, rem);
}
}
const printBases = seq => {
const filterSeq = filterCount(seq);
let tot = 0;
[...bases].forEach(e => {
const cnt = filterSeq(e);
print(e, cnt);
tot = tot + cnt;
})
print('Σ', tot);
}
// Mutation definitions
const swap = ([hist, seq]) => {
const arr = copy(seq);
const [newBase, extBase, randPos] = getVars(arr, bases);
arr.splice(randPos, 1, newBase);
return [[...hist, `Swapped ${extBase} for ${newBase} at ${randPos}`], arr];
};
const del = ([hist, seq]) => {
const arr = copy(seq);
const [newBase, extBase, randPos] = getVars(arr, bases);
arr.splice(randPos, 1);
return [[...hist, `Deleted ${extBase} at ${randPos}`], arr];
}
const insert = ([hist, seq]) => {
const arr = copy(seq);
const [newBase, extBase, randPos] = getVars(arr, bases);
arr.splice(randPos, 0, newBase);
return [[...hist, `Inserted ${newBase} at ${randPos}`], arr];
}
// Create the starting sequence
const seq = Array(numBases).fill(undefined).map(
() => randSelect(bases)[0]);
// Create a weighted set of mutations
const weightMap = new Map()
.set(swap, 1)
.set(del, 1)
.set(insert, 1);
const operations = weightedOps(weightMap);
const mutations = Array(numMutations).fill(undefined).map(
() => randSelect(operations)[0]);
// Mutate the sequence
const [hist, mut] = mutations.reduce((p, c) => c(p), [[], seq]);
console.log('ORIGINAL SEQUENCE:')
prettyPrint(seq);
console.log('\nBASE COUNTS:')
printBases(seq);
console.log('\nMUTATION LOG:')
hist.forEach((e, i) => console.log(`${i}:\t${e}`));
console.log('\nMUTATED SEQUENCE:')
prettyPrint(mut);
console.log('\nMUTATED BASE COUNTS:')
printBases(mut);
- Output:
ORIGINAL SEQUENCE: 0: GTGATCGTAGCGTATCACACGTGCGGGCAGATTGCGGGGCTTCCGTAAGT 50: CTCTCGATTGGCTAAACCTAACGTATGGAGTCGGGCCTGTTCAGAACTGC 100: GCCATAACCACAAACGTCGACAGATAAAGTTCGTGAAGGGACAGGATGAG 150: ATTTTTTTCCCGGTCTGTGCTCAGGGCTTAAATTAGTGCCTTTCCCGAAT 200: GGATACCAACGATTCTGACTGGTTATTTTAATCACCTAATGCCAGTAGTC BASE COUNTS: A: 61 C: 57 G: 64 T: 68 Σ: 250 MUTATION LOG: 0: Inserted A at 231 1: Inserted C at 67 2: Deleted T at 192 3: Inserted A at 106 4: Inserted C at 226 5: Swapped C for G at 34 6: Swapped T for A at 165 7: Inserted G at 11 8: Inserted G at 75 9: Inserted C at 219 10: Swapped A for A at 205 11: Inserted C at 67 12: Inserted C at 34 13: Deleted A at 31 14: Inserted A at 171 15: Deleted G at 0 16: Deleted A at 170 17: Deleted C at 67 18: Deleted T at 173 19: Inserted T at 109 20: Inserted C at 232 21: Inserted G at 137 22: Inserted T at 151 23: Deleted C at 93 24: Deleted A at 95 25: Inserted C at 229 26: Inserted C at 65 27: Inserted G at 84 28: Inserted G at 212 29: Inserted G at 161 MUTATED SEQUENCE: 0: TGATCGTAGCGGTATCACACGTGCGGGCAGTTCGGGGGGCTTCCGTAAGT 50: CTCTCGATTGGCTAACACCCTAACGGTATGGAGTGCGGGCCTGTTAGACT 100: GCGCCATAATACCACAAACGTCGACAGATAAAGTTCGGTGAAGGGACAGG 150: ATTGAGATTTTGTTTCCCGGACTGTGCCAGGGCTTAAATTAGTGCCTTCC 200: CGAATGGATACCAGACGATTCTCGACTGGTTACCTTTCTAAATCACCTAA 250: TGCCAGTAGTC MUTATED BASE COUNTS: A: 62 C: 62 G: 70 T: 67 Σ: 261
jq
Works with gojq, the Go implementation of jq
Adapted from Wren
Since jq does not include a PRNG, the following assumes that an external source of entropy such as /dev/urandom is available. See the "Invocation" section below for details.
### Generic utilities
# Output: a PRN in range(0; .)
def prn:
if . == 1 then 0
else . as $n
| (($n-1)|tostring|length) as $w
| [limit($w; inputs)] | join("") | tonumber
| if . < $n then . else ($n | prn) end
end;
# bag of words
def bow(stream):
reduce stream as $word ({}; .[($word|tostring)] += 1);
# Emit a stream of the constituent characters of the input string
def chars: explode[] | [.] | implode;
def lpad($len): tostring | ($len - length) as $l | (" " * $l) + .;
# Print $n-character segments at a time, each prefixed by a 1-based index
def pretty_nwise($n):
(length | tostring | length) as $len
| def _n($i):
if length == 0 then empty
else "\($i|lpad($len)): \(.[:$n])",
(.[$n:] | _n($i+$n))
end;
_n(1);
### Biology
def bases: ["A", "C", "G", "T"];
def randomBase:
bases | .[length|prn];
# $w is an array [weightSwap, weightDelete, weightInsert]
# specifying the weights out of 300 for each of swap, delete and insert
# Input: an object {dna}
# Output: an object {dna, message}
def mutate($w):
def removeAt($p): .[:$p] + .[$p+1:];
(.dna|length) as $le
# get a random position in the dna to mutate
| ($le | prn) as $p
# get a random number between 0 and 299 inclusive
| (300 | prn) as $r
| .dna |= [chars]
| if $r < $w[0]
then # swap
randomBase as $base
| .message = " Change @\($p) \(.dna[$p]) to \($base)"
| .dna[$p] = $base
elif $r < $w[0] + $w[1]
then # delete
.message = " Delete @\($p) \(.dna[$p])"
| .dna |= removeAt($p)
else # insert
randomBase as $base
| .message = " Insert @\($p) \($base)"
| .dna |= .[:$p] + [$base] + .[$p:]
end
| .dna |= join("") ;
# Generate a random dna sequence of given length:
def generate($n):
[range(0; $n) | randomBase] | join("");
# Pretty print dna and stats.
def prettyPrint($rowLen):
"SEQUENCE:", pretty_nwise($rowLen),
( bow(chars) as $baseMap
| "\nBASE COUNT:",
( bases[] as $c | " \($c): \($baseMap[$c] // 0)" ),
" ------",
" Σ: \(length)",
" ======\n"
) ;
# For displaying the weights
def pretty_weights:
" Change: \(.[0])\n Delete: \(.[1])\n Insert: \(.[2])";
# Arguments are length, weights, mutations
def task($n; $w; $muts ):
generate($n)
| . as $dna
| prettyPrint(50),
"\nWEIGHTS (0 .. 300):", ($w|pretty_weights),
"\nMUTATIONS (\($muts)):",
(reduce range(0;$muts) as $i ({$dna};
mutate($w)
| .emit += [.message] )
| (.emit | join("\n")),
"",
(.dna | prettyPrint(50)) ) ;
task(250; # length
[100, 100, 100]; # use e.g. [0, 300, 0] to choose only deletions
10 # mutations
)
Invocation:
< /dev/urandom tr -cd '0-9' | fold -w 1 | $JQ -cnr -f rc-sequence-mutation.jq
- Output:
SEQUENCE: 1: AGGACACTGCCTTATTTTGTTTCAACAGAAGCCATCTCGAGCAACTACGT 51: GGCCACACAAGCTAATACGAATGACCTTGTATGGGGAGTTACGGGGGGTT 101: TATCTTGAGAAATGGTATAACGATACCCCAAGTGGCGTGATAGGCCGCGC 151: GGGCCTCAGAATAGGTCGTAGATCCGTAAGGGCACCGGGAGCCTTTCTTC 201: TCGTATAATCCGCCGAGATGTTAAAAGACAGCTATGGATTCCCGTAATGC BASE COUNT: A: 66 C: 57 G: 67 T: 60 ------ Σ: 250 ====== WEIGHTS (0 .. 300): Change: 100 Delete: 100 Insert: 100 MUTATIONS (10): Insert @76 T Delete @104 C Change @197 T to T Insert @206 A Delete @184 C Change @69 A to C Insert @211 G Delete @31 C Insert @165 G Insert @234 T SEQUENCE: 1: AGGACACTGCCTTATTTTGTTTCAACAGAAGCATCTCGAGCAACTACGTG 51: GCCACACAAGCTAATACGCATGACCTTTGTATGGGGAGTTACGGGGGGTT 101: TATTTGAGAAATGGTATAACGATACCCCAAGTGGCGTGATAGGCCGCGCG 151: GGCCTCAGAATAGGTGCGTAGATCCGTAAGGGCACGGGAGCCTTTCTTCT 201: CGTATAAATCCGGCCGAGATGTTAAAAGACAGCTTATGGATTCCCGTAAT 251: GC BASE COUNT: A: 66 C: 55 G: 69 T: 62 ------ Σ: 252 ======
Julia
dnabases = ['A', 'C', 'G', 'T']
randpos(seq) = rand(1:length(seq)) # 1
mutateat(pos, seq) = (s = seq[:]; s[pos] = rand(dnabases); s) # 2-1
deleteat(pos, seq) = [seq[1:pos-1]; seq[pos+1:end]] # 2-2
randinsertat(pos, seq) = [seq[1:pos]; rand(dnabases); seq[pos+1:end]] # 2-3
function weightedmutation(seq, pos, weights=[1, 1, 1], verbose=true) # Extra credit
p, r = weights ./ sum(weights), rand()
f = (r <= p[1]) ? mutateat : (r < p[1] + p[2]) ? deleteat : randinsertat
verbose && print("Mutate by ", f == mutateat ? "swap" :
f == deleteat ? "delete" : "insert")
return f(pos, seq)
end
function weightedrandomsitemutation(seq, weights=[1, 1, 1], verbose=true)
position = randpos(seq)
newseq = weightedmutation(seq, position, weights, verbose)
verbose && println(" at position $position")
return newseq
end
randdnasequence(n) = rand(dnabases, n) # 3
function dnasequenceprettyprint(seq, colsize=50) # 4
println(length(seq), "nt DNA sequence:\n")
rows = [seq[i:min(length(seq), i + colsize - 1)] for i in 1:colsize:length(seq)]
for (i, r) in enumerate(rows)
println(lpad(colsize * (i - 1), 5), " ", String(r))
end
bases = [[c, 0] for c in dnabases]
for c in seq, base in bases
if c == base[1]
base[2] += 1
end
end
println("\nNucleotide counts:\n")
for base in bases
println(lpad(base[1], 10), lpad(string(base[2]), 12))
end
println(lpad("Other", 10), lpad(string(length(seq) - sum(x[2] for x in bases)), 12))
println(" _________________\n", lpad("Total", 10), lpad(string(length(seq)), 12))
end
function testbioseq()
sequence = randdnasequence(500)
dnasequenceprettyprint(sequence)
for _ in 1:10 # 5
sequence = weightedrandomsitemutation(sequence)
end
println("\n Mutated:"); dnasequenceprettyprint(sequence) # 6
end
testbioseq()
- Output:
500nt DNA sequence: 0 TCACGTAACAGAGGTATAGTTGATCTTGAGCGGGCTGGCTCCCGGGTTTC 50 TAGCAAGAAAAGGGGAGGGAAGTGCGCCTGCTTTTGCCCCGGGCACCCCA 100 ATCGAAGACAGCTCCGGGGTCGCACATTTTTATGGCCACATAATGAGGGA 150 ATGCACGCATCACTCCTATCACTAACTGCGAACTCATGTGACTGTGCAAA 200 ACACCTTTAACACTGCGATGCCGTGGGGACGGGCCCCCCCAGCGGTATAG 250 CGCGCACACGCGACAGATGTTAACTCGAATGGTCGCGCCGGGGAGTGCAC 300 CCCTTGACATATACTCCAGATGCAATGCGCTATACTTTATGAACTTGCAT 350 AAGCTGCGCAGGGGGGATTGACTTATACTACATATTAACTACCGATATCG 400 ACGCAAATATTCGGGCGGTCTAAAGTGTGTCAGAACGGACATGCCGCCCA 450 GAATCACGGCTACTGAGGACAAATACGCATTCCCGGTGCTGCATTCATTC Nucleotide counts: A 127 C 133 G 131 T 109 Other 0 _________________ Total 500 Mutate by swap at position 253 Mutate by swap at position 448 Mutate by insert at position 379 Mutate by delete at position 311 Mutate by delete at position 335 Mutate by insert at position 132 Mutate by swap at position 191 Mutate by swap at position 481 Mutate by insert at position 189 Mutate by insert at position 423 Mutated: 502nt DNA sequence: 0 TCACGTAACAGAGGTATAGTTGATCTTGAGCGGGCTGGCTCCCGGGTTTC 50 TAGCAAGAAAAGGGGAGGGAAGTGCGCCTGCTTTTGCCCCGGGCACCCCA 100 ATCGAAGACAGCTCCGGGGTCGCACATTTTTACTGGCCACATAATGAGGG 150 AATGCACGCATCACTCCTATCACTAACTGCGAACTCATGATCACTGTGCA 200 AAACACCTTTAACACTGCGATGCCGTGGGGACGGGCCCCCCCAGCGGTAT 250 AGCGAGCACACGCGACAGATGTTAACTCGAATGGTCGCGCCGGGGAGTGC 300 ACCCCTTGACATTACTCCAGATGCAATGCGCTATACTTATGAACTTGCAT 350 AAGCTGCGCAGGGGGGATTGACTTATACTGACATATTAACTACCGATATC 400 GACGCAAATATTCGGGCGGTCTAGAAGTGTGTCAGAACGGACATGCCGCT 450 CAGAATCACGGCTACTGAGGACAAATACGCATTCCCGGTGCTGCATTCAT 500 TC Nucleotide counts: A 128 C 133 G 132 T 109 Other 0 _________________ Total 502
Lua
Using the prettyprint()
function from Bioinformatics/base_count#Lua (not replicated here)
math.randomseed(os.time())
bases = {"A","C","T","G"}
function randbase() return bases[math.random(#bases)] end
function mutate(seq)
local i,h = math.random(#seq), "%-6s %3s at %3d"
local old,new = seq:sub(i,i), randbase()
local ops = {
function(s) h=h:format("Swap", old..">"..new, i) return s:sub(1,i-1)..new..s:sub(i+1) end,
function(s) h=h:format("Delete", " -"..old, i) return s:sub(1,i-1)..s:sub(i+1) end,
function(s) h=h:format("Insert", " +"..new, i) return s:sub(1,i-1)..new..s:sub(i) end,
}
local weighted = { 1,1,2,3 }
local n = weighted[math.random(#weighted)]
return ops[n](seq), h
end
local seq,hist="",{} for i = 1, 200 do seq=seq..randbase() end
print("ORIGINAL:")
prettyprint(seq)
print()
for i = 1, 10 do seq,h=mutate(seq) hist[#hist+1]=h end
print("MUTATIONS:")
for i,h in ipairs(hist) do print(" "..h) end
print()
print("MUTATED:")
prettyprint(seq)
- Output:
ORIGINAL: LOCUS AB000000 200 bp mRNA linear HUM 01-JAN-2001 BASE COUNT 50 a 47 c 51 g 52 t ORIGIN 1 atggatccga cgtgattata ttcactatgg ggcaatcgca cattagtttt atctccatca 61 gcgacacgat ggggatcaat gggctgctac tggagacgtc cgatgcgatg attggtaatt 121 gcatagagtg gatctccttt aacctagtag aaacgccctt ccggttcagc atggcgagtg 181 cgtacaacgt cacccagact MUTATIONS: Insert +A at 190 Delete -C at 134 Swap A>G at 57 Delete -G at 83 Insert +T at 81 Swap T>T at 164 Delete -C at 199 Swap T>G at 147 Swap C>G at 33 Swap C>G at 191 MUTATED: LOCUS AB000000 199 bp mRNA linear HUM 01-JAN-2001 BASE COUNT 50 a 43 c 54 g 52 t ORIGIN 1 atggatccga cgtgattata ttcactatgg gggaatcgca cattagtttt atctccgtca 61 gcgacacgat ggggatcaat tggctgctac tggagacgtc cgatgcgatg attggtaatt 121 gcatagagtg gattccttta acctaggaga aacgcccttc cggttcagca tggcgagtgc 181 gtacaacgat gacccagat
Mathematica / Wolfram Language
BioSequence is a fundamental data type in Mathematica:
SeedRandom[13122345];
seq = BioSequence["DNA", "ATAAACGTACGTTTTTAGGCT"];
randompos = RandomInteger[seq["SequenceLength"]];
seq = StringReplacePart[seq, RandomChoice[{"A", "T", "C", "G"}], {randompos, randompos}];
randompos = RandomInteger[seq["SequenceLength"]];
seq = StringReplacePart[seq, "", {randompos, randompos}];
randompos = RandomInteger[seq["SequenceLength"]];
seq = StringInsert[seq, RandomChoice[{"A", "T", "C", "G"}], randompos];
seq = BioSequence["DNA", StringJoin@RandomChoice[{"A", "T", "C", "G"}, 250]];
size = 50;
parts = StringPartition[seq["SequenceString"], UpTo[size]];
begins = Most[Accumulate[Prepend[StringLength /@ parts, 1]]];
ends = Rest[Accumulate[Prepend[StringLength /@ parts, 0]]];
StringRiffle[MapThread[ToString[#1] <> "-" <> ToString[#2] <> ": " <> #3 &, {begins, ends, parts}], "\n"]
Tally[Characters[seq["SequenceString"]]]
Do[
type = RandomChoice[{1, 2, 3}];
Switch[type, 1,
randompos = RandomInteger[seq["SequenceLength"]];
seq = StringReplacePart[seq, RandomChoice[{"A", "T", "C", "G"}], {randompos, randompos}];
, 2,
randompos = RandomInteger[seq["SequenceLength"]];
seq = StringReplacePart[seq, "", {randompos, randompos}];
, 3,
randompos = RandomInteger[seq["SequenceLength"]];
seq = StringInsert[seq, RandomChoice[{"A", "T", "C", "G"}], randompos];
]
,
{10}
]
parts = StringPartition[seq["SequenceString"], UpTo[size]];
begins = Most[Accumulate[Prepend[StringLength /@ parts, 1]]];
ends = Rest[Accumulate[Prepend[StringLength /@ parts, 0]]];
StringRiffle[MapThread[ToString[#1] <> "-" <> ToString[#2] <> ": " <> #3 &, {begins, ends, parts}], "\n"]
Tally[Characters[seq["SequenceString"]]]
- Output:
1-50: TAGCAGGGGAATTGTCGACTCCCGGGTTTCAATTGCCAACCAAGCATATT 51-100: GTACGCTCGTTCATTATAGGGGAAATGCGAGGGGCTAGAACGTTAGCTTC 101-150: GAGAGGTCGCGGCAATTTAGGGGGGCACCAAACGGTTTATAATACAGGGA 151-200: CTGATACATTCGCTGGAAAACAATTCTGCCCAGCAGCGACTCCGGACAAC 201-250: GTGACTTTGGTCCAAGATATTAGATTATCAATCCGTATTAATGTAGGCTT {{"T", 63}, {"A", 69}, {"G", 66}, {"C", 52}} 1-50: TAGCAGGGGAATTGTCGACTCCCGGGTTCAATTGCCAACCAAGATATTGT 51-100: ACGCTCGTTCATTATAGGGGAAATGCGAGGGGCTAGAAACGTTAGTTCGA 101-150: GAGGTCGCGGAAATTTAGGGGGGCACCAACGGTTTATAATACAGGGACTG 151-200: ATACATTCGCTGGAAAACAATTCTGCCCAGCAGCGACTCCGGACAACGTG 201-246: ACTTTGGTCCAAGATAGTTAGATATCAATCCGTATAATGTAGGCTT {{"T", 60}, {"A", 70}, {"G", 67}, {"C", 49}}
Nim
import random
import strformat
import strutils
type
# Enumeration type for bases.
Base {.pure.} = enum A, C, G, T, Other = "other"
# Sequence of bases.
DnaSequence = string
# Kind of mutation.
Mutation = enum mutSwap, mutDelete, mutInsert
const MaxBaseVal = ord(Base.high) - 1 # Maximum base value.
#---------------------------------------------------------------------------------------------------
template toChar(base: Base): char = ($base)[0]
#---------------------------------------------------------------------------------------------------
proc newDnaSeq(length: Natural): DnaSequence =
## Create a DNA sequence of given length.
result = newStringOfCap(length)
for _ in 1..length:
result.add($Base(rand(MaxBaseVal)))
#---------------------------------------------------------------------------------------------------
proc mutate(dnaSeq: var DnaSequence) =
## Mutate a sequence (it is changed in place).
# Choose randomly the position of mutation.
let idx = rand(dnaSeq.high)
# Choose randomly the kind of mutation.
let mut = Mutation(rand(ord(Mutation.high)))
# Apply the mutation.
case mut
of mutSwap:
let newBase = Base(rand(MaxBaseVal))
echo fmt"Changing base at position {idx + 1} from {dnaSeq[idx]} to {newBase}"
dnaSeq[idx] = newBase.toChar
of mutDelete:
echo fmt"Deleting base {dnaSeq[idx]} at position {idx + 1}"
dnaSeq.delete(idx, idx)
of mutInsert:
let newBase = Base(rand(MaxBaseVal))
echo fmt"Inserting base {newBase} at position {idx + 1}"
dnaSeq.insert($newBase, idx)
#---------------------------------------------------------------------------------------------------
proc display(dnaSeq: DnaSequence) =
## Display a DNA sequence using EMBL format.
var counts: array[Base, Natural] # Count of bases.
for c in dnaSeq:
inc counts[parseEnum[Base]($c, Other)] # Use Other as default value.
# Display the SQ line.
var sqline = fmt"SQ {dnaSeq.len} BP; "
for (base, count) in counts.pairs:
sqline &= fmt"{count} {base}; "
echo sqline
# Display the sequence.
var idx = 0
var row = newStringOfCap(80)
var remaining = dnaSeq.len
while remaining > 0:
row.setLen(0)
row.add(" ")
# Add groups of 10 bases.
for group in 1..6:
let nextIdx = idx + min(10, remaining)
for i in idx..<nextIdx:
row.add($dnaSeq[i])
row.add(' ')
dec remaining, nextIdx - idx
idx = nextIdx
if remaining == 0:
break
# Append the number of the last base in the row.
row.add(spaces(72 - row.len))
row.add(fmt"{idx:>8}")
echo row
# Add termination.
echo "//"
#———————————————————————————————————————————————————————————————————————————————————————————————————
randomize()
var dnaSeq = newDnaSeq(200)
echo "Initial sequence"
echo "———————————————\n"
dnaSeq.display()
echo "\nMutations"
echo "—————————\n"
for _ in 1..10:
dnaSeq.mutate()
echo "\nMutated sequence"
echo "————————————————\n"
dnaSeq.display()
- Output:
Initial sequence ——————————————— SQ 200 BP; 53 A; 52 C; 45 G; 50 T; 0 other; ATGGATAGAA TGGCACCTCA GTGTCACATG TCCTAGGCAC ATTCTACGTA CTAGTTTCTG 60 GAGGTCGATA AATACAAGAT GGAATACTCT TATCAACCGC TAGCAATGAA ACTTAGATAG 120 CCACCCCTCG ATCGCGGTTC GCTATGCGGC ATCGTCAACT GCGTCAAAGC ACTACGTCGT 180 TTCGTGACTG CCAGTCGAGC 200 // Mutations ————————— Inserting base A at position 121 Changing base at position 101 from T to G Deleting base A at position 115 Changing base at position 126 from C to G Deleting base T at position 155 Deleting base G at position 198 Deleting base T at position 159 Changing base at position 144 from A to T Inserting base C at position 34 Changing base at position 127 from G to G Mutated sequence ———————————————— SQ 198 BP; 52 A; 52 C; 46 G; 48 T; 0 other; ATGGATAGAA TGGCACCTCA GTGTCACATG TCCCTAGGCA CATTCTACGT ACTAGTTTCT 60 GGAGGTCGAT AAATACAAGA TGGAATACTC TTATCAACCG CGAGCAATGA AACTTGATAG 120 ACCACCGCTC GATCGCGGTT CGCTTTGCGG CATCGCAACG CGTCAAAGCA CTACGTCGTT 180 TCGTGACTGC CAGTCGAC 198 //
Perl
use strict;
use warnings;
use feature 'say';
my @bases = <A C G T>;
my $dna;
$dna .= $bases[int rand 4] for 1..200;
my %cnt;
$cnt{$_}++ for split //, $dna;
sub pretty {
my($string) = @_;
my $chunk = 10;
my $wrap = 5 * ($chunk+1);
($string =~ s/(.{$chunk})/$1 /gr) =~ s/(.{$wrap})/$1\n/gr;
}
sub mutate {
my($dna,$count) = @_;
my $orig = $dna;
substr($dna,rand length $dna,1) = $bases[int rand 4] while $count > diff($orig, $dna) =~ tr/acgt//;
$dna
}
sub diff {
my($orig, $repl) = @_;
for my $i (0 .. -1+length $orig) {
substr($repl,$i,1, lc substr $repl,$i,1) if substr($orig,$i,1) ne substr($repl,$i,1);
}
$repl;
}
say "Original DNA strand:\n" . pretty($dna);
say "Total bases: ". length $dna;
say "$_: $cnt{$_}" for @bases;
my $mutate = mutate($dna, 10);
%cnt = ();
$cnt{$_}++ for split //, $mutate;
say "\nMutated DNA strand:\n" . pretty diff $dna, $mutate;
say "Total bases: ". length $mutate;
say "$_: $cnt{$_}" for @bases;
- Output:
Original DNA strand: TGGAACATGT CCCAACGAGT TCTTCTTGCT AGCAGATTTT TTCAGTTGAT CGTCACATGC GGTAGACTAC CCAAGGTGTG ACTACTCGCA TGCCTGATCT AAATGGACAG TCGGCAGGCT AGTGCTAATT ACCGGAAGTA CGAACGAGCC ATGCTGAGCG ACTCATCATT GTGAAATCGA GCCTATCTGC ATGACCTAAT Total bases: 200 A: 52 C: 48 G: 47 T: 53 Mutated DNA strand: TGGAACATGT CCCAACGAGT cCTTCTTGCT AGCcGATTTT TTCAGTTGgT gGTCACATGC aGTAGACTAC CCgAGGTGTG ACTACTCGCA TGCCTGATCT AAATGGACAG TCGGCAGGCT AGTGCTAATT ACCGGAAGTA CGAACGAGCt ATGCaGAGCG ACTCATCgTT GTGAAATCGA GCCTATCTGC AgGACCTAAT Total bases: 200 A: 50 C: 48 G: 51 T: 51
Phix
string dna = repeat(' ',200+rand(300)) for i=1 to length(dna) do dna[i] = "ACGT"[rand(4)] end for procedure show() sequence acgt = repeat(0,5) for i=1 to length(dna) do acgt[find(dna[i],"ACGT")] += 1 end for acgt[$] = sum(acgt) sequence s = split(trim(join_by(split(join_by(dna,1,10,""),"\n"),1,5," ")),"\n") for i=1 to length(s) do printf(1,"%3d: %s\n",{(i-1)*50+1,s[i]}) end for printf(1,"\nBase counts: A:%d, C:%d, G:%d, T:%d, total:%d\n",acgt) end procedure procedure mutate() printf(1,"\n") for i=1 to 10 do integer p = rand(length(dna)), sdi = "SDI"[rand(3)], rep = "ACGT"[rand(4)], was = dna[p] switch sdi do case 'S':dna[p] = rep printf(1,"swapped %c at %d for %c\n",{was,p,rep}) case 'D':dna[p..p] = "" printf(1,"deleted %c at %d\n",{was,p}) case 'I':dna[p..p-1] = ""&rep printf(1,"inserted %c at %d, before %c\n",{rep,p,was}) end switch end for printf(1,"\n") end procedure show() mutate() show()
- Output:
1: ATAGACCGAT GTGTAGGTCT CGAACATCCC TGGGGTAGCT CAGCTTGGGG 51: GTTGACCTGT CTTGCTCCCA TGAACTGAGG GATTTGGAAA TAACGCTTAT 101: AACTGCGGGG GATTGATATG GGACATTGTT GCTGTAGGGC TTCGGCGTGC 151: TTAGAAACAA GAGAACACCA ATTTCGATAG ACCAGGTTTC GTCCCGCTAC 201: GAGTGATAGT AGCGCGTTAG GATTAATAAT CAGGGAGAGC ATTAAACATT 251: CTAAAAACTG ACATTCCCGA GGTGGAACCC GAGTTGATAA CGAGTATGCT 301: CTGAAAAATT AATTGATTGA TCCGCGACAC TATCACACCG TCTTCGCCGT 351: TGTAATGCAT GCTGGCTTAG CATCCGGATG CTCTTCTACC GATCTTAAGG 401: CGCGCACTCC CCAAGGAGCA TAGAAGCATC CCCGGCCCTC GACAGAGTCT 451: CCCAGTGTAA GTGTCTTTAT CCAAAATC Base counts: A:125, C:111, G:120, T:122, total:478 inserted C at 58, before T deleted T at 2 deleted C at 169 deleted G at 80 inserted A at 331, before T swapped C at 27 for T inserted A at 22, before A inserted T at 190, before G swapped C at 195 for C inserted A at 274, before G 1: AAGACCGATG TGTAGGTCTC GAAACATTCC TGGGGTAGCT CAGCTTGGGG 51: GTTGACCCTG TCTTGCTCCC ATGAACTGAG GATTTGGAAA TAACGCTTAT 101: AACTGCGGGG GATTGATATG GGACATTGTT GCTGTAGGGC TTCGGCGTGC 151: TTAGAAACAA GAGAACACAA TTTCGATAGA CCAGGTTTCT GTCCCGCTAC 201: GAGTGATAGT AGCGCGTTAG GATTAATAAT CAGGGAGAGC ATTAAACATT 251: CTAAAAACTG ACATTCCCGA GGTAGGAACC CGAGTTGATA ACGAGTATGC 301: TCTGAAAAAT TAATTGATTG ATCCGCGACA CTAATCACAC CGTCTTCGCC 351: GTTGTAATGC ATGCTGGCTT AGCATCCGGA TGCTCTTCTA CCGATCTTAA 401: GGCGCGCACT CCCCAAGGAG CATAGAAGCA TCCCCGGCCC TCGACAGAGT 451: CTCCCAGTGT AAGTGTCTTT ATCCAAAATC Base counts: A:128, C:110, G:119, T:123, total:480
PureBasic
#BASE$="ACGT"
#SEQLEN=200
#PROTOCOL=#True
Global dna.s
Define i.i
Procedure pprint()
Define p.i, cnt.i, sum.i
For p=1 To Len(dna) Step 50
Print(RSet(Str(p-1)+": ",5))
PrintN(Mid(dna,p,50))
Next
PrintN("Base counts:")
For p=1 To 4
cnt=CountString(dna,Mid(#BASE$,p,1)) : sum+cnt
Print(Mid(#BASE$,p,1)+": "+Str(cnt)+", ")
Next
PrintN("Total: "+Str(sum))
EndProcedure
Procedure InsertAtPos(basenr.i,position.i)
If #PROTOCOL : PrintN("Insert base "+Mid(#BASE$,basenr,1)+" at position "+Str(position)) : EndIf
dna=InsertString(dna,Mid(#BASE$,basenr,1),position)
EndProcedure
Procedure EraseAtPos(position.i)
If #PROTOCOL : PrintN("Erase base "+Mid(dna,position,1)+" at position "+Str(position)) : EndIf
If position>0 And position<=Len(dna)
dna=Left(dna,position-1)+Right(dna,Len(dna)-position)
EndIf
EndProcedure
Procedure OverwriteAtPos(basenr.i,position.i)
If #PROTOCOL : PrintN("Change base at position "+Str(position)+" from "+Mid(dna,position,1)+" to "+Mid(#BASE$,basenr,1)) : EndIf
If position>0 And position<=Len(dna)
position-1
PokeS(@dna+2*position,Mid(#BASE$,basenr,1),-1,#PB_String_NoZero)
EndIf
EndProcedure
If OpenConsole()=0 : End 1 : EndIf
For i=1 To #SEQLEN : dna+Mid(#BASE$,Random(4,1),1) : Next
PrintN("Initial sequence:")
pprint()
For i=1 To 10
Select Random(2)
Case 0 : InsertAtPos(Random(4,1),Random(Len(dna),1))
Case 1 : EraseAtPos(Random(Len(dna),1))
Case 2 : OverwriteAtPos(Random(4,1),Random(Len(dna),1))
EndSelect
Next
PrintN("After 10 mutations:")
pprint()
Input()
- Output:
Initial sequence: 0: AAGTTTACGTCGGACTTCATTAATCGGTTTAGTCAGACCCGATCCAAATC 50: TTGCTTTCACTCCGCATTCTTCTCATGAGTAAAAGGCTGCTCCTGCACTA 100: AAGCGTTCTCAACACCTTGGAGAGCCATCTCGGTACTCCGCGCAAAATAG 150: CCATAGAGGGTATCAGGAAACGCATCGAAGGTTTAGCCGAACTAAGGTCT Base counts: A: 54, C: 52, G: 42, T: 52, Total: 200 Change base at position 7 from A to T Insert base T at position 66 Erase base G at position 198 Insert base C at position 32 Change base at position 80 from A to G Erase base A at position 2 Insert base C at position 33 Insert base C at position 201 Insert base G at position 70 Erase base T at position 187 After 10 mutations: 0: AGTTTTCGTCGGACTTCATTAATCGGTTTACGCTCAGACCCGATCCAAAT 50: CTTGCTTTCACTCCGCTATGTCTTCTCATGGGTAAAAGGCTGCTCCTGCA 100: CTAAAGCGTTCTCAACACCTTGGAGAGCCATCTCGGTACTCCGCGCAAAA 150: TAGCCATAGAGGGTATCAGGAAACGCATCGAAGGTTAGCCGAACTAAGTC 200: CT Base counts: A: 51, C: 55, G: 43, T: 53, Total: 202
Python
In function seq_mutate argument kinds selects between the three kinds of mutation. The characters I, D, and S are chosen from the string to give the kind of mutation to perform, so the more of that character, the more of that type of mutation performed.
Similarly parameter choice is chosen from to give the base for substitution or insertion - the more any base appears, the more likely it is to be chosen in any insertion/substitution.
import random
from collections import Counter
def basecount(dna):
return sorted(Counter(dna).items())
def seq_split(dna, n=50):
return [dna[i: i+n] for i in range(0, len(dna), n)]
def seq_pp(dna, n=50):
for i, part in enumerate(seq_split(dna, n)):
print(f"{i*n:>5}: {part}")
print("\n BASECOUNT:")
tot = 0
for base, count in basecount(dna):
print(f" {base:>3}: {count}")
tot += count
base, count = 'TOT', tot
print(f" {base:>3}= {count}")
def seq_mutate(dna, count=1, kinds="IDSSSS", choice="ATCG" ):
mutation = []
k2txt = dict(I='Insert', D='Delete', S='Substitute')
for _ in range(count):
kind = random.choice(kinds)
index = random.randint(0, len(dna))
if kind == 'I': # Insert
dna = dna[:index] + random.choice(choice) + dna[index:]
elif kind == 'D' and dna: # Delete
dna = dna[:index] + dna[index+1:]
elif kind == 'S' and dna: # Substitute
dna = dna[:index] + random.choice(choice) + dna[index+1:]
mutation.append((k2txt[kind], index))
return dna, mutation
if __name__ == '__main__':
length = 250
print("SEQUENCE:")
sequence = ''.join(random.choices('ACGT', weights=(1, 0.8, .9, 1.1), k=length))
seq_pp(sequence)
print("\n\nMUTATIONS:")
mseq, m = seq_mutate(sequence, 10)
for kind, index in m:
print(f" {kind:>10} @{index}")
print()
seq_pp(mseq)
- Output:
SEQUENCE: 0: GGAAGATTAGGTCACGGGCCTCATCTTGTGCGAGATAAATAATAACACTC 50: AGCGATCATTAGAATGTATATTGTACGGGCATGTTTATCTACCATAGGTC 100: CTGTCAAAAGATGGCTAGCTGCAATTTTTTCTTCTAGATCCCGATTACTG 150: CGGTATTTTTGTATAACGTGCTAAACGGTGTGTTTTCAGGTCGGCCTGCT 200: AATCTAACGCCAGTGGACTTGGGATGGACGCCCAACAACTGAGAGCGCGG BASECOUNT: A: 64 C: 51 G: 62 T: 73 TOT= 250 MUTATIONS: Substitute @138 Substitute @72 Insert @103 Insert @129 Insert @124 Delete @52 Delete @202 Substitute @200 Insert @158 Delete @32 0: GGAAGATTAGGTCACGGGCCTCATCTTGTGCGGATAAATAATAACACTCA 50: GGATCATTAGAATGTATATTATACGGGCATGTTTATCTACCATAGGTCCT 100: GCTCAAAAGATGGCTAGCTGCAGATTTTGTTCTTCTAGAGCCCGATTACT 150: GCGGTATGTTTTGTATAACGTGCTAAACGGTGTGTTTTCAGGTCGGCCTG 200: CTATCTAACGCCAGTGGACTTGGGATGGACGCCCAACAACTGAGAGCGCG 250: G BASECOUNT: A: 63 C: 51 G: 65 T: 72 TOT= 251
Quackery
prettyprint
and tallybases
are defined at Bioinformatics/base count#Quackery.
[ $ "ACGT" 4 random peek ] is randomgene ( --> c )
[ $ "" swap times
[ randomgene join ] ] is randomsequence ( n --> $ )
[ dup size random
3 random
[ table
[ pluck drop ]
[ randomgene unrot stuff ]
[ randomgene unrot poke ] ]
do ] is mutate ( $ --> $ )
200 randomsequence
dup prettyprint cr cr dup tallybases
cr cr say "Mutating..." cr
10 times mutate
dup prettyprint cr cr tallybases
- Output:
0 CGCCCGCACC TAGAACCATT AAGCTGTCTG GTGTCGGGAT CGTCACATTA 50 CGCCCCTTGT TGCGTCGGCG CCGATGCGAA GGCATAATAT GTGGTCTAAT 100 GTCATGCGTG CCCGGGGAAT CTGGCGCGAC CGTCATGGCA AACCGCATCC 150 CCTCAGCAAA TTACTAGCTG GTTGATTTTC ATCATAGGCC TGATCATGTG adenine 41 cytosine 56 guanine 53 thymine 50 total 200 Mutating... 0 AGGCCCGCAC CTAGAACCAT TAAGCTGTCT GGTGTCGGGA TCGTCACATT 50 ACGCCCCTTG TGCGTCGGCG CCGATGCGAA GGCATAATAT GTGGTCTGAT 100 GTCATGCGTG CGCGGGGAAT CTGGCGCGAC CGTCATGGCA AACCGCATCC 150 CCTCAGCAAA ATTCTAGCTG GTTGATTTTA TCTATAGGCC TGACTCATGT 200 G adenine 41 cytosine 54 guanine 56 thymine 50 total 201
Racket
#lang racket
(define current-S-weight (make-parameter 1))
(define current-D-weight (make-parameter 1))
(define current-I-weight (make-parameter 1))
(define bases '(#\A #\C #\G #\T))
(define (fold-sequence seq kons #:finalise (finalise (λ x (apply values x))) . k0s)
(define (recur seq . ks)
(if (null? seq)
(call-with-values (λ () (apply finalise ks)) (λ vs (apply values vs)))
(call-with-values (λ () (apply kons (car seq) ks)) (λ ks+ (apply recur (cdr seq) ks+)))))
(apply recur (if (string? seq) (string->list (regexp-replace* #px"[^ACGT]" seq "")) seq) k0s))
(define (sequence->pretty-printed-string seq)
(define (fmt idx cs-rev) (format "~a: ~a" (~a idx #:width 3 #:align 'right) (list->string (reverse cs-rev))))
(fold-sequence
seq
(λ (b n start-idx lns-rev cs-rev)
(if (zero? (modulo n 50))
(values (+ n 1) n (if (pair? cs-rev) (cons (fmt start-idx cs-rev) lns-rev) lns-rev) (cons b null))
(values (+ n 1) start-idx lns-rev (cons b cs-rev))))
0 0 null null
#:finalise (λ (n idx lns-rev cs-rev)
(string-join (reverse (if (null? cs-rev) lns-rev (cons (fmt idx cs-rev) lns-rev))) "\n"))))
(define (count-bases b as cs gs ts n)
(values (+ as (if (eq? b #\A) 1 0)) (+ cs (if (eq? b #\C) 1 0)) (+ gs (if (eq? b #\T) 1 0)) (+ ts (if (eq? b #\G) 1 0)) (add1 n)))
(define (report-sequence s)
(define-values (as cs gs ts n) (fold-sequence s count-bases 0 0 0 0 0))
(printf "SEQUENCE:~%~a~%" (sequence->pretty-printed-string s))
(printf "BASE COUNT:~%-----------~%~a~%"
(string-join (map (λ (c n) (format " ~a :~a" c (~a #:width 4 #:align 'right n))) bases (list as ts cs gs)) "\n"))
(printf "TOTAL: ~a~%" n))
(define (make-random-sequence-string l)
(list->string (for/list ((_ l)) (list-ref bases (random 4)))))
(define (weighted-random-call weights-and-functions . args)
(let loop ((r (random)) (wfs weights-and-functions))
(if (<= r (car wfs)) (apply (cadr wfs) args) (loop (- r (car wfs)) (cddr wfs)))))
(define (mutate-S s)
(let ((r (random (string-length s))) (i (string (list-ref bases (random 4)))))
(printf "Mutate at ~a -> ~a~%" r i)
(string-append (substring s 0 r) i (substring s (add1 r)))))
(define (mutate-D s)
(let ((r (random (string-length s))))
(printf "Delete at ~a~%" r)
(string-append (substring s 0 r) (substring s (add1 r)))))
(define (mutate-I s)
(let ((r (random (string-length s))) (i (string (list-ref bases (random 4)))))
(printf "Insert at ~a -> ~a~%" r i)
(string-append (substring s 0 r) i (substring s r))))
(define (mutate s)
(define W (+ (current-S-weight) (current-D-weight) (current-I-weight)))
(weighted-random-call
(list (/ (current-S-weight) W) mutate-S (/ (current-D-weight) W) mutate-D (/ (current-I-weight) W) mutate-I)
s))
(module+
main
(define initial-sequence (make-random-sequence-string 200))
(report-sequence initial-sequence)
(newline)
(define s+ (for/fold ((s initial-sequence)) ((_ 10)) (mutate s)))
(newline)
(report-sequence s+)
(newline)
(define s+d (parameterize ((current-D-weight 5)) (for/fold ((s initial-sequence)) ((_ 10)) (mutate s))))
(newline)
(report-sequence s+d))
- Output:
SEQUENCE: 0: AATGCTTGTGTCACCGGCCACCCCCTATGTACATACACGGTTCAGCTGGC 50: CTGGCAGAGCGTCTGACAACGGAGCTCTAAAACTTCCTTCTGACTAGAGC 100: TCATCCTCTCCCGGCTGAACTCCGTGGCTTACAATCAGACCCTACCCGTG 150: TCCAACTACGCAGTCGCCCGATGAGGCTGGGCTCACCACGTTGGGCGAGG BASE COUNT: ----------- A : 41 C : 49 G : 67 T : 43 TOTAL: 200 Insert at 176 -> T Insert at 99 -> G Delete at 154 Mutate at 188 -> A Insert at 14 -> A Mutate at 62 -> G Delete at 110 Mutate at 79 -> T Insert at 109 -> A Insert at 46 -> C SEQUENCE: 0: AATGCTTGTGTCACACGGCCACCCCCTATGTACATACACGGTTCAGCCTG 50: GCCTGGCAGAGCGGCTGACAACGGAGCTCTTAAACTTCCTTCTGACTAGA 100: GGCTCATCCTACCCCGGCTGAACTCCGTGGCTTACAATCAGACCCTACCC 150: GTGTCCACTACGCAGTCGCCCGATGAGGTCTGGGCTCACCACGTTGGGCG 200: AGG BASE COUNT: ----------- A : 41 C : 51 G : 68 T : 43 TOTAL: 203 Delete at 79 Delete at 105 Delete at 63 Delete at 37 Delete at 166 Mutate at 42 -> C Delete at 142 Delete at 23 Delete at 91 Insert at 4 -> T SEQUENCE: 0: AATGTCTTGTGTCACCGGCCACCCCTATGTACATACAGGTTCCGCTGGCC 50: TGGCAGAGCGTCGACAACGGAGCTCTAAACTTCCTTCTGACTGAGCTCAT 100: CCCTCCCGGCTGAACTCCGTGGCTTACAATCAGACCCTACCGTGTCCAAC 150: TACGCAGTCGCCCGTGAGGCTGGGCTCACCACGTTGGGCGAGG BASE COUNT: ----------- A : 37 C : 49 G : 65 T : 42 TOTAL: 193
Raku
(formerly Perl 6)
Unweighted mutations at this point. The mutated DNA strand has a "diff" operation performed on it which (in this specific case) renders the mutated base in lower case so it may be picked out more easily.
my @bases = <A C G T>;
# The DNA strand
my $dna = @bases.roll(200).join;
# The Task
put "ORIGINAL DNA STRAND:";
put pretty $dna;
put "\nTotal bases: ", +my $bases = $dna.comb.Bag;
put $bases.sort( ~*.key ).join: "\n";
put "\nMUTATED DNA STRAND:";
my $mutate = $dna.&mutate(10);
put pretty diff $dna, $mutate;
put "\nTotal bases: ", +my $mutated = $mutate.comb.Bag;
put $mutated.sort( ~*.key ).join: "\n";
# Helper subs
sub pretty ($string, $wrap = 50) {
$string.comb($wrap).map( { sprintf "%8d: %s", $++ * $wrap, $_ } ).join: "\n"
}
sub mutate ($dna is copy, $count = 1) {
$dna.substr-rw((^$dna.chars).roll, 1) = @bases.roll for ^$count;
$dna
}
sub diff ($orig, $repl) {
($orig.comb Z $repl.comb).map( -> ($o, $r) { $o eq $r ?? $o !! $r.lc }).join
}
- Output:
ORIGINAL DNA STRAND: 0: ACGGATAGACCGTTCCTGCAAGCTGGTACGGTTCGAATGTTGACCTTATT 50: CTCCGCAGCGCACTACCCGATCGGGTAACGTACTCTATATGATGCCTATT 100: TTCCCCGCCTTACATCGGCGATCAATGTTCTTTTACGCTAACTAGGCGCA 150: CGTCGTGCCTTACCGAGAGCCAGTTCGAAATCGTGCTGAAAATATCTGGA Total bases: 200 A 45 C 55 G 45 T 55 MUTATED DNA STRAND: 0: ACGGATAGcCCGTTCCTGCAAGCTGGTACGGTTCGAATGTTGACCTTATT 50: CTCCGCAGCGCACTACCCGATCGGGTcACtcACTCTATATGAcGCCTAaT 100: TTCCCCGCCTTACATCGGCGATCAATGTTCTTTTACGCTAACTAGGCGCA 150: CGTCGTGCCTTACCcAGAGCCAGTTCGAAATCGTGCTGAAAATATCTGGA Total bases: 200 A 44 C 60 G 43 T 53
Ring
row = 0
dnaList = []
base = ["A","C","G","T"]
long = 20
see "Initial sequence:" + nl
see " 12345678901234567890" + nl
see " " + long + ": "
for nr = 1 to 200
row = row + 1
rnd = random(3)+1
baseStr = base[rnd]
see baseStr # + " "
if (row%20) = 0 and long < 200
long = long + 20
see nl
if long < 100
see " " + long + ": "
else
see "" + long + ": "
ok
ok
add(dnaList,baseStr)
next
see nl+ " 12345678901234567890" + nl
baseCount(dnaList)
for n = 1 to 10
rnd = random(2)+1
switch rnd
on 1
baseSwap(dnaList)
on 2
baseDelete(dnaList)
on 3
baseInsert(dnaList)
off
next
showDna(dnaList)
baseCount(dnaList)
func baseInsert(dnaList)
rnd1 = random(len(dnaList)-1)+1
rnd2 = random(len(base)-1)+1
insert(dnaList,rnd1,base[rnd2])
see "Insert base " + base[rnd2] + " at position " + rnd1 + nl
return dnaList
func baseDelete(dnaList)
rnd = random(len(dnaList)-1)+1
del(dnaList,rnd)
see "Erase base " + dnaList[rnd] + " at position " + rnd + nl
return dnaList
func baseSwap(dnaList)
rnd1 = random(len(dnaList))
rnd2 = random(3)+1
see "Change base at position " + rnd1 + " from " + dnaList[rnd1] + " to " + base[rnd2] + nl
dnaList[rnd1] = base[rnd2]
func showDna(dnaList)
long = 20
see nl + "After 10 mutations:" + nl
see " 12345678901234567890" + nl
see " " + long + ": "
for nr = 1 to len(dnaList)
row = row + 1
see dnaList[nr]
if (row%20) = 0 and long < 200
long = long + 20
see nl
if long < 100
see " " + long + ": "
else
see "" + long + ": "
ok
ok
next
see nl+ " 12345678901234567890" + nl
func baseCount(dnaList)
dnaBase = [:A=0, :C=0, :G=0, :T=0]
lenDna = len(dnaList)
for n = 1 to lenDna
dnaStr = dnaList[n]
switch dnaStr
on "A"
strA = dnaBase["A"]
strA++
dnaBase["A"] = strA
on "C"
strC = dnaBase["C"]
strC++
dnaBase["C"] = strC
on "G"
strG = dnaBase["G"]
strG++
dnaBase["G"] = strG
on "T"
strT = dnaBase["T"]
strT++
dnaBase["T"] = strT
off
next
see nl
see "A: " + dnaBase["A"] + ", "
see "T: " + dnaBase["T"] + ", "
see "C: " + dnaBase["C"] + ", "
see "G: " + dnaBase["G"] + ", "
total = dnaBase["A"] + dnaBase["T"] + dnaBase["C"] + dnaBase["G"]
see "Total: " + total+ nl + nl
- Output:
Initial sequence: 12345678901234567890 20: GGAGACACTAACGAAACAAA 40: CAGGTATATAGGACATGTAG 60: AAACAATTAATACGTAGCGA 80: ACTGTGGCGCGAAAGAAGGG 100: ATGGACTCGGGTATTGCCGA 120: GATTCACGCCAACGAAAAAT 140: ATCTCAGATGACCGAAATAG 160: GGTCATCGAAATGAGTCCAA 180: ATAACTAAGTGGACAAAGGT 200: AGACCAAAAAGGACAGAAAA 12345678901234567890 A: 84, T: 32, C: 34, G: 50, Total: 200 Change base at position 176 from A to A Change base at position 178 from G to C Change base at position 180 from T to C Erase base T at position 28 Insert base A at position 17 Erase base C at position 52 Change base at position 118 from A to A Insert base T at position 192 Insert base A at position 142 Erase base A at position 18 After 10 mutations: 12345678901234567890 20: GGAGACACTAACGAAACAAA 40: CAGGTATTAGGACATGTAGA 60: AACAATTAATCGTAGCGAAC 80: TGTGGCGCGAAAGAAGGGAT 100: GGACTCGGGTATTGCCGAGA 120: TTCACGCCAACGAAAAATAT 140: CTCAGATGACCGAAATAGGG 160: TACATCGAAATGAGTCCAAA 180: TAACTAAGTGGACAAACGCA 200: GACCAAAAAGGATCAGAAAA 12345678901234567890 A: 83, T: 32, C: 36, G: 49, Total: 200
Ruby
class DNA_Seq
attr_accessor :seq
def initialize(bases: %i[A C G T] , size: 0)
@bases = bases
@seq = Array.new(size){ bases.sample }
end
def mutate(n = 10)
n.times{|n| method([:s, :d, :i].sample).call}
end
def to_s(n = 50)
just_size = @seq.size / n
(0...@seq.size).step(n).map{|from| "#{from.to_s.rjust(just_size)} " + @seq[from, n].join}.join("\n") +
"\nTotal #{seq.size}: #{@seq.tally.sort.to_h.inspect}\n\n"
end
def s = @seq[rand_index]= @bases.sample
def d = @seq.delete_at(rand_index)
def i = @seq.insert(rand_index, @bases.sample )
alias :swap :s
alias :delete :d
alias :insert :i
private
def rand_index = rand( @seq.size )
end
puts test = DNA_Seq.new(size: 200)
test.mutate
puts test
test.delete
puts test
- Output:
0 TAAGGTGAGGAGTGTGATGGAGTTCGGTGGCTAGCCACAAATACAACACA 50 CTCACCCATACTCGCCTCTGAAGCATGTTTTACTTGGATAGGGCCTACAG 100 CAGTATTCACCCATTCCTCGGCTCCTGACCTGATGTAGGTCTATGTGCGG 150 GAAAATAGGACAATACTGCCGAAGTCATATCCATTGGAGGGGCCCCAGGC Total 200: {:A=>51, :C=>50, :G=>52, :T=>47} 0 TAAGCTGAGGTGTGTGATGGAGTTCGGTGGCTAGCCACAAATACAACACA 50 CTCACCCATACTCGCCTCTGAAGCATGTTTTAATTGGATAGGGCCTACAG 100 CAGTATTCACCCTTCCTCCGCTCCTGACCTGATATAGGTCTATGTGCGGG 150 AAAATAGGACAATACTGCCGAAGTCATATCCATTGGAGGGGCCCCAAGGC Total 200: {:A=>52, :C=>51, :G=>49, :T=>48} 0 TAAGCTGAGGTGTGTGATGGAGTTCGGTGGCTAGCCACAAATACAACACA 50 CTCACCCATACTCGCTCTGAAGCATGTTTTAATTGGATAGGGCCTACAGC 100 AGTATTCACCCTTCCTCCGCTCCTGACCTGATATAGGTCTATGTGCGGGA 150 AAATAGGACAATACTGCCGAAGTCATATCCATTGGAGGGGCCCCAAGGC Total 199: {:A=>52, :C=>50, :G=>49, :T=>48}
Rust
use rand::prelude::*;
use std::collections::HashMap;
use std::fmt::{Display, Formatter, Error};
pub struct Seq<'a> {
alphabet: Vec<&'a str>,
distr: rand::distributions::Uniform<usize>,
pos_distr: rand::distributions::Uniform<usize>,
seq: Vec<&'a str>,
}
impl Display for Seq<'_> {
fn fmt(&self, f: &mut Formatter) -> Result<(), Error> {
let pretty: String = self.seq
.iter()
.enumerate()
.map(|(i, nt)| if (i + 1) % 60 == 0 { format!("{}\n", nt) } else { nt.to_string() })
.collect();
let counts_hm = self.seq
.iter()
.fold(HashMap::<&str, usize>::new(), |mut m, nt| {
*m.entry(nt).or_default() += 1;
m
});
let mut counts_vec: Vec<(&str, usize)> = counts_hm.into_iter().collect();
counts_vec.sort_by(|a, b| a.0.cmp(&b.0));
let counts_string = counts_vec
.iter()
.fold(String::new(), |mut counts_string, (nt, count)| {
counts_string += &format!("{} = {}\n", nt, count);
counts_string
});
write!(f, "Seq:\n{}\n\nLength: {}\n\nCounts:\n{}", pretty, self.seq.len(), counts_string)
}
}
impl Seq<'_> {
pub fn new(alphabet: Vec<&str>, len: usize) -> Seq {
let distr = rand::distributions::Uniform::new_inclusive(0, alphabet.len() - 1);
let pos_distr = rand::distributions::Uniform::new_inclusive(0, len - 1);
let seq: Vec<&str> = (0..len)
.map(|_| {
alphabet[thread_rng().sample(distr)]
})
.collect();
Seq { alphabet, distr, pos_distr, seq }
}
pub fn insert(&mut self) {
let pos = thread_rng().sample(self.pos_distr);
let nt = self.alphabet[thread_rng().sample(self.distr)];
println!("Inserting {} at position {}", nt, pos);
self.seq.insert(pos, nt);
}
pub fn delete(&mut self) {
let pos = thread_rng().sample(self.pos_distr);
println!("Deleting {} at position {}", self.seq[pos], pos);
self.seq.remove(pos);
}
pub fn swap(&mut self) {
let pos = thread_rng().sample(self.pos_distr);
let cur_nt = self.seq[pos];
let new_nt = self.alphabet[thread_rng().sample(self.distr)];
println!("Replacing {} at position {} with {}", cur_nt, pos, new_nt);
self.seq[pos] = new_nt;
}
}
fn main() {
let mut seq = Seq::new(vec!["A", "C", "T", "G"], 200);
println!("Initial sequnce:\n{}", seq);
let mut_distr = rand::distributions::Uniform::new_inclusive(0, 2);
for _ in 0..10 {
let mutation = thread_rng().sample(mut_distr);
if mutation == 0 {
seq.insert()
} else if mutation == 1 {
seq.delete()
} else {
seq.swap()
}
}
println!("\nMutated sequence:\n{}", seq);
}
- Output:
Initial sequnce: Seq: TAAGTTTAGTCTGTTTACGAGATCTAGAGGAGGACACCGTGTAGAGGGGATTTGTCAGGA CACATGCATGGCACCCTAGTCAAATAGTGCCGAGAACAGGCTCTCCTGAGAAAGTTAGGT CTGCCGAAGTGACGAAGTGCACGTTATAGCTCTATTAAGTATGTTCGTTAACAGGTATTA ATGCTCTTAGCCAAGACCGT Length: 200 Counts: A = 56 C = 38 G = 53 T = 53 Deleting C at position 197 Inserting T at position 157 Replacing C at position 149 with G Replacing A at position 171 with G Replacing T at position 182 with G Deleting C at position 124 Inserting T at position 128 Replacing G at position 175 with C Deleting A at position 35 Replacing A at position 193 with G Mutated sequence: Seq: TAAGTTTAGTCTGTTTACGAGATCTAGAGGAGGACCCGTGTAGAGGGGATTTGTCAGGAC ACATGCATGGCACCCTAGTCAAATAGTGCCGAGAACAGGCTCTCCTGAGAAAGTTAGGTC TGCGAAGTTGACGAAGTGCACGTTATAGGTCTATTATAGTATGTTCGTTAGCAGCTATTA AGGCTCTTAGCCAGGACGT Length: 199 Counts: A = 53 C = 36 G = 56 T = 54
Swift
let bases: [Character] = ["A", "C", "G", "T"]
enum Action: CaseIterable {
case swap, delete, insert
}
@discardableResult
func mutate(dna: inout String) -> Action {
guard let i = dna.indices.shuffled().first(where: { $0 != dna.endIndex }) else {
fatalError()
}
let action = Action.allCases.randomElement()!
switch action {
case .swap:
dna.replaceSubrange(i..<i, with: [bases.randomElement()!])
case .delete:
dna.remove(at: i)
case .insert:
dna.insert(bases.randomElement()!, at: i)
}
return action
}
var d = ""
for _ in 0..<200 {
d.append(bases.randomElement()!)
}
func printSeq(_ dna: String) {
for startI in stride(from: 0, to: dna.count, by: 50) {
print("\(startI): \(dna.dropFirst(startI).prefix(50))")
}
print()
print("Size: \(dna.count)")
print()
let counts = dna.reduce(into: [:], { $0[$1, default: 0] += 1 })
for (char, count) in counts.sorted(by: { $0.key < $1.key }) {
print("\(char): \(count)")
}
}
printSeq(d)
print()
for _ in 0..<20 {
mutate(dna: &d)
}
printSeq(d)
- Output:
0: CCCTGTTACCCTTAAGTCACAAATCATGATAAGCAGCCTTCGAGCACTTC 50: GTGTCAAGCCTGATTCGAGCGCGCCGGTCATCCTCCGATAGAGCACGGGG 100: ACGCCCGCACTACCCCACTGGCGCTTGGTCGCTGAATAGGGCGCCCTTGG 150: TGGTGGATGGTCTTAAGCTGTCGCAAATCTAGCCCCGACCAAGAGAAGGC Size: 200 A: 43 C: 62 G: 54 T: 41 0: CCCTGTTACCCTTAAGTCACAAATCATGTATAAAGCAGCCTTCGAGCACT 50: TCGTGTCAAGCCTGATTCGAGCGCGCTAGGGCATCCTCCGTATAAGAGCA 100: CCGGGGACGCCCGCACTTACCACCACTGGCGCTTGGTCGCGAATAGGGGC 150: GCCCTTTGGTGGTGGATTGGTCTTAAGTGTCGCAAATCTAGCCCCCGACC 200: AAGAGAAGGC Size: 210 A: 47 C: 62 G: 56 T: 45
V (Vlang)
import rand
import rand.seed
const bases = "ACGT"
// 'w' contains the weights out of 300 for each
// of swap, delete or insert in that order.
fn mutate(dna string, w [3]int) string {
le := dna.len
// get a random position in the dna to mutate
p := rand.intn(le) or {0}
// get a random number between 0 and 299 inclusive
r := rand.intn(300) or {0}
mut bytes := dna.bytes()
match true {
r < w[0] { // swap
base := bases[rand.intn(4) or {0}]
println(" Change @${p:3} ${[bytes[p]].bytestr()} to ${[base].bytestr()}")
bytes[p] = base
}
r < w[0]+w[1] { // delete
println(" Delete @${p:3} ${bytes[p]}")
bytes.delete(p)
//copy(bytes[p:], bytes[p+1:])
bytes = bytes[0..le-1]
}
else { // insert
base := bases[rand.intn(4) or {0}]
bytes << 0
bytes.insert(p,bytes[p])
//copy(bytes[p+1:], bytes[p:])
println(" Insert @${p:3} $base")
bytes[p] = base
}
}
return bytes.bytestr()
}
// Generate a random dna sequence of given length.
fn generate(le int) string {
mut bytes := []u8{len:le}
for i := 0; i < le; i++ {
bytes[i] = bases[rand.intn(4) or {0}]
}
return bytes.bytestr()
}
// Pretty print dna and stats.
fn pretty_print(dna string, rowLen int) {
println("SEQUENCE:")
le := dna.len
for i := 0; i < le; i += rowLen {
mut k := i + rowLen
if k > le {
k = le
}
println("${i:5}: ${dna[i..k]}")
}
mut base_map := map[byte]int{} // allows for 'any' base
for i in 0..le {
base_map[dna[i]]++
}
mut bb := base_map.keys()
bb.sort()
println("\nBASE COUNT:")
for base in bb {
println(" $base: ${base_map[base]:3}")
}
println(" ------")
println(" Σ: $le")
println(" ======\n")
}
// Express weights as a string.
fn wstring(w [3]int) string {
return " Change: ${w[0]}\n Delete: ${w[1]}\n Insert: ${w[2]}\n"
}
fn main() {
rand.seed(seed.time_seed_array(2))
mut dna := generate(250)
pretty_print(dna, 50)
muts := 10
w := [100, 100, 100]! // use e.g. {0, 300, 0} to choose only deletions
println("WEIGHTS (ex 300):\n${wstring(w)}")
println("MUTATIONS ($muts):")
for _ in 0..muts {
dna = mutate(dna, w)
}
println('')
pretty_print(dna, 50)
}
- Output:
Sample run:
SEQUENCE: 0: CATTGGATTGCTAGTCGTTCAATAGCGAACGAACAGTTTGCATGAATCAG 50: AGAGAGCCTGAAACCTTGGTTGGTATCGACACAACCTCATAATTCACATT 100: CACAAACTTATTTTCGGATCCGCGAAAACGCAAGCGCATTAAGAGACACC 150: CCCAGAGACTCAATTCCGGATTTGCGCTGCTATATACCCACATTGATGAT 200: ATAGGGCTTAGAACGGCCTTAGCCCCGTCGGCTAGTTTCTGAAGTCTCTT BASE COUNT: A: 71 C: 62 G: 52 T: 65 ------ Σ: 250 ====== WEIGHTS (ex 300): Change: 100 Delete: 100 Insert: 100 MUTATIONS (10): Delete @166 "C" Change @185 "C" to "G" Insert @230 "T" Insert @230 "G" Insert @226 "C" Change @162 "A" to "C" Change @236 "G" to "C" Insert @ 25 "C" Delete @ 75 "A" Change @104 "A" to "T" SEQUENCE: 0: CATTGGATTGCTAGTCGTTCAATAGCCGAACGAACAGTTTGCATGAATCA 50: GAGAGAGCCTGAAACCTTGGTTGGTTCGACACAACCTCATAATTCACATT 100: CACATACTTATTTTCGGATCCGCGAAAACGCAAGCGCATTAAGAGACACC 150: CCCAGAGACTCACTTCGGATTTGCGCTGCTATATAGCCACATTGATGATA 200: TAGGGCTTAGAACGGCCTTAGCCCCGCTCGGGTCTACTTTCTGAAGTCTC 250: TT BASE COUNT: A: 68 C: 64 G: 53 T: 67 ------ Σ: 252 ======
Wren
import "random" for Random
import "./fmt" for Fmt
import "./sort" for Sort
var rand = Random.new()
var bases = "ACGT"
// 'w' contains the weights out of 300 for each
// of swap, delete or insert in that order.
var mutate = Fn.new { |dna, w|
var le = dna.count
// get a random position in the dna to mutate
var p = rand.int(le)
// get a random number between 0 and 299 inclusive
var r = rand.int(300)
var chars = dna.toList
if (r < w[0]) { // swap
var base = bases[rand.int(4)]
Fmt.print(" Change @$3d $q to $q", p, chars[p], base)
chars[p] = base
} else if (r < w[0] + w[1]) { // delete
Fmt.print(" Delete @$3d $q", p, chars[p])
chars.removeAt(p)
} else { // insert
var base = bases[rand.int(4)]
Fmt.print(" Insert @$3d $q", p, base)
chars.insert(p, base)
}
return chars.join()
}
// Generate a random dna sequence of given length.
var generate = Fn.new { |le|
var chars = [""] * le
for (i in 0...le) chars[i] = bases[rand.int(4)]
return chars.join()
}
// Pretty print dna and stats.
var prettyPrint = Fn.new { |dna, rowLen|
System.print("SEQUENCE:")
var le = dna.count
var i = 0
while (i < le) {
var k = i + rowLen
if (k > le) k = le
Fmt.print("$5d: $s", i, dna[i...k])
i = i + rowLen
}
var baseMap = {}
for (i in 0...le) {
var v = baseMap[dna[i]]
baseMap[dna[i]] = (v) ? v + 1 : 1
}
var bases = []
for (k in baseMap.keys) bases.add(k)
Sort.insertion(bases) // get bases into alphabetic order
System.print("\nBASE COUNT:")
for (base in bases) Fmt.print(" $s: $3d", base, baseMap[base])
System.print(" ------")
System.print(" Σ: %(le)")
System.print(" ======\n")
}
// Express weights as a string.
var wstring = Fn.new { |w|
return Fmt.swrite(" Change: $d\n Delete: $d\n Insert: $d\n", w[0], w[1], w[2])
}
var dna = generate.call(250)
prettyPrint.call(dna, 50)
var muts = 10
var w = [100, 100, 100] // use e.g. {0, 300, 0} to choose only deletions
Fmt.print("WEIGHTS (ex 300):\n$s", wstring.call(w))
Fmt.print("MUTATIONS ($d):", muts)
for (i in 0...muts) dna = mutate.call(dna, w)
System.print()
prettyPrint.call(dna, 50)
- Output:
Sample run:
SEQUENCE: 0: CATTGGATTGCTAGTCGTTCAATAGCGAACGAACAGTTTGCATGAATCAG 50: AGAGAGCCTGAAACCTTGGTTGGTATCGACACAACCTCATAATTCACATT 100: CACAAACTTATTTTCGGATCCGCGAAAACGCAAGCGCATTAAGAGACACC 150: CCCAGAGACTCAATTCCGGATTTGCGCTGCTATATACCCACATTGATGAT 200: ATAGGGCTTAGAACGGCCTTAGCCCCGTCGGCTAGTTTCTGAAGTCTCTT BASE COUNT: A: 71 C: 62 G: 52 T: 65 ------ Σ: 250 ====== WEIGHTS (ex 300): Change: 100 Delete: 100 Insert: 100 MUTATIONS (10): Delete @166 "C" Change @185 "C" to "G" Insert @230 "T" Insert @230 "G" Insert @226 "C" Change @162 "A" to "C" Change @236 "G" to "C" Insert @ 25 "C" Delete @ 75 "A" Change @104 "A" to "T" SEQUENCE: 0: CATTGGATTGCTAGTCGTTCAATAGCCGAACGAACAGTTTGCATGAATCA 50: GAGAGAGCCTGAAACCTTGGTTGGTTCGACACAACCTCATAATTCACATT 100: CACATACTTATTTTCGGATCCGCGAAAACGCAAGCGCATTAAGAGACACC 150: CCCAGAGACTCACTTCGGATTTGCGCTGCTATATAGCCACATTGATGATA 200: TAGGGCTTAGAACGGCCTTAGCCCCGCTCGGGTCTACTTTCTGAAGTCTC 250: TT BASE COUNT: A: 68 C: 64 G: 53 T: 67 ------ Σ: 252 ======
Yabasic
// Rosetta Code problem: http://rosettacode.org/wiki/Sequence_mutation
// by Galileo, 07/2022
r = int(ran(300))
for i = 1 to 200 + r : dna$ = dna$ + mid$("ACGT", int(ran(4))+1, 1) : next
sub show()
local acgt(4), i, j, x, total
for i = 1 to len(dna$)
x = instr("ACGT", mid$(dna$, i, 1))
acgt(x) = acgt(x) + 1
next
for i = 1 to 4 : total = total + acgt(i) : next
for i = 1 to len(dna$) step 50
print i, ":\t";
for j = 0 to 49 step 10
print mid$(dna$, i+j, 10), " ";
next
print
next
print "\nBase counts: A: ", acgt(1), ", C: ", acgt(2), ", G: ", acgt(3), ", T: ", acgt(4), ", total: ", total
end sub
sub mutate()
local i, p, sdi$, rep$, was$
print
for i = 1 to 10
p = int(ran(len(dna$))) + 1
sdi$ = mid$("SDI", int(ran(3)) + 1, 1)
rep$ = mid$("ACGT", int(ran(4)) + 1, 1)
was$ = mid$(dna$, p, 1)
switch sdi$
case "S": mid$(dna$, p, 1) = rep$
print "swapped ", was$, " at ", p, " for ", rep$ : break
case "D": dna$ = left$(dna$, p - 1) + right$(dna$, len(dna$) - p)
print "deleted ", was$, " at ", p : break
case "I": dna$ = left$(dna$, p - 1) + rep$ + right$(dna$, (len(dna$) - p + 1))
print "inserted ", rep$, " at ", p, ", before ", was$ : break
end switch
next
print
end sub
show()
mutate()
show()
- Output:
1: TCCATCGTGG GATCGCTCTA GCGGTATGCT ATCATTCCTA TAGCAATTCT 51: CAGGGGGCCC GACGGCGCCG ATCACATGTG ATCCTTGTGT GATCGCTTCA 101: TGTCATGGCT TTCTAGACCT TGGATAAGCA TGTACGGTTG GACCAGTCGT 151: GCGTCGGTAA ACAACGCATC TGTGTTATAT CCGTCGAATA ACCCATATGT 201: CTCCAGTCTA ATCCCCTAAG CAACTGCTCA AGGTAAAATG CAAATACAGG 251: TGAGGAGTCC TCGAAGGGGT CGCACCGCAA TATGGGCGTC CCTTATTGGC 301: CCTCATCAGT AT Base counts: A: 72, C: 81, G: 76, T: 83, total: 312 inserted G at 89, before G swapped T at 174 for C deleted A at 31 deleted G at 89 deleted C at 275 inserted A at 278, before A inserted C at 200, before C inserted C at 232, before G deleted G at 10 swapped A at 124 for C 1: TCCATCGTGG ATCGCTCTAG CGGTATGCTT CATTCCTATA GCAATTCTCA 51: GGGGGCCCGA CGGCGCCGAT CACATGTGAT CCTTGTGTGA TCGCTTCATG 101: TCATGGCTTT CTAGACCTTG GATCAGCATG TACGGTTGGA CCAGTCGTGC 151: GTCGGTAAAC AACGCATCTG CGTTATATCC GTCGAATAAC CCATATGTCC 201: TCCAGTCTAA TCCCCTAAGC AACTGCTCAA CGGTAAAATG CAAATACAGG 251: TGAGGAGTCC TCGAAGGGGT CGCACGCAAA TATGGGCGTC CCTTATTGGC 301: CCTCATCAGT AT Base counts: A: 71, C: 84, G: 75, T: 82, total: 312 ---Program done, press RETURN---
zkl
var [const] bases="ACGT", lbases=bases.toLower();
dna:=(190).pump(Data().howza(3),(0).random.fp(0,4),bases.get); // bucket of bytes
foreach s,m in (T("Original","Mutated").zip(T(True,False))){
println("\n",s," DNA strand:"); dnaPP(dna);
println("Base Counts: ", dna.len()," : ",
dna.text.toUpper().counts() // ("A",5, "C",10, ...)
.pump(String,Void.Read,"%s(%d) ".fmt));
if(m) mutate(dna,10,True);
}
fcn mutate(dna,count=1,verbose=False){
if(verbose) println("Mutating:");
do(count){
n,rb := (0).random(dna.len()), lbases[(0).random(4)];
switch( (0).random(3) ){
case(0){ if(verbose) println("Change[%d] '%s' to '%s'".fmt(n,dna.charAt(n),rb));
dna[n]=rb;
}
case(1){ if(verbose) println("Delete[%d] '%s'".fmt(n,dna.charAt(n)));
dna.del(n);
}
else{ if(verbose) println("Insert[%d] '%s'".fmt(n,rb));
dna.insert(n,rb);
}
}
}
}
fcn dnaPP(dna,N=50){
[0..*,50].zipWith(fcn(n,bases){ println("%6d: %s".fmt(n,bases.concat())) },
dna.walker().walk.fp(50)).pump(Void); // .pump forces the iterator
}
- Output:
Original DNA strand: 0: AACGACAGGTTCTCGATGCGTGTCTTCACACATGTGGAGTCGCCAAGGAT 50: TGTTGATCAATGCGTAAACGTCTCCACGGGATACACGGGCAGCTTGCGGT 100: GACGAGTGCGGACCACCAAAAAAGGTGGGATCCACGTTGAGGAGCCTCAC 150: TACCTACGGCGTGATATGGCGGCAGGAGTCAAAAACTGCT Base Counts: 190 : A(49) C(46) G(57) T(38) Mutating: Insert[123] 'c' Insert[69] 't' Delete[5] 'C' Delete[147] 'T' Change[167] 'G' to 't' Change[153] 'C' to 'c' Insert[151] 't' Insert[156] 't' Delete[150] 'T' Change[102] 'C' to 'g' Mutated DNA strand: 0: AACGAAGGTTCTCGATGCGTGTCTTCACACATGTGGAGTCGCCAAGGATT 50: GTTGATCAATGCGTAAACtGTCTCCACGGGATACACGGGCAGCTTGCGGT 100: GAgGAGTGCGGACCACCAAAAAAcGGTGGGATCCACGTTGAGGAGCCCAC 150: tACcTtACGGCGTGATATtGCGGCAGGAGTCAAAAACTGCT Base Counts: 191 : A(49) C(45) G(57) T(40)