Bioinformatics/Sequence mutation

From Rosetta Code
Bioinformatics/Sequence mutation is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.
Task

Given a string of characters A, C, G, and T representing a DNA sequence write a routine to mutate the sequence, (string) by:

  1. Choosing a random base position in the sequence.
  2. Mutate the sequence by doing one of either:
    1. Swap the base at that position by changing it to one of A, C, G, or T. (which has a chance of swapping the base for the same base)
    2. Delete the chosen base at the position.
    3. Insert another base randomly chosen from A,C, G, or T into the sequence at that position.
  3. Randomly generate a test DNA sequence of at least 200 bases
  4. "Pretty print" the sequence and a count of its size, and the count of each base in the sequence
  5. Mutate the sequence ten times.
  6. "Pretty print" the sequence after all mutations, and a count of its size, and the count of each base in the sequence.
Extra credit
  • Give more information on the individual mutations applied.
  • Allow mutations to be weighted and/or chosen.

C[edit]

Adenine ( A ) is always swapped for Thymine ( T ) and vice versa. Similarly with Cytosine ( C ) and Guanine ( G ).

 
#include<stdlib.h>
#include<stdio.h>
#include<time.h>
 
typedef struct genome{
char base;
struct genome *next;
}genome;
 
typedef struct{
char mutation;
int position;
}genomeChange;
 
typedef struct{
int adenineCount,thymineCount,cytosineCount,guanineCount;
}baseCounts;
 
genome *strand;
baseCounts baseData;
int genomeLength = 100, lineLength = 50;
 
int numDigits(int num){
int len = 1;
 
while(num>10){
num /= 10;
len++;
}
return len;
}
 
void generateStrand(){
 
int baseChoice = rand()%4, i;
genome *strandIterator, *newStrand;
 
baseData.adenineCount = 0;
baseData.thymineCount = 0;
baseData.cytosineCount = 0;
baseData.guanineCount = 0;
 
strand = (genome*)malloc(sizeof(genome));
strand->base = baseChoice==0?'A':(baseChoice==1?'T':(baseChoice==2?'C':'G'));
baseChoice==0?baseData.adenineCount++:(baseChoice==1?baseData.thymineCount++:(baseChoice==2?baseData.cytosineCount++:baseData.guanineCount++));
strand->next = NULL;
 
strandIterator = strand;
 
for(i=1;i<genomeLength;i++){
baseChoice = rand()%4;
 
newStrand = (genome*)malloc(sizeof(genome));
newStrand->base = baseChoice==0?'A':(baseChoice==1?'T':(baseChoice==2?'C':'G'));
baseChoice==0?baseData.adenineCount++:(baseChoice==1?baseData.thymineCount++:(baseChoice==2?baseData.cytosineCount++:baseData.guanineCount++));
newStrand->next = NULL;
 
strandIterator->next = newStrand;
strandIterator = newStrand;
}
}
 
genomeChange generateMutation(int swapWeight, int insertionWeight, int deletionWeight){
int mutationChoice = rand()%(swapWeight + insertionWeight + deletionWeight);
 
genomeChange mutationCommand;
 
mutationCommand.mutation = mutationChoice<swapWeight?'S':((mutationChoice>=swapWeight && mutationChoice<swapWeight+insertionWeight)?'I':'D');
mutationCommand.position = rand()%genomeLength;
 
return mutationCommand;
}
 
void printGenome(){
int rows, width = numDigits(genomeLength), len = 0,i,j;
lineLength = (genomeLength<lineLength)?genomeLength:lineLength;
 
rows = genomeLength/lineLength + (genomeLength%lineLength!=0);
 
genome* strandIterator = strand;
 
printf("\n\nGenome : \n--------\n");
 
for(i=0;i<rows;i++){
printf("\n%*d%3s",width,len,":");
 
for(j=0;j<lineLength && strandIterator!=NULL;j++){
printf("%c",strandIterator->base);
strandIterator = strandIterator->next;
}
len += lineLength;
}
 
while(strandIterator!=NULL){
printf("%c",strandIterator->base);
strandIterator = strandIterator->next;
}
 
printf("\n\nBase Counts\n-----------");
 
printf("\n%*c%3s%*d",width,'A',":",width,baseData.adenineCount);
printf("\n%*c%3s%*d",width,'T',":",width,baseData.thymineCount);
printf("\n%*c%3s%*d",width,'C',":",width,baseData.cytosineCount);
printf("\n%*c%3s%*d",width,'G',":",width,baseData.guanineCount);
 
printf("\n\nTotal:%*d",width,baseData.adenineCount + baseData.thymineCount + baseData.cytosineCount + baseData.guanineCount);
 
printf("\n");
}
 
void mutateStrand(int numMutations, int swapWeight, int insertionWeight, int deletionWeight){
int i,j,width,baseChoice;
genomeChange newMutation;
genome *strandIterator, *strandFollower, *newStrand;
 
for(i=0;i<numMutations;i++){
strandIterator = strand;
strandFollower = strand;
newMutation = generateMutation(swapWeight,insertionWeight,deletionWeight);
width = numDigits(genomeLength);
 
for(j=0;j<newMutation.position;j++){
strandFollower = strandIterator;
strandIterator = strandIterator->next;
}
 
if(newMutation.mutation=='S'){
if(strandIterator->base=='A'){
strandIterator->base='T';
printf("\nSwapping A at position : %*d with T",width,newMutation.position);
}
else if(strandIterator->base=='A'){
strandIterator->base='T';
printf("\nSwapping A at position : %*d with T",width,newMutation.position);
}
else if(strandIterator->base=='C'){
strandIterator->base='G';
printf("\nSwapping C at position : %*d with G",width,newMutation.position);
}
else{
strandIterator->base='C';
printf("\nSwapping G at position : %*d with C",width,newMutation.position);
}
}
 
else if(newMutation.mutation=='I'){
baseChoice = rand()%4;
 
newStrand = (genome*)malloc(sizeof(genome));
newStrand->base = baseChoice==0?'A':(baseChoice==1?'T':(baseChoice==2?'C':'G'));
printf("\nInserting %c at position : %*d",newStrand->base,width,newMutation.position);
baseChoice==0?baseData.adenineCount++:(baseChoice==1?baseData.thymineCount++:(baseChoice==2?baseData.cytosineCount++:baseData.guanineCount++));
newStrand->next = strandIterator;
strandFollower->next = newStrand;
genomeLength++;
}
 
else{
strandFollower->next = strandIterator->next;
strandIterator->next = NULL;
printf("\nDeleting %c at position : %*d",strandIterator->base,width,newMutation.position);
free(strandIterator);
genomeLength--;
}
}
}
 
int main(int argc,char* argv[])
{
int numMutations = 10, swapWeight = 10, insertWeight = 10, deleteWeight = 10;
 
if(argc==1||argc>6){
printf("Usage : %s <Genome Length> <Optional number of mutations> <Optional Swapping weight> <Optional Insertion weight> <Optional Deletion weight>\n",argv[0]);
return 0;
}
 
switch(argc){
case 2: genomeLength = atoi(argv[1]);
break;
case 3: genomeLength = atoi(argv[1]);
numMutations = atoi(argv[2]);
break;
case 4: genomeLength = atoi(argv[1]);
numMutations = atoi(argv[2]);
swapWeight = atoi(argv[3]);
break;
case 5: genomeLength = atoi(argv[1]);
numMutations = atoi(argv[2]);
swapWeight = atoi(argv[3]);
insertWeight = atoi(argv[4]);
break;
case 6: genomeLength = atoi(argv[1]);
numMutations = atoi(argv[2]);
swapWeight = atoi(argv[3]);
insertWeight = atoi(argv[4]);
deleteWeight = atoi(argv[5]);
break;
};
 
srand(time(NULL));
generateStrand();
 
printf("\nOriginal:");
printGenome();
mutateStrand(numMutations,swapWeight,insertWeight,deleteWeight);
 
printf("\n\nMutated:");
printGenome();
 
return 0;
}
 

Sample run :

C:\My Projects\networks>a 500 30 15 10 5

Original:

Genome :
--------

  0  :CGATGAGTTTCCTCCAAGGAGCAGGGCGTGACGGAAGGGAGGCTTAGGTC
 50  :CGCATGCTCGTCGGCAGCCGGCTGGTGCCGTCGTAACCTTCACATTATTC
100  :TAGAATTTCGATGCACCTGATGACTCATACCCAGATGTAGGGGTACGCGA
150  :TGCAGATGCGGGCACGAGGAATTGTGGGCAAGCCGGCAGGTCTTTTGTAA
200  :GTTGTCACTAACTAAATAGAGGGATGGATGTTATAGCACACTACTGTCGA
250  :TTACGGACAGCGTCCCGATTCGTCATACGACCAGGATATATACTCGACGT
300  :CCAACAGGAGATTCACGTAGTGAACGCAGTTGACAGCCTGCTCGTATCTC
350  :CAGGGGTGGACTGCACCGTTCGTTAACTGCTGCCACATTAAACAGCTTCC
400  :CACTCCTTGACGCCAGACTCGGTACCACAGACCGTCAAGCTCCTATTTCC
450  :TTTGCAGTTAAAAAACACTATGGTGAAGGTCGGAGAGATGACCTCATCTA

Base Counts
-----------
  A  :124
  T  :118
  C  :126
  G  :132

Total:500

Inserting G at position : 205
Inserting G at position : 144
Inserting C at position : 171
Swapping A at position : 335 with T
Inserting A at position : 101
Swapping C at position : 109 with G
Swapping A at position : 306 with T
Inserting G at position :  51
Swapping G at position :   1 with C
Deleting G at position :  60
Swapping G at position :  66 with C
Inserting C at position :  41
Inserting C at position : 425
Swapping C at position : 173 with G
Inserting A at position : 319
Swapping G at position : 460 with C
Deleting T at position :  61
Swapping C at position : 160 with G
Inserting C at position : 251
Swapping G at position : 337 with C
Inserting G at position :  43
Inserting T at position : 146
Inserting T at position : 181
Deleting G at position :  53
Deleting A at position : 464
Swapping G at position : 362 with C
Swapping G at position : 190 with C
Swapping C at position : 280 with G
Inserting T at position : 479
Deleting C at position : 400

Mutated:

Genome :
--------

  0  :CCATGAGTTTCCTCCAAGGAGCAGGGCGTGACGGAAGGGAGCGGCTTAGG
 50  :TCCGCATGCTCCGGCACCCGGCTGGTGCCGTCGTAACCTTCACATTATTC
100  :TAAGAATTTGGATGCACCTGATGACTCATACCCAGATGTAGGGGTTGACG
150  :CGATGCAGATGGGGGCACGAGGAGATTGTGTGGCAAGCCGCCAGGTCTTT
200  :TGTAAGTTGTGCACTAACTAAATAGAGGGATGGATGTTATAGCACACTAC
250  :TGTCCGATTACGGACAGCGTCCCGATTCGTGATACGACCAGGATATATAC
300  :TCGACGTCCTACAGGAGATTCAACGTAGTGAACGCAGTTCTCAGCCTGCT
350  :CGTATCTCCAGGCGTGGACTGCACCGTTCGTTAACTGCTGCCACATTAAA
400  :AGCTTCCCACTCCTTGACGCCAGACTCCGGTACCACAGACCGTCAAGCTC
450  :CTATTTCCTTTCCGTTAAAAAACACTATTGGTGAAGGTCGGAGAGATGAC
500  :CTCATCTA

Base Counts
-----------
  A  :126
  T  :121
  C  :130
  G  :136

Total:513

Go[edit]

package main
 
import (
"fmt"
"math/rand"
"sort"
"time"
)
 
const bases = "ACGT"
 
// 'w' contains the weights out of 300 for each
// of swap, delete or insert in that order.
func mutate(dna string, w [3]int) string {
le := len(dna)
// get a random position in the dna to mutate
p := rand.Intn(le)
// get a random number between 0 and 299 inclusive
r := rand.Intn(300)
bytes := []byte(dna)
switch {
case r < w[0]: // swap
base := bases[rand.Intn(4)]
fmt.Printf(" Change @%3d %q to %q\n", p, bytes[p], base)
bytes[p] = base
case r < w[0]+w[1]: // delete
fmt.Printf(" Delete @%3d %q\n", p, bytes[p])
copy(bytes[p:], bytes[p+1:])
bytes = bytes[0 : le-1]
default: // insert
base := bases[rand.Intn(4)]
bytes = append(bytes, 0)
copy(bytes[p+1:], bytes[p:])
fmt.Printf(" Insert @%3d %q\n", p, base)
bytes[p] = base
}
return string(bytes)
}
 
// Generate a random dna sequence of given length.
func generate(le int) string {
bytes := make([]byte, le)
for i := 0; i < le; i++ {
bytes[i] = bases[rand.Intn(4)]
}
return string(bytes)
}
 
// Pretty print dna and stats.
func prettyPrint(dna string, rowLen int) {
fmt.Println("SEQUENCE:")
le := len(dna)
for i := 0; i < le; i += rowLen {
k := i + rowLen
if k > le {
k = le
}
fmt.Printf("%5d: %s\n", i, dna[i:k])
}
baseMap := make(map[byte]int) // allows for 'any' base
for i := 0; i < le; i++ {
baseMap[dna[i]]++
}
var bases []byte
for k := range baseMap {
bases = append(bases, k)
}
sort.Slice(bases, func(i, j int) bool { // get bases into alphabetic order
return bases[i] < bases[j]
})
 
fmt.Println("\nBASE COUNT:")
for _, base := range bases {
fmt.Printf("  %c: %3d\n", base, baseMap[base])
}
fmt.Println(" ------")
fmt.Println(" Σ:", le)
fmt.Println(" ======\n")
}
 
// Express weights as a string.
func wstring(w [3]int) string {
return fmt.Sprintf(" Change: %d\n Delete: %d\n Insert: %d\n", w[0], w[1], w[2])
}
 
func main() {
rand.Seed(time.Now().UnixNano())
dna := generate(250)
prettyPrint(dna, 50)
muts := 10
w := [3]int{100, 100, 100} // use e.g. {0, 300, 0} to choose only deletions
fmt.Printf("WEIGHTS (ex 300):\n%s\n", wstring(w))
fmt.Printf("MUTATIONS (%d):\n", muts)
for i := 0; i < muts; i++ {
dna = mutate(dna, w)
}
fmt.Println()
prettyPrint(dna, 50)
}
Output:

Sample run:

SEQUENCE:
    0: AATCCAGAAGTTGCGGGAACCGTCGAATAGTGTTCATTAAGTGTCCCGCG
   50: GAGTAGCCTCGTAATATAGAATGACCGGGCTTCCCAGCTAGACTTGTCCG
  100: CCACGTTTGTGTAGGGCGCAGCGAGACTGCTCTTGATACTCGTTATGTTC
  150: CTCACCGGATTATTGAATAGAGTCGAGGGGCTGACGTGACTGAACATTGC
  200: CTCCTTTGCGACTAATCTTTCCTTCAATGAACAGGCGCTACCCGTCATCG

BASE COUNT:
    A:  56
    C:  63
    G:  64
    T:  67
    ------
    Σ: 250
    ======

WEIGHTS (ex 300):
  Change: 100
  Delete: 100
  Insert: 100

MUTATIONS (10):
  Change @195 'A' to 'C'
  Insert @ 95 'G'
  Change @137 'T' to 'C'
  Delete @207 'T'
  Insert @148 'C'
  Insert @113 'A'
  Change @ 45 'C' to 'T'
  Delete @ 93 'T'
  Insert @ 51 'C'
  Delete @248 'A'

SEQUENCE:
    0: AATCCAGAAGTTGCGGGAACCGTCGAATAGTGTTCATTAAGTGTCTCGCG
   50: GCAGTAGCCTCGTAATATAGAATGACCGGGCTTCCCAGCTAGACTGGTCC
  100: GCCACGTTTGTGTAAGGGCGCAGCGAGACTGCTCTTGACACTCGTTATGC
  150: TTCCTCACCGGATTATTGAATAGAGTCGAGGGGCTGACGTGACTGAACCT
  200: TGCCTCCTTGCGACTAATCTTTCCTTCAATGAACAGGCGCTACCCGTCTC
  250: G

BASE COUNT:
    A:  55
    C:  66
    G:  65
    T:  65
    ------
    Σ: 251
    ======

Julia[edit]

dnabases = ['A', 'C', 'G', 'T']
randpos(seq) = rand(1:length(seq)) # 1
mutateat(pos, seq) = (s = seq[:]; s[pos] = rand(dnabases); s) # 2-1
deleteat(pos, seq) = [seq[1:pos-1]; seq[pos+1:end]] # 2-2
randinsertat(pos, seq) = [seq[1:pos]; rand(dnabases); seq[pos+1:end]] # 2-3
 
function weightedmutation(seq, pos, weights=[1, 1, 1], verbose=true) # Extra credit
p, r = weights ./ sum(weights), rand()
f = (r <= p[1]) ? mutateat : (r < p[1] + p[2]) ? deleteat : randinsertat
verbose && print("Mutate by ", f == mutateat ? "swap" :
f == deleteat ? "delete" : "insert")
return f(pos, seq)
end
 
function weightedrandomsitemutation(seq, weights=[1, 1, 1], verbose=true)
position = randpos(seq)
newseq = weightedmutation(seq, position, weights, verbose)
verbose && println(" at position $position")
return newseq
end
 
randdnasequence(n) = rand(dnabases, n) # 3
 
function dnasequenceprettyprint(seq, colsize=50) # 4
println(length(seq), "nt DNA sequence:\n")
rows = [seq[i:min(length(seq), i + colsize - 1)] for i in 1:colsize:length(seq)]
for (i, r) in enumerate(rows)
println(lpad(colsize * (i - 1), 5), " ", String(r))
end
bases = [[c, 0] for c in dnabases]
for c in seq, base in bases
if c == base[1]
base[2] += 1
end
end
println("\nNucleotide counts:\n")
for base in bases
println(lpad(base[1], 10), lpad(string(base[2]), 12))
end
println(lpad("Other", 10), lpad(string(length(seq) - sum(x[2] for x in bases)), 12))
println(" _________________\n", lpad("Total", 10), lpad(string(length(seq)), 12))
end
 
function testbioseq()
sequence = randdnasequence(500)
dnasequenceprettyprint(sequence)
for _ in 1:10 # 5
sequence = weightedrandomsitemutation(sequence)
end
println("\n Mutated:"); dnasequenceprettyprint(sequence) # 6
end
 
testbioseq()
 
Output:
500nt DNA sequence:

    0   TCACGTAACAGAGGTATAGTTGATCTTGAGCGGGCTGGCTCCCGGGTTTC
   50   TAGCAAGAAAAGGGGAGGGAAGTGCGCCTGCTTTTGCCCCGGGCACCCCA
  100   ATCGAAGACAGCTCCGGGGTCGCACATTTTTATGGCCACATAATGAGGGA
  150   ATGCACGCATCACTCCTATCACTAACTGCGAACTCATGTGACTGTGCAAA
  200   ACACCTTTAACACTGCGATGCCGTGGGGACGGGCCCCCCCAGCGGTATAG
  250   CGCGCACACGCGACAGATGTTAACTCGAATGGTCGCGCCGGGGAGTGCAC
  300   CCCTTGACATATACTCCAGATGCAATGCGCTATACTTTATGAACTTGCAT
  350   AAGCTGCGCAGGGGGGATTGACTTATACTACATATTAACTACCGATATCG
  400   ACGCAAATATTCGGGCGGTCTAAAGTGTGTCAGAACGGACATGCCGCCCA
  450   GAATCACGGCTACTGAGGACAAATACGCATTCCCGGTGCTGCATTCATTC

Nucleotide counts:

         A         127
         C         133
         G         131
         T         109
     Other           0
     _________________
     Total         500
Mutate by swap at position 253
Mutate by swap at position 448
Mutate by insert at position 379
Mutate by delete at position 311
Mutate by delete at position 335
Mutate by insert at position 132
Mutate by swap at position 191
Mutate by swap at position 481
Mutate by insert at position 189
Mutate by insert at position 423

 Mutated:
502nt DNA sequence:

    0   TCACGTAACAGAGGTATAGTTGATCTTGAGCGGGCTGGCTCCCGGGTTTC
   50   TAGCAAGAAAAGGGGAGGGAAGTGCGCCTGCTTTTGCCCCGGGCACCCCA
  100   ATCGAAGACAGCTCCGGGGTCGCACATTTTTACTGGCCACATAATGAGGG
  150   AATGCACGCATCACTCCTATCACTAACTGCGAACTCATGATCACTGTGCA
  200   AAACACCTTTAACACTGCGATGCCGTGGGGACGGGCCCCCCCAGCGGTAT
  250   AGCGAGCACACGCGACAGATGTTAACTCGAATGGTCGCGCCGGGGAGTGC
  300   ACCCCTTGACATTACTCCAGATGCAATGCGCTATACTTATGAACTTGCAT
  350   AAGCTGCGCAGGGGGGATTGACTTATACTGACATATTAACTACCGATATC
  400   GACGCAAATATTCGGGCGGTCTAGAAGTGTGTCAGAACGGACATGCCGCT
  450   CAGAATCACGGCTACTGAGGACAAATACGCATTCCCGGTGCTGCATTCAT
  500   TC

Nucleotide counts:

         A         128
         C         133
         G         132
         T         109
     Other           0
     _________________
     Total         502

Perl[edit]

Translation of: Perl 6
use strict;
use warnings;
use feature 'say';
 
my @bases = <A C G T>;
 
my $dna;
$dna .= $bases[int rand 4] for 1..200;
 
my %cnt;
$cnt{$_}++ for split //, $dna;
 
sub pretty {
my($string) = @_;
my $chunk = 10;
my $wrap = 5 * ($chunk+1);
($string =~ s/(.{$chunk})/$1 /gr) =~ s/(.{$wrap})/$1\n/gr;
}
 
sub mutate {
my($dna,$count) = @_;
my $orig = $dna;
substr($dna,rand length $dna,1) = $bases[int rand 4] while $count > diff($orig, $dna) =~ tr/acgt//;
$dna
}
 
sub diff {
my($orig, $repl) = @_;
for my $i (0 .. -1+length $orig) {
substr($repl,$i,1, lc substr $repl,$i,1) if substr($orig,$i,1) ne substr($repl,$i,1);
}
$repl;
}
 
say "Original DNA strand:\n" . pretty($dna);
say "Total bases: ". length $dna;
say "$_: $cnt{$_}" for @bases;
 
my $mutate = mutate($dna, 10);
%cnt = ();
$cnt{$_}++ for split //, $mutate;
say "\nMutated DNA strand:\n" . pretty diff $dna, $mutate;
say "Total bases: ". length $mutate;
say "$_: $cnt{$_}" for @bases;
 
Output:
Original DNA strand:
TGGAACATGT CCCAACGAGT TCTTCTTGCT AGCAGATTTT TTCAGTTGAT
CGTCACATGC GGTAGACTAC CCAAGGTGTG ACTACTCGCA TGCCTGATCT
AAATGGACAG TCGGCAGGCT AGTGCTAATT ACCGGAAGTA CGAACGAGCC
ATGCTGAGCG ACTCATCATT GTGAAATCGA GCCTATCTGC ATGACCTAAT

Total bases: 200
A: 52
C: 48
G: 47
T: 53

Mutated DNA strand:
TGGAACATGT CCCAACGAGT cCTTCTTGCT AGCcGATTTT TTCAGTTGgT
gGTCACATGC aGTAGACTAC CCgAGGTGTG ACTACTCGCA TGCCTGATCT
AAATGGACAG TCGGCAGGCT AGTGCTAATT ACCGGAAGTA CGAACGAGCt
ATGCaGAGCG ACTCATCgTT GTGAAATCGA GCCTATCTGC AgGACCTAAT

Total bases: 200
A: 50
C: 48
G: 51
T: 51

Perl 6[edit]

Works with: Rakudo version 2019.07.1

Unweighted mutations at this point. The mutated DNA strand has a "diff" operation performed on it which (in this specific case) renders the mutated base in lower case so it may be picked out more easily.


my @bases = <A C G T>;
 
# The DNA strand
my $dna = @bases.roll(200).join;
 
 
# The Task
put "ORIGINAL DNA STRAND:";
put pretty $dna;
put "\nTotal bases: ", +my $bases = $dna.comb.Bag;
put $bases.sort( ~*.key ).join: "\n";
 
put "\nMUTATED DNA STRAND:";
my $mutate = $dna.&mutate(10);
put pretty diff $dna, $mutate;
put "\nTotal bases: ", +my $mutated = $mutate.comb.Bag;
put $mutated.sort( ~*.key ).join: "\n";
 
 
# Helper subs
sub pretty ($string, $wrap = 50) {
$string.comb($wrap).map( { sprintf "%8d: %s", $++ * $wrap, $_ } ).join: "\n"
}
 
sub mutate ($dna is copy, $count = 1) {
$dna.substr-rw((^$dna.chars).roll, 1) = @bases.roll for ^$count;
$dna
}
 
sub diff ($orig, $repl) {
($orig.comb Z $repl.comb).map( -> ($o, $r) { $o eq $r ?? $o !! $r.lc }).join
}
Output:
ORIGINAL DNA STRAND:
       0: ACGGATAGACCGTTCCTGCAAGCTGGTACGGTTCGAATGTTGACCTTATT
      50: CTCCGCAGCGCACTACCCGATCGGGTAACGTACTCTATATGATGCCTATT
     100: TTCCCCGCCTTACATCGGCGATCAATGTTCTTTTACGCTAACTAGGCGCA
     150: CGTCGTGCCTTACCGAGAGCCAGTTCGAAATCGTGCTGAAAATATCTGGA

Total bases: 200
A	45
C	55
G	45
T	55

MUTATED DNA STRAND:
       0: ACGGATAGcCCGTTCCTGCAAGCTGGTACGGTTCGAATGTTGACCTTATT
      50: CTCCGCAGCGCACTACCCGATCGGGTcACtcACTCTATATGAcGCCTAaT
     100: TTCCCCGCCTTACATCGGCGATCAATGTTCTTTTACGCTAACTAGGCGCA
     150: CGTCGTGCCTTACCcAGAGCCAGTTCGAAATCGTGCTGAAAATATCTGGA

Total bases: 200
A	44
C	60
G	43
T	53

Phix[edit]

string dna = repeat(' ',200+rand(300))
for i=1 to length(dna) do dna[i] = "ACGT"[rand(4)] end for
 
procedure show()
sequence acgt = repeat(0,5)
for i=1 to length(dna) do
acgt[find(dna[i],"ACGT")] += 1
end for
acgt[$] = sum(acgt)
sequence s = split(trim(join_by(split(join_by(dna,1,10,""),"\n"),1,5," ")),"\n\n")
for i=1 to length(s) do
printf(1,"%3d: %s\n",{(i-1)*50+1,s[i]})
end for
printf(1,"\nBase counts: A:%d, C:%d, G:%d, T:%d, total:%d\n",acgt)
end procedure
 
procedure mutate()
printf(1,"\n")
for i=1 to 10 do
integer p = rand(length(dna)),
sdi = "SDI"[rand(3)],
rep = "ACGT"[rand(4)],
was = dna[p]
switch sdi do
case 'S':dna[p] = rep printf(1,"swapped %c at %d for %c\n",{was,p,rep})
case 'D':dna[p..p] = "" printf(1,"deleted %c at %d\n",{was,p})
case 'I':dna[p..p-1] = ""&rep printf(1,"inserted %c at %d, before %c\n",{rep,p,was})
end switch
end for
printf(1,"\n")
end procedure
 
show()
mutate()
show()
Output:
  1: ATAGACCGAT GTGTAGGTCT CGAACATCCC TGGGGTAGCT CAGCTTGGGG
 51: GTTGACCTGT CTTGCTCCCA TGAACTGAGG GATTTGGAAA TAACGCTTAT
101: AACTGCGGGG GATTGATATG GGACATTGTT GCTGTAGGGC TTCGGCGTGC
151: TTAGAAACAA GAGAACACCA ATTTCGATAG ACCAGGTTTC GTCCCGCTAC
201: GAGTGATAGT AGCGCGTTAG GATTAATAAT CAGGGAGAGC ATTAAACATT
251: CTAAAAACTG ACATTCCCGA GGTGGAACCC GAGTTGATAA CGAGTATGCT
301: CTGAAAAATT AATTGATTGA TCCGCGACAC TATCACACCG TCTTCGCCGT
351: TGTAATGCAT GCTGGCTTAG CATCCGGATG CTCTTCTACC GATCTTAAGG
401: CGCGCACTCC CCAAGGAGCA TAGAAGCATC CCCGGCCCTC GACAGAGTCT
451: CCCAGTGTAA GTGTCTTTAT CCAAAATC

Base counts: A:125, C:111, G:120, T:122, total:478

inserted C at 58, before T
deleted T at 2
deleted C at 169
deleted G at 80
inserted A at 331, before T
swapped C at 27 for T
inserted A at 22, before A
inserted T at 190, before G
swapped C at 195 for C
inserted A at 274, before G

  1: AAGACCGATG TGTAGGTCTC GAAACATTCC TGGGGTAGCT CAGCTTGGGG
 51: GTTGACCCTG TCTTGCTCCC ATGAACTGAG GATTTGGAAA TAACGCTTAT
101: AACTGCGGGG GATTGATATG GGACATTGTT GCTGTAGGGC TTCGGCGTGC
151: TTAGAAACAA GAGAACACAA TTTCGATAGA CCAGGTTTCT GTCCCGCTAC
201: GAGTGATAGT AGCGCGTTAG GATTAATAAT CAGGGAGAGC ATTAAACATT
251: CTAAAAACTG ACATTCCCGA GGTAGGAACC CGAGTTGATA ACGAGTATGC
301: TCTGAAAAAT TAATTGATTG ATCCGCGACA CTAATCACAC CGTCTTCGCC
351: GTTGTAATGC ATGCTGGCTT AGCATCCGGA TGCTCTTCTA CCGATCTTAA
401: GGCGCGCACT CCCCAAGGAG CATAGAAGCA TCCCCGGCCC TCGACAGAGT
451: CTCCCAGTGT AAGTGTCTTT ATCCAAAATC

Base counts: A:128, C:110, G:119, T:123, total:480

Python[edit]

In function seq_mutate argument kinds selects between the three kinds of mutation. The characters I, D, and S are chosen from the string to give the kind of mutation to perform, so the more of that character, the more of that type of mutation performed.
Similarly parameter choice is chosen from to give the base for substitution or insertion - the more any base appears, the more likely it is to be chosen in any insertion/substitution.

import random
from collections import Counter
 
def basecount(dna):
return sorted(Counter(dna).items())
 
def seq_split(dna, n=50):
return [dna[i: i+n] for i in range(0, len(dna), n)]
 
def seq_pp(dna, n=50):
for i, part in enumerate(seq_split(dna, n)):
print(f"{i*n:>5}: {part}")
print("\n BASECOUNT:")
tot = 0
for base, count in basecount(dna):
print(f" {base:>3}: {count}")
tot += count
base, count = 'TOT', tot
print(f" {base:>3}= {count}")
 
def seq_mutate(dna, count=1, kinds="IDSSSS", choice="ATCG" ):
mutation = []
k2txt = dict(I='Insert', D='Delete', S='Substitute')
for _ in range(count):
kind = random.choice(kinds)
index = random.randint(0, len(dna))
if kind == 'I': # Insert
dna = dna[:index] + random.choice(choice) + dna[index:]
elif kind == 'D' and dna: # Delete
dna = dna[:index] + dna[index+1:]
elif kind == 'S' and dna: # Substitute
dna = dna[:index] + random.choice(choice) + dna[index+1:]
mutation.append((k2txt[kind], index))
return dna, mutation
 
if __name__ == '__main__':
length = 250
print("SEQUENCE:")
sequence = ''.join(random.choices('ACGT', weights=(1, 0.8, .9, 1.1), k=length))
seq_pp(sequence)
print("\n\nMUTATIONS:")
mseq, m = seq_mutate(sequence, 10)
for kind, index in m:
print(f" {kind:>10} @{index}")
print()
seq_pp(mseq)
Output:
SEQUENCE:
    0: GGAAGATTAGGTCACGGGCCTCATCTTGTGCGAGATAAATAATAACACTC
   50: AGCGATCATTAGAATGTATATTGTACGGGCATGTTTATCTACCATAGGTC
  100: CTGTCAAAAGATGGCTAGCTGCAATTTTTTCTTCTAGATCCCGATTACTG
  150: CGGTATTTTTGTATAACGTGCTAAACGGTGTGTTTTCAGGTCGGCCTGCT
  200: AATCTAACGCCAGTGGACTTGGGATGGACGCCCAACAACTGAGAGCGCGG

  BASECOUNT:
      A: 64
      C: 51
      G: 62
      T: 73
    TOT= 250


MUTATIONS:
 Substitute @138
 Substitute @72
     Insert @103
     Insert @129
     Insert @124
     Delete @52
     Delete @202
 Substitute @200
     Insert @158
     Delete @32

    0: GGAAGATTAGGTCACGGGCCTCATCTTGTGCGGATAAATAATAACACTCA
   50: GGATCATTAGAATGTATATTATACGGGCATGTTTATCTACCATAGGTCCT
  100: GCTCAAAAGATGGCTAGCTGCAGATTTTGTTCTTCTAGAGCCCGATTACT
  150: GCGGTATGTTTTGTATAACGTGCTAAACGGTGTGTTTTCAGGTCGGCCTG
  200: CTATCTAACGCCAGTGGACTTGGGATGGACGCCCAACAACTGAGAGCGCG
  250: G

  BASECOUNT:
      A: 63
      C: 51
      G: 65
      T: 72
    TOT= 251

zkl[edit]

var [const] bases="ACGT", lbases=bases.toLower();
dna:=(190).pump(Data().howza(3),(0).random.fp(0,4),bases.get); // bucket of bytes
 
foreach s,m in (T("Original","Mutated").zip(T(True,False))){
println("\n",s," DNA strand:"); dnaPP(dna);
println("Base Counts: ", dna.len()," : ",
dna.text.toUpper().counts() // ("A",5, "C",10, ...)
.pump(String,Void.Read,"%s(%d) ".fmt));
if(m) mutate(dna,10,True);
}
 
fcn mutate(dna,count=1,verbose=False){
if(verbose) println("Mutating:");
do(count){
n,rb := (0).random(dna.len()), lbases[(0).random(4)];
switch( (0).random(3) ){
case(0){ if(verbose) println("Change[%d] '%s' to '%s'".fmt(n,dna.charAt(n),rb));
dna[n]=rb;
}
case(1){ if(verbose) println("Delete[%d] '%s'".fmt(n,dna.charAt(n)));
dna.del(n);
}
else{ if(verbose) println("Insert[%d] '%s'".fmt(n,rb));
dna.insert(n,rb);
}
}
}
}
 
fcn dnaPP(dna,N=50){
[0..*,50].zipWith(fcn(n,bases){ println("%6d: %s".fmt(n,bases.concat())) },
dna.walker().walk.fp(50)).pump(Void); // .pump forces the iterator
}
Output:
Original DNA strand:
     0: AACGACAGGTTCTCGATGCGTGTCTTCACACATGTGGAGTCGCCAAGGAT
    50: TGTTGATCAATGCGTAAACGTCTCCACGGGATACACGGGCAGCTTGCGGT
   100: GACGAGTGCGGACCACCAAAAAAGGTGGGATCCACGTTGAGGAGCCTCAC
   150: TACCTACGGCGTGATATGGCGGCAGGAGTCAAAAACTGCT
Base Counts: 190 : A(49) C(46) G(57) T(38) 
Mutating:
Insert[123] 'c'
Insert[69] 't'
Delete[5] 'C'
Delete[147] 'T'
Change[167] 'G' to 't'
Change[153] 'C' to 'c'
Insert[151] 't'
Insert[156] 't'
Delete[150] 'T'
Change[102] 'C' to 'g'

Mutated DNA strand:
     0: AACGAAGGTTCTCGATGCGTGTCTTCACACATGTGGAGTCGCCAAGGATT
    50: GTTGATCAATGCGTAAACtGTCTCCACGGGATACACGGGCAGCTTGCGGT
   100: GAgGAGTGCGGACCACCAAAAAAcGGTGGGATCCACGTTGAGGAGCCCAC
   150: tACcTtACGGCGTGATATtGCGGCAGGAGTCAAAAACTGCT
Base Counts: 191 : A(49) C(45) G(57) T(40)