Bioinformatics/Sequence mutation: Difference between revisions
Line 953:
insertElement i e xs = take i xs <> [e] <> drop i xs
swapElement i a xs = take (pred i) xs <> [a] <> drop i xs
randomIndex
randomDNA = head . randoms <$> newStdGen
|
Revision as of 14:33, 16 June 2020
You are encouraged to solve this task according to the task description, using any language you may know.
- Task
Given a string of characters A, C, G, and T representing a DNA sequence write a routine to mutate the sequence, (string) by:
- Choosing a random base position in the sequence.
- Mutate the sequence by doing one of either:
- Swap the base at that position by changing it to one of A, C, G, or T. (which has a chance of swapping the base for the same base)
- Delete the chosen base at the position.
- Insert another base randomly chosen from A,C, G, or T into the sequence at that position.
- Randomly generate a test DNA sequence of at least 200 bases
- "Pretty print" the sequence and a count of its size, and the count of each base in the sequence
- Mutate the sequence ten times.
- "Pretty print" the sequence after all mutations, and a count of its size, and the count of each base in the sequence.
- Extra credit
- Give more information on the individual mutations applied.
- Allow mutations to be weighted and/or chosen.
C
Adenine ( A ) is always swapped for Thymine ( T ) and vice versa. Similarly with Cytosine ( C ) and Guanine ( G ). <lang C>
- include<stdlib.h>
- include<stdio.h>
- include<time.h>
typedef struct genome{
char base; struct genome *next;
}genome;
typedef struct{
char mutation; int position;
}genomeChange;
typedef struct{
int adenineCount,thymineCount,cytosineCount,guanineCount;
}baseCounts;
genome *strand; baseCounts baseData; int genomeLength = 100, lineLength = 50;
int numDigits(int num){
int len = 1;
while(num>10){ num /= 10; len++; } return len;
}
void generateStrand(){
int baseChoice = rand()%4, i; genome *strandIterator, *newStrand;
baseData.adenineCount = 0; baseData.thymineCount = 0; baseData.cytosineCount = 0; baseData.guanineCount = 0; strand = (genome*)malloc(sizeof(genome)); strand->base = baseChoice==0?'A':(baseChoice==1?'T':(baseChoice==2?'C':'G')); baseChoice==0?baseData.adenineCount++:(baseChoice==1?baseData.thymineCount++:(baseChoice==2?baseData.cytosineCount++:baseData.guanineCount++)); strand->next = NULL;
strandIterator = strand;
for(i=1;i<genomeLength;i++){ baseChoice = rand()%4;
newStrand = (genome*)malloc(sizeof(genome)); newStrand->base = baseChoice==0?'A':(baseChoice==1?'T':(baseChoice==2?'C':'G')); baseChoice==0?baseData.adenineCount++:(baseChoice==1?baseData.thymineCount++:(baseChoice==2?baseData.cytosineCount++:baseData.guanineCount++)); newStrand->next = NULL;
strandIterator->next = newStrand; strandIterator = newStrand; }
}
genomeChange generateMutation(int swapWeight, int insertionWeight, int deletionWeight){
int mutationChoice = rand()%(swapWeight + insertionWeight + deletionWeight); genomeChange mutationCommand;
mutationCommand.mutation = mutationChoice<swapWeight?'S':((mutationChoice>=swapWeight && mutationChoice<swapWeight+insertionWeight)?'I':'D'); mutationCommand.position = rand()%genomeLength;
return mutationCommand;
}
void printGenome(){
int rows, width = numDigits(genomeLength), len = 0,i,j;
lineLength = (genomeLength<lineLength)?genomeLength:lineLength;
rows = genomeLength/lineLength + (genomeLength%lineLength!=0);
genome* strandIterator = strand;
printf("\n\nGenome : \n--------\n");
for(i=0;i<rows;i++){ printf("\n%*d%3s",width,len,":");
for(j=0;j<lineLength && strandIterator!=NULL;j++){ printf("%c",strandIterator->base); strandIterator = strandIterator->next; } len += lineLength; }
while(strandIterator!=NULL){ printf("%c",strandIterator->base); strandIterator = strandIterator->next; }
printf("\n\nBase Counts\n-----------");
printf("\n%*c%3s%*d",width,'A',":",width,baseData.adenineCount); printf("\n%*c%3s%*d",width,'T',":",width,baseData.thymineCount); printf("\n%*c%3s%*d",width,'C',":",width,baseData.cytosineCount); printf("\n%*c%3s%*d",width,'G',":",width,baseData.guanineCount);
printf("\n\nTotal:%*d",width,baseData.adenineCount + baseData.thymineCount + baseData.cytosineCount + baseData.guanineCount);
printf("\n");
}
void mutateStrand(int numMutations, int swapWeight, int insertionWeight, int deletionWeight){
int i,j,width,baseChoice; genomeChange newMutation; genome *strandIterator, *strandFollower, *newStrand;
for(i=0;i<numMutations;i++){ strandIterator = strand; strandFollower = strand; newMutation = generateMutation(swapWeight,insertionWeight,deletionWeight); width = numDigits(genomeLength);
for(j=0;j<newMutation.position;j++){ strandFollower = strandIterator; strandIterator = strandIterator->next; } if(newMutation.mutation=='S'){ if(strandIterator->base=='A'){ strandIterator->base='T'; printf("\nSwapping A at position : %*d with T",width,newMutation.position); } else if(strandIterator->base=='A'){ strandIterator->base='T'; printf("\nSwapping A at position : %*d with T",width,newMutation.position); } else if(strandIterator->base=='C'){ strandIterator->base='G'; printf("\nSwapping C at position : %*d with G",width,newMutation.position); } else{ strandIterator->base='C'; printf("\nSwapping G at position : %*d with C",width,newMutation.position); } }
else if(newMutation.mutation=='I'){ baseChoice = rand()%4;
newStrand = (genome*)malloc(sizeof(genome)); newStrand->base = baseChoice==0?'A':(baseChoice==1?'T':(baseChoice==2?'C':'G')); printf("\nInserting %c at position : %*d",newStrand->base,width,newMutation.position); baseChoice==0?baseData.adenineCount++:(baseChoice==1?baseData.thymineCount++:(baseChoice==2?baseData.cytosineCount++:baseData.guanineCount++)); newStrand->next = strandIterator; strandFollower->next = newStrand; genomeLength++; }
else{ strandFollower->next = strandIterator->next; strandIterator->next = NULL; printf("\nDeleting %c at position : %*d",strandIterator->base,width,newMutation.position); free(strandIterator); genomeLength--; } }
}
int main(int argc,char* argv[]) {
int numMutations = 10, swapWeight = 10, insertWeight = 10, deleteWeight = 10;
if(argc==1||argc>6){ printf("Usage : %s <Genome Length> <Optional number of mutations> <Optional Swapping weight> <Optional Insertion weight> <Optional Deletion weight>\n",argv[0]); return 0; }
switch(argc){ case 2: genomeLength = atoi(argv[1]); break; case 3: genomeLength = atoi(argv[1]); numMutations = atoi(argv[2]); break; case 4: genomeLength = atoi(argv[1]); numMutations = atoi(argv[2]); swapWeight = atoi(argv[3]); break; case 5: genomeLength = atoi(argv[1]); numMutations = atoi(argv[2]); swapWeight = atoi(argv[3]); insertWeight = atoi(argv[4]); break; case 6: genomeLength = atoi(argv[1]); numMutations = atoi(argv[2]); swapWeight = atoi(argv[3]); insertWeight = atoi(argv[4]); deleteWeight = atoi(argv[5]); break; };
srand(time(NULL)); generateStrand();
printf("\nOriginal:");
printGenome(); mutateStrand(numMutations,swapWeight,insertWeight,deleteWeight);
printf("\n\nMutated:"); printGenome();
return 0;
} </lang> Sample run :
C:\My Projects\networks>a 500 30 15 10 5 Original: Genome : -------- 0 :CGATGAGTTTCCTCCAAGGAGCAGGGCGTGACGGAAGGGAGGCTTAGGTC 50 :CGCATGCTCGTCGGCAGCCGGCTGGTGCCGTCGTAACCTTCACATTATTC 100 :TAGAATTTCGATGCACCTGATGACTCATACCCAGATGTAGGGGTACGCGA 150 :TGCAGATGCGGGCACGAGGAATTGTGGGCAAGCCGGCAGGTCTTTTGTAA 200 :GTTGTCACTAACTAAATAGAGGGATGGATGTTATAGCACACTACTGTCGA 250 :TTACGGACAGCGTCCCGATTCGTCATACGACCAGGATATATACTCGACGT 300 :CCAACAGGAGATTCACGTAGTGAACGCAGTTGACAGCCTGCTCGTATCTC 350 :CAGGGGTGGACTGCACCGTTCGTTAACTGCTGCCACATTAAACAGCTTCC 400 :CACTCCTTGACGCCAGACTCGGTACCACAGACCGTCAAGCTCCTATTTCC 450 :TTTGCAGTTAAAAAACACTATGGTGAAGGTCGGAGAGATGACCTCATCTA Base Counts ----------- A :124 T :118 C :126 G :132 Total:500 Inserting G at position : 205 Inserting G at position : 144 Inserting C at position : 171 Swapping A at position : 335 with T Inserting A at position : 101 Swapping C at position : 109 with G Swapping A at position : 306 with T Inserting G at position : 51 Swapping G at position : 1 with C Deleting G at position : 60 Swapping G at position : 66 with C Inserting C at position : 41 Inserting C at position : 425 Swapping C at position : 173 with G Inserting A at position : 319 Swapping G at position : 460 with C Deleting T at position : 61 Swapping C at position : 160 with G Inserting C at position : 251 Swapping G at position : 337 with C Inserting G at position : 43 Inserting T at position : 146 Inserting T at position : 181 Deleting G at position : 53 Deleting A at position : 464 Swapping G at position : 362 with C Swapping G at position : 190 with C Swapping C at position : 280 with G Inserting T at position : 479 Deleting C at position : 400 Mutated: Genome : -------- 0 :CCATGAGTTTCCTCCAAGGAGCAGGGCGTGACGGAAGGGAGCGGCTTAGG 50 :TCCGCATGCTCCGGCACCCGGCTGGTGCCGTCGTAACCTTCACATTATTC 100 :TAAGAATTTGGATGCACCTGATGACTCATACCCAGATGTAGGGGTTGACG 150 :CGATGCAGATGGGGGCACGAGGAGATTGTGTGGCAAGCCGCCAGGTCTTT 200 :TGTAAGTTGTGCACTAACTAAATAGAGGGATGGATGTTATAGCACACTAC 250 :TGTCCGATTACGGACAGCGTCCCGATTCGTGATACGACCAGGATATATAC 300 :TCGACGTCCTACAGGAGATTCAACGTAGTGAACGCAGTTCTCAGCCTGCT 350 :CGTATCTCCAGGCGTGGACTGCACCGTTCGTTAACTGCTGCCACATTAAA 400 :AGCTTCCCACTCCTTGACGCCAGACTCCGGTACCACAGACCGTCAAGCTC 450 :CTATTTCCTTTCCGTTAAAAAACACTATTGGTGAAGGTCGGAGAGATGAC 500 :CTCATCTA Base Counts ----------- A :126 T :121 C :130 G :136 Total:513
C++
<lang cpp>#include <array>
- include <iomanip>
- include <iostream>
- include <random>
- include <string>
class sequence_generator { public:
sequence_generator(); std::string generate_sequence(size_t length); void mutate_sequence(std::string&); static void print_sequence(std::ostream&, const std::string&); enum class operation { change, erase, insert }; void set_weight(operation, unsigned int);
private:
char get_random_base() { return bases_[base_dist_(engine_)]; } operation get_random_operation(); static const std::array<char, 4> bases_; std::mt19937 engine_; std::uniform_int_distribution<size_t> base_dist_; std::array<unsigned int, 3> operation_weight_; unsigned int total_weight_;
};
const std::array<char, 4> sequence_generator::bases_{ 'A', 'C', 'G', 'T' };
sequence_generator::sequence_generator() : engine_(std::random_device()()),
base_dist_(0, bases_.size() - 1), total_weight_(operation_weight_.size()) { operation_weight_.fill(1);
}
sequence_generator::operation sequence_generator::get_random_operation() {
std::uniform_int_distribution<unsigned int> op_dist(0, total_weight_ - 1); unsigned int n = op_dist(engine_), op = 0, weight = 0; for (; op < operation_weight_.size(); ++op) { weight += operation_weight_[op]; if (n < weight) break; } return static_cast<operation>(op);
}
void sequence_generator::set_weight(operation op, unsigned int weight) {
total_weight_ -= operation_weight_[static_cast<size_t>(op)]; operation_weight_[static_cast<size_t>(op)] = weight; total_weight_ += weight;
}
std::string sequence_generator::generate_sequence(size_t length) {
std::string sequence; sequence.reserve(length); for (size_t i = 0; i < length; ++i) sequence += get_random_base(); return sequence;
}
void sequence_generator::mutate_sequence(std::string& sequence) {
std::uniform_int_distribution<size_t> dist(0, sequence.length() - 1); size_t pos = dist(engine_); char b; switch (get_random_operation()) { case operation::change: b = get_random_base(); std::cout << "Change base at position " << pos << " from " << sequence[pos] << " to " << b << '\n'; sequence[pos] = b; break; case operation::erase: std::cout << "Erase base " << sequence[pos] << " at position " << pos << '\n'; sequence.erase(pos, 1); break; case operation::insert: b = get_random_base(); std::cout << "Insert base " << b << " at position " << pos << '\n'; sequence.insert(pos, 1, b); break; }
}
void sequence_generator::print_sequence(std::ostream& out, const std::string& sequence) {
constexpr size_t base_count = bases_.size(); std::array<size_t, base_count> count = { 0 }; for (size_t i = 0, n = sequence.length(); i < n; ++i) { if (i % 50 == 0) { if (i != 0) out << '\n'; out << std::setw(3) << i << ": "; } out << sequence[i]; for (size_t j = 0; j < base_count; ++j) { if (bases_[j] == sequence[i]) { ++count[j]; break; } } } out << '\n'; out << "Base counts:\n"; size_t total = 0; for (size_t j = 0; j < base_count; ++j) { total += count[j]; out << bases_[j] << ": " << count[j] << ", "; } out << "Total: " << total << '\n';
}
int main() {
sequence_generator gen; gen.set_weight(sequence_generator::operation::change, 2); std::string sequence = gen.generate_sequence(250); std::cout << "Initial sequence:\n"; sequence_generator::print_sequence(std::cout, sequence); constexpr int count = 10; for (int i = 0; i < count; ++i) gen.mutate_sequence(sequence); std::cout << "After " << count << " mutations:\n"; sequence_generator::print_sequence(std::cout, sequence); return 0;
}</lang>
- Output:
Initial sequence: 0: CATATCTGCGTAAGGCGTCGAATCCTTAGAGAAAACTCGCCAAACGCGCT 50: AGCCAAGACTTAATTAAAGGCTGGCCACATAACAGTAGTACTGCAAGGAT 100: GACGTGACTACAACGTGGAATACTCTATCTGATGAGCCCCACGTGGGCCA 150: ACCTTCCAATGCGGCGTCTTGCAGTCTTCGGACTTTGCCTCTACTAGGAG 200: TAGCCATGACGAGTGGTGAGGCGGAGGGACCAATTCCGCACTTCGAATCG Base counts: A: 67, C: 65, G: 64, T: 54, Total: 250 Change base at position 39 from C to C Erase base T at position 194 Insert base T at position 70 Insert base C at position 190 Insert base T at position 45 Erase base A at position 111 Change base at position 96 from A to C Change base at position 113 from A to C Change base at position 5 from C to A Change base at position 44 from C to T After 10 mutations: 0: CATATATGCGTAAGGCGTCGAATCCTTAGAGAAAACTCGCCAAATTGCGC 50: TAGCCAAGACTTAATTAAAGGTCTGGCCACATAACAGTAGTACTGCCAGG 100: ATGACGTGACTCACCGTGGAATACTCTATCTGATGAGCCCCACGTGGGCC 150: AACCTTCCAATGCGGCGTCTTGCAGTCTTCGGACTTTGCCCTCTACAGGA 200: GTAGCCATGACGAGTGGTGAGGCGGAGGGACCAATTCCGCACTTCGAATC 250: G Base counts: A: 65, C: 66, G: 64, T: 56, Total: 251
Common Lisp
Usage :
- (mutate (<Genome length> <Number of mutations>
- :ins_w <Insertion weight> :swp_w <Swap weight> :del_w <Delete weight>
- :genome <Genome Sequence>)
All keys are optional. <Genome length> is discarded when :genome is set. <lang lisp> (defun random_base ()
(random 4))
(defun basechar (base)
(char "ACTG" base))
(defun generate_genome (genome_length)
(let (genome '()) (loop for i below genome_length do (push (random_base) genome)) (return-from generate_genome genome)))
(defun map_genome (genome)
(let (seq '()) (loop for n from (1- (length genome)) downto 0 do (push (position (char genome n) "ACTG") seq)) seq))
(defun output_genome_info (genome &optional (genome_name "ORIGINAL"))
(let ((ac 0) (tc 0) (cc 0) (gc 0)) (format t "~% ---- ~a ----" genome_name) (do ((n 0 (1+ n))) ((= n (length genome))) (when (= 0 (mod n 50)) (format t "~& ~4d: " (1+ n))) (case (nth n genome) (0 (incf ac)) (1 (incf tc)) (2 (incf cc)) (3 (incf gc))) (format t "~c" (basechar (nth n genome)))) (format t "~2%- Total : ~3d~%A : ~d C : ~d~%T : ~d G : ~d~2%" (length genome) ac tc cc gc)))
(defun insert_base (genome)
(let ((place (random (length genome))) (base (random_base))) (format t "Insert + ~c at ~3d~%" (basechar base) (+ 1 place)) (if (= 0 place) (push base genome) (push base (cdr (nthcdr (1- place) genome)))) (return-from insert_base genome)))
(defun swap_base (genome)
(let ((place (random (length genome))) (base (random_base))) (format t "Swap ~c -> ~c at ~3d~%" (basechar (nth place genome)) (basechar base) (+ 1 place)) (setf (nth place genome) base) (return-from swap_base genome)))
(defun delete_base (genome)
(let ((place (random (length genome)))) (format t "Delete - ~c at ~3d~%" (basechar (nth place genome)) (+ 1 place)) (if (= 0 place) (pop genome) (pop (cdr (nthcdr (1- place) genome)))) (return-from delete_base genome)))
(defun mutate (genome_length n_mutations
&key (ins_w 10) (swp_w 10) (del_w 10) (genome (generate_genome genome_length) has_genome)) (if has_genome (setf genome (map_genome genome))) (output_genome_info genome) (format t " ---- MUTATION SEQUENCE ----~%") (do ((n 0 (1+ n))) ((= n n_mutations)) (setf mutation_type (random (+ ins_w swp_w del_w))) (format t "~3d : " (1+ n)) (setf genome (cond ((< mutation_type ins_w) (insert_base genome)) ((< mutation_type (+ ins_w swp_w)) (swap_base genome)) (t (delete_base genome))))) (output_genome_info genome "MUTATED"))
</lang>
- Output:
[CLISP]> (mutate 500 30 :ins_w 5 :swp_w 10 :del_w 15) ---- ORIGINAL ---- 1: CTGCCATGCGTAAGATAGCAGAAGTGTTCGCGTATATGTTCATTTTGGCT 51: TCAACCTACGGGCGTATACACATTTCAGCTCGGAGCTCGGGCCCCAGTAC 101: CTAGTTTTTCTTTCAAGCTGGAACTACGGCGCCTTGTGCCGTAATCCGTC 151: GGGGTGATAATTCTAACTTGCTAACACGCGCAGATGGTCCGGTCGCGGGT 201: CAAACATTCGCCAAGGTCAACTTACCCTTAAAAGGCTTGCAACAGGGACC 251: AGTACTAGGAAGTAGACTTCATGGATCTTGGGCATAGTGGACTAGCTTAT 301: TTACCAGCGACGTTCTTGCACCCGAGACATTATCATAGTTCGACAGCGTT 351: GAACTGTGCTTAGGTTAATGCCGCGCTTCCTACCTTGAACTAAACACAGC 401: ACACAGTGAGAACGTAGCGGCCTCTTTTCCTGCCTGCAATTCGTAAGCCT 451: GATTTGACGGGTCTGGAGTTTGGCTCGGAGTAGGTCTGCTACTTAAATTC - Total : 500 A : 116 C : 124 T : 137 G : 123 ---- MUTATION SEQUENCE ---- 1 : Swap T -> C at 74 2 : Delete - T at 208 3 : Insert + C at 332 4 : Insert + G at 287 5 : Delete - T at 188 6 : Swap G -> A at 263 7 : Delete - G at 323 8 : Delete - A at 336 9 : Swap T -> A at 426 10 : Swap G -> G at 38 11 : Swap G -> C at 288 12 : Delete - T at 11 13 : Swap G -> A at 197 14 : Insert + T at 476 15 : Swap C -> A at 5 16 : Swap A -> T at 211 17 : Swap A -> T at 248 18 : Delete - C at 471 19 : Delete - C at 455 20 : Swap T -> T at 184 21 : Insert + T at 224 22 : Delete - T at 224 23 : Insert + T at 333 24 : Delete - C at 18 25 : Delete - G at 139 26 : Delete - T at 333 27 : Insert + T at 80 28 : Insert + T at 480 29 : Swap A -> T at 341 30 : Swap T -> C at 73 ---- MUTATED ---- 1: CTGCAATGCGAAGATAGAGAAGTGTTCGCGTATATGTTCATTTTGGCTTC 51: AACCTACGGGCGTATACACATCCCAGCTCTGGAGCTCGGGCCCCAGTACC 101: TAGTTTTTCTTTCAAGCTGGAACTACGGCGCCTTGTGCCTAATCCGTCGG 151: GGTGATAATTCTAACTTGCTAACACGCGCAGATGGCCGGTCGCGGATCAA 201: ACATCGCCATGGTCAACTTACCCTTAAAAGGCTTGCAACAGGGACCTGTA 251: CTAGGAAGTAAACTTCATGGATCTTGGGCATAGGTCGACTAGCTTATTTA 301: CCAGCGACGTTCTTGCACCCAGACATTACTCTAGTTCGACTGCGTTGAAC 351: TGTGCTTAGGTTAATGCCGCGCTTCCTACCTTGAACTAAACACAGCACAC 401: AGTGAGAACGTAGCGGCCTCTTTACCTGCCTGCAATTCGTAAGCCTGATT 451: TGAGGGTCTGGAGTTTGGTCGGTAGTAGGTTCTGCTACTTAAATTC - Total : 496 A : 116 C : 124 T : 137 G : 119
Factor
<lang factor>USING: assocs combinators.random formatting grouping io kernel macros math math.statistics namespaces prettyprint quotations random sequences sorting ; IN: sequence-mutation
SYMBOL: verbose? ! Turn on to show mutation details.
! Off by default.
! Return a random base as a character.
- rand-base ( -- n ) "ACGT" random ;
! Generate a random dna sequence of length n.
- <dna> ( n -- seq ) [ rand-base ] "" replicate-as ;
! Prettyprint a dna sequence in blocks of n.
- .dna ( seq n -- )
"SEQUENCE:" print [ group ] keep [ * swap " %3d: %s\n" printf ] curry each-index ;
! Show a histogram of bases in a dna sequence and their total.
- show-counts ( seq -- )
"BASE COUNTS:" print histogram >alist [ first ] sort-with [ [ " %c: %3d\n" printf ] assoc-each ] [ "TOTAL: " write [ second ] [ + ] map-reduce . ] bi ;
! Prettyprint the overall state of a dna sequence.
- show-dna ( seq -- ) [ 50 .dna nl ] [ show-counts nl ] bi ;
! Call a quotation only if verbose? is on.
- log ( quot -- ) verbose? get [ call ] [ drop ] if ; inline
! Set index n to a random base.
- bswap ( n seq -- seq' )
[ rand-base ] 2dip 3dup [ nth ] keepd spin [ " index %3d: swapping %c with %c\n" printf ] 3curry log [ set-nth ] keep ;
! Remove the base at index n.
- bdelete ( n seq -- seq' )
2dup dupd nth [ " index %3d: deleting %c\n" printf ] 2curry log remove-nth ;
! Insert a random base at index n.
- binsert ( n seq -- seq' )
[ rand-base ] 2dip over reach [ " index %3d: inserting %c\n" printf ] 2curry log insert-nth ;
! Allow "passing" probabilities to casep. This is necessary ! because casep is a macro. MACRO: build-casep-seq ( seq -- quot )
{ [ bswap ] [ bdelete ] [ binsert ] } zip 1quotation ;
! Mutate a dna sequence according to some weights. ! For example, ! "ACGT" { 0.1 0.3 0.6 } mutate ! means swap with 0.1 probability, delete with 0.3 probability, ! and insert with 0.6 probability.
- mutate ( dna-seq weights-seq -- dna-seq' )
[ [ length random ] keep ] [ build-casep-seq ] bi* casep ; inline
! Prettyprint a sequence of weights.
- show-weights ( seq -- )
"MUTATION PROBABILITIES:" print " swap: %.2f\n delete: %.2f\n insert: %.2f\n\n" vprintf ;
- main ( -- )
verbose? on "ORIGINAL " write 200 <dna> dup show-dna 10 { 0.2 0.2 0.6 } dup show-weights "MUTATION LOG:" print [ mutate ] curry times nl "MUTATED " write show-dna ;
MAIN: main</lang>
- Output:
ORIGINAL SEQUENCE: 0: CACAGGCAAGGGTCGTATGCTACTATAGATGTTTCAGAACCGTATTTCGA 50: CTCCGACGCGGTCATGAAGCAGACACTCCGTCACCGATTGCAAGTGTGCA 100: GTTGGGAGAATGCATTAAAATTCTGGGTTATGAAACGGGCAGCCTTGATT 150: GACAGGTGGTCCAGCGACAGTTTAACATACCAAACTCTTTGAGTACGCAG BASE COUNTS: A: 55 C: 44 G: 52 T: 49 TOTAL: 200 MUTATION PROBABILITIES: swap: 0.20 delete: 0.20 insert: 0.60 MUTATION LOG: index 82: deleting A index 161: inserting C index 48: deleting G index 10: swapping G with T index 184: swapping T with C index 137: inserting T index 60: inserting T index 135: inserting C index 32: inserting T index 201: inserting A MUTATED SEQUENCE: 0: CACAGGCAAGTGTCGTATGCTACTATAGATGTTTTCAGAACCGTATTTCA 50: CTCCGACGCGGTTCATGAAGCAGACACTCCGTCCCGATTGCAAGTGTGCA 100: GTTGGGAGAATGCATTAAAATTCTGGGTTATGAAACCGGGTCAGCCTTGA 150: TTGACAGGTGGTCCCAGCGACAGTTTAACATACCAAACCCTTTGAGTACG 200: CAAG BASE COUNTS: A: 55 C: 47 G: 50 T: 52 TOTAL: 204
Go
<lang go>package main
import (
"fmt" "math/rand" "sort" "time"
)
const bases = "ACGT"
// 'w' contains the weights out of 300 for each // of swap, delete or insert in that order. func mutate(dna string, w [3]int) string {
le := len(dna) // get a random position in the dna to mutate p := rand.Intn(le) // get a random number between 0 and 299 inclusive r := rand.Intn(300) bytes := []byte(dna) switch { case r < w[0]: // swap base := bases[rand.Intn(4)] fmt.Printf(" Change @%3d %q to %q\n", p, bytes[p], base) bytes[p] = base case r < w[0]+w[1]: // delete fmt.Printf(" Delete @%3d %q\n", p, bytes[p]) copy(bytes[p:], bytes[p+1:]) bytes = bytes[0 : le-1] default: // insert base := bases[rand.Intn(4)] bytes = append(bytes, 0) copy(bytes[p+1:], bytes[p:]) fmt.Printf(" Insert @%3d %q\n", p, base) bytes[p] = base } return string(bytes)
}
// Generate a random dna sequence of given length. func generate(le int) string {
bytes := make([]byte, le) for i := 0; i < le; i++ { bytes[i] = bases[rand.Intn(4)] } return string(bytes)
}
// Pretty print dna and stats. func prettyPrint(dna string, rowLen int) {
fmt.Println("SEQUENCE:") le := len(dna) for i := 0; i < le; i += rowLen { k := i + rowLen if k > le { k = le } fmt.Printf("%5d: %s\n", i, dna[i:k]) } baseMap := make(map[byte]int) // allows for 'any' base for i := 0; i < le; i++ { baseMap[dna[i]]++ } var bases []byte for k := range baseMap { bases = append(bases, k) } sort.Slice(bases, func(i, j int) bool { // get bases into alphabetic order return bases[i] < bases[j] })
fmt.Println("\nBASE COUNT:") for _, base := range bases { fmt.Printf(" %c: %3d\n", base, baseMap[base]) } fmt.Println(" ------") fmt.Println(" Σ:", le) fmt.Println(" ======\n")
}
// Express weights as a string. func wstring(w [3]int) string {
return fmt.Sprintf(" Change: %d\n Delete: %d\n Insert: %d\n", w[0], w[1], w[2])
}
func main() {
rand.Seed(time.Now().UnixNano()) dna := generate(250) prettyPrint(dna, 50) muts := 10 w := [3]int{100, 100, 100} // use e.g. {0, 300, 0} to choose only deletions fmt.Printf("WEIGHTS (ex 300):\n%s\n", wstring(w)) fmt.Printf("MUTATIONS (%d):\n", muts) for i := 0; i < muts; i++ { dna = mutate(dna, w) } fmt.Println() prettyPrint(dna, 50)
}</lang>
- Output:
Sample run:
SEQUENCE: 0: AATCCAGAAGTTGCGGGAACCGTCGAATAGTGTTCATTAAGTGTCCCGCG 50: GAGTAGCCTCGTAATATAGAATGACCGGGCTTCCCAGCTAGACTTGTCCG 100: CCACGTTTGTGTAGGGCGCAGCGAGACTGCTCTTGATACTCGTTATGTTC 150: CTCACCGGATTATTGAATAGAGTCGAGGGGCTGACGTGACTGAACATTGC 200: CTCCTTTGCGACTAATCTTTCCTTCAATGAACAGGCGCTACCCGTCATCG BASE COUNT: A: 56 C: 63 G: 64 T: 67 ------ Σ: 250 ====== WEIGHTS (ex 300): Change: 100 Delete: 100 Insert: 100 MUTATIONS (10): Change @195 'A' to 'C' Insert @ 95 'G' Change @137 'T' to 'C' Delete @207 'T' Insert @148 'C' Insert @113 'A' Change @ 45 'C' to 'T' Delete @ 93 'T' Insert @ 51 'C' Delete @248 'A' SEQUENCE: 0: AATCCAGAAGTTGCGGGAACCGTCGAATAGTGTTCATTAAGTGTCTCGCG 50: GCAGTAGCCTCGTAATATAGAATGACCGGGCTTCCCAGCTAGACTGGTCC 100: GCCACGTTTGTGTAAGGGCGCAGCGAGACTGCTCTTGACACTCGTTATGC 150: TTCCTCACCGGATTATTGAATAGAGTCGAGGGGCTGACGTGACTGAACCT 200: TGCCTCCTTGCGACTAATCTTTCCTTCAATGAACAGGCGCTACCCGTCTC 250: G BASE COUNT: A: 55 C: 66 G: 65 T: 65 ------ Σ: 251 ======
Haskell
<lang haskell>import Data.List (group, sort) import Data.List.Split (chunksOf) import System.Random (Random, randomR, random, newStdGen, randoms, getStdRandom) import Text.Printf (PrintfArg(..), fmtChar, fmtPrecision, formatString, IsChar(..), printf)
data Mutation = Swap | Delete | Insert deriving (Show, Eq, Ord, Enum, Bounded) data DNABase = A | C | G | T deriving (Show, Read, Eq, Ord, Enum, Bounded) type DNASequence = [DNABase]
instance Random DNABase where
randomR (a, b) g = case randomR (fromEnum a, fromEnum b) g of (x, y) -> (toEnum x, y) random = randomR (minBound, maxBound)
instance Random Mutation where
randomR (a, b) g = case randomR (fromEnum a, fromEnum b) g of (x, y) -> (toEnum x, y) random = randomR (minBound, maxBound)
instance PrintfArg DNABase where
formatArg x fmt = formatString (show x) (fmt { fmtChar = 's', fmtPrecision = Nothing })
instance PrintfArg Mutation where
formatArg x fmt = formatString (show x) (fmt { fmtChar = 's', fmtPrecision = Nothing })
instance IsChar DNABase where
toChar = head . show fromChar = read . pure
chunkedDNASequence :: DNASequence -> [(Int, [DNABase])] chunkedDNASequence = zip [50,100..] . chunksOf 50
baseCounts :: DNASequence -> [(DNABase, Int)] baseCounts = fmap ((,) . head <*> length) . group . sort
newSequence :: Int -> IO DNASequence newSequence n = take n . randoms <$> newStdGen
mutateSequence :: DNASequence -> IO ((Mutation, Int), DNASequence) mutateSequence [] = fail "empty sequence" mutateSequence ds = mutate ds =<< randomMutation
where randomMutation = head . randoms <$> newStdGen mutate xs m = do i <- randomIndex (length xs) case m of Swap -> randomDNA >>= \d -> pure ((Swap, i), swapElement i d xs) Insert -> randomDNA >>= \d -> pure ((Insert, i), insertElement i d xs) Delete -> pure ((Delete, i), dropElement i xs) where dropElement i xs = take (pred i) xs <> drop i xs insertElement i e xs = take i xs <> [e] <> drop i xs swapElement i a xs = take (pred i) xs <> [a] <> drop i xs randomIndex n = getStdRandom (randomR (1, n)) randomDNA = head . randoms <$> newStdGen
mutate :: Int -> DNASequence -> IO DNASequence mutate 0 s = pure s mutate n s = do
(m, ms) <- mutateSequence s uncurry (printf "%6s @ %d\n") m mutate (pred n) ms
main :: IO () main = do
ds <- newSequence 200 putStrLn "\nInitial Sequence:" >> showSequence ds putStrLn "\nBase Counts:" >> showBaseCounts ds printf "Total: %d\n\n" $ length ds ms <- mutate 10 ds putStrLn "\nMutated Sequence:" >> showSequence ms putStrLn "\nBase Counts:" >> showBaseCounts ms printf "Total: %d\n" $ length ms where showSequence = mapM_ (uncurry (printf "%3d: %s\n")) . chunkedDNASequence showBaseCounts = mapM_ (uncurry (printf "%s: %3d\n")) . baseCounts</lang>
- Output:
Initial Sequence: 50: ACAGAGAGACCCACAATGGGGGGTCCGACATAGGCAGATACAGTAGACGA 100: AGTAGCGTTCTCACATTCGCCGTCTTCCACACGTTTGCCTCCCGGGTTGA 150: CCCCGTGTAATGGAACCCAAGCGAATGGCGGCGTAGGCAAACTTAACATG 200: GAATCGGTGGCATAAATGACGGTTCTCCGCCGACAGCGCATGGATTCTTG Base Counts: A: 51 C: 53 G: 56 T: 40 Total: 200 Swap @ 82 Insert @ 15 Swap @ 11 Swap @ 159 Swap @ 121 Swap @ 184 Swap @ 126 Delete @ 134 Swap @ 78 Insert @ 69 Mutated Sequence: 50: ACAGAGAGACGCACATATGGGGGGTCCGACATAGGCAGATACAGTAGACG 100: AAGTAGCGTTCTCACATTCGGCCGTCTTTCACATGTTTGCCTCCCGGGTT 150: GACCCCGTGTAATGGAACCCAAGCGATTGGCGGCTAGGCAAACTTAACAT 200: GGAATCGGGGGCATAAATGACGGTTCTCCGCCGCCAGCGCATGGATTCTT 250: G Base Counts: A: 49 C: 51 G: 58 T: 43 Total: 201
J
<lang J>ACGT=: 'ACGT' MUTS=: ;: 'del ins mut'
NB. generate sequence of size y of uniformly selected nucleotides. NB. represent sequences as ints in range i.4 pretty printed. nuc NB. defined separately to avoid fixing value inside mutation NB. functions. nuc=: monad : '?4' dna=: nuc"0 @ i.
NB. randomly mutate nucleotide at a random index by deletion insertion NB. or mutation of a nucleotide. del=: {.,[:}.}. ins=: {.,nuc@],}. mut=: {.,nuc@],[:}.}.
NB. pretty print nucleotides in rows of 50 with numbering seq=: [: (;~ [: (4&":"0) 50*i.@#) _50]\{&ACGT
sim=: monad define 'n k ws'=. y NB. initial size, mutations, and weights for mutations ws=. (% +/) ws NB. normalize weights A=.0$]D0=.D=. dna n NB. initial dna and history of actions
NB. k times do a random action according to weights and record it for. i.k do.
D=.". action=. (":?#D),' ',(":MUTS{::~(+/\ws)I.?0),' D' A=. action ; A
end.
echo 'actions';,. A-.a: echo ('mutation';'probability') , MUTS ,. <"0 ws ('start';'end'),.(seq D0) ,: seq D )
simulate=: (sim@(1 1 1&; &. |. ))`sim@.(3=#)</lang>
- Output:
simulate 200 ; 10 ┌─────────┐ │actions │ ├─────────┤ │60 mut D │ ├─────────┤ │156 del D│ ├─────────┤ │44 mut D │ ├─────────┤ │64 mut D │ ├─────────┤ │167 mut D│ ├─────────┤ │40 ins D │ ├─────────┤ │39 mut D │ ├─────────┤ │187 del D│ ├─────────┤ │186 del D│ ├─────────┤ │150 del D│ └─────────┘ ┌────────┬───────────┐ │mutation│probability│ ├────────┼───────────┤ │del │0.333333 │ ├────────┼───────────┤ │ins │0.333333 │ ├────────┼───────────┤ │mut │0.333333 │ └────────┴───────────┘ ┌─────┬────┬──────────────────────────────────────────────────┐ │start│ 0│GGCTGTTGGCCGCCAATCTACAATGATAGCGCGTGAGGAGGGCTAATGTA│ │ │ 50│GAGCCAATAATGGATGCTCGCGCTTCTGCTTATGCTGGTTACTGCTGCCC│ │ │ 100│AAAAACGGGGTACATTGAGCGATAAGCCCGCAAGGTTACTGCTCGTGACA│ │ │ 150│GTCCGAACACCACATTCGTGGTTACTCGACTCTGCCACCTCTTAGCGGAT│ ├─────┼────┼──────────────────────────────────────────────────┤ │end │ 0│GGCTGTTGGCCGCCAATCTACAATGATAGCGCGTGAGGACCGGCAAATGT│ │ │ 50│AGAGCCAATACTGGATGCTCGCGCTTCTGCTTATGCTGGTTACTGCTGCC│ │ │ 100│CAAAAACGGGGTACATTGAGCGATAAGCCCGCAAGGTTACTGCTCGTGAC│ │ │ 150│ATCCGACACCACATTCCTGGTTACTCGACTCTGCCACCTTAGCGGAT │ └─────┴────┴──────────────────────────────────────────────────┘ simulate 200 ; 10 ; 1 3 1 ┌─────────┐ │actions │ ├─────────┤ │120 ins D│ ├─────────┤ │199 ins D│ ├─────────┤ │138 mut D│ ├─────────┤ │15 ins D │ ├─────────┤ │8 del D │ ├─────────┤ │135 ins D│ ├─────────┤ │29 ins D │ ├─────────┤ │118 del D│ ├─────────┤ │111 ins D│ ├─────────┤ │10 del D │ └─────────┘ ┌────────┬───────────┐ │mutation│probability│ ├────────┼───────────┤ │del │0.2 │ ├────────┼───────────┤ │ins │0.6 │ ├────────┼───────────┤ │mut │0.2 │ └────────┴───────────┘ ┌─────┬────┬──────────────────────────────────────────────────┐ │start│ 0│GAACATACAATATCGTGTGGGTGGTAAGGTGCGCCGATTTGGCAGTGTAG│ │ │ 50│AGCGGCCTCTGGCCGGGCCCATACTGACATATCTTTTATCTCCGTGCTAG│ │ │ 100│CAGAAGAATCAAACGCGTCAAGATGCTGGCGCGGGCTGATATGCGCCCGG│ │ │ 150│CAGTGGAGAACTGCGTTGATACACCTCAAAGATAAGCGGACGATATTAGC│ ├─────┼────┼──────────────────────────────────────────────────┤ │end │ 0│GAACATACAATCGTGCTGGGTGGTAAGGTTGCGCCGATTTGGCAGTGTAG│ │ │ 50│AGCGGCCTCTGGCCGGGCCCATACTGACATATCTTTTATCTCCGTGCTAG│ │ │ 100│CAGAAGAATCAACACGCGTAGAGATGCTGGCGCGGGACTGATATGCGCCC│ │ │ 150│GGCAGTGGAGAACTGCGTTGATACACCTCAAAGATAAGCGGACGATATTA│ │ │ 200│GGC │ └─────┴────┴──────────────────────────────────────────────────┘
Java
<lang java>import java.util.Arrays; import java.util.Random;
public class SequenceMutation {
public static void main(String[] args) { SequenceMutation sm = new SequenceMutation(); sm.setWeight(OP_CHANGE, 3); String sequence = sm.generateSequence(250); System.out.println("Initial sequence:"); printSequence(sequence); int count = 10; for (int i = 0; i < count; ++i) sequence = sm.mutateSequence(sequence); System.out.println("After " + count + " mutations:"); printSequence(sequence); }
public SequenceMutation() { totalWeight_ = OP_COUNT; Arrays.fill(operationWeight_, 1); }
public String generateSequence(int length) { char[] ch = new char[length]; for (int i = 0; i < length; ++i) ch[i] = getRandomBase(); return new String(ch); }
public void setWeight(int operation, int weight) { totalWeight_ -= operationWeight_[operation]; operationWeight_[operation] = weight; totalWeight_ += weight; }
public String mutateSequence(String sequence) { char[] ch = sequence.toCharArray(); int pos = random_.nextInt(ch.length); int operation = getRandomOperation(); if (operation == OP_CHANGE) { char b = getRandomBase(); System.out.println("Change base at position " + pos + " from " + ch[pos] + " to " + b); ch[pos] = b; } else if (operation == OP_ERASE) { System.out.println("Erase base " + ch[pos] + " at position " + pos); char[] newCh = new char[ch.length - 1]; System.arraycopy(ch, 0, newCh, 0, pos); System.arraycopy(ch, pos + 1, newCh, pos, ch.length - pos - 1); ch = newCh; } else if (operation == OP_INSERT) { char b = getRandomBase(); System.out.println("Insert base " + b + " at position " + pos); char[] newCh = new char[ch.length + 1]; System.arraycopy(ch, 0, newCh, 0, pos); System.arraycopy(ch, pos, newCh, pos + 1, ch.length - pos); newCh[pos] = b; ch = newCh; } return new String(ch); }
public static void printSequence(String sequence) { int[] count = new int[BASES.length]; for (int i = 0, n = sequence.length(); i < n; ++i) { if (i % 50 == 0) { if (i != 0) System.out.println(); System.out.print(String.format("%3d: ", i)); } char ch = sequence.charAt(i); System.out.print(ch); for (int j = 0; j < BASES.length; ++j) { if (BASES[j] == ch) { ++count[j]; break; } } } System.out.println(); System.out.println("Base counts:"); int total = 0; for (int j = 0; j < BASES.length; ++j) { total += count[j]; System.out.print(BASES[j] + ": " + count[j] + ", "); } System.out.println("Total: " + total); }
private char getRandomBase() { return BASES[random_.nextInt(BASES.length)]; }
private int getRandomOperation() { int n = random_.nextInt(totalWeight_), op = 0; for (int weight = 0; op < OP_COUNT; ++op) { weight += operationWeight_[op]; if (n < weight) break; } return op; }
private final Random random_ = new Random(); private int[] operationWeight_ = new int[OP_COUNT]; private int totalWeight_ = 0;
private static final int OP_CHANGE = 0; private static final int OP_ERASE = 1; private static final int OP_INSERT = 2; private static final int OP_COUNT = 3; private static final char[] BASES = {'A', 'C', 'G', 'T'};
}</lang>
- Output:
Initial sequence: 0: TCCCCTCCAGTTAGCAGAAATATTAGCTAACGATACCTCGACACGGAGGG 50: GTGGGGCCAACTCTTAACACAATTACGAGAACCATCCTTCGAAAGCAAAA 100: AAGTTTATGCCTGTTGTGTCAGGAACCCCCCGCGACGGACAACACAGTAA 150: GCACCTGCGGATACTGTGGTTGCCCTGAAAGACGGAGGATGCCTCCTATG 200: TCATTTAGAACTATCGAACGTACGGTTCTTAAATGGTCGTAGTTAGATAG Base counts: A: 73, C: 61, G: 59, T: 57, Total: 250 Insert base T at position 196 Change base at position 19 from A to G Erase base T at position 204 Change base at position 223 from G to T Change base at position 183 from G to C Change base at position 21 from A to T Insert base T at position 40 Change base at position 20 from T to G Insert base T at position 69 Change base at position 19 from G to T After 10 mutations: 0: TCCCCTCCAGTTAGCAGAATGTTTAGCTAACGATACCTCGTACACGGAGG 50: GGTGGGGCCAACTCTTAACTACAATTACGAGAACCATCCTTCGAAAGCAA 100: AAAAGTTTATGCCTGTTGTGTCAGGAACCCCCCGCGACGGACAACACAGT 150: AAGCACCTGCGGATACTGTGGTTGCCCTGAAAGACCGAGGATGCCTCCTT 200: ATGTCATTAGAACTATCGAACGTACTGTTCTTAAATGGTCGTAGTTAGAT 250: AG Base counts: A: 71, C: 62, G: 58, T: 61, Total: 252
Julia
<lang julia>dnabases = ['A', 'C', 'G', 'T'] randpos(seq) = rand(1:length(seq)) # 1 mutateat(pos, seq) = (s = seq[:]; s[pos] = rand(dnabases); s) # 2-1 deleteat(pos, seq) = [seq[1:pos-1]; seq[pos+1:end]] # 2-2 randinsertat(pos, seq) = [seq[1:pos]; rand(dnabases); seq[pos+1:end]] # 2-3
function weightedmutation(seq, pos, weights=[1, 1, 1], verbose=true) # Extra credit
p, r = weights ./ sum(weights), rand() f = (r <= p[1]) ? mutateat : (r < p[1] + p[2]) ? deleteat : randinsertat verbose && print("Mutate by ", f == mutateat ? "swap" : f == deleteat ? "delete" : "insert") return f(pos, seq)
end
function weightedrandomsitemutation(seq, weights=[1, 1, 1], verbose=true)
position = randpos(seq) newseq = weightedmutation(seq, position, weights, verbose) verbose && println(" at position $position") return newseq
end
randdnasequence(n) = rand(dnabases, n) # 3
function dnasequenceprettyprint(seq, colsize=50) # 4
println(length(seq), "nt DNA sequence:\n") rows = [seq[i:min(length(seq), i + colsize - 1)] for i in 1:colsize:length(seq)] for (i, r) in enumerate(rows) println(lpad(colsize * (i - 1), 5), " ", String(r)) end bases = [[c, 0] for c in dnabases] for c in seq, base in bases if c == base[1] base[2] += 1 end end println("\nNucleotide counts:\n") for base in bases println(lpad(base[1], 10), lpad(string(base[2]), 12)) end println(lpad("Other", 10), lpad(string(length(seq) - sum(x[2] for x in bases)), 12)) println(" _________________\n", lpad("Total", 10), lpad(string(length(seq)), 12))
end
function testbioseq()
sequence = randdnasequence(500) dnasequenceprettyprint(sequence) for _ in 1:10 # 5 sequence = weightedrandomsitemutation(sequence) end println("\n Mutated:"); dnasequenceprettyprint(sequence) # 6
end
testbioseq()
</lang>
- Output:
500nt DNA sequence: 0 TCACGTAACAGAGGTATAGTTGATCTTGAGCGGGCTGGCTCCCGGGTTTC 50 TAGCAAGAAAAGGGGAGGGAAGTGCGCCTGCTTTTGCCCCGGGCACCCCA 100 ATCGAAGACAGCTCCGGGGTCGCACATTTTTATGGCCACATAATGAGGGA 150 ATGCACGCATCACTCCTATCACTAACTGCGAACTCATGTGACTGTGCAAA 200 ACACCTTTAACACTGCGATGCCGTGGGGACGGGCCCCCCCAGCGGTATAG 250 CGCGCACACGCGACAGATGTTAACTCGAATGGTCGCGCCGGGGAGTGCAC 300 CCCTTGACATATACTCCAGATGCAATGCGCTATACTTTATGAACTTGCAT 350 AAGCTGCGCAGGGGGGATTGACTTATACTACATATTAACTACCGATATCG 400 ACGCAAATATTCGGGCGGTCTAAAGTGTGTCAGAACGGACATGCCGCCCA 450 GAATCACGGCTACTGAGGACAAATACGCATTCCCGGTGCTGCATTCATTC Nucleotide counts: A 127 C 133 G 131 T 109 Other 0 _________________ Total 500 Mutate by swap at position 253 Mutate by swap at position 448 Mutate by insert at position 379 Mutate by delete at position 311 Mutate by delete at position 335 Mutate by insert at position 132 Mutate by swap at position 191 Mutate by swap at position 481 Mutate by insert at position 189 Mutate by insert at position 423 Mutated: 502nt DNA sequence: 0 TCACGTAACAGAGGTATAGTTGATCTTGAGCGGGCTGGCTCCCGGGTTTC 50 TAGCAAGAAAAGGGGAGGGAAGTGCGCCTGCTTTTGCCCCGGGCACCCCA 100 ATCGAAGACAGCTCCGGGGTCGCACATTTTTACTGGCCACATAATGAGGG 150 AATGCACGCATCACTCCTATCACTAACTGCGAACTCATGATCACTGTGCA 200 AAACACCTTTAACACTGCGATGCCGTGGGGACGGGCCCCCCCAGCGGTAT 250 AGCGAGCACACGCGACAGATGTTAACTCGAATGGTCGCGCCGGGGAGTGC 300 ACCCCTTGACATTACTCCAGATGCAATGCGCTATACTTATGAACTTGCAT 350 AAGCTGCGCAGGGGGGATTGACTTATACTGACATATTAACTACCGATATC 400 GACGCAAATATTCGGGCGGTCTAGAAGTGTGTCAGAACGGACATGCCGCT 450 CAGAATCACGGCTACTGAGGACAAATACGCATTCCCGGTGCTGCATTCAT 500 TC Nucleotide counts: A 128 C 133 G 132 T 109 Other 0 _________________ Total 502
Perl
<lang perl>use strict; use warnings; use feature 'say';
my @bases = <A C G T>;
my $dna; $dna .= $bases[int rand 4] for 1..200;
my %cnt; $cnt{$_}++ for split //, $dna;
sub pretty {
my($string) = @_; my $chunk = 10; my $wrap = 5 * ($chunk+1); ($string =~ s/(.{$chunk})/$1 /gr) =~ s/(.{$wrap})/$1\n/gr;
}
sub mutate {
my($dna,$count) = @_; my $orig = $dna; substr($dna,rand length $dna,1) = $bases[int rand 4] while $count > diff($orig, $dna) =~ tr/acgt//; $dna
}
sub diff {
my($orig, $repl) = @_; for my $i (0 .. -1+length $orig) { substr($repl,$i,1, lc substr $repl,$i,1) if substr($orig,$i,1) ne substr($repl,$i,1); } $repl;
}
say "Original DNA strand:\n" . pretty($dna); say "Total bases: ". length $dna; say "$_: $cnt{$_}" for @bases;
my $mutate = mutate($dna, 10); %cnt = (); $cnt{$_}++ for split //, $mutate; say "\nMutated DNA strand:\n" . pretty diff $dna, $mutate; say "Total bases: ". length $mutate; say "$_: $cnt{$_}" for @bases; </lang>
- Output:
Original DNA strand: TGGAACATGT CCCAACGAGT TCTTCTTGCT AGCAGATTTT TTCAGTTGAT CGTCACATGC GGTAGACTAC CCAAGGTGTG ACTACTCGCA TGCCTGATCT AAATGGACAG TCGGCAGGCT AGTGCTAATT ACCGGAAGTA CGAACGAGCC ATGCTGAGCG ACTCATCATT GTGAAATCGA GCCTATCTGC ATGACCTAAT Total bases: 200 A: 52 C: 48 G: 47 T: 53 Mutated DNA strand: TGGAACATGT CCCAACGAGT cCTTCTTGCT AGCcGATTTT TTCAGTTGgT gGTCACATGC aGTAGACTAC CCgAGGTGTG ACTACTCGCA TGCCTGATCT AAATGGACAG TCGGCAGGCT AGTGCTAATT ACCGGAAGTA CGAACGAGCt ATGCaGAGCG ACTCATCgTT GTGAAATCGA GCCTATCTGC AgGACCTAAT Total bases: 200 A: 50 C: 48 G: 51 T: 51
Phix
<lang Phix>string dna = repeat(' ',200+rand(300)) for i=1 to length(dna) do dna[i] = "ACGT"[rand(4)] end for
procedure show()
sequence acgt = repeat(0,5) for i=1 to length(dna) do acgt[find(dna[i],"ACGT")] += 1 end for acgt[$] = sum(acgt) sequence s = split(trim(join_by(split(join_by(dna,1,10,""),"\n"),1,5," ")),"\n\n") for i=1 to length(s) do printf(1,"%3d: %s\n",{(i-1)*50+1,s[i]}) end for printf(1,"\nBase counts: A:%d, C:%d, G:%d, T:%d, total:%d\n",acgt)
end procedure
procedure mutate()
printf(1,"\n") for i=1 to 10 do integer p = rand(length(dna)), sdi = "SDI"[rand(3)], rep = "ACGT"[rand(4)], was = dna[p] switch sdi do case 'S':dna[p] = rep printf(1,"swapped %c at %d for %c\n",{was,p,rep}) case 'D':dna[p..p] = "" printf(1,"deleted %c at %d\n",{was,p}) case 'I':dna[p..p-1] = ""&rep printf(1,"inserted %c at %d, before %c\n",{rep,p,was}) end switch end for printf(1,"\n")
end procedure
show() mutate() show()</lang>
- Output:
1: ATAGACCGAT GTGTAGGTCT CGAACATCCC TGGGGTAGCT CAGCTTGGGG 51: GTTGACCTGT CTTGCTCCCA TGAACTGAGG GATTTGGAAA TAACGCTTAT 101: AACTGCGGGG GATTGATATG GGACATTGTT GCTGTAGGGC TTCGGCGTGC 151: TTAGAAACAA GAGAACACCA ATTTCGATAG ACCAGGTTTC GTCCCGCTAC 201: GAGTGATAGT AGCGCGTTAG GATTAATAAT CAGGGAGAGC ATTAAACATT 251: CTAAAAACTG ACATTCCCGA GGTGGAACCC GAGTTGATAA CGAGTATGCT 301: CTGAAAAATT AATTGATTGA TCCGCGACAC TATCACACCG TCTTCGCCGT 351: TGTAATGCAT GCTGGCTTAG CATCCGGATG CTCTTCTACC GATCTTAAGG 401: CGCGCACTCC CCAAGGAGCA TAGAAGCATC CCCGGCCCTC GACAGAGTCT 451: CCCAGTGTAA GTGTCTTTAT CCAAAATC Base counts: A:125, C:111, G:120, T:122, total:478 inserted C at 58, before T deleted T at 2 deleted C at 169 deleted G at 80 inserted A at 331, before T swapped C at 27 for T inserted A at 22, before A inserted T at 190, before G swapped C at 195 for C inserted A at 274, before G 1: AAGACCGATG TGTAGGTCTC GAAACATTCC TGGGGTAGCT CAGCTTGGGG 51: GTTGACCCTG TCTTGCTCCC ATGAACTGAG GATTTGGAAA TAACGCTTAT 101: AACTGCGGGG GATTGATATG GGACATTGTT GCTGTAGGGC TTCGGCGTGC 151: TTAGAAACAA GAGAACACAA TTTCGATAGA CCAGGTTTCT GTCCCGCTAC 201: GAGTGATAGT AGCGCGTTAG GATTAATAAT CAGGGAGAGC ATTAAACATT 251: CTAAAAACTG ACATTCCCGA GGTAGGAACC CGAGTTGATA ACGAGTATGC 301: TCTGAAAAAT TAATTGATTG ATCCGCGACA CTAATCACAC CGTCTTCGCC 351: GTTGTAATGC ATGCTGGCTT AGCATCCGGA TGCTCTTCTA CCGATCTTAA 401: GGCGCGCACT CCCCAAGGAG CATAGAAGCA TCCCCGGCCC TCGACAGAGT 451: CTCCCAGTGT AAGTGTCTTT ATCCAAAATC Base counts: A:128, C:110, G:119, T:123, total:480
Python
In function seq_mutate argument kinds selects between the three kinds of mutation. The characters I, D, and S are chosen from the string to give the kind of mutation to perform, so the more of that character, the more of that type of mutation performed.
Similarly parameter choice is chosen from to give the base for substitution or insertion - the more any base appears, the more likely it is to be chosen in any insertion/substitution.
<lang python>import random from collections import Counter
def basecount(dna):
return sorted(Counter(dna).items())
def seq_split(dna, n=50):
return [dna[i: i+n] for i in range(0, len(dna), n)]
def seq_pp(dna, n=50):
for i, part in enumerate(seq_split(dna, n)): print(f"{i*n:>5}: {part}") print("\n BASECOUNT:") tot = 0 for base, count in basecount(dna): print(f" {base:>3}: {count}") tot += count base, count = 'TOT', tot print(f" {base:>3}= {count}")
def seq_mutate(dna, count=1, kinds="IDSSSS", choice="ATCG" ):
mutation = [] k2txt = dict(I='Insert', D='Delete', S='Substitute') for _ in range(count): kind = random.choice(kinds) index = random.randint(0, len(dna)) if kind == 'I': # Insert dna = dna[:index] + random.choice(choice) + dna[index:] elif kind == 'D' and dna: # Delete dna = dna[:index] + dna[index+1:] elif kind == 'S' and dna: # Substitute dna = dna[:index] + random.choice(choice) + dna[index+1:] mutation.append((k2txt[kind], index)) return dna, mutation
if __name__ == '__main__':
length = 250 print("SEQUENCE:") sequence = .join(random.choices('ACGT', weights=(1, 0.8, .9, 1.1), k=length)) seq_pp(sequence) print("\n\nMUTATIONS:") mseq, m = seq_mutate(sequence, 10) for kind, index in m: print(f" {kind:>10} @{index}") print() seq_pp(mseq)</lang>
- Output:
SEQUENCE: 0: GGAAGATTAGGTCACGGGCCTCATCTTGTGCGAGATAAATAATAACACTC 50: AGCGATCATTAGAATGTATATTGTACGGGCATGTTTATCTACCATAGGTC 100: CTGTCAAAAGATGGCTAGCTGCAATTTTTTCTTCTAGATCCCGATTACTG 150: CGGTATTTTTGTATAACGTGCTAAACGGTGTGTTTTCAGGTCGGCCTGCT 200: AATCTAACGCCAGTGGACTTGGGATGGACGCCCAACAACTGAGAGCGCGG BASECOUNT: A: 64 C: 51 G: 62 T: 73 TOT= 250 MUTATIONS: Substitute @138 Substitute @72 Insert @103 Insert @129 Insert @124 Delete @52 Delete @202 Substitute @200 Insert @158 Delete @32 0: GGAAGATTAGGTCACGGGCCTCATCTTGTGCGGATAAATAATAACACTCA 50: GGATCATTAGAATGTATATTATACGGGCATGTTTATCTACCATAGGTCCT 100: GCTCAAAAGATGGCTAGCTGCAGATTTTGTTCTTCTAGAGCCCGATTACT 150: GCGGTATGTTTTGTATAACGTGCTAAACGGTGTGTTTTCAGGTCGGCCTG 200: CTATCTAACGCCAGTGGACTTGGGATGGACGCCCAACAACTGAGAGCGCG 250: G BASECOUNT: A: 63 C: 51 G: 65 T: 72 TOT= 251
Racket
<lang racket>#lang racket
(define current-S-weight (make-parameter 1)) (define current-D-weight (make-parameter 1)) (define current-I-weight (make-parameter 1))
(define bases '(#\A #\C #\G #\T))
(define (fold-sequence seq kons #:finalise (finalise (λ x (apply values x))) . k0s)
(define (recur seq . ks) (if (null? seq) (call-with-values (λ () (apply finalise ks)) (λ vs (apply values vs))) (call-with-values (λ () (apply kons (car seq) ks)) (λ ks+ (apply recur (cdr seq) ks+))))) (apply recur (if (string? seq) (string->list (regexp-replace* #px"[^ACGT]" seq "")) seq) k0s))
(define (sequence->pretty-printed-string seq)
(define (fmt idx cs-rev) (format "~a: ~a" (~a idx #:width 3 #:align 'right) (list->string (reverse cs-rev)))) (fold-sequence seq (λ (b n start-idx lns-rev cs-rev) (if (zero? (modulo n 50)) (values (+ n 1) n (if (pair? cs-rev) (cons (fmt start-idx cs-rev) lns-rev) lns-rev) (cons b null)) (values (+ n 1) start-idx lns-rev (cons b cs-rev)))) 0 0 null null #:finalise (λ (n idx lns-rev cs-rev) (string-join (reverse (if (null? cs-rev) lns-rev (cons (fmt idx cs-rev) lns-rev))) "\n"))))
(define (count-bases b as cs gs ts n)
(values (+ as (if (eq? b #\A) 1 0)) (+ cs (if (eq? b #\C) 1 0)) (+ gs (if (eq? b #\T) 1 0)) (+ ts (if (eq? b #\G) 1 0)) (add1 n)))
(define (report-sequence s)
(define-values (as cs gs ts n) (fold-sequence s count-bases 0 0 0 0 0)) (printf "SEQUENCE:~%~a~%" (sequence->pretty-printed-string s)) (printf "BASE COUNT:~%-----------~%~a~%" (string-join (map (λ (c n) (format " ~a :~a" c (~a #:width 4 #:align 'right n))) bases (list as ts cs gs)) "\n")) (printf "TOTAL: ~a~%" n))
(define (make-random-sequence-string l)
(list->string (for/list ((_ l)) (list-ref bases (random 4)))))
(define (weighted-random-call weights-and-functions . args)
(let loop ((r (random)) (wfs weights-and-functions)) (if (<= r (car wfs)) (apply (cadr wfs) args) (loop (- r (car wfs)) (cddr wfs)))))
(define (mutate-S s)
(let ((r (random (string-length s))) (i (string (list-ref bases (random 4))))) (printf "Mutate at ~a -> ~a~%" r i) (string-append (substring s 0 r) i (substring s (add1 r)))))
(define (mutate-D s)
(let ((r (random (string-length s)))) (printf "Delete at ~a~%" r) (string-append (substring s 0 r) (substring s (add1 r)))))
(define (mutate-I s)
(let ((r (random (string-length s))) (i (string (list-ref bases (random 4))))) (printf "Insert at ~a -> ~a~%" r i) (string-append (substring s 0 r) i (substring s r))))
(define (mutate s)
(define W (+ (current-S-weight) (current-D-weight) (current-I-weight))) (weighted-random-call (list (/ (current-S-weight) W) mutate-S (/ (current-D-weight) W) mutate-D (/ (current-I-weight) W) mutate-I) s))
(module+
main (define initial-sequence (make-random-sequence-string 200)) (report-sequence initial-sequence) (newline) (define s+ (for/fold ((s initial-sequence)) ((_ 10)) (mutate s))) (newline) (report-sequence s+) (newline) (define s+d (parameterize ((current-D-weight 5)) (for/fold ((s initial-sequence)) ((_ 10)) (mutate s)))) (newline) (report-sequence s+d))</lang>
- Output:
SEQUENCE: 0: AATGCTTGTGTCACCGGCCACCCCCTATGTACATACACGGTTCAGCTGGC 50: CTGGCAGAGCGTCTGACAACGGAGCTCTAAAACTTCCTTCTGACTAGAGC 100: TCATCCTCTCCCGGCTGAACTCCGTGGCTTACAATCAGACCCTACCCGTG 150: TCCAACTACGCAGTCGCCCGATGAGGCTGGGCTCACCACGTTGGGCGAGG BASE COUNT: ----------- A : 41 C : 49 G : 67 T : 43 TOTAL: 200 Insert at 176 -> T Insert at 99 -> G Delete at 154 Mutate at 188 -> A Insert at 14 -> A Mutate at 62 -> G Delete at 110 Mutate at 79 -> T Insert at 109 -> A Insert at 46 -> C SEQUENCE: 0: AATGCTTGTGTCACACGGCCACCCCCTATGTACATACACGGTTCAGCCTG 50: GCCTGGCAGAGCGGCTGACAACGGAGCTCTTAAACTTCCTTCTGACTAGA 100: GGCTCATCCTACCCCGGCTGAACTCCGTGGCTTACAATCAGACCCTACCC 150: GTGTCCACTACGCAGTCGCCCGATGAGGTCTGGGCTCACCACGTTGGGCG 200: AGG BASE COUNT: ----------- A : 41 C : 51 G : 68 T : 43 TOTAL: 203 Delete at 79 Delete at 105 Delete at 63 Delete at 37 Delete at 166 Mutate at 42 -> C Delete at 142 Delete at 23 Delete at 91 Insert at 4 -> T SEQUENCE: 0: AATGTCTTGTGTCACCGGCCACCCCTATGTACATACAGGTTCCGCTGGCC 50: TGGCAGAGCGTCGACAACGGAGCTCTAAACTTCCTTCTGACTGAGCTCAT 100: CCCTCCCGGCTGAACTCCGTGGCTTACAATCAGACCCTACCGTGTCCAAC 150: TACGCAGTCGCCCGTGAGGCTGGGCTCACCACGTTGGGCGAGG BASE COUNT: ----------- A : 37 C : 49 G : 65 T : 42 TOTAL: 193
Raku
(formerly Perl 6)
Unweighted mutations at this point. The mutated DNA strand has a "diff" operation performed on it which (in this specific case) renders the mutated base in lower case so it may be picked out more easily.
<lang perl6>my @bases = <A C G T>;
- The DNA strand
my $dna = @bases.roll(200).join;
- The Task
put "ORIGINAL DNA STRAND:"; put pretty $dna; put "\nTotal bases: ", +my $bases = $dna.comb.Bag; put $bases.sort( ~*.key ).join: "\n";
put "\nMUTATED DNA STRAND:"; my $mutate = $dna.&mutate(10); put pretty diff $dna, $mutate; put "\nTotal bases: ", +my $mutated = $mutate.comb.Bag; put $mutated.sort( ~*.key ).join: "\n";
- Helper subs
sub pretty ($string, $wrap = 50) {
$string.comb($wrap).map( { sprintf "%8d: %s", $++ * $wrap, $_ } ).join: "\n"
}
sub mutate ($dna is copy, $count = 1) {
$dna.substr-rw((^$dna.chars).roll, 1) = @bases.roll for ^$count; $dna
}
sub diff ($orig, $repl) {
($orig.comb Z $repl.comb).map( -> ($o, $r) { $o eq $r ?? $o !! $r.lc }).join
}</lang>
- Output:
ORIGINAL DNA STRAND: 0: ACGGATAGACCGTTCCTGCAAGCTGGTACGGTTCGAATGTTGACCTTATT 50: CTCCGCAGCGCACTACCCGATCGGGTAACGTACTCTATATGATGCCTATT 100: TTCCCCGCCTTACATCGGCGATCAATGTTCTTTTACGCTAACTAGGCGCA 150: CGTCGTGCCTTACCGAGAGCCAGTTCGAAATCGTGCTGAAAATATCTGGA Total bases: 200 A 45 C 55 G 45 T 55 MUTATED DNA STRAND: 0: ACGGATAGcCCGTTCCTGCAAGCTGGTACGGTTCGAATGTTGACCTTATT 50: CTCCGCAGCGCACTACCCGATCGGGTcACtcACTCTATATGAcGCCTAaT 100: TTCCCCGCCTTACATCGGCGATCAATGTTCTTTTACGCTAACTAGGCGCA 150: CGTCGTGCCTTACCcAGAGCCAGTTCGAAATCGTGCTGAAAATATCTGGA Total bases: 200 A 44 C 60 G 43 T 53
Swift
<lang swift>let bases: [Character] = ["A", "C", "G", "T"]
enum Action: CaseIterable {
case swap, delete, insert
}
@discardableResult func mutate(dna: inout String) -> Action {
guard let i = dna.indices.shuffled().first(where: { $0 != dna.endIndex }) else { fatalError() }
let action = Action.allCases.randomElement()!
switch action { case .swap: dna.replaceSubrange(i..<i, with: [bases.randomElement()!]) case .delete: dna.remove(at: i) case .insert: dna.insert(bases.randomElement()!, at: i) }
return action
}
var d = ""
for _ in 0..<200 {
d.append(bases.randomElement()!)
}
func printSeq(_ dna: String) {
for startI in stride(from: 0, to: dna.count, by: 50) { print("\(startI): \(dna.dropFirst(startI).prefix(50))") }
print() print("Size: \(dna.count)") print()
let counts = dna.reduce(into: [:], { $0[$1, default: 0] += 1 })
for (char, count) in counts.sorted(by: { $0.key < $1.key }) { print("\(char): \(count)") }
}
printSeq(d)
print()
for _ in 0..<20 {
mutate(dna: &d)
}
printSeq(d)</lang>
- Output:
0: CCCTGTTACCCTTAAGTCACAAATCATGATAAGCAGCCTTCGAGCACTTC 50: GTGTCAAGCCTGATTCGAGCGCGCCGGTCATCCTCCGATAGAGCACGGGG 100: ACGCCCGCACTACCCCACTGGCGCTTGGTCGCTGAATAGGGCGCCCTTGG 150: TGGTGGATGGTCTTAAGCTGTCGCAAATCTAGCCCCGACCAAGAGAAGGC Size: 200 A: 43 C: 62 G: 54 T: 41 0: CCCTGTTACCCTTAAGTCACAAATCATGTATAAAGCAGCCTTCGAGCACT 50: TCGTGTCAAGCCTGATTCGAGCGCGCTAGGGCATCCTCCGTATAAGAGCA 100: CCGGGGACGCCCGCACTTACCACCACTGGCGCTTGGTCGCGAATAGGGGC 150: GCCCTTTGGTGGTGGATTGGTCTTAAGTGTCGCAAATCTAGCCCCCGACC 200: AAGAGAAGGC Size: 210 A: 47 C: 62 G: 56 T: 45
zkl
<lang zkl>var [const] bases="ACGT", lbases=bases.toLower(); dna:=(190).pump(Data().howza(3),(0).random.fp(0,4),bases.get); // bucket of bytes
foreach s,m in (T("Original","Mutated").zip(T(True,False))){
println("\n",s," DNA strand:"); dnaPP(dna); println("Base Counts: ", dna.len()," : ", dna.text.toUpper().counts() // ("A",5, "C",10, ...) .pump(String,Void.Read,"%s(%d) ".fmt)); if(m) mutate(dna,10,True);
}
fcn mutate(dna,count=1,verbose=False){
if(verbose) println("Mutating:"); do(count){ n,rb := (0).random(dna.len()), lbases[(0).random(4)]; switch( (0).random(3) ){
case(0){ if(verbose) println("Change[%d] '%s' to '%s'".fmt(n,dna.charAt(n),rb)); dna[n]=rb; } case(1){ if(verbose) println("Delete[%d] '%s'".fmt(n,dna.charAt(n))); dna.del(n); } else{ if(verbose) println("Insert[%d] '%s'".fmt(n,rb)); dna.insert(n,rb); }
} }
}
fcn dnaPP(dna,N=50){
[0..*,50].zipWith(fcn(n,bases){ println("%6d: %s".fmt(n,bases.concat())) }, dna.walker().walk.fp(50)).pump(Void); // .pump forces the iterator
}</lang>
- Output:
Original DNA strand: 0: AACGACAGGTTCTCGATGCGTGTCTTCACACATGTGGAGTCGCCAAGGAT 50: TGTTGATCAATGCGTAAACGTCTCCACGGGATACACGGGCAGCTTGCGGT 100: GACGAGTGCGGACCACCAAAAAAGGTGGGATCCACGTTGAGGAGCCTCAC 150: TACCTACGGCGTGATATGGCGGCAGGAGTCAAAAACTGCT Base Counts: 190 : A(49) C(46) G(57) T(38) Mutating: Insert[123] 'c' Insert[69] 't' Delete[5] 'C' Delete[147] 'T' Change[167] 'G' to 't' Change[153] 'C' to 'c' Insert[151] 't' Insert[156] 't' Delete[150] 'T' Change[102] 'C' to 'g' Mutated DNA strand: 0: AACGAAGGTTCTCGATGCGTGTCTTCACACATGTGGAGTCGCCAAGGATT 50: GTTGATCAATGCGTAAACtGTCTCCACGGGATACACGGGCAGCTTGCGGT 100: GAgGAGTGCGGACCACCAAAAAAcGGTGGGATCCACGTTGAGGAGCCCAC 150: tACcTtACGGCGTGATATtGCGGCAGGAGTCAAAAACTGCT Base Counts: 191 : A(49) C(45) G(57) T(40)