Bioinformatics/Sequence mutation: Difference between revisions
(Added C implementation) |
(Added call arguments) |
||
Line 233: | Line 233: | ||
Sample run : |
Sample run : |
||
<pre> |
<pre> |
||
C:\My Projects\networks>a 500 30 15 10 5 |
|||
Original: |
Original: |
||
Revision as of 20:53, 1 December 2019
- Task
Given a string of characters A, C, G, and T representing a DNA sequence write a routine to mutate the sequence, (string) by:
- Choosing a random base position in the sequence.
- Mutate the sequence by doing one of either:
- Swap the base at that position by changing it to one of A, C, G, or T. (which has a chance of swapping the base for the same base)
- Delete the chosen base at the position.
- Insert another base randomly chosen from A,C, G, or T into the sequence at that position.
- Randomly generate a test DNA sequence of at least 200 bases
- "Pretty print" the sequence and a count of its size, and the count of each base in the sequence
- Mutate the sequence ten times.
- "Pretty print" the sequence after all mutations, and a count of its size, and the count of each base in the sequence.
- Extra credit
- Give more information on the individual mutations applied.
- Allow mutations to be weighted and/or chosen.
C
Adenine ( A ) is always swapped for Thymine ( T ) and vice versa. Similarly with Cytosine ( C ) and Guanine ( G ). <lang C>
- include<stdlib.h>
- include<stdio.h>
- include<time.h>
typedef struct genome{
char base; struct genome *next;
}genome;
typedef struct{
char mutation; int position;
}genomeChange;
typedef struct{
int adenineCount,thymineCount,cytosineCount,guanineCount;
}baseCounts;
genome *strand; baseCounts baseData; int genomeLength = 100, lineLength = 50;
int numDigits(int num){
int len = 1;
while(num>10){ num /= 10; len++; } return len;
}
void generateStrand(){
int baseChoice = rand()%4, i; genome *strandIterator, *newStrand;
baseData.adenineCount = 0; baseData.thymineCount = 0; baseData.cytosineCount = 0; baseData.guanineCount = 0; strand = (genome*)malloc(sizeof(genome)); strand->base = baseChoice==0?'A':(baseChoice==1?'T':(baseChoice==2?'C':'G')); baseChoice==0?baseData.adenineCount++:(baseChoice==1?baseData.thymineCount++:(baseChoice==2?baseData.cytosineCount++:baseData.guanineCount++)); strand->next = NULL;
strandIterator = strand;
for(i=1;i<genomeLength;i++){ baseChoice = rand()%4;
newStrand = (genome*)malloc(sizeof(genome)); newStrand->base = baseChoice==0?'A':(baseChoice==1?'T':(baseChoice==2?'C':'G')); baseChoice==0?baseData.adenineCount++:(baseChoice==1?baseData.thymineCount++:(baseChoice==2?baseData.cytosineCount++:baseData.guanineCount++)); newStrand->next = NULL;
strandIterator->next = newStrand; strandIterator = newStrand; }
}
genomeChange generateMutation(int swapWeight, int insertionWeight, int deletionWeight){
int mutationChoice = rand()%(swapWeight + insertionWeight + deletionWeight); genomeChange mutationCommand;
mutationCommand.mutation = mutationChoice<swapWeight?'S':((mutationChoice>=swapWeight && mutationChoice<swapWeight+insertionWeight)?'I':'D'); mutationCommand.position = rand()%genomeLength;
return mutationCommand;
}
void printGenome(){
int rows, width = numDigits(genomeLength), len = 0,i,j;
lineLength = (genomeLength<lineLength)?genomeLength:lineLength;
rows = genomeLength/lineLength + (genomeLength%lineLength!=0);
genome* strandIterator = strand;
printf("\n\nGenome : \n--------\n");
for(i=0;i<rows;i++){ printf("\n%*d%3s",width,len,":");
for(j=0;j<lineLength && strandIterator!=NULL;j++){ printf("%c",strandIterator->base); strandIterator = strandIterator->next; } len += lineLength; }
while(strandIterator!=NULL){ printf("%c",strandIterator->base); strandIterator = strandIterator->next; }
printf("\n\nBase Counts\n-----------");
printf("\n%*c%3s%*d",width,'A',":",width,baseData.adenineCount); printf("\n%*c%3s%*d",width,'T',":",width,baseData.thymineCount); printf("\n%*c%3s%*d",width,'C',":",width,baseData.cytosineCount); printf("\n%*c%3s%*d",width,'G',":",width,baseData.guanineCount);
printf("\n\nTotal:%*d",width,baseData.adenineCount + baseData.thymineCount + baseData.cytosineCount + baseData.guanineCount);
printf("\n");
}
void mutateStrand(int numMutations, int swapWeight, int insertionWeight, int deletionWeight){
int i,j,width,baseChoice; genomeChange newMutation; genome *strandIterator, *strandFollower, *newStrand;
for(i=0;i<numMutations;i++){ strandIterator = strand; strandFollower = strand; newMutation = generateMutation(swapWeight,insertionWeight,deletionWeight); width = numDigits(genomeLength);
for(j=0;j<newMutation.position;j++){ strandFollower = strandIterator; strandIterator = strandIterator->next; } if(newMutation.mutation=='S'){ if(strandIterator->base=='A'){ strandIterator->base='T'; printf("\nSwapping A at position : %*d with T",width,newMutation.position); } else if(strandIterator->base=='A'){ strandIterator->base='T'; printf("\nSwapping A at position : %*d with T",width,newMutation.position); } else if(strandIterator->base=='C'){ strandIterator->base='G'; printf("\nSwapping C at position : %*d with G",width,newMutation.position); } else{ strandIterator->base='C'; printf("\nSwapping G at position : %*d with C",width,newMutation.position); } }
else if(newMutation.mutation=='I'){ baseChoice = rand()%4;
newStrand = (genome*)malloc(sizeof(genome)); newStrand->base = baseChoice==0?'A':(baseChoice==1?'T':(baseChoice==2?'C':'G')); printf("\nInserting %c at position : %*d",newStrand->base,width,newMutation.position); baseChoice==0?baseData.adenineCount++:(baseChoice==1?baseData.thymineCount++:(baseChoice==2?baseData.cytosineCount++:baseData.guanineCount++)); newStrand->next = strandIterator; strandFollower->next = newStrand; genomeLength++; }
else{ strandFollower->next = strandIterator->next; strandIterator->next = NULL; printf("\nDeleting %c at position : %*d",strandIterator->base,width,newMutation.position); free(strandIterator); genomeLength--; } }
}
int main(int argc,char* argv[]) {
int numMutations = 10, swapWeight = 10, insertWeight = 10, deleteWeight = 10;
if(argc==1||argc>6){ printf("Usage : %s <Genome Length> <Optional number of mutations> <Optional Swapping weight> <Optional Insertion weight> <Optional Deletion weight>\n",argv[0]); return 0; }
switch(argc){ case 2: genomeLength = atoi(argv[1]); break; case 3: genomeLength = atoi(argv[1]); numMutations = atoi(argv[2]); break; case 4: genomeLength = atoi(argv[1]); numMutations = atoi(argv[2]); swapWeight = atoi(argv[3]); break; case 5: genomeLength = atoi(argv[1]); numMutations = atoi(argv[2]); swapWeight = atoi(argv[3]); insertWeight = atoi(argv[4]); break; case 6: genomeLength = atoi(argv[1]); numMutations = atoi(argv[2]); swapWeight = atoi(argv[3]); insertWeight = atoi(argv[4]); deleteWeight = atoi(argv[5]); break; };
srand(time(NULL)); generateStrand();
printf("\nOriginal:");
printGenome(); mutateStrand(numMutations,swapWeight,insertWeight,deleteWeight);
printf("\n\nMutated:"); printGenome();
return 0;
} </lang> Sample run :
C:\My Projects\networks>a 500 30 15 10 5 Original: Genome : -------- 0 :CGATGAGTTTCCTCCAAGGAGCAGGGCGTGACGGAAGGGAGGCTTAGGTC 50 :CGCATGCTCGTCGGCAGCCGGCTGGTGCCGTCGTAACCTTCACATTATTC 100 :TAGAATTTCGATGCACCTGATGACTCATACCCAGATGTAGGGGTACGCGA 150 :TGCAGATGCGGGCACGAGGAATTGTGGGCAAGCCGGCAGGTCTTTTGTAA 200 :GTTGTCACTAACTAAATAGAGGGATGGATGTTATAGCACACTACTGTCGA 250 :TTACGGACAGCGTCCCGATTCGTCATACGACCAGGATATATACTCGACGT 300 :CCAACAGGAGATTCACGTAGTGAACGCAGTTGACAGCCTGCTCGTATCTC 350 :CAGGGGTGGACTGCACCGTTCGTTAACTGCTGCCACATTAAACAGCTTCC 400 :CACTCCTTGACGCCAGACTCGGTACCACAGACCGTCAAGCTCCTATTTCC 450 :TTTGCAGTTAAAAAACACTATGGTGAAGGTCGGAGAGATGACCTCATCTA Base Counts ----------- A :124 T :118 C :126 G :132 Total:500 Inserting G at position : 205 Inserting G at position : 144 Inserting C at position : 171 Swapping A at position : 335 with T Inserting A at position : 101 Swapping C at position : 109 with G Swapping A at position : 306 with T Inserting G at position : 51 Swapping G at position : 1 with C Deleting G at position : 60 Swapping G at position : 66 with C Inserting C at position : 41 Inserting C at position : 425 Swapping C at position : 173 with G Inserting A at position : 319 Swapping G at position : 460 with C Deleting T at position : 61 Swapping C at position : 160 with G Inserting C at position : 251 Swapping G at position : 337 with C Inserting G at position : 43 Inserting T at position : 146 Inserting T at position : 181 Deleting G at position : 53 Deleting A at position : 464 Swapping G at position : 362 with C Swapping G at position : 190 with C Swapping C at position : 280 with G Inserting T at position : 479 Deleting C at position : 400 Mutated: Genome : -------- 0 :CCATGAGTTTCCTCCAAGGAGCAGGGCGTGACGGAAGGGAGCGGCTTAGG 50 :TCCGCATGCTCCGGCACCCGGCTGGTGCCGTCGTAACCTTCACATTATTC 100 :TAAGAATTTGGATGCACCTGATGACTCATACCCAGATGTAGGGGTTGACG 150 :CGATGCAGATGGGGGCACGAGGAGATTGTGTGGCAAGCCGCCAGGTCTTT 200 :TGTAAGTTGTGCACTAACTAAATAGAGGGATGGATGTTATAGCACACTAC 250 :TGTCCGATTACGGACAGCGTCCCGATTCGTGATACGACCAGGATATATAC 300 :TCGACGTCCTACAGGAGATTCAACGTAGTGAACGCAGTTCTCAGCCTGCT 350 :CGTATCTCCAGGCGTGGACTGCACCGTTCGTTAACTGCTGCCACATTAAA 400 :AGCTTCCCACTCCTTGACGCCAGACTCCGGTACCACAGACCGTCAAGCTC 450 :CTATTTCCTTTCCGTTAAAAAACACTATTGGTGAAGGTCGGAGAGATGAC 500 :CTCATCTA Base Counts ----------- A :126 T :121 C :130 G :136 Total:513
Go
<lang go>package main
import (
"fmt" "math/rand" "sort" "time"
)
const bases = "ACGT"
// 'w' contains the weights out of 300 for each // of swap, delete or insert in that order. func mutate(dna string, w [3]int) string {
le := len(dna) // get a random position in the dna to mutate p := rand.Intn(le) // get a random number between 0 and 299 inclusive r := rand.Intn(300) bytes := []byte(dna) switch { case r < w[0]: // swap base := bases[rand.Intn(4)] fmt.Printf(" Change @%3d %q to %q\n", p, bytes[p], base) bytes[p] = base case r < w[0]+w[1]: // delete fmt.Printf(" Delete @%3d %q\n", p, bytes[p]) copy(bytes[p:], bytes[p+1:]) bytes = bytes[0 : le-1] default: // insert base := bases[rand.Intn(4)] bytes = append(bytes, 0) copy(bytes[p+1:], bytes[p:]) fmt.Printf(" Insert @%3d %q\n", p, base) bytes[p] = base } return string(bytes)
}
// Generate a random dna sequence of given length. func generate(le int) string {
bytes := make([]byte, le) for i := 0; i < le; i++ { bytes[i] = bases[rand.Intn(4)] } return string(bytes)
}
// Pretty print dna and stats. func prettyPrint(dna string, rowLen int) {
fmt.Println("SEQUENCE:") le := len(dna) for i := 0; i < le; i += rowLen { k := i + rowLen if k > le { k = le } fmt.Printf("%5d: %s\n", i, dna[i:k]) } baseMap := make(map[byte]int) // allows for 'any' base for i := 0; i < le; i++ { baseMap[dna[i]]++ } var bases []byte for k := range baseMap { bases = append(bases, k) } sort.Slice(bases, func(i, j int) bool { // get bases into alphabetic order return bases[i] < bases[j] })
fmt.Println("\nBASE COUNT:") for _, base := range bases { fmt.Printf(" %c: %3d\n", base, baseMap[base]) } fmt.Println(" ------") fmt.Println(" Σ:", le) fmt.Println(" ======\n")
}
// Express weights as a string. func wstring(w [3]int) string {
return fmt.Sprintf(" Change: %d\n Delete: %d\n Insert: %d\n", w[0], w[1], w[2])
}
func main() {
rand.Seed(time.Now().UnixNano()) dna := generate(250) prettyPrint(dna, 50) muts := 10 w := [3]int{100, 100, 100} // use e.g. {0, 300, 0} to choose only deletions fmt.Printf("WEIGHTS (ex 300):\n%s\n", wstring(w)) fmt.Printf("MUTATIONS (%d):\n", muts) for i := 0; i < muts; i++ { dna = mutate(dna, w) } fmt.Println() prettyPrint(dna, 50)
}</lang>
- Output:
Sample run:
SEQUENCE: 0: AATCCAGAAGTTGCGGGAACCGTCGAATAGTGTTCATTAAGTGTCCCGCG 50: GAGTAGCCTCGTAATATAGAATGACCGGGCTTCCCAGCTAGACTTGTCCG 100: CCACGTTTGTGTAGGGCGCAGCGAGACTGCTCTTGATACTCGTTATGTTC 150: CTCACCGGATTATTGAATAGAGTCGAGGGGCTGACGTGACTGAACATTGC 200: CTCCTTTGCGACTAATCTTTCCTTCAATGAACAGGCGCTACCCGTCATCG BASE COUNT: A: 56 C: 63 G: 64 T: 67 ------ Σ: 250 ====== WEIGHTS (ex 300): Change: 100 Delete: 100 Insert: 100 MUTATIONS (10): Change @195 'A' to 'C' Insert @ 95 'G' Change @137 'T' to 'C' Delete @207 'T' Insert @148 'C' Insert @113 'A' Change @ 45 'C' to 'T' Delete @ 93 'T' Insert @ 51 'C' Delete @248 'A' SEQUENCE: 0: AATCCAGAAGTTGCGGGAACCGTCGAATAGTGTTCATTAAGTGTCTCGCG 50: GCAGTAGCCTCGTAATATAGAATGACCGGGCTTCCCAGCTAGACTGGTCC 100: GCCACGTTTGTGTAAGGGCGCAGCGAGACTGCTCTTGACACTCGTTATGC 150: TTCCTCACCGGATTATTGAATAGAGTCGAGGGGCTGACGTGACTGAACCT 200: TGCCTCCTTGCGACTAATCTTTCCTTCAATGAACAGGCGCTACCCGTCTC 250: G BASE COUNT: A: 55 C: 66 G: 65 T: 65 ------ Σ: 251 ======
Perl 6
Unweighted mutations at this point. The mutated DNA strand has a "diff" operation performed on it which (in this specific case) renders the mutated base in lower case so it may be picked out more easily.
<lang perl6>my @bases = <A C G T>;
- The DNA strand
my $dna = @bases.roll(200).join;
- The Task
put "ORIGINAL DNA STRAND:"; put pretty $dna; put "\nTotal bases: ", +my $bases = $dna.comb.Bag; put $bases.sort( ~*.key ).join: "\n";
put "\nMUTATED DNA STRAND:"; my $mutate = $dna.&mutate(10); put pretty diff $dna, $mutate; put "\nTotal bases: ", +my $mutated = $mutate.comb.Bag; put $mutated.sort( ~*.key ).join: "\n";
- Helper subs
sub pretty ($string, $wrap = 50) {
$string.comb($wrap).map( { sprintf "%8d: %s", $++ * $wrap, $_ } ).join: "\n"
}
sub mutate ($dna is copy, $count = 1) {
$dna.substr-rw((^$dna.chars).roll, 1) = @bases.roll for ^$count; $dna
}
sub diff ($orig, $repl) {
($orig.comb Z $repl.comb).map( -> ($o, $r) { $o eq $r ?? $o !! $r.lc }).join
}</lang>
- Output:
ORIGINAL DNA STRAND: 0: ACGGATAGACCGTTCCTGCAAGCTGGTACGGTTCGAATGTTGACCTTATT 50: CTCCGCAGCGCACTACCCGATCGGGTAACGTACTCTATATGATGCCTATT 100: TTCCCCGCCTTACATCGGCGATCAATGTTCTTTTACGCTAACTAGGCGCA 150: CGTCGTGCCTTACCGAGAGCCAGTTCGAAATCGTGCTGAAAATATCTGGA Total bases: 200 A 45 C 55 G 45 T 55 MUTATED DNA STRAND: 0: ACGGATAGcCCGTTCCTGCAAGCTGGTACGGTTCGAATGTTGACCTTATT 50: CTCCGCAGCGCACTACCCGATCGGGTcACtcACTCTATATGAcGCCTAaT 100: TTCCCCGCCTTACATCGGCGATCAATGTTCTTTTACGCTAACTAGGCGCA 150: CGTCGTGCCTTACCcAGAGCCAGTTCGAAATCGTGCTGAAAATATCTGGA Total bases: 200 A 44 C 60 G 43 T 53
Python
In function seq_mutate argument kinds selects between the three kinds of mutation. The characters I, D, and S are chosen from the string to give the kind of mutation to perform, so the more of that character, the more of that type of mutation performed.
Similarly parameter choice is chosen from to give the base for substitution or insertion - the more any base appears, the more likely it is to be chosen in any insertion/substitution.
<lang python>import random from collections import Counter
def basecount(dna):
return sorted(Counter(dna).items())
def seq_split(dna, n=50):
return [dna[i: i+n] for i in range(0, len(dna), n)]
def seq_pp(dna, n=50):
for i, part in enumerate(seq_split(dna, n)): print(f"{i*n:>5}: {part}") print("\n BASECOUNT:") tot = 0 for base, count in basecount(dna): print(f" {base:>3}: {count}") tot += count base, count = 'TOT', tot print(f" {base:>3}= {count}")
def seq_mutate(dna, count=1, kinds="IDSSSS", choice="ATCG" ):
mutation = [] k2txt = dict(I='Insert', D='Delete', S='Substitute') for _ in range(count): kind = random.choice(kinds) index = random.randint(0, len(dna)) if kind == 'I': # Insert dna = dna[:index] + random.choice(choice) + dna[index:] elif kind == 'D' and dna: # Delete dna = dna[:index] + dna[index+1:] elif kind == 'S' and dna: # Substitute dna = dna[:index] + random.choice(choice) + dna[index+1:] mutation.append((k2txt[kind], index)) return dna, mutation
if __name__ == '__main__':
length = 250 print("SEQUENCE:") sequence = .join(random.choices('ACGT', weights=(1, 0.8, .9, 1.1), k=length)) seq_pp(sequence) print("\n\nMUTATIONS:") mseq, m = seq_mutate(sequence, 10) for kind, index in m: print(f" {kind:>10} @{index}") print() seq_pp(mseq)</lang>
- Output:
SEQUENCE: 0: GGAAGATTAGGTCACGGGCCTCATCTTGTGCGAGATAAATAATAACACTC 50: AGCGATCATTAGAATGTATATTGTACGGGCATGTTTATCTACCATAGGTC 100: CTGTCAAAAGATGGCTAGCTGCAATTTTTTCTTCTAGATCCCGATTACTG 150: CGGTATTTTTGTATAACGTGCTAAACGGTGTGTTTTCAGGTCGGCCTGCT 200: AATCTAACGCCAGTGGACTTGGGATGGACGCCCAACAACTGAGAGCGCGG BASECOUNT: A: 64 C: 51 G: 62 T: 73 TOT= 250 MUTATIONS: Substitute @138 Substitute @72 Insert @103 Insert @129 Insert @124 Delete @52 Delete @202 Substitute @200 Insert @158 Delete @32 0: GGAAGATTAGGTCACGGGCCTCATCTTGTGCGGATAAATAATAACACTCA 50: GGATCATTAGAATGTATATTATACGGGCATGTTTATCTACCATAGGTCCT 100: GCTCAAAAGATGGCTAGCTGCAGATTTTGTTCTTCTAGAGCCCGATTACT 150: GCGGTATGTTTTGTATAACGTGCTAAACGGTGTGTTTTCAGGTCGGCCTG 200: CTATCTAACGCCAGTGGACTTGGGATGGACGCCCAACAACTGAGAGCGCG 250: G BASECOUNT: A: 63 C: 51 G: 65 T: 72 TOT= 251
zkl
<lang zkl>var [const] bases="ACGT", lbases=bases.toLower(); dna:=(190).pump(Data().howza(3),(0).random.fp(0,4),bases.get); // bucket of bytes
foreach s,m in (T("Original","Mutated").zip(T(True,False))){
println("\n",s," DNA strand:"); dnaPP(dna); println("Base Counts: ", dna.len()," : ", dna.text.toUpper().counts() // ("A",5, "C",10, ...) .pump(String,Void.Read,"%s(%d) ".fmt)); if(m) mutate(dna,10,True);
}
fcn mutate(dna,count=1,verbose=False){
if(verbose) println("Mutating:"); do(count){ n,rb := (0).random(dna.len()), lbases[(0).random(4)]; switch( (0).random(3) ){
case(0){ if(verbose) println("Change[%d] '%s' to '%s'".fmt(n,dna.charAt(n),rb)); dna[n]=rb; } case(1){ if(verbose) println("Delete[%d] '%s'".fmt(n,dna.charAt(n))); dna.del(n); } else{ if(verbose) println("Insert[%d] '%s'".fmt(n,rb)); dna.insert(n,rb); }
} }
}
fcn dnaPP(dna,N=50){
[0..*,50].zipWith(fcn(n,bases){ println("%6d: %s".fmt(n,bases.concat())) }, dna.walker().walk.fp(50)).pump(Void); // .pump forces the iterator
}</lang>
- Output:
Original DNA strand: 0: AACGACAGGTTCTCGATGCGTGTCTTCACACATGTGGAGTCGCCAAGGAT 50: TGTTGATCAATGCGTAAACGTCTCCACGGGATACACGGGCAGCTTGCGGT 100: GACGAGTGCGGACCACCAAAAAAGGTGGGATCCACGTTGAGGAGCCTCAC 150: TACCTACGGCGTGATATGGCGGCAGGAGTCAAAAACTGCT Base Counts: 190 : A(49) C(46) G(57) T(38) Mutating: Insert[123] 'c' Insert[69] 't' Delete[5] 'C' Delete[147] 'T' Change[167] 'G' to 't' Change[153] 'C' to 'c' Insert[151] 't' Insert[156] 't' Delete[150] 'T' Change[102] 'C' to 'g' Mutated DNA strand: 0: AACGAAGGTTCTCGATGCGTGTCTTCACACATGTGGAGTCGCCAAGGATT 50: GTTGATCAATGCGTAAACtGTCTCCACGGGATACACGGGCAGCTTGCGGT 100: GAgGAGTGCGGACCACCAAAAAAcGGTGGGATCCACGTTGAGGAGCCCAC 150: tACcTtACGGCGTGATATtGCGGCAGGAGTCAAAAACTGCT Base Counts: 191 : A(49) C(45) G(57) T(40)