Bioinformatics/Sequence mutation: Difference between revisions

From Rosetta Code
Content added Content deleted
(New draft task with python example.)
 
(Added Go)
Line 15: Line 15:
* Give more information on the individual mutations applied.
* Give more information on the individual mutations applied.
* Allow mutations to be weighted and/or chosen.
* Allow mutations to be weighted and/or chosen.

=={{header|Go}}==
<lang go>package main

import (
"fmt"
"math/rand"
"sort"
"time"
)

const bases = "ACGT"

// 'w' contains the weights out of 300 for each
// of swap, delete or insert in that order.
func mutate(dna string, w [3]int) string {
le := len(dna)
// get a random position in the dna to mutate
p := rand.Intn(le)
// get a random number between 0 and 299 inclusive
r := rand.Intn(300)
bytes := []byte(dna)
switch {
case r < w[0]: // swap
base := bases[rand.Intn(4)]
fmt.Printf(" Change @%3d %q to %q\n", p, bytes[p], base)
bytes[p] = base
case r < w[0]+w[1]: // delete
fmt.Printf(" Delete @%3d %q\n", p, bytes[p])
copy(bytes[p:], bytes[p+1:])
bytes = bytes[0 : le-1]
default: // insert
base := bases[rand.Intn(4)]
bytes = append(bytes, 0)
copy(bytes[p+1:], bytes[p:])
fmt.Printf(" Insert @%3d %q\n", p, base)
bytes[p] = base
}
return string(bytes)
}

// Generate a random dna sequence of given length.
func generate(le int) string {
bytes := make([]byte, le)
for i := 0; i < le; i++ {
bytes[i] = bases[rand.Intn(4)]
}
return string(bytes)
}

// Pretty print dna and stats.
func prettyPrint(dna string, rowLen int) {
fmt.Println("SEQUENCE:")
le := len(dna)
for i := 0; i < le; i += rowLen {
k := i + rowLen
if k > le {
k = le
}
fmt.Printf("%5d: %s\n", i, dna[i:k])
}
baseMap := make(map[byte]int) // allows for 'any' base
for i := 0; i < le; i++ {
baseMap[dna[i]]++
}
var bases []byte
for k := range baseMap {
bases = append(bases, k)
}
sort.Slice(bases, func(i, j int) bool { // get bases into alphabetic order
return bases[i] < bases[j]
})

fmt.Println("\nBASE COUNT:")
for _, base := range bases {
fmt.Printf(" %c: %3d\n", base, baseMap[base])
}
fmt.Println(" ------")
fmt.Println(" Σ:", le)
fmt.Println(" ======\n")
}

// Express weights as a string.
func wstring(w [3]int) string {
return fmt.Sprintf(" Change: %d\n Delete: %d\n Insert: %d\n", w[0], w[1], w[2])
}

func main() {
rand.Seed(time.Now().UnixNano())
dna := generate(250)
prettyPrint(dna, 50)
muts := 10
w := [3]int{100, 100, 100} // use e.g. {0, 300, 0} to choose only deletions
fmt.Printf("WEIGHTS (ex 300):\n%s\n", wstring(w))
fmt.Printf("MUTATIONS (%d):\n", muts)
for i := 0; i < muts; i++ {
dna = mutate(dna, w)
}
fmt.Println()
prettyPrint(dna, 50)
}</lang>

{{out}}
Sample run:
<pre>
SEQUENCE:
0: AATCCAGAAGTTGCGGGAACCGTCGAATAGTGTTCATTAAGTGTCCCGCG
50: GAGTAGCCTCGTAATATAGAATGACCGGGCTTCCCAGCTAGACTTGTCCG
100: CCACGTTTGTGTAGGGCGCAGCGAGACTGCTCTTGATACTCGTTATGTTC
150: CTCACCGGATTATTGAATAGAGTCGAGGGGCTGACGTGACTGAACATTGC
200: CTCCTTTGCGACTAATCTTTCCTTCAATGAACAGGCGCTACCCGTCATCG

BASE COUNT:
A: 56
C: 63
G: 64
T: 67
------
Σ: 250
======

WEIGHTS (ex 300):
Change: 100
Delete: 100
Insert: 100

MUTATIONS (10):
Change @195 'A' to 'C'
Insert @ 95 'G'
Change @137 'T' to 'C'
Delete @207 'T'
Insert @148 'C'
Insert @113 'A'
Change @ 45 'C' to 'T'
Delete @ 93 'T'
Insert @ 51 'C'
Delete @248 'A'

SEQUENCE:
0: AATCCAGAAGTTGCGGGAACCGTCGAATAGTGTTCATTAAGTGTCTCGCG
50: GCAGTAGCCTCGTAATATAGAATGACCGGGCTTCCCAGCTAGACTGGTCC
100: GCCACGTTTGTGTAAGGGCGCAGCGAGACTGCTCTTGACACTCGTTATGC
150: TTCCTCACCGGATTATTGAATAGAGTCGAGGGGCTGACGTGACTGAACCT
200: TGCCTCCTTGCGACTAATCTTTCCTTCAATGAACAGGCGCTACCCGTCTC
250: G

BASE COUNT:
A: 55
C: 66
G: 65
T: 65
------
Σ: 251
======
</pre>


=={{header|Python}}==
=={{header|Python}}==

Revision as of 11:52, 26 November 2019

Bioinformatics/Sequence mutation is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.
Task

Given a string of characters A, C, G, and T representing a DNA sequence write a routine to mutate the sequence, (string) by:

  1. Choosing a random base position in the sequence.
  2. Mutate the sequence by doing one of either:
    1. Swap the base at that position by changing it to one of A, C, G, or T. (which has a chance of swapping the base for the same base)
    2. Delete the chosen base at the position.
    3. Insert another base randomly chosen from A,C, G, or T into the sequence at that position.
  3. Randomly generate a test DNA sequence of at least 200 bases
  4. "Pretty print" the sequence and a count of its size, and the count of each base in the sequence
  5. Mutate the sequence ten times.
  6. "Pretty print" the sequence after all mutations, and a count of its size, and the count of each base in the sequence.
Extra credit
  • Give more information on the individual mutations applied.
  • Allow mutations to be weighted and/or chosen.

Go

<lang go>package main

import (

   "fmt"
   "math/rand"
   "sort"
   "time"

)

const bases = "ACGT"

// 'w' contains the weights out of 300 for each // of swap, delete or insert in that order. func mutate(dna string, w [3]int) string {

   le := len(dna)
   // get a random position in the dna to mutate
   p := rand.Intn(le)
   // get a random number between 0 and 299 inclusive
   r := rand.Intn(300)
   bytes := []byte(dna)
   switch {
   case r < w[0]: // swap
       base := bases[rand.Intn(4)]
       fmt.Printf("  Change @%3d %q to %q\n", p, bytes[p], base)
       bytes[p] = base
   case r < w[0]+w[1]: // delete
       fmt.Printf("  Delete @%3d %q\n", p, bytes[p])
       copy(bytes[p:], bytes[p+1:])
       bytes = bytes[0 : le-1]
   default: // insert
       base := bases[rand.Intn(4)]
       bytes = append(bytes, 0)
       copy(bytes[p+1:], bytes[p:])
       fmt.Printf("  Insert @%3d %q\n", p, base)
       bytes[p] = base
   }
   return string(bytes)

}

// Generate a random dna sequence of given length. func generate(le int) string {

   bytes := make([]byte, le)
   for i := 0; i < le; i++ {
       bytes[i] = bases[rand.Intn(4)]
   }
   return string(bytes)

}

// Pretty print dna and stats. func prettyPrint(dna string, rowLen int) {

   fmt.Println("SEQUENCE:")
   le := len(dna)
   for i := 0; i < le; i += rowLen {
       k := i + rowLen
       if k > le {
           k = le
       }
       fmt.Printf("%5d: %s\n", i, dna[i:k])
   }
   baseMap := make(map[byte]int) // allows for 'any' base
   for i := 0; i < le; i++ {
       baseMap[dna[i]]++
   }
   var bases []byte
   for k := range baseMap {
       bases = append(bases, k)
   }
   sort.Slice(bases, func(i, j int) bool { // get bases into alphabetic order
       return bases[i] < bases[j]
   })
   fmt.Println("\nBASE COUNT:")
   for _, base := range bases {
       fmt.Printf("    %c: %3d\n", base, baseMap[base])
   }
   fmt.Println("    ------")
   fmt.Println("    Σ:", le)
   fmt.Println("    ======\n")

}

// Express weights as a string. func wstring(w [3]int) string {

   return fmt.Sprintf("  Change: %d\n  Delete: %d\n  Insert: %d\n", w[0], w[1], w[2])

}

func main() {

   rand.Seed(time.Now().UnixNano())
   dna := generate(250)
   prettyPrint(dna, 50)
   muts := 10
   w := [3]int{100, 100, 100} // use e.g. {0, 300, 0} to choose only deletions
   fmt.Printf("WEIGHTS (ex 300):\n%s\n", wstring(w))
   fmt.Printf("MUTATIONS (%d):\n", muts)
   for i := 0; i < muts; i++ {
       dna = mutate(dna, w)
   }
   fmt.Println()
   prettyPrint(dna, 50)

}</lang>

Output:

Sample run:

SEQUENCE:
    0: AATCCAGAAGTTGCGGGAACCGTCGAATAGTGTTCATTAAGTGTCCCGCG
   50: GAGTAGCCTCGTAATATAGAATGACCGGGCTTCCCAGCTAGACTTGTCCG
  100: CCACGTTTGTGTAGGGCGCAGCGAGACTGCTCTTGATACTCGTTATGTTC
  150: CTCACCGGATTATTGAATAGAGTCGAGGGGCTGACGTGACTGAACATTGC
  200: CTCCTTTGCGACTAATCTTTCCTTCAATGAACAGGCGCTACCCGTCATCG

BASE COUNT:
    A:  56
    C:  63
    G:  64
    T:  67
    ------
    Σ: 250
    ======

WEIGHTS (ex 300):
  Change: 100
  Delete: 100
  Insert: 100

MUTATIONS (10):
  Change @195 'A' to 'C'
  Insert @ 95 'G'
  Change @137 'T' to 'C'
  Delete @207 'T'
  Insert @148 'C'
  Insert @113 'A'
  Change @ 45 'C' to 'T'
  Delete @ 93 'T'
  Insert @ 51 'C'
  Delete @248 'A'

SEQUENCE:
    0: AATCCAGAAGTTGCGGGAACCGTCGAATAGTGTTCATTAAGTGTCTCGCG
   50: GCAGTAGCCTCGTAATATAGAATGACCGGGCTTCCCAGCTAGACTGGTCC
  100: GCCACGTTTGTGTAAGGGCGCAGCGAGACTGCTCTTGACACTCGTTATGC
  150: TTCCTCACCGGATTATTGAATAGAGTCGAGGGGCTGACGTGACTGAACCT
  200: TGCCTCCTTGCGACTAATCTTTCCTTCAATGAACAGGCGCTACCCGTCTC
  250: G

BASE COUNT:
    A:  55
    C:  66
    G:  65
    T:  65
    ------
    Σ: 251
    ======

Python

In function seq_mutate argument kinds selects between the three kinds of mutation. The characters I, D, and S are chosen from the string to give the kind of mutation to perform, so the more of that character, the more of that type of mutation performed.
Similarly parameter choice is chosen from to give the base for substitution or insertion - the more any base appears, the more likely it is to be chosen in any insertion/substitution.

<lang python>import random from collections import Counter

def basecount(dna):

   return sorted(Counter(dna).items())

def seq_split(dna, n=50):

   return [dna[i: i+n] for i in range(0, len(dna), n)]

def seq_pp(dna, n=50):

   for i, part in enumerate(seq_split(dna, n)):
       print(f"{i*n:>5}: {part}")
   print("\n  BASECOUNT:")
   tot = 0
   for base, count in basecount(dna):
       print(f"    {base:>3}: {count}")
       tot += count
   base, count = 'TOT', tot
   print(f"    {base:>3}= {count}")

def seq_mutate(dna, count=1, kinds="IDSSSS", choice="ATCG" ):

   mutation = []
   k2txt = dict(I='Insert', D='Delete', S='Substitute')
   for _ in range(count):
       kind = random.choice(kinds)
       index = random.randint(0, len(dna))
       if kind == 'I':    # Insert
           dna = dna[:index] + random.choice(choice) + dna[index:]
       elif kind == 'D' and dna:  # Delete
           dna = dna[:index] + dna[index+1:]
       elif kind == 'S' and dna:  # Substitute
           dna = dna[:index] + random.choice(choice) + dna[index+1:]
       mutation.append((k2txt[kind], index))
   return dna, mutation

if __name__ == '__main__':

   length = 250
   print("SEQUENCE:")
   sequence = .join(random.choices('ACGT', weights=(1, 0.8, .9, 1.1), k=length))
   seq_pp(sequence)
   print("\n\nMUTATIONS:")
   mseq, m = seq_mutate(sequence, 10)
   for kind, index in m:
       print(f" {kind:>10} @{index}")
   print()
   seq_pp(mseq)</lang>
Output:
SEQUENCE:
    0: GGAAGATTAGGTCACGGGCCTCATCTTGTGCGAGATAAATAATAACACTC
   50: AGCGATCATTAGAATGTATATTGTACGGGCATGTTTATCTACCATAGGTC
  100: CTGTCAAAAGATGGCTAGCTGCAATTTTTTCTTCTAGATCCCGATTACTG
  150: CGGTATTTTTGTATAACGTGCTAAACGGTGTGTTTTCAGGTCGGCCTGCT
  200: AATCTAACGCCAGTGGACTTGGGATGGACGCCCAACAACTGAGAGCGCGG

  BASECOUNT:
      A: 64
      C: 51
      G: 62
      T: 73
    TOT= 250


MUTATIONS:
 Substitute @138
 Substitute @72
     Insert @103
     Insert @129
     Insert @124
     Delete @52
     Delete @202
 Substitute @200
     Insert @158
     Delete @32

    0: GGAAGATTAGGTCACGGGCCTCATCTTGTGCGGATAAATAATAACACTCA
   50: GGATCATTAGAATGTATATTATACGGGCATGTTTATCTACCATAGGTCCT
  100: GCTCAAAAGATGGCTAGCTGCAGATTTTGTTCTTCTAGAGCCCGATTACT
  150: GCGGTATGTTTTGTATAACGTGCTAAACGGTGTGTTTTCAGGTCGGCCTG
  200: CTATCTAACGCCAGTGGACTTGGGATGGACGCCCAACAACTGAGAGCGCG
  250: G

  BASECOUNT:
      A: 63
      C: 51
      G: 65
      T: 72
    TOT= 251