Bioinformatics/base count: Difference between revisions
m →{{header|REXX}}: (internal only) changed the DNA sequence to lowercase; output DNA sequence is still uppercase. |
m →{{header|REXX}}: added whitespace. |
||
Line 969: | Line 969: | ||
dna= space(dna, 0); upper dna /*elide blanks from DNA; uppercase it. */ |
dna= space(dna, 0); upper dna /*elide blanks from DNA; uppercase it. */ |
||
say '────────length of the DNA string: ' length(dna) |
say '────────length of the DNA string: ' length(dna) |
||
@.= |
@.= 0 /*initialize the count for all bases. */ |
||
w= 1 /*the maximum width of a base count. */ |
w= 1 /*the maximum width of a base count. */ |
||
$= /*a placeholder for the names of bases.*/ |
$= /*a placeholder for the names of bases.*/ |
||
do j=1 for length(dna) /*traipse through the DNA string. */ |
do j=1 for length(dna) /*traipse through the DNA string. */ |
||
_= substr(dna, j, 1) /*obtain a base name from the DNA str. */ |
_= substr(dna, j, 1) /*obtain a base name from the DNA str. */ |
||
if pos(_, $)==0 then $=$ || |
if pos(_, $)==0 then $= $ || _ /*if not found before, add it to list. */ |
||
@._= @._ + 1 /*bump the count of this base. */ |
@._= @._ + 1 /*bump the count of this base. */ |
||
w= max(w, length(@._) ) /*compute the maximum width number. */ |
w= max(w, length(@._) ) /*compute the maximum width number. */ |
||
Line 983: | Line 983: | ||
say ' base ' z " has a basecount of: " right(@.z, w) |
say ' base ' z " has a basecount of: " right(@.z, w) |
||
@.tot= @.tot + @.z /*add to a grand total to verify count.*/ |
@.tot= @.tot + @.z /*add to a grand total to verify count.*/ |
||
⚫ | |||
end /*k*/ |
|||
say |
|||
⚫ | |||
say '────────total for all basecounts:' right(@.tot, w+1)</lang> |
say '────────total for all basecounts:' right(@.tot, w+1)</lang> |
||
{{out|output|text= when using the default input:}} |
{{out|output|text= when using the default input:}} |
Revision as of 18:37, 6 July 2020
You are encouraged to solve this task according to the task description, using any language you may know.
Given this string representing ordered DNA bases:
CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT
- "Pretty print" the sequence followed by a summary of the counts of each of the bases, (A, C, G, and T) in the sequence as well as the total count of bases in the string.
C
Reads genome from a file, determines string length to ensure optimal formatting <lang C>
- include<string.h>
- include<stdlib.h>
- include<stdio.h>
typedef struct genome{
char* strand; int length; struct genome* next;
}genome;
genome* genomeData; int totalLength = 0, Adenine = 0, Cytosine = 0, Guanine = 0, Thymine = 0;
int numDigits(int num){
int len = 1;
while(num>10){ num = num/10; len++; }
return len;
}
void buildGenome(char str[100]){
int len = strlen(str),i; genome *genomeIterator, *newGenome;
totalLength += len;
for(i=0;i<len;i++){ switch(str[i]){ case 'A': Adenine++; break; case 'T': Thymine++; break; case 'C': Cytosine++; break; case 'G': Guanine++; break; }; }
if(genomeData==NULL){ genomeData = (genome*)malloc(sizeof(genome));
genomeData->strand = (char*)malloc(len*sizeof(char)); strcpy(genomeData->strand,str); genomeData->length = len;
genomeData->next = NULL; }
else{ genomeIterator = genomeData;
while(genomeIterator->next!=NULL) genomeIterator = genomeIterator->next;
newGenome = (genome*)malloc(sizeof(genome));
newGenome->strand = (char*)malloc(len*sizeof(char)); strcpy(newGenome->strand,str); newGenome->length = len;
newGenome->next = NULL; genomeIterator->next = newGenome; }
}
void printGenome(){
genome* genomeIterator = genomeData;
int width = numDigits(totalLength), len = 0;
printf("Sequence:\n");
while(genomeIterator!=NULL){ printf("\n%*d%3s%3s",width+1,len,":",genomeIterator->strand); len += genomeIterator->length;
genomeIterator = genomeIterator->next; }
printf("\n\nBase Count\n----------\n\n");
printf("%3c%3s%*d\n",'A',":",width+1,Adenine); printf("%3c%3s%*d\n",'T',":",width+1,Thymine); printf("%3c%3s%*d\n",'C',":",width+1,Cytosine); printf("%3c%3s%*d\n",'G',":",width+1,Guanine); printf("\n%3s%*d\n","Total:",width+1,Adenine + Thymine + Cytosine + Guanine);
free(genomeData);
}
int main(int argc,char** argv) {
char str[100]; int counter = 0, len; if(argc!=2){ printf("Usage : %s <Gene file name>\n",argv[0]); return 0; }
FILE *fp = fopen(argv[1],"r");
while(fscanf(fp,"%s",str)!=EOF) buildGenome(str); fclose(fp);
printGenome();
return 0;
} </lang> Run and output :
abhishek_ghosh@Azure:~/doodles$ ./a.out genome.txt Sequence: 0 :CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG 50 :CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG 100 :AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT 150 :GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT 200 :CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG 250 :TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA 300 :TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT 350 :CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG 400 :TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC 450 :GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT Base Count ---------- A : 129 T : 155 C : 97 G : 119 Total: 500
Factor
<lang factor>USING: assocs formatting grouping io kernel literals math math.statistics prettyprint qw sequences sorting ;
CONSTANT: dna $[
qw{ CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT } concat
]
- .dna ( seq n -- )
"SEQUENCE:" print [ group ] keep [ * swap " %3d: %s\n" printf ] curry each-index ;
- show-counts ( seq -- )
"BASE COUNTS:" print histogram >alist [ first ] sort-with [ [ " %c: %3d\n" printf ] assoc-each ] [ "TOTAL: " write [ second ] [ + ] map-reduce . ] bi ;
dna [ 50 .dna nl ] [ show-counts ] bi</lang>
- Output:
SEQUENCE: 0: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG 50: CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG 100: AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT 150: GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT 200: CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG 250: TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA 300: TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT 350: CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG 400: TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC 450: GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT BASE COUNTS: A: 129 C: 97 G: 119 T: 155 TOTAL: 500
Go
<lang go>package main
import (
"fmt" "sort"
)
func main() {
dna := "" + "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG" + "CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG" + "AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT" + "GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT" + "CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG" + "TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA" + "TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT" + "CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG" + "TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC" + "GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT"
fmt.Println("SEQUENCE:") le := len(dna) for i := 0; i < le; i += 50 { k := i + 50 if k > le { k = le } fmt.Printf("%5d: %s\n", i, dna[i:k]) } baseMap := make(map[byte]int) // allows for 'any' base for i := 0; i < le; i++ { baseMap[dna[i]]++ } var bases []byte for k := range baseMap { bases = append(bases, k) } sort.Slice(bases, func(i, j int) bool { // get bases into alphabetic order return bases[i] < bases[j] })
fmt.Println("\nBASE COUNT:") for _, base := range bases { fmt.Printf(" %c: %3d\n", base, baseMap[base]) } fmt.Println(" ------") fmt.Println(" Σ:", le) fmt.Println(" ======")
}</lang>
- Output:
SEQUENCE: 0: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG 50: CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG 100: AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT 150: GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT 200: CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG 250: TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA 300: TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT 350: CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG 400: TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC 450: GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT BASE COUNT: A: 129 C: 97 G: 119 T: 155 ------ Σ: 500 ======
Haskell
<lang haskell>import Data.List (group, sort) import Data.List.Split (chunksOf) import Text.Printf (printf, IsChar(..), PrintfArg(..), fmtChar, fmtPrecision, formatString)
data DNABase = A | C | G | T deriving (Show, Read, Eq, Ord) type DNASequence = [DNABase]
instance IsChar DNABase where
toChar = head . show fromChar = read . pure
instance PrintfArg DNABase where
formatArg x fmt = formatString (show x) (fmt { fmtChar = 's', fmtPrecision = Nothing })
test :: DNASequence test = read . pure <$> concat
[ "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG" , "CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG" , "AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT" , "GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT" , "CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG" , "TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA" , "TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT" , "CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG" , "TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC" , "GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT" ]
chunkedDNASequence :: DNASequence -> [(Int, [DNABase])] chunkedDNASequence = zip [50,100..] . chunksOf 50
baseCounts :: DNASequence -> [(DNABase, Int)] baseCounts = fmap ((,) . head <*> length) . group . sort
main :: IO () main = do
putStrLn "Sequence:" mapM_ (uncurry (printf "%3d: %s\n")) $ chunkedDNASequence test putStrLn "\nBase Counts:" mapM_ (uncurry (printf "%2s: %2d\n")) $ baseCounts test putStrLn (replicate 8 '-') >> printf " Σ: %d\n\n" (length test)</lang>
- Output:
Sequence: 50: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG 100: CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG 150: AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT 200: GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT 250: CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG 300: TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA 350: TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT 400: CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG 450: TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC 500: GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT Base Counts: A: 129 C: 97 G: 119 T: 155 -------- Σ: 500
Julia
<lang julia>const sequence = "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG" * "CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG" * "AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT" * "GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT" * "CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG" * "TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA" * "TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT" * "CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG" * "TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC" * "GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT"
function dnasequenceprettyprint(seq, colsize=50)
println(length(seq), "nt DNA sequence:\n") rows = [seq[i:min(length(seq), i + colsize - 1)] for i in 1:colsize:length(seq)] for (i, r) in enumerate(rows) println(lpad(colsize * (i - 1), 5), " ", r) end
end
dnasequenceprettyprint(sequence)
function printcounts(seq)
bases = [['A', 0], ['C', 0], ['G', 0], ['T', 0]] for c in seq, base in bases if c == base[1] base[2] += 1 end end println("\nNucleotide counts:\n") for base in bases println(lpad(base[1], 10), lpad(string(base[2]), 12)) end println(lpad("Other", 10), lpad(string(length(seq) - sum(x[2] for x in bases)), 12)) println(" _________________\n", lpad("Total", 10), lpad(string(length(seq)), 12))
end
printcounts(sequence)
</lang>
- Output:
500nt DNA sequence: 0 CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG 50 CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG 100 AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT 150 GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT 200 CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG 250 TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA 300 TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT 350 CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG 400 TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC 450 GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT Nucleotide counts: A 129 C 97 G 119 T 155 Other 0 _________________ Total 500
Pascal
<lang pascal>program DNA_Base_Count; {$IFDEF FPC}
{$MODE DELPHI}//String = AnsiString
{$ELSE}
{$APPTYPE CONSOLE}
{$ENDIF} const
dna = 'CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG' + 'CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG' + 'AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT' + 'GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT' + 'CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG' + 'TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA' + 'TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT' + 'CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG' + 'TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC' + 'GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT';
var
CntIdx : array of NativeUint; DNABases : String; SumBaseTotal : NativeInt;
procedure OutFormatBase(var DNA: String;colWidth:NativeInt); var
j: NativeInt;
Begin
j := 0; Writeln(' DNA base sequence'); While j<Length(DNA) do Begin writeln(j:5,copy(DNA,j+1,colWidth):colWidth+2); inc(j,colWidth); end; writeln;
end;
procedure Cnt(const DNA: String); var
i,p :NativeInt;
Begin
SetLength(CntIdx,Length(DNABases)); i := 1; while i <= Length(DNA) do Begin p := Pos(DNA[i],DNABases); //found new base so extend list if p = 0 then Begin DNABases := DNABases+DNA[i]; p := length(DNABases); Setlength(CntIdx,p+1); end; inc(CntIdx[p]); inc(i); end;
Writeln('Base Count'); SumBaseTotal := 0; For i := 1 to Length(DNABases) do Begin p := CntIdx[i]; inc(SumBaseTotal,p); writeln(DNABases[i]:4,p:10); end; Writeln('Total base count ',SumBaseTotal); writeln;
end;
var
TestDNA: String;
Begin
DNABases :='ACGT';// predefined TestDNA := DNA; OutFormatBase(TestDNA,50); Cnt(TestDNA);
end.</lang>
- Output:
DNA base sequence 0 CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG 50 CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG 100 AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT 150 GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT 200 CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG 250 TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA 300 TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT 350 CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG 400 TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC 450 GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT Base Count A 129 C 97 G 119 T 155 Total base count 500
Perl
<lang perl>use strict; use warnings; use feature 'say';
my %cnt; my $total = 0;
while ($_ = ) {
chomp; printf "%4d: %s\n", $total+1, s/(.{10})/$1 /gr; $total += length; $cnt{$_}++ for split //
}
say "\nTotal bases: $total"; say "$_: " . ($cnt{$_}//0) for <A C G T>;
__DATA__ CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT</lang>
- Output:
1: CGTAAAAAAT TACAACGTCC TTTGGCTATC TCTTAAACTC CTGCTAAATG 51: CTCGTGCTTT CCAATTATGT AAGCGTTCCG AGACGGGGTG GTCGATTCTG 101: AGGACAAAGG TCAAGATGGA GCGCATCGAA CGCAATAAGG ATCATTTGAT 151: GGGACGTTTC GTCGACAAAG TCTTGTTTCG AGAGTAACGG CTACCGTCTT 201: CGATTCTGCT TATAACACTA TGTTCTTATG AAATGGATGT TCTGAGTTGG 251: TCAGTCCCAA TGTGCGGGGT TTCTTTTAGT ACGTCGGGAG TGGTATTATA 301: TTTAATTTTT CTATATAGCG ATCTGTATTT AAGCAATTCA TTTAGGTTAT 351: CGCCGCGATG CTCGGTTCGG ACCGCCAAGC ATCTGGCTCC ACTGCTAGTG 401: TCCTAAATTT GAATGGCAAA CACAAATAAG ATTTAGCAAT TCGTGTAGAC 451: GACCGGGGAC TTGCATGATG GGAGCAGCTT TGTTAAACTA CGAACGTAAT Total bases: 500 A: 129 C: 97 G: 119 T: 155
Phix
<lang Phix>constant dna = substitute(""" CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT ""","\n","") sequence acgt = repeat(0,5) for i=1 to length(dna) do
acgt[find(dna[i],"ACGT")] += 1
end for acgt[$] = sum(acgt) sequence s = split(trim(join_by(split(join_by(dna,1,10,""),"\n"),1,5," ")),"\n") for i=1 to length(s) do
printf(1,"%3d: %s\n",{(i-1)*50+1,s[i]})
end for printf(1,"\nBase counts: A:%d, C:%d, G:%d, T:%d, total:%d\n",acgt)</lang>
- Output:
1: CGTAAAAAAT TACAACGTCC TTTGGCTATC TCTTAAACTC CTGCTAAATG 51: CTCGTGCTTT CCAATTATGT AAGCGTTCCG AGACGGGGTG GTCGATTCTG 101: AGGACAAAGG TCAAGATGGA GCGCATCGAA CGCAATAAGG ATCATTTGAT 151: GGGACGTTTC GTCGACAAAG TCTTGTTTCG AGAGTAACGG CTACCGTCTT 201: CGATTCTGCT TATAACACTA TGTTCTTATG AAATGGATGT TCTGAGTTGG 251: TCAGTCCCAA TGTGCGGGGT TTCTTTTAGT ACGTCGGGAG TGGTATTATA 301: TTTAATTTTT CTATATAGCG ATCTGTATTT AAGCAATTCA TTTAGGTTAT 351: CGCCGCGATG CTCGGTTCGG ACCGCCAAGC ATCTGGCTCC ACTGCTAGTG 401: TCCTAAATTT GAATGGCAAA CACAAATAAG ATTTAGCAAT TCGTGTAGAC 451: GACCGGGGAC TTGCATGATG GGAGCAGCTT TGTTAAACTA CGAACGTAAT Base counts: A:129, C:97, G:119, T:155, total:500
Python
Procedural
<lang python>from collections import Counter
def basecount(dna):
return sorted(Counter(dna).items())
def seq_split(dna, n=50):
return [dna[i: i+n] for i in range(0, len(dna), n)]
def seq_pp(dna, n=50):
for i, part in enumerate(seq_split(dna, n)): print(f"{i*n:>5}: {part}") print("\n BASECOUNT:") tot = 0 for base, count in basecount(dna): print(f" {base:>3}: {count}") tot += count base, count = 'TOT', tot print(f" {base:>3}= {count}")
if __name__ == '__main__':
print("SEQUENCE:") sequence = \
CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG\ CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG\ AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT\ GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT\ CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG\ TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA\ TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT\ CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG\ TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC\ GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT
seq_pp(sequence)
</lang>
- Output:
SEQUENCE: 0: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG 50: CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG 100: AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT 150: GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT 200: CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG 250: TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA 300: TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT 350: CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG 400: TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC 450: GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT BASECOUNT: A: 129 C: 97 G: 119 T: 155 TOT= 500
Functional
Sequence and base counts displayed in GenBank format.
<lang python>Bioinformatics – base count
from itertools import count from functools import reduce
- genBankFormatWithBaseCounts :: String -> String
def genBankFormatWithBaseCounts(sequence):
DNA Sequence displayed in a subset of the GenBank format. See example at foot of: https://www.genomatix.de/online_help/help/sequence_formats.html ks, totals = zip(*baseCounts(sequence)) ns = list(map(str, totals)) w = 2 + max(map(len, ns))
return '\n'.join([ 'DEFINITION len=' + str(sum(totals)), 'BASE COUNT ' + .join( n.rjust(w) + ' ' + k.lower() for (k, n) in zip(ks, ns) ), 'ORIGIN' ] + [ str(i).rjust(9) + ' ' + k for i, k in zip( count(1, 60), [ ' '.join(row) for row in chunksOf(6)(chunksOf(10)(sequence)) ] ) ] + ['//'])
- baseCounts :: String -> Zip [(String, Int)]
def baseCounts(baseString):
Sums for each base type in the given sequence string, with a fifth sum for any characters not drawn from {A, C, G, T}. bases = { 'A': 0, 'C': 1, 'G': 2, 'T': 3 } return zip( list(bases.keys()) + ['Other'], foldl( lambda a: compose( nthArrow(succ)(a), flip(curry(bases.get))(4) ) )((0, 0, 0, 0, 0))(baseString) )
- -------------------------- TEST --------------------------
- main :: IO ()
def main():
Base counts and sequence displayed in GenBank format print( genBankFormatWithBaseCounts(\
CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG\ CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG\ AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT\ GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT\ CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG\ TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA\ TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT\ CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG\ TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC\ GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT)
)
- ------------------------ GENERIC -------------------------
- chunksOf :: Int -> [a] -> a
def chunksOf(n):
A series of lists of length n, subdividing the contents of xs. Where the length of xs is not evenly divible, the final list will be shorter than n. return lambda xs: reduce( lambda a, i: a + [xs[i:n + i]], range(0, len(xs), n), [] ) if 0 < n else []
- compose :: ((a -> a), ...) -> (a -> a)
def compose(*fs):
Composition, from right to left, of a series of functions. def go(f, g): def fg(x): return f(g(x)) return fg return reduce(go, fs, lambda x: x)
- curry :: ((a, b) -> c) -> a -> b -> c
def curry(f):
A curried function derived from an uncurried function. return lambda x: lambda y: f(x, y)
- flip :: (a -> b -> c) -> b -> a -> c
def flip(f):
The (curried or uncurried) function f with its arguments reversed. return lambda a: lambda b: f(b)(a)
- foldl :: (a -> b -> a) -> a -> [b] -> a
def foldl(f):
Left to right reduction of a list, using the binary operator f, and starting with an initial value a. def go(acc, xs): return reduce(lambda a, x: f(a)(x), xs, acc) return lambda acc: lambda xs: go(acc, xs)
- nthArrow :: (a -> b) -> Tuple -> Int -> Tuple
def nthArrow(f):
A simple function lifted to one which applies to a tuple, transforming only its nth value. def go(v, n): return v if n > len(v) else [ x if n != i else f(x) for i, x in enumerate(v) ] return lambda tpl: lambda n: tuple(go(tpl, n))
- succ :: Enum a => a -> a
def succ(x):
The successor of a value. For numeric types, (1 +). return 1 + x
- MAIN ---
if __name__ == '__main__':
main()</lang>
- Output:
DEFINITION len=500 BASE COUNT 129 a 97 c 119 g 155 t 0 other ORIGIN 1 CGTAAAAAAT TACAACGTCC TTTGGCTATC TCTTAAACTC CTGCTAAATG CTCGTGCTTT 61 CCAATTATGT AAGCGTTCCG AGACGGGGTG GTCGATTCTG AGGACAAAGG TCAAGATGGA 121 GCGCATCGAA CGCAATAAGG ATCATTTGAT GGGACGTTTC GTCGACAAAG TCTTGTTTCG 181 AGAGTAACGG CTACCGTCTT CGATTCTGCT TATAACACTA TGTTCTTATG AAATGGATGT 241 TCTGAGTTGG TCAGTCCCAA TGTGCGGGGT TTCTTTTAGT ACGTCGGGAG TGGTATTATA 301 TTTAATTTTT CTATATAGCG ATCTGTATTT AAGCAATTCA TTTAGGTTAT CGCCGCGATG 361 CTCGGTTCGG ACCGCCAAGC ATCTGGCTCC ACTGCTAGTG TCCTAAATTT GAATGGCAAA 421 CACAAATAAG ATTTAGCAAT TCGTGTAGAC GACCGGGGAC TTGCATGATG GGAGCAGCTT 481 TGTTAAACTA CGAACGTAAT //
Racket
<lang racket>#lang racket
(define (fold-sequence seq kons #:finalise (finalise (λ x (apply values x))) . k0s)
(define (recur seq . ks) (if (null? seq) (call-with-values (λ () (apply finalise ks)) (λ vs (apply values vs))) (call-with-values (λ () (apply kons (car seq) ks)) (λ ks+ (apply recur (cdr seq) ks+))))) (apply recur (if (string? seq) (string->list (regexp-replace* #px"[^ACGT]" seq "")) seq) k0s))
(define (sequence->pretty-printed-string seq)
(define (fmt idx cs-rev) (format "~a: ~a" (~a idx #:width 3 #:align 'right) (list->string (reverse cs-rev)))) (fold-sequence seq (λ (b n start-idx lns-rev cs-rev) (if (zero? (modulo n 50))
(values (+ n 1) n (if (pair? cs-rev) (cons (fmt start-idx cs-rev) lns-rev) lns-rev) (cons b null)) (values (+ n 1) start-idx lns-rev (cons b cs-rev))))
0 0 null null #:finalise (λ (n idx lns-rev cs-rev)
(string-join (reverse (if (null? cs-rev) lns-rev (cons (fmt idx cs-rev) lns-rev))) "\n"))))
(define (count-bases b as cs gs ts n)
(values (+ as (if (eq? b #\A) 1 0))
(+ cs (if (eq? b #\C) 1 0)) (+ gs (if (eq? b #\T) 1 0)) (+ ts (if (eq? b #\G) 1 0)) (add1 n)))
(define (bioinformatics-Base_count s)
(define-values (as cs gs ts n) (fold-sequence s count-bases 0 0 0 0 0)) (printf "SEQUENCE:~%~%~a~%~%" (sequence->pretty-printed-string s)) (printf "BASE COUNT:~%-----------~%~%~a~%~%"
(string-join (map (λ (c n) (format " ~a :~a" c (~a #:width 4 #:align 'right n))) '(A T C G) (list as ts cs gs)) "\n"))
(newline) (printf "TOTAL: ~a~%" n))
(module+
main (define the-string #<<EOS
CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT EOS )
(bioinformatics-Base_count the-string))</lang>
- Output:
SEQUENCE: 0: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG 50: CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG 100: AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT 150: GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT 200: CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG 250: TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA 300: TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT 350: CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG 400: TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC 450: GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT BASE COUNT: ----------- A : 129 T : 119 C : 97 G : 155 TOTAL: 500
Raku
(formerly Perl 6)
It's the Letter frequency task all over again, just simpler and dressed up in different clothes.
The specs for what "pretty print" means are sadly lacking. Ah well, just makes it easily defensible if I do anything at all.
<lang perl6>my $dna = join , lines q:to/END/;
CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT END
put pretty($dna, 80);
put "\nTotal bases: ", +my $bases = $dna.comb.Bag;
put $bases.sort(~*.key).join: "\n";
sub pretty ($string, $wrap = 50) {
$string.comb($wrap).map( { sprintf "%8d: %s", $++ * $wrap, $_ } ).join: "\n"
}</lang>
- Output:
0: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCG 80: AGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTC 160: GTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGT 240: TCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATATTTAATTTTTCTATATAGCG 320: ATCTGTATTTAAGCAATTCATTTAGGTTATCGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG 400: TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGACGACCGGGGACTTGCATGATGGGAGCAGCTT 480: TGTTAAACTACGAACGTAAT Total bases: 500 A 129 C 97 G 119 T 155
REXX
A little extra boilerplate was added to verify correct coding of the bases in a DNA string and the alignment of the (totals) numbers. <lang rexx>/*REXX program finds the number of each base in a DNA string (along with a total). */ parse arg dna . if dna== | dna=="," then dna= 'cgtaaaaaattacaacgtcctttggctatctcttaaactcctgctaaatg' ,
'ctcgtgctttccaattatgtaagcgttccgagacggggtggtcgattctg' , 'aggacaaaggtcaagatggagcgcatcgaacgcaataaggatcatttgat' , 'gggacgtttcgtcgacaaagtcttgtttcgagagtaacggctaccgtctt' , 'cgattctgcttataacactatgttcttatgaaatggatgttctgagttgg' , 'tcagtcccaatgtgcggggtttcttttagtacgtcgggagtggtattata' , 'tttaatttttctatatagcgatctgtatttaagcaattcatttaggttat' , 'cgccgcgatgctcggttcggaccgccaagcatctggctccactgctagtg' , 'tcctaaatttgaatggcaaacacaaataagatttagcaattcgtgtagac' , 'gaccggggacttgcatgatgggagcagctttgttaaactacgaacgtaat'
dna= space(dna, 0); upper dna /*elide blanks from DNA; uppercase it. */ say '────────length of the DNA string: ' length(dna) @.= 0 /*initialize the count for all bases. */ w= 1 /*the maximum width of a base count. */ $= /*a placeholder for the names of bases.*/
do j=1 for length(dna) /*traipse through the DNA string. */ _= substr(dna, j, 1) /*obtain a base name from the DNA str. */ if pos(_, $)==0 then $= $ || _ /*if not found before, add it to list. */ @._= @._ + 1 /*bump the count of this base. */ w= max(w, length(@._) ) /*compute the maximum width number. */ end /*j*/
say
do k=0 for 255; z= d2c(k) /*traipse through all possibilities. */ if pos(z, $)==0 then iterate /*Was this base found? No, then skip. */ say ' base ' z " has a basecount of: " right(@.z, w) @.tot= @.tot + @.z /*add to a grand total to verify count.*/ end /*k*/ /*stick a fork in it, we're all done. */
say say '────────total for all basecounts:' right(@.tot, w+1)</lang>
- output when using the default input:
────────length of the DNA string: 500 base A has a basecount of: 129 base C has a basecount of: 97 base G has a basecount of: 119 base T has a basecount of: 155 ────────total for all basecounts: 500
Swift
<lang swift>import Foundation
let dna = """
CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT """
print("input:\n\(dna)\n")
let counts =
dna.replacingOccurrences(of: "\n", with: "").reduce(into: [:], { $0[$1, default: 0] += 1 })
print("Counts: \(counts)") print("Total: \(counts.values.reduce(0, +))")</lang>
- Output:
input: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT ["C": 97, "T": 155, "G": 119, "A": 129] Total: 500
Wren
<lang ecmascript>import "/fmt" for Fmt import "/sort" for Sort import "/trait" for Stepped
var dna = "CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG" +
"CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG" + "AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT" + "GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT" + "CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG" + "TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA" + "TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT" + "CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG" + "TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC" + "GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT"
System.print("SEQUENCE:") var le = dna.count for (i in Stepped.new(0...le, 50)) {
var k = i + 50 if (k > le) k = le System.print("%(Fmt.d(5, i)): %(dna[i...k])")
} var baseMap = {} // allows for 'any' base for (i in 0...le) {
var d = dna[i] var v = baseMap[d] baseMap[d] = !v ? 1 : v + 1
} var bases = baseMap.keys.toList Sort.quick(bases)
System.print("\nBASE COUNT:") for (base in bases) {
System.print(" %(base): %(Fmt.d(3, baseMap[base]))")
} System.print(" ------") System.print(" Σ: %(le)") System.print(" ======")</lang>
- Output:
SEQUENCE: 0: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG 50: CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG 100: AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT 150: GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT 200: CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG 250: TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA 300: TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT 350: CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG 400: TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC 450: GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT BASE COUNT: A: 129 C: 97 G: 119 T: 155 ------ Σ: 500 ======
zkl
<lang zkl>bases:=
- <<<"
CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT" - " \n";
- <<<
[0..*,50].zipWith(fcn(n,bases){ println("%6d: %s".fmt(n,bases.concat())) },
bases.walker().walk.fp(50)).pump(Void); // .pump forces the iterator
println("\nBase Counts: ", bases.counts().pump(String,Void.Read,"%s: %d ".fmt)); println("Total: ",bases.len());</lang>
- Output:
0: CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATG 50: CTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTG 100: AGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGAT 150: GGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT 200: CGATTCTGCTTATAACACTATGTTCTTATGAAATGGATGTTCTGAGTTGG 250: TCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA 300: TTTAATTTTTCTATATAGCGATCTGTATTTAAGCAATTCATTTAGGTTAT 350: CGCCGCGATGCTCGGTTCGGACCGCCAAGCATCTGGCTCCACTGCTAGTG 400: TCCTAAATTTGAATGGCAAACACAAATAAGATTTAGCAATTCGTGTAGAC 450: GACCGGGGACTTGCATGATGGGAGCAGCTTTGTTAAACTACGAACGTAAT Base Counts: A: 129 C: 97 G: 119 T: 155 Total: 500