FASTA format
You are encouraged to solve this task according to the task description, using any language you may know.
In bioinformatics, long character strings are often encoded in a format called FASTA.
A FASTA file can contain several strings, each identified by a name marked by a >
(greater than) character at the beginning of the line.
- Task
Write a program that reads a FASTA file such as:
>Rosetta_Example_1 THERECANBENOSPACE >Rosetta_Example_2 THERECANBESEVERAL LINESBUTTHEYALLMUST BECONCATENATED
- Output:
Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
Note that a high-quality implementation will not hold the entire file in memory at once; real FASTA files can be multiple gigabytes in size.
Ada
This solution reads the entire file (from standard input) into a map and then prints the data stored in the map.
<lang Ada>with Ada.Text_IO, Ada.Containers.Indefinite_Ordered_Maps; use Ada.Text_IO;
procedure FASTA is
package Maps is new Ada.Containers.Indefinite_Ordered_Maps (Element_Type => String, Key_Type => String); Map: Maps.Map; -- Map holds the full file (as pairs of name and value) function Get_Value(Previous: String := "") return String is Current: Character; begin if End_Of_File then
return Previous; -- file ends
else
Get(Current); -- read first character if Current = '>' then -- ah, a new name begins return Previous; -- the string read so far is the value else -- the entire line is part of the value return Get_Value(Previous & Current & Get_Line); end if;
end if; end Get_Value; procedure Print_Pair(Position: Maps.Cursor) is begin Put_Line(Maps.Key(Position) & ": " & Maps.Element(Position)); -- Maps.Key(X) is the name and Maps.Element(X) is the value at X end Print_Pair; Skip_This: String := Get_Value; -- consumes the entire file, until the first line starting with '>'. -- the string Skip_This should be empty, but we don't verify this
begin
while not End_Of_File loop -- read the file into Map declare
Name: String := Get_Line; -- reads all characters in the line, except for the first ">" Value: String := Get_Value;
begin
Map.Insert(Key => Name, New_Item => Value); -- adds the pair (Name, Value) to Map
end; end loop; Map.Iterate(Process => Print_Pair'Access); -- print Map
end FASTA;</lang>
AutoHotkey
<lang AutoHotkey>Data = ( >Rosetta_Example_1 THERECANBENOSPACE >Rosetta_Example_2 THERECANBESEVERAL LINESBUTTHEYALLMUST BECONCATENATED )
Data := RegExReplace(RegExReplace(Data, ">\V+\K\v+", ": "), "\v+(?!>)") Gui, add, Edit, w700, % Data Gui, show return</lang>
- Output:
>Rosetta_Example_1: THERECANBENOSPACE >Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
AWK
<lang AWK>
- syntax: GAWK -f FASTA_FORMAT.AWK filename
- stop processing each file when an error is encountered
{ if (FNR == 1) {
header_found = 0 if ($0 !~ /^[;>]/) { error("record is not valid") nextfile } } if ($0 ~ /^;/) { next } # comment begins with a ";" if ($0 ~ /^>/) { # header if (header_found > 0) { printf("\n") # EOL for previous sequence } printf("%s: ",substr($0,2)) header_found = 1 next } if ($0 ~ /[ \t]/) { next } # ignore records with whitespace if ($0 ~ /\*$/) { # sequence may end with an "*" if (header_found > 0) { printf("%s\n",substr($0,1,length($0)-1)) header_found = 0 next } else { error("end of sequence found but header is missing") nextfile } } if (header_found > 0) { printf("%s",$0) } else { error("header not found") nextfile }
} ENDFILE {
if (header_found > 0) { printf("\n") }
} END {
exit (errors == 0) ? 0 : 1
} function error(message) {
printf("error: FILENAME=%s, FNR=%d, %s, %s\n",FILENAME,FNR,message,$0) >"con" errors++ return
} </lang>
- Output:
Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
C
<lang c>#include <stdio.h>
- include <stdlib.h>
- include <string.h>
void main() { FILE * fp; char * line = NULL; size_t len = 0; ssize_t read;
fp = fopen("fasta.txt", "r"); if (fp == NULL) exit(EXIT_FAILURE);
int state = 0; while ((read = getline(&line, &len, fp)) != -1) { /* Delete trailing newline */ if (line[read - 1] == '\n') line[read - 1] = 0; /* Handle comment lines*/ if (line[0] == '>') { if (state == 1) printf("\n"); printf("%s: ", line+1); state = 1; } else { /* Print everything else */ printf("%s", line); } } printf("\n");
fclose(fp); if (line) free(line); exit(EXIT_SUCCESS); }</lang>
- Output:
Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
C++
<lang cpp>#include <iostream>
- include <fstream>
int main( int argc, char **argv ){
if( argc <= 1 ){ std::cerr << "Usage: "<<argv[0]<<" [infile]" << std::endl; return -1; }
std::ifstream input(argv[1]); if(!input.good()){ std::cerr << "Error opening '"<<argv[1]<<"'. Bailing out." << std::endl; return -1; }
std::string line, name, content; while( std::getline( input, line ).good() ){ if( line.empty() || line[0] == '>' ){ // Identifier marker if( !name.empty() ){ // Print out what we read from the last entry std::cout << name << " : " << content << std::endl; name.clear(); } if( !line.empty() ){ name = line.substr(1); } content.clear(); } else if( !name.empty() ){ if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed name.clear(); content.clear(); } else { content += line; } } } if( !name.empty() ){ // Print out what we read from the last entry std::cout << name << " : " << content << std::endl; } return 0;
}</lang>
- Output:
Rosetta_Example_1 : THERECANBENOSPACE Rosetta_Example_2 : THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
C#
<lang csharp>using System; using System.Collections.Generic; using System.IO; using System.Text;
class Program {
public class FastaEntry { public string Name { get; set; } public StringBuilder Sequence { get; set; } }
static IEnumerable<FastaEntry> ParseFasta(StreamReader fastaFile) { FastaEntry f = null; string line; while ((line = fastaFile.ReadLine()) != null) { // ignore comment lines if (line.StartsWith(";")) continue;
if (line.StartsWith(">")) { if (f != null) yield return f; f = new FastaEntry { Name = line.Substring(1), Sequence = new StringBuilder() }; } else if (f != null) f.Sequence.Append(line); } yield return f; }
static void Main(string[] args) { try { using (var fastaFile = new StreamReader("fasta.txt")) { foreach (FastaEntry f in ParseFasta(fastaFile)) Console.WriteLine("{0}: {1}", f.Name, f.Sequence); } } catch (FileNotFoundException e) { Console.WriteLine(e); } Console.ReadLine(); }
}</lang>
Common Lisp
<lang lisp>(defun fasta (pathname)
(with-open-file (s pathname) (loop for line = (read-line s nil) while line do (if (char= #\> (char line 0)) (format t "~&~A: " (subseq line 1)) (princ line)) finally (fresh-line))))</lang>
D
<lang d>import std.stdio, std.string;
void main() {
immutable fileName = "fasta_format_data.fasta";
bool first = true;
foreach (const line; fileName.File.byLine) { if (line[0] == '>') { if (first) { first = false; } else { writeln; }
write(line[1 .. $].strip, ": "); } else { line.strip.write; } }
writeln;
}</lang>
- Output:
Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
FreeBASIC
This program sticks to the task as described in the heading and doesn't allow for any of the (apparently) obsolete practices described in the Wikipedia article : <lang freebasic>' FB 1.05.0 Win64
Function checkNoSpaces(s As String) As Boolean
For i As UInteger = 0 To Len(s) - 1 If s[i] = 32 OrElse s[i] = 9 Then Return False check for spaces or tabs Next Return True
End Function
Open "input.fasta" For Input As # 1
Dim As String ln, seq Dim first As Boolean = True
While Not Eof(1)
Line Input #1, ln If Left(ln, 1) = ">" Then If Not first Then Print Print Mid(ln, 2); ": "; If first Then first = False ElseIf first Then Print: Print "Error : File does not begin with '>'"; Exit While Else If checkNoSpaces(ln) Then Print ln; Else Print : Print "Error : Sequence contains space(s)"; Exit While End If End If
Wend
Close #1
Print : Print Print "Press any key to quit" Sleep</lang>
- Output:
Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
Go
<lang go>package main
import (
"bufio" "fmt" "os"
)
func main() {
f, err := os.Open("rc.fasta") if err != nil { fmt.Println(err) return } defer f.Close() s := bufio.NewScanner(f) headerFound := false for s.Scan() { line := s.Text() switch { case line == "": continue case line[0] != '>': if !headerFound { fmt.Println("missing header") return } fmt.Print(line) case headerFound: fmt.Println() fallthrough default: fmt.Printf("%s: ", line[1:]) headerFound = true } } if headerFound { fmt.Println() } if err := s.Err(); err != nil { fmt.Println(err) }
}</lang>
- Output:
Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
J
Needs chunking to handle huge files. <lang j>require 'strings' NB. not needed for J versions greater than 6. parseFasta=: ((': ' ,~ LF&taketo) , (LF -.~ LF&takeafter));._1</lang> Example Usage <lang j> Fafile=: noun define >Rosetta_Example_1 THERECANBENOSPACE >Rosetta_Example_2 THERECANBESEVERAL LINESBUTTHEYALLMUST BECONCATENATED )
parseFasta Fafile
Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED</lang>
Java
<lang java>import java.io.*; import java.util.Scanner;
public class ReadFastaFile {
public static void main(String[] args) throws FileNotFoundException {
boolean first = true;
try (Scanner sc = new Scanner(new File("test.fasta"))) { while (sc.hasNextLine()) { String line = sc.nextLine().trim(); if (line.charAt(0) == '>') { if (first) first = false; else System.out.println(); System.out.printf("%s: ", line.substring(1)); } else { System.out.print(line); } } } System.out.println(); }
}</lang>
Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED Rosetta_Example_3: THISISFASTA
jq
The following implementation uses "foreach" and "inputs"
so that very large input files can be processed with minimal space requirements:
in each cycle, only as many lines are read as are required to compose an output line.
Notice that an additional ">" must be provided to "foreach" to ensure the final block of lines of the input file are properly assembled.
<lang jq>
def fasta:
foreach (inputs, ">") as $line # state: [accumulator, print ] ( [null, null]; if $line[0:1] == ">" then [($line[1:] + ": "), .[0]] else [ (.[0] + $line), false] end; if .[1] then .[1] else empty end ) ;
fasta</lang>
- Output:
<lang sh>$ jq -n -R -r -f FASTA_format.jq < FASTA_format.fasta Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED</lang>
OCaml
I keep it simple by sticking to the description of the FASTA format described in the task.
The program reads and processes the input one line at a time, and directly prints out the chunk of data available. The long strings are not concatenated in memory but just examined and processed as necessary: either printed out as is in the case of part of a sequence, or formatted in the case of the name (what I call the label), and managing the new lines where needed.
<lang ocaml> (* This program reads from the standard input and writes to standard output.
* Examples of use: * $ ocaml fasta.ml < fasta_file.txt * $ ocaml fasta.ml < fasta_file.txt > my_result.txt * * The FASTA file is assumed to have a specific format, where the first line * contains a label in the form of '>blablabla', i.e. with a '>' as the first * character. *)
let labelstart = '>'
let is_label s = s.[0] = labelstart let get_label s = String.sub s 1 (String.length s - 1)
let read_in channel = input_line channel |> String.trim
let print_fasta chan =
let rec doloop currlabel line = if is_label line then begin if currlabel <> "" then print_newline (); let newlabel = get_label line in print_string (newlabel ^ ": "); doloop newlabel (read_in chan) end else begin print_string line; doloop currlabel (read_in chan) end in try match read_in chan with | line when is_label line -> doloop "" line | _ -> failwith "Badly formatted FASTA file?" with End_of_file -> print_newline ()
let () =
print_fasta stdin
</lang>
- Output:
Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
Mathematica
Mathematica has built-in support for FASTA files and strings <lang Mathematica>ImportString[">Rosetta_Example_1
THERECANBENOSPACE >Rosetta_Example_2 THERECANBESEVERAL LINESBUTTHEYALLMUST BECONCATENATED ", "FASTA"]</lang>
- Output:
{"THERECANBENOSPACE", "THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED"}
Perl
<lang perl>my $fasta_example = <<'END_FASTA_EXAMPLE'; >Rosetta_Example_1 THERECANBENOSPACE >Rosetta_Example_2 THERECANBESEVERAL LINESBUTTHEYALLMUST BECONCATENATED END_FASTA_EXAMPLE
my $num_newlines = 0; while ( < $fasta_example > ) { if (/\A\>(.*)/) { print "\n" x $num_newlines, $1, ': '; } else { $num_newlines = 1; print; } }</lang>
- Output:
Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
Perl 6
<lang perl6>grammar FASTA {
rule TOP { <entry>+ } rule entry { \> <title> <sequence> } token title { <.alnum>+ } token sequence { ( <.alnum>+ )+ % \n { make $0.join } }
}
FASTA.parse: q:to /§/; >Rosetta_Example_1 THERECANBENOSPACE >Rosetta_Example_2 THERECANBESEVERAL LINESBUTTHEYALLMUST BECONCATENATED §
for $/<entry>[] {
say ~.<title>, " : ", .<sequence>.made;
}</lang>
- Output:
Rosetta_Example_1 : THERECANBENOSPACE Rosetta_Example_2 : THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
PowerShell
When working with a real file, the content of the $file
variable would be: Get-Content -Path .\FASTA_file.txt -ReadCount 1000
. The -ReadCount
parameter value for large files is unknown, yet sure to be a value between 1,000 and 10,000 depending upon the length of file and length of the records in the file. Experimentation is the only way to know the optimum value.
<lang PowerShell> $file = @' >Rosetta_Example_1 THERECANBENOSPACE >Rosetta_Example_2 THERECANBESEVERAL LINESBUTTHEYALLMUST BECONCATENATED '@
$lines = $file.Replace("`n","~").Split(">").ForEach({$_.TrimEnd("~").Split("`n",2,[StringSplitOptions]::RemoveEmptyEntries)})
$output = New-Object -TypeName PSObject
foreach ($line in $lines) {
$name, $value = $line.Split("~",2).ForEach({$_.Replace("~","")})
$output | Add-Member -MemberType NoteProperty -Name $name -Value $value
}
$output | Format-List </lang>
- Output:
Rosetta_Example_1 : THERECANBENOSPACE Rosetta_Example_2 : THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
Version 3.0 Or Less
<lang PowerShell> $file = @' >Rosetta_Example_1 THERECANBENOSPACE >Rosetta_Example_2 THERECANBESEVERAL LINESBUTTHEYALLMUST BECONCATENATED '@
$lines = $file.Replace("`n","~").Split(">") | ForEach-Object {$_.TrimEnd("~").Split("`n",2,[StringSplitOptions]::RemoveEmptyEntries)}
$output = New-Object -TypeName PSObject
foreach ($line in $lines) {
$name, $value = $line.Split("~",2) | ForEach-Object {$_.Replace("~","")}
$output | Add-Member -MemberType NoteProperty -Name $name -Value $value
}
$output | Format-List </lang>
- Output:
Rosetta_Example_1 : THERECANBENOSPACE Rosetta_Example_2 : THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
Python
I use a string to mimic an input file. If it was an input file, then the file is read line-by-line and I use a generator expression yielding key, value pairs as soon as they are read, keeping the minimum in memory. <lang python>import io
FASTA=\ >Rosetta_Example_1 THERECANBENOSPACE >Rosetta_Example_2 THERECANBESEVERAL LINESBUTTHEYALLMUST BECONCATENATED
infile = io.StringIO(FASTA)
def fasta_parse(infile):
key = for line in infile: if line.startswith('>'): if key: yield key, val key, val = line[1:].rstrip().split()[0], elif key: val += line.rstrip() if key: yield key, val
print('\n'.join('%s: %s' % keyval for keyval in fasta_parse(infile)))</lang>
- Output:
Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
Racket
<lang racket>
- lang racket
(let loop ([m #t])
(when m (when (regexp-try-match #rx"^>" (current-input-port)) (unless (eq? #t m) (newline)) (printf "~a: " (read-line))) (loop (regexp-match #rx"\n" (current-input-port) 0 #f (current-output-port)))))
(newline) </lang>
REXX
version 1
This REXX version correctly processes the examples shown. <lang rexx>/*REXX pgm reads a (bioinformational) FASTA file and displays contents.*/ parse arg iFID _ . /*iFID = input file to be read.*/ if iFID== then iFID='FASTA.IN' /*Not specified? Use the default*/ $=; name= /*default values (so far). */
do while lines(iFID)\==0 /*process the FASTA file contents*/ x=strip(linein(iFID), 'T') /*read a line (record) from file,*/ /* and strip trailing blanks.*/ if left(x,1)=='>' then do if $\== then say name':' $ name=substr(x,2) $= end else $=$||x end /*j*/
if $\== then say name':' $
/*stick a fork in it, we're done.*/</lang>
- Output:
when using the default input file
Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
version 2
This REXX version handles (see the talk page):
- blank lines
- sequences that end in an asterisk [*]
- sequences that contain blanks, tabs, and other whitespace
- sequence names that are identified with a semicolon [;]
<lang rexx>/*REXX pgm reads a (bioinformational) FASTA file and displays contents.*/ parse arg iFID _ . /*iFID = input file to be read.*/ if iFID== then iFID='FASTA.IN' /*Not specified? Use the default*/ $=; name= /*default values (so far). */
do while lines(iFID)\==0 /*process the FASTA file contents*/ x=strip(linein(iFID), 'T') /*read a line (record) from file,*/ /* and strip trailing blanks.*/ if x== then iterate /*ignore blank lines. */ if left(x,1)==';' then do if name== then name=substr(x,2) say x iterate end if left(x,1)=='>' then do if $\== then say name':' $ name=substr(x,2) $= end else $=space($||translate(x,,'*'),0) end /*j*/
if $\== then say name':' $
/*stick a fork in it, we're done.*/</lang>
input The FASTA2.IN file is shown below:
;LCBO - Prolactin precursor - Bovine ; a sample sequence in FASTA format MDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSS EMFNEFDKRYAQGKGFITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL VTEVRGMKGAPDAILSRAIEIEEENKRLLEGMEMIFGQVIPGAKETEPYPVWSGLPSLQTKDED ARYSAFYNLLHCLRRDSSKIDTYLKLLNCRIIYNNNC* >MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTID FPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREA DIDGDGQVNYEEFVQMMTAK* >gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus] LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX IENY
- Output:
when the FASTA2.IN input file is used
;LCBO - Prolactin precursor - Bovine ; a sample sequence in FASTA format LCBO - Prolactin precursor - Bovine: MDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSSEMFNEFDKRYAQGKGFITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHLVTEVRGMKGAPDAILSRAIEIEEENKRLLEGMEMIFGQVIPGAKETEPYPVWSGLPSLQTKDEDARYSAFYNLLHCLRRDSSKIDTYLKLLNCRIIYNNNC MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken: ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREADIDGDGQVNYEEFVQMMTAK gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]: LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLVEWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLGLLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVILGLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGXIENY
Ruby
<lang ruby>def fasta_format(strings)
out, text = [], "" strings.split("\n").each do |line| if line[0] == '>' out << text unless text.empty? text = line[1..-1] + ": " else text << line end end out << text unless text.empty?
end
data = <<'EOS' >Rosetta_Example_1 THERECANBENOSPACE >Rosetta_Example_2 THERECANBESEVERAL LINESBUTTHEYALLMUST BECONCATENATED EOS
puts fasta_format(data)</lang>
- Output:
Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
Run BASIC
<lang runbasic>a$ = ">Rosetta_Example_1 THERECANBENOSPACE >Rosetta_Example_2 THERECANBESEVERAL LINESBUTTHEYALLMUST BECONCATENATED"
i = 1 while i <= len(a$)
if mid$(a$,i,17) = ">Rosetta_Example_" then print print mid$(a$,i,18);": "; i = i + 17 else if asc(mid$(a$,i,1)) > 20 then print mid$(a$,i,1); end if i = i + 1
wend</lang>
- Output:
>Rosetta_Example_1: THERECANBENOSPACE >Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
Rust
This example is implemented using an iterator to reduce memory requirements and encourage code reuse.
<lang rust> use std::env; use std::io::{BufReader, Lines}; use std::io::prelude::*; use std::fs::File;
fn main() {
let args: Vec<String> = env::args().collect(); let f = File::open(&args[1]).unwrap(); for line in FastaIter::new(f) { println!("{}", line); }
}
struct FastaIter<T> {
buffer_lines: Lines<BufReader<T>>, current_name: Option<String>, current_sequence: String
}
impl<T: Read> FastaIter<T> {
fn new(file: T) -> FastaIter<T> { FastaIter { buffer_lines: BufReader::new(file).lines(), current_name: None, current_sequence: String::new() } }
}
impl<T: Read> Iterator for FastaIter<T> {
type Item = String;
fn next(&mut self) -> Option<String> { while let Some(l) = self.buffer_lines.next() { let line = l.unwrap(); if line.starts_with(">") { if self.current_name.is_some() { let mut res = String::new(); res.push_str(self.current_name.as_ref().unwrap()); res.push_str(": "); res.push_str(&self.current_sequence); self.current_name = Some(String::from(&line[1..])); self.current_sequence.clear(); return Some(res); } else { self.current_name = Some(String::from(&line[1..])); self.current_sequence.clear(); } continue; } self.current_sequence.push_str(line.trim()); } if self.current_name.is_some() { let mut res = String::new(); res.push_str(self.current_name.as_ref().unwrap()); res.push_str(": "); res.push_str(&self.current_sequence); self.current_name = None; self.current_sequence.clear(); self.current_sequence.shrink_to_fit(); return Some(res); } None }
} </lang>
- Output:
Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
Scheme
<lang scheme> (import (scheme base)
(scheme file) (scheme write))
(with-input-from-file ; reads text from named file, one line at a time
"fasta.txt" (lambda () (do ((first-line? #t #f) (line (read-line) (read-line))) ((eof-object? line) (newline)) (cond ((char=? #\> (string-ref line 0)) ; found a name (unless first-line? ; no newline on first name (newline)) (display (string-copy line 1)) (display ": ")) (else ; display the string directly (display line))))))
</lang>
- Output:
Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
Sidef
<lang ruby>func fasta_format(strings) {
var out = [] var text = for line in (strings.lines) { if (line.begins_with('>')) { text.len && (out << text) text = line.substr(1)+': ' } else { text += line } } text.len && (out << text) return out
}
fasta_format(DATA.slurp).each { .say }
__DATA__ >Rosetta_Example_1 THERECANBENOSPACE >Rosetta_Example_2 THERECANBESEVERAL LINESBUTTHEYALLMUST BECONCATENATED</lang>
- Output:
Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
Tcl
<lang tcl>proc fastaReader {filename} {
set f [open $filename] set sep "" while {[gets $f line] >= 0} {
if {[string match >* $line]} { puts -nonewline "$sep[string range $line 1 end]: " set sep "\n" } else { puts -nonewline $line }
} puts "" close $f
}
fastaReader ./rosettacode.fas</lang>
- Output:
Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED
zkl
<lang zkl>fcn fasta(data){ // a lazy cruise through a FASTA file
fcn(w){ // one string at a time, -->False garbage at front of file line:=w.next().strip(); if(line[0]==">") w.pump(line[1,*]+": ",'wrap(l){ if(l[0]==">") { w.push(l); Void.Stop } else l.strip() }) }.fp(data.walker()) : Utils.Helpers.wap(_);
}</lang>
- This assumes that white space at front or end of string is extraneous (excepting ">" lines).
- Lazy, works for objects that support iterating over lines (ie most).
- The fasta function returns an iterator that wraps a function taking an iterator. Uh, yeah. An initial iterator (Walker) is used to get lines, hold state and do push back when read the start of the next string. The function sucks up one string (using the iterator). The wrapping iterator (wap) traps the exception when the function waltzes off the end of the data and provides API for foreach (etc).
FASTA file: <lang zkl>foreach l in (fasta(File("fasta.txt"))) { println(l) }</lang> FASTA data blob: <lang zkl>data:=Data(0,String,
">Rosetta_Example_1\nTHERECANBENOSPACE\n" ">Rosetta_Example_2\nTHERECANBESEVERAL\nLINESBUTTHEYALLMUST\n" "BECONCATENATED");
foreach l in (fasta(data)) { println(l) }</lang>
- Output:
Rosetta_Example_1: THERECANBENOSPACE Rosetta_Example_2: THERECANBESEVERALLINESBUTTHEYALLMUSTBECONCATENATED