Suffix tree

From Rosetta Code
Suffix tree is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.

A suffix tree is a data structure commonly used in string algorithms.

Given a string S of length n, its suffix tree is a tree T such that:

  • T has exactly n leaves numbered from 1 to n.
  • Except for the root, every internal node has at least two children.
  • Each edge of T is labelled with a non-empty substring of S.
  • No two edges starting out of a node can have string labels beginning with the same character.
  • The string obtained by concatenating all the string labels found on the path from the root to leaf i spells out suffix S[i..n], for i from 1 to n.

Such a tree does not exist for all strings. To ensure existence, a character that is not found in S must be appended at its end. The character '$' is traditionally used for this purpose.

For this task, build and display the suffix tree of the string "banana$". Displaying the tree can be done using the code from the visualize a tree task, but any other convenient method is accepted.

There are several ways to implement the tree data structure, for instance how edges should be labelled. Latitude is given in this matter, but notice that a simple way to do it is to label each node with the label of the edge leading to it.

The computation time for an efficient algorithm should be , but such an algorithm might be difficult to implement. An easier, algorithm is accepted.

Go[edit]

Vis function from Visualize_a_tree#Unicode.

package main
 
import "fmt"
 
func main() {
vis(buildTree("banana$"))
}
 
type tree []node
 
type node struct {
sub string // a substring of the input string
ch []int // list of child nodes
}
 
func buildTree(s string) tree {
t := tree{node{}} // root node
for i := range s {
t = t.addSuffix(s[i:])
}
return t
}
 
func (t tree) addSuffix(suf string) tree {
n := 0
for i := 0; i < len(suf); {
b := suf[i]
ch := t[n].ch
var x2, n2 int
for ; ; x2++ {
if x2 == len(ch) {
// no matching child, remainder of suf becomes new node.
n2 = len(t)
t = append(t, node{sub: suf[i:]})
t[n].ch = append(t[n].ch, n2)
return t
}
n2 = ch[x2]
if t[n2].sub[0] == b {
break
}
}
// find prefix of remaining suffix in common with child
sub2 := t[n2].sub
j := 0
for ; j < len(sub2); j++ {
if suf[i+j] != sub2[j] {
// split n2
n3 := n2
// new node for the part in common
n2 = len(t)
t = append(t, node{sub2[:j], []int{n3}})
t[n3].sub = sub2[j:] // old node loses the part in common
t[n].ch[x2] = n2
break // continue down the tree
}
}
i += j // advance past part in common
n = n2 // continue down the tree
}
return t
}
 
func vis(t tree) {
if len(t) == 0 {
fmt.Println("<empty>")
return
}
var f func(int, string)
f = func(n int, pre string) {
children := t[n].ch
if len(children) == 0 {
fmt.Println("╴", t[n].sub)
return
}
fmt.Println("┐", t[n].sub)
last := len(children) - 1
for _, ch := range children[:last] {
fmt.Print(pre, "├─")
f(ch, pre+"│ ")
}
fmt.Print(pre, "└─")
f(children[last], pre+" ")
}
f(0, "")
}
Output:
┐ 
├─╴ banana$
├─┐ a
│ ├─┐ na
│ │ ├─╴ na$
│ │ └─╴ $
│ └─╴ $
├─┐ na
│ ├─╴ na$
│ └─╴ $
└─╴ $

J[edit]

Implementation:

classify=: {.@> </. ]
 
build_tree=:3 :0
tree=. ,:_;_;''
if. 0=#y do. tree return.end.
if. 1=#y do. tree,(#;y);0;y return.end.
for_box.classify y do.
char=. {.>{.>box
subtree=. }.build_tree }.each>box
ndx=.I.0=1&{::"1 subtree
n=.#tree
if. 1=#ndx do.
counts=. 1 + 0&{::"1 subtree
parents=. (n-1) (+*]&*) 1&{::"1 subtree
edges=. (ndx}~ <@(char,ndx&{::)) 2&{"1 subtree
tree=. tree, counts;"0 1 parents;"0 edges
else.
tree=. tree,(__;0;,char),(1;n;0) + ::]&.>"1 subtree
end.
end.
)
 
suffix_tree=:3 :0
assert. -.({:e.}:)y
tree=. B=:|:build_tree <\. y
((1+#y)-each {.tree),}.tree
)

Task example:

   suffix_tree 'banana$'
┌──┬───────┬─┬──┬───┬─┬─┬──┬───┬─┬─┐
__1__246_357
├──┼───────┼─┼──┼───┼─┼─┼──┼───┼─┼─┤
_0023320770
├──┼───────┼─┼──┼───┼─┼─┼──┼───┼─┼─┤
│ │banana$│a│na│na$│$│$│na│na$│$│$│
└──┴───────┴─┴──┴───┴─┴─┴──┴───┴─┴─┘

The first row is the leaf number (_ for internal nodes).

The second row is parent index (_ for root node).

The third row is the edge's substring (empty for root node).

Visualizing, using showtree and prefixing the substring leading to each leaf with the leaf number (in brackets):

fmttree=: ;@(1&{) showtree~ {: (,~ }.`('[','] ',~":)@.(_>|))each {.
 
fmttree suffix_tree 'banana$'
┌─ [1] banana$
│ ┌─ [2] na$
│ ┌─ na ────┴─ [4] $
────┼─ a ─────────┴─ [6] $
│ ┌─ [3] na$
├─ na ────────┴─ [5] $
└─ [7] $
 

Perl[edit]

Translation of: Perl 6
use strict;
use warnings;
use Data::Dumper;
 
sub classify {
my $h = {};
for (@_) { push @{$h->{substr($_,0,1)}}, $_ }
return $h;
}
sub suffixes {
my $str = shift;
map { substr $str, $_ } 0 .. length($str) - 1;
}
sub suffix_tree {
return +{} if @_ == 0;
return +{ $_[0] => +{} } if @_ == 1;
my $h = {};
my $classif = classify @_;
for my $key (keys %$classif) {
my $subtree = suffix_tree(
map { substr $_, 1 } @{$classif->{$key}}
);
my @subkeys = keys %$subtree;
if (@subkeys == 1) {
my ($subkey) = @subkeys;
$h->{"$key$subkey"} = $subtree->{$subkey};
} else { $h->{$key} = $subtree }
}
return $h;
}
print +Dumper suffix_tree suffixes 'banana$';
Output:
$VAR1 = {
          '$' => {},
          'a' => {
                   '$' => {},
                   'na' => {
                             'na$' => {},
                             '$' => {}
                           }
                 },
          'banana$' => {},
          'na' => {
                    'na$' => {},
                    '$' => {}
                  }
        };

Perl 6[edit]

Works with: rakudo version 2017-01

Here is quite a naive algorithm, probably .

multi suffix-tree(Str $str) { suffix-tree flat map &flip, [\~] $str.flip.comb }
multi suffix-tree(@a) {
hash
@a == 0 ?? () !!
@a == 1 ?? @a[0] => [] !!
gather for @a.classify(*.substr(0, 1)) {
my $subtree = suffix-tree(grep *.chars, map *.substr(1), .value[]);
if $subtree == 1 {
my $pair = $subtree.pick;
take .key ~ $pair.key => $pair.value;
} else {
take .key => $subtree;
}
}
}

Displaying the tree is done with the code from visualize a tree:

my $tree = root => suffix-tree 'banana$';
.say for visualize-tree $tree, *.key, *.value.list;
Output:
root
├─$
├─a
│ ├─$
│ └─na
│   ├─$
│   └─na$
├─na
│ ├─$
│ └─na$
└─banana$

Racket[edit]

See Suffix trees with Ukkonen’s algorithm by Danny Yoo for more information on how to use suffix trees in Racket.

#lang racket
(require (planet dyoo/suffixtree))
(define tree (make-tree))
(tree-add! tree (string->label "banana$"))
 
(define (show-node nd dpth)
(define children (node-children nd))
(printf "~a~a ~a~%" (match dpth
[(regexp #px"(.*) $" (list _ d)) (string-append d "`")]
[else else]) (if (null? children) "--" "-+") (label->string (node-up-label nd)))
(let l ((children children))
(match children
((list) (void))
((list c) (show-node c (string-append dpth " ")))
((list c ct ...) (show-node c (string-append dpth " |")) (l ct)))))
 
(show-node (tree-root tree) "")
Output:
-+ 
 |-- $
 |-+ a
 | |-- $
 | `-+ na
 |   |-- $
 |   `-- na$
 |-+ na
 | |-- $
 | `-- na$
 `-- banana$

Sidef[edit]

Translation of: Perl 6
func suffix_tree(Str t) {
suffix_tree(^t.len -> map { t.substr(_) })
}
 
func suffix_tree(a {.len == 1}) {
Hash(a[0] => nil)
}
 
func suffix_tree(Arr a) {
var h = Hash()
for k,v in (a.group_by { .char(0) }) {
var subtree = suffix_tree(v.map { .substr(1) })
var subkeys = subtree.keys
if (subkeys.len == 1) {
var subk = subkeys[0]
h{k + subk} = subtree{subk}
}
else {
h{k} = subtree
}
}
return h
}
 
say suffix_tree('banana$')
Output:
Hash(
    "$" => nil,
    "a" => Hash(
        "$" => nil,
        "na" => Hash(
            "$" => nil,
            "na$" => nil
        )
    ),
    "banana$" => nil,
    "na" => Hash(
        "$" => nil,
        "na$" => nil
    )
)