Ukkonen’s suffix tree construction: Difference between revisions

From Rosetta Code
Content added Content deleted
m (→‎{{header|Go}}: Added a note in the preamble about the C code on which this entry is based not compiling as it stands.)
Line 296: Line 296:
first 100000 d.p. of Pi is: 041021944
first 100000 d.p. of Pi is: 041021944
(this took 599.770281ms)
(this took 599.770281ms)
</pre>

=={{header|Phix}}==
{{trans|Go}}
<lang Phix>-- demo/rosetta/Ukkonens_Suffix_Tree.exw
integer maxChar = 'z'
sequence children = {},
suffixLinks = {},
starts = {},
ends = {},
suffixIndices = {}
string text
atom pSplitEnd,
pRootEnd,
pLeafEnd = allocate_word(0,true)
integer root = NULL,
lastNewNode,
activeNode,
activeEdge = -1,
activeLength = 0,
remainingSuffixCount = 0,
size = -1
function newNode(integer start, atom pFinish, bool bKids=false)
children = append(children,iff(bKids?repeat(NULL,maxChar):0))
suffixLinks = append(suffixLinks,root)
starts = append(starts,start)
ends = append(ends,pFinish)
suffixIndices = append(suffixIndices,-1)
return length(children)
end function
function edgeLength(integer n)
if n == root then
return 0
end if
return peekns(ends[n]) - starts[n] + 1
end function
function walkDown(integer currNode)
integer l = edgeLength(currNode)
if activeLength >= l then
activeEdge += l
activeLength -= l
activeNode = currNode
return true
end if
return false
end function
procedure extendSuffixTree(integer pos)
poken(pLeafEnd,pos)
remainingSuffixCount += 1
lastNewNode = NULL
while remainingSuffixCount > 0 do
if activeLength == 0 then
activeEdge = pos
end if
integer ta = text[activeEdge]
object ca = children[activeNode]
integer next = iff(ca=0?NULL:ca[ta])
if next == null then
if ca=0 then
children[activeNode] = repeat(NULL,maxChar)
end if
children[activeNode][ta] = newNode(pos, pLeafEnd)
if lastNewNode != NULL then
suffixLinks[lastNewNode] = activeNode
lastNewNode = NULL
end if
else
if walkDown(next) then
continue
end if
integer tp = text[pos]
if text[starts[next]+activeLength] == tp then
if lastNewNode != NULL and activeNode != root then
suffixLinks[lastNewNode] = activeNode
lastNewNode = NULL
end if
activeLength += 1
exit
end if
integer temp = starts[next] + activeLength - 1
pSplitEnd = allocate_word(temp,true)
integer splitnode = newNode(starts[next], pSplitEnd,true)
ta = text[activeEdge]
children[activeNode][ta] = splitnode
children[splitnode][tp] = newNode(pos, pLeafEnd)
starts[next] += activeLength
children[splitnode][text[starts[next]]] = next
if lastNewNode != NULL then
suffixLinks[lastNewNode] = splitnode
end if
lastNewNode = splitnode
end if
remainingSuffixCount -= 1
if activeNode == root and activeLength > 0 then
activeLength -= 1
activeEdge = pos - remainingSuffixCount + 1
elsif activeNode != root then
activeNode = suffixLinks[activeNode]
end if
end while
end procedure
procedure setSuffixIndexByDFS(integer n, integer labelHeight)
if n!=NULL then
-- Uncomment the 3 lines below to print suffix tree
--if starts[n]!=-1 then
-- printf(1,text[starts[n]..peekns(ends[n])])
--end if
if children[n]=0 then
suffixIndices[n] = size - labelHeight
-- Uncomment line below to print suffix index
--printf(1," [%d]\n", suffixIndices[n])
else
bool leaf = true
for i=1 to maxChar do
integer nci = children[n][i]
if nci != NULL then
-- Uncomment the 3 lines below to print suffix index
--if leaf and starts[n] != -1 then
-- printf(1," [%d]\n", suffixIndices[n])
--end if
leaf = false
setSuffixIndexByDFS(nci, labelHeight+edgeLength(nci))
end if
end for
if leaf then ?9/0 end if -- (sanity check)
end if
end if
end procedure
procedure buildSuffixTree()
size = length(text)
pRootEnd = allocate_word(-1,true)
root = newNode(-1, pRootEnd)
activeNode = root
for i=1 to size do
extendSuffixTree(i)
end for
integer labelHeight = 0
setSuffixIndexByDFS(root, labelHeight)
end procedure
procedure doTraversal(integer n, integer labelHeight, atom pMaxHeight, pSubstringStartIndex)
if n!=NULL then
integer nsi = suffixIndices[n], newHeight
if nsi == -1 then
for i=1 to maxChar do
integer nci = children[n][i]
if nci != NULL then
newHeight = labelHeight+edgeLength(nci)
doTraversal(nci, newHeight, pMaxHeight, pSubstringStartIndex)
end if
end for
elsif nsi > -1 then
newHeight = labelHeight-edgeLength(n)
if peekns(pMaxHeight)<newHeight then
poken(pMaxHeight,newHeight)
poken(pSubstringStartIndex,nsi)
end if
end if
end if
end procedure
procedure getLongestRepeatedSubstring(string s)
atom pMaxHeight = allocate_word(0,true),
pSubstringStartIndex = allocate_word(0,true)
doTraversal(root, 0, pMaxHeight, pSubstringStartIndex)
integer maxHeight = peekns(pMaxHeight),
start = peekns(pSubstringStartIndex)
-- Uncomment line below to print maxHeight and substringStartIndex
--printf(1,"maxHeight %d, substringStartIndex %d\n", {maxHeight, start})
string t = iff(maxHeight=0?"No repeated substring"
:text[start+1..start+maxHeight])
printf(1," %s is: %s\n", {iff(s=""?text:s),t})
end procedure
constant tests = {"GEEKSFORGEEKS$",
"AAAAAAAAAA$",
"ABCDEFG$",
"ABABABA$",
"ATCGATCGA$",
"banana$",
"abcpqrabpqpq$",
"pqrpqpqabab$"}
printf(1,"Longest Repeated Substring in:\n")
for i=1 to length(tests) do
text = tests[i]
buildSuffixTree()
getLongestRepeatedSubstring("")
end for
printf(1,"\n")
-- Sadly gmp crashes when I try 100,000 dp, but I suspect it would be fine with more than my paltry 4GB ram
include mpfr.e
mpfr pi = mpfr_init(0,-10001) -- (set precision to 10,000 dp, plus the "3.")
mpfr_const_pi(pi)
string piStr = mpfr_sprintf("%.10000Rf", pi), s="Pi"
piStr = piStr[3..$] -- discard leading "3."
maxChar = '9'
for i=3 to 6 do
atom t0 = time()
integer n = power(10,i)
if i>=5 then
text = repeat('0',n)&"$"
for c=1 to n do
text[c] = rand(10)+'0'-1
end for
s = "a made up number"
else
text = piStr[1..n] & "$"
end if
buildSuffixTree()
getLongestRepeatedSubstring(sprintf("first %,d d.p. of %s", {n,s}))
printf(1," (this took %gs)\n\n", {time()-t0})
end for</lang>
{{out}}
<pre>
Longest Repeated Substring in:
GEEKSFORGEEKS$ is: GEEKS
AAAAAAAAAA$ is: AAAAAAAAA
ABCDEFG$ is: No repeated substring
ABABABA$ is: ABABA
ATCGATCGA$ is: ATCGA
banana$ is: ana
abcpqrabpqpq$ is: ab
pqrpqpqabab$ is: ab

first 1,000 d.p. of Pi is: 23846
(this took 0s)

first 10,000 d.p. of Pi is: 7111369
(this took 0.047s)

first 100,000 d.p. of a made up number is: 022105155
(this took 0.422s)

first 1,000,000 d.p. of a made up number is: 09014346795
(this took 4.734s)
</pre>
</pre>

Revision as of 09:16, 2 August 2020

Ukkonen’s suffix tree construction is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.

Suffix Trees are very useful in numerous string processing and computational biology problems.

The task is to create a function which implements Ukkonen’s algorithm to create a useful Suffix Tree as described:

Part 1
Part 2
Part 3
Part 4
Part 5
Part 6

Using Arithmetic-geometric mean/Calculate Pi generate the first 1000, 10000, and 100000 decimal places of pi. Using your implementation with an alphabet of 0 through 9 (plus $ say to make the tree explicit) find the longest repeated string in each list. Time your results and demonstrate that your implementation is linear (i.e. that 10000 takes approx. 10 times as long as 1000). You may vary the size of the lists of decimal places of pi to give reasonable answers.

Go

This is a translation of the C code here which is an extended form of the code in Part 6 of the task description for finding the longest repeated substring of a given string. In the interests of brevity, the extensive comments in the C version have been largely omitted. The C code doesn't compile as it stands but I have added a fix in the Talk Page.

For convenience I have included the code from the Arithmetic-geometric_mean/Calculate_Pi#Go task in the same package.

It takes around 25 seconds on my machine (Celeron @1.6GHz) to calculate the first 100,000 (or so) decimal places of Pi. Having done that, the timings for extracting the longest repeated sequence of digits are quick and fairly linear as expected.

As the task doesn't say whether overlapping sequences are to be counted, I've assumed that they are as this is what the algorithm naturally produces. <lang go>package main

import (

   "fmt"
   "math/big"
   "time"

)

var maxChar = 128

type Node struct {

   children    []*Node
   suffixLink  *Node
   start       int
   end         *int
   suffixIndex int

}

var (

   text                 string
   root                 *Node
   lastNewNode          *Node
   activeNode           *Node
   activeEdge           = -1
   activeLength         = 0
   remainingSuffixCount = 0
   leafEnd              = -1
   rootEnd              *int
   splitEnd             *int
   size                 = -1

)

func newNode(start int, end *int) *Node {

   node := new(Node)
   node.children = make([]*Node, maxChar)
   node.suffixLink = root
   node.start = start
   node.end = end
   node.suffixIndex = -1
   return node

}

func edgeLength(n *Node) int {

   if n == root {
       return 0
   }
   return *(n.end) - n.start + 1

}

func walkDown(currNode *Node) bool {

   if activeLength >= edgeLength(currNode) {
       activeEdge += edgeLength(currNode)
       activeLength -= edgeLength(currNode)
       activeNode = currNode
       return true
   }
   return false

}

func extendSuffixTree(pos int) {

   leafEnd = pos
   remainingSuffixCount++
   lastNewNode = nil
   for remainingSuffixCount > 0 {
       if activeLength == 0 {
           activeEdge = pos
       }
       if activeNode.children[text[activeEdge]] == nil {
           activeNode.children[text[activeEdge]] = newNode(pos, &leafEnd)
           if lastNewNode != nil {
               lastNewNode.suffixLink = activeNode
               lastNewNode = nil
           }
       } else {
           next := activeNode.children[text[activeEdge]]
           if walkDown(next) {
               continue
           }
           if text[next.start+activeLength] == text[pos] {
               if lastNewNode != nil && activeNode != root {
                   lastNewNode.suffixLink = activeNode
                   lastNewNode = nil
               }
               activeLength++
               break
           }
           temp := next.start + activeLength - 1
           splitEnd = &temp
           split := newNode(next.start, splitEnd)
           activeNode.children[text[activeEdge]] = split
           split.children[text[pos]] = newNode(pos, &leafEnd)
           next.start += activeLength
           split.children[text[next.start]] = next
           if lastNewNode != nil {
               lastNewNode.suffixLink = split
           }
           lastNewNode = split
       }
       remainingSuffixCount--
       if activeNode == root && activeLength > 0 {
           activeLength--
           activeEdge = pos - remainingSuffixCount + 1
       } else if activeNode != root {
           activeNode = activeNode.suffixLink
       }
   }

}

func setSuffixIndexByDFS(n *Node, labelHeight int) {

   if n == nil {
       return
   }
   if n.start != -1 {
       // Uncomment line below to print suffix tree
       // fmt.Print(text[n.start: *(n.end) +1])
   }
   leaf := 1
   for i := 0; i < maxChar; i++ {
       if n.children[i] != nil {
           // Uncomment the 3 lines below to print suffix index
           //if leaf == 1 && n.start != -1 {
           //    fmt.Printf(" [%d]\n", n.suffixIndex)
           //}
           leaf = 0
           setSuffixIndexByDFS(n.children[i], labelHeight+edgeLength(n.children[i]))
       }
   }
   if leaf == 1 {
       n.suffixIndex = size - labelHeight
       // Uncomment line below to print suffix index
       //fmt.Printf(" [%d]\n", n.suffixIndex)
   }

}

func buildSuffixTree() {

   size = len(text)
   temp := -1
   rootEnd = &temp
   root = newNode(-1, rootEnd)
   activeNode = root
   for i := 0; i < size; i++ {
       extendSuffixTree(i)
   }
   labelHeight := 0
   setSuffixIndexByDFS(root, labelHeight)

}

func doTraversal(n *Node, labelHeight int, maxHeight, substringStartIndex *int) {

   if n == nil {
       return
   }
   if n.suffixIndex == -1 {
       for i := 0; i < maxChar; i++ {
           if n.children[i] != nil {
               doTraversal(n.children[i], labelHeight+edgeLength(n.children[i]),
                   maxHeight, substringStartIndex)
           }
       }
   } else if n.suffixIndex > -1 && (*maxHeight < labelHeight-edgeLength(n)) {
       *maxHeight = labelHeight - edgeLength(n)
       *substringStartIndex = n.suffixIndex
   }

}

func getLongestRepeatedSubstring(s string) {

   maxHeight := 0
   substringStartIndex := 0
   doTraversal(root, 0, &maxHeight, &substringStartIndex)
   // Uncomment line below to print maxHeight and substringStartIndex
   // fmt.Printf("maxHeight %d, substringStartIndex %d\n", maxHeight, substringStartIndex)
   if s == "" {
       fmt.Printf("  %s is: ", text)
   } else {
       fmt.Printf("  %s is: ", s)
   }
   k := 0
   for ; k < maxHeight; k++ {
       fmt.Printf("%c", text[k+substringStartIndex])
   }
   if k == 0 {
       fmt.Print("No repeated substring")
   }
   fmt.Println()

}

func calculatePi() *big.Float {

   one := big.NewFloat(1)
   two := big.NewFloat(2)
   four := big.NewFloat(4)
   prec := uint(325 * 1024) // enough to calculate Pi to 100,182 decimal digits
   a := big.NewFloat(1).SetPrec(prec)
   g := new(big.Float).SetPrec(prec)
   // temporary variables
   t := new(big.Float).SetPrec(prec)
   u := new(big.Float).SetPrec(prec)
   g.Quo(a, t.Sqrt(two))
   sum := new(big.Float)
   pow := big.NewFloat(2)
   for a.Cmp(g) != 0 {
       t.Add(a, g)
       t.Quo(t, two)
       g.Sqrt(u.Mul(a, g))
       a.Set(t)
       pow.Mul(pow, two)
       t.Sub(t.Mul(a, a), u.Mul(g, g))
       sum.Add(sum, t.Mul(t, pow))
   }
   t.Mul(a, a)
   t.Mul(t, four)
   pi := t.Quo(t, u.Sub(one, sum))
   return pi

}

func main() {

   tests := []string{
       "GEEKSFORGEEKS$",
       "AAAAAAAAAA$",
       "ABCDEFG$",
       "ABABABA$",
       "ATCGATCGA$",
       "banana$",
       "abcpqrabpqpq$",
       "pqrpqpqabab$",
   }
   fmt.Println("Longest Repeated Substring in:\n")
   for _, test := range tests {
       text = test
       buildSuffixTree()
       getLongestRepeatedSubstring("")
   }
   fmt.Println()
   pi := calculatePi()
   piStr := fmt.Sprintf("%v", pi)
   piStr = piStr[2:] // remove initial 3.
   numbers := []int{1e3, 1e4, 1e5}
   maxChar = 58
   for _, number := range numbers {
       start := time.Now()
       text = piStr[0:number] + "$"
       buildSuffixTree()
       getLongestRepeatedSubstring(fmt.Sprintf("first %d d.p. of Pi", number))
       elapsed := time.Now().Sub(start)
       fmt.Printf("  (this took %s)\n\n", elapsed)
   }

}</lang>

Output:

Sample run:

Longest Repeated Substring in:

  GEEKSFORGEEKS$ is: GEEKS
  AAAAAAAAAA$ is: AAAAAAAAA
  ABCDEFG$ is: No repeated substring
  ABABABA$ is: ABABA
  ATCGATCGA$ is: ATCGA
  banana$ is: ana
  abcpqrabpqpq$ is: ab
  pqrpqpqabab$ is: ab

  first 1000 d.p. of Pi is: 23846
  (this took 7.728858ms)

  first 10000 d.p. of Pi is: 7111369
  (this took 57.524478ms)

  first 100000 d.p. of Pi is: 041021944
  (this took 599.770281ms)

Phix

Translation of: Go

<lang Phix>-- demo/rosetta/Ukkonens_Suffix_Tree.exw integer maxChar = 'z'

sequence children = {},

        suffixLinks = {},
        starts = {},
        ends = {},
        suffixIndices = {}

string text atom pSplitEnd,

    pRootEnd,
    pLeafEnd = allocate_word(0,true)

integer root = NULL,

       lastNewNode,
       activeNode,
       activeEdge = -1,
       activeLength = 0,
       remainingSuffixCount = 0,
       size = -1

function newNode(integer start, atom pFinish, bool bKids=false)

   children = append(children,iff(bKids?repeat(NULL,maxChar):0))
   suffixLinks = append(suffixLinks,root)
   starts = append(starts,start)
   ends = append(ends,pFinish)
   suffixIndices = append(suffixIndices,-1)
   return length(children)

end function

function edgeLength(integer n)

   if n == root then
       return 0
   end if
   return peekns(ends[n]) - starts[n] + 1

end function

function walkDown(integer currNode)

   integer l = edgeLength(currNode)
   if activeLength >= l then
       activeEdge += l
       activeLength -= l
       activeNode = currNode
       return true
   end if
   return false

end function

procedure extendSuffixTree(integer pos)

   poken(pLeafEnd,pos)
   remainingSuffixCount += 1
   lastNewNode = NULL
   while remainingSuffixCount > 0 do
       if activeLength == 0 then
           activeEdge = pos
       end if
       integer ta = text[activeEdge]
       object ca = children[activeNode]
       integer next = iff(ca=0?NULL:ca[ta])
       if next == null then
           if ca=0 then
               children[activeNode] = repeat(NULL,maxChar)
           end if
           children[activeNode][ta] = newNode(pos, pLeafEnd)
           if lastNewNode != NULL then
               suffixLinks[lastNewNode] = activeNode
               lastNewNode = NULL
           end if
       else
           if walkDown(next) then
               continue
           end if
           integer tp = text[pos]
           if text[starts[next]+activeLength] == tp then
               if lastNewNode != NULL and activeNode != root then
                   suffixLinks[lastNewNode] = activeNode
                   lastNewNode = NULL
               end if
               activeLength += 1
               exit
           end if
           integer temp = starts[next] + activeLength - 1
           pSplitEnd = allocate_word(temp,true)
           integer splitnode = newNode(starts[next], pSplitEnd,true)
           ta = text[activeEdge]
           children[activeNode][ta] = splitnode
           children[splitnode][tp] = newNode(pos, pLeafEnd)
           starts[next] += activeLength
           children[splitnode][text[starts[next]]] = next
           if lastNewNode != NULL then
               suffixLinks[lastNewNode] = splitnode
           end if
           lastNewNode = splitnode
       end if
       remainingSuffixCount -= 1
       if activeNode == root and activeLength > 0 then
           activeLength -= 1
           activeEdge = pos - remainingSuffixCount + 1
       elsif activeNode != root then
           activeNode = suffixLinks[activeNode]
       end if
   end while

end procedure

procedure setSuffixIndexByDFS(integer n, integer labelHeight)

   if n!=NULL then
       -- Uncomment the 3 lines below to print suffix tree
       --if starts[n]!=-1 then
       --  printf(1,text[starts[n]..peekns(ends[n])])
       --end if
       if children[n]=0 then
           suffixIndices[n] = size - labelHeight
           -- Uncomment line below to print suffix index
           --printf(1," [%d]\n", suffixIndices[n])
       else
           bool leaf = true
           for i=1 to maxChar do
               integer nci = children[n][i]
               if nci != NULL then
                   -- Uncomment the 3 lines below to print suffix index
                   --if leaf and starts[n] != -1 then
                   --  printf(1," [%d]\n", suffixIndices[n])
                   --end if
                   leaf = false
                   setSuffixIndexByDFS(nci, labelHeight+edgeLength(nci))
               end if
           end for
           if leaf then ?9/0 end if -- (sanity check)
       end if
   end if

end procedure

procedure buildSuffixTree()

   size = length(text)
   pRootEnd = allocate_word(-1,true)
   root = newNode(-1, pRootEnd)
   activeNode = root
   for i=1 to size do
       extendSuffixTree(i)
   end for
   integer labelHeight = 0
   setSuffixIndexByDFS(root, labelHeight)

end procedure

procedure doTraversal(integer n, integer labelHeight, atom pMaxHeight, pSubstringStartIndex)

   if n!=NULL then
       integer nsi = suffixIndices[n], newHeight
       if nsi == -1 then
           for i=1 to maxChar do
               integer nci = children[n][i]
               if nci != NULL then
                   newHeight = labelHeight+edgeLength(nci)
                   doTraversal(nci, newHeight, pMaxHeight, pSubstringStartIndex)
               end if
           end for
       elsif nsi > -1 then
           newHeight = labelHeight-edgeLength(n)
           if peekns(pMaxHeight)<newHeight then
               poken(pMaxHeight,newHeight)
               poken(pSubstringStartIndex,nsi)
           end if
       end if
   end if

end procedure

procedure getLongestRepeatedSubstring(string s)

   atom pMaxHeight = allocate_word(0,true),
        pSubstringStartIndex = allocate_word(0,true)
   doTraversal(root, 0, pMaxHeight, pSubstringStartIndex)
   integer maxHeight = peekns(pMaxHeight),
           start = peekns(pSubstringStartIndex)
   -- Uncomment line below to print maxHeight and substringStartIndex
   --printf(1,"maxHeight %d, substringStartIndex %d\n", {maxHeight, start})
   string t = iff(maxHeight=0?"No repeated substring"
                             :text[start+1..start+maxHeight])
   printf(1,"  %s is: %s\n", {iff(s=""?text:s),t})

end procedure

constant tests = {"GEEKSFORGEEKS$",

                 "AAAAAAAAAA$",
                 "ABCDEFG$",
                 "ABABABA$",
                 "ATCGATCGA$",
                 "banana$",
                 "abcpqrabpqpq$",
                 "pqrpqpqabab$"}

printf(1,"Longest Repeated Substring in:\n") for i=1 to length(tests) do

   text = tests[i]
   buildSuffixTree()
   getLongestRepeatedSubstring("")

end for printf(1,"\n")

-- Sadly gmp crashes when I try 100,000 dp, but I suspect it would be fine with more than my paltry 4GB ram include mpfr.e mpfr pi = mpfr_init(0,-10001) -- (set precision to 10,000 dp, plus the "3.") mpfr_const_pi(pi) string piStr = mpfr_sprintf("%.10000Rf", pi), s="Pi" piStr = piStr[3..$] -- discard leading "3." maxChar = '9' for i=3 to 6 do

   atom t0 = time()
   integer n = power(10,i)
   if i>=5 then
       text = repeat('0',n)&"$"
       for c=1 to n do
           text[c] = rand(10)+'0'-1
       end for
       s = "a made up number"
   else
       text = piStr[1..n] & "$"
   end if
   buildSuffixTree()
   getLongestRepeatedSubstring(sprintf("first %,d d.p. of %s", {n,s}))
   printf(1,"  (this took %gs)\n\n", {time()-t0})

end for</lang>

Output:
Longest Repeated Substring in:
  GEEKSFORGEEKS$ is: GEEKS
  AAAAAAAAAA$ is: AAAAAAAAA
  ABCDEFG$ is: No repeated substring
  ABABABA$ is: ABABA
  ATCGATCGA$ is: ATCGA
  banana$ is: ana
  abcpqrabpqpq$ is: ab
  pqrpqpqabab$ is: ab

  first 1,000 d.p. of Pi is: 23846
  (this took 0s)

  first 10,000 d.p. of Pi is: 7111369
  (this took 0.047s)

  first 100,000 d.p. of a made up number is: 022105155
  (this took 0.422s)

  first 1,000,000 d.p. of a made up number is: 09014346795
  (this took 4.734s)