Ukkonen’s suffix tree construction: Difference between revisions

Content added Content deleted
m (Thundergnat moved page Ukkonen’s Suffix Tree Construction to Ukkonen’s suffix tree construction: Follow normal task title capitalization policy)
(Added Wren)
Line 744: Line 744:
first 1,000,000 d.p. of a made up number is: 09014346795
first 1,000,000 d.p. of a made up number is: 09014346795
(this took 4.734s)
(this took 4.734s)
</pre>

=={{header|Wren}}==
{{trans|Go}}
{{libheader|Wren-big}}
{{libheader|Wren-dynamic}}
{{libheader|Wren-trait}}
As it would take a very long time to calculate the first 100,000 digits of Pi using the code from the [[Arithmetic-geometric_mean/Calculate_Pi#Wren]] task, I have instead saved the digits produced by the Go entry to a file (which only takes a few seconds) and then loaded that into the Wren script.

Having done that, the timings for extracting the longest repeated sequence of digits are reasonably quick and fairly linear as expected.
<lang ecmascript>import "/big" for BigRat
import "/dynamic" for Struct
import "/trait" for ByRef
import "io" for File

var maxChar = 128

var Node = Struct.create("Node", ["children", "suffixLink", "start", "pEnd", "suffixIndex"])

var text = ""
var root = null
var lastNewNode = null
var activeNode = null
var activeEdge = -1
var activeLength = 0
var remainingSuffixCount = 0
var pLeafEnd = ByRef.new(-1)
var pRootEnd = null
var pSplitEnd = null
var size = -1

var newNode = Fn.new { |start, pEnd|
var children = List.filled(maxChar, null)
var suffixLink = root
var suffixIndex = -1
return Node.new(children, suffixLink, start, pEnd, suffixIndex)
}

var edgeLength = Fn.new { |n|
if (n == root) return 0
return n.pEnd.value - n.start + 1
}

var walkDown = Fn.new { |currNode|
var el = edgeLength.call(currNode)
if (activeLength >= el) {
activeEdge = activeEdge + el
activeLength = activeLength - el
activeNode = currNode
return true
}
return false
}

var extendSuffixTree = Fn.new { |pos|
pLeafEnd.value = pos
remainingSuffixCount = remainingSuffixCount + 1
lastNewNode = null
while (remainingSuffixCount > 0) {
if (activeLength == 0) activeEdge = pos
if (!activeNode.children[text[activeEdge].bytes[0]]) {
activeNode.children[text[activeEdge].bytes[0]] = newNode.call(pos, pLeafEnd)
if (lastNewNode) {
lastNewNode.suffixLink = activeNode
lastNewNode = null
}
} else {
var next = activeNode.children[text[activeEdge].bytes[0]]
if (walkDown.call(next)) continue
if (text[next.start + activeLength] == text[pos]) {
if (lastNewNode && activeNode != root) {
lastNewNode.suffixLink = activeNode
lastNewNode = null
}
activeLength = activeLength + 1
break
}
var temp = next.start + activeLength - 1
pSplitEnd = ByRef.new(temp)
var split = newNode.call(next.start, pSplitEnd)
activeNode.children[text[activeEdge].bytes[0]] = split
split.children[text[pos].bytes[0]] = newNode.call(pos, pLeafEnd)
next.start = next.start + activeLength
split.children[text[next.start].bytes[0]] = next
if (lastNewNode) lastNewNode.suffixLink = split
lastNewNode = split
}
remainingSuffixCount = remainingSuffixCount - 1
if (activeNode == root && activeLength > 0) {
activeLength = activeLength - 1
activeEdge = pos - remainingSuffixCount + 1
} else if (activeNode != root) {
activeNode = activeNode.suffixLink
}
}
}

var setSuffixIndexByDFS // recursive
setSuffixIndexByDFS = Fn.new { |n, labelHeight|
if (!n) return
if (n.start != -1) {
// Uncomment line below to print suffix tree
// System.write(text[n.start..n.pEnd.value])
}
var leaf = 1
for (i in 0...maxChar) {
if (n.children[i]) {
// Uncomment the 3 lines below to print suffix index
// if (leaf == 1 && n.start != -1) {
// System.print(" [%(n.suffixIndex)]")
// }
leaf = 0
setSuffixIndexByDFS.call(n.children[i], labelHeight + edgeLength.call(n.children[i]))
}
}
if (leaf == 1) {
n.suffixIndex = size - labelHeight
// Uncomment line below to print suffix index
// System.print(" [%(n.suffixIndex)]")
}
}

var buildSuffixTree = Fn.new {
size = text.count
var temp = -1
pRootEnd = ByRef.new(temp)
root = newNode.call(-1, pRootEnd)
activeNode = root
for (i in 0...size) extendSuffixTree.call(i)
var labelHeight = 0
setSuffixIndexByDFS.call(root, labelHeight)
}

var doTraversal // recursive
doTraversal = Fn.new { |n, labelHeight, pMaxHeight, pSubstringStartIndex|
if (!n) return
if (n.suffixIndex == -1) {
for (i in 0...maxChar) {
if (n.children[i]) {
doTraversal.call(n.children[i], labelHeight + edgeLength.call(n.children[i]),
pMaxHeight, pSubstringStartIndex)
}
}
} else if (n.suffixIndex > -1 && (pMaxHeight.value < labelHeight - edgeLength.call(n))) {
pMaxHeight.value = labelHeight - edgeLength.call(n)
pSubstringStartIndex.value = n.suffixIndex
}
}

var getLongestRepeatedSubstring = Fn.new { |s|
var maxHeight = 0
var substringStartIndex = 0
var pMaxHeight = ByRef.new(maxHeight)
var pSubstringStartIndex = ByRef.new(substringStartIndex)
doTraversal.call(root, 0, pMaxHeight, pSubstringStartIndex)
maxHeight = pMaxHeight.value
substringStartIndex = pSubstringStartIndex.value
// Uncomment line below to print maxHeight and substringStartIndex
// System.print("maxHeight %(maxHeight), substringStartIndex %(substringStartIndex)")
if (s == "") {
System.write(" %(text) is: ")
} else {
System.write(" %(s) is: ")
}
var k = 0
while (k < maxHeight) {
System.write(text[k + substringStartIndex])
k = k + 1
}
if (k == 0) {
System.write("No repeated substring")
}
System.print()
}

var tests = [
"GEEKSFORGEEKS$",
"AAAAAAAAAA$",
"ABCDEFG$",
"ABABABA$",
"ATCGATCGA$",
"banana$",
"abcpqrabpqpq$",
"pqrpqpqabab$",
]
System.print("Longest Repeated Substring in:\n")
for (test in tests) {
text = test
buildSuffixTree.call()
getLongestRepeatedSubstring.call("")
}
System.print()

// load pi to 100,182 digits
var piStr = File.read("pi_100000.txt")
piStr = piStr[2..-1] // remove initial 3.
var numbers = [1e3, 1e4, 1e5]
maxChar = 58
for (number in numbers) {
var start = System.clock
text = piStr[0...number] + "$"
buildSuffixTree.call()
getLongestRepeatedSubstring.call("first %(number) d.p. of Pi")
var elapsed = (System.clock - start) * 1000
System.print(" (this took %(elapsed) ms)\n")
}</lang>

{{out}}
<pre>
Longest Repeated Substring in:

GEEKSFORGEEKS$ is: GEEKS
AAAAAAAAAA$ is: AAAAAAAAA
ABCDEFG$ is: No repeated substring
ABABABA$ is: ABABA
ATCGATCGA$ is: ATCGA
banana$ is: ana
abcpqrabpqpq$ is: ab
pqrpqpqabab$ is: ab

first 1000 d.p. of Pi is: 23846
(this took 9.987 ms)

first 10000 d.p. of Pi is: 7111369
(this took 89.12 ms)

first 100000 d.p. of Pi is: 041021944
(this took 1031.072 ms)
</pre>
</pre>