Ukkonen’s suffix tree construction

From Rosetta Code
Revision as of 09:57, 15 February 2024 by PureFox (talk | contribs) (→‎{{header|Wren}}: Minor tidy)
Task
Ukkonen’s suffix tree construction
You are encouraged to solve this task according to the task description, using any language you may know.

Suffix Trees are very useful in numerous string processing and computational biology problems.

The task is to create a function which implements Ukkonen’s algorithm to create a useful Suffix Tree as described:

Part 1
Part 2
Part 3
Part 4
Part 5
Part 6

Using Arithmetic-geometric mean/Calculate Pi generate the first 1000, 10000, and 100000 decimal places of pi. Using your implementation with an alphabet of 0 through 9 (plus $ say to make the tree explicit) find the longest repeated string in each list. Time your results and demonstrate that your implementation is linear (i.e. that 10000 takes approx. 10 times as long as 1000). You may vary the size of the lists of decimal places of pi to give reasonable answers.

Go

This is a translation of the C code here which is an extended form of the code in Part 6 of the task description for finding the longest repeated substring of a given string. In the interests of brevity, the extensive comments in the C version have been largely omitted. The C code doesn't compile as it stands but I have added a fix in the Talk Page.

For convenience I have included the code from the Arithmetic-geometric_mean/Calculate_Pi#Go task in the same package.

It takes around 25 seconds on my machine (Celeron @1.6GHz) to calculate the first 100,000 (or so) decimal places of Pi. Having done that, the timings for extracting the longest repeated sequence of digits are quick and fairly linear as expected.

As the task doesn't say whether overlapping sequences are to be counted, I've assumed that they are as this is what the algorithm naturally produces.

package main

import (
    "fmt"
    "math/big"
    "time"
)

var maxChar = 128

type Node struct {
    children    []*Node
    suffixLink  *Node
    start       int
    end         *int
    suffixIndex int
}

var (
    text                 string
    root                 *Node
    lastNewNode          *Node
    activeNode           *Node
    activeEdge           = -1
    activeLength         = 0
    remainingSuffixCount = 0
    leafEnd              = -1
    rootEnd              *int
    splitEnd             *int
    size                 = -1
)

func newNode(start int, end *int) *Node {
    node := new(Node)
    node.children = make([]*Node, maxChar)
    node.suffixLink = root
    node.start = start
    node.end = end
    node.suffixIndex = -1
    return node
}

func edgeLength(n *Node) int {
    if n == root {
        return 0
    }
    return *(n.end) - n.start + 1
}

func walkDown(currNode *Node) bool {
    if activeLength >= edgeLength(currNode) {
        activeEdge += edgeLength(currNode)
        activeLength -= edgeLength(currNode)
        activeNode = currNode
        return true
    }
    return false
}

func extendSuffixTree(pos int) {
    leafEnd = pos
    remainingSuffixCount++
    lastNewNode = nil
    for remainingSuffixCount > 0 {
        if activeLength == 0 {
            activeEdge = pos
        }
        if activeNode.children[text[activeEdge]] == nil {
            activeNode.children[text[activeEdge]] = newNode(pos, &leafEnd)
            if lastNewNode != nil {
                lastNewNode.suffixLink = activeNode
                lastNewNode = nil
            }
        } else {
            next := activeNode.children[text[activeEdge]]
            if walkDown(next) {
                continue
            }
            if text[next.start+activeLength] == text[pos] {
                if lastNewNode != nil && activeNode != root {
                    lastNewNode.suffixLink = activeNode
                    lastNewNode = nil
                }
                activeLength++
                break
            }
            temp := next.start + activeLength - 1
            splitEnd = &temp
            split := newNode(next.start, splitEnd)
            activeNode.children[text[activeEdge]] = split
            split.children[text[pos]] = newNode(pos, &leafEnd)
            next.start += activeLength
            split.children[text[next.start]] = next
            if lastNewNode != nil {
                lastNewNode.suffixLink = split
            }
            lastNewNode = split
        }
        remainingSuffixCount--
        if activeNode == root && activeLength > 0 {
            activeLength--
            activeEdge = pos - remainingSuffixCount + 1
        } else if activeNode != root {
            activeNode = activeNode.suffixLink
        }
    }
}

func setSuffixIndexByDFS(n *Node, labelHeight int) {
    if n == nil {
        return
    }
    if n.start != -1 {
        // Uncomment line below to print suffix tree
        // fmt.Print(text[n.start: *(n.end) +1])
    }
    leaf := 1
    for i := 0; i < maxChar; i++ {
        if n.children[i] != nil {
            // Uncomment the 3 lines below to print suffix index
            //if leaf == 1 && n.start != -1 {
            //    fmt.Printf(" [%d]\n", n.suffixIndex)
            //}
            leaf = 0
            setSuffixIndexByDFS(n.children[i], labelHeight+edgeLength(n.children[i]))
        }
    }
    if leaf == 1 {
        n.suffixIndex = size - labelHeight
        // Uncomment line below to print suffix index
        //fmt.Printf(" [%d]\n", n.suffixIndex)
    }
}

func buildSuffixTree() {
    size = len(text)
    temp := -1
    rootEnd = &temp
    root = newNode(-1, rootEnd)
    activeNode = root
    for i := 0; i < size; i++ {
        extendSuffixTree(i)
    }
    labelHeight := 0
    setSuffixIndexByDFS(root, labelHeight)
}

func doTraversal(n *Node, labelHeight int, maxHeight, substringStartIndex *int) {
    if n == nil {
        return
    }
    if n.suffixIndex == -1 {
        for i := 0; i < maxChar; i++ {
            if n.children[i] != nil {
                doTraversal(n.children[i], labelHeight+edgeLength(n.children[i]),
                    maxHeight, substringStartIndex)
            }
        }
    } else if n.suffixIndex > -1 && (*maxHeight < labelHeight-edgeLength(n)) {
        *maxHeight = labelHeight - edgeLength(n)
        *substringStartIndex = n.suffixIndex
    }
}

func getLongestRepeatedSubstring(s string) {
    maxHeight := 0
    substringStartIndex := 0
    doTraversal(root, 0, &maxHeight, &substringStartIndex)
    // Uncomment line below to print maxHeight and substringStartIndex
    // fmt.Printf("maxHeight %d, substringStartIndex %d\n", maxHeight, substringStartIndex)
    if s == "" {
        fmt.Printf("  %s is: ", text)
    } else {
        fmt.Printf("  %s is: ", s)
    }
    k := 0
    for ; k < maxHeight; k++ {
        fmt.Printf("%c", text[k+substringStartIndex])
    }
    if k == 0 {
        fmt.Print("No repeated substring")
    }
    fmt.Println()
}

func calculatePi() *big.Float {
    one := big.NewFloat(1)
    two := big.NewFloat(2)
    four := big.NewFloat(4)
    prec := uint(325 * 1024) // enough to calculate Pi to 100,182 decimal digits

    a := big.NewFloat(1).SetPrec(prec)
    g := new(big.Float).SetPrec(prec)

    // temporary variables
    t := new(big.Float).SetPrec(prec)
    u := new(big.Float).SetPrec(prec)

    g.Quo(a, t.Sqrt(two))
    sum := new(big.Float)
    pow := big.NewFloat(2)

    for a.Cmp(g) != 0 {
        t.Add(a, g)
        t.Quo(t, two)
        g.Sqrt(u.Mul(a, g))
        a.Set(t)
        pow.Mul(pow, two)
        t.Sub(t.Mul(a, a), u.Mul(g, g))
        sum.Add(sum, t.Mul(t, pow))
    }

    t.Mul(a, a)
    t.Mul(t, four)
    pi := t.Quo(t, u.Sub(one, sum))
    return pi
}

func main() {
    tests := []string{
        "GEEKSFORGEEKS$",
        "AAAAAAAAAA$",
        "ABCDEFG$",
        "ABABABA$",
        "ATCGATCGA$",
        "banana$",
        "abcpqrabpqpq$",
        "pqrpqpqabab$",
    }
    fmt.Println("Longest Repeated Substring in:\n")
    for _, test := range tests {
        text = test
        buildSuffixTree()
        getLongestRepeatedSubstring("")
    }
    fmt.Println()

    pi := calculatePi()
    piStr := fmt.Sprintf("%v", pi)
    piStr = piStr[2:] // remove initial 3.
    numbers := []int{1e3, 1e4, 1e5}
    maxChar = 58
    for _, number := range numbers {
        start := time.Now()
        text = piStr[0:number] + "$"
        buildSuffixTree()
        getLongestRepeatedSubstring(fmt.Sprintf("first %d d.p. of Pi", number))
        elapsed := time.Now().Sub(start)
        fmt.Printf("  (this took %s)\n\n", elapsed)
    }
}
Output:

Sample run:

Longest Repeated Substring in:

  GEEKSFORGEEKS$ is: GEEKS
  AAAAAAAAAA$ is: AAAAAAAAA
  ABCDEFG$ is: No repeated substring
  ABABABA$ is: ABABA
  ATCGATCGA$ is: ATCGA
  banana$ is: ana
  abcpqrabpqpq$ is: ab
  pqrpqpqabab$ is: ab

  first 1000 d.p. of Pi is: 23846
  (this took 7.728858ms)

  first 10000 d.p. of Pi is: 7111369
  (this took 57.524478ms)

  first 100000 d.p. of Pi is: 041021944
  (this took 599.770281ms)

Java

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

public final class UkkonenSuffixTree {

	public static void main(String[] aArgs) throws IOException {		
		List<Integer> limits = List.of( 1_000, 10_000, 100_000 );
		
		for ( int limit : limits ) {
			String contents = Files.readString(Path.of("piDigits.txt"));
			String piDigits = contents.substring(0, limit + 1);
			
			final long start = System.currentTimeMillis();			
			SuffixTree tree = new SuffixTree(piDigits);			
			Map<String, Set<Integer>> substrings = tree.getLongestRepeatedSubstrings();			
			final long finish = System.currentTimeMillis();
			
			System.out.println("First " + limit + " digits of pi has longest repeated characters:"); 
			for ( String substring : substrings.keySet() ) {
				System.out.print("    '" + substring + "' starting at index ");
				for ( Iterator<Integer> iterator = substrings.get(substring).iterator(); iterator.hasNext(); ) {
					System.out.print(iterator.next());
					if ( iterator.hasNext() ) {
						System.out.print(" and ");
					}
				}
				System.out.println();
			}
			
			System.out.println("Time taken: " + ( finish - start ) + " milliseconds." + System.lineSeparator());
		}
		
		System.out.println("The timings show that the implementation has approximately linear performance.");
	}

	private final static class SuffixTree {
		
		public SuffixTree(String aWord) {
			text = Arrays.copyOfRange(aWord.toCharArray(), 0, aWord.length() + 1);
			text[aWord.length()] = '\uF123'; // Terminal character
			
			nodes = new Node[2 * aWord.length() + 2];
	        root = newNode(UNDEFINED, UNDEFINED);
	        activeNode = root;
	        
			for ( char character : text ) {
				extendSuffixTree(character);
			}				
		}
		
		public Map<String, Set<Integer>> getLongestRepeatedSubstrings() {
			List<Integer> indexes = doTraversal();
			String word = String.valueOf(text).substring(0, text.length - 1);
			Map<String, Set<Integer>> result = new HashMap<String, Set<Integer>>();
			
			if ( indexes.get(0) > 0 ) {
				for ( int i = 1; i < indexes.size(); i++ ) {
					String substring = word.substring(indexes.get(i), indexes.get(i) + indexes.get(0));			
					result.computeIfAbsent(substring, k -> new TreeSet<Integer>()).add(indexes.get(i));	
				}
			}
			
			return result;
		}	
	
		// PRIVATE //
		
		private void extendSuffixTree(char aCharacter) {
	        needParentLink = UNDEFINED;
	        remainder++;
	        
	        while ( remainder > 0 ) {
	            if ( activeLength == 0 ) {
	            	activeEdge = textIndex;
	            }
	            
	            if ( ! nodes[activeNode].children.containsKey(text[activeEdge]) ) {
	                final int leaf = newNode(textIndex, LEAF_NODE);
	                nodes[activeNode].children.put(text[activeEdge], leaf);
	                addSuffixLink(activeNode);
	            } else {
	                final int next = nodes[activeNode].children.get(text[activeEdge]);
	                if ( walkDown(next) ) {
	                	continue;
	                }
	                
	                if ( text[nodes[next].start + activeLength] == aCharacter ) {
	                    activeLength++;
	                    addSuffixLink(activeNode);
	                    break;
	                }
	                
	                final int split = newNode(nodes[next].start, nodes[next].start + activeLength);
	                nodes[activeNode].children.put(text[activeEdge], split);
	                final int leaf = newNode(textIndex, LEAF_NODE);
	                nodes[split].children.put(aCharacter, leaf);
	                nodes[next].start += activeLength;
	                nodes[split].children.put(text[nodes[next].start], next);
	                addSuffixLink(split);
	            }
	            
	            remainder--;
	
	            if ( activeNode == root && activeLength > 0 ) {
	                activeLength--;
	                activeEdge = textIndex - remainder + 1;
	            } else {
	                activeNode = ( nodes[activeNode].parentLink > 0 ) ? nodes[activeNode].parentLink : root;
	            }
	        }
	        
	        textIndex++;       
	    }
		
		private boolean walkDown(int aNode) {
	        if ( activeLength >= nodes[aNode].edgeLength() ) {
	            activeEdge += nodes[aNode].edgeLength();
	            activeLength -= nodes[aNode].edgeLength();
	            activeNode = aNode;
	            
	            return true;
	        }
	        
	        return false;
	    }
		
		private void addSuffixLink(int aNode) {
	        if ( needParentLink != UNDEFINED ) {
	            nodes[needParentLink].parentLink = aNode;
	        }
	        
	        needParentLink = aNode;
	    }
		
		private int newNode(int aStart, int aEnd) {
			Node node = new Node(aStart, aEnd);
			node.leafIndex = ( aEnd == LEAF_NODE ) ? leafIndexGenerator++ : UNDEFINED;
			nodes[currentNode] = node;
	
	        return currentNode++;
	    }
	
		private List<Integer> doTraversal() {
			List<Integer> indexes = new ArrayList<Integer>();
			indexes.add(UNDEFINED);		
	
			return traversal(indexes, nodes[root], 0);
		}		
		
		private List<Integer> traversal(List<Integer> aIndexes, Node aNode, int aHeight) {
			if ( aNode.leafIndex == UNDEFINED ) {
				for ( int index : aNode.children.values() ) {
			        Node child = nodes[index];
			        traversal(aIndexes, child, aHeight + child.edgeLength());
				}
			} else if ( aIndexes.get(0) < aHeight - aNode.edgeLength() ) {
				aIndexes.clear();
				aIndexes.add(aHeight - aNode.edgeLength());
				aIndexes.add(aNode.leafIndex);
			} else if ( aIndexes.get(0) == aHeight - aNode.edgeLength() ) {
			    aIndexes.add(aNode.leafIndex);
			}		
			
			return aIndexes;
		}
	    
	    private final class Node { 
	    	
	    	public Node(int aStart, int aEnd) {
	            start = aStart;
	            end = aEnd;
	        }
	    	
	    	public int edgeLength() {
	            return Math.min(end, textIndex + 1) - start;
	        }
	
	        private int start, end, parentLink, leafIndex;
	        private Map<Character, Integer> children = new HashMap<Character, Integer>();
	        
	    }    
	    
	    private Node[] nodes;
	    private char[] text;
	    private int activeNode, activeLength, activeEdge;
	    private int root, textIndex, currentNode, needParentLink, remainder, leafIndexGenerator;
	    
	    private static final int UNDEFINED = -1;
	    private static final int LEAF_NODE = Integer.MAX_VALUE;   
	
	}

}
Output:
First 1000 digits of pi has longest repeated characters:
    '82534' starting at index 87 and 902
    '33446' starting at index 214 and 830
    '60943' starting at index 396 and 550
    '23846' starting at index 15 and 578
    '93751' starting at index 44 and 940
    '99999' starting at index 761 and 762
    '42019' starting at index 700 and 993
Time taken: 13 milliseconds.

First 10000 digits of pi has longest repeated characters:
    '7111369' starting at index 3501 and 6114
    '8530614' starting at index 4166 and 4600
Time taken: 8 milliseconds.

First 100000 digits of pi has longest repeated characters:
    '041021944' starting at index 21760 and 75869
    '201890888' starting at index 30626 and 33843
    '134926158' starting at index 69596 and 86281
Time taken: 100 milliseconds.

The timings show that the implementation has approximately linear performance.

Julia

Translation of: Go

Uses array indices instead of the Go version's node pointers.

const oo = typemax(Int)

"""The suffix-tree's node."""
mutable struct Node
    children::Dict{Char, Int}
    start::Int
    ending::Int
    suffixlink::Int
    suffixindex::Int
end

Node() = Node(Dict(), 0, oo, 0, -1)
Node(start, ending) = Node(Dict(), start, ending, 0, -1)

""" Ukkonen Suffix-Tree """
mutable struct SuffixTree
    nodes::Vector{Node}
    text::Vector{Char}
    root::Int
    position::Int
    currentnode::Int
    needsuffixlink::Int
    remainder::Int
    activenode::Int
    activelength::Int
    activeedge::Int
end

edgelength(st, n::Node) = min(n.ending, st.position + 1) - n.start

function newnode(st, start, ending)
    st.currentnode += 1
    st.nodes[st.currentnode] = Node(start, ending)
    return st.currentnode
end

function SuffixTree(str::String)
    nodes = [Node() for _ in 1:length(str) * 2]
    st = SuffixTree(nodes, [c for c in str], 1, 0, 0, 0, 0, 1, 1, 1)
    st.root = newnode(st, 0, 0)
    st.activenode = st.root
    for i in 1:length(st.text)
        extendsuffixtree(st, i)
    end
    setsuffixindexbyDFS(st, st.nodes[st.root], 0)
    return st
end

function addsuffixlink(st, nodenum::Int)
    if st.needsuffixlink > 0
        st.nodes[st.needsuffixlink].suffixlink = nodenum
    end
    st.needsuffixlink = nodenum
end

activeedge(st) = st.text[st.activeedge]

function walkdown!(st, currnode::Int)
    len = edgelength(st, st.nodes[currnode])
    st.activelength < len && return false
    st.activeedge += len
    st.activelength -= len
    st.activenode = currnode
    return true
end

function extendsuffixtree(st, pos)
    st.position = pos
    st.needsuffixlink = 0
    st.remainder += 1
    while st.remainder > 0
        st.activelength == 0 && (st.activeedge = st.position)
        if !haskey(st.nodes[st.activenode].children, activeedge(st))
            nodenum = newnode(st, st.position, oo)
            st.nodes[st.activenode].children[activeedge(st)] = nodenum
            addsuffixlink(st, st.activenode)
        else
            next = st.nodes[st.activenode].children[activeedge(st)]
            walkdown!(st, next) && continue
            if st.text[st.nodes[next].start + st.activelength] == st.text[pos]
                addsuffixlink(st, st.activenode)
                st.activelength += 1
                break
            end
            splt = newnode(st, st.nodes[next].start, st.nodes[next].start + st.activelength)
            st.nodes[st.activenode].children[activeedge(st)] = splt
            nodenum = newnode(st, st.position, oo)
            st.nodes[splt].children[st.text[pos]] = nodenum
            st.nodes[next].start += st.activelength
            st.nodes[splt].children[st.text[st.nodes[next].start]] = next
            addsuffixlink(st, splt)
        end
        st.remainder -= 1
        if st.activenode == st.root && st.activelength > 0
            st.activelength -= 1
            st.activeedge = st.position - st.remainder + 1
        elseif st.activenode != st.root
            st.activenode = st.nodes[st.activenode].suffixlink
        end
    end
end

function setsuffixindexbyDFS(st, node, labelheight, verbose=false)
    verbose && node.start > 0 && print(st.text[node.start:min(node.ending, length(st.text))])
    isleaf = true
    for child in map(v -> st.nodes[v], collect(values(node.children)))
        verbose && isleaf && node.start > 0 && println(" [", node.suffixindex, "]")
        isleaf = false
        setsuffixindexbyDFS(st, child, labelheight + edgelength(st, child))
    end
    if isleaf
        idx = length(st.text) - labelheight
        node.suffixindex = idx
        verbose && println(" [$idx]")
    end
end

function dotraversal(st)
    maxheight, substringstartindices = 0, [0]
    function traversal(node::Node, labelheight)
        if node.suffixindex == -1
            for child in map(v -> st.nodes[v], collect(values(node.children)))
                traversal(child, labelheight + edgelength(st, child))
            end
        elseif maxheight < labelheight - edgelength(st, node)
            maxheight = labelheight - edgelength(st, node)
            substringstartindices = [node.suffixindex + 1]
        elseif maxheight == labelheight - edgelength(st, node)
            push!(substringstartindices, node.suffixindex + 1)
        end
    end
    traversal(st.nodes[st.root], 0)
    return maxheight, substringstartindices
end

function getlongestrepeatedsubstring(st::SuffixTree, label="", printresult=true)
    len, starts = dotraversal(st)
    substring = len == 0 ? "" :
        join(unique(map(x -> String(st.text[x:x+len-1]), starts)), " (or) ")
    if printresult
        print("  ", label == "" ? String(st.text) : label, ": ")
        println(len == 0 ? "No repeated substring." : substring)
    end
    return substring
end

function testsuffixtree()
    tests = [
        "CAAAABAAAABD\$",
        "GEEKSFORGEEKS\$",
        "AAAAAAAAAA\$",
        "ABCDEFG\$",
        "ABABABA\$",
        "ATCGATCGA\$",
        "banana\$",
        "abcpqrabpqpq\$",
        "pqrpqpqabab\$",
    ]
    println("Longest Repeated Substring in:\n")
    for test in tests
        st = SuffixTree(test)
        getlongestrepeatedsubstring(st)
    end
    println()
     = ""
    setprecision(4000000) do
         = string(BigFloat(π))[3:end]
    end
    for number in [1000, 10000, 100000, 1000000]
        text = [1:number] * "\$"
        @time begin
            st = SuffixTree(text)
            getlongestrepeatedsubstring(st, "first $number d.p. of π")
        end
    end
end

testsuffixtree()
Output:
Longest Repeated Substring in:

  CAAAABAAAABD: AAAAB
  GEEKSFORGEEKS: GEEKS
  AAAAAAAAAA: AAAAAAAAA
  ABCDEFG: No repeated substring.
  ABABABA: ABABA
  ATCGATCGA: ATCGA
  banana: ana
  abcpqrabpqpq: ab (or) pq
  pqrpqpqabab: ab (or) pq

  first 1000 d.p. of π: 60943 (or) 42019 (or) 82534 (or) 99999 (or) 93751 (or) 23846 (or) 33446
  0.003336 seconds (34.86 k allocations: 4.252 MiB)
  first 10000 d.p. of π: 7111369 (or) 8530614
  0.038749 seconds (351.60 k allocations: 42.460 MiB, 16.54% gc time)
  first 100000 d.p. of π: 134926158 (or) 041021944 (or) 201890888
  0.533892 seconds (3.52 M allocations: 425.035 MiB, 22.42% gc time)
  first 1000000 d.p. of π: 756130190263
  6.008879 seconds (35.25 M allocations: 4.152 GiB, 23.20% gc time)

Nim

Translation of: Julia
Library: Nim-Integers
import std/[sequtils, strformat, strutils, tables]

const ∞ = int.high

type

  # Suffix-tree node.
  Node = ref object
    children: Table[char, int]
    first: int
    last: int
    suffixLink: int
    suffixIndex: int

  # Ukkonen suffix-tree.
  SuffixTree = object
    nodes: seq[Node]
    text: string
    root: int
    position: int
    currentNode: int
    needSuffixLink: int
    remainder: int
    activeNode: int
    activeLength: int
    activeEdge: int


func newNode(): Node =
  Node(first: -1, last: ∞, suffixLink: -1, suffixIndex: -1)

func newNode(first, last: int): Node =
  Node(first: first, last: last, suffixLink: 0, suffixIndex: -1)

func newNode(st: var SuffixTree; first, last: int): int =
  inc st.currentNode
  st.nodes[st.currentNode] = newNode(first, last)
  result = st.currentNode


func activeEdgeChar(st: SuffixTree): char {.inline.} = st.text[st.activeedge]


func edgeLength(st: SuffixTree; n: Node): int =
  min(n.last, st.position + 1) - n.first


func addSuffixLink(st: var SuffixTree; nodenum: int) =
  if st.needSuffixLink >= 0:
    st.nodes[st.needSuffixLink].suffixLink = nodenum
  st.needSuffixLink = nodenum

func walkdown(st: var SuffixTree; currNode: int): bool =
  let length = st.edgeLength(st.nodes[currNode])
  if st.activeLength < length: return false
  inc st.activeEdge, length
  dec st.activeLength, length
  st.activeNode = currnode
  result = true


func extendSuffixTree(st: var SuffixTree; pos: int) =
  st.position = pos
  st.needSuffixLink = -1
  inc st.remainder
  while st.remainder > 0:
    if st.activeLength == 0: st.activeEdge = st.position
    if st.activeEdgeChar() notin st.nodes[st.activeNode].children:
      let nodeNum = st.newNode(st.position, ∞)
      st.nodes[st.activeNode].children[st.activeEdgeChar()] = nodeNum
      st.addSuffixLink(st.activeNode)
    else:
      let next = st.nodes[st.activeNode].children[st.activeEdgeChar()]
      if st.walkdown(next): continue
      if st.text[st.nodes[next].first + st.activeLength] == st.text[pos]:
        st.addSuffixLink(st.activeNode)
        inc st.activeLength
        break
      let split = st.newNode(st.nodes[next].first, st.nodes[next].first + st.activeLength)
      st.nodes[st.activeNode].children[st.activeEdgeChar()] = split
      let nodeNum = st.newNode(st.position, ∞)
      st.nodes[split].children[st.text[pos]] = nodeNum
      st.nodes[next].first += st.activelength
      st.nodes[split].children[st.text[st.nodes[next].first]] = next
      st.addSuffixLink(split)

    dec st.remainder
    if st.activeNode == st.root and st.activeLength > 0:
      dec st.activelength
      st.activeEdge = st.position - st.remainder + 1
    elif st.activeNode != st.root:
      st.activeNode = st.nodes[st.activeNode].suffixLink


proc setSuffixIndexByDFS(st: var SuffixTree; node: Node; labelHeight: Natural; verbose = false) =
  if verbose and node.first >= 0:
    echo st.text[node.first..<min(node.last, st.text.len)]
  var isLeaf = true
  for ichild in node.children.values:
    let child = st.nodes[ichild]
    if verbose and isLeaf and node.first >= 0:
      echo &" [{node.suffixindex}]"
    isLeaf = false
    st.setSuffixIndexbyDFS(child, labelHeight + st.edgeLength(child))
  if isleaf:
    node.suffixindex = st.text.len - labelHeight
    if verbose: echo &" [{node.suffixindex}]"

proc initSuffixTree(str: string): SuffixTree =
  var nodes = newSeqWith(str.len * 2, newNode())
  result = SuffixTree(nodes: nodes, text: str, position: -1, currentNode: -1,
                      needSuffixLink: -1, remainder: 0, activeLength: 1, activeEdge: 0)
  result.root = result.newNode(0, 0)
  result.activeNode = result.root
  for i in 0..result.text.high:
    result.extendSuffixTree(i)
  result.setSuffixIndexByDFS(result.nodes[result.root], 0)


func doTraversal(st: SuffixTree): (int, seq[int]) =
  var maxHeight = 0
  var substringStartIndices = @[-1]

  func traversal(node: Node; labelHeight: int) =
    if node.suffixIndex == -1:
      for ichild in node.children.values:
        let child = st.nodes[ichild]
        traversal(child, labelHeight + st.edgeLength(child))
    elif maxHeight < labelHeight - st.edgeLength(node):
      maxHeight = labelHeight - st.edgeLength(node)
      substringStartIndices = @[node.suffixIndex]
    elif maxHeight == labelHeight - st.edgelength(node):
      substringStartIndices.add node.suffixIndex

  traversal(st.nodes[st.root], 0)
  result = (maxHeight, move(substringStartIndices))


proc getLongestRepeatedSubstring(st: SuffixTree; label = ""; printResult = true): string =
  let (length, starts) = st.dotraversal()
  result = if length == 0: ""
           else: starts.mapIt(st.text[it..it+length-1]).deduplicate().join(" (or) ")
  if printResult:
    stdout.write "  ", if label.len == 0: join(st.text) else: label, ": "
    echo if length == 0: "No repeated substring." else: result



const Tests = ["CAAAABAAAABD$",
                "GEEKSFORGEEKS$",
                "AAAAAAAAAA$",
                "ABCDEFG$",
                "ABABABA$",
                "ATCGATCGA$",
                "banana$",
                "abcpqrabpqpq$",
                "pqrpqpqabab$"]

echo "Longest Repeated Substring in:\n"
for test in Tests:
  var st = initSuffixTree(test)
  discard st.getLongestRepeatedSubstring()
echo()


#############################################################################
# Pi calculation.

import std/[monotimes, times]
import integers

func isr(term, guess: Integer): Integer =
  var term = term
  result = guess
  let value = term * result
  while true:
    if abs(term - result) <= 1:
      break
    result = (result + term) shr 1
    term = value div result

func calcAgm(lam, gm: Integer; z: var Integer; ep: Integer): Integer =
  var am: Integer
  var lam = lam
  var gm = gm
  var n = 1
  while true:
    am = (lam + gm) shr 1
    gm = isr(lam, gm)
    let v = am - lam
    let zi = v * v * n
    if zi < ep:
      break
    z -= zi
    inc n, n
    lam = am
  result = am

func bip(exp: int; man = 1): Integer {.inline.} = man * 10^exp

func calculatePi(): string =
  const Digits = 4_000_000
  let am = bip(Digits)
  let gm = isqrt(bip(Digits + Digits - 1, 5))
  var z = bip(Digits + Digits - 2, 25)
  let agm = calcAGM(am, gm, z, bip(Digits + 1))
  let pi = agm * agm * bip(Digits - 2) div z
  result = $pi

let  = calculatePi()
for number in [1000, 10000, 100000, 1000000]:
  let text = [2..<number] & '$'
  let start = getMonoTime()
  var st = initSuffixTree(text)
  discard st.getlongestrepeatedsubstring(&"first {number} d.p. of π")
  echo &"  → Temps: {(getMonoTime() - start).inMicroseconds} µs"
Output:
Longest Repeated Substring in:

  CAAAABAAAABD$: AAAAB
  GEEKSFORGEEKS$: GEEKS
  AAAAAAAAAA$: AAAAAAAAA
  ABCDEFG$: No repeated substring.
  ABABABA$: ABABA
  ATCGATCGA$: ATCGA
  banana$: ana
  abcpqrabpqpq$: ab (or) pq
  pqrpqpqabab$: ab (or) pq

  first 1000 d.p. of π: 60943 (or) 42019 (or) 82534 (or) 33446 (or) 93751 (or) 99999 (or) 23846
  → Temps: 780 µs
  first 10000 d.p. of π: 7111369 (or) 8530614
  → Temps: 14726 µs
  first 100000 d.p. of π: 134926158 (or) 041021944 (or) 201890888
  → Temps: 178656 µs
  first 1000000 d.p. of π: 756130190263
  → Temps: 1919290 µs

Phix

Translation of: Go
-- demo/rosetta/Ukkonens_Suffix_Tree.exw
with javascript_semantics
integer maxChar = 'z'
 
sequence children = {},
         suffixLinks = {},
         starts = {},
         endIndices = {},
         suffixIndices = {},
         leaves = {}

function new_leaf(integer v=0)
    leaves = append(leaves,v)
    return length(leaves)
end function

string text
integer splitEndIdx,
        rootEndIdx,
        leafEndIdx = new_leaf(),
        root = NULL,
        lastNewNode,
        activeNode,
        activeEdge = -1,
        activeLength = 0,
        remainingSuffixCount = 0,
        size = -1
 
function newNode(integer start, finishIdx, bool bKids=false)
    children = append(children,iff(bKids?repeat(NULL,maxChar):0))
    suffixLinks = append(suffixLinks,root)
    starts = append(starts,start)
    endIndices = append(endIndices,finishIdx)
    suffixIndices = append(suffixIndices,-1)
    return length(children)
end function
 
function edgeLength(integer n)
    return iff(n==root?0:leaves[endIndices[n]] - starts[n] + 1)
end function
 
function walkDown(integer currNode)
    integer l = edgeLength(currNode)
    if activeLength >= l then
        activeEdge += l
        activeLength -= l
        activeNode = currNode
        return true
    end if
    return false
end function
 
procedure extendSuffixTree(integer pos)
    leaves[leafEndIdx] = pos
    remainingSuffixCount += 1
    lastNewNode = NULL
    while remainingSuffixCount > 0 do
        if activeLength == 0 then
            activeEdge = pos
        end if
        integer ta = text[activeEdge]
        bool ca0 = children[activeNode]=0
        integer next = iff(ca0?NULL:children[activeNode][ta])
        if next==null then
            if ca0 then
                children[activeNode] = repeat(NULL,maxChar)
            end if
            children[activeNode][ta] = newNode(pos, leafEndIdx)
            if lastNewNode!=NULL then
                suffixLinks[lastNewNode] = activeNode
                lastNewNode = NULL
            end if
        else
            if walkDown(next) then
                continue
            end if
            integer tp = text[pos]
            if text[starts[next]+activeLength] == tp then
                if lastNewNode!=NULL and activeNode!=root then
                    suffixLinks[lastNewNode] = activeNode
                    lastNewNode = NULL
                end if
                activeLength += 1
                exit
            end if
            integer temp = starts[next] + activeLength - 1
            splitEndIdx = new_leaf(temp)
            integer splitnode = newNode(starts[next], splitEndIdx, true)
            ta = text[activeEdge]
            children[activeNode][ta] = splitnode
            children[splitnode][tp] = newNode(pos, leafEndIdx)
            starts[next] += activeLength
            children[splitnode][text[starts[next]]] = next
            if lastNewNode!=NULL then
                suffixLinks[lastNewNode] = splitnode
            end if
            lastNewNode = splitnode
        end if
        remainingSuffixCount -= 1
        if activeNode==root and activeLength>0 then
            activeLength -= 1
            activeEdge = pos - remainingSuffixCount + 1
        elsif activeNode!=root then
            activeNode = suffixLinks[activeNode]
        end if
    end while
end procedure
 
procedure setSuffixIndexByDFS(integer n, labelHeight)
    if n!=NULL then
        if children[n]=0 then
            suffixIndices[n] = size - labelHeight
        else
            bool leaf = true
            for i=1 to maxChar do
                integer nci = children[n][i]
                if nci!=NULL then
                    leaf = false
                    setSuffixIndexByDFS(nci, labelHeight+edgeLength(nci))
                end if
            end for
            if leaf then ?9/0 end if -- (sanity check)
        end if
    end if
end procedure
 
procedure buildSuffixTree()
    size = length(text)
    rootEndIdx = new_leaf(-1)
    root = newNode(-1, rootEndIdx)
    activeNode = root
    for i=1 to size do
        extendSuffixTree(i)
    end for
    integer labelHeight = 0
    setSuffixIndexByDFS(root, labelHeight)
end procedure
 
procedure doTraversal(integer n, labelHeight, maxHeightIdx, substringStartIndex)
    if n!=NULL then
        integer nsi = suffixIndices[n], newHeight
        if nsi == -1 then
            for i=1 to maxChar do
                integer nci = children[n][i]
                if nci!=NULL then
                    newHeight = labelHeight+edgeLength(nci)
                    doTraversal(nci, newHeight, maxHeightIdx, substringStartIndex)
                end if
            end for
        elsif nsi > -1 then
            newHeight = labelHeight-edgeLength(n)
            if leaves[maxHeightIdx]<newHeight then
                leaves[maxHeightIdx] = newHeight
                leaves[substringStartIndex] = nsi
            end if
        end if
    end if
end procedure
 
function getLongestRepeatedSubstring()
    integer maxHeightIdx = new_leaf(),
            substringStartIndex = new_leaf()
    doTraversal(root, 0, maxHeightIdx, substringStartIndex)
    integer maxHeight = leaves[maxHeightIdx],
            start = leaves[substringStartIndex]
    string t = iff(maxHeight=0?"No repeated substring"
                              :text[start+1..start+maxHeight])
    return t
end function
 
constant tests = {"CAAAABAAAABD$",
                  "GEEKSFORGEEKS$",
                  "AAAAAAAAAA$",
                  "ABCDEFG$",
                  "ABABABA$",
                  "ATCGATCGA$",
                  "banana$",
                  "abcpqrabpqpq$",
                  "pqrpqpqabab$"}
printf(1,"Longest Repeated Substring in:\n")
for i=1 to length(tests) do
    text = tests[i]
    buildSuffixTree()
    printf(1,"  %s is: %s\n", {text,getLongestRepeatedSubstring()})
end for
printf(1,"\n")
 
include mpfr.e
string piStr
if platform()=JS then
    mpfr pi = mpfr_init(0,-1001) -- (set precision to 1,000 dp, plus the "3.")
    mpfr_const_pi(pi)
    piStr = mpfr_get_fixed(pi,1000) -- (all we can really manage under pwa/p2js)
else
    -- gmp crashes when I try 100,000 dp, so just get from file
    piStr = get_text(`E:\downloads\misc\arm\pi-10million.txt`)
end if
piStr = piStr[3..$] -- discard leading "3."
maxChar = '9'
for i=3 to iff(platform()=JS?3:6) do
    atom t0 = time()
    integer n = power(10,i)
    text = piStr[1..n] & "$"
    buildSuffixTree()
    string r = getLongestRepeatedSubstring(),
           e = elapsed(time()-t0)
    printf(1,"  first %,d d.p. of Pi is: %s (%s)\n", {n,r,e})
end for
Output:
Longest Repeated Substring in:
  CAAAABAAAABD$ is: AAAAB
  GEEKSFORGEEKS$ is: GEEKS
  AAAAAAAAAA$ is: AAAAAAAAA
  ABCDEFG$ is: No repeated substring
  ABABABA$ is: ABABA
  ATCGATCGA$ is: ATCGA
  banana$ is: ana
  abcpqrabpqpq$ is: ab
  pqrpqpqabab$ is: ab

  first 1,000 d.p. of Pi is: 23846 (0s)
  first 10,000 d.p. of Pi is: 7111369 (0.0s)
  first 100,000 d.p. of Pi is: 041021944 (0.3s)
  first 1,000,000 d.p. of Pi is: 756130190263 (3.2s)

Note that mpfr_const_pi() struggles to generate more than 1,000 digits of pi under pwa/p2js [and will continue to do so unless someone graciously donates a decent/fast Chudnovsky method in pure Phix or JavaScript...]

Wren

Translation of: Go
Library: Wren-big
Library: Wren-dynamic
Library: Wren-trait

As it would take a very long time to calculate the first 100,000 digits of Pi using the code from the Arithmetic-geometric_mean/Calculate_Pi#Wren task, I have instead saved the digits produced by the Go entry to a file (which only takes a few seconds) and then loaded that into the Wren script.

Having done that, the timings for extracting the longest repeated sequence of digits are reasonably quick and fairly linear as expected.

import "./big" for BigRat
import "./dynamic" for Struct
import "./trait" for ByRef
import "io" for File

var maxChar = 128

var Node = Struct.create("Node", ["children", "suffixLink", "start", "pEnd", "suffixIndex"])

var text                 = ""
var root                 = null
var lastNewNode          = null
var activeNode           = null
var activeEdge           = -1
var activeLength         = 0
var remainingSuffixCount = 0
var pLeafEnd             = ByRef.new(-1)
var pRootEnd             = null
var pSplitEnd            = null
var size                 = -1

var newNode = Fn.new { |start, pEnd|
    var children = List.filled(maxChar, null)
    var suffixLink = root
    var suffixIndex = -1
    return Node.new(children, suffixLink, start, pEnd, suffixIndex)
}

var edgeLength = Fn.new { |n|
    if (n == root) return 0
    return n.pEnd.value - n.start + 1
}

var walkDown = Fn.new { |currNode|
    var el = edgeLength.call(currNode)
    if (activeLength >= el) {
        activeEdge = activeEdge + el
        activeLength = activeLength - el
        activeNode = currNode
        return true
    }
    return false
}

var extendSuffixTree = Fn.new { |pos|
    pLeafEnd.value = pos
    remainingSuffixCount = remainingSuffixCount + 1
    lastNewNode = null
    while (remainingSuffixCount > 0) {
        if (activeLength == 0) activeEdge = pos
        if (!activeNode.children[text[activeEdge].bytes[0]]) {
            activeNode.children[text[activeEdge].bytes[0]] = newNode.call(pos, pLeafEnd)
            if (lastNewNode) {
                lastNewNode.suffixLink = activeNode
                lastNewNode = null
            }
        } else {
            var next = activeNode.children[text[activeEdge].bytes[0]]
            if (walkDown.call(next)) continue
            if (text[next.start + activeLength] == text[pos]) {
                if (lastNewNode && activeNode != root) {
                    lastNewNode.suffixLink = activeNode
                    lastNewNode = null
                }
                activeLength = activeLength + 1
                break
            }
            var temp = next.start + activeLength - 1
            pSplitEnd = ByRef.new(temp)
            var split = newNode.call(next.start, pSplitEnd)
            activeNode.children[text[activeEdge].bytes[0]] = split
            split.children[text[pos].bytes[0]] = newNode.call(pos, pLeafEnd)
            next.start = next.start + activeLength
            split.children[text[next.start].bytes[0]] = next
            if (lastNewNode) lastNewNode.suffixLink = split
            lastNewNode = split
        }
        remainingSuffixCount = remainingSuffixCount - 1
        if (activeNode == root && activeLength > 0) {
            activeLength = activeLength - 1
            activeEdge = pos - remainingSuffixCount + 1
        } else if (activeNode != root) {
            activeNode = activeNode.suffixLink
        }
    }
}

var setSuffixIndexByDFS  // recursive
setSuffixIndexByDFS = Fn.new { |n, labelHeight|
    if (!n) return
    if (n.start != -1) {
        // Uncomment line below to print suffix tree
        // System.write(text[n.start..n.pEnd.value])
    }
    var leaf = 1
    for (i in 0...maxChar) {
        if (n.children[i]) {
            // Uncomment the 3 lines below to print suffix index
            // if (leaf == 1 && n.start != -1) {
            //    System.print(" [%(n.suffixIndex)]")
            // }
            leaf = 0
            setSuffixIndexByDFS.call(n.children[i], labelHeight + edgeLength.call(n.children[i]))
        }
    }
    if (leaf == 1) {
        n.suffixIndex = size - labelHeight
        // Uncomment line below to print suffix index
        // System.print(" [%(n.suffixIndex)]")
    }
}

var buildSuffixTree = Fn.new {
    size = text.count
    var temp = -1
    pRootEnd = ByRef.new(temp)
    root = newNode.call(-1, pRootEnd)
    activeNode = root
    for (i in 0...size) extendSuffixTree.call(i)
    var labelHeight = 0
    setSuffixIndexByDFS.call(root, labelHeight)
}

var doTraversal  // recursive
doTraversal = Fn.new { |n, labelHeight, pMaxHeight, pSubstringStartIndex|
    if (!n) return
    if (n.suffixIndex == -1) {
        for (i in 0...maxChar) {
            if (n.children[i]) {                        
                doTraversal.call(n.children[i], labelHeight + edgeLength.call(n.children[i]),
                    pMaxHeight, pSubstringStartIndex)
             }
        }
    } else if (n.suffixIndex > -1 && (pMaxHeight.value < labelHeight - edgeLength.call(n))) {
        pMaxHeight.value = labelHeight - edgeLength.call(n)
        pSubstringStartIndex.value = n.suffixIndex
    }
}

var getLongestRepeatedSubstring = Fn.new { |s|
    var maxHeight = 0
    var substringStartIndex = 0
    var pMaxHeight = ByRef.new(maxHeight)
    var pSubstringStartIndex = ByRef.new(substringStartIndex)
    doTraversal.call(root, 0, pMaxHeight, pSubstringStartIndex)
    maxHeight = pMaxHeight.value
    substringStartIndex = pSubstringStartIndex.value
    // Uncomment line below to print maxHeight and substringStartIndex
    // System.print("maxHeight %(maxHeight), substringStartIndex %(substringStartIndex)")
    if (s == "") {
        System.write("  %(text) is: ")
    } else {
        System.write("  %(s) is: ")
    }
    var k = 0
    while (k < maxHeight) {
        System.write(text[k + substringStartIndex])
        k = k + 1
    }
    if (k == 0) {
        System.write("No repeated substring")
    }
    System.print()
}

var tests = [
    "GEEKSFORGEEKS$",
    "AAAAAAAAAA$",
    "ABCDEFG$",
    "ABABABA$",
    "ATCGATCGA$",
    "banana$",
    "abcpqrabpqpq$",
    "pqrpqpqabab$",
]
System.print("Longest Repeated Substring in:\n")
for (test in tests) {
    text = test
    buildSuffixTree.call()
    getLongestRepeatedSubstring.call("")
}
System.print()

// load pi to 100,182 digits
var piStr = File.read("pi_100000.txt")
piStr = piStr[2..-1] // remove initial 3.
var numbers = [1e3, 1e4, 1e5]
maxChar = 58
for (number in numbers) {
    var start = System.clock
    text = piStr[0...number] + "$"
    buildSuffixTree.call()
    getLongestRepeatedSubstring.call("first %(number) d.p. of Pi")
    var elapsed = (System.clock - start) * 1000
    System.print("  (this took %(elapsed) ms)\n")
}
Output:
Longest Repeated Substring in:

  GEEKSFORGEEKS$ is: GEEKS
  AAAAAAAAAA$ is: AAAAAAAAA
  ABCDEFG$ is: No repeated substring
  ABABABA$ is: ABABA
  ATCGATCGA$ is: ATCGA
  banana$ is: ana
  abcpqrabpqpq$ is: ab
  pqrpqpqabab$ is: ab

  first 1000 d.p. of Pi is: 23846
  (this took 9.987 ms)

  first 10000 d.p. of Pi is: 7111369
  (this took 89.12 ms)

  first 100000 d.p. of Pi is: 041021944
  (this took 1031.072 ms)