Knuth-Morris-Pratt string search

< About Knuth-Morris-Pratt String Search Algorithm >

Task examples:<lang J> text1=: 'InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented'

  text2=: 'Nearby farms grew a half acre of alfalfa on the dairys behalf, with bales of all that alfalfa exchanged for milk.'
  'put' kmp_search text1

26 90

  'and' kmp_search text1

101 128 171

  'and' kmp_search text2

  'alfalfa' kmp_search text2

33 87</lang>

(J uses index 0 for the first character in a sequence of characters.)

Julia

<lang ruby>"""

   function kmp_table(W)

input:

   an array of characters, W (the word to be analyzed)

output:

   an array of integers, T (the table to be filled)

define variables:

   an integer, i ← 2 (the current one-based position we are computing in T)
   an integer, j ← 0 (the additive to index i in W of the next character of the current candidate substring)

""" function kmp_table(W)

   len = length(W)
   T = zeros(Int, len)
   # start with the second letter of W, looking for patterns in W
   i = 2
   while i < len
       j = 0
       while i + j <= len # avoid overshooting end with index
           # compute the longest proper prefix
           if W[i + j] == W[j + 1]
               T[i + j] = T[i + j - 1] + 1
           else
               T[i + j] = 0 # back to start
               j += 1
               break
           end
           j += 1
       end       
       # entry in T found, so begin at next starting point along W
       i += j
   end
   return T

end

"""

   function kmp_search(S, W)

input:

   an array of characters, S (the text to be searched)
   an array of characters, W (the word sought)

output:

   an array of integers, P (positions in S at which W is found)

define variables (one based indexing in Julia differs from the Wikipedia example):

   an integer, i ← 1 (the position of the current character in S)
   an integer, j ← 1 (the position of the current character in W)
   an array of integers, T (the table, computed elsewhere)

""" function kmp_search(S, W)

   lenW, lenS = length(W), length(S)    
   i, P = 1, Int[]      
   T = kmp_table(W) # get pattern table
   while i <= lenS - lenW + 1
       for j in 1:lenW
           if S[i + j - 1] != W[j]
               # pattern not found, so skip unnecessary inner loops
               i += T[j] + 1
               @goto next_outer_loop
           end
       end
       # found pattern W in S, so add to output P
       push!(P, i)
       i += 1
       @label next_outer_loop
   end                
   return P

end

const text1 = "InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented" const text2 = "Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk." const pat1, pat2, pat3 = "put", "and", "alfalfa"

println("Found <$pat1> at: (one-based indices) ", kmp_search(text1, pat1)) println("Found <$pat2> at: (one-based indices) ", kmp_search(text1, pat2)) println("Found <$pat3> at: (one-based indices) ", kmp_search(text2, pat3))

</lang>

Output:

Found <put> at: (one-based indices) [27, 91]
Found <and> at: (one-based indices) [102, 129, 172]
Found <alfalfa> at: (one-based indices) [34, 88]

Phix

--
-- demo\rosetta\KnuthMorrisPratt.exw
-- =================================
--
-- based on https://www-igm.univ-mlv.fr/~lecroq/string/node8.html#SECTION0080
--
with javascript_semantics
function preKmp(string pat)
    integer m = length(pat), i = 1, j = 0
    sequence kmpNext = repeat(-1,m+1)
    pat &= '\0'
    while i <= m do
        while j > 0 and pat[i] != pat[j] do
            j = kmpNext[j]+1
        end while
        i += 1
        j += 1
        if pat[i] == pat[j] then
            kmpNext[i] = kmpNext[j]
        else
            kmpNext[i] = j-1
        end if
    end while
    return kmpNext
end function

procedure KMP(string pat, s, bool case_insensitive = false)
    string pins = sprintf("`%s` in `%s`",{pat,shorten(s,"characters",10)})
    if case_insensitive then {pat,s} = lower({pat,s}) end if
    /* Preprocessing */
    sequence kmpNext = preKmp(pat)
    /* Searching */
    sequence res = {}
    integer i = 0, j = 0,
            n = length(s),
            m = length(pat),
            cc = 0
    while j < n do
        while i > -1 do
            cc += 1
            if pat[i+1] = s[j+1] then exit end if
            i = kmpNext[i+1]
        end while
        i += 1
        j += 1
        if i >= m then
            res &= j-i+1
            i = kmpNext[i+1]
        end if
    end while
    /* Output */
    string ccs = sprintf("(%d character comparisons)",cc)
    if length(res) then
        string many = ordinant(length(res))
        printf(1,"Found %s %s at %v %s\n",{pins,many,res,ccs})
    else
        printf(1,"No %s %s\n",{pins,ccs})
    end if
end procedure

KMP("GCAGAGAG","GCATCGCAGAGAGTATACAGTACG")
KMP("TCTA","GCTAGCTCTACGAGTCTA")
KMP("TAATAAA","GGCTATAATGCGTA")
KMP("word","there would have been a time for such a word")
KMP("needle","needle need noodle needle")
constant book = "InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesley"&
                "DKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeand"&
                "assemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented"
KMP("put",book)
KMP("and",book)
constant farm = "Nearby farms grew a half acre of alfalfa on the dairy's behalf, with "&
                "bales of all that alfalfa exchanged for milk."
KMP("alfalfa",farm)
--KMP("aLfAlfa",farm)       -- none
--KMP("aLfAlfa",farm,true)  -- as -2

Output:

Significantly higher character comparison counts than Boyer-Moore_string_search#Phix.
Also higher than those on the above (lecroq) link, probably because this carries on for all matches.

Found `GCAGAGAG` in `GCATCGCAGAGAGTATACAGTACG` once at {6} (26 character comparisons)
Found `TCTA` in `GCTAGCTCTACGAGTCTA` twice at {7,15} (19 character comparisons)
No `TAATAAA` in `GGCTATAATGCGTA` (16 character comparisons)
Found `word` in `there woul...uch a word (44 characters)` once at {41} (45 character comparisons)
Found `needle` in `needle need noodle needle` twice at {1,20} (27 character comparisons)
Found `put` in `Inhisbooks...epresented (202 characters)` twice at {27,91} (205 character comparisons)
Found `and` in `Inhisbooks...epresented (202 characters)` three times at {102,129,172} (216 character comparisons)
Found `alfalfa` in `Nearby far... for milk. (114 characters)` twice at {34,88} (125 character comparisons)

Python

Based on pseudocode found in the Wikipedia article. Uses test cases from the #Wren solution. <lang python>"""Knuth-Morris-Pratt string search. Requires Python >= 3.7.""" from typing import List from typing import Tuple

def kmp_search(text: str, word: str) -> Tuple[List[int], int]:

   text_position = 0
   word_position = 0

   positions: List[int] = []
   number_of_positions = 0
   table = kmp_table(word)

   while text_position < len(text):
       if word[word_position] == text[text_position]:
           text_position += 1
           word_position += 1
           if word_position == len(word):
               positions.append(text_position - word_position)
               number_of_positions += 1
               word_position = table[word_position]
       else:
           word_position = table[word_position]
           if word_position < 0:
               text_position += 1
               word_position += 1

   return positions, number_of_positions

def kmp_table(word: str) -> List[int]:

   position = 1
   candidate = 0

   table = [0] * (len(word) + 1)
   table[0] = -1

   while position < len(word):
       if word[position] == word[candidate]:
           table[position] = table[candidate]
       else:
           table[position] = candidate
           while candidate >= 0 and word[position] != word[candidate]:
               candidate = table[candidate]
       position += 1
       candidate += 1

   table[position] = candidate
   return table

TEST_CASES = [

   ("GCTAGCTCTACGAGTCTA", "TCTA"),
   ("GGCTATAATGCGTA", "TAATAAA"),
   ("there would have been a time for such a word", "word"),
   ("needle need noodle needle", "needle"),
   (
       (
           "InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesley"
           "DKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeand"
           "assemblylanguagestoillustratetheconceptsandalgorithmsastheyare"
           "presented"
       ),
       "put",
   ),
   (
       (
           "InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesley"
           "DKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeand"
           "assemblylanguagestoillustratetheconceptsandalgorithmsastheyare"
           "presented"
       ),
       "and",
   ),
   (
       (
           "Nearby farms grew a half acre of alfalfa on the dairy's behalf, "
           "with bales of all that alfalfa exchanged for milk."
       ),
       "alfalfa",
   ),

]

def main():

   for text, word in TEST_CASES:
       positions, number_of_positions = kmp_search(text, word)

       if number_of_positions == 0:
           print(f"`{word}` not found in `{text[:10]}...`")
       elif number_of_positions == 1:
           print(f"Found `{word}` in `{text[:10]}...` once at {positions}.")
       else:
           print(
               f"Found `{word}` in `{text[:10]}...` {number_of_positions} times at {positions}."
           )

if __name__ == "__main__":

   main()

</lang>

Output:

Found `TCTA` in `GCTAGCTCTA...` 2 times at [6, 14].
`TAATAAA` not found in `GGCTATAATG...`
Found `word` in `there woul...` once at [40].
Found `needle` in `needle nee...` 2 times at [0, 19].
Found `put` in `Inhisbooks...` 2 times at [26, 90].
Found `and` in `Inhisbooks...` 3 times at [101, 128, 171].
Found `alfalfa` in `Nearby far...` 2 times at [33, 87].

Wren

This is based on the code here. The examples used are the same as in the Boyer-Moore_string_search#Wren task. <lang ecmascript>class KMP {

   static search(haystack, needle) {
       haystack = haystack.bytes.toList
       needle = needle.bytes.toList
       var hc = haystack.count
       var nc = needle.count
       var indices = []
       var i = 0 // index into haystack
       var j = 0 // index into needle
       var t = table_(needle)
       while (i < hc) {
           if (needle[j] == haystack[i]) {
               i = i + 1
               j = j + 1
           }
           if (j == nc) {
               indices.add(i - j)
               j = t[j-1]
           } else if (i < hc && needle[j] != haystack[i]) {
               if (j != 0) {
                   j = t[j-1]
               } else {
                   i = i + 1
               }
           }
       }
       return indices
   }

   static table_(needle) {
       var nc = needle.count
       var t = List.filled(nc, 0)
       var i = 1   // index into table
       var len = 0 // length of previous longest prefix
       while (i < nc) {
           if (needle[i] == needle[len]) {
              len = len + 1
              t[i] = len
              i = i + 1
           } else if (len != 0) {
               len = t[len-1]
           } else {
               t[i] = 0
               i = i + 1
           }
       }
       return t
   }

}

var texts = [

   "GCTAGCTCTACGAGTCTA",
   "GGCTATAATGCGTA",
   "there would have been a time for such a word",
   "needle need noodle needle",

"InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented",

   "Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk."

] var pats = ["TCTA", "TAATAAA", "word", "needle", "put", "and", "alfalfa"] for (i in 0...texts.count) System.print("text%(i+1) = %(texts[i])") System.print() for (i in 0...pats.count) {

   var j = (i < 5) ? i : i-1
   System.print("Found '%(pats[i])' in 'text%(j+1)' at indices %(KMP.search(texts[j], pats[i]))")

}</lang>

Output:

text1 = GCTAGCTCTACGAGTCTA
text2 = GGCTATAATGCGTA
text3 = there would have been a time for such a word
text4 = needle need noodle needle
text5 = InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented
text6 = Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk.

Found 'TCTA' in 'text1' at indices [6, 14]
Found 'TAATAAA' in 'text2' at indices []
Found 'word' in 'text3' at indices [40]
Found 'needle' in 'text4' at indices [0, 19]
Found 'put' in 'text5' at indices [26, 90]
Found 'and' in 'text5' at indices [101, 128, 171]
Found 'alfalfa' in 'text6' at indices [33, 87]

Emacs Lisp

J

Julia

Phix

Python

Wren