I'm working on modernizing Rosetta Code's infrastructure. Starting with communications. Please accept this time-limited open invite to RC's Slack.. --Michael Mol (talk) 20:59, 30 May 2020 (UTC)

Knuth-Morris-Pratt string search

From Rosetta Code
Knuth-Morris-Pratt string search is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.

< About Knuth-Morris-Pratt String Search Algorithm >

Emacs Lisp[edit]

 
(defun kmp_compile_pattern (pattern)
"Compile pattern to DFA."
 
(defun create-2d-array (x y init)
(let ((arr1 (make-vector x nil)))
(dotimes (i x)
(aset arr1 i (make-vector y init)) )
arr1 ) )
 
(let* ((patLen (length pattern))
(R 256)
(restartPos 0)
(dfa (create-2d-array R patLen 0)))
 
(aset (aref dfa (elt pattern 0)) 0 1)
 
(let ((patPos 0))
(while (progn (setq patPos (1+ patPos)) (< patPos patLen))
(dotimes (c R)
(aset (aref dfa c) patPos (aref (aref dfa c) restartPos)) )
 
(aset (aref dfa (elt pattern patPos)) patPos (1+ patPos))
(setq restartPos
(aref (aref dfa (elt pattern patPos)) restartPos) )
)
)
dfa )
)
 
(defun kmp_search (pattern text)
"Pattern search with KMP algorithm."
(let ((dfa (kmp_compile_pattern pattern)))
 
(let ((textPos 0) (patPos 0) (N (length text)) (M (length pattern)))
(while (and (< textPos N) (< patPos M))
(setq patPos (aref (aref dfa (elt text textPos)) patPos))
(setq textPos (1+ textPos)) )
 
(if (= patPos M) (- textPos M) N ) ) ) )
 

J[edit]

Implementation:
kmp_table=: {{
j=. 0
T=. _1
for_ch. }.y do.
if. ch=j{y do.
T=. T,j{T
else.
T=. T,j while. (0<:j) * ch~:j{y do. j=. j{T end.
end.
j=. j+1
end.
T=. T, j
}}
 
kmp_search=: {{
b=. 0#~#y
k=. _1
f=. _1+#x
T=. kmp_table x
for_ch. y do.
if. ch=x{~k=.k+1 do.
if. f=k do.
b=. 1 (ch_index-k)} b
k=. k{T
end.
else.
whilst. _1<k do.
if. ch=x{~k=. k{T do. break. end.
end.
end.
end.
I. b
}}
Task examples:
   text1=: 'InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented'
text2=: 'Nearby farms grew a half acre of alfalfa on the dairy''s behalf, with bales of all that alfalfa exchanged for milk.'
'put' kmp_search text1
26 90
'and' kmp_search text1
101 128 171
'and' kmp_search text2
 
'alfalfa' kmp_search text2
33 87

(J uses index 0 for the first character in a sequence of characters.)

Julia[edit]

"""
function kmp_table(W)
 
input:
an array of characters, W (the word to be analyzed)
output:
an array of integers, T (the table to be filled)
define variables:
an integer, i ← 2 (the current one-based position we are computing in T)
an integer, j ← 0 (the additive to index i in W of the next character of the current candidate substring)
"
""
function kmp_table(W)
len = length(W)
T = zeros(Int, len)
# start with the second letter of W, looking for patterns in W
i = 2
while i < len
j = 0
while i + j <= len # avoid overshooting end with index
# compute the longest proper prefix
if W[i + j] == W[j + 1]
T[i + j] = T[i + j - 1] + 1
else
T[i + j] = 0 # back to start
j += 1
break
end
j += 1
end
# entry in T found, so begin at next starting point along W
i += j
end
return T
end
 
"""
function kmp_search(S, W)
 
input:
an array of characters, S (the text to be searched)
an array of characters, W (the word sought)
output:
an array of integers, P (positions in S at which W is found)
 
define variables (one based indexing in Julia differs from the Wikipedia example):
an integer, i ← 1 (the position of the current character in S)
an integer, j ← 1 (the position of the current character in W)
an array of integers, T (the table, computed elsewhere)
"
""
function kmp_search(S, W)
lenW, lenS = length(W), length(S)
i, P = 1, Int[]
T = kmp_table(W) # get pattern table
while i <= lenS - lenW + 1
for j in 1:lenW
if S[i + j - 1] != W[j]
# pattern not found, so skip unnecessary inner loops
i += T[j] + 1
@goto next_outer_loop
end
end
# found pattern W in S, so add to output P
push!(P, i)
i += 1
@label next_outer_loop
end
return P
end
 
const text1 = "InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented"
const text2 = "Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk."
const pat1, pat2, pat3 = "put", "and", "alfalfa"
 
println("Found <$pat1> at: (one-based indices) ", kmp_search(text1, pat1))
println("Found <$pat2> at: (one-based indices) ", kmp_search(text1, pat2))
println("Found <$pat3> at: (one-based indices) ", kmp_search(text2, pat3))
 
Output:
Found <put> at: (one-based indices) [27, 91]
Found <and> at: (one-based indices) [102, 129, 172]
Found <alfalfa> at: (one-based indices) [34, 88]

Phix[edit]

--
-- demo\rosetta\KnuthMorrisPratt.exw
-- =================================
--
-- based on https://www-igm.univ-mlv.fr/~lecroq/string/node8.html#SECTION0080
--
with javascript_semantics
function preKmp(string pat)
    integer m = length(pat), i = 1, j = 0
    sequence kmpNext = repeat(-1,m+1)
    pat &= '\0'
    while i <= m do
        while j > 0 and pat[i] != pat[j] do
            j = kmpNext[j]+1
        end while
        i += 1
        j += 1
        if pat[i] == pat[j] then
            kmpNext[i] = kmpNext[j]
        else
            kmpNext[i] = j-1
        end if
    end while
    return kmpNext
end function

procedure KMP(string pat, s, bool case_insensitive = false)
    string pins = sprintf("`%s` in `%s`",{pat,shorten(s,"characters",10)})
    if case_insensitive then {pat,s} = lower({pat,s}) end if
    /* Preprocessing */
    sequence kmpNext = preKmp(pat)
    /* Searching */
    sequence res = {}
    integer i = 0, j = 0,
            n = length(s),
            m = length(pat),
            cc = 0
    while j < n do
        while i > -1 do
            cc += 1
            if pat[i+1] = s[j+1] then exit end if
            i = kmpNext[i+1]
        end while
        i += 1
        j += 1
        if i >= m then
            res &= j-i+1
            i = kmpNext[i+1]
        end if
    end while
    /* Output */
    string ccs = sprintf("(%d character comparisons)",cc)
    if length(res) then
        string many = ordinant(length(res))
        printf(1,"Found %s %s at %v %s\n",{pins,many,res,ccs})
    else
        printf(1,"No %s %s\n",{pins,ccs})
    end if
end procedure

KMP("GCAGAGAG","GCATCGCAGAGAGTATACAGTACG")
KMP("TCTA","GCTAGCTCTACGAGTCTA")
KMP("TAATAAA","GGCTATAATGCGTA")
KMP("word","there would have been a time for such a word")
KMP("needle","needle need noodle needle")
constant book = "InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesley"&
                "DKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeand"&
                "assemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented"
KMP("put",book)
KMP("and",book)
constant farm = "Nearby farms grew a half acre of alfalfa on the dairy's behalf, with "&
                "bales of all that alfalfa exchanged for milk."
KMP("alfalfa",farm)
--KMP("aLfAlfa",farm)       -- none
--KMP("aLfAlfa",farm,true)  -- as -2
Output:

Significantly higher character comparison counts than Boyer-Moore_string_search#Phix.
Also higher than those on the above (lecroq) link, probably because this carries on for all matches.

Found `GCAGAGAG` in `GCATCGCAGAGAGTATACAGTACG` once at {6} (26 character comparisons)
Found `TCTA` in `GCTAGCTCTACGAGTCTA` twice at {7,15} (19 character comparisons)
No `TAATAAA` in `GGCTATAATGCGTA` (16 character comparisons)
Found `word` in `there woul...uch a word (44 characters)` once at {41} (45 character comparisons)
Found `needle` in `needle need noodle needle` twice at {1,20} (27 character comparisons)
Found `put` in `Inhisbooks...epresented (202 characters)` twice at {27,91} (205 character comparisons)
Found `and` in `Inhisbooks...epresented (202 characters)` three times at {102,129,172} (216 character comparisons)
Found `alfalfa` in `Nearby far... for milk. (114 characters)` twice at {34,88} (125 character comparisons)

Python[edit]

Based on pseudocode found in the Wikipedia article. Uses test cases from the #Wren solution.

"""Knuth-Morris-Pratt string search. Requires Python >= 3.7."""
from typing import List
from typing import Tuple
 
 
def kmp_search(text: str, word: str) -> Tuple[List[int], int]:
text_position = 0
word_position = 0
 
positions: List[int] = []
number_of_positions = 0
table = kmp_table(word)
 
while text_position < len(text):
if word[word_position] == text[text_position]:
text_position += 1
word_position += 1
if word_position == len(word):
positions.append(text_position - word_position)
number_of_positions += 1
word_position = table[word_position]
else:
word_position = table[word_position]
if word_position < 0:
text_position += 1
word_position += 1
 
return positions, number_of_positions
 
 
def kmp_table(word: str) -> List[int]:
position = 1
candidate = 0
 
table = [0] * (len(word) + 1)
table[0] = -1
 
while position < len(word):
if word[position] == word[candidate]:
table[position] = table[candidate]
else:
table[position] = candidate
while candidate >= 0 and word[position] != word[candidate]:
candidate = table[candidate]
position += 1
candidate += 1
 
table[position] = candidate
return table
 
 
TEST_CASES = [
("GCTAGCTCTACGAGTCTA", "TCTA"),
("GGCTATAATGCGTA", "TAATAAA"),
("there would have been a time for such a word", "word"),
("needle need noodle needle", "needle"),
(
(
"InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesley"
"DKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeand"
"assemblylanguagestoillustratetheconceptsandalgorithmsastheyare"
"presented"
),
"put",
),
(
(
"InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesley"
"DKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeand"
"assemblylanguagestoillustratetheconceptsandalgorithmsastheyare"
"presented"
),
"and",
),
(
(
"Nearby farms grew a half acre of alfalfa on the dairy's behalf, "
"with bales of all that alfalfa exchanged for milk."
),
"alfalfa",
),
]
 
 
def main():
for text, word in TEST_CASES:
positions, number_of_positions = kmp_search(text, word)
 
if number_of_positions == 0:
print(f"`{word}` not found in `{text[:10]}...`")
elif number_of_positions == 1:
print(f"Found `{word}` in `{text[:10]}...` once at {positions}.")
else:
print(
f"Found `{word}` in `{text[:10]}...` {number_of_positions} times at {positions}."
)
 
 
if __name__ == "__main__":
main()
 
Output:
Found `TCTA` in `GCTAGCTCTA...` 2 times at [6, 14].
`TAATAAA` not found in `GGCTATAATG...`
Found `word` in `there woul...` once at [40].
Found `needle` in `needle nee...` 2 times at [0, 19].
Found `put` in `Inhisbooks...` 2 times at [26, 90].
Found `and` in `Inhisbooks...` 3 times at [101, 128, 171].
Found `alfalfa` in `Nearby far...` 2 times at [33, 87].

Raku[edit]

Based on pseudocode found in WP (kmp_search & kmp_table). Data and output format mostly follows the Wren entry.

# 20220810 Raku programming solution
 
sub kmp_search (@S where *.Bool, @W where *.Bool) {
 
sub kmp_table (@W where *.Bool) {
loop (my ($pos,$cnd,@T) = 1,0,-1 ; $pos < @W.elems ; ($pos, $cnd)>>++) {
if @W[$pos] eq @W[$cnd] {
@T[$pos] = @T[$cnd]
} else {
@T[$pos] = $cnd;
while $cnd0 and @W[$pos] ne @W[$cnd] { $cnd = @T[$cnd] }
}
}
@T[$pos] = $cnd;
return @T
}
 
return gather loop (my ($j,$k,@T) = 0,0, |kmp_table @W; $j < @S.elems; ) {
if @W[$k] eq @S[$j] {
($j, $k)>>++;
if $k == @W.elems {
take $j - $k;
$k = @T[$k]
}
} else {
($j, $k)>>++ if ($k = @T[$k]) < 0
}
}
}
 
my @texts = [
"GCTAGCTCTACGAGTCTA",
"GGCTATAATGCGTA",
"there would have been a time for such a word",
"needle need noodle needle",
"BharôtভাৰতBharôtভারতIndiaBhāratભારતBhāratभारतBhārataಭಾರತBhāratभारतBhāratamഭാരതംBhāratभारतBhāratभारतBharôtôଭାରତBhāratਭਾਰਤBhāratamभारतम्Bārataபாரதம்BhāratamഭാരതംBhāratadēsamభారతదేశం",
"InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented",
"Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk.",
];
 
my @pats = ["TCTA", "TAATAAA", "word", "needle", "ഭാരതം", "put", "and", "alfalfa"];
 
say "text$_ = @texts[$_]" for @texts.keys ;
say();
 
for @pats.keys {
my \j = $_ < 6 ?? $_ !! $_-1 ; # for searching text5 twice
say "Found '@pats[$_]' in 'text{j}' at indices ", kmp_search @texts[j].comb, @pats[$_].comb
}
Output:
text0 = GCTAGCTCTACGAGTCTA
text1 = GGCTATAATGCGTA
text2 = there would have been a time for such a word
text3 = needle need noodle needle
text4 = BharôtভাৰতBharôtভারতIndiaBhāratભારતBhāratभारतBhārataಭಾರತBhāratभारतBhāratamഭാരതംBhāratभारतBhāratभारतBharôtôଭାରତBhāratਭਾਰਤBhāratamभारतम्Bārataபாரதம்BhāratamഭാരതംBhāratadēsamభారతదేశం
text5 = InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented
text6 = Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk.

Found 'TCTA' in 'text0' at indices (6 14)
Found 'TAATAAA' in 'text1' at indices ()
Found 'word' in 'text2' at indices (40)
Found 'needle' in 'text3' at indices (0 19)
Found 'ഭാരതം' in 'text4' at indices (68 138)
Found 'put' in 'text5' at indices (26 90)
Found 'and' in 'text5' at indices (101 128 171)
Found 'alfalfa' in 'text6' at indices (33 87)

Wren[edit]

This is based on the code here. The examples used are the same as in the Boyer-Moore_string_search#Wren task.

class KMP {
static search(haystack, needle) {
haystack = haystack.bytes.toList
needle = needle.bytes.toList
var hc = haystack.count
var nc = needle.count
var indices = []
var i = 0 // index into haystack
var j = 0 // index into needle
var t = table_(needle)
while (i < hc) {
if (needle[j] == haystack[i]) {
i = i + 1
j = j + 1
}
if (j == nc) {
indices.add(i - j)
j = t[j-1]
} else if (i < hc && needle[j] != haystack[i]) {
if (j != 0) {
j = t[j-1]
} else {
i = i + 1
}
}
}
return indices
}
 
static table_(needle) {
var nc = needle.count
var t = List.filled(nc, 0)
var i = 1 // index into table
var len = 0 // length of previous longest prefix
while (i < nc) {
if (needle[i] == needle[len]) {
len = len + 1
t[i] = len
i = i + 1
} else if (len != 0) {
len = t[len-1]
} else {
t[i] = 0
i = i + 1
}
}
return t
}
}
 
var texts = [
"GCTAGCTCTACGAGTCTA",
"GGCTATAATGCGTA",
"there would have been a time for such a word",
"needle need noodle needle",
"InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented",
"Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk."
]
var pats = ["TCTA", "TAATAAA", "word", "needle", "put", "and", "alfalfa"]
for (i in 0...texts.count) System.print("text%(i+1) = %(texts[i])")
System.print()
for (i in 0...pats.count) {
var j = (i < 5) ? i : i-1
System.print("Found '%(pats[i])' in 'text%(j+1)' at indices %(KMP.search(texts[j], pats[i]))")
}
Output:
text1 = GCTAGCTCTACGAGTCTA
text2 = GGCTATAATGCGTA
text3 = there would have been a time for such a word
text4 = needle need noodle needle
text5 = InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented
text6 = Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk.

Found 'TCTA' in 'text1' at indices [6, 14]
Found 'TAATAAA' in 'text2' at indices []
Found 'word' in 'text3' at indices [40]
Found 'needle' in 'text4' at indices [0, 19]
Found 'put' in 'text5' at indices [26, 90]
Found 'and' in 'text5' at indices [101, 128, 171]
Found 'alfalfa' in 'text6' at indices [33, 87]