Knuth-Morris-Pratt string search: Difference between revisions

From Rosetta Code
Content added Content deleted
(julia example)
(Added Wren)
Line 129: Line 129:
Found <and> at: (one-based indices) [102, 129, 172]
Found <and> at: (one-based indices) [102, 129, 172]
Found <alfalfa> at: (one-based indices) [34, 88]
Found <alfalfa> at: (one-based indices) [34, 88]
</pre>

=={{header|Wren}}==
This is based on the code [https://www.geeksforgeeks.org/kmp-algorithm-for-pattern-searching/ here]. The examples used are the same as in the [[Boyer-Moore_string_search#Wren]] task.
<lang ecmascript>class KMP {
static search(haystack, needle) {
haystack = haystack.bytes.toList
needle = needle.bytes.toList
var hc = haystack.count
var nc = needle.count
var indices = []
var i = 0 // index into haystack
var j = 0 // index into needle
var t = table_(needle)
while (i < hc) {
if (needle[j] == haystack[i]) {
i = i + 1
j = j + 1
}
if (j == nc) {
indices.add(i - j)
j = t[j-1]
} else if (i < hc && needle[j] != haystack[i]) {
if (j != 0) {
j = t[j-1]
} else {
i = i + 1
}
}
}
return indices
}

static table_(needle) {
var nc = needle.count
var t = List.filled(nc, 0)
var i = 1 // index into table
var len = 0 // length of previous longest prefix
while (i < nc) {
if (needle[i] == needle[len]) {
len = len + 1
t[i] = len
i = i + 1
} else if (len != 0) {
len = t[len-1]
} else {
t[i] = 0
i = i + 1
}
}
return t
}
}

var texts = [
"GCTAGCTCTACGAGTCTA",
"GGCTATAATGCGTA",
"there would have been a time for such a word",
"needle need noodle needle",
"InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented",
"Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk."
]
var pats = ["TCTA", "TAATAAA", "word", "needle", "put", "and", "alfalfa"]
for (i in 0...texts.count) System.print("text%(i+1) = %(texts[i])")
System.print()
for (i in 0...pats.count) {
var j = (i < 5) ? i : i-1
System.print("Found '%(pats[i])' in 'text%(j+1)' at indices %(KMP.search(texts[j], pats[i]))")
}</lang>

{{out}}
<pre>
text1 = GCTAGCTCTACGAGTCTA
text2 = GGCTATAATGCGTA
text3 = there would have been a time for such a word
text4 = needle need noodle needle
text5 = InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented
text6 = Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk.

Found 'TCTA' in 'text1' at indices [6, 14]
Found 'TAATAAA' in 'text2' at indices []
Found 'word' in 'text3' at indices [40]
Found 'needle' in 'text4' at indices [0, 19]
Found 'put' in 'text5' at indices [26, 90]
Found 'and' in 'text5' at indices [101, 128, 171]
Found 'alfalfa' in 'text6' at indices [33, 87]
</pre>
</pre>

Revision as of 16:36, 8 July 2022

Knuth-Morris-Pratt string search is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.

< About Knuth-Morris-Pratt String Search Algorithm >

Emacs Lisp

<lang lisp> (defun kmp_compile_pattern (pattern)

 "Compile pattern to DFA."
 (defun create-2d-array (x y init)
   (let ((arr1 (make-vector x nil)))
     (dotimes (i x)

(aset arr1 i (make-vector y init)) )

     arr1 ) )
 
 (let* ((patLen (length pattern))

(R 256)

        (restartPos 0)

(dfa (create-2d-array R patLen 0)))

   (aset (aref dfa (elt pattern 0)) 0 1)
   (let ((patPos 0))
     (while (progn (setq patPos (1+ patPos)) (< patPos patLen))

(dotimes (c R) (aset (aref dfa c) patPos (aref (aref dfa c) restartPos)) )

(aset (aref dfa (elt pattern patPos)) patPos (1+ patPos)) (setq restartPos (aref (aref dfa (elt pattern patPos)) restartPos) ) )

     )
   dfa )
 )

(defun kmp_search (pattern text)

 "Pattern search with KMP algorithm."
 (let ((dfa (kmp_compile_pattern pattern)))
   (let ((textPos 0) (patPos 0) (N (length text)) (M (length pattern)))
     (while (and (< textPos N) (< patPos M))

(setq patPos (aref (aref dfa (elt text textPos)) patPos)) (setq textPos (1+ textPos)) )

     (if (= patPos M) (- textPos M) N ) ) ) )

</lang>

Julia

<lang ruby>"""

   function kmp_table(W)

input:

   an array of characters, W (the word to be analyzed)

output:

   an array of integers, T (the table to be filled)

define variables:

   an integer, i ← 2 (the current one-based position we are computing in T)
   an integer, j ← 0 (the additive to index i in W of the next character of the current candidate substring)

""" function kmp_table(W)

   len = length(W)
   T = zeros(Int, len)
   # start with the second letter of W, looking for patterns in W
   i = 2
   while i < len
       j = 0
       while i + j <= len # avoid overshooting end with index
           # compute the longest proper prefix
           if W[i + j] == W[j + 1]
               T[i + j] = T[i + j - 1] + 1
           else
               T[i + j] = 0 # back to start
               j += 1
               break
           end
           j += 1
       end       
       # entry in T found, so begin at next starting point along W
       i += j
   end
   return T

end

"""

   function kmp_search(S, W)
   

input:

   an array of characters, S (the text to be searched)
   an array of characters, W (the word sought)

output:

   an array of integers, P (positions in S at which W is found)

define variables (one based indexing in Julia differs from the Wikipedia example):

   an integer, i ← 1 (the position of the current character in S)
   an integer, j ← 1 (the position of the current character in W)
   an array of integers, T (the table, computed elsewhere)

""" function kmp_search(S, W)

   lenW, lenS = length(W), length(S)    
   i, P = 1, Int[]      
   T = kmp_table(W) # get pattern table
   while i <= lenS - lenW + 1
       for j in 1:lenW
           if S[i + j - 1] != W[j]
               # pattern not found, so skip unnecessary inner loops
               i += T[j] + 1
               @goto next_outer_loop
           end
       end
       # found pattern W in S, so add to output P
       push!(P, i)
       i += 1
       @label next_outer_loop
   end                
   return P    

end

const text1 = "InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented" const text2 = "Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk." const pat1, pat2, pat3 = "put", "and", "alfalfa"

println("Found <$pat1> at: (one-based indices) ", kmp_search(text1, pat1)) println("Found <$pat2> at: (one-based indices) ", kmp_search(text1, pat2)) println("Found <$pat3> at: (one-based indices) ", kmp_search(text2, pat3))

</lang>

Output:
Found <put> at: (one-based indices) [27, 91]
Found <and> at: (one-based indices) [102, 129, 172]
Found <alfalfa> at: (one-based indices) [34, 88]

Wren

This is based on the code here. The examples used are the same as in the Boyer-Moore_string_search#Wren task. <lang ecmascript>class KMP {

   static search(haystack, needle) {
       haystack = haystack.bytes.toList
       needle = needle.bytes.toList
       var hc = haystack.count
       var nc = needle.count
       var indices = []
       var i = 0 // index into haystack
       var j = 0 // index into needle
       var t = table_(needle)
       while (i < hc) {
           if (needle[j] == haystack[i]) {
               i = i + 1
               j = j + 1
           }
           if (j == nc) {
               indices.add(i - j)
               j = t[j-1]
           } else if (i < hc && needle[j] != haystack[i]) {
               if (j != 0) {
                   j = t[j-1]
               } else {
                   i = i + 1
               }
           }
       }
       return indices
   }
   static table_(needle) {
       var nc = needle.count
       var t = List.filled(nc, 0)
       var i = 1   // index into table
       var len = 0 // length of previous longest prefix
       while (i < nc) {
           if (needle[i] == needle[len]) {
              len = len + 1
              t[i] = len
              i = i + 1
           } else if (len != 0) {
               len = t[len-1]
           } else {
               t[i] = 0
               i = i + 1
           }
       }
       return t
   }

}

var texts = [

   "GCTAGCTCTACGAGTCTA",
   "GGCTATAATGCGTA",
   "there would have been a time for such a word",
   "needle need noodle needle",

"InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented",

   "Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk."

] var pats = ["TCTA", "TAATAAA", "word", "needle", "put", "and", "alfalfa"] for (i in 0...texts.count) System.print("text%(i+1) = %(texts[i])") System.print() for (i in 0...pats.count) {

   var j = (i < 5) ? i : i-1
   System.print("Found '%(pats[i])' in 'text%(j+1)' at indices %(KMP.search(texts[j], pats[i]))")

}</lang>

Output:
text1 = GCTAGCTCTACGAGTCTA
text2 = GGCTATAATGCGTA
text3 = there would have been a time for such a word
text4 = needle need noodle needle
text5 = InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented
text6 = Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk.

Found 'TCTA' in 'text1' at indices [6, 14]
Found 'TAATAAA' in 'text2' at indices []
Found 'word' in 'text3' at indices [40]
Found 'needle' in 'text4' at indices [0, 19]
Found 'put' in 'text5' at indices [26, 90]
Found 'and' in 'text5' at indices [101, 128, 171]
Found 'alfalfa' in 'text6' at indices [33, 87]