Boyer-Moore string search: Difference between revisions

From Rosetta Code
Content added Content deleted
No edit summary
(Python example)
Line 1: Line 1:
{{draft task}}


[https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm < About this algorithm >]
[https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm < About this algorithm >]
Line 53: Line 54:
)
)
</lang>
</lang>


=={{header|Python}}==
Slightly modified from en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#Python_implementation.
<lang python>"""
This version uses ASCII for case-sensitive matching. For Unicode you may want to match in UTF-8
bytes instead of creating a 0x10FFFF-sized table.
"""

from typing import List

ALPHABET_SIZE = 256

def alphabet_index(c: str) -> int:
"""Return the index of the given character in the English alphabet, counting from 0."""
val = ord(c)
assert val >= 0 and val < ALPHABET_SIZE
return val

def match_length(S: str, idx1: int, idx2: int) -> int:
"""Return the length of the match of the substrings of S beginning at idx1 and idx2."""
if idx1 == idx2:
return len(S) - idx1
match_count = 0
while idx1 < len(S) and idx2 < len(S) and S[idx1] == S[idx2]:
match_count += 1
idx1 += 1
idx2 += 1
return match_count

def fundamental_preprocess(S: str) -> List[int]:
"""Return Z, the Fundamental Preprocessing of S.

Z[i] is the length of the substring beginning at i which is also a prefix of S.
This pre-processing is done in O(n) time, where n is the length of S.
"""
if len(S) == 0: # Handles case of empty string
return []
if len(S) == 1: # Handles case of single-character string
return [1]
z = [0 for x in S]
z[0] = len(S)
z[1] = match_length(S, 0, 1)
for i in range(2, 1 + z[1]): # Optimization from exercise 1-5
z[i] = z[1] - i + 1
# Defines lower and upper limits of z-box
l = 0
r = 0
for i in range(2 + z[1], len(S)):
if i <= r: # i falls within existing z-box
k = i - l
b = z[k]
a = r - i + 1
if b < a: # b ends within existing z-box
z[i] = b
else: # b ends at or after the end of the z-box, we need to do an explicit match to the right of the z-box
z[i] = a + match_length(S, a, r + 1)
l = i
r = i + z[i] - 1
else: # i does not reside within existing z-box
z[i] = match_length(S, 0, i)
if z[i] > 0:
l = i
r = i + z[i] - 1
return z

def bad_character_table(S: str) -> List[List[int]]:
"""
Generates R for S, which is an array indexed by the position of some character c in the
English alphabet. At that index in R is an array of length |S|+1, specifying for each
index i in S (plus the index after S) the next location of character c encountered when
traversing S from right to left starting at i. This is used for a constant-time lookup
for the bad character rule in the Boyer-Moore string search algorithm, although it has
a much larger size than non-constant-time solutions.
"""
if len(S) == 0:
return [[] for a in range(ALPHABET_SIZE)]
R = [[-1] for a in range(ALPHABET_SIZE)]
alpha = [-1 for a in range(ALPHABET_SIZE)]
for i, c in enumerate(S):
alpha[alphabet_index(c)] = i
for j, a in enumerate(alpha):
R[j].append(a)
return R

def good_suffix_table(S: str) -> List[int]:
"""
Generates L for S, an array used in the implementation of the strong good suffix rule.
L[i] = k, the largest position in S such that S[i:] (the suffix of S starting at i) matches
a suffix of S[:k] (a substring in S ending at k). Used in Boyer-Moore, L gives an amount to
shift P relative to T such that no instances of P in T are skipped and a suffix of P[:L[i]]
matches the substring of T matched by a suffix of P in the previous match attempt.
Specifically, if the mismatch took place at position i-1 in P, the shift magnitude is given
by the equation len(P) - L[i]. In the case that L[i] = -1, the full shift table is used.
Since only proper suffixes matter, L[0] = -1.
"""
L = [-1 for c in S]
N = fundamental_preprocess(S[::-1]) # S[::-1] reverses S
N.reverse()
for j in range(0, len(S) - 1):
i = len(S) - N[j]
if i != len(S):
L[i] = j
return L

def full_shift_table(S: str) -> List[int]:
"""
Generates F for S, an array used in a special case of the good suffix rule in the Boyer-Moore
string search algorithm. F[i] is the length of the longest suffix of S[i:] that is also a
prefix of S. In the cases it is used, the shift magnitude of the pattern P relative to the
text T is len(P) - F[i] for a mismatch occurring at i-1.
"""
F = [0 for c in S]
Z = fundamental_preprocess(S)
longest = 0
for i, zv in enumerate(reversed(Z)):
longest = max(zv, longest) if zv == i + 1 else longest
F[-i - 1] = longest
return F

def string_search(P, T) -> List[int]:
"""
Implementation of the Boyer-Moore string search algorithm. This finds all occurrences of P
in T, and incorporates numerous ways of pre-processing the pattern to determine the optimal
amount to shift the string and skip comparisons. In practice it runs in O(m) (and even
sublinear) time, where m is the length of T. This implementation performs a case-insensitive
search on ASCII alphabetic characters, spaces not included. P must be uppercase.
"""
if len(P) == 0 or len(T) == 0 or len(T) < len(P):
return []

matches = []

# Preprocessing
R = bad_character_table(P)
L = good_suffix_table(P)
F = full_shift_table(P)

k = len(P) - 1 # Represents alignment of end of P relative to T
previous_k = -1 # Represents alignment in previous phase (Galil's rule)
while k < len(T):
i = len(P) - 1 # Character to compare in P
h = k # Character to compare in T
while i >= 0 and h > previous_k and P[i] == T[h]: # Matches starting from end of P
i -= 1
h -= 1
if i == -1 or h == previous_k: # Match has been found (Galil's rule)
matches.append(k - len(P) + 1)
k += len(P) - F[1] if len(P) > 1 else 1
else: # No match, shift by max of bad character and good suffix rules
char_shift = i - R[alphabet_index(T[h])][i]
if i + 1 == len(P): # Mismatch happened on first attempt
suffix_shift = 1
elif L[i + 1] == -1: # Matched suffix does not appear anywhere in P
suffix_shift = len(P) - F[i + 1]
else: # Matched suffix appears in P
suffix_shift = len(P) - 1 - L[i + 1]
shift = max(char_shift, suffix_shift)
previous_k = k if shift >= i + 1 else previous_k # Galil's rule
k += shift
return matches

TEXT = 'InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented'
PAT1, PAT2 = 'put', 'and'

print("Found", PAT1, "at:", string_search(PAT1, TEXT))
print("Found", PAT2, "at:", string_search(PAT2, TEXT))
</lang>{{out}}
<pre>
Found put at: [26, 90]
Found and at: [101, 128, 171]
</pre>

Revision as of 02:28, 7 July 2022

Boyer-Moore string search is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.

< About this algorithm >

Emacs Lisp

<lang lisp>

Compile the pattern to a right most position map

(defun bm_compile_pattern (pattern)

 (let* ((R 256) (patLen (length pattern)) (rightMap (make-vector R -1)))
   (let ((j -1))
     (while (progn (setq j (1+ j)) (< j patLen))

(aset rightMap (elt pattern j) j) ) )

   rightMap
   )
 )
(print (bm_compile_pattern "abcdb"))

(defun bm_substring_search (pattern text)

 "Boyer-Moore string search"
 (let ((startPos 0)

(skip 0) (result nil) (rightMap nil) (result nil))

   (setq rightMap (bm_compile_pattern pattern))
   ;; Continue this loop when no result and not exceed the text length
   (while (and (not result) (<= (+ startPos skip (length pattern)) (length text)))
     (setq startPos (+ startPos skip))
     (let ((idx (length pattern)) (skip1 nil))

(while (and (not skip1) (>= (setq idx (1- idx)) 0))  ;; skip when the character at position idx is different (when (/= (elt pattern idx) (elt text (+ startPos idx)))  ;; looking up the right most position in pattern (let ((right (aref rightMap (elt text (+ startPos idx))))) (if (>= right 0) (progn (setq skip1 (- idx right)) (when (<= skip1 0) (setq skip1 1))) (progn (setq skip1 (1+ idx))) ) ) ) ) (if (or (not skip1) (<= skip1 0)) (progn (setq result startPos)) (progn (setq skip skip1)) ) )

     )
   result
   )
 )

</lang>


Python

Slightly modified from en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#Python_implementation. <lang python>""" This version uses ASCII for case-sensitive matching. For Unicode you may want to match in UTF-8 bytes instead of creating a 0x10FFFF-sized table. """

from typing import List

ALPHABET_SIZE = 256

def alphabet_index(c: str) -> int:

   """Return the index of the given character in the English alphabet, counting from 0."""
   val = ord(c)
   assert val >= 0 and val < ALPHABET_SIZE
   return val

def match_length(S: str, idx1: int, idx2: int) -> int:

   """Return the length of the match of the substrings of S beginning at idx1 and idx2."""
   if idx1 == idx2:
       return len(S) - idx1
   match_count = 0
   while idx1 < len(S) and idx2 < len(S) and S[idx1] == S[idx2]:
       match_count += 1
       idx1 += 1
       idx2 += 1
   return match_count

def fundamental_preprocess(S: str) -> List[int]:

   """Return Z, the Fundamental Preprocessing of S.
   Z[i] is the length of the substring beginning at i which is also a prefix of S.
   This pre-processing is done in O(n) time, where n is the length of S.
   """
   if len(S) == 0:  # Handles case of empty string
       return []
   if len(S) == 1:  # Handles case of single-character string
       return [1]
   z = [0 for x in S]
   z[0] = len(S)
   z[1] = match_length(S, 0, 1)
   for i in range(2, 1 + z[1]):  # Optimization from exercise 1-5
       z[i] = z[1] - i + 1
   # Defines lower and upper limits of z-box
   l = 0
   r = 0
   for i in range(2 + z[1], len(S)):
       if i <= r:  # i falls within existing z-box
           k = i - l
           b = z[k]
           a = r - i + 1
           if b < a:  # b ends within existing z-box
               z[i] = b
           else:  # b ends at or after the end of the z-box, we need to do an explicit match to the right of the z-box
               z[i] = a + match_length(S, a, r + 1)
               l = i
               r = i + z[i] - 1
       else:  # i does not reside within existing z-box
           z[i] = match_length(S, 0, i)
           if z[i] > 0:
               l = i
               r = i + z[i] - 1
   return z

def bad_character_table(S: str) -> List[List[int]]:

   """
   Generates R for S, which is an array indexed by the position of some character c in the
   English alphabet. At that index in R is an array of length |S|+1, specifying for each
   index i in S (plus the index after S) the next location of character c encountered when
   traversing S from right to left starting at i. This is used for a constant-time lookup
   for the bad character rule in the Boyer-Moore string search algorithm, although it has
   a much larger size than non-constant-time solutions.
   """
   if len(S) == 0:
       return [[] for a in range(ALPHABET_SIZE)]
   R = [[-1] for a in range(ALPHABET_SIZE)]
   alpha = [-1 for a in range(ALPHABET_SIZE)]
   for i, c in enumerate(S):
       alpha[alphabet_index(c)] = i
       for j, a in enumerate(alpha):
           R[j].append(a)
   return R

def good_suffix_table(S: str) -> List[int]:

   """
   Generates L for S, an array used in the implementation of the strong good suffix rule.
   L[i] = k, the largest position in S such that S[i:] (the suffix of S starting at i) matches
   a suffix of S[:k] (a substring in S ending at k). Used in Boyer-Moore, L gives an amount to
   shift P relative to T such that no instances of P in T are skipped and a suffix of P[:L[i]]
   matches the substring of T matched by a suffix of P in the previous match attempt.
   Specifically, if the mismatch took place at position i-1 in P, the shift magnitude is given
   by the equation len(P) - L[i]. In the case that L[i] = -1, the full shift table is used.
   Since only proper suffixes matter, L[0] = -1.
   """
   L = [-1 for c in S]
   N = fundamental_preprocess(S[::-1])  # S[::-1] reverses S
   N.reverse()
   for j in range(0, len(S) - 1):
       i = len(S) - N[j]
       if i != len(S):
           L[i] = j
   return L

def full_shift_table(S: str) -> List[int]:

   """
   Generates F for S, an array used in a special case of the good suffix rule in the Boyer-Moore
   string search algorithm. F[i] is the length of the longest suffix of S[i:] that is also a
   prefix of S. In the cases it is used, the shift magnitude of the pattern P relative to the
   text T is len(P) - F[i] for a mismatch occurring at i-1.
   """
   F = [0 for c in S]
   Z = fundamental_preprocess(S)
   longest = 0
   for i, zv in enumerate(reversed(Z)):
       longest = max(zv, longest) if zv == i + 1 else longest
       F[-i - 1] = longest
   return F

def string_search(P, T) -> List[int]:

   """
   Implementation of the Boyer-Moore string search algorithm. This finds all occurrences of P
   in T, and incorporates numerous ways of pre-processing the pattern to determine the optimal
   amount to shift the string and skip comparisons. In practice it runs in O(m) (and even
   sublinear) time, where m is the length of T. This implementation performs a case-insensitive
   search on ASCII alphabetic characters, spaces not included. P must be uppercase.
   """
   if len(P) == 0 or len(T) == 0 or len(T) < len(P):
       return []
   matches = []
   # Preprocessing
   R = bad_character_table(P)
   L = good_suffix_table(P)
   F = full_shift_table(P)
   k = len(P) - 1      # Represents alignment of end of P relative to T
   previous_k = -1     # Represents alignment in previous phase (Galil's rule)
   while k < len(T):
       i = len(P) - 1  # Character to compare in P
       h = k           # Character to compare in T
       while i >= 0 and h > previous_k and P[i] == T[h]:  # Matches starting from end of P
           i -= 1
           h -= 1
       if i == -1 or h == previous_k:  # Match has been found (Galil's rule)
           matches.append(k - len(P) + 1)
           k += len(P) - F[1] if len(P) > 1 else 1
       else:  # No match, shift by max of bad character and good suffix rules
           char_shift = i - R[alphabet_index(T[h])][i]
           if i + 1 == len(P):  # Mismatch happened on first attempt
               suffix_shift = 1
           elif L[i + 1] == -1:  # Matched suffix does not appear anywhere in P
               suffix_shift = len(P) - F[i + 1]
           else:               # Matched suffix appears in P
               suffix_shift = len(P) - 1 - L[i + 1]
           shift = max(char_shift, suffix_shift)
           previous_k = k if shift >= i + 1 else previous_k  # Galil's rule
           k += shift
   return matches

TEXT = 'InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented' PAT1, PAT2 = 'put', 'and'

print("Found", PAT1, "at:", string_search(PAT1, TEXT)) print("Found", PAT2, "at:", string_search(PAT2, TEXT))

</lang>

Output:
Found put at: [26, 90]
Found and at: [101, 128, 171]