Boyer-Moore string search: Difference between revisions

Content added Content deleted

Inline

Revision as of 08:39, 7 July 2022

To properly test this algorithm, it would be good to search for a string which contains repeated subsequences, such as alfalfa and the text being searched should not only include a match but that match should be preceded by words which partially match, such as behalf.

Emacs Lisp

Compile the pattern to a right most position map

(defun bm_compile_pattern (pattern)

 (let* ((R 256) (patLen (length pattern)) (rightMap (make-vector R -1)))
   (let ((j -1))
     (while (progn (setq j (1+ j)) (< j patLen))

(aset rightMap (elt pattern j) j) ) )

   rightMap
   )
 )

(print (bm_compile_pattern "abcdb"))

(defun bm_substring_search (pattern text)

 "Boyer-Moore string search"
 (let ((startPos 0)

(skip 0) (result nil) (rightMap nil) (result nil))

   (setq rightMap (bm_compile_pattern pattern))

   ;; Continue this loop when no result and not exceed the text length
   (while (and (not result) (<= (+ startPos skip (length pattern)) (length text)))
     (setq startPos (+ startPos skip))

     (let ((idx (length pattern)) (skip1 nil))

(while (and (not skip1) (>= (setq idx (1- idx)) 0)) ;; skip when the character at position idx is different (when (/= (elt pattern idx) (elt text (+ startPos idx))) ;; looking up the right most position in pattern (let ((right (aref rightMap (elt text (+ startPos idx))))) (if (>= right 0) (progn (setq skip1 (- idx right)) (when (<= skip1 0) (setq skip1 1))) (progn (setq skip1 (1+ idx))) ) ) ) ) (if (or (not skip1) (<= skip1 0)) (progn (setq result startPos)) (progn (setq skip skip1)) ) )

     )
   result
   )
 )

</lang>

Python

Slightly modified from en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#Python_implementation. <lang python>""" This version uses ASCII for case-sensitive matching. For Unicode you may want to match in UTF-8 bytes instead of creating a 0x10FFFF-sized table. """

from typing import List

ALPHABET_SIZE = 256

def alphabet_index(c: str) -> int:

   """Return the index of the given ASCII character. """
   val = ord(c)
   assert 0 <= val < ALPHABET_SIZE
   return val

def match_length(S: str, idx1: int, idx2: int) -> int:

   """Return the length of the match of the substrings of S beginning at idx1 and idx2."""
   if idx1 == idx2:
       return len(S) - idx1
   match_count = 0
   while idx1 < len(S) and idx2 < len(S) and S[idx1] == S[idx2]:
       match_count += 1
       idx1 += 1
       idx2 += 1
   return match_count

def fundamental_preprocess(S: str) -> List[int]:

   """Return Z, the Fundamental Preprocessing of S.

   Z[i] is the length of the substring beginning at i which is also a prefix of S.
   This pre-processing is done in O(n) time, where n is the length of S.
   """
   if len(S) == 0:  # Handles case of empty string
       return []
   if len(S) == 1:  # Handles case of single-character string
       return [1]
   z = [0 for x in S]
   z[0] = len(S)
   z[1] = match_length(S, 0, 1)
   for i in range(2, 1 + z[1]):  # Optimization from exercise 1-5
       z[i] = z[1] - i + 1
   # Defines lower and upper limits of z-box
   l = 0
   r = 0
   for i in range(2 + z[1], len(S)):
       if i <= r:  # i falls within existing z-box
           k = i - l
           b = z[k]
           a = r - i + 1
           if b < a:  # b ends within existing z-box
               z[i] = b
           else:  # b ends at or after the end of the z-box, we need to do an explicit match to the right of the z-box
               z[i] = a + match_length(S, a, r + 1)
               l = i
               r = i + z[i] - 1
       else:  # i does not reside within existing z-box
           z[i] = match_length(S, 0, i)
           if z[i] > 0:
               l = i
               r = i + z[i] - 1
   return z

def bad_character_table(S: str) -> List[List[int]]:

   """
   Generates R for S, which is an array indexed by the position of some character c in the
   ASCII table. At that index in R is an array of length |S|+1, specifying for each
   index i in S (plus the index after S) the next location of character c encountered when
   traversing S from right to left starting at i. This is used for a constant-time lookup
   for the bad character rule in the Boyer-Moore string search algorithm, although it has
   a much larger size than non-constant-time solutions.
   """
   if len(S) == 0:
       return [[] for a in range(ALPHABET_SIZE)]
   R = [[-1] for a in range(ALPHABET_SIZE)]
   alpha = [-1 for a in range(ALPHABET_SIZE)]
   for i, c in enumerate(S):
       alpha[alphabet_index(c)] = i
       for j, a in enumerate(alpha):
           R[j].append(a)
   return R

def good_suffix_table(S: str) -> List[int]:

   """
   Generates L for S, an array used in the implementation of the strong good suffix rule.
   L[i] = k, the largest position in S such that S[i:] (the suffix of S starting at i) matches
   a suffix of S[:k] (a substring in S ending at k). Used in Boyer-Moore, L gives an amount to
   shift P relative to T such that no instances of P in T are skipped and a suffix of P[:L[i]]
   matches the substring of T matched by a suffix of P in the previous match attempt.
   Specifically, if the mismatch took place at position i-1 in P, the shift magnitude is given
   by the equation len(P) - L[i]. In the case that L[i] = -1, the full shift table is used.
   Since only proper suffixes matter, L[0] = -1.
   """
   L = [-1 for c in S]
   N = fundamental_preprocess(S[::-1])  # S[::-1] reverses S
   N.reverse()
   for j in range(0, len(S) - 1):
       i = len(S) - N[j]
       if i != len(S):
           L[i] = j
   return L

def full_shift_table(S: str) -> List[int]:

   """
   Generates F for S, an array used in a special case of the good suffix rule in the Boyer-Moore
   string search algorithm. F[i] is the length of the longest suffix of S[i:] that is also a
   prefix of S. In the cases it is used, the shift magnitude of the pattern P relative to the
   text T is len(P) - F[i] for a mismatch occurring at i-1.
   """
   F = [0 for c in S]
   Z = fundamental_preprocess(S)
   longest = 0
   for i, zv in enumerate(reversed(Z)):
       longest = max(zv, longest) if zv == i + 1 else longest
       F[-i - 1] = longest
   return F

def string_search(P, T) -> List[int]:

   """
   Implementation of the Boyer-Moore string search algorithm. This finds all occurrences of P
   in T, and incorporates numerous ways of pre-processing the pattern to determine the optimal
   amount to shift the string and skip comparisons. In practice it runs in O(m) (and even
   sublinear) time, where m is the length of T. This implementation performs a case-sensitive
   search on ASCII characters. P must be ASCII as well.
   """
   if len(P) == 0 or len(T) == 0 or len(T) < len(P):
       return []

   matches = []

   # Preprocessing
   R = bad_character_table(P)
   L = good_suffix_table(P)
   F = full_shift_table(P)

   k = len(P) - 1      # Represents alignment of end of P relative to T
   previous_k = -1     # Represents alignment in previous phase (Galil's rule)
   while k < len(T):
       i = len(P) - 1  # Character to compare in P
       h = k           # Character to compare in T
       while i >= 0 and h > previous_k and P[i] == T[h]:  # Matches starting from end of P
           i -= 1
           h -= 1
       if i == -1 or h == previous_k:  # Match has been found (Galil's rule)
           matches.append(k - len(P) + 1)
           k += len(P) - F[1] if len(P) > 1 else 1
       else:  # No match, shift by max of bad character and good suffix rules
           char_shift = i - R[alphabet_index(T[h])][i]
           if i + 1 == len(P):  # Mismatch happened on first attempt
               suffix_shift = 1
           elif L[i + 1] == -1:  # Matched suffix does not appear anywhere in P
               suffix_shift = len(P) - F[i + 1]
           else:               # Matched suffix appears in P
               suffix_shift = len(P) - 1 - L[i + 1]
           shift = max(char_shift, suffix_shift)
           previous_k = k if shift >= i + 1 else previous_k  # Galil's rule
           k += shift
   return matches

TEXT1 = 'InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented' TEXT2 = "Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk." PAT1, PAT2, PAT3 = 'put', 'and', 'alfalfa'

print("Found", PAT1, "at:", string_search(PAT1, TEXT1)) print("Found", PAT2, "at:", string_search(PAT2, TEXT1)) print("Found", PAT3, "at:", string_search(PAT3, TEXT2))

</lang>

Output:

Found put at: [26, 90]
Found and at: [101, 128, 171]
Found alfalfa at: [33, 87]