Boyer-Moore string search: Difference between revisions

From Rosetta Code
Content added Content deleted
m (→‎{{header|Python}}: add alfalfa example)
m (→‎{{header|Python}}: fix docs from original)
Line 125: Line 125:
"""
"""
Generates R for S, which is an array indexed by the position of some character c in the
Generates R for S, which is an array indexed by the position of some character c in the
English alphabet. At that index in R is an array of length |S|+1, specifying for each
ASCII table. At that index in R is an array of length |S|+1, specifying for each
index i in S (plus the index after S) the next location of character c encountered when
index i in S (plus the index after S) the next location of character c encountered when
traversing S from right to left starting at i. This is used for a constant-time lookup
traversing S from right to left starting at i. This is used for a constant-time lookup
Line 181: Line 181:
in T, and incorporates numerous ways of pre-processing the pattern to determine the optimal
in T, and incorporates numerous ways of pre-processing the pattern to determine the optimal
amount to shift the string and skip comparisons. In practice it runs in O(m) (and even
amount to shift the string and skip comparisons. In practice it runs in O(m) (and even
sublinear) time, where m is the length of T. This implementation performs a case-insensitive
sublinear) time, where m is the length of T. This implementation performs a case-sensitive
search on ASCII alphabetic characters, spaces not included. P must be uppercase.
search on ASCII characters. P must be ASCII as well.
"""
"""
if len(P) == 0 or len(T) == 0 or len(T) < len(P):
if len(P) == 0 or len(T) == 0 or len(T) < len(P):

Revision as of 08:39, 7 July 2022

Boyer-Moore string search is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.

< About this algorithm >

To properly test this algorithm, it would be good to search for a string which contains repeated subsequences, such as alfalfa and the text being searched should not only include a match but that match should be preceded by words which partially match, such as behalf.

Emacs Lisp

<lang lisp>

Compile the pattern to a right most position map

(defun bm_compile_pattern (pattern)

 (let* ((R 256) (patLen (length pattern)) (rightMap (make-vector R -1)))
   (let ((j -1))
     (while (progn (setq j (1+ j)) (< j patLen))

(aset rightMap (elt pattern j) j) ) )

   rightMap
   )
 )
(print (bm_compile_pattern "abcdb"))

(defun bm_substring_search (pattern text)

 "Boyer-Moore string search"
 (let ((startPos 0)

(skip 0) (result nil) (rightMap nil) (result nil))

   (setq rightMap (bm_compile_pattern pattern))
   ;; Continue this loop when no result and not exceed the text length
   (while (and (not result) (<= (+ startPos skip (length pattern)) (length text)))
     (setq startPos (+ startPos skip))
     (let ((idx (length pattern)) (skip1 nil))

(while (and (not skip1) (>= (setq idx (1- idx)) 0))  ;; skip when the character at position idx is different (when (/= (elt pattern idx) (elt text (+ startPos idx)))  ;; looking up the right most position in pattern (let ((right (aref rightMap (elt text (+ startPos idx))))) (if (>= right 0) (progn (setq skip1 (- idx right)) (when (<= skip1 0) (setq skip1 1))) (progn (setq skip1 (1+ idx))) ) ) ) ) (if (or (not skip1) (<= skip1 0)) (progn (setq result startPos)) (progn (setq skip skip1)) ) )

     )
   result
   )
 )

</lang>


Python

Slightly modified from en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#Python_implementation. <lang python>""" This version uses ASCII for case-sensitive matching. For Unicode you may want to match in UTF-8 bytes instead of creating a 0x10FFFF-sized table. """

from typing import List

ALPHABET_SIZE = 256

def alphabet_index(c: str) -> int:

   """Return the index of the given ASCII character. """
   val = ord(c)
   assert 0 <= val < ALPHABET_SIZE
   return val

def match_length(S: str, idx1: int, idx2: int) -> int:

   """Return the length of the match of the substrings of S beginning at idx1 and idx2."""
   if idx1 == idx2:
       return len(S) - idx1
   match_count = 0
   while idx1 < len(S) and idx2 < len(S) and S[idx1] == S[idx2]:
       match_count += 1
       idx1 += 1
       idx2 += 1
   return match_count

def fundamental_preprocess(S: str) -> List[int]:

   """Return Z, the Fundamental Preprocessing of S.
   Z[i] is the length of the substring beginning at i which is also a prefix of S.
   This pre-processing is done in O(n) time, where n is the length of S.
   """
   if len(S) == 0:  # Handles case of empty string
       return []
   if len(S) == 1:  # Handles case of single-character string
       return [1]
   z = [0 for x in S]
   z[0] = len(S)
   z[1] = match_length(S, 0, 1)
   for i in range(2, 1 + z[1]):  # Optimization from exercise 1-5
       z[i] = z[1] - i + 1
   # Defines lower and upper limits of z-box
   l = 0
   r = 0
   for i in range(2 + z[1], len(S)):
       if i <= r:  # i falls within existing z-box
           k = i - l
           b = z[k]
           a = r - i + 1
           if b < a:  # b ends within existing z-box
               z[i] = b
           else:  # b ends at or after the end of the z-box, we need to do an explicit match to the right of the z-box
               z[i] = a + match_length(S, a, r + 1)
               l = i
               r = i + z[i] - 1
       else:  # i does not reside within existing z-box
           z[i] = match_length(S, 0, i)
           if z[i] > 0:
               l = i
               r = i + z[i] - 1
   return z

def bad_character_table(S: str) -> List[List[int]]:

   """
   Generates R for S, which is an array indexed by the position of some character c in the
   ASCII table. At that index in R is an array of length |S|+1, specifying for each
   index i in S (plus the index after S) the next location of character c encountered when
   traversing S from right to left starting at i. This is used for a constant-time lookup
   for the bad character rule in the Boyer-Moore string search algorithm, although it has
   a much larger size than non-constant-time solutions.
   """
   if len(S) == 0:
       return [[] for a in range(ALPHABET_SIZE)]
   R = [[-1] for a in range(ALPHABET_SIZE)]
   alpha = [-1 for a in range(ALPHABET_SIZE)]
   for i, c in enumerate(S):
       alpha[alphabet_index(c)] = i
       for j, a in enumerate(alpha):
           R[j].append(a)
   return R

def good_suffix_table(S: str) -> List[int]:

   """
   Generates L for S, an array used in the implementation of the strong good suffix rule.
   L[i] = k, the largest position in S such that S[i:] (the suffix of S starting at i) matches
   a suffix of S[:k] (a substring in S ending at k). Used in Boyer-Moore, L gives an amount to
   shift P relative to T such that no instances of P in T are skipped and a suffix of P[:L[i]]
   matches the substring of T matched by a suffix of P in the previous match attempt.
   Specifically, if the mismatch took place at position i-1 in P, the shift magnitude is given
   by the equation len(P) - L[i]. In the case that L[i] = -1, the full shift table is used.
   Since only proper suffixes matter, L[0] = -1.
   """
   L = [-1 for c in S]
   N = fundamental_preprocess(S[::-1])  # S[::-1] reverses S
   N.reverse()
   for j in range(0, len(S) - 1):
       i = len(S) - N[j]
       if i != len(S):
           L[i] = j
   return L

def full_shift_table(S: str) -> List[int]:

   """
   Generates F for S, an array used in a special case of the good suffix rule in the Boyer-Moore
   string search algorithm. F[i] is the length of the longest suffix of S[i:] that is also a
   prefix of S. In the cases it is used, the shift magnitude of the pattern P relative to the
   text T is len(P) - F[i] for a mismatch occurring at i-1.
   """
   F = [0 for c in S]
   Z = fundamental_preprocess(S)
   longest = 0
   for i, zv in enumerate(reversed(Z)):
       longest = max(zv, longest) if zv == i + 1 else longest
       F[-i - 1] = longest
   return F

def string_search(P, T) -> List[int]:

   """
   Implementation of the Boyer-Moore string search algorithm. This finds all occurrences of P
   in T, and incorporates numerous ways of pre-processing the pattern to determine the optimal
   amount to shift the string and skip comparisons. In practice it runs in O(m) (and even
   sublinear) time, where m is the length of T. This implementation performs a case-sensitive
   search on ASCII characters. P must be ASCII as well.
   """
   if len(P) == 0 or len(T) == 0 or len(T) < len(P):
       return []
   matches = []
   # Preprocessing
   R = bad_character_table(P)
   L = good_suffix_table(P)
   F = full_shift_table(P)
   k = len(P) - 1      # Represents alignment of end of P relative to T
   previous_k = -1     # Represents alignment in previous phase (Galil's rule)
   while k < len(T):
       i = len(P) - 1  # Character to compare in P
       h = k           # Character to compare in T
       while i >= 0 and h > previous_k and P[i] == T[h]:  # Matches starting from end of P
           i -= 1
           h -= 1
       if i == -1 or h == previous_k:  # Match has been found (Galil's rule)
           matches.append(k - len(P) + 1)
           k += len(P) - F[1] if len(P) > 1 else 1
       else:  # No match, shift by max of bad character and good suffix rules
           char_shift = i - R[alphabet_index(T[h])][i]
           if i + 1 == len(P):  # Mismatch happened on first attempt
               suffix_shift = 1
           elif L[i + 1] == -1:  # Matched suffix does not appear anywhere in P
               suffix_shift = len(P) - F[i + 1]
           else:               # Matched suffix appears in P
               suffix_shift = len(P) - 1 - L[i + 1]
           shift = max(char_shift, suffix_shift)
           previous_k = k if shift >= i + 1 else previous_k  # Galil's rule
           k += shift
   return matches

TEXT1 = 'InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented' TEXT2 = "Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk." PAT1, PAT2, PAT3 = 'put', 'and', 'alfalfa'

print("Found", PAT1, "at:", string_search(PAT1, TEXT1)) print("Found", PAT2, "at:", string_search(PAT2, TEXT1)) print("Found", PAT3, "at:", string_search(PAT3, TEXT2))

</lang>

Output:
Found put at: [26, 90]
Found and at: [101, 128, 171]
Found alfalfa at: [33, 87]