Boyer-Moore string search: Difference between revisions

← Older edit

Boyer-Moore string search (view source)

Revision as of 18:56, 9 November 2023

20,899 bytes added , 6 months ago

Added Yabasic

Jjuanhdez

2,130

edits

Revision as of 16:15, 23 April 2023 (view source) Lscrd (talk \| contribs) m (Added a precision in "reverse" function comment.) ← Older edit		Latest revision as of 18:56, 9 November 2023 (view source) Jjuanhdez (talk \| contribs) (Added Yabasic)
(7 intermediate revisions by 4 users not shown)
Line 176: Found and at: [101, 128, 171] Found alfalfa at: [33, 87] </pre> =={{header\|C++}}== <syntaxhighlight lang="c++"> #include <algorithm> #include <cstdint> #include <experimental/iterator> #include <functional> #include <iomanip> #include <iostream> #include <string> #include <vector> void display(const std::vector<int32_t>& numbers) { std::cout << "["; std::copy(std::begin(numbers), std::end(numbers), std::experimental::make_ostream_joiner(std::cout, ", ")); std::cout << "]" << std::endl; } int32_t string_search_single(const std::string& haystack, const std::string& needle) { auto it = std::search(haystack.begin(), haystack.end(), std::boyer_moore_searcher(needle.begin(), needle.end())); if ( it != haystack.end() ) { return std::distance(haystack.begin(), it); } return -1; } std::vector<int32_t> string_search(const std::string& haystack, const std::string& needle) { std::vector<int32_t> result = {}; uint64_t start = 0; int32_t index = 0; while ( index >= 0 && start < haystack.length() ) { std::string haystackReduced = haystack.substr(start); index = string_search_single(haystackReduced, needle); if ( index >= 0 ) { result.push_back(start + index); start += index + needle.length(); } } return result; } int main() { const std::vector<std::string> texts = { "GCTAGCTCTACGAGTCTA", "GGCTATAATGCGTA", "there would have been a time for such a word", "needle need noodle needle", "DKnuthusesandprogramsanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguages", "Nearby farms grew an acre of alfalfa on the dairy's behalf, with bales of that alfalfa exchanged for milk." }; const std::vector<std::string> patterns = { "TCTA", "TAATAAA", "word", "needle", "and", "alfalfa" }; for ( uint64_t i = 0; i < texts.size(); ++i ) { std::cout << "text" << ( i + 1 ) << " = " << texts[i] << std::endl; } std::cout << std::endl; for ( uint64_t i = 0; i < texts.size(); ++i ) { std::vector<int32_t> indexes = string_search(texts[i], patterns[i]); std::cout << "Found " << std::quoted(patterns[i]) << " in 'text" << ( i + 1 ) << "' at indexes "; display(string_search(texts[i], patterns[i])); } } </syntaxhighlight> {{ out }} <pre> text1 = GCTAGCTCTACGAGTCTA text2 = GGCTATAATGCGTA text3 = there would have been a time for such a word text4 = needle need noodle needle text5 = DKnuthusesandprogramsanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguages text6 = Nearby farms grew an acre of alfalfa on the dairy's behalf, with bales of that alfalfa exchanged for milk. Found "TCTA" in 'text1' at indexes [6, 14] Found "TAATAAA" in 'text2' at indexes [] Found "word" in 'text3' at indexes [40] Found "needle" in 'text4' at indexes [0, 19] Found "and" in 'text5' at indexes [10, 46, 73] Found "alfalfa" in 'text6' at indexes [29, 79] </pre> Line 268 ⟶ 350: <pre>33</pre> </pre> =={{header\|Fortran}}== <syntaxhighlight lang="Fortran"> module bm_str implicit none private public :: bmstr ! ! PARAMETER definitions ! integer , private , parameter :: NO_OF_CHARS = 256, SIZEX=256 contains pure subroutine badcharheuristic(Str,M,Badchar) implicit none ! Dummy arguments ! integer :: M integer , dimension(NO_OF_CHARS) :: Badchar character(1) , dimension(M) :: Str intent (in) M , Str intent (out) Badchar ! ! Local variables ! integer :: i ! Code starts here Badchar(NO_OF_CHARS) = -1 do i = 1 , M Badchar(iachar(Str(i))) = i enddo return end subroutine badcharheuristic function bmstr(Pat,M,Str,N) result(found) implicit none ! ! Dummy arguments ! integer :: M integer :: N character(len=m) :: Pat character(len=n) :: Str intent (in) M , N , Pat , Str ! ! Local variables ! integer , dimension(NO_OF_CHARS) :: badchar integer :: found integer :: i integer :: j integer :: s ! Code starts here ! found = -1 if ( (M==0) .OR. (N==0) .OR. (M>N) ) return badchar = 0 call badcharheuristic(Pat,M,badchar) i = 0 s = 0 do while ( s<=(N-M) ) j = M do j = m,1,-1 if((Pat(j:j) /= Str(s+j:s+j)) )exit ! Leave, the pattern doesn't match. enddo if ( j < 1 ) then ! Found, let's leave found = s + 1 return endif i = badchar(iachar(Str(s+j:s+j))) s = s + MAX(1,j-i) enddo found = -1 return end function bmstr end module bm_str </syntaxhighlight> </pre> =={{header\|FreeBASIC}}== <syntaxhighlight lang="vb">#define max(a, b) iif((a) > (b), (a), (b)) Dim Shared As Integer suff(), bmBc(), bmGs() Sub preBmBc(pat As String) Dim As Integer m = Len(pat), i Redim bmBc(m) For i = 1 To m-1 bmBc(Cint(pat[i])) = m - i Next End Sub Sub suffixes(pat As String) Dim As Integer m = Len(pat), g = m, i, f Redim suff(m) suff(m) = m For i = m-1 To 1 Step -1 If i > g And suff(i + m - f) < i - g Then suff(i) = suff(i + m - f) Else If i < g Then g = i f = i While g >= 1 And pat[g] = pat[g + m - f] g -= 1 Wend suff(i) = f - g End If Next End Sub Sub preBmGs(pat As String) Dim As Integer m = Len(pat), j = 1, i Redim suff(m) Redim bmGs(m) For i = m To 1 Step -1 If suff(i) = i Then While j < m - i If bmGs(j) = m Then bmGs(j) = m - i j += 1 Wend End If Next For i = 1 To m-1 bmGs(m - suff(i)) = m - i Next End Sub Sub BM(pat As String, s As String, case_insensitive As Boolean = False) Dim As String pins = "'" & pat & "' in " & "'" & s & "'" If case_insensitive Then pat = Lcase(pat) s = Lcase(s) End If ' Preprocessing preBmGs(pat) preBmBc(pat) ' Searching Dim As Integer j = 0, n = Len(s), m = Len(pat), i = m While j <= n - m i -= 1 If pat[i] <> s[i+j] Then Exit While j += Iif(i < 1, bmGs(0), max(bmGs(i), bmBc(Len(s[i+j]) - m + i))) Wend Dim As Integer many = Instr(s, pat) Dim As String tmp = "" If Not many > 0 Then Print "No "; pins Else Do While many > 0 'if not found loop will be skipped tmp &= Str(many) & "," many = Instr(many + 1, s, pat) Loop Print Using "Found & at indices [&]"; pins; tmp & Chr(8) End If End Sub BM("GCAGAGAG","GCATCGCAGAGAGTATACAGTACG") BM("TCTA","GCTAGCTCTACGAGTCTA") BM("TAATAAA","GGCTATAATGCGTA") BM("word","there would have been a time for such a word") BM("needle","needle need noodle needle") Const book = "InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesley" + _ "DKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeand" + _ "assemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented" BM("put",book) BM("and",book) Const farm = "Nearby farms grew a half acre of alfalfa on the dairy's behalf, with " + _ "bales of all that alfalfa exchanged for milk." BM("alfalfa",farm) Sleep</syntaxhighlight> {{out}} <pre>Found 'GCAGAGAG' in 'GCATCGCAGAGAGTATACAGTACG' at indices [6] Found 'TCTA' in 'GCTAGCTCTACGAGTCTA' at indices [7,15] No 'TAATAAA' in 'GGCTATAATGCGTA' Found 'word' in 'there would have been a time for such a word' at indices [41] Found 'needle' in 'needle need noodle needle' at indices [1,20] Found 'put' in 'InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented' at indices [27,91] Found 'and' in InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented' at indices [102,129,172] Found 'alfalfa' in 'Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk.' at indices [34,88]</pre> =={{header\|J}}== Line 352 ⟶ 619: In other words: <tt>bmsearch0</tt> takes slightly over twice as long as <tt>bmsearch1</tt> for a somewhat randomly picked example search (though the results are the same). =={{header\|Java}}== <syntaxhighlight lang="java"> import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.stream.Collectors; import java.util.stream.IntStream; /** * An implementation of the Boyer-Moore string search algorithm. * It finds all occurrences of a pattern in a text, performing a case-insensitive search on ASCII characters. * * For all full description of the algorithm visit: * https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm / public final class BoyerMooreStringSearch { public static void main(String[] aArgs) { List<String> texts = List.of( "GCTAGCTCTACGAGTCTA", "GGCTATAATGCGTA", "there would have been a time for such a word", "needle need noodle needle", "DKnuthusesandshowsanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustrate", "Farms grew an acre of alfalfa on the dairy's behalf, with bales of that alfalfa exchanged for milk."); List<String> patterns = List.of( "TCTA", "TAATAAA", "word", "needle", "put", "and", "alfalfa" ); for ( int i = 0; i < texts.size(); i++ ) { System.out.println("text" + ( i + 1 ) + " = " + texts.get(i)); } System.out.println(); for ( int i = 0; i < patterns.size(); i++ ) { final int j = ( i < 5 ) ? i : i - 1; System.out.println("Found \"" + patterns.get(i) + "\" in 'text" + ( j + 1 ) + "' at indexes " + stringSearch(texts.get(j), patterns.get(i))); } } /* * Return a list of indexes at which the given pattern matches the given text. / private static List<Integer> stringSearch(String aText, String aPattern) { if ( aPattern.isEmpty() \|\| aText.isEmpty() \|\| aText.length() < aPattern.length() ) { return Collections.emptyList(); } List<Integer> matches = new ArrayList<Integer>(); // Preprocessing List<List<Integer>> R = badCharacterTable(aPattern); int[] L = goodSuffixTable(aPattern); int[] F = fullShiftTable(aPattern); int k = aPattern.length() - 1; // Represents the alignment of the end of aPattern relative to aText int previousK = -1; // Represents the above alignment in the previous phase while ( k < aText.length() ) { int i = aPattern.length() - 1; // Index of the character to compare in aPattern int h = k; // Index of the character to compare in aText while ( i >= 0 && h > previousK && aPattern.charAt(i) == aText.charAt(h) ) { i -= 1; h -= 1; } if ( i == -1 \|\| h == previousK ) { // Match has been found matches.add(k - aPattern.length() + 1); k += ( aPattern.length() > 1 ) ? aPattern.length() - F[1] : 1; } else { // No match, shift by the maximum of the bad character and good suffix rules int suffixShift; int charShift = i - R.get(alphabetIndex(aText.charAt(h))).get(i); if ( i + 1 == aPattern.length() ) { // Mismatch occurred on the first character suffixShift = 1; } else if ( L[i + 1] == -1 ) { // Matched suffix does not appear anywhere in aPattern suffixShift = aPattern.length() - F[i + 1]; } else { // Matched suffix appears in aPattern suffixShift = aPattern.length() - 1 - L[i + 1]; } int shift = Math.max(charShift, suffixShift); if ( shift >= i + 1 ) { // Galil's rule previousK = k; } k += shift; } } return matches; } /* * Create the shift table, F, for the given text, which is an array such that * F[i] is the length of the longest suffix of, aText.substring(i), which is also a prefix of the given text. * * Use case: If a mismatch occurs at character index i - 1 in the pattern, * the magnitude of the shift of the pattern, P, relative to the text is: P.length() - F[i]. / private static int[] fullShiftTable(String aText) { int[] F = new int[aText.length()]; int[] Z = fundamentalPreprocess(aText); int longest = 0; Collections.reverse(Arrays.asList(Z)); for ( int i = 0; i < Z.length; i++ ) { int zv = Z[i]; if ( zv == i + 1 ) { longest = Math.max(zv, longest); } F[F.length - i - 1] = longest; } return F; } /* * Create the good suffix table, L, for the given text, which is an array such that * L[i] = k, is the largest index in the given text, S, * such that S.substring(i) matches a suffix of S.substring(0, k). * * Use case: If a mismatch of characters occurs at index i - 1 in the pattern, P, * then a shift of magnitude, P.length() - L[i], is such that no instances of the pattern in the text are omitted. * Furthermore, a suffix of P.substring(0, L[i]) matches the same substring of the text that was matched by a * suffix in the pattern on the previous matching attempt. * In the case that L[i] = -1, the full shift table must be used. / private static int[] goodSuffixTable(String aText) { int[] L = IntStream.generate( () -> -1 ).limit(aText.length()).toArray(); String reversed = new StringBuilder(aText).reverse().toString(); int[] N = fundamentalPreprocess(reversed); Collections.reverse(Arrays.asList(N)); for ( int j = 0; j < aText.length() - 1; j++ ) { int i = aText.length() - N[j]; if ( i != aText.length() ) { L[i] = j; } } return L; } /* * Create the bad character table, R, for the given text, * which is a list indexed by the ASCII value of a character, C, in the given text. * * Use case: The entry at index i of R is a list of size: 1 + length of the given text. * This inner list gives at each index j the next position at which character C will be found * while traversing the given text from right to left starting from index j. / private static List<List<Integer>> badCharacterTable(String aText) { if ( aText.isEmpty() ) { return Collections.emptyList(); } List<List<Integer>> R = IntStream.range(0, ALPHABET_SIZE).boxed() .map( i -> new ArrayList<Integer>(Collections.nCopies(1,-1)) ).collect(Collectors.toList()); List<Integer> alpha = new ArrayList<Integer>(Collections.nCopies(ALPHABET_SIZE, -1)); for ( int i = 0; i < aText.length(); i++ ) { alpha.set(alphabetIndex(aText.charAt(i)), i); for ( int j = 0; j < alpha.size(); j++ ) { R.get(j).add(alpha.get(j)); } } return R; } /* * Create the fundamental preprocess, Z, of the given text, which is an array such that * Z[i] is the length of the substring of the given text beginning at index i which is also a prefix of the text. / private static int[] fundamentalPreprocess(String aText) { if ( aText.isEmpty() ) { return new int[0]; } if ( aText.length() == 1 ) { return new int[] { 1 }; } int[] Z = new int[aText.length()]; Z[0] = aText.length(); Z[1] = matchLength(aText, 0, 1); for ( int i = 2; i <= Z[1]; i++ ) { Z[i] = Z[1] - i + 1; } // Define the left and right limits of the z-box int left = 0; int right = 0; for ( int i = 2 + Z[1]; i < aText.length(); i++ ) { if ( i <= right ) { // i falls within existing z-box final int k = i - left; final int b = Z[k]; final int a = right - i + 1; if ( b < a ) { // b ends within existing z-box Z[i] = b; } else { // b ends at or after the end of the z-box, // an explicit match to the right of the z-box is required Z[i] = a + matchLength(aText, a, right + 1); left = i; right = i + Z[i] - 1; } } else { // i does not fall within existing z-box Z[i] = matchLength(aText, 0, i); if ( Z[i] > 0 ) { left = i; right = i + Z[i] - 1; } } } return Z; } /* * Return the length of the match of the two substrings of the given text beginning at each of the given indexes. / private static int matchLength(String aText, int aIndexOne, int aIndexTwo) { if ( aIndexOne == aIndexTwo ) { return aText.length() - aIndexOne; } int matchCount = 0; while ( aIndexOne < aText.length() && aIndexTwo < aText.length() && aText.charAt(aIndexOne) == aText.charAt(aIndexTwo) ) { matchCount += 1; aIndexOne += 1; aIndexTwo += 1; } return matchCount; } /* * Return the ASCII index of the given character, if it is such, otherwise throw an illegal argument exception. / private static int alphabetIndex(char aChar) { final int result = (int) aChar; if ( result >= ALPHABET_SIZE ) { throw new IllegalArgumentException("Not an ASCII character:" + aChar); } return result; } / The range of ASCII characters is 0..255, both inclusive. / private static final int ALPHABET_SIZE = 256; } </syntaxhighlight> <pre> text1 = GCTAGCTCTACGAGTCTA text2 = GGCTATAATGCGTA text3 = there would have been a time for such a word text4 = needle need noodle needle text5 = DKnuthusesandshowsanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustrate text6 = Farms grew an acre of alfalfa on the dairy's behalf, with bales of that alfalfa exchanged for milk. Found "TCTA" in 'text1' at indexes [6, 14] Found "TAATAAA" in 'text2' at indexes [] Found "word" in 'text3' at indexes [40] Found "needle" in 'text4' at indexes [0, 19] Found "put" in 'text5' at indexes [32] Found "and" in 'text5' at indexes [10, 43, 70] Found "alfalfa" in 'text6' at indexes [22, 72] </pre> =={{header\|Julia}}== Some of this code is derived from Python code by Ben Langmead. Because of that ancestry, it returns Line 1,449 ⟶ 1,976: The same examples have been used as in the Julia entry above. <syntaxhighlight lang="~~ecmascript~~wren">class BoyerMoore { /* * Returns the index within this string of the first occurrence of the Line 1,595 ⟶ 2,122: Found 'alfalfa' in 'text6' at indices [33, 87] </pre> =={{header\|Yabasic}}== {{trans\|FreeBASIC}} <syntaxhighlight lang="vb">case_insensitive = false sub preBmBc(pat$) local m, i m = len(pat$) redim bmBc(m) for i = 1 to m-1 bmBc(val(mid$(pat$, i, 1))) = m - i next end sub sub suffixes(pat$) local m, g, i, f m = len(pat$) g = m redim suff(m) suff(m) = m for i = m-1 to 1 step -1 if i > g and suff(i + m - f) < i - g then suff(i) = suff(i + m - f) else if i < g g = i f = i while g >= 1 and mid$(pat$, g, 1) = mid$(pat$, g + m - f, 1) g = q - 1 wend suff(i) = f - g fi next end sub sub preBmGs(pat$) local m, j, i m = len(pat$) j = 1 redim suff(m) redim bmGs(m) for i = m to 1 step -1 if suff(i) = i then while j < m - i if bmGs(j) = m bmGs(j) = m - i j = j + 1 wend fi next for i = 1 to m-1 bmGs(m - suff(i)) = m - i next end sub sub BM(pat$, s$, case_insensitive) local pins pins$ = "'" + pat$ + "' in " + "'" + s$ + "'" if case_insensitive then pat$ = lower$(pat$) s$ = lower$(s$) fi //* Preprocessing // preBmGs(pat$) preBmBc(pat$) // Searching *// j = 0 n = len(s$) m = len(pat$) i = m while j <= n - m i = i - 1 if mid$(pat$,i,1) <> mid$(s$,i+j,1) break if i < 1 then j = j + bmGs(0) else j = j + max(bmGs(i), bmBc(len(mid$(s$,i+j,1)) - m + i)) : fi wend many = instr(s$, pat$) tmp$ = "" if not many > 0 then print "No ", pins$ else while many > 0 //if not found loop will be skipped tmp$ = tmp$ + str$(many) + "," many = instr(s$, pat$, many + 1) wend print "Found ", pins$, " at indices [", tmp$, chr$(8), "]" fi end sub BM("GCAGAGAG","GCATCGCAGAGAGTATACAGTACG") BM("TCTA","GCTAGCTCTACGAGTCTA") BM("TAATAAA","GGCTATAATGCGTA") BM("word","there would have been a time for such a word") BM("needle","needle need noodle needle") book$ = "InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented" BM("put",book$) BM("and",book$) farm$ = "Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk." BM("alfalfa",farm$)</syntaxhighlight> {{out}} <pre>Same as QBasic entry.</pre>