Boyer-Moore string search: Difference between revisions

Added Yabasic
m (Added a precision in "reverse" function comment.)
(Added Yabasic)
 
(7 intermediate revisions by 4 users not shown)
Line 176:
Found and at: [101, 128, 171]
Found alfalfa at: [33, 87]
</pre>
 
=={{header|C++}}==
<syntaxhighlight lang="c++">
#include <algorithm>
#include <cstdint>
#include <experimental/iterator>
#include <functional>
#include <iomanip>
#include <iostream>
#include <string>
#include <vector>
 
void display(const std::vector<int32_t>& numbers) {
std::cout << "[";
std::copy(std::begin(numbers), std::end(numbers), std::experimental::make_ostream_joiner(std::cout, ", "));
std::cout << "]" << std::endl;
}
 
int32_t string_search_single(const std::string& haystack, const std::string& needle) {
auto it = std::search(haystack.begin(), haystack.end(), std::boyer_moore_searcher(needle.begin(), needle.end()));
 
if ( it != haystack.end() ) {
return std::distance(haystack.begin(), it);
}
return -1;
}
 
std::vector<int32_t> string_search(const std::string& haystack, const std::string& needle) {
std::vector<int32_t> result = {};
uint64_t start = 0;
int32_t index = 0;
while ( index >= 0 && start < haystack.length() ) {
std::string haystackReduced = haystack.substr(start);
index = string_search_single(haystackReduced, needle);
if ( index >= 0 ) {
result.push_back(start + index);
start += index + needle.length();
}
}
return result;
}
 
int main() {
const std::vector<std::string> texts = {
"GCTAGCTCTACGAGTCTA",
"GGCTATAATGCGTA",
"there would have been a time for such a word",
"needle need noodle needle",
"DKnuthusesandprogramsanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguages",
"Nearby farms grew an acre of alfalfa on the dairy's behalf, with bales of that alfalfa exchanged for milk."
};
 
const std::vector<std::string> patterns = { "TCTA", "TAATAAA", "word", "needle", "and", "alfalfa" };
 
for ( uint64_t i = 0; i < texts.size(); ++i ) {
std::cout << "text" << ( i + 1 ) << " = " << texts[i] << std::endl;
}
std::cout << std::endl;
 
for ( uint64_t i = 0; i < texts.size(); ++i ) {
std::vector<int32_t> indexes = string_search(texts[i], patterns[i]);
std::cout << "Found " << std::quoted(patterns[i]) << " in 'text" << ( i + 1 ) << "' at indexes ";
display(string_search(texts[i], patterns[i]));
}
}
</syntaxhighlight>
{{ out }}
<pre>
text1 = GCTAGCTCTACGAGTCTA
text2 = GGCTATAATGCGTA
text3 = there would have been a time for such a word
text4 = needle need noodle needle
text5 = DKnuthusesandprogramsanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguages
text6 = Nearby farms grew an acre of alfalfa on the dairy's behalf, with bales of that alfalfa exchanged for milk.
 
Found "TCTA" in 'text1' at indexes [6, 14]
Found "TAATAAA" in 'text2' at indexes []
Found "word" in 'text3' at indexes [40]
Found "needle" in 'text4' at indexes [0, 19]
Found "and" in 'text5' at indexes [10, 46, 73]
Found "alfalfa" in 'text6' at indexes [29, 79]
</pre>
 
Line 268 ⟶ 350:
 
<pre>33</pre>
</pre>
 
=={{header|Fortran}}==
<syntaxhighlight lang="Fortran">
module bm_str
implicit none
private
public :: bmstr
!
! PARAMETER definitions
!
integer , private , parameter :: NO_OF_CHARS = 256, SIZEX=256
contains
pure subroutine badcharheuristic(Str,M,Badchar)
implicit none
! Dummy arguments
!
integer :: M
integer , dimension(NO_OF_CHARS) :: Badchar
character(1) , dimension(M) :: Str
intent (in) M , Str
intent (out) Badchar
!
! Local variables
!
integer :: i
! Code starts here
Badchar(NO_OF_CHARS) = -1
do i = 1 , M
Badchar(iachar(Str(i))) = i
enddo
return
end subroutine badcharheuristic
function bmstr(Pat,M,Str,N) result(found)
implicit none
!
! Dummy arguments
!
integer :: M
integer :: N
character(len=m) :: Pat
character(len=n) :: Str
intent (in) M , N , Pat , Str
!
! Local variables
!
integer , dimension(NO_OF_CHARS) :: badchar
integer :: found
integer :: i
integer :: j
integer :: s
! Code starts here
!
found = -1
if ( (M==0) .OR. (N==0) .OR. (M>N) ) return
badchar = 0
call badcharheuristic(Pat,M,badchar)
i = 0
s = 0
do while ( s<=(N-M) )
j = M
do j = m,1,-1
if((Pat(j:j) /= Str(s+j:s+j)) )exit ! Leave, the pattern doesn't match.
enddo
if ( j < 1 ) then ! Found, let's leave
found = s + 1
return
endif
i = badchar(iachar(Str(s+j:s+j)))
s = s + MAX(1,j-i)
enddo
found = -1
return
end function bmstr
end module bm_str
</syntaxhighlight>
</pre>
 
=={{header|FreeBASIC}}==
<syntaxhighlight lang="vb">#define max(a, b) iif((a) > (b), (a), (b))
 
Dim Shared As Integer suff(), bmBc(), bmGs()
 
Sub preBmBc(pat As String)
Dim As Integer m = Len(pat), i
Redim bmBc(m)
For i = 1 To m-1
bmBc(Cint(pat[i])) = m - i
Next
End Sub
 
Sub suffixes(pat As String)
Dim As Integer m = Len(pat), g = m, i, f
Redim suff(m)
suff(m) = m
For i = m-1 To 1 Step -1
If i > g And suff(i + m - f) < i - g Then
suff(i) = suff(i + m - f)
Else
If i < g Then g = i
f = i
While g >= 1 And pat[g] = pat[g + m - f]
g -= 1
Wend
suff(i) = f - g
End If
Next
End Sub
 
Sub preBmGs(pat As String)
Dim As Integer m = Len(pat), j = 1, i
Redim suff(m)
Redim bmGs(m)
For i = m To 1 Step -1
If suff(i) = i Then
While j < m - i
If bmGs(j) = m Then bmGs(j) = m - i
j += 1
Wend
End If
Next
For i = 1 To m-1
bmGs(m - suff(i)) = m - i
Next
End Sub
 
Sub BM(pat As String, s As String, case_insensitive As Boolean = False)
Dim As String pins = "'" & pat & "' in " & "'" & s & "'"
If case_insensitive Then
pat = Lcase(pat)
s = Lcase(s)
End If
' Preprocessing
preBmGs(pat)
preBmBc(pat)
' Searching
Dim As Integer j = 0, n = Len(s), m = Len(pat), i = m
While j <= n - m
i -= 1
If pat[i] <> s[i+j] Then Exit While
j += Iif(i < 1, bmGs(0), max(bmGs(i), bmBc(Len(s[i+j]) - m + i)))
Wend
Dim As Integer many = Instr(s, pat)
Dim As String tmp = ""
If Not many > 0 Then
Print "No "; pins
Else
Do While many > 0 'if not found loop will be skipped
tmp &= Str(many) & ","
many = Instr(many + 1, s, pat)
Loop
Print Using "Found & at indices [&]"; pins; tmp & Chr(8)
End If
End Sub
 
BM("GCAGAGAG","GCATCGCAGAGAGTATACAGTACG")
BM("TCTA","GCTAGCTCTACGAGTCTA")
BM("TAATAAA","GGCTATAATGCGTA")
BM("word","there would have been a time for such a word")
BM("needle","needle need noodle needle")
Const book = "InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesley" + _
"DKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeand" + _
"assemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented"
BM("put",book)
BM("and",book)
Const farm = "Nearby farms grew a half acre of alfalfa on the dairy's behalf, with " + _
"bales of all that alfalfa exchanged for milk."
BM("alfalfa",farm)
 
Sleep</syntaxhighlight>
{{out}}
<pre>Found 'GCAGAGAG' in 'GCATCGCAGAGAGTATACAGTACG' at indices [6]
Found 'TCTA' in 'GCTAGCTCTACGAGTCTA' at indices [7,15]
No 'TAATAAA' in 'GGCTATAATGCGTA'
Found 'word' in 'there would have been a time for such a word' at indices [41]
Found 'needle' in 'needle need noodle needle' at indices [1,20]
Found 'put' in 'InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented' at indices [27,91]
Found 'and' in InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented' at indices [102,129,172]
Found 'alfalfa' in 'Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk.' at indices [34,88]</pre>
 
=={{header|J}}==
Line 352 ⟶ 619:
 
In other words: <tt>bmsearch0</tt> takes slightly over twice as long as <tt>bmsearch1</tt> for a somewhat randomly picked example search (though the results are the same).
 
=={{header|Java}}==
<syntaxhighlight lang="java">
 
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
 
/**
* An implementation of the Boyer-Moore string search algorithm.
* It finds all occurrences of a pattern in a text, performing a case-insensitive search on ASCII characters.
*
* For all full description of the algorithm visit:
* https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm
*/
public final class BoyerMooreStringSearch {
public static void main(String[] aArgs) {
List<String> texts = List.of(
"GCTAGCTCTACGAGTCTA",
"GGCTATAATGCGTA",
"there would have been a time for such a word",
"needle need noodle needle",
"DKnuthusesandshowsanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustrate",
"Farms grew an acre of alfalfa on the dairy's behalf, with bales of that alfalfa exchanged for milk.");
List<String> patterns = List.of( "TCTA", "TAATAAA", "word", "needle", "put", "and", "alfalfa" );
for ( int i = 0; i < texts.size(); i++ ) {
System.out.println("text" + ( i + 1 ) + " = " + texts.get(i));
}
System.out.println();
for ( int i = 0; i < patterns.size(); i++ ) {
final int j = ( i < 5 ) ? i : i - 1;
System.out.println("Found \"" + patterns.get(i) + "\" in 'text" + ( j + 1 ) + "' at indexes "
+ stringSearch(texts.get(j), patterns.get(i)));
}
}
/**
* Return a list of indexes at which the given pattern matches the given text.
*/
private static List<Integer> stringSearch(String aText, String aPattern) {
if ( aPattern.isEmpty() || aText.isEmpty() || aText.length() < aPattern.length() ) {
return Collections.emptyList();
}
 
List<Integer> matches = new ArrayList<Integer>();
 
// Preprocessing
List<List<Integer>> R = badCharacterTable(aPattern);
int[] L = goodSuffixTable(aPattern);
int[] F = fullShiftTable(aPattern);
 
int k = aPattern.length() - 1; // Represents the alignment of the end of aPattern relative to aText
int previousK = -1; // Represents the above alignment in the previous phase
while ( k < aText.length() ) {
int i = aPattern.length() - 1; // Index of the character to compare in aPattern
int h = k; // Index of the character to compare in aText
while ( i >= 0 && h > previousK && aPattern.charAt(i) == aText.charAt(h) ) {
i -= 1;
h -= 1;
}
if ( i == -1 || h == previousK ) { // Match has been found
matches.add(k - aPattern.length() + 1);
k += ( aPattern.length() > 1 ) ? aPattern.length() - F[1] : 1;
} else { // No match, shift by the maximum of the bad character and good suffix rules
int suffixShift;
int charShift = i - R.get(alphabetIndex(aText.charAt(h))).get(i);
if ( i + 1 == aPattern.length() ) { // Mismatch occurred on the first character
suffixShift = 1;
} else if ( L[i + 1] == -1 ) { // Matched suffix does not appear anywhere in aPattern
suffixShift = aPattern.length() - F[i + 1];
} else { // Matched suffix appears in aPattern
suffixShift = aPattern.length() - 1 - L[i + 1];
}
int shift = Math.max(charShift, suffixShift);
if ( shift >= i + 1 ) { // Galil's rule
previousK = k;
}
k += shift;
}
}
return matches;
}
/**
* Create the shift table, F, for the given text, which is an array such that
* F[i] is the length of the longest suffix of, aText.substring(i), which is also a prefix of the given text.
*
* Use case: If a mismatch occurs at character index i - 1 in the pattern,
* the magnitude of the shift of the pattern, P, relative to the text is: P.length() - F[i].
*/
private static int[] fullShiftTable(String aText) {
int[] F = new int[aText.length()];
int[] Z = fundamentalPreprocess(aText);
int longest = 0;
Collections.reverse(Arrays.asList(Z));
for ( int i = 0; i < Z.length; i++ ) {
int zv = Z[i];
if ( zv == i + 1 ) {
longest = Math.max(zv, longest);
}
F[F.length - i - 1] = longest;
}
return F;
}
/**
* Create the good suffix table, L, for the given text, which is an array such that
* L[i] = k, is the largest index in the given text, S,
* such that S.substring(i) matches a suffix of S.substring(0, k).
*
* Use case: If a mismatch of characters occurs at index i - 1 in the pattern, P,
* then a shift of magnitude, P.length() - L[i], is such that no instances of the pattern in the text are omitted.
* Furthermore, a suffix of P.substring(0, L[i]) matches the same substring of the text that was matched by a
* suffix in the pattern on the previous matching attempt.
* In the case that L[i] = -1, the full shift table must be used.
*/
private static int[] goodSuffixTable(String aText) {
int[] L = IntStream.generate( () -> -1 ).limit(aText.length()).toArray();
String reversed = new StringBuilder(aText).reverse().toString();
int[] N = fundamentalPreprocess(reversed);
Collections.reverse(Arrays.asList(N));
for ( int j = 0; j < aText.length() - 1; j++ ) {
int i = aText.length() - N[j];
if ( i != aText.length() ) {
L[i] = j;
}
}
return L;
}
/**
* Create the bad character table, R, for the given text,
* which is a list indexed by the ASCII value of a character, C, in the given text.
*
* Use case: The entry at index i of R is a list of size: 1 + length of the given text.
* This inner list gives at each index j the next position at which character C will be found
* while traversing the given text from right to left starting from index j.
*/
private static List<List<Integer>> badCharacterTable(String aText) {
if ( aText.isEmpty() ) {
return Collections.emptyList();
}
List<List<Integer>> R = IntStream.range(0, ALPHABET_SIZE).boxed()
.map( i -> new ArrayList<Integer>(Collections.nCopies(1,-1)) ).collect(Collectors.toList());
List<Integer> alpha = new ArrayList<Integer>(Collections.nCopies(ALPHABET_SIZE, -1));
for ( int i = 0; i < aText.length(); i++ ) {
alpha.set(alphabetIndex(aText.charAt(i)), i);
for ( int j = 0; j < alpha.size(); j++ ) {
R.get(j).add(alpha.get(j));
}
}
return R;
}
/**
* Create the fundamental preprocess, Z, of the given text, which is an array such that
* Z[i] is the length of the substring of the given text beginning at index i which is also a prefix of the text.
*/
private static int[] fundamentalPreprocess(String aText) {
if ( aText.isEmpty() ) {
return new int[0];
}
if ( aText.length() == 1 ) {
return new int[] { 1 };
}
int[] Z = new int[aText.length()];
Z[0] = aText.length();
Z[1] = matchLength(aText, 0, 1);
for ( int i = 2; i <= Z[1]; i++ ) {
Z[i] = Z[1] - i + 1;
}
// Define the left and right limits of the z-box
int left = 0;
int right = 0;
for ( int i = 2 + Z[1]; i < aText.length(); i++ ) {
if ( i <= right ) { // i falls within existing z-box
final int k = i - left;
final int b = Z[k];
final int a = right - i + 1;
if ( b < a ) { // b ends within existing z-box
Z[i] = b;
} else { // b ends at or after the end of the z-box,
// an explicit match to the right of the z-box is required
Z[i] = a + matchLength(aText, a, right + 1);
left = i;
right = i + Z[i] - 1;
}
} else { // i does not fall within existing z-box
Z[i] = matchLength(aText, 0, i);
if ( Z[i] > 0 ) {
left = i;
right = i + Z[i] - 1;
}
}
}
return Z;
}
/**
* Return the length of the match of the two substrings of the given text beginning at each of the given indexes.
*/
private static int matchLength(String aText, int aIndexOne, int aIndexTwo) {
if ( aIndexOne == aIndexTwo ) {
return aText.length() - aIndexOne;
}
int matchCount = 0;
while ( aIndexOne < aText.length() && aIndexTwo < aText.length()
&& aText.charAt(aIndexOne) == aText.charAt(aIndexTwo) ) {
matchCount += 1;
aIndexOne += 1;
aIndexTwo += 1;
}
return matchCount;
}
/**
* Return the ASCII index of the given character, if it is such, otherwise throw an illegal argument exception.
*/
private static int alphabetIndex(char aChar) {
final int result = (int) aChar;
if ( result >= ALPHABET_SIZE ) {
throw new IllegalArgumentException("Not an ASCII character:" + aChar);
}
return result;
}
 
/* The range of ASCII characters is 0..255, both inclusive. */
private static final int ALPHABET_SIZE = 256;
}
</syntaxhighlight>
<pre>
text1 = GCTAGCTCTACGAGTCTA
text2 = GGCTATAATGCGTA
text3 = there would have been a time for such a word
text4 = needle need noodle needle
text5 = DKnuthusesandshowsanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustrate
text6 = Farms grew an acre of alfalfa on the dairy's behalf, with bales of that alfalfa exchanged for milk.
 
Found "TCTA" in 'text1' at indexes [6, 14]
Found "TAATAAA" in 'text2' at indexes []
Found "word" in 'text3' at indexes [40]
Found "needle" in 'text4' at indexes [0, 19]
Found "put" in 'text5' at indexes [32]
Found "and" in 'text5' at indexes [10, 43, 70]
Found "alfalfa" in 'text6' at indexes [22, 72]
</pre>
 
=={{header|Julia}}==
Some of this code is derived from Python code by Ben Langmead. Because of that ancestry, it returns
Line 1,449 ⟶ 1,976:
 
The same examples have been used as in the Julia entry above.
<syntaxhighlight lang="ecmascriptwren">class BoyerMoore {
/**
* Returns the index within this string of the first occurrence of the
Line 1,595 ⟶ 2,122:
Found 'alfalfa' in 'text6' at indices [33, 87]
</pre>
 
=={{header|Yabasic}}==
{{trans|FreeBASIC}}
<syntaxhighlight lang="vb">case_insensitive = false
 
sub preBmBc(pat$)
local m, i
 
m = len(pat$)
redim bmBc(m)
for i = 1 to m-1
bmBc(val(mid$(pat$, i, 1))) = m - i
next
end sub
 
sub suffixes(pat$)
local m, g, i, f
m = len(pat$)
g = m
redim suff(m)
suff(m) = m
for i = m-1 to 1 step -1
if i > g and suff(i + m - f) < i - g then
suff(i) = suff(i + m - f)
else
if i < g g = i
f = i
while g >= 1 and mid$(pat$, g, 1) = mid$(pat$, g + m - f, 1)
g = q - 1
wend
suff(i) = f - g
fi
next
end sub
 
sub preBmGs(pat$)
local m, j, i
m = len(pat$)
j = 1
redim suff(m)
redim bmGs(m)
for i = m to 1 step -1
if suff(i) = i then
while j < m - i
if bmGs(j) = m bmGs(j) = m - i
j = j + 1
wend
fi
next
for i = 1 to m-1
bmGs(m - suff(i)) = m - i
next
end sub
 
sub BM(pat$, s$, case_insensitive)
local pins
 
pins$ = "'" + pat$ + "' in " + "'" + s$ + "'"
if case_insensitive then
pat$ = lower$(pat$)
s$ = lower$(s$)
fi
//* Preprocessing *//
preBmGs(pat$)
preBmBc(pat$)
//* Searching *//
j = 0
n = len(s$)
m = len(pat$)
i = m
while j <= n - m
i = i - 1
if mid$(pat$,i,1) <> mid$(s$,i+j,1) break
if i < 1 then j = j + bmGs(0) else j = j + max(bmGs(i), bmBc(len(mid$(s$,i+j,1)) - m + i)) : fi
wend
many = instr(s$, pat$)
tmp$ = ""
if not many > 0 then
print "No ", pins$
else
while many > 0 //if not found loop will be skipped
tmp$ = tmp$ + str$(many) + ","
many = instr(s$, pat$, many + 1)
wend
print "Found ", pins$, " at indices [", tmp$, chr$(8), "]"
fi
end sub
 
BM("GCAGAGAG","GCATCGCAGAGAGTATACAGTACG")
BM("TCTA","GCTAGCTCTACGAGTCTA")
BM("TAATAAA","GGCTATAATGCGTA")
BM("word","there would have been a time for such a word")
BM("needle","needle need noodle needle")
book$ = "InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesleyDKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassemblylanguagestoillustratetheconceptsandalgorithmsastheyarepresented"
BM("put",book$)
BM("and",book$)
farm$ = "Nearby farms grew a half acre of alfalfa on the dairy's behalf, with bales of all that alfalfa exchanged for milk."
BM("alfalfa",farm$)</syntaxhighlight>
{{out}}
<pre>Same as QBasic entry.</pre>
2,130

edits