Skip to content

Commit

Permalink
feat: add Boyer-Moore algorithm (#259)
Browse files Browse the repository at this point in the history
This PR adds Boyer-Moore algorithm.

## Pull Request type

<!-- Please try to limit your pull request to one type; submit multiple
pull requests if needed. -->

Please check the type of change your PR introduces:

- [ ] Bugfix
- [x] Feature
- [ ] Code style update (formatting, renaming)
- [ ] Refactoring (no functional changes, no API changes)
- [ ] Build-related changes
- [ ] Documentation content changes
- [ ] Other (please describe):

## What is the current behavior?

<!-- Please describe the current behavior that you are modifying, or
link to a relevant issue. -->

Issue Number: N/A

## What is the new behavior?

<!-- Please describe the behavior or changes that are being added by
this PR. -->

- searching pattern ByteArray in a text ByteArray using Boyer-Moore
algorithm.
- all tests are passed.

## Does this introduce a breaking change?

- [ ] Yes
- [x] No

<!-- If this does introduce a breaking change, please describe the
impact and migration path for existing applications below. -->

## Other information

<!-- Any other information that is important to this PR, such as
screenshots of how the component looks before and after the change. -->
  • Loading branch information
Soptq authored Jan 25, 2024
1 parent 8637d35 commit 7dea99e
Show file tree
Hide file tree
Showing 5 changed files with 251 additions and 0 deletions.
6 changes: 6 additions & 0 deletions src/searching/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Searching

## [Boyer-Moore algorithm](./src/bm_search.cairo)

The Boyer-Moore algorithm is a string-searching algorithm that finds the position of a pattern in a string. It preprocesses the pattern to create two lookup tables: one for the bad character rule and one for the good suffix rule. The bad character rule shifts the pattern to align with the last occurrence of the mismatched character in the pattern. The good suffix rule shifts the pattern to align with the last occurrence of the suffix of the pattern that matches the suffix of the text.

The Boyer-Moore algorithm has a best-case time complexity of O(n/m) and a worst-case time complexity of O(nm), where n is the length of the text and m is the length of the pattern. It is the most efficient string-searching algorithm in practice.

## [Binary search](./src/binary_search.cairo)

The binary search algorithm is a simple search in an ordered array-like compound. It starts by comparing the value we are looking for to the middle of the array. If it's not a match, the function calls itself recursively on the right or left half of the array until it does(n't) find the value in the array.
Expand Down
93 changes: 93 additions & 0 deletions src/searching/src/bm_search.cairo
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// The Boyer-Moore string search algorithm
use dict::Felt252DictTrait;

/// Find `pattern` in `text` and return the index of every match.
/// * `text` - The text to search in.
/// * `pattern` - The pattern to search for.
/// # Returns
/// * `Array<usize>` - The index of every match.
fn bm_search(text: @ByteArray, pattern: @ByteArray) -> Array<usize> {
let mut positions: Array<usize> = array![]; // Array to store the indices of every match
let text_len = text.len(); // Length of the text
let pattern_len = pattern.len(); // Length of the pattern

// Check for invalid inputs or if the pattern is longer than the text
if text_len == 0 || pattern_len == 0 || pattern_len > text_len {
return positions;
}

let mut char_dict = felt252_dict_new::<
usize
>(); // Dictionary to store the last occurrence of each character in the pattern
let mut pattern_index = 0; // Index of the current character in the pattern

// Build the character dictionary
loop {
if pattern_index == pattern_len {
break;
}
let current_char = pattern.at(pattern_index).unwrap();
char_dict
.insert(
current_char.into(), pattern_index + 1
); // Avoid 0 since felt252_dict initializes every entry to 0 by default
pattern_index += 1;
};

let mut shift: usize = 0; // Shift value for pattern matching

// Perform pattern matching
loop {
if shift > text_len - pattern_len {
break;
}

let mut pattern_index = pattern_len;

// Compare characters from right to left
loop {
if pattern_index == 0
|| @pattern
.at(pattern_index - 1)
.unwrap() != @text
.at(shift + pattern_index - 1)
.unwrap() {
break;
}
pattern_index -= 1;
};

// If the pattern is found at the current shift position
if pattern_index == 0 {
positions.append(shift); // Add the current shift position to the positions array

// Calculate the next shift value
let add_to_shift = {
if shift + pattern_len < text_len {
let next_char = text.at(shift + pattern_len).unwrap();
let index = char_dict.get(next_char.into());
if index == 0 {
pattern_len + 1
} else {
pattern_len - index + 1
}
} else {
1
}
};
shift += add_to_shift;
} else {
let current_char = text.at(shift + pattern_index - 1).unwrap();
let index = char_dict.get(current_char.into());

// Calculate the next shift value based on the last occurrence of the current character in the pattern
if pattern_index <= (index + 1) {
shift += 1;
} else {
shift += pattern_index - index;
}
}
};

positions // Return the array of positions
}
1 change: 1 addition & 0 deletions src/searching/src/lib.cairo
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
mod binary_search;
mod bm_search;
mod dijkstra;

#[cfg(test)]
Expand Down
1 change: 1 addition & 0 deletions src/searching/src/tests.cairo
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
mod binary_search_test;
mod bm_search_test;
mod dijkstra_test;
150 changes: 150 additions & 0 deletions src/searching/src/tests/bm_search_test.cairo
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
use alexandria_searching::bm_search::bm_search;


// Check if two arrays are equal.
/// * `a` - The first array.
/// * `b` - The second array.
/// # Returns
/// * `bool` - True if the arrays are equal, false otherwise.
fn is_equal(mut a: Span<u32>, mut b: Span<u32>) -> bool {
if a.len() != b.len() {
return false;
}
loop {
match a.pop_front() {
Option::Some(val1) => {
let val2 = b.pop_front().unwrap();
if *val1 != *val2 {
break false;
}
},
Option::None => { break true; },
};
}
}


#[test]
#[available_gas(5000000)]
fn bm_search_test_1() {
// AABCAB12AFAABCABFFEGABCAB -> 41,41,42,43,41,42,31,32,41,46,41,41,42,43,41,42,46,46,45,47,41,42,43,41,42
let mut text: ByteArray = Default::default();
text.append_byte(0x41_u8);
text.append_byte(0x41_u8);
text.append_byte(0x42_u8);
text.append_byte(0x43_u8);
text.append_byte(0x41_u8);
text.append_byte(0x42_u8);
text.append_byte(0x31_u8);
text.append_byte(0x32_u8);
text.append_byte(0x41_u8);
text.append_byte(0x46_u8);
text.append_byte(0x41_u8);
text.append_byte(0x41_u8);
text.append_byte(0x42_u8);
text.append_byte(0x43_u8);
text.append_byte(0x41_u8);
text.append_byte(0x42_u8);
text.append_byte(0x46_u8);
text.append_byte(0x46_u8);
text.append_byte(0x45_u8);
text.append_byte(0x47_u8);
text.append_byte(0x41_u8);
text.append_byte(0x42_u8);
text.append_byte(0x43_u8);
text.append_byte(0x41_u8);
text.append_byte(0x42_u8);
// ABCAB -> 41,42,43,41,42
let mut pattern: ByteArray = Default::default();
pattern.append_byte(0x41_u8);
pattern.append_byte(0x42_u8);
pattern.append_byte(0x43_u8);
pattern.append_byte(0x41_u8);
pattern.append_byte(0x42_u8);

let positions = bm_search(@text, @pattern);
let ground_truth: Array<usize> = array![1, 11, 20];
assert(is_equal(positions.span(), ground_truth.span()), 'invalid result');
}

#[test]
#[available_gas(5000000)]
fn bm_search_test_2() {
// AABCAB12AFAABCABFFEGABCAB -> 41,41,42,43,41,42,31,32,41,46,41,41,42,43,41,42,46,46,45,47,41,42,43,41,42
let mut text: ByteArray = Default::default();
text.append_byte(0x41_u8);
text.append_byte(0x41_u8);
text.append_byte(0x42_u8);
text.append_byte(0x43_u8);
text.append_byte(0x41_u8);
text.append_byte(0x42_u8);
text.append_byte(0x31_u8);
text.append_byte(0x32_u8);
text.append_byte(0x41_u8);
text.append_byte(0x46_u8);
text.append_byte(0x41_u8);
text.append_byte(0x41_u8);
text.append_byte(0x42_u8);
text.append_byte(0x43_u8);
text.append_byte(0x41_u8);
text.append_byte(0x42_u8);
text.append_byte(0x46_u8);
text.append_byte(0x46_u8);
text.append_byte(0x45_u8);
text.append_byte(0x47_u8);
text.append_byte(0x41_u8);
text.append_byte(0x42_u8);
text.append_byte(0x43_u8);
text.append_byte(0x41_u8);
text.append_byte(0x42_u8);
// FFF -> 46,46,46
let mut pattern: ByteArray = Default::default();
pattern.append_byte(0x46_u8);
pattern.append_byte(0x46_u8);
pattern.append_byte(0x46_u8);

let positions = bm_search(@text, @pattern);
let ground_truth: Array<usize> = array![];
assert(is_equal(positions.span(), ground_truth.span()), 'invalid result');
}

#[test]
#[available_gas(5000000)]
fn bm_search_test_3() {
// AABCAB12AFAABCABFFEGABCAB -> 41,41,42,43,41,42,31,32,41,46,41,41,42,43,41,42,46,46,45,47,41,42,43,41,42
let mut text: ByteArray = Default::default();
text.append_byte(0x41_u8);
text.append_byte(0x41_u8);
text.append_byte(0x42_u8);
text.append_byte(0x43_u8);
text.append_byte(0x41_u8);
text.append_byte(0x42_u8);
text.append_byte(0x31_u8);
text.append_byte(0x32_u8);
text.append_byte(0x41_u8);
text.append_byte(0x46_u8);
text.append_byte(0x41_u8);
text.append_byte(0x41_u8);
text.append_byte(0x42_u8);
text.append_byte(0x43_u8);
text.append_byte(0x41_u8);
text.append_byte(0x42_u8);
text.append_byte(0x46_u8);
text.append_byte(0x46_u8);
text.append_byte(0x45_u8);
text.append_byte(0x47_u8);
text.append_byte(0x41_u8);
text.append_byte(0x42_u8);
text.append_byte(0x43_u8);
text.append_byte(0x41_u8);
text.append_byte(0x42_u8);
// CAB -> 43,41,42
let mut pattern: ByteArray = Default::default();
pattern.append_byte(0x43_u8);
pattern.append_byte(0x41_u8);
pattern.append_byte(0x42_u8);

let positions = bm_search(@text, @pattern);
let ground_truth: Array<usize> = array![3, 13, 22];
assert(is_equal(positions.span(), ground_truth.span()), 'invalid result');
}

0 comments on commit 7dea99e

Please sign in to comment.