String Search and Fuzzy Matching
Recipe | Crates | Categories |
---|---|---|
aho-corasick | [![aho-corasick][c-aho-corasick-badge]][c-aho-corasick] | |
fuzzy-matcher | [![fuzzy-matcher][c-fuzzy-matcher-badge]][c-fuzzy-matcher] | |
memchr | ||
strsim |
FIXME
aho-corasick
aho-corasick
⮳ implements fast multiple substring searching.
use std::collections::HashMap; use aho_corasick::AhoCorasick; fn main() -> anyhow::Result<()> { // Search for occurrences of multiple patterns simultaneously. let patterns = &["apple", "banana", "orange"]; let ac = AhoCorasick::new(patterns)?; // For case-insensitive search, use: // let ac = AhoCorasick::builder() // .ascii_case_insensitive(true) // .build(patterns)?; let text = "I like apple and banana smoothies"; let mut matches = vec![]; for mat in ac.find_iter(text) { matches.push((mat.pattern(), mat.start(), mat.end())); } println!("Basic matches:"); for (pattern_idx, start, end) in matches { println!( "Pattern '{}' at byte range {}..{}", patterns[pattern_idx], start, end ); } // Using with replacement let ac = AhoCorasick::new(patterns)?; let result = ac.replace_all(text, &["APPLE", "BANANA", "ORANGE"]); println!("\nReplaced text: {}", result); // Finding all matches with overlapping patterns let patterns = &["abc", "bc", "c"]; // Use leftmost-first match semantics, which reports leftmost matches, like // a Perl regex. When there are multiple possible leftmost matches, the // match corresponding to the pattern that appeared earlier when // constructing the automaton is reported. let ac = AhoCorasick::builder() .match_kind(aho_corasick::MatchKind::LeftmostFirst) .build(patterns)?; let text = "abcdef"; println!("\nOverlapping matches in 'abcdef':"); for mat in ac.find_iter(text) { println!( "Pattern '{}' at byte range {}..{}", patterns[mat.pattern()], mat.start(), mat.end() ); } // Using with byte offsets let patterns = &[b"abc", b"def"]; let ac = AhoCorasick::new(patterns)?; let text = b"abcdef"; println!("\nMatches in byte sequence:"); for mat in ac.find_iter(text) { println!( "{:?} at byte range {}..{}", mat.pattern(), mat.start(), mat.end() ); } // Example: counting occurrences let patterns = &["he", "she", "his", "hers"]; let ac = AhoCorasick::new(patterns)?; let text = "he said that she was using his book, not hers"; let mut counts = HashMap::new(); for mat in ac.find_iter(text) { let pattern = patterns[mat.pattern()]; *counts.entry(pattern).or_insert(0) += 1; } println!("\nWord counts:"); for (pattern, count) in counts { println!("{}: {}", pattern, count); } Ok(()) }
memchr
memchr
⮳ provides extremely fast routines for 1, 2 or 3 byte search and single substring search (that use SIMD on x86_64, aarch64 and wasm32).
use memchr::memchr; // Use `memchr` to efficiently search for a byte in a slice. // Add the dependency to Cargo.toml: // [dependencies] // memchr = "2.7.4" # Or latest fn main() { let haystack = b"hello world"; // Find first occurrence of 'o'. match memchr(b'o', haystack) { Some(index) => println!("Found 'o' at index: {}", index), None => println!("Character not found"), } // Using `memchr` in a function. fn find_all_occurrences(needle: u8, haystack: &[u8]) -> Vec<usize> { let mut indices = Vec::new(); let mut start = 0; while let Some(pos) = memchr(needle, &haystack[start..]) { let absolute_pos = start + pos; indices.push(absolute_pos); start = absolute_pos + 1; if start >= haystack.len() { break; } } indices } // Find all occurrences of 'l' let all_l_positions = find_all_occurrences(b'l', haystack); println!("All 'l' positions: {:?}", all_l_positions); }
fuzzy-matcher
fuzzy-matcher
⮳ is a fuzzy matching library.
use fuzzy_matcher::FuzzyMatcher; use fuzzy_matcher::skim::SkimMatcherV2; fn main() { // Create a new matcher let matcher = SkimMatcherV2::default(); // Define some example strings to search through let items = [ "fuzzy", "hazy", "buzzy", "scuzzy", "cloudy", "fuzzily", "fuzzed", ]; // The pattern to search for let pattern = "fuzzy"; println!("Searching for pattern: '{}'", pattern); // Find all matches and their scores let mut matches: Vec<(i64, &str)> = items .iter() .filter_map(|item| { matcher .fuzzy_match(item, pattern) .map(|score| (score, *item)) }) .collect(); // Sort by score (highest first) matches.sort_by(|a, b| b.0.cmp(&a.0)); // Display results println!("\nResults (sorted by match score):"); for (score, item) in matches.clone() { println!("Score: {}, Item: '{}'", score, item); } // Example of highlighting matches println!("\nHighlighted matches:"); for (_, item) in matches { // fuzzy match item with pattern, and return the score & matched indices // of characters. if let Some((score, indices)) = matcher.fuzzy_indices(item, pattern) { let mut highlighted = String::new(); let mut last_idx = 0; for &idx in indices.iter() { // Add text before the match highlighted.push_str(&item[last_idx..idx]); // Add the matched character with formatting highlighted.push('['); highlighted.push(item.chars().nth(idx).unwrap()); highlighted.push(']'); last_idx = idx + 1; } // Add remaining text highlighted.push_str(&item[last_idx..]); println!("Score: {}, Highlighted: '{}'", score, highlighted); } } }
strsim
strsim
⮳ implement string similarity metrics. That includes Hamming, Levenshtein, OSA, Damerau-Levenshtein, Jaro, Jaro-Winkler, and Sørensen-Dice.
use strsim::damerau_levenshtein; use strsim::hamming; use strsim::jaro; use strsim::jaro_winkler; use strsim::levenshtein; use strsim::normalized_levenshtein; use strsim::sorensen_dice; // Add to your `Cargo.toml`: // [dependencies] // strsim = "0.11.1" # Or latest fn main() { let s1 = "aleksandr"; let s2 = "alexander"; // Levenshtein distance println!("Levenshtein: {}", levenshtein(s1, s2)); // Normalized Levenshtein distance (0.0 to 1.0) println!("Normalized Levenshtein: {}", normalized_levenshtein(s1, s2)); // Damerau-Levenshtein distance (handles transpositions) println!("Damerau-Levenshtein: {}", damerau_levenshtein(s1, s2)); // Jaro similarity (0.0 to 1.0) println!("Jaro: {}", jaro(s1, s2)); // Jaro-Winkler similarity (0.0 to 1.0) println!("Jaro-Winkler: {}", jaro_winkler(s1, s2)); // Sørensen-Dice coefficient (0.0 to 1.0) println!("Sørensen-Dice: {}", sorensen_dice(s1, s2)); // Hamming distance (only for equal length strings) let s3 = "karolin"; let s4 = "kathrin"; match hamming(s3, s4) { Ok(distance) => println!("Hamming: {}", distance), Err(e) => println!("Error calculating Hamming distance: {}", e), } // Example: Finding closest match let target = "rusty"; let candidates = ["dusty", "rust", "trust", "crusty"]; let closest = candidates .iter() .map(|&c| (c, jaro_winkler(target, c))) .max_by(|a, b| a.1.partial_cmp(&b.1).unwrap()) .unwrap(); println!( "Closest match to '{}' is '{}' with score {}", target, closest.0, closest.1 ); }
Related Topics
- Rust Search Engines.
- Search.
- Strings.