Regular Expressions
Verify and Extract Login from an Email Address
Validates that an email address is formatted correctly, and extracts everything before the @
symbol.
use lazy_static::lazy_static; use regex::Regex; fn extract_login(input: &str) -> Option<&str> { lazy_static! { static ref RE: Regex = Regex::new( r"(?x) ^(?P<login>[^@\s]+)@ ([[:word:]]+\.)* [[:word:]]+$ " ) .unwrap(); } RE.captures(input) .and_then(|cap| cap.name("login").map(|login| login.as_str())) } fn main() { let login = extract_login(r"I❤email@example.com"); println!("{:?}", login); assert_eq!(login, Some(r"I❤email")); let login = extract_login(r"sdf+sdsfsd.as.sdsd@jhkk.d.rl"); println!("{:?}", login); assert_eq!(login, Some(r"sdf+sdsfsd.as.sdsd")); assert_eq!(extract_login(r"More@Than@One@at.com"), None); assert_eq!(extract_login(r"Not an email@email"), None); }
Extract a list of Unique #hashtags from a Text
Extracts, sorts, and deduplicates list of hashtags from text.
The hashtag regex given here only catches Latin hashtags that start with a letter. The complete Twitter hashtag regex⮳ is much more complicated.
use std::collections::HashSet; use lazy_static::lazy_static; use regex::Regex; fn extract_hashtags(text: &str) -> HashSet<&str> { lazy_static! { static ref HASHTAG_REGEX: Regex = Regex::new(r"\#[a-zA-Z][0-9a-zA-Z_]*").unwrap(); } HASHTAG_REGEX .find_iter(text) .map(|mat| mat.as_str()) .collect() } fn main() { let tweet = "Hey #world, I just got my new #dog, say hello to Till. #dog #forever #2 #_ "; let tags = extract_hashtags(tweet); println!("{:?}", tags); assert!( tags.contains("#dog") && tags.contains("#forever") && tags.contains("#world") ); assert_eq!(tags.len(), 3); }
Extract Phone Numbers from Text
Processes a string of text using regex::Regex::captures_iter
⮳ to capture multiple phone numbers. The example here is for US convention phone numbers.
//! This example demonstrates how to extract and format phone numbers from a //! text using regular expressions. use std::fmt; use anyhow::Result; use regex::Regex; /// Represents a US phone number /// with its area code, exchange code, /// and subscriber number. struct PhoneNumber<'a> { area: &'a str, exchange: &'a str, subscriber: &'a str, } /// Formats the phone number as "1 (area) exchange-subscriber". impl fmt::Display for PhoneNumber<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "1 ({}) {}-{}", self.area, self.exchange, self.subscriber) } } /// Extracts and formats phone numbers from a given text. /// /// This function defines a regular expression to match phone numbers in /// various formats. It then iterates over the matches, extracts the area /// code, exchange code, and subscriber number, and formats them into a /// `PhoneNumber` struct. Finally, it asserts that the extracted phone /// numbers match the expected output. fn main() -> Result<()> { let phone_text = " +1 505 555 0192 (v) +1 505 555 0112 (c) +1 505 555 0197 (f) (202) 555 0134 Alex 3925550011 1 (800) 555-1010 1.299.555.1020"; // Regular expression to match phone numbers. let re = Regex::new( r#"(?x) (?:\+?1)? # Country Code Optional [\s\.]? (([2-9]\d{2})|\(([2-9]\d{2})\)) # Area Code [\s\.\-]? ([2-9]\d{2}) # Exchange Code [\s\.\-]? (\d{4}) # Subscriber Number"#, )?; // Extract phone numbers from the text. let phone_numbers = re.captures_iter(phone_text).filter_map(|cap| { let groups = (cap.get(2).or(cap.get(3)), cap.get(4), cap.get(5)); match groups { (Some(area), Some(ext), Some(sub)) => Some(PhoneNumber { area: area.as_str(), exchange: ext.as_str(), subscriber: sub.as_str(), }), _ => None, } }); // Assert that the extracted phone numbers match the expected output. assert_eq!( phone_numbers.map(|m| m.to_string()).collect::<Vec<_>>(), vec![ "1 (505) 555-0192", "1 (505) 555-0112", "1 (505) 555-0197", "1 (202) 555-0134", "1 (392) 555-0011", "1 (800) 555-1010", "1 (299) 555-1020", ] ); Ok(()) }
Filter a log File by Matching Multiple Regular Expressions
Reads a file named application.log
and only outputs the lines containing “version X.X.X”, some IP address followed by port 443 (e.g. “192.168.0.1:443”), or a specific warning.
A regex::RegexSetBuilder
⮳ composes a regex::RegexSetBuilder
⮳ Since backslashes are very common in regular expressions, using raw string literals⮳ makes them more readable.
use std::fs; use std::fs::File; use std::io::BufRead; use std::io::BufReader; use anyhow::Result; use regex::RegexSetBuilder; fn main() -> Result<()> { let log_path = "temp/application.log"; let buffered = BufReader::new(File::open(log_path)?); let set = RegexSetBuilder::new([ r#"version "\d\.\d\.\d""#, r#"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:443"#, r#"warning.*timeout expired"#, ]) .case_insensitive(true) .build()?; buffered .lines() // yield instances of io::Result<String> .map_while(Result::ok) .filter(|line| set.is_match(line.as_str())) .for_each(|x| println!("{}", x)); Ok(()) }
Replace all Occurrences of one text Pattern with Another Pattern
Replaces all occurrences of the standard ISO 8601 YYYY-MM-DD date pattern with the equivalent American English date with slashes. For example 2013-01-15
becomes 01/15/2013
.
The method regex::Regex::replace_all
⮳ replaces all occurrences of the whole regex.
&str
implements the regex::Replacer
⮳ trait which allows variables like $abcde
to refer to corresponding named capture groups (?P<abcde>REGEX)
from the search regex. See the replacement string syntax
⮳ for examples and escaping detail.
use std::borrow::Cow; use lazy_static::lazy_static; use regex::Regex; fn reformat_dates(before: &str) -> Cow<str> { lazy_static! { static ref ISO8601_DATE_REGEX: Regex = Regex::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})").unwrap(); } ISO8601_DATE_REGEX.replace_all(before, "$m/$d/$y") } fn main() { let before = "2012-03-14, 2013-01-15 and 2014-07-05"; let after = reformat_dates(before); println!("{}", after); assert_eq!(after, "03/14/2012, 01/15/2013 and 07/05/2014"); }
Use Regular Expressions with Back-references and Lookarounds
regex
⮳ is the de facto standard regex library. It is very fast, but does not support fancier features such as backtracking, backreferences, and look-arounds. Use fancy-regex
⮳ if you need features that regex
⮳ doesn't support.
use fancy_regex::Regex; // `fancy_regex` support features, including backreferences and lookaround, that // are not available on the `regex` crate. fn match_two_identical_words(s: &str) { // Check if a text consists of two identical words. // Note the backreference \1 to the 1st capture group (...) let re = Regex::new(r"^(\w+) (\1)$").expect("Error parsing the regex"); match re.is_match(s) { Ok(true) => println!("Match found."), Ok(false) => println!("No match found."), Err(err) => eprintln!("Regex error: {}", err), } } fn capture_groups() { let text = "Lorem ipsum dolor sit amet"; // Capture the word after "dolor" using a look-around. For reference, // (?=exp) : look-ahead, succeeds if exp matches to the right of the current // position // (?!exp) : negative look-ahead, succeeds if exp doesn't match // to the right // (?<=exp) : look-behind, succeeds if exp matches to the // left of the current position // (?<!exp) : negative look-behind, // succeeds if exp doesn't match to the left let pattern = r"(?<=dolor)\s*(\w+)"; let re = Regex::new(pattern).expect("Error parsing the regex"); // Returns the capture groups for the first match in text. match re.captures(text) { Ok(Some(caps)) => { // Get the capture group by its index in the regex. if let Some(name) = caps.get(1) { println!("Found: {}", name.as_str()); } else { println!("No match found."); } } Ok(None) => println!("No match found."), Err(err) => eprintln!("Regex error: {}", err), } } fn split_text() { let re = Regex::new(r"[ \t]+").unwrap(); let target = "Lorem ipsum\t dolor sit amet"; let fields: Vec<&str> = re.split(target).map(|x| x.unwrap()).collect(); assert_eq!(fields, vec!["Lorem", "ipsum", "dolor", "sit", "amet"]); } fn main() { match_two_identical_words("foo foo"); capture_groups(); split_text(); }
Longer Regex Example
use std::collections::BTreeMap; use once_cell::sync::Lazy; use regex::Regex; /// Store a regular expression and the names of its capture groups. struct Re(Regex, Vec<&'static str>); /// Regexes take a while to compile. /// It is reasonable to store them in a global static. static GLOBAL_REGEX: Lazy<BTreeMap<&str, Re>> = Lazy::new(|| { println!("Initializing Regexes...\n"); // A sorted map: let mut m = BTreeMap::new(); // A Markdown inline link - see https://spec.commonmark.org/ // (?<name> ) is a named capture group. // \s is a whitespace. \S is a not-whitespace. // [^!] excludes ! before [. m.insert( "[text](http...)", Re( Regex::new(r"[^!]\[(?<text>.*?)\]\s?\(\s*?(?<link>\S*?)\s*?\)") .unwrap(), vec!["text", "link"], ), ); // A Markdown autolink. m.insert( "<http...>", Re(Regex::new(r"<(?<link>http.*?)>").unwrap(), vec!["link"]), ); // A Markdown shortcut link. // [text] not preceded by ! or ], not followed by <optional spaces>[ or ( or // : m.insert( "[text] ...", Re( Regex::new(r"[^!\]]\[(?<text>[^\[\]]+?)\]\s*?[^\[\(:]").unwrap(), vec!["text"], ), ); // A Markdown reference-style link. m.insert( "[text][label]", Re( Regex::new(r"[^!\]]\[(?<text>.*?)\]\s?\[(?<label>.+?)\]").unwrap(), vec!["text", "label"], ), ); // A Markdown reference definition (with optional title): // (?: ) is a non-capturing group. // (?m) flags the multi-line mode. // ^ and $ are the beginning and end of a line, respectively. m.insert( "[label]: url \"title\"", Re(Regex::new(r#"(?m)^\s*?\[(?<label>.*?)\]:\s*?(?<url>\S+)\s*?(?:"(?<title>.*)")?\s*$"#).unwrap(), vec!["label", "url", "title"]) ); m }); /// Extract and print Markdown inline links e.g. [text](http...). fn extract_inline_links(contents: &str) { for (_, [text, link]) in GLOBAL_REGEX["[text](http...)"] .0 // `captures_iter` iterates through `Captures`, which stores the // capture groups for each match. .captures_iter(contents) // `extract` returns a tuple where // the first element corresponds to the full substring of the contents // that matched the regex. The second element is an array of // substrings, with each corresponding to the substring that matched // for a particular capture group. .map(|c| c.extract()) { println!("[{text}]({link})\n"); } } /// Locate markup in text. fn search_with_all_regexes(contents: &str) { // Try to match all reggular expressions for (key, re) in GLOBAL_REGEX.iter() { println!("----------------------\nLooking for {}:\n", key); for caps in re.0.captures_iter(contents) { // Print the whole match. print!("{} -> ", &caps[0]); for group in re.1.iter() { print!( "{}={}; ", group, // Retrieve each named capture group in turn... // `extract` can't be used here, since the # of capture // groups varies. caps.name(group).map_or("", |m| m.as_str()) ); } println!("\n"); } } } fn main() { // Example Markdown links to process: let md: &'static str = " <http://url0/> [text1](url1) [text2][lbl2] [lbl2]: url2 \"title2\" [lbl3][] [lbl4]  ![image6][image_lbl6] image_lbl6: image_url6 ![image_lbl7] ![image_lbl8][] "; extract_inline_links(md); search_with_all_regexes(md); }
Related Topics
- Rust Search Engines.
- Search.
- Strings.