Regular Expressions
Verify and Extract Login from an Email Address
Validates that an email address is formatted correctly, and extracts everything before the @
symbol.
use lazy_static::lazy_static; use regex::Regex; fn extract_login(input: &str) -> Option<&str> { lazy_static! { static ref RE: Regex = Regex::new( r"(?x) ^(?P<login>[^@\s]+)@ ([[:word:]]+\.)* [[:word:]]+$ " ) .unwrap(); } RE.captures(input) .and_then(|cap| cap.name("login").map(|login| login.as_str())) } fn main() { let login = extract_login(r"I❤email@example.com"); println!("{:?}", login); assert_eq!(login, Some(r"I❤email")); let login = extract_login(r"sdf+sdsfsd.as.sdsd@jhkk.d.rl"); println!("{:?}", login); assert_eq!(login, Some(r"sdf+sdsfsd.as.sdsd")); assert_eq!(extract_login(r"More@Than@One@at.com"), None); assert_eq!(extract_login(r"Not an email@email"), None); }
Extract a list of Unique #hashtags from a Text
Extracts, sorts, and deduplicates list of hashtags from text.
The hashtag regex given here only catches Latin hashtags that start with a letter. The complete Twitter hashtag regex⮳ is much more complicated.
use std::collections::HashSet; use lazy_static::lazy_static; use regex::Regex; fn extract_hashtags(text: &str) -> HashSet<&str> { lazy_static! { static ref HASHTAG_REGEX: Regex = Regex::new(r"\#[a-zA-Z][0-9a-zA-Z_]*").unwrap(); } HASHTAG_REGEX .find_iter(text) .map(|mat| mat.as_str()) .collect() } fn main() { let tweet = "Hey #world, I just got my new #dog, say hello to Till. #dog #forever #2 #_ "; let tags = extract_hashtags(tweet); println!("{:?}", tags); assert!( tags.contains("#dog") && tags.contains("#forever") && tags.contains("#world") ); assert_eq!(tags.len(), 3); }
Extract Phone Numbers from Text
Processes a string of text using regex::Regex::captures_iter
⮳ to capture multiple phone numbers. The example here is for US convention phone numbers.
use std::fmt; use anyhow::Result; use regex::Regex; struct PhoneNumber<'a> { area: &'a str, exchange: &'a str, subscriber: &'a str, } impl fmt::Display for PhoneNumber<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "1 ({}) {}-{}", self.area, self.exchange, self.subscriber) } } fn main() -> Result<()> { let phone_text = " +1 505 881 9292 (v) +1 505 778 2212 (c) +1 505 881 9297 (f) (202) 991 9534 Alex 5553920011 1 (800) 233-2010 1.299.339.1020"; let re = Regex::new( r#"(?x) (?:\+?1)? # Country Code Optional [\s\.]? (([2-9]\d{2})|\(([2-9]\d{2})\)) # Area Code [\s\.\-]? ([2-9]\d{2}) # Exchange Code [\s\.\-]? (\d{4}) # Subscriber Number"#, )?; let phone_numbers = re.captures_iter(phone_text).filter_map(|cap| { let groups = (cap.get(2).or(cap.get(3)), cap.get(4), cap.get(5)); match groups { (Some(area), Some(ext), Some(sub)) => Some(PhoneNumber { area: area.as_str(), exchange: ext.as_str(), subscriber: sub.as_str(), }), _ => None, } }); assert_eq!( phone_numbers.map(|m| m.to_string()).collect::<Vec<_>>(), vec![ "1 (505) 881-9292", "1 (505) 778-2212", "1 (505) 881-9297", "1 (202) 991-9534", "1 (555) 392-0011", "1 (800) 233-2010", "1 (299) 339-1020", ] ); Ok(()) }
Filter a log File by Matching Multiple Regular Expressions
Reads a file named application.log
and only outputs the lines containing “version X.X.X”, some IP address followed by port 443 (e.g. “192.168.0.1:443”), or a specific warning.
A regex::RegexSetBuilder
⮳ composes a regex::RegexSetBuilder
⮳ Since backslashes are very common in regular expressions, using raw string literals⮳ makes them more readable.
use std::fs; use std::fs::File; use std::io::BufRead; use std::io::BufReader; use anyhow::Result; use regex::RegexSetBuilder; fn main() -> Result<()> { let log_path = "temp/application.log"; let buffered = BufReader::new(File::open(log_path)?); let set = RegexSetBuilder::new([ r#"version "\d\.\d\.\d""#, r#"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:443"#, r#"warning.*timeout expired"#, ]) .case_insensitive(true) .build()?; buffered .lines() // yield instances of io::Result<String> .map_while(Result::ok) .filter(|line| set.is_match(line.as_str())) .for_each(|x| println!("{}", x)); Ok(()) }
Replace all Occurrences of one text Pattern with Another Pattern
Replaces all occurrences of the standard ISO 8601 YYYY-MM-DD date pattern with the equivalent American English date with slashes. For example 2013-01-15
becomes 01/15/2013
.
The method regex::Regex::replace_all
⮳ replaces all occurrences of the whole regex.
&str
implements the regex::Replacer
⮳ trait which allows variables like $abcde
to refer to corresponding named capture groups (?P<abcde>REGEX)
from the search regex. See the replacement string syntax
⮳ for examples and escaping detail.
use std::borrow::Cow; use lazy_static::lazy_static; use regex::Regex; fn reformat_dates(before: &str) -> Cow<str> { lazy_static! { static ref ISO8601_DATE_REGEX: Regex = Regex::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})").unwrap(); } ISO8601_DATE_REGEX.replace_all(before, "$m/$d/$y") } fn main() { let before = "2012-03-14, 2013-01-15 and 2014-07-05"; let after = reformat_dates(before); println!("{}", after); assert_eq!(after, "03/14/2012, 01/15/2013 and 07/05/2014"); }
Use Regular Expressions with Back-references and Lookarounds
regex
⮳ is the de facto standard regex library. It is very fast, but does not support fancier features such as backtracking, backreferences, and look-arounds. Use fancy-regex
⮳ if you need features that regex
⮳ doesn't support.
use fancy_regex::Regex; // `fancy_regex` support features, including backreferences and lookaround, that // are not available on the `regex` crate. fn match_two_identical_words(s: &str) { // Check if a text consists of two identical words. // Note the backreference \1 to the 1st capture group (...) let re = Regex::new(r"^(\w+) (\1)$").expect("Error parsing the regex"); match re.is_match(s) { Ok(true) => println!("Match found."), Ok(false) => println!("No match found."), Err(err) => eprintln!("Regex error: {}", err), } } fn capture_groups() { let text = "Lorem ipsum dolor sit amet"; // Capture the word after "dolor" using a look-around. For reference, // (?=exp) : look-ahead, succeeds if exp matches to the right of the current // position // (?!exp) : negative look-ahead, succeeds if exp doesn't match // to the right // (?<=exp) : look-behind, succeeds if exp matches to the // left of the current position // (?<!exp) : negative look-behind, // succeeds if exp doesn't match to the left let pattern = r"(?<=dolor)\s*(\w+)"; let re = Regex::new(pattern).expect("Error parsing the regex"); // Returns the capture groups for the first match in text. match re.captures(text) { Ok(Some(caps)) => { // Get the capture group by its index in the regex. if let Some(name) = caps.get(1) { println!("Found: {}", name.as_str()); } else { println!("No match found."); } } Ok(None) => println!("No match found."), Err(err) => eprintln!("Regex error: {}", err), } } fn split_text() { let re = Regex::new(r"[ \t]+").unwrap(); let target = "Lorem ipsum\t dolor sit amet"; let fields: Vec<&str> = re.split(target).map(|x| x.unwrap()).collect(); assert_eq!(fields, vec!["Lorem", "ipsum", "dolor", "sit", "amet"]); } fn main() { match_two_identical_words("foo foo"); capture_groups(); split_text(); }
Longer Regex Example
use std::collections::BTreeMap; use once_cell::sync::Lazy; use regex::Regex; // Regular expression and the names of its capture groups. struct Re(Regex, Vec<&'static str>); // Regexes take a while to compile; it is reasonable to store them in // a global static static GLOBAL_REGEX: Lazy<BTreeMap<&str, Re>> = Lazy::new(|| { println!("Initializing Regexes...\n"); // A sorted map: let mut m = BTreeMap::new(); // A Markdown inline link: // (?<name> ) is a named capture group. // \s is a whitespace. \S is a not-whitespace. m.insert( "[text](http...)", Re( Regex::new(r"[^!]\[(?<text>.*?)\]\s?\(\s*?(?<link>\S*?)\s*?\)") .unwrap(), vec!["text", "link"], ), ); // A Markdown autolink m.insert( "<http...>", Re(Regex::new(r"<(?<link>http.*?)>").unwrap(), vec!["link"]), ); // A Markdown shortcut link // or <spaces>( or : m.insert( "[text] ...", Re( Regex::new(r"[^!\]]\[(?<text>[^\[\]]+?)\]\s*?[^\[\(:]").unwrap(), vec!["text"], ), ); // A Markdown reference-style link m.insert( "[text][label]", Re( Regex::new(r"[^!\]]\[(?<text>.*?)\]\s?\[(?<label>.+?)\]").unwrap(), vec!["text", "label"], ), ); // A Markdown reference definition (with optional title): // (?: ) is a non-capturing group. // (?m) flags multi-line mode. ^ and $ are the beginning and end of a // line, respectively. m.insert( "[label]: url \"title\"", Re(Regex::new(r#"(?m)^\s*?\[(?<label>.*?)\]:\s*?(?<url>\S+)\s*?(?:"(?<title>.*)")?\s*$"#).unwrap(), vec!["label", "url", "title"]) ); m }); #[allow(dead_code)] fn extract_inline_links(contents: &str) { for (_, [text, link]) in GLOBAL_REGEX["[text](http...)"] .0 // `captures_iter` iterates through `Captures`, which stores the // capture groups for each match. .captures_iter(contents) // `extract` returns a tuple where // the first element corresponds to the full substring of the contents // that matched the regex. The second element is an array of // substrings, with each corresponding to the substring that matched // for a particular capture group. .map(|c| c.extract()) { println!("[{text}]({link})\n"); } } // Locate markup in text fn search_with_all_regexes(contents: &str) { // Try to match all reggular expressions for (key, re) in GLOBAL_REGEX.iter() { println!("----------------------\nLooking for {}:\n", key); for caps in re.0.captures_iter(contents) { // Print the whole match print!("{} -> ", &caps[0]); for group in re.1.iter() { print!( "{}={}; ", group, // Retrieve each named capture group in turn... // `extract` can't be used here, since the # of capture // groups varies. caps.name(group).map_or("", |m| m.as_str()) ); } println!("\n"); } } } // Example Markdown to process fn get_test_markdown() -> String { let md: &'static str = " <http://url0/> [text1](url1) [text2][lbl2] [lbl2]: url2 \"title2\" [lbl3][] [lbl4]  ![image6][image_lbl6] image_lbl6: image_url6 ![image_lbl7] ![image_lbl8][] "; md.to_owned() } fn main() { search_with_all_regexes(get_test_markdown().as_str()); }
Related Topics
- Rust Search Engines.
- Search.
- Strings.