Regular Expressions
| Use regular expressions with backreferences and lookarounds | | |
Verify and extract login from an email address
Validates that an email address is formatted correctly, and extracts everything before the @
symbol.
use lazy_static::lazy_static; use regex::Regex; fn extract_login(input: &str) -> Option<&str> { lazy_static! { static ref RE: Regex = Regex::new( r"(?x) ^(?P<login>[^@\s]+)@ ([[:word:]]+\.)* [[:word:]]+$ " ) .unwrap(); } RE.captures(input) .and_then(|cap| cap.name("login").map(|login| login.as_str())) } fn main() { let login = extract_login(r"I❤email@example.com"); println!("{:?}", login); assert_eq!(login, Some(r"I❤email")); let login = extract_login(r"sdf+sdsfsd.as.sdsd@jhkk.d.rl"); println!("{:?}", login); assert_eq!(login, Some(r"sdf+sdsfsd.as.sdsd")); assert_eq!(extract_login(r"More@Than@One@at.com"), None); assert_eq!(extract_login(r"Not an email@email"), None); }
Extract a list of unique #hashtags from a text
Extracts, sorts, and deduplicates list of hashtags from text.
The hashtag regex given here only catches Latin hashtags that start with a letter. The complete Twitter hashtag regex⮳ is much more complicated.
use std::collections::HashSet; use lazy_static::lazy_static; use regex::Regex; fn extract_hashtags(text: &str) -> HashSet<&str> { lazy_static! { static ref HASHTAG_REGEX: Regex = Regex::new(r"\#[a-zA-Z][0-9a-zA-Z_]*").unwrap(); } HASHTAG_REGEX .find_iter(text) .map(|mat| mat.as_str()) .collect() } fn main() { let tweet = "Hey #world, I just got my new #dog, say hello to Till. #dog #forever #2 #_ "; let tags = extract_hashtags(tweet); println!("{:?}", tags); assert!( tags.contains("#dog") && tags.contains("#forever") && tags.contains("#world") ); assert_eq!(tags.len(), 3); }
Extract phone numbers from text
Processes a string of text using regex::Regex::captures_iter
⮳ to capture multiple phone numbers. The example here is for US convention phone numbers.
use std::fmt; use anyhow::Result; use regex::Regex; struct PhoneNumber<'a> { area: &'a str, exchange: &'a str, subscriber: &'a str, } impl fmt::Display for PhoneNumber<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "1 ({}) {}-{}", self.area, self.exchange, self.subscriber) } } fn main() -> Result<()> { let phone_text = " +1 505 881 9292 (v) +1 505 778 2212 (c) +1 505 881 9297 (f) (202) 991 9534 Alex 5553920011 1 (800) 233-2010 1.299.339.1020"; let re = Regex::new( r#"(?x) (?:\+?1)? # Country Code Optional [\s\.]? (([2-9]\d{2})|\(([2-9]\d{2})\)) # Area Code [\s\.\-]? ([2-9]\d{2}) # Exchange Code [\s\.\-]? (\d{4}) # Subscriber Number"#, )?; let phone_numbers = re.captures_iter(phone_text).filter_map(|cap| { let groups = (cap.get(2).or(cap.get(3)), cap.get(4), cap.get(5)); match groups { (Some(area), Some(ext), Some(sub)) => Some(PhoneNumber { area: area.as_str(), exchange: ext.as_str(), subscriber: sub.as_str(), }), _ => None, } }); assert_eq!( phone_numbers.map(|m| m.to_string()).collect::<Vec<_>>(), vec![ "1 (505) 881-9292", "1 (505) 778-2212", "1 (505) 881-9297", "1 (202) 991-9534", "1 (555) 392-0011", "1 (800) 233-2010", "1 (299) 339-1020", ] ); Ok(()) }
Filter a log file by matching multiple regular expressions
Reads a file named application.log
and only outputs the lines containing “version X.X.X”, some IP address followed by port 443 (e.g. “192.168.0.1:443”), or a specific warning.
A regex::RegexSetBuilder
⮳ composes a regex::RegexSetBuilder
⮳ Since backslashes are very common in regular expressions, using raw string literals⮳ makes them more readable.
use std::fs; use std::fs::File; use std::io::BufRead; use std::io::BufReader; use anyhow::Result; use regex::RegexSetBuilder; fn main() -> Result<()> { let log_path = "temp/application.log"; let buffered = BufReader::new(File::open(log_path)?); let set = RegexSetBuilder::new([ r#"version "\d\.\d\.\d""#, r#"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:443"#, r#"warning.*timeout expired"#, ]) .case_insensitive(true) .build()?; buffered .lines() // yield instances of io::Result<String> .map_while(Result::ok) .filter(|line| set.is_match(line.as_str())) .for_each(|x| println!("{}", x)); Ok(()) }
Replace all occurrences of one text pattern with another pattern
Replaces all occurrences of the standard ISO 8601 YYYY-MM-DD date pattern with the equivalent American English date with slashes. For example 2013-01-15
becomes 01/15/2013
.
The method regex::Regex::replace_all
⮳ replaces all occurrences of the whole regex.
&str
implements the regex::Replacer
⮳ trait which allows variables like $abcde
to refer to corresponding named capture groups (?P<abcde>REGEX)
from the search regex. See the replacement string syntax
⮳ for examples and escaping detail.
use std::borrow::Cow; use lazy_static::lazy_static; use regex::Regex; fn reformat_dates(before: &str) -> Cow<str> { lazy_static! { static ref ISO8601_DATE_REGEX: Regex = Regex::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})").unwrap(); } ISO8601_DATE_REGEX.replace_all(before, "$m/$d/$y") } fn main() { let before = "2012-03-14, 2013-01-15 and 2014-07-05"; let after = reformat_dates(before); println!("{}", after); assert_eq!(after, "03/14/2012, 01/15/2013 and 07/05/2014"); }
Use regular expressions with backreferences and lookarounds
regex
is the de facto standard regex library. It is very fast, but does not support fancier features such as backtracking, backreferences, and look-arounds. Use fancy-regex
if you need features that regex
doesn't support.
fn main() { todo!(); }