Regular Expressions

Recipe	Crates	Categories
Verify and extract login from an email address
Extract a list of unique #hashtags from a text
Extract phone numbers from text
Filter a log file by matching multiple regular expressions
Replace all occurrences of one text pattern with another pattern
Longer regex example

Validates that an email address is formatted correctly, and extracts everything before the @ symbol.

use lazy_static::lazy_static;
use regex::Regex;

fn extract_login(input: &str) -> Option<&str> {
    lazy_static! {
        static ref RE: Regex = Regex::new(
            r"(?x)
            ^(?P<login>[^@\s]+)@
            ([[:word:]]+\.)*
            [[:word:]]+$
            "
        )
        .unwrap();
    }
    RE.captures(input)
        .and_then(|cap| cap.name("login").map(|login| login.as_str()))
}

fn main() {
    let login = extract_login(r"I❤email@example.com");
    println!("{:?}", login);
    assert_eq!(login, Some(r"I❤email"));

    let login = extract_login(r"sdf+sdsfsd.as.sdsd@jhkk.d.rl");
    println!("{:?}", login);
    assert_eq!(login, Some(r"sdf+sdsfsd.as.sdsd"));

    assert_eq!(extract_login(r"More@Than@One@at.com"), None);
    assert_eq!(extract_login(r"Not an email@email"), None);
}

Extract a list of unique #hashtags from a text

Extracts, sorts, and deduplicates list of hashtags from text.

The hashtag regex given here only catches Latin hashtags that start with a letter. The complete Twitter hashtag regex⮳ is much more complicated.

use std::collections::HashSet;

use lazy_static::lazy_static;
use regex::Regex;

fn extract_hashtags(text: &str) -> HashSet<&str> {
    lazy_static! {
        static ref HASHTAG_REGEX: Regex =
            Regex::new(r"\#[a-zA-Z][0-9a-zA-Z_]*").unwrap();
    }
    HASHTAG_REGEX
        .find_iter(text)
        .map(|mat| mat.as_str())
        .collect()
}

fn main() {
    let tweet = "Hey #world, I just got my new #dog, say hello to Till. #dog #forever #2 #_ ";
    let tags = extract_hashtags(tweet);
    println!("{:?}", tags);
    assert!(
        tags.contains("#dog")
            && tags.contains("#forever")
            && tags.contains("#world")
    );
    assert_eq!(tags.len(), 3);
}

Extract phone numbers from text

Processes a string of text using regex::Regex::captures_iter⮳ to capture multiple phone numbers. The example here is for US convention phone numbers.

use std::fmt;

use anyhow::Result;
use regex::Regex;

struct PhoneNumber<'a> {
    area: &'a str,
    exchange: &'a str,
    subscriber: &'a str,
}

impl fmt::Display for PhoneNumber<'_> {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "1 ({}) {}-{}", self.area, self.exchange, self.subscriber)
    }
}

fn main() -> Result<()> {
    let phone_text = "
    +1 505 881 9292 (v) +1 505 778 2212 (c) +1 505 881 9297 (f)
    (202) 991 9534
    Alex 5553920011
    1 (800) 233-2010
    1.299.339.1020";

    let re = Regex::new(
        r#"(?x)
          (?:\+?1)?                       # Country Code Optional
          [\s\.]?
          (([2-9]\d{2})|\(([2-9]\d{2})\)) # Area Code
          [\s\.\-]?
          ([2-9]\d{2})                    # Exchange Code
          [\s\.\-]?
          (\d{4})                         # Subscriber Number"#,
    )?;

    let phone_numbers = re.captures_iter(phone_text).filter_map(|cap| {
        let groups = (cap.get(2).or(cap.get(3)), cap.get(4), cap.get(5));
        match groups {
            (Some(area), Some(ext), Some(sub)) => Some(PhoneNumber {
                area: area.as_str(),
                exchange: ext.as_str(),
                subscriber: sub.as_str(),
            }),
            _ => None,
        }
    });

    assert_eq!(
        phone_numbers.map(|m| m.to_string()).collect::<Vec<_>>(),
        vec![
            "1 (505) 881-9292",
            "1 (505) 778-2212",
            "1 (505) 881-9297",
            "1 (202) 991-9534",
            "1 (555) 392-0011",
            "1 (800) 233-2010",
            "1 (299) 339-1020",
        ]
    );

    Ok(())
}

Filter a log file by matching multiple regular expressions

Reads a file named application.log and only outputs the lines containing “version X.X.X”, some IP address followed by port 443 (e.g. “192.168.0.1:443”), or a specific warning.

A regex::RegexSetBuilder⮳ composes a regex::RegexSetBuilder⮳ Since backslashes are very common in regular expressions, using raw string literals⮳ makes them more readable.

use std::fs;
use std::fs::File;
use std::io::BufRead;
use std::io::BufReader;

use anyhow::Result;
use regex::RegexSetBuilder;

fn main() -> Result<()> {
    let log_path = "temp/application.log";
    let buffered = BufReader::new(File::open(log_path)?);

    let set = RegexSetBuilder::new([
        r#"version "\d\.\d\.\d""#,
        r#"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:443"#,
        r#"warning.*timeout expired"#,
    ])
    .case_insensitive(true)
    .build()?;

    buffered
        .lines()                // yield instances of io::Result<String>
        .map_while(Result::ok)
        .filter(|line| set.is_match(line.as_str()))
        .for_each(|x| println!("{}", x));

    Ok(())
}

Replace all occurrences of one text pattern with another pattern

Replaces all occurrences of the standard ISO 8601 YYYY-MM-DD date pattern with the equivalent American English date with slashes. For example 2013-01-15 becomes 01/15/2013.

The method regex::Regex::replace_all⮳ replaces all occurrences of the whole regex. &str implements the regex::Replacer⮳ trait which allows variables like $abcde to refer to corresponding named capture groups (?P<abcde>REGEX) from the search regex. See the replacement string syntax⮳ for examples and escaping detail.

use std::borrow::Cow;

use lazy_static::lazy_static;
use regex::Regex;

fn reformat_dates(before: &str) -> Cow<str> {
    lazy_static! {
        static ref ISO8601_DATE_REGEX: Regex =
            Regex::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})").unwrap();
    }
    ISO8601_DATE_REGEX.replace_all(before, "$m/$d/$y")
}

fn main() {
    let before = "2012-03-14, 2013-01-15 and 2014-07-05";
    let after = reformat_dates(before);
    println!("{}", after);
    assert_eq!(after, "03/14/2012, 01/15/2013 and 07/05/2014");
}

Use regular expressions with backreferences and lookarounds

regex is the de facto standard regex library. It is very fast, but does not support fancier features such as backtracking, backreferences, and look-arounds. Use fancy-regex if you need features that regex doesn't support.

use fancy_regex::Regex;

// `fancy_regex` support features, including backreferences and lookaround, that
// are not available on the `regex` crate.

fn match_two_identical_words(s: &str) {
    // Check if a text consists of two identical words.
    // Note the backreference \1 to the 1st capture group (...)
    let re = Regex::new(r"^(\w+) (\1)$").expect("Error parsing the regex");
    match re.is_match(s) {
        Ok(true) => println!("Match found."),
        Ok(false) => println!("No match found."),
        Err(err) => eprintln!("Regex error: {}", err),
    }
}

fn capture_groups() {
    let text = "Lorem ipsum dolor sit amet";

    // Capture the word after "dolor" using a look-around. For reference,
    // (?=exp) : look-ahead, succeeds if exp matches to the right of the current
    // position
    // (?!exp) : negative look-ahead, succeeds if exp doesn’t match
    // to the right
    // (?<=exp) : look-behind, succeeds if exp matches to the
    // left of the current position
    // (?<!exp) : negative look-behind,
    // succeeds if exp doesn’t match to the left
    let pattern = r"(?<=dolor)\s*(\w+)";

    let re = Regex::new(pattern).expect("Error parsing the regex");
    // Returns the capture groups for the first match in text.
    match re.captures(text) {
        Ok(Some(caps)) => {
            // Get the capture group by its index in the regex.
            if let Some(name) = caps.get(1) {
                println!("Found: {}", name.as_str());
            } else {
                println!("No match found.");
            }
        }
        Ok(None) => println!("No match found."),
        Err(err) => eprintln!("Regex error: {}", err),
    }
}

fn split_text() {
    let re = Regex::new(r"[ \t]+").unwrap();
    let target = "Lorem ipsum\t dolor sit amet";
    let fields: Vec<&str> = re.split(target).map(|x| x.unwrap()).collect();
    assert_eq!(fields, vec!["Lorem", "ipsum", "dolor", "sit", "amet"]);
}

fn main() {
    match_two_identical_words("foo foo");
    capture_groups();
    split_text();
}

Rust How-to