Regular Expressions

Verify and Extract Login from an Email Address

regex lazy_static cat-text-processing

Validates that an email address is formatted correctly, and extracts everything before the @ symbol.

use lazy_static::lazy_static;
use regex::Regex;

fn extract_login(input: &str) -> Option<&str> {
    lazy_static! {
        static ref RE: Regex = Regex::new(
            r"(?x)
            ^(?P<login>[^@\s]+)@
            ([[:word:]]+\.)*
            [[:word:]]+$
            "
        )
        .unwrap();
    }
    RE.captures(input)
        .and_then(|cap| cap.name("login").map(|login| login.as_str()))
}

fn main() {
    let login = extract_login(r"I❤email@example.com");
    println!("{:?}", login);
    assert_eq!(login, Some(r"I❤email"));

    let login = extract_login(r"sdf+sdsfsd.as.sdsd@jhkk.d.rl");
    println!("{:?}", login);
    assert_eq!(login, Some(r"sdf+sdsfsd.as.sdsd"));

    assert_eq!(extract_login(r"More@Than@One@at.com"), None);
    assert_eq!(extract_login(r"Not an email@email"), None);
}

Extract a list of Unique #hashtags from a Text

regex lazy_static cat-text-processing

Extracts, sorts, and deduplicates list of hashtags from text.

The hashtag regex given here only catches Latin hashtags that start with a letter. The complete Twitter hashtag regextwitter-hashtag-regex is much more complicated.

use std::collections::HashSet;

use lazy_static::lazy_static;
use regex::Regex;

fn extract_hashtags(text: &str) -> HashSet<&str> {
    lazy_static! {
        static ref HASHTAG_REGEX: Regex =
            Regex::new(r"\#[a-zA-Z][0-9a-zA-Z_]*").unwrap();
    }
    HASHTAG_REGEX
        .find_iter(text)
        .map(|mat| mat.as_str())
        .collect()
}

fn main() {
    let tweet = "Hey #world, I just got my new #dog, say hello to Till. #dog #forever #2 #_ ";
    let tags = extract_hashtags(tweet);
    println!("{:?}", tags);
    assert!(
        tags.contains("#dog")
            && tags.contains("#forever")
            && tags.contains("#world")
    );
    assert_eq!(tags.len(), 3);
}

Extract Phone Numbers from Text

regex cat-text-processing

Processes a string of text using regex::Regex::captures_iter⮳ to capture multiple phone numbers. The example here is for US convention phone numbers.

use std::fmt;

use anyhow::Result;
use regex::Regex;

struct PhoneNumber<'a> {
    area: &'a str,
    exchange: &'a str,
    subscriber: &'a str,
}

impl fmt::Display for PhoneNumber<'_> {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "1 ({}) {}-{}", self.area, self.exchange, self.subscriber)
    }
}

fn main() -> Result<()> {
    let phone_text = "
    +1 505 881 9292 (v) +1 505 778 2212 (c) +1 505 881 9297 (f)
    (202) 991 9534
    Alex 5553920011
    1 (800) 233-2010
    1.299.339.1020";

    let re = Regex::new(
        r#"(?x)
          (?:\+?1)?                       # Country Code Optional
          [\s\.]?
          (([2-9]\d{2})|\(([2-9]\d{2})\)) # Area Code
          [\s\.\-]?
          ([2-9]\d{2})                    # Exchange Code
          [\s\.\-]?
          (\d{4})                         # Subscriber Number"#,
    )?;

    let phone_numbers = re.captures_iter(phone_text).filter_map(|cap| {
        let groups = (cap.get(2).or(cap.get(3)), cap.get(4), cap.get(5));
        match groups {
            (Some(area), Some(ext), Some(sub)) => Some(PhoneNumber {
                area: area.as_str(),
                exchange: ext.as_str(),
                subscriber: sub.as_str(),
            }),
            _ => None,
        }
    });

    assert_eq!(
        phone_numbers.map(|m| m.to_string()).collect::<Vec<_>>(),
        vec![
            "1 (505) 881-9292",
            "1 (505) 778-2212",
            "1 (505) 881-9297",
            "1 (202) 991-9534",
            "1 (555) 392-0011",
            "1 (800) 233-2010",
            "1 (299) 339-1020",
        ]
    );

    Ok(())
}

Filter a log File by Matching Multiple Regular Expressions

regex cat-text-processing

Reads a file named application.log and only outputs the lines containing “version X.X.X”, some IP address followed by port 443 (e.g. “192.168.0.1:443”), or a specific warning.

A regex::RegexSetBuilder⮳ composes a regex::RegexSetBuilder⮳ Since backslashes are very common in regular expressions, using raw string literals⮳ makes them more readable.

use std::fs;
use std::fs::File;
use std::io::BufRead;
use std::io::BufReader;

use anyhow::Result;
use regex::RegexSetBuilder;

fn main() -> Result<()> {
    let log_path = "temp/application.log";
    let buffered = BufReader::new(File::open(log_path)?);

    let set = RegexSetBuilder::new([
        r#"version "\d\.\d\.\d""#,
        r#"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:443"#,
        r#"warning.*timeout expired"#,
    ])
    .case_insensitive(true)
    .build()?;

    buffered
        .lines()                // yield instances of io::Result<String>
        .map_while(Result::ok)
        .filter(|line| set.is_match(line.as_str()))
        .for_each(|x| println!("{}", x));

    Ok(())
}

Replace all Occurrences of one text Pattern with Another Pattern

regex lazy_static cat-text-processing

Replaces all occurrences of the standard ISO 8601 YYYY-MM-DD date pattern with the equivalent American English date with slashes. For example 2013-01-15 becomes 01/15/2013.

The method regex::Regex::replace_all⮳ replaces all occurrences of the whole regex. &str implements the regex::Replacer⮳ trait which allows variables like $abcde to refer to corresponding named capture groups (?P<abcde>REGEX) from the search regex. See the replacement string syntax⮳ for examples and escaping detail.

use std::borrow::Cow;

use lazy_static::lazy_static;
use regex::Regex;

fn reformat_dates(before: &str) -> Cow<str> {
    lazy_static! {
        static ref ISO8601_DATE_REGEX: Regex =
            Regex::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})").unwrap();
    }
    ISO8601_DATE_REGEX.replace_all(before, "$m/$d/$y")
}

fn main() {
    let before = "2012-03-14, 2013-01-15 and 2014-07-05";
    let after = reformat_dates(before);
    println!("{}", after);
    assert_eq!(after, "03/14/2012, 01/15/2013 and 07/05/2014");
}

Use Regular Expressions with Back-references and Lookarounds

fancy-regex fancy-regex-crates.io fancy-regex-github fancy-regex-lib.rs cat-text-processing

regex is the de facto standard regex library. It is very fast, but does not support fancier features such as backtracking, backreferences, and look-arounds. Use fancy-regex if you need features that regex doesn't support.

use fancy_regex::Regex;

// `fancy_regex` support features, including backreferences and lookaround, that
// are not available on the `regex` crate.

fn match_two_identical_words(s: &str) {
    // Check if a text consists of two identical words.
    // Note the backreference \1 to the 1st capture group (...)
    let re = Regex::new(r"^(\w+) (\1)$").expect("Error parsing the regex");
    match re.is_match(s) {
        Ok(true) => println!("Match found."),
        Ok(false) => println!("No match found."),
        Err(err) => eprintln!("Regex error: {}", err),
    }
}

fn capture_groups() {
    let text = "Lorem ipsum dolor sit amet";

    // Capture the word after "dolor" using a look-around. For reference,
    // (?=exp) : look-ahead, succeeds if exp matches to the right of the current
    // position
    // (?!exp) : negative look-ahead, succeeds if exp doesn't match
    // to the right
    // (?<=exp) : look-behind, succeeds if exp matches to the
    // left of the current position
    // (?<!exp) : negative look-behind,
    // succeeds if exp doesn't match to the left
    let pattern = r"(?<=dolor)\s*(\w+)";

    let re = Regex::new(pattern).expect("Error parsing the regex");
    // Returns the capture groups for the first match in text.
    match re.captures(text) {
        Ok(Some(caps)) => {
            // Get the capture group by its index in the regex.
            if let Some(name) = caps.get(1) {
                println!("Found: {}", name.as_str());
            } else {
                println!("No match found.");
            }
        }
        Ok(None) => println!("No match found."),
        Err(err) => eprintln!("Regex error: {}", err),
    }
}

fn split_text() {
    let re = Regex::new(r"[ \t]+").unwrap();
    let target = "Lorem ipsum\t dolor sit amet";
    let fields: Vec<&str> = re.split(target).map(|x| x.unwrap()).collect();
    assert_eq!(fields, vec!["Lorem", "ipsum", "dolor", "sit", "amet"]);
}

fn main() {
    match_two_identical_words("foo foo");
    capture_groups();
    split_text();
}

Longer Regex Example

regex regex-github cat-text-processing

use std::collections::BTreeMap;

use once_cell::sync::Lazy;
use regex::Regex;

// Regular expression and the names of its capture groups.
struct Re(Regex, Vec<&'static str>);

// Regexes take a while to compile; it is reasonable to store them in
// a global static
static GLOBAL_REGEX: Lazy<BTreeMap<&str, Re>> = Lazy::new(|| {
    println!("Initializing Regexes...\n");
    // A sorted map:
    let mut m = BTreeMap::new();
    // A Markdown inline link:
    // (?<name>  ) is a named capture group.
    // \s is a whitespace. \S is a not-whitespace.
    m.insert(
        "[text](http...)",
        Re(
            Regex::new(r"[^!]\[(?<text>.*?)\]\s?\(\s*?(?<link>\S*?)\s*?\)")
                .unwrap(),
            vec!["text", "link"],
        ),
    );
    // A Markdown autolink
    m.insert(
        "<http...>",
        Re(Regex::new(r"<(?<link>http.*?)>").unwrap(), vec!["link"]),
    );
    // A Markdown shortcut link
    // or <spaces>( or :
    m.insert(
        "[text] ...",
        Re(
            Regex::new(r"[^!\]]\[(?<text>[^\[\]]+?)\]\s*?[^\[\(:]").unwrap(),
            vec!["text"],
        ),
    );
    // A Markdown reference-style link
    m.insert(
        "[text][label]",
        Re(
            Regex::new(r"[^!\]]\[(?<text>.*?)\]\s?\[(?<label>.+?)\]").unwrap(),
            vec!["text", "label"],
        ),
    );
    // A Markdown reference definition (with optional title):
    // (?:  ) is a non-capturing group.
    // (?m) flags multi-line mode. ^ and $ are the beginning and end of a
    // line, respectively.
    m.insert(
        "[label]: url \"title\"",
        Re(Regex::new(r#"(?m)^\s*?\[(?<label>.*?)\]:\s*?(?<url>\S+)\s*?(?:"(?<title>.*)")?\s*$"#).unwrap(),
        vec!["label", "url", "title"])
    );
    m
});

#[allow(dead_code)]
fn extract_inline_links(contents: &str) {
    for (_, [text, link]) in GLOBAL_REGEX["[text](http...)"]
        .0
        // `captures_iter` iterates through `Captures`, which stores the
        // capture groups for each match.
        .captures_iter(contents)
        // `extract` returns a tuple where
        // the first element corresponds to the full substring of the contents
        // that matched the regex. The second element is an array of
        // substrings, with each corresponding to the substring that matched
        // for a particular capture group.
        .map(|c| c.extract())
    {
        println!("[{text}]({link})\n");
    }
}

// Locate markup in text
fn search_with_all_regexes(contents: &str) {
    // Try to match all reggular expressions
    for (key, re) in GLOBAL_REGEX.iter() {
        println!("----------------------\nLooking for {}:\n", key);
        for caps in re.0.captures_iter(contents) {
            // Print the whole match
            print!("{} -> ", &caps[0]);
            for group in re.1.iter() {
                print!(
                    "{}={}; ",
                    group,
                    // Retrieve each named capture group in turn...
                    // `extract` can't be used here, since the # of capture
                    // groups varies.
                    caps.name(group).map_or("", |m| m.as_str())
                );
            }
            println!("\n");
        }
    }
}

// Example Markdown to process
fn get_test_markdown() -> String {
    let md: &'static str = "
<http://url0/>

[text1](url1)

[text2][lbl2]

[lbl2]: url2 \"title2\"

[lbl3][]

[lbl4]

![image5](image_url5)

![image6][image_lbl6]

image_lbl6: image_url6

![image_lbl7]

![image_lbl8][]
";
    md.to_owned()
}

fn main() {
    search_with_all_regexes(get_test_markdown().as_str());
}

Related Topics

  • Rust Search Engines.
  • Search.
  • Strings.