Longer Regex Example
[regex.incl: fix (P2)](https://github.com/john-cd/rust_howto/issues/487)
| Use regular expressions with backreferences and lookarounds | | |
use std::collections::BTreeMap; use once_cell::sync::Lazy; use regex::Regex; // Regular expression and the names of its capture groups. struct Re(Regex, Vec<&'static str>); // Regexes take a while to compile; it is reasonable to store them in // a global static static GLOBAL_REGEX: Lazy<BTreeMap<&str, Re>> = Lazy::new(|| { println!("Initializing Regexes...\n"); // A sorted map: let mut m = BTreeMap::new(); // A Markdown inline link: // (?<name> ) is a named capture group. // \s is a whitespace. \S is a not-whitespace. m.insert( "[text](http...)", Re( Regex::new(r"[^!]\[(?<text>.*?)\]\s?\(\s*?(?<link>\S*?)\s*?\)") .unwrap(), vec!["text", "link"], ), ); // A Markdown autolink m.insert( "<http...>", Re(Regex::new(r"<(?<link>http.*?)>").unwrap(), vec!["link"]), ); // A Markdown shortcut link // or <spaces>( or : m.insert( "[text] ...", Re( Regex::new(r"[^!\]]\[(?<text>[^\[\]]+?)\]\s*?[^\[\(:]").unwrap(), vec!["text"], ), ); // A Markdown reference-style link m.insert( "[text][label]", Re( Regex::new(r"[^!\]]\[(?<text>.*?)\]\s?\[(?<label>.+?)\]").unwrap(), vec!["text", "label"], ), ); // A Markdown reference definition (with optional title): // (?: ) is a non-capturing group. // (?m) flags multi-line mode. ^ and $ are the beginning and end of a // line, respectively. m.insert( "[label]: url \"title\"", Re(Regex::new(r#"(?m)^\s*?\[(?<label>.*?)\]:\s*?(?<url>\S+)\s*?(?:"(?<title>.*)")?\s*$"#).unwrap(), vec!["label", "url", "title"]) ); m }); #[allow(dead_code)] fn extract_inline_links(contents: &str) { for (_, [text, link]) in GLOBAL_REGEX["[text](http...)"] .0 // `captures_iter` iterates through `Captures`, which stores the // capture groups for each match. .captures_iter(contents) // `extract` returns a tuple where // the first element corresponds to the full substring of the contents // that matched the regex. The second element is an array of // substrings, with each corresponding to the substring that matched // for a particular capture group. .map(|c| c.extract()) { println!("[{text}]({link})\n"); } } // Locate markup in text fn search_with_all_regexes(contents: &str) { // Try to match all reggular expressions for (key, re) in GLOBAL_REGEX.iter() { println!("----------------------\nLooking for {}:\n", key); for caps in re.0.captures_iter(contents) { // Print the whole match print!("{} -> ", &caps[0]); for group in re.1.iter() { print!( "{}={}; ", group, // Retrieve each named capture group in turn... // `extract` can't be used here, since the # of capture // groups varies. caps.name(group).map_or("", |m| m.as_str()) ); } println!("\n"); } } } // Example Markdown to process fn get_test_markdown() -> String { let md: &'static str = " <http://url0/> [text1](url1) [text2][lbl2] [lbl2]: url2 \"title2\" [lbl3][] [lbl4] ![image5](image_url5) ![image6][image_lbl6] image_lbl6: image_url6 ![image_lbl7] ![image_lbl8][] "; md.to_owned() } fn main() { search_with_all_regexes(get_test_markdown().as_str()); }
[regex2: better title (P1)](https://github.com/john-cd/rust_howto/issues/489)