From 8a5a08e04d16678d9f843069a5fd28197a663619 Mon Sep 17 00:00:00 2001 From: Gareth Smith Date: Thu, 1 Aug 2013 23:48:22 +0100 Subject: [PATCH 1/7] Replace os::glob with extra::glob, which is written in rust, fixing issue #6100. --- src/libextra/extra.rs | 1 + src/libextra/glob.rs | 348 ++++++++++++++++++++++++++++++++++++++++++ src/libstd/os.rs | 82 ---------- 3 files changed, 349 insertions(+), 82 deletions(-) create mode 100644 src/libextra/glob.rs diff --git a/src/libextra/extra.rs b/src/libextra/extra.rs index f4fb7bcd76c99..735c5f0136aca 100644 --- a/src/libextra/extra.rs +++ b/src/libextra/extra.rs @@ -83,6 +83,7 @@ pub mod getopts; pub mod json; pub mod md4; pub mod tempfile; +pub mod glob; pub mod term; pub mod time; pub mod arena; diff --git a/src/libextra/glob.rs b/src/libextra/glob.rs new file mode 100644 index 0000000000000..f57a857b272e1 --- /dev/null +++ b/src/libextra/glob.rs @@ -0,0 +1,348 @@ +// Copyright 2013 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +use std::{os, uint}; + +use sort; + + +/** + * An iterator that yields Paths from the filesystem that match a particular + * pattern - see the glob function for more details. + */ +pub struct GlobIterator { + priv root: Path, + priv dir_patterns: ~[~[PatternToken]], + priv todo: ~[Path] +} + +enum PatternToken { + Char(char), + AnyChar, + AnySequence, + AnyWithin(~[char]), + AnyExcept(~[char]), +} + +/** + * Return an iterator that produces all the paths that match the given pattern, + * which may be absolute or relative to the current working directory. + * + * This function accepts Unix shell style patterns: + * '?' matches any single character. + * '*' matches any (possibly empty) sequence of characters. + * '[...]' matches any character inside the brackets, unless the first character + * is '!' in which case it matches any character except those between + * the '!' and the ']'. + * + * The metacharacters '?', '*', '[', ']' can be matched by using brackets (e.g. '[?]') + * + * Paths are yielded in alphabetical order, as absolute paths. + */ +pub fn glob(pattern: &str) -> GlobIterator { + + // note that this relies on the glob meta characters not + // having any special meaning in actual pathnames + let path = Path(pattern); + let dir_patterns = path.components.map(|s| compile_pattern(*s)); + + let root = if path.is_absolute() { + Path {components: ~[], .. path} // preserve windows path host/device + } else { + os::getcwd() + }; + let todo = list_dir_sorted(&root); + + GlobIterator { + root: root, + dir_patterns: dir_patterns, + todo: todo, + } +} + +impl Iterator for GlobIterator { + + fn next(&mut self) -> Option { + loop { + if self.dir_patterns.is_empty() || self.todo.is_empty() { + return None; + } + + let path = self.todo.pop(); + let pattern_index = path.components.len() - self.root.components.len() - 1; + + if pattern_matches(*path.components.last(), self.dir_patterns[pattern_index]) { + + if pattern_index == self.dir_patterns.len() - 1 { + // it is not possible for a pattern to match a directory *AND* its children + // so we don't need to check the children + return Some(path); + } else { + self.todo.push_all(list_dir_sorted(&path)); + } + } + } + } + +} + +fn list_dir_sorted(path: &Path) -> ~[Path] { + let mut children = os::list_dir_path(path); + sort::quick_sort(children, |p1, p2| p2.components.last() <= p1.components.last()); + children +} + +fn compile_pattern(pattern_str: &str) -> ~[PatternToken] { + let mut pattern = ~[]; + + let mut pattern_iter = pattern_str.iter(); + loop { + let pchar = match pattern_iter.next() { + None => break, + Some(c) => c, + }; + match pchar { + '?' => { + pattern.push(AnyChar); + } + '*' => { + pattern.push(AnySequence); + } + '[' => { + let mut chars = ~[]; + let is_except = match pattern_iter.next() { + None => false, // let the following loop fail with a message + Some('!') => true, + Some(c) => { + chars.push(c); + false + } + }; + loop { + match pattern_iter.next() { + None => fail!("invalid pattern syntax due to unclosed bracket: %s", + pattern_str), + Some(']') => break, + Some(c) => chars.push(c), + } + } + pattern.push(if is_except { AnyExcept(chars) } else { AnyWithin(chars) }); + } + c => { + pattern.push(Char(c)); + } + } + } + + pattern +} + +fn pattern_matches(mut file: &str, pattern: &[PatternToken]) -> bool { + + for uint::range(0, pattern.len()) |pi| { + match pattern[pi] { + AnySequence => { + loop { + if pattern_matches(file, pattern.slice_from(pi + 1)) { + return true; + } + if file.is_empty() { + return false; + } + file = file.slice_shift_char().second(); + } + } + _ => { + if file.is_empty() { + return false; + } + let (c, next) = file.slice_shift_char(); + let matches = match pattern[pi] { + AnyChar => true, + AnyWithin(ref chars) => chars.contains(&c), + AnyExcept(ref chars) => !chars.contains(&c), + Char(c2) => c == c2, + AnySequence => fail!(), + }; + if !matches { + return false; + } + file = next; + } + } + } + + file.is_empty() +} + +#[cfg(test)] +mod test { + use std::{io, os, unstable}; + use super::*; + + #[test] + fn test_relative_pattern() { + + fn mk_file(path: &str, directory: bool) { + if directory { + os::make_dir(&Path(path), 0xFFFF); + } else { + io::mk_file_writer(&Path(path), [io::Create]); + } + } + + fn abs_path(path: &str) -> Path { + os::getcwd().push_many(Path(path).components) + } + + fn glob_vec(pattern: &str) -> ~[Path] { + glob(pattern).collect() + } + + mk_file("tmp", true); + mk_file("tmp/glob-tests", true); + + do unstable::change_dir_locked(&Path("tmp/glob-tests")) { + + mk_file("aaa", true); + mk_file("aaa/apple", true); + mk_file("aaa/orange", true); + mk_file("aaa/tomato", true); + mk_file("aaa/tomato/tomato.txt", false); + mk_file("aaa/tomato/tomoto.txt", false); + mk_file("bbb", true); + mk_file("bbb/specials", true); + mk_file("bbb/specials/!", false); + + // windows does not allow some meta characters to exist in filenames + if os::consts::FAMILY != os::consts::windows::FAMILY { + mk_file("bbb/specials/*", false); + mk_file("bbb/specials/?", false); + } + + mk_file("bbb/specials/[", false); + mk_file("bbb/specials/]", false); + mk_file("ccc", true); + mk_file("xyz", true); + mk_file("xyz/x", false); + mk_file("xyz/y", false); + mk_file("xyz/z", false); + + assert_eq!(glob_vec(""), ~[]); + assert_eq!(glob_vec("."), ~[]); + assert_eq!(glob_vec(".."), ~[]); + + assert_eq!(glob_vec("aaa"), ~[abs_path("aaa")]); + assert_eq!(glob_vec("aaa/"), ~[abs_path("aaa")]); + assert_eq!(glob_vec("a"), ~[]); + assert_eq!(glob_vec("aa"), ~[]); + assert_eq!(glob_vec("aaaa"), ~[]); + + assert_eq!(glob_vec("aaa/apple"), ~[abs_path("aaa/apple")]); + assert_eq!(glob_vec("aaa/apple/nope"), ~[]); + + // windows should support both / and \ as directory separators + if os::consts::FAMILY == os::consts::windows::FAMILY { + assert_eq!(glob_vec("aaa\\apple"), ~[abs_path("aaa/apple")]); + } + + assert_eq!(glob_vec("???/"), ~[ + abs_path("aaa"), + abs_path("bbb"), + abs_path("ccc"), + abs_path("xyz")]); + + assert_eq!(glob_vec("aaa/tomato/tom?to.txt"), ~[ + abs_path("aaa/tomato/tomato.txt"), + abs_path("aaa/tomato/tomoto.txt")]); + + assert_eq!(glob_vec("xyz/?"), ~[ + abs_path("xyz/x"), + abs_path("xyz/y"), + abs_path("xyz/z")]); + + assert_eq!(glob_vec("a*"), ~[abs_path("aaa")]); + assert_eq!(glob_vec("*a*"), ~[abs_path("aaa")]); + assert_eq!(glob_vec("a*a"), ~[abs_path("aaa")]); + assert_eq!(glob_vec("aaa*"), ~[abs_path("aaa")]); + assert_eq!(glob_vec("*aaa"), ~[abs_path("aaa")]); + assert_eq!(glob_vec("*aaa*"), ~[abs_path("aaa")]); + assert_eq!(glob_vec("*a*a*a*"), ~[abs_path("aaa")]); + assert_eq!(glob_vec("aaa*/"), ~[abs_path("aaa")]); + + assert_eq!(glob_vec("aaa/*"), ~[ + abs_path("aaa/apple"), + abs_path("aaa/orange"), + abs_path("aaa/tomato")]); + + assert_eq!(glob_vec("aaa/*a*"), ~[ + abs_path("aaa/apple"), + abs_path("aaa/orange"), + abs_path("aaa/tomato")]); + + assert_eq!(glob_vec("*/*/*.txt"), ~[ + abs_path("aaa/tomato/tomato.txt"), + abs_path("aaa/tomato/tomoto.txt")]); + + assert_eq!(glob_vec("*/*/t[aob]m?to[.]t[!y]t"), ~[ + abs_path("aaa/tomato/tomato.txt"), + abs_path("aaa/tomato/tomoto.txt")]); + + assert_eq!(glob_vec("aa[a]"), ~[abs_path("aaa")]); + assert_eq!(glob_vec("aa[abc]"), ~[abs_path("aaa")]); + assert_eq!(glob_vec("a[bca]a"), ~[abs_path("aaa")]); + assert_eq!(glob_vec("aa[b]"), ~[]); + assert_eq!(glob_vec("aa[xyz]"), ~[]); + assert_eq!(glob_vec("aa[]]"), ~[]); + + assert_eq!(glob_vec("aa[!b]"), ~[abs_path("aaa")]); + assert_eq!(glob_vec("aa[!bcd]"), ~[abs_path("aaa")]); + assert_eq!(glob_vec("a[!bcd]a"), ~[abs_path("aaa")]); + assert_eq!(glob_vec("aa[!]"), ~[abs_path("aaa")]); + assert_eq!(glob_vec("aa[!a]"), ~[]); + assert_eq!(glob_vec("aa[!abc]"), ~[]); + + assert_eq!(glob_vec("bbb/specials/[[]"), ~[abs_path("bbb/specials/[")]); + assert_eq!(glob_vec("bbb/specials/!"), ~[abs_path("bbb/specials/!")]); + assert_eq!(glob_vec("bbb/specials/[]]"), ~[abs_path("bbb/specials/]")]); + + if os::consts::FAMILY != os::consts::windows::FAMILY { + assert_eq!(glob_vec("bbb/specials/[*]"), ~[abs_path("bbb/specials/*")]); + assert_eq!(glob_vec("bbb/specials/[?]"), ~[abs_path("bbb/specials/?")]); + } + }; + } + + #[test] + fn test_absolute_pattern() { + // assume that the filesystem is not empty! + assert!(glob("/*").next().is_some()); + assert!(glob("//").next().is_none()); + + // check windows absolute paths with host/device components + let root_with_device = (Path {components: ~[], .. os::getcwd()}).to_str() + "*"; + assert!(glob(root_with_device).next().is_some()); + } + + #[test] + fn test_lots_of_files() { + // this is a good test because it touches lots of differently named files + for glob("/*/*/*/*").advance |_p| {} + } + + #[test] + #[should_fail] + #[cfg(not(windows))] + fn test_unclosed_bracket() { + glob("abc[def"); + } +} + diff --git a/src/libstd/os.rs b/src/libstd/os.rs index 3afd946ee264b..bf6db6cb1534f 100644 --- a/src/libstd/os.rs +++ b/src/libstd/os.rs @@ -1227,88 +1227,6 @@ pub fn set_args(new_args: ~[~str]) { local_data::set(overridden_arg_key, overridden_args); } -// FIXME #6100 we should really use an internal implementation of this - using -// the POSIX glob functions isn't portable to windows, probably has slight -// inconsistencies even where it is implemented, and makes extending -// functionality a lot more difficult -// FIXME #6101 also provide a non-allocating version - each_glob or so? -/// Returns a vector of Path objects that match the given glob pattern -#[cfg(target_os = "linux")] -#[cfg(target_os = "android")] -#[cfg(target_os = "freebsd")] -#[cfg(target_os = "macos")] -pub fn glob(pattern: &str) -> ~[Path] { - #[cfg(target_os = "linux")] - #[cfg(target_os = "android")] - fn default_glob_t () -> libc::glob_t { - libc::glob_t { - gl_pathc: 0, - gl_pathv: ptr::null(), - gl_offs: 0, - __unused1: ptr::null(), - __unused2: ptr::null(), - __unused3: ptr::null(), - __unused4: ptr::null(), - __unused5: ptr::null(), - } - } - - #[cfg(target_os = "freebsd")] - fn default_glob_t () -> libc::glob_t { - libc::glob_t { - gl_pathc: 0, - __unused1: 0, - gl_offs: 0, - __unused2: 0, - gl_pathv: ptr::null(), - __unused3: ptr::null(), - __unused4: ptr::null(), - __unused5: ptr::null(), - __unused6: ptr::null(), - __unused7: ptr::null(), - __unused8: ptr::null(), - } - } - - #[cfg(target_os = "macos")] - fn default_glob_t () -> libc::glob_t { - libc::glob_t { - gl_pathc: 0, - __unused1: 0, - gl_offs: 0, - __unused2: 0, - gl_pathv: ptr::null(), - __unused3: ptr::null(), - __unused4: ptr::null(), - __unused5: ptr::null(), - __unused6: ptr::null(), - __unused7: ptr::null(), - __unused8: ptr::null(), - } - } - - let mut g = default_glob_t(); - do pattern.as_c_str |c_pattern| { - unsafe { libc::glob(c_pattern, 0, ptr::null(), &mut g) } - }; - do(|| { - let paths = unsafe { - vec::raw::from_buf_raw(g.gl_pathv, g.gl_pathc as uint) - }; - do paths.map |&c_str| { - Path(unsafe { str::raw::from_c_str(c_str) }) - } - }).finally { - unsafe { libc::globfree(&mut g) }; - } -} - -/// Returns a vector of Path objects that match the given glob pattern -#[cfg(target_os = "win32")] -pub fn glob(_pattern: &str) -> ~[Path] { - fail!("glob() is unimplemented on Windows") -} - #[cfg(target_os = "macos")] extern { // These functions are in crt_externs.h. From 60bc719a8fec7e86f9dcddbd33841290ed6a4cef Mon Sep 17 00:00:00 2001 From: Gareth Smith Date: Sun, 4 Aug 2013 22:09:41 +0100 Subject: [PATCH 2/7] Allow NOT ']' to be matched with the pattern '[!]]' (prior to this there was no way to match NOT ']'). Also do some refactoring to avoid relying on the vector iterator always returning None after returning None for the first time (as requested by @kballard). --- src/libextra/glob.rs | 103 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 88 insertions(+), 15 deletions(-) diff --git a/src/libextra/glob.rs b/src/libextra/glob.rs index f57a857b272e1..4b0e865684af5 100644 --- a/src/libextra/glob.rs +++ b/src/libextra/glob.rs @@ -43,7 +43,10 @@ enum PatternToken { * is '!' in which case it matches any character except those between * the '!' and the ']'. * - * The metacharacters '?', '*', '[', ']' can be matched by using brackets (e.g. '[?]') + * The metacharacters '?', '*', '[', ']' can be matched by using brackets (e.g. '[?]'). + * When a ']' occurs immediately following '[' or '[!' then it is interpreted as + * being part of, rather then ending, the character set, so ']' and NOT ']' can be + * matched by '[]]' and '[!]]' respectively. * * Paths are yielded in alphabetical order, as absolute paths. */ @@ -117,21 +120,30 @@ fn compile_pattern(pattern_str: &str) -> ~[PatternToken] { pattern.push(AnySequence); } '[' => { - let mut chars = ~[]; - let is_except = match pattern_iter.next() { - None => false, // let the following loop fail with a message - Some('!') => true, - Some(c) => { - chars.push(c); - false - } - }; - loop { + + // get the next char, or fail with a helpful message + let next_pattern_char = || { match pattern_iter.next() { None => fail!("invalid pattern syntax due to unclosed bracket: %s", pattern_str), - Some(']') => break, - Some(c) => chars.push(c), + Some(c) => c, + } + }; + + let c = next_pattern_char(); + let is_except = (c == '!'); + + let mut chars = ~[]; + if is_except { + chars.push(next_pattern_char()); + } else { + chars.push(c); + }; + + loop { + match next_pattern_char() { + ']' => break, + c => chars.push(c), } } pattern.push(if is_except { AnyExcept(chars) } else { AnyWithin(chars) }); @@ -222,7 +234,7 @@ mod test { mk_file("bbb/specials", true); mk_file("bbb/specials/!", false); - // windows does not allow some meta characters to exist in filenames + // windows does not allow '*' or '?' characters to exist in filenames if os::consts::FAMILY != os::consts::windows::FAMILY { mk_file("bbb/specials/*", false); mk_file("bbb/specials/?", false); @@ -306,7 +318,6 @@ mod test { assert_eq!(glob_vec("aa[!b]"), ~[abs_path("aaa")]); assert_eq!(glob_vec("aa[!bcd]"), ~[abs_path("aaa")]); assert_eq!(glob_vec("a[!bcd]a"), ~[abs_path("aaa")]); - assert_eq!(glob_vec("aa[!]"), ~[abs_path("aaa")]); assert_eq!(glob_vec("aa[!a]"), ~[]); assert_eq!(glob_vec("aa[!abc]"), ~[]); @@ -318,6 +329,54 @@ mod test { assert_eq!(glob_vec("bbb/specials/[*]"), ~[abs_path("bbb/specials/*")]); assert_eq!(glob_vec("bbb/specials/[?]"), ~[abs_path("bbb/specials/?")]); } + + if os::consts::FAMILY == os::consts::windows::FAMILY { + + assert_eq!(glob_vec("bbb/specials/[![]"), ~[ + abs_path("bbb/specials/!"), + abs_path("bbb/specials/]")]); + + assert_eq!(glob_vec("bbb/specials/[!]]"), ~[ + abs_path("bbb/specials/!"), + abs_path("bbb/specials/[")]); + + assert_eq!(glob_vec("bbb/specials/[!!]"), ~[ + abs_path("bbb/specials/["), + abs_path("bbb/specials/]")]); + + } else { + + assert_eq!(glob_vec("bbb/specials/[![]"), ~[ + abs_path("bbb/specials/!"), + abs_path("bbb/specials/*"), + abs_path("bbb/specials/?"), + abs_path("bbb/specials/]")]); + + assert_eq!(glob_vec("bbb/specials/[!]]"), ~[ + abs_path("bbb/specials/!"), + abs_path("bbb/specials/*"), + abs_path("bbb/specials/?"), + abs_path("bbb/specials/[")]); + + assert_eq!(glob_vec("bbb/specials/[!!]"), ~[ + abs_path("bbb/specials/*"), + abs_path("bbb/specials/?"), + abs_path("bbb/specials/["), + abs_path("bbb/specials/]")]); + + assert_eq!(glob_vec("bbb/specials/[!*]"), ~[ + abs_path("bbb/specials/!"), + abs_path("bbb/specials/?"), + abs_path("bbb/specials/["), + abs_path("bbb/specials/]")]); + + assert_eq!(glob_vec("bbb/specials/[!?]"), ~[ + abs_path("bbb/specials/!"), + abs_path("bbb/specials/*"), + abs_path("bbb/specials/["), + abs_path("bbb/specials/]")]); + + } }; } @@ -344,5 +403,19 @@ mod test { fn test_unclosed_bracket() { glob("abc[def"); } + + #[test] + #[should_fail] + #[cfg(not(windows))] + fn test_unclosed_bracket_special() { + glob("abc[]"); // not valid syntax, '[]]' should be used to match ']' + } + + #[test] + #[should_fail] + #[cfg(not(windows))] + fn test_unclosed_bracket_special_except() { + glob("abc[!]"); // not valid syntax, '[!]]' should be used to match NOT ']' + } } From bbabc8d279015bacd9e81c6522cff6f3cd3479f9 Mon Sep 17 00:00:00 2001 From: Gareth Smith Date: Mon, 5 Aug 2013 19:20:15 +0100 Subject: [PATCH 3/7] Refactor extra::glob so that it now exposes a Pattern::matches method that acts something like the C fnmatch function (albeit without any options... yet). --- src/libextra/glob.rs | 245 ++++++++++++++++++++++++++----------------- 1 file changed, 148 insertions(+), 97 deletions(-) diff --git a/src/libextra/glob.rs b/src/libextra/glob.rs index 4b0e865684af5..9398d3b656813 100644 --- a/src/libextra/glob.rs +++ b/src/libextra/glob.rs @@ -20,33 +20,17 @@ use sort; */ pub struct GlobIterator { priv root: Path, - priv dir_patterns: ~[~[PatternToken]], + priv dir_patterns: ~[Pattern], priv todo: ~[Path] } -enum PatternToken { - Char(char), - AnyChar, - AnySequence, - AnyWithin(~[char]), - AnyExcept(~[char]), -} - /** - * Return an iterator that produces all the paths that match the given pattern, + * Return an iterator that produces all the Paths that match the given pattern, * which may be absolute or relative to the current working directory. * - * This function accepts Unix shell style patterns: - * '?' matches any single character. - * '*' matches any (possibly empty) sequence of characters. - * '[...]' matches any character inside the brackets, unless the first character - * is '!' in which case it matches any character except those between - * the '!' and the ']'. - * - * The metacharacters '?', '*', '[', ']' can be matched by using brackets (e.g. '[?]'). - * When a ']' occurs immediately following '[' or '[!' then it is interpreted as - * being part of, rather then ending, the character set, so ']' and NOT ']' can be - * matched by '[]]' and '[!]]' respectively. + * This function accepts Unix shell style patterns as described by Pattern::new(..), + * with the exception that path separators (i.e. '/' on Posix systems) must be matched + * by their literal representation - they can't be matched by '*', '?', '[...]', etc. * * Paths are yielded in alphabetical order, as absolute paths. */ @@ -55,7 +39,7 @@ pub fn glob(pattern: &str) -> GlobIterator { // note that this relies on the glob meta characters not // having any special meaning in actual pathnames let path = Path(pattern); - let dir_patterns = path.components.map(|s| compile_pattern(*s)); + let dir_patterns = path.components.map(|s| Pattern::new(*s)); let root = if path.is_absolute() { Path {components: ~[], .. path} // preserve windows path host/device @@ -82,7 +66,7 @@ impl Iterator for GlobIterator { let path = self.todo.pop(); let pattern_index = path.components.len() - self.root.components.len() - 1; - if pattern_matches(*path.components.last(), self.dir_patterns[pattern_index]) { + if self.dir_patterns[pattern_index].matches(*path.components.last()) { if pattern_index == self.dir_patterns.len() - 1 { // it is not possible for a pattern to match a directory *AND* its children @@ -103,96 +87,145 @@ fn list_dir_sorted(path: &Path) -> ~[Path] { children } -fn compile_pattern(pattern_str: &str) -> ~[PatternToken] { - let mut pattern = ~[]; - - let mut pattern_iter = pattern_str.iter(); - loop { - let pchar = match pattern_iter.next() { - None => break, - Some(c) => c, - }; - match pchar { - '?' => { - pattern.push(AnyChar); - } - '*' => { - pattern.push(AnySequence); - } - '[' => { - - // get the next char, or fail with a helpful message - let next_pattern_char = || { - match pattern_iter.next() { - None => fail!("invalid pattern syntax due to unclosed bracket: %s", - pattern_str), - Some(c) => c, - } - }; - - let c = next_pattern_char(); - let is_except = (c == '!'); +/** + * A compiled Unix shell style pattern. + */ +pub struct Pattern { + priv tokens: ~[PatternToken] +} - let mut chars = ~[]; - if is_except { - chars.push(next_pattern_char()); - } else { - chars.push(c); - }; +enum PatternToken { + Char(char), + AnyChar, + AnySequence, + AnyWithin(~[char]), + AnyExcept(~[char]) +} - loop { - match next_pattern_char() { - ']' => break, - c => chars.push(c), +impl Pattern { + + /** + * This function compiles Unix shell style patterns: + * '?' matches any single character. + * '*' matches any (possibly empty) sequence of characters. + * '[...]' matches any character inside the brackets, unless the first character + * is '!' in which case it matches any character except those between + * the '!' and the ']'. + * + * The metacharacters '?', '*', '[', ']' can be matched by using brackets (e.g. '[?]'). + * When a ']' occurs immediately following '[' or '[!' then it is interpreted as + * being part of, rather then ending, the character set, so ']' and NOT ']' can be + * matched by '[]]' and '[!]]' respectively. + * + * This function will fail if the given pattern string is malformed - e.g. if it contains + * a '[...]' sequence that is missing its closing ']'. + */ + pub fn new(pattern: &str) -> Pattern { + let mut tokens = ~[]; + + let mut pattern_iter = pattern.iter(); + loop { + let pchar = match pattern_iter.next() { + None => break, + Some(c) => c, + }; + match pchar { + '?' => { + tokens.push(AnyChar); + } + '*' => { + tokens.push(AnySequence); + } + '[' => { + + // get the next char, or fail with a helpful message + let next_pattern_char = || { + match pattern_iter.next() { + None => fail!("invalid pattern syntax due to unclosed bracket: %s", + pattern), + Some(c) => c, + } + }; + + let c = next_pattern_char(); + let is_except = (c == '!'); + + let mut chars = ~[]; + if is_except { + chars.push(next_pattern_char()); + } else { + chars.push(c); + }; + + loop { + match next_pattern_char() { + ']' => break, + c => chars.push(c), + } } + tokens.push(if is_except { AnyExcept(chars) } else { AnyWithin(chars) }); + } + c => { + tokens.push(Char(c)); } - pattern.push(if is_except { AnyExcept(chars) } else { AnyWithin(chars) }); - } - c => { - pattern.push(Char(c)); } } + + Pattern { tokens: tokens } } - pattern -} + /** + * Return if the given str matches this Pattern. + */ + pub fn matches(&self, file: &str) -> bool { + self.matches_from(file, 0) + } -fn pattern_matches(mut file: &str, pattern: &[PatternToken]) -> bool { + /** + * Return if the given Path, when converted to a str, matches this Pattern. + */ + pub fn matches_path(&self, path: &Path) -> bool { + self.matches(path.to_str()) + } - for uint::range(0, pattern.len()) |pi| { - match pattern[pi] { - AnySequence => { - loop { - if pattern_matches(file, pattern.slice_from(pi + 1)) { - return true; + fn matches_from(&self, mut file: &str, i: uint) -> bool { + + for uint::range(i, self.tokens.len()) |ti| { + match self.tokens[ti] { + AnySequence => { + loop { + if self.matches_from(file, ti + 1) { + return true; + } + if file.is_empty() { + return false; + } + file = file.slice_shift_char().second(); } + } + _ => { if file.is_empty() { return false; } - file = file.slice_shift_char().second(); - } - } - _ => { - if file.is_empty() { - return false; - } - let (c, next) = file.slice_shift_char(); - let matches = match pattern[pi] { - AnyChar => true, - AnyWithin(ref chars) => chars.contains(&c), - AnyExcept(ref chars) => !chars.contains(&c), - Char(c2) => c == c2, - AnySequence => fail!(), - }; - if !matches { - return false; + let (c, next) = file.slice_shift_char(); + let matches = match self.tokens[ti] { + AnyChar => true, + AnyWithin(ref chars) => chars.contains(&c), + AnyExcept(ref chars) => !chars.contains(&c), + Char(c2) => c == c2, + AnySequence => fail!(), + }; + if !matches { + return false; + } + file = next; } - file = next; } } + + file.is_empty() } - file.is_empty() } #[cfg(test)] @@ -401,21 +434,39 @@ mod test { #[should_fail] #[cfg(not(windows))] fn test_unclosed_bracket() { - glob("abc[def"); + Pattern::new("abc[def"); } #[test] #[should_fail] #[cfg(not(windows))] fn test_unclosed_bracket_special() { - glob("abc[]"); // not valid syntax, '[]]' should be used to match ']' + Pattern::new("abc[]"); // not valid syntax, '[]]' should be used to match ']' } #[test] #[should_fail] #[cfg(not(windows))] fn test_unclosed_bracket_special_except() { - glob("abc[!]"); // not valid syntax, '[!]]' should be used to match NOT ']' + Pattern::new("abc[!]"); // not valid syntax, '[!]]' should be used to match NOT ']' + } + + #[test] + fn test_pattern_matches() { + let txt_pat = Pattern::new("*hello.txt"); + assert!(txt_pat.matches("hello.txt")); + assert!(txt_pat.matches("gareth_says_hello.txt")); + assert!(txt_pat.matches("some/path/to/hello.txt")); + assert!(txt_pat.matches("some\\path\\to\\hello.txt")); + assert!(txt_pat.matches("/an/absolute/path/to/hello.txt")); + assert!(!txt_pat.matches("hello.txt-and-then-some")); + assert!(!txt_pat.matches("goodbye.txt")); + + let dir_pat = Pattern::new("*some/path/to/hello.txt"); + assert!(dir_pat.matches("some/path/to/hello.txt")); + assert!(dir_pat.matches("a/bigger/some/path/to/hello.txt")); + assert!(!dir_pat.matches("some/path/to/hello.txt-and-then-some")); + assert!(!dir_pat.matches("some/other/path/to/hello.txt")); } } From b0648a0af3a76fbf88277291efe877dba74c70bf Mon Sep 17 00:00:00 2001 From: Gareth Smith Date: Mon, 5 Aug 2013 21:29:26 +0100 Subject: [PATCH 4/7] Add Pattern::escape(&str) to help constructing dynamic pattern strings. --- src/libextra/glob.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/libextra/glob.rs b/src/libextra/glob.rs index 9398d3b656813..8192a8853c0bd 100644 --- a/src/libextra/glob.rs +++ b/src/libextra/glob.rs @@ -174,6 +174,29 @@ impl Pattern { Pattern { tokens: tokens } } + /** + * Escape metacharacters within the given string by surrounding them in + * brackets. The resulting string will, when compiled into a Pattern, + * match the input string and nothing else. + */ + pub fn escape(s: &str) -> ~str { + let mut escaped = ~""; + foreach c in s.iter() { + match c { + // note that ! does not need escaping because it is only special inside brackets + '?' | '*' | '[' | ']' => { + escaped.push_char('['); + escaped.push_char(c); + escaped.push_char(']'); + } + c => { + escaped.push_char(c); + } + } + } + escaped + } + /** * Return if the given str matches this Pattern. */ @@ -468,5 +491,12 @@ mod test { assert!(!dir_pat.matches("some/path/to/hello.txt-and-then-some")); assert!(!dir_pat.matches("some/other/path/to/hello.txt")); } + + #[test] + fn test_pattern_escape() { + let s = "_[_]_?_*_!_"; + assert_eq!(Pattern::escape(s), ~"_[[]_[]]_[?]_[*]_!_"); + assert!(Pattern::new(Pattern::escape(s)).matches(s)); + } } From 149ab389ae57995ef3475e9dd978b179235a16f9 Mon Sep 17 00:00:00 2001 From: Gareth Smith Date: Wed, 14 Aug 2013 20:23:32 +0100 Subject: [PATCH 5/7] Use util::unreachable() instead of fail!() in aid of self-describing code. --- src/libextra/glob.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libextra/glob.rs b/src/libextra/glob.rs index 8192a8853c0bd..f92a604fb65b4 100644 --- a/src/libextra/glob.rs +++ b/src/libextra/glob.rs @@ -9,7 +9,7 @@ // except according to those terms. -use std::{os, uint}; +use std::{os, uint, util}; use sort; @@ -236,7 +236,7 @@ impl Pattern { AnyWithin(ref chars) => chars.contains(&c), AnyExcept(ref chars) => !chars.contains(&c), Char(c2) => c == c2, - AnySequence => fail!(), + AnySequence => util::unreachable(), }; if !matches { return false; From b75e005324aac85b85ae962a451098d6c9d3ac7c Mon Sep 17 00:00:00 2001 From: Gareth Smith Date: Thu, 15 Aug 2013 20:36:37 +0100 Subject: [PATCH 6/7] Allow matching options to be specified when calling glob and Pattern::matches, and provide a case_sensitive option that defaults to true. --- src/libextra/glob.rs | 134 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 122 insertions(+), 12 deletions(-) diff --git a/src/libextra/glob.rs b/src/libextra/glob.rs index f92a604fb65b4..7886489d14fe6 100644 --- a/src/libextra/glob.rs +++ b/src/libextra/glob.rs @@ -21,9 +21,19 @@ use sort; pub struct GlobIterator { priv root: Path, priv dir_patterns: ~[Pattern], + priv options: MatchOptions, priv todo: ~[Path] } +/** + * Call glob_with with the default match options. + * + * This equivalent to glob_with(pattern, MatchOptions::new()) + */ +pub fn glob(pattern: &str) -> GlobIterator { + glob_with(pattern, MatchOptions::new()) +} + /** * Return an iterator that produces all the Paths that match the given pattern, * which may be absolute or relative to the current working directory. @@ -32,9 +42,11 @@ pub struct GlobIterator { * with the exception that path separators (i.e. '/' on Posix systems) must be matched * by their literal representation - they can't be matched by '*', '?', '[...]', etc. * + * The options given are passed through unchanged to Pattern::matches_with(..). + * * Paths are yielded in alphabetical order, as absolute paths. */ -pub fn glob(pattern: &str) -> GlobIterator { +pub fn glob_with(pattern: &str, options: MatchOptions) -> GlobIterator { // note that this relies on the glob meta characters not // having any special meaning in actual pathnames @@ -51,6 +63,7 @@ pub fn glob(pattern: &str) -> GlobIterator { GlobIterator { root: root, dir_patterns: dir_patterns, + options: options, todo: todo, } } @@ -65,8 +78,9 @@ impl Iterator for GlobIterator { let path = self.todo.pop(); let pattern_index = path.components.len() - self.root.components.len() - 1; + let ref pattern = self.dir_patterns[pattern_index]; - if self.dir_patterns[pattern_index].matches(*path.components.last()) { + if pattern.matches_with(*path.components.last(), self.options) { if pattern_index == self.dir_patterns.len() - 1 { // it is not possible for a pattern to match a directory *AND* its children @@ -198,26 +212,43 @@ impl Pattern { } /** - * Return if the given str matches this Pattern. + * Return if the given str matches this Pattern using the default + * match options (i.e. MatchOptions::new()). */ pub fn matches(&self, file: &str) -> bool { - self.matches_from(file, 0) + self.matches_with(file, MatchOptions::new()) } /** - * Return if the given Path, when converted to a str, matches this Pattern. + * Return if the given Path, when converted to a str, matches this Pattern + * using the default match options (i.e. MatchOptions::new()). */ pub fn matches_path(&self, path: &Path) -> bool { self.matches(path.to_str()) } - fn matches_from(&self, mut file: &str, i: uint) -> bool { + /** + * Return if the given str matches this Pattern using the specified match options. + */ + pub fn matches_with(&self, file: &str, options: MatchOptions) -> bool { + self.matches_from(file, 0, options) + } + + /** + * Return if the given Path, when converted to a str, matches this Pattern + * using the specified match options. + */ + pub fn matches_path_with(&self, path: &Path, options: MatchOptions) -> bool { + self.matches_with(path.to_str(), options) + } + + fn matches_from(&self, mut file: &str, i: uint, options: MatchOptions) -> bool { for uint::range(i, self.tokens.len()) |ti| { match self.tokens[ti] { AnySequence => { loop { - if self.matches_from(file, ti + 1) { + if self.matches_from(file, ti + 1, options) { return true; } if file.is_empty() { @@ -232,11 +263,21 @@ impl Pattern { } let (c, next) = file.slice_shift_char(); let matches = match self.tokens[ti] { - AnyChar => true, - AnyWithin(ref chars) => chars.contains(&c), - AnyExcept(ref chars) => !chars.contains(&c), - Char(c2) => c == c2, - AnySequence => util::unreachable(), + AnyChar => { + true + } + AnyWithin(ref chars) => { + chars.rposition(|&e| chars_eq(e, c, options.case_sensitive)).is_some() + } + AnyExcept(ref chars) => { + chars.rposition(|&e| chars_eq(e, c, options.case_sensitive)).is_none() + } + Char(c2) => { + chars_eq(c, c2, options.case_sensitive) + } + AnySequence => { + util::unreachable() + } }; if !matches { return false; @@ -251,6 +292,43 @@ impl Pattern { } +/// A helper function to determine if two chars are (possibly case-insensitively) equal. +fn chars_eq(a: char, b: char, case_sensitive: bool) -> bool { + // FIXME: work with non-ascii chars properly (issue #1347) + if !case_sensitive && a.is_ascii() && b.is_ascii() { + a.to_ascii().eq_ignore_case(b.to_ascii()) + } else { + a == b + } +} + +/** + * Configuration options to modify the behaviour of Pattern::matches_with(..) + */ +pub struct MatchOptions { + /// Whether or not patterns should be matched in a case-sensitive manner. + case_sensitive: bool +} + +impl MatchOptions { + + /** + * Constructs a new MatchOptions with default field values. This is used + * when calling functions that do not take an explicit MatchOptions parameter. + * + * This function always returns this value: + * MatchOptions { + * case_sensitive: true + * } + */ + pub fn new() -> MatchOptions { + MatchOptions { + case_sensitive: true + } + } + +} + #[cfg(test)] mod test { use std::{io, os, unstable}; @@ -498,5 +576,37 @@ mod test { assert_eq!(Pattern::escape(s), ~"_[[]_[]]_[?]_[*]_!_"); assert!(Pattern::new(Pattern::escape(s)).matches(s)); } + + #[test] + fn test_pattern_matches_case_insensitive() { + + let pat = Pattern::new("aBcDeFg"); + let options = MatchOptions { case_sensitive: false }; + + assert!(pat.matches_with("aBcDeFg", options)); + assert!(pat.matches_with("abcdefg", options)); + assert!(pat.matches_with("ABCDEFG", options)); + assert!(pat.matches_with("AbCdEfG", options)); + } + + #[test] + fn test_pattern_matches_case_insensitive_within() { + + let pat = Pattern::new("[a]"); + + assert!(pat.matches_with("a", MatchOptions { case_sensitive: false })); + assert!(pat.matches_with("A", MatchOptions { case_sensitive: false })); + assert!(!pat.matches_with("A", MatchOptions { case_sensitive: true })); + } + + #[test] + fn test_pattern_matches_case_insensitive_except() { + + let pat = Pattern::new("[!a]"); + + assert!(!pat.matches_with("a", MatchOptions { case_sensitive: false })); + assert!(!pat.matches_with("A", MatchOptions { case_sensitive: false })); + assert!(pat.matches_with("A", MatchOptions { case_sensitive: true })); + } } From 006b5e017ed2ab5f8d63984b86c8634c31e34905 Mon Sep 17 00:00:00 2001 From: Gareth Smith Date: Fri, 16 Aug 2013 14:57:53 +0100 Subject: [PATCH 7/7] Add a require_literal_separator match option that is equivalent to the FNM_PATHNAME flag in the libc fnmatch function --- src/libextra/glob.rs | 100 +++++++++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 22 deletions(-) diff --git a/src/libextra/glob.rs b/src/libextra/glob.rs index 7886489d14fe6..0608c96f1fa92 100644 --- a/src/libextra/glob.rs +++ b/src/libextra/glob.rs @@ -9,7 +9,7 @@ // except according to those terms. -use std::{os, uint, util}; +use std::{os, path, uint, util}; use sort; @@ -38,11 +38,10 @@ pub fn glob(pattern: &str) -> GlobIterator { * Return an iterator that produces all the Paths that match the given pattern, * which may be absolute or relative to the current working directory. * - * This function accepts Unix shell style patterns as described by Pattern::new(..), - * with the exception that path separators (i.e. '/' on Posix systems) must be matched - * by their literal representation - they can't be matched by '*', '?', '[...]', etc. - * - * The options given are passed through unchanged to Pattern::matches_with(..). + * This function accepts Unix shell style patterns as described by Pattern::new(..). + * The options given are passed through unchanged to Pattern::matches_with(..) with + * the exception that require_literal_separator is always set to true regardless of the + * value passed to this function. * * Paths are yielded in alphabetical order, as absolute paths. */ @@ -251,25 +250,37 @@ impl Pattern { if self.matches_from(file, ti + 1, options) { return true; } + if file.is_empty() { return false; } - file = file.slice_shift_char().second(); + + let (c, next) = file.slice_shift_char(); + if options.require_literal_separator && is_sep(c) { + return false; + } + + file = next; } } _ => { if file.is_empty() { return false; } + let (c, next) = file.slice_shift_char(); + let require_literal = options.require_literal_separator && is_sep(c); + let matches = match self.tokens[ti] { AnyChar => { - true + !require_literal } AnyWithin(ref chars) => { + !require_literal && chars.rposition(|&e| chars_eq(e, c, options.case_sensitive)).is_some() } AnyExcept(ref chars) => { + !require_literal && chars.rposition(|&e| chars_eq(e, c, options.case_sensitive)).is_none() } Char(c2) => { @@ -302,12 +313,25 @@ fn chars_eq(a: char, b: char, case_sensitive: bool) -> bool { } } +/// A helper function to determine if a char is a path separator on the current platform. +#[cfg(windows)] +fn is_sep(c: char) -> bool { + path::windows::is_sep(c) +} +#[cfg(unix)] +fn is_sep(c: char) -> bool { + path::posix::is_sep(c) +} + /** * Configuration options to modify the behaviour of Pattern::matches_with(..) */ pub struct MatchOptions { /// Whether or not patterns should be matched in a case-sensitive manner. - case_sensitive: bool + case_sensitive: bool, + /// If this is true then path-component separator characters (e.g. '/' on Posix) + /// must be matched by a literal '/', rather than by '*' or '?' or '[...]' + require_literal_separator: bool } impl MatchOptions { @@ -318,12 +342,14 @@ impl MatchOptions { * * This function always returns this value: * MatchOptions { - * case_sensitive: true + * case_sensitive: true, + * require_literal_separator: false * } */ pub fn new() -> MatchOptions { MatchOptions { - case_sensitive: true + case_sensitive: true, + require_literal_separator: false } } @@ -581,7 +607,10 @@ mod test { fn test_pattern_matches_case_insensitive() { let pat = Pattern::new("aBcDeFg"); - let options = MatchOptions { case_sensitive: false }; + let options = MatchOptions { + case_sensitive: false, + require_literal_separator: false + }; assert!(pat.matches_with("aBcDeFg", options)); assert!(pat.matches_with("abcdefg", options)); @@ -590,23 +619,50 @@ mod test { } #[test] - fn test_pattern_matches_case_insensitive_within() { + fn test_pattern_matches_case_insensitive_range() { + + let pat_within = Pattern::new("[a]"); + let pat_except = Pattern::new("[!a]"); - let pat = Pattern::new("[a]"); + let options_case_insensitive = MatchOptions { + case_sensitive: false, + require_literal_separator: false + }; + let options_case_sensitive = MatchOptions { + case_sensitive: true, + require_literal_separator: false + }; - assert!(pat.matches_with("a", MatchOptions { case_sensitive: false })); - assert!(pat.matches_with("A", MatchOptions { case_sensitive: false })); - assert!(!pat.matches_with("A", MatchOptions { case_sensitive: true })); + assert!(pat_within.matches_with("a", options_case_insensitive)); + assert!(pat_within.matches_with("A", options_case_insensitive)); + assert!(!pat_within.matches_with("A", options_case_sensitive)); + + assert!(!pat_except.matches_with("a", options_case_insensitive)); + assert!(!pat_except.matches_with("A", options_case_insensitive)); + assert!(pat_except.matches_with("A", options_case_sensitive)); } #[test] - fn test_pattern_matches_case_insensitive_except() { + fn test_pattern_matches_require_literal_separator() { + + let options_requires_literal = MatchOptions { + case_sensitive: true, + require_literal_separator: true + }; + let options_not_requires_literal = MatchOptions { + case_sensitive: true, + require_literal_separator: false + }; - let pat = Pattern::new("[!a]"); + assert!(Pattern::new("abc/def").matches_with("abc/def", options_requires_literal)); + assert!(!Pattern::new("abc?def").matches_with("abc/def", options_requires_literal)); + assert!(!Pattern::new("abc*def").matches_with("abc/def", options_requires_literal)); + assert!(!Pattern::new("abc[/]def").matches_with("abc/def", options_requires_literal)); - assert!(!pat.matches_with("a", MatchOptions { case_sensitive: false })); - assert!(!pat.matches_with("A", MatchOptions { case_sensitive: false })); - assert!(pat.matches_with("A", MatchOptions { case_sensitive: true })); + assert!(Pattern::new("abc/def").matches_with("abc/def", options_not_requires_literal)); + assert!(Pattern::new("abc?def").matches_with("abc/def", options_not_requires_literal)); + assert!(Pattern::new("abc*def").matches_with("abc/def", options_not_requires_literal)); + assert!(Pattern::new("abc[/]def").matches_with("abc/def", options_not_requires_literal)); } }