From 4ce7303fc2e7934040f44e26c05d216500559312 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Sat, 13 Jul 2019 19:52:24 -0600 Subject: [PATCH 1/2] refactor(parser): Switch to bstr for line splitting --- Cargo.lock | 80 ++++++++++++++++-------------------------------------- Cargo.toml | 2 +- src/lib.rs | 5 +++- 3 files changed, 29 insertions(+), 58 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3b78dc63d..6fb22530d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -71,8 +71,18 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] -name = "bytecount" -version = "0.3.2" +name = "bstr" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-automata 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "byteorder" +version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -153,22 +163,6 @@ name = "either" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "encoding_rs" -version = "0.8.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "encoding_rs_io" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "encoding_rs 0.8.14 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "failure" version = "0.1.5" @@ -237,28 +231,6 @@ dependencies = [ "walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "grep-matcher" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "memchr 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "grep-searcher" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "bytecount 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)", - "encoding_rs 0.8.14 (registry+https://github.com/rust-lang/crates.io-index)", - "encoding_rs_io 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", - "grep-matcher 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", - "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", - "memchr 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", - "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "heck" version = "0.3.1" @@ -333,15 +305,6 @@ dependencies = [ "libc 0.2.47 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "memmap" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "libc 0.2.47 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "normalize-line-endings" version = "0.2.2" @@ -568,6 +531,14 @@ dependencies = [ "utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "regex-automata" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "regex-syntax" version = "0.6.4" @@ -766,10 +737,10 @@ name = "typos" version = "0.1.0" dependencies = [ "assert_fs 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)", + "bstr 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", "csv 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", "failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", - "grep-searcher 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "ignore 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", "itertools 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -885,7 +856,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum backtrace 0.3.13 (registry+https://github.com/rust-lang/crates.io-index)" = "b5b493b66e03090ebc4343eb02f94ff944e0cbc9ac6571491d170ba026741eb5" "checksum backtrace-sys 0.1.28 (registry+https://github.com/rust-lang/crates.io-index)" = "797c830ac25ccc92a7f8a7b9862bde440715531514594a6154e3d4a54dd769b6" "checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" -"checksum bytecount 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f861d9ce359f56dbcb6e0c2a1cb84e52ad732cadb57b806adeb3c7668caccbd8" +"checksum bstr 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "6cc0572e02f76cb335f309b19e0a0d585b4f62788f7d26de2a13a836a637385f" +"checksum byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a7c3dd8985a7111efc5c80b44e23ecdd8c007de8ade3b96595387e812b957cf5" "checksum cc 1.0.28 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4a8b715cb4597106ea87c7c84b2f1d452c7492033765df7f32651e66fcf749" "checksum cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "082bb9b28e00d3c9d39cc03e64ce4cea0f1bb9b3fde493f0cbc008472d22bdf4" "checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e" @@ -896,8 +868,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum csv-core 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fa5cdef62f37e6ffe7d1f07a381bc0db32b7a3ff1cac0de56cb0d81e71f53d65" "checksum difference 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "524cbf6897b527295dff137cec09ecf3a05f4fddffd7dfcd1585403449e74198" "checksum either 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3be565ca5c557d7f59e7cfcf1844f9e3033650c929c6566f511e8005f205c1d0" -"checksum encoding_rs 0.8.14 (registry+https://github.com/rust-lang/crates.io-index)" = "a69d152eaa438a291636c1971b0a370212165ca8a75759eb66818c5ce9b538f7" -"checksum encoding_rs_io 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "098f6a0ab73a9ba256b71344dc82c6d7e252736ad9db7f4e35345f3a1f8713f5" "checksum failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "795bd83d3abeb9220f257e597aa0080a508b27533824adf336529648f6abf7e2" "checksum failure_derive 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "ea1063915fd7ef4309e222a5a07cf9c319fb9c7836b1f89b85458672dbb127e1" "checksum float-cmp 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "134a8fa843d80a51a5b77d36d42bc2def9edcb0262c914861d08129fd1926600" @@ -906,8 +876,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" "checksum globset 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4743617a7464bbda3c8aec8558ff2f9429047e025771037df561d383337ff865" "checksum globwalk 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "89fa2e29859da05acd066bd45996f05c271b271d7ec4a781f909682328f65d25" -"checksum grep-matcher 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "820946e0c314510779a8d86c5cd03240e0ae0993dabcdb98733a8f6a9001b607" -"checksum grep-searcher 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "6eb23805170ff0e96894a24847019500de11e9baaabe3dafed75b35a897636e1" "checksum heck 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205" "checksum ignore 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ad03ca67dc12474ecd91fdb94d758cbd20cb4e7a78ebe831df26a9b7511e1162" "checksum itertools 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5b8467d9c1cebe26feb08c640139247fac215782d35371ade9a2136ed6085358" @@ -917,7 +885,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum lock_api 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "62ebf1391f6acad60e5c8b43706dde4582df75c06698ab44511d15016bc2442c" "checksum log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c84ec4b527950aa83a329754b01dbe3f58361d1c5efacd1f6d68c494d08a17c6" "checksum memchr 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "e1dd4eaac298c32ce07eb6ed9242eda7d82955b9170b7d6db59b2e02cc63fcb8" -"checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff" "checksum normalize-line-endings 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "2e0a1a39eab95caf4f5556da9289b9e68f0aafac901b2ce80daaf020d3b733a8" "checksum num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0b3a5d7cc97d6d30d8b9bc8fa19bf45349ffe46241e8816f50f62f6d6aaabee1" "checksum owning_ref 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "49a4b8ea2179e6a2e27411d3bca09ca6dd630821cf6894c6c7c8467a8ee7ef13" @@ -944,6 +911,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum redox_syscall 0.1.51 (registry+https://github.com/rust-lang/crates.io-index)" = "423e376fffca3dfa06c9e9790a9ccd282fafb3cc6e6397d01dbf64f9bacc6b85" "checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76" "checksum regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "37e7cbbd370869ce2e8dff25c7018702d10b21a20ef7135316f8daecd6c25b7f" +"checksum regex-automata 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "3ed09217220c272b29ef237a974ad58515bde75f194e3ffa7e6d0bf0f3b01f86" "checksum regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "4e47a2ed29da7a9e1960e1639e7a982e6edc6d49be308a3b02daf511504a16d1" "checksum remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3488ba1b9a2084d38645c4c08276a1752dcbf2c7130d74f1569681ad5d2799c5" "checksum rustc-demangle 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "adacaae16d02b6ec37fdc7acfcddf365978de76d1983d3ee22afc260e1ca9619" diff --git a/Cargo.toml b/Cargo.toml index e48565843..fb51c5c4c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,12 +27,12 @@ ignore = "0.4" phf = { version = "0.7", features = ["unicase"] } regex = "1.0" lazy_static = "1.2.0" -grep-searcher = "0.1" serde = "1.0" serde_derive = "1.0" serde_json = "1.0" itertools = "0.8" unicase = "1.1" +bstr = "0.2" [dev-dependencies] assert_fs = "0.10" diff --git a/src/lib.rs b/src/lib.rs index 5b425bcf1..ae33b11e1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,6 +12,8 @@ pub use crate::dict::*; use std::fs::File; use std::io::Read; +use bstr::ByteSlice; + pub fn process_file( path: &std::path::Path, dictionary: &Dictionary, @@ -20,7 +22,8 @@ pub fn process_file( ) -> Result<(), failure::Error> { let mut buffer = Vec::new(); File::open(path)?.read_to_end(&mut buffer)?; - for (line_idx, line) in grep_searcher::LineIter::new(b'\n', &buffer).enumerate() { + + for (line_idx, line) in buffer.lines().enumerate() { let line_num = line_idx + 1; for ident in tokens::Identifier::parse(line) { if !ignore_hex && is_hex(ident.token()) { From da156e3f23b44e62029b78690bb289b9dd75727d Mon Sep 17 00:00:00 2001 From: Ed Page Date: Sat, 13 Jul 2019 20:14:06 -0600 Subject: [PATCH 2/2] feat: Ignore binary files Fixes #29 --- benches/file.rs | 6 ++++++ src/lib.rs | 4 ++++ src/main.rs | 17 +++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/benches/file.rs b/benches/file.rs index 1520f1b1a..6656701b2 100644 --- a/benches/file.rs +++ b/benches/file.rs @@ -18,6 +18,7 @@ fn process_empty(b: &mut test::Bencher) { sample_path.path(), &corrections, true, + false, typos::report::print_silent, ) }); @@ -37,6 +38,7 @@ fn process_no_tokens(b: &mut test::Bencher) { sample_path.path(), &corrections, true, + false, typos::report::print_silent, ) }); @@ -56,6 +58,7 @@ fn process_single_token(b: &mut test::Bencher) { sample_path.path(), &corrections, true, + false, typos::report::print_silent, ) }); @@ -75,6 +78,7 @@ fn process_sherlock(b: &mut test::Bencher) { sample_path.path(), &corrections, true, + false, typos::report::print_silent, ) }); @@ -94,6 +98,7 @@ fn process_code(b: &mut test::Bencher) { sample_path.path(), &corrections, true, + false, typos::report::print_silent, ) }); @@ -113,6 +118,7 @@ fn process_corpus(b: &mut test::Bencher) { sample_path.path(), &corrections, true, + false, typos::report::print_silent, ) }); diff --git a/src/lib.rs b/src/lib.rs index ae33b11e1..f46af9896 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,10 +18,14 @@ pub fn process_file( path: &std::path::Path, dictionary: &Dictionary, ignore_hex: bool, + binary: bool, report: report::Report, ) -> Result<(), failure::Error> { let mut buffer = Vec::new(); File::open(path)?.read_to_end(&mut buffer)?; + if !binary && buffer.find_byte(b'\0').is_some() { + return Ok(()); + } for (line_idx, line) in buffer.lines().enumerate() { let line_num = line_idx + 1; diff --git a/src/main.rs b/src/main.rs index e5d3ef81a..38f025af0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -55,6 +55,12 @@ struct Options { /// The approximate number of threads to use. threads: usize, + #[structopt(long, raw(overrides_with = r#""no-binary""#))] + /// Search binary files. + binary: bool, + #[structopt(long, raw(overrides_with = r#""binary""#), raw(hidden = "true"))] + no_binary: bool, + #[structopt(long, raw(overrides_with = r#""no-hidden""#))] /// Search hidden files and directories. hidden: bool, @@ -118,6 +124,15 @@ impl Options { } } + pub fn binary(&self) -> Option { + match (self.binary, self.no_binary) { + (true, false) => Some(true), + (false, true) => Some(false), + (false, false) => None, + (_, _) => unreachable!("StructOpt should make this impossible"), + } + } + pub fn ignore_hidden(&self) -> Option { match (self.hidden, self.no_hidden) { (true, false) => Some(false), @@ -183,6 +198,7 @@ fn run() -> Result<(), failure::Error> { let dictionary = typos::Dictionary::new(); let ignore_hex = options.ignore_hex().unwrap_or(true); + let binary = options.binary().unwrap_or(false); let first_path = &options .path @@ -207,6 +223,7 @@ fn run() -> Result<(), failure::Error> { entry.path(), &dictionary, ignore_hex, + binary, options.format.report(), )?; }