diff --git a/Cargo.lock b/Cargo.lock index b019a97f0..88da03561 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -287,6 +287,70 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd56b59865bce947ac5958779cfa508f6c3b9497cc762b7e24a12d11ccde2c4f" +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +dependencies = [ + "encoding-index-japanese", + "encoding-index-korean", + "encoding-index-simpchinese", + "encoding-index-singlebyte", + "encoding-index-tradchinese", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" + [[package]] name = "enumflags2" version = "0.6.4" @@ -981,6 +1045,7 @@ dependencies = [ "content_inspector", "derive_more 0.99.11", "derive_setters", + "encoding", "itertools", "lazy_static", "log", diff --git a/crates/typos/Cargo.toml b/crates/typos/Cargo.toml index e777c5798..1c2512502 100644 --- a/crates/typos/Cargo.toml +++ b/crates/typos/Cargo.toml @@ -28,3 +28,4 @@ unicode-segmentation = "1.6.0" derive_more = "0.99.11" derive_setters = "0.1" content_inspector = "0.2.4" +encoding = "0.2" diff --git a/crates/typos/src/checks.rs b/crates/typos/src/checks.rs index 984413f77..8d9486e98 100644 --- a/crates/typos/src/checks.rs +++ b/crates/typos/src/checks.rs @@ -1,4 +1,5 @@ use bstr::ByteSlice; +use encoding::Encoding; use crate::report; use crate::tokens; @@ -190,18 +191,31 @@ impl ParseWords { let buffer = std::fs::read(path) .map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?; - if !explicit && !self.binary { - let content_type = content_inspector::inspect(&buffer); - // HACK: We only support UTF-8 at the moment - if content_type.is_binary() - || (content_type != content_inspector::ContentType::UTF_8_BOM - && content_type != content_inspector::ContentType::UTF_8) - { - let msg = report::BinaryFile { path }; - reporter.report(msg.into()); - return Ok(typos_found); - } - } + let content_type = content_inspector::inspect(&buffer); + + let buffer = match content_type { + content_inspector::ContentType::BINARY | + // HACK: We don't support UTF-32 yet + content_inspector::ContentType::UTF_32LE | + content_inspector::ContentType::UTF_32BE + => { + if !explicit && !self.binary { + let msg = report::BinaryFile { path }; + reporter.report(msg.into()); + return Ok(typos_found); + } else { + buffer + } + }, + content_inspector::ContentType::UTF_8 | + content_inspector::ContentType::UTF_8_BOM + => { buffer + }, + content_inspector::ContentType::UTF_16LE | + content_inspector::ContentType::UTF_16BE => { + buffer + }, + }; for line in buffer.lines() { let msg = report::Parse {