Skip to content

Commit

Permalink
feat: Check and replace UTF-16 files
Browse files Browse the repository at this point in the history
We don't have good detection for non-UTF encodings and don't have
encoding support for UTF-32, so limiting it to just UTF-16.

Fixes #17
  • Loading branch information
Ed Page committed Jan 3, 2021
1 parent 1c392c2 commit 998fad4
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 25 deletions.
65 changes: 65 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ derive_more = "0.99.11"
derive_setters = "0.1"
itertools = "0.9"
serde_json = "1.0"
encoding = "0.2"

[dev-dependencies]
assert_fs = "1.0"
Expand Down
2 changes: 1 addition & 1 deletion docs/about.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ See also [benchmarks](../benchsuite/runs).
| snake_case | Yes | No | ? | No | Yes |
| Ignore Hex | Yes | No | ? | No | Yes |
| C-Escapes | No ([#20][def-3]) | No | ? | No | Yes |
| Encodings | UTF-8 ([#17][def-17]) | UTF-8 | ? | Auto | Auto |
| Encodings | UTF-8 / UTF-16 | UTF-8 | ? | Auto | Auto |
| Whole-project | Yes | Yes | Yes | Yes | No |
| Ignores hidden | Yes | Yes | ? | Yes | No |
| Respect gitignore | Yes | Yes | ? | No | No |
Expand Down
94 changes: 70 additions & 24 deletions src/checks.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use bstr::ByteSlice;
use encoding::Encoding;

use crate::report;
use typos::tokens;
Expand Down Expand Up @@ -208,7 +209,7 @@ impl FileChecker for FixTypos {
}
if !fixes.is_empty() {
let buffer = fix_buffer(buffer, fixes.into_iter());
write_file(path, content_type, &buffer, reporter)?;
write_file(path, content_type, buffer, reporter)?;
}
}
}
Expand Down Expand Up @@ -503,45 +504,90 @@ pub fn read_file(
path: &std::path::Path,
reporter: &dyn report::Report,
) -> Result<(Vec<u8>, content_inspector::ContentType), std::io::Error> {
let buffer = match std::fs::read(path) {
Ok(buffer) => buffer,
Err(err) => {
let msg = report::Error::new(err.to_string());
reporter.report(msg.into())?;
Vec::new()
let buffer = report_error(std::fs::read(path), reporter)?;

let content_type = content_inspector::inspect(&buffer);

let (buffer, content_type) = match content_type {
content_inspector::ContentType::BINARY |
// HACK: We don't support UTF-32 yet
content_inspector::ContentType::UTF_32LE |
content_inspector::ContentType::UTF_32BE => {
(buffer, content_inspector::ContentType::BINARY)
},
content_inspector::ContentType::UTF_8 |
content_inspector::ContentType::UTF_8_BOM => {
(buffer, content_type)
},
content_inspector::ContentType::UTF_16LE => {
let buffer = report_error(encoding::all::UTF_16LE.decode(&buffer, encoding::DecoderTrap::Strict), reporter)?;
(buffer.into_bytes(), content_type)
}
content_inspector::ContentType::UTF_16BE => {
let buffer = report_error(encoding::all::UTF_16BE.decode(&buffer, encoding::DecoderTrap::Strict), reporter)?;
(buffer.into_bytes(), content_type)
},
};

let mut content_type = content_inspector::inspect(&buffer);
// HACK: We only support UTF-8 at the moment
if content_type != content_inspector::ContentType::UTF_8_BOM
&& content_type != content_inspector::ContentType::UTF_8
{
content_type = content_inspector::ContentType::BINARY;
}

Ok((buffer, content_type))
}

pub fn write_file(
path: &std::path::Path,
content_type: content_inspector::ContentType,
buffer: &[u8],
buffer: Vec<u8>,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
assert!(
content_type == content_inspector::ContentType::UTF_8_BOM
|| content_type == content_inspector::ContentType::UTF_8
|| content_type == content_inspector::ContentType::BINARY
);
match std::fs::write(path, buffer) {
Ok(()) => (),
let buffer = match content_type {
// HACK: We don't support UTF-32 yet
content_inspector::ContentType::UTF_32LE | content_inspector::ContentType::UTF_32BE => {
unreachable!("read_file should prevent these from being passed along");
}
content_inspector::ContentType::BINARY
| content_inspector::ContentType::UTF_8
| content_inspector::ContentType::UTF_8_BOM => buffer,
content_inspector::ContentType::UTF_16LE => {
let buffer = report_error(String::from_utf8(buffer), reporter)?;
if buffer.is_empty() {
// Error occurred, don't clear out the file
return Ok(());
}
report_error(
encoding::all::UTF_16LE.encode(&buffer, encoding::EncoderTrap::Strict),
reporter,
)?
}
content_inspector::ContentType::UTF_16BE => {
let buffer = report_error(String::from_utf8(buffer), reporter)?;
if buffer.is_empty() {
// Error occurred, don't clear out the file
return Ok(());
}
report_error(
encoding::all::UTF_16BE.encode(&buffer, encoding::EncoderTrap::Strict),
reporter,
)?
}
};

report_error(std::fs::write(path, buffer), reporter)?;

Ok(())
}

fn report_error<T: Default, E: ToString>(
value: Result<T, E>,
reporter: &dyn report::Report,
) -> Result<T, std::io::Error> {
let buffer = match value {
Ok(value) => value,
Err(err) => {
let msg = report::Error::new(err.to_string());
reporter.report(msg.into())?;
Default::default()
}
};
Ok(())
Ok(buffer)
}

struct AccumulateLineNum {
Expand Down

0 comments on commit 998fad4

Please sign in to comment.