Skip to content

Commit

Permalink
lib: address windows lossy conversion inconsistency in std
Browse files Browse the repository at this point in the history
On Windows, `std` lossily converts an unpaired surrogate encoding
into a single unicode replacement character, whereas on Unix, and
also via `String`'s raw byte conversion, three unicode replacement
characters are used.

This comes from the fact that different code paths are used which
take different approaches: `std::sys_common::wtf8::to_string_lossy`
for Windows `OsStr` lossy conversion, vs. `core`'s
`run_utf8_validation` function for Unix and for creaing a `String`
from raw bytes.

The inconsistency causes a problem in correct short option argument
byte consumption tracking in our `OsStr` parsing code, due to using
lossy `OsStr` conversion in combination with `std::str::from_utf8`.

A [bug report][1] and [pull request][2] have been filed against `std`.

[1]: rust-lang/rust#56786
[2]: rust-lang/rust#56787
  • Loading branch information
jnqnfe committed Dec 13, 2018
1 parent ebf972b commit d2b9517
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 12 deletions.
103 changes: 92 additions & 11 deletions lib/src/engine_os.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ use std::mem;
use std::ops::Range;
#[cfg(any(unix, target_os = "redox"))]
use std::os::unix::ffi::OsStrExt;
#[cfg(windows)]
use self::windows::OsStrExt;
use super::options::*;
use super::analysis::*;
use super::engine::{self, SINGLE_DASH_PREFIX, DOUBLE_DASH_PREFIX};
Expand All @@ -86,7 +88,7 @@ pub(crate) fn process<'o, 'r, 's, A>(args: &'s [A], options: &'o OptionSet<'r, '
};

// Temporary lossy conversion
let args_as_str: Vec<Cow<'s, str>> = args.iter().map(|s| s.as_ref().to_string_lossy()).collect();
let args_as_str: Vec<Cow<'s, str>> = args.iter().map(|s| to_string_lossy(s.as_ref())).collect();
// HACK: We must adjust the lifetime for use with `process`
let args_as_str_slice = unsafe {
mem::transmute::<&'_ [Cow<str>], &'s [Cow<str>]>(&args_as_str[..])
Expand Down Expand Up @@ -184,6 +186,16 @@ pub(crate) fn process<'o, 'r, 's, A>(args: &'s [A], options: &'o OptionSet<'r, '
converted
}

//TODO: THIS HELPER IS A NECESSARY HACK DUE TO INCONSISTENCY IN LOSSY CONVERSION ON WINDOWS
fn to_string_lossy<'a>(os_str: &'a OsStr) -> Cow<'a, str> {
#[cfg(not(windows))] {
os_str.to_string_lossy()
}
#[cfg(windows)] {
windows::to_string_lossy_hacked(os_str)
}
}

//TODO: THIS HELPER IS A NECESSARY HACK DUE TO LACK OF OSSTR SLICING IN STD
// `std` currently provides no proper clean & safe way to slice an `OSStr`. On both Unix and Windows
// `OsStr` holds the string in a `u8` slice. On Unix, this *may* or may not be valid UTF-8, but is
Expand Down Expand Up @@ -285,18 +297,87 @@ fn track_short_opt_set(arg: &OsStr, bytes_consumed: &mut usize, c: char) {
};
}

/// Temporary hacks for Windows
#[cfg(windows)]
pub trait OsStrExt {
fn from_bytes(slice: &[u8]) -> &Self;
fn as_bytes(&self) -> &[u8];
}
mod windows {
pub trait OsStrExt {
fn from_bytes(slice: &[u8]) -> &Self;
fn as_bytes(&self) -> &[u8];
}

#[cfg(windows)]
impl OsStrExt for OsStr {
fn from_bytes(slice: &[u8]) -> &OsStr {
unsafe { mem::transmute(slice) }
impl OsStrExt for OsStr {
fn from_bytes(slice: &[u8]) -> &OsStr {
unsafe { mem::transmute(slice) }
}
fn as_bytes(&self) -> &[u8] {
unsafe { mem::transmute(self) }
}
}

const SURROGATE_REPLACEMENT: &str = "\u{FFFD}\u{FFFD}\u{FFFD}";

// Copied from `std::sys_common::wtf8::to_string_lossy` and modified to return three unicode
// replacement characters for surrogates instead of one
pub fn to_string_lossy_hacked<'a>(os_str: &'a OsStr) -> Cow<'a, str> {
let surrogate_pos = match next_surrogate(os_str, 0) {
None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(os_str.as_bytes()) }),
Some((pos, _)) => pos,
};
let wtf8_bytes = os_str.as_bytes();
let mut utf8_bytes = Vec::with_capacity(os_str.len());
utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
utf8_bytes.extend_from_slice(SURROGATE_REPLACEMENT.as_bytes());
let mut pos = surrogate_pos + 3;
loop {
match next_surrogate(os_str, pos) {
Some((surrogate_pos, _)) => {
utf8_bytes.extend_from_slice(&wtf8_bytes[pos .. surrogate_pos]);
utf8_bytes.extend_from_slice(SURROGATE_REPLACEMENT.as_bytes());
pos = surrogate_pos + 3;
},
None => {
utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]);
return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) })
}
}
}
}
fn as_bytes(&self) -> &[u8] {
unsafe { mem::transmute(self) }

// Copied from `std::sys_common::wtf8` and hacked to make work
#[inline]
fn next_surrogate(os_str: &OsStr, mut pos: usize) -> Option<(usize, u16)> {
let mut iter = os_str.get_osstr_suffix(pos).as_bytes().iter();
loop {
let b = *iter.next()?;
if b < 0x80 {
pos += 1;
} else if b < 0xE0 {
iter.next();
pos += 2;
} else if b == 0xED {
match (iter.next(), iter.next()) {
(Some(&b2), Some(&b3)) if b2 >= 0xA0 => {
return Some((pos, decode_surrogate(b2, b3)))
}
_ => pos += 3
}
} else if b < 0xF0 {
iter.next();
iter.next();
pos += 3;
} else {
iter.next();
iter.next();
iter.next();
pos += 4;
}
}
}

// Copied from `std::sys_common::wtf8`
#[inline]
fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
// The first byte is assumed to be 0xED
0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
}
}
21 changes: 20 additions & 1 deletion lib/tests/os_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,9 @@ mod invalid_byte_sequences {
fn from_bytes(slice: &[u8]) -> &OsStr { unsafe { std::mem::transmute(slice) } }
}

eprintln!("***REMINDER: There is inconsistency in `std` in how many unicode replacement
characters a surrogate gets lossily converted to on Windows***");

let arg_strings = [
OsStr::from_bytes(b"a\xed\xa0\x80bc"), // Non-option
OsStr::from_bytes(b"--\xed\xa0\x80xx"), // Unknown long option
Expand Down Expand Up @@ -403,10 +406,18 @@ mod invalid_byte_sequences {
expected_item!(6, LongWithNoName),
expected_item!(7, UnknownShort, 'm'),
expected_item!(7, UnknownShort, '�'),
expected_item!(7, UnknownShort, '�'),
expected_item!(7, UnknownShort, '�'),
expected_item!(7, Short, 'h'),
expected_item!(8, ShortWithData, 'o', expected_strings[5], DataLocation::SameArg),
expected_item!(9, ShortWithData, 'o', expected_strings[6], DataLocation::NextArg),
expected_item!(11, UnknownShort, '�'), // Notice three individual instances for arg 11
expected_item!(11, UnknownShort, '�'), // Notice nine individual instances for arg 11
expected_item!(11, UnknownShort, '�'),
expected_item!(11, UnknownShort, '�'),
expected_item!(11, UnknownShort, '�'),
expected_item!(11, UnknownShort, '�'),
expected_item!(11, UnknownShort, '�'),
expected_item!(11, UnknownShort, '�'),
expected_item!(11, UnknownShort, '�'),
expected_item!(11, UnknownShort, '�'),
expected_item!(12, UnknownShort, 'm'),
Expand All @@ -415,12 +426,20 @@ mod invalid_byte_sequences {
expected_item!(12, UnknownShort, 'ş'),
expected_item!(12, UnknownShort, 'j'),
expected_item!(12, UnknownShort, '�'),
expected_item!(12, UnknownShort, '�'),
expected_item!(12, UnknownShort, '�'),
expected_item!(12, UnknownShort, '�'), // This one is from the actual U+FFFD char
expected_item!(12, UnknownShort, 'k'),
expected_item!(13, UnknownShort, 'm'),
expected_item!(13, UnknownShort, '�'),
expected_item!(13, UnknownShort, '�'),
expected_item!(13, UnknownShort, '�'),
expected_item!(13, UnknownShort, '�'),
expected_item!(13, UnknownShort, '�'),
expected_item!(13, UnknownShort, '�'),
expected_item!(13, UnknownShort, '�'),
expected_item!(13, UnknownShort, '�'),
expected_item!(13, UnknownShort, '�'),
expected_item!(13, ShortWithData, 'o', expected_strings[7], DataLocation::SameArg),
expected_item!(14, UnknownShort, 'm'),
expected_item!(14, UnknownShort, '�'),
Expand Down

0 comments on commit d2b9517

Please sign in to comment.