lib: address windows lossy conversion inconsistency in std

On Windows, `std` lossily converts an unpaired surrogate encoding into a single unicode replacement character, whereas on Unix, and also via `String`'s raw byte conversion, three unicode replacement characters are used. This comes from the fact that different code paths are used which take different approaches: `std::sys_common::wtf8::to_string_lossy` for Windows `OsStr` lossy conversion, vs. `core`'s `run_utf8_validation` function for Unix and for creaing a `String` from raw bytes. The inconsistency causes a problem in correct short option argument byte consumption tracking in our `OsStr` parsing code, due to using lossy `OsStr` conversion in combination with `std::str::from_utf8`. A [bug report][1] and [pull request][2] have been filed against `std`. [1]: rust-lang/rust#56786 [2]: rust-lang/rust#56787
jnqnfe · Dec 13, 2018 · d2b9517 · d2b9517
1 parent ebf972b
commit d2b9517
Show file tree

Hide file tree

Showing 2 changed files with 112 additions and 12 deletions.
diff --git a/lib/src/engine_os.rs b/lib/src/engine_os.rs
@@ -63,6 +63,8 @@ use std::mem;
 use std::ops::Range;
 #[cfg(any(unix, target_os = "redox"))]
 use std::os::unix::ffi::OsStrExt;
+#[cfg(windows)]
+use self::windows::OsStrExt;
 use super::options::*;
 use super::analysis::*;
 use super::engine::{self, SINGLE_DASH_PREFIX, DOUBLE_DASH_PREFIX};
@@ -86,7 +88,7 @@ pub(crate) fn process<'o, 'r, 's, A>(args: &'s [A], options: &'o OptionSet<'r, '
     };
 
     // Temporary lossy conversion
-    let args_as_str: Vec<Cow<'s, str>> = args.iter().map(|s| s.as_ref().to_string_lossy()).collect();
+    let args_as_str: Vec<Cow<'s, str>> = args.iter().map(|s| to_string_lossy(s.as_ref())).collect();
     // HACK: We must adjust the lifetime for use with `process`
     let args_as_str_slice = unsafe {
         mem::transmute::<&'_ [Cow<str>], &'s [Cow<str>]>(&args_as_str[..])
@@ -184,6 +186,16 @@ pub(crate) fn process<'o, 'r, 's, A>(args: &'s [A], options: &'o OptionSet<'r, '
     converted
 }
 
+//TODO: THIS HELPER IS A NECESSARY HACK DUE TO INCONSISTENCY IN LOSSY CONVERSION ON WINDOWS
+fn to_string_lossy<'a>(os_str: &'a OsStr) -> Cow<'a, str> {
+    #[cfg(not(windows))] {
+        os_str.to_string_lossy()
+    }
+    #[cfg(windows)] {
+        windows::to_string_lossy_hacked(os_str)
+    }
+}
+
 //TODO: THIS HELPER IS A NECESSARY HACK DUE TO LACK OF OSSTR SLICING IN STD
 // `std` currently provides no proper clean & safe way to slice an `OSStr`. On both Unix and Windows
 // `OsStr` holds the string in a `u8` slice. On Unix, this *may* or may not be valid UTF-8, but is
@@ -285,18 +297,87 @@ fn track_short_opt_set(arg: &OsStr, bytes_consumed: &mut usize, c: char) {
     };
 }
 
+/// Temporary hacks for Windows
 #[cfg(windows)]
-pub trait OsStrExt {
-    fn from_bytes(slice: &[u8]) -> &Self;
-    fn as_bytes(&self) -> &[u8];
-}
+mod windows {
+    pub trait OsStrExt {
+        fn from_bytes(slice: &[u8]) -> &Self;
+        fn as_bytes(&self) -> &[u8];
+    }
 
-#[cfg(windows)]
-impl OsStrExt for OsStr {
-    fn from_bytes(slice: &[u8]) -> &OsStr {
-        unsafe { mem::transmute(slice) }
+    impl OsStrExt for OsStr {
+        fn from_bytes(slice: &[u8]) -> &OsStr {
+            unsafe { mem::transmute(slice) }
+        }
+        fn as_bytes(&self) -> &[u8] {
+            unsafe { mem::transmute(self) }
+        }
+    }
+
+    const SURROGATE_REPLACEMENT: &str = "\u{FFFD}\u{FFFD}\u{FFFD}";
+
+    // Copied from `std::sys_common::wtf8::to_string_lossy` and modified to return three unicode
+    // replacement characters for surrogates instead of one
+    pub fn to_string_lossy_hacked<'a>(os_str: &'a OsStr) -> Cow<'a, str> {
+        let surrogate_pos = match next_surrogate(os_str, 0) {
+            None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(os_str.as_bytes()) }),
+            Some((pos, _)) => pos,
+        };
+        let wtf8_bytes = os_str.as_bytes();
+        let mut utf8_bytes = Vec::with_capacity(os_str.len());
+        utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
+        utf8_bytes.extend_from_slice(SURROGATE_REPLACEMENT.as_bytes());
+        let mut pos = surrogate_pos + 3;
+        loop {
+            match next_surrogate(os_str, pos) {
+                Some((surrogate_pos, _)) => {
+                    utf8_bytes.extend_from_slice(&wtf8_bytes[pos .. surrogate_pos]);
+                    utf8_bytes.extend_from_slice(SURROGATE_REPLACEMENT.as_bytes());
+                    pos = surrogate_pos + 3;
+                },
+                None => {
+                    utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]);
+                    return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) })
+                }
+            }
+        }
     }
-    fn as_bytes(&self) -> &[u8] {
-        unsafe { mem::transmute(self) }
+
+    // Copied from `std::sys_common::wtf8` and hacked to make work
+    #[inline]
+    fn next_surrogate(os_str: &OsStr, mut pos: usize) -> Option<(usize, u16)> {
+        let mut iter = os_str.get_osstr_suffix(pos).as_bytes().iter();
+        loop {
+            let b = *iter.next()?;
+            if b < 0x80 {
+                pos += 1;
+            } else if b < 0xE0 {
+                iter.next();
+                pos += 2;
+            } else if b == 0xED {
+                match (iter.next(), iter.next()) {
+                    (Some(&b2), Some(&b3)) if b2 >= 0xA0 => {
+                        return Some((pos, decode_surrogate(b2, b3)))
+                    }
+                    _ => pos += 3
+                }
+            } else if b < 0xF0 {
+                iter.next();
+                iter.next();
+                pos += 3;
+            } else {
+                iter.next();
+                iter.next();
+                iter.next();
+                pos += 4;
+            }
+        }
+    }
+
+    // Copied from `std::sys_common::wtf8`
+    #[inline]
+    fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
+        // The first byte is assumed to be 0xED
+        0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
     }
 }
diff --git a/lib/tests/os_str.rs b/lib/tests/os_str.rs
@@ -348,6 +348,9 @@ mod invalid_byte_sequences {
             fn from_bytes(slice: &[u8]) -> &OsStr { unsafe { std::mem::transmute(slice) } }
         }
 
+        eprintln!("***REMINDER: There is inconsistency in `std` in how many unicode replacement
+                   characters a surrogate gets lossily converted to on Windows***");
+
         let arg_strings = [
             OsStr::from_bytes(b"a\xed\xa0\x80bc"),       // Non-option
             OsStr::from_bytes(b"--\xed\xa0\x80xx"),      // Unknown long option
@@ -403,10 +406,18 @@ mod invalid_byte_sequences {
                 expected_item!(6, LongWithNoName),
                 expected_item!(7, UnknownShort, 'm'),
                 expected_item!(7, UnknownShort, '�'),
+                expected_item!(7, UnknownShort, '�'),
+                expected_item!(7, UnknownShort, '�'),
                 expected_item!(7, Short, 'h'),
                 expected_item!(8, ShortWithData, 'o', expected_strings[5], DataLocation::SameArg),
                 expected_item!(9, ShortWithData, 'o', expected_strings[6], DataLocation::NextArg),
-                expected_item!(11, UnknownShort, '�'), // Notice three individual instances for arg 11
+                expected_item!(11, UnknownShort, '�'), // Notice nine individual instances for arg 11
+                expected_item!(11, UnknownShort, '�'),
+                expected_item!(11, UnknownShort, '�'),
+                expected_item!(11, UnknownShort, '�'),
+                expected_item!(11, UnknownShort, '�'),
+                expected_item!(11, UnknownShort, '�'),
+                expected_item!(11, UnknownShort, '�'),
                 expected_item!(11, UnknownShort, '�'),
                 expected_item!(11, UnknownShort, '�'),
                 expected_item!(12, UnknownShort, 'm'),
@@ -415,12 +426,20 @@ mod invalid_byte_sequences {
                 expected_item!(12, UnknownShort, 'ş'),
                 expected_item!(12, UnknownShort, 'j'),
                 expected_item!(12, UnknownShort, '�'),
+                expected_item!(12, UnknownShort, '�'),
+                expected_item!(12, UnknownShort, '�'),
                 expected_item!(12, UnknownShort, '�'), // This one is from the actual U+FFFD char
                 expected_item!(12, UnknownShort, 'k'),
                 expected_item!(13, UnknownShort, 'm'),
                 expected_item!(13, UnknownShort, '�'),
                 expected_item!(13, UnknownShort, '�'),
                 expected_item!(13, UnknownShort, '�'),
+                expected_item!(13, UnknownShort, '�'),
+                expected_item!(13, UnknownShort, '�'),
+                expected_item!(13, UnknownShort, '�'),
+                expected_item!(13, UnknownShort, '�'),
+                expected_item!(13, UnknownShort, '�'),
+                expected_item!(13, UnknownShort, '�'),
                 expected_item!(13, ShortWithData, 'o', expected_strings[7], DataLocation::SameArg),
                 expected_item!(14, UnknownShort, 'm'),
                 expected_item!(14, UnknownShort, '�'),