From ea2e69ded706323d8592cf05cc84882fbc5af356 Mon Sep 17 00:00:00 2001 From: Ryan Liptak Date: Sun, 14 Apr 2024 00:48:35 -0700 Subject: [PATCH] ArgIteratorWindows: Match post-2008 C runtime rather than CommandLineToArgvW On Windows, the command line arguments of a program are a single WTF-16 encoded string and it's up to the program to split it into an array of strings. In C/C++, the entry point of the C runtime takes care of splitting the command line and passing argc/argv to the main function. https://github.com/ziglang/zig/pull/18309 updated ArgIteratorWindows to match the behavior of CommandLineToArgvW, but it turns out that CommandLineToArgvW's behavior does not match the behavior of the C runtime post-2008. In 2008, the C runtime argv splitting changed how it handles consecutive double quotes within a quoted argument (it's now considered an escaped quote, e.g. `"foo""bar"` post-2008 would get parsed into `foo"bar`), and the rules around argv[0] were also changed. This commit makes ArgIteratorWindows match the behavior of the post-2008 C runtime, and adds a standalone test that verifies the behavior matches both the MSVC and MinGW argv splitting exactly in all cases (it checks that randomly generated command line strings get split the same way). The motivation here is roughly the same as when the same change was made in Rust (https://github.com/rust-lang/rust/pull/87580), that is (paraphrased): - Consistent behavior between Zig and modern C/C++ programs - Allows users to escape double quotes in a way that can be more straightforward Additionally, the suggested mitigation for BatBadBut (https://flatt.tech/research/posts/batbadbut-you-cant-securely-execute-commands-on-windows/) relies on the post-2008 argv splitting behavior for roundtripping of the arguments given to `cmd.exe`. Note: it's not necessary for the suggested mitigation to work, but it is necessary for the suggested escaping to be parsed back into the intended argv by ArgIteratorWindows after being run through a `.bat` file. --- lib/std/process.zig | 231 ++++++++++++++++--------- test/standalone/build.zig.zon | 3 + test/standalone/windows_argv/README.md | 19 ++ test/standalone/windows_argv/build.zig | 88 ++++++++++ test/standalone/windows_argv/fuzz.zig | 159 +++++++++++++++++ test/standalone/windows_argv/lib.h | 8 + test/standalone/windows_argv/lib.zig | 59 +++++++ test/standalone/windows_argv/verify.c | 7 + 8 files changed, 490 insertions(+), 84 deletions(-) create mode 100644 test/standalone/windows_argv/README.md create mode 100644 test/standalone/windows_argv/build.zig create mode 100644 test/standalone/windows_argv/fuzz.zig create mode 100644 test/standalone/windows_argv/lib.h create mode 100644 test/standalone/windows_argv/lib.zig create mode 100644 test/standalone/windows_argv/verify.c diff --git a/lib/std/process.zig b/lib/std/process.zig index 8f3f990890a1..f9870736f968 100644 --- a/lib/std/process.zig +++ b/lib/std/process.zig @@ -625,11 +625,22 @@ pub const ArgIteratorWasi = struct { }; /// Iterator that implements the Windows command-line parsing algorithm. +/// The implementation is intended to be compatible with the post-2008 C runtime, +/// but is *not* intended to be compatible with `CommandLineToArgvW` since +/// `CommandLineToArgvW` uses the pre-2008 parsing rules. /// -/// This iterator faithfully implements the parsing behavior observed in `CommandLineToArgvW` with +/// This iterator faithfully implements the parsing behavior observed from the C runtime with /// one exception: if the command-line string is empty, the iterator will immediately complete -/// without returning any arguments (whereas `CommandLineArgvW` will return a single argument +/// without returning any arguments (whereas the C runtime will return a single argument /// representing the name of the current executable). +/// +/// The essential parts of the algorithm are described in Microsoft's documentation: +/// +/// - https://learn.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-170#parsing-c-command-line-arguments +/// +/// David Deley explains some additional undocumented quirks in great detail: +/// +/// - https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULES pub const ArgIteratorWindows = struct { allocator: Allocator, /// Owned by the iterator. @@ -686,6 +697,51 @@ pub const ArgIteratorWindows = struct { fn emitCharacter(self: *ArgIteratorWindows, char: u8) void { self.buffer[self.end] = char; self.end += 1; + + // Because we are emitting WTF-8 byte-by-byte, we need to + // check to see if we've emitted two consecutive surrogate + // codepoints that form a valid surrogate pair in order + // to ensure that we're always emitting well-formed WTF-8 + // (https://simonsapin.github.io/wtf-8/#concatenating). + // + // If we do have a valid surrogate pair, we need to emit + // the UTF-8 sequence for the codepoint that they encode + // instead of the WTF-8 encoding for the two surrogate pairs + // separately. + // + // This is relevant when dealing with a WTF-16 encoded + // command line like this: + // "<0xD801>"<0xDC37> + // which would get converted to WTF-8 in `cmd_line` as: + // "<0xED><0xA0><0x81>"<0xED><0xB0><0xB7> + // and then after parsing it'd naively get emitted as: + // <0xED><0xA0><0x81><0xED><0xB0><0xB7> + // but instead, we need to recognize the surrogate pair + // and emit the codepoint it encodes, which in this + // example is U+10437 (𐐷), which is encoded in UTF-8 as: + // <0xF0><0x90><0x90><0xB7> + concatSurrogatePair(self); + } + + fn concatSurrogatePair(self: *ArgIteratorWindows) void { + // Surrogate codepoints are always encoded as 3 bytes, so there + // must be 6 bytes for a surrogate pair to exist. + if (self.end - self.start >= 6) { + const window = self.buffer[self.end - 6 .. self.end]; + const view = std.unicode.Wtf8View.init(window) catch return; + var it = view.iterator(); + var pair: [2]u16 = undefined; + pair[0] = std.mem.nativeToLittle(u16, std.math.cast(u16, it.nextCodepoint().?) orelse return); + if (!std.unicode.utf16IsHighSurrogate(pair[0])) return; + pair[1] = std.mem.nativeToLittle(u16, std.math.cast(u16, it.nextCodepoint().?) orelse return); + if (!std.unicode.utf16IsLowSurrogate(pair[1])) return; + // We know we have a valid surrogate pair, so convert + // it to UTF-8, overwriting the surrogate pair's bytes + // and then chop off the extra bytes. + const len = std.unicode.utf16LeToUtf8(window, &pair) catch unreachable; + const delta = 6 - len; + self.end -= delta; + } } fn yieldArg(self: *ArgIteratorWindows) [:0]const u8 { @@ -711,69 +767,37 @@ pub const ArgIteratorWindows = struct { } }; - // The essential parts of the algorithm are described in Microsoft's documentation: - // - // - - // - - // - // David Deley explains some additional undocumented quirks in great detail: - // - // - - // - // Code points <= U+0020 terminating an unquoted first argument was discovered independently by - // testing and observing the behavior of 'CommandLineToArgvW' on Windows 10. - fn nextWithStrategy(self: *ArgIteratorWindows, comptime strategy: type) strategy.T { // The first argument (the executable name) uses different parsing rules. if (self.index == 0) { - var char = if (self.cmd_line.len != 0) self.cmd_line[0] else 0; - switch (char) { - 0 => { - // Immediately complete the iterator. - // 'CommandLineToArgvW' would return the name of the current executable here. - return strategy.eof; - }, - '"' => { - // If the first character is a quote, read everything until the next quote (then - // skip that quote), or until the end of the string. - self.index += 1; - while (true) : (self.index += 1) { - char = if (self.index != self.cmd_line.len) self.cmd_line[self.index] else 0; - switch (char) { - 0 => { - return strategy.yieldArg(self); - }, - '"' => { - self.index += 1; - return strategy.yieldArg(self); - }, - else => { - strategy.emitCharacter(self, char); - }, - } - } - }, - else => { - // Otherwise, read everything until the next space or ASCII control character - // (not including DEL) (then skip that character), or until the end of the - // string. This means that if the command-line string starts with one of these - // characters, the first returned argument will be the empty string. - while (true) : (self.index += 1) { - char = if (self.index != self.cmd_line.len) self.cmd_line[self.index] else 0; - switch (char) { - 0 => { - return strategy.yieldArg(self); - }, - '\x01'...' ' => { - self.index += 1; - return strategy.yieldArg(self); - }, - else => { - strategy.emitCharacter(self, char); - }, + if (self.cmd_line.len == 0 or self.cmd_line[0] == 0) { + // Immediately complete the iterator. + // The C runtime would return the name of the current executable here. + return strategy.eof; + } + + var inside_quotes = false; + while (true) : (self.index += 1) { + const char = if (self.index != self.cmd_line.len) self.cmd_line[self.index] else 0; + switch (char) { + 0 => { + return strategy.yieldArg(self); + }, + '"' => { + inside_quotes = !inside_quotes; + }, + ' ', '\t' => { + if (inside_quotes) + strategy.emitCharacter(self, char) + else { + self.index += 1; + return strategy.yieldArg(self); } - } - }, + }, + else => { + strategy.emitCharacter(self, char); + }, + } } } @@ -791,9 +815,10 @@ pub const ArgIteratorWindows = struct { // // - The end of the string always terminates the current argument. // - When not in 'inside_quotes' mode, a space or tab terminates the current argument. - // - 2n backslashes followed by a quote emit n backslashes. If in 'inside_quotes' and the - // quote is immediately followed by a second quote, one quote is emitted and the other is - // skipped, otherwise, the quote is skipped. Finally, 'inside_quotes' is toggled. + // - 2n backslashes followed by a quote emit n backslashes (note: n can be zero). + // If in 'inside_quotes' and the quote is immediately followed by a second quote, + // one quote is emitted and the other is skipped, otherwise, the quote is skipped + // and 'inside_quotes' is toggled. // - 2n + 1 backslashes followed by a quote emit n backslashes followed by a quote. // - n backslashes not followed by a quote emit n backslashes. var backslash_count: usize = 0; @@ -826,8 +851,9 @@ pub const ArgIteratorWindows = struct { { strategy.emitCharacter(self, '"'); self.index += 1; + } else { + inside_quotes = !inside_quotes; } - inside_quotes = !inside_quotes; } }, '\\' => { @@ -1215,10 +1241,10 @@ test ArgIteratorWindows { // Separators try t("aa bb cc", &.{ "aa", "bb", "cc" }); try t("aa\tbb\tcc", &.{ "aa", "bb", "cc" }); - try t("aa\nbb\ncc", &.{ "aa", "bb\ncc" }); - try t("aa\r\nbb\r\ncc", &.{ "aa", "\nbb\r\ncc" }); - try t("aa\rbb\rcc", &.{ "aa", "bb\rcc" }); - try t("aa\x07bb\x07cc", &.{ "aa", "bb\x07cc" }); + try t("aa\nbb\ncc", &.{"aa\nbb\ncc"}); + try t("aa\r\nbb\r\ncc", &.{"aa\r\nbb\r\ncc"}); + try t("aa\rbb\rcc", &.{"aa\rbb\rcc"}); + try t("aa\x07bb\x07cc", &.{"aa\x07bb\x07cc"}); try t("aa\x7Fbb\x7Fcc", &.{"aa\x7Fbb\x7Fcc"}); try t("aa🦎bb🦎cc", &.{"aa🦎bb🦎cc"}); @@ -1227,22 +1253,22 @@ test ArgIteratorWindows { try t(" aa bb ", &.{ "", "aa", "bb" }); try t("\t\t", &.{""}); try t("\t\taa\t\tbb\t\t", &.{ "", "aa", "bb" }); - try t("\n\n", &.{ "", "\n" }); - try t("\n\naa\n\nbb\n\n", &.{ "", "\naa\n\nbb\n\n" }); + try t("\n\n", &.{"\n\n"}); + try t("\n\naa\n\nbb\n\n", &.{"\n\naa\n\nbb\n\n"}); // Executable name with quotes/backslashes try t("\"aa bb\tcc\ndd\"", &.{"aa bb\tcc\ndd"}); try t("\"", &.{""}); try t("\"\"", &.{""}); - try t("\"\"\"", &.{ "", "" }); - try t("\"\"\"\"", &.{ "", "" }); - try t("\"\"\"\"\"", &.{ "", "\"" }); - try t("aa\"bb\"cc\"dd", &.{"aa\"bb\"cc\"dd"}); - try t("aa\"bb cc\"dd", &.{ "aa\"bb", "ccdd" }); - try t("\"aa\\\"bb\"", &.{ "aa\\", "bb" }); + try t("\"\"\"", &.{""}); + try t("\"\"\"\"", &.{""}); + try t("\"\"\"\"\"", &.{""}); + try t("aa\"bb\"cc\"dd", &.{"aabbccdd"}); + try t("aa\"bb cc\"dd", &.{"aabb ccdd"}); + try t("\"aa\\\"bb\"", &.{"aa\\bb"}); try t("\"aa\\\\\"", &.{"aa\\\\"}); - try t("aa\\\"bb", &.{"aa\\\"bb"}); - try t("aa\\\\\"bb", &.{"aa\\\\\"bb"}); + try t("aa\\\"bb", &.{"aa\\bb"}); + try t("aa\\\\\"bb", &.{"aa\\\\bb"}); // Arguments with quotes/backslashes try t(". \"aa bb\tcc\ndd\"", &.{ ".", "aa bb\tcc\ndd" }); @@ -1252,29 +1278,66 @@ test ArgIteratorWindows { try t(". \"\"", &.{ ".", "" }); try t(". \"\"\"", &.{ ".", "\"" }); try t(". \"\"\"\"", &.{ ".", "\"" }); - try t(". \"\"\"\"\"", &.{ ".", "\"" }); + try t(". \"\"\"\"\"", &.{ ".", "\"\"" }); try t(". \"\"\"\"\"\"", &.{ ".", "\"\"" }); try t(". \" \"", &.{ ".", " " }); try t(". \" \"\"", &.{ ".", " \"" }); try t(". \" \"\"\"", &.{ ".", " \"" }); - try t(". \" \"\"\"\"", &.{ ".", " \"" }); + try t(". \" \"\"\"\"", &.{ ".", " \"\"" }); try t(". \" \"\"\"\"\"", &.{ ".", " \"\"" }); - try t(". \" \"\"\"\"\"\"", &.{ ".", " \"\"" }); + try t(". \" \"\"\"\"\"\"", &.{ ".", " \"\"\"" }); try t(". \\\"", &.{ ".", "\"" }); try t(". \\\"\"", &.{ ".", "\"" }); try t(". \\\"\"\"", &.{ ".", "\"" }); try t(". \\\"\"\"\"", &.{ ".", "\"\"" }); try t(". \\\"\"\"\"\"", &.{ ".", "\"\"" }); - try t(". \\\"\"\"\"\"\"", &.{ ".", "\"\"" }); + try t(". \\\"\"\"\"\"\"", &.{ ".", "\"\"\"" }); try t(". \" \\\"", &.{ ".", " \"" }); try t(". \" \\\"\"", &.{ ".", " \"" }); try t(". \" \\\"\"\"", &.{ ".", " \"\"" }); try t(". \" \\\"\"\"\"", &.{ ".", " \"\"" }); - try t(". \" \\\"\"\"\"\"", &.{ ".", " \"\"" }); + try t(". \" \\\"\"\"\"\"", &.{ ".", " \"\"\"" }); try t(". \" \\\"\"\"\"\"\"", &.{ ".", " \"\"\"" }); try t(". aa\\bb\\\\cc\\\\\\dd", &.{ ".", "aa\\bb\\\\cc\\\\\\dd" }); try t(". \\\\\\\"aa bb\"", &.{ ".", "\\\"aa", "bb" }); try t(". \\\\\\\\\"aa bb\"", &.{ ".", "\\\\aa bb" }); + + // From https://learn.microsoft.com/en-us/cpp/cpp/main-function-command-line-args#results-of-parsing-command-lines + try t( + \\foo.exe "abc" d e + , &.{ "foo.exe", "abc", "d", "e" }); + try t( + \\foo.exe a\\b d"e f"g h + , &.{ "foo.exe", "a\\\\b", "de fg", "h" }); + try t( + \\foo.exe a\\\"b c d + , &.{ "foo.exe", "a\\\"b", "c", "d" }); + try t( + \\foo.exe a\\\\"b c" d e + , &.{ "foo.exe", "a\\\\b c", "d", "e" }); + try t( + \\foo.exe a"b"" c d + , &.{ "foo.exe", "ab\" c d" }); + + // From https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULESEX + try t("foo.exe CallMeIshmael", &.{ "foo.exe", "CallMeIshmael" }); + try t("foo.exe \"Call Me Ishmael\"", &.{ "foo.exe", "Call Me Ishmael" }); + try t("foo.exe Cal\"l Me I\"shmael", &.{ "foo.exe", "Call Me Ishmael" }); + try t("foo.exe CallMe\\\"Ishmael", &.{ "foo.exe", "CallMe\"Ishmael" }); + try t("foo.exe \"CallMe\\\"Ishmael\"", &.{ "foo.exe", "CallMe\"Ishmael" }); + try t("foo.exe \"Call Me Ishmael\\\\\"", &.{ "foo.exe", "Call Me Ishmael\\" }); + try t("foo.exe \"CallMe\\\\\\\"Ishmael\"", &.{ "foo.exe", "CallMe\\\"Ishmael" }); + try t("foo.exe a\\\\\\b", &.{ "foo.exe", "a\\\\\\b" }); + try t("foo.exe \"a\\\\\\b\"", &.{ "foo.exe", "a\\\\\\b" }); + + // Surrogate pair encoding of 𐐷 separated by quotes. + // Encoded as WTF-16: + // "<0xD801>"<0xDC37> + // Encoded as WTF-8: + // "<0xED><0xA0><0x81>"<0xED><0xB0><0xB7> + // During parsing, the quotes drop out and the surrogate pair + // should end up encoded as its normal UTF-8 representation. + try t("foo.exe \"\xed\xa0\x81\"\xed\xb0\xb7", &.{ "foo.exe", "𐐷" }); } fn testArgIteratorWindows(cmd_line: []const u8, expected_args: []const []const u8) !void { diff --git a/test/standalone/build.zig.zon b/test/standalone/build.zig.zon index 8b59f261179e..8f5a061fe2e0 100644 --- a/test/standalone/build.zig.zon +++ b/test/standalone/build.zig.zon @@ -104,6 +104,9 @@ .windows_spawn = .{ .path = "windows_spawn", }, + .windows_argv = .{ + .path = "windows_argv", + }, .self_exe_symlink = .{ .path = "self_exe_symlink", }, diff --git a/test/standalone/windows_argv/README.md b/test/standalone/windows_argv/README.md new file mode 100644 index 000000000000..f7ce7ac7c424 --- /dev/null +++ b/test/standalone/windows_argv/README.md @@ -0,0 +1,19 @@ +Tests that Zig's `std.process.ArgIteratorWindows` is compatible with both the MSVC and MinGW C runtimes' argv splitting algorithms. + +The method of testing is: +- Compile a C file with `wmain` as its entry point +- The C `wmain` calls a Zig-implemented `verify` function that takes the `argv` from `wmain` and compares it to the argv gotten from `std.proccess.argsAlloc` (which takes `kernel32.GetCommandLineW()` and splits it) +- The compiled C program is spawned continuously as a child process by the implementation in `fuzz.zig` with randomly generated command lines + + On Windows, the 'application name' and the 'command line' are disjoint concepts. That is, you can spawn `foo.exe` but set the command line to `bar.exe`, and `CreateProcessW` will spawn `foo.exe` but `argv[0]` will be `bar.exe`. This quirk allows us to test arbitrary `argv[0]` values as well which otherwise wouldn't be possible. + +Note: This is intentionally testing against the C runtime argv splitting and *not* [`CommandLineToArgvW`](https://learn.microsoft.com/en-us/windows/win32/api/shellapi/nf-shellapi-commandlinetoargvw), since the C runtime argv splitting was updated in 2008 but `CommandLineToArgvW` still uses the pre-2008 algorithm (which differs in both `argv[0]` rules and `""`; see [here](https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULESDOC) for details) + +--- + +In addition to being run during `zig build test-standalone`, this test can be run on its own via `zig build test` from within this directory. + +When run on its own: +- `-Diterations=` can be used to set the max fuzzing iterations, and `-Diterations=0` can be used to fuzz indefinitely +- `-Dseed=` can be used to set the PRNG seed for fuzz testing. If not provided, then the seed is chosen at random during `build.zig` compilation. + +On failure, the number of iterations and the seed can be seen in the failing command, e.g. in `path\to\fuzz.exe path\to\verify-msvc.exe 100 2780392459403250529`, the iterations is `100` and the seed is `2780392459403250529`. diff --git a/test/standalone/windows_argv/build.zig b/test/standalone/windows_argv/build.zig new file mode 100644 index 000000000000..dcc2d1e4b183 --- /dev/null +++ b/test/standalone/windows_argv/build.zig @@ -0,0 +1,88 @@ +const std = @import("std"); +const builtin = @import("builtin"); + +pub fn build(b: *std.Build) !void { + const test_step = b.step("test", "Test it"); + b.default_step = test_step; + + if (builtin.os.tag != .windows) return; + + const optimize: std.builtin.OptimizeMode = .Debug; + + const lib_msvc = b.addStaticLibrary(.{ + .name = "toargv-msvc", + .root_source_file = .{ .path = "lib.zig" }, + .target = b.resolveTargetQuery(.{ + .abi = .msvc, + }), + .optimize = optimize, + }); + const verify_msvc = b.addExecutable(.{ + .name = "verify-msvc", + .target = b.resolveTargetQuery(.{ + .abi = .msvc, + }), + .optimize = optimize, + }); + verify_msvc.addCSourceFile(.{ + .file = .{ .path = "verify.c" }, + .flags = &.{ "-DUNICODE", "-D_UNICODE" }, + }); + verify_msvc.linkLibrary(lib_msvc); + verify_msvc.linkLibC(); + + const lib_gnu = b.addStaticLibrary(.{ + .name = "toargv-gnu", + .root_source_file = .{ .path = "lib.zig" }, + .target = b.resolveTargetQuery(.{ + .abi = .gnu, + }), + .optimize = optimize, + }); + const verify_gnu = b.addExecutable(.{ + .name = "verify-gnu", + .target = b.resolveTargetQuery(.{ + .abi = .gnu, + }), + .optimize = optimize, + }); + verify_gnu.addCSourceFile(.{ + .file = .{ .path = "verify.c" }, + .flags = &.{ "-DUNICODE", "-D_UNICODE" }, + }); + verify_gnu.mingw_unicode_entry_point = true; + verify_gnu.linkLibrary(lib_gnu); + verify_gnu.linkLibC(); + + const fuzz = b.addExecutable(.{ + .name = "fuzz", + .root_source_file = .{ .path = "fuzz.zig" }, + .target = b.host, + .optimize = optimize, + }); + + const fuzz_max_iterations = b.option(u64, "iterations", "The max fuzz iterations (default: 100)") orelse 100; + const fuzz_iterations_arg = std.fmt.allocPrint(b.allocator, "{}", .{fuzz_max_iterations}) catch @panic("oom"); + + const fuzz_seed = b.option(u64, "seed", "Seed to use for the PRNG (default: random)") orelse seed: { + var buf: [8]u8 = undefined; + try std.posix.getrandom(&buf); + break :seed std.mem.readInt(u64, &buf, builtin.cpu.arch.endian()); + }; + const fuzz_seed_arg = std.fmt.allocPrint(b.allocator, "{}", .{fuzz_seed}) catch @panic("oom"); + + const run_msvc = b.addRunArtifact(fuzz); + run_msvc.setName("fuzz-msvc"); + run_msvc.addArtifactArg(verify_msvc); + run_msvc.addArgs(&.{ fuzz_iterations_arg, fuzz_seed_arg }); + run_msvc.expectExitCode(0); + + const run_gnu = b.addRunArtifact(fuzz); + run_gnu.setName("fuzz-gnu"); + run_gnu.addArtifactArg(verify_gnu); + run_gnu.addArgs(&.{ fuzz_iterations_arg, fuzz_seed_arg }); + run_gnu.expectExitCode(0); + + test_step.dependOn(&run_msvc.step); + test_step.dependOn(&run_gnu.step); +} diff --git a/test/standalone/windows_argv/fuzz.zig b/test/standalone/windows_argv/fuzz.zig new file mode 100644 index 000000000000..b88853196fb0 --- /dev/null +++ b/test/standalone/windows_argv/fuzz.zig @@ -0,0 +1,159 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const windows = std.os.windows; +const Allocator = std.mem.Allocator; + +pub fn main() !void { + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + defer std.debug.assert(gpa.deinit() == .ok); + const allocator = gpa.allocator(); + + const args = try std.process.argsAlloc(allocator); + defer std.process.argsFree(allocator, args); + + if (args.len < 2) return error.MissingArgs; + + const verify_path_wtf8 = args[1]; + const verify_path_w = try std.unicode.wtf8ToWtf16LeAllocZ(allocator, verify_path_wtf8); + defer allocator.free(verify_path_w); + + const iterations: u64 = iterations: { + if (args.len < 3) break :iterations 0; + break :iterations try std.fmt.parseUnsigned(u64, args[2], 10); + }; + + var rand_seed = false; + const seed: u64 = seed: { + if (args.len < 4) { + rand_seed = true; + var buf: [8]u8 = undefined; + try std.posix.getrandom(&buf); + break :seed std.mem.readInt(u64, &buf, builtin.cpu.arch.endian()); + } + break :seed try std.fmt.parseUnsigned(u64, args[3], 10); + }; + var random = std.rand.DefaultPrng.init(seed); + const rand = random.random(); + + // If the seed was not given via the CLI, then output the + // randomly chosen seed so that this run can be reproduced + if (rand_seed) { + std.debug.print("rand seed: {}\n", .{seed}); + } + + var cmd_line_w_buf = std.ArrayList(u16).init(allocator); + defer cmd_line_w_buf.deinit(); + + var i: u64 = 0; + var errors: u64 = 0; + while (iterations == 0 or i < iterations) { + const cmd_line_w = try randomCommandLineW(allocator, rand); + defer allocator.free(cmd_line_w); + + // avoid known difference for 0-length command lines + if (cmd_line_w.len == 0 or cmd_line_w[0] == '\x00') continue; + + const exit_code = try spawnVerify(verify_path_w, cmd_line_w); + if (exit_code != 0) { + std.debug.print(">>> found discrepancy <<<\n", .{}); + const cmd_line_wtf8 = try std.unicode.wtf16LeToWtf8Alloc(allocator, cmd_line_w); + defer allocator.free(cmd_line_wtf8); + std.debug.print("\"{}\"\n\n", .{std.zig.fmtEscapes(cmd_line_wtf8)}); + + errors += 1; + } + + i += 1; + } + if (errors > 0) { + // we never get here if iterations is 0 so we don't have to worry about that case + std.debug.print("found {} discrepancies in {} iterations\n", .{ errors, iterations }); + return error.FoundDiscrepancies; + } +} + +fn randomCommandLineW(allocator: Allocator, rand: std.rand.Random) ![:0]const u16 { + const Choice = enum { + backslash, + quote, + space, + tab, + control, + printable, + non_ascii, + }; + + const choices = rand.uintAtMostBiased(u16, 256); + var buf = try std.ArrayList(u16).initCapacity(allocator, choices); + errdefer buf.deinit(); + + for (0..choices) |_| { + const choice = rand.enumValue(Choice); + const code_unit = switch (choice) { + .backslash => '\\', + .quote => '"', + .space => ' ', + .tab => '\t', + .control => switch (rand.uintAtMostBiased(u8, 0x21)) { + 0x21 => '\x7F', + else => |b| b, + }, + .printable => '!' + rand.uintAtMostBiased(u8, '~' - '!'), + .non_ascii => rand.intRangeAtMostBiased(u16, 0x80, 0xFFFF), + }; + try buf.append(std.mem.nativeToLittle(u16, code_unit)); + } + + return buf.toOwnedSliceSentinel(0); +} + +/// Returns the exit code of the verify process +fn spawnVerify(verify_path: [:0]const u16, cmd_line: [:0]const u16) !windows.DWORD { + const child_proc = spawn: { + var startup_info: windows.STARTUPINFOW = .{ + .cb = @sizeOf(windows.STARTUPINFOW), + .lpReserved = null, + .lpDesktop = null, + .lpTitle = null, + .dwX = 0, + .dwY = 0, + .dwXSize = 0, + .dwYSize = 0, + .dwXCountChars = 0, + .dwYCountChars = 0, + .dwFillAttribute = 0, + .dwFlags = windows.STARTF_USESTDHANDLES, + .wShowWindow = 0, + .cbReserved2 = 0, + .lpReserved2 = null, + .hStdInput = null, + .hStdOutput = null, + .hStdError = windows.GetStdHandle(windows.STD_ERROR_HANDLE) catch null, + }; + var proc_info: windows.PROCESS_INFORMATION = undefined; + + try windows.CreateProcessW( + @constCast(verify_path.ptr), + @constCast(cmd_line.ptr), + null, + null, + windows.TRUE, + 0, + null, + null, + &startup_info, + &proc_info, + ); + windows.CloseHandle(proc_info.hThread); + + break :spawn proc_info.hProcess; + }; + defer windows.CloseHandle(child_proc); + try windows.WaitForSingleObjectEx(child_proc, windows.INFINITE, false); + + var exit_code: windows.DWORD = undefined; + if (windows.kernel32.GetExitCodeProcess(child_proc, &exit_code) == 0) { + return error.UnableToGetExitCode; + } + return exit_code; +} diff --git a/test/standalone/windows_argv/lib.h b/test/standalone/windows_argv/lib.h new file mode 100644 index 000000000000..e3cb684715f9 --- /dev/null +++ b/test/standalone/windows_argv/lib.h @@ -0,0 +1,8 @@ +#ifndef _LIB_H_ +#define _LIB_H_ + +#include + +int verify(int argc, wchar_t *argv[]); + +#endif \ No newline at end of file diff --git a/test/standalone/windows_argv/lib.zig b/test/standalone/windows_argv/lib.zig new file mode 100644 index 000000000000..074273ae21aa --- /dev/null +++ b/test/standalone/windows_argv/lib.zig @@ -0,0 +1,59 @@ +const std = @import("std"); + +/// Returns 1 on success, 0 on failure +export fn verify(argc: c_int, argv: [*]const [*:0]const u16) c_int { + const argv_slice = argv[0..@intCast(argc)]; + testArgv(argv_slice) catch |err| switch (err) { + error.OutOfMemory => @panic("oom"), + error.Overflow => @panic("bytes needed to contain args would overflow usize"), + error.ArgvMismatch => return 0, + }; + return 1; +} + +fn testArgv(expected_args: []const [*:0]const u16) !void { + var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator); + defer arena_state.deinit(); + const allocator = arena_state.allocator(); + + const args = try std.process.argsAlloc(allocator); + var wtf8_buf = std.ArrayList(u8).init(allocator); + + var eql = true; + if (args.len != expected_args.len) eql = false; + + const min_len = @min(expected_args.len, args.len); + for (expected_args[0..min_len], args[0..min_len], 0..) |expected_arg, arg_wtf8, i| { + wtf8_buf.clearRetainingCapacity(); + try std.unicode.wtf16LeToWtf8ArrayList(&wtf8_buf, std.mem.span(expected_arg)); + if (!std.mem.eql(u8, wtf8_buf.items, arg_wtf8)) { + std.debug.print("{}: expected: \"{}\"\n", .{ i, std.zig.fmtEscapes(wtf8_buf.items) }); + std.debug.print("{}: actual: \"{}\"\n", .{ i, std.zig.fmtEscapes(arg_wtf8) }); + eql = false; + } + } + if (!eql) { + for (expected_args[min_len..], min_len..) |arg, i| { + wtf8_buf.clearRetainingCapacity(); + try std.unicode.wtf16LeToWtf8ArrayList(&wtf8_buf, std.mem.span(arg)); + std.debug.print("{}: expected: \"{}\"\n", .{ i, std.zig.fmtEscapes(wtf8_buf.items) }); + } + for (args[min_len..], min_len..) |arg, i| { + std.debug.print("{}: actual: \"{}\"\n", .{ i, std.zig.fmtEscapes(arg) }); + } + const peb = std.os.windows.peb(); + const lpCmdLine: [*:0]u16 = @ptrCast(peb.ProcessParameters.CommandLine.Buffer); + wtf8_buf.clearRetainingCapacity(); + try std.unicode.wtf16LeToWtf8ArrayList(&wtf8_buf, std.mem.span(lpCmdLine)); + std.debug.print("command line: \"{}\"\n", .{std.zig.fmtEscapes(wtf8_buf.items)}); + std.debug.print("expected argv:\n", .{}); + std.debug.print("&.{{\n", .{}); + for (expected_args) |arg| { + wtf8_buf.clearRetainingCapacity(); + try std.unicode.wtf16LeToWtf8ArrayList(&wtf8_buf, std.mem.span(arg)); + std.debug.print(" \"{}\",\n", .{std.zig.fmtEscapes(wtf8_buf.items)}); + } + std.debug.print("}}\n", .{}); + return error.ArgvMismatch; + } +} diff --git a/test/standalone/windows_argv/verify.c b/test/standalone/windows_argv/verify.c new file mode 100644 index 000000000000..53a40d61c9fc --- /dev/null +++ b/test/standalone/windows_argv/verify.c @@ -0,0 +1,7 @@ +#include +#include "lib.h" + +int wmain(int argc, wchar_t *argv[]) { + if (!verify(argc, argv)) return 1; + return 0; +} \ No newline at end of file