From 4a3913273f8fa1196f811bda144327229d4950d1 Mon Sep 17 00:00:00 2001 From: Grahame Bowland Date: Sun, 8 Jan 2012 03:02:28 +0800 Subject: [PATCH 1/2] add new read_chars method to std::io::reader This change also fixes a bug in the old read_char method with multi-byte utf8 characters, due to re-use of 'w' variable as a loop counter. --- src/libstd/io.rs | 76 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 20 deletions(-) diff --git a/src/libstd/io.rs b/src/libstd/io.rs index 1928cd4b3d79e..b1a25dc0988ca 100644 --- a/src/libstd/io.rs +++ b/src/libstd/io.rs @@ -50,6 +50,7 @@ type reader = fn unread_byte(int); fn read_bytes(uint) -> [u8]; fn read_char() -> char; + fn read_chars(uint) -> [char]; fn eof() -> bool; fn read_line() -> str; fn read_c_str() -> str; @@ -101,29 +102,64 @@ obj new_reader(rdr: buf_reader) { fn read_byte() -> int { ret rdr.read_byte(); } fn unread_byte(byte: int) { ret rdr.unread_byte(byte); } fn read_bytes(len: uint) -> [u8] { ret rdr.read(len); } + fn read_chars(n: uint) -> [char] { + // returns the (consumed offset, n_req) + fn chars_from_buf(buf: [u8], &chars: [char]) -> (uint, uint) { + let i = 0u; + while i < vec::len(buf) { + let b0 = buf[i]; + let w = str::utf8_char_width(b0); + let end = i + w; + i += 1u; + assert (w > 0u); + if w == 1u { + chars += [ b0 as char ]; + cont; + } + // can't satisfy this char with the existing data + if end > vec::len(buf) { + ret (i - 1u, end - vec::len(buf)); + } + let val = 0u; + while i < end { + let next = buf[i] as int; + i += 1u; + assert (next > -1); + assert (next & 192 == 128); + val <<= 6u; + val += next & 63 as uint; + } + // See str::char_at + val += (b0 << (w + 1u as u8) as uint) << (w - 1u) * 6u - w - 1u; + chars += [ val as char ]; + } + ret (i, 0u); + } + let buf: [u8] = self.read_bytes(n); // might need more, n will never over-read + let chars: [char] = []; + while vec::len(chars) < n { + let (offset, nbreq) = chars_from_buf(buf, chars); + let ncreq = n - vec::len(chars); + let ntoread = if ncreq > nbreq { ncreq } else { nbreq }; + if ntoread > 0u { + buf = vec::slice(buf, offset, vec::len(buf)); + let data = self.read_bytes(ntoread); + if vec::len(data) == 0u { + // eof - should we do something if we're split in a unicode char? + break; + } + buf += data; + } + } + ret chars; + } fn read_char() -> char { - let c0 = rdr.read_byte(); - if c0 == -1 { + let c = self.read_chars(1u); + if vec::len(c) == 0u { ret -1 as char; // FIXME will this stay valid? - - } - let b0 = c0 as u8; - let w = str::utf8_char_width(b0); - assert (w > 0u); - if w == 1u { ret b0 as char; } - let val = 0u; - while w > 1u { - w -= 1u; - let next = rdr.read_byte(); - assert (next > -1); - assert (next & 192 == 128); - val <<= 6u; - val += next & 63 as uint; } - // See str::char_at - - val += (b0 << (w + 1u as u8) as uint) << (w - 1u) * 6u - w - 1u; - ret val as char; + assert(vec::len(c) == 1u); + ret c[0]; } fn eof() -> bool { ret rdr.eof(); } fn read_line() -> str { From 4d9c6241f30200e9a9f6b780dc13ad92f5f94826 Mon Sep 17 00:00:00 2001 From: Grahame Bowland Date: Sun, 8 Jan 2012 21:31:13 +0800 Subject: [PATCH 2/2] tidy up the new read_chars() method, improve behavior on eof --- src/libstd/io.rs | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/libstd/io.rs b/src/libstd/io.rs index b1a25dc0988ca..f424b8c3a5e0e 100644 --- a/src/libstd/io.rs +++ b/src/libstd/io.rs @@ -103,7 +103,7 @@ obj new_reader(rdr: buf_reader) { fn unread_byte(byte: int) { ret rdr.unread_byte(byte); } fn read_bytes(len: uint) -> [u8] { ret rdr.read(len); } fn read_chars(n: uint) -> [char] { - // returns the (consumed offset, n_req) + // returns the (consumed offset, n_req), appends characters to &chars fn chars_from_buf(buf: [u8], &chars: [char]) -> (uint, uint) { let i = 0u; while i < vec::len(buf) { @@ -135,20 +135,24 @@ obj new_reader(rdr: buf_reader) { } ret (i, 0u); } - let buf: [u8] = self.read_bytes(n); // might need more, n will never over-read + let buf: [u8] = []; let chars: [char] = []; - while vec::len(chars) < n { + let nbread = n; // might need more bytes, but reading n will never over-read + while nbread > 0u { + let data = self.read_bytes(nbread); + if vec::len(data) == 0u { + // eof - FIXME should we do something if we're split in a unicode char? + break; + } + buf += data; let (offset, nbreq) = chars_from_buf(buf, chars); let ncreq = n - vec::len(chars); - let ntoread = if ncreq > nbreq { ncreq } else { nbreq }; - if ntoread > 0u { + // again we either know we need a certain number of bytes to complete a + // character, or we make sure we don't over-read by reading 1-byte per char + // needed + nbread = if ncreq > nbreq { ncreq } else { nbreq }; + if nbread > 0u { buf = vec::slice(buf, offset, vec::len(buf)); - let data = self.read_bytes(ntoread); - if vec::len(data) == 0u { - // eof - should we do something if we're split in a unicode char? - break; - } - buf += data; } } ret chars;