From 4a3913273f8fa1196f811bda144327229d4950d1 Mon Sep 17 00:00:00 2001
From: Grahame Bowland <grahame@angrygoats.net>
Date: Sun, 8 Jan 2012 03:02:28 +0800
Subject: [PATCH 1/2] add new read_chars method to std::io::reader

This change also fixes a bug in the old read_char method with
multi-byte utf8 characters, due to re-use of 'w' variable as a loop
counter.
---
 src/libstd/io.rs | 76 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 20 deletions(-)

diff --git a/src/libstd/io.rs b/src/libstd/io.rs
index 1928cd4b3d79e..b1a25dc0988ca 100644
--- a/src/libstd/io.rs
+++ b/src/libstd/io.rs
@@ -50,6 +50,7 @@ type reader =
         fn unread_byte(int);
         fn read_bytes(uint) -> [u8];
         fn read_char() -> char;
+        fn read_chars(uint) -> [char];
         fn eof() -> bool;
         fn read_line() -> str;
         fn read_c_str() -> str;
@@ -101,29 +102,64 @@ obj new_reader(rdr: buf_reader) {
     fn read_byte() -> int { ret rdr.read_byte(); }
     fn unread_byte(byte: int) { ret rdr.unread_byte(byte); }
     fn read_bytes(len: uint) -> [u8] { ret rdr.read(len); }
+    fn read_chars(n: uint) -> [char] {
+        // returns the (consumed offset, n_req)
+        fn chars_from_buf(buf: [u8], &chars: [char]) -> (uint, uint) {
+            let i = 0u;
+            while i < vec::len(buf) {
+                let b0 = buf[i];
+                let w = str::utf8_char_width(b0);
+                let end = i + w;
+                i += 1u;
+                assert (w > 0u);
+                if w == 1u {
+                    chars += [ b0 as char ];
+                    cont;
+                }
+                // can't satisfy this char with the existing data
+                if end > vec::len(buf) {
+                    ret (i - 1u, end - vec::len(buf));
+                }
+                let val = 0u;
+                while i < end {
+                    let next = buf[i] as int;
+                    i += 1u;
+                    assert (next > -1);
+                    assert (next & 192 == 128);
+                    val <<= 6u;
+                    val += next & 63 as uint;
+                }
+                // See str::char_at
+                val += (b0 << (w + 1u as u8) as uint) << (w - 1u) * 6u - w - 1u;
+                chars += [ val as char ];
+            }
+            ret (i, 0u);
+        }
+        let buf: [u8] = self.read_bytes(n); // might need more, n will never over-read
+        let chars: [char] = [];
+        while vec::len(chars) < n {
+            let (offset, nbreq) = chars_from_buf(buf, chars);
+            let ncreq = n - vec::len(chars);
+            let ntoread = if ncreq > nbreq { ncreq } else { nbreq };
+            if ntoread > 0u {
+                buf = vec::slice(buf, offset, vec::len(buf));
+                let data = self.read_bytes(ntoread);
+                if vec::len(data) == 0u {
+                    // eof - should we do something if we're split in a unicode char?
+                    break;
+                }
+                buf += data;
+            }
+        }
+        ret chars;
+    }
     fn read_char() -> char {
-        let c0 = rdr.read_byte();
-        if c0 == -1 {
+        let c = self.read_chars(1u);
+        if vec::len(c) == 0u {
             ret -1 as char; // FIXME will this stay valid?
-
-        }
-        let b0 = c0 as u8;
-        let w = str::utf8_char_width(b0);
-        assert (w > 0u);
-        if w == 1u { ret b0 as char; }
-        let val = 0u;
-        while w > 1u {
-            w -= 1u;
-            let next = rdr.read_byte();
-            assert (next > -1);
-            assert (next & 192 == 128);
-            val <<= 6u;
-            val += next & 63 as uint;
         }
-        // See str::char_at
-
-        val += (b0 << (w + 1u as u8) as uint) << (w - 1u) * 6u - w - 1u;
-        ret val as char;
+        assert(vec::len(c) == 1u);
+        ret c[0];
     }
     fn eof() -> bool { ret rdr.eof(); }
     fn read_line() -> str {

From 4d9c6241f30200e9a9f6b780dc13ad92f5f94826 Mon Sep 17 00:00:00 2001
From: Grahame Bowland <grahame@angrygoats.net>
Date: Sun, 8 Jan 2012 21:31:13 +0800
Subject: [PATCH 2/2] tidy up the new read_chars() method, improve behavior on
 eof

---
 src/libstd/io.rs | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/libstd/io.rs b/src/libstd/io.rs
index b1a25dc0988ca..f424b8c3a5e0e 100644
--- a/src/libstd/io.rs
+++ b/src/libstd/io.rs
@@ -103,7 +103,7 @@ obj new_reader(rdr: buf_reader) {
     fn unread_byte(byte: int) { ret rdr.unread_byte(byte); }
     fn read_bytes(len: uint) -> [u8] { ret rdr.read(len); }
     fn read_chars(n: uint) -> [char] {
-        // returns the (consumed offset, n_req)
+        // returns the (consumed offset, n_req), appends characters to &chars
         fn chars_from_buf(buf: [u8], &chars: [char]) -> (uint, uint) {
             let i = 0u;
             while i < vec::len(buf) {
@@ -135,20 +135,24 @@ obj new_reader(rdr: buf_reader) {
             }
             ret (i, 0u);
         }
-        let buf: [u8] = self.read_bytes(n); // might need more, n will never over-read
+        let buf: [u8] = [];
         let chars: [char] = [];
-        while vec::len(chars) < n {
+        let nbread = n; // might need more bytes, but reading n will never over-read
+        while nbread > 0u {
+            let data = self.read_bytes(nbread); 
+            if vec::len(data) == 0u {
+                // eof - FIXME should we do something if we're split in a unicode char?
+                break;
+            }
+            buf += data;
             let (offset, nbreq) = chars_from_buf(buf, chars);
             let ncreq = n - vec::len(chars);
-            let ntoread = if ncreq > nbreq { ncreq } else { nbreq };
-            if ntoread > 0u {
+            // again we either know we need a certain number of bytes to complete a
+            // character, or we make sure we don't over-read by reading 1-byte per char
+            // needed
+            nbread = if ncreq > nbreq { ncreq } else { nbreq };
+            if nbread > 0u {
                 buf = vec::slice(buf, offset, vec::len(buf));
-                let data = self.read_bytes(ntoread);
-                if vec::len(data) == 0u {
-                    // eof - should we do something if we're split in a unicode char?
-                    break;
-                }
-                buf += data;
             }
         }
         ret chars;