for whitespace delimited files, adjoining delimiters are treated as s…

…ingle delimiter. fixed bug in handling empty columns. updated tests and docs. fixes JuliaLang#5391
tanmaykm · Jan 16, 2014 · ee61b45 · ee61b45
1 parent 7eab096
commit ee61b45
Show file tree

Hide file tree

Showing 4 changed files with 188 additions and 45 deletions.
diff --git a/base/datafmt.jl b/base/datafmt.jl
@@ -68,18 +68,20 @@ function ascii_if_possible(sbuff::String)
 end
 
 function readdlm_string(sbuff::String, dlm::Char, T::Type, eol::Char, auto::Bool, optsd::Dict)
+    ign_empty = (dlm == invalid_dlm) || isspace(dlm)
+
     nrows,ncols = try
-            dlm_dims(sbuff, eol, dlm)
+            dlm_dims(sbuff, eol, dlm, ign_empty)
         catch ex
             !get(optsd, :ignore_invalid_chars, false) && throw(ex)
             sbuff = ascii_if_possible(convert(typeof(sbuff), sbuff.data, ""))
-            dlm_dims(sbuff, eol, dlm)
+            dlm_dims(sbuff, eol, dlm, ign_empty)
         end
     offsets = zeros(Int, nrows, ncols)
     has_header = get(optsd, :has_header, false)
     cells = Array(T, has_header ? nrows-1 : nrows, ncols)
-    dlm_offsets(sbuff, dlm, eol, offsets)
-    has_header ? (dlm_fill(cells, offsets, sbuff, auto, 1, eol), dlm_fill(Array(String, 1, ncols), offsets, sbuff, auto, 0, eol)) : dlm_fill(cells, offsets, sbuff, auto, 0, eol)
+    dlm_offsets(sbuff, dlm, eol, offsets, ign_empty)
+    has_header ? (dlm_fill(cells, offsets, sbuff, auto, 1, dlm, eol, ign_empty), dlm_fill(Array(String, 1, ncols), offsets, sbuff, auto, 0, dlm, eol, ign_empty)) : dlm_fill(cells, offsets, sbuff, auto, 0, dlm, eol, ign_empty)
 end
 
 const valid_opts = [:has_header, :ignore_invalid_chars, :use_mmap]
@@ -99,10 +101,11 @@ function dlm_col_begin(ncols::Int, offsets::Array{Int,2}, row::Int, col::Int)
     pp_col = (1 == col) ? ncols : (col-1)
 
     ret = offsets[pp_row, pp_col]
-    (ret == 0) ? dlm_col_begin(ncols, offsets, pp_row, pp_col) : (ret+2)
+    (ret == 0) && (ret = dlm_col_begin(ncols, offsets, pp_row, pp_col))
+    ret+2
 end
 
-function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, auto::Bool, row_offset::Int, eol::Char)
+function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, auto::Bool, row_offset::Int, dlm::Char, eol::Char, ign_adj_dlm::Bool)
     maxrow,maxcol = size(cells)
     tmp64 = Array(Float64,1)
 
@@ -114,6 +117,14 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au
 
             end_idx = prevind(sbuff, nextind(sbuff,end_pos))
             (col == maxcol) && (end_idx > 0) && ('\n' == eol) && ('\r' == sbuff[end_idx]) && (end_idx = prevind(sbuff, end_idx))
+            if ign_adj_dlm
+                is_default_dlm = (dlm == invalid_dlm)
+                while start_pos <= end_idx
+                    val = sbuff[start_pos] 
+                    (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && break
+                    start_pos = nextind(sbuff, start_pos)
+                end
+            end
             sval = SubString(sbuff, start_pos, end_idx)
 
             if T <: Char
@@ -123,7 +134,7 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au
                 if float64_isvalid(sval, tmp64)
                     cells[cell_row,col] = tmp64[1]
                 elseif auto
-                    return dlm_fill(Array(Any,maxrow,maxcol), offsets, sbuff, false, row_offset, eol)
+                    return dlm_fill(Array(Any,maxrow,maxcol), offsets, sbuff, false, row_offset, dlm, eol, ign_adj_dlm)
                 else
                     cells[cell_row,col] = NaN
                 end
@@ -140,52 +151,62 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au
 end
 
 
-function dlm_offsets(sbuff::UTF8String, dlm, eol, offsets::Array{Int,2})
-    isascii(dlm) && isascii(eol) && (return dlm_offsets(sbuff.data, uint8(dlm), uint8(eol), offsets))
+function dlm_offsets(sbuff::UTF8String, dlm, eol, offsets::Array{Int,2}, ign_adj_dlm::Bool)
+    isascii(dlm) && isascii(eol) && (return dlm_offsets(sbuff.data, uint8(dlm), uint8(eol), offsets, ign_adj_dlm))
 
     col = 0
     row = 1
     maxrow,maxcol = size(offsets)
     offsets[maxrow,maxcol] = length(sbuff.data)
     idx = 1
     is_default_dlm = (dlm == invalid_dlm)
+    got_data = false
     while(idx <= length(sbuff.data))
         val,idx = next(sbuff, idx)
-        (val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
-        col += 1
-        offsets[row,col] = idx-2
+        (val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
+        if got_data || !ign_adj_dlm
+            col += 1
+            offsets[row,col] = idx-2
+        end
         (row >= maxrow) && (col == maxcol) && break
         (val == eol) && (row += 1; col = 0)
+        got_data = false
     end
 end
 
-dlm_offsets(sbuff::ASCIIString, dlmc, eolc, offsets::Array{Int,2}) = dlm_offsets(sbuff.data, uint8(dlmc), uint8(eolc), offsets)
-function dlm_offsets(dbuff::Vector{Uint8}, dlm::Uint8, eol::Uint8, offsets::Array{Int,2})
+dlm_offsets(sbuff::ASCIIString, dlmc, eolc, offsets::Array{Int,2}, ign_adj_dlm::Bool) = dlm_offsets(sbuff.data, uint8(dlmc), uint8(eolc), offsets, ign_adj_dlm)
+function dlm_offsets(dbuff::Vector{Uint8}, dlm::Uint8, eol::Uint8, offsets::Array{Int,2}, ign_adj_dlm::Bool)
     col = 0
     row = 1
     is_default_dlm = (dlm == uint8(invalid_dlm))
     maxrow,maxcol = size(offsets)
     offsets[maxrow,maxcol] = length(dbuff)
+    got_data = false
     for idx in 1:length(dbuff)
         val = dbuff[idx]
-        (val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
-        col += 1
-        offsets[row,col] = idx-1
+        (val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
+        if got_data || !ign_adj_dlm
+            col += 1
+            offsets[row,col] = idx-1
+        end
         (row >= maxrow) && (col == maxcol) && break
         (val == eol) && (row += 1; col = 0)
+        got_data = false
     end
 end
 
-dlm_dims(s::ASCIIString, eol::Char, dlm::Char) = dlm_dims(s.data, uint8(eol), uint8(dlm))
-function dlm_dims{T,D}(dbuff::T, eol::D, dlm::D)
-    isa(dbuff, UTF8String) && isascii(eol) && isascii(dlm) && (return dlm_dims(dbuff.data, uint8(eol), uint8(dlm)))
+dlm_dims(s::ASCIIString, eol::Char, dlm::Char, ign_adj_dlm::Bool) = dlm_dims(s.data, uint8(eol), uint8(dlm), ign_adj_dlm)
+function dlm_dims{T,D}(dbuff::T, eol::D, dlm::D, ign_adj_dlm::Bool)
+    isa(dbuff, UTF8String) && isascii(eol) && isascii(dlm) && (return dlm_dims(dbuff.data, uint8(eol), uint8(dlm), ign_adj_dlm))
     ncols = nrows = col = 0
     is_default_dlm = (dlm == convert(D, invalid_dlm))
     try
+        got_data = false
         for val in dbuff
-            (val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
-            col += 1
+            (val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
+            (got_data || !ign_adj_dlm) && (col += 1)
             (val == eol) && (nrows += 1; ncols = max(ncols, col); col = 0)
+            got_data = false
         end
     catch ex
         error("at row $nrows, column $col : $ex)")

diff --git a/doc/helpdb.jl b/doc/helpdb.jl
@@ -2536,25 +2536,34 @@
 
 "),
 
-("Text I/O","Base","readdlm","readdlm(source, delim::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
+("Text I/O","Base","readdlm","readdlm(source, delim::Char, T::Type, eol::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
 
-   Read a matrix from the source where each line gives one row, with
-   elements separated by the given delimeter. The source can be a text
-   file, stream or byte array. Memory mapped filed can be used by
-   passing the byte array representation of the mapped segment as
-   source.
+   Read a matrix from the source where each line (separated by \"eol\") 
+   gives one row, with elements separated by the given delimeter. The 
+   source can be a text file, stream or byte array. Memory mapped files 
+   can be used by passing the byte array representation of the mapped 
+   segment as source. 
 
-   If \"has_header\" is \"true\" the first row of data would be read
+   If \"T\" is a numeric type, the result is an array of that type, 
+   with any non-numeric elements as \"NaN\" for floating-point types, 
+   or zero. Other useful values of \"T\" include \"ASCIIString\", 
+   \"String\", and \"Any\".
+
+   If \"has_header\" is \"true\", the first row of data would be read
    as headers and the tuple \"(data_cells, header_cells)\" is returned
    instead of only \"data_cells\".
 
-   If \"use_mmap\" is \"true\" the file specified by \"source\" is
+   If \"use_mmap\" is \"true\", the file specified by \"source\" is
    memory mapped for potential speedups.
 
-   If \"ignore_invalid_chars\" is \"true\" bytes in \"source\" with
+   If \"ignore_invalid_chars\" is \"true\", bytes in \"source\" with
    invalid character encoding will be ignored. Otherwise an error is
    thrown indicating the offending character position.
 
+"),
+
+("Text I/O","Base","readdlm","readdlm(source, delim::Char, eol::Char; options...)
+
    If all data is numeric, the result will be a numeric array. If some
    elements cannot be parsed as numbers, a cell array of numbers and
    strings is returned.
@@ -2563,11 +2572,33 @@
 
 ("Text I/O","Base","readdlm","readdlm(source, delim::Char, T::Type; options...)
 
-   Read a matrix from the source with a given element type. If \"T\"
-   is a numeric type, the result is an array of that type, with any
-   non-numeric elements as \"NaN\" for floating-point types, or zero.
-   Other useful values of \"T\" include \"ASCIIString\", \"String\",
-   and \"Any\".
+   The end of line delimiter is taken as \"\\n\". 
+
+"),
+
+("Text I/O","Base","readdlm","readdlm(source, delim::Char; options...)
+
+   The end of line delimiter is taken as \"\\n\". If all data is 
+   numeric, the result will be a numeric array. If some elements 
+   cannot be parsed as numbers, a cell array of numbers and 
+   strings is returned.
+
+"),
+
+("Text I/O","Base","readdlm","readdlm(source, T::Type; options...)
+
+   The columns are assumed to be separated by one or more whitespaces. 
+   The end of line delimiter is taken as \"\\n\".
+
+"),
+
+("Text I/O","Base","readdlm","readdlm(source, options...)
+
+   The columns are assumed to be separated by one or more whitespaces. 
+   The end of line delimiter is taken as \"\\n\". If all data is 
+   numeric, the result will be a numeric array. If some elements 
+   cannot be parsed as numbers, a cell array of numbers and strings 
+   is returned.
 
 "),
 

diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst
@@ -1664,22 +1664,37 @@ Text I/O
 
    Create an iterable object that will yield each line from a stream.
 
-.. function:: readdlm(source, delim::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
+.. function:: readdlm(source, delim::Char, T::Type, eol::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
 
-   Read a matrix from the source where each line gives one row, with elements separated by the given delimeter. The source can be a text file, stream or byte array. Memory mapped filed can be used by passing the byte array representation of the mapped segment as source. 
+   Read a matrix from the source where each line (separated by ``eol``) gives one row, with elements separated by the given delimeter. The source can be a text file, stream or byte array. Memory mapped files can be used by passing the byte array representation of the mapped segment as source. 
 
-   If ``has_header`` is ``true`` the first row of data would be read as headers and the tuple ``(data_cells, header_cells)`` is returned instead of only ``data_cells``.
+   If ``T`` is a numeric type, the result is an array of that type, with any non-numeric elements as ``NaN`` for floating-point types, or zero. Other useful values of ``T`` include ``ASCIIString``, ``String``, and ``Any``.
 
-   If ``use_mmap`` is ``true`` the file specified by ``source`` is memory mapped for potential speedups.
+   If ``has_header`` is ``true``, the first row of data would be read as headers and the tuple ``(data_cells, header_cells)`` is returned instead of only ``data_cells``.
 
-   If ``ignore_invalid_chars`` is ``true`` bytes in ``source`` with invalid character encoding will be ignored. Otherwise an error is thrown indicating the offending character position.
+   If ``use_mmap`` is ``true``, the file specified by ``source`` is memory mapped for potential speedups.
 
-   If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.
+   If ``ignore_invalid_chars`` is ``true``, bytes in ``source`` with invalid character encoding will be ignored. Otherwise an error is thrown indicating the offending character position.
 
+.. function:: readdlm(source, delim::Char, eol::Char; options...)
 
+   If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.
+
 .. function:: readdlm(source, delim::Char, T::Type; options...)
 
-   Read a matrix from the source with a given element type. If ``T`` is a numeric type, the result is an array of that type, with any non-numeric elements as ``NaN`` for floating-point types, or zero. Other useful values of ``T`` include ``ASCIIString``, ``String``, and ``Any``.
+   The end of line delimiter is taken as ``\n``.
+
+.. function:: readdlm(source, delim::Char; options...)
+
+   The end of line delimiter is taken as ``\n``. If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.
+
+.. function:: readdlm(source, T::Type; options...)
+
+   The columns are assumed to be separated by one or more whitespaces. The end of line delimiter is taken as ``\n``.
+
+.. function:: readdlm(source; options...)
+
+   The columns are assumed to be separated by one or more whitespaces. The end of line delimiter is taken as ``\n``. If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.
 
 .. function:: writedlm(f, A, delim='\t')
 

diff --git a/test/readdlm.jl b/test/readdlm.jl
@@ -13,13 +13,89 @@ dlm_data = readdlm(joinpath("perf", "kernel", "imdb-1.tsv"), '\t')
 @test size(readcsv(IOBuffer("1,2,3,4\n1,2,3,"))) == (2,4)
 @test size(readcsv(IOBuffer("1,2,3,4\n1,2,3"))) == (2,4)
 
+@test size(readcsv(IOBuffer("1,2,3,4\r\n"))) == (1,4)
+@test size(readcsv(IOBuffer("1,2,3,4\r\n1,2,3\r\n"))) == (2,4)
+@test size(readcsv(IOBuffer("1,2,3,4\r\n1,2,3,4\r\n"))) == (2,4)
+
 @test size(readdlm(IOBuffer("1 2 3 4\n1 2 3"))) == (2,4)
 @test size(readdlm(IOBuffer("1\t2 3 4\n1 2 3"))) == (2,4)
-@test size(readdlm(IOBuffer("1\t 2 3 4\n1 2 3"))) == (2,5)
-@test size(readdlm(IOBuffer("1\t 2 3 4\n1 2 3\n"))) == (2,5)
+@test size(readdlm(IOBuffer("1\t 2 3 4\n1 2 3"))) == (2,4)
+@test size(readdlm(IOBuffer("1\t 2 3 4\n1 2 3\n"))) == (2,4)
+@test size(readdlm(IOBuffer("1,,2,3,4\n1,2,3\n"), ',')) == (2,5)
+
+result1 = reshape({"", "", "", "", "", "", 1.0, 1.0, "", "", "", "", "", 1.0, 2.0, "", 3.0, "", "", "", "", "", 4.0, "", "", ""}, 2, 13)
+result2 = reshape({1.0, 1.0, 2.0, 1.0, 3.0, "", 4.0, ""}, 2, 4)
+
+@test readdlm(IOBuffer(",,,1,,,,2,3,,,4,\n,,,1,,,1\n"), ',') == result1
+@test readdlm(IOBuffer("   1    2 3   4 \n   1   1\n")) == result2
+
+result1[1,4] = "भारत" 
+@test readdlm(IOBuffer(",,,भारत,,,,2,3,,,4,\n,,,1,,,1\n"), ',') == result1
 
 let x = [1,2,3], y = [4,5,6], io = IOBuffer()
     writedlm(io, zip(x,y), ",  ")
     seek(io, 0)
     @test readcsv(io) == [x y]
 end
+
+
+# source: http://www.i18nguy.com/unicode/unicode-example-utf8.zip
+i18n_data = ["Origin (English)", "Name (English)", "Origin (Native)", "Name (Native)", 
+"Australia", "Nicole Kidman", "Australia", "Nicole Kidman", 
+"Austria", "Johann Strauss", "Österreich", "Johann Strauß", 
+"Belgium (Flemish)", "Rene Magritte", "België", "René Magritte", 
+"Belgium (French)", "Rene Magritte", "Belgique", "René Magritte", 
+"Belgium (German)", "Rene Magritte", "Belgien", "René Magritte", 
+"Bhutan", "Gonpo Dorji", "འབྲུག་ཡུལ།", "མགོན་པོ་རྡོ་རྗེ།", 
+"Canada", "Celine Dion", "Canada", "Céline Dion", 
+"Canada - Nunavut (Inuktitut)", "Susan Aglukark", "ᓄᓇᕗᒻᒥᐅᑦ", "ᓱᓴᓐ ᐊᒡᓗᒃᑲᖅ", 
+"Democratic People's Rep. of Korea", "LEE Sol-Hee", "조선 민주주의 인민 공화국", "이설희", 
+"Denmark", "Soren Hauch-Fausboll", "Danmark", "Søren Hauch-Fausbøll", 
+"Denmark", "Soren Kierkegaard", "Danmark", "Søren Kierkegård", 
+"Egypt", "Abdel Halim Hafez", "ﻣﺼﺮ", "ﻋﺑﺪﺍﻠﺣﻟﻳﻢ ﺤﺎﻓﻅ", 
+"Egypt", "Om Kolthoum", "ﻣﺼﺮ", "ﺃﻡ ﻛﻟﺛﻭﻡ", 
+"Eritrea", "Berhane Zeray", "ብርሃነ ዘርኣይ", "ኤርትራ", 
+"Ethiopia", "Haile Gebreselassie", "ኃይሌ ገብረሥላሴ", "ኢትዮጵያ", 
+"France", "Gerard Depardieu", "France", "Gérard Depardieu", 
+"France", "Jean Reno", "France", "Jean Réno", 
+"France", "Camille Saint-Saens", "France", "Camille Saint-Saëns", 
+"France", "Mylene Demongeot", "France", "Mylène Demongeot", 
+"France", "Francois Truffaut", "France", "François Truffaut", 
+"France (Braille)", "Louis Braille", "⠋⠗⠁⠝⠉⠑", "⠇⠕⠥⠊⠎⠀<BR>⠃⠗⠁⠊⠇⠇⠑", 
+"Georgia", "Eduard Shevardnadze", "საქართველო", "ედუარდ შევარდნაძე", 
+"Germany", "Rudi Voeller", "Deutschland", "Rudi Völler", 
+"Germany", "Walter Schultheiss", "Deutschland", "Walter Schultheiß", 
+"Greece", "Giorgos Dalaras", "Ελλάς", "Γιώργος Νταλάρας", 
+"Iceland", "Bjork Gudmundsdottir", "Ísland", "Björk Guðmundsdóttir", 
+"India (Hindi)", "Madhuri Dixit", "भारत", "माधुरी दिछित", 
+"Ireland", "Sinead O'Connor", "Éire", "Sinéad O'Connor", 
+"Israel", "Yehoram Gaon", "ישראל", "יהורם גאון", 
+"Italy", "Fabrizio DeAndre", "Italia", "Fabrizio De André", 
+"Japan", "KUBOTA Toshinobu", "日本", "久保田    利伸", 
+"Japan", "HAYASHIBARA Megumi", "日本", "林原 めぐみ", 
+"Japan", "Mori Ogai", "日本", "森鷗外", 
+"Japan", "Tex Texin", "日本", "テクス テクサン", 
+"Norway", "Tor Age Bringsvaerd", "Noreg", "Tor Åge Bringsværd", 
+"Pakistan (Urdu)", "Nusrat Fatah Ali Khan", "پاکستان", "نصرت فتح علی خان", 
+"People's Rep. of China", "ZHANG Ziyi", "中国", "章子怡", 
+"People's Rep. of China", "WONG Faye", "中国", "王菲", 
+"Poland", "Lech Walesa", "Polska", "Lech Wałęsa", 
+"Puerto Rico", "Olga Tanon", "Puerto Rico", "Olga Tañón", 
+"Rep. of China", "Hsu Chi", "臺灣", "舒淇", 
+"Rep. of China", "Ang Lee", "臺灣", "李安", 
+"Rep. of Korea", "AHN Sung-Gi", "한민국", "안성기", 
+"Rep. of Korea", "SHIM Eun-Ha", "한민국", "심은하", 
+"Russia", "Mikhail Gorbachev", "Россия", "Михаил Горбачёв", 
+"Russia", "Boris Grebenshchikov", "Россия", "Борис Гребенщиков", 
+"Slovenia", "\"Frane \"\"Jezek\"\" Milcinski", "Slovenija", "Frane Milčinski - Ježek", 
+"Syracuse (Sicily)", "Archimedes", "Συρακούσα", "Ἀρχιμήδης", 
+"Thailand", "Thongchai McIntai", "ประเทศไทย", "ธงไชย แม็คอินไตย์", 
+"U.S.A.", "Brad Pitt", "U.S.A.", "Brad Pitt", 
+"Yugoslavia (Cyrillic)", "Djordje Balasevic", "Југославија", "Ђорђе Балашевић", 
+"Yugoslavia (Latin)", "Djordje Balasevic", "Jugoslavija", "Đorđe Balašević"]
+
+i18n_arr = transpose(reshape(i18n_data, 4, int(floor(length(i18n_data)/4))))
+i18n_buff = PipeBuffer()
+writedlm(i18n_buff, i18n_arr, ',')
+@test i18n_arr == readcsv(i18n_buff)
+