added readdlm option to ignore empty columns.

updated tests and docs. fixes JuliaLang#5391
tanmaykm · Jan 15, 2014 · 43d5f72 · 43d5f72
1 parent 714fa07
commit 43d5f72
Show file tree

Hide file tree

Showing 4 changed files with 125 additions and 30 deletions.
diff --git a/base/datafmt.jl b/base/datafmt.jl
@@ -68,21 +68,23 @@ function ascii_if_possible(sbuff::String)
 end
 
 function readdlm_string(sbuff::String, dlm::Char, T::Type, eol::Char, auto::Bool, optsd::Dict)
+    ign_empty = get(optsd, :ignore_empty_columns, false)
+
     nrows,ncols = try
-            dlm_dims(sbuff, eol, dlm)
+            dlm_dims(sbuff, eol, dlm, ign_empty)
         catch ex
             !get(optsd, :ignore_invalid_chars, false) && throw(ex)
             sbuff = ascii_if_possible(convert(typeof(sbuff), sbuff.data, ""))
-            dlm_dims(sbuff, eol, dlm)
+            dlm_dims(sbuff, eol, dlm, ign_empty)
         end
     offsets = zeros(Int, nrows, ncols)
     has_header = get(optsd, :has_header, false)
     cells = Array(T, has_header ? nrows-1 : nrows, ncols)
-    dlm_offsets(sbuff, dlm, eol, offsets)
-    has_header ? (dlm_fill(cells, offsets, sbuff, auto, 1, eol), dlm_fill(Array(String, 1, ncols), offsets, sbuff, auto, 0, eol)) : dlm_fill(cells, offsets, sbuff, auto, 0, eol)
+    dlm_offsets(sbuff, dlm, eol, offsets, ign_empty)
+    has_header ? (dlm_fill(cells, offsets, sbuff, auto, 1, dlm, eol, ign_empty), dlm_fill(Array(String, 1, ncols), offsets, sbuff, auto, 0, dlm, eol, ign_empty)) : dlm_fill(cells, offsets, sbuff, auto, 0, dlm, eol, ign_empty)
 end
 
-const valid_opts = [:has_header, :ignore_invalid_chars, :use_mmap]
+const valid_opts = [:has_header, :ignore_invalid_chars, :use_mmap, :ignore_empty_columns]
 function val_opts(opts)
     d = Dict{Symbol,Bool}()
     for opt in opts
@@ -102,7 +104,7 @@ function dlm_col_begin(ncols::Int, offsets::Array{Int,2}, row::Int, col::Int)
     (ret == 0) ? dlm_col_begin(ncols, offsets, pp_row, pp_col) : (ret+2)
 end
 
-function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, auto::Bool, row_offset::Int, eol::Char)
+function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, auto::Bool, row_offset::Int, dlm::Char, eol::Char, ign_adj_dlm::Bool)
     maxrow,maxcol = size(cells)
     tmp64 = Array(Float64,1)
 
@@ -114,6 +116,15 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au
 
             end_idx = prevind(sbuff, nextind(sbuff,end_pos))
             (col == maxcol) && (end_idx > 0) && ('\n' == eol) && ('\r' == sbuff[end_idx]) && (end_idx = prevind(sbuff, end_idx))
+            if ign_adj_dlm
+                is_default_dlm = (dlm == invalid_dlm)
+                l = length(sbuff)
+                while start_pos <= l
+                    val = sbuff[start_pos] 
+                    (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && break
+                    start_pos = nextind(sbuff, start_pos)
+                end
+            end
             sval = SubString(sbuff, start_pos, end_idx)
 
             if T <: Char
@@ -123,7 +134,7 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au
                 if float64_isvalid(sval, tmp64)
                     cells[cell_row,col] = tmp64[1]
                 elseif auto
-                    return dlm_fill(Array(Any,maxrow,maxcol), offsets, sbuff, false, row_offset, eol)
+                    return dlm_fill(Array(Any,maxrow,maxcol), offsets, sbuff, false, row_offset, dlm, eol, ign_adj_dlm)
                 else
                     cells[cell_row,col] = NaN
                 end
@@ -140,52 +151,62 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au
 end
 
 
-function dlm_offsets(sbuff::UTF8String, dlm, eol, offsets::Array{Int,2})
-    isascii(dlm) && isascii(eol) && (return dlm_offsets(sbuff.data, uint8(dlm), uint8(eol), offsets))
+function dlm_offsets(sbuff::UTF8String, dlm, eol, offsets::Array{Int,2}, ign_adj_dlm::Bool)
+    isascii(dlm) && isascii(eol) && (return dlm_offsets(sbuff.data, uint8(dlm), uint8(eol), offsets, ign_adj_dlm))
 
     col = 0
     row = 1
     maxrow,maxcol = size(offsets)
     offsets[maxrow,maxcol] = length(sbuff.data)
     idx = 1
     is_default_dlm = (dlm == invalid_dlm)
+    got_data = false
     while(idx <= length(sbuff.data))
         val,idx = next(sbuff, idx)
-        (val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
-        col += 1
-        offsets[row,col] = idx-2
+        (val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
+        if got_data || !ign_adj_dlm
+            col += 1
+            offsets[row,col] = idx-2
+        end
         (row >= maxrow) && (col == maxcol) && break
         (val == eol) && (row += 1; col = 0)
+        got_data = false
     end
 end
 
-dlm_offsets(sbuff::ASCIIString, dlmc, eolc, offsets::Array{Int,2}) = dlm_offsets(sbuff.data, uint8(dlmc), uint8(eolc), offsets)
-function dlm_offsets(dbuff::Vector{Uint8}, dlm::Uint8, eol::Uint8, offsets::Array{Int,2})
+dlm_offsets(sbuff::ASCIIString, dlmc, eolc, offsets::Array{Int,2}, ign_adj_dlm::Bool) = dlm_offsets(sbuff.data, uint8(dlmc), uint8(eolc), offsets, ign_adj_dlm)
+function dlm_offsets(dbuff::Vector{Uint8}, dlm::Uint8, eol::Uint8, offsets::Array{Int,2}, ign_adj_dlm::Bool)
     col = 0
     row = 1
     is_default_dlm = (dlm == uint8(invalid_dlm))
     maxrow,maxcol = size(offsets)
     offsets[maxrow,maxcol] = length(dbuff)
+    got_data = false
     for idx in 1:length(dbuff)
         val = dbuff[idx]
-        (val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
-        col += 1
-        offsets[row,col] = idx-1
+        (val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
+        if got_data || !ign_adj_dlm
+            col += 1
+            offsets[row,col] = idx-1
+        end
         (row >= maxrow) && (col == maxcol) && break
         (val == eol) && (row += 1; col = 0)
+        got_data = false
     end
 end
 
-dlm_dims(s::ASCIIString, eol::Char, dlm::Char) = dlm_dims(s.data, uint8(eol), uint8(dlm))
-function dlm_dims{T,D}(dbuff::T, eol::D, dlm::D)
-    isa(dbuff, UTF8String) && isascii(eol) && isascii(dlm) && (return dlm_dims(dbuff.data, uint8(eol), uint8(dlm)))
+dlm_dims(s::ASCIIString, eol::Char, dlm::Char, ign_adj_dlm::Bool) = dlm_dims(s.data, uint8(eol), uint8(dlm), ign_adj_dlm)
+function dlm_dims{T,D}(dbuff::T, eol::D, dlm::D, ign_adj_dlm::Bool)
+    isa(dbuff, UTF8String) && isascii(eol) && isascii(dlm) && (return dlm_dims(dbuff.data, uint8(eol), uint8(dlm), ign_adj_dlm))
     ncols = nrows = col = 0
     is_default_dlm = (dlm == convert(D, invalid_dlm))
     try
+        got_data = false
         for val in dbuff
-            (val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
-            col += 1
+            (val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
+            (got_data || !ign_adj_dlm) && (col += 1)
             (val == eol) && (nrows += 1; ncols = max(ncols, col); col = 0)
+            got_data = false
         end
     catch ex
         error("at row $nrows, column $col : $ex)")

diff --git a/doc/helpdb.jl b/doc/helpdb.jl
@@ -2536,25 +2536,29 @@
 
 "),
 
-("Text I/O","Base","readdlm","readdlm(source, delim::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
+("Text I/O","Base","readdlm","readdlm(source, delim::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false, ignore_empty_columns=false)
 
    Read a matrix from the source where each line gives one row, with
    elements separated by the given delimeter. The source can be a text
    file, stream or byte array. Memory mapped filed can be used by
    passing the byte array representation of the mapped segment as
    source.
 
-   If \"has_header\" is \"true\" the first row of data would be read
+   If \"has_header\" is \"true\", the first row of data would be read
    as headers and the tuple \"(data_cells, header_cells)\" is returned
    instead of only \"data_cells\".
 
-   If \"use_mmap\" is \"true\" the file specified by \"source\" is
+   If \"use_mmap\" is \"true\", the file specified by \"source\" is
    memory mapped for potential speedups.
 
-   If \"ignore_invalid_chars\" is \"true\" bytes in \"source\" with
+   If \"ignore_invalid_chars\" is \"true\", bytes in \"source\" with
    invalid character encoding will be ignored. Otherwise an error is
    thrown indicating the offending character position.
 
+   If \"ignore_empty_columns\" is \"true\", adjoining column 
+   delimiters will be squashed instead of being treated as cells with 
+   empty strings.
+
    If all data is numeric, the result will be a numeric array. If some
    elements cannot be parsed as numbers, a cell array of numbers and
    strings is returned.

diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst
@@ -1664,15 +1664,17 @@ Text I/O
 
    Create an iterable object that will yield each line from a stream.
 
-.. function:: readdlm(source, delim::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
+.. function:: readdlm(source, delim::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false, ignore_empty_columns=false)
 
    Read a matrix from the source where each line gives one row, with elements separated by the given delimeter. The source can be a text file, stream or byte array. Memory mapped filed can be used by passing the byte array representation of the mapped segment as source. 
 
-   If ``has_header`` is ``true`` the first row of data would be read as headers and the tuple ``(data_cells, header_cells)`` is returned instead of only ``data_cells``.
+   If ``has_header`` is ``true``, the first row of data would be read as headers and the tuple ``(data_cells, header_cells)`` is returned instead of only ``data_cells``.
 
-   If ``use_mmap`` is ``true`` the file specified by ``source`` is memory mapped for potential speedups.
+   If ``use_mmap`` is ``true``, the file specified by ``source`` is memory mapped for potential speedups.
 
-   If ``ignore_invalid_chars`` is ``true`` bytes in ``source`` with invalid character encoding will be ignored. Otherwise an error is thrown indicating the offending character position.
+   If ``ignore_invalid_chars`` is ``true``, bytes in ``source`` with invalid character encoding will be ignored. Otherwise an error is thrown indicating the offending character position.
+
+   If ``ignore_empty_columns`` is ``true``, adjoining column delimiters will be squashed instead of being treated as cells with empty strings.
 
    If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.
 

diff --git a/test/readdlm.jl b/test/readdlm.jl
@@ -13,13 +13,81 @@ dlm_data = readdlm(joinpath("perf", "kernel", "imdb-1.tsv"), '\t')
 @test size(readcsv(IOBuffer("1,2,3,4\n1,2,3,"))) == (2,4)
 @test size(readcsv(IOBuffer("1,2,3,4\n1,2,3"))) == (2,4)
 
+@test size(readcsv(IOBuffer("1,2,3,4\r\n"))) == (1,4)
+@test size(readcsv(IOBuffer("1,2,3,4\r\n1,2,3\r\n"))) == (2,4)
+@test size(readcsv(IOBuffer("1,2,3,4\r\n1,2,3,4\r\n"))) == (2,4)
+
 @test size(readdlm(IOBuffer("1 2 3 4\n1 2 3"))) == (2,4)
 @test size(readdlm(IOBuffer("1\t2 3 4\n1 2 3"))) == (2,4)
 @test size(readdlm(IOBuffer("1\t 2 3 4\n1 2 3"))) == (2,5)
 @test size(readdlm(IOBuffer("1\t 2 3 4\n1 2 3\n"))) == (2,5)
 
+@test size(readdlm(IOBuffer("1\t 2 3 4\n1 2 3\n"); ignore_empty_columns=true)) == (2,4)
+
 let x = [1,2,3], y = [4,5,6], io = IOBuffer()
     writedlm(io, zip(x,y), ",  ")
     seek(io, 0)
     @test readcsv(io) == [x y]
 end
+
+
+# source: http://www.i18nguy.com/unicode/unicode-example-utf8.zip
+i18n_data = ["Origin (English)", "Name (English)", "Origin (Native)", "Name (Native)", 
+"Australia", "Nicole Kidman", "Australia", "Nicole Kidman", 
+"Austria", "Johann Strauss", "Österreich", "Johann Strauß", 
+"Belgium (Flemish)", "Rene Magritte", "België", "René Magritte", 
+"Belgium (French)", "Rene Magritte", "Belgique", "René Magritte", 
+"Belgium (German)", "Rene Magritte", "Belgien", "René Magritte", 
+"Bhutan", "Gonpo Dorji", "འབྲུག་ཡུལ།", "མགོན་པོ་རྡོ་རྗེ།", 
+"Canada", "Celine Dion", "Canada", "Céline Dion", 
+"Canada - Nunavut (Inuktitut)", "Susan Aglukark", "ᓄᓇᕗᒻᒥᐅᑦ", "ᓱᓴᓐ ᐊᒡᓗᒃᑲᖅ", 
+"Democratic People's Rep. of Korea", "LEE Sol-Hee", "조선 민주주의 인민 공화국", "이설희", 
+"Denmark", "Soren Hauch-Fausboll", "Danmark", "Søren Hauch-Fausbøll", 
+"Denmark", "Soren Kierkegaard", "Danmark", "Søren Kierkegård", 
+"Egypt", "Abdel Halim Hafez", "ﻣﺼﺮ", "ﻋﺑﺪﺍﻠﺣﻟﻳﻢ ﺤﺎﻓﻅ", 
+"Egypt", "Om Kolthoum", "ﻣﺼﺮ", "ﺃﻡ ﻛﻟﺛﻭﻡ", 
+"Eritrea", "Berhane Zeray", "ብርሃነ ዘርኣይ", "ኤርትራ", 
+"Ethiopia", "Haile Gebreselassie", "ኃይሌ ገብረሥላሴ", "ኢትዮጵያ", 
+"France", "Gerard Depardieu", "France", "Gérard Depardieu", 
+"France", "Jean Reno", "France", "Jean Réno", 
+"France", "Camille Saint-Saens", "France", "Camille Saint-Saëns", 
+"France", "Mylene Demongeot", "France", "Mylène Demongeot", 
+"France", "Francois Truffaut", "France", "François Truffaut", 
+"France (Braille)", "Louis Braille", "⠋⠗⠁⠝⠉⠑", "⠇⠕⠥⠊⠎⠀<BR>⠃⠗⠁⠊⠇⠇⠑", 
+"Georgia", "Eduard Shevardnadze", "საქართველო", "ედუარდ შევარდნაძე", 
+"Germany", "Rudi Voeller", "Deutschland", "Rudi Völler", 
+"Germany", "Walter Schultheiss", "Deutschland", "Walter Schultheiß", 
+"Greece", "Giorgos Dalaras", "Ελλάς", "Γιώργος Νταλάρας", 
+"Iceland", "Bjork Gudmundsdottir", "Ísland", "Björk Guðmundsdóttir", 
+"India (Hindi)", "Madhuri Dixit", "भारत", "माधुरी दिछित", 
+"Ireland", "Sinead O'Connor", "Éire", "Sinéad O'Connor", 
+"Israel", "Yehoram Gaon", "ישראל", "יהורם גאון", 
+"Italy", "Fabrizio DeAndre", "Italia", "Fabrizio De André", 
+"Japan", "KUBOTA Toshinobu", "日本", "久保田    利伸", 
+"Japan", "HAYASHIBARA Megumi", "日本", "林原 めぐみ", 
+"Japan", "Mori Ogai", "日本", "森鷗外", 
+"Japan", "Tex Texin", "日本", "テクス テクサン", 
+"Norway", "Tor Age Bringsvaerd", "Noreg", "Tor Åge Bringsværd", 
+"Pakistan (Urdu)", "Nusrat Fatah Ali Khan", "پاکستان", "نصرت فتح علی خان", 
+"People's Rep. of China", "ZHANG Ziyi", "中国", "章子怡", 
+"People's Rep. of China", "WONG Faye", "中国", "王菲", 
+"Poland", "Lech Walesa", "Polska", "Lech Wałęsa", 
+"Puerto Rico", "Olga Tanon", "Puerto Rico", "Olga Tañón", 
+"Rep. of China", "Hsu Chi", "臺灣", "舒淇", 
+"Rep. of China", "Ang Lee", "臺灣", "李安", 
+"Rep. of Korea", "AHN Sung-Gi", "한민국", "안성기", 
+"Rep. of Korea", "SHIM Eun-Ha", "한민국", "심은하", 
+"Russia", "Mikhail Gorbachev", "Россия", "Михаил Горбачёв", 
+"Russia", "Boris Grebenshchikov", "Россия", "Борис Гребенщиков", 
+"Slovenia", "\"Frane \"\"Jezek\"\" Milcinski", "Slovenija", "Frane Milčinski - Ježek", 
+"Syracuse (Sicily)", "Archimedes", "Συρακούσα", "Ἀρχιμήδης", 
+"Thailand", "Thongchai McIntai", "ประเทศไทย", "ธงไชย แม็คอินไตย์", 
+"U.S.A.", "Brad Pitt", "U.S.A.", "Brad Pitt", 
+"Yugoslavia (Cyrillic)", "Djordje Balasevic", "Југославија", "Ђорђе Балашевић", 
+"Yugoslavia (Latin)", "Djordje Balasevic", "Jugoslavija", "Đorđe Balašević"]
+
+i18n_arr = transpose(reshape(i18n_data, 4, int(floor(length(i18n_data)/4))))
+i18n_buff = PipeBuffer()
+writedlm(i18n_buff, i18n_arr, ',')
+@test i18n_arr == readcsv(i18n_buff)
+