Skip to content

Commit

Permalink
added readdlm option to ignore empty columns.
Browse files Browse the repository at this point in the history
updated tests and docs.
fixes JuliaLang#5391
  • Loading branch information
tanmaykm committed Jan 15, 2014
1 parent 714fa07 commit 43d5f72
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 30 deletions.
65 changes: 43 additions & 22 deletions base/datafmt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,21 +68,23 @@ function ascii_if_possible(sbuff::String)
end

function readdlm_string(sbuff::String, dlm::Char, T::Type, eol::Char, auto::Bool, optsd::Dict)
ign_empty = get(optsd, :ignore_empty_columns, false)

nrows,ncols = try
dlm_dims(sbuff, eol, dlm)
dlm_dims(sbuff, eol, dlm, ign_empty)
catch ex
!get(optsd, :ignore_invalid_chars, false) && throw(ex)
sbuff = ascii_if_possible(convert(typeof(sbuff), sbuff.data, ""))
dlm_dims(sbuff, eol, dlm)
dlm_dims(sbuff, eol, dlm, ign_empty)
end
offsets = zeros(Int, nrows, ncols)
has_header = get(optsd, :has_header, false)
cells = Array(T, has_header ? nrows-1 : nrows, ncols)
dlm_offsets(sbuff, dlm, eol, offsets)
has_header ? (dlm_fill(cells, offsets, sbuff, auto, 1, eol), dlm_fill(Array(String, 1, ncols), offsets, sbuff, auto, 0, eol)) : dlm_fill(cells, offsets, sbuff, auto, 0, eol)
dlm_offsets(sbuff, dlm, eol, offsets, ign_empty)
has_header ? (dlm_fill(cells, offsets, sbuff, auto, 1, dlm, eol, ign_empty), dlm_fill(Array(String, 1, ncols), offsets, sbuff, auto, 0, dlm, eol, ign_empty)) : dlm_fill(cells, offsets, sbuff, auto, 0, dlm, eol, ign_empty)
end

const valid_opts = [:has_header, :ignore_invalid_chars, :use_mmap]
const valid_opts = [:has_header, :ignore_invalid_chars, :use_mmap, :ignore_empty_columns]
function val_opts(opts)
d = Dict{Symbol,Bool}()
for opt in opts
Expand All @@ -102,7 +104,7 @@ function dlm_col_begin(ncols::Int, offsets::Array{Int,2}, row::Int, col::Int)
(ret == 0) ? dlm_col_begin(ncols, offsets, pp_row, pp_col) : (ret+2)
end

function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, auto::Bool, row_offset::Int, eol::Char)
function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, auto::Bool, row_offset::Int, dlm::Char, eol::Char, ign_adj_dlm::Bool)
maxrow,maxcol = size(cells)
tmp64 = Array(Float64,1)

Expand All @@ -114,6 +116,15 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au

end_idx = prevind(sbuff, nextind(sbuff,end_pos))
(col == maxcol) && (end_idx > 0) && ('\n' == eol) && ('\r' == sbuff[end_idx]) && (end_idx = prevind(sbuff, end_idx))
if ign_adj_dlm
is_default_dlm = (dlm == invalid_dlm)
l = length(sbuff)
while start_pos <= l
val = sbuff[start_pos]
(is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && break
start_pos = nextind(sbuff, start_pos)
end
end
sval = SubString(sbuff, start_pos, end_idx)

if T <: Char
Expand All @@ -123,7 +134,7 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au
if float64_isvalid(sval, tmp64)
cells[cell_row,col] = tmp64[1]
elseif auto
return dlm_fill(Array(Any,maxrow,maxcol), offsets, sbuff, false, row_offset, eol)
return dlm_fill(Array(Any,maxrow,maxcol), offsets, sbuff, false, row_offset, dlm, eol, ign_adj_dlm)
else
cells[cell_row,col] = NaN
end
Expand All @@ -140,52 +151,62 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au
end


function dlm_offsets(sbuff::UTF8String, dlm, eol, offsets::Array{Int,2})
isascii(dlm) && isascii(eol) && (return dlm_offsets(sbuff.data, uint8(dlm), uint8(eol), offsets))
function dlm_offsets(sbuff::UTF8String, dlm, eol, offsets::Array{Int,2}, ign_adj_dlm::Bool)
isascii(dlm) && isascii(eol) && (return dlm_offsets(sbuff.data, uint8(dlm), uint8(eol), offsets, ign_adj_dlm))

col = 0
row = 1
maxrow,maxcol = size(offsets)
offsets[maxrow,maxcol] = length(sbuff.data)
idx = 1
is_default_dlm = (dlm == invalid_dlm)
got_data = false
while(idx <= length(sbuff.data))
val,idx = next(sbuff, idx)
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
col += 1
offsets[row,col] = idx-2
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
if got_data || !ign_adj_dlm
col += 1
offsets[row,col] = idx-2
end
(row >= maxrow) && (col == maxcol) && break
(val == eol) && (row += 1; col = 0)
got_data = false
end
end

dlm_offsets(sbuff::ASCIIString, dlmc, eolc, offsets::Array{Int,2}) = dlm_offsets(sbuff.data, uint8(dlmc), uint8(eolc), offsets)
function dlm_offsets(dbuff::Vector{Uint8}, dlm::Uint8, eol::Uint8, offsets::Array{Int,2})
dlm_offsets(sbuff::ASCIIString, dlmc, eolc, offsets::Array{Int,2}, ign_adj_dlm::Bool) = dlm_offsets(sbuff.data, uint8(dlmc), uint8(eolc), offsets, ign_adj_dlm)
function dlm_offsets(dbuff::Vector{Uint8}, dlm::Uint8, eol::Uint8, offsets::Array{Int,2}, ign_adj_dlm::Bool)
col = 0
row = 1
is_default_dlm = (dlm == uint8(invalid_dlm))
maxrow,maxcol = size(offsets)
offsets[maxrow,maxcol] = length(dbuff)
got_data = false
for idx in 1:length(dbuff)
val = dbuff[idx]
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
col += 1
offsets[row,col] = idx-1
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
if got_data || !ign_adj_dlm
col += 1
offsets[row,col] = idx-1
end
(row >= maxrow) && (col == maxcol) && break
(val == eol) && (row += 1; col = 0)
got_data = false
end
end

dlm_dims(s::ASCIIString, eol::Char, dlm::Char) = dlm_dims(s.data, uint8(eol), uint8(dlm))
function dlm_dims{T,D}(dbuff::T, eol::D, dlm::D)
isa(dbuff, UTF8String) && isascii(eol) && isascii(dlm) && (return dlm_dims(dbuff.data, uint8(eol), uint8(dlm)))
dlm_dims(s::ASCIIString, eol::Char, dlm::Char, ign_adj_dlm::Bool) = dlm_dims(s.data, uint8(eol), uint8(dlm), ign_adj_dlm)
function dlm_dims{T,D}(dbuff::T, eol::D, dlm::D, ign_adj_dlm::Bool)
isa(dbuff, UTF8String) && isascii(eol) && isascii(dlm) && (return dlm_dims(dbuff.data, uint8(eol), uint8(dlm), ign_adj_dlm))
ncols = nrows = col = 0
is_default_dlm = (dlm == convert(D, invalid_dlm))
try
got_data = false
for val in dbuff
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
col += 1
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
(got_data || !ign_adj_dlm) && (col += 1)
(val == eol) && (nrows += 1; ncols = max(ncols, col); col = 0)
got_data = false
end
catch ex
error("at row $nrows, column $col : $ex)")
Expand Down
12 changes: 8 additions & 4 deletions doc/helpdb.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2536,25 +2536,29 @@
"),

("Text I/O","Base","readdlm","readdlm(source, delim::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
("Text I/O","Base","readdlm","readdlm(source, delim::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false, ignore_empty_columns=false)
Read a matrix from the source where each line gives one row, with
elements separated by the given delimeter. The source can be a text
file, stream or byte array. Memory mapped filed can be used by
passing the byte array representation of the mapped segment as
source.
If \"has_header\" is \"true\" the first row of data would be read
If \"has_header\" is \"true\", the first row of data would be read
as headers and the tuple \"(data_cells, header_cells)\" is returned
instead of only \"data_cells\".
If \"use_mmap\" is \"true\" the file specified by \"source\" is
If \"use_mmap\" is \"true\", the file specified by \"source\" is
memory mapped for potential speedups.
If \"ignore_invalid_chars\" is \"true\" bytes in \"source\" with
If \"ignore_invalid_chars\" is \"true\", bytes in \"source\" with
invalid character encoding will be ignored. Otherwise an error is
thrown indicating the offending character position.
If \"ignore_empty_columns\" is \"true\", adjoining column
delimiters will be squashed instead of being treated as cells with
empty strings.
If all data is numeric, the result will be a numeric array. If some
elements cannot be parsed as numbers, a cell array of numbers and
strings is returned.
Expand Down
10 changes: 6 additions & 4 deletions doc/stdlib/base.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1664,15 +1664,17 @@ Text I/O

Create an iterable object that will yield each line from a stream.

.. function:: readdlm(source, delim::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
.. function:: readdlm(source, delim::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false, ignore_empty_columns=false)

Read a matrix from the source where each line gives one row, with elements separated by the given delimeter. The source can be a text file, stream or byte array. Memory mapped filed can be used by passing the byte array representation of the mapped segment as source.

If ``has_header`` is ``true`` the first row of data would be read as headers and the tuple ``(data_cells, header_cells)`` is returned instead of only ``data_cells``.
If ``has_header`` is ``true``, the first row of data would be read as headers and the tuple ``(data_cells, header_cells)`` is returned instead of only ``data_cells``.

If ``use_mmap`` is ``true`` the file specified by ``source`` is memory mapped for potential speedups.
If ``use_mmap`` is ``true``, the file specified by ``source`` is memory mapped for potential speedups.

If ``ignore_invalid_chars`` is ``true`` bytes in ``source`` with invalid character encoding will be ignored. Otherwise an error is thrown indicating the offending character position.
If ``ignore_invalid_chars`` is ``true``, bytes in ``source`` with invalid character encoding will be ignored. Otherwise an error is thrown indicating the offending character position.

If ``ignore_empty_columns`` is ``true``, adjoining column delimiters will be squashed instead of being treated as cells with empty strings.

If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.

Expand Down
68 changes: 68 additions & 0 deletions test/readdlm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,81 @@ dlm_data = readdlm(joinpath("perf", "kernel", "imdb-1.tsv"), '\t')
@test size(readcsv(IOBuffer("1,2,3,4\n1,2,3,"))) == (2,4)
@test size(readcsv(IOBuffer("1,2,3,4\n1,2,3"))) == (2,4)

@test size(readcsv(IOBuffer("1,2,3,4\r\n"))) == (1,4)
@test size(readcsv(IOBuffer("1,2,3,4\r\n1,2,3\r\n"))) == (2,4)
@test size(readcsv(IOBuffer("1,2,3,4\r\n1,2,3,4\r\n"))) == (2,4)

@test size(readdlm(IOBuffer("1 2 3 4\n1 2 3"))) == (2,4)
@test size(readdlm(IOBuffer("1\t2 3 4\n1 2 3"))) == (2,4)
@test size(readdlm(IOBuffer("1\t 2 3 4\n1 2 3"))) == (2,5)
@test size(readdlm(IOBuffer("1\t 2 3 4\n1 2 3\n"))) == (2,5)

@test size(readdlm(IOBuffer("1\t 2 3 4\n1 2 3\n"); ignore_empty_columns=true)) == (2,4)

let x = [1,2,3], y = [4,5,6], io = IOBuffer()
writedlm(io, zip(x,y), ", ")
seek(io, 0)
@test readcsv(io) == [x y]
end


# source: http://www.i18nguy.com/unicode/unicode-example-utf8.zip
i18n_data = ["Origin (English)", "Name (English)", "Origin (Native)", "Name (Native)",
"Australia", "Nicole Kidman", "Australia", "Nicole Kidman",
"Austria", "Johann Strauss", "Österreich", "Johann Strauß",
"Belgium (Flemish)", "Rene Magritte", "België", "René Magritte",
"Belgium (French)", "Rene Magritte", "Belgique", "René Magritte",
"Belgium (German)", "Rene Magritte", "Belgien", "René Magritte",
"Bhutan", "Gonpo Dorji", "འབྲུག་ཡུལ།", "མགོན་པོ་རྡོ་རྗེ།",
"Canada", "Celine Dion", "Canada", "Céline Dion",
"Canada - Nunavut (Inuktitut)", "Susan Aglukark", "ᓄᓇᕗᒻᒥᐅᑦ", "ᓱᓴᓐ ᐊᒡᓗᒃᑲᖅ",
"Democratic People's Rep. of Korea", "LEE Sol-Hee", "조선 민주주의 인민 공화국", "이설희",
"Denmark", "Soren Hauch-Fausboll", "Danmark", "Søren Hauch-Fausbøll",
"Denmark", "Soren Kierkegaard", "Danmark", "Søren Kierkegård",
"Egypt", "Abdel Halim Hafez", "ﻣﺼﺮ", "ﻋﺑﺪﺍﻠﺣﻟﻳﻢ ﺤﺎﻓﻅ",
"Egypt", "Om Kolthoum", "ﻣﺼﺮ", "ﺃﻡ ﻛﻟﺛﻭﻡ",
"Eritrea", "Berhane Zeray", "ብርሃነ ዘርኣይ", "ኤርትራ",
"Ethiopia", "Haile Gebreselassie", "ኃይሌ ገብረሥላሴ", "ኢትዮጵያ",
"France", "Gerard Depardieu", "France", "Gérard Depardieu",
"France", "Jean Reno", "France", "Jean Réno",
"France", "Camille Saint-Saens", "France", "Camille Saint-Saëns",
"France", "Mylene Demongeot", "France", "Mylène Demongeot",
"France", "Francois Truffaut", "France", "François Truffaut",
"France (Braille)", "Louis Braille", "⠋⠗⠁⠝⠉⠑", "⠇⠕⠥⠊⠎⠀<BR>⠃⠗⠁⠊⠇⠇⠑",
"Georgia", "Eduard Shevardnadze", "საქართველო", "ედუარდ შევარდნაძე",
"Germany", "Rudi Voeller", "Deutschland", "Rudi Völler",
"Germany", "Walter Schultheiss", "Deutschland", "Walter Schultheiß",
"Greece", "Giorgos Dalaras", "Ελλάς", "Γιώργος Νταλάρας",
"Iceland", "Bjork Gudmundsdottir", "Ísland", "Björk Guðmundsdóttir",
"India (Hindi)", "Madhuri Dixit", "भारत", "माधुरी दिछित",
"Ireland", "Sinead O'Connor", "Éire", "Sinéad O'Connor",
"Israel", "Yehoram Gaon", "ישראל", "יהורם גאון",
"Italy", "Fabrizio DeAndre", "Italia", "Fabrizio De André",
"Japan", "KUBOTA Toshinobu", "日本", "久保田    利伸",
"Japan", "HAYASHIBARA Megumi", "日本", "林原 めぐみ",
"Japan", "Mori Ogai", "日本", "森鷗外",
"Japan", "Tex Texin", "日本", "テクス テクサン",
"Norway", "Tor Age Bringsvaerd", "Noreg", "Tor Åge Bringsværd",
"Pakistan (Urdu)", "Nusrat Fatah Ali Khan", "پاکستان", "نصرت فتح علی خان",
"People's Rep. of China", "ZHANG Ziyi", "中国", "章子怡",
"People's Rep. of China", "WONG Faye", "中国", "王菲",
"Poland", "Lech Walesa", "Polska", "Lech Wałęsa",
"Puerto Rico", "Olga Tanon", "Puerto Rico", "Olga Tañón",
"Rep. of China", "Hsu Chi", "臺灣", "舒淇",
"Rep. of China", "Ang Lee", "臺灣", "李安",
"Rep. of Korea", "AHN Sung-Gi", "한민국", "안성기",
"Rep. of Korea", "SHIM Eun-Ha", "한민국", "심은하",
"Russia", "Mikhail Gorbachev", "Россия", "Михаил Горбачёв",
"Russia", "Boris Grebenshchikov", "Россия", "Борис Гребенщиков",
"Slovenia", "\"Frane \"\"Jezek\"\" Milcinski", "Slovenija", "Frane Milčinski - Ježek",
"Syracuse (Sicily)", "Archimedes", "Συρακούσα", "Ἀρχιμήδης",
"Thailand", "Thongchai McIntai", "ประเทศไทย", "ธงไชย แม็คอินไตย์",
"U.S.A.", "Brad Pitt", "U.S.A.", "Brad Pitt",
"Yugoslavia (Cyrillic)", "Djordje Balasevic", "Југославија", "Ђорђе Балашевић",
"Yugoslavia (Latin)", "Djordje Balasevic", "Jugoslavija", "Đorđe Balašević"]

i18n_arr = transpose(reshape(i18n_data, 4, int(floor(length(i18n_data)/4))))
i18n_buff = PipeBuffer()
writedlm(i18n_buff, i18n_arr, ',')
@test i18n_arr == readcsv(i18n_buff)

0 comments on commit 43d5f72

Please sign in to comment.