Skip to content

Commit

Permalink
for whitespace delimited files, adjoining delimiters are treated as s…
Browse files Browse the repository at this point in the history
…ingle delimiter.

fixed bug in handling empty columns.
updated tests and docs.
fixes JuliaLang#5391
  • Loading branch information
tanmaykm committed Jan 16, 2014
1 parent 7eab096 commit ee61b45
Show file tree
Hide file tree
Showing 4 changed files with 188 additions and 45 deletions.
65 changes: 43 additions & 22 deletions base/datafmt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,18 +68,20 @@ function ascii_if_possible(sbuff::String)
end

function readdlm_string(sbuff::String, dlm::Char, T::Type, eol::Char, auto::Bool, optsd::Dict)
ign_empty = (dlm == invalid_dlm) || isspace(dlm)

nrows,ncols = try
dlm_dims(sbuff, eol, dlm)
dlm_dims(sbuff, eol, dlm, ign_empty)
catch ex
!get(optsd, :ignore_invalid_chars, false) && throw(ex)
sbuff = ascii_if_possible(convert(typeof(sbuff), sbuff.data, ""))
dlm_dims(sbuff, eol, dlm)
dlm_dims(sbuff, eol, dlm, ign_empty)
end
offsets = zeros(Int, nrows, ncols)
has_header = get(optsd, :has_header, false)
cells = Array(T, has_header ? nrows-1 : nrows, ncols)
dlm_offsets(sbuff, dlm, eol, offsets)
has_header ? (dlm_fill(cells, offsets, sbuff, auto, 1, eol), dlm_fill(Array(String, 1, ncols), offsets, sbuff, auto, 0, eol)) : dlm_fill(cells, offsets, sbuff, auto, 0, eol)
dlm_offsets(sbuff, dlm, eol, offsets, ign_empty)
has_header ? (dlm_fill(cells, offsets, sbuff, auto, 1, dlm, eol, ign_empty), dlm_fill(Array(String, 1, ncols), offsets, sbuff, auto, 0, dlm, eol, ign_empty)) : dlm_fill(cells, offsets, sbuff, auto, 0, dlm, eol, ign_empty)
end

const valid_opts = [:has_header, :ignore_invalid_chars, :use_mmap]
Expand All @@ -99,10 +101,11 @@ function dlm_col_begin(ncols::Int, offsets::Array{Int,2}, row::Int, col::Int)
pp_col = (1 == col) ? ncols : (col-1)

ret = offsets[pp_row, pp_col]
(ret == 0) ? dlm_col_begin(ncols, offsets, pp_row, pp_col) : (ret+2)
(ret == 0) && (ret = dlm_col_begin(ncols, offsets, pp_row, pp_col))
ret+2
end

function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, auto::Bool, row_offset::Int, eol::Char)
function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, auto::Bool, row_offset::Int, dlm::Char, eol::Char, ign_adj_dlm::Bool)
maxrow,maxcol = size(cells)
tmp64 = Array(Float64,1)

Expand All @@ -114,6 +117,14 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au

end_idx = prevind(sbuff, nextind(sbuff,end_pos))
(col == maxcol) && (end_idx > 0) && ('\n' == eol) && ('\r' == sbuff[end_idx]) && (end_idx = prevind(sbuff, end_idx))
if ign_adj_dlm
is_default_dlm = (dlm == invalid_dlm)
while start_pos <= end_idx
val = sbuff[start_pos]
(is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && break
start_pos = nextind(sbuff, start_pos)
end
end
sval = SubString(sbuff, start_pos, end_idx)

if T <: Char
Expand All @@ -123,7 +134,7 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au
if float64_isvalid(sval, tmp64)
cells[cell_row,col] = tmp64[1]
elseif auto
return dlm_fill(Array(Any,maxrow,maxcol), offsets, sbuff, false, row_offset, eol)
return dlm_fill(Array(Any,maxrow,maxcol), offsets, sbuff, false, row_offset, dlm, eol, ign_adj_dlm)
else
cells[cell_row,col] = NaN
end
Expand All @@ -140,52 +151,62 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au
end


function dlm_offsets(sbuff::UTF8String, dlm, eol, offsets::Array{Int,2})
isascii(dlm) && isascii(eol) && (return dlm_offsets(sbuff.data, uint8(dlm), uint8(eol), offsets))
function dlm_offsets(sbuff::UTF8String, dlm, eol, offsets::Array{Int,2}, ign_adj_dlm::Bool)
isascii(dlm) && isascii(eol) && (return dlm_offsets(sbuff.data, uint8(dlm), uint8(eol), offsets, ign_adj_dlm))

col = 0
row = 1
maxrow,maxcol = size(offsets)
offsets[maxrow,maxcol] = length(sbuff.data)
idx = 1
is_default_dlm = (dlm == invalid_dlm)
got_data = false
while(idx <= length(sbuff.data))
val,idx = next(sbuff, idx)
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
col += 1
offsets[row,col] = idx-2
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
if got_data || !ign_adj_dlm
col += 1
offsets[row,col] = idx-2
end
(row >= maxrow) && (col == maxcol) && break
(val == eol) && (row += 1; col = 0)
got_data = false
end
end

dlm_offsets(sbuff::ASCIIString, dlmc, eolc, offsets::Array{Int,2}) = dlm_offsets(sbuff.data, uint8(dlmc), uint8(eolc), offsets)
function dlm_offsets(dbuff::Vector{Uint8}, dlm::Uint8, eol::Uint8, offsets::Array{Int,2})
dlm_offsets(sbuff::ASCIIString, dlmc, eolc, offsets::Array{Int,2}, ign_adj_dlm::Bool) = dlm_offsets(sbuff.data, uint8(dlmc), uint8(eolc), offsets, ign_adj_dlm)
function dlm_offsets(dbuff::Vector{Uint8}, dlm::Uint8, eol::Uint8, offsets::Array{Int,2}, ign_adj_dlm::Bool)
col = 0
row = 1
is_default_dlm = (dlm == uint8(invalid_dlm))
maxrow,maxcol = size(offsets)
offsets[maxrow,maxcol] = length(dbuff)
got_data = false
for idx in 1:length(dbuff)
val = dbuff[idx]
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
col += 1
offsets[row,col] = idx-1
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
if got_data || !ign_adj_dlm
col += 1
offsets[row,col] = idx-1
end
(row >= maxrow) && (col == maxcol) && break
(val == eol) && (row += 1; col = 0)
got_data = false
end
end

dlm_dims(s::ASCIIString, eol::Char, dlm::Char) = dlm_dims(s.data, uint8(eol), uint8(dlm))
function dlm_dims{T,D}(dbuff::T, eol::D, dlm::D)
isa(dbuff, UTF8String) && isascii(eol) && isascii(dlm) && (return dlm_dims(dbuff.data, uint8(eol), uint8(dlm)))
dlm_dims(s::ASCIIString, eol::Char, dlm::Char, ign_adj_dlm::Bool) = dlm_dims(s.data, uint8(eol), uint8(dlm), ign_adj_dlm)
function dlm_dims{T,D}(dbuff::T, eol::D, dlm::D, ign_adj_dlm::Bool)
isa(dbuff, UTF8String) && isascii(eol) && isascii(dlm) && (return dlm_dims(dbuff.data, uint8(eol), uint8(dlm), ign_adj_dlm))
ncols = nrows = col = 0
is_default_dlm = (dlm == convert(D, invalid_dlm))
try
got_data = false
for val in dbuff
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
col += 1
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
(got_data || !ign_adj_dlm) && (col += 1)
(val == eol) && (nrows += 1; ncols = max(ncols, col); col = 0)
got_data = false
end
catch ex
error("at row $nrows, column $col : $ex)")
Expand Down
59 changes: 45 additions & 14 deletions doc/helpdb.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2536,25 +2536,34 @@
"),

("Text I/O","Base","readdlm","readdlm(source, delim::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
("Text I/O","Base","readdlm","readdlm(source, delim::Char, T::Type, eol::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
Read a matrix from the source where each line gives one row, with
elements separated by the given delimeter. The source can be a text
file, stream or byte array. Memory mapped filed can be used by
passing the byte array representation of the mapped segment as
source.
Read a matrix from the source where each line (separated by \"eol\")
gives one row, with elements separated by the given delimeter. The
source can be a text file, stream or byte array. Memory mapped files
can be used by passing the byte array representation of the mapped
segment as source.
If \"has_header\" is \"true\" the first row of data would be read
If \"T\" is a numeric type, the result is an array of that type,
with any non-numeric elements as \"NaN\" for floating-point types,
or zero. Other useful values of \"T\" include \"ASCIIString\",
\"String\", and \"Any\".
If \"has_header\" is \"true\", the first row of data would be read
as headers and the tuple \"(data_cells, header_cells)\" is returned
instead of only \"data_cells\".
If \"use_mmap\" is \"true\" the file specified by \"source\" is
If \"use_mmap\" is \"true\", the file specified by \"source\" is
memory mapped for potential speedups.
If \"ignore_invalid_chars\" is \"true\" bytes in \"source\" with
If \"ignore_invalid_chars\" is \"true\", bytes in \"source\" with
invalid character encoding will be ignored. Otherwise an error is
thrown indicating the offending character position.
"),

("Text I/O","Base","readdlm","readdlm(source, delim::Char, eol::Char; options...)
If all data is numeric, the result will be a numeric array. If some
elements cannot be parsed as numbers, a cell array of numbers and
strings is returned.
Expand All @@ -2563,11 +2572,33 @@

("Text I/O","Base","readdlm","readdlm(source, delim::Char, T::Type; options...)
Read a matrix from the source with a given element type. If \"T\"
is a numeric type, the result is an array of that type, with any
non-numeric elements as \"NaN\" for floating-point types, or zero.
Other useful values of \"T\" include \"ASCIIString\", \"String\",
and \"Any\".
The end of line delimiter is taken as \"\\n\".
"),

("Text I/O","Base","readdlm","readdlm(source, delim::Char; options...)
The end of line delimiter is taken as \"\\n\". If all data is
numeric, the result will be a numeric array. If some elements
cannot be parsed as numbers, a cell array of numbers and
strings is returned.
"),

("Text I/O","Base","readdlm","readdlm(source, T::Type; options...)
The columns are assumed to be separated by one or more whitespaces.
The end of line delimiter is taken as \"\\n\".
"),

("Text I/O","Base","readdlm","readdlm(source, options...)
The columns are assumed to be separated by one or more whitespaces.
The end of line delimiter is taken as \"\\n\". If all data is
numeric, the result will be a numeric array. If some elements
cannot be parsed as numbers, a cell array of numbers and strings
is returned.
"),

Expand Down
29 changes: 22 additions & 7 deletions doc/stdlib/base.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1664,22 +1664,37 @@ Text I/O

Create an iterable object that will yield each line from a stream.

.. function:: readdlm(source, delim::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
.. function:: readdlm(source, delim::Char, T::Type, eol::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)

Read a matrix from the source where each line gives one row, with elements separated by the given delimeter. The source can be a text file, stream or byte array. Memory mapped filed can be used by passing the byte array representation of the mapped segment as source.
Read a matrix from the source where each line (separated by ``eol``) gives one row, with elements separated by the given delimeter. The source can be a text file, stream or byte array. Memory mapped files can be used by passing the byte array representation of the mapped segment as source.

If ``has_header`` is ``true`` the first row of data would be read as headers and the tuple ``(data_cells, header_cells)`` is returned instead of only ``data_cells``.
If ``T`` is a numeric type, the result is an array of that type, with any non-numeric elements as ``NaN`` for floating-point types, or zero. Other useful values of ``T`` include ``ASCIIString``, ``String``, and ``Any``.

If ``use_mmap`` is ``true`` the file specified by ``source`` is memory mapped for potential speedups.
If ``has_header`` is ``true``, the first row of data would be read as headers and the tuple ``(data_cells, header_cells)`` is returned instead of only ``data_cells``.

If ``ignore_invalid_chars`` is ``true`` bytes in ``source`` with invalid character encoding will be ignored. Otherwise an error is thrown indicating the offending character position.
If ``use_mmap`` is ``true``, the file specified by ``source`` is memory mapped for potential speedups.

If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.
If ``ignore_invalid_chars`` is ``true``, bytes in ``source`` with invalid character encoding will be ignored. Otherwise an error is thrown indicating the offending character position.

.. function:: readdlm(source, delim::Char, eol::Char; options...)

If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.

.. function:: readdlm(source, delim::Char, T::Type; options...)

Read a matrix from the source with a given element type. If ``T`` is a numeric type, the result is an array of that type, with any non-numeric elements as ``NaN`` for floating-point types, or zero. Other useful values of ``T`` include ``ASCIIString``, ``String``, and ``Any``.
The end of line delimiter is taken as ``\n``.

.. function:: readdlm(source, delim::Char; options...)

The end of line delimiter is taken as ``\n``. If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.

.. function:: readdlm(source, T::Type; options...)

The columns are assumed to be separated by one or more whitespaces. The end of line delimiter is taken as ``\n``.

.. function:: readdlm(source; options...)

The columns are assumed to be separated by one or more whitespaces. The end of line delimiter is taken as ``\n``. If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.

.. function:: writedlm(f, A, delim='\t')

Expand Down
80 changes: 78 additions & 2 deletions test/readdlm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,89 @@ dlm_data = readdlm(joinpath("perf", "kernel", "imdb-1.tsv"), '\t')
@test size(readcsv(IOBuffer("1,2,3,4\n1,2,3,"))) == (2,4)
@test size(readcsv(IOBuffer("1,2,3,4\n1,2,3"))) == (2,4)

@test size(readcsv(IOBuffer("1,2,3,4\r\n"))) == (1,4)
@test size(readcsv(IOBuffer("1,2,3,4\r\n1,2,3\r\n"))) == (2,4)
@test size(readcsv(IOBuffer("1,2,3,4\r\n1,2,3,4\r\n"))) == (2,4)

@test size(readdlm(IOBuffer("1 2 3 4\n1 2 3"))) == (2,4)
@test size(readdlm(IOBuffer("1\t2 3 4\n1 2 3"))) == (2,4)
@test size(readdlm(IOBuffer("1\t 2 3 4\n1 2 3"))) == (2,5)
@test size(readdlm(IOBuffer("1\t 2 3 4\n1 2 3\n"))) == (2,5)
@test size(readdlm(IOBuffer("1\t 2 3 4\n1 2 3"))) == (2,4)
@test size(readdlm(IOBuffer("1\t 2 3 4\n1 2 3\n"))) == (2,4)
@test size(readdlm(IOBuffer("1,,2,3,4\n1,2,3\n"), ',')) == (2,5)

result1 = reshape({"", "", "", "", "", "", 1.0, 1.0, "", "", "", "", "", 1.0, 2.0, "", 3.0, "", "", "", "", "", 4.0, "", "", ""}, 2, 13)
result2 = reshape({1.0, 1.0, 2.0, 1.0, 3.0, "", 4.0, ""}, 2, 4)

@test readdlm(IOBuffer(",,,1,,,,2,3,,,4,\n,,,1,,,1\n"), ',') == result1
@test readdlm(IOBuffer(" 1 2 3 4 \n 1 1\n")) == result2

result1[1,4] = "भारत"
@test readdlm(IOBuffer(",,,भारत,,,,2,3,,,4,\n,,,1,,,1\n"), ',') == result1

let x = [1,2,3], y = [4,5,6], io = IOBuffer()
writedlm(io, zip(x,y), ", ")
seek(io, 0)
@test readcsv(io) == [x y]
end


# source: http://www.i18nguy.com/unicode/unicode-example-utf8.zip
i18n_data = ["Origin (English)", "Name (English)", "Origin (Native)", "Name (Native)",
"Australia", "Nicole Kidman", "Australia", "Nicole Kidman",
"Austria", "Johann Strauss", "Österreich", "Johann Strauß",
"Belgium (Flemish)", "Rene Magritte", "België", "René Magritte",
"Belgium (French)", "Rene Magritte", "Belgique", "René Magritte",
"Belgium (German)", "Rene Magritte", "Belgien", "René Magritte",
"Bhutan", "Gonpo Dorji", "འབྲུག་ཡུལ།", "མགོན་པོ་རྡོ་རྗེ།",
"Canada", "Celine Dion", "Canada", "Céline Dion",
"Canada - Nunavut (Inuktitut)", "Susan Aglukark", "ᓄᓇᕗᒻᒥᐅᑦ", "ᓱᓴᓐ ᐊᒡᓗᒃᑲᖅ",
"Democratic People's Rep. of Korea", "LEE Sol-Hee", "조선 민주주의 인민 공화국", "이설희",
"Denmark", "Soren Hauch-Fausboll", "Danmark", "Søren Hauch-Fausbøll",
"Denmark", "Soren Kierkegaard", "Danmark", "Søren Kierkegård",
"Egypt", "Abdel Halim Hafez", "ﻣﺼﺮ", "ﻋﺑﺪﺍﻠﺣﻟﻳﻢ ﺤﺎﻓﻅ",
"Egypt", "Om Kolthoum", "ﻣﺼﺮ", "ﺃﻡ ﻛﻟﺛﻭﻡ",
"Eritrea", "Berhane Zeray", "ብርሃነ ዘርኣይ", "ኤርትራ",
"Ethiopia", "Haile Gebreselassie", "ኃይሌ ገብረሥላሴ", "ኢትዮጵያ",
"France", "Gerard Depardieu", "France", "Gérard Depardieu",
"France", "Jean Reno", "France", "Jean Réno",
"France", "Camille Saint-Saens", "France", "Camille Saint-Saëns",
"France", "Mylene Demongeot", "France", "Mylène Demongeot",
"France", "Francois Truffaut", "France", "François Truffaut",
"France (Braille)", "Louis Braille", "⠋⠗⠁⠝⠉⠑", "⠇⠕⠥⠊⠎⠀<BR>⠃⠗⠁⠊⠇⠇⠑",
"Georgia", "Eduard Shevardnadze", "საქართველო", "ედუარდ შევარდნაძე",
"Germany", "Rudi Voeller", "Deutschland", "Rudi Völler",
"Germany", "Walter Schultheiss", "Deutschland", "Walter Schultheiß",
"Greece", "Giorgos Dalaras", "Ελλάς", "Γιώργος Νταλάρας",
"Iceland", "Bjork Gudmundsdottir", "Ísland", "Björk Guðmundsdóttir",
"India (Hindi)", "Madhuri Dixit", "भारत", "माधुरी दिछित",
"Ireland", "Sinead O'Connor", "Éire", "Sinéad O'Connor",
"Israel", "Yehoram Gaon", "ישראל", "יהורם גאון",
"Italy", "Fabrizio DeAndre", "Italia", "Fabrizio De André",
"Japan", "KUBOTA Toshinobu", "日本", "久保田    利伸",
"Japan", "HAYASHIBARA Megumi", "日本", "林原 めぐみ",
"Japan", "Mori Ogai", "日本", "森鷗外",
"Japan", "Tex Texin", "日本", "テクス テクサン",
"Norway", "Tor Age Bringsvaerd", "Noreg", "Tor Åge Bringsværd",
"Pakistan (Urdu)", "Nusrat Fatah Ali Khan", "پاکستان", "نصرت فتح علی خان",
"People's Rep. of China", "ZHANG Ziyi", "中国", "章子怡",
"People's Rep. of China", "WONG Faye", "中国", "王菲",
"Poland", "Lech Walesa", "Polska", "Lech Wałęsa",
"Puerto Rico", "Olga Tanon", "Puerto Rico", "Olga Tañón",
"Rep. of China", "Hsu Chi", "臺灣", "舒淇",
"Rep. of China", "Ang Lee", "臺灣", "李安",
"Rep. of Korea", "AHN Sung-Gi", "한민국", "안성기",
"Rep. of Korea", "SHIM Eun-Ha", "한민국", "심은하",
"Russia", "Mikhail Gorbachev", "Россия", "Михаил Горбачёв",
"Russia", "Boris Grebenshchikov", "Россия", "Борис Гребенщиков",
"Slovenia", "\"Frane \"\"Jezek\"\" Milcinski", "Slovenija", "Frane Milčinski - Ježek",
"Syracuse (Sicily)", "Archimedes", "Συρακούσα", "Ἀρχιμήδης",
"Thailand", "Thongchai McIntai", "ประเทศไทย", "ธงไชย แม็คอินไตย์",
"U.S.A.", "Brad Pitt", "U.S.A.", "Brad Pitt",
"Yugoslavia (Cyrillic)", "Djordje Balasevic", "Југославија", "Ђорђе Балашевић",
"Yugoslavia (Latin)", "Djordje Balasevic", "Jugoslavija", "Đorđe Balašević"]

i18n_arr = transpose(reshape(i18n_data, 4, int(floor(length(i18n_data)/4))))
i18n_buff = PipeBuffer()
writedlm(i18n_buff, i18n_arr, ',')
@test i18n_arr == readcsv(i18n_buff)

0 comments on commit ee61b45

Please sign in to comment.