Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added readdlm option to ignore empty columns. #5403

Merged
merged 1 commit into from
Jan 18, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 75 additions & 43 deletions base/datafmt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,18 +68,21 @@ function ascii_if_possible(sbuff::String)
end

function readdlm_string(sbuff::String, dlm::Char, T::Type, eol::Char, auto::Bool, optsd::Dict)
ign_empty = (dlm == invalid_dlm)

nrows,ncols = try
dlm_dims(sbuff, eol, dlm)
dlm_dims(sbuff, eol, dlm, ign_empty)
catch ex
!get(optsd, :ignore_invalid_chars, false) && throw(ex)
sbuff = ascii_if_possible(convert(typeof(sbuff), sbuff.data, ""))
dlm_dims(sbuff, eol, dlm)
dlm_dims(sbuff, eol, dlm, ign_empty)
end
offsets = zeros(Int, nrows, ncols)
begin_offsets = zeros(Int, nrows, ncols)
end_offsets = zeros(Int, nrows, ncols)
has_header = get(optsd, :has_header, false)
cells = Array(T, has_header ? nrows-1 : nrows, ncols)
dlm_offsets(sbuff, dlm, eol, offsets)
has_header ? (dlm_fill(cells, offsets, sbuff, auto, 1, eol), dlm_fill(Array(String, 1, ncols), offsets, sbuff, auto, 0, eol)) : dlm_fill(cells, offsets, sbuff, auto, 0, eol)
dlm_offsets(sbuff, dlm, eol, begin_offsets, end_offsets, ign_empty)
has_header ? (dlm_fill(cells, begin_offsets, end_offsets, sbuff, auto, 1, dlm, eol, ign_empty), dlm_fill(Array(String, 1, ncols), begin_offsets, end_offsets, sbuff, auto, 0, dlm, eol, ign_empty)) : dlm_fill(cells, begin_offsets, end_offsets, sbuff, auto, 0, dlm, eol, ign_empty)
end

const valid_opts = [:has_header, :ignore_invalid_chars, :use_mmap]
Expand All @@ -93,28 +96,31 @@ function val_opts(opts)
d
end

function dlm_col_begin(ncols::Int, offsets::Array{Int,2}, row::Int, col::Int)
(row == 1) && (col == 1) && return 1
pp_row = (1 == col) ? (row-1) : row
pp_col = (1 == col) ? ncols : (col-1)

ret = offsets[pp_row, pp_col]
(ret == 0) ? dlm_col_begin(ncols, offsets, pp_row, pp_col) : (ret+2)
end

function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, auto::Bool, row_offset::Int, eol::Char)
function dlm_fill{T}(cells::Array{T,2}, begin_offsets::Array{Int,2}, end_offsets::Array{Int,2}, sbuff::String, auto::Bool, row_offset::Int, dlm::Char, eol::Char, ign_adj_dlm::Bool)
maxrow,maxcol = size(cells)
tmp64 = Array(Float64,1)

for row in (1+row_offset):(maxrow+row_offset)
cell_row = row-row_offset
for col in 1:maxcol
start_pos = dlm_col_begin(maxcol, offsets, row, col)
end_pos = offsets[row,col]

end_idx = prevind(sbuff, nextind(sbuff,end_pos))
(col == maxcol) && (end_idx > 0) && ('\n' == eol) && ('\r' == sbuff[end_idx]) && (end_idx = prevind(sbuff, end_idx))
sval = SubString(sbuff, start_pos, end_idx)
start_pos = begin_offsets[row,col]
end_pos = end_offsets[row,col]

if start_pos > 0 && end_pos > 0
end_idx = prevind(sbuff, nextind(sbuff,end_pos))
(end_idx > 0) && ('\n' == eol) && ('\r' == sbuff[end_idx]) && (end_idx = prevind(sbuff, end_idx))
if ign_adj_dlm
is_default_dlm = (dlm == invalid_dlm)
while start_pos <= end_idx
val = sbuff[start_pos]
(is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && break
start_pos = nextind(sbuff, start_pos)
end
end
sval = SubString(sbuff, start_pos, end_idx)
else
sval = SubString(sbuff, 1, 0)
end

if T <: Char
(length(sval) != 1) && error("file entry \"$(sval)\" is not a Char")
Expand All @@ -123,7 +129,7 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au
if float64_isvalid(sval, tmp64)
cells[cell_row,col] = tmp64[1]
elseif auto
return dlm_fill(Array(Any,maxrow,maxcol), offsets, sbuff, false, row_offset, eol)
return dlm_fill(Array(Any,maxrow,maxcol), begin_offsets, end_offsets, sbuff, false, row_offset, dlm, eol, ign_adj_dlm)
else
cells[cell_row,col] = NaN
end
Expand All @@ -140,52 +146,78 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au
end


function dlm_offsets(sbuff::UTF8String, dlm, eol, offsets::Array{Int,2})
isascii(dlm) && isascii(eol) && (return dlm_offsets(sbuff.data, uint8(dlm), uint8(eol), offsets))
function dlm_offsets(sbuff::UTF8String, dlm, eol, begin_offsets::Array{Int,2}, end_offsets::Array{Int,2}, ign_adj_dlm::Bool)
isascii(dlm) && isascii(eol) && (return dlm_offsets(sbuff.data, uint8(dlm), uint8(eol), begin_offsets, end_offsets, ign_adj_dlm))

col = 0
row = 1
maxrow,maxcol = size(offsets)
offsets[maxrow,maxcol] = length(sbuff.data)
maxrow,maxcol = size(begin_offsets)
idx = 1
is_default_dlm = (dlm == invalid_dlm)
while(idx <= length(sbuff.data))
got_data = false
last_offset = 0
slen = length(sbuff.data)
while idx <= slen
val,idx = next(sbuff, idx)
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
col += 1
offsets[row,col] = idx-2
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
if got_data || !ign_adj_dlm
col += 1
end_offsets[row,col] = idx-2
begin_offsets[row,col] = last_offset+1
end
last_offset = idx
(row >= maxrow) && (col == maxcol) && break
(val == eol) && (row += 1; col = 0)
got_data = false
end
if last_offset < slen
col += 1
begin_offsets[row,col] = last_offset+1
end_offsets[row,col] = slen
end
end

dlm_offsets(sbuff::ASCIIString, dlmc, eolc, offsets::Array{Int,2}) = dlm_offsets(sbuff.data, uint8(dlmc), uint8(eolc), offsets)
function dlm_offsets(dbuff::Vector{Uint8}, dlm::Uint8, eol::Uint8, offsets::Array{Int,2})
dlm_offsets(sbuff::ASCIIString, dlmc, eolc, begin_offsets::Array{Int,2}, end_offsets::Array{Int,2}, ign_adj_dlm::Bool) = dlm_offsets(sbuff.data, uint8(dlmc), uint8(eolc), begin_offsets, end_offsets, ign_adj_dlm)
function dlm_offsets(dbuff::Vector{Uint8}, dlm::Uint8, eol::Uint8, begin_offsets::Array{Int,2}, end_offsets::Array{Int,2}, ign_adj_dlm::Bool)
col = 0
row = 1
is_default_dlm = (dlm == uint8(invalid_dlm))
maxrow,maxcol = size(offsets)
offsets[maxrow,maxcol] = length(dbuff)
for idx in 1:length(dbuff)
maxrow,maxcol = size(begin_offsets)
got_data = false
last_offset = 0
slen = length(dbuff)
for idx in 1:slen
val = dbuff[idx]
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
col += 1
offsets[row,col] = idx-1
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
if got_data || !ign_adj_dlm
col += 1
end_offsets[row,col] = idx-1
begin_offsets[row,col] = last_offset+1
end
last_offset = idx
(row >= maxrow) && (col == maxcol) && break
(val == eol) && (row += 1; col = 0)
got_data = false
end
if last_offset < slen
col += 1
begin_offsets[row,col] = last_offset+1
end_offsets[row,col] = slen
end
end

dlm_dims(s::ASCIIString, eol::Char, dlm::Char) = dlm_dims(s.data, uint8(eol), uint8(dlm))
function dlm_dims{T,D}(dbuff::T, eol::D, dlm::D)
isa(dbuff, UTF8String) && isascii(eol) && isascii(dlm) && (return dlm_dims(dbuff.data, uint8(eol), uint8(dlm)))
dlm_dims(s::ASCIIString, eol::Char, dlm::Char, ign_adj_dlm::Bool) = dlm_dims(s.data, uint8(eol), uint8(dlm), ign_adj_dlm)
function dlm_dims{T,D}(dbuff::T, eol::D, dlm::D, ign_adj_dlm::Bool)
isa(dbuff, UTF8String) && isascii(eol) && isascii(dlm) && (return dlm_dims(dbuff.data, uint8(eol), uint8(dlm), ign_adj_dlm))
ncols = nrows = col = 0
is_default_dlm = (dlm == convert(D, invalid_dlm))
try
got_data = false
for val in dbuff
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
col += 1
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
(got_data || !ign_adj_dlm) && (col += 1)
(val == eol) && (nrows += 1; ncols = max(ncols, col); col = 0)
got_data = false
end
catch ex
error("at row $nrows, column $col : $ex)")
Expand Down
59 changes: 45 additions & 14 deletions doc/helpdb.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2536,25 +2536,34 @@

"),

("Text I/O","Base","readdlm","readdlm(source, delim::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
("Text I/O","Base","readdlm","readdlm(source, delim::Char, T::Type, eol::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)

Read a matrix from the source where each line gives one row, with
elements separated by the given delimeter. The source can be a text
file, stream or byte array. Memory mapped filed can be used by
passing the byte array representation of the mapped segment as
source.
Read a matrix from the source where each line (separated by \"eol\")
gives one row, with elements separated by the given delimeter. The
source can be a text file, stream or byte array. Memory mapped files
can be used by passing the byte array representation of the mapped
segment as source.

If \"has_header\" is \"true\" the first row of data would be read
If \"T\" is a numeric type, the result is an array of that type,
with any non-numeric elements as \"NaN\" for floating-point types,
or zero. Other useful values of \"T\" include \"ASCIIString\",
\"String\", and \"Any\".

If \"has_header\" is \"true\", the first row of data would be read
as headers and the tuple \"(data_cells, header_cells)\" is returned
instead of only \"data_cells\".

If \"use_mmap\" is \"true\" the file specified by \"source\" is
If \"use_mmap\" is \"true\", the file specified by \"source\" is
memory mapped for potential speedups.

If \"ignore_invalid_chars\" is \"true\" bytes in \"source\" with
If \"ignore_invalid_chars\" is \"true\", bytes in \"source\" with
invalid character encoding will be ignored. Otherwise an error is
thrown indicating the offending character position.

"),

("Text I/O","Base","readdlm","readdlm(source, delim::Char, eol::Char; options...)

If all data is numeric, the result will be a numeric array. If some
elements cannot be parsed as numbers, a cell array of numbers and
strings is returned.
Expand All @@ -2563,11 +2572,33 @@

("Text I/O","Base","readdlm","readdlm(source, delim::Char, T::Type; options...)

Read a matrix from the source with a given element type. If \"T\"
is a numeric type, the result is an array of that type, with any
non-numeric elements as \"NaN\" for floating-point types, or zero.
Other useful values of \"T\" include \"ASCIIString\", \"String\",
and \"Any\".
The end of line delimiter is taken as \"\\n\".

"),

("Text I/O","Base","readdlm","readdlm(source, delim::Char; options...)

The end of line delimiter is taken as \"\\n\". If all data is
numeric, the result will be a numeric array. If some elements
cannot be parsed as numbers, a cell array of numbers and
strings is returned.

"),

("Text I/O","Base","readdlm","readdlm(source, T::Type; options...)

The columns are assumed to be separated by one or more whitespaces.
The end of line delimiter is taken as \"\\n\".

"),

("Text I/O","Base","readdlm","readdlm(source, options...)

The columns are assumed to be separated by one or more whitespaces.
The end of line delimiter is taken as \"\\n\". If all data is
numeric, the result will be a numeric array. If some elements
cannot be parsed as numbers, a cell array of numbers and strings
is returned.

"),

Expand Down
29 changes: 22 additions & 7 deletions doc/stdlib/base.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1664,22 +1664,37 @@ Text I/O

Create an iterable object that will yield each line from a stream.

.. function:: readdlm(source, delim::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
.. function:: readdlm(source, delim::Char, T::Type, eol::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)

Read a matrix from the source where each line gives one row, with elements separated by the given delimeter. The source can be a text file, stream or byte array. Memory mapped filed can be used by passing the byte array representation of the mapped segment as source.
Read a matrix from the source where each line (separated by ``eol``) gives one row, with elements separated by the given delimeter. The source can be a text file, stream or byte array. Memory mapped files can be used by passing the byte array representation of the mapped segment as source.

If ``has_header`` is ``true`` the first row of data would be read as headers and the tuple ``(data_cells, header_cells)`` is returned instead of only ``data_cells``.
If ``T`` is a numeric type, the result is an array of that type, with any non-numeric elements as ``NaN`` for floating-point types, or zero. Other useful values of ``T`` include ``ASCIIString``, ``String``, and ``Any``.

If ``use_mmap`` is ``true`` the file specified by ``source`` is memory mapped for potential speedups.
If ``has_header`` is ``true``, the first row of data would be read as headers and the tuple ``(data_cells, header_cells)`` is returned instead of only ``data_cells``.

If ``ignore_invalid_chars`` is ``true`` bytes in ``source`` with invalid character encoding will be ignored. Otherwise an error is thrown indicating the offending character position.
If ``use_mmap`` is ``true``, the file specified by ``source`` is memory mapped for potential speedups.

If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.
If ``ignore_invalid_chars`` is ``true``, bytes in ``source`` with invalid character encoding will be ignored. Otherwise an error is thrown indicating the offending character position.

.. function:: readdlm(source, delim::Char, eol::Char; options...)

If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.

.. function:: readdlm(source, delim::Char, T::Type; options...)

Read a matrix from the source with a given element type. If ``T`` is a numeric type, the result is an array of that type, with any non-numeric elements as ``NaN`` for floating-point types, or zero. Other useful values of ``T`` include ``ASCIIString``, ``String``, and ``Any``.
The end of line delimiter is taken as ``\n``.

.. function:: readdlm(source, delim::Char; options...)

The end of line delimiter is taken as ``\n``. If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.

.. function:: readdlm(source, T::Type; options...)

The columns are assumed to be separated by one or more whitespaces. The end of line delimiter is taken as ``\n``.

.. function:: readdlm(source; options...)

The columns are assumed to be separated by one or more whitespaces. The end of line delimiter is taken as ``\n``. If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.

.. function:: writedlm(f, A, delim='\t')

Expand Down
Loading