Skip to content

Commit

Permalink
Finish support for embedded nulls
Browse files Browse the repository at this point in the history
* Add hasNull_ field to Token
* Track presence of nulls in tokenizers.
* Warn in CollectorCharacter
* Update unit test. Warnings in header is a tricky topic, so I've dodged the issue by tweaking the example

Fixes #202
hadley committed Sep 23, 2015
1 parent f096ca8 commit da39530
Showing 15 changed files with 73 additions and 47 deletions.
4 changes: 2 additions & 2 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -19,8 +19,8 @@
* Pathological zero row inputs (due to empty input, `skip` or `n_max`) now
return zero row data frames (#119).

* readr has better support for embedded nul values in strings: any characters
after the nul are silently dropped (#202).
* readr has better support for embedded null values in strings: any characters
after the null are dropped with a warning (#202).

* Numbers with leading zeros now default to being parsed as text, rather than
as integers/doubles. Override with `col_integer()` or `col_double()` (#266).
3 changes: 3 additions & 0 deletions src/Collector.cpp
Original file line number Diff line number Diff line change
@@ -71,6 +71,9 @@ void CollectorCharacter::setValue(int i, const Token& t) {
boost::container::string buffer;
SourceIterators string = t.getString(&buffer);

if (t.hasNull())
warn(t.row(), t.col(), "", "embedded null");

SET_STRING_ELT(column_, i, pEncoder_->makeSEXP(string.first, string.second));
break;
};
10 changes: 5 additions & 5 deletions src/Iconv.cpp
Original file line number Diff line number Diff line change
@@ -56,17 +56,17 @@ size_t Iconv::convert(const char* start, const char* end) {

// To be safe, we need to check for nulls - this also needs to emit
// a warning, but this behaviour is better than crashing
SEXP safeMakeChar(const char* start, size_t n) {
int m = strnlen(start, n);
SEXP safeMakeChar(const char* start, size_t n, bool hasNull) {
int m = hasNull ? strnlen(start, n) : n;
return Rf_mkCharLenCE(start, m, CE_UTF8);
}

SEXP Iconv::makeSEXP(const char* start, const char* end) {
SEXP Iconv::makeSEXP(const char* start, const char* end, bool hasNull) {
if (cd_ == NULL)
return safeMakeChar(start, end - start);
return safeMakeChar(start, end - start, hasNull);

int n = convert(start, end);
return safeMakeChar(&buffer_[0], n);
return safeMakeChar(&buffer_[0], n, hasNull);
}

std::string Iconv::makeString(const char* start, const char* end) {
5 changes: 2 additions & 3 deletions src/Iconv.h
Original file line number Diff line number Diff line change
@@ -12,9 +12,8 @@ class Iconv {
Iconv(const std::string& from, const std::string& to = "UTF-8");
virtual ~Iconv();

SEXP makeSEXP(const char* start, const char* end);
std::string
makeString(const char* start, const char* end);
SEXP makeSEXP(const char* start, const char* end, bool hasNull = true);
std::string makeString(const char* start, const char* end);

private:

11 changes: 9 additions & 2 deletions src/Token.h
Original file line number Diff line number Diff line change
@@ -18,19 +18,22 @@ class Token {
TokenType type_;
SourceIterator begin_, end_;
size_t row_, col_;
bool hasNull_;

Tokenizer* pTokenizer_;

public:

Token(): type_(TOKEN_EMPTY), row_(0), col_(0) {}
Token(TokenType type, int row, int col): type_(type), row_(row), col_(col) {}
Token(SourceIterator begin, SourceIterator end, int row, int col, Tokenizer* pTokenizer = NULL):
Token(SourceIterator begin, SourceIterator end, int row, int col, bool hasNull,
Tokenizer* pTokenizer = NULL):
type_(TOKEN_STRING),
begin_(begin),
end_(end),
row_(row),
col_(col),
hasNull_(hasNull),
pTokenizer_(pTokenizer)
{
if (begin_ == end_)
@@ -69,7 +72,7 @@ class Token {
boost::container::string buffer;
SourceIterators string = getString(&buffer);

return pEncoder->makeSEXP(string.first, string.second);
return pEncoder->makeSEXP(string.first, string.second, hasNull_);
}
default:
return NA_STRING;
@@ -95,6 +98,10 @@ class Token {
return col_;
}

bool hasNull() const {
return hasNull_;
}

Token& trim() {
while (begin_ != end_ && *begin_ == ' ')
begin_++;
45 changes: 25 additions & 20 deletions src/TokenizerDelim.cpp
Original file line number Diff line number Diff line change
@@ -43,13 +43,16 @@ Token TokenizerDelim::nextToken() {
return Token(TOKEN_EOF, row, col);

SourceIterator token_begin = cur_;
bool hasEscapeD = false, hasEscapeB = false;
bool hasEscapeD = false, hasEscapeB = false, hasNull = false;

while (cur_ != end_) {
// Increments cur on destruct, ensuring that we always move on to the
// next character
Advance advance(&cur_);

if (*cur_ == '\0')
hasNull = true;

if ((row_ + 1) % 100000 == 0 || (col_ + 1) % 100000 == 0)
Rcpp::checkUserInterrupt();

@@ -76,15 +79,15 @@ Token TokenizerDelim::nextToken() {
case STATE_FIELD:
if (*cur_ == '\r' || *cur_ == '\n') {
newRecord();
return fieldToken(token_begin, advanceForLF(&cur_, end_), hasEscapeB, row, col);
return fieldToken(token_begin, advanceForLF(&cur_, end_), hasEscapeB, hasNull, row, col);
} else if (isComment(cur_)) {
state_ = STATE_COMMENT;
return fieldToken(token_begin, cur_, hasEscapeB, row, col);
return fieldToken(token_begin, cur_, hasEscapeB, hasNull, row, col);
} else if (escapeBackslash_ && *cur_ == '\\') {
state_ = STATE_ESCAPE_F;
} else if (*cur_ == delim_) {
newField();
return fieldToken(token_begin, cur_, hasEscapeB, row, col);
return fieldToken(token_begin, cur_, hasEscapeB, hasNull, row, col);
}
break;

@@ -100,15 +103,15 @@ Token TokenizerDelim::nextToken() {
} else if (*cur_ == '\r' || *cur_ == '\n') {
newRecord();
return stringToken(token_begin + 1, advanceForLF(&cur_, end_) - 1,
hasEscapeB, hasEscapeD, row, col);
hasEscapeB, hasEscapeD, hasNull, row, col);
} else if (isComment(cur_)) {
state_ = STATE_COMMENT;
return stringToken(token_begin + 1, cur_ - 1,
hasEscapeB, hasEscapeD, row, col);
hasEscapeB, hasEscapeD, hasNull, row, col);
} else if (*cur_ == delim_) {
newField();
return stringToken(token_begin + 1, cur_ - 1,
hasEscapeB, hasEscapeD, row, col);
hasEscapeB, hasEscapeD, hasNull, row, col);
} else {
warn(row, col, "delimiter or quote", std::string(cur_, cur_ + 1));
state_ = STATE_STRING;
@@ -131,13 +134,13 @@ Token TokenizerDelim::nextToken() {
if (*cur_ == '\r' || *cur_ == '\n') {
newRecord();
return stringToken(token_begin + 1, advanceForLF(&cur_, end_) - 1,
hasEscapeB, hasEscapeD, row, col);
hasEscapeB, hasEscapeD, hasNull, row, col);
} else if (isComment(cur_)) {
state_ = STATE_COMMENT;
return stringToken(token_begin + 1, cur_ - 1, hasEscapeB, hasEscapeD, row, col);
return stringToken(token_begin + 1, cur_ - 1, hasEscapeB, hasEscapeD, hasNull, row, col);
} else if (*cur_ == delim_) {
newField();
return stringToken(token_begin + 1, cur_ - 1, hasEscapeB, hasEscapeD, row, col);
return stringToken(token_begin + 1, cur_ - 1, hasEscapeB, hasEscapeD, hasNull, row, col);
} else {
state_ = STATE_FIELD;
}
@@ -171,19 +174,19 @@ Token TokenizerDelim::nextToken() {

case STATE_STRING_END:
case STATE_QUOTE:
return stringToken(token_begin + 1, end_ - 1, hasEscapeB, hasEscapeD, row, col);
return stringToken(token_begin + 1, end_ - 1, hasEscapeB, hasEscapeD, hasNull, row, col);

case STATE_STRING:
warn(row, col, "closing quote at end of file");
return stringToken(token_begin + 1, end_, hasEscapeB, hasEscapeD, row, col);
return stringToken(token_begin + 1, end_, hasEscapeB, hasEscapeD, hasNull, row, col);

case STATE_ESCAPE_S:
case STATE_ESCAPE_F:
warn(row, col, "closing escape at end of file");
return stringToken(token_begin, end_ - 1, hasEscapeB, hasEscapeD, row, col);
return stringToken(token_begin, end_ - 1, hasEscapeB, hasEscapeD, hasNull, row, col);

case STATE_FIELD:
return fieldToken(token_begin, end_, hasEscapeB, row, col);
return fieldToken(token_begin, end_, hasEscapeB, hasNull, row, col);

case STATE_COMMENT:
return Token(TOKEN_EOF, row, col);
@@ -215,18 +218,20 @@ Token TokenizerDelim::emptyToken(int row, int col) {
return Token(TOKEN_EMPTY, row, col).flagNA(NA_);
}

Token TokenizerDelim::fieldToken(SourceIterator begin, SourceIterator end, bool hasEscapeB,
int row, int col) {
Token t(begin, end, row, col, (hasEscapeB) ? this : NULL);
Token TokenizerDelim::fieldToken(SourceIterator begin, SourceIterator end,
bool hasEscapeB, bool hasNull,
int row, int col) {
Token t(begin, end, row, col, hasNull, (hasEscapeB) ? this : NULL);
if (trimWS_)
t.trim();
t.flagNA(NA_);
return t;
}

Token TokenizerDelim::stringToken(SourceIterator begin, SourceIterator end, bool hasEscapeB,
bool hasEscapeD, int row, int col) {
Token t(begin, end, row, col, (hasEscapeD || hasEscapeB) ? this : NULL);
Token TokenizerDelim::stringToken(SourceIterator begin, SourceIterator end,
bool hasEscapeB, bool hasEscapeD, bool hasNull,
int row, int col) {
Token t(begin, end, row, col, hasNull, (hasEscapeD || hasEscapeB) ? this : NULL);
if (trimWS_)
t.trim();
return t;
4 changes: 2 additions & 2 deletions src/TokenizerDelim.h
Original file line number Diff line number Diff line change
@@ -57,10 +57,10 @@ class TokenizerDelim : public Tokenizer {
Token emptyToken(int row, int col);

Token fieldToken(SourceIterator begin, SourceIterator end, bool hasEscapeB,
int row, int col);
bool hasNull, int row, int col);

Token stringToken(SourceIterator begin, SourceIterator end, bool hasEscapeB,
bool hasEscapeD, int row, int col);
bool hasEscapeD, bool hasNull, int row, int col);

void unescapeBackslash(SourceIterator begin, SourceIterator end,
boost::container::string* pOut);
15 changes: 10 additions & 5 deletions src/TokenizerFwf.cpp
Original file line number Diff line number Diff line change
@@ -134,7 +134,7 @@ Token TokenizerFwf::nextToken() {

SourceIterator fieldEnd = fieldBegin;
int width = endOffset_[col_] - beginOffset_[col_];
bool lastCol = (col_ == cols_ - 1), tooShort = false;
bool lastCol = (col_ == cols_ - 1), tooShort = false, hasNull = false;

// Find the end of the field, stopping for newlines
for(int i = 0; i < width; ++i) {
@@ -144,16 +144,21 @@ Token TokenizerFwf::nextToken() {
tooShort = true;
break;
}
if (*fieldEnd == '\0')
hasNull = true;

fieldEnd++;
}
// Last column is often ragged, so read until end of line (ignoring width)
if (lastCol) {
while(fieldEnd != end_ && *fieldEnd != '\r' && *fieldEnd != '\n')
while(fieldEnd != end_ && *fieldEnd != '\r' && *fieldEnd != '\n') {
if (*fieldEnd == '\0')
hasNull = true;
fieldEnd++;
}
}

Token t = fieldToken(fieldBegin, fieldEnd);
Token t = fieldToken(fieldBegin, fieldEnd, hasNull);

if (lastCol || tooShort) {
row_++;
@@ -170,11 +175,11 @@ Token TokenizerFwf::nextToken() {
return t;
}

Token TokenizerFwf::fieldToken(SourceIterator begin, SourceIterator end) {
Token TokenizerFwf::fieldToken(SourceIterator begin, SourceIterator end, bool hasNull) {
if (begin == end)
return Token(TOKEN_MISSING, row_, col_);

Token t = Token(begin, end, row_, col_);
Token t = Token(begin, end, row_, col_, hasNull);
t.trim();
t.flagNA(NA_);

2 changes: 1 addition & 1 deletion src/TokenizerFwf.h
Original file line number Diff line number Diff line change
@@ -27,7 +27,7 @@ class TokenizerFwf : public Tokenizer {

private:

Token fieldToken(SourceIterator begin, SourceIterator end);
Token fieldToken(SourceIterator begin, SourceIterator end, bool hasNull);
};

#endif
9 changes: 7 additions & 2 deletions src/TokenizerLine.h
Original file line number Diff line number Diff line change
@@ -31,19 +31,24 @@ class TokenizerLine : public Tokenizer {
Token nextToken() {
SourceIterator token_begin = cur_;

bool hasNull = false;

if (!moreTokens_)
return Token(TOKEN_EOF, line_, 0);

while (cur_ != end_) {
Advance advance(&cur_);

if (*cur_ == '\0')
hasNull = true;

if ((line_ + 1) % 500000 == 0)
Rcpp::checkUserInterrupt();

switch(*cur_) {
case '\r':
case '\n':
return Token(token_begin, advanceForLF(&cur_, end_), line_++, 0);
return Token(token_begin, advanceForLF(&cur_, end_), hasNull, line_++, 0);
default:
break;
}
@@ -54,7 +59,7 @@ class TokenizerLine : public Tokenizer {
if (token_begin == end_) {
return Token(TOKEN_EOF, line_++, 0);
} else {
return Token(token_begin, end_, line_++, 0);
return Token(token_begin, end_, hasNull, line_++, 0);
}
}

2 changes: 1 addition & 1 deletion src/TokenizerLog.h
Original file line number Diff line number Diff line change
@@ -167,7 +167,7 @@ class TokenizerLog : public Tokenizer {
}

Token fieldToken(SourceIterator begin, SourceIterator end, int row, int col) {
return Token(begin, end, row, col).flagNA(std::vector<std::string>(1, "-"));
return Token(begin, end, false, row, col).flagNA(std::vector<std::string>(1, "-"));
}

};
3 changes: 2 additions & 1 deletion src/parse.cpp
Original file line number Diff line number Diff line change
@@ -58,6 +58,7 @@ RObject read_header_(List sourceSpec, List tokenizerSpec, List locale_) {
tokenizer->setWarnings(&warnings);

CollectorCharacter out(&locale.encoder_);
out.setWarnings(&warnings);

for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF; t = tokenizer->nextToken()) {
if (t.row() > (size_t) 0) // only read one row
@@ -122,7 +123,7 @@ SEXP parse_vector_(CharacterVector x, List collectorSpec,
t = Token(TOKEN_MISSING, i, -1);
} else {
SEXP string = x[i];
t = Token(CHAR(string), CHAR(string) + Rf_length(string), i, -1);
t = Token(CHAR(string), CHAR(string) + Rf_length(string), i, -1, false);
t.trim();
t.flagNA(na);
}
2 changes: 1 addition & 1 deletion src/type_convert.cpp
Original file line number Diff line number Diff line change
@@ -21,7 +21,7 @@ RObject type_convert_col(CharacterVector x, List spec, List locale_, int col,
t = Token(TOKEN_MISSING, i - 1, col - 1);
} else {
const char* begin = CHAR(string);
t = Token(begin, begin + Rf_length(string), i - 1, col - 1);
t = Token(begin, begin + Rf_length(string), i - 1, col - 1, false);
if (trim_ws)
t.trim();
t.flagNA(na);
Binary file modified tests/testthat/raw.csv
Binary file not shown.
5 changes: 3 additions & 2 deletions tests/testthat/test-read-csv.R
Original file line number Diff line number Diff line change
@@ -63,9 +63,10 @@ test_that("encoding affects text and headers", {
expect_identical(x[[1]], "élève")
})

test_that("nuls are silently dropped", {
test_that("nuls are dropped with a warning", {
x <- read_csv("raw.csv")
expect_equal(names(x), c("ab", "def"))
expect_equal(n_problems(x), 1)
expect_equal(x$abc, "ab")
})

# Column warnings ---------------------------------------------------------

0 comments on commit da39530

Please sign in to comment.