Finish support for embedded nulls

* Add hasNull_ field to Token * Track presence of nulls in tokenizers. * Warn in CollectorCharacter * Update unit test. Warnings in header is a tricky topic, so I've dodged the issue by tweaking the example Fixes #202
tidyverse · Sep 23, 2015 · da39530 · da39530
1 parent f096ca8
commit da39530
Showing 15 changed files with 73 additions and 47 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -19,8 +19,8 @@
 * Pathological zero row inputs (due to empty input, `skip` or `n_max`) now
   return zero row data frames (#119).
 
-* readr has better support for embedded nul values in strings: any characters
-  after the nul are silently dropped (#202).
+* readr has better support for embedded null values in strings: any characters
+  after the null are dropped with a warning (#202).
 
 * Numbers with leading zeros now default to being parsed as text, rather than
   as integers/doubles. Override with `col_integer()` or `col_double()` (#266).

diff --git a/src/Collector.cpp b/src/Collector.cpp
@@ -71,6 +71,9 @@ void CollectorCharacter::setValue(int i, const Token& t) {
     boost::container::string buffer;
     SourceIterators string = t.getString(&buffer);
 
+    if (t.hasNull())
+      warn(t.row(), t.col(), "", "embedded null");
+
     SET_STRING_ELT(column_, i, pEncoder_->makeSEXP(string.first, string.second));
     break;
   };

diff --git a/src/Iconv.cpp b/src/Iconv.cpp
@@ -56,17 +56,17 @@ size_t Iconv::convert(const char* start, const char* end) {
 
 // To be safe, we need to check for nulls - this also needs to emit
 // a warning, but this behaviour is better than crashing
-SEXP safeMakeChar(const char* start, size_t n) {
-  int m = strnlen(start, n);
+SEXP safeMakeChar(const char* start, size_t n, bool hasNull) {
+  int m = hasNull ? strnlen(start, n) : n;
   return Rf_mkCharLenCE(start, m, CE_UTF8);
 }
 
-SEXP Iconv::makeSEXP(const char* start, const char* end) {
+SEXP Iconv::makeSEXP(const char* start, const char* end, bool hasNull) {
   if (cd_ == NULL)
-    return safeMakeChar(start, end - start);
+    return safeMakeChar(start, end - start, hasNull);
 
   int n = convert(start, end);
-  return safeMakeChar(&buffer_[0], n);
+  return safeMakeChar(&buffer_[0], n, hasNull);
 }
 
 std::string Iconv::makeString(const char* start, const char* end) {

diff --git a/src/Iconv.h b/src/Iconv.h
@@ -12,9 +12,8 @@ class Iconv {
   Iconv(const std::string& from, const std::string& to = "UTF-8");
   virtual ~Iconv();
 
-  SEXP makeSEXP(const char* start, const char* end);
-  std::string
-    makeString(const char* start, const char* end);
+  SEXP makeSEXP(const char* start, const char* end, bool hasNull = true);
+  std::string makeString(const char* start, const char* end);
 
 private:
 

diff --git a/src/Token.h b/src/Token.h
@@ -18,19 +18,22 @@ class Token {
   TokenType type_;
   SourceIterator begin_, end_;
   size_t row_, col_;
+  bool hasNull_;
 
   Tokenizer* pTokenizer_;
 
 public:
 
   Token(): type_(TOKEN_EMPTY), row_(0), col_(0) {}
   Token(TokenType type, int row, int col): type_(type), row_(row), col_(col) {}
-  Token(SourceIterator begin, SourceIterator end, int row, int col, Tokenizer* pTokenizer = NULL):
+  Token(SourceIterator begin, SourceIterator end, int row, int col, bool hasNull,
+        Tokenizer* pTokenizer = NULL):
     type_(TOKEN_STRING),
     begin_(begin),
     end_(end),
     row_(row),
     col_(col),
+    hasNull_(hasNull),
     pTokenizer_(pTokenizer)
   {
     if (begin_ == end_)
@@ -69,7 +72,7 @@ class Token {
       boost::container::string buffer;
       SourceIterators string = getString(&buffer);
 
-      return pEncoder->makeSEXP(string.first, string.second);
+      return pEncoder->makeSEXP(string.first, string.second, hasNull_);
     }
     default:
       return NA_STRING;
@@ -95,6 +98,10 @@ class Token {
     return col_;
   }
 
+  bool hasNull() const {
+    return hasNull_;
+  }
+
   Token& trim() {
     while (begin_ != end_ && *begin_ == ' ')
       begin_++;

diff --git a/src/TokenizerDelim.cpp b/src/TokenizerDelim.cpp
@@ -43,13 +43,16 @@ Token TokenizerDelim::nextToken() {
     return Token(TOKEN_EOF, row, col);
 
   SourceIterator token_begin = cur_;
-  bool hasEscapeD = false, hasEscapeB = false;
+  bool hasEscapeD = false, hasEscapeB = false, hasNull = false;
 
   while (cur_ != end_) {
     // Increments cur on destruct, ensuring that we always move on to the
     // next character
     Advance advance(&cur_);
 
+    if (*cur_ == '\0')
+      hasNull = true;
+
     if ((row_ + 1) % 100000 == 0 || (col_ + 1) % 100000 == 0)
       Rcpp::checkUserInterrupt();
 
@@ -76,15 +79,15 @@ Token TokenizerDelim::nextToken() {
     case STATE_FIELD:
       if (*cur_ == '\r' || *cur_ == '\n') {
         newRecord();
-        return fieldToken(token_begin, advanceForLF(&cur_, end_), hasEscapeB, row, col);
+        return fieldToken(token_begin, advanceForLF(&cur_, end_), hasEscapeB, hasNull, row, col);
       } else if (isComment(cur_)) {
         state_ = STATE_COMMENT;
-        return fieldToken(token_begin, cur_, hasEscapeB, row, col);
+        return fieldToken(token_begin, cur_, hasEscapeB, hasNull, row, col);
       } else if (escapeBackslash_ && *cur_ == '\\') {
         state_ = STATE_ESCAPE_F;
       } else if (*cur_ == delim_) {
         newField();
-        return fieldToken(token_begin, cur_, hasEscapeB, row, col);
+        return fieldToken(token_begin, cur_, hasEscapeB, hasNull, row, col);
       }
       break;
 
@@ -100,15 +103,15 @@ Token TokenizerDelim::nextToken() {
       } else if (*cur_ == '\r' || *cur_ == '\n') {
         newRecord();
         return stringToken(token_begin + 1, advanceForLF(&cur_, end_) - 1,
-          hasEscapeB, hasEscapeD, row, col);
+          hasEscapeB, hasEscapeD, hasNull, row, col);
       } else if (isComment(cur_)) {
         state_ = STATE_COMMENT;
         return stringToken(token_begin + 1, cur_ - 1,
-          hasEscapeB, hasEscapeD, row, col);
+          hasEscapeB, hasEscapeD, hasNull, row, col);
       } else if (*cur_ == delim_) {
         newField();
         return stringToken(token_begin + 1, cur_ - 1,
-          hasEscapeB, hasEscapeD, row, col);
+          hasEscapeB, hasEscapeD, hasNull, row, col);
       } else {
         warn(row, col, "delimiter or quote", std::string(cur_, cur_ + 1));
         state_ = STATE_STRING;
@@ -131,13 +134,13 @@ Token TokenizerDelim::nextToken() {
       if (*cur_ == '\r' || *cur_ == '\n') {
         newRecord();
         return stringToken(token_begin + 1, advanceForLF(&cur_, end_) - 1,
-          hasEscapeB, hasEscapeD, row, col);
+          hasEscapeB, hasEscapeD, hasNull, row, col);
       } else if (isComment(cur_)) {
         state_ = STATE_COMMENT;
-        return stringToken(token_begin + 1, cur_ - 1, hasEscapeB, hasEscapeD, row, col);
+        return stringToken(token_begin + 1, cur_ - 1, hasEscapeB, hasEscapeD, hasNull, row, col);
       } else if (*cur_ == delim_) {
         newField();
-        return stringToken(token_begin + 1, cur_ - 1, hasEscapeB, hasEscapeD, row, col);
+        return stringToken(token_begin + 1, cur_ - 1, hasEscapeB, hasEscapeD, hasNull, row, col);
       } else {
         state_ = STATE_FIELD;
       }
@@ -171,19 +174,19 @@ Token TokenizerDelim::nextToken() {
 
   case STATE_STRING_END:
   case STATE_QUOTE:
-    return stringToken(token_begin + 1, end_ - 1, hasEscapeB, hasEscapeD, row, col);
+    return stringToken(token_begin + 1, end_ - 1, hasEscapeB, hasEscapeD, hasNull, row, col);
 
   case STATE_STRING:
     warn(row, col, "closing quote at end of file");
-    return stringToken(token_begin + 1, end_, hasEscapeB, hasEscapeD, row, col);
+    return stringToken(token_begin + 1, end_, hasEscapeB, hasEscapeD, hasNull, row, col);
 
   case STATE_ESCAPE_S:
   case STATE_ESCAPE_F:
     warn(row, col, "closing escape at end of file");
-    return stringToken(token_begin, end_ - 1, hasEscapeB, hasEscapeD, row, col);
+    return stringToken(token_begin, end_ - 1, hasEscapeB, hasEscapeD, hasNull, row, col);
 
   case STATE_FIELD:
-    return fieldToken(token_begin, end_, hasEscapeB, row, col);
+    return fieldToken(token_begin, end_, hasEscapeB, hasNull, row, col);
 
   case STATE_COMMENT:
     return Token(TOKEN_EOF, row, col);
@@ -215,18 +218,20 @@ Token TokenizerDelim::emptyToken(int row, int col) {
   return Token(TOKEN_EMPTY, row, col).flagNA(NA_);
 }
 
-Token TokenizerDelim::fieldToken(SourceIterator begin, SourceIterator end, bool hasEscapeB,
-  int row, int col)  {
-  Token t(begin, end, row, col, (hasEscapeB) ? this : NULL);
+Token TokenizerDelim::fieldToken(SourceIterator begin, SourceIterator end,
+                                 bool hasEscapeB, bool hasNull,
+                                 int row, int col)  {
+  Token t(begin, end, row, col, hasNull, (hasEscapeB) ? this : NULL);
   if (trimWS_)
     t.trim();
   t.flagNA(NA_);
   return t;
 }
 
-Token TokenizerDelim::stringToken(SourceIterator begin, SourceIterator end, bool hasEscapeB,
-  bool hasEscapeD, int row, int col) {
-  Token t(begin, end, row, col, (hasEscapeD || hasEscapeB) ? this : NULL);
+Token TokenizerDelim::stringToken(SourceIterator begin, SourceIterator end,
+                                  bool hasEscapeB, bool hasEscapeD, bool hasNull,
+                                  int row, int col) {
+  Token t(begin, end, row, col, hasNull, (hasEscapeD || hasEscapeB) ? this : NULL);
   if (trimWS_)
     t.trim();
   return t;

diff --git a/src/TokenizerDelim.h b/src/TokenizerDelim.h
@@ -57,10 +57,10 @@ class TokenizerDelim : public Tokenizer {
   Token emptyToken(int row, int col);
 
   Token fieldToken(SourceIterator begin, SourceIterator end, bool hasEscapeB,
-    int row, int col);
+    bool hasNull, int row, int col);
 
   Token stringToken(SourceIterator begin, SourceIterator end, bool hasEscapeB,
-    bool hasEscapeD, int row, int col);
+    bool hasEscapeD, bool hasNull, int row, int col);
 
   void unescapeBackslash(SourceIterator begin, SourceIterator end,
     boost::container::string* pOut);

diff --git a/src/TokenizerFwf.cpp b/src/TokenizerFwf.cpp
@@ -134,7 +134,7 @@ Token TokenizerFwf::nextToken() {
 
   SourceIterator fieldEnd = fieldBegin;
   int width = endOffset_[col_] - beginOffset_[col_];
-  bool lastCol = (col_ == cols_ - 1), tooShort = false;
+  bool lastCol = (col_ == cols_ - 1), tooShort = false, hasNull = false;
 
   // Find the end of the field, stopping for newlines
   for(int i = 0; i < width; ++i) {
@@ -144,16 +144,21 @@ Token TokenizerFwf::nextToken() {
       tooShort = true;
       break;
     }
+    if (*fieldEnd == '\0')
+      hasNull = true;
 
     fieldEnd++;
   }
   // Last column is often ragged, so read until end of line (ignoring width)
   if (lastCol) {
-    while(fieldEnd != end_ && *fieldEnd != '\r' && *fieldEnd != '\n')
+    while(fieldEnd != end_ && *fieldEnd != '\r' && *fieldEnd != '\n') {
+      if (*fieldEnd == '\0')
+        hasNull = true;
       fieldEnd++;
+    }
   }
 
-  Token t = fieldToken(fieldBegin, fieldEnd);
+  Token t = fieldToken(fieldBegin, fieldEnd, hasNull);
 
   if (lastCol || tooShort) {
     row_++;
@@ -170,11 +175,11 @@ Token TokenizerFwf::nextToken() {
   return t;
 }
 
-Token TokenizerFwf::fieldToken(SourceIterator begin, SourceIterator end) {
+Token TokenizerFwf::fieldToken(SourceIterator begin, SourceIterator end, bool hasNull) {
   if (begin == end)
     return Token(TOKEN_MISSING, row_, col_);
 
-  Token t = Token(begin, end, row_, col_);
+  Token t = Token(begin, end, row_, col_, hasNull);
   t.trim();
   t.flagNA(NA_);
 

diff --git a/src/TokenizerFwf.h b/src/TokenizerFwf.h
@@ -27,7 +27,7 @@ class TokenizerFwf : public Tokenizer {
 
 private:
 
-  Token fieldToken(SourceIterator begin, SourceIterator end);
+  Token fieldToken(SourceIterator begin, SourceIterator end, bool hasNull);
 };
 
 #endif
diff --git a/src/TokenizerLine.h b/src/TokenizerLine.h
@@ -31,19 +31,24 @@ class TokenizerLine : public Tokenizer {
   Token nextToken() {
     SourceIterator token_begin = cur_;
 
+    bool hasNull = false;
+
     if (!moreTokens_)
       return Token(TOKEN_EOF, line_, 0);
 
     while (cur_ != end_) {
       Advance advance(&cur_);
 
+      if (*cur_ == '\0')
+        hasNull = true;
+
       if ((line_ + 1) % 500000 == 0)
         Rcpp::checkUserInterrupt();
 
       switch(*cur_) {
       case '\r':
       case '\n':
-        return Token(token_begin, advanceForLF(&cur_, end_), line_++, 0);
+        return Token(token_begin, advanceForLF(&cur_, end_), hasNull, line_++, 0);
       default:
         break;
       }
@@ -54,7 +59,7 @@ class TokenizerLine : public Tokenizer {
     if (token_begin == end_) {
       return Token(TOKEN_EOF, line_++, 0);
     } else {
-      return Token(token_begin, end_, line_++, 0);
+      return Token(token_begin, end_, hasNull, line_++, 0);
     }
   }
 

diff --git a/src/TokenizerLog.h b/src/TokenizerLog.h
@@ -167,7 +167,7 @@ class TokenizerLog : public Tokenizer {
   }
 
   Token fieldToken(SourceIterator begin, SourceIterator end, int row, int col) {
-    return Token(begin, end, row, col).flagNA(std::vector<std::string>(1, "-"));
+    return Token(begin, end, false, row, col).flagNA(std::vector<std::string>(1, "-"));
   }
 
 };

diff --git a/src/parse.cpp b/src/parse.cpp
@@ -58,6 +58,7 @@ RObject read_header_(List sourceSpec, List tokenizerSpec, List locale_) {
   tokenizer->setWarnings(&warnings);
 
   CollectorCharacter out(&locale.encoder_);
+  out.setWarnings(&warnings);
 
   for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF; t = tokenizer->nextToken()) {
     if (t.row() > (size_t) 0) // only read one row
@@ -122,7 +123,7 @@ SEXP parse_vector_(CharacterVector x, List collectorSpec,
       t = Token(TOKEN_MISSING, i, -1);
     } else {
       SEXP string = x[i];
-      t = Token(CHAR(string), CHAR(string) + Rf_length(string), i, -1);
+      t = Token(CHAR(string), CHAR(string) + Rf_length(string), i, -1, false);
       t.trim();
       t.flagNA(na);
     }

diff --git a/src/type_convert.cpp b/src/type_convert.cpp
@@ -21,7 +21,7 @@ RObject type_convert_col(CharacterVector x, List spec, List locale_, int col,
       t = Token(TOKEN_MISSING, i - 1, col - 1);
     } else {
       const char* begin = CHAR(string);
-      t = Token(begin, begin + Rf_length(string), i - 1, col - 1);
+      t = Token(begin, begin + Rf_length(string), i - 1, col - 1, false);
       if (trim_ws)
         t.trim();
       t.flagNA(na);

diff --git a/tests/testthat/raw.csv b/tests/testthat/raw.csv
diff --git a/tests/testthat/test-read-csv.R b/tests/testthat/test-read-csv.R
@@ -63,9 +63,10 @@ test_that("encoding affects text and headers", {
   expect_identical(x[[1]], "élève")
 })
 
-test_that("nuls are silently dropped", {
+test_that("nuls are dropped with a warning", {
   x <- read_csv("raw.csv")
-  expect_equal(names(x), c("ab", "def"))
+  expect_equal(n_problems(x), 1)
+  expect_equal(x$abc, "ab")
 })
 
 # Column warnings ---------------------------------------------------------