Skip to content

Commit

Permalink
Excel leap day bug (#422)
Browse files Browse the repository at this point in the history
* Fix comment typo

* Warn when times are not allowed due to daylight savings time

* Correct typo in error message

* Work around Excel leap day bug (29 Feb 1900 does not exist)

* Fix test

* Work around as.POSIX* cross-platform bugs by moving to lubridate
  • Loading branch information
billdenney authored Jan 19, 2021
1 parent f9b2d27 commit 2d1a1a6
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 41 deletions.
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ export(top_levels)
export(untabyl)
export(use_first_valid_of)
importFrom(dplyr,rename_all)
importFrom(lubridate,as_date)
importFrom(lubridate,as_datetime)
importFrom(lubridate,force_tz)
importFrom(lubridate,hour)
importFrom(lubridate,minute)
importFrom(lubridate,second)
importFrom(lubridate,ymd)
importFrom(lubridate,ymd_hms)
importFrom(magrittr,"%>%")
Expand Down
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
# janitor 2.1.0.9000 (unreleased, under development)

## Breaking changes

* Microsoft Excel incorrectly has a leap day on 29 February 1900 (see https://docs.microsoft.com/en-us/office/troubleshoot/excel/wrongly-assumes-1900-is-leap-year). `excel_numeric_to_date()` did not account for this error, and now it does (thanks **@billdenney** for fixing). Dates output from `excel_numeric_to_date()` before 1 March 1900 will now be one day later compared to previous versions (i.e. what was 1 Feb 1900 is now 2 Feb 1900), and dates that Excel presents as 29 Feb 1900 will become `as.POSIXct(NA)`. (thanks **@billdenney** for fixing)
* A minor breaking change is the time zone is always set for `excel_numeric_to_date()` and `convert_date()`. The default timezone is `Sys.timezone()` while before it was an empty string (`""`).

## Minor features

* `excel_numeric_to_date()` now warns when times are converted to `NA` due to hours that do not exist because of daylight savings time. (fix #420, thanks **@Geomorph2** for reporting and **@billdenney** for fixing)

# janitor 2.1.0 (2021-01-05)

Expand Down
38 changes: 23 additions & 15 deletions R/excel_dates.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,22 +42,20 @@
#' excel_numeric_to_date(40000.521, include_time = TRUE,
#' round_seconds = FALSE) # Time with fractional seconds is included
#' @family Date-time cleaning

# Converts a numeric value like 42414 into a date "2016-02-14"

excel_numeric_to_date <- function(date_num, date_system = "modern", include_time = FALSE, round_seconds = TRUE, tz = "") {
#' @importFrom lubridate as_date as_datetime force_tz hour minute second
excel_numeric_to_date <- function(date_num, date_system = "modern", include_time = FALSE, round_seconds = TRUE, tz = Sys.timezone()) {
if (all(is.na(date_num))) {
# For NA input, return the expected type of NA output.
if (include_time) {
return(as.POSIXlt(as.character(date_num)))
return(lubridate::as_datetime(date_num, tz=tz))
} else {
return(as.Date(as.character(date_num)))
return(lubridate::as_date(date_num))
}
} else if (!is.numeric(date_num)) {
stop("argument `date_num` must be of class numeric")
}

# Manage floating point imprecision; coerce to double to avoid inteter
# Manage floating point imprecision; coerce to double to avoid integer
# overflow.
date_num_days <- (as.double(date_num) * 86400L + 0.001) %/% 86400L
date_num_days_no_floating_correction <- date_num %/% 1
Expand All @@ -71,20 +69,30 @@ excel_numeric_to_date <- function(date_num, date_system = "modern", include_time
if (any(mask_day_rollover)) {
warning(sum(mask_day_rollover), " date_num values are within 0.001 sec of a later date and were rounded up to the next day.")
}
mask_excel_leap_day_bug <- !is.na(date_num_days) & floor(date_num_days) == 60
mask_before_excel_leap_day_bug <- !is.na(date_num_days) & floor(date_num_days) < 60
date_num_days[mask_excel_leap_day_bug] <- NA_real_
date_num_days[mask_before_excel_leap_day_bug] <- date_num_days[mask_before_excel_leap_day_bug] + 1
ret <-
if (date_system == "mac pre-2011") {
as.Date(floor(date_num_days), origin = "1904-01-01")
lubridate::as_date(floor(date_num_days), origin = "1904-01-01")
} else if (date_system == "modern") {
as.Date(floor(date_num_days), origin = "1899-12-30")
lubridate::as_date(floor(date_num_days), origin = "1899-12-30")
} else {
stop("argument 'created' must be one of 'mac pre-2011' or 'modern'")
stop("argument 'date_system' must be one of 'mac pre-2011' or 'modern'")
}
if (include_time) {
ret <- as.POSIXlt(ret, tz = tz)
ret$sec <- date_num_seconds %% 60
ret$min <- floor(date_num_seconds/60) %% 60
ret$hour <- floor(date_num_seconds/3600)
ret <- as.POSIXct(ret, tz = tz)
ret <- lubridate::as_datetime(ret)
lubridate::second(ret) <- date_num_seconds %% 60
lubridate::minute(ret) <- floor(date_num_seconds/60) %% 60
lubridate::hour(ret) <- floor(date_num_seconds/3600)
ret <- lubridate::force_tz(ret, tzone=tz)
}
if (any(mask_excel_leap_day_bug)) {
warning("NAs introduced by coercion, Excel leap day bug detected in `date_num`. 29 February 1900 does not exist.")
}
if (any(is.na(ret) & !is.na(date_num) & !mask_excel_leap_day_bug)) {
warning("NAs introduced by coercion, possible daylight savings time issue with input. Consider `tz='UTC'`.")
}
ret
}
2 changes: 1 addition & 1 deletion man/excel_numeric_to_date.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

93 changes: 68 additions & 25 deletions tests/testthat/test-date-conversion.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ test_that("Serial number dates convert correctly", {

test_that("Bad inputs handled appropriately", {
expect_error(excel_numeric_to_date("hello"), "argument `date_num` must be of class numeric")
expect_error(excel_numeric_to_date(40908, "bad string"), "argument 'created' must be one of 'mac pre-2011' or 'modern'")
expect_error(excel_numeric_to_date(40908, 4), "argument 'created' must be one of 'mac pre-2011' or 'modern'")
expect_error(excel_numeric_to_date(40908, "bad string"), "argument 'date_system' must be one of 'mac pre-2011' or 'modern'")
expect_error(excel_numeric_to_date(40908, 4), "argument 'date_system' must be one of 'mac pre-2011' or 'modern'")
})

test_that("time handling works correctly", {
Expand Down Expand Up @@ -59,19 +59,29 @@ test_that("time handling at the edge of the next date works correctly", {
})

test_that("excel_numeric_to_date handles NA", {
expect_equal(excel_numeric_to_date(NA),
as.Date(NA_character_),
info="Return NA output of the correct class (Date) for NA input.")
expect_equal(excel_numeric_to_date(NA, include_time=TRUE),
as.POSIXct(NA_character_),
info="Return NA output of the correct class (POSIXct) for NA input.")
expect_equal(excel_numeric_to_date(c(43088, NA)),
as.Date(floor(c(43088, NA)), origin = "1899-12-30"),
info="Return NA output as part of a vector of inputs correctly")
expect_equal(excel_numeric_to_date(c(43088, NA), include_time=TRUE),
structure(as.POSIXlt(as.Date(floor(c(43088, NA)), origin = "1899-12-30")),
tzone=NULL),
info="Return NA output as part of a vector of inputs correctly")
expect_equal(
excel_numeric_to_date(NA),
as.Date(NA_character_),
info="Return NA output of the correct class (Date) for NA input."
)
expect_equal(
excel_numeric_to_date(NA, include_time=TRUE),
as.POSIXct(NA_character_, tz=Sys.timezone()),
info="Return NA output of the correct class (POSIXct) for NA input."
)
expect_equal(
excel_numeric_to_date(c(43088, NA)),
as.Date(floor(c(43088, NA)), origin = "1899-12-30"),
info="Return NA output as part of a vector of inputs correctly"
)
expect_equal(
excel_numeric_to_date(c(43088, NA), include_time=TRUE),
structure(
as.POSIXlt(as.Date(floor(c(43088, NA)), origin = "1899-12-30")),
tzone=NULL
),
info="Return NA output as part of a vector of inputs correctly"
)
})

test_that("excel_numeric_to_date returns a POSIXct object when include_time is requested", {
Expand All @@ -80,19 +90,52 @@ test_that("excel_numeric_to_date returns a POSIXct object when include_time is r
})

test_that("time zone setting works", {
expect_equal(attr(excel_numeric_to_date(43001.11, include_time = TRUE), "tzone"),
"") # blank by default
expect_equal(attr(excel_numeric_to_date(43001.11, include_time = TRUE, tz = "America/New_York"), "tzone"),
"America/New_York")
# expect_warning(excel_numeric_to_date(43001.11, include_time = TRUE, tz = "nonsense"),
# "unknown timezone 'nonsense'")
# this test should be written:
# providing a bad timezone value defaults to blank tz value "" and throws warning
# Then delete prior test as it will fail

expect_equal(
attr(excel_numeric_to_date(43001.11, include_time = TRUE), "tzone"),
Sys.timezone(),
info="Defaults to the local timezone"
)
expect_equal(
attr(excel_numeric_to_date(43001.11, include_time = TRUE, tz = "America/New_York"), "tzone"),
"America/New_York"
)
expect_equal(
attr(excel_numeric_to_date(43001.11, include_time = TRUE, tz = "Europe/Zurich"), "tzone"),
"Europe/Zurich"
)
expect_error(
excel_numeric_to_date(43001.11, include_time = TRUE, tz = "nonsense"),
info="Invalid timezone gives an error"
)
})

test_that("integer Excel dates do not overflow (ref issue #241)", {
expect_equal(excel_numeric_to_date(42370L),
excel_numeric_to_date(42370))
})

test_that("daylight savings time handling (issue #420)", {
expect_equal(
expect_warning(
excel_numeric_to_date(43170.09, include_time=TRUE, tz="America/New_York"),
regexp="NAs introduced by coercion, possible daylight savings time issue with input. Consider `tz='UTC'`.",
fixed=TRUE
),
as.POSIXct(NA_real_, tz="America/New_York", origin="1900-01-01")
)
expect_equal(
excel_numeric_to_date(43170.09, include_time=TRUE, tz="UTC"),
as.POSIXct("2018-03-11 02:09:36", tz="UTC")
)
})

test_that("Nonexistent 29 Feb 1900 exists in Excel but not in accurate calendars", {
expect_equal(
expect_warning(
excel_numeric_to_date(59:61),
regexp="NAs introduced by coercion, Excel leap day bug detected in `date_num`. 29 February 1900 does not exist.",
fixed=TRUE
),
as.Date(c("1900-02-28", NA, "1900-03-01"))
)
})

0 comments on commit 2d1a1a6

Please sign in to comment.