Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Excel leap day bug #422

Merged
merged 7 commits into from
Jan 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ export(top_levels)
export(untabyl)
export(use_first_valid_of)
importFrom(dplyr,rename_all)
importFrom(lubridate,as_date)
importFrom(lubridate,as_datetime)
importFrom(lubridate,force_tz)
importFrom(lubridate,hour)
importFrom(lubridate,minute)
importFrom(lubridate,second)
importFrom(lubridate,ymd)
importFrom(lubridate,ymd_hms)
importFrom(magrittr,"%>%")
Expand Down
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
# janitor 2.1.0.9000 (unreleased, under development)

## Breaking changes

* Microsoft Excel incorrectly has a leap day on 29 February 1900 (see https://docs.microsoft.com/en-us/office/troubleshoot/excel/wrongly-assumes-1900-is-leap-year). `excel_numeric_to_date()` did not account for this error, and now it does (thanks **@billdenney** for fixing). Dates output from `excel_numeric_to_date()` before 1 March 1900 will now be one day later compared to previous versions (i.e. what was 1 Feb 1900 is now 2 Feb 1900), and dates that Excel presents as 29 Feb 1900 will become `as.POSIXct(NA)`. (thanks **@billdenney** for fixing)
* A minor breaking change is the time zone is always set for `excel_numeric_to_date()` and `convert_date()`. The default timezone is `Sys.timezone()` while before it was an empty string (`""`).

## Minor features

* `excel_numeric_to_date()` now warns when times are converted to `NA` due to hours that do not exist because of daylight savings time. (fix #420, thanks **@Geomorph2** for reporting and **@billdenney** for fixing)

# janitor 2.1.0 (2021-01-05)

Expand Down
38 changes: 23 additions & 15 deletions R/excel_dates.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,22 +42,20 @@
#' excel_numeric_to_date(40000.521, include_time = TRUE,
#' round_seconds = FALSE) # Time with fractional seconds is included
#' @family Date-time cleaning

# Converts a numeric value like 42414 into a date "2016-02-14"

excel_numeric_to_date <- function(date_num, date_system = "modern", include_time = FALSE, round_seconds = TRUE, tz = "") {
#' @importFrom lubridate as_date as_datetime force_tz hour minute second
excel_numeric_to_date <- function(date_num, date_system = "modern", include_time = FALSE, round_seconds = TRUE, tz = Sys.timezone()) {
if (all(is.na(date_num))) {
# For NA input, return the expected type of NA output.
if (include_time) {
return(as.POSIXlt(as.character(date_num)))
return(lubridate::as_datetime(date_num, tz=tz))
} else {
return(as.Date(as.character(date_num)))
return(lubridate::as_date(date_num))
}
} else if (!is.numeric(date_num)) {
stop("argument `date_num` must be of class numeric")
}

# Manage floating point imprecision; coerce to double to avoid inteter
# Manage floating point imprecision; coerce to double to avoid integer
# overflow.
date_num_days <- (as.double(date_num) * 86400L + 0.001) %/% 86400L
date_num_days_no_floating_correction <- date_num %/% 1
Expand All @@ -71,20 +69,30 @@ excel_numeric_to_date <- function(date_num, date_system = "modern", include_time
if (any(mask_day_rollover)) {
warning(sum(mask_day_rollover), " date_num values are within 0.001 sec of a later date and were rounded up to the next day.")
}
mask_excel_leap_day_bug <- !is.na(date_num_days) & floor(date_num_days) == 60
mask_before_excel_leap_day_bug <- !is.na(date_num_days) & floor(date_num_days) < 60
date_num_days[mask_excel_leap_day_bug] <- NA_real_
date_num_days[mask_before_excel_leap_day_bug] <- date_num_days[mask_before_excel_leap_day_bug] + 1
ret <-
if (date_system == "mac pre-2011") {
as.Date(floor(date_num_days), origin = "1904-01-01")
lubridate::as_date(floor(date_num_days), origin = "1904-01-01")
} else if (date_system == "modern") {
as.Date(floor(date_num_days), origin = "1899-12-30")
lubridate::as_date(floor(date_num_days), origin = "1899-12-30")
} else {
stop("argument 'created' must be one of 'mac pre-2011' or 'modern'")
stop("argument 'date_system' must be one of 'mac pre-2011' or 'modern'")
}
if (include_time) {
ret <- as.POSIXlt(ret, tz = tz)
ret$sec <- date_num_seconds %% 60
ret$min <- floor(date_num_seconds/60) %% 60
ret$hour <- floor(date_num_seconds/3600)
ret <- as.POSIXct(ret, tz = tz)
ret <- lubridate::as_datetime(ret)
lubridate::second(ret) <- date_num_seconds %% 60
lubridate::minute(ret) <- floor(date_num_seconds/60) %% 60
lubridate::hour(ret) <- floor(date_num_seconds/3600)
ret <- lubridate::force_tz(ret, tzone=tz)
}
if (any(mask_excel_leap_day_bug)) {
warning("NAs introduced by coercion, Excel leap day bug detected in `date_num`. 29 February 1900 does not exist.")
}
if (any(is.na(ret) & !is.na(date_num) & !mask_excel_leap_day_bug)) {
warning("NAs introduced by coercion, possible daylight savings time issue with input. Consider `tz='UTC'`.")
}
ret
}
2 changes: 1 addition & 1 deletion man/excel_numeric_to_date.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

93 changes: 68 additions & 25 deletions tests/testthat/test-date-conversion.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ test_that("Serial number dates convert correctly", {

test_that("Bad inputs handled appropriately", {
expect_error(excel_numeric_to_date("hello"), "argument `date_num` must be of class numeric")
expect_error(excel_numeric_to_date(40908, "bad string"), "argument 'created' must be one of 'mac pre-2011' or 'modern'")
expect_error(excel_numeric_to_date(40908, 4), "argument 'created' must be one of 'mac pre-2011' or 'modern'")
expect_error(excel_numeric_to_date(40908, "bad string"), "argument 'date_system' must be one of 'mac pre-2011' or 'modern'")
expect_error(excel_numeric_to_date(40908, 4), "argument 'date_system' must be one of 'mac pre-2011' or 'modern'")
})

test_that("time handling works correctly", {
Expand Down Expand Up @@ -59,19 +59,29 @@ test_that("time handling at the edge of the next date works correctly", {
})

test_that("excel_numeric_to_date handles NA", {
expect_equal(excel_numeric_to_date(NA),
as.Date(NA_character_),
info="Return NA output of the correct class (Date) for NA input.")
expect_equal(excel_numeric_to_date(NA, include_time=TRUE),
as.POSIXct(NA_character_),
info="Return NA output of the correct class (POSIXct) for NA input.")
expect_equal(excel_numeric_to_date(c(43088, NA)),
as.Date(floor(c(43088, NA)), origin = "1899-12-30"),
info="Return NA output as part of a vector of inputs correctly")
expect_equal(excel_numeric_to_date(c(43088, NA), include_time=TRUE),
structure(as.POSIXlt(as.Date(floor(c(43088, NA)), origin = "1899-12-30")),
tzone=NULL),
info="Return NA output as part of a vector of inputs correctly")
expect_equal(
excel_numeric_to_date(NA),
as.Date(NA_character_),
info="Return NA output of the correct class (Date) for NA input."
)
expect_equal(
excel_numeric_to_date(NA, include_time=TRUE),
as.POSIXct(NA_character_, tz=Sys.timezone()),
info="Return NA output of the correct class (POSIXct) for NA input."
)
expect_equal(
excel_numeric_to_date(c(43088, NA)),
as.Date(floor(c(43088, NA)), origin = "1899-12-30"),
info="Return NA output as part of a vector of inputs correctly"
)
expect_equal(
excel_numeric_to_date(c(43088, NA), include_time=TRUE),
structure(
as.POSIXlt(as.Date(floor(c(43088, NA)), origin = "1899-12-30")),
tzone=NULL
),
info="Return NA output as part of a vector of inputs correctly"
)
})

test_that("excel_numeric_to_date returns a POSIXct object when include_time is requested", {
Expand All @@ -80,19 +90,52 @@ test_that("excel_numeric_to_date returns a POSIXct object when include_time is r
})

test_that("time zone setting works", {
expect_equal(attr(excel_numeric_to_date(43001.11, include_time = TRUE), "tzone"),
"") # blank by default
expect_equal(attr(excel_numeric_to_date(43001.11, include_time = TRUE, tz = "America/New_York"), "tzone"),
"America/New_York")
# expect_warning(excel_numeric_to_date(43001.11, include_time = TRUE, tz = "nonsense"),
# "unknown timezone 'nonsense'")
# this test should be written:
# providing a bad timezone value defaults to blank tz value "" and throws warning
# Then delete prior test as it will fail

expect_equal(
attr(excel_numeric_to_date(43001.11, include_time = TRUE), "tzone"),
Sys.timezone(),
info="Defaults to the local timezone"
)
expect_equal(
attr(excel_numeric_to_date(43001.11, include_time = TRUE, tz = "America/New_York"), "tzone"),
"America/New_York"
)
expect_equal(
attr(excel_numeric_to_date(43001.11, include_time = TRUE, tz = "Europe/Zurich"), "tzone"),
"Europe/Zurich"
)
expect_error(
excel_numeric_to_date(43001.11, include_time = TRUE, tz = "nonsense"),
info="Invalid timezone gives an error"
)
})

test_that("integer Excel dates do not overflow (ref issue #241)", {
expect_equal(excel_numeric_to_date(42370L),
excel_numeric_to_date(42370))
})

test_that("daylight savings time handling (issue #420)", {
expect_equal(
expect_warning(
excel_numeric_to_date(43170.09, include_time=TRUE, tz="America/New_York"),
regexp="NAs introduced by coercion, possible daylight savings time issue with input. Consider `tz='UTC'`.",
fixed=TRUE
),
as.POSIXct(NA_real_, tz="America/New_York", origin="1900-01-01")
)
expect_equal(
excel_numeric_to_date(43170.09, include_time=TRUE, tz="UTC"),
as.POSIXct("2018-03-11 02:09:36", tz="UTC")
)
})

test_that("Nonexistent 29 Feb 1900 exists in Excel but not in accurate calendars", {
expect_equal(
expect_warning(
excel_numeric_to_date(59:61),
regexp="NAs introduced by coercion, Excel leap day bug detected in `date_num`. 29 February 1900 does not exist.",
fixed=TRUE
),
as.Date(c("1900-02-28", NA, "1900-03-01"))
)
})