diff --git a/NAMESPACE b/NAMESPACE index 0f5511b7..f6f8acca 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -50,6 +50,12 @@ export(top_levels) export(untabyl) export(use_first_valid_of) importFrom(dplyr,rename_all) +importFrom(lubridate,as_date) +importFrom(lubridate,as_datetime) +importFrom(lubridate,force_tz) +importFrom(lubridate,hour) +importFrom(lubridate,minute) +importFrom(lubridate,second) importFrom(lubridate,ymd) importFrom(lubridate,ymd_hms) importFrom(magrittr,"%>%") diff --git a/NEWS.md b/NEWS.md index f7209e82..d03a9f9b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,13 @@ # janitor 2.1.0.9000 (unreleased, under development) +## Breaking changes + +* Microsoft Excel incorrectly has a leap day on 29 February 1900 (see https://docs.microsoft.com/en-us/office/troubleshoot/excel/wrongly-assumes-1900-is-leap-year). `excel_numeric_to_date()` did not account for this error, and now it does (thanks **@billdenney** for fixing). Dates output from `excel_numeric_to_date()` before 1 March 1900 will now be one day later compared to previous versions (i.e. what was 1 Feb 1900 is now 2 Feb 1900), and dates that Excel presents as 29 Feb 1900 will become `as.POSIXct(NA)`. (thanks **@billdenney** for fixing) +* A minor breaking change is the time zone is always set for `excel_numeric_to_date()` and `convert_date()`. The default timezone is `Sys.timezone()` while before it was an empty string (`""`). + +## Minor features +* `excel_numeric_to_date()` now warns when times are converted to `NA` due to hours that do not exist because of daylight savings time. (fix #420, thanks **@Geomorph2** for reporting and **@billdenney** for fixing) # janitor 2.1.0 (2021-01-05) diff --git a/R/excel_dates.R b/R/excel_dates.R index c33ba06c..d32b951a 100644 --- a/R/excel_dates.R +++ b/R/excel_dates.R @@ -42,22 +42,20 @@ #' excel_numeric_to_date(40000.521, include_time = TRUE, #' round_seconds = FALSE) # Time with fractional seconds is included #' @family Date-time cleaning - -# Converts a numeric value like 42414 into a date "2016-02-14" - -excel_numeric_to_date <- function(date_num, date_system = "modern", include_time = FALSE, round_seconds = TRUE, tz = "") { +#' @importFrom lubridate as_date as_datetime force_tz hour minute second +excel_numeric_to_date <- function(date_num, date_system = "modern", include_time = FALSE, round_seconds = TRUE, tz = Sys.timezone()) { if (all(is.na(date_num))) { # For NA input, return the expected type of NA output. if (include_time) { - return(as.POSIXlt(as.character(date_num))) + return(lubridate::as_datetime(date_num, tz=tz)) } else { - return(as.Date(as.character(date_num))) + return(lubridate::as_date(date_num)) } } else if (!is.numeric(date_num)) { stop("argument `date_num` must be of class numeric") } - # Manage floating point imprecision; coerce to double to avoid inteter + # Manage floating point imprecision; coerce to double to avoid integer # overflow. date_num_days <- (as.double(date_num) * 86400L + 0.001) %/% 86400L date_num_days_no_floating_correction <- date_num %/% 1 @@ -71,20 +69,30 @@ excel_numeric_to_date <- function(date_num, date_system = "modern", include_time if (any(mask_day_rollover)) { warning(sum(mask_day_rollover), " date_num values are within 0.001 sec of a later date and were rounded up to the next day.") } + mask_excel_leap_day_bug <- !is.na(date_num_days) & floor(date_num_days) == 60 + mask_before_excel_leap_day_bug <- !is.na(date_num_days) & floor(date_num_days) < 60 + date_num_days[mask_excel_leap_day_bug] <- NA_real_ + date_num_days[mask_before_excel_leap_day_bug] <- date_num_days[mask_before_excel_leap_day_bug] + 1 ret <- if (date_system == "mac pre-2011") { - as.Date(floor(date_num_days), origin = "1904-01-01") + lubridate::as_date(floor(date_num_days), origin = "1904-01-01") } else if (date_system == "modern") { - as.Date(floor(date_num_days), origin = "1899-12-30") + lubridate::as_date(floor(date_num_days), origin = "1899-12-30") } else { - stop("argument 'created' must be one of 'mac pre-2011' or 'modern'") + stop("argument 'date_system' must be one of 'mac pre-2011' or 'modern'") } if (include_time) { - ret <- as.POSIXlt(ret, tz = tz) - ret$sec <- date_num_seconds %% 60 - ret$min <- floor(date_num_seconds/60) %% 60 - ret$hour <- floor(date_num_seconds/3600) - ret <- as.POSIXct(ret, tz = tz) + ret <- lubridate::as_datetime(ret) + lubridate::second(ret) <- date_num_seconds %% 60 + lubridate::minute(ret) <- floor(date_num_seconds/60) %% 60 + lubridate::hour(ret) <- floor(date_num_seconds/3600) + ret <- lubridate::force_tz(ret, tzone=tz) + } + if (any(mask_excel_leap_day_bug)) { + warning("NAs introduced by coercion, Excel leap day bug detected in `date_num`. 29 February 1900 does not exist.") + } + if (any(is.na(ret) & !is.na(date_num) & !mask_excel_leap_day_bug)) { + warning("NAs introduced by coercion, possible daylight savings time issue with input. Consider `tz='UTC'`.") } ret } diff --git a/man/excel_numeric_to_date.Rd b/man/excel_numeric_to_date.Rd index 9895c68e..01727bf7 100644 --- a/man/excel_numeric_to_date.Rd +++ b/man/excel_numeric_to_date.Rd @@ -9,7 +9,7 @@ excel_numeric_to_date( date_system = "modern", include_time = FALSE, round_seconds = TRUE, - tz = "" + tz = Sys.timezone() ) } \arguments{ diff --git a/tests/testthat/test-date-conversion.R b/tests/testthat/test-date-conversion.R index 9c1c630b..1cde5ee9 100644 --- a/tests/testthat/test-date-conversion.R +++ b/tests/testthat/test-date-conversion.R @@ -11,8 +11,8 @@ test_that("Serial number dates convert correctly", { test_that("Bad inputs handled appropriately", { expect_error(excel_numeric_to_date("hello"), "argument `date_num` must be of class numeric") - expect_error(excel_numeric_to_date(40908, "bad string"), "argument 'created' must be one of 'mac pre-2011' or 'modern'") - expect_error(excel_numeric_to_date(40908, 4), "argument 'created' must be one of 'mac pre-2011' or 'modern'") + expect_error(excel_numeric_to_date(40908, "bad string"), "argument 'date_system' must be one of 'mac pre-2011' or 'modern'") + expect_error(excel_numeric_to_date(40908, 4), "argument 'date_system' must be one of 'mac pre-2011' or 'modern'") }) test_that("time handling works correctly", { @@ -59,19 +59,29 @@ test_that("time handling at the edge of the next date works correctly", { }) test_that("excel_numeric_to_date handles NA", { - expect_equal(excel_numeric_to_date(NA), - as.Date(NA_character_), - info="Return NA output of the correct class (Date) for NA input.") - expect_equal(excel_numeric_to_date(NA, include_time=TRUE), - as.POSIXct(NA_character_), - info="Return NA output of the correct class (POSIXct) for NA input.") - expect_equal(excel_numeric_to_date(c(43088, NA)), - as.Date(floor(c(43088, NA)), origin = "1899-12-30"), - info="Return NA output as part of a vector of inputs correctly") - expect_equal(excel_numeric_to_date(c(43088, NA), include_time=TRUE), - structure(as.POSIXlt(as.Date(floor(c(43088, NA)), origin = "1899-12-30")), - tzone=NULL), - info="Return NA output as part of a vector of inputs correctly") + expect_equal( + excel_numeric_to_date(NA), + as.Date(NA_character_), + info="Return NA output of the correct class (Date) for NA input." + ) + expect_equal( + excel_numeric_to_date(NA, include_time=TRUE), + as.POSIXct(NA_character_, tz=Sys.timezone()), + info="Return NA output of the correct class (POSIXct) for NA input." + ) + expect_equal( + excel_numeric_to_date(c(43088, NA)), + as.Date(floor(c(43088, NA)), origin = "1899-12-30"), + info="Return NA output as part of a vector of inputs correctly" + ) + expect_equal( + excel_numeric_to_date(c(43088, NA), include_time=TRUE), + structure( + as.POSIXlt(as.Date(floor(c(43088, NA)), origin = "1899-12-30")), + tzone=NULL + ), + info="Return NA output as part of a vector of inputs correctly" + ) }) test_that("excel_numeric_to_date returns a POSIXct object when include_time is requested", { @@ -80,19 +90,52 @@ test_that("excel_numeric_to_date returns a POSIXct object when include_time is r }) test_that("time zone setting works", { - expect_equal(attr(excel_numeric_to_date(43001.11, include_time = TRUE), "tzone"), - "") # blank by default - expect_equal(attr(excel_numeric_to_date(43001.11, include_time = TRUE, tz = "America/New_York"), "tzone"), - "America/New_York") -# expect_warning(excel_numeric_to_date(43001.11, include_time = TRUE, tz = "nonsense"), -# "unknown timezone 'nonsense'") - # this test should be written: - # providing a bad timezone value defaults to blank tz value "" and throws warning - # Then delete prior test as it will fail - + expect_equal( + attr(excel_numeric_to_date(43001.11, include_time = TRUE), "tzone"), + Sys.timezone(), + info="Defaults to the local timezone" + ) + expect_equal( + attr(excel_numeric_to_date(43001.11, include_time = TRUE, tz = "America/New_York"), "tzone"), + "America/New_York" + ) + expect_equal( + attr(excel_numeric_to_date(43001.11, include_time = TRUE, tz = "Europe/Zurich"), "tzone"), + "Europe/Zurich" + ) + expect_error( + excel_numeric_to_date(43001.11, include_time = TRUE, tz = "nonsense"), + info="Invalid timezone gives an error" + ) }) test_that("integer Excel dates do not overflow (ref issue #241)", { expect_equal(excel_numeric_to_date(42370L), excel_numeric_to_date(42370)) }) + +test_that("daylight savings time handling (issue #420)", { + expect_equal( + expect_warning( + excel_numeric_to_date(43170.09, include_time=TRUE, tz="America/New_York"), + regexp="NAs introduced by coercion, possible daylight savings time issue with input. Consider `tz='UTC'`.", + fixed=TRUE + ), + as.POSIXct(NA_real_, tz="America/New_York", origin="1900-01-01") + ) + expect_equal( + excel_numeric_to_date(43170.09, include_time=TRUE, tz="UTC"), + as.POSIXct("2018-03-11 02:09:36", tz="UTC") + ) +}) + +test_that("Nonexistent 29 Feb 1900 exists in Excel but not in accurate calendars", { + expect_equal( + expect_warning( + excel_numeric_to_date(59:61), + regexp="NAs introduced by coercion, Excel leap day bug detected in `date_num`. 29 February 1900 does not exist.", + fixed=TRUE + ), + as.Date(c("1900-02-28", NA, "1900-03-01")) + ) +})