From 4b69c6cd696880a95a66ac1e72d633baeb60cc8f Mon Sep 17 00:00:00 2001 From: limnoliver Date: Tue, 25 Oct 2022 14:04:35 -0500 Subject: [PATCH 1/6] keep statistical base code to use later to find data that is already collapsed --- 1_wqp_pull/out/wqp_data.rds.ind | 2 +- 1_wqp_pull/src/get_wqp_data.R | 4 ++-- build/status/MV93cXBfcHVsbC9vdXQvd3FwX2RhdGEucmRzLmluZA.yml | 6 +++--- ...kYXRhX211bmdlL291dC93cXBfZGF0YV9zdHJlYW1zLnJkcy5pbmQ.yml | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/1_wqp_pull/out/wqp_data.rds.ind b/1_wqp_pull/out/wqp_data.rds.ind index d7daaf9..cc80c30 100644 --- a/1_wqp_pull/out/wqp_data.rds.ind +++ b/1_wqp_pull/out/wqp_data.rds.ind @@ -1,2 +1,2 @@ -hash: fa556791765715562797e6c2dad855ee +hash: 80fac6326f338af72b0f1945905f8d72 diff --git a/1_wqp_pull/src/get_wqp_data.R b/1_wqp_pull/src/get_wqp_data.R index 52111be..c096b44 100644 --- a/1_wqp_pull/src/get_wqp_data.R +++ b/1_wqp_pull/src/get_wqp_data.R @@ -130,13 +130,13 @@ combine_wqp_dat <- function(ind_file, ...){ } dat_mod <- dat_mod %>% filter(!is.na(ResultMeasureValue)) %>% - select(MonitoringLocationIdentifier, ActivityMediaName, ActivityStartDate, `ActivityStartTime/Time`, `ActivityStartTime/TimeZoneCode`, + select(MonitoringLocationIdentifier, ActivityMediaName, ActivityStartDate, ActivityEndDate, `ActivityStartTime/Time`, `ActivityStartTime/TimeZoneCode`, `ActivityDepthHeightMeasure/MeasureValue`, `ActivityDepthHeightMeasure/MeasureUnitCode`, `ActivityTopDepthHeightMeasure/MeasureValue`,`ActivityTopDepthHeightMeasure/MeasureUnitCode`, `ActivityBottomDepthHeightMeasure/MeasureValue`, `ActivityBottomDepthHeightMeasure/MeasureUnitCode`, ActivityCommentText, `SampleCollectionMethod/MethodIdentifier`, `SampleCollectionMethod/MethodIdentifierContext`, `SampleCollectionMethod/MethodName`,CharacteristicName, ResultMeasureValue, - `ResultMeasure/MeasureUnitCode`, ResultValueTypeName, PrecisionValue, + `ResultMeasure/MeasureUnitCode`, ResultValueTypeName, PrecisionValue, StatisticalBaseCode, ResultCommentText, `ResultDepthHeightMeasure/MeasureValue`, `ResultDepthHeightMeasure/MeasureUnitCode`, ProviderName) %>% mutate(PrecisionValue = as.numeric(PrecisionValue)) %>% mutate_at(vars(contains('MeasureValue')), as.numeric) %>% diff --git a/build/status/MV93cXBfcHVsbC9vdXQvd3FwX2RhdGEucmRzLmluZA.yml b/build/status/MV93cXBfcHVsbC9vdXQvd3FwX2RhdGEucmRzLmluZA.yml index 76f7919..eae5e32 100644 --- a/build/status/MV93cXBfcHVsbC9vdXQvd3FwX2RhdGEucmRzLmluZA.yml +++ b/build/status/MV93cXBfcHVsbC9vdXQvd3FwX2RhdGEucmRzLmluZA.yml @@ -1,12 +1,12 @@ version: 0.3.0 name: 1_wqp_pull/out/wqp_data.rds.ind type: file -hash: 1a8dbb9cd4dfd070cbfc946511ca65d7 -time: 2022-03-29 18:28:28 UTC +hash: 3edeb04eb813118f8aa4a901d080b24b +time: 2022-10-19 15:31:10 UTC depends: wqp_pull_plan: 6f549ec3f2aa682ceace9c864015ff0a 1_wqp_pull_tasks.yml: dd5e68ad85ae31a47f12bd4bf52eeee7 - 1_wqp_pull/src/get_wqp_data.R: 65def7cf8815ff64e926482a9640b97f + 1_wqp_pull/src/get_wqp_data.R: b05dcdb365bbcd5a4e241f0c46076fac fixed: b103af1551106aa1ebfea5ec48fe44e0 code: functions: diff --git a/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGF0YV9zdHJlYW1zLnJkcy5pbmQ.yml b/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGF0YV9zdHJlYW1zLnJkcy5pbmQ.yml index 3c795a9..f0eaa20 100644 --- a/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGF0YV9zdHJlYW1zLnJkcy5pbmQ.yml +++ b/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGF0YV9zdHJlYW1zLnJkcy5pbmQ.yml @@ -1,10 +1,10 @@ version: 0.3.0 name: 5_data_munge/out/wqp_data_streams.rds.ind type: file -hash: 1091d4b0e14854dd8450a914273a0363 -time: 2022-03-31 20:30:18 UTC +hash: 72ff57a393533becbfb82fc278a8ff81 +time: 2022-10-24 04:21:10 UTC depends: - 1_wqp_pull/out/wqp_data.rds.ind: 1a8dbb9cd4dfd070cbfc946511ca65d7 + 1_wqp_pull/out/wqp_data.rds.ind: 3edeb04eb813118f8aa4a901d080b24b 1_wqp_pull/inout/wqp_inventory.feather.ind: 4d0042964407bcf7c2656a19e5c4bcd2 stream_types: ff830476c42cccbd885995ef9cb26b5c fixed: 987ddc70a46a2204bf0a037cfbaf2482 From 6ecd1d3aa92eafc07ca33cd3b46239f73e1a13b1 Mon Sep 17 00:00:00 2001 From: limnoliver Date: Tue, 25 Oct 2022 15:29:51 -0500 Subject: [PATCH 2/6] a couple of changes: drops estimated and blank corrected values, keeps only min/mean/max statcodes and uses them appropriately, drops observations where activity start date does not equal end date (except where there is only one obs per site-date) or resolves date issues when there are collection dates in comments. Also does not calculate min and max when there is only a single observation. --- 5_data_munge.yml | 13 ++- 5_data_munge/out/daily_temperatures.rds.ind | 2 +- .../out/daily_temperatures_qaqc.rds.ind | 2 +- .../out/daily_temperatures_summary.csv | 2 +- .../out/flagged_temperature_summary.csv | 2 +- 5_data_munge/out/wqp_daily_depths.rds.ind | 2 +- 5_data_munge/out/wqp_daily_nodepths.rds.ind | 2 +- 5_data_munge/out/wqp_data_streams.rds.ind | 2 +- .../out/wqp_data_streams_datesres.rds.ind | 2 + 5_data_munge/src/munge_wqp_files.R | 86 +++++++++++++++++-- ...GF0YV9zdHJlYW1zX2RhdGVzcmVzLnJkcy5pbmQ.yml | 12 +++ ...291dC93cXBfZGFpbHlfZGVwdGhzLnJkcy5pbmQ.yml | 6 +- ...dC93cXBfZGFpbHlfbm9kZXB0aHMucmRzLmluZA.yml | 8 +- ...dC9kYWlseV90ZW1wZXJhdHVyZXMucmRzLmluZA.yml | 6 +- ...lseV90ZW1wZXJhdHVyZXNfcWFxYy5yZHMuaW5k.yml | 6 +- getters.yml | 3 + 16 files changed, 125 insertions(+), 31 deletions(-) create mode 100644 5_data_munge/out/wqp_data_streams_datesres.rds.ind create mode 100644 build/status/NV9kYXRhX211bmdlL291dC93cXBfZGF0YV9zdHJlYW1zX2RhdGVzcmVzLnJkcy5pbmQ.yml diff --git a/5_data_munge.yml b/5_data_munge.yml index d2f21b9..2c68a9d 100644 --- a/5_data_munge.yml +++ b/5_data_munge.yml @@ -44,15 +44,24 @@ targets: keep_types = stream_types, out_ind = target_name) + # resolve a few known issues regarding start/end dates + # drop those where start/end is different and there is more than one + # obs per site-date. Try to recover those obs where the collection + # date is in comments. + 5_data_munge/out/wqp_data_streams_datesres.rds.ind: + command: resolve_statcodes( + in_ind = '5_data_munge/out/wqp_data_streams.rds.ind', + out_ind = target_name) + 5_data_munge/out/wqp_daily_depths.rds.ind: command: munge_wqp_withdepths( - in_ind = '5_data_munge/out/wqp_data_streams.rds.ind', + in_ind = '5_data_munge/out/wqp_data_streams_datesres.rds.ind', min_value, max_value, max_daily_range, out_ind = target_name) 5_data_munge/out/wqp_daily_nodepths.rds.ind: command: munge_wqp_withoutdepths( - in_ind = '5_data_munge/out/wqp_data_streams.rds.ind', + in_ind = '5_data_munge/out/wqp_data_streams_datesres.rds.ind', min_value, max_value, max_daily_range, out_ind = target_name) diff --git a/5_data_munge/out/daily_temperatures.rds.ind b/5_data_munge/out/daily_temperatures.rds.ind index 376f6c0..980834e 100644 --- a/5_data_munge/out/daily_temperatures.rds.ind +++ b/5_data_munge/out/daily_temperatures.rds.ind @@ -1,2 +1,2 @@ -hash: 747034b3bcd79a69f58df11a7b52ad07 +hash: 44f39f208344987935f33e9211634b6a diff --git a/5_data_munge/out/daily_temperatures_qaqc.rds.ind b/5_data_munge/out/daily_temperatures_qaqc.rds.ind index a5da657..a195320 100644 --- a/5_data_munge/out/daily_temperatures_qaqc.rds.ind +++ b/5_data_munge/out/daily_temperatures_qaqc.rds.ind @@ -1,2 +1,2 @@ -hash: 5f5d0a4a4eb977025698ebccba574bca +hash: 68d8e0426ceb8bc94a5ddbfbcfd24b84 diff --git a/5_data_munge/out/daily_temperatures_summary.csv b/5_data_munge/out/daily_temperatures_summary.csv index 7461c4b..53e402a 100644 --- a/5_data_munge/out/daily_temperatures_summary.csv +++ b/5_data_munge/out/daily_temperatures_summary.csv @@ -1,2 +1,2 @@ n_obs,n_sites -27368431,293513 +22706105,78846 diff --git a/5_data_munge/out/flagged_temperature_summary.csv b/5_data_munge/out/flagged_temperature_summary.csv index b1adb4a..cac4578 100644 --- a/5_data_munge/out/flagged_temperature_summary.csv +++ b/5_data_munge/out/flagged_temperature_summary.csv @@ -1,2 +1,2 @@ n_flagged_obs,perc_flagged_obs,n_flagged_sites,perc_flagged_sites -1061259,4.1,87062,29.7 +819889,3.9,30944,39.3 diff --git a/5_data_munge/out/wqp_daily_depths.rds.ind b/5_data_munge/out/wqp_daily_depths.rds.ind index ea61d60..5efef61 100644 --- a/5_data_munge/out/wqp_daily_depths.rds.ind +++ b/5_data_munge/out/wqp_daily_depths.rds.ind @@ -1,2 +1,2 @@ -hash: 7ff8028fb0c2115a5371684c17bd6927 +hash: bfe822528a7c57aa65e880a3bafd6c9a diff --git a/5_data_munge/out/wqp_daily_nodepths.rds.ind b/5_data_munge/out/wqp_daily_nodepths.rds.ind index 79d4fac..605f37e 100644 --- a/5_data_munge/out/wqp_daily_nodepths.rds.ind +++ b/5_data_munge/out/wqp_daily_nodepths.rds.ind @@ -1,2 +1,2 @@ -hash: db18e66f40e06e34622a764d7f3ca9c6 +hash: 76be43ab961077a911f9406ceee6df12 diff --git a/5_data_munge/out/wqp_data_streams.rds.ind b/5_data_munge/out/wqp_data_streams.rds.ind index 92ab68e..f9e4bb8 100644 --- a/5_data_munge/out/wqp_data_streams.rds.ind +++ b/5_data_munge/out/wqp_data_streams.rds.ind @@ -1,2 +1,2 @@ -hash: e1fc3f9ca8c625e78dafa54fa4736432 +hash: 9a6b397af80917e46559b5a871e8e602 diff --git a/5_data_munge/out/wqp_data_streams_datesres.rds.ind b/5_data_munge/out/wqp_data_streams_datesres.rds.ind new file mode 100644 index 0000000..6348c7d --- /dev/null +++ b/5_data_munge/out/wqp_data_streams_datesres.rds.ind @@ -0,0 +1,2 @@ +hash: 149949ca40b06eef57f929114680149e + diff --git a/5_data_munge/src/munge_wqp_files.R b/5_data_munge/src/munge_wqp_files.R index 8d2efde..e475bfe 100644 --- a/5_data_munge/src/munge_wqp_files.R +++ b/5_data_munge/src/munge_wqp_files.R @@ -73,7 +73,7 @@ munge_wqp_withdepths <- function(in_ind, min_value, max_value, max_daily_range, munge_wqp_withoutdepths <- function(in_ind, min_value, max_value, max_daily_range, out_ind) { dat <- readRDS(sc_retrieve(in_ind, remake_file = 'getters.yml')) %>% - select(MonitoringLocationIdentifier, ActivityStartDate, ResultMeasureValue, + select(MonitoringLocationIdentifier, ActivityStartDate, ResultMeasureValue, StatisticalBaseCode, `ResultMeasure/MeasureUnitCode`, `ActivityStartTime/Time`, ActivityMediaName, `ActivityDepthHeightMeasure/MeasureValue`, `ResultDepthHeightMeasure/MeasureValue`, `ActivityTopDepthHeightMeasure/MeasureValue`, `ActivityBottomDepthHeightMeasure/MeasureValue`) %>% @@ -93,15 +93,32 @@ munge_wqp_withoutdepths <- function(in_ind, min_value, max_value, max_daily_rang f_to_c(ResultMeasureValue), ResultMeasureValue)) %>% mutate(`ResultMeasure/MeasureUnitCode` = 'deg C') - dat_daily <- group_by(dat_reduced, MonitoringLocationIdentifier, ActivityStartDate) %>% - summarize(temperature_mean_daily = mean(ResultMeasureValue), - temperature_min_daily = min(ResultMeasureValue), - temperature_max_daily = max(ResultMeasureValue), - n_obs = n(), + dat_reduced_statcode <- ungroup(dat_reduced) %>% + filter(!is.na(StatisticalBaseCode)) %>% + group_by(MonitoringLocationIdentifier, ActivityStartDate) %>% + mutate(temperature_mean_daily = ifelse(grepl('mean', StatisticalBaseCode, ignore.case = TRUE), ResultMeasureValue, NA), + temperature_min_daily = ifelse(grepl('min', StatisticalBaseCode, ignore.case = TRUE), ResultMeasureValue, NA), + temperature_max_daily = ifelse(grepl('max', StatisticalBaseCode, ignore.case = TRUE), ResultMeasureValue, NA)) %>% + mutate(n_obs = NA) %>% + group_by(MonitoringLocationIdentifier, ActivityStartDate) %>% + summarize(across(c(temperature_mean_daily, temperature_min_daily, temperature_max_daily, n_obs) , ~ first(na.omit(.)))) + + # we don't know the number of observations here because stat codes were used + + + dat_daily <- ungroup(dat_reduced) %>% + filter(is.na(StatisticalBaseCode)) %>% + group_by(MonitoringLocationIdentifier, ActivityStartDate) %>% + summarize(n_obs = n(), + temperature_mean_daily = mean(ResultMeasureValue), + # we don't want to propagate min and max if there is only one value + temperature_min_daily = ifelse(n_obs>1, min(ResultMeasureValue), NA), + temperature_max_daily = ifelse(n_obs>1, max(ResultMeasureValue), NA), time = ifelse(n_obs == 1, `ActivityStartTime/Time`, NA)) %>% - filter(temperature_mean_daily > min_value & temperature_mean_daily < max_value, - temperature_min_daily > min_value & temperature_min_daily < max_value, - temperature_max_daily > min_value & temperature_max_daily < max_value) + bind_rows(dat_reduced_statcode) %>% + filter(temperature_mean_daily > min_value & temperature_mean_daily < max_value|is.na(temperature_mean_daily), + temperature_min_daily > min_value & temperature_min_daily < max_value|is.na(temperature_min_daily), + temperature_max_daily > min_value & temperature_max_daily < max_value|is.na(temperature_max_daily)) # save data_file <- scipiper::as_data_file(out_ind) @@ -109,3 +126,54 @@ munge_wqp_withoutdepths <- function(in_ind, min_value, max_value, max_daily_rang gd_put(out_ind) } + +resolve_statcodes <- function(in_ind, out_ind) { + dat <- readRDS(sc_retrieve(in_ind, remake_file = 'getters.yml')) %>% + # drop values that are estimated or blank-corrected + # drop values that are not min, mean, max + filter(!ResultValueTypeName %in% c('Estimated', 'Blank Corrected Calc')) %>% + filter(StatisticalBaseCode %in% c(NA, 'Mean', 'Minimum', 'Maximum', 'mean', + 'Geometric Mean', 'Daily Maximum', 'Daily Minimum', + 'Daily Geometric Mean')) + + + # for some data, the start and end dates are different, and data providers + # seem to be using these as a date range of the whole dataset + # sometimes, the proper collection date is in the comment field + # we're dropping data that has a StatisticalBaseCode because we don't want + # values averaged over multiple days + range_dates <- filter(dat, ActivityStartDate != ActivityEndDate) %>% + filter(is.na(StatisticalBaseCode)) %>% + filter(grepl('Collected', ResultCommentText)) %>% + mutate(newActivityStartDate = gsub('(Collected: )(.*\\d{4})(\\s*\\d+.*)', '\\2', ResultCommentText, perl = TRUE), + newActivityStartTime = gsub('(Collected: .*\\d{4}\\s*)(\\d+.*)', '\\2', ResultCommentText, perl = TRUE)) %>% + mutate(newActivityStartTime = format(strptime(newActivityStartTime, format = '%I:%M %p'), '%H:%M:%S'), + newActivityStartDate = as.Date(newActivityStartDate, format = "%b %d %Y")) %>% + select(-ActivityStartDate, -`ActivityStartTime/Time`) %>% + rename(ActivityStartDate = newActivityStartDate, + `ActivityStartTime/Time` = newActivityStartTime) + + # those that don't have collected in the comments + other <- filter(dat, ActivityStartDate != ActivityEndDate) %>% + filter(is.na(StatisticalBaseCode)) %>% + filter(!grepl('Collected', ResultCommentText)) + + # look at the number of obs per day per site-date to weed out bad sites + # only keep sites that have one obs per day + # site dates with > 1440 obs (which is 1obs/minute) was most of the drops here, + # so we're confident we're dropping bad data + keep_onesiteday <- other %>% + group_by(MonitoringLocationIdentifier, ActivityStartDate) %>% + mutate(n = n()) %>% + filter(n == 1) %>% + select(-n) %>% ungroup() + + out <- filter(dat, ActivityStartDate == ActivityEndDate) %>% # keep all data where start/end dates are same + bind_rows(range_dates) %>% # keep all data where we fixed the dates from the comments + bind_rows(keep_onesiteday) # keep all data where the start/end was different but there was only one value per site-date + + + data_file <- scipiper::as_data_file(out_ind) + saveRDS(out, data_file) + gd_put(out_ind) +} diff --git a/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGF0YV9zdHJlYW1zX2RhdGVzcmVzLnJkcy5pbmQ.yml b/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGF0YV9zdHJlYW1zX2RhdGVzcmVzLnJkcy5pbmQ.yml new file mode 100644 index 0000000..9cc6754 --- /dev/null +++ b/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGF0YV9zdHJlYW1zX2RhdGVzcmVzLnJkcy5pbmQ.yml @@ -0,0 +1,12 @@ +version: 0.3.0 +name: 5_data_munge/out/wqp_data_streams_datesres.rds.ind +type: file +hash: a3e8b99d7e975a1c1812e58f6000c4fd +time: 2022-10-25 12:18:56 UTC +depends: + 5_data_munge/out/wqp_data_streams.rds.ind: 72ff57a393533becbfb82fc278a8ff81 +fixed: ad14d31fed2ff9f2c6960bfbae40e1d5 +code: + functions: + resolve_statcodes: a8688cec2fda35f2ab4a155c8e88bc7c + diff --git a/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGFpbHlfZGVwdGhzLnJkcy5pbmQ.yml b/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGFpbHlfZGVwdGhzLnJkcy5pbmQ.yml index 43d531b..ae0bdba 100644 --- a/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGFpbHlfZGVwdGhzLnJkcy5pbmQ.yml +++ b/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGFpbHlfZGVwdGhzLnJkcy5pbmQ.yml @@ -1,10 +1,10 @@ version: 0.3.0 name: 5_data_munge/out/wqp_daily_depths.rds.ind type: file -hash: 696fcc6e752c73008bb62bace2cdac56 -time: 2022-03-31 20:53:49 UTC +hash: 62da8cd8a92119b6236155a1b38bacb8 +time: 2022-10-25 15:28:49 UTC depends: - 5_data_munge/out/wqp_data_streams.rds.ind: 1091d4b0e14854dd8450a914273a0363 + 5_data_munge/out/wqp_data_streams_datesres.rds.ind: a3e8b99d7e975a1c1812e58f6000c4fd min_value: feee3efa1c3c99f6ae54f943171a7d14 max_value: 8550904a2a45c13015a45b21ef719fb7 max_daily_range: afe39393dd5be2c095cd37fede5c5b89 diff --git a/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGFpbHlfbm9kZXB0aHMucmRzLmluZA.yml b/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGFpbHlfbm9kZXB0aHMucmRzLmluZA.yml index 7e19e0d..6aae9a7 100644 --- a/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGFpbHlfbm9kZXB0aHMucmRzLmluZA.yml +++ b/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGFpbHlfbm9kZXB0aHMucmRzLmluZA.yml @@ -1,10 +1,10 @@ version: 0.3.0 name: 5_data_munge/out/wqp_daily_nodepths.rds.ind type: file -hash: d6adcb94d2168ceb7d1977635bdebbed -time: 2022-04-25 01:23:35 UTC +hash: 00a061b7f1c3e89deb5f9c9f4cd03b1f +time: 2022-10-25 19:29:29 UTC depends: - 5_data_munge/out/wqp_data_streams.rds.ind: 1091d4b0e14854dd8450a914273a0363 + 5_data_munge/out/wqp_data_streams_datesres.rds.ind: a3e8b99d7e975a1c1812e58f6000c4fd min_value: feee3efa1c3c99f6ae54f943171a7d14 max_value: 8550904a2a45c13015a45b21ef719fb7 max_daily_range: afe39393dd5be2c095cd37fede5c5b89 @@ -12,5 +12,5 @@ fixed: c8da49989b1a3d05cbbfd15be13cfef9 code: functions: f_to_c: a61909ceabd1801b610840dc86315ffe - munge_wqp_withoutdepths: 4f675338f8fd61f29d42cd02819ebaab + munge_wqp_withoutdepths: 5e779d4adf6c955c7431d9f00c7e8b2b diff --git a/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXMucmRzLmluZA.yml b/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXMucmRzLmluZA.yml index d79cd03..bcfa7e0 100644 --- a/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXMucmRzLmluZA.yml +++ b/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXMucmRzLmluZA.yml @@ -1,11 +1,11 @@ version: 0.3.0 name: 5_data_munge/out/daily_temperatures.rds.ind type: file -hash: 8391327cc8c907cbf3b58133509ce816 -time: 2022-04-25 01:34:39 UTC +hash: e8cddbd995f4c8a3fffe5d3ceca371b4 +time: 2022-10-25 19:35:00 UTC depends: 5_data_munge/out/nwis_daily.rds.ind: 3396bbdded05965d18f905f5cc24ff65 - 5_data_munge/out/wqp_daily_nodepths.rds.ind: d6adcb94d2168ceb7d1977635bdebbed + 5_data_munge/out/wqp_daily_nodepths.rds.ind: 00a061b7f1c3e89deb5f9c9f4cd03b1f 5_data_munge/out/ecosheds_munged.rds.ind: 0b44b5d6c2089809ea92a8917da8450e 5_data_munge/out/norwest_munged.rds.ind: 9d1118944011c968be2afa00add16553 wqp_pull_date: 1703d8ed940ea9516622d02d7d072ff6 diff --git a/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXNfcWFxYy5yZHMuaW5k.yml b/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXNfcWFxYy5yZHMuaW5k.yml index 9f722ce..be5863b 100644 --- a/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXNfcWFxYy5yZHMuaW5k.yml +++ b/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXNfcWFxYy5yZHMuaW5k.yml @@ -1,10 +1,10 @@ version: 0.3.0 name: 5_data_munge/out/daily_temperatures_qaqc.rds.ind type: file -hash: 26e19a7dd06b10ff39ce1311f139559e -time: 2022-04-25 14:05:20 UTC +hash: 2aecf60e102d516495e968133fde7e8a +time: 2022-10-25 19:50:37 UTC depends: - 5_data_munge/out/daily_temperatures.rds.ind: 8391327cc8c907cbf3b58133509ce816 + 5_data_munge/out/daily_temperatures.rds.ind: e8cddbd995f4c8a3fffe5d3ceca371b4 5_data_munge/out/stream_sites_us.rds.ind: 822cd3a5e059f1e3a1e0b110e74af8cc fixed: 4eed73f624bb922908b23ee8e6e8dbc3 code: diff --git a/getters.yml b/getters.yml index 763e5b0..bbcb6f5 100644 --- a/getters.yml +++ b/getters.yml @@ -67,6 +67,9 @@ targets: 5_data_munge/out/wqp_data_streams.rds: command: gd_get('5_data_munge/out/wqp_data_streams.rds.ind') + 5_data_munge/out/wqp_data_streams_datesres.rds: + command: gd_get('5_data_munge/out/wqp_data_streams_datesres.rds.ind') + 5_data_munge/out/ecosheds_munged.rds: command: gd_get('5_data_munge/out/ecosheds_munged.rds.ind') From 2c8e5e781d04703d83400504499541d138a09ebf Mon Sep 17 00:00:00 2001 From: limnoliver Date: Wed, 26 Oct 2022 09:58:10 -0500 Subject: [PATCH 3/6] fix some missing values that were dropping due to NA end dates --- 5_data_munge/src/munge_wqp_files.R | 42 +++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/5_data_munge/src/munge_wqp_files.R b/5_data_munge/src/munge_wqp_files.R index e475bfe..fdacab5 100644 --- a/5_data_munge/src/munge_wqp_files.R +++ b/5_data_munge/src/munge_wqp_files.R @@ -129,6 +129,7 @@ munge_wqp_withoutdepths <- function(in_ind, min_value, max_value, max_daily_rang resolve_statcodes <- function(in_ind, out_ind) { dat <- readRDS(sc_retrieve(in_ind, remake_file = 'getters.yml')) %>% + ungroup() %>% # drop values that are estimated or blank-corrected # drop values that are not min, mean, max filter(!ResultValueTypeName %in% c('Estimated', 'Blank Corrected Calc')) %>% @@ -137,12 +138,16 @@ resolve_statcodes <- function(in_ind, out_ind) { 'Daily Geometric Mean')) + # print message that says how many observations we lost when dropped + nrow_o <- nrow(readRDS(sc_retrieve(in_ind, remake_file = 'getters.yml'))) + message(paste(nrow_o - nrow(dat), 'observations were dropped due to estimation, blank correction, or statcode that was not mean, min, max')) # for some data, the start and end dates are different, and data providers # seem to be using these as a date range of the whole dataset # sometimes, the proper collection date is in the comment field # we're dropping data that has a StatisticalBaseCode because we don't want # values averaged over multiple days - range_dates <- filter(dat, ActivityStartDate != ActivityEndDate) %>% + range_dates <- filter(dat, !is.na(ActivityEndDate)) %>% + filter(ActivityStartDate != ActivityEndDate) %>% filter(is.na(StatisticalBaseCode)) %>% filter(grepl('Collected', ResultCommentText)) %>% mutate(newActivityStartDate = gsub('(Collected: )(.*\\d{4})(\\s*\\d+.*)', '\\2', ResultCommentText, perl = TRUE), @@ -153,8 +158,12 @@ resolve_statcodes <- function(in_ind, out_ind) { rename(ActivityStartDate = newActivityStartDate, `ActivityStartTime/Time` = newActivityStartTime) + # print message about date recoveries + message(paste(nrow(range_dates), 'observations with mismatching start/end dates were recovered by extracting collection dates from comments')) + # those that don't have collected in the comments - other <- filter(dat, ActivityStartDate != ActivityEndDate) %>% + other <- filter(dat, !is.na(ActivityEndDate)) %>% + filter(ActivityStartDate != ActivityEndDate) %>% filter(is.na(StatisticalBaseCode)) %>% filter(!grepl('Collected', ResultCommentText)) @@ -168,10 +177,37 @@ resolve_statcodes <- function(in_ind, out_ind) { filter(n == 1) %>% select(-n) %>% ungroup() - out <- filter(dat, ActivityStartDate == ActivityEndDate) %>% # keep all data where start/end dates are same + + + drop <- other %>% + group_by(MonitoringLocationIdentifier, ActivityStartDate) %>% + summarize(n = n()) %>% + filter(n > 1) + + widnr <- filter(drop, grepl('WIDNR', MonitoringLocationIdentifier)) + + message(paste(nrow(drop), 'site-dates and', sum(drop$n), + 'raw observations were dropped because n>1 obs per site-date and start-end dates did not match.', + 'WIDNR was responsible for', nrow(widnr), 'site-dates and', sum(widnr$n), 'raw observations.')) + + statdiffdates <- filter(dat, !is.na(ActivityEndDate)) %>% + filter(ActivityStartDate != ActivityEndDate) %>% + filter(!is.na(StatisticalBaseCode)) + + message(length(unique(paste(statdiffdates$MonitoringLocationIdentifier, statdiffdates$ActivityStartDate))), + ' site-dates and ', nrow(statdiffdates), ' raw observations were dropped because the observation had a stat code but different start/end dates') + + + out <- filter(dat, ActivityStartDate == ActivityEndDate | !is.na(ActivityEndDate)) %>% # keep all data where start/end dates are same bind_rows(range_dates) %>% # keep all data where we fixed the dates from the comments bind_rows(keep_onesiteday) # keep all data where the start/end was different but there was only one value per site-date + perc_keep <- round((nrow(dat) - nrow(out))/nrow(dat)*100, 1) + raw_dropped <- nrow(dat) - nrow(out) + + + message(paste(perc_keep, 'percent of observations were kept...or', + raw_dropped, 'raw observations were dropped due to mismatch start/end dates')) data_file <- scipiper::as_data_file(out_ind) saveRDS(out, data_file) From 3ed9c4cffbb929764a65d8b3bdccbc00efd7a64a Mon Sep 17 00:00:00 2001 From: limnoliver Date: Wed, 26 Oct 2022 14:03:51 -0500 Subject: [PATCH 4/6] fixed return data to keep instances where ActivityEndDate is NA. --- 5_data_munge/out/daily_temperatures.rds.ind | 2 +- 5_data_munge/out/daily_temperatures_qaqc.rds.ind | 2 +- 5_data_munge/out/daily_temperatures_summary.csv | 2 +- 5_data_munge/out/flagged_temperature_summary.csv | 2 +- 5_data_munge/out/wqp_daily_nodepths.rds.ind | 2 +- 5_data_munge/out/wqp_data_streams_datesres.rds.ind | 2 +- 5_data_munge/src/munge_wqp_files.R | 4 ++-- ...lL291dC93cXBfZGF0YV9zdHJlYW1zX2RhdGVzcmVzLnJkcy5pbmQ.yml | 6 +++--- ...RhX211bmdlL291dC93cXBfZGFpbHlfbm9kZXB0aHMucmRzLmluZA.yml | 6 +++--- ...RhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXMucmRzLmluZA.yml | 6 +++--- ...bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXNfcWFxYy5yZHMuaW5k.yml | 6 +++--- 11 files changed, 20 insertions(+), 20 deletions(-) diff --git a/5_data_munge/out/daily_temperatures.rds.ind b/5_data_munge/out/daily_temperatures.rds.ind index 980834e..a97d617 100644 --- a/5_data_munge/out/daily_temperatures.rds.ind +++ b/5_data_munge/out/daily_temperatures.rds.ind @@ -1,2 +1,2 @@ -hash: 44f39f208344987935f33e9211634b6a +hash: 49d2079760cc47874ff0d787278185fc diff --git a/5_data_munge/out/daily_temperatures_qaqc.rds.ind b/5_data_munge/out/daily_temperatures_qaqc.rds.ind index a195320..32b226b 100644 --- a/5_data_munge/out/daily_temperatures_qaqc.rds.ind +++ b/5_data_munge/out/daily_temperatures_qaqc.rds.ind @@ -1,2 +1,2 @@ -hash: 68d8e0426ceb8bc94a5ddbfbcfd24b84 +hash: 5a33a2e9b29857a90970a75036c314db diff --git a/5_data_munge/out/daily_temperatures_summary.csv b/5_data_munge/out/daily_temperatures_summary.csv index 53e402a..c504098 100644 --- a/5_data_munge/out/daily_temperatures_summary.csv +++ b/5_data_munge/out/daily_temperatures_summary.csv @@ -1,2 +1,2 @@ n_obs,n_sites -22706105,78846 +27323646,292462 diff --git a/5_data_munge/out/flagged_temperature_summary.csv b/5_data_munge/out/flagged_temperature_summary.csv index cac4578..738687b 100644 --- a/5_data_munge/out/flagged_temperature_summary.csv +++ b/5_data_munge/out/flagged_temperature_summary.csv @@ -1,2 +1,2 @@ n_flagged_obs,perc_flagged_obs,n_flagged_sites,perc_flagged_sites -819889,3.9,30944,39.3 +1058664,4.1,86984,29.9 diff --git a/5_data_munge/out/wqp_daily_nodepths.rds.ind b/5_data_munge/out/wqp_daily_nodepths.rds.ind index 605f37e..96ec2a2 100644 --- a/5_data_munge/out/wqp_daily_nodepths.rds.ind +++ b/5_data_munge/out/wqp_daily_nodepths.rds.ind @@ -1,2 +1,2 @@ -hash: 76be43ab961077a911f9406ceee6df12 +hash: 64eb9b1bce06bfb7d2d21758b3dcd8d6 diff --git a/5_data_munge/out/wqp_data_streams_datesres.rds.ind b/5_data_munge/out/wqp_data_streams_datesres.rds.ind index 6348c7d..311aa27 100644 --- a/5_data_munge/out/wqp_data_streams_datesres.rds.ind +++ b/5_data_munge/out/wqp_data_streams_datesres.rds.ind @@ -1,2 +1,2 @@ -hash: 149949ca40b06eef57f929114680149e +hash: ec5947853df0c31d2eb86599d5acc337 diff --git a/5_data_munge/src/munge_wqp_files.R b/5_data_munge/src/munge_wqp_files.R index fdacab5..cd3d241 100644 --- a/5_data_munge/src/munge_wqp_files.R +++ b/5_data_munge/src/munge_wqp_files.R @@ -198,7 +198,7 @@ resolve_statcodes <- function(in_ind, out_ind) { ' site-dates and ', nrow(statdiffdates), ' raw observations were dropped because the observation had a stat code but different start/end dates') - out <- filter(dat, ActivityStartDate == ActivityEndDate | !is.na(ActivityEndDate)) %>% # keep all data where start/end dates are same + out <- filter(dat, ActivityStartDate == ActivityEndDate | is.na(ActivityEndDate)) %>% # keep all data where start/end dates are same bind_rows(range_dates) %>% # keep all data where we fixed the dates from the comments bind_rows(keep_onesiteday) # keep all data where the start/end was different but there was only one value per site-date @@ -206,7 +206,7 @@ resolve_statcodes <- function(in_ind, out_ind) { raw_dropped <- nrow(dat) - nrow(out) - message(paste(perc_keep, 'percent of observations were kept...or', + message(paste(perc_keep, 'percent of observations were dropped...or', raw_dropped, 'raw observations were dropped due to mismatch start/end dates')) data_file <- scipiper::as_data_file(out_ind) diff --git a/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGF0YV9zdHJlYW1zX2RhdGVzcmVzLnJkcy5pbmQ.yml b/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGF0YV9zdHJlYW1zX2RhdGVzcmVzLnJkcy5pbmQ.yml index 9cc6754..d6c331f 100644 --- a/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGF0YV9zdHJlYW1zX2RhdGVzcmVzLnJkcy5pbmQ.yml +++ b/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGF0YV9zdHJlYW1zX2RhdGVzcmVzLnJkcy5pbmQ.yml @@ -1,12 +1,12 @@ version: 0.3.0 name: 5_data_munge/out/wqp_data_streams_datesres.rds.ind type: file -hash: a3e8b99d7e975a1c1812e58f6000c4fd -time: 2022-10-25 12:18:56 UTC +hash: aaa79d1a4e7bacbc0991767599a4e996 +time: 2022-10-26 16:32:03 UTC depends: 5_data_munge/out/wqp_data_streams.rds.ind: 72ff57a393533becbfb82fc278a8ff81 fixed: ad14d31fed2ff9f2c6960bfbae40e1d5 code: functions: - resolve_statcodes: a8688cec2fda35f2ab4a155c8e88bc7c + resolve_statcodes: 934a4c614c42bcd96cac58e87b576c35 diff --git a/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGFpbHlfbm9kZXB0aHMucmRzLmluZA.yml b/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGFpbHlfbm9kZXB0aHMucmRzLmluZA.yml index 6aae9a7..41474fc 100644 --- a/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGFpbHlfbm9kZXB0aHMucmRzLmluZA.yml +++ b/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGFpbHlfbm9kZXB0aHMucmRzLmluZA.yml @@ -1,10 +1,10 @@ version: 0.3.0 name: 5_data_munge/out/wqp_daily_nodepths.rds.ind type: file -hash: 00a061b7f1c3e89deb5f9c9f4cd03b1f -time: 2022-10-25 19:29:29 UTC +hash: a1e354106c78cbf8e0b185f08137bac2 +time: 2022-10-26 16:44:32 UTC depends: - 5_data_munge/out/wqp_data_streams_datesres.rds.ind: a3e8b99d7e975a1c1812e58f6000c4fd + 5_data_munge/out/wqp_data_streams_datesres.rds.ind: aaa79d1a4e7bacbc0991767599a4e996 min_value: feee3efa1c3c99f6ae54f943171a7d14 max_value: 8550904a2a45c13015a45b21ef719fb7 max_daily_range: afe39393dd5be2c095cd37fede5c5b89 diff --git a/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXMucmRzLmluZA.yml b/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXMucmRzLmluZA.yml index bcfa7e0..4d0e7ae 100644 --- a/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXMucmRzLmluZA.yml +++ b/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXMucmRzLmluZA.yml @@ -1,11 +1,11 @@ version: 0.3.0 name: 5_data_munge/out/daily_temperatures.rds.ind type: file -hash: e8cddbd995f4c8a3fffe5d3ceca371b4 -time: 2022-10-25 19:35:00 UTC +hash: 390b3616eec395ff28fcbd7ce867aec2 +time: 2022-10-26 16:50:24 UTC depends: 5_data_munge/out/nwis_daily.rds.ind: 3396bbdded05965d18f905f5cc24ff65 - 5_data_munge/out/wqp_daily_nodepths.rds.ind: 00a061b7f1c3e89deb5f9c9f4cd03b1f + 5_data_munge/out/wqp_daily_nodepths.rds.ind: a1e354106c78cbf8e0b185f08137bac2 5_data_munge/out/ecosheds_munged.rds.ind: 0b44b5d6c2089809ea92a8917da8450e 5_data_munge/out/norwest_munged.rds.ind: 9d1118944011c968be2afa00add16553 wqp_pull_date: 1703d8ed940ea9516622d02d7d072ff6 diff --git a/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXNfcWFxYy5yZHMuaW5k.yml b/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXNfcWFxYy5yZHMuaW5k.yml index be5863b..b45fa19 100644 --- a/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXNfcWFxYy5yZHMuaW5k.yml +++ b/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXNfcWFxYy5yZHMuaW5k.yml @@ -1,10 +1,10 @@ version: 0.3.0 name: 5_data_munge/out/daily_temperatures_qaqc.rds.ind type: file -hash: 2aecf60e102d516495e968133fde7e8a -time: 2022-10-25 19:50:37 UTC +hash: f27d7e0d8a492b4515ed318199b79848 +time: 2022-10-26 17:11:00 UTC depends: - 5_data_munge/out/daily_temperatures.rds.ind: e8cddbd995f4c8a3fffe5d3ceca371b4 + 5_data_munge/out/daily_temperatures.rds.ind: 390b3616eec395ff28fcbd7ce867aec2 5_data_munge/out/stream_sites_us.rds.ind: 822cd3a5e059f1e3a1e0b110e74af8cc fixed: 4eed73f624bb922908b23ee8e6e8dbc3 code: From a360d90757bb6ffe4674900847a8de558e1d5b0a Mon Sep 17 00:00:00 2001 From: limnoliver Date: Thu, 16 Mar 2023 11:28:35 -0500 Subject: [PATCH 5/6] rebuild site to GF crosswalk --- 6_network/out/site_flowlines.rds.ind | 2 +- 6_network/out/site_stream_crosswalk.rds.ind | 2 +- ...b3JrL291dC9zaXRlX3N0cmVhbV9jcm9zc3dhbGsucmRzLmluZA.yml | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/6_network/out/site_flowlines.rds.ind b/6_network/out/site_flowlines.rds.ind index 724b504..6a0b75c 100644 --- a/6_network/out/site_flowlines.rds.ind +++ b/6_network/out/site_flowlines.rds.ind @@ -1,2 +1,2 @@ -hash: 393bbb8751a97f632580c62efd627e97 +hash: 6dc4731a52161e291ac13e3d1419ee41 diff --git a/6_network/out/site_stream_crosswalk.rds.ind b/6_network/out/site_stream_crosswalk.rds.ind index 7bfd8f0..2fca247 100644 --- a/6_network/out/site_stream_crosswalk.rds.ind +++ b/6_network/out/site_stream_crosswalk.rds.ind @@ -1,2 +1,2 @@ -hash: 53a04e06755af97258c7d7848a1ee5de +hash: 9b22cff7abde66fe97d81b8cee861a32 diff --git a/build/status/Nl9uZXR3b3JrL291dC9zaXRlX3N0cmVhbV9jcm9zc3dhbGsucmRzLmluZA.yml b/build/status/Nl9uZXR3b3JrL291dC9zaXRlX3N0cmVhbV9jcm9zc3dhbGsucmRzLmluZA.yml index 483e688..abd7643 100644 --- a/build/status/Nl9uZXR3b3JrL291dC9zaXRlX3N0cmVhbV9jcm9zc3dhbGsucmRzLmluZA.yml +++ b/build/status/Nl9uZXR3b3JrL291dC9zaXRlX3N0cmVhbV9jcm9zc3dhbGsucmRzLmluZA.yml @@ -1,11 +1,11 @@ version: 0.3.0 name: 6_network/out/site_stream_crosswalk.rds.ind type: file -hash: 63bee8bd2f89e468cf83097242d3acb5 -time: 2021-05-05 13:46:31 UTC +hash: fe62da3f0fed14520543475b94909376 +time: 2022-12-16 22:28:04 UTC depends: - 6_network/out/site_flowlines.rds.ind: d59ecb03a1739f6bb9201e6eb2b86140 - 5_data_munge/out/stream_sites_us.rds.ind: 36d4e04430142075e97c030e36c47d71 + 6_network/out/site_flowlines.rds.ind: 59981aa41f25a866c8d73ca4bd8a309a + 5_data_munge/out/stream_sites_us.rds.ind: 822cd3a5e059f1e3a1e0b110e74af8cc fixed: c4bb3d1cfd994b4eda8cf860009af078 code: functions: From 33a648a17ff9e7117fe79142e6522c6fde8b27fb Mon Sep 17 00:00:00 2001 From: limnoliver Date: Tue, 25 Apr 2023 12:42:34 -0500 Subject: [PATCH 6/6] put some notes in for next commit, optimized some code so it doesn't have to read in big file twice. Not sure why indicators rebuilt, but confident there weren't downstream chances because I did an scmake on the final traget in 5_munge and it did not rebuild. --- 5_data_munge/out/daily_temperatures.rds.ind | 2 +- 5_data_munge/out/daily_temperatures_qaqc.rds.ind | 2 +- 5_data_munge/out/wqp_daily_nodepths.rds.ind | 2 +- .../out/wqp_data_streams_datesres.rds.ind | 2 +- 5_data_munge/src/munge_data_files.R | 13 +++++++++++++ 5_data_munge/src/munge_wqp_files.R | 15 ++++++++------- ...XBfZGF0YV9zdHJlYW1zX2RhdGVzcmVzLnJkcy5pbmQ.yml | 6 +++--- ...L291dC93cXBfZGFpbHlfbm9kZXB0aHMucmRzLmluZA.yml | 6 +++--- ...L291dC9kYWlseV90ZW1wZXJhdHVyZXMucmRzLmluZA.yml | 6 +++--- ...9kYWlseV90ZW1wZXJhdHVyZXNfcWFxYy5yZHMuaW5k.yml | 6 +++--- ...R3b3JrL291dC9zaXRlX2Zsb3dsaW5lcy5yZHMuaW5k.yml | 6 +++--- 11 files changed, 40 insertions(+), 26 deletions(-) diff --git a/5_data_munge/out/daily_temperatures.rds.ind b/5_data_munge/out/daily_temperatures.rds.ind index a97d617..c566c59 100644 --- a/5_data_munge/out/daily_temperatures.rds.ind +++ b/5_data_munge/out/daily_temperatures.rds.ind @@ -1,2 +1,2 @@ -hash: 49d2079760cc47874ff0d787278185fc +hash: d9fe8b468e09ec2e5a324201572635e4 diff --git a/5_data_munge/out/daily_temperatures_qaqc.rds.ind b/5_data_munge/out/daily_temperatures_qaqc.rds.ind index 32b226b..d6699e2 100644 --- a/5_data_munge/out/daily_temperatures_qaqc.rds.ind +++ b/5_data_munge/out/daily_temperatures_qaqc.rds.ind @@ -1,2 +1,2 @@ -hash: 5a33a2e9b29857a90970a75036c314db +hash: 74c44fedba3f51c6351dc81f558c0822 diff --git a/5_data_munge/out/wqp_daily_nodepths.rds.ind b/5_data_munge/out/wqp_daily_nodepths.rds.ind index 96ec2a2..cd52480 100644 --- a/5_data_munge/out/wqp_daily_nodepths.rds.ind +++ b/5_data_munge/out/wqp_daily_nodepths.rds.ind @@ -1,2 +1,2 @@ -hash: 64eb9b1bce06bfb7d2d21758b3dcd8d6 +hash: 46fae0d96a8451dd9ca67c3d05a4d642 diff --git a/5_data_munge/out/wqp_data_streams_datesres.rds.ind b/5_data_munge/out/wqp_data_streams_datesres.rds.ind index 311aa27..d503a29 100644 --- a/5_data_munge/out/wqp_data_streams_datesres.rds.ind +++ b/5_data_munge/out/wqp_data_streams_datesres.rds.ind @@ -1,2 +1,2 @@ -hash: ec5947853df0c31d2eb86599d5acc337 +hash: 0657e60e4b6243b3bd68ab1cf37a8107 diff --git a/5_data_munge/src/munge_data_files.R b/5_data_munge/src/munge_data_files.R index bff2026..6877ca7 100644 --- a/5_data_munge/src/munge_data_files.R +++ b/5_data_munge/src/munge_data_files.R @@ -167,6 +167,19 @@ combine_all_dat <- function(wqp_ind, nwis_ind, ecosheds_ind, norwest_ind, out_in select(site_id, date = SampleDate, mean_temp_degC = DailyMean, min_temp_degC = DailyMin, max_temp_degC = DailyMax, n_obs = Nobs, source) + #!! start here + # we still have really large n values, still have max and min temperatures for a day > 30 deg difference + # this should be a sign that the start times weren't coded right and need to be dropped. + # + #browser() + + # in NWIS, using data after 2000: + # 95% of obs have temp_diff of <= 5.7 + # 99% of obs have temp_diff of <= 8.9 + # 99.5% of obs have temp_diff <= 10.3 + # 99.9% of obs have temp_diff <= 13.2 + # save + all_dat <- bind_rows(nwis, wqp, ecosheds, norwest) %>% select(-unique_id) %>% distinct(site_id, date, time, mean_temp_degC, min_temp_degC, max_temp_degC, .keep_all = TRUE) diff --git a/5_data_munge/src/munge_wqp_files.R b/5_data_munge/src/munge_wqp_files.R index cd3d241..22e7df0 100644 --- a/5_data_munge/src/munge_wqp_files.R +++ b/5_data_munge/src/munge_wqp_files.R @@ -99,12 +99,11 @@ munge_wqp_withoutdepths <- function(in_ind, min_value, max_value, max_daily_rang mutate(temperature_mean_daily = ifelse(grepl('mean', StatisticalBaseCode, ignore.case = TRUE), ResultMeasureValue, NA), temperature_min_daily = ifelse(grepl('min', StatisticalBaseCode, ignore.case = TRUE), ResultMeasureValue, NA), temperature_max_daily = ifelse(grepl('max', StatisticalBaseCode, ignore.case = TRUE), ResultMeasureValue, NA)) %>% + # we don't know the number of observations here because stat codes were used mutate(n_obs = NA) %>% group_by(MonitoringLocationIdentifier, ActivityStartDate) %>% summarize(across(c(temperature_mean_daily, temperature_min_daily, temperature_max_daily, n_obs) , ~ first(na.omit(.)))) - # we don't know the number of observations here because stat codes were used - dat_daily <- ungroup(dat_reduced) %>% filter(is.na(StatisticalBaseCode)) %>% @@ -120,7 +119,6 @@ munge_wqp_withoutdepths <- function(in_ind, min_value, max_value, max_daily_rang temperature_min_daily > min_value & temperature_min_daily < max_value|is.na(temperature_min_daily), temperature_max_daily > min_value & temperature_max_daily < max_value|is.na(temperature_max_daily)) - # save data_file <- scipiper::as_data_file(out_ind) saveRDS(dat_daily, data_file) gd_put(out_ind) @@ -128,8 +126,13 @@ munge_wqp_withoutdepths <- function(in_ind, min_value, max_value, max_daily_rang } resolve_statcodes <- function(in_ind, out_ind) { + dat <- readRDS(sc_retrieve(in_ind, remake_file = 'getters.yml')) %>% - ungroup() %>% + ungroup() + + nrow_o <- nrow(dat) + + dat <- dat %>% # drop values that are estimated or blank-corrected # drop values that are not min, mean, max filter(!ResultValueTypeName %in% c('Estimated', 'Blank Corrected Calc')) %>% @@ -139,8 +142,8 @@ resolve_statcodes <- function(in_ind, out_ind) { # print message that says how many observations we lost when dropped - nrow_o <- nrow(readRDS(sc_retrieve(in_ind, remake_file = 'getters.yml'))) message(paste(nrow_o - nrow(dat), 'observations were dropped due to estimation, blank correction, or statcode that was not mean, min, max')) + # for some data, the start and end dates are different, and data providers # seem to be using these as a date range of the whole dataset # sometimes, the proper collection date is in the comment field @@ -177,8 +180,6 @@ resolve_statcodes <- function(in_ind, out_ind) { filter(n == 1) %>% select(-n) %>% ungroup() - - drop <- other %>% group_by(MonitoringLocationIdentifier, ActivityStartDate) %>% summarize(n = n()) %>% diff --git a/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGF0YV9zdHJlYW1zX2RhdGVzcmVzLnJkcy5pbmQ.yml b/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGF0YV9zdHJlYW1zX2RhdGVzcmVzLnJkcy5pbmQ.yml index d6c331f..a3f7347 100644 --- a/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGF0YV9zdHJlYW1zX2RhdGVzcmVzLnJkcy5pbmQ.yml +++ b/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGF0YV9zdHJlYW1zX2RhdGVzcmVzLnJkcy5pbmQ.yml @@ -1,12 +1,12 @@ version: 0.3.0 name: 5_data_munge/out/wqp_data_streams_datesres.rds.ind type: file -hash: aaa79d1a4e7bacbc0991767599a4e996 -time: 2022-10-26 16:32:03 UTC +hash: 0a73866d89f284c8a1a3159c8171612e +time: 2023-04-24 16:50:34 UTC depends: 5_data_munge/out/wqp_data_streams.rds.ind: 72ff57a393533becbfb82fc278a8ff81 fixed: ad14d31fed2ff9f2c6960bfbae40e1d5 code: functions: - resolve_statcodes: 934a4c614c42bcd96cac58e87b576c35 + resolve_statcodes: 3d66b335beb4a0ac1723476cb740ff77 diff --git a/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGFpbHlfbm9kZXB0aHMucmRzLmluZA.yml b/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGFpbHlfbm9kZXB0aHMucmRzLmluZA.yml index 41474fc..a9993d4 100644 --- a/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGFpbHlfbm9kZXB0aHMucmRzLmluZA.yml +++ b/build/status/NV9kYXRhX211bmdlL291dC93cXBfZGFpbHlfbm9kZXB0aHMucmRzLmluZA.yml @@ -1,10 +1,10 @@ version: 0.3.0 name: 5_data_munge/out/wqp_daily_nodepths.rds.ind type: file -hash: a1e354106c78cbf8e0b185f08137bac2 -time: 2022-10-26 16:44:32 UTC +hash: ed52117b4be67e9c68d7e28c41e3c13a +time: 2023-04-25 17:01:46 UTC depends: - 5_data_munge/out/wqp_data_streams_datesres.rds.ind: aaa79d1a4e7bacbc0991767599a4e996 + 5_data_munge/out/wqp_data_streams_datesres.rds.ind: 0a73866d89f284c8a1a3159c8171612e min_value: feee3efa1c3c99f6ae54f943171a7d14 max_value: 8550904a2a45c13015a45b21ef719fb7 max_daily_range: afe39393dd5be2c095cd37fede5c5b89 diff --git a/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXMucmRzLmluZA.yml b/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXMucmRzLmluZA.yml index 4d0e7ae..33c0a6c 100644 --- a/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXMucmRzLmluZA.yml +++ b/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXMucmRzLmluZA.yml @@ -1,11 +1,11 @@ version: 0.3.0 name: 5_data_munge/out/daily_temperatures.rds.ind type: file -hash: 390b3616eec395ff28fcbd7ce867aec2 -time: 2022-10-26 16:50:24 UTC +hash: 79546c309dac4cc292da934cf6c4e0ab +time: 2023-04-25 17:17:26 UTC depends: 5_data_munge/out/nwis_daily.rds.ind: 3396bbdded05965d18f905f5cc24ff65 - 5_data_munge/out/wqp_daily_nodepths.rds.ind: a1e354106c78cbf8e0b185f08137bac2 + 5_data_munge/out/wqp_daily_nodepths.rds.ind: ed52117b4be67e9c68d7e28c41e3c13a 5_data_munge/out/ecosheds_munged.rds.ind: 0b44b5d6c2089809ea92a8917da8450e 5_data_munge/out/norwest_munged.rds.ind: 9d1118944011c968be2afa00add16553 wqp_pull_date: 1703d8ed940ea9516622d02d7d072ff6 diff --git a/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXNfcWFxYy5yZHMuaW5k.yml b/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXNfcWFxYy5yZHMuaW5k.yml index b45fa19..bd9a9e7 100644 --- a/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXNfcWFxYy5yZHMuaW5k.yml +++ b/build/status/NV9kYXRhX211bmdlL291dC9kYWlseV90ZW1wZXJhdHVyZXNfcWFxYy5yZHMuaW5k.yml @@ -1,10 +1,10 @@ version: 0.3.0 name: 5_data_munge/out/daily_temperatures_qaqc.rds.ind type: file -hash: f27d7e0d8a492b4515ed318199b79848 -time: 2022-10-26 17:11:00 UTC +hash: d1278ec25074a5a30699dc5fb24307d7 +time: 2023-04-25 17:39:52 UTC depends: - 5_data_munge/out/daily_temperatures.rds.ind: 390b3616eec395ff28fcbd7ce867aec2 + 5_data_munge/out/daily_temperatures.rds.ind: 79546c309dac4cc292da934cf6c4e0ab 5_data_munge/out/stream_sites_us.rds.ind: 822cd3a5e059f1e3a1e0b110e74af8cc fixed: 4eed73f624bb922908b23ee8e6e8dbc3 code: diff --git a/build/status/Nl9uZXR3b3JrL291dC9zaXRlX2Zsb3dsaW5lcy5yZHMuaW5k.yml b/build/status/Nl9uZXR3b3JrL291dC9zaXRlX2Zsb3dsaW5lcy5yZHMuaW5k.yml index 8ff433d..8a74e7d 100644 --- a/build/status/Nl9uZXR3b3JrL291dC9zaXRlX2Zsb3dsaW5lcy5yZHMuaW5k.yml +++ b/build/status/Nl9uZXR3b3JrL291dC9zaXRlX2Zsb3dsaW5lcy5yZHMuaW5k.yml @@ -1,11 +1,11 @@ version: 0.3.0 name: 6_network/out/site_flowlines.rds.ind type: file -hash: d59ecb03a1739f6bb9201e6eb2b86140 -time: 2021-04-19 16:48:17 UTC +hash: 59981aa41f25a866c8d73ca4bd8a309a +time: 2022-12-16 22:27:27 UTC depends: 6_network/out/reach_direction.rds.ind: 2ee516b500dea725d7c88715899671aa - 5_data_munge/out/stream_sites_us.rds.ind: 36d4e04430142075e97c030e36c47d71 + 5_data_munge/out/stream_sites_us.rds.ind: 822cd3a5e059f1e3a1e0b110e74af8cc fixed: 09cec64854f41c8ad6e000bc1a943355 code: functions: