Skip to content
This repository has been archived by the owner on Jun 1, 2023. It is now read-only.

Wqp fixes #58

Merged
merged 6 commits into from
Apr 25, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 1_wqp_pull/out/wqp_data.rds.ind
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
hash: fa556791765715562797e6c2dad855ee
hash: 80fac6326f338af72b0f1945905f8d72

4 changes: 2 additions & 2 deletions 1_wqp_pull/src/get_wqp_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -130,13 +130,13 @@ combine_wqp_dat <- function(ind_file, ...){
}
dat_mod <- dat_mod %>%
filter(!is.na(ResultMeasureValue)) %>%
select(MonitoringLocationIdentifier, ActivityMediaName, ActivityStartDate, `ActivityStartTime/Time`, `ActivityStartTime/TimeZoneCode`,
select(MonitoringLocationIdentifier, ActivityMediaName, ActivityStartDate, ActivityEndDate, `ActivityStartTime/Time`, `ActivityStartTime/TimeZoneCode`,
`ActivityDepthHeightMeasure/MeasureValue`, `ActivityDepthHeightMeasure/MeasureUnitCode`,
limnoliver marked this conversation as resolved.
Show resolved Hide resolved
`ActivityTopDepthHeightMeasure/MeasureValue`,`ActivityTopDepthHeightMeasure/MeasureUnitCode`,
`ActivityBottomDepthHeightMeasure/MeasureValue`, `ActivityBottomDepthHeightMeasure/MeasureUnitCode`,
ActivityCommentText, `SampleCollectionMethod/MethodIdentifier`, `SampleCollectionMethod/MethodIdentifierContext`,
`SampleCollectionMethod/MethodName`,CharacteristicName, ResultMeasureValue,
`ResultMeasure/MeasureUnitCode`, ResultValueTypeName, PrecisionValue,
`ResultMeasure/MeasureUnitCode`, ResultValueTypeName, PrecisionValue, StatisticalBaseCode,
ResultCommentText, `ResultDepthHeightMeasure/MeasureValue`, `ResultDepthHeightMeasure/MeasureUnitCode`, ProviderName) %>%
mutate(PrecisionValue = as.numeric(PrecisionValue)) %>%
mutate_at(vars(contains('MeasureValue')), as.numeric) %>%
Expand Down
13 changes: 11 additions & 2 deletions 5_data_munge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,24 @@ targets:
keep_types = stream_types,
out_ind = target_name)

# resolve a few known issues regarding start/end dates
# drop those where start/end is different and there is more than one
# obs per site-date. Try to recover those obs where the collection
# date is in comments.
5_data_munge/out/wqp_data_streams_datesres.rds.ind:
command: resolve_statcodes(
in_ind = '5_data_munge/out/wqp_data_streams.rds.ind',
out_ind = target_name)

5_data_munge/out/wqp_daily_depths.rds.ind:
command: munge_wqp_withdepths(
in_ind = '5_data_munge/out/wqp_data_streams.rds.ind',
in_ind = '5_data_munge/out/wqp_data_streams_datesres.rds.ind',
min_value, max_value, max_daily_range,
out_ind = target_name)

5_data_munge/out/wqp_daily_nodepths.rds.ind:
command: munge_wqp_withoutdepths(
in_ind = '5_data_munge/out/wqp_data_streams.rds.ind',
in_ind = '5_data_munge/out/wqp_data_streams_datesres.rds.ind',
min_value, max_value, max_daily_range,
out_ind = target_name)

Expand Down
2 changes: 1 addition & 1 deletion 5_data_munge/out/daily_temperatures.rds.ind
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
hash: 747034b3bcd79a69f58df11a7b52ad07
hash: 49d2079760cc47874ff0d787278185fc

2 changes: 1 addition & 1 deletion 5_data_munge/out/daily_temperatures_qaqc.rds.ind
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
hash: 5f5d0a4a4eb977025698ebccba574bca
hash: 5a33a2e9b29857a90970a75036c314db

2 changes: 1 addition & 1 deletion 5_data_munge/out/daily_temperatures_summary.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
n_obs,n_sites
27368431,293513
27323646,292462
2 changes: 1 addition & 1 deletion 5_data_munge/out/flagged_temperature_summary.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
n_flagged_obs,perc_flagged_obs,n_flagged_sites,perc_flagged_sites
1061259,4.1,87062,29.7
1058664,4.1,86984,29.9
2 changes: 1 addition & 1 deletion 5_data_munge/out/wqp_daily_depths.rds.ind
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
hash: 7ff8028fb0c2115a5371684c17bd6927
hash: bfe822528a7c57aa65e880a3bafd6c9a

2 changes: 1 addition & 1 deletion 5_data_munge/out/wqp_daily_nodepths.rds.ind
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
hash: db18e66f40e06e34622a764d7f3ca9c6
hash: 64eb9b1bce06bfb7d2d21758b3dcd8d6

2 changes: 1 addition & 1 deletion 5_data_munge/out/wqp_data_streams.rds.ind
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
hash: e1fc3f9ca8c625e78dafa54fa4736432
hash: 9a6b397af80917e46559b5a871e8e602

2 changes: 2 additions & 0 deletions 5_data_munge/out/wqp_data_streams_datesres.rds.ind
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
hash: ec5947853df0c31d2eb86599d5acc337

122 changes: 113 additions & 9 deletions 5_data_munge/src/munge_wqp_files.R
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ munge_wqp_withdepths <- function(in_ind, min_value, max_value, max_daily_range,

munge_wqp_withoutdepths <- function(in_ind, min_value, max_value, max_daily_range, out_ind) {
dat <- readRDS(sc_retrieve(in_ind, remake_file = 'getters.yml')) %>%
select(MonitoringLocationIdentifier, ActivityStartDate, ResultMeasureValue,
select(MonitoringLocationIdentifier, ActivityStartDate, ResultMeasureValue, StatisticalBaseCode,
limnoliver marked this conversation as resolved.
Show resolved Hide resolved
`ResultMeasure/MeasureUnitCode`, `ActivityStartTime/Time`, ActivityMediaName,
`ActivityDepthHeightMeasure/MeasureValue`, `ResultDepthHeightMeasure/MeasureValue`,
`ActivityTopDepthHeightMeasure/MeasureValue`, `ActivityBottomDepthHeightMeasure/MeasureValue`) %>%
Expand All @@ -93,19 +93,123 @@ munge_wqp_withoutdepths <- function(in_ind, min_value, max_value, max_daily_rang
f_to_c(ResultMeasureValue), ResultMeasureValue)) %>%
mutate(`ResultMeasure/MeasureUnitCode` = 'deg C')

dat_daily <- group_by(dat_reduced, MonitoringLocationIdentifier, ActivityStartDate) %>%
summarize(temperature_mean_daily = mean(ResultMeasureValue),
temperature_min_daily = min(ResultMeasureValue),
temperature_max_daily = max(ResultMeasureValue),
n_obs = n(),
dat_reduced_statcode <- ungroup(dat_reduced) %>%
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the grouping structure for dat_reduced? You could consider moving the ungroup() step to line 95 to avoid repeating that step in line 96 and then again in line 109.

filter(!is.na(StatisticalBaseCode)) %>%
group_by(MonitoringLocationIdentifier, ActivityStartDate) %>%
limnoliver marked this conversation as resolved.
Show resolved Hide resolved
mutate(temperature_mean_daily = ifelse(grepl('mean', StatisticalBaseCode, ignore.case = TRUE), ResultMeasureValue, NA),
temperature_min_daily = ifelse(grepl('min', StatisticalBaseCode, ignore.case = TRUE), ResultMeasureValue, NA),
temperature_max_daily = ifelse(grepl('max', StatisticalBaseCode, ignore.case = TRUE), ResultMeasureValue, NA)) %>%
mutate(n_obs = NA) %>%
group_by(MonitoringLocationIdentifier, ActivityStartDate) %>%
summarize(across(c(temperature_mean_daily, temperature_min_daily, temperature_max_daily, n_obs) , ~ first(na.omit(.))))

# we don't know the number of observations here because stat codes were used


dat_daily <- ungroup(dat_reduced) %>%
filter(is.na(StatisticalBaseCode)) %>%
group_by(MonitoringLocationIdentifier, ActivityStartDate) %>%
summarize(n_obs = n(),
temperature_mean_daily = mean(ResultMeasureValue),
# we don't want to propagate min and max if there is only one value
temperature_min_daily = ifelse(n_obs>1, min(ResultMeasureValue), NA),
temperature_max_daily = ifelse(n_obs>1, max(ResultMeasureValue), NA),
time = ifelse(n_obs == 1, `ActivityStartTime/Time`, NA)) %>%
filter(temperature_mean_daily > min_value & temperature_mean_daily < max_value,
temperature_min_daily > min_value & temperature_min_daily < max_value,
temperature_max_daily > min_value & temperature_max_daily < max_value)
bind_rows(dat_reduced_statcode) %>%
filter(temperature_mean_daily > min_value & temperature_mean_daily < max_value|is.na(temperature_mean_daily),
temperature_min_daily > min_value & temperature_min_daily < max_value|is.na(temperature_min_daily),
temperature_max_daily > min_value & temperature_max_daily < max_value|is.na(temperature_max_daily))

# save
data_file <- scipiper::as_data_file(out_ind)
saveRDS(dat_daily, data_file)
gd_put(out_ind)

}

resolve_statcodes <- function(in_ind, out_ind) {
dat <- readRDS(sc_retrieve(in_ind, remake_file = 'getters.yml')) %>%
ungroup() %>%
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar to my comment above, I wonder about ungrouping the data frame earlier on in the pipeline. Is it expected that '5_data_munge/out/wqp_data_streams.rds.ind' has some grouping structure that should be maintained?

This is a minor formatting suggestion and I don't expect it to change the output of the pipeline, so feel free to take or leave this suggestion.

# drop values that are estimated or blank-corrected
# drop values that are not min, mean, max
filter(!ResultValueTypeName %in% c('Estimated', 'Blank Corrected Calc')) %>%
filter(StatisticalBaseCode %in% c(NA, 'Mean', 'Minimum', 'Maximum', 'mean',
'Geometric Mean', 'Daily Maximum', 'Daily Minimum',
'Daily Geometric Mean'))


# print message that says how many observations we lost when dropped
nrow_o <- nrow(readRDS(sc_retrieve(in_ind, remake_file = 'getters.yml')))
message(paste(nrow_o - nrow(dat), 'observations were dropped due to estimation, blank correction, or statcode that was not mean, min, max'))
limnoliver marked this conversation as resolved.
Show resolved Hide resolved
# for some data, the start and end dates are different, and data providers
# seem to be using these as a date range of the whole dataset
# sometimes, the proper collection date is in the comment field
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sometimes, the proper collection date is in the comment field

😢

# we're dropping data that has a StatisticalBaseCode because we don't want
# values averaged over multiple days
range_dates <- filter(dat, !is.na(ActivityEndDate)) %>%
filter(ActivityStartDate != ActivityEndDate) %>%
filter(is.na(StatisticalBaseCode)) %>%
filter(grepl('Collected', ResultCommentText)) %>%
mutate(newActivityStartDate = gsub('(Collected: )(.*\\d{4})(\\s*\\d+.*)', '\\2', ResultCommentText, perl = TRUE),
newActivityStartTime = gsub('(Collected: .*\\d{4}\\s*)(\\d+.*)', '\\2', ResultCommentText, perl = TRUE)) %>%
mutate(newActivityStartTime = format(strptime(newActivityStartTime, format = '%I:%M %p'), '%H:%M:%S'),
newActivityStartDate = as.Date(newActivityStartDate, format = "%b %d %Y")) %>%
select(-ActivityStartDate, -`ActivityStartTime/Time`) %>%
rename(ActivityStartDate = newActivityStartDate,
`ActivityStartTime/Time` = newActivityStartTime)

# print message about date recoveries
message(paste(nrow(range_dates), 'observations with mismatching start/end dates were recovered by extracting collection dates from comments'))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice, way to recover this info!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For posterity: 2115448 observations with mismatching start/end dates were recovered by extracting collection dates from comments

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wow, that number is higher than I was expecting!


# those that don't have collected in the comments
other <- filter(dat, !is.na(ActivityEndDate)) %>%
filter(ActivityStartDate != ActivityEndDate) %>%
filter(is.na(StatisticalBaseCode)) %>%
filter(!grepl('Collected', ResultCommentText))

# look at the number of obs per day per site-date to weed out bad sites
# only keep sites that have one obs per day
# site dates with > 1440 obs (which is 1obs/minute) was most of the drops here,
# so we're confident we're dropping bad data
limnoliver marked this conversation as resolved.
Show resolved Hide resolved
keep_onesiteday <- other %>%
group_by(MonitoringLocationIdentifier, ActivityStartDate) %>%
mutate(n = n()) %>%
filter(n == 1) %>%
select(-n) %>% ungroup()



drop <- other %>%
group_by(MonitoringLocationIdentifier, ActivityStartDate) %>%
summarize(n = n()) %>%
filter(n > 1)

widnr <- filter(drop, grepl('WIDNR', MonitoringLocationIdentifier))

message(paste(nrow(drop), 'site-dates and', sum(drop$n),
'raw observations were dropped because n>1 obs per site-date and start-end dates did not match.',
'WIDNR was responsible for', nrow(widnr), 'site-dates and', sum(widnr$n), 'raw observations.'))

statdiffdates <- filter(dat, !is.na(ActivityEndDate)) %>%
filter(ActivityStartDate != ActivityEndDate) %>%
filter(!is.na(StatisticalBaseCode))

message(length(unique(paste(statdiffdates$MonitoringLocationIdentifier, statdiffdates$ActivityStartDate))),
' site-dates and ', nrow(statdiffdates), ' raw observations were dropped because the observation had a stat code but different start/end dates')


out <- filter(dat, ActivityStartDate == ActivityEndDate | is.na(ActivityEndDate)) %>% # keep all data where start/end dates are same
bind_rows(range_dates) %>% # keep all data where we fixed the dates from the comments
bind_rows(keep_onesiteday) # keep all data where the start/end was different but there was only one value per site-date

perc_keep <- round((nrow(dat) - nrow(out))/nrow(dat)*100, 1)
raw_dropped <- nrow(dat) - nrow(out)


message(paste(perc_keep, 'percent of observations were dropped...or',
raw_dropped, 'raw observations were dropped due to mismatch start/end dates'))

data_file <- scipiper::as_data_file(out_ind)
saveRDS(out, data_file)
gd_put(out_ind)
}
2 changes: 1 addition & 1 deletion 6_network/out/site_flowlines.rds.ind
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
hash: 393bbb8751a97f632580c62efd627e97
hash: 6dc4731a52161e291ac13e3d1419ee41

2 changes: 1 addition & 1 deletion 6_network/out/site_stream_crosswalk.rds.ind
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
hash: 53a04e06755af97258c7d7848a1ee5de
hash: 9b22cff7abde66fe97d81b8cee861a32

6 changes: 3 additions & 3 deletions build/status/MV93cXBfcHVsbC9vdXQvd3FwX2RhdGEucmRzLmluZA.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
version: 0.3.0
name: 1_wqp_pull/out/wqp_data.rds.ind
type: file
hash: 1a8dbb9cd4dfd070cbfc946511ca65d7
time: 2022-03-29 18:28:28 UTC
hash: 3edeb04eb813118f8aa4a901d080b24b
time: 2022-10-19 15:31:10 UTC
depends:
wqp_pull_plan: 6f549ec3f2aa682ceace9c864015ff0a
1_wqp_pull_tasks.yml: dd5e68ad85ae31a47f12bd4bf52eeee7
1_wqp_pull/src/get_wqp_data.R: 65def7cf8815ff64e926482a9640b97f
1_wqp_pull/src/get_wqp_data.R: b05dcdb365bbcd5a4e241f0c46076fac
fixed: b103af1551106aa1ebfea5ec48fe44e0
code:
functions:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
version: 0.3.0
name: 5_data_munge/out/wqp_data_streams.rds.ind
type: file
hash: 1091d4b0e14854dd8450a914273a0363
time: 2022-03-31 20:30:18 UTC
hash: 72ff57a393533becbfb82fc278a8ff81
time: 2022-10-24 04:21:10 UTC
depends:
1_wqp_pull/out/wqp_data.rds.ind: 1a8dbb9cd4dfd070cbfc946511ca65d7
1_wqp_pull/out/wqp_data.rds.ind: 3edeb04eb813118f8aa4a901d080b24b
1_wqp_pull/inout/wqp_inventory.feather.ind: 4d0042964407bcf7c2656a19e5c4bcd2
stream_types: ff830476c42cccbd885995ef9cb26b5c
fixed: 987ddc70a46a2204bf0a037cfbaf2482
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
version: 0.3.0
name: 5_data_munge/out/wqp_data_streams_datesres.rds.ind
type: file
hash: aaa79d1a4e7bacbc0991767599a4e996
time: 2022-10-26 16:32:03 UTC
depends:
5_data_munge/out/wqp_data_streams.rds.ind: 72ff57a393533becbfb82fc278a8ff81
fixed: ad14d31fed2ff9f2c6960bfbae40e1d5
code:
functions:
resolve_statcodes: 934a4c614c42bcd96cac58e87b576c35

Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
version: 0.3.0
name: 5_data_munge/out/wqp_daily_depths.rds.ind
type: file
hash: 696fcc6e752c73008bb62bace2cdac56
time: 2022-03-31 20:53:49 UTC
hash: 62da8cd8a92119b6236155a1b38bacb8
time: 2022-10-25 15:28:49 UTC
depends:
5_data_munge/out/wqp_data_streams.rds.ind: 1091d4b0e14854dd8450a914273a0363
5_data_munge/out/wqp_data_streams_datesres.rds.ind: a3e8b99d7e975a1c1812e58f6000c4fd
min_value: feee3efa1c3c99f6ae54f943171a7d14
max_value: 8550904a2a45c13015a45b21ef719fb7
max_daily_range: afe39393dd5be2c095cd37fede5c5b89
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
version: 0.3.0
name: 5_data_munge/out/wqp_daily_nodepths.rds.ind
type: file
hash: d6adcb94d2168ceb7d1977635bdebbed
time: 2022-04-25 01:23:35 UTC
hash: a1e354106c78cbf8e0b185f08137bac2
time: 2022-10-26 16:44:32 UTC
depends:
5_data_munge/out/wqp_data_streams.rds.ind: 1091d4b0e14854dd8450a914273a0363
5_data_munge/out/wqp_data_streams_datesres.rds.ind: aaa79d1a4e7bacbc0991767599a4e996
min_value: feee3efa1c3c99f6ae54f943171a7d14
max_value: 8550904a2a45c13015a45b21ef719fb7
max_daily_range: afe39393dd5be2c095cd37fede5c5b89
fixed: c8da49989b1a3d05cbbfd15be13cfef9
code:
functions:
f_to_c: a61909ceabd1801b610840dc86315ffe
munge_wqp_withoutdepths: 4f675338f8fd61f29d42cd02819ebaab
munge_wqp_withoutdepths: 5e779d4adf6c955c7431d9f00c7e8b2b

Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
version: 0.3.0
name: 5_data_munge/out/daily_temperatures.rds.ind
type: file
hash: 8391327cc8c907cbf3b58133509ce816
time: 2022-04-25 01:34:39 UTC
hash: 390b3616eec395ff28fcbd7ce867aec2
time: 2022-10-26 16:50:24 UTC
depends:
5_data_munge/out/nwis_daily.rds.ind: 3396bbdded05965d18f905f5cc24ff65
5_data_munge/out/wqp_daily_nodepths.rds.ind: d6adcb94d2168ceb7d1977635bdebbed
5_data_munge/out/wqp_daily_nodepths.rds.ind: a1e354106c78cbf8e0b185f08137bac2
5_data_munge/out/ecosheds_munged.rds.ind: 0b44b5d6c2089809ea92a8917da8450e
5_data_munge/out/norwest_munged.rds.ind: 9d1118944011c968be2afa00add16553
wqp_pull_date: 1703d8ed940ea9516622d02d7d072ff6
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
version: 0.3.0
name: 5_data_munge/out/daily_temperatures_qaqc.rds.ind
type: file
hash: 26e19a7dd06b10ff39ce1311f139559e
time: 2022-04-25 14:05:20 UTC
hash: f27d7e0d8a492b4515ed318199b79848
time: 2022-10-26 17:11:00 UTC
depends:
5_data_munge/out/daily_temperatures.rds.ind: 8391327cc8c907cbf3b58133509ce816
5_data_munge/out/daily_temperatures.rds.ind: 390b3616eec395ff28fcbd7ce867aec2
5_data_munge/out/stream_sites_us.rds.ind: 822cd3a5e059f1e3a1e0b110e74af8cc
fixed: 4eed73f624bb922908b23ee8e6e8dbc3
code:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
version: 0.3.0
name: 6_network/out/site_stream_crosswalk.rds.ind
type: file
hash: 63bee8bd2f89e468cf83097242d3acb5
time: 2021-05-05 13:46:31 UTC
hash: fe62da3f0fed14520543475b94909376
time: 2022-12-16 22:28:04 UTC
depends:
6_network/out/site_flowlines.rds.ind: d59ecb03a1739f6bb9201e6eb2b86140
5_data_munge/out/stream_sites_us.rds.ind: 36d4e04430142075e97c030e36c47d71
6_network/out/site_flowlines.rds.ind: 59981aa41f25a866c8d73ca4bd8a309a
5_data_munge/out/stream_sites_us.rds.ind: 822cd3a5e059f1e3a1e0b110e74af8cc
fixed: c4bb3d1cfd994b4eda8cf860009af078
code:
functions:
Expand Down
3 changes: 3 additions & 0 deletions getters.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ targets:
5_data_munge/out/wqp_data_streams.rds:
command: gd_get('5_data_munge/out/wqp_data_streams.rds.ind')

5_data_munge/out/wqp_data_streams_datesres.rds:
command: gd_get('5_data_munge/out/wqp_data_streams_datesres.rds.ind')

5_data_munge/out/ecosheds_munged.rds:
command: gd_get('5_data_munge/out/ecosheds_munged.rds.ind')

Expand Down