Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DC-3645] Update Union QC notebook with fixes for column name and moving notes check #1830

Merged
merged 1 commit into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions data_steward/analytics/cdr_ops/combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,30 @@
execute(client, query)
# -

# ## Verify Note text data

# +
query = f'''
SELECT 'note_text' AS field, note_text AS field_value, COUNT(note_text) AS row_count,
FROM `{PROJECT_ID}.{DATASET_ID}.note`
GROUP BY note_text

UNION ALL

SELECT 'note_title' AS field, note_title AS field_value, COUNT(note_title) AS row_count,
FROM `{PROJECT_ID}.{DATASET_ID}.note`
GROUP BY note_title

UNION ALL

SELECT 'note_source_value' AS field, note_source_value AS field_value, COUNT(note_source_value) AS row_count,
FROM `{PROJECT_ID}.{DATASET_ID}.note`
GROUP BY note_source_value
'''

execute(client, query)
# -

# ## Date and datetime fields should have the same date
# The date represented by associated `_date` and `_datetime` fields of the same
# row should be the same. If there any discrepancies, there may be a bug in the
Expand Down
40 changes: 8 additions & 32 deletions data_steward/analytics/cdr_ops/ehr_union_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@
GROUP BY
src_hpo_id)
SELECT
cc.src_hpo_id,
src_hpo_id,
coalesce(previous_unioned_ct, 0) as previous_unioned_ct,
coalesce(current_unioned_ct, 0) AS current_unioned_ct,
coalesce(cast((current_unioned_ct - previous_unioned_ct)/ current_unioned_ct * 100 as INT64), 100) as percentage_change
Expand All @@ -172,77 +172,53 @@
execute(client, query)
# -

# ## Verify Note text data

# +
query = f'''
SELECT 'note_text' AS field, note_text AS field_value, COUNT(note_text) AS row_count,
FROM `{PROJECT_ID}.{CURRENT_UNIONED_EHR_DATASET_ID}.note`
GROUP BY note_text

UNION ALL

SELECT 'note_title' AS field, note_title AS field_value, COUNT(note_title) AS row_count,
FROM `{PROJECT_ID}.{CURRENT_UNIONED_EHR_DATASET_ID}.note`
GROUP BY note_title

UNION ALL

SELECT 'note_source_value' AS field, note_source_value AS field_value, COUNT(note_source_value) AS row_count,
FROM `{PROJECT_ID}.{CURRENT_UNIONED_EHR_DATASET_ID}.note`
GROUP BY note_source_value
'''

execute(client, query)
# -

# ## Verifying no data past cut-off date

# +
query = f'''
SELECT
'observation' AS TABLE,
COUNT(*) AS non_clompling_rows
COUNT(*) AS non_complying_rows
FROM
`{PROJECT_ID}.{CURRENT_UNIONED_EHR_DATASET_ID}.observation`
WHERE
observation_date > DATE('{EHR_CUTOFF_DATE}')
UNION ALL
SELECT
'measurement' AS TABLE,
COUNT(*) AS non_clompling_rows
COUNT(*) AS non_complying_rows
FROM
`{PROJECT_ID}.{CURRENT_UNIONED_EHR_DATASET_ID}.measurement`
WHERE
measurement_date > DATE('{EHR_CUTOFF_DATE}')
UNION ALL
SELECT
'visit_occurrence' AS TABLE,
COUNT(*) AS non_clompling_rows
COUNT(*) AS non_complying_rows
FROM
`{PROJECT_ID}.{CURRENT_UNIONED_EHR_DATASET_ID}.visit_occurrence`
WHERE
visit_end_date > DATE('{EHR_CUTOFF_DATE}')
UNION ALL
SELECT
'drug_exposure' AS TABLE,
COUNT(*) AS non_clompling_rows
COUNT(*) AS non_complying_rows
FROM
`{PROJECT_ID}.{CURRENT_UNIONED_EHR_DATASET_ID}.drug_exposure`
WHERE
drug_exposure_end_date > DATE('{EHR_CUTOFF_DATE}')
UNION ALL
SELECT
'procedure' AS TABLE,
COUNT(*) AS non_clompling_rows
COUNT(*) AS non_complying_rows
FROM
`{PROJECT_ID}.{CURRENT_UNIONED_EHR_DATASET_ID}.procedure_occurrence`
WHERE
procedure_date > DATE('{EHR_CUTOFF_DATE}')
UNION ALL
SELECT
'visit_detail' AS TABLE,
COUNT(*) AS non_clompling_rows
COUNT(*) AS non_complying_rows
FROM
`{PROJECT_ID}.{CURRENT_UNIONED_EHR_DATASET_ID}.visit_detail`
WHERE
Expand Down Expand Up @@ -353,7 +329,7 @@
SELECT
aou_death_id,
CASE WHEN aou_death_id IN (
SELECT aou_death_id FROM `{{project_id}}.{{dataset_id}}.aou_death`
SELECT aou_death_id FROM `{{project_id}}.{{dataset}}.aou_death`
WHERE death_date IS NOT NULL -- NULL death_date records must not become primary --
QUALIFY RANK() OVER (
PARTITION BY person_id
Expand Down