-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathcovid_data_testing.py
662 lines (580 loc) · 30.5 KB
/
covid_data_testing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
import datetime
import os
import re
import numpy as np
import pandas as pd
import requests
from dateutil.parser import parse as d
from utils_pandas import daterange
from utils_pandas import export
from utils_pandas import import_csv
from utils_pandas import spread_date_range
from utils_scraping import all_in
from utils_scraping import any_in
from utils_scraping import camelot_cache
from utils_scraping import get_next_numbers
from utils_scraping import local_files
from utils_scraping import logger
from utils_scraping import parse_file
from utils_scraping import pptx2chartdata
from utils_scraping import USE_CACHE_DATA
from utils_scraping import web_files
from utils_thai import file2date
from utils_thai import find_date_range
from utils_thai import find_dates
from utils_thai import join_provinces
from utils_thai import POS_COLS
from utils_thai import TEST_COLS
##########################################
# Testing data
##########################################
# def get_test_dav_files(url="http://nextcloud.dmsc.moph.go.th/public.php/webdav",
# username="wbioWZAQfManokc",
# password="null",
# ext=".pdf .pptx",
# dir="inputs/testing_moph"):
# return dav_files(url, username, password, ext, dir)
def get_drive_files(folder_id, ext="pdf", dir="inputs/testing_moph", check=True):
key = os.environ.get('DRIVE_API_KEY', None)
if key is None or not check:
if key is None:
logger.warning("env DRIVE_API_KEY missing")
logger.warning(f"Using local cached data only for {folder_id} {dir}")
yield from local_files(ext, dir)
return
url = f"https://www.googleapis.com/drive/v3/files?q=%27{folder_id}%27+in+parents&key={key}"
res = requests.get(url).json()
if "files" not in res:
logger.warning(f"Error accessing drive files. Using local cache: {res}")
yield from local_files(ext, dir)
return
files = res["files"]
while "nextPageToken" in res:
token = res['nextPageToken']
url = f"https://www.googleapis.com/drive/v3/files?q=%27{folder_id}%27+in+parents&key={key}&pageToken={token}"
res = requests.get(url).json()
if "files" not in res:
logger.error(str(res))
break
files.extend(res["files"])
count = 0
for data in files:
id = data['id']
name = data['name']
# TODO: how to get modification date?
if not name.endswith(ext):
continue
count += 1
target = os.path.join(dir, name)
os.makedirs(os.path.dirname(target), exist_ok=True)
def get_file(id=id, name=name):
url = f"https://www.googleapis.com/drive/v2/files/{id}?alt=media&key={key}"
file, _, _ = next(iter(web_files(url, dir=dir, filenamer=lambda url, _: name)))
return file
yield target, get_file
if count == 0:
logger.warning("No files found: Using local cached testing data only")
yield from local_files(ext, dir)
return
def get_test_files(ext="pdf", dir="inputs/testing_moph", check=True):
folder_id = "1yUVwstf5CmdvBVtKBs0uReV0BTbjQYlT"
yield from get_drive_files(folder_id, ext, dir, check=check)
def get_variant_files(ext=".pdf", dir="inputs/variants", check=True):
# https://drive.google.com/drive/folders/13k14Hs61pgrK8raSMS9LFQn83PKswS-b
folder_id = "13k14Hs61pgrK8raSMS9LFQn83PKswS-b"
yield from get_drive_files(folder_id, ext=ext, dir=dir, check=check)
def get_tests_by_day():
def from_reports():
df = pd.DataFrame()
files = ""
missing = "Thailand_COVID-19_ATK_data-update-20220604.xlsx" # Until they bring it back, get from local cache
for file, dl in list(get_test_files(ext="xlsx")) + list(get_test_files(ext=missing)):
dl() # Cache everything just in case
if not any_in(file, "ATK", "testing_data"):
continue
tests = pd.read_excel(file, parse_dates=True)
tests = tests.drop(columns=tests.columns[4:]) # Some have some rubbish on the right
tests = tests.dropna(how="all", axis=1)
if "ATK" in file:
tests = tests.rename(columns={"approve date": "Date", "countPositive": "Pos ATK", "total": "Tests ATK"})
else:
tests = tests.rename(columns={'Pos': "Pos XLS", 'Total': "Tests XLS"})
tests["Tests ATK"] = np.nan
tests["Pos ATK"] = np.nan
tests = tests.drop(tests[tests['Date'].isna()].index) # get rid of totals row
tests = tests.drop(columns="Week") if "Week" in tests.columns else tests
tests = tests.iloc[1:] # Get rid of first row with unspecified data
tests['Date'] = pd.to_datetime(tests['Date'], dayfirst=True)
tests = tests.set_index("Date")
# tests.drop("Cannot specify date", inplace=True)
files += " " + file
df = df.combine_first(tests)
return files, df
def from_data():
url = "https://data.go.th/dataset/9f6d900f-f648-451f-8df4-89c676fce1c4/resource/0092046c-db85-4608-b519-ce8af099315e/download/thailand_covid-19_testing_data_update091064.csv" # NOQA
file, _, _ = next(iter(web_files(url, dir="inputs/testing_moph")))
tests = pd.read_csv(file, parse_dates=True, usecols=[0, 1, 2])
return file, tests.rename(columns={'positive': "Pos", 'Total Testing': "Total"})
file, tests = from_reports()
if file is None:
return tests
def redistribute(tests):
pos = tests[tests["Date"] == "Cannot specify date"]['Pos']
total = tests[tests["Date"] == "Cannot specify date"]['Total']
tests = tests[tests["Date"] != "Cannot specify date"]
# Need to redistribute the unknown values across known values
# Documentation tells us it was 11 labs and only before 3 April
unknown_end_date = datetime.datetime(day=3, month=4, year=2020)
all_pos = tests["positive"][:unknown_end_date].sum()
all_total = tests["Total Testing"][:unknown_end_date].sum()
for index, row in tests.iterrows():
if index > unknown_end_date:
continue
row['positive'] = float(row['positive']) + row['positive'] / all_pos * pos
row['Total Testing'] = float(row['Total Testing']) + row['Total Testing'] / all_total * total
# TODO: still doesn't redistribute all missing values due to rounding. about 200 left
# print(tests["Pos"].sum(), pos + all_pos)
# print(tests["Total"].sum(), total + all_total)
# fix datetime
# tests.reset_index(drop=False, inplace=True)
return tests
logger.info("{} {}", file, len(tests))
return tests
def get_tests_per_province():
# df = pd.DataFrame(index=pd.MultiIndex.from_tuples([], names=['Date', 'Province']))
df = pd.DataFrame(columns=['Date', 'Province']).set_index(['Date', 'Province'])
for file, dl in list(get_test_files(ext="รายจังหวัด.xlsx")):
dl() # Cache everything just in case
# Now get everything inc those no longer there
for file, dl in list(get_test_files(ext="รายจังหวัด.xlsx", check=False)):
# TODO: work out how to process 2023.01.21_แยกประเภทของผล-รายจังหวัด.xlsx. Tests of different types per province
date = file2date(file)
tests = pd.read_excel(file, header=None)
tests.iloc[0] = tests.iloc[0].ffill() # Fill in the dates
tests.iloc[1] = tests.iloc[1].ffill() # fill in the type of test
tests = tests.drop(columns=tests.columns[0:1]) # get rid of province number
# TODO: unpivot the days down and then join the columns names then match the province names
cols = tests.iloc[0:3].transpose()
cols.columns = ["Date", "Tested", "Type"]
day = cols['Date'].iloc[-1]
# TODO: will need better method when it changes month?
cols['Date'] = cols['Date'].apply(
lambda d: np.nan if pd.isna(d) else datetime.datetime(date.year, date.month, int(d)) if int(d) <= date.day else datetime.datetime(date.year, date.month - 1, int(d)))
dates = cols['Date']
cols['Metric'] = cols.apply(lambda row: row['Tested'] + ' ' + ('Pos' if row['Type'] ==
'detected' else 'Tests') if not pd.isna(row['Type']) else np.nan, axis=1)
cols = cols.drop(columns=['Tested', 'Type'])
cols = pd.MultiIndex.from_frame(cols.iloc[1:])
tests = tests.iloc[3:-2] # Get rid of headrings and empty and totals
tests = tests.rename(columns={1: "Province"})
tests = tests.set_index("Province")
tests = join_provinces(tests, "Province", extra=[])
tests.columns = cols
tests = tests.unstack().to_frame("Tests").reset_index(["Metric"]).pivot(columns=["Metric"])
tests.columns = tests.columns.droplevel(0)
df = df.combine_first(tests)
logger.info("Tests by Prov {} - {} {}", dates.min(), dates.max(), file)
export(df, "tests_by_province", csv_only=True)
return df
def get_tests_by_area_chart_pptx(file, title, series, data, raw):
start, end = find_date_range(title)
if start is None or "เริ่มเปิดบริการ" in title or not any_in(title, "เขตสุขภาพ", "เขตสุขภำพ"):
return data, raw
# the graph for X period split by health area.
# Need both pptx and pdf as one pdf is missing
if "จำนวนผลบวก" not in series:
# 2021-08-24 they added another graph with %
return data, raw
pos = list(series["จำนวนผลบวก"])
tests = list(series["จำนวนตรวจ"])
row = pos + tests + [sum(pos), sum(tests)]
results = spread_date_range(start, end, row, ["Date"] + POS_COLS + TEST_COLS + ["Pos Area", "Tests Area"])
# print(results)
data = data.combine_first(results)
raw = raw.combine_first(pd.DataFrame(
[[start, end, ] + pos + tests],
columns=["Start", "End", ] + POS_COLS + TEST_COLS
).set_index("Start"))
logger.info("Tests by Area {} - {} {}", start.date(), end.date(), file)
return data, raw
def get_tests_by_area_pdf(file, page, data, raw, page_num=None):
if not any_in(page, "เขตสุขภาพ", "เขตสุขภำพ"):
return data, raw
elif any_in(page, "เริ่มเปิดบริการ", "90%"):
return data, raw
start, end = find_date_range(page)
if start is None:
return data, raw
# Can't parse '35_21_12_2020_COVID19_(ถึง_18_ธันวาคม_2563)(powerpoint).pptx' because data is a graph
# no pdf available so data missing
# Also missing 14-20 Nov 2020 (no pptx or pdf)
page = page.replace(
"349585", "349 585").replace(
"4869151.1", "48691 51.1").replace(
"6993173.8", "69931 73.8").replace(
"988114.3", "9881 14.3").replace(
"2061119828", "2061 119828").replace(
"9881 14.3", "98811 4.3").replace(
"2061 119828", "20611 19828").replace(
"445270", "445 270").replace(
"237193", "237 193"
)
page = page if "2023.03.18" not in file else page.replace("114157", "114 157")
page = page if "2023.03.11" not in file else page.replace("88108", "88 108")
page = page if "2023.02.25" not in file else page.replace("983.7 6.4", "98 12661 3.7 6.4")
# Missing district 13 number in 2023.01.28. #TODO. Number is the % plot, 2 pages later
page = page if "2023.01.28" not in file else page.replace("2107.7", "210 12966 7.7")
# First line can be like จดัท ำโดย เพญ็พชิชำ ถำวงศ ์กรมวิทยำศำสตณก์ำรแพทย์ วันที่ท ำรำยงำน 15/02/2564 เวลำ 09.30 น.
first, rest = page.split("\n", 1)
page = (
rest if "เพญ็พชิชำ" in first or "/" in first else page
) # get rid of first line that sometimes as date and time in it
numbers, _ = get_next_numbers(page, "", debug=True, ints=False) # "ภาคเอกชน",
# ภาครัฐ
# ภาคเอกชน
# จดัท ำโดย เพญ็พชิชำ ถำวงศ ์กรมวิทยำศำสตณก์ำรแพทย์
# print(numbers)
# TODO: should really find and parse X axis labels which contains 'เขต' and count
tests_start = 13 if "total" not in page else 14
pos = list(map(int, numbers[0:13]))
# assert all([p < 500000 for p in pos])
tests = numbers[tests_start:tests_start + 13]
assert tests == list(map(int, tests)) # last number sometimes is joined to next %
tests = list(map(int, tests))
pos_rate = numbers[tests_start + 13: tests_start + 26]
if start > d("2020-12-05"):
assert all([r <= 100 for r in pos_rate])
calc_pos_rates = [round(p / t * 100, 1) if t else 0 for p, t in zip(pos, tests)]
assert pos_rate == calc_pos_rates # double check we got right values
row = pos + tests + [sum(pos), sum(tests)]
results = spread_date_range(start, end, row, ["Date"] + POS_COLS + TEST_COLS + ["Pos Area", "Tests Area"])
data = data.combine_first(results)
raw = raw.combine_first(pd.DataFrame(
[[start, end, ] + pos + tests],
columns=["Start", "End", ] + POS_COLS + TEST_COLS
).set_index("Start"))
logger.info("Tests by Area {} - {} {}", start.date(), end.date(), file)
return data, raw
def get_tests_private_public_pptx(file, title, series, data):
start, end = find_date_range(title)
if start is None:
return data
elif "เริ่มเปิดบริการ" not in title and any_in(title, "เขตสุขภาพ", "เขตสุขภำพ"):
# It's a by area chart
return data
elif not ("และอัตราการตรวจพบ" in title and "รายสัปดาห์" not in title and "จำนวนตรวจ" in series):
return data
# The graphs at the end with all testing numbers private vs public
private = " Private" if "ภาคเอกชน" in title else ""
# pos = series["Pos"]
tests = series["จำนวนตรวจ"]
positivity = series["% Detection"]
dates = list(daterange(start, end, 1))
df = pd.DataFrame(
{
"Date": dates,
f"Tests{private}": tests,
f"% Detection{private}": positivity,
}
).set_index("Date")
df[f"Pos{private}"] = (
df[f"Tests{private}"] * df[f"% Detection{private}"] / 100.0
)
logger.info("Tests {} {} - {} {}", private, start.date(), end.date(), file)
return data.combine_first(df)
def get_test_reports():
data = pd.DataFrame()
raw = import_csv("tests_by_area", ["Start"], not USE_CACHE_DATA, date_cols=["Start", "End"])
pubpriv = import_csv("tests_pubpriv", ["Date"], not USE_CACHE_DATA)
# Also need pdf copies because of missing pptx
raw_pdf = pd.DataFrame()
for file, dl in get_test_files(ext=".pdf"):
dl()
pages = parse_file(file, html=False, paged=True)
for page_num, page in enumerate(pages, start=1):
data, raw_pdf = get_tests_by_area_pdf(file, page, data, raw_pdf, page_num)
# pptx less prone to scraping issues so should be trusted more
raw_ppt = pd.DataFrame()
for file, dl in get_test_files(ext=".pptx"):
dl()
for chart, title, series, pagenum in pptx2chartdata(file):
data, raw_ppt = get_tests_by_area_chart_pptx(file, title, series, data, raw_ppt)
if not all_in(pubpriv.columns, 'Tests', 'Tests Private'):
# Latest file as all the data we need
pubpriv = get_tests_private_public_pptx(file, title, series, pubpriv)
assert not data.empty
# TODO: assert for pubpriv too. but disappeared after certain date
export(raw.combine_first(raw_ppt).combine_first(raw_pdf), "tests_by_area")
pubpriv['Pos Public'] = pubpriv['Pos'] - pubpriv['Pos Private']
pubpriv['Tests Public'] = pubpriv['Tests'] - pubpriv['Tests Private']
export(pubpriv, "tests_pubpriv")
data = data.combine_first(pubpriv)
return data
def get_variants_by_area_pdf(file, page, page_num):
if "frequency distribution" not in page:
return pd.DataFrame()
df = camelot_cache(file, page_num + 1, process_background=False)
if len(df.columns) == 13:
totals = df[[2, 5, 8, 11]] # only want this week not whole year
elif len(df.columns) == 16: # 2022-06-24 switched to inc BA4/5
totals = df[[1, 2, 3, 5, 8, 11, 14]]
elif len(df.columns) == 19: # 2022-08-18 add BA.2.75
# TODO: inc BA.2.75/BA2.76
totals = df[[1, 2, 3, 5, 8, 11, 14, 17]]
# TODO: get totals from previous week also since report is sometimes missing
else:
assert False, "Unknown Area Variant table"
cols = [v.replace("\n", "").replace("Potentially ", "") for v in df.iloc[1] if v]
totals.columns = cols
assert len(df) == 17
totals = totals.iloc[3:16]
totals = totals.replace(r"([0-9]*) \+ \([0-9]\)", r"\1", regex=True).apply(pd.to_numeric)
totals["Health Area"] = range(1, 14)
# start, end = find_date_range(page) whole year
# start, end = find_date_range(df.iloc[2][2]) # e.g. '26 FEB – 04 \nMAR 22'
date_range = list(df.iloc[2])[-2] # Last is total, 2nd last is this date range
start_txt, end_txt = re.split("(?:–|-)", date_range)
end = pd.to_datetime(end_txt)
start = pd.to_datetime(f"{start_txt} {end.strftime('%B')} {end.year}", dayfirst=True, errors="coerce")
if pd.isnull(start):
# Start includes the month
start = pd.to_datetime(f"{start_txt} {end.year}", dayfirst=True, errors="coerce")
elif "20230106" in file:
# Someone put in the wrong dates
start, end = d("2022-12-31"), d("2023-01-06")
assert not pd.isnull(start)
totals["Start"] = start
totals["End"] = end
if "BA.2 (Omicron)" in totals.columns:
# HACK: Cols are totals not daily. Hack since they are likely 0
# TODO: return these seperate and work out diffs in case any were added
totals[["B.1.1.7 (Alpha)", "B.1.351 (Beta)", "B.1.617.2 (Delta)"]] = 0
logger.info("Variants by PCR: {} - {} {}", totals['Start'].max(), totals['End'].max(), file)
return totals.set_index("End")
def get_variants_plot_pdf(file, page, page_num):
if "National prevalence" not in page:
# and "variants surveillance" not in page or "N =" in page:
return pd.DataFrame()
# national = camelot_cache(file, page_num + 1, process_background=False, table=2)
# none of the other tables work with camelot
def splitline(line):
v, line = line.split(")")
return [v + ")"] + get_next_numbers(line, return_rest=False)
rows = [splitline(line) for line in page.split("\n") if line.startswith("B.1.")]
bangkok = pd.DataFrame(rows[4:8]).transpose()
bangkok.columns = bangkok.iloc[0]
bangkok = bangkok.iloc[1:]
return bangkok
def get_variant_api(country="THA", other_threshold=0.03, nday_threshold=5, ndays=600):
# TODO: should be able to replace with this api - https://outbreak-info.github.io/R-outbreak-info/
# need to apply for account
page = requests.get("https://outbreak.info/").text
assets_file = re.search(r'index-[^.]*.js', page).group(0)
assets = requests.get(f"https://outbreak.info/assets/{assets_file}").text
js_file = re.search(r'genomics-[^.]*.js', assets).group(0)
js = requests.get(f"https://outbreak.info/assets/{js_file}").text
bearer = re.search(r'Bearer ([^"]*)', js).group(0)
# sequences = requests.get("https://api.outbreak.info/genomics/sequence-count?location_id=THA", headers=dict(Authorization=bearer)).json()
url = f"https://api.outbreak.info/genomics/prevalence-by-location-all-lineages?location_id={country}&other_threshold={other_threshold}&nday_threshold={nday_threshold}&ndays={ndays}"
prev = requests.get(url, headers=dict(Authorization=bearer)).json()
prev = pd.DataFrame(prev['results'])
prev['date'] = pd.to_datetime(prev['date'])
# prev = prev.set_index(["date", "lineage"])
prev = prev.rename(columns=dict(lineage="Variant", date="End"))
prev = prev.pivot(columns="Variant", values="lineage_count", index="End")
logger.info("Variants GISAID {} - {} {}", prev.index.min(), prev.index.max(), url)
return prev
def get_variant_sequenced_table(file, pages):
fileseq = pd.DataFrame()
first_seq_table = None
date = file2date(file)
if date >= d("2023-09-24"):
return pd.DataFrame() # switched to montly. TODO
for page_num, page in enumerate(pages):
if "Prevalence of Pangolin lineages" not in page:
continue
df = camelot_cache(file, page_num + 1, process_background=False)
if "Total" in df[0].iloc[-1] or df[0].str.contains("Other").any() or df.iloc[0][1:].str.contains("(?:w|W)[0-9]+", regex=True).any():
# Vertical. TODO: probably need a better test
df = df.transpose()
# clean up "13 MAY 2022\nOther BA.2" , "BA.2.27\n13 MAY 2022"
df.iloc[0] = df.iloc[0].str.replace(r" \(.*\)", "", regex=True)
df.iloc[0] = df.iloc[0].str.replace(r"(.*2022\n|\n.*2022)", r"", regex=True)
# Some columns get combined. e.g.
df.iloc[0] = [c for c in sum(df.iloc[0].str.replace(
" Other ", "| Other ").str.replace(" w", "| w").str.split("|").tolist(), []) if c]
if "20220715" in file and df.iloc[0, 3] == "Other BA.2":
# Other BA.2 is there twice. One is wrong
df.iloc[0, 3] = "Other BA.2.9"
df.columns = df.iloc[0]
df = df.rename(columns=dict(Lineage="Week"))
df = df.iloc[1:]
# Convert week number to a date
# Need to handle when two weeks have ended up in one cell
weeks = df["Week"].astype(str).str.replace("w", "").str.replace(
"W", "").str.split(" ", expand=True).stack().reset_index()[0]
# 164 = 2023-03-03?, 153 = 2022-12-02
if "20220610" in file and page_num == 11:
weeks = weeks.replace("126", "127").replace("125", "126")
elif "20230303" in file:
weeks = weeks.replace("164", "165").replace("163", "164")
elif "20230421" in file:
weeks = weeks.replace("173", "172")
elif "20230512" in file:
weeks = weeks.replace("174", "175").replace("173", "174")
elif "20230602" in file:
weeks = weeks.replace("178", "177").replace("179", "178")
weeks = weeks.replace("176", "178").replace("175", "177")
elif "20230609" in file:
weeks = weeks.replace("179", "178").replace("180", "179")
elif "20230414" in file:
weeks = weeks.replace("173", "171")
df["Week"] = list(pd.to_numeric(weeks).dropna())
if date == d("2023-01-27"):
end_week_one = d("2020-01-17")
# Suspect dates are wrong so won't correct
# elif file2date(file) == d("2023-03-03"):
# end_week_one = d("2020-01-10")
elif date > d("2022-12-02"):
end_week_one = d("2020-01-03")
else:
end_week_one = d("2019-12-27")
df['End'] = (df['Week'] * 7).apply(lambda x: pd.DateOffset(x) + end_week_one) # )
df = df.set_index("End")
assert df.index.max() <= date
# Double check the weeks numbers are correct
nospace = page.replace("\n", "").replace(" ", "")
if any_in(file, "20230217"):
pass
# 2023-02-10: wrong month
# 27/01/2023 03/02/2023 in 32_20230203_DMSc_Variant_V.pdf. 160, 161
else:
found = find_dates(nospace, thai=False)
deltas = []
for week_date, week in zip([d for d in [df.index.max()]], [df['Week'].max()]):
deltas += [(week_date - d).days for d in found[-1:]]
if not any_in(deltas, 0, -6, isstr=False) and deltas:
week_dates = [str(d.date()) for d in df.index]
found_dates = [str(d.date()) for d in found]
logger.warning(
f"Sequence table: week numbers don't match dates w{list(df['Week'])} is {week_dates} but found {found_dates}: off by {deltas} in {file}")
df = df.drop(columns=["Week"] + [c for c in df.columns if "Total" in c], errors="ignore")
# TODO: Ensure Other is always counted in rest of numbers. so far seems to
# df = df.drop(columns=[c for c in df.columns if "Other BA" in c])
df = df.apply(pd.to_numeric)
# get rid of mistake duplicate columns - 14_20220610_DMSc_Variant.pdf
df = df.loc[:, ~df.columns.duplicated()].copy()
df.columns = [c.strip("*") for c in df.columns]
# match = re.search("Thailand with (.*)sequence data", page)
# if match and match.group(1) in ["", "Omicron"]:
# # first table. Ignore others so no double counted
# # 2022-06-24 - Other=5 in first table but this is BA4/5 in next table
# # 2022-07-15 - Other=1 in first table but not represented in other tables
# # TODO: need more general method
# df = df.drop(columns=[c for c in df.columns if "Other" in c])
seq_table = df
# The first table should include all the numbers from the next more detailed tables
if first_seq_table is None and not seq_table.empty:
first_seq_table = seq_table
else:
fileseq = seq_table.combine_first(fileseq)
# HACK: Most reports have only 2 rows. BA.1 data is from older time periods. Just keep last 2 and let
# older reports include that data so we get the full row
fileseq = fileseq.iloc[-2:]
# Check sub tables add up to the initial table. If not these are our "others"
exceptions = ["Other BA.5"] if "20220715" in file else []
fileseq = fileseq.drop(columns=[c for c in fileseq.columns if "Other" in c and c not in exceptions])
if first_seq_table is None:
return fileseq
others = first_seq_table.sum(axis=1) - fileseq.sum(axis=1)
if any_in(file, "20220715", "20220610"):
# Other doesn't add up. 2 is a mistake?
# 2022-05-07 is 1 out in 20220610
pass
elif any_in(file, "20220708", "20220701", "20220627"):
# BA4/5 are the "Other" in the first table but counted later on
pass
elif any_in(file, '20220916', '20221021', '20221125'):
# 20220916: "Other B" doesn't seem to appear in any later tables?
# 20221021: TODO: seems like mix between two sets of weeks? 142/143, 143/144
# 20221125: BQ.X should be combined with Other to make 7?
pass
elif any_in(file, '20230106'):
# XBB is counted as other in first table but not in 2nd. Deal with XBB later if needed?
pass
else:
latest_week = first_seq_table.index.max()
assert others.sum() == 0 or (first_seq_table['Other'][latest_week] == others[latest_week]).all()
fileseq['Other'] = others
logger.info("Variants by Seq {} - {} {}", fileseq.index.min(), fileseq.index.max(), file)
return fileseq
def get_variant_reports():
# TODO: historical variant data 2021 is in https://tncn.dmsc.moph.go.th/
sequenced = pd.DataFrame()
raw = import_csv("variants", ["Start"], not USE_CACHE_DATA, date_cols=["Start", "End"])
area = import_csv("variants_by_area", ["Start"], not USE_CACHE_DATA, date_cols=["Start", "End"])
# Get national numbers. Also gives us date ranges
for file, dl in get_variant_files(ext=".xlsx"):
file = dl()
nat = pd.read_excel(file, sheet_name="National prevalence")
nat.iloc[0, 0] = "End"
nat.columns = list(nat.iloc[0])
nat = nat.iloc[1:-1] # Get rid of header and totals at the bottom
# nat = nat[[c for c in nat.columns if not pd.isna(c)]] # get rid of empty cols
dates = nat["End"].str.split("-", expand=True)
dates[0] = dates[0].str.replace("66", "23")
dates[1] = dates[1].str.replace("66", "23")
if len(dates.columns) == 1:
# Dealing with weeks since pandemic start.
pd.to_numeric(dates[0].str.replace("wk", ""), errors="coerce")
else:
# e.g. 21JAN-27JAN2023
a = pd.to_datetime(dates[1], errors="coerce", format="%d %b") + pd.offsets.DateOffset(years=121)
b = pd.to_datetime(dates[1], errors="coerce", format="%d%b") + pd.offsets.DateOffset(years=121)
c = pd.to_datetime(dates[1], errors="coerce")
ends = a.combine_first(b).combine_first(c)
nat["End"] = ends
nat = nat.set_index("End")
# There is now 3 lots of numbers. pick the last set?
nat = nat.iloc[:, 8:12]
nat = nat.rename(columns={"B.1617.2 (Delta)": "B.1.617.2 (Delta)", "B.1.1.529 (Omicron": "B.1.1.529 (Omicron)"})
break
for file, dl in get_variant_files(ext=".pdf"):
dl()
pages = parse_file(file, html=False, paged=True)
# page 1 title
# page 2 people + sample sizes
# page 3 table year + week per variant (4) per district
# page 4 pie charts national + bangkok + regional
# page 5 area chart: weekly national, bangkok, regional
# page 6 samples submitted GSAID: weekly
for page_num, page in enumerate(pages):
bangkok = get_variants_plot_pdf(file, page, page_num)
if not bangkok.empty:
# dates from pdf too hard to parse so assume as as xslx
# TODO: date ranges don't line up so can't do this
bangkok.index = nat.index[:len(bangkok.index)]
area = area.combine_first(get_variants_by_area_pdf(file, page, page_num))
fileseq = get_variant_sequenced_table(file, pages)
if fileseq.empty:
continue
# Later files can update prev reports
sequenced = pd.concat([sequenced, fileseq]).reset_index().drop_duplicates(subset="End").set_index("End").sort_index()
# TODO: variants_by_area is now totals but we can convert it to weekly if esp if we get the seed totals in the earliest report
# area_grouped = area.groupby(["Start", "End"]).sum()
export(nat, "variants")
export(area, "variants_by_area")
export(sequenced, "variants_sequenced")
return nat
if __name__ == '__main__':
prev = get_variant_api()
variants = get_variant_reports()
df_daily = get_tests_by_day()
df = get_test_reports()
test_prov = get_tests_per_province()
old = import_csv("combined", index=["Date"])
df = old.combine_first(df).combine_first(df_daily)
import covid_plot_tests
covid_plot_tests.save_test_area_plots(df)
covid_plot_tests.save_tests_plots(df)
covid_plot_tests.save_variant_plots(df)