-
Notifications
You must be signed in to change notification settings - Fork 0
/
app_explore_df.py
120 lines (96 loc) · 3.64 KB
/
app_explore_df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pandas as pd
import streamlit as st
from pandas.api.types import (
is_categorical_dtype,
is_datetime64_any_dtype,
is_numeric_dtype,
is_object_dtype,
)
import glob
import polars as pl
from utils import describe_numeric
st.set_page_config(layout="wide", page_title='Explore Table')
def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
"""
Adds a UI on top of a dataframe to let viewers filter columns
Args:
df (pd.DataFrame): Original dataframe
Returns:
pd.DataFrame: Filtered dataframe
"""
modify = st.checkbox("Add filters")
if not modify:
return df
df = df.copy()
# Try to convert datetimes into a standard format (datetime, no timezone)
for col in df.columns:
if is_object_dtype(df[col]):
try:
df[col] = pd.to_datetime(df[col])
except Exception:
pass
if is_datetime64_any_dtype(df[col]):
df[col] = df[col].dt.tz_localize(None)
modification_container = st.container()
with modification_container:
to_filter_columns = st.multiselect("Filter dataframe on", df.columns)
for column in to_filter_columns:
left, right = st.columns((1, 20))
left.write("↳")
# Treat columns with < 10 unique values as categorical
if is_categorical_dtype(df[column]) or df[column].nunique() < 10:
user_cat_input = right.multiselect(
f"Values for {column}",
df[column].unique(),
default=list(df[column].unique()),
)
df = df[df[column].isin(user_cat_input)]
elif is_numeric_dtype(df[column]):
_min = float(df[column].min())
_max = float(df[column].max())
step = (_max - _min) / 100
user_num_input = right.slider(
f"Values for {column}",
_min,
_max,
(_min, _max),
step=step,
)
df = df[df[column].between(*user_num_input)]
elif is_datetime64_any_dtype(df[column]):
user_date_input = right.date_input(
f"Values for {column}",
value=(
df[column].min(),
df[column].max(),
),
)
if len(user_date_input) == 2:
user_date_input = tuple(map(pd.to_datetime, user_date_input))
start_date, end_date = user_date_input
df = df.loc[df[column].between(start_date, end_date)]
else:
user_text_input = right.text_input(
f"Substring or regex in {column}",
)
if user_text_input:
df = df[df[column].str.contains(user_text_input)]
return df
tab0, tab1, tab2 = st.tabs(['File', 'Table', 'Summary'])
with tab0:
file_path_txt = st.text_input("File path:")
df = None
with tab1:
if file_path_txt:
if file_path_txt.endswith('.csv'):
df = pd.read_csv(file_path_txt)
else:
files_ = glob.glob(f'{file_path_txt}/*.csv')
df = pl.concat([pl.read_csv(f) for f in files_]).to_pandas()
if 'smape_avg_val' in df.columns and 'smape_avg_test' in df.columns:
df['diff_smape_val_test'] = df['smape_avg_val'] - df['smape_avg_test']
st.dataframe(filter_dataframe(df))
with tab2:
if df is not None:
st.table(describe_numeric(df))
# streamlit run app_explore_df.py --server.port 8001