-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmodule_inspect_data.py
136 lines (98 loc) · 5.11 KB
/
module_inspect_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
### MODULE_INSPECT_DATA.PY
### This is the inspect data module. In here you can find several functions:
### - Functions to inspect several options
### - A function that includes the program for inspecting the data
#%% Necessary modules and packages for this module
import module_settings_airlines_airports as setair
import matplotlib.pyplot as plt
#%% functions for inspecting several options
# function to inspect the columns in the chosen dataframe and list them
def inspect_columns(specific_df):
print('\nVariables in the dataframe: \n')
print(specific_df.columns.tolist())
# function to insprect the unique values of the chosen datafram and print output
def inspect_unique_values(specific_df):
print('\nUnique values per variable \n')
for column in specific_df.columns:
print(f'{column} = {len(specific_df[column].unique())}')
# function to visualise the amount of airports of the top 10 countries
def inspect_most_flights_airports(df):
test = df['source airport'].value_counts()[:10].reset_index()
test.plot.bar(x = "index", y = "source airport", legend=False)
plt.ylabel("flight routes")
plt.show()
# function to create bar plot from table
def barplot_from_df(table, x, y, ylabel = None):
table.plot.bar(x, y, legend=False)
plt.ylabel(ylabel)
plt.show()
#%% METAFUNCTION FOR INSPECTING DATA
# function with program to ask user to select the option they want
def inspect_data(routes, airports, merged):
inspect_option = input("""What do you want to do?
1\tInspect the variables of a used dataframe
2\tInspect unique values of a used dataframe
3\tShow biggest airports/airlines/countries
enter answer (1/2/3): """)
if inspect_option == '1':
columns_dataframe = input("""Which dataframe?
1\tThe routes dataframe
2\tThe airports dataframe
3\tThe merged dataframe
enter answer (1/2/3): """)
# depending on answer, select right dataframe and use inspect_columns function
if columns_dataframe == '1':
inspect_columns(routes)
elif columns_dataframe == '2':
inspect_columns(airports)
elif columns_dataframe == '3':
inspect_columns(merged)
else:
print('\nSorry, this is not an option, we will return to the main program')
elif inspect_option == '2':
unique_dataframe = input("""Which dataframe?
1\tThe routes dataframe
2\tThe airports dataframe
3\tThe merged dataframe
enter answer (1/2/3): """)
# depending on answer, select right dataframe and use inspect_unique_values function
if unique_dataframe == '1':
inspect_unique_values(routes)
elif unique_dataframe == '2':
inspect_unique_values(airports)
elif unique_dataframe == '3':
inspect_unique_values(merged)
else:
print('\nSorry, this is not an option, we will return to the main program')
elif inspect_option == '3':
extra_options = input("""What would you want to do?
1\tShow in which countries most airports are located
2\tShow 10 biggest airports based on number of incoming flights
3\tShow 10 biggest airports based on degree (most connected)
4\tShow 10 biggest airlines
enter answer (1/2/3/4): """)
if extra_options == '1':
print('\nThe 10 countries with most airports: \n')
# create plot pie of the 10 countries with the most airports
merged['airport country'].value_counts()[0:10].plot.pie()
plt.show()
elif extra_options == '2':
print('\nThe 10 biggest airports based on number of incoming flights: \n')
# use inspect_most_flights_airports function with merged file
inspect_most_flights_airports(merged)
elif extra_options == '3':
print('\nThe 10 biggest airports based on degree (most connected): \n')
# create table with top 10 airports with function in compare module
hub_table = setair.find_hubs_in_df(merged, 10)
# then barplot this top 10 with labels
barplot_from_df(hub_table, x="airport" , y="degree", ylabel="flight routes")
elif extra_options == '4':
print('\nThe 10 biggest airlines: \n')
# create table with top 10 airlines with function in compare module
df_table_airlines = setair.airline_table_name(merged)[:10]
# use barplot function from compare module to make bar plot
barplot_from_df(df_table_airlines, x="name airline" , y="flight_routes_nr" , ylabel="flight routes")
else:
print('\nSorry, this is not an option, we will return to the main program')
else:
print('\nSorry, this is not an option, we will return to the main program')