-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path1. Data Prep.py
262 lines (191 loc) · 11.2 KB
/
1. Data Prep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import pandas as pd
import geopandas as gpd
# Read CDC lung disease data
lung_disease_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/CSV og data files/lung_disease_data.csv')
# Read EPA air quality data for PM2.5 and ozone
pm25_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_pm25_data.csv', low_memory=False)
ozone_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_ozone_data.csv', low_memory=False)
# Read county shapefile for spatial analysis
counties_gdf = gpd.read_file('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/County data/county_shapefile.shp')
# Convert date column to datetime
pm25_df['Date Local'] = pd.to_datetime(pm25_df['Date Local'], format='%Y-%m-%d')
ozone_df['Date Local'] = pd.to_datetime(ozone_df['Date Local'], format='%Y-%m-%d')
# Extract year from date
pm25_df['year'] = pm25_df['Date Local'].dt.year
ozone_df['year'] = ozone_df['Date Local'].dt.year
# Get year from PM2.5
# Convert 'Date Local' to datetime and extract year
pm25_df['Date Local'] = pd.to_datetime(pm25_df['Date Local'], format='%Y-%m-%d')
pm25_df['year'] = pm25_df['Date Local'].dt.year
# Rename 'Sample Measurement' to 'PM2.5'
pm25_df.rename(columns={'Arithmetic Mean': 'PM2.5'}, inplace=True)
# Verify the DataFrame
print(pm25_df.head())
print(pm25_df.columns)
# Convert 'Date Local' to datetime and extract year
ozone_df['Date Local'] = pd.to_datetime(ozone_df['Date Local'], format='%m/%d/%y')
ozone_df['year'] = ozone_df['Date Local'].dt.year
# Rename 'Sample Measurement' to 'Ozone'
ozone_df.rename(columns={'Arithmetic Mean': 'Ozone'}, inplace=True)
# Verify the DataFrame
print(ozone_df.head())
print(ozone_df.columns)
# Ensure 'County Name' and 'year' exist
print(pm25_df[['County Name', 'year']].head())
print(ozone_df[['County Name', 'year']].head())
# Group by county and year to calculate annual averages
pm25_annual = pm25_df.groupby(['County Name', 'year'])['PM2.5'].mean().reset_index()
ozone_annual = ozone_df.groupby(['County Name', 'year'])['Ozone'].mean().reset_index()
# Get latitude and longitude for each county-year pair
lat_lon_pm25 = pm25_df.groupby(['County Name', 'year']).agg({
'Latitude': 'first',
'Longitude': 'first'
}).reset_index()
lat_lon_ozone = ozone_df.groupby(['County Name', 'year']).agg({
'Latitude': 'first',
'Longitude': 'first'
}).reset_index()
# Merge latitude and longitude with PM2.5 and Ozone averages
pm25_annual = pd.merge(pm25_annual, lat_lon_pm25, on=['County Name', 'year'])
ozone_annual = pd.merge(ozone_annual, lat_lon_ozone, on=['County Name', 'year'])
# this worked but merging had error described below
# Problems to fix: the lung disease dataframe columns are unnamed, the PM and ozone dataframe columns have 'County Name' while the lung disease doesn't
# The header of the Lung disease data is 'Interstitial lung...' not the column name so...
# Reload the lung disease dataframe with the correct header
lung_disease_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/CSV og data files/lung_disease_data.csv', header=1)
# Problem: lung disease data does not have 'County Name'
# Rename columns to be consistent
lung_disease_df.rename(columns={'Location': 'County Name'}, inplace=True)
# Verify the columns, all should have 'County Name' and 'year'
print("PM2.5 DataFrame columns after renaming:")
print(pm25_df.columns)
print("Ozone DataFrame columns after renaming:")
print(ozone_df.columns)
print("lung_disease DataFrame columns after renaming:")
print(lung_disease_df.columns)
# problem is 'year'
# extract the year from the columns, "'Mortality Rate, 1980*', 'Mortality Rate, 1985*'"
# Melt the DataFrame
# Use the melt function to transform the DataFrame from wide to long format. This will convert the year-specific columns into rows.
lung_disease_long = lung_disease_df.melt(
id_vars=['County Name', 'FIPS', '% Change in Mortality Rate, 1980-2014'],
var_name='year',
value_name='Mortality Rate'
)
# Extract the Year
# Extract the year from the 'Year' column using string operations and convert it to an integer.
lung_disease_long['year'] = lung_disease_long['year'].str.extract(r'(\d{4})').astype(int)
# r'(\d{4})' is a regex pattern that matches and captures a four-digit year.
# test again and see what it looks like now, make sure to print the long version (changed version)
print(lung_disease_long)
# Ensure that the year column exists in all DataFrames used in the merge operation.
print("Lung Disease DataFrame columns:")
print(lung_disease_long.columns)
print("PM2.5 Annual DataFrame columns:")
print(pm25_annual.columns)
print("Ozone Annual DataFrame columns:")
print(ozone_annual.columns)
# Convert year columns to integer type
lung_disease_long['year'] = lung_disease_long['year'].astype(int)
pm25_annual['year'] = pm25_annual['year'].astype(int)
ozone_annual['year'] = ozone_annual['year'].astype(int)
# Tried merging but the merged_df file contained no data
# Problem: Lung disease data doesn't have latitude and longitutde like PM2.5 and ozone data
# County Names are different
# Check unique values in the County Name and year columns
print(lung_disease_long[['County Name', 'year']].drop_duplicates().head())
print(pm25_annual[['County Name', 'year']].drop_duplicates().head())
print(ozone_annual[['County Name', 'year']].drop_duplicates().head())
# Define a function to standardize county names
def clean_county_name(name):
return name.strip().lower()
# Apply the function to each DataFrame
lung_disease_long['County Name'] = lung_disease_long['County Name'].apply(clean_county_name)
pm25_annual['County Name'] = pm25_annual['County Name'].apply(clean_county_name)
ozone_annual['County Name'] = ozone_annual['County Name'].apply(clean_county_name)
# Remove 'county' from the County Name in the lung disease dataset
lung_disease_long['County Name'] = lung_disease_long['County Name'].str.replace(' county', '', case=False)
# Assuming the state and county columns are named 'State' and 'County Name'
# Adjust column names if they are different in your datasets
# Remove any extra spaces and clean up
pm25_annual['County Name'] = pm25_annual['County Name'].str.strip()
ozone_annual['County Name'] = ozone_annual['County Name'].str.strip()
# Verify the new format
print(pm25_annual[['County Name', 'year']].drop_duplicates().sort_values(['County Name', 'year']))
print(ozone_annual[['County Name', 'year']].drop_duplicates().sort_values(['County Name', 'year']))
print(pm25_annual.head)
print(ozone_annual.head)
# Convert county names to lowercase to standardize
pm25_annual['County Name'] = pm25_annual['County Name'].str.lower()
ozone_annual['County Name'] = ozone_annual['County Name'].str.lower()
lung_disease_long['County Name'] = lung_disease_long['County Name'].str.lower()
# Find common counties
common_counties = set(pm25_annual['County Name']).intersection(set(ozone_annual['County Name']))
# Filter DataFrames by common counties
pm25_common = pm25_annual[pm25_annual['County Name'].isin(common_counties)]
ozone_common = ozone_annual[ozone_annual['County Name'].isin(common_counties)]
lung_disease_common = lung_disease_long[lung_disease_long['County Name'].isin(common_counties)]
# Print results
print("PM2.5 DataFrame with common counties:\n", pm25_common)
print("Ozone DataFrame with common counties:\n", ozone_common)
print("Lung Disease DataFrame with common counties:\n", ozone_common)
print(f"Number of common counties: {len(common_counties)}")
# Fixed county problem
# Now I need to find common years
# Find common years across the datasets
common_years = set(pm25_common['year']).intersection(set(ozone_common['year'])).intersection(set(lung_disease_common['year']))
# Filter DataFrames by common years
pm25_common = pm25_common[pm25_common['year'].isin(common_years)]
ozone_common = ozone_common[ozone_common['year'].isin(common_years)]
lung_disease_common = lung_disease_common[lung_disease_common['year'].isin(common_years)]
# Verify the common counties and years
print(f"Number of common counties: {len(common_counties)}")
print(f"Number of common years: {len(common_years)}")
# Check unique years in each DataFrame
print("Unique years in PM2.5 data:", pm25_common['year'].unique())
print("Unique years in Ozone data:", ozone_common['year'].unique())
print("Unique years in Lung Disease data:", lung_disease_common['year'].unique())
print("Number of rows in PM2.5 data before filtering:", len(pm25_annual))
print("Number of rows in Ozone data before filtering:", len(ozone_annual))
print("Number of rows in Lung Disease data before filtering:", len(lung_disease_long))
# Find common counties again and print them
common_counties = set(pm25_annual['County Name']).intersection(set(ozone_annual['County Name']))
print(f"Number of common counties: {len(common_counties)}")
print(f"Common counties: {common_counties}")
# Filter DataFrames by common counties and print their lengths
pm25_common = pm25_annual[pm25_annual['County Name'].isin(common_counties)]
ozone_common = ozone_annual[ozone_annual['County Name'].isin(common_counties)]
lung_disease_common = lung_disease_long[lung_disease_long['County Name'].isin(common_counties)]
print("Number of rows in PM2.5 data after filtering:", len(pm25_common))
print("Number of rows in Ozone data after filtering:", len(ozone_common))
print("Number of rows in Lung Disease data after filtering:", len(lung_disease_common))
print("Unique years in PM2.5 data before filtering:", pm25_annual['year'].unique())
print("Unique years in Ozone data before filtering:", ozone_annual['year'].unique())
print("Unique years in Lung Disease data before filtering:", lung_disease_long['year'].unique())
# Now county name and year columns should be fixed
# Merge lung disease data with PM2.5 and ozone data
merged_df = pd.merge(lung_disease_long, pm25_annual, on=['County Name', 'year'])
merged_df = pd.merge(merged_df, ozone_annual, on=['County Name', 'year'])
print(merged_df.head)
merged_df.to_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/merged_df.csv', index=False)
# Find rows where Latitude_x and Latitude_y differ
discrepancies = merged_df[merged_df['Latitude_x'] != merged_df['Latitude_y']]
# Display the rows with discrepancies
print(discrepancies[['County Name', 'year', 'Latitude_x', 'Latitude_y']])
# Drop the incorrect columns and rename the correct ones
merged_df = merged_df.drop(columns=['Latitude_y', 'Longitude_y'])
merged_df = merged_df.rename(columns={'Latitude_x': 'Latitude', 'Longitude_x': 'Longitude'})
# Recreate the Geometry Column - Once you have cleaned up the latitude and longitude columns:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
# Create the geometry column using the cleaned latitude and longitude columns
merged_df['geometry'] = merged_df.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
# Convert to GeoDataFrame
gdf = gpd.GeoDataFrame(merged_df, geometry='geometry')
# Optionally, set the coordinate reference system (CRS) if you know it
# Example: gdf.set_crs(epsg=4326, inplace=True) # WGS84 CRS
# Save the GeoDataFrame to a shapefile or other format if needed
gdf.to_file('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/merged_geodataframe.shp')
# Save the updated DataFrame to CSV
merged_df.to_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/merged_df.csv', index=False)