-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path5. COPD Regression Analysis.py
256 lines (187 loc) · 8.46 KB
/
5. COPD Regression Analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point
import libpysal as lp
from spreg import ML_Lag, ML_Error
from libpysal.weights import Queen
import os
# Load the shapefile into a GeoDataFrame
gdf1 = gpd.read_file('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/new_merged_geodataframe.shp')
# Load the merged dataset
new_merged_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/new_merged_df.csv')
# Load the county shapefile
shapefile_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/County data/county_shapefile.shp'
county_gdf = gpd.read_file(shapefile_path)
# Load the fixed_geodataframe1
fixed_geodataframe1 = gpd.read_file('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/fixed_geodataframe1.shp')
print(new_merged_df)
print(fixed_geodataframe1)
# Check the geometry type of merged_geodataframe
print("Geometry type of gdf:")
print(gdf1.geometry.geom_type.unique())
# Check the geometry type of fixed_geodataframe1
print("Geometry type of fixed_geodataframe1:")
print(fixed_geodataframe1.geometry.geom_type.unique())
# Check common columns
common_columns = set(gdf1.columns) & set(fixed_geodataframe1.columns)
print("Common Columns:")
print(common_columns)
# Merge the GeoDataFrames on common columns to see differences
comparison_df = gdf1.merge(fixed_geodataframe1, on=list(common_columns), how='outer', indicator=True)
print(comparison_df.head())
# Check if there are any differences
differences = comparison_df[comparison_df['_merge'] != 'both']
print("Differences between the two GeoDataFrames:")
print(differences)
# Perform the merge operation and assign the result to gdf
gdf1 = gdf1.merge(fixed_geodataframe1, on=list(common_columns), how='outer', indicator=True)
# Optionally, you can drop the '_merge' column if it's no longer needed
gdf1 = gdf1.drop(columns=['_merge'])
# Now gdf contains the merged data
# Convert 'FIPS' in gdf to string
gdf1['FIPS'] = gdf1['FIPS'].astype(str)
# Convert 'COUNTYFP' in county_gdf to string
county_gdf['COUNTYFP'] = county_gdf['COUNTYFP'].astype(str)
print(gdf1['FIPS'].dtype)
print(county_gdf['COUNTYFP'].dtype)
# Perform the merge
new_merged_gdf = gdf1.merge(county_gdf, left_on='FIPS', right_on='COUNTYFP', how='left')
print(new_merged_gdf.head())
print(new_merged_gdf.columns)
# Merge was successful, now cleaning columns
# Rename the columns to fix any typos and remove unwanted suffixes
new_merged_gdf = new_merged_gdf.rename(columns={'County Nam': 'County Name', '% Change i': '% Change in Mortality Rate, 1980-2014', 'geometry_x': 'geometry'})
# Drop unnecessary columns (if any)
columns_to_drop = ['STATEFP', 'COUNTYFP', 'COUNTYNS', 'AFFGEOID', 'GEOID', 'NAME_y', 'LSAD', 'NAME', 'geometry_y', 'ALAND', 'AWATER']
new_merged_gdf = new_merged_gdf.drop(columns=columns_to_drop, errors='ignore')
# Convert 'Mortality' to numeric
gdf1['Mortality'] = gdf1['Mortality'].str.extract(r'(\d+\.\d+)').astype(float)
# Handle NaNs
gdf1['Mortality'] = gdf1['Mortality'].fillna(gdf1['Mortality'].mean())
# Check the result
print(gdf1.head())
# Ensure that the columns of interest are present and correctly named
print(new_merged_gdf.head())
print(new_merged_gdf.columns)
# Check the data types of your DataFrame columns
print(gdf1[['Mortality', 'PM2.5', 'Ozone']].dtypes)
# Convert to numeric, if necessary
gdf1['Mortality'] = pd.to_numeric(gdf1['Mortality'], errors='coerce')
gdf1['PM2.5'] = pd.to_numeric(gdf1['PM2.5'], errors='coerce')
gdf1['Ozone'] = pd.to_numeric(gdf1['Ozone'], errors='coerce')
# Calculate the mean of each column
means = gdf1[['Mortality', 'PM2.5', 'Ozone']].mean()
# Replace NaN values with the column mean
gdf1[['Mortality', 'PM2.5', 'Ozone']] = gdf1[['Mortality', 'PM2.5', 'Ozone']].fillna(means)
# Optionally, ensure no infinite values are present
gdf1 = gdf1[~gdf1[['Mortality', 'PM2.5', 'Ozone']].isin([np.inf, -np.inf]).any(axis=1)]
# Convert 'Mortality' to numeric, coerce errors to NaN
gdf1['Mortality'] = pd.to_numeric(gdf1['Mortality'], errors='coerce')
# Recalculate means and replace NaNs with the column mean
means = gdf1[['Mortality', 'PM2.5', 'Ozone']].mean()
print(means)
gdf1[['Mortality', 'PM2.5', 'Ozone']] = gdf1[['Mortality', 'PM2.5', 'Ozone']].fillna(means)
# Check the data types of your DataFrame columns
print(gdf1[['Mortality', 'PM2.5', 'Ozone']].dtypes)
# Check data types
print(gdf1.dtypes)
# Ensure no infinite values are present
gdf1 = gdf1[~gdf1[['Mortality', 'PM2.5', 'Ozone']].isin([np.inf, -np.inf]).any(axis=1)]
# Confirm the cleaned data
print(gdf1.head())
# Check for NaN values in your DataFrame columns
print(gdf1[['Mortality', 'PM2.5', 'Ozone']].isna().sum())
# Confirm that there are no infinite values
print((np.isinf(gdf1[['Mortality', 'PM2.5', 'Ozone']])).sum())
# Buffer points to create polygons
buffer_distance = 1 # Adjust this distance based on your analysis
gdf1['geometry'] = gdf1['geometry'].buffer(buffer_distance)
# Verify that the geometry type is now polygons
print("Geometry type after buffering:")
print(gdf1.geometry.geom_type.unique())
# Recreate GeoDataFrame with new polygon geometries
gdf1 = gpd.GeoDataFrame(gdf1, geometry='geometry')
# Check for NaN values in your DataFrame columns
print("NaN values in columns before processing:")
print(gdf1[['Mortality', 'PM2.5', 'Ozone']].isna().sum())
# Check data types and ensure no infinite values are present
print("Checking for infinite values before processing:")
print(gdf1[~gdf1[['Mortality', 'PM2.5', 'Ozone']].isin([np.inf, -np.inf]).any(axis=1)])
# Convert 'Mortality' to numeric
gdf1['Mortality'] = pd.to_numeric(gdf1['Mortality'], errors='coerce')
# Check for NaN values and fill with column mean
means = gdf1[['Mortality', 'PM2.5', 'Ozone']].mean()
gdf1[['Mortality', 'PM2.5', 'Ozone']] = gdf1[['Mortality', 'PM2.5', 'Ozone']].fillna(means)
# Check the final data types and presence of NaN/infinite values
print("Data types of columns after processing:")
print(gdf1.dtypes)
print("Number of NaN values after processing:")
print(gdf1[['Mortality', 'PM2.5', 'Ozone']].isna().sum())
print("Number of infinite values after processing:")
print((np.isinf(gdf1[['Mortality', 'PM2.5', 'Ozone']])).sum())
# Define your dependent and independent variables
y = gdf1[['Mortality']].values.flatten() # Flatten to 1D array
X = gdf1[['PM2.5', 'Ozone']].values
# Ensure that the input data is valid
print("Shape of y:", y.shape)
print("Shape of X:", X.shape)
print("First 5 rows of y:", y[:5])
print("First 5 rows of X:", X[:5])
# Recalculate weights matrix for polygons
try:
w = lp.weights.Queen.from_dataframe(gdf1, use_index=False)
w.transform = 'r' # Row-standardize the weights
print("Row-standardized Weights matrix:")
print(w.full()) # For binary weights
# Check the type of weights matrix
print("Weights matrix:")
print("Type of weights matrix:", type(w))
print("Weights matrix W:")
print(w)
# Print the number of weights and some example values
print("Number of weights:", len(w.weights)) # Total number of weights
# Sample output for binary weights
print("Sample weights (first 5 entries):")
for key in list(w.weights.keys())[:5]:
print(f"{key}: {w.weights[key]}")
except Exception as e:
print("Error in creating weights matrix:", e)
# Perform the regression with explicitly set weights matrix
try:
# Fit the ML_Lag model
model_lag = ML_Lag(y, X, w=w)
print("ML_Lag Model Summary:")
print(model_lag.summary)
except Exception as e:
print("Error in ML_Lag model:", e)
try:
# Fit the ML_Error model
model_error = ML_Error(y, X, w=w)
print("ML_Error Model Summary:")
print(model_error.summary)
except Exception as e:
print("Error in ML_Error model:", e)
# Check the columns after renaming
print(new_merged_gdf.columns)
# Ensure the CRS is set
if new_merged_gdf.crs is None:
new_merged_gdf.crs = 'EPSG:4326' # or the appropriate EPSG code for your CRS
# Assuming your GeoDataFrame is named COPD_new_merged_gdf
new_merged_gdf.to_file('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/new_merged_gdf.gpkg', layer='new_merged_gdf', driver='GPKG')
import libpysal as ps
from mgwr.sel_bw import Sel_BW
#import mgwr
#from mgwr.sel import Sel_BW
#import mgwr
#try:
# from mgwr.sel import Sel_BW
# print("Sel_BW imported successfully!")
#except ImportError as e:
# print(f"ImportError: {e}")
coords = list(zip(gdf1.geometry.centroid.x, gdf1.geometry.centroid.y))
y = gdf1[['Mortality']].values
X = gdf1[['PM2.5', 'Ozone']].values
# Bandwidth selection
#sel_bw = Sel_BW(coords, y, X,w=w)
#bw = sel_bw.search()