-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathprocess.json
17 lines (17 loc) · 11.1 KB
/
process.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
[{
"id" : "6zmcpn",
"name" : "1. Incidence COPD data prep",
"description" : null,
"code" : "import pandas as pd\nimport geopandas as gpd\n\n# Read CDC COPD incidence data\nCOPD_incidence_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/IHME_2000-2021_COPD_Incidence_DATA.csv')\n\n# Read EPA air quality data for PM2.5 and ozone\npm25_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_pm25_data.csv', low_memory=False)\nozone_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/Ozone and PM2.5 Data/combined_ozone_data.csv', low_memory=False)\n\n# Read county shapefile for spatial analysis\ncounties_gdf = gpd.read_file('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/County data/county_shapefile.shp')\n\n# Convert date column to datetime\npm25_df['Date Local'] = pd.to_datetime(pm25_df['Date Local'], format='%Y-%m-%d')\nozone_df['Date Local'] = pd.to_datetime(ozone_df['Date Local'], format='%Y-%m-%d')\n\n# Extract year from date\npm25_df['year'] = pm25_df['Date Local'].dt.year\nozone_df['year'] = ozone_df['Date Local'].dt.year\n\n# Get year from PM2.5\n\n# Convert 'Date Local' to datetime and extract year\npm25_df['Date Local'] = pd.to_datetime(pm25_df['Date Local'], format='%Y-%m-%d')\npm25_df['year'] = pm25_df['Date Local'].dt.year\n\n# Rename 'Sample Measurement' to 'PM2.5'\npm25_df.rename(columns={'Arithmetic Mean': 'PM2.5'}, inplace=True)\n\n# Verify the DataFrame\nprint(pm25_df.head())\nprint(pm25_df.columns)\n\n# Convert 'Date Local' to datetime and extract year\nozone_df['Date Local'] = pd.to_datetime(ozone_df['Date Local'], format='%m/%d/%y')\nozone_df['year'] = ozone_df['Date Local'].dt.year\n\n# Rename 'Sample Measurement' to 'Ozone'\nozone_df.rename(columns={'Arithmetic Mean': 'Ozone'}, inplace=True)\n\n# Verify the DataFrame\nprint(ozone_df.head())\nprint(ozone_df.columns)\n\n# Ensure 'State Name' and 'year' exist\nprint(pm25_df[['State Name', 'year']].head())\nprint(ozone_df[['State Name', 'year']].head())\n\n# Group by state and year to calculate annual averages\npm25_annual = pm25_df.groupby(['State Name', 'year'])['PM2.5'].mean().reset_index()\nozone_annual = ozone_df.groupby(['State Name', 'year'])['Ozone'].mean().reset_index()\n\n# Get latitude and longitude for each state-year pair\nlat_lon_pm25 = pm25_df.groupby(['State Name', 'year']).agg({\n 'Latitude': 'first',\n 'Longitude': 'first'\n}).reset_index()\n\nlat_lon_ozone = ozone_df.groupby(['State Name', 'year']).agg({\n 'Latitude': 'first',\n 'Longitude': 'first'\n}).reset_index()\n\n# Merge latitude and longitude with PM2.5 and Ozone averages\npm25_annual = pd.merge(pm25_annual, lat_lon_pm25, on=['State Name', 'year'])\nozone_annual = pd.merge(ozone_annual, lat_lon_ozone, on=['State Name', 'year'])\n\n# Verify the columns, all should have 'State Name' and 'year'\nprint(\"PM2.5 DataFrame columns:\")\nprint(pm25_df.columns)\nprint(\"Ozone DataFrame columns:\")\nprint(ozone_df.columns)\nprint(\"COPD incidence DataFrame columns:\")\nprint(COPD_incidence_df.columns)\n\n# Merge lung disease data with PM2.5 and ozone data\nCOPD_incidence_merged_df = pd.merge(COPD_incidence_df, pm25_annual, on=['State Name', 'year'])\nCOPD_incidence_merged_df = pd.merge(COPD_incidence_merged_df, ozone_annual, on=['State Name', 'year'])\n\n# Find rows where Latitude_x and Latitude_y differ\ndiscrepancies = COPD_incidence_merged_df[COPD_incidence_merged_df['Latitude_x'] != COPD_incidence_merged_df['Latitude_y']]\n\n# Display the rows with discrepancies\nprint(discrepancies[['State Name', 'year', 'Latitude_x', 'Latitude_y']])\n\n# Drop the incorrect columns and rename the correct ones\nCOPD_incidence_merged_df = COPD_incidence_merged_df.drop(columns=['Latitude_x', 'Longitude_x'])\nCOPD_incidence_merged_df = COPD_incidence_merged_df.rename(columns={'Latitude_y': 'Latitude', 'Longitude_y': 'Longitude'})\n\nfrom shapely.geometry import Point\n\n# Create the geometry column using the latitude and longitude columns\nCOPD_incidence_merged_df['geometry'] = COPD_incidence_merged_df.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)\n\n\n# Convert merged DataFrame to GeoDataFrame for spatial analysis\nCOPD_incidence_gdf = gpd.GeoDataFrame(COPD_incidence_merged_df, geometry='geometry')\n\nCOPD_incidence_gdf.crs = 'EPSG:4326' # Example CRS, adjust as needed\n\n# Save the GeoDataFrame to a shapefile or other format if needed\nCOPD_incidence_gdf.to_file('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_incidence_gdf.shp')\n\n# Save the updated DataFrame to CSV\nCOPD_incidence_merged_df.to_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_incidence_merged_df.csv', index=False)\n\n\n\n\n# Check for matches in State Name and year across dataframes\nprint(COPD_incidence_df[['State Name', 'year']].drop_duplicates())\nprint(pm25_annual[['State Name', 'year']].drop_duplicates())\nprint(ozone_annual[['State Name', 'year']].drop_duplicates())\n\n# Check for common values between dataframes\ncommon_states_years = pd.merge(COPD_incidence_df[['State Name', 'year']], pm25_annual[['State Name', 'year']], on=['State Name', 'year'])\nprint(common_states_years.head())\n\n# Verify data types before merging\nprint(COPD_incidence_df.dtypes)\nprint(pm25_annual.dtypes)\nprint(ozone_annual.dtypes)\n\n# Ensure no columns have trailing spaces or unexpected characters\nCOPD_incidence_df.columns = COPD_incidence_df.columns.str.strip()\npm25_annual.columns = pm25_annual.columns.str.strip()\nozone_annual.columns = ozone_annual.columns.str.strip()\n\n\n\n\n\n\n# Making Random Forest Prediction Model\nimport pandas as pd\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.metrics import mean_squared_error, r2_score\n\n# Load the merged DataFrame\nCOPD_incidence_merged_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_incidence_merged_df.csv')\n\nprint(COPD_incidence_merged_df.dtypes)\n\n# Check for any non-numeric values or unexpected text\nnon_numeric_values = COPD_incidence_merged_df[~COPD_incidence_merged_df['COPD Incidence'].apply(pd.to_numeric, errors='coerce').notnull()]\nprint(non_numeric_values)\n\nprint(COPD_incidence_merged_df['State Name'].unique())\nprint(COPD_incidence_merged_df['metric_name'].unique())\n\nCOPD_incidence_merged_df['COPD Incidence'] = pd.to_numeric(COPD_incidence_merged_df['COPD Incidence'], errors='coerce')\nCOPD_incidence_merged_df['PM2.5'] = pd.to_numeric(COPD_incidence_merged_df['PM2.5'], errors='coerce')\nCOPD_incidence_merged_df['Ozone'] = pd.to_numeric(COPD_incidence_merged_df['Ozone'], errors='coerce')\n\n# Verify if any columns were unintentionally concatenated\nprint(COPD_incidence_merged_df.head(10))\n\n\n# Identify numeric columns\nnumeric_cols = COPD_incidence_merged_df.select_dtypes(include='number').columns\nnon_numeric_cols = COPD_incidence_merged_df.select_dtypes(exclude='number').columns\n\n\n# Identify non-numeric columns\nnon_numeric_cols = COPD_incidence_merged_df.select_dtypes(exclude=[np.number]).columns.tolist()\n\n# Fill NaN values in numeric columns with the mean\nCOPD_incidence_merged_df[numeric_cols].fillna(COPD_incidence_merged_df[numeric_cols].mean(), inplace=True)\n\n\n# Check for missing values\nprint(COPD_incidence_merged_df.isnull().sum())\n\n# Fill missing values if necessary\n# For simplicity, you can use the mean or median of the columns with missing values\nCOPD_incidence_merged_df.fillna(COPD_incidence_merged_df.mean(), inplace=True)\n\n# Extract features and target variable\n# Assume 'COPD Incidence' is the target variable and the rest are features\nfeatures = COPD_incidence_merged_df[['PM2.5', 'Ozone', 'Latitude', 'Longitude']]\ntarget = COPD_incidence_merged_df['COPD Incidence']\n\n# Split the data into training and testing sets\nX_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)\n\n# Initialize and fit the Random Forest model\nrf_model = RandomForestRegressor(n_estimators=300, max_depth=10, min_samples_split=10, random_state=42)\nrf_model.fit(X_train, y_train)\n\n# Make predictions\ny_pred = rf_model.predict(X_test)\n\n# Evaluate the model\nmse = mean_squared_error(y_test, y_pred)\nr2 = r2_score(y_test, y_pred)\n\nprint(f\"Mean Squared Error: {mse}\")\nprint(f\"R^2 Score: {r2}\")\n\n# Get feature importances\nimportances = rf_model.feature_importances_\nfeatures_importance = pd.DataFrame({\n 'Feature': X_train.columns,\n 'Importance': importances\n}).sort_values(by='Importance', ascending=False)\n\nprint(features_importance)\n\n\n\n\n\n\n",
"lang" : "python",
"owner" : "111111",
"confidential" : "FALSE"
},{
"id" : "hjbur5",
"name" : "2. Incidence COPD Data Analysis",
"description" : null,
"code" : "import pandas as pd\nimport geopandas as gpd\nimport matplotlib.pyplot as plt\nfrom esda.moran import Moran, Moran_Local\nfrom libpysal.weights import Queen\nfrom splot.esda import lisa_cluster\n\n# Load the merged dataset\nCOPD_incidence_merged_df = pd.read_csv('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_incidence_merged_df.csv')\n\n# Load the shapefile into a GeoDataFrame\nCOPD_incidence_gdf = gpd.read_file('/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_incidence_gdf.shp')\n\n# Rename the column 'val' to 'COPD Incidence'\nCOPD_incidence_gdf = COPD_incidence_gdf.rename(columns={'val': 'COPD Incidence'})\n\n\n# Verify the loaded GeoDataFrame\nprint(COPD_incidence_gdf.head())\nprint(COPD_incidence_gdf.columns)\nprint(COPD_incidence_gdf.crs) # Check the Coordinate Reference System\n\n# Descriptive Statistics\nsummary_stats = COPD_incidence_gdf.describe()\nprint(summary_stats)\n\n# Plot distribution of PM2.5 levels and Ozone levels\nplt.figure(figsize=(12, 6))\n\nplt.subplot(1, 2, 1)\nCOPD_incidence_gdf['PM2.5'].hist(bins=20)\nplt.title('Distribution of PM2.5 Levels')\nplt.xlabel('PM2.5')\nplt.ylabel('Frequency')\n\nplt.subplot(1, 2, 2)\nCOPD_incidence_gdf['Ozone'].hist(bins=20)\nplt.title('Distribution of Ozone Levels')\nplt.xlabel('Ozone')\nplt.ylabel('Frequency')\n\nplt.tight_layout()\nplt.show()\n\n\n# Scatter plot of PM2.5 vs. Mortality\nplt.figure(figsize=(12, 6))\n\nplt.subplot(1, 2, 1)\nplt.scatter(COPD_incidence_gdf['PM2.5'], COPD_incidence_gdf['COPD Incidence'], alpha=0.5, edgecolors='w', s=80)\nplt.title('PM2.5 vs. COPD Incidence')\nplt.xlabel('PM2.5')\nplt.ylabel('COPD Incidence')\n\n\n# Scatter plot of Ozone vs. Mortality\nplt.subplot(1, 2, 2)\nplt.scatter(COPD_incidence_gdf['Ozone'], COPD_incidence_gdf['COPD Incidence'], alpha=0.5, edgecolors='w', s=80)\nplt.title('Ozone vs. COPD Incidence')\nplt.xlabel('Ozone')\nplt.ylabel('COPD Incidence')\n\nplt.tight_layout()\nplt.show()\n\n\n# Add an ID column to the GeoDataFrame to use it on GeoDa\nCOPD_incidence_gdf['ID'] = range(1, len(COPD_incidence_gdf) + 1)\n\nprint(COPD_incidence_gdf.columns)\n\n# Define the path to the shapefile\nshapefile_path = '/Users/icce_icecweam7/gw-workspace/S6wTraiideDo/COPD/COPD_incidence_gdf.shp'\n\n# Save the updated GeoDataFrame to the shapefile\nCOPD_incidence_gdf.to_file(shapefile_path)\n\n# Verify the file is saved\nprint(f\"Updated shapefile saved to {shapefile_path}\")\n\n\n\n\n\n\n\n\n",
"lang" : "python",
"owner" : "111111",
"confidential" : "FALSE"
}]