-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathprocess.json
17 lines (17 loc) · 5.63 KB
/
process.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
[{
"id" : "k5yduw",
"name" : "breatheright_correlation_analysis",
"description" : null,
"code" : "\nfrom breatheright_data_preparation import data_folder, merged_ozone_file_name\nimport pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n\ndef do_breatheright_correlation_analysis():\n # read in the merged one csv file with ozone and lung disease data\n merged_ozone_df = pd.read_csv(f\"{data_folder}/{merged_ozone_file_name}\")\n\n print(merged_ozone_df.head())\n print(merged_ozone_df.columns)\n\n # drop the unnecessary columns\n merged_ozone_df = merged_ozone_df.drop(columns=[\n \"County Name_x\", 'State Name_x', \n 'County Name_y', 'State Name_y'])\n\n merged_ozone_df = merged_ozone_df.rename(columns={\n 'Max': 'ozone_max',\n 'Min': 'ozone_min',\n 'Mean': 'ozone_mean',\n 'Median': 'ozone_median',\n 'Std': 'ozone_std'\n })\n \n # Calculate correlations\n correlation_matrix = merged_ozone_df[[\n 'mortality_average', 'mortality_min', 'mortality_max',\n 'ozone_max', 'ozone_min', 'ozone_mean', 'ozone_median', \n 'ozone_std']].corr()\n\n # Save correlation matrix to CSV\n correlation_matrix.to_csv(f'{data_folder}/correlation_matrix_ozone_lung_disease_5years.csv')\n\n # Plot Correlation Heatmap\n plt.figure(figsize=(10, 8))\n sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)\n plt.title('Correlation Heatmap')\n plt.savefig(f'{data_folder}/correlation_heatmap_ozone_lung_disease.png')\n #plt.show()\n\n\nif __name__ == \"__main__\":\n do_breatheright_correlation_analysis()",
"lang" : "python",
"owner" : "111111",
"confidential" : "FALSE"
},{
"id" : "bf1ioh",
"name" : "breatheright_data_preparation",
"description" : null,
"code" : "import os\nimport pandas as pd\nimport re\n\nhome_directory = os.path.expanduser(\"~\")\nprint(home_directory)\n\ndata_folder = f\"{home_directory}/Documents/GitHub/health-assip-2024/data\"\nmerged_ozone_file_name = \"ozone_lung_disease_yearly_merged.csv\"\n\n\ndef split_mortality_column_into_three(lung_disease_df):\n # Define a function to split the mortality rate\n def split_mortality_rate(mortality_rate):\n # Regular expression to extract the values - regex\n match = re.match(r'(\\d+\\.\\d+) \\((\\d+\\.\\d+), (\\d+\\.\\d+)\\)', mortality_rate)\n if match:\n avg, min_val, max_val = match.groups()\n return pd.Series([float(avg), float(min_val), float(max_val)], index=['mortality_average', 'mortality_min', 'mortality_max'])\n else:\n return pd.Series([None, None, None], index=['mortality_average', 'mortality_min', 'mortality_max'])\n\n # Apply the function to split the 'mortality_rate' column\n lung_disease_df[['mortality_average', 'mortality_min', 'mortality_max']] = lung_disease_df['Mortality Rate'].apply(split_mortality_rate)\n\n # Drop the original 'mortality_rate' column if you no longer need it\n lung_disease_df = lung_disease_df.drop(columns=['Mortality Rate'])\n\n return lung_disease_df\n\ndef parse_and_load_data():\n files = [f for f in os.listdir(data_folder) if os.path.isfile(os.path.join(data_folder, f))]\n print(files)\n\n ozone_data_path = f\"{data_folder}/Ozonecombined.csv\"\n lung_disease_data_path = f\"{data_folder}/lung_disease_data.csv\"\n pm25_data_path = f\"{data_folder}/PM2.5combined.csv\"\n\n # Read all the csv into pandas dataframe in memory\n ozone_df = pd.read_csv(ozone_data_path, parse_dates=['Date Local'])\n lung_disease_df = pd.read_csv(lung_disease_data_path)\n pm25_df = pd.read_csv(pm25_data_path)\n\n print(\"ozone head: \", ozone_df.columns)\n print(\"pm25 header: \", pm25_df.columns)\n pd.set_option('display.max_columns', None)\n print(\"ozone header: \", ozone_df.head())\n print(\"lung disease head: \", lung_disease_df.columns)\n\n # convert the daily ozone into yearly data\n ozone_df['year'] = ozone_df['Date Local'].dt.year\n \n # Group by additional columns and 'year'\n grouped = ozone_df.groupby(['State Name', 'County Name', 'year'])['Arithmetic Mean']\n\n # Compute statistics\n stats_df = grouped.agg(['max', 'min', 'mean', 'median', 'std']).reset_index()\n\n # Rename columns for clarity\n stats_df.columns = ['State Name', 'County Name', 'year', 'Max', 'Min', 'Mean', 'Median', 'Std']\n\n \n\n # Convert columns to string in both DataFrames\n stats_df['county'] = stats_df['County Name'].str.lower()\n stats_df['state'] = stats_df['State Name'].str.lower()\n stats_df['year'] = stats_df['year'].astype(int)\n\n lung_disease_df['county'] = lung_disease_df['County Name'].str.strip()\n lung_disease_df['county'] = lung_disease_df['county'].str.lower()\n lung_disease_df['state'] = lung_disease_df['State Name'].str.lower()\n lung_disease_df['year'] = lung_disease_df['year'].astype(int)\n\n # Print the results\n print(\"ozone aggregated yearly data: \", stats_df)\n\n lung_disease_df = split_mortality_column_into_three(lung_disease_df)\n print(\"split lung disease header:\", lung_disease_df.head())\n\n # Merge the statistics oznone DataFrame with the lung disease DataFrame\n merged_df = pd.merge(lung_disease_df, stats_df, \n on=['county', 'state', 'year'],\n how='inner')\n\n print(\"Merged dataframe is: \", merged_df.head())\n\n # Save to a CSV file\n merged_df.to_csv(f'{data_folder}/{merged_ozone_file_name}', index=False)\n\n\n\nif __name__ == \"__main__\":\n parse_and_load_data()\n",
"lang" : "python",
"owner" : "111111",
"confidential" : "FALSE"
}]