Push changes to base TELL package

IMMM-SFA · Jan 29, 2025 · 384a288 · 384a288
1 parent da5e8f7
commit 384a288
Show file tree

Hide file tree

Showing 5 changed files with 94 additions and 88 deletions.
diff --git a/notebooks/tell_data_preprocessing.ipynb b/notebooks/tell_data_preprocessing.ipynb
@@ -170,8 +170,9 @@
    "outputs": [],
    "source": [
     "# Execute the function to pre-proceess the raw EIA-930 data using parallel processing streams:\n",
-    "tell.process_eia_930_data(data_input_dir = tell_data_dir,\n",
-    "                          n_jobs = -1)\n"
+    "tell.process_eia_930_data(eia_930_data_input_dir = '/Users/burl878/Documents/Code/code_repos/tell/tell/tell_data/tell_raw_data/EIA_930/Balancing_Authority',\n",
+    "                          data_output_dir = '/Users/burl878/Documents/Code/code_repos/tell/tell/tell_data/tell_quickstarter_data/outputs/historical_ba_load',\n",
+    "                          n_jobs = -2)\n"
    ]
   },
   {
@@ -202,8 +203,10 @@
    "source": [
     "# Execute the function to pre-process the historical observed population data for all years from 2015 to 2020:\n",
     "tell.process_ba_population_data(start_year = 2015,\n",
-    "                                end_year = 2020,\n",
-    "                                data_input_dir = tell_data_dir)\n"
+    "                                end_year = 2023,\n",
+    "                                ba_mapping_input_dir = '/Users/burl878/Documents/Code/code_repos/tell/tell/tell_data/tell_quickstarter_data/outputs/ba_service_territory', \n",
+    "                                population_input_dir = '/Users/burl878/Documents/Code/code_repos/tell/tell/tell_data/tell_raw_data/Population',\n",
+    "                                data_output_dir = '/Users/burl878/Documents/Code/code_repos/tell/tell/tell_data/tell_quickstarter_data/outputs/historical_population')\n"
    ]
   },
   {
@@ -262,20 +265,19 @@
    "outputs": [],
    "source": [
     "# Compile the historical load, population, and meteorology data into a single set of .csv files:\n",
-    "tell.compile_data(start_year = 2015,\n",
-    "                  end_year = 2019,\n",
-    "                  data_input_dir = tell_data_dir)\n"
+    "tell.compile_data(start_year = 2016,\n",
+    "                  end_year = 2023,\n",
+    "                  load_input_dir = '/Users/burl878/Documents/Code/code_repos/tell/tell/tell_data/tell_quickstarter_data/outputs/historical_ba_load',\n",
+    "                  ba_weather_input_dir = '/Users/burl878/Documents/Code/code_repos/tell/tell/tell_data/sample_forcing_data/historical_weather',\n",
+    "                  population_input_dir = '/Users/burl878/Documents/Code/code_repos/tell/tell/tell_data/tell_quickstarter_data/outputs/historical_population',\n",
+    "                  data_output_dir = '/Users/burl878/Documents/Code/code_repos/tell/tell/tell_data/tell_quickstarter_data/outputs/compiled_historical_data')\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4b20187e-1e0d-4a4b-b0b4-6f332fd6ec41",
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
+   "id": "df7b4ef4-2dc7-464e-8e0e-d165bb916db4",
+   "metadata": {},
    "outputs": [],
    "source": []
   }

diff --git a/tell/data_process_compile.py b/tell/data_process_compile.py
@@ -6,7 +6,8 @@
 from .package_data import get_ba_abbreviations
 
 
-def compile_data(start_year: int, end_year: int, data_input_dir: str):
+def compile_data(start_year: int, end_year: int, load_input_dir: str, ba_weather_input_dir: str,
+                 population_input_dir: str, data_output_dir: str):
     """Merge the load, population, and climate data into a single .csv file for each BA
 
     :param start_year:                         Year to start process; four digit year (e.g., 1990)
@@ -15,43 +16,46 @@ def compile_data(start_year: int, end_year: int, data_input_dir: str):
     :param end_year:                           Year to end process; four digit year (e.g., 1990)
     :type end_year:                            int
 
-    :param data_input_dir:                     Top-level data directory for TELL
-    :type data_input_dir:                      str
+    :param load_input_dir:                     Path to where the pre-processed BA hourly loads are located
+    :type load_input_dir:                      str
+
+    :param ba_weather_input_dir:               Path to where the pre-processed BA weather data is located
+    :type ba_weather_input_dir:                str
+
+    :param population_input_dir:               Path to where the pre-processed BA population data is located
+    :type population_input_dir:                str
+
+    :param data_output_dir:                    Place to store the output files
+    :type data_output_dir:                     str
 
     """
 
     # Get a list of BA abbreviations to process:
     ba_name = get_ba_abbreviations()
 
-    # Set the input directories for each variable:
-    load_dir = os.path.join(data_input_dir, r'tell_quickstarter_data', r'outputs', r'historical_ba_load')
-    population_dir = os.path.join(data_input_dir, r'tell_quickstarter_data', r'outputs', r'historical_population')
-    weather_dir = os.path.join(data_input_dir, r'sample_forcing_data', r'historical_weather')
-
     # Set the output directory based on the "data_input_dir" variable:
-    output_dir = os.path.join(data_input_dir, r'tell_quickstarter_data', r'outputs', r'compiled_historical_data')
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
+    if not os.path.exists(data_output_dir):
+        os.makedirs(data_output_dir)
 
     # Loop over the list of BAs to process:
     for i in ba_name:
 
         # Check to make sure all of the requisite data exist for that BA:
         all_data_present = False
-        if os.path.isfile(os.path.join(load_dir, f"{i}_hourly_load_data.csv")) is True:
-            if os.path.isfile(os.path.join(population_dir, f"{i}_hourly_population_data.csv")) is True:
-                if os.path.isfile(os.path.join(weather_dir, f"{i}_WRF_Hourly_Mean_Meteorology_2019.csv")) is True:
+        if os.path.isfile(os.path.join(load_input_dir, f"{i}_hourly_load_data.csv")) is True:
+            if os.path.isfile(os.path.join(population_input_dir, f"{i}_hourly_population_data.csv")) is True:
+                if os.path.isfile(os.path.join(ba_weather_input_dir, f"{i}_WRF_Hourly_Mean_Meteorology_2019.csv")) is True:
                     all_data_present = True
 
         if all_data_present is True:
             # Read in the historical load and population data for that BA:
-            load_df = pd.read_csv(os.path.join(load_dir, f"{i}_hourly_load_data.csv"))
-            population_df = pd.read_csv(os.path.join(population_dir, f"{i}_hourly_population_data.csv"))
+            load_df = pd.read_csv(os.path.join(load_input_dir, f"{i}_hourly_load_data.csv"))
+            population_df = pd.read_csv(os.path.join(population_input_dir, f"{i}_hourly_population_data.csv"))
 
             # Loop over the range of years defined by the 'start_year' and 'end_year' variables:
             for year in range(start_year, end_year + 1):
                 # Read in the annual historical weather for that BA:
-                temp_weather_df = pd.read_csv(os.path.join(weather_dir, f"{i}_WRF_Hourly_Mean_Meteorology_{year}.csv"))
+                temp_weather_df = pd.read_csv(os.path.join(ba_weather_input_dir, f"{i}_WRF_Hourly_Mean_Meteorology_{year}.csv"))
 
                 # Convert the time stamp to a datetime variable and then extract the year, month, day, and hour variables:
                 temp_weather_df['Time_UTC'] = pd.to_datetime(temp_weather_df['Time_UTC'])
@@ -79,7 +83,7 @@ def compile_data(start_year: int, end_year: int, data_input_dir: str):
             merged_second['Total_Population'] = merged_second['Total_Population'].round(2)
 
             # Write the merged dataframe to a .csv file
-            merged_second.to_csv(os.path.join(output_dir, f"{i}_historical_data.csv"), index=False, header=True)
+            merged_second.to_csv(os.path.join(data_output_dir, f"{i}_historical_data.csv"), index=False, header=True)
 
             # Clean up the variables and move to the next BA in the loop:
             del temp_weather_df, weather_df, load_df, population_df, merged_first, merged_second, all_data_present
diff --git a/tell/data_process_eia_930.py b/tell/data_process_eia_930.py
@@ -6,11 +6,11 @@
 from .package_data import get_ba_abbreviations
 
 
-def list_EIA_930_files(data_input_dir: str) -> list:
+def list_EIA_930_files(eia_930_data_input_dir: str) -> list:
     """Make a list of all the file names for the EIA-930 hourly load dataset
 
-    :param data_input_dir:         Top-level data directory for TELL
-    :type data_input_dir:          str
+    :param eia_930_data_input_dir: Path to where the raw EIA-930 data are stored
+    :type eia_930_data_input_dir:  str
 
     :return:                       list
 
@@ -24,30 +24,27 @@ def list_EIA_930_files(data_input_dir: str) -> list:
 
     # Loop over the list and find the path for each BA in the list:
     for i in ba_name:
-        path_to_check = os.path.join(data_input_dir, r'tell_raw_data', r'EIA_930', r'Balancing_Authority', f'{i}.xlsx')
+        path_to_check = os.path.join(eia_930_data_input_dir, f'{i}.xlsx')
         path_list.append(path_to_check)
 
     # Return the list:
     return path_list
 
 
-def eia_data_subset(file_string: str, data_input_dir: str):
+def eia_data_subset(file_string: str, data_output_dir: str):
     """Extract only the columns TELL needs from the EIA-930 Excel files
 
     :param file_string:            File name of EIA-930 hourly load data by BA
     :type file_string:             str
 
-    :param data_input_dir:         Top-level data directory for TELL
-    :type data_input_dir:          str
+    :param data_output_dir:        Place to store the output files
+    :type data_output_dir:         str
 
     """
 
-    # Set the output directory based on the "data_input_dir" variable:
-    output_dir = os.path.join(data_input_dir, r'tell_quickstarter_data', r'outputs', r'historical_ba_load')
-
     # If the output directory doesn't exist then create it:
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
+    if not os.path.exists(data_output_dir):
+        os.makedirs(data_output_dir)
 
     # Read in the data from the "Published Hourly Data" sheet:
     df = pd.read_excel(file_string, sheet_name='Published Hourly Data')
@@ -59,27 +56,30 @@ def eia_data_subset(file_string: str, data_input_dir: str):
     df['Hour'] = df['UTC time'].dt.strftime('%H')
 
     # Only keep the columns that are needed:
-    col_names = ['Year', 'Month', 'Day', 'Hour', 'DF', 'Adjusted D', 'Adjusted NG', 'Adjusted TI']
+    col_names = ['Year', 'Month', 'Day', 'Hour', 'Demand forecast', 'Adjusted demand', 'Adjusted net generation', 'Adjusted total interchange']
     df = df[col_names].copy()
 
     # Rename the columns to add the units to each variable:
-    df.rename(columns={"DF": "Forecast_Demand_MWh",
-                       "Adjusted D": "Adjusted_Demand_MWh",
-                       "Adjusted NG": "Adjusted_Generation_MWh",
-                       "Adjusted TI": "Adjusted_Interchange_MWh"}, inplace=True)
+    df.rename(columns={"Demand forecast": "Forecast_Demand_MWh",
+                       "Adjusted demand": "Adjusted_Demand_MWh",
+                       "Adjusted net generation": "Adjusted_Generation_MWh",
+                       "Adjusted total interchange": "Adjusted_Interchange_MWh"}, inplace=True)
 
     # Extract the BA name from the "file_string" variable:
     BA_name = os.path.splitext(os.path.basename(file_string))[0]
 
     # Write the output to a .csv file:
-    df.to_csv(os.path.join(output_dir, f'{BA_name}_hourly_load_data.csv'), index=False, header=True)
+    df.to_csv(os.path.join(data_output_dir, f'{BA_name}_hourly_load_data.csv'), index=False, header=True)
 
 
-def process_eia_930_data(data_input_dir: str, n_jobs: int):
+def process_eia_930_data(eia_930_data_input_dir: str, data_output_dir: str, n_jobs: int):
     """Read in list of EIA 930 files, subset the data, and save the output as a .csv file
 
-    :param data_input_dir:         Top-level data directory for TELL
-    :type data_input_dir:          str
+    :param eia_930_data_input_dir: Path to where the raw EIA-930 data are stored
+    :type eia_930_data_input_dir:  str
+
+    :param data_output_dir:       Place to store the output files
+    :type data_output_dir:         str
 
     :param n_jobs:                 The maximum number of concurrently running jobs, such as the number of Python
                                    worker processes when backend=”multiprocessing” or the size of the thread-pool
@@ -94,12 +94,12 @@ def process_eia_930_data(data_input_dir: str, n_jobs: int):
     """
 
     # Create the list of EIA-930 Excel files:
-    list_of_files = list_EIA_930_files(data_input_dir)
+    list_of_files = list_EIA_930_files(eia_930_data_input_dir)
 
     # Process each file in the list in parallel:
     Parallel(n_jobs=n_jobs)(
         delayed(eia_data_subset)(
             file_string=i,
-            data_input_dir=data_input_dir
+            data_output_dir=data_output_dir
         ) for i in list_of_files
     )