From fcaf960bfb0c550b4b12119fa0f916eb538a8bab Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Fri, 21 May 2021 00:51:36 +0200
Subject: [PATCH 01/97] added hh-demand-profile-generator

---
 .../hh_demand/hh_demand_profiles.py           | 207 ++++++++++++++++++
 .../hh_demand/hh_demand_profiles_tools.py     | 202 +++++++++++++++++
 2 files changed, 409 insertions(+)
 create mode 100644 src/egon/data/processing/hh_demand/hh_demand_profiles.py
 create mode 100644 src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
new file mode 100644
index 000000000..f724fa86a
--- /dev/null
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import os
+import pandas as pd
+from egon.data import db
+
+import hh_demand_profiles_tools as hh_tools
+
+
+
+# Loadprofilesdump
+# ###################
+# in Wh
+# TODO: > to/from SQL?
+file = 'h0_profiles.h5'
+file = os.path.join(os.path.realpath(file))
+df_profiles = pd.read_hdf(file)
+
+# set multiindex to HH_types
+df_profiles.columns = pd.MultiIndex.from_arrays([df_profiles.columns.str[:2], df_profiles.columns.str[3:]])
+
+
+# Load Zensus data at nuts-level
+# ###################
+# TODO: > to/from SQL?
+file = 'Zensus2011_Personen.csv'
+file = os.path.join(os.path.realpath(file))
+df_zensus = pd.read_csv(file, sep=';', decimal='.', skiprows=5, skipfooter=7,
+                        index_col=[0, 1], header=[0, 1], encoding='latin1', engine='python')
+
+# clean data
+df_zensus = df_zensus.applymap(hh_tools.clean).applymap(int)
+# preprocess nuts1 zensus data
+df_zensus = hh_tools.process_nuts1_zensus_data(df_zensus)
+
+# ## Household distribution
+# - Adults living in househould type
+# - number of kids not included even if in housholdtype name
+# **! The Eurostat data only gives the amount of adults/seniors, excluding the amount of kids <15**
+# eurostat is used for demand-profile-generator @fraunhofer
+
+# hh shares mapping zensus to eurostat
+hh_types = {'SR': [('Einpersonenhaushalte (Singlehaushalte)', 'Insgesamt', 'Seniors'),
+                   ('Alleinerziehende Elternteile', 'Insgesamt', 'Seniors')],  # Single Seniors Single Parents Seniors
+            'SO': [('Einpersonenhaushalte (Singlehaushalte)', 'Insgesamt', 'Adults')],  # Single Adults
+            'SK': [('Alleinerziehende Elternteile', 'Insgesamt', 'Adults')],  # Single Parents Adult
+            'PR': [('Paare ohne Kind(er)', '2 Personen', 'Seniors'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '2 Personen', 'Seniors')],
+            # Couples without Kids Senior & same sex couples & shared flat seniors
+            'PO': [('Paare ohne Kind(er)', '2 Personen', 'Adults'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '2 Personen', 'Adults')],
+            # Couples without Kids adults & same sex couples & shared flat adults
+            'P1': [('Paare mit Kind(ern)', '3 Personen', 'Adults')],
+            'P2': [('Paare mit Kind(ern)', '4 Personen', 'Adults')],
+            'P3': [('Paare mit Kind(ern)', '5 Personen', 'Adults'),
+                   ('Paare mit Kind(ern)', '6 und mehr Personen', 'Adults')],
+            'OR': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen', 'Seniors'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen', 'Seniors'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen', 'Seniors'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '6 und mehr Personen', 'Seniors'),
+                   ('Paare mit Kind(ern)', '3 Personen', 'Seniors'),
+                   ('Paare ohne Kind(er)', '3 Personen', 'Seniors'),
+                   ('Paare mit Kind(ern)', '4 Personen', 'Seniors'),
+                   ('Paare ohne Kind(er)', '4 Personen', 'Seniors'),
+                   ('Paare mit Kind(ern)', '5 Personen', 'Seniors'),
+                   ('Paare ohne Kind(er)', '5 Personen', 'Seniors'),
+                   ('Paare mit Kind(ern)', '6 und mehr Personen', 'Seniors'),
+                   ('Paare ohne Kind(er)', '6 und mehr Personen', 'Seniors')],  # no info about share of kids
+
+            # OO, O1, O2 have the same amount, as no information about the share of kids within zensus data set.
+            # if needed the total amount can be corrected in the hh_tools.get_hh_dist function
+            # using multi_adjust=True option
+            'OO': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen', 'Adults'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen', 'Adults'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen', 'Adults'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '6 und mehr Personen', 'Adults'),
+                   ('Paare ohne Kind(er)', '3 Personen', 'Adults'),
+                   ('Paare ohne Kind(er)', '4 Personen', 'Adults'),
+                   ('Paare ohne Kind(er)', '5 Personen', 'Adults'),
+                   ('Paare ohne Kind(er)', '6 und mehr Personen', 'Adults')],  # no info about share of kids
+            # TODO: maybe remove following lines if not needed
+            #             'O1': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen', 'Adults'),
+            #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen', 'Adults'),
+            #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen', 'Adults'),
+            #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '6 und mehr Personen', 'Adults'),
+            #                    ('Paare ohne Kind(er)', '3 Personen', 'Adults'),
+            #                    ('Paare ohne Kind(er)', '4 Personen', 'Adults'),
+            #                    ('Paare ohne Kind(er)', '5 Personen', 'Adults'),
+            #                    ('Paare ohne Kind(er)', '6 und mehr Personen', 'Adults')],  # no info about share of kids
+            #             'O2': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen', 'Adults'),
+            #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen', 'Adults'),
+            #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen', 'Adults'),
+            #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '6 und mehr Personen', 'Adults'),
+            #                    ('Paare ohne Kind(er)', '3 Personen', 'Adults'),
+            #                    ('Paare ohne Kind(er)', '4 Personen', 'Adults'),
+            #                    ('Paare ohne Kind(er)', '5 Personen', 'Adults'),
+            #                    ('Paare ohne Kind(er)', '6 und mehr Personen', 'Adults')]
+            }
+
+# distribution of people by household @eurostats
+# df_hh_types_D = pd.Series({'SR': 0.083, 'SO': 0.158, 'SK': 0.022,
+#                            'PR': 0.145, 'PO': 0.203, 'P1': 0.081, 'P2': 0.077, 'P3': 0.024,
+#                            'OR': 0.023, 'OO': 0.13, 'O1': 0.04, 'O2': 0.015})
+
+# hh_tools.get_hh_dist without eurostat adjustment for O1-03 Groups in absolute values
+df_hh_types_nad_abs = hh_tools.get_hh_dist(df_zensus, hh_types, multi_adjust=False, relative=False)
+
+# #########################
+# FIXME:
+# mapping needs to be adjusted for OR, OO, O1, O2
+#
+# ###########################
+
+mapping_people_in_households = {'SR': 1,
+                                'SO': 1,
+                                'SK': 1,  # kids are excluded
+                                'PR': 2,
+                                'PO': 2,
+                                'P1': 2,  # kids are excluded
+                                'P2': 2,  # ""
+                                'P3': 2,  # ""
+                                'OR': 4,  # parameter needs to be re/defined
+                                'OO': 4,  # ""
+                                #                                 'O1': 4,  # ""
+                                #                                 'O2': 4,  # ""
+                                }
+# derivate households data from inhabitants data by compound number of people per household type
+df_dist_households = hh_tools.inhabitants_to_households(df_hh_types_nad_abs, mapping_people_in_households)
+
+
+# SQL - Access Zensus household data cell-level
+df_households_typ = db.select_dataframe(sql="""
+            SELECT grid_id, attribute, characteristics_code, characteristics_text, quantity
+            FROM society.destatis_zensus_household_per_ha
+            WHERE attribute = 'HHTYP_FAM' """)
+df_households_typ = df_households_typ.drop(columns=['attribute', 'characteristics_text'])
+df_households_typ = df_households_typ.rename(columns={'quantity': 'hh_5types'})
+
+mapping_zensus_hh_subgroups = {1: ['SR', 'SO'],
+                               2: ['PR', 'PO'],
+                               3: ['SK'],
+                               4: ['P1', 'P2', 'P3'],
+                               5: ['OR', 'OO'],
+                               }
+
+for value in mapping_zensus_hh_subgroups.values():
+    df_dist_households.loc[value] = df_dist_households.loc[value].div(df_dist_households.loc[value].sum())
+
+# SQL- create table to map cells to nuts3 and nuts1
+df_grid_id = db.select_dataframe(sql="""
+                    SELECT pop.grid_id, pop.gid, vg250.vg250_nuts3 as nuts3, lan.nuts as nuts1, lan.gen
+                    FROM society.destatis_zensus_population_per_ha_inside_germany as pop
+                    LEFT JOIN boundaries.egon_map_zensus_vg250 as vg250
+                    ON (pop.gid=vg250.zensus_population_id)
+                    LEFT JOIN boundaries.vg250_lan as lan
+                    ON (LEFT(vg250.vg250_nuts3, 3)=lan.nuts) """)
+df_grid_id = df_grid_id.drop_duplicates()
+df_grid_id = df_grid_id.reset_index(drop=True)
+
+# merge nuts info to zensus cell level data
+# how='inner' is used as ids of unpopulated areas are removed df_grid_id or earliers tables. see here:
+# https://github.com/openego/eGon-data/blob/59195926e41c8bd6d1ca8426957b97f33ef27bcc/src/egon/data/importing/zensus/__init__.py#L418-L449
+df_households_typ = pd.merge(df_households_typ, df_grid_id[['grid_id', 'gen', 'nuts1', 'nuts3']],
+                             left_on='grid_id', right_on='grid_id', how='inner')
+
+# Merge Zensus nuts level household data with zensus cell level by dividing hh-groups with mapping_zensus_hh_subgroups
+df_zensus_cells = pd.DataFrame()
+for (country, code), df_country_type in df_households_typ.groupby(['gen', 'characteristics_code']):
+
+    # iterate over zenus_country subgroups
+    for typ in mapping_zensus_hh_subgroups[code]:
+        df_country_type['hh_type'] = typ
+        df_country_type['factor'] = df_dist_households.loc[typ, country]
+        df_country_type['hh_10types'] = df_country_type['hh_5types'] * df_dist_households.loc[typ, country]
+        df_zensus_cells = df_zensus_cells.append(df_country_type, ignore_index=True)
+
+df_zensus_cells = df_zensus_cells.sort_values(by=['grid_id', 'characteristics_code']).reset_index(drop=True)
+
+# change profile numbers to int
+df_profiles.columns = pd.MultiIndex.from_tuples([(a, int(b)) for a, b in df_profiles.columns])
+
+pool_size = df_profiles.groupby(level=0, axis=1).size()
+
+df_demand_regio = db.select_dataframe(sql="""
+                        SELECT year, nuts3, SUM (demand) as demand_mWha
+                        FROM demand.egon_demandregio_hh as egon_d
+                        GROUP BY nuts3, year
+                        ORDER BY year""", index_col=['year', 'nuts3'])
+
+# testcase
+# test_data = df_zensus_cells.groupby('nuts3').get_group('DEF03')
+# test_data = pd.concat([df_zensus_cells.groupby('nuts3').get_group('DEF03'),
+#                        df_zensus_cells.groupby('nuts3').get_group('DEF06')])
+# 
+# df_cell_demand_metadata = hh_tools.get_cell_demand_metadata(test_data, df_profiles)
+# df_cell_demand_metadata = hh_tools.adjust_to_demand_regio_nuts3_annual(df_cell_demand_metadata, df_profiles, df_demand_regio)
+# 
+# 
+# 
+# import random
+# load_area_ids = random.sample(list(df_cell_demand_metadata.index), 100)
+# max_value_load_area = hh_tools.get_load_area_max_load(df_profiles, df_cell_demand_metadata, load_area_ids, 2034)
+# # print(df_cell_demand_metadata.shape)
+# print(max_value_load_area)
+
+
diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
new file mode 100644
index 000000000..56613300f
--- /dev/null
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import pandas as pd
+import numpy as np
+
+from itertools import cycle
+import random
+
+
+def clean(x):
+    x = str(x).replace('-', str(0))
+    x = str(x).replace('.', str(0))
+    x = x.strip('()')
+    return x
+
+
+def get_hh_dist(df_zensus, hh_types, multi_adjust=True, relative=True):
+    # adjust multi with/without kids via eurostat share as not clearly derivable without infos about share of kids
+    if multi_adjust:
+        adjust = {'SR': 1, 'SO': 1, 'SK': 1, 'PR': 1, 'PO': 1, 'P1': 1, 'P2': 1, 'P3': 1, 'OR': 1,
+                  'OO': 0.703, 'O1': 0.216, 'O2': 0.081, }
+    else:
+        adjust = {'SR': 1, 'SO': 1, 'SK': 1, 'PR': 1, 'PO': 1, 'P1': 1, 'P2': 1, 'P3': 1, 'OR': 1,
+                  'OO': 1, 'O1': 0, 'O2': 0, }
+
+    df_hh_types = pd.DataFrame(
+        ({hhtype: adjust[hhtype] * df_zensus.loc[countries, codes].sum() for hhtype, codes in hh_types.items()}
+         for countries in df_zensus.index), index=df_zensus.index)
+    # drop zero columns
+    df_hh_types = df_hh_types.loc[:, (df_hh_types != 0).any(axis=0)]
+    if relative:
+        # normalize
+        df_hh_types = df_hh_types.div(df_hh_types.sum(axis=1), axis=0)
+    return df_hh_types.T
+
+
+def get_loadprofiles(df_profiles, df_hh_types, hh_total, state_dist=False):
+    # equal share of hh_total for every state
+    if not state_dist:
+        state_dist = pd.Series(hh_total / df_hh_types.shape[1], index=df_hh_types.columns)
+    # specific share of hh_total by state_dist
+    else:
+        state_dist = state_dist * hh_total
+
+    header = pd.MultiIndex.from_tuples(
+        [(state, hh_type) for state in df_hh_types.columns for hh_type in df_hh_types.index],
+        names=['State', 'Type'])
+
+    df_loadprofiles = pd.DataFrame(columns=header)
+
+    for state, state_share in state_dist.items():
+
+        for hh_type, type_share in df_hh_types[state].items():
+            samples = state_share * type_share
+            #             if samples<1:
+            #                 raise ValueError(f"Sample size needs to be >=1. hh-share of {state}-{hh_type} is {samples:.2f} , increase hh_total")
+            samples = int(samples)
+            hh_type_ts = df_profiles[hh_type].T.sample(samples, axis=0, replace=True).sum()
+
+            df_loadprofiles.loc[:, (state, hh_type)] = hh_type_ts
+
+    timestamp = pd.date_range(start='01-01-2012', periods=df_loadprofiles.shape[0], freq='h')
+    df_loadprofiles.index = timestamp
+    return df_loadprofiles
+
+
+def normalize_loadprofiles(df_lp, to_value):
+    # normed to 'to_value' kWh annual
+    df_kwh = df_lp.groupby(level='State', axis=1).sum() * to_value / df_lp.groupby(level='State', axis=1).sum().sum()
+
+    return df_kwh
+
+
+def aggregate_loadprofiles(df_lp, to_value):
+    # normed to 'to_value' kWh annual
+    df_kwh = df_lp.sum(axis=1) * to_value / df_lp.sum(axis=1).sum()
+
+    return df_kwh
+
+
+def inhabitants_to_households(df_people_by_householdtypes_abs, mapping_people_in_households):
+    diff = set(df_people_by_householdtypes_abs.index) ^ set(mapping_people_in_households.keys())
+
+    if bool(diff):
+        for key in diff:
+            mapping_people_in_households = dict(mapping_people_in_households)
+            del mapping_people_in_households[key]
+        print(f'Removed {diff} from mapping!')
+
+    df_households_by_type = df_people_by_householdtypes_abs.div(mapping_people_in_households, axis=0)
+    df_households_by_type = df_households_by_type.apply(np.ceil)  # round up households
+
+    return df_households_by_type
+
+
+def process_nuts1_zensus_data(df_zensus):
+    # Group data to fit Load Profile Generator categories
+    # define kids/adults/seniors
+    kids = ['Unter 3', '3 - 5', '6 - 14']  # < 15
+    adults = ['15 - 17', '18 - 24', '25 - 29', '30 - 39', '40 - 49', '50 - 64']  # 15 < x <65
+    seniors = ['65 - 74', '75 und älter']  # >65
+
+    # sum groups of kids, adults and seniors and concat
+    df_kids = df_zensus.loc[:, (slice(None), kids)].groupby(level=0, axis=1).sum()
+    df_adults = df_zensus.loc[:, (slice(None), adults)].groupby(level=0, axis=1).sum()
+    df_seniors = df_zensus.loc[:, (slice(None), seniors)].groupby(level=0, axis=1).sum()
+    df_zensus = pd.concat([df_kids, df_adults, df_seniors], axis=1, keys=['Kids', 'Adults', 'Seniors'],
+                          names=['age', 'persons'])
+
+    # reduce column names to state only
+    mapping_state = {i: i.split()[1] for i in df_zensus.index.get_level_values(level=0)}
+
+    # rename index
+    df_zensus = df_zensus.rename(index=mapping_state, level=0)
+    # rename axis
+    df_zensus = df_zensus.rename_axis(['state', 'type'])
+    # unstack
+    df_zensus = df_zensus.unstack()
+    # reorder levels
+    df_zensus = df_zensus.reorder_levels(order=['type', 'persons', 'age'], axis=1)
+
+    return df_zensus
+
+
+def get_cell_demand_profile_ids(df_cell, pool_size, df_profiles):
+    """generates tuple of hh_type and random sample(without replacement) profile ids for cell"""
+    # maybe use instead
+    # np.random.default_rng().integers(low=0, high=pool_size[hh_type], size=sq) instead of random.sample
+    # use random.choice() if with replacement
+    # list of sample ids per hh_type in cell
+    cell_profile_ids = [(hh_type, random.sample(range(pool_size[hh_type]), k=sq)) \
+                        for hh_type, sq in zip(df_cell['hh_type'],
+                                               df_cell['hh_10types'].astype(int))]
+
+    # format to lists of tuples (hh_type, id)
+    cell_profile_ids = [list(zip(cycle([hh_type]), ids)) for hh_type, ids in cell_profile_ids]
+    # reduce to list
+    cell_profile_ids = [a for b in cell_profile_ids for a in b]
+
+    return cell_profile_ids
+
+
+# can be parallelized with grouping df_zensus_cells by grid_id/nuts3/nuts1
+def get_cell_demand_metadata(df_zensus_cells, df_profiles):
+    """generate table including demand profile ids for each cell using get_cell_demand_profile_ids"""
+
+    df_cell_demand_metadata = pd.DataFrame(index=df_zensus_cells.grid_id.unique(),
+                                           columns=['cell_profile_ids', 'nuts3', 'nuts1', '2035_factor',
+                                                    '2050_factor', ])
+    # 'peak_loads_hh', 'peak_load_cell',
+    df_cell_demand_metadata = df_cell_demand_metadata.rename_axis('cell_id')
+
+    pool_size = df_profiles.groupby(level=0, axis=1).size()
+
+    for cell_id, df_cell in df_zensus_cells.groupby(by='grid_id'):
+        # FIXME
+        # ! runden der Haushaltszahlen auf int
+        # ! kein Zurücklegen innerhalb einer Zelle ?!
+        cell_profile_ids = get_cell_demand_profile_ids(df_cell, pool_size, df_profiles)
+
+        df_cell_demand_metadata.at[cell_id, 'cell_profile_ids'] = cell_profile_ids
+        df_cell_demand_metadata.at[cell_id, 'nuts3'] = df_cell.loc[:, 'nuts3'].unique()[0]
+        df_cell_demand_metadata.at[cell_id, 'nuts1'] = df_cell.loc[:, 'nuts1'].unique()[0]
+
+    return df_cell_demand_metadata
+
+
+# can be parallelized with grouping df_zensus_cells by grid_id/nuts3/nuts1
+def adjust_to_demand_regio_nuts3_annual(df_cell_demand_metadata, df_profiles, df_demand_regio):
+    """computes the profile scaling factor by accumulated nuts3 cells and demand_regio data"""
+    for nuts3_id, df_nuts3 in df_cell_demand_metadata.groupby(by='nuts3'):
+        nuts3_cell_ids = df_nuts3.index
+        nuts3_profile_ids = df_nuts3.loc[:, 'cell_profile_ids'].sum()
+
+        # take all profiles of one nuts3, aggregate and sum
+        # profiles in Wh
+        nuts3_profiles_sum_annual = df_profiles.loc[:, nuts3_profile_ids].sum().sum()
+
+        # Scaling Factor
+        # ##############
+        # demand regio in MWh
+        # profiles in Wh
+        df_cell_demand_metadata.loc[nuts3_cell_ids, '2035_factor'] = df_demand_regio.loc[
+                                                                         (2035, nuts3_id), 'demand_mwha'] * 1e3 / (
+                                                                                 nuts3_profiles_sum_annual / 1e3)
+        df_cell_demand_metadata.loc[nuts3_cell_ids, '2050_factor'] = df_demand_regio.loc[
+                                                                         (2050, nuts3_id), 'demand_mwha'] * 1e3 / (
+                                                                                 nuts3_profiles_sum_annual / 1e3)
+
+    return df_cell_demand_metadata
+
+
+def get_load_area_max_load(df_profiles, df_cell_demand_metadata, load_area_ids, year):
+    """get max value of load area demand profile"""
+    timesteps = len(df_profiles)
+    full_load = pd.Series(data=np.zeros(timesteps), dtype=np.float64, index=range(timesteps))
+    load_area_meta = df_cell_demand_metadata.loc[load_area_ids, ['cell_profile_ids', 'nuts3', f'{year}_factor']]
+    for (nuts3, factor), df in load_area_meta.groupby(by=['nuts3', f'{year}_factor']):
+        part_load = df_profiles.loc[:, df['cell_profile_ids'].sum()].sum(axis=1) * factor / 1e3  # profiles in Wh
+        full_load = full_load.add(part_load)
+    return full_load.max()  #, full_load.idxmax()

From 16672b3de5d4827960480fed40125293aa658a2e Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Fri, 21 May 2021 01:14:17 +0200
Subject: [PATCH 02/97] add note for missing files #256

---
 src/egon/data/processing/hh_demand/hh_demand_profiles.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index f724fa86a..9c1cae0c2 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -13,6 +13,7 @@
 # ###################
 # in Wh
 # TODO: > to/from SQL?
+# filed needs to be placed manually in directory
 file = 'h0_profiles.h5'
 file = os.path.join(os.path.realpath(file))
 df_profiles = pd.read_hdf(file)
@@ -24,6 +25,7 @@
 # Load Zensus data at nuts-level
 # ###################
 # TODO: > to/from SQL?
+# filed needs to be placed manually in directory
 file = 'Zensus2011_Personen.csv'
 file = os.path.join(os.path.realpath(file))
 df_zensus = pd.read_csv(file, sep=';', decimal='.', skiprows=5, skipfooter=7,

From ef1e667c674c7a9c939022be2cfe7367ef3849e6 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Sat, 22 May 2021 14:56:33 +0200
Subject: [PATCH 03/97] add TODO for db.engine() #256

---
 .../processing/hh_demand/hh_demand_profiles.py     | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index 9c1cae0c2..a5cb3ac55 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -82,6 +82,7 @@
                    ('Paare ohne Kind(er)', '5 Personen', 'Adults'),
                    ('Paare ohne Kind(er)', '6 und mehr Personen', 'Adults')],  # no info about share of kids
             # TODO: maybe remove following lines if not needed
+
             #             'O1': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen', 'Adults'),
             #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen', 'Adults'),
             #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen', 'Adults'),
@@ -130,7 +131,8 @@
 # derivate households data from inhabitants data by compound number of people per household type
 df_dist_households = hh_tools.inhabitants_to_households(df_hh_types_nad_abs, mapping_people_in_households)
 
-
+# TODO: direct db.engine to configuration file
+# engine = db.engine()
 # SQL - Access Zensus household data cell-level
 df_households_typ = db.select_dataframe(sql="""
             SELECT grid_id, attribute, characteristics_code, characteristics_text, quantity
@@ -190,16 +192,16 @@
                         GROUP BY nuts3, year
                         ORDER BY year""", index_col=['year', 'nuts3'])
 
-# testcase
+# # testcase
 # test_data = df_zensus_cells.groupby('nuts3').get_group('DEF03')
 # test_data = pd.concat([df_zensus_cells.groupby('nuts3').get_group('DEF03'),
 #                        df_zensus_cells.groupby('nuts3').get_group('DEF06')])
-# 
+#
 # df_cell_demand_metadata = hh_tools.get_cell_demand_metadata(test_data, df_profiles)
 # df_cell_demand_metadata = hh_tools.adjust_to_demand_regio_nuts3_annual(df_cell_demand_metadata, df_profiles, df_demand_regio)
-# 
-# 
-# 
+#
+#
+#
 # import random
 # load_area_ids = random.sample(list(df_cell_demand_metadata.index), 100)
 # max_value_load_area = hh_tools.get_load_area_max_load(df_profiles, df_cell_demand_metadata, load_area_ids, 2034)

From c0f1a2f329b0be806623161498098793b8a46655 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Tue, 25 May 2021 11:49:37 +0200
Subject: [PATCH 04/97] fix type in year

---
 src/egon/data/processing/hh_demand/hh_demand_profiles.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index a5cb3ac55..b310955d5 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -203,8 +203,8 @@
 #
 #
 # import random
-# load_area_ids = random.sample(list(df_cell_demand_metadata.index), 100)
-# max_value_load_area = hh_tools.get_load_area_max_load(df_profiles, df_cell_demand_metadata, load_area_ids, 2034)
+# load_area_cell_ids = random.sample(list(df_cell_demand_metadata.index), 100)
+# max_value_load_area = hh_tools.get_load_area_max_load(df_profiles, df_cell_demand_metadata, load_area_cell_ids, 2035)
 # # print(df_cell_demand_metadata.shape)
 # print(max_value_load_area)
 

From 5710ce91869bc71a5f689e356f0ce7ff304fd1fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Tue, 25 May 2021 15:56:53 +0200
Subject: [PATCH 05/97] Put code in if _name__

---
 .../hh_demand/hh_demand_profiles.py           | 404 +++++++++---------
 1 file changed, 202 insertions(+), 202 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index b310955d5..ea74cb906 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -5,207 +5,207 @@
 import pandas as pd
 from egon.data import db
 
-import hh_demand_profiles_tools as hh_tools
-
-
-
-# Loadprofilesdump
-# ###################
-# in Wh
-# TODO: > to/from SQL?
-# filed needs to be placed manually in directory
-file = 'h0_profiles.h5'
-file = os.path.join(os.path.realpath(file))
-df_profiles = pd.read_hdf(file)
-
-# set multiindex to HH_types
-df_profiles.columns = pd.MultiIndex.from_arrays([df_profiles.columns.str[:2], df_profiles.columns.str[3:]])
-
-
-# Load Zensus data at nuts-level
-# ###################
-# TODO: > to/from SQL?
-# filed needs to be placed manually in directory
-file = 'Zensus2011_Personen.csv'
-file = os.path.join(os.path.realpath(file))
-df_zensus = pd.read_csv(file, sep=';', decimal='.', skiprows=5, skipfooter=7,
-                        index_col=[0, 1], header=[0, 1], encoding='latin1', engine='python')
-
-# clean data
-df_zensus = df_zensus.applymap(hh_tools.clean).applymap(int)
-# preprocess nuts1 zensus data
-df_zensus = hh_tools.process_nuts1_zensus_data(df_zensus)
-
-# ## Household distribution
-# - Adults living in househould type
-# - number of kids not included even if in housholdtype name
-# **! The Eurostat data only gives the amount of adults/seniors, excluding the amount of kids <15**
-# eurostat is used for demand-profile-generator @fraunhofer
-
-# hh shares mapping zensus to eurostat
-hh_types = {'SR': [('Einpersonenhaushalte (Singlehaushalte)', 'Insgesamt', 'Seniors'),
-                   ('Alleinerziehende Elternteile', 'Insgesamt', 'Seniors')],  # Single Seniors Single Parents Seniors
-            'SO': [('Einpersonenhaushalte (Singlehaushalte)', 'Insgesamt', 'Adults')],  # Single Adults
-            'SK': [('Alleinerziehende Elternteile', 'Insgesamt', 'Adults')],  # Single Parents Adult
-            'PR': [('Paare ohne Kind(er)', '2 Personen', 'Seniors'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '2 Personen', 'Seniors')],
-            # Couples without Kids Senior & same sex couples & shared flat seniors
-            'PO': [('Paare ohne Kind(er)', '2 Personen', 'Adults'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '2 Personen', 'Adults')],
-            # Couples without Kids adults & same sex couples & shared flat adults
-            'P1': [('Paare mit Kind(ern)', '3 Personen', 'Adults')],
-            'P2': [('Paare mit Kind(ern)', '4 Personen', 'Adults')],
-            'P3': [('Paare mit Kind(ern)', '5 Personen', 'Adults'),
-                   ('Paare mit Kind(ern)', '6 und mehr Personen', 'Adults')],
-            'OR': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen', 'Seniors'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen', 'Seniors'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen', 'Seniors'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '6 und mehr Personen', 'Seniors'),
-                   ('Paare mit Kind(ern)', '3 Personen', 'Seniors'),
-                   ('Paare ohne Kind(er)', '3 Personen', 'Seniors'),
-                   ('Paare mit Kind(ern)', '4 Personen', 'Seniors'),
-                   ('Paare ohne Kind(er)', '4 Personen', 'Seniors'),
-                   ('Paare mit Kind(ern)', '5 Personen', 'Seniors'),
-                   ('Paare ohne Kind(er)', '5 Personen', 'Seniors'),
-                   ('Paare mit Kind(ern)', '6 und mehr Personen', 'Seniors'),
-                   ('Paare ohne Kind(er)', '6 und mehr Personen', 'Seniors')],  # no info about share of kids
-
-            # OO, O1, O2 have the same amount, as no information about the share of kids within zensus data set.
-            # if needed the total amount can be corrected in the hh_tools.get_hh_dist function
-            # using multi_adjust=True option
-            'OO': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen', 'Adults'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen', 'Adults'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen', 'Adults'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '6 und mehr Personen', 'Adults'),
-                   ('Paare ohne Kind(er)', '3 Personen', 'Adults'),
-                   ('Paare ohne Kind(er)', '4 Personen', 'Adults'),
-                   ('Paare ohne Kind(er)', '5 Personen', 'Adults'),
-                   ('Paare ohne Kind(er)', '6 und mehr Personen', 'Adults')],  # no info about share of kids
-            # TODO: maybe remove following lines if not needed
-
-            #             'O1': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen', 'Adults'),
-            #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen', 'Adults'),
-            #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen', 'Adults'),
-            #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '6 und mehr Personen', 'Adults'),
-            #                    ('Paare ohne Kind(er)', '3 Personen', 'Adults'),
-            #                    ('Paare ohne Kind(er)', '4 Personen', 'Adults'),
-            #                    ('Paare ohne Kind(er)', '5 Personen', 'Adults'),
-            #                    ('Paare ohne Kind(er)', '6 und mehr Personen', 'Adults')],  # no info about share of kids
-            #             'O2': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen', 'Adults'),
-            #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen', 'Adults'),
-            #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen', 'Adults'),
-            #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '6 und mehr Personen', 'Adults'),
-            #                    ('Paare ohne Kind(er)', '3 Personen', 'Adults'),
-            #                    ('Paare ohne Kind(er)', '4 Personen', 'Adults'),
-            #                    ('Paare ohne Kind(er)', '5 Personen', 'Adults'),
-            #                    ('Paare ohne Kind(er)', '6 und mehr Personen', 'Adults')]
-            }
-
-# distribution of people by household @eurostats
-# df_hh_types_D = pd.Series({'SR': 0.083, 'SO': 0.158, 'SK': 0.022,
-#                            'PR': 0.145, 'PO': 0.203, 'P1': 0.081, 'P2': 0.077, 'P3': 0.024,
-#                            'OR': 0.023, 'OO': 0.13, 'O1': 0.04, 'O2': 0.015})
-
-# hh_tools.get_hh_dist without eurostat adjustment for O1-03 Groups in absolute values
-df_hh_types_nad_abs = hh_tools.get_hh_dist(df_zensus, hh_types, multi_adjust=False, relative=False)
-
-# #########################
-# FIXME:
-# mapping needs to be adjusted for OR, OO, O1, O2
-#
-# ###########################
-
-mapping_people_in_households = {'SR': 1,
-                                'SO': 1,
-                                'SK': 1,  # kids are excluded
-                                'PR': 2,
-                                'PO': 2,
-                                'P1': 2,  # kids are excluded
-                                'P2': 2,  # ""
-                                'P3': 2,  # ""
-                                'OR': 4,  # parameter needs to be re/defined
-                                'OO': 4,  # ""
-                                #                                 'O1': 4,  # ""
-                                #                                 'O2': 4,  # ""
-                                }
-# derivate households data from inhabitants data by compound number of people per household type
-df_dist_households = hh_tools.inhabitants_to_households(df_hh_types_nad_abs, mapping_people_in_households)
-
-# TODO: direct db.engine to configuration file
-# engine = db.engine()
-# SQL - Access Zensus household data cell-level
-df_households_typ = db.select_dataframe(sql="""
-            SELECT grid_id, attribute, characteristics_code, characteristics_text, quantity
-            FROM society.destatis_zensus_household_per_ha
-            WHERE attribute = 'HHTYP_FAM' """)
-df_households_typ = df_households_typ.drop(columns=['attribute', 'characteristics_text'])
-df_households_typ = df_households_typ.rename(columns={'quantity': 'hh_5types'})
-
-mapping_zensus_hh_subgroups = {1: ['SR', 'SO'],
-                               2: ['PR', 'PO'],
-                               3: ['SK'],
-                               4: ['P1', 'P2', 'P3'],
-                               5: ['OR', 'OO'],
-                               }
-
-for value in mapping_zensus_hh_subgroups.values():
-    df_dist_households.loc[value] = df_dist_households.loc[value].div(df_dist_households.loc[value].sum())
-
-# SQL- create table to map cells to nuts3 and nuts1
-df_grid_id = db.select_dataframe(sql="""
-                    SELECT pop.grid_id, pop.gid, vg250.vg250_nuts3 as nuts3, lan.nuts as nuts1, lan.gen
-                    FROM society.destatis_zensus_population_per_ha_inside_germany as pop
-                    LEFT JOIN boundaries.egon_map_zensus_vg250 as vg250
-                    ON (pop.gid=vg250.zensus_population_id)
-                    LEFT JOIN boundaries.vg250_lan as lan
-                    ON (LEFT(vg250.vg250_nuts3, 3)=lan.nuts) """)
-df_grid_id = df_grid_id.drop_duplicates()
-df_grid_id = df_grid_id.reset_index(drop=True)
-
-# merge nuts info to zensus cell level data
-# how='inner' is used as ids of unpopulated areas are removed df_grid_id or earliers tables. see here:
-# https://github.com/openego/eGon-data/blob/59195926e41c8bd6d1ca8426957b97f33ef27bcc/src/egon/data/importing/zensus/__init__.py#L418-L449
-df_households_typ = pd.merge(df_households_typ, df_grid_id[['grid_id', 'gen', 'nuts1', 'nuts3']],
-                             left_on='grid_id', right_on='grid_id', how='inner')
-
-# Merge Zensus nuts level household data with zensus cell level by dividing hh-groups with mapping_zensus_hh_subgroups
-df_zensus_cells = pd.DataFrame()
-for (country, code), df_country_type in df_households_typ.groupby(['gen', 'characteristics_code']):
-
-    # iterate over zenus_country subgroups
-    for typ in mapping_zensus_hh_subgroups[code]:
-        df_country_type['hh_type'] = typ
-        df_country_type['factor'] = df_dist_households.loc[typ, country]
-        df_country_type['hh_10types'] = df_country_type['hh_5types'] * df_dist_households.loc[typ, country]
-        df_zensus_cells = df_zensus_cells.append(df_country_type, ignore_index=True)
-
-df_zensus_cells = df_zensus_cells.sort_values(by=['grid_id', 'characteristics_code']).reset_index(drop=True)
-
-# change profile numbers to int
-df_profiles.columns = pd.MultiIndex.from_tuples([(a, int(b)) for a, b in df_profiles.columns])
-
-pool_size = df_profiles.groupby(level=0, axis=1).size()
-
-df_demand_regio = db.select_dataframe(sql="""
-                        SELECT year, nuts3, SUM (demand) as demand_mWha
-                        FROM demand.egon_demandregio_hh as egon_d
-                        GROUP BY nuts3, year
-                        ORDER BY year""", index_col=['year', 'nuts3'])
-
-# # testcase
-# test_data = df_zensus_cells.groupby('nuts3').get_group('DEF03')
-# test_data = pd.concat([df_zensus_cells.groupby('nuts3').get_group('DEF03'),
-#                        df_zensus_cells.groupby('nuts3').get_group('DEF06')])
-#
-# df_cell_demand_metadata = hh_tools.get_cell_demand_metadata(test_data, df_profiles)
-# df_cell_demand_metadata = hh_tools.adjust_to_demand_regio_nuts3_annual(df_cell_demand_metadata, df_profiles, df_demand_regio)
-#
-#
-#
-# import random
-# load_area_cell_ids = random.sample(list(df_cell_demand_metadata.index), 100)
-# max_value_load_area = hh_tools.get_load_area_max_load(df_profiles, df_cell_demand_metadata, load_area_cell_ids, 2035)
-# # print(df_cell_demand_metadata.shape)
-# print(max_value_load_area)
+from egon.data.processing.hh_demand import hh_demand_profiles_tools as hh_tools
+
+
+if __name__ == "__main__":
+    # Loadprofilesdump
+    # ###################
+    # in Wh
+    # TODO: > to/from SQL?
+    # filed needs to be placed manually in directory
+    file = 'h0_profiles.h5'
+    file = os.path.join(os.path.realpath(file))
+    df_profiles = pd.read_hdf(file)
+
+    # set multiindex to HH_types
+    df_profiles.columns = pd.MultiIndex.from_arrays([df_profiles.columns.str[:2], df_profiles.columns.str[3:]])
+
+
+    # Load Zensus data at nuts-level
+    # ###################
+    # TODO: > to/from SQL?
+    # filed needs to be placed manually in directory
+    file = 'Zensus2011_Personen.csv'
+    file = os.path.join(os.path.realpath(file))
+    df_zensus = pd.read_csv(file, sep=';', decimal='.', skiprows=5, skipfooter=7,
+                            index_col=[0, 1], header=[0, 1], encoding='latin1', engine='python')
+
+    # clean data
+    df_zensus = df_zensus.applymap(hh_tools.clean).applymap(int)
+    # preprocess nuts1 zensus data
+    df_zensus = hh_tools.process_nuts1_zensus_data(df_zensus)
+
+    # ## Household distribution
+    # - Adults living in househould type
+    # - number of kids not included even if in housholdtype name
+    # **! The Eurostat data only gives the amount of adults/seniors, excluding the amount of kids <15**
+    # eurostat is used for demand-profile-generator @fraunhofer
+
+    # hh shares mapping zensus to eurostat
+    hh_types = {'SR': [('Einpersonenhaushalte (Singlehaushalte)', 'Insgesamt', 'Seniors'),
+                       ('Alleinerziehende Elternteile', 'Insgesamt', 'Seniors')],  # Single Seniors Single Parents Seniors
+                'SO': [('Einpersonenhaushalte (Singlehaushalte)', 'Insgesamt', 'Adults')],  # Single Adults
+                'SK': [('Alleinerziehende Elternteile', 'Insgesamt', 'Adults')],  # Single Parents Adult
+                'PR': [('Paare ohne Kind(er)', '2 Personen', 'Seniors'),
+                       ('Mehrpersonenhaushalte ohne Kernfamilie', '2 Personen', 'Seniors')],
+                # Couples without Kids Senior & same sex couples & shared flat seniors
+                'PO': [('Paare ohne Kind(er)', '2 Personen', 'Adults'),
+                       ('Mehrpersonenhaushalte ohne Kernfamilie', '2 Personen', 'Adults')],
+                # Couples without Kids adults & same sex couples & shared flat adults
+                'P1': [('Paare mit Kind(ern)', '3 Personen', 'Adults')],
+                'P2': [('Paare mit Kind(ern)', '4 Personen', 'Adults')],
+                'P3': [('Paare mit Kind(ern)', '5 Personen', 'Adults'),
+                       ('Paare mit Kind(ern)', '6 und mehr Personen', 'Adults')],
+                'OR': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen', 'Seniors'),
+                       ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen', 'Seniors'),
+                       ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen', 'Seniors'),
+                       ('Mehrpersonenhaushalte ohne Kernfamilie', '6 und mehr Personen', 'Seniors'),
+                       ('Paare mit Kind(ern)', '3 Personen', 'Seniors'),
+                       ('Paare ohne Kind(er)', '3 Personen', 'Seniors'),
+                       ('Paare mit Kind(ern)', '4 Personen', 'Seniors'),
+                       ('Paare ohne Kind(er)', '4 Personen', 'Seniors'),
+                       ('Paare mit Kind(ern)', '5 Personen', 'Seniors'),
+                       ('Paare ohne Kind(er)', '5 Personen', 'Seniors'),
+                       ('Paare mit Kind(ern)', '6 und mehr Personen', 'Seniors'),
+                       ('Paare ohne Kind(er)', '6 und mehr Personen', 'Seniors')],  # no info about share of kids
+
+                # OO, O1, O2 have the same amount, as no information about the share of kids within zensus data set.
+                # if needed the total amount can be corrected in the hh_tools.get_hh_dist function
+                # using multi_adjust=True option
+                'OO': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen', 'Adults'),
+                       ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen', 'Adults'),
+                       ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen', 'Adults'),
+                       ('Mehrpersonenhaushalte ohne Kernfamilie', '6 und mehr Personen', 'Adults'),
+                       ('Paare ohne Kind(er)', '3 Personen', 'Adults'),
+                       ('Paare ohne Kind(er)', '4 Personen', 'Adults'),
+                       ('Paare ohne Kind(er)', '5 Personen', 'Adults'),
+                       ('Paare ohne Kind(er)', '6 und mehr Personen', 'Adults')],  # no info about share of kids
+                # TODO: maybe remove following lines if not needed
+
+                #             'O1': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen', 'Adults'),
+                #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen', 'Adults'),
+                #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen', 'Adults'),
+                #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '6 und mehr Personen', 'Adults'),
+                #                    ('Paare ohne Kind(er)', '3 Personen', 'Adults'),
+                #                    ('Paare ohne Kind(er)', '4 Personen', 'Adults'),
+                #                    ('Paare ohne Kind(er)', '5 Personen', 'Adults'),
+                #                    ('Paare ohne Kind(er)', '6 und mehr Personen', 'Adults')],  # no info about share of kids
+                #             'O2': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen', 'Adults'),
+                #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen', 'Adults'),
+                #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen', 'Adults'),
+                #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '6 und mehr Personen', 'Adults'),
+                #                    ('Paare ohne Kind(er)', '3 Personen', 'Adults'),
+                #                    ('Paare ohne Kind(er)', '4 Personen', 'Adults'),
+                #                    ('Paare ohne Kind(er)', '5 Personen', 'Adults'),
+                #                    ('Paare ohne Kind(er)', '6 und mehr Personen', 'Adults')]
+                }
+
+    # distribution of people by household @eurostats
+    # df_hh_types_D = pd.Series({'SR': 0.083, 'SO': 0.158, 'SK': 0.022,
+    #                            'PR': 0.145, 'PO': 0.203, 'P1': 0.081, 'P2': 0.077, 'P3': 0.024,
+    #                            'OR': 0.023, 'OO': 0.13, 'O1': 0.04, 'O2': 0.015})
+
+    # hh_tools.get_hh_dist without eurostat adjustment for O1-03 Groups in absolute values
+    df_hh_types_nad_abs = hh_tools.get_hh_dist(df_zensus, hh_types, multi_adjust=False, relative=False)
+
+    # #########################
+    # FIXME:
+    # mapping needs to be adjusted for OR, OO, O1, O2
+    #
+    # ###########################
+
+    mapping_people_in_households = {'SR': 1,
+                                    'SO': 1,
+                                    'SK': 1,  # kids are excluded
+                                    'PR': 2,
+                                    'PO': 2,
+                                    'P1': 2,  # kids are excluded
+                                    'P2': 2,  # ""
+                                    'P3': 2,  # ""
+                                    'OR': 4,  # parameter needs to be re/defined
+                                    'OO': 4,  # ""
+                                    #                                 'O1': 4,  # ""
+                                    #                                 'O2': 4,  # ""
+                                    }
+    # derivate households data from inhabitants data by compound number of people per household type
+    df_dist_households = hh_tools.inhabitants_to_households(df_hh_types_nad_abs, mapping_people_in_households)
+
+    # TODO: direct db.engine to configuration file
+    # engine = db.engine()
+    # SQL - Access Zensus household data cell-level
+    df_households_typ = db.select_dataframe(sql="""
+                SELECT grid_id, attribute, characteristics_code, characteristics_text, quantity
+                FROM society.destatis_zensus_household_per_ha
+                WHERE attribute = 'HHTYP_FAM' """)
+    df_households_typ = df_households_typ.drop(columns=['attribute', 'characteristics_text'])
+    df_households_typ = df_households_typ.rename(columns={'quantity': 'hh_5types'})
+
+    mapping_zensus_hh_subgroups = {1: ['SR', 'SO'],
+                                   2: ['PR', 'PO'],
+                                   3: ['SK'],
+                                   4: ['P1', 'P2', 'P3'],
+                                   5: ['OR', 'OO'],
+                                   }
+
+    for value in mapping_zensus_hh_subgroups.values():
+        df_dist_households.loc[value] = df_dist_households.loc[value].div(df_dist_households.loc[value].sum())
+
+    # SQL- create table to map cells to nuts3 and nuts1
+    df_grid_id = db.select_dataframe(sql="""
+                        SELECT pop.grid_id, pop.gid, vg250.vg250_nuts3 as nuts3, lan.nuts as nuts1, lan.gen
+                        FROM society.destatis_zensus_population_per_ha_inside_germany as pop
+                        LEFT JOIN boundaries.egon_map_zensus_vg250 as vg250
+                        ON (pop.gid=vg250.zensus_population_id)
+                        LEFT JOIN boundaries.vg250_lan as lan
+                        ON (LEFT(vg250.vg250_nuts3, 3)=lan.nuts) """)
+    df_grid_id = df_grid_id.drop_duplicates()
+    df_grid_id = df_grid_id.reset_index(drop=True)
+
+    # merge nuts info to zensus cell level data
+    # how='inner' is used as ids of unpopulated areas are removed df_grid_id or earliers tables. see here:
+    # https://github.com/openego/eGon-data/blob/59195926e41c8bd6d1ca8426957b97f33ef27bcc/src/egon/data/importing/zensus/__init__.py#L418-L449
+    df_households_typ = pd.merge(df_households_typ, df_grid_id[['grid_id', 'gen', 'nuts1', 'nuts3']],
+                                 left_on='grid_id', right_on='grid_id', how='inner')
+
+    # Merge Zensus nuts level household data with zensus cell level by dividing hh-groups with mapping_zensus_hh_subgroups
+    df_zensus_cells = pd.DataFrame()
+    for (country, code), df_country_type in df_households_typ.groupby(['gen', 'characteristics_code']):
+
+        # iterate over zenus_country subgroups
+        for typ in mapping_zensus_hh_subgroups[code]:
+            df_country_type['hh_type'] = typ
+            df_country_type['factor'] = df_dist_households.loc[typ, country]
+            df_country_type['hh_10types'] = df_country_type['hh_5types'] * df_dist_households.loc[typ, country]
+            df_zensus_cells = df_zensus_cells.append(df_country_type, ignore_index=True)
+
+    df_zensus_cells = df_zensus_cells.sort_values(by=['grid_id', 'characteristics_code']).reset_index(drop=True)
+
+    # change profile numbers to int
+    df_profiles.columns = pd.MultiIndex.from_tuples([(a, int(b)) for a, b in df_profiles.columns])
+
+    pool_size = df_profiles.groupby(level=0, axis=1).size()
+
+    df_demand_regio = db.select_dataframe(sql="""
+                            SELECT year, nuts3, SUM (demand) as demand_mWha
+                            FROM demand.egon_demandregio_hh as egon_d
+                            GROUP BY nuts3, year
+                            ORDER BY year""", index_col=['year', 'nuts3'])
+
+    # # testcase
+    # test_data = df_zensus_cells.groupby('nuts3').get_group('DEF03')
+    # test_data = pd.concat([df_zensus_cells.groupby('nuts3').get_group('DEF03'),
+    #                        df_zensus_cells.groupby('nuts3').get_group('DEF06')])
+    #
+    # df_cell_demand_metadata = hh_tools.get_cell_demand_metadata(test_data, df_profiles)
+    # df_cell_demand_metadata = hh_tools.adjust_to_demand_regio_nuts3_annual(df_cell_demand_metadata, df_profiles, df_demand_regio)
+    #
+    #
+    #
+    # import random
+    # load_area_cell_ids = random.sample(list(df_cell_demand_metadata.index), 100)
+    # max_value_load_area = hh_tools.get_load_area_max_load(df_profiles, df_cell_demand_metadata, load_area_cell_ids, 2035)
+    # # print(df_cell_demand_metadata.shape)
+    # print(max_value_load_area)
 
 

From 1ac40fedbe2cb9931990e1ec21a95c75f679bfb8 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Tue, 25 May 2021 17:21:59 +0200
Subject: [PATCH 06/97] add description about data sources concerning files to
 be downloaded

---
 .../hh_demand/hh_demand_profiles_tools.py     | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 56613300f..1e08ab5e5 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -200,3 +200,29 @@ def get_load_area_max_load(df_profiles, df_cell_demand_metadata, load_area_ids,
         part_load = df_profiles.loc[:, df['cell_profile_ids'].sum()].sum(axis=1) * factor / 1e3  # profiles in Wh
         full_load = full_load.add(part_load)
     return full_load.max()  #, full_load.idxmax()
+
+
+def download_files():
+    """
+    1. 'h0_profiles.h5'
+        Households demand profiles generated by Fraunhofer IWES
+        Methodology is described in: https://www.researchgate.net/publication/273775902_Erzeugung_zeitlich_hochaufgeloster_Stromlastprofile_fur_verschiedene_Haushaltstypen
+        used and further describer in the thesisis by:
+            1. Jonas Haack
+                "Auswirkungen verschiedener Haushaltslastprofile auf PV-Batterie-Systeme" (confidential)
+            2. Simon Ruben Drauz
+                "Synthesis of a heat and electrical load profile for single and multi-family houses used for subsequent
+                 performance tests of a multi-component energy system"
+                http://dx.doi.org/10.13140/RG.2.2.13959.14248
+
+    2. 'Zensus2011_Personen.csv' (does not exist in this format anymore but in different format)
+        Dataset describing the amount of people living by a certain types of family-types, age-classes,
+        sex and size of household in Germany in state-resolution.
+        Data from: https://ergebnisse2011.zensus2022.de/datenbank/online
+        - Search for: "1000A-2029"
+        - or choose topic: "Bevölkerung kompakt"
+        - Choose table code: "1000A-2029" with title "Personen: Alter (11 Altersklassen)/Geschlecht/Größe des
+        privaten Haushalts - Typ des privaten Haushalts (nach Familien/Lebensform)"
+        - Change setting "GEOLK1" to "Bundesländer (16)"
+        higher resolution "Landkreise und kreisfreie Städte (412)" only accessible after registration.
+    """

From da582bd3b2420ce0a3648b437faa63dd1f9b7726 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Tue, 25 May 2021 17:55:48 +0200
Subject: [PATCH 07/97] add docstrings and remove not used functions

---
 .../hh_demand/hh_demand_profiles_tools.py     | 91 +++++++++----------
 1 file changed, 45 insertions(+), 46 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 1e08ab5e5..2a4d9f1d7 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -9,6 +9,7 @@
 
 
 def clean(x):
+    """clean dataset convert '.' and '-' to str(0). remove brackets. table will be converted to int/floats afterwards"""
     x = str(x).replace('-', str(0))
     x = str(x).replace('.', str(0))
     x = x.strip('()')
@@ -16,6 +17,24 @@ def clean(x):
 
 
 def get_hh_dist(df_zensus, hh_types, multi_adjust=True, relative=True):
+    """group zensus data to fit Demand-Profile-Generator (DPG) format.
+    Parameters
+    ----------
+    df_zensus: pd.DataFrame
+        containing zensus data
+    hh_types: dict
+        mapping of zensus groups to DPG groups
+    multi-adjust: bool
+        if True, splits DPG-group 'OO' into 3 subgroups and uses distribution factor derived by
+        table II in https://www.researchgate.net/publication/273775902_Erzeugung_zeitlich_hochaufgeloster_Stromlastprofile_fur_verschiedene_Haushaltstypen
+    relative: bool
+        if True produces relative values
+    Returns
+    ----------
+    df_hh_types: pd.DataFrame
+        distribution of people by household type and regional-resolution
+    !data still needs to be converted from amount of people to amount of households!
+     """
     # adjust multi with/without kids via eurostat share as not clearly derivable without infos about share of kids
     if multi_adjust:
         adjust = {'SR': 1, 'SO': 1, 'SK': 1, 'PR': 1, 'PO': 1, 'P1': 1, 'P2': 1, 'P3': 1, 'OR': 1,
@@ -35,51 +54,23 @@ def get_hh_dist(df_zensus, hh_types, multi_adjust=True, relative=True):
     return df_hh_types.T
 
 
-def get_loadprofiles(df_profiles, df_hh_types, hh_total, state_dist=False):
-    # equal share of hh_total for every state
-    if not state_dist:
-        state_dist = pd.Series(hh_total / df_hh_types.shape[1], index=df_hh_types.columns)
-    # specific share of hh_total by state_dist
-    else:
-        state_dist = state_dist * hh_total
-
-    header = pd.MultiIndex.from_tuples(
-        [(state, hh_type) for state in df_hh_types.columns for hh_type in df_hh_types.index],
-        names=['State', 'Type'])
-
-    df_loadprofiles = pd.DataFrame(columns=header)
-
-    for state, state_share in state_dist.items():
-
-        for hh_type, type_share in df_hh_types[state].items():
-            samples = state_share * type_share
-            #             if samples<1:
-            #                 raise ValueError(f"Sample size needs to be >=1. hh-share of {state}-{hh_type} is {samples:.2f} , increase hh_total")
-            samples = int(samples)
-            hh_type_ts = df_profiles[hh_type].T.sample(samples, axis=0, replace=True).sum()
-
-            df_loadprofiles.loc[:, (state, hh_type)] = hh_type_ts
-
-    timestamp = pd.date_range(start='01-01-2012', periods=df_loadprofiles.shape[0], freq='h')
-    df_loadprofiles.index = timestamp
-    return df_loadprofiles
-
-
-def normalize_loadprofiles(df_lp, to_value):
-    # normed to 'to_value' kWh annual
-    df_kwh = df_lp.groupby(level='State', axis=1).sum() * to_value / df_lp.groupby(level='State', axis=1).sum().sum()
-
-    return df_kwh
-
-
-def aggregate_loadprofiles(df_lp, to_value):
-    # normed to 'to_value' kWh annual
-    df_kwh = df_lp.sum(axis=1) * to_value / df_lp.sum(axis=1).sum()
-
-    return df_kwh
-
-
 def inhabitants_to_households(df_people_by_householdtypes_abs, mapping_people_in_households):
+    """converts distribution of peoples living in types of households to distribution of household types by using
+    a people-in-household mapping. results are rounded to int (ceiled) to full households.
+
+    Parameters
+    ----------
+    df_people_by_householdtypes_abs: pd.DataFrame
+        distribution of people living in households
+    mapping_people_in_households: dict
+        mapping of people living in certain types of households
+    Returns
+    ----------
+    df_households_by_type: pd.DataFrame
+        distribution of households type
+
+    """
+    # compare categories and remove form mapping if to many
     diff = set(df_people_by_householdtypes_abs.index) ^ set(mapping_people_in_households.keys())
 
     if bool(diff):
@@ -88,13 +79,21 @@ def inhabitants_to_households(df_people_by_householdtypes_abs, mapping_people_in
             del mapping_people_in_households[key]
         print(f'Removed {diff} from mapping!')
 
+    # divide amount of people by people in household types
     df_households_by_type = df_people_by_householdtypes_abs.div(mapping_people_in_households, axis=0)
-    df_households_by_type = df_households_by_type.apply(np.ceil)  # round up households
+    # TODO: check @ Guido
+    # round up households
+    df_households_by_type = df_households_by_type.apply(np.ceil)
 
     return df_households_by_type
 
 
 def process_nuts1_zensus_data(df_zensus):
+    """group, remove and reorder categories wich are not needed for demand-profile-generator (DPG)
+    Kids (<15) are excluded as they are also excluded in DPG origin dataset.
+    Adults (15<65)
+    Seniors (<65)
+    """
     # Group data to fit Load Profile Generator categories
     # define kids/adults/seniors
     kids = ['Unter 3', '3 - 5', '6 - 14']  # < 15
@@ -156,7 +155,7 @@ def get_cell_demand_metadata(df_zensus_cells, df_profiles):
     for cell_id, df_cell in df_zensus_cells.groupby(by='grid_id'):
         # FIXME
         # ! runden der Haushaltszahlen auf int
-        # ! kein Zurücklegen innerhalb einer Zelle ?!
+        # ! kein zurücklegen innerhalb einer Zelle ?! -> das is ok.
         cell_profile_ids = get_cell_demand_profile_ids(df_cell, pool_size, df_profiles)
 
         df_cell_demand_metadata.at[cell_id, 'cell_profile_ids'] = cell_profile_ids

From 0e9a839958278cfcbd69d8431eb09f4b6ed5aff3 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Tue, 25 May 2021 19:13:57 +0200
Subject: [PATCH 08/97] add not on hh-people-mapping

---
 src/egon/data/processing/hh_demand/hh_demand_profiles.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index ea74cb906..27237c466 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -112,7 +112,8 @@
     # #########################
     # FIXME:
     # mapping needs to be adjusted for OR, OO, O1, O2
-    #
+    # O1, O2 are not used anymore
+    # influence of OO and OR -parameter to overall household-sum rather small
     # ###########################
 
     mapping_people_in_households = {'SR': 1,
@@ -131,6 +132,9 @@
     # derivate households data from inhabitants data by compound number of people per household type
     df_dist_households = hh_tools.inhabitants_to_households(df_hh_types_nad_abs, mapping_people_in_households)
 
+    # FIXME:
+    # compare df_dist_households.sum() here with values from other source
+
     # TODO: direct db.engine to configuration file
     # engine = db.engine()
     # SQL - Access Zensus household data cell-level

From 16a57e26c61b5422132c2679925acbce2da63b99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Tue, 25 May 2021 19:17:05 +0200
Subject: [PATCH 09/97] Download HH load profiles from NextCloud

---
 src/egon/data/datasets.yml                    |  4 +++
 .../hh_demand/hh_demand_profiles.py           | 13 +++----
 .../hh_demand/hh_demand_profiles_tools.py     | 35 +++++++++++++++++--
 3 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/src/egon/data/datasets.yml b/src/egon/data/datasets.yml
index bb59a1eb0..203e68ea5 100644
--- a/src/egon/data/datasets.yml
+++ b/src/egon/data/datasets.yml
@@ -435,3 +435,7 @@ electrical_load_curves_cts:
       schema: 'grid'
       table: 'egon_pf_hv_load_timeseries'
 
+household_electricity_demand:
+  sources:
+    household_electricity_demand_profiles:
+      url: "https://next.rl-institut.de/s/M8o3ALXPappRM3Y/download/h0_profiles.h5"
diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index 27237c466..5ec92b52a 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -14,12 +14,13 @@
     # in Wh
     # TODO: > to/from SQL?
     # filed needs to be placed manually in directory
-    file = 'h0_profiles.h5'
-    file = os.path.join(os.path.realpath(file))
-    df_profiles = pd.read_hdf(file)
-
-    # set multiindex to HH_types
-    df_profiles.columns = pd.MultiIndex.from_arrays([df_profiles.columns.str[:2], df_profiles.columns.str[3:]])
+    # file = 'h0_profiles.h5'
+    # file = os.path.join(os.path.realpath(file))
+    # df_profiles = pd.read_hdf(file)
+    #
+    # # set multiindex to HH_types
+    # df_profiles.columns = pd.MultiIndex.from_arrays([df_profiles.columns.str[:2], df_profiles.columns.str[3:]])
+    df_profiles = hh_tools.get_household_demand_profiles_raw()
 
 
     # Load Zensus data at nuts-level
diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 2a4d9f1d7..3db25c0b1 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -1,12 +1,12 @@
-#!/usr/bin/env python
-# coding: utf-8
-
 import pandas as pd
 import numpy as np
 
 from itertools import cycle
 import random
+from pathlib import Path
+from urllib.request import urlretrieve
 
+import egon.data.config
 
 def clean(x):
     """clean dataset convert '.' and '-' to str(0). remove brackets. table will be converted to int/floats afterwards"""
@@ -16,6 +16,35 @@ def clean(x):
     return x
 
 
+def get_household_demand_profiles_raw():
+    """
+    Downloads and returns household electricity demand profiles
+
+    Download only happens, if file isn't already existing.
+
+    Returns
+    -------
+    pd.DataFrame
+        Table with profiles in columns and time as index. A pd.MultiIndex is
+        used to distinguish load profiles from different EUROSTAT household
+        types.
+    """
+    data_config = egon.data.config.datasets()["household_electricity_demand"]
+
+    hh_profiles_url = data_config["sources"]["household_electricity_demand_profiles"]["url"]
+    hh_profiles_file = Path(".") / Path(hh_profiles_url).name
+
+    if not hh_profiles_file.is_file():
+        urlretrieve(hh_profiles_url, hh_profiles_file)
+
+    hh_profiles = pd.read_hdf(hh_profiles_file)
+
+    # set multiindex to HH_types
+    hh_profiles.columns = pd.MultiIndex.from_arrays([hh_profiles.columns.str[:2], hh_profiles.columns.str[3:]])
+
+    return hh_profiles
+
+
 def get_hh_dist(df_zensus, hh_types, multi_adjust=True, relative=True):
     """group zensus data to fit Demand-Profile-Generator (DPG) format.
     Parameters

From 243bd57a41abf9ed341dcf52455809de40eaa45e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Tue, 25 May 2021 19:40:37 +0200
Subject: [PATCH 10/97] Download zensus households data from nextcloud

---
 src/egon/data/datasets.yml                    |  2 ++
 .../hh_demand/hh_demand_profiles.py           | 19 +++++-----
 .../hh_demand/hh_demand_profiles_tools.py     | 36 +++++++++++++++++++
 3 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/src/egon/data/datasets.yml b/src/egon/data/datasets.yml
index 203e68ea5..399b665d6 100644
--- a/src/egon/data/datasets.yml
+++ b/src/egon/data/datasets.yml
@@ -439,3 +439,5 @@ household_electricity_demand:
   sources:
     household_electricity_demand_profiles:
       url: "https://next.rl-institut.de/s/M8o3ALXPappRM3Y/download/h0_profiles.h5"
+    zensus_household_types:
+      url: "https://next.rl-institut.de/s/oQXRkYgWLXK3zND/download/Zensus2011_Personen.csv"
diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index 5ec92b52a..b17d61b51 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -27,15 +27,16 @@
     # ###################
     # TODO: > to/from SQL?
     # filed needs to be placed manually in directory
-    file = 'Zensus2011_Personen.csv'
-    file = os.path.join(os.path.realpath(file))
-    df_zensus = pd.read_csv(file, sep=';', decimal='.', skiprows=5, skipfooter=7,
-                            index_col=[0, 1], header=[0, 1], encoding='latin1', engine='python')
-
-    # clean data
-    df_zensus = df_zensus.applymap(hh_tools.clean).applymap(int)
-    # preprocess nuts1 zensus data
-    df_zensus = hh_tools.process_nuts1_zensus_data(df_zensus)
+    # file = 'Zensus2011_Personen.csv'
+    # file = os.path.join(os.path.realpath(file))
+    # df_zensus = pd.read_csv(file, sep=';', decimal='.', skiprows=5, skipfooter=7,
+    #                         index_col=[0, 1], header=[0, 1], encoding='latin1', engine='python')
+    #
+    # # clean data
+    # df_zensus = df_zensus.applymap(hh_tools.clean).applymap(int)
+    # # preprocess nuts1 zensus data
+    # df_zensus = hh_tools.process_nuts1_zensus_data(df_zensus)
+    df_zensus = hh_tools.download_process_zensus_households()
 
     # ## Household distribution
     # - Adults living in househould type
diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 3db25c0b1..34b8677ff 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -45,6 +45,42 @@ def get_household_demand_profiles_raw():
     return hh_profiles
 
 
+def download_process_zensus_households():
+    """
+    Downloads and pre-processes zensus age x household type data
+
+    Download only happens, if file isn't already existing.
+    TODO: Add description about data
+
+
+    Returns
+    -------
+    pd.DataFrame
+        Pre-processed zensus household data
+    """
+    data_config = egon.data.config.datasets()["household_electricity_demand"]
+
+    households_url = data_config["sources"]["zensus_household_types"]["url"]
+    households_file = Path(".") / Path(households_url).name
+
+    # Download prepared data file from nextcloud
+    if not households_file.is_file():
+        urlretrieve(households_url, households_file)
+
+    # Read downloaded file from disk
+    households_raw = pd.read_csv(households_file, sep=';', decimal='.', skiprows=5, skipfooter=7,
+                            index_col=[0, 1], header=[0, 1], encoding='latin1', engine='python')
+
+    # Clean data
+    households = households_raw.applymap(clean).applymap(int)
+
+    # Make data compatible with household demand profile categories
+    # Use less age interval and aggregate data to NUTS-1 level
+    households_nuts1 = process_nuts1_zensus_data(households)
+
+    return households_nuts1
+
+
 def get_hh_dist(df_zensus, hh_types, multi_adjust=True, relative=True):
     """group zensus data to fit Demand-Profile-Generator (DPG) format.
     Parameters

From 34a4815ce5779d7fc2f8294e0b0cd577ef8463dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Tue, 25 May 2021 19:41:57 +0200
Subject: [PATCH 11/97] Correct table name

---
 src/egon/data/processing/hh_demand/hh_demand_profiles.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index b17d61b51..bd3d9b02b 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -142,7 +142,7 @@
     # SQL - Access Zensus household data cell-level
     df_households_typ = db.select_dataframe(sql="""
                 SELECT grid_id, attribute, characteristics_code, characteristics_text, quantity
-                FROM society.destatis_zensus_household_per_ha
+                FROM society.egon_destatis_zensus_household_per_ha
                 WHERE attribute = 'HHTYP_FAM' """)
     df_households_typ = df_households_typ.drop(columns=['attribute', 'characteristics_text'])
     df_households_typ = df_households_typ.rename(columns={'quantity': 'hh_5types'})

From e9e15bfb99e4d6ee66d2b9dfbbe8536d03ab54c4 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Tue, 25 May 2021 22:55:57 +0200
Subject: [PATCH 12/97] add calculation of OO-facotr for
 people-to-household-mapping

---
 .../processing/hh_demand/hh_demand_profiles.py    | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index bd3d9b02b..0b57d2829 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -117,6 +117,14 @@
     # O1, O2 are not used anymore
     # influence of OO and OR -parameter to overall household-sum rather small
     # ###########################
+    df_hh_size = db.select_dataframe(sql="""
+                    SELECT characteristics_text, SUM(quantity) as summe
+                    FROM society.egon_destatis_zensus_household_per_ha as egon_d
+                    WHERE attribute = 'HHGROESS_KLASS'
+                    GROUP BY characteristics_text """, index_col='characteristics_text')
+
+    df_hh_size = df_hh_size.drop(index=['1 Person', '2 Personen'])
+    OO_factor = sum(df_hh_size['summe'] * [3, 4, 5, 6]) / df_hh_size['summe'].sum()
 
     mapping_people_in_households = {'SR': 1,
                                     'SO': 1,
@@ -126,8 +134,8 @@
                                     'P1': 2,  # kids are excluded
                                     'P2': 2,  # ""
                                     'P3': 2,  # ""
-                                    'OR': 4,  # parameter needs to be re/defined
-                                    'OO': 4,  # ""
+                                    'OR': OO_factor,
+                                    'OO': OO_factor,
                                     #                                 'O1': 4,  # ""
                                     #                                 'O2': 4,  # ""
                                     }
@@ -136,6 +144,9 @@
 
     # FIXME:
     # compare df_dist_households.sum() here with values from other source
+    # maybe scale on state-level
+
+
 
     # TODO: direct db.engine to configuration file
     # engine = db.engine()

From 9ec6a967cde25f084cc0ce2a3507063c2da2fa41 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Tue, 25 May 2021 22:56:56 +0200
Subject: [PATCH 13/97] add note

---
 src/egon/data/processing/hh_demand/hh_demand_profiles.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index 0b57d2829..5210fce7b 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -144,7 +144,7 @@
 
     # FIXME:
     # compare df_dist_households.sum() here with values from other source
-    # maybe scale on state-level
+    # maybe scale on state-levelm
 
 
 

From fd541ec8af1d4faf9cd841cfa259688342298704 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Wed, 26 May 2021 14:00:22 +0200
Subject: [PATCH 14/97] Reformat and extend docstrings

---
 .../hh_demand/hh_demand_profiles_tools.py     | 186 +++++++++++++++---
 1 file changed, 161 insertions(+), 25 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 34b8677ff..5ade14f6b 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -8,8 +8,27 @@
 
 import egon.data.config
 
+
 def clean(x):
-    """clean dataset convert '.' and '-' to str(0). remove brackets. table will be converted to int/floats afterwards"""
+    """Clean zensus household data row-wise
+
+    Clean dataset by
+
+    * converting '.' and '-' to str(0)
+    * removing brackets
+
+    Table can be converted to int/floats afterwards
+
+    Parameters
+    ----------
+    x: pd.Series
+        It is meant to be used with :code:`df.applymap()`
+
+    Returns
+    -------
+    pd.Series
+        Re-formatted data row
+    """
     x = str(x).replace('-', str(0))
     x = str(x).replace('.', str(0))
     x = x.strip('()')
@@ -82,23 +101,31 @@ def download_process_zensus_households():
 
 
 def get_hh_dist(df_zensus, hh_types, multi_adjust=True, relative=True):
-    """group zensus data to fit Demand-Profile-Generator (DPG) format.
+    """
+    Group zensus data to fit Demand-Profile-Generator (DPG) format.
+
     Parameters
     ----------
     df_zensus: pd.DataFrame
-        containing zensus data
+        Zensus households data
     hh_types: dict
-        mapping of zensus groups to DPG groups
+        Mapping of zensus groups to DPG groups
     multi-adjust: bool
-        if True, splits DPG-group 'OO' into 3 subgroups and uses distribution factor derived by
-        table II in https://www.researchgate.net/publication/273775902_Erzeugung_zeitlich_hochaufgeloster_Stromlastprofile_fur_verschiedene_Haushaltstypen
+        If True (default), splits DPG-group 'OO' into 3 subgroups and uses
+        distribution factor derived by table II in
+        https://www.researchgate.net/publication/273775902_Erzeugung_zeitlich_hochaufgeloster_Stromlastprofile_fur_verschiedene_Haushaltstypen
     relative: bool
         if True produces relative values
+
     Returns
     ----------
     df_hh_types: pd.DataFrame
         distribution of people by household type and regional-resolution
-    !data still needs to be converted from amount of people to amount of households!
+
+        .. warning::
+
+            Data still needs to be converted from amount of people to amount
+            of households
      """
     # adjust multi with/without kids via eurostat share as not clearly derivable without infos about share of kids
     if multi_adjust:
@@ -120,19 +147,26 @@ def get_hh_dist(df_zensus, hh_types, multi_adjust=True, relative=True):
 
 
 def inhabitants_to_households(df_people_by_householdtypes_abs, mapping_people_in_households):
-    """converts distribution of peoples living in types of households to distribution of household types by using
-    a people-in-household mapping. results are rounded to int (ceiled) to full households.
+    """
+    Convert number of inhabitant to number of household types
+
+    Takes the distribution of peoples living in types of households to
+    calculate a distribution of household types by using a people-in-household
+    mapping.
+
+    Results are rounded to int (ceiled) to full households.
 
     Parameters
     ----------
     df_people_by_householdtypes_abs: pd.DataFrame
-        distribution of people living in households
+        Distribution of people living in households
     mapping_people_in_households: dict
-        mapping of people living in certain types of households
+        Mapping of people living in certain types of households
+
     Returns
     ----------
     df_households_by_type: pd.DataFrame
-        distribution of households type
+        Distribution of households type
 
     """
     # compare categories and remove form mapping if to many
@@ -154,10 +188,19 @@ def inhabitants_to_households(df_people_by_householdtypes_abs, mapping_people_in
 
 
 def process_nuts1_zensus_data(df_zensus):
-    """group, remove and reorder categories wich are not needed for demand-profile-generator (DPG)
-    Kids (<15) are excluded as they are also excluded in DPG origin dataset.
-    Adults (15<65)
-    Seniors (<65)
+    """Make data compatible with household demand profile categories
+
+    Groups, removes and reorders categories which are not needed for
+    demand-profile-generator (DPG)
+
+    * Kids (<15) are excluded as they are also excluded in DPG origin dataset
+    * Adults (15<65)
+    * Seniors (<65)
+
+    Returns
+    -------
+    pd.DataFrame
+        Aggregated zensus household data on NUTS-1 level
     """
     # Group data to fit Load Profile Generator categories
     # define kids/adults/seniors
@@ -187,8 +230,25 @@ def process_nuts1_zensus_data(df_zensus):
     return df_zensus
 
 
-def get_cell_demand_profile_ids(df_cell, pool_size, df_profiles):
-    """generates tuple of hh_type and random sample(without replacement) profile ids for cell"""
+def get_cell_demand_profile_ids(df_cell, pool_size):
+    """
+    Generates tuple of hh_type and zensus cell ids
+
+    Takes a random sample (without replacement) of profile ids for given cell
+
+    Parameters
+    ----------
+    df_cell: pd.DataFrame
+        Household type information for a single zensus cell
+    pool_size: int
+        Number of available profiles to select from
+
+    Returns
+    -------
+    list of tuple
+        List of (`hh_type`, `cell_id`)
+
+    """
     # maybe use instead
     # np.random.default_rng().integers(low=0, high=pool_size[hh_type], size=sq) instead of random.sample
     # use random.choice() if with replacement
@@ -207,7 +267,32 @@ def get_cell_demand_profile_ids(df_cell, pool_size, df_profiles):
 
 # can be parallelized with grouping df_zensus_cells by grid_id/nuts3/nuts1
 def get_cell_demand_metadata(df_zensus_cells, df_profiles):
-    """generate table including demand profile ids for each cell using get_cell_demand_profile_ids"""
+    """
+    Defines information about profiles for each zensus cell
+
+    A table including the demand profile ids for each cell is created by using
+    :func:`get_cell_demand_profile_ids`.
+
+    Parameters
+    ----------
+    df_zensus_cells: pd.DataFrame
+        Household type parameters. Each row representing one household. Hence,
+        multiple rows per zensus cell.
+    df_profiles: pd.DataFrame
+        Household load profile data
+
+        * Index: Times steps as serial integers
+        * Columns: pd.MultiIndex with (`HH_TYPE`, `id`)
+
+    Returns
+    -------
+    pd.DataFrame
+        Tabular data with one row represents one zensus cell.
+        The column `cell_profile_ids` contains
+        a list of tuples (see :func:`get_cell_demand_profile_ids`) providing a
+        reference to the actual load profiles that are associated with this
+        cell.
+    """
 
     df_cell_demand_metadata = pd.DataFrame(index=df_zensus_cells.grid_id.unique(),
                                            columns=['cell_profile_ids', 'nuts3', 'nuts1', '2035_factor',
@@ -221,7 +306,8 @@ def get_cell_demand_metadata(df_zensus_cells, df_profiles):
         # FIXME
         # ! runden der Haushaltszahlen auf int
         # ! kein zurücklegen innerhalb einer Zelle ?! -> das is ok.
-        cell_profile_ids = get_cell_demand_profile_ids(df_cell, pool_size, df_profiles)
+        # cell_profile_ids = get_cell_demand_profile_ids(df_cell, pool_size, df_profiles)
+        cell_profile_ids = get_cell_demand_profile_ids(df_cell, pool_size)
 
         df_cell_demand_metadata.at[cell_id, 'cell_profile_ids'] = cell_profile_ids
         df_cell_demand_metadata.at[cell_id, 'nuts3'] = df_cell.loc[:, 'nuts3'].unique()[0]
@@ -232,7 +318,33 @@ def get_cell_demand_metadata(df_zensus_cells, df_profiles):
 
 # can be parallelized with grouping df_zensus_cells by grid_id/nuts3/nuts1
 def adjust_to_demand_regio_nuts3_annual(df_cell_demand_metadata, df_profiles, df_demand_regio):
-    """computes the profile scaling factor by accumulated nuts3 cells and demand_regio data"""
+    """
+    Computes the profile scaling factor for alignment to demand regio data
+
+    The scaling factor can be used to re-scale each load profile such that the
+    sum of all load profiles within one NUTS-3 area equals the annual demand
+    of demand regio data.
+
+    Parameters
+    ----------
+    df_cell_demand_metadata: pd.DataFrame
+        Result of :func:`get_cell_demand_metadata`.
+    df_profiles: pd.DataFrame
+        Household load profile data
+
+        * Index: Times steps as serial integers
+        * Columns: pd.MultiIndex with (`HH_TYPE`, `id`)
+
+    df_demand_regio: pd.DataFrame
+        Annual demand by demand regio for each NUTS-3 region and scenario year.
+        Index is pd.MultiIndex with :code:`tuple(scenario_year, nuts3_code)`.
+
+    Returns
+    -------
+    pd.DataFrame
+        Returns the same data as :func:`get_cell_demand_metadata`, but with
+        filled columns `2035_factor` and `2050_factor`.
+    """
     for nuts3_id, df_nuts3 in df_cell_demand_metadata.groupby(by='nuts3'):
         nuts3_cell_ids = df_nuts3.index
         nuts3_profile_ids = df_nuts3.loc[:, 'cell_profile_ids'].sum()
@@ -255,15 +367,39 @@ def adjust_to_demand_regio_nuts3_annual(df_cell_demand_metadata, df_profiles, df
     return df_cell_demand_metadata
 
 
-def get_load_area_max_load(df_profiles, df_cell_demand_metadata, load_area_ids, year):
-    """get max value of load area demand profile"""
+def get_load_area_max_load(df_profiles, df_cell_demand_metadata, cell_ids, year):
+    """
+    Get peak load for one load area
+
+    The peak load is calculated in aggregated manner for a group of zensus
+    cells that belong to one load area (defined by `cell_ids`).
+
+    Parameters
+    ----------
+    df_profiles: pd.DataFrame
+        Household load profile data
+
+        * Index: Times steps as serial integers
+        * Columns: pd.MultiIndex with (`HH_TYPE`, `id`)
+
+        Used to calculate the peak load from.
+    df_cell_demand_metadata: pd.DataFrame
+        Return value of :func:`adjust_to_demand_regio_nuts3_annual`.
+    cell_ids: list
+        Zensus cell ids that define one group of zensus cells that belong to
+        the same load area.
+    year: int
+        Scenario year. Is used to consider the scaling factor for aligning
+        annual demand to NUTS-3 data.
+
+    """
     timesteps = len(df_profiles)
     full_load = pd.Series(data=np.zeros(timesteps), dtype=np.float64, index=range(timesteps))
-    load_area_meta = df_cell_demand_metadata.loc[load_area_ids, ['cell_profile_ids', 'nuts3', f'{year}_factor']]
+    load_area_meta = df_cell_demand_metadata.loc[cell_ids, ['cell_profile_ids', 'nuts3', f'{year}_factor']]
     for (nuts3, factor), df in load_area_meta.groupby(by=['nuts3', f'{year}_factor']):
         part_load = df_profiles.loc[:, df['cell_profile_ids'].sum()].sum(axis=1) * factor / 1e3  # profiles in Wh
         full_load = full_load.add(part_load)
-    return full_load.max()  #, full_load.idxmax()
+    return full_load.max()
 
 
 def download_files():

From 311308c795311419d3c2f044fa2475f704b2af30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Thu, 3 Jun 2021 12:28:54 +0200
Subject: [PATCH 15/97] Describe files in download function

---
 .../hh_demand/hh_demand_profiles_tools.py     | 67 +++++++++++--------
 1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 5ade14f6b..fabacd4f6 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -39,7 +39,21 @@ def get_household_demand_profiles_raw():
     """
     Downloads and returns household electricity demand profiles
 
-    Download only happens, if file isn't already existing.
+    Household electricity demand profiles generated by Fraunhofer IEE.
+    Methodology is described in
+    :ref:`Erzeugung zeitlich hochaufgelöster Stromlastprofile für verschiedene
+    Haushaltstypen
+    <https://www.researchgate.net/publication/273775902_Erzeugung_zeitlich_hochaufgeloster_Stromlastprofile_fur_verschiedene_Haushaltstypen>`_.
+    It is used and further described in the following theses by:
+
+    * Jonas Haack:
+      "Auswirkungen verschiedener Haushaltslastprofile auf PV-Batterie-Systeme" (confidential)
+    * Simon Ruben Drauz
+      "Synthesis of a heat and electrical load profile for single and multi-family houses used for subsequent
+      performance tests of a multi-component energy system",
+      http://dx.doi.org/10.13140/RG.2.2.13959.14248
+
+    Download only happens, if file 'h0_profiles.h5' isn't already existing.
 
     Returns
     -------
@@ -68,9 +82,30 @@ def download_process_zensus_households():
     """
     Downloads and pre-processes zensus age x household type data
 
-    Download only happens, if file isn't already existing.
-    TODO: Add description about data
+    Dataset about household size with information about the categories:
+
+    * family type
+    * age class
+    * gender
+
+    for Germany in spatial resolution of federal states.
+
+    Data manually selected and retrieved from:
+    https://ergebnisse2011.zensus2022.de/datenbank/online
+    For reproducing data selection, please do:
+
+    * Search for: "1000A-2029"
+    * or choose topic: "Bevölkerung kompakt"
+    * Choose table code: "1000A-2029" with title "Personen: Alter (11 Altersklassen)/Geschlecht/Größe des
+    privaten Haushalts - Typ des privaten Haushalts (nach Familien/Lebensform)"
+    - Change setting "GEOLK1" to "Bundesländer (16)"
+
+    Data would be available in higher resolution
+    ("Landkreise und kreisfreie Städte (412)"), but only after registration.
 
+    The downloaded file is called 'Zensus2011_Personen.csv'.
+
+    Download only happens, if file isn't already existing.
 
     Returns
     -------
@@ -400,29 +435,3 @@ def get_load_area_max_load(df_profiles, df_cell_demand_metadata, cell_ids, year)
         part_load = df_profiles.loc[:, df['cell_profile_ids'].sum()].sum(axis=1) * factor / 1e3  # profiles in Wh
         full_load = full_load.add(part_load)
     return full_load.max()
-
-
-def download_files():
-    """
-    1. 'h0_profiles.h5'
-        Households demand profiles generated by Fraunhofer IWES
-        Methodology is described in: https://www.researchgate.net/publication/273775902_Erzeugung_zeitlich_hochaufgeloster_Stromlastprofile_fur_verschiedene_Haushaltstypen
-        used and further describer in the thesisis by:
-            1. Jonas Haack
-                "Auswirkungen verschiedener Haushaltslastprofile auf PV-Batterie-Systeme" (confidential)
-            2. Simon Ruben Drauz
-                "Synthesis of a heat and electrical load profile for single and multi-family houses used for subsequent
-                 performance tests of a multi-component energy system"
-                http://dx.doi.org/10.13140/RG.2.2.13959.14248
-
-    2. 'Zensus2011_Personen.csv' (does not exist in this format anymore but in different format)
-        Dataset describing the amount of people living by a certain types of family-types, age-classes,
-        sex and size of household in Germany in state-resolution.
-        Data from: https://ergebnisse2011.zensus2022.de/datenbank/online
-        - Search for: "1000A-2029"
-        - or choose topic: "Bevölkerung kompakt"
-        - Choose table code: "1000A-2029" with title "Personen: Alter (11 Altersklassen)/Geschlecht/Größe des
-        privaten Haushalts - Typ des privaten Haushalts (nach Familien/Lebensform)"
-        - Change setting "GEOLK1" to "Bundesländer (16)"
-        higher resolution "Landkreise und kreisfreie Städte (412)" only accessible after registration.
-    """

From 25da37a1126a536dedd8a6adf3265d9213149196 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Thu, 3 Jun 2021 12:29:23 +0200
Subject: [PATCH 16/97] Remove unused code

---
 .../hh_demand/hh_demand_profiles.py           | 27 +------------------
 1 file changed, 1 insertion(+), 26 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index 5210fce7b..fbbcaba81 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -9,33 +9,8 @@
 
 
 if __name__ == "__main__":
-    # Loadprofilesdump
-    # ###################
-    # in Wh
-    # TODO: > to/from SQL?
-    # filed needs to be placed manually in directory
-    # file = 'h0_profiles.h5'
-    # file = os.path.join(os.path.realpath(file))
-    # df_profiles = pd.read_hdf(file)
-    #
-    # # set multiindex to HH_types
-    # df_profiles.columns = pd.MultiIndex.from_arrays([df_profiles.columns.str[:2], df_profiles.columns.str[3:]])
+    # Get demand profiles and zensus household type x age category data
     df_profiles = hh_tools.get_household_demand_profiles_raw()
-
-
-    # Load Zensus data at nuts-level
-    # ###################
-    # TODO: > to/from SQL?
-    # filed needs to be placed manually in directory
-    # file = 'Zensus2011_Personen.csv'
-    # file = os.path.join(os.path.realpath(file))
-    # df_zensus = pd.read_csv(file, sep=';', decimal='.', skiprows=5, skipfooter=7,
-    #                         index_col=[0, 1], header=[0, 1], encoding='latin1', engine='python')
-    #
-    # # clean data
-    # df_zensus = df_zensus.applymap(hh_tools.clean).applymap(int)
-    # # preprocess nuts1 zensus data
-    # df_zensus = hh_tools.process_nuts1_zensus_data(df_zensus)
     df_zensus = hh_tools.download_process_zensus_households()
 
     # ## Household distribution

From 0a70277f894d84a37379f94d4ee6d2d42cfc094f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Fri, 4 Jun 2021 14:08:10 +0200
Subject: [PATCH 17/97] Re-work code and add comments

---
 .../hh_demand/hh_demand_profiles.py           | 215 ++++++++----------
 1 file changed, 101 insertions(+), 114 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index fbbcaba81..4af5ab0ac 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -7,100 +7,96 @@
 
 from egon.data.processing.hh_demand import hh_demand_profiles_tools as hh_tools
 
+# Define mapping of zensus household categories to eurostat categories
+# - Adults living in househould type
+# - number of kids not included even if in housholdtype name
+# **! The Eurostat data only gives the amount of adults/seniors, excluding the amount of kids <15**
+# eurostat is used for demand-profile-generator @fraunhofer
+HH_TYPES = {'SR': [
+    ('Einpersonenhaushalte (Singlehaushalte)', 'Insgesamt', 'Seniors'),
+    ('Alleinerziehende Elternteile', 'Insgesamt', 'Seniors')],
+            # Single Seniors Single Parents Seniors
+            'SO': [('Einpersonenhaushalte (Singlehaushalte)', 'Insgesamt',
+                    'Adults')],  # Single Adults
+            'SK': [('Alleinerziehende Elternteile', 'Insgesamt', 'Adults')],
+            # Single Parents Adult
+            'PR': [('Paare ohne Kind(er)', '2 Personen', 'Seniors'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '2 Personen',
+                    'Seniors')],
+            # Couples without Kids Senior & same sex couples & shared flat seniors
+            'PO': [('Paare ohne Kind(er)', '2 Personen', 'Adults'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '2 Personen',
+                    'Adults')],
+            # Couples without Kids adults & same sex couples & shared flat adults
+            'P1': [('Paare mit Kind(ern)', '3 Personen', 'Adults')],
+            'P2': [('Paare mit Kind(ern)', '4 Personen', 'Adults')],
+            'P3': [('Paare mit Kind(ern)', '5 Personen', 'Adults'),
+                   ('Paare mit Kind(ern)', '6 und mehr Personen', 'Adults')],
+            'OR': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen',
+                    'Seniors'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen',
+                    'Seniors'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen',
+                    'Seniors'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie',
+                    '6 und mehr Personen', 'Seniors'),
+                   ('Paare mit Kind(ern)', '3 Personen', 'Seniors'),
+                   ('Paare ohne Kind(er)', '3 Personen', 'Seniors'),
+                   ('Paare mit Kind(ern)', '4 Personen', 'Seniors'),
+                   ('Paare ohne Kind(er)', '4 Personen', 'Seniors'),
+                   ('Paare mit Kind(ern)', '5 Personen', 'Seniors'),
+                   ('Paare ohne Kind(er)', '5 Personen', 'Seniors'),
+                   ('Paare mit Kind(ern)', '6 und mehr Personen', 'Seniors'),
+                   ('Paare ohne Kind(er)', '6 und mehr Personen', 'Seniors')],
+            # no info about share of kids
+
+            # OO, O1, O2 have the same amount, as no information about the share of kids within zensus data set.
+            # if needed the total amount can be corrected in the hh_tools.get_hh_dist function
+            # using multi_adjust=True option
+            'OO': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen',
+                    'Adults'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen',
+                    'Adults'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen',
+                    'Adults'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie',
+                    '6 und mehr Personen', 'Adults'),
+                   ('Paare ohne Kind(er)', '3 Personen', 'Adults'),
+                   ('Paare ohne Kind(er)', '4 Personen', 'Adults'),
+                   ('Paare ohne Kind(er)', '5 Personen', 'Adults'),
+                   ('Paare ohne Kind(er)', '6 und mehr Personen', 'Adults')],
+            # no info about share of kids
+            }
+
+MAPPING_ZENSUS_HH_SUBGROUPS = {1: ['SR', 'SO'],
+                               2: ['PR', 'PO'],
+                               3: ['SK'],
+                               4: ['P1', 'P2', 'P3'],
+                               5: ['OR', 'OO'],
+                               }
+
 
 if __name__ == "__main__":
     # Get demand profiles and zensus household type x age category data
     df_profiles = hh_tools.get_household_demand_profiles_raw()
     df_zensus = hh_tools.download_process_zensus_households()
 
-    # ## Household distribution
-    # - Adults living in househould type
-    # - number of kids not included even if in housholdtype name
-    # **! The Eurostat data only gives the amount of adults/seniors, excluding the amount of kids <15**
-    # eurostat is used for demand-profile-generator @fraunhofer
-
-    # hh shares mapping zensus to eurostat
-    hh_types = {'SR': [('Einpersonenhaushalte (Singlehaushalte)', 'Insgesamt', 'Seniors'),
-                       ('Alleinerziehende Elternteile', 'Insgesamt', 'Seniors')],  # Single Seniors Single Parents Seniors
-                'SO': [('Einpersonenhaushalte (Singlehaushalte)', 'Insgesamt', 'Adults')],  # Single Adults
-                'SK': [('Alleinerziehende Elternteile', 'Insgesamt', 'Adults')],  # Single Parents Adult
-                'PR': [('Paare ohne Kind(er)', '2 Personen', 'Seniors'),
-                       ('Mehrpersonenhaushalte ohne Kernfamilie', '2 Personen', 'Seniors')],
-                # Couples without Kids Senior & same sex couples & shared flat seniors
-                'PO': [('Paare ohne Kind(er)', '2 Personen', 'Adults'),
-                       ('Mehrpersonenhaushalte ohne Kernfamilie', '2 Personen', 'Adults')],
-                # Couples without Kids adults & same sex couples & shared flat adults
-                'P1': [('Paare mit Kind(ern)', '3 Personen', 'Adults')],
-                'P2': [('Paare mit Kind(ern)', '4 Personen', 'Adults')],
-                'P3': [('Paare mit Kind(ern)', '5 Personen', 'Adults'),
-                       ('Paare mit Kind(ern)', '6 und mehr Personen', 'Adults')],
-                'OR': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen', 'Seniors'),
-                       ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen', 'Seniors'),
-                       ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen', 'Seniors'),
-                       ('Mehrpersonenhaushalte ohne Kernfamilie', '6 und mehr Personen', 'Seniors'),
-                       ('Paare mit Kind(ern)', '3 Personen', 'Seniors'),
-                       ('Paare ohne Kind(er)', '3 Personen', 'Seniors'),
-                       ('Paare mit Kind(ern)', '4 Personen', 'Seniors'),
-                       ('Paare ohne Kind(er)', '4 Personen', 'Seniors'),
-                       ('Paare mit Kind(ern)', '5 Personen', 'Seniors'),
-                       ('Paare ohne Kind(er)', '5 Personen', 'Seniors'),
-                       ('Paare mit Kind(ern)', '6 und mehr Personen', 'Seniors'),
-                       ('Paare ohne Kind(er)', '6 und mehr Personen', 'Seniors')],  # no info about share of kids
-
-                # OO, O1, O2 have the same amount, as no information about the share of kids within zensus data set.
-                # if needed the total amount can be corrected in the hh_tools.get_hh_dist function
-                # using multi_adjust=True option
-                'OO': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen', 'Adults'),
-                       ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen', 'Adults'),
-                       ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen', 'Adults'),
-                       ('Mehrpersonenhaushalte ohne Kernfamilie', '6 und mehr Personen', 'Adults'),
-                       ('Paare ohne Kind(er)', '3 Personen', 'Adults'),
-                       ('Paare ohne Kind(er)', '4 Personen', 'Adults'),
-                       ('Paare ohne Kind(er)', '5 Personen', 'Adults'),
-                       ('Paare ohne Kind(er)', '6 und mehr Personen', 'Adults')],  # no info about share of kids
-                # TODO: maybe remove following lines if not needed
-
-                #             'O1': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen', 'Adults'),
-                #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen', 'Adults'),
-                #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen', 'Adults'),
-                #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '6 und mehr Personen', 'Adults'),
-                #                    ('Paare ohne Kind(er)', '3 Personen', 'Adults'),
-                #                    ('Paare ohne Kind(er)', '4 Personen', 'Adults'),
-                #                    ('Paare ohne Kind(er)', '5 Personen', 'Adults'),
-                #                    ('Paare ohne Kind(er)', '6 und mehr Personen', 'Adults')],  # no info about share of kids
-                #             'O2': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen', 'Adults'),
-                #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen', 'Adults'),
-                #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen', 'Adults'),
-                #                    ('Mehrpersonenhaushalte ohne Kernfamilie', '6 und mehr Personen', 'Adults'),
-                #                    ('Paare ohne Kind(er)', '3 Personen', 'Adults'),
-                #                    ('Paare ohne Kind(er)', '4 Personen', 'Adults'),
-                #                    ('Paare ohne Kind(er)', '5 Personen', 'Adults'),
-                #                    ('Paare ohne Kind(er)', '6 und mehr Personen', 'Adults')]
-                }
-
-    # distribution of people by household @eurostats
-    # df_hh_types_D = pd.Series({'SR': 0.083, 'SO': 0.158, 'SK': 0.022,
-    #                            'PR': 0.145, 'PO': 0.203, 'P1': 0.081, 'P2': 0.077, 'P3': 0.024,
-    #                            'OR': 0.023, 'OO': 0.13, 'O1': 0.04, 'O2': 0.015})
-
     # hh_tools.get_hh_dist without eurostat adjustment for O1-03 Groups in absolute values
-    df_hh_types_nad_abs = hh_tools.get_hh_dist(df_zensus, hh_types, multi_adjust=False, relative=False)
-
-    # #########################
-    # FIXME:
-    # mapping needs to be adjusted for OR, OO, O1, O2
-    # O1, O2 are not used anymore
-    # influence of OO and OR -parameter to overall household-sum rather small
-    # ###########################
+    df_hh_types_nad_abs = hh_tools.get_hh_dist(df_zensus, HH_TYPES, multi_adjust=False, relative=False)
+
+    # Get household size for each census cell grouped by
+    # As this is only used to estimate size of households for OR, OO, 1 P and 2 P households are dropped
     df_hh_size = db.select_dataframe(sql="""
                     SELECT characteristics_text, SUM(quantity) as summe
                     FROM society.egon_destatis_zensus_household_per_ha as egon_d
                     WHERE attribute = 'HHGROESS_KLASS'
                     GROUP BY characteristics_text """, index_col='characteristics_text')
-
     df_hh_size = df_hh_size.drop(index=['1 Person', '2 Personen'])
-    OO_factor = sum(df_hh_size['summe'] * [3, 4, 5, 6]) / df_hh_size['summe'].sum()
 
+    # Define/ estimate number of persons (w/o kids) for each household category
+    # For categories S* and P* it's clear; for multi-person households (OO,OR)
+    # the number is estimated as average by taking remaining persons
+    OO_factor = sum(df_hh_size['summe'] * [3, 4, 5, 6]) / df_hh_size['summe'].sum()
     mapping_people_in_households = {'SR': 1,
                                     'SO': 1,
                                     'SK': 1,  # kids are excluded
@@ -111,21 +107,15 @@
                                     'P3': 2,  # ""
                                     'OR': OO_factor,
                                     'OO': OO_factor,
-                                    #                                 'O1': 4,  # ""
-                                    #                                 'O2': 4,  # ""
                                     }
-    # derivate households data from inhabitants data by compound number of people per household type
+    # Determine number of persons for each household category and per federal state
     df_dist_households = hh_tools.inhabitants_to_households(df_hh_types_nad_abs, mapping_people_in_households)
 
-    # FIXME:
+    # TODO:
     # compare df_dist_households.sum() here with values from other source
-    # maybe scale on state-levelm
-
+    # maybe scale on state-level
 
-
-    # TODO: direct db.engine to configuration file
-    # engine = db.engine()
-    # SQL - Access Zensus household data cell-level
+    # Retrieve information about households for each census cell
     df_households_typ = db.select_dataframe(sql="""
                 SELECT grid_id, attribute, characteristics_code, characteristics_text, quantity
                 FROM society.egon_destatis_zensus_household_per_ha
@@ -133,17 +123,11 @@
     df_households_typ = df_households_typ.drop(columns=['attribute', 'characteristics_text'])
     df_households_typ = df_households_typ.rename(columns={'quantity': 'hh_5types'})
 
-    mapping_zensus_hh_subgroups = {1: ['SR', 'SO'],
-                                   2: ['PR', 'PO'],
-                                   3: ['SK'],
-                                   4: ['P1', 'P2', 'P3'],
-                                   5: ['OR', 'OO'],
-                                   }
-
-    for value in mapping_zensus_hh_subgroups.values():
+    # Calculate fraction of persons within subgroup
+    for value in MAPPING_ZENSUS_HH_SUBGROUPS.values():
         df_dist_households.loc[value] = df_dist_households.loc[value].div(df_dist_households.loc[value].sum())
 
-    # SQL- create table to map cells to nuts3 and nuts1
+    # Census cells with nuts3 and nuts1 information
     df_grid_id = db.select_dataframe(sql="""
                         SELECT pop.grid_id, pop.gid, vg250.vg250_nuts3 as nuts3, lan.nuts as nuts1, lan.gen
                         FROM society.destatis_zensus_population_per_ha_inside_germany as pop
@@ -154,7 +138,7 @@
     df_grid_id = df_grid_id.drop_duplicates()
     df_grid_id = df_grid_id.reset_index(drop=True)
 
-    # merge nuts info to zensus cell level data
+    # Merge household type and size data with considered (populated) census cells
     # how='inner' is used as ids of unpopulated areas are removed df_grid_id or earliers tables. see here:
     # https://github.com/openego/eGon-data/blob/59195926e41c8bd6d1ca8426957b97f33ef27bcc/src/egon/data/importing/zensus/__init__.py#L418-L449
     df_households_typ = pd.merge(df_households_typ, df_grid_id[['grid_id', 'gen', 'nuts1', 'nuts3']],
@@ -165,7 +149,7 @@
     for (country, code), df_country_type in df_households_typ.groupby(['gen', 'characteristics_code']):
 
         # iterate over zenus_country subgroups
-        for typ in mapping_zensus_hh_subgroups[code]:
+        for typ in MAPPING_ZENSUS_HH_SUBGROUPS[code]:
             df_country_type['hh_type'] = typ
             df_country_type['factor'] = df_dist_households.loc[typ, country]
             df_country_type['hh_10types'] = df_country_type['hh_5types'] * df_dist_households.loc[typ, country]
@@ -173,31 +157,34 @@
 
     df_zensus_cells = df_zensus_cells.sort_values(by=['grid_id', 'characteristics_code']).reset_index(drop=True)
 
-    # change profile numbers to int
+    # Cast profile ids into int
     df_profiles.columns = pd.MultiIndex.from_tuples([(a, int(b)) for a, b in df_profiles.columns])
 
+    # Available profiles for each category
     pool_size = df_profiles.groupby(level=0, axis=1).size()
 
+    # Annual household electricity demand on NUTS-3 level (demand regio)
     df_demand_regio = db.select_dataframe(sql="""
                             SELECT year, nuts3, SUM (demand) as demand_mWha
                             FROM demand.egon_demandregio_hh as egon_d
                             GROUP BY nuts3, year
                             ORDER BY year""", index_col=['year', 'nuts3'])
 
-    # # testcase
+    # Take census cells from two NUTS-3 regions as testcase
     # test_data = df_zensus_cells.groupby('nuts3').get_group('DEF03')
     # test_data = pd.concat([df_zensus_cells.groupby('nuts3').get_group('DEF03'),
     #                        df_zensus_cells.groupby('nuts3').get_group('DEF06')])
-    #
-    # df_cell_demand_metadata = hh_tools.get_cell_demand_metadata(test_data, df_profiles)
-    # df_cell_demand_metadata = hh_tools.adjust_to_demand_regio_nuts3_annual(df_cell_demand_metadata, df_profiles, df_demand_regio)
-    #
-    #
-    #
-    # import random
-    # load_area_cell_ids = random.sample(list(df_cell_demand_metadata.index), 100)
-    # max_value_load_area = hh_tools.get_load_area_max_load(df_profiles, df_cell_demand_metadata, load_area_cell_ids, 2035)
-    # # print(df_cell_demand_metadata.shape)
-    # print(max_value_load_area)
+    test_data = df_zensus_cells
+
+    df_cell_demand_metadata = hh_tools.get_cell_demand_metadata(test_data, df_profiles)
+    df_cell_demand_metadata = hh_tools.adjust_to_demand_regio_nuts3_annual(df_cell_demand_metadata, df_profiles, df_demand_regio)
+
+
+
+    import random
+    load_area_cell_ids = random.sample(list(df_cell_demand_metadata.index), 100)
+    max_value_load_area = hh_tools.get_load_area_max_load(df_profiles, df_cell_demand_metadata, load_area_cell_ids, 2035)
+    # print(df_cell_demand_metadata.shape)
+    print(max_value_load_area)
 
 

From 2255e6472d64fdac7e87a1a6c3d331461f074524 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Fri, 4 Jun 2021 14:39:00 +0200
Subject: [PATCH 18/97] Move households in census cells mapping into function

---
 .../hh_demand/hh_demand_profiles.py           | 168 +--------------
 .../hh_demand/hh_demand_profiles_tools.py     | 194 ++++++++++++++++++
 2 files changed, 196 insertions(+), 166 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index 4af5ab0ac..a6d0f685b 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -7,177 +7,13 @@
 
 from egon.data.processing.hh_demand import hh_demand_profiles_tools as hh_tools
 
-# Define mapping of zensus household categories to eurostat categories
-# - Adults living in househould type
-# - number of kids not included even if in housholdtype name
-# **! The Eurostat data only gives the amount of adults/seniors, excluding the amount of kids <15**
-# eurostat is used for demand-profile-generator @fraunhofer
-HH_TYPES = {'SR': [
-    ('Einpersonenhaushalte (Singlehaushalte)', 'Insgesamt', 'Seniors'),
-    ('Alleinerziehende Elternteile', 'Insgesamt', 'Seniors')],
-            # Single Seniors Single Parents Seniors
-            'SO': [('Einpersonenhaushalte (Singlehaushalte)', 'Insgesamt',
-                    'Adults')],  # Single Adults
-            'SK': [('Alleinerziehende Elternteile', 'Insgesamt', 'Adults')],
-            # Single Parents Adult
-            'PR': [('Paare ohne Kind(er)', '2 Personen', 'Seniors'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '2 Personen',
-                    'Seniors')],
-            # Couples without Kids Senior & same sex couples & shared flat seniors
-            'PO': [('Paare ohne Kind(er)', '2 Personen', 'Adults'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '2 Personen',
-                    'Adults')],
-            # Couples without Kids adults & same sex couples & shared flat adults
-            'P1': [('Paare mit Kind(ern)', '3 Personen', 'Adults')],
-            'P2': [('Paare mit Kind(ern)', '4 Personen', 'Adults')],
-            'P3': [('Paare mit Kind(ern)', '5 Personen', 'Adults'),
-                   ('Paare mit Kind(ern)', '6 und mehr Personen', 'Adults')],
-            'OR': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen',
-                    'Seniors'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen',
-                    'Seniors'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen',
-                    'Seniors'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie',
-                    '6 und mehr Personen', 'Seniors'),
-                   ('Paare mit Kind(ern)', '3 Personen', 'Seniors'),
-                   ('Paare ohne Kind(er)', '3 Personen', 'Seniors'),
-                   ('Paare mit Kind(ern)', '4 Personen', 'Seniors'),
-                   ('Paare ohne Kind(er)', '4 Personen', 'Seniors'),
-                   ('Paare mit Kind(ern)', '5 Personen', 'Seniors'),
-                   ('Paare ohne Kind(er)', '5 Personen', 'Seniors'),
-                   ('Paare mit Kind(ern)', '6 und mehr Personen', 'Seniors'),
-                   ('Paare ohne Kind(er)', '6 und mehr Personen', 'Seniors')],
-            # no info about share of kids
 
-            # OO, O1, O2 have the same amount, as no information about the share of kids within zensus data set.
-            # if needed the total amount can be corrected in the hh_tools.get_hh_dist function
-            # using multi_adjust=True option
-            'OO': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen',
-                    'Adults'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen',
-                    'Adults'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen',
-                    'Adults'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie',
-                    '6 und mehr Personen', 'Adults'),
-                   ('Paare ohne Kind(er)', '3 Personen', 'Adults'),
-                   ('Paare ohne Kind(er)', '4 Personen', 'Adults'),
-                   ('Paare ohne Kind(er)', '5 Personen', 'Adults'),
-                   ('Paare ohne Kind(er)', '6 und mehr Personen', 'Adults')],
-            # no info about share of kids
-            }
-
-MAPPING_ZENSUS_HH_SUBGROUPS = {1: ['SR', 'SO'],
-                               2: ['PR', 'PO'],
-                               3: ['SK'],
-                               4: ['P1', 'P2', 'P3'],
-                               5: ['OR', 'OO'],
-                               }
 
 
 if __name__ == "__main__":
-    # Get demand profiles and zensus household type x age category data
+    
     df_profiles = hh_tools.get_household_demand_profiles_raw()
-    df_zensus = hh_tools.download_process_zensus_households()
-
-    # hh_tools.get_hh_dist without eurostat adjustment for O1-03 Groups in absolute values
-    df_hh_types_nad_abs = hh_tools.get_hh_dist(df_zensus, HH_TYPES, multi_adjust=False, relative=False)
-
-    # Get household size for each census cell grouped by
-    # As this is only used to estimate size of households for OR, OO, 1 P and 2 P households are dropped
-    df_hh_size = db.select_dataframe(sql="""
-                    SELECT characteristics_text, SUM(quantity) as summe
-                    FROM society.egon_destatis_zensus_household_per_ha as egon_d
-                    WHERE attribute = 'HHGROESS_KLASS'
-                    GROUP BY characteristics_text """, index_col='characteristics_text')
-    df_hh_size = df_hh_size.drop(index=['1 Person', '2 Personen'])
-
-    # Define/ estimate number of persons (w/o kids) for each household category
-    # For categories S* and P* it's clear; for multi-person households (OO,OR)
-    # the number is estimated as average by taking remaining persons
-    OO_factor = sum(df_hh_size['summe'] * [3, 4, 5, 6]) / df_hh_size['summe'].sum()
-    mapping_people_in_households = {'SR': 1,
-                                    'SO': 1,
-                                    'SK': 1,  # kids are excluded
-                                    'PR': 2,
-                                    'PO': 2,
-                                    'P1': 2,  # kids are excluded
-                                    'P2': 2,  # ""
-                                    'P3': 2,  # ""
-                                    'OR': OO_factor,
-                                    'OO': OO_factor,
-                                    }
-    # Determine number of persons for each household category and per federal state
-    df_dist_households = hh_tools.inhabitants_to_households(df_hh_types_nad_abs, mapping_people_in_households)
-
-    # TODO:
-    # compare df_dist_households.sum() here with values from other source
-    # maybe scale on state-level
-
-    # Retrieve information about households for each census cell
-    df_households_typ = db.select_dataframe(sql="""
-                SELECT grid_id, attribute, characteristics_code, characteristics_text, quantity
-                FROM society.egon_destatis_zensus_household_per_ha
-                WHERE attribute = 'HHTYP_FAM' """)
-    df_households_typ = df_households_typ.drop(columns=['attribute', 'characteristics_text'])
-    df_households_typ = df_households_typ.rename(columns={'quantity': 'hh_5types'})
-
-    # Calculate fraction of persons within subgroup
-    for value in MAPPING_ZENSUS_HH_SUBGROUPS.values():
-        df_dist_households.loc[value] = df_dist_households.loc[value].div(df_dist_households.loc[value].sum())
-
-    # Census cells with nuts3 and nuts1 information
-    df_grid_id = db.select_dataframe(sql="""
-                        SELECT pop.grid_id, pop.gid, vg250.vg250_nuts3 as nuts3, lan.nuts as nuts1, lan.gen
-                        FROM society.destatis_zensus_population_per_ha_inside_germany as pop
-                        LEFT JOIN boundaries.egon_map_zensus_vg250 as vg250
-                        ON (pop.gid=vg250.zensus_population_id)
-                        LEFT JOIN boundaries.vg250_lan as lan
-                        ON (LEFT(vg250.vg250_nuts3, 3)=lan.nuts) """)
-    df_grid_id = df_grid_id.drop_duplicates()
-    df_grid_id = df_grid_id.reset_index(drop=True)
-
-    # Merge household type and size data with considered (populated) census cells
-    # how='inner' is used as ids of unpopulated areas are removed df_grid_id or earliers tables. see here:
-    # https://github.com/openego/eGon-data/blob/59195926e41c8bd6d1ca8426957b97f33ef27bcc/src/egon/data/importing/zensus/__init__.py#L418-L449
-    df_households_typ = pd.merge(df_households_typ, df_grid_id[['grid_id', 'gen', 'nuts1', 'nuts3']],
-                                 left_on='grid_id', right_on='grid_id', how='inner')
-
-    # Merge Zensus nuts level household data with zensus cell level by dividing hh-groups with mapping_zensus_hh_subgroups
-    df_zensus_cells = pd.DataFrame()
-    for (country, code), df_country_type in df_households_typ.groupby(['gen', 'characteristics_code']):
-
-        # iterate over zenus_country subgroups
-        for typ in MAPPING_ZENSUS_HH_SUBGROUPS[code]:
-            df_country_type['hh_type'] = typ
-            df_country_type['factor'] = df_dist_households.loc[typ, country]
-            df_country_type['hh_10types'] = df_country_type['hh_5types'] * df_dist_households.loc[typ, country]
-            df_zensus_cells = df_zensus_cells.append(df_country_type, ignore_index=True)
-
-    df_zensus_cells = df_zensus_cells.sort_values(by=['grid_id', 'characteristics_code']).reset_index(drop=True)
-
-    # Cast profile ids into int
-    df_profiles.columns = pd.MultiIndex.from_tuples([(a, int(b)) for a, b in df_profiles.columns])
-
-    # Available profiles for each category
-    pool_size = df_profiles.groupby(level=0, axis=1).size()
-
-    # Annual household electricity demand on NUTS-3 level (demand regio)
-    df_demand_regio = db.select_dataframe(sql="""
-                            SELECT year, nuts3, SUM (demand) as demand_mWha
-                            FROM demand.egon_demandregio_hh as egon_d
-                            GROUP BY nuts3, year
-                            ORDER BY year""", index_col=['year', 'nuts3'])
-
-    # Take census cells from two NUTS-3 regions as testcase
-    # test_data = df_zensus_cells.groupby('nuts3').get_group('DEF03')
-    # test_data = pd.concat([df_zensus_cells.groupby('nuts3').get_group('DEF03'),
-    #                        df_zensus_cells.groupby('nuts3').get_group('DEF06')])
-    test_data = df_zensus_cells
-
-    df_cell_demand_metadata = hh_tools.get_cell_demand_metadata(test_data, df_profiles)
-    df_cell_demand_metadata = hh_tools.adjust_to_demand_regio_nuts3_annual(df_cell_demand_metadata, df_profiles, df_demand_regio)
+    df_cell_demand_metadata = hh_tools.houseprofiles_in_census_cells()
 
 
 
diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index fabacd4f6..0c3447fe4 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -1,5 +1,6 @@
 import pandas as pd
 import numpy as np
+from egon.data import db
 
 from itertools import cycle
 import random
@@ -9,6 +10,75 @@
 import egon.data.config
 
 
+# Define mapping of zensus household categories to eurostat categories
+# - Adults living in househould type
+# - number of kids not included even if in housholdtype name
+# **! The Eurostat data only gives the amount of adults/seniors, excluding the amount of kids <15**
+# eurostat is used for demand-profile-generator @fraunhofer
+HH_TYPES = {'SR': [
+    ('Einpersonenhaushalte (Singlehaushalte)', 'Insgesamt', 'Seniors'),
+    ('Alleinerziehende Elternteile', 'Insgesamt', 'Seniors')],
+            # Single Seniors Single Parents Seniors
+            'SO': [('Einpersonenhaushalte (Singlehaushalte)', 'Insgesamt',
+                    'Adults')],  # Single Adults
+            'SK': [('Alleinerziehende Elternteile', 'Insgesamt', 'Adults')],
+            # Single Parents Adult
+            'PR': [('Paare ohne Kind(er)', '2 Personen', 'Seniors'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '2 Personen',
+                    'Seniors')],
+            # Couples without Kids Senior & same sex couples & shared flat seniors
+            'PO': [('Paare ohne Kind(er)', '2 Personen', 'Adults'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '2 Personen',
+                    'Adults')],
+            # Couples without Kids adults & same sex couples & shared flat adults
+            'P1': [('Paare mit Kind(ern)', '3 Personen', 'Adults')],
+            'P2': [('Paare mit Kind(ern)', '4 Personen', 'Adults')],
+            'P3': [('Paare mit Kind(ern)', '5 Personen', 'Adults'),
+                   ('Paare mit Kind(ern)', '6 und mehr Personen', 'Adults')],
+            'OR': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen',
+                    'Seniors'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen',
+                    'Seniors'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen',
+                    'Seniors'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie',
+                    '6 und mehr Personen', 'Seniors'),
+                   ('Paare mit Kind(ern)', '3 Personen', 'Seniors'),
+                   ('Paare ohne Kind(er)', '3 Personen', 'Seniors'),
+                   ('Paare mit Kind(ern)', '4 Personen', 'Seniors'),
+                   ('Paare ohne Kind(er)', '4 Personen', 'Seniors'),
+                   ('Paare mit Kind(ern)', '5 Personen', 'Seniors'),
+                   ('Paare ohne Kind(er)', '5 Personen', 'Seniors'),
+                   ('Paare mit Kind(ern)', '6 und mehr Personen', 'Seniors'),
+                   ('Paare ohne Kind(er)', '6 und mehr Personen', 'Seniors')],
+            # no info about share of kids
+
+            # OO, O1, O2 have the same amount, as no information about the share of kids within zensus data set.
+            # if needed the total amount can be corrected in the hh_tools.get_hh_dist function
+            # using multi_adjust=True option
+            'OO': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen',
+                    'Adults'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen',
+                    'Adults'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen',
+                    'Adults'),
+                   ('Mehrpersonenhaushalte ohne Kernfamilie',
+                    '6 und mehr Personen', 'Adults'),
+                   ('Paare ohne Kind(er)', '3 Personen', 'Adults'),
+                   ('Paare ohne Kind(er)', '4 Personen', 'Adults'),
+                   ('Paare ohne Kind(er)', '5 Personen', 'Adults'),
+                   ('Paare ohne Kind(er)', '6 und mehr Personen', 'Adults')],
+            # no info about share of kids
+            }
+
+MAPPING_ZENSUS_HH_SUBGROUPS = {1: ['SR', 'SO'],
+                               2: ['PR', 'PO'],
+                               3: ['SK'],
+                               4: ['P1', 'P2', 'P3'],
+                               5: ['OR', 'OO'],
+                               }
+
+
 def clean(x):
     """Clean zensus household data row-wise
 
@@ -75,6 +145,10 @@ def get_household_demand_profiles_raw():
     # set multiindex to HH_types
     hh_profiles.columns = pd.MultiIndex.from_arrays([hh_profiles.columns.str[:2], hh_profiles.columns.str[3:]])
 
+    # Cast profile ids into int
+    hh_profiles.columns = pd.MultiIndex.from_tuples(
+        [(a, int(b)) for a, b in hh_profiles.columns])
+
     return hh_profiles
 
 
@@ -435,3 +509,123 @@ def get_load_area_max_load(df_profiles, df_cell_demand_metadata, cell_ids, year)
         part_load = df_profiles.loc[:, df['cell_profile_ids'].sum()].sum(axis=1) * factor / 1e3  # profiles in Wh
         full_load = full_load.add(part_load)
     return full_load.max()
+
+
+def houseprofiles_in_census_cells():
+    """
+    Identify household electricity profiles for each census cell
+
+
+    Returns
+    -------
+
+    """
+    # Get demand profiles and zensus household type x age category data
+    df_profiles = get_household_demand_profiles_raw()
+    df_zensus = download_process_zensus_households()
+
+    # hh_tools.get_hh_dist without eurostat adjustment for O1-03 Groups in absolute values
+    df_hh_types_nad_abs = get_hh_dist(df_zensus, HH_TYPES,
+                                               multi_adjust=False,
+                                               relative=False)
+
+    # Get household size for each census cell grouped by
+    # As this is only used to estimate size of households for OR, OO, 1 P and 2 P households are dropped
+    df_hh_size = db.select_dataframe(sql="""
+                        SELECT characteristics_text, SUM(quantity) as summe
+                        FROM society.egon_destatis_zensus_household_per_ha as egon_d
+                        WHERE attribute = 'HHGROESS_KLASS'
+                        GROUP BY characteristics_text """,
+                                     index_col='characteristics_text')
+    df_hh_size = df_hh_size.drop(index=['1 Person', '2 Personen'])
+
+    # Define/ estimate number of persons (w/o kids) for each household category
+    # For categories S* and P* it's clear; for multi-person households (OO,OR)
+    # the number is estimated as average by taking remaining persons
+    OO_factor = sum(df_hh_size['summe'] * [3, 4, 5, 6]) / df_hh_size[
+        'summe'].sum()
+    mapping_people_in_households = {'SR': 1,
+                                    'SO': 1,
+                                    'SK': 1,  # kids are excluded
+                                    'PR': 2,
+                                    'PO': 2,
+                                    'P1': 2,  # kids are excluded
+                                    'P2': 2,  # ""
+                                    'P3': 2,  # ""
+                                    'OR': OO_factor,
+                                    'OO': OO_factor,
+                                    }
+    # Determine number of persons for each household category and per federal state
+    df_dist_households = inhabitants_to_households(
+        df_hh_types_nad_abs, mapping_people_in_households)
+
+    # TODO:
+    # compare df_dist_households.sum() here with values from other source
+    # maybe scale on state-level
+
+    # Retrieve information about households for each census cell
+    df_households_typ = db.select_dataframe(sql="""
+                    SELECT grid_id, attribute, characteristics_code, characteristics_text, quantity
+                    FROM society.egon_destatis_zensus_household_per_ha
+                    WHERE attribute = 'HHTYP_FAM' """)
+    df_households_typ = df_households_typ.drop(
+        columns=['attribute', 'characteristics_text'])
+    df_households_typ = df_households_typ.rename(
+        columns={'quantity': 'hh_5types'})
+
+    # Calculate fraction of persons within subgroup
+    for value in MAPPING_ZENSUS_HH_SUBGROUPS.values():
+        df_dist_households.loc[value] = df_dist_households.loc[value].div(
+            df_dist_households.loc[value].sum())
+
+    # Census cells with nuts3 and nuts1 information
+    df_grid_id = db.select_dataframe(sql="""
+                            SELECT pop.grid_id, pop.gid, vg250.vg250_nuts3 as nuts3, lan.nuts as nuts1, lan.gen
+                            FROM society.destatis_zensus_population_per_ha_inside_germany as pop
+                            LEFT JOIN boundaries.egon_map_zensus_vg250 as vg250
+                            ON (pop.gid=vg250.zensus_population_id)
+                            LEFT JOIN boundaries.vg250_lan as lan
+                            ON (LEFT(vg250.vg250_nuts3, 3)=lan.nuts) """)
+    df_grid_id = df_grid_id.drop_duplicates()
+    df_grid_id = df_grid_id.reset_index(drop=True)
+
+    # Merge household type and size data with considered (populated) census cells
+    # how='inner' is used as ids of unpopulated areas are removed df_grid_id or earliers tables. see here:
+    # https://github.com/openego/eGon-data/blob/59195926e41c8bd6d1ca8426957b97f33ef27bcc/src/egon/data/importing/zensus/__init__.py#L418-L449
+    df_households_typ = pd.merge(df_households_typ, df_grid_id[
+        ['grid_id', 'gen', 'nuts1', 'nuts3']],
+                                 left_on='grid_id', right_on='grid_id',
+                                 how='inner')
+
+    # Merge Zensus nuts level household data with zensus cell level by dividing hh-groups with mapping_zensus_hh_subgroups
+    df_zensus_cells = pd.DataFrame()
+    for (country, code), df_country_type in df_households_typ.groupby(
+        ['gen', 'characteristics_code']):
+
+        # iterate over zenus_country subgroups
+        for typ in MAPPING_ZENSUS_HH_SUBGROUPS[code]:
+            df_country_type['hh_type'] = typ
+            df_country_type['factor'] = df_dist_households.loc[typ, country]
+            df_country_type['hh_10types'] = df_country_type['hh_5types'] * \
+                                            df_dist_households.loc[
+                                                typ, country]
+            df_zensus_cells = df_zensus_cells.append(df_country_type,
+                                                     ignore_index=True)
+
+    df_zensus_cells = df_zensus_cells.sort_values(
+        by=['grid_id', 'characteristics_code']).reset_index(drop=True)
+
+    # Annual household electricity demand on NUTS-3 level (demand regio)
+    df_demand_regio = db.select_dataframe(sql="""
+                                SELECT year, nuts3, SUM (demand) as demand_mWha
+                                FROM demand.egon_demandregio_hh as egon_d
+                                GROUP BY nuts3, year
+                                ORDER BY year""", index_col=['year', 'nuts3'])
+
+    # Finally create table that stores profile ids for each cell
+    df_cell_demand_metadata = get_cell_demand_metadata(df_zensus_cells,
+                                                                df_profiles)
+    df_cell_demand_metadata = adjust_to_demand_regio_nuts3_annual(
+        df_cell_demand_metadata, df_profiles, df_demand_regio)
+
+    return df_cell_demand_metadata

From 21b7895c937ef9b45016e8c3fee9db3aaad394ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Fri, 4 Jun 2021 16:40:17 +0200
Subject: [PATCH 19/97] Store census cell x household electricity profile
 mapping in db table

---
 .../hh_demand/hh_demand_profiles.py           | 17 ++-----
 .../hh_demand/hh_demand_profiles_tools.py     | 51 ++++++++++++++++---
 2 files changed, 48 insertions(+), 20 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index a6d0f685b..855dcf4f6 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -1,26 +1,19 @@
-#!/usr/bin/env python
-# coding: utf-8
-
-import os
-import pandas as pd
-from egon.data import db
-
 from egon.data.processing.hh_demand import hh_demand_profiles_tools as hh_tools
 
 
-
-
 if __name__ == "__main__":
-    
+
     df_profiles = hh_tools.get_household_demand_profiles_raw()
-    df_cell_demand_metadata = hh_tools.houseprofiles_in_census_cells()
+    hh_tools.houseprofiles_in_census_cells()
+
+    df_cell_demand_metadata = hh_tools.get_houseprofiles_in_census_cells()
 
 
 
     import random
     load_area_cell_ids = random.sample(list(df_cell_demand_metadata.index), 100)
     max_value_load_area = hh_tools.get_load_area_max_load(df_profiles, df_cell_demand_metadata, load_area_cell_ids, 2035)
-    # print(df_cell_demand_metadata.shape)
     print(max_value_load_area)
+    print(df_cell_demand_metadata.shape)
 
 
diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 0c3447fe4..3ef6b90fd 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -6,6 +6,9 @@
 import random
 from pathlib import Path
 from urllib.request import urlretrieve
+from sqlalchemy import Column, String, Float, Integer, ARRAY
+from sqlalchemy.ext.declarative import declarative_base
+Base = declarative_base()
 
 import egon.data.config
 
@@ -79,6 +82,18 @@
                                }
 
 
+class HouseholdElectricityProfilesInCensusCells(Base):
+    __tablename__ = "household_electricity_profiles_in_census_cells"
+    __table_args__ = {"schema": "demand"}
+
+    cell_id = Column(Integer, primary_key=True)
+    cell_profile_ids = Column(ARRAY(String, dimensions=2))
+    nuts3 = Column(String)
+    nuts1 = Column(String)
+    factor_2035 = Column(Float)
+    factor_2050 = Column(Float)
+
+
 def clean(x):
     """Clean zensus household data row-wise
 
@@ -404,8 +419,8 @@ def get_cell_demand_metadata(df_zensus_cells, df_profiles):
     """
 
     df_cell_demand_metadata = pd.DataFrame(index=df_zensus_cells.grid_id.unique(),
-                                           columns=['cell_profile_ids', 'nuts3', 'nuts1', '2035_factor',
-                                                    '2050_factor', ])
+                                           columns=['cell_profile_ids', 'nuts3', 'nuts1', 'factor_2035',
+                                                    'factor_2050', ])
     # 'peak_loads_hh', 'peak_load_cell',
     df_cell_demand_metadata = df_cell_demand_metadata.rename_axis('cell_id')
 
@@ -452,7 +467,7 @@ def adjust_to_demand_regio_nuts3_annual(df_cell_demand_metadata, df_profiles, df
     -------
     pd.DataFrame
         Returns the same data as :func:`get_cell_demand_metadata`, but with
-        filled columns `2035_factor` and `2050_factor`.
+        filled columns `factor_2035` and `factor_2050`.
     """
     for nuts3_id, df_nuts3 in df_cell_demand_metadata.groupby(by='nuts3'):
         nuts3_cell_ids = df_nuts3.index
@@ -466,10 +481,10 @@ def adjust_to_demand_regio_nuts3_annual(df_cell_demand_metadata, df_profiles, df
         # ##############
         # demand regio in MWh
         # profiles in Wh
-        df_cell_demand_metadata.loc[nuts3_cell_ids, '2035_factor'] = df_demand_regio.loc[
+        df_cell_demand_metadata.loc[nuts3_cell_ids, 'factor_2035'] = df_demand_regio.loc[
                                                                          (2035, nuts3_id), 'demand_mwha'] * 1e3 / (
                                                                                  nuts3_profiles_sum_annual / 1e3)
-        df_cell_demand_metadata.loc[nuts3_cell_ids, '2050_factor'] = df_demand_regio.loc[
+        df_cell_demand_metadata.loc[nuts3_cell_ids, 'factor_2050'] = df_demand_regio.loc[
                                                                          (2050, nuts3_id), 'demand_mwha'] * 1e3 / (
                                                                                  nuts3_profiles_sum_annual / 1e3)
 
@@ -504,8 +519,8 @@ def get_load_area_max_load(df_profiles, df_cell_demand_metadata, cell_ids, year)
     """
     timesteps = len(df_profiles)
     full_load = pd.Series(data=np.zeros(timesteps), dtype=np.float64, index=range(timesteps))
-    load_area_meta = df_cell_demand_metadata.loc[cell_ids, ['cell_profile_ids', 'nuts3', f'{year}_factor']]
-    for (nuts3, factor), df in load_area_meta.groupby(by=['nuts3', f'{year}_factor']):
+    load_area_meta = df_cell_demand_metadata.loc[cell_ids, ['cell_profile_ids', 'nuts3', f'factor_{year}']]
+    for (nuts3, factor), df in load_area_meta.groupby(by=['nuts3', f'factor_{year}']):
         part_load = df_profiles.loc[:, df['cell_profile_ids'].sum()].sum(axis=1) * factor / 1e3  # profiles in Wh
         full_load = full_load.add(part_load)
     return full_load.max()
@@ -628,4 +643,24 @@ def houseprofiles_in_census_cells():
     df_cell_demand_metadata = adjust_to_demand_regio_nuts3_annual(
         df_cell_demand_metadata, df_profiles, df_demand_regio)
 
-    return df_cell_demand_metadata
+    # Insert data into respective database table
+    engine = db.engine()
+    HouseholdElectricityProfilesInCensusCells.__table__.drop(bind=engine,
+                                                             checkfirst=True)
+    HouseholdElectricityProfilesInCensusCells.__table__.create(bind=engine,
+                                                               checkfirst=True)
+    with db.session_scope() as session:
+        session.bulk_insert_mappings(HouseholdElectricityProfilesInCensusCells,
+                                     df_cell_demand_metadata.to_dict(orient="records"))
+
+
+def get_houseprofiles_in_census_cells():
+    with db.session_scope() as session:
+        q = session.query(HouseholdElectricityProfilesInCensusCells)
+
+        census_profile_mapping = pd.read_sql(q.statement, q.session.bind, index_col="cell_id")
+
+    census_profile_mapping["cell_profile_ids"] = census_profile_mapping["cell_profile_ids"].apply(
+        lambda x: [(cat, int(profile_id)) for cat, profile_id in x])
+
+    return census_profile_mapping

From 2dd9e1eef8462907bf653df3ae0bc64cd2052581 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Tue, 8 Jun 2021 10:41:01 +0200
Subject: [PATCH 20/97] Introduce option to calculate aggregated peak load

---
 .../processing/hh_demand/hh_demand_profiles.py   |  2 +-
 .../hh_demand/hh_demand_profiles_tools.py        | 16 ++++++++++++++--
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index 855dcf4f6..c3b478747 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -12,7 +12,7 @@
 
     import random
     load_area_cell_ids = random.sample(list(df_cell_demand_metadata.index), 100)
-    max_value_load_area = hh_tools.get_load_area_max_load(df_profiles, df_cell_demand_metadata, load_area_cell_ids, 2035)
+    max_value_load_area = hh_tools.get_load_timeseries(df_profiles, df_cell_demand_metadata, cell_ids, 2035, peak_load_only=False)
     print(max_value_load_area)
     print(df_cell_demand_metadata.shape)
 
diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 3ef6b90fd..2a9a58a22 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -491,7 +491,7 @@ def adjust_to_demand_regio_nuts3_annual(df_cell_demand_metadata, df_profiles, df
     return df_cell_demand_metadata
 
 
-def get_load_area_max_load(df_profiles, df_cell_demand_metadata, cell_ids, year):
+def get_load_timeseries(df_profiles, df_cell_demand_metadata, cell_ids, year, peak_load_only=False):
     """
     Get peak load for one load area
 
@@ -515,7 +515,16 @@ def get_load_area_max_load(df_profiles, df_cell_demand_metadata, cell_ids, year)
     year: int
         Scenario year. Is used to consider the scaling factor for aligning
         annual demand to NUTS-3 data.
+    peak_load_only: bool
+        If true, only the peak load value is returned (the type of the return
+        value is `float`). Defaults to False which returns the entire time
+        series as pd.Series.
 
+    Returns
+    -------
+    pd.Series or float
+        Aggregated time series for given `cell_ids` or peak load of this time
+        series.
     """
     timesteps = len(df_profiles)
     full_load = pd.Series(data=np.zeros(timesteps), dtype=np.float64, index=range(timesteps))
@@ -523,7 +532,10 @@ def get_load_area_max_load(df_profiles, df_cell_demand_metadata, cell_ids, year)
     for (nuts3, factor), df in load_area_meta.groupby(by=['nuts3', f'factor_{year}']):
         part_load = df_profiles.loc[:, df['cell_profile_ids'].sum()].sum(axis=1) * factor / 1e3  # profiles in Wh
         full_load = full_load.add(part_load)
-    return full_load.max()
+    if peak_load_only:
+        return full_load.max()
+    else:
+        return full_load
 
 
 def houseprofiles_in_census_cells():

From 5adb3628492af552c188e41c81c430d48e85da1d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Wed, 9 Jun 2021 19:05:35 +0200
Subject: [PATCH 21/97] Create HH demand profiles for each MV grid

---
 .../hh_demand/hh_demand_profiles.py           | 13 ++-----
 .../hh_demand/hh_demand_profiles_tools.py     | 37 ++++++++++++++++++-
 2 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index c3b478747..774075d13 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -3,17 +3,10 @@
 
 if __name__ == "__main__":
 
-    df_profiles = hh_tools.get_household_demand_profiles_raw()
+    # Create table with mapping of census cells and household elec. profiles
     hh_tools.houseprofiles_in_census_cells()
 
-    df_cell_demand_metadata = hh_tools.get_houseprofiles_in_census_cells()
-
-
-
-    import random
-    load_area_cell_ids = random.sample(list(df_cell_demand_metadata.index), 100)
-    max_value_load_area = hh_tools.get_load_timeseries(df_profiles, df_cell_demand_metadata, cell_ids, 2035, peak_load_only=False)
-    print(max_value_load_area)
-    print(df_cell_demand_metadata.shape)
+    # Calculate household electricity demand time series for each MV grid
+    hh_tools.mv_grid_district_HH_electricity_load()
 
 
diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 2a9a58a22..3235a6568 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -1,12 +1,14 @@
 import pandas as pd
 import numpy as np
 from egon.data import db
+from egon.data.processing.zensus_grid_districts import MapZensusGridDistricts
+from egon.data.processing.zensus_vg250.zensus_population_inside_germany import DestatisZensusPopulationPerHaInsideGermany, DestatisZensusPopulationPerHa
 
 from itertools import cycle
 import random
 from pathlib import Path
 from urllib.request import urlretrieve
-from sqlalchemy import Column, String, Float, Integer, ARRAY
+from sqlalchemy import Column, String, Float, Integer, ARRAY, ForeignKey, text
 from sqlalchemy.ext.declarative import declarative_base
 Base = declarative_base()
 
@@ -676,3 +678,36 @@ def get_houseprofiles_in_census_cells():
         lambda x: [(cat, int(profile_id)) for cat, profile_id in x])
 
     return census_profile_mapping
+
+
+def mv_grid_district_HH_electricity_load():
+
+    with db.session_scope() as session:
+        cells_query = session.query(
+            HouseholdElectricityProfilesInCensusCells, MapZensusGridDistricts.subst_id).join(
+            MapZensusGridDistricts,
+            HouseholdElectricityProfilesInCensusCells.cell_id == MapZensusGridDistricts.zensus_population_id
+        )
+
+    cells = pd.read_sql(cells_query.statement, cells_query.session.bind, index_col="cell_id")
+    cells["cell_profile_ids"] = cells[
+        "cell_profile_ids"].apply(
+        lambda x: [(cat, int(profile_id)) for cat, profile_id in x])
+
+    # Create aggregated load profile for each MV grid district
+    df_profiles = get_household_demand_profiles_raw()
+
+    mvgd_profiles_list = []
+    for grid_district, data in cells.groupby("subst_id"):
+        mvgd_profile = get_load_timeseries(df_profiles,
+                            data,
+                            data.index,
+                            2035,
+                            peak_load_only=False)
+        mvgd_profile.name = grid_district
+        mvgd_profiles_list.append(mvgd_profile)
+
+    mvgd_profiles = pd.concat(mvgd_profiles_list, axis=1)
+
+    return mvgd_profiles
+

From ff1eeaa6a35a48775353b544ac49f4fe8450015b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Wed, 9 Jun 2021 19:06:15 +0200
Subject: [PATCH 22/97] Add functions to check created HH data

---
 .../hh_demand/hh_demand_profiles.py           |  5 ++
 .../hh_demand/hh_demand_profiles_tools.py     | 66 +++++++++++++++++++
 2 files changed, 71 insertions(+)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index 774075d13..5e11c3fe2 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -9,4 +9,9 @@
     # Calculate household electricity demand time series for each MV grid
     hh_tools.mv_grid_district_HH_electricity_load()
 
+    # ONLY FOR CHECKING
+    # Create table with profiles for each census cell including geom
+    hh_tools.mv_grid_district_HH_electricity_load_check()
 
+    # Create table with zensus households including geom from zensus population table
+    hh_tools.zensus_household_with_geom_check()
diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 3235a6568..508b11135 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -711,3 +711,69 @@ def mv_grid_district_HH_electricity_load():
 
     return mvgd_profiles
 
+
+def mv_grid_district_HH_electricity_load_check():
+    with db.session_scope() as session:
+        cells_w_geom_query = session.query(
+            HouseholdElectricityProfilesInCensusCells,
+            MapZensusGridDistricts.subst_id,
+            DestatisZensusPopulationPerHaInsideGermany.population,
+            DestatisZensusPopulationPerHaInsideGermany.geom
+        ).join(
+            MapZensusGridDistricts,
+            HouseholdElectricityProfilesInCensusCells.cell_id == MapZensusGridDistricts.zensus_population_id
+        ).join(DestatisZensusPopulationPerHaInsideGermany,
+               HouseholdElectricityProfilesInCensusCells.cell_id == DestatisZensusPopulationPerHaInsideGermany.gid)
+
+    # Used for visual check of data
+    import geopandas as gpd
+    cells_w_geom = gpd.read_postgis(cells_w_geom_query.statement, cells_w_geom_query.session.bind, index_col="cell_id")
+    cells_w_geom.to_postgis(
+        "household_electricity_profiles_in_census_cells_with_geom",
+        schema="demand",
+        con=db.engine(),
+        if_exists="replace")
+
+
+class EgonDestatisZensusHouseholdPerHa(Base):
+    __tablename__ = 'egon_destatis_zensus_household_per_ha'
+    __table_args__ = {'schema': 'society'}
+
+    id = Column(Integer, primary_key=True, server_default=text("nextval('society.egon_destatis_zensus_household_per_ha_id_seq'::regclass)"))
+    grid_id = Column(String(50))
+    grid_id_new = Column(String(50))
+    attribute = Column(String(50))
+    characteristics_code = Column(Integer)
+    characteristics_text = Column(String)
+    quantity = Column(Integer)
+    quantity_q = Column(Integer)
+    zensus_population_id = Column(ForeignKey('society.destatis_zensus_population_per_ha.id'))
+
+
+def zensus_household_with_geom_check():
+    import geopandas as gpd
+
+    with db.session_scope() as session:
+
+        # Household x Zensus population x Zensus grid district mapping
+        cells_w_geom_query = session.query(
+            EgonDestatisZensusHouseholdPerHa,
+            MapZensusGridDistricts.subst_id,
+            DestatisZensusPopulationPerHaInsideGermany.population,
+            DestatisZensusPopulationPerHaInsideGermany.geom
+        ).join(
+            MapZensusGridDistricts,
+            EgonDestatisZensusHouseholdPerHa.zensus_population_id == MapZensusGridDistricts.zensus_population_id
+        ).join(DestatisZensusPopulationPerHaInsideGermany,
+               EgonDestatisZensusHouseholdPerHa.zensus_population_id == DestatisZensusPopulationPerHaInsideGermany.gid
+               )
+        cells_w_geom = gpd.read_postgis(cells_w_geom_query.statement,
+                                        cells_w_geom_query.session.bind,
+                                        index_col="zensus_population_id")
+
+    # Used for visual check of data
+    cells_w_geom.to_postgis(
+        "egon_destatis_zensus_household_per_ha_with_geom",
+        schema="society",
+        con=db.engine(),
+        if_exists="replace")

From 9efc0a9bceabf8f8c245d3d16641d113cf36e25a Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Thu, 10 Jun 2021 14:35:46 +0200
Subject: [PATCH 23/97] fix false lable of grid_id and add cell_id to
 output-table

---
 .../hh_demand/hh_demand_profiles_tools.py     | 34 ++++++++++---------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 508b11135..1c3126324 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -88,7 +88,8 @@ class HouseholdElectricityProfilesInCensusCells(Base):
     __tablename__ = "household_electricity_profiles_in_census_cells"
     __table_args__ = {"schema": "demand"}
 
-    cell_id = Column(Integer, primary_key=True)
+    cell_id = Column(String, primary_key=True)
+    grid_id = Column(String)
     cell_profile_ids = Column(ARRAY(String, dimensions=2))
     nuts3 = Column(String)
     nuts1 = Column(String)
@@ -421,23 +422,24 @@ def get_cell_demand_metadata(df_zensus_cells, df_profiles):
     """
 
     df_cell_demand_metadata = pd.DataFrame(index=df_zensus_cells.grid_id.unique(),
-                                           columns=['cell_profile_ids', 'nuts3', 'nuts1', 'factor_2035',
+                                           columns=['cell_profile_ids', 'cell_id', 'nuts3', 'nuts1', 'factor_2035',
                                                     'factor_2050', ])
     # 'peak_loads_hh', 'peak_load_cell',
-    df_cell_demand_metadata = df_cell_demand_metadata.rename_axis('cell_id')
+    df_cell_demand_metadata = df_cell_demand_metadata.rename_axis('grid_id')
 
     pool_size = df_profiles.groupby(level=0, axis=1).size()
 
-    for cell_id, df_cell in df_zensus_cells.groupby(by='grid_id'):
+    for grid_id, df_cell in df_zensus_cells.groupby(by='grid_id'):
         # FIXME
-        # ! runden der Haushaltszahlen auf int
+        # ! runden der Haushaltszahlen auf int -> zu einfach!
         # ! kein zurücklegen innerhalb einer Zelle ?! -> das is ok.
         # cell_profile_ids = get_cell_demand_profile_ids(df_cell, pool_size, df_profiles)
         cell_profile_ids = get_cell_demand_profile_ids(df_cell, pool_size)
 
-        df_cell_demand_metadata.at[cell_id, 'cell_profile_ids'] = cell_profile_ids
-        df_cell_demand_metadata.at[cell_id, 'nuts3'] = df_cell.loc[:, 'nuts3'].unique()[0]
-        df_cell_demand_metadata.at[cell_id, 'nuts1'] = df_cell.loc[:, 'nuts1'].unique()[0]
+        df_cell_demand_metadata.at[grid_id, 'cell_id'] = df_cell.loc[:, 'cell_id'].unique()[0]
+        df_cell_demand_metadata.at[grid_id, 'cell_profile_ids'] = cell_profile_ids
+        df_cell_demand_metadata.at[grid_id, 'nuts3'] = df_cell.loc[:, 'nuts3'].unique()[0]
+        df_cell_demand_metadata.at[grid_id, 'nuts1'] = df_cell.loc[:, 'nuts1'].unique()[0]
 
     return df_cell_demand_metadata
 
@@ -609,7 +611,7 @@ def houseprofiles_in_census_cells():
 
     # Census cells with nuts3 and nuts1 information
     df_grid_id = db.select_dataframe(sql="""
-                            SELECT pop.grid_id, pop.gid, vg250.vg250_nuts3 as nuts3, lan.nuts as nuts1, lan.gen
+                            SELECT pop.grid_id, pop.gid as cell_id, vg250.vg250_nuts3 as nuts3, lan.nuts as nuts1, lan.gen
                             FROM society.destatis_zensus_population_per_ha_inside_germany as pop
                             LEFT JOIN boundaries.egon_map_zensus_vg250 as vg250
                             ON (pop.gid=vg250.zensus_population_id)
@@ -621,8 +623,7 @@ def houseprofiles_in_census_cells():
     # Merge household type and size data with considered (populated) census cells
     # how='inner' is used as ids of unpopulated areas are removed df_grid_id or earliers tables. see here:
     # https://github.com/openego/eGon-data/blob/59195926e41c8bd6d1ca8426957b97f33ef27bcc/src/egon/data/importing/zensus/__init__.py#L418-L449
-    df_households_typ = pd.merge(df_households_typ, df_grid_id[
-        ['grid_id', 'gen', 'nuts1', 'nuts3']],
+    df_households_typ = pd.merge(df_households_typ, df_grid_id,
                                  left_on='grid_id', right_on='grid_id',
                                  how='inner')
 
@@ -656,6 +657,7 @@ def houseprofiles_in_census_cells():
                                                                 df_profiles)
     df_cell_demand_metadata = adjust_to_demand_regio_nuts3_annual(
         df_cell_demand_metadata, df_profiles, df_demand_regio)
+    df_cell_demand_metadata = df_cell_demand_metadata.reset_index(drop=False)
 
     # Insert data into respective database table
     engine = db.engine()
@@ -699,11 +701,11 @@ def mv_grid_district_HH_electricity_load():
 
     mvgd_profiles_list = []
     for grid_district, data in cells.groupby("subst_id"):
-        mvgd_profile = get_load_timeseries(df_profiles,
-                            data,
-                            data.index,
-                            2035,
-                            peak_load_only=False)
+        mvgd_profile = get_load_timeseries(df_profiles=df_profiles,
+                                           df_cell_demand_metadata=data,
+                                           cell_ids=data.index,
+                                           year=2035,
+                                           peak_load_only=False)
         mvgd_profile.name = grid_district
         mvgd_profiles_list.append(mvgd_profile)
 

From 14ea5865c3d9ed82d9db1a605f50b4c00a6a9026 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Thu, 10 Jun 2021 16:16:12 +0200
Subject: [PATCH 24/97] Change cell_id type to int and cast data before
 insertation explicitly

---
 src/egon/data/processing/hh_demand/hh_demand_profiles.py       | 3 ++-
 src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index 5e11c3fe2..989258195 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -7,7 +7,8 @@
     hh_tools.houseprofiles_in_census_cells()
 
     # Calculate household electricity demand time series for each MV grid
-    hh_tools.mv_grid_district_HH_electricity_load()
+    profiles = hh_tools.mv_grid_district_HH_electricity_load()
+    print(profiles)
 
     # ONLY FOR CHECKING
     # Create table with profiles for each census cell including geom
diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 1c3126324..bc2ec7b52 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -88,7 +88,7 @@ class HouseholdElectricityProfilesInCensusCells(Base):
     __tablename__ = "household_electricity_profiles_in_census_cells"
     __table_args__ = {"schema": "demand"}
 
-    cell_id = Column(String, primary_key=True)
+    cell_id = Column(Integer, primary_key=True)
     grid_id = Column(String)
     cell_profile_ids = Column(ARRAY(String, dimensions=2))
     nuts3 = Column(String)
@@ -665,6 +665,7 @@ def houseprofiles_in_census_cells():
                                                              checkfirst=True)
     HouseholdElectricityProfilesInCensusCells.__table__.create(bind=engine,
                                                                checkfirst=True)
+    df_cell_demand_metadata["cell_id"] = df_cell_demand_metadata["cell_id"].astype(int)
     with db.session_scope() as session:
         session.bulk_insert_mappings(HouseholdElectricityProfilesInCensusCells,
                                      df_cell_demand_metadata.to_dict(orient="records"))

From 5e0f1d3af135b180895cdc04e0a80ce1795a40c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Thu, 10 Jun 2021 16:36:44 +0200
Subject: [PATCH 25/97] Inform about the small deviation introduced by ceiling
 people to int

---
 .../data/processing/hh_demand/hh_demand_profiles_tools.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index bc2ec7b52..304460385 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -307,8 +307,8 @@ def inhabitants_to_households(df_people_by_householdtypes_abs, mapping_people_in
 
     # divide amount of people by people in household types
     df_households_by_type = df_people_by_householdtypes_abs.div(mapping_people_in_households, axis=0)
-    # TODO: check @ Guido
-    # round up households
+    # Number of people gets adjusted to integer values by ceiling
+    # This introduces a small deviation
     df_households_by_type = df_households_by_type.apply(np.ceil)
 
     return df_households_by_type

From f4f2ef4cdd20a95c1f953e8747eb606a3e02233d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Thu, 10 Jun 2021 17:02:16 +0200
Subject: [PATCH 26/97] Remove TODO note

Discussion is moved here:
https://github.com/openego/eGon-data/issues/256#issuecomment-858683346
---
 .../data/processing/hh_demand/hh_demand_profiles_tools.py     | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 304460385..e613b04b7 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -590,10 +590,6 @@ def houseprofiles_in_census_cells():
     df_dist_households = inhabitants_to_households(
         df_hh_types_nad_abs, mapping_people_in_households)
 
-    # TODO:
-    # compare df_dist_households.sum() here with values from other source
-    # maybe scale on state-level
-
     # Retrieve information about households for each census cell
     df_households_typ = db.select_dataframe(sql="""
                     SELECT grid_id, attribute, characteristics_code, characteristics_text, quantity

From 71e496b5f29f18f72087d7272af961e749c8ea3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Thu, 10 Jun 2021 17:06:10 +0200
Subject: [PATCH 27/97] Black and isort

---
 .../hh_demand/hh_demand_profiles.py           |   1 -
 .../hh_demand/hh_demand_profiles_tools.py     | 653 ++++++++++++------
 2 files changed, 427 insertions(+), 227 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index 989258195..0cdb513db 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -1,6 +1,5 @@
 from egon.data.processing.hh_demand import hh_demand_profiles_tools as hh_tools
 
-
 if __name__ == "__main__":
 
     # Create table with mapping of census cells and household elec. profiles
diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index e613b04b7..2a8697391 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -1,87 +1,102 @@
-import pandas as pd
-import numpy as np
-from egon.data import db
-from egon.data.processing.zensus_grid_districts import MapZensusGridDistricts
-from egon.data.processing.zensus_vg250.zensus_population_inside_germany import DestatisZensusPopulationPerHaInsideGermany, DestatisZensusPopulationPerHa
-
 from itertools import cycle
-import random
 from pathlib import Path
 from urllib.request import urlretrieve
-from sqlalchemy import Column, String, Float, Integer, ARRAY, ForeignKey, text
+import random
+
+from sqlalchemy import ARRAY, Column, Float, ForeignKey, Integer, String, text
 from sqlalchemy.ext.declarative import declarative_base
+import numpy as np
+import pandas as pd
+
+from egon.data import db
+from egon.data.processing.zensus_grid_districts import MapZensusGridDistricts
+from egon.data.processing.zensus_vg250.zensus_population_inside_germany import (
+    DestatisZensusPopulationPerHa,
+    DestatisZensusPopulationPerHaInsideGermany,
+)
+
 Base = declarative_base()
 
 import egon.data.config
 
-
 # Define mapping of zensus household categories to eurostat categories
 # - Adults living in househould type
 # - number of kids not included even if in housholdtype name
 # **! The Eurostat data only gives the amount of adults/seniors, excluding the amount of kids <15**
 # eurostat is used for demand-profile-generator @fraunhofer
-HH_TYPES = {'SR': [
-    ('Einpersonenhaushalte (Singlehaushalte)', 'Insgesamt', 'Seniors'),
-    ('Alleinerziehende Elternteile', 'Insgesamt', 'Seniors')],
-            # Single Seniors Single Parents Seniors
-            'SO': [('Einpersonenhaushalte (Singlehaushalte)', 'Insgesamt',
-                    'Adults')],  # Single Adults
-            'SK': [('Alleinerziehende Elternteile', 'Insgesamt', 'Adults')],
-            # Single Parents Adult
-            'PR': [('Paare ohne Kind(er)', '2 Personen', 'Seniors'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '2 Personen',
-                    'Seniors')],
-            # Couples without Kids Senior & same sex couples & shared flat seniors
-            'PO': [('Paare ohne Kind(er)', '2 Personen', 'Adults'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '2 Personen',
-                    'Adults')],
-            # Couples without Kids adults & same sex couples & shared flat adults
-            'P1': [('Paare mit Kind(ern)', '3 Personen', 'Adults')],
-            'P2': [('Paare mit Kind(ern)', '4 Personen', 'Adults')],
-            'P3': [('Paare mit Kind(ern)', '5 Personen', 'Adults'),
-                   ('Paare mit Kind(ern)', '6 und mehr Personen', 'Adults')],
-            'OR': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen',
-                    'Seniors'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen',
-                    'Seniors'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen',
-                    'Seniors'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie',
-                    '6 und mehr Personen', 'Seniors'),
-                   ('Paare mit Kind(ern)', '3 Personen', 'Seniors'),
-                   ('Paare ohne Kind(er)', '3 Personen', 'Seniors'),
-                   ('Paare mit Kind(ern)', '4 Personen', 'Seniors'),
-                   ('Paare ohne Kind(er)', '4 Personen', 'Seniors'),
-                   ('Paare mit Kind(ern)', '5 Personen', 'Seniors'),
-                   ('Paare ohne Kind(er)', '5 Personen', 'Seniors'),
-                   ('Paare mit Kind(ern)', '6 und mehr Personen', 'Seniors'),
-                   ('Paare ohne Kind(er)', '6 und mehr Personen', 'Seniors')],
-            # no info about share of kids
-
-            # OO, O1, O2 have the same amount, as no information about the share of kids within zensus data set.
-            # if needed the total amount can be corrected in the hh_tools.get_hh_dist function
-            # using multi_adjust=True option
-            'OO': [('Mehrpersonenhaushalte ohne Kernfamilie', '3 Personen',
-                    'Adults'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '4 Personen',
-                    'Adults'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie', '5 Personen',
-                    'Adults'),
-                   ('Mehrpersonenhaushalte ohne Kernfamilie',
-                    '6 und mehr Personen', 'Adults'),
-                   ('Paare ohne Kind(er)', '3 Personen', 'Adults'),
-                   ('Paare ohne Kind(er)', '4 Personen', 'Adults'),
-                   ('Paare ohne Kind(er)', '5 Personen', 'Adults'),
-                   ('Paare ohne Kind(er)', '6 und mehr Personen', 'Adults')],
-            # no info about share of kids
-            }
-
-MAPPING_ZENSUS_HH_SUBGROUPS = {1: ['SR', 'SO'],
-                               2: ['PR', 'PO'],
-                               3: ['SK'],
-                               4: ['P1', 'P2', 'P3'],
-                               5: ['OR', 'OO'],
-                               }
+HH_TYPES = {
+    "SR": [
+        ("Einpersonenhaushalte (Singlehaushalte)", "Insgesamt", "Seniors"),
+        ("Alleinerziehende Elternteile", "Insgesamt", "Seniors"),
+    ],
+    # Single Seniors Single Parents Seniors
+    "SO": [
+        ("Einpersonenhaushalte (Singlehaushalte)", "Insgesamt", "Adults")
+    ],  # Single Adults
+    "SK": [("Alleinerziehende Elternteile", "Insgesamt", "Adults")],
+    # Single Parents Adult
+    "PR": [
+        ("Paare ohne Kind(er)", "2 Personen", "Seniors"),
+        ("Mehrpersonenhaushalte ohne Kernfamilie", "2 Personen", "Seniors"),
+    ],
+    # Couples without Kids Senior & same sex couples & shared flat seniors
+    "PO": [
+        ("Paare ohne Kind(er)", "2 Personen", "Adults"),
+        ("Mehrpersonenhaushalte ohne Kernfamilie", "2 Personen", "Adults"),
+    ],
+    # Couples without Kids adults & same sex couples & shared flat adults
+    "P1": [("Paare mit Kind(ern)", "3 Personen", "Adults")],
+    "P2": [("Paare mit Kind(ern)", "4 Personen", "Adults")],
+    "P3": [
+        ("Paare mit Kind(ern)", "5 Personen", "Adults"),
+        ("Paare mit Kind(ern)", "6 und mehr Personen", "Adults"),
+    ],
+    "OR": [
+        ("Mehrpersonenhaushalte ohne Kernfamilie", "3 Personen", "Seniors"),
+        ("Mehrpersonenhaushalte ohne Kernfamilie", "4 Personen", "Seniors"),
+        ("Mehrpersonenhaushalte ohne Kernfamilie", "5 Personen", "Seniors"),
+        (
+            "Mehrpersonenhaushalte ohne Kernfamilie",
+            "6 und mehr Personen",
+            "Seniors",
+        ),
+        ("Paare mit Kind(ern)", "3 Personen", "Seniors"),
+        ("Paare ohne Kind(er)", "3 Personen", "Seniors"),
+        ("Paare mit Kind(ern)", "4 Personen", "Seniors"),
+        ("Paare ohne Kind(er)", "4 Personen", "Seniors"),
+        ("Paare mit Kind(ern)", "5 Personen", "Seniors"),
+        ("Paare ohne Kind(er)", "5 Personen", "Seniors"),
+        ("Paare mit Kind(ern)", "6 und mehr Personen", "Seniors"),
+        ("Paare ohne Kind(er)", "6 und mehr Personen", "Seniors"),
+    ],
+    # no info about share of kids
+    # OO, O1, O2 have the same amount, as no information about the share of kids within zensus data set.
+    # if needed the total amount can be corrected in the hh_tools.get_hh_dist function
+    # using multi_adjust=True option
+    "OO": [
+        ("Mehrpersonenhaushalte ohne Kernfamilie", "3 Personen", "Adults"),
+        ("Mehrpersonenhaushalte ohne Kernfamilie", "4 Personen", "Adults"),
+        ("Mehrpersonenhaushalte ohne Kernfamilie", "5 Personen", "Adults"),
+        (
+            "Mehrpersonenhaushalte ohne Kernfamilie",
+            "6 und mehr Personen",
+            "Adults",
+        ),
+        ("Paare ohne Kind(er)", "3 Personen", "Adults"),
+        ("Paare ohne Kind(er)", "4 Personen", "Adults"),
+        ("Paare ohne Kind(er)", "5 Personen", "Adults"),
+        ("Paare ohne Kind(er)", "6 und mehr Personen", "Adults"),
+    ],
+    # no info about share of kids
+}
+
+MAPPING_ZENSUS_HH_SUBGROUPS = {
+    1: ["SR", "SO"],
+    2: ["PR", "PO"],
+    3: ["SK"],
+    4: ["P1", "P2", "P3"],
+    5: ["OR", "OO"],
+}
 
 
 class HouseholdElectricityProfilesInCensusCells(Base):
@@ -117,9 +132,9 @@ def clean(x):
     pd.Series
         Re-formatted data row
     """
-    x = str(x).replace('-', str(0))
-    x = str(x).replace('.', str(0))
-    x = x.strip('()')
+    x = str(x).replace("-", str(0))
+    x = str(x).replace(".", str(0))
+    x = x.strip("()")
     return x
 
 
@@ -152,7 +167,9 @@ def get_household_demand_profiles_raw():
     """
     data_config = egon.data.config.datasets()["household_electricity_demand"]
 
-    hh_profiles_url = data_config["sources"]["household_electricity_demand_profiles"]["url"]
+    hh_profiles_url = data_config["sources"][
+        "household_electricity_demand_profiles"
+    ]["url"]
     hh_profiles_file = Path(".") / Path(hh_profiles_url).name
 
     if not hh_profiles_file.is_file():
@@ -161,11 +178,14 @@ def get_household_demand_profiles_raw():
     hh_profiles = pd.read_hdf(hh_profiles_file)
 
     # set multiindex to HH_types
-    hh_profiles.columns = pd.MultiIndex.from_arrays([hh_profiles.columns.str[:2], hh_profiles.columns.str[3:]])
+    hh_profiles.columns = pd.MultiIndex.from_arrays(
+        [hh_profiles.columns.str[:2], hh_profiles.columns.str[3:]]
+    )
 
     # Cast profile ids into int
     hh_profiles.columns = pd.MultiIndex.from_tuples(
-        [(a, int(b)) for a, b in hh_profiles.columns])
+        [(a, int(b)) for a, b in hh_profiles.columns]
+    )
 
     return hh_profiles
 
@@ -214,8 +234,17 @@ def download_process_zensus_households():
         urlretrieve(households_url, households_file)
 
     # Read downloaded file from disk
-    households_raw = pd.read_csv(households_file, sep=';', decimal='.', skiprows=5, skipfooter=7,
-                            index_col=[0, 1], header=[0, 1], encoding='latin1', engine='python')
+    households_raw = pd.read_csv(
+        households_file,
+        sep=";",
+        decimal=".",
+        skiprows=5,
+        skipfooter=7,
+        index_col=[0, 1],
+        header=[0, 1],
+        encoding="latin1",
+        engine="python",
+    )
 
     # Clean data
     households = households_raw.applymap(clean).applymap(int)
@@ -253,18 +282,49 @@ def get_hh_dist(df_zensus, hh_types, multi_adjust=True, relative=True):
 
             Data still needs to be converted from amount of people to amount
             of households
-     """
+    """
     # adjust multi with/without kids via eurostat share as not clearly derivable without infos about share of kids
     if multi_adjust:
-        adjust = {'SR': 1, 'SO': 1, 'SK': 1, 'PR': 1, 'PO': 1, 'P1': 1, 'P2': 1, 'P3': 1, 'OR': 1,
-                  'OO': 0.703, 'O1': 0.216, 'O2': 0.081, }
+        adjust = {
+            "SR": 1,
+            "SO": 1,
+            "SK": 1,
+            "PR": 1,
+            "PO": 1,
+            "P1": 1,
+            "P2": 1,
+            "P3": 1,
+            "OR": 1,
+            "OO": 0.703,
+            "O1": 0.216,
+            "O2": 0.081,
+        }
     else:
-        adjust = {'SR': 1, 'SO': 1, 'SK': 1, 'PR': 1, 'PO': 1, 'P1': 1, 'P2': 1, 'P3': 1, 'OR': 1,
-                  'OO': 1, 'O1': 0, 'O2': 0, }
+        adjust = {
+            "SR": 1,
+            "SO": 1,
+            "SK": 1,
+            "PR": 1,
+            "PO": 1,
+            "P1": 1,
+            "P2": 1,
+            "P3": 1,
+            "OR": 1,
+            "OO": 1,
+            "O1": 0,
+            "O2": 0,
+        }
 
     df_hh_types = pd.DataFrame(
-        ({hhtype: adjust[hhtype] * df_zensus.loc[countries, codes].sum() for hhtype, codes in hh_types.items()}
-         for countries in df_zensus.index), index=df_zensus.index)
+        (
+            {
+                hhtype: adjust[hhtype] * df_zensus.loc[countries, codes].sum()
+                for hhtype, codes in hh_types.items()
+            }
+            for countries in df_zensus.index
+        ),
+        index=df_zensus.index,
+    )
     # drop zero columns
     df_hh_types = df_hh_types.loc[:, (df_hh_types != 0).any(axis=0)]
     if relative:
@@ -273,7 +333,9 @@ def get_hh_dist(df_zensus, hh_types, multi_adjust=True, relative=True):
     return df_hh_types.T
 
 
-def inhabitants_to_households(df_people_by_householdtypes_abs, mapping_people_in_households):
+def inhabitants_to_households(
+    df_people_by_householdtypes_abs, mapping_people_in_households
+):
     """
     Convert number of inhabitant to number of household types
 
@@ -297,16 +359,20 @@ def inhabitants_to_households(df_people_by_householdtypes_abs, mapping_people_in
 
     """
     # compare categories and remove form mapping if to many
-    diff = set(df_people_by_householdtypes_abs.index) ^ set(mapping_people_in_households.keys())
+    diff = set(df_people_by_householdtypes_abs.index) ^ set(
+        mapping_people_in_households.keys()
+    )
 
     if bool(diff):
         for key in diff:
             mapping_people_in_households = dict(mapping_people_in_households)
             del mapping_people_in_households[key]
-        print(f'Removed {diff} from mapping!')
+        print(f"Removed {diff} from mapping!")
 
     # divide amount of people by people in household types
-    df_households_by_type = df_people_by_householdtypes_abs.div(mapping_people_in_households, axis=0)
+    df_households_by_type = df_people_by_householdtypes_abs.div(
+        mapping_people_in_households, axis=0
+    )
     # Number of people gets adjusted to integer values by ceiling
     # This introduces a small deviation
     df_households_by_type = df_households_by_type.apply(np.ceil)
@@ -331,28 +397,49 @@ def process_nuts1_zensus_data(df_zensus):
     """
     # Group data to fit Load Profile Generator categories
     # define kids/adults/seniors
-    kids = ['Unter 3', '3 - 5', '6 - 14']  # < 15
-    adults = ['15 - 17', '18 - 24', '25 - 29', '30 - 39', '40 - 49', '50 - 64']  # 15 < x <65
-    seniors = ['65 - 74', '75 und älter']  # >65
+    kids = ["Unter 3", "3 - 5", "6 - 14"]  # < 15
+    adults = [
+        "15 - 17",
+        "18 - 24",
+        "25 - 29",
+        "30 - 39",
+        "40 - 49",
+        "50 - 64",
+    ]  # 15 < x <65
+    seniors = ["65 - 74", "75 und älter"]  # >65
 
     # sum groups of kids, adults and seniors and concat
-    df_kids = df_zensus.loc[:, (slice(None), kids)].groupby(level=0, axis=1).sum()
-    df_adults = df_zensus.loc[:, (slice(None), adults)].groupby(level=0, axis=1).sum()
-    df_seniors = df_zensus.loc[:, (slice(None), seniors)].groupby(level=0, axis=1).sum()
-    df_zensus = pd.concat([df_kids, df_adults, df_seniors], axis=1, keys=['Kids', 'Adults', 'Seniors'],
-                          names=['age', 'persons'])
+    df_kids = (
+        df_zensus.loc[:, (slice(None), kids)].groupby(level=0, axis=1).sum()
+    )
+    df_adults = (
+        df_zensus.loc[:, (slice(None), adults)].groupby(level=0, axis=1).sum()
+    )
+    df_seniors = (
+        df_zensus.loc[:, (slice(None), seniors)].groupby(level=0, axis=1).sum()
+    )
+    df_zensus = pd.concat(
+        [df_kids, df_adults, df_seniors],
+        axis=1,
+        keys=["Kids", "Adults", "Seniors"],
+        names=["age", "persons"],
+    )
 
     # reduce column names to state only
-    mapping_state = {i: i.split()[1] for i in df_zensus.index.get_level_values(level=0)}
+    mapping_state = {
+        i: i.split()[1] for i in df_zensus.index.get_level_values(level=0)
+    }
 
     # rename index
     df_zensus = df_zensus.rename(index=mapping_state, level=0)
     # rename axis
-    df_zensus = df_zensus.rename_axis(['state', 'type'])
+    df_zensus = df_zensus.rename_axis(["state", "type"])
     # unstack
     df_zensus = df_zensus.unstack()
     # reorder levels
-    df_zensus = df_zensus.reorder_levels(order=['type', 'persons', 'age'], axis=1)
+    df_zensus = df_zensus.reorder_levels(
+        order=["type", "persons", "age"], axis=1
+    )
 
     return df_zensus
 
@@ -380,12 +467,17 @@ def get_cell_demand_profile_ids(df_cell, pool_size):
     # np.random.default_rng().integers(low=0, high=pool_size[hh_type], size=sq) instead of random.sample
     # use random.choice() if with replacement
     # list of sample ids per hh_type in cell
-    cell_profile_ids = [(hh_type, random.sample(range(pool_size[hh_type]), k=sq)) \
-                        for hh_type, sq in zip(df_cell['hh_type'],
-                                               df_cell['hh_10types'].astype(int))]
+    cell_profile_ids = [
+        (hh_type, random.sample(range(pool_size[hh_type]), k=sq))
+        for hh_type, sq in zip(
+            df_cell["hh_type"], df_cell["hh_10types"].astype(int)
+        )
+    ]
 
     # format to lists of tuples (hh_type, id)
-    cell_profile_ids = [list(zip(cycle([hh_type]), ids)) for hh_type, ids in cell_profile_ids]
+    cell_profile_ids = [
+        list(zip(cycle([hh_type]), ids)) for hh_type, ids in cell_profile_ids
+    ]
     # reduce to list
     cell_profile_ids = [a for b in cell_profile_ids for a in b]
 
@@ -421,31 +513,49 @@ def get_cell_demand_metadata(df_zensus_cells, df_profiles):
         cell.
     """
 
-    df_cell_demand_metadata = pd.DataFrame(index=df_zensus_cells.grid_id.unique(),
-                                           columns=['cell_profile_ids', 'cell_id', 'nuts3', 'nuts1', 'factor_2035',
-                                                    'factor_2050', ])
+    df_cell_demand_metadata = pd.DataFrame(
+        index=df_zensus_cells.grid_id.unique(),
+        columns=[
+            "cell_profile_ids",
+            "cell_id",
+            "nuts3",
+            "nuts1",
+            "factor_2035",
+            "factor_2050",
+        ],
+    )
     # 'peak_loads_hh', 'peak_load_cell',
-    df_cell_demand_metadata = df_cell_demand_metadata.rename_axis('grid_id')
+    df_cell_demand_metadata = df_cell_demand_metadata.rename_axis("grid_id")
 
     pool_size = df_profiles.groupby(level=0, axis=1).size()
 
-    for grid_id, df_cell in df_zensus_cells.groupby(by='grid_id'):
+    for grid_id, df_cell in df_zensus_cells.groupby(by="grid_id"):
         # FIXME
         # ! runden der Haushaltszahlen auf int -> zu einfach!
         # ! kein zurücklegen innerhalb einer Zelle ?! -> das is ok.
         # cell_profile_ids = get_cell_demand_profile_ids(df_cell, pool_size, df_profiles)
         cell_profile_ids = get_cell_demand_profile_ids(df_cell, pool_size)
 
-        df_cell_demand_metadata.at[grid_id, 'cell_id'] = df_cell.loc[:, 'cell_id'].unique()[0]
-        df_cell_demand_metadata.at[grid_id, 'cell_profile_ids'] = cell_profile_ids
-        df_cell_demand_metadata.at[grid_id, 'nuts3'] = df_cell.loc[:, 'nuts3'].unique()[0]
-        df_cell_demand_metadata.at[grid_id, 'nuts1'] = df_cell.loc[:, 'nuts1'].unique()[0]
+        df_cell_demand_metadata.at[grid_id, "cell_id"] = df_cell.loc[
+            :, "cell_id"
+        ].unique()[0]
+        df_cell_demand_metadata.at[
+            grid_id, "cell_profile_ids"
+        ] = cell_profile_ids
+        df_cell_demand_metadata.at[grid_id, "nuts3"] = df_cell.loc[
+            :, "nuts3"
+        ].unique()[0]
+        df_cell_demand_metadata.at[grid_id, "nuts1"] = df_cell.loc[
+            :, "nuts1"
+        ].unique()[0]
 
     return df_cell_demand_metadata
 
 
 # can be parallelized with grouping df_zensus_cells by grid_id/nuts3/nuts1
-def adjust_to_demand_regio_nuts3_annual(df_cell_demand_metadata, df_profiles, df_demand_regio):
+def adjust_to_demand_regio_nuts3_annual(
+    df_cell_demand_metadata, df_profiles, df_demand_regio
+):
     """
     Computes the profile scaling factor for alignment to demand regio data
 
@@ -473,29 +583,37 @@ def adjust_to_demand_regio_nuts3_annual(df_cell_demand_metadata, df_profiles, df
         Returns the same data as :func:`get_cell_demand_metadata`, but with
         filled columns `factor_2035` and `factor_2050`.
     """
-    for nuts3_id, df_nuts3 in df_cell_demand_metadata.groupby(by='nuts3'):
+    for nuts3_id, df_nuts3 in df_cell_demand_metadata.groupby(by="nuts3"):
         nuts3_cell_ids = df_nuts3.index
-        nuts3_profile_ids = df_nuts3.loc[:, 'cell_profile_ids'].sum()
+        nuts3_profile_ids = df_nuts3.loc[:, "cell_profile_ids"].sum()
 
         # take all profiles of one nuts3, aggregate and sum
         # profiles in Wh
-        nuts3_profiles_sum_annual = df_profiles.loc[:, nuts3_profile_ids].sum().sum()
+        nuts3_profiles_sum_annual = (
+            df_profiles.loc[:, nuts3_profile_ids].sum().sum()
+        )
 
         # Scaling Factor
         # ##############
         # demand regio in MWh
         # profiles in Wh
-        df_cell_demand_metadata.loc[nuts3_cell_ids, 'factor_2035'] = df_demand_regio.loc[
-                                                                         (2035, nuts3_id), 'demand_mwha'] * 1e3 / (
-                                                                                 nuts3_profiles_sum_annual / 1e3)
-        df_cell_demand_metadata.loc[nuts3_cell_ids, 'factor_2050'] = df_demand_regio.loc[
-                                                                         (2050, nuts3_id), 'demand_mwha'] * 1e3 / (
-                                                                                 nuts3_profiles_sum_annual / 1e3)
+        df_cell_demand_metadata.loc[nuts3_cell_ids, "factor_2035"] = (
+            df_demand_regio.loc[(2035, nuts3_id), "demand_mwha"]
+            * 1e3
+            / (nuts3_profiles_sum_annual / 1e3)
+        )
+        df_cell_demand_metadata.loc[nuts3_cell_ids, "factor_2050"] = (
+            df_demand_regio.loc[(2050, nuts3_id), "demand_mwha"]
+            * 1e3
+            / (nuts3_profiles_sum_annual / 1e3)
+        )
 
     return df_cell_demand_metadata
 
 
-def get_load_timeseries(df_profiles, df_cell_demand_metadata, cell_ids, year, peak_load_only=False):
+def get_load_timeseries(
+    df_profiles, df_cell_demand_metadata, cell_ids, year, peak_load_only=False
+):
     """
     Get peak load for one load area
 
@@ -531,10 +649,20 @@ def get_load_timeseries(df_profiles, df_cell_demand_metadata, cell_ids, year, pe
         series.
     """
     timesteps = len(df_profiles)
-    full_load = pd.Series(data=np.zeros(timesteps), dtype=np.float64, index=range(timesteps))
-    load_area_meta = df_cell_demand_metadata.loc[cell_ids, ['cell_profile_ids', 'nuts3', f'factor_{year}']]
-    for (nuts3, factor), df in load_area_meta.groupby(by=['nuts3', f'factor_{year}']):
-        part_load = df_profiles.loc[:, df['cell_profile_ids'].sum()].sum(axis=1) * factor / 1e3  # profiles in Wh
+    full_load = pd.Series(
+        data=np.zeros(timesteps), dtype=np.float64, index=range(timesteps)
+    )
+    load_area_meta = df_cell_demand_metadata.loc[
+        cell_ids, ["cell_profile_ids", "nuts3", f"factor_{year}"]
+    ]
+    for (nuts3, factor), df in load_area_meta.groupby(
+        by=["nuts3", f"factor_{year}"]
+    ):
+        part_load = (
+            df_profiles.loc[:, df["cell_profile_ids"].sum()].sum(axis=1)
+            * factor
+            / 1e3
+        )  # profiles in Wh
         full_load = full_load.add(part_load)
     if peak_load_only:
         return full_load.max()
@@ -556,125 +684,159 @@ def houseprofiles_in_census_cells():
     df_zensus = download_process_zensus_households()
 
     # hh_tools.get_hh_dist without eurostat adjustment for O1-03 Groups in absolute values
-    df_hh_types_nad_abs = get_hh_dist(df_zensus, HH_TYPES,
-                                               multi_adjust=False,
-                                               relative=False)
+    df_hh_types_nad_abs = get_hh_dist(
+        df_zensus, HH_TYPES, multi_adjust=False, relative=False
+    )
 
     # Get household size for each census cell grouped by
     # As this is only used to estimate size of households for OR, OO, 1 P and 2 P households are dropped
-    df_hh_size = db.select_dataframe(sql="""
+    df_hh_size = db.select_dataframe(
+        sql="""
                         SELECT characteristics_text, SUM(quantity) as summe
                         FROM society.egon_destatis_zensus_household_per_ha as egon_d
                         WHERE attribute = 'HHGROESS_KLASS'
                         GROUP BY characteristics_text """,
-                                     index_col='characteristics_text')
-    df_hh_size = df_hh_size.drop(index=['1 Person', '2 Personen'])
+        index_col="characteristics_text",
+    )
+    df_hh_size = df_hh_size.drop(index=["1 Person", "2 Personen"])
 
     # Define/ estimate number of persons (w/o kids) for each household category
     # For categories S* and P* it's clear; for multi-person households (OO,OR)
     # the number is estimated as average by taking remaining persons
-    OO_factor = sum(df_hh_size['summe'] * [3, 4, 5, 6]) / df_hh_size[
-        'summe'].sum()
-    mapping_people_in_households = {'SR': 1,
-                                    'SO': 1,
-                                    'SK': 1,  # kids are excluded
-                                    'PR': 2,
-                                    'PO': 2,
-                                    'P1': 2,  # kids are excluded
-                                    'P2': 2,  # ""
-                                    'P3': 2,  # ""
-                                    'OR': OO_factor,
-                                    'OO': OO_factor,
-                                    }
+    OO_factor = (
+        sum(df_hh_size["summe"] * [3, 4, 5, 6]) / df_hh_size["summe"].sum()
+    )
+    mapping_people_in_households = {
+        "SR": 1,
+        "SO": 1,
+        "SK": 1,  # kids are excluded
+        "PR": 2,
+        "PO": 2,
+        "P1": 2,  # kids are excluded
+        "P2": 2,  # ""
+        "P3": 2,  # ""
+        "OR": OO_factor,
+        "OO": OO_factor,
+    }
     # Determine number of persons for each household category and per federal state
     df_dist_households = inhabitants_to_households(
-        df_hh_types_nad_abs, mapping_people_in_households)
+        df_hh_types_nad_abs, mapping_people_in_households
+    )
 
     # Retrieve information about households for each census cell
-    df_households_typ = db.select_dataframe(sql="""
+    df_households_typ = db.select_dataframe(
+        sql="""
                     SELECT grid_id, attribute, characteristics_code, characteristics_text, quantity
                     FROM society.egon_destatis_zensus_household_per_ha
-                    WHERE attribute = 'HHTYP_FAM' """)
+                    WHERE attribute = 'HHTYP_FAM' """
+    )
     df_households_typ = df_households_typ.drop(
-        columns=['attribute', 'characteristics_text'])
+        columns=["attribute", "characteristics_text"]
+    )
     df_households_typ = df_households_typ.rename(
-        columns={'quantity': 'hh_5types'})
+        columns={"quantity": "hh_5types"}
+    )
 
     # Calculate fraction of persons within subgroup
     for value in MAPPING_ZENSUS_HH_SUBGROUPS.values():
         df_dist_households.loc[value] = df_dist_households.loc[value].div(
-            df_dist_households.loc[value].sum())
+            df_dist_households.loc[value].sum()
+        )
 
     # Census cells with nuts3 and nuts1 information
-    df_grid_id = db.select_dataframe(sql="""
+    df_grid_id = db.select_dataframe(
+        sql="""
                             SELECT pop.grid_id, pop.gid as cell_id, vg250.vg250_nuts3 as nuts3, lan.nuts as nuts1, lan.gen
                             FROM society.destatis_zensus_population_per_ha_inside_germany as pop
                             LEFT JOIN boundaries.egon_map_zensus_vg250 as vg250
                             ON (pop.gid=vg250.zensus_population_id)
                             LEFT JOIN boundaries.vg250_lan as lan
-                            ON (LEFT(vg250.vg250_nuts3, 3)=lan.nuts) """)
+                            ON (LEFT(vg250.vg250_nuts3, 3)=lan.nuts) """
+    )
     df_grid_id = df_grid_id.drop_duplicates()
     df_grid_id = df_grid_id.reset_index(drop=True)
 
     # Merge household type and size data with considered (populated) census cells
     # how='inner' is used as ids of unpopulated areas are removed df_grid_id or earliers tables. see here:
     # https://github.com/openego/eGon-data/blob/59195926e41c8bd6d1ca8426957b97f33ef27bcc/src/egon/data/importing/zensus/__init__.py#L418-L449
-    df_households_typ = pd.merge(df_households_typ, df_grid_id,
-                                 left_on='grid_id', right_on='grid_id',
-                                 how='inner')
+    df_households_typ = pd.merge(
+        df_households_typ,
+        df_grid_id,
+        left_on="grid_id",
+        right_on="grid_id",
+        how="inner",
+    )
 
     # Merge Zensus nuts level household data with zensus cell level by dividing hh-groups with mapping_zensus_hh_subgroups
     df_zensus_cells = pd.DataFrame()
     for (country, code), df_country_type in df_households_typ.groupby(
-        ['gen', 'characteristics_code']):
+        ["gen", "characteristics_code"]
+    ):
 
         # iterate over zenus_country subgroups
         for typ in MAPPING_ZENSUS_HH_SUBGROUPS[code]:
-            df_country_type['hh_type'] = typ
-            df_country_type['factor'] = df_dist_households.loc[typ, country]
-            df_country_type['hh_10types'] = df_country_type['hh_5types'] * \
-                                            df_dist_households.loc[
-                                                typ, country]
-            df_zensus_cells = df_zensus_cells.append(df_country_type,
-                                                     ignore_index=True)
+            df_country_type["hh_type"] = typ
+            df_country_type["factor"] = df_dist_households.loc[typ, country]
+            df_country_type["hh_10types"] = (
+                df_country_type["hh_5types"]
+                * df_dist_households.loc[typ, country]
+            )
+            df_zensus_cells = df_zensus_cells.append(
+                df_country_type, ignore_index=True
+            )
 
     df_zensus_cells = df_zensus_cells.sort_values(
-        by=['grid_id', 'characteristics_code']).reset_index(drop=True)
+        by=["grid_id", "characteristics_code"]
+    ).reset_index(drop=True)
 
     # Annual household electricity demand on NUTS-3 level (demand regio)
-    df_demand_regio = db.select_dataframe(sql="""
+    df_demand_regio = db.select_dataframe(
+        sql="""
                                 SELECT year, nuts3, SUM (demand) as demand_mWha
                                 FROM demand.egon_demandregio_hh as egon_d
                                 GROUP BY nuts3, year
-                                ORDER BY year""", index_col=['year', 'nuts3'])
+                                ORDER BY year""",
+        index_col=["year", "nuts3"],
+    )
 
     # Finally create table that stores profile ids for each cell
-    df_cell_demand_metadata = get_cell_demand_metadata(df_zensus_cells,
-                                                                df_profiles)
+    df_cell_demand_metadata = get_cell_demand_metadata(
+        df_zensus_cells, df_profiles
+    )
     df_cell_demand_metadata = adjust_to_demand_regio_nuts3_annual(
-        df_cell_demand_metadata, df_profiles, df_demand_regio)
+        df_cell_demand_metadata, df_profiles, df_demand_regio
+    )
     df_cell_demand_metadata = df_cell_demand_metadata.reset_index(drop=False)
 
     # Insert data into respective database table
     engine = db.engine()
-    HouseholdElectricityProfilesInCensusCells.__table__.drop(bind=engine,
-                                                             checkfirst=True)
-    HouseholdElectricityProfilesInCensusCells.__table__.create(bind=engine,
-                                                               checkfirst=True)
-    df_cell_demand_metadata["cell_id"] = df_cell_demand_metadata["cell_id"].astype(int)
+    HouseholdElectricityProfilesInCensusCells.__table__.drop(
+        bind=engine, checkfirst=True
+    )
+    HouseholdElectricityProfilesInCensusCells.__table__.create(
+        bind=engine, checkfirst=True
+    )
+    df_cell_demand_metadata["cell_id"] = df_cell_demand_metadata[
+        "cell_id"
+    ].astype(int)
     with db.session_scope() as session:
-        session.bulk_insert_mappings(HouseholdElectricityProfilesInCensusCells,
-                                     df_cell_demand_metadata.to_dict(orient="records"))
+        session.bulk_insert_mappings(
+            HouseholdElectricityProfilesInCensusCells,
+            df_cell_demand_metadata.to_dict(orient="records"),
+        )
 
 
 def get_houseprofiles_in_census_cells():
     with db.session_scope() as session:
         q = session.query(HouseholdElectricityProfilesInCensusCells)
 
-        census_profile_mapping = pd.read_sql(q.statement, q.session.bind, index_col="cell_id")
+        census_profile_mapping = pd.read_sql(
+            q.statement, q.session.bind, index_col="cell_id"
+        )
 
-    census_profile_mapping["cell_profile_ids"] = census_profile_mapping["cell_profile_ids"].apply(
-        lambda x: [(cat, int(profile_id)) for cat, profile_id in x])
+    census_profile_mapping["cell_profile_ids"] = census_profile_mapping[
+        "cell_profile_ids"
+    ].apply(lambda x: [(cat, int(profile_id)) for cat, profile_id in x])
 
     return census_profile_mapping
 
@@ -683,26 +845,33 @@ def mv_grid_district_HH_electricity_load():
 
     with db.session_scope() as session:
         cells_query = session.query(
-            HouseholdElectricityProfilesInCensusCells, MapZensusGridDistricts.subst_id).join(
+            HouseholdElectricityProfilesInCensusCells,
+            MapZensusGridDistricts.subst_id,
+        ).join(
             MapZensusGridDistricts,
-            HouseholdElectricityProfilesInCensusCells.cell_id == MapZensusGridDistricts.zensus_population_id
+            HouseholdElectricityProfilesInCensusCells.cell_id
+            == MapZensusGridDistricts.zensus_population_id,
         )
 
-    cells = pd.read_sql(cells_query.statement, cells_query.session.bind, index_col="cell_id")
-    cells["cell_profile_ids"] = cells[
-        "cell_profile_ids"].apply(
-        lambda x: [(cat, int(profile_id)) for cat, profile_id in x])
+    cells = pd.read_sql(
+        cells_query.statement, cells_query.session.bind, index_col="cell_id"
+    )
+    cells["cell_profile_ids"] = cells["cell_profile_ids"].apply(
+        lambda x: [(cat, int(profile_id)) for cat, profile_id in x]
+    )
 
     # Create aggregated load profile for each MV grid district
     df_profiles = get_household_demand_profiles_raw()
 
     mvgd_profiles_list = []
     for grid_district, data in cells.groupby("subst_id"):
-        mvgd_profile = get_load_timeseries(df_profiles=df_profiles,
-                                           df_cell_demand_metadata=data,
-                                           cell_ids=data.index,
-                                           year=2035,
-                                           peak_load_only=False)
+        mvgd_profile = get_load_timeseries(
+            df_profiles=df_profiles,
+            df_cell_demand_metadata=data,
+            cell_ids=data.index,
+            year=2035,
+            peak_load_only=False,
+        )
         mvgd_profile.name = grid_district
         mvgd_profiles_list.append(mvgd_profile)
 
@@ -713,32 +882,52 @@ def mv_grid_district_HH_electricity_load():
 
 def mv_grid_district_HH_electricity_load_check():
     with db.session_scope() as session:
-        cells_w_geom_query = session.query(
-            HouseholdElectricityProfilesInCensusCells,
-            MapZensusGridDistricts.subst_id,
-            DestatisZensusPopulationPerHaInsideGermany.population,
-            DestatisZensusPopulationPerHaInsideGermany.geom
-        ).join(
-            MapZensusGridDistricts,
-            HouseholdElectricityProfilesInCensusCells.cell_id == MapZensusGridDistricts.zensus_population_id
-        ).join(DestatisZensusPopulationPerHaInsideGermany,
-               HouseholdElectricityProfilesInCensusCells.cell_id == DestatisZensusPopulationPerHaInsideGermany.gid)
+        cells_w_geom_query = (
+            session.query(
+                HouseholdElectricityProfilesInCensusCells,
+                MapZensusGridDistricts.subst_id,
+                DestatisZensusPopulationPerHaInsideGermany.population,
+                DestatisZensusPopulationPerHaInsideGermany.geom,
+            )
+            .join(
+                MapZensusGridDistricts,
+                HouseholdElectricityProfilesInCensusCells.cell_id
+                == MapZensusGridDistricts.zensus_population_id,
+            )
+            .join(
+                DestatisZensusPopulationPerHaInsideGermany,
+                HouseholdElectricityProfilesInCensusCells.cell_id
+                == DestatisZensusPopulationPerHaInsideGermany.gid,
+            )
+        )
 
     # Used for visual check of data
     import geopandas as gpd
-    cells_w_geom = gpd.read_postgis(cells_w_geom_query.statement, cells_w_geom_query.session.bind, index_col="cell_id")
+
+    cells_w_geom = gpd.read_postgis(
+        cells_w_geom_query.statement,
+        cells_w_geom_query.session.bind,
+        index_col="cell_id",
+    )
     cells_w_geom.to_postgis(
         "household_electricity_profiles_in_census_cells_with_geom",
         schema="demand",
         con=db.engine(),
-        if_exists="replace")
+        if_exists="replace",
+    )
 
 
 class EgonDestatisZensusHouseholdPerHa(Base):
-    __tablename__ = 'egon_destatis_zensus_household_per_ha'
-    __table_args__ = {'schema': 'society'}
-
-    id = Column(Integer, primary_key=True, server_default=text("nextval('society.egon_destatis_zensus_household_per_ha_id_seq'::regclass)"))
+    __tablename__ = "egon_destatis_zensus_household_per_ha"
+    __table_args__ = {"schema": "society"}
+
+    id = Column(
+        Integer,
+        primary_key=True,
+        server_default=text(
+            "nextval('society.egon_destatis_zensus_household_per_ha_id_seq'::regclass)"
+        ),
+    )
     grid_id = Column(String(50))
     grid_id_new = Column(String(50))
     attribute = Column(String(50))
@@ -746,7 +935,9 @@ class EgonDestatisZensusHouseholdPerHa(Base):
     characteristics_text = Column(String)
     quantity = Column(Integer)
     quantity_q = Column(Integer)
-    zensus_population_id = Column(ForeignKey('society.destatis_zensus_population_per_ha.id'))
+    zensus_population_id = Column(
+        ForeignKey("society.destatis_zensus_population_per_ha.id")
+    )
 
 
 def zensus_household_with_geom_check():
@@ -755,24 +946,34 @@ def zensus_household_with_geom_check():
     with db.session_scope() as session:
 
         # Household x Zensus population x Zensus grid district mapping
-        cells_w_geom_query = session.query(
-            EgonDestatisZensusHouseholdPerHa,
-            MapZensusGridDistricts.subst_id,
-            DestatisZensusPopulationPerHaInsideGermany.population,
-            DestatisZensusPopulationPerHaInsideGermany.geom
-        ).join(
-            MapZensusGridDistricts,
-            EgonDestatisZensusHouseholdPerHa.zensus_population_id == MapZensusGridDistricts.zensus_population_id
-        ).join(DestatisZensusPopulationPerHaInsideGermany,
-               EgonDestatisZensusHouseholdPerHa.zensus_population_id == DestatisZensusPopulationPerHaInsideGermany.gid
-               )
-        cells_w_geom = gpd.read_postgis(cells_w_geom_query.statement,
-                                        cells_w_geom_query.session.bind,
-                                        index_col="zensus_population_id")
+        cells_w_geom_query = (
+            session.query(
+                EgonDestatisZensusHouseholdPerHa,
+                MapZensusGridDistricts.subst_id,
+                DestatisZensusPopulationPerHaInsideGermany.population,
+                DestatisZensusPopulationPerHaInsideGermany.geom,
+            )
+            .join(
+                MapZensusGridDistricts,
+                EgonDestatisZensusHouseholdPerHa.zensus_population_id
+                == MapZensusGridDistricts.zensus_population_id,
+            )
+            .join(
+                DestatisZensusPopulationPerHaInsideGermany,
+                EgonDestatisZensusHouseholdPerHa.zensus_population_id
+                == DestatisZensusPopulationPerHaInsideGermany.gid,
+            )
+        )
+        cells_w_geom = gpd.read_postgis(
+            cells_w_geom_query.statement,
+            cells_w_geom_query.session.bind,
+            index_col="zensus_population_id",
+        )
 
     # Used for visual check of data
     cells_w_geom.to_postgis(
         "egon_destatis_zensus_household_per_ha_with_geom",
         schema="society",
         con=db.engine(),
-        if_exists="replace")
+        if_exists="replace",
+    )

From 30256fce107f16fbde3664291f1c06711264b4b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Thu, 10 Jun 2021 18:10:37 +0200
Subject: [PATCH 28/97] Add draft of module docstring used as dataset
 description

---
 .../hh_demand/hh_demand_profiles_tools.py     | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 2a8697391..9ea311579 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -1,3 +1,41 @@
+"""
+Household electricity demand time series for scenarios in 2035 and 2050
+
+Electricity demand data for households in Germany in 1-hourly resolution for
+an entire year. Spatially, the data is resolved to 100 x 100 m cells and
+provides individual and distinct time series for each household in a cell.
+
+The resulting data is stored in two separate tables
+
+* `demand.household_electricity_profiles_in_census_cells`:
+  Lists references to time series data for each household in a cell by
+  identifiers. This table is fundamental for creating subsequent data like
+  demand profiles on MV grid level or for determining the peak load at load
+  area level.
+* `demand.TABLE_NEEDS_TO_BE_CREATED`:
+  Household electricity demand profiles aggregated at MV grid district level.
+  Primarily used to create the eTraGo data model.
+
+Data is created ...
+# TODO: describe how census data and IEE profiles are used to determine individual HH demand profiles for each cell
+- IEE electricity demand time series as basis
+- Spatial information about households by zensus
+- How are these datasets mapped?
+- What are central assumptions during the data processing?
+- Drawbacks and limitations of the data
+
+The table `demand.household_electricity_profiles_in_census_cells` is created
+by :func:`houseprofiles_in_census_cells`.
+# TODO: reference the function that create the table with HH profiles for each MV grid
+
+Notes
+-----
+
+This module docstring is rather a dataset documentation. Once, a decision
+is made in ... the content of this module docstring needs to be moved to
+docs attribute of the respective dataset class.
+"""
+
 from itertools import cycle
 from pathlib import Path
 from urllib.request import urlretrieve

From f5f3c8d0f36c4636fb4e3e44933e7eb06b19ceaa Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Fri, 11 Jun 2021 10:33:34 +0200
Subject: [PATCH 29/97] change household rounding at cell-level to np.rint

---
 src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 9ea311579..6f2f44c48 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -508,7 +508,7 @@ def get_cell_demand_profile_ids(df_cell, pool_size):
     cell_profile_ids = [
         (hh_type, random.sample(range(pool_size[hh_type]), k=sq))
         for hh_type, sq in zip(
-            df_cell["hh_type"], df_cell["hh_10types"].astype(int)
+            df_cell["hh_type"], df_cell["hh_10types"].apply(np.rint)
         )
     ]
 

From 739998e85825035933a70e5479deaa83e8bc70eb Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Fri, 11 Jun 2021 11:13:21 +0200
Subject: [PATCH 30/97] fix np.rint

---
 src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 6f2f44c48..8516b88ad 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -508,7 +508,7 @@ def get_cell_demand_profile_ids(df_cell, pool_size):
     cell_profile_ids = [
         (hh_type, random.sample(range(pool_size[hh_type]), k=sq))
         for hh_type, sq in zip(
-            df_cell["hh_type"], df_cell["hh_10types"].apply(np.rint)
+            df_cell["hh_type"], np.rint(df_cell["hh_10types"].values).astype(int)
         )
     ]
 

From e55169c8d9995437df4cfd680c9898d846db0b42 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Fri, 11 Jun 2021 11:13:21 +0200
Subject: [PATCH 31/97] fix np.rint

---
 src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 6f2f44c48..8516b88ad 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -508,7 +508,7 @@ def get_cell_demand_profile_ids(df_cell, pool_size):
     cell_profile_ids = [
         (hh_type, random.sample(range(pool_size[hh_type]), k=sq))
         for hh_type, sq in zip(
-            df_cell["hh_type"], df_cell["hh_10types"].apply(np.rint)
+            df_cell["hh_type"], np.rint(df_cell["hh_10types"].values).astype(int)
         )
     ]
 

From 17d1be81ee3dc12c891c8b7471b46667681a88ad Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Fri, 11 Jun 2021 11:43:28 +0200
Subject: [PATCH 32/97] add info about household-rounding to docstring of
 get_cell_demand_metadata

---
 .../hh_demand/hh_demand_profiles_tools.py           | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 8516b88ad..c88752623 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -528,7 +528,9 @@ def get_cell_demand_metadata(df_zensus_cells, df_profiles):
     Defines information about profiles for each zensus cell
 
     A table including the demand profile ids for each cell is created by using
-    :func:`get_cell_demand_profile_ids`.
+    :func:`get_cell_demand_profile_ids`. Household profiles are randomly sampled for each cell. The profiles
+    are not replaced to the pool within a cell but after. The number of households are rounded to the nearest integer
+    if float. This results in a small deviation for the course of the aggregated profiles.
 
     Parameters
     ----------
@@ -568,10 +570,11 @@ def get_cell_demand_metadata(df_zensus_cells, df_profiles):
     pool_size = df_profiles.groupby(level=0, axis=1).size()
 
     for grid_id, df_cell in df_zensus_cells.groupby(by="grid_id"):
-        # FIXME
-        # ! runden der Haushaltszahlen auf int -> zu einfach!
-        # ! kein zurücklegen innerhalb einer Zelle ?! -> das is ok.
-        # cell_profile_ids = get_cell_demand_profile_ids(df_cell, pool_size, df_profiles)
+
+        # random sampling of household profiles for each cell
+        # without replacement within cell but after
+        # number of households are rounded to the nearest integer if float
+        # this results in a small deviation for the course of the aggregated profiles
         cell_profile_ids = get_cell_demand_profile_ids(df_cell, pool_size)
 
         df_cell_demand_metadata.at[grid_id, "cell_id"] = df_cell.loc[

From deeb9eb2eae5911d93a90fb0369f7f75a6811ef6 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Fri, 11 Jun 2021 11:48:09 +0200
Subject: [PATCH 33/97] add info about household-rounding to docstring of
 get_cell_demand_profile_ids

---
 src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index c88752623..cb06df79e 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -486,7 +486,8 @@ def get_cell_demand_profile_ids(df_cell, pool_size):
     """
     Generates tuple of hh_type and zensus cell ids
 
-    Takes a random sample (without replacement) of profile ids for given cell
+    Takes a random sample (without replacement) of profile ids for given cell.
+    The number of households are rounded to the nearest integer if float.
 
     Parameters
     ----------

From 4234188f3879e43a45a412f8c274998195cfe35f Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Fri, 11 Jun 2021 11:58:31 +0200
Subject: [PATCH 34/97] extend docstring of process_nuts1_zensus_data()

---
 .../data/processing/hh_demand/hh_demand_profiles_tools.py   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index cb06df79e..c4d962fc4 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -8,7 +8,7 @@
 The resulting data is stored in two separate tables
 
 * `demand.household_electricity_profiles_in_census_cells`:
-  Lists references to time series data for each household in a cell by
+  Lists references and scaling parameters to time series data for each household in a cell by
   identifiers. This table is fundamental for creating subsequent data like
   demand profiles on MV grid level or for determining the peak load at load
   area level.
@@ -421,8 +421,8 @@ def inhabitants_to_households(
 def process_nuts1_zensus_data(df_zensus):
     """Make data compatible with household demand profile categories
 
-    Groups, removes and reorders categories which are not needed for
-    demand-profile-generator (DPG)
+    Groups, removes and reorders categories which are not needed to fit data to household types of
+    IEE electricity demand time series generated by demand-profile-generator (DPG).
 
     * Kids (<15) are excluded as they are also excluded in DPG origin dataset
     * Adults (15<65)

From 47d753ab1f6c282d6b8312cf662da9969360f994 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Fri, 11 Jun 2021 15:10:35 +0200
Subject: [PATCH 35/97] add dataset description in module-docstring

---
 .../hh_demand/hh_demand_profiles_tools.py     | 71 +++++++++++++++----
 1 file changed, 56 insertions(+), 15 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index c4d962fc4..451d6e510 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -18,11 +18,51 @@
 
 Data is created ...
 # TODO: describe how census data and IEE profiles are used to determine individual HH demand profiles for each cell
-- IEE electricity demand time series as basis
-- Spatial information about households by zensus
-- How are these datasets mapped?
-- What are central assumptions during the data processing?
-- Drawbacks and limitations of the data
+The following datasets are used:
+* IEE electricity demand time series produced by demand-profile-generator (DPG) as basis
+* Spatial information about people living in households by zensus (2011) at federal state level
+    * type of household (family status)
+    * age
+    * size
+* Spatial information about number of households per ha
+    * type of household (family status, 5 types)
+* mapping of 100 x 100 m cells to NUTS3 and NUTS1
+* Demand-Regio annual household demand at NUTS3 level
+
+What is the goal?
+To use the IEE electricity demand time series at spatial resolution of 100 x 100 m cells.
+
+What is the challenge?
+The IEE electricity demand time series produced by demand-profile-generator (DPG) offer 12 different
+household profile types. To use most of them, the spatial information about the number of households per ha (5 types)
+needs to be enriched by supplementary data to better fit household profile specifications. Hence, 10 out of 12
+different household profile types can be distinguished and used.
+
+How are these datasets mapped?
+* Spatial information about people living in households by zensus (2011) at federal state NUTS1 level
+:var:`df_zensus` is aggregated to be compatible to IEE household profile specifications.
+    * exclude kids and reduce to adults and seniors
+    * group as defined in :var:`HH_TYPES`
+    * convert data from people living in households to number of households by :var:`mapping_people_in_households`
+    * calculate fraction of fine household types (10) within subgroup of rough household types (5) :var:`df_dist_households`
+* Spatial information about number of households per ha :var:`df_households_typ` is mapped to NUTS1 and NUTS3 level.
+Data is enriched with refined household subgroups via :var:`df_dist_households` in :var:`df_zensus_cells`.
+* Enriched 100 x 100 m household dataset is used to sample and aggregate household profiles. A table including
+individual profile id's for each cell and scaling factor to match Demand-Regio annual sum projections for 2035 and 2050
+at NUTS3 level is created in the database as `demand.household_electricity_profiles_in_census_cells`.
+
+What are central assumptions during the data processing?
+* the mapping of zensus data to IEE household types is not trivial. In conversion from persons in household to number of
+households, number of inhabitants for multi-persons households is estimated as weighted average in :var:`OO_factor`
+* the distribution to refine household types at cell level are the same for each federal state
+* refining of household types lead to float number of profiles drew at cell level and need to be rounded to nearest int.
+* 100 x 100 m cells are matched to NUTS via centroid location
+* cells with households in unpopulated areas are removed
+
+Drawbacks and limitations of the data
+* the distribution to refine household types at cell level are the same for each federal state
+*
+
 
 The table `demand.household_electricity_profiles_in_census_cells` is created
 by :func:`houseprofiles_in_census_cells`.
@@ -236,7 +276,7 @@ def download_process_zensus_households():
 
     * family type
     * age class
-    * gender
+    * household size
 
     for Germany in spatial resolution of federal states.
 
@@ -244,9 +284,9 @@ def download_process_zensus_households():
     https://ergebnisse2011.zensus2022.de/datenbank/online
     For reproducing data selection, please do:
 
-    * Search for: "1000A-2029"
+    * Search for: "1000A-3016"
     * or choose topic: "Bevölkerung kompakt"
-    * Choose table code: "1000A-2029" with title "Personen: Alter (11 Altersklassen)/Geschlecht/Größe des
+    * Choose table code: "1000A-3016" with title "Personen: Alter (11 Altersklassen) - Größe des
     privaten Haushalts - Typ des privaten Haushalts (nach Familien/Lebensform)"
     - Change setting "GEOLK1" to "Bundesländer (16)"
 
@@ -765,6 +805,12 @@ def houseprofiles_in_census_cells():
         df_hh_types_nad_abs, mapping_people_in_households
     )
 
+    # Calculate fraction of fine household types within subgroup of rough household types
+    for value in MAPPING_ZENSUS_HH_SUBGROUPS.values():
+        df_dist_households.loc[value] = df_dist_households.loc[value].div(
+            df_dist_households.loc[value].sum()
+        )
+
     # Retrieve information about households for each census cell
     df_households_typ = db.select_dataframe(
         sql="""
@@ -779,12 +825,6 @@ def houseprofiles_in_census_cells():
         columns={"quantity": "hh_5types"}
     )
 
-    # Calculate fraction of persons within subgroup
-    for value in MAPPING_ZENSUS_HH_SUBGROUPS.values():
-        df_dist_households.loc[value] = df_dist_households.loc[value].div(
-            df_dist_households.loc[value].sum()
-        )
-
     # Census cells with nuts3 and nuts1 information
     df_grid_id = db.select_dataframe(
         sql="""
@@ -809,7 +849,8 @@ def houseprofiles_in_census_cells():
         how="inner",
     )
 
-    # Merge Zensus nuts level household data with zensus cell level by dividing hh-groups with mapping_zensus_hh_subgroups
+    # Merge Zensus nuts1 level household data with zensus cell level 100 x 100 m
+    # by refining hh-groups with MAPPING_ZENSUS_HH_SUBGROUPS
     df_zensus_cells = pd.DataFrame()
     for (country, code), df_country_type in df_households_typ.groupby(
         ["gen", "characteristics_code"]

From 36d1c891c5310214994c7aad26978acdb55dc853 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Tue, 15 Jun 2021 10:48:17 +0200
Subject: [PATCH 36/97] Write HV/MV substation HH load profile to database

---
 .../hh_demand/hh_demand_profiles_tools.py     | 41 ++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 451d6e510..1f32aba40 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -81,7 +81,8 @@
 from urllib.request import urlretrieve
 import random
 
-from sqlalchemy import ARRAY, Column, Float, ForeignKey, Integer, String, text
+from sqlalchemy import ARRAY, Column, Float, ForeignKey, Integer, String, \
+    text, Sequence
 from sqlalchemy.ext.declarative import declarative_base
 import numpy as np
 import pandas as pd
@@ -190,6 +191,21 @@ class HouseholdElectricityProfilesInCensusCells(Base):
     factor_2050 = Column(Float)
 
 
+class HouseholdElectricityProfilesHvMvSubstation(Base):
+    __tablename__ = "household_electricity_profiles_hvmv_substation"
+    __table_args__ = {"schema": "demand"}
+
+    index = Column(
+        Integer,
+        Sequence(f"{__tablename__}_id_seq",
+                 schema=f"{ __table_args__['schema']}"),
+        primary_key=True,
+    )
+    subst_id = Column(Integer)
+    timestep = Column(Integer)
+    household_electricity_load = Column(Float)
+
+
 def clean(x):
     """Clean zensus household data row-wise
 
@@ -960,6 +976,29 @@ def mv_grid_district_HH_electricity_load():
 
     mvgd_profiles = pd.concat(mvgd_profiles_list, axis=1)
 
+    # Add timestep index
+    mvgd_profiles["timestep"] = mvgd_profiles.index + 1
+
+    # Reshape data: put MV grid ids in columns to a single index column
+    mvgd_profiles = mvgd_profiles.set_index("timestep").stack()
+    mvgd_profiles.name = "household_electricity_load"
+    mvgd_profiles.index.names = ["timestep", "subst_id"]
+    mvgd_profiles = mvgd_profiles.reset_index()
+
+    # Insert data into respective database table
+    engine = db.engine()
+    HouseholdElectricityProfilesHvMvSubstation.__table__.drop(
+        bind=engine, checkfirst=True
+    )
+    HouseholdElectricityProfilesHvMvSubstation.__table__.create(
+        bind=engine, checkfirst=True
+    )
+
+    with db.session_scope() as session:
+        session.bulk_insert_mappings(
+            HouseholdElectricityProfilesHvMvSubstation,
+            mvgd_profiles.to_dict(orient="records"),
+        )
     return mvgd_profiles
 
 

From 8df877ec4d0200d1b58688b5ac9ddd4759225c71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Tue, 15 Jun 2021 10:49:59 +0200
Subject: [PATCH 37/97] Revise module docstring for documenting the dataset

---
 .../hh_demand/hh_demand_profiles_tools.py     | 76 +++++++++++--------
 1 file changed, 45 insertions(+), 31 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 1f32aba40..56434fb1a 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -4,6 +4,7 @@
 Electricity demand data for households in Germany in 1-hourly resolution for
 an entire year. Spatially, the data is resolved to 100 x 100 m cells and
 provides individual and distinct time series for each household in a cell.
+The cells are defined by the dataset Zensus 2011.
 
 The resulting data is stored in two separate tables
 
@@ -11,62 +12,74 @@
   Lists references and scaling parameters to time series data for each household in a cell by
   identifiers. This table is fundamental for creating subsequent data like
   demand profiles on MV grid level or for determining the peak load at load
-  area level.
-* `demand.TABLE_NEEDS_TO_BE_CREATED`:
+  area level. The table is created by :func:`houseprofiles_in_census_cells`.
+* `demand.household_electricity_profiles_hvmv_substation`:
   Household electricity demand profiles aggregated at MV grid district level.
   Primarily used to create the eTraGo data model.
+  The table is created with :func:`mv_grid_district_HH_electricity_load`.
 
-Data is created ...
-# TODO: describe how census data and IEE profiles are used to determine individual HH demand profiles for each cell
-The following datasets are used:
-* IEE electricity demand time series produced by demand-profile-generator (DPG) as basis
-* Spatial information about people living in households by zensus (2011) at federal state level
+The following datasets are used for creating the data:
+
+* Electricity demand time series for household categories
+  produced by demand profile generator (DPG) from Fraunhofer IEE
+  (see :func:`get_household_demand_profiles_raw`)
+* Spatial information about people living in households by Zensus 2011 at
+  federal state level
     * type of household (family status)
     * age
     * size
-* Spatial information about number of households per ha
-    * type of household (family status, 5 types)
-* mapping of 100 x 100 m cells to NUTS3 and NUTS1
+* Spatial information about number of households per ha, categorized by type
+  of household (family status) with 5 categories (also from Zensus 2011)
 * Demand-Regio annual household demand at NUTS3 level
 
-What is the goal?
-To use the IEE electricity demand time series at spatial resolution of 100 x 100 m cells.
+**What is the goal?**
+
+To use the electricity demand time series from the `demand profile generator`
+to created spatially reference household demand time series for Germany at a
+resolution of 100 x 100 m cells.
+
+**What is the challenge?**
+
+The electricity demand time series produced by demand profile generator offer
+12 different household profile categories.
+To use most of them, the spatial information about the number of households
+per cell (5 categories) needs to be enriched by supplementary data to match
+the household demand profile categories specifications. Hence, 10 out of 12
+different household profile categories can be distinguished and by increasing
+the number of categories of cell-level household data.
 
-What is the challenge?
-The IEE electricity demand time series produced by demand-profile-generator (DPG) offer 12 different
-household profile types. To use most of them, the spatial information about the number of households per ha (5 types)
-needs to be enriched by supplementary data to better fit household profile specifications. Hence, 10 out of 12
-different household profile types can be distinguished and used.
+**How are these datasets combined?**
 
-How are these datasets mapped?
 * Spatial information about people living in households by zensus (2011) at federal state NUTS1 level
-:var:`df_zensus` is aggregated to be compatible to IEE household profile specifications.
+ :var:`df_zensus` is aggregated to be compatible to IEE household profile specifications.
     * exclude kids and reduce to adults and seniors
     * group as defined in :var:`HH_TYPES`
     * convert data from people living in households to number of households by :var:`mapping_people_in_households`
     * calculate fraction of fine household types (10) within subgroup of rough household types (5) :var:`df_dist_households`
 * Spatial information about number of households per ha :var:`df_households_typ` is mapped to NUTS1 and NUTS3 level.
-Data is enriched with refined household subgroups via :var:`df_dist_households` in :var:`df_zensus_cells`.
+  Data is enriched with refined household subgroups via :var:`df_dist_households` in :var:`df_zensus_cells`.
 * Enriched 100 x 100 m household dataset is used to sample and aggregate household profiles. A table including
-individual profile id's for each cell and scaling factor to match Demand-Regio annual sum projections for 2035 and 2050
-at NUTS3 level is created in the database as `demand.household_electricity_profiles_in_census_cells`.
+  individual profile id's for each cell and scaling factor to match Demand-Regio annual sum projections for 2035 and 2050
+  at NUTS3 level is created in the database as `demand.household_electricity_profiles_in_census_cells`.
 
-What are central assumptions during the data processing?
-* the mapping of zensus data to IEE household types is not trivial. In conversion from persons in household to number of
-households, number of inhabitants for multi-persons households is estimated as weighted average in :var:`OO_factor`
+**What are central assumptions during the data processing?**
+
+* mapping zensus data to IEE household categories is not trivial. In
+  conversion from persons in household to number of
+  households, number of inhabitants for multi-person households is estimated
+  as weighted average in :var:`OO_factor`
 * the distribution to refine household types at cell level are the same for each federal state
 * refining of household types lead to float number of profiles drew at cell level and need to be rounded to nearest int.
 * 100 x 100 m cells are matched to NUTS via centroid location
 * cells with households in unpopulated areas are removed
 
-Drawbacks and limitations of the data
-* the distribution to refine household types at cell level are the same for each federal state
-*
+**Drawbacks and limitations of the data**
 
+* the distribution to refine household types at cell level are the same for
+  each federal state
+* Household profiles aggregated annual demand matches Demand Regio demand at
+  NUTS-3 level, but it is not matching the demand regio time series profile
 
-The table `demand.household_electricity_profiles_in_census_cells` is created
-by :func:`houseprofiles_in_census_cells`.
-# TODO: reference the function that create the table with HH profiles for each MV grid
 
 Notes
 -----
@@ -999,6 +1012,7 @@ def mv_grid_district_HH_electricity_load():
             HouseholdElectricityProfilesHvMvSubstation,
             mvgd_profiles.to_dict(orient="records"),
         )
+
     return mvgd_profiles
 
 

From a68e965880165b7778976c8085cc3c9e663d3854 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Tue, 15 Jun 2021 12:05:20 +0200
Subject: [PATCH 38/97] Add missing docstrings

---
 .../hh_demand/hh_demand_profiles_tools.py     | 30 +++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 56434fb1a..1d1e5b46f 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -785,9 +785,11 @@ def houseprofiles_in_census_cells():
     """
     Identify household electricity profiles for each census cell
 
+    Creates a table that maps household electricity demand profiles to zensus
+    cells. Each row represents one cell and contains a list of profile IDs.
 
-    Returns
-    -------
+    Use :func:`get_houseprofiles_in_census_cells` to retrieve the data from
+    the database as pandas
 
     """
     # Get demand profiles and zensus household type x age category data
@@ -939,6 +941,18 @@ def houseprofiles_in_census_cells():
 
 
 def get_houseprofiles_in_census_cells():
+    """
+    Retrieve household demand time profile mapping
+
+    See Also
+    --------
+    :func:`houseprofiles_in_census_cells`
+
+    Returns
+    -------
+    pd.DataFrame
+        Mapping of household demand profiles to zensus cells
+    """
     with db.session_scope() as session:
         q = session.query(HouseholdElectricityProfilesInCensusCells)
 
@@ -954,6 +968,18 @@ def get_houseprofiles_in_census_cells():
 
 
 def mv_grid_district_HH_electricity_load():
+    """
+    Aggregated household demand time series at HV/MV substation level
+
+    Calculate the aggregated demand time series based on the demand profiles
+    of each zensus cell inside each MV grid district.
+
+    Returns
+    -------
+    pd.DataFrame
+        Multiindexed dataframe with `timestep` and `subst_id` as indexers.
+        Demand is given in kWh.
+    """
 
     with db.session_scope() as session:
         cells_query = session.query(

From 354044c8a88440c2dddddfaa197090eb89e9086a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Tue, 15 Jun 2021 12:19:29 +0200
Subject: [PATCH 39/97] Remove code for data checking

---
 .../hh_demand/hh_demand_profiles.py           |   7 --
 .../hh_demand/hh_demand_profiles_tools.py     | 108 +-----------------
 2 files changed, 3 insertions(+), 112 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index 0cdb513db..f56e9eb9b 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -8,10 +8,3 @@
     # Calculate household electricity demand time series for each MV grid
     profiles = hh_tools.mv_grid_district_HH_electricity_load()
     print(profiles)
-
-    # ONLY FOR CHECKING
-    # Create table with profiles for each census cell including geom
-    hh_tools.mv_grid_district_HH_electricity_load_check()
-
-    # Create table with zensus households including geom from zensus population table
-    hh_tools.zensus_household_with_geom_check()
diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 1d1e5b46f..923b876c0 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -94,18 +94,15 @@
 from urllib.request import urlretrieve
 import random
 
-from sqlalchemy import ARRAY, Column, Float, ForeignKey, Integer, String, \
-    text, Sequence
+from sqlalchemy import ARRAY, Column, Float, Integer, String, \
+    Sequence
 from sqlalchemy.ext.declarative import declarative_base
 import numpy as np
 import pandas as pd
 
 from egon.data import db
 from egon.data.processing.zensus_grid_districts import MapZensusGridDistricts
-from egon.data.processing.zensus_vg250.zensus_population_inside_germany import (
-    DestatisZensusPopulationPerHa,
-    DestatisZensusPopulationPerHaInsideGermany,
-)
+
 
 Base = declarative_base()
 
@@ -1040,102 +1037,3 @@ def mv_grid_district_HH_electricity_load():
         )
 
     return mvgd_profiles
-
-
-def mv_grid_district_HH_electricity_load_check():
-    with db.session_scope() as session:
-        cells_w_geom_query = (
-            session.query(
-                HouseholdElectricityProfilesInCensusCells,
-                MapZensusGridDistricts.subst_id,
-                DestatisZensusPopulationPerHaInsideGermany.population,
-                DestatisZensusPopulationPerHaInsideGermany.geom,
-            )
-            .join(
-                MapZensusGridDistricts,
-                HouseholdElectricityProfilesInCensusCells.cell_id
-                == MapZensusGridDistricts.zensus_population_id,
-            )
-            .join(
-                DestatisZensusPopulationPerHaInsideGermany,
-                HouseholdElectricityProfilesInCensusCells.cell_id
-                == DestatisZensusPopulationPerHaInsideGermany.gid,
-            )
-        )
-
-    # Used for visual check of data
-    import geopandas as gpd
-
-    cells_w_geom = gpd.read_postgis(
-        cells_w_geom_query.statement,
-        cells_w_geom_query.session.bind,
-        index_col="cell_id",
-    )
-    cells_w_geom.to_postgis(
-        "household_electricity_profiles_in_census_cells_with_geom",
-        schema="demand",
-        con=db.engine(),
-        if_exists="replace",
-    )
-
-
-class EgonDestatisZensusHouseholdPerHa(Base):
-    __tablename__ = "egon_destatis_zensus_household_per_ha"
-    __table_args__ = {"schema": "society"}
-
-    id = Column(
-        Integer,
-        primary_key=True,
-        server_default=text(
-            "nextval('society.egon_destatis_zensus_household_per_ha_id_seq'::regclass)"
-        ),
-    )
-    grid_id = Column(String(50))
-    grid_id_new = Column(String(50))
-    attribute = Column(String(50))
-    characteristics_code = Column(Integer)
-    characteristics_text = Column(String)
-    quantity = Column(Integer)
-    quantity_q = Column(Integer)
-    zensus_population_id = Column(
-        ForeignKey("society.destatis_zensus_population_per_ha.id")
-    )
-
-
-def zensus_household_with_geom_check():
-    import geopandas as gpd
-
-    with db.session_scope() as session:
-
-        # Household x Zensus population x Zensus grid district mapping
-        cells_w_geom_query = (
-            session.query(
-                EgonDestatisZensusHouseholdPerHa,
-                MapZensusGridDistricts.subst_id,
-                DestatisZensusPopulationPerHaInsideGermany.population,
-                DestatisZensusPopulationPerHaInsideGermany.geom,
-            )
-            .join(
-                MapZensusGridDistricts,
-                EgonDestatisZensusHouseholdPerHa.zensus_population_id
-                == MapZensusGridDistricts.zensus_population_id,
-            )
-            .join(
-                DestatisZensusPopulationPerHaInsideGermany,
-                EgonDestatisZensusHouseholdPerHa.zensus_population_id
-                == DestatisZensusPopulationPerHaInsideGermany.gid,
-            )
-        )
-        cells_w_geom = gpd.read_postgis(
-            cells_w_geom_query.statement,
-            cells_w_geom_query.session.bind,
-            index_col="zensus_population_id",
-        )
-
-    # Used for visual check of data
-    cells_w_geom.to_postgis(
-        "egon_destatis_zensus_household_per_ha_with_geom",
-        schema="society",
-        con=db.engine(),
-        if_exists="replace",
-    )

From 661434104e3623b9e10cadd446f5877b818d75dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Tue, 15 Jun 2021 19:01:28 +0200
Subject: [PATCH 40/97] Write MV grid aggregated profiles directly to eTraGo
 compatible table

---
 .../hh_demand/hh_demand_profiles.py           | 15 +++-
 .../hh_demand/hh_demand_profiles_tools.py     | 75 ++++++++++---------
 2 files changed, 53 insertions(+), 37 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index f56e9eb9b..762ce4024 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -6,5 +6,16 @@
     hh_tools.houseprofiles_in_census_cells()
 
     # Calculate household electricity demand time series for each MV grid
-    profiles = hh_tools.mv_grid_district_HH_electricity_load()
-    print(profiles)
+    profiles_2035 = hh_tools.mv_grid_district_HH_electricity_load(
+        "eGon2035",
+        2035,
+        "0.0.0",
+        drop_table=True
+    )
+    profiles_2050 = hh_tools.mv_grid_district_HH_electricity_load(
+        "eGon100RE",
+        2050,
+        "0.0.0"
+    )
+    print(profiles_2035)
+    print(profiles_2050)
diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
index 923b876c0..e55f1c530 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
@@ -94,8 +94,7 @@
 from urllib.request import urlretrieve
 import random
 
-from sqlalchemy import ARRAY, Column, Float, Integer, String, \
-    Sequence
+from sqlalchemy import ARRAY, Column, Float, Integer, String
 from sqlalchemy.ext.declarative import declarative_base
 import numpy as np
 import pandas as pd
@@ -201,19 +200,16 @@ class HouseholdElectricityProfilesInCensusCells(Base):
     factor_2050 = Column(Float)
 
 
-class HouseholdElectricityProfilesHvMvSubstation(Base):
-    __tablename__ = "household_electricity_profiles_hvmv_substation"
+class EgonEtragoElectricityHouseholds(Base):
+    __tablename__ = "egon_etrago_electricity_households"
     __table_args__ = {"schema": "demand"}
 
-    index = Column(
-        Integer,
-        Sequence(f"{__tablename__}_id_seq",
-                 schema=f"{ __table_args__['schema']}"),
-        primary_key=True,
-    )
-    subst_id = Column(Integer)
-    timestep = Column(Integer)
-    household_electricity_load = Column(Float)
+    version = Column(String, primary_key=True)
+    subst_id = Column(Integer, primary_key=True)
+    scn_name = Column(String, primary_key=True)
+    p_set = Column(ARRAY(Float))
+    q_set = Column(ARRAY(Float))
+
 
 
 def clean(x):
@@ -964,19 +960,39 @@ def get_houseprofiles_in_census_cells():
     return census_profile_mapping
 
 
-def mv_grid_district_HH_electricity_load():
+def mv_grid_district_HH_electricity_load(scenario_name, scenario_year, version, drop_table=False):
     """
     Aggregated household demand time series at HV/MV substation level
 
     Calculate the aggregated demand time series based on the demand profiles
     of each zensus cell inside each MV grid district.
 
+    Parameters
+    ----------
+    scenario_name: str
+        Scenario name identifier, i.e. "eGon2035"
+    scenario_year: int
+        Scenario year according to `scenario_name`
+    version: str
+        Version identifier
+    drop_table: bool
+        Toggle to True for dropping table at beginning of this function.
+        Be careful, delete any data.
+
     Returns
     -------
     pd.DataFrame
         Multiindexed dataframe with `timestep` and `subst_id` as indexers.
         Demand is given in kWh.
     """
+    engine = db.engine()
+    if drop_table:
+        EgonEtragoElectricityHouseholds.__table__.drop(
+            bind=engine, checkfirst=True
+        )
+    EgonEtragoElectricityHouseholds.__table__.create(
+        bind=engine, checkfirst=True
+    )
 
     with db.session_scope() as session:
         cells_query = session.query(
@@ -998,41 +1014,30 @@ def mv_grid_district_HH_electricity_load():
     # Create aggregated load profile for each MV grid district
     df_profiles = get_household_demand_profiles_raw()
 
-    mvgd_profiles_list = []
+    mvgd_profiles_dict = {}
     for grid_district, data in cells.groupby("subst_id"):
         mvgd_profile = get_load_timeseries(
             df_profiles=df_profiles,
             df_cell_demand_metadata=data,
             cell_ids=data.index,
-            year=2035,
+            year=scenario_year,
             peak_load_only=False,
         )
-        mvgd_profile.name = grid_district
-        mvgd_profiles_list.append(mvgd_profile)
-
-    mvgd_profiles = pd.concat(mvgd_profiles_list, axis=1)
-
-    # Add timestep index
-    mvgd_profiles["timestep"] = mvgd_profiles.index + 1
+        mvgd_profiles_dict[grid_district] = [mvgd_profile.to_list()]
+    mvgd_profiles = pd.DataFrame.from_dict(mvgd_profiles_dict, orient="index")
 
     # Reshape data: put MV grid ids in columns to a single index column
-    mvgd_profiles = mvgd_profiles.set_index("timestep").stack()
-    mvgd_profiles.name = "household_electricity_load"
-    mvgd_profiles.index.names = ["timestep", "subst_id"]
     mvgd_profiles = mvgd_profiles.reset_index()
+    mvgd_profiles.columns = ["subst_id", "p_set"]
 
-    # Insert data into respective database table
-    engine = db.engine()
-    HouseholdElectricityProfilesHvMvSubstation.__table__.drop(
-        bind=engine, checkfirst=True
-    )
-    HouseholdElectricityProfilesHvMvSubstation.__table__.create(
-        bind=engine, checkfirst=True
-    )
+    # Add remaining columns
+    mvgd_profiles["version"] = version
+    mvgd_profiles["scn_name"] = scenario_name
 
+    # Insert data into respective database table
     with db.session_scope() as session:
         session.bulk_insert_mappings(
-            HouseholdElectricityProfilesHvMvSubstation,
+            EgonEtragoElectricityHouseholds,
             mvgd_profiles.to_dict(orient="records"),
         )
 

From 7ca901286937072d9e8a6be0c9e4736315cfd8ab Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Thu, 17 Jun 2021 19:39:13 +0200
Subject: [PATCH 41/97] add hh_demand_profiles to pipeline

---
 src/egon/data/airflow/dags/pipeline.py | 35 ++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/src/egon/data/airflow/dags/pipeline.py b/src/egon/data/airflow/dags/pipeline.py
index 8d0698d52..3fa0134b8 100755
--- a/src/egon/data/airflow/dags/pipeline.py
+++ b/src/egon/data/airflow/dags/pipeline.py
@@ -8,6 +8,7 @@
 from egon.data.datasets import database
 from egon.data.datasets.data_bundle import DataBundle
 from egon.data.datasets.osm import OpenStreetMap
+from egon.data.datasets import Dataset
 from egon.data.processing.zensus_vg250 import (
     zensus_population_inside_germany as zensus_vg250,
 )
@@ -37,6 +38,7 @@
 import egon.data.processing.mv_grid_districts as mvgd
 import egon.data.processing.zensus as process_zs
 import egon.data.processing.zensus_grid_districts as zensus_grid_districts
+import egon.data.processing.hh_demand.hh_demand_profiles_tools as hh_tools
 
 from egon.data import db
 
@@ -562,3 +564,36 @@
     nep_insert_data >> solar_rooftop_etrago
     etrago_input_data >> solar_rooftop_etrago
     map_zensus_grid_districts >> solar_rooftop_etrago
+
+    hh_profiles_in_census_cells = PythonOperator(
+        task_id="hh_profiles_in_census_cells",
+        python_callable=hh_tools.houseprofiles_in_census_cells,
+    )
+
+    mv_HH_electricity_load_2035 = PythonOperator(
+        task_id="mv_HH_electricity_load_2035",
+        python_callable=hh_tools.mv_grid_district_HH_electricity_load,
+        op_args=["eGon2035", 2035, "0.0.0"],
+        op_kwargs={'drop_table': True},
+    )
+
+    mv_HH_electricity_load_2050 = PythonOperator(
+        task_id="mv_HH_electricity_load_2050",
+        python_callable=hh_tools.mv_grid_district_HH_electricity_load,
+        op_args=["eGon100RE", 2050, "0.0.0"],
+        op_kwargs={'drop_table': True},
+    )
+
+    hh_demand = Dataset(
+        name="hh_demand",
+        version="0.0.0",
+        dependencies=[vg250_clean_and_prepare,
+                      zensus_misc_import,
+                      map_zensus_vg250,
+                      # zensus_inside_ger,
+                      demandregio_demand_households],
+        tasks=(hh_profiles_in_census_cells,
+               mv_HH_electricity_load_2035,
+               mv_HH_electricity_load_2050),
+    )
+    hh_demand.insert_into(pipeline)

From 837efcdc0b2bae11c639cc56b55a9268dc8cf76a Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Fri, 18 Jun 2021 15:41:44 +0200
Subject: [PATCH 42/97] add more dependencies

---
 src/egon/data/airflow/dags/pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/egon/data/airflow/dags/pipeline.py b/src/egon/data/airflow/dags/pipeline.py
index 3fa0134b8..d1978e3f0 100755
--- a/src/egon/data/airflow/dags/pipeline.py
+++ b/src/egon/data/airflow/dags/pipeline.py
@@ -589,8 +589,8 @@
         version="0.0.0",
         dependencies=[vg250_clean_and_prepare,
                       zensus_misc_import,
-                      map_zensus_vg250,
-                      # zensus_inside_ger,
+                      map_zensus_grid_districts,
+                      zensus_inside_ger,
                       demandregio_demand_households],
         tasks=(hh_profiles_in_census_cells,
                mv_HH_electricity_load_2035,

From 798499d3900a526ea82eaffb9dd6c6908bdc7795 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Fri, 18 Jun 2021 18:34:44 +0200
Subject: [PATCH 43/97] remove op_kwarg for mv_HH_electricity_load_2050

---
 src/egon/data/airflow/dags/pipeline.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/egon/data/airflow/dags/pipeline.py b/src/egon/data/airflow/dags/pipeline.py
index d1978e3f0..50663e7ad 100755
--- a/src/egon/data/airflow/dags/pipeline.py
+++ b/src/egon/data/airflow/dags/pipeline.py
@@ -581,7 +581,6 @@
         task_id="mv_HH_electricity_load_2050",
         python_callable=hh_tools.mv_grid_district_HH_electricity_load,
         op_args=["eGon100RE", 2050, "0.0.0"],
-        op_kwargs={'drop_table': True},
     )
 
     hh_demand = Dataset(

From 5889e7c790f4aaf57aaf455eb8736b8dc29545da Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Mon, 21 Jun 2021 12:21:10 +0200
Subject: [PATCH 44/97] remove hh_demand-profiles.py

---
 .../hh_demand/hh_demand_profiles.py           | 21 -------------------
 1 file changed, 21 deletions(-)
 delete mode 100644 src/egon/data/processing/hh_demand/hh_demand_profiles.py

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
deleted file mode 100644
index 762ce4024..000000000
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from egon.data.processing.hh_demand import hh_demand_profiles_tools as hh_tools
-
-if __name__ == "__main__":
-
-    # Create table with mapping of census cells and household elec. profiles
-    hh_tools.houseprofiles_in_census_cells()
-
-    # Calculate household electricity demand time series for each MV grid
-    profiles_2035 = hh_tools.mv_grid_district_HH_electricity_load(
-        "eGon2035",
-        2035,
-        "0.0.0",
-        drop_table=True
-    )
-    profiles_2050 = hh_tools.mv_grid_district_HH_electricity_load(
-        "eGon100RE",
-        2050,
-        "0.0.0"
-    )
-    print(profiles_2035)
-    print(profiles_2050)

From 3a11a3653db7f413136ce81c35ae17a2401d5420 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Mon, 21 Jun 2021 12:23:31 +0200
Subject: [PATCH 45/97] rename hh_demand_profiles_tools.py to
 hh_demand_profiles.py

---
 .../{hh_demand_profiles_tools.py => hh_demand_profiles.py}        | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/egon/data/processing/hh_demand/{hh_demand_profiles_tools.py => hh_demand_profiles.py} (100%)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
similarity index 100%
rename from src/egon/data/processing/hh_demand/hh_demand_profiles_tools.py
rename to src/egon/data/processing/hh_demand/hh_demand_profiles.py

From 680d7cdde9c40678e4e4a0a6085b496b53462d51 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Mon, 21 Jun 2021 12:29:20 +0200
Subject: [PATCH 46/97] black&isort

---
 src/egon/data/processing/hh_demand/hh_demand_profiles.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index e55f1c530..6f2f49623 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -102,7 +102,6 @@
 from egon.data import db
 from egon.data.processing.zensus_grid_districts import MapZensusGridDistricts
 
-
 Base = declarative_base()
 
 import egon.data.config
@@ -211,7 +210,6 @@ class EgonEtragoElectricityHouseholds(Base):
     q_set = Column(ARRAY(Float))
 
 
-
 def clean(x):
     """Clean zensus household data row-wise
 
@@ -571,7 +569,8 @@ def get_cell_demand_profile_ids(df_cell, pool_size):
     cell_profile_ids = [
         (hh_type, random.sample(range(pool_size[hh_type]), k=sq))
         for hh_type, sq in zip(
-            df_cell["hh_type"], np.rint(df_cell["hh_10types"].values).astype(int)
+            df_cell["hh_type"],
+            np.rint(df_cell["hh_10types"].values).astype(int),
         )
     ]
 
@@ -960,7 +959,9 @@ def get_houseprofiles_in_census_cells():
     return census_profile_mapping
 
 
-def mv_grid_district_HH_electricity_load(scenario_name, scenario_year, version, drop_table=False):
+def mv_grid_district_HH_electricity_load(
+    scenario_name, scenario_year, version, drop_table=False
+):
     """
     Aggregated household demand time series at HV/MV substation level
 

From 9326f1149033109734253783251b3733cac3558a Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Mon, 21 Jun 2021 12:34:52 +0200
Subject: [PATCH 47/97] add changelog

---
 CHANGELOG.rst | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index c8dfff746..d73452e6b 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -65,26 +65,26 @@ Added
   `#9 <https://github.com/openego/eGon-data/issues/9>`_
 * Add hydro and biomass power plants eGon2035
   `#127 <https://github.com/openego/eGon-data/issues/127>`_
-* Creation of the ehv/hv grid model with osmTGmod, see 
-  `issue #4 <https://github.com/openego/eGon-data/issues/4>`_ and 
+* Creation of the ehv/hv grid model with osmTGmod, see
+  `issue #4 <https://github.com/openego/eGon-data/issues/4>`_ and
   `PR #164 <https://github.com/openego/eGon-data/pull/164>`_
 * Identification of medium-voltage grid districts
-  `#10 <https://github.com/openego/eGon-data/pull/10>`_  
+  `#10 <https://github.com/openego/eGon-data/pull/10>`_
 * Distribute electrical demands of households to zensus cells
   `#181 <https://github.com/openego/eGon-data/issues/181>`_
 * Distribute electrical demands of cts to zensus cells
   `#210 <https://github.com/openego/eGon-data/issues/210>`_
 * Include industrial sites' download, import and merge
   `#117 <https://github.com/openego/eGon-data/issues/117>`_
-* Integrate scenario table with parameters for each sector 
+* Integrate scenario table with parameters for each sector
   `#177 <https://github.com/openego/eGon-data/issues/177>`_
 * The volume of the docker container for the PostgreSQL database
   is saved in the project directory under `docker/database-data`.
   The current user (`$USER`) is owner of the volume.
-  Containers created prior to this change will fail when using the 
+  Containers created prior to this change will fail when using the
   changed code. The container needs to be re-created.
   `#228 <https://github.com/openego/eGon-data/issues/228>`_
-* Extract landuse areas from OSM 
+* Extract landuse areas from OSM
   `#214 <https://github.com/openego/eGon-data/issues/214>`_
 * Integrate weather data and renewable feedin timeseries
   `#19 <https://github.com/openego/eGon-data/issues/19>`_
@@ -100,6 +100,8 @@ Added
   `#198 <https://github.com/openego/eGon-data/issues/198>`_
 * Integrate data bundle
   `#272 <https://github.com/openego/eGon-data/issues/272>`_
+* Add hh demand profiles and load area aggregation
+  `#256 <https://github.com/openego/eGon-data/issues/256>`_
 
 .. _PR #159: https://github.com/openego/eGon-data/pull/159
 
@@ -135,7 +137,7 @@ Changed
 * Update installation of demandregio's disaggregator
   `#202 <https://github.com/openego/eGon-data/issues/202>`_
 * Update etrago tables
-  `#243 <https://github.com/openego/eGon-data/issues/243>`_ and 
+  `#243 <https://github.com/openego/eGon-data/issues/243>`_ and
   `#285 <https://github.com/openego/eGon-data/issues/285>`_
 
 Bug fixes

From e02047a604970ef986f391596844c4e6d36fbc17 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Mon, 21 Jun 2021 12:42:55 +0200
Subject: [PATCH 48/97] change import of hh_demand

---
 src/egon/data/airflow/dags/pipeline.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/egon/data/airflow/dags/pipeline.py b/src/egon/data/airflow/dags/pipeline.py
index 50663e7ad..7923d19f9 100755
--- a/src/egon/data/airflow/dags/pipeline.py
+++ b/src/egon/data/airflow/dags/pipeline.py
@@ -38,7 +38,7 @@
 import egon.data.processing.mv_grid_districts as mvgd
 import egon.data.processing.zensus as process_zs
 import egon.data.processing.zensus_grid_districts as zensus_grid_districts
-import egon.data.processing.hh_demand.hh_demand_profiles_tools as hh_tools
+import egon.data.processing.hh_demand.hh_demand_profiles as hh_demand_profiles
 
 from egon.data import db
 
@@ -567,19 +567,19 @@
 
     hh_profiles_in_census_cells = PythonOperator(
         task_id="hh_profiles_in_census_cells",
-        python_callable=hh_tools.houseprofiles_in_census_cells,
+        python_callable=hh_demand_profiles.houseprofiles_in_census_cells,
     )
 
     mv_HH_electricity_load_2035 = PythonOperator(
         task_id="mv_HH_electricity_load_2035",
-        python_callable=hh_tools.mv_grid_district_HH_electricity_load,
+        python_callable=hh_demand_profiles.mv_grid_district_HH_electricity_load,
         op_args=["eGon2035", 2035, "0.0.0"],
         op_kwargs={'drop_table': True},
     )
 
     mv_HH_electricity_load_2050 = PythonOperator(
         task_id="mv_HH_electricity_load_2050",
-        python_callable=hh_tools.mv_grid_district_HH_electricity_load,
+        python_callable=hh_demand_profiles.mv_grid_district_HH_electricity_load,
         op_args=["eGon100RE", 2050, "0.0.0"],
     )
 

From b2d8b33b4d13e5eda3d6687e2f085a69cb299615 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Ple=C3=9Fmann?= <guido.plessmann@rl-institut.de>
Date: Mon, 21 Jun 2021 13:42:32 +0200
Subject: [PATCH 49/97] Extent CHANGELOG note

---
 CHANGELOG.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index d73452e6b..89ca0ad11 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -100,7 +100,9 @@ Added
   `#198 <https://github.com/openego/eGon-data/issues/198>`_
 * Integrate data bundle
   `#272 <https://github.com/openego/eGon-data/issues/272>`_
-* Add hh demand profiles and load area aggregation
+* Add household electricity demand time series, mapping of
+  demand profiles to census cells and aggregated household
+  electricity demand time series at MV grid district level
   `#256 <https://github.com/openego/eGon-data/issues/256>`_
 
 .. _PR #159: https://github.com/openego/eGon-data/pull/159

From b57afdb279f22395c8132a7359a21c858fc23ddd Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Thu, 24 Jun 2021 14:59:17 +0200
Subject: [PATCH 50/97] change implementation of dataset versioning to partial

---
 src/egon/data/airflow/dags/pipeline.py        | 48 +++++--------------
 .../hh_demand/hh_demand_profiles.py           | 29 +++++++++++
 2 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/src/egon/data/airflow/dags/pipeline.py b/src/egon/data/airflow/dags/pipeline.py
index 7923d19f9..4dd54ac98 100755
--- a/src/egon/data/airflow/dags/pipeline.py
+++ b/src/egon/data/airflow/dags/pipeline.py
@@ -8,10 +8,7 @@
 from egon.data.datasets import database
 from egon.data.datasets.data_bundle import DataBundle
 from egon.data.datasets.osm import OpenStreetMap
-from egon.data.datasets import Dataset
-from egon.data.processing.zensus_vg250 import (
-    zensus_population_inside_germany as zensus_vg250,
-)
+
 import airflow
 import egon.data.importing.demandregio as import_dr
 import egon.data.importing.demandregio.install_disaggregator as install_dr
@@ -38,7 +35,7 @@
 import egon.data.processing.mv_grid_districts as mvgd
 import egon.data.processing.zensus as process_zs
 import egon.data.processing.zensus_grid_districts as zensus_grid_districts
-import egon.data.processing.hh_demand.hh_demand_profiles as hh_demand_profiles
+from egon.data.processing.hh_demand.hh_demand_profiles import hh_demand_setup
 
 from egon.data import db
 
@@ -565,34 +562,15 @@
     etrago_input_data >> solar_rooftop_etrago
     map_zensus_grid_districts >> solar_rooftop_etrago
 
-    hh_profiles_in_census_cells = PythonOperator(
-        task_id="hh_profiles_in_census_cells",
-        python_callable=hh_demand_profiles.houseprofiles_in_census_cells,
-    )
-
-    mv_HH_electricity_load_2035 = PythonOperator(
-        task_id="mv_HH_electricity_load_2035",
-        python_callable=hh_demand_profiles.mv_grid_district_HH_electricity_load,
-        op_args=["eGon2035", 2035, "0.0.0"],
-        op_kwargs={'drop_table': True},
-    )
-
-    mv_HH_electricity_load_2050 = PythonOperator(
-        task_id="mv_HH_electricity_load_2050",
-        python_callable=hh_demand_profiles.mv_grid_district_HH_electricity_load,
-        op_args=["eGon100RE", 2050, "0.0.0"],
-    )
-
-    hh_demand = Dataset(
-        name="hh_demand",
-        version="0.0.0",
-        dependencies=[vg250_clean_and_prepare,
-                      zensus_misc_import,
-                      map_zensus_grid_districts,
-                      zensus_inside_ger,
-                      demandregio_demand_households],
-        tasks=(hh_profiles_in_census_cells,
-               mv_HH_electricity_load_2035,
-               mv_HH_electricity_load_2050),
-    )
+    # initiate household demand profile dataset and medium voltage load area profiles
+    hh_demand = hh_demand_setup(dependencies=[
+        vg250_clean_and_prepare,
+        zensus_misc_import,
+        map_zensus_grid_districts,
+        zensus_inside_ger,
+        demandregio_demand_households,
+        ])
     hh_demand.insert_into(pipeline)
+    householdprofiles_in_cencus_cells = tasks["houseprofiles-in-census-cells"]
+    mv_hh_electricity_load_2035 = tasks["MV-hh-electricity-load-2035"]
+    mv_hh_electricity_load_2050 = tasks["MV-hh-electricity-load-2050"]
diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index 6f2f49623..10a2bdc95 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -88,6 +88,9 @@
 is made in ... the content of this module docstring needs to be moved to
 docs attribute of the respective dataset class.
 """
+from functools import partial
+from egon.data.datasets import Dataset
+from airflow.operators.python_operator import PythonOperator
 
 from itertools import cycle
 from pathlib import Path
@@ -1043,3 +1046,29 @@ def mv_grid_district_HH_electricity_load(
         )
 
     return mvgd_profiles
+
+
+mv_HH_electricity_load_2035 = PythonOperator(
+    task_id="MV-hh-electricity-load-2035",
+    python_callable=mv_grid_district_HH_electricity_load,
+    op_args=["eGon2035", 2035, "0.0.0"],
+    op_kwargs={'drop_table': True},
+)
+
+
+mv_HH_electricity_load_2050 = PythonOperator(
+    task_id="MV-hh-electricity-load-2050",
+    python_callable=mv_grid_district_HH_electricity_load,
+    op_args=["eGon100RE", 2050, "0.0.0"],
+)
+
+
+hh_demand_setup = partial(
+    Dataset,
+    name="HH Demand",
+    version="0.0.0",
+    dependencies=[],
+    tasks=(houseprofiles_in_census_cells,
+           mv_HH_electricity_load_2035,
+           mv_HH_electricity_load_2050),
+)

From c8b2402e2d191246e47cd5ab0288bb70d0136346 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Thu, 24 Jun 2021 15:00:52 +0200
Subject: [PATCH 51/97] black&isort

---
 .../processing/hh_demand/hh_demand_profiles.py    | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index 10a2bdc95..9597cca50 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -89,20 +89,19 @@
 docs attribute of the respective dataset class.
 """
 from functools import partial
-from egon.data.datasets import Dataset
-from airflow.operators.python_operator import PythonOperator
-
 from itertools import cycle
 from pathlib import Path
 from urllib.request import urlretrieve
 import random
 
+from airflow.operators.python_operator import PythonOperator
 from sqlalchemy import ARRAY, Column, Float, Integer, String
 from sqlalchemy.ext.declarative import declarative_base
 import numpy as np
 import pandas as pd
 
 from egon.data import db
+from egon.data.datasets import Dataset
 from egon.data.processing.zensus_grid_districts import MapZensusGridDistricts
 
 Base = declarative_base()
@@ -1052,7 +1051,7 @@ def mv_grid_district_HH_electricity_load(
     task_id="MV-hh-electricity-load-2035",
     python_callable=mv_grid_district_HH_electricity_load,
     op_args=["eGon2035", 2035, "0.0.0"],
-    op_kwargs={'drop_table': True},
+    op_kwargs={"drop_table": True},
 )
 
 
@@ -1068,7 +1067,9 @@ def mv_grid_district_HH_electricity_load(
     name="HH Demand",
     version="0.0.0",
     dependencies=[],
-    tasks=(houseprofiles_in_census_cells,
-           mv_HH_electricity_load_2035,
-           mv_HH_electricity_load_2050),
+    tasks=(
+        houseprofiles_in_census_cells,
+        mv_HH_electricity_load_2035,
+        mv_HH_electricity_load_2050,
+    ),
 )

From b7c52f624099aa56fb5197d6c55eee3e029cbe2b Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Thu, 24 Jun 2021 15:52:43 +0200
Subject: [PATCH 52/97] move PythonOperator into pipeline

---
 src/egon/data/airflow/dags/pipeline.py        | 23 +++++++++++++++++-
 .../hh_demand/hh_demand_profiles.py           | 24 ++++---------------
 2 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/src/egon/data/airflow/dags/pipeline.py b/src/egon/data/airflow/dags/pipeline.py
index 4dd54ac98..b90f33dfa 100755
--- a/src/egon/data/airflow/dags/pipeline.py
+++ b/src/egon/data/airflow/dags/pipeline.py
@@ -8,6 +8,7 @@
 from egon.data.datasets import database
 from egon.data.datasets.data_bundle import DataBundle
 from egon.data.datasets.osm import OpenStreetMap
+# from egon.data.processing import hh_demand
 
 import airflow
 import egon.data.importing.demandregio as import_dr
@@ -37,6 +38,9 @@
 import egon.data.processing.zensus_grid_districts as zensus_grid_districts
 from egon.data.processing.hh_demand.hh_demand_profiles import hh_demand_setup
 
+from egon.data.processing.hh_demand.hh_demand_profiles import mv_grid_district_HH_electricity_load
+from egon.data.processing.hh_demand.hh_demand_profiles import houseprofiles_in_census_cells
+
 from egon.data import db
 
 
@@ -563,13 +567,30 @@
     map_zensus_grid_districts >> solar_rooftop_etrago
 
     # initiate household demand profile dataset and medium voltage load area profiles
+    mv_HH_electricity_load_2035 = PythonOperator(
+        task_id="MV-hh-electricity-load-2035",
+        python_callable=mv_grid_district_HH_electricity_load,
+        op_args=["eGon2035", 2035, "0.0.0"],
+        op_kwargs={"drop_table": True},
+    )
+
+    mv_HH_electricity_load_2050 = PythonOperator(
+        task_id="MV-hh-electricity-load-2050",
+        python_callable=mv_grid_district_HH_electricity_load,
+        op_args=["eGon100RE", 2050, "0.0.0"],
+    )
+
     hh_demand = hh_demand_setup(dependencies=[
         vg250_clean_and_prepare,
         zensus_misc_import,
         map_zensus_grid_districts,
         zensus_inside_ger,
         demandregio_demand_households,
-        ])
+    ],
+        tasks=(houseprofiles_in_census_cells,
+               mv_HH_electricity_load_2035,
+               mv_HH_electricity_load_2050,)
+    )
     hh_demand.insert_into(pipeline)
     householdprofiles_in_cencus_cells = tasks["houseprofiles-in-census-cells"]
     mv_hh_electricity_load_2035 = tasks["MV-hh-electricity-load-2035"]
diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
index 9597cca50..f016527c2 100644
--- a/src/egon/data/processing/hh_demand/hh_demand_profiles.py
+++ b/src/egon/data/processing/hh_demand/hh_demand_profiles.py
@@ -1047,29 +1047,13 @@ def mv_grid_district_HH_electricity_load(
     return mvgd_profiles
 
 
-mv_HH_electricity_load_2035 = PythonOperator(
-    task_id="MV-hh-electricity-load-2035",
-    python_callable=mv_grid_district_HH_electricity_load,
-    op_args=["eGon2035", 2035, "0.0.0"],
-    op_kwargs={"drop_table": True},
-)
-
-
-mv_HH_electricity_load_2050 = PythonOperator(
-    task_id="MV-hh-electricity-load-2050",
-    python_callable=mv_grid_district_HH_electricity_load,
-    op_args=["eGon100RE", 2050, "0.0.0"],
-)
-
-
 hh_demand_setup = partial(
     Dataset,
     name="HH Demand",
     version="0.0.0",
     dependencies=[],
-    tasks=(
-        houseprofiles_in_census_cells,
-        mv_HH_electricity_load_2035,
-        mv_HH_electricity_load_2050,
-    ),
+    # Tasks are declared in pipeline as function is used multiple time with different args
+    # to differentiate these tasks PythonOperator with specific id-names are used
+    # PythonOperator needs to be declared in pipeline to be mapped to DAG
+    # tasks=[],
 )

From 08a2fa94b05ed124a4720a88d3c1754e410ec5b8 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Tue, 13 Jul 2021 11:25:19 +0200
Subject: [PATCH 53/97] Move hh_demand_profiles to datasets directory

---
 .../data/{processing/hh_demand => datasets}/hh_demand_profiles.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/egon/data/{processing/hh_demand => datasets}/hh_demand_profiles.py (100%)

diff --git a/src/egon/data/processing/hh_demand/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
similarity index 100%
rename from src/egon/data/processing/hh_demand/hh_demand_profiles.py
rename to src/egon/data/datasets/hh_demand_profiles.py

From 53679cc3eadfb481992ac8adda1afc2d59aae19a Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Tue, 13 Jul 2021 11:27:28 +0200
Subject: [PATCH 54/97] Clean imports

---
 src/egon/data/airflow/dags/pipeline.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/egon/data/airflow/dags/pipeline.py b/src/egon/data/airflow/dags/pipeline.py
index 29b9e6730..eef533037 100755
--- a/src/egon/data/airflow/dags/pipeline.py
+++ b/src/egon/data/airflow/dags/pipeline.py
@@ -8,7 +8,8 @@
 from egon.data.datasets import database
 from egon.data.datasets.data_bundle import DataBundle
 from egon.data.datasets.osm import OpenStreetMap
-# from egon.data.processing import hh_demand
+from egon.data.datasets.hh_demand_profiles import hh_demand_setup, mv_grid_district_HH_electricity_load, \
+    houseprofiles_in_census_cells
 
 from egon.data.datasets.vg250 import Vg250
 from egon.data.processing.zensus_vg250 import (
@@ -39,10 +40,7 @@
 import egon.data.processing.mv_grid_districts as mvgd
 import egon.data.processing.zensus as process_zs
 import egon.data.processing.zensus_grid_districts as zensus_grid_districts
-from egon.data.processing.hh_demand.hh_demand_profiles import hh_demand_setup
 
-from egon.data.processing.hh_demand.hh_demand_profiles import mv_grid_district_HH_electricity_load
-from egon.data.processing.hh_demand.hh_demand_profiles import houseprofiles_in_census_cells
 
 from egon.data import db
 

From 49811f7d67b699bfc3ddfa0a096f8dde74c8ee25 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Tue, 13 Jul 2021 11:27:53 +0200
Subject: [PATCH 55/97] Fix typo

---
 src/egon/data/datasets/hh_demand_profiles.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index f016527c2..3ef6ba2c0 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -1052,8 +1052,8 @@ def mv_grid_district_HH_electricity_load(
     name="HH Demand",
     version="0.0.0",
     dependencies=[],
-    # Tasks are declared in pipeline as function is used multiple time with different args
-    # to differentiate these tasks PythonOperator with specific id-names are used
+    # Tasks are declared in pipeline as function is used multiple times with different args
+    # To differentiate these tasks PythonOperator with specific id-names are used
     # PythonOperator needs to be declared in pipeline to be mapped to DAG
     # tasks=[],
 )

From 1c04d0e865d95d01a16cc9314a14094478f31406 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Wed, 21 Jul 2021 10:59:38 +0200
Subject: [PATCH 56/97] Fix Task-string

---
 src/egon/data/airflow/dags/pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/egon/data/airflow/dags/pipeline.py b/src/egon/data/airflow/dags/pipeline.py
index eef533037..0a126405f 100755
--- a/src/egon/data/airflow/dags/pipeline.py
+++ b/src/egon/data/airflow/dags/pipeline.py
@@ -569,6 +569,6 @@
                mv_HH_electricity_load_2050,)
     )
     hh_demand.insert_into(pipeline)
-    householdprofiles_in_cencus_cells = tasks["houseprofiles-in-census-cells"]
+    householdprofiles_in_cencus_cells = tasks["hh_demand_profiles.houseprofiles-in-census-cells"]
     mv_hh_electricity_load_2035 = tasks["MV-hh-electricity-load-2035"]
     mv_hh_electricity_load_2050 = tasks["MV-hh-electricity-load-2050"]

From e1294331433f025c3b140a096f368a5cfe4d62b8 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Mon, 26 Jul 2021 10:49:24 +0200
Subject: [PATCH 57/97]  Merge branch 'dev' into branch
 features/#256-hh-load-area-profile-generator

---
 AUTHORS.rst                                   |    2 +-
 CHANGELOG.rst                                 |   34 +
 README.rst                                    |   13 +-
 docs/troubleshooting.rst                      |   15 +
 setup.py                                      |    2 +
 src/egon/data/airflow/airflow.cfg             |    2 +-
 src/egon/data/airflow/dags/pipeline.py        |  138 +-
 src/egon/data/cli.py                          |   23 +-
 src/egon/data/datasets.yml                    |   82 ++
 src/egon/data/datasets/__init__.py            |    5 +-
 .../data/datasets/heat_etrago/__init__.py     |  247 ++++
 .../datasets/heat_etrago/power_to_heat.py     |  404 ++++++
 .../data/datasets/heat_supply/__init__.py     |  118 ++
 .../datasets/heat_supply/district_heating.py  |  326 +++++
 .../data/datasets/heat_supply/geothermal.py   |  228 ++++
 .../heat_supply/individual_heating.py         |  226 ++++
 .../data/{importing => datasets}/mastr.py     |   11 +
 .../mv_grid_districts.py                      |   24 +-
 .../re_potential_areas/__init__.py            |   97 +-
 src/egon/data/datasets/vg250/__init__.py      |    2 +-
 .../vg250/cleaning_and_preparation.sql        |    3 +-
 src/egon/data/importing/gas_grid/__init__.py  |    6 +-
 src/egon/data/processing/calculate_dlr.py     |  289 ++++
 .../data/processing/gas_areas/__init__.py     |   98 ++
 src/egon/data/processing/osmtgmod/__init__.py |   27 +-
 .../data/processing/power_plants/__init__.py  |   22 +-
 src/egon/data/processing/pv_ground_mounted.py | 1200 +++++++++++++++++
 src/egon/data/processing/wind_farms.py        |  477 +++++++
 .../data/processing/zensus_grid_districts.py  |    2 +-
 29 files changed, 4010 insertions(+), 113 deletions(-)
 create mode 100644 src/egon/data/datasets/heat_etrago/__init__.py
 create mode 100644 src/egon/data/datasets/heat_etrago/power_to_heat.py
 create mode 100644 src/egon/data/datasets/heat_supply/__init__.py
 create mode 100644 src/egon/data/datasets/heat_supply/district_heating.py
 create mode 100644 src/egon/data/datasets/heat_supply/geothermal.py
 create mode 100644 src/egon/data/datasets/heat_supply/individual_heating.py
 rename src/egon/data/{importing => datasets}/mastr.py (86%)
 rename src/egon/data/{processing => datasets}/mv_grid_districts.py (98%)
 rename src/egon/data/{importing => datasets}/re_potential_areas/__init__.py (50%)
 create mode 100644 src/egon/data/processing/calculate_dlr.py
 create mode 100755 src/egon/data/processing/gas_areas/__init__.py
 create mode 100644 src/egon/data/processing/pv_ground_mounted.py
 create mode 100755 src/egon/data/processing/wind_farms.py

diff --git a/AUTHORS.rst b/AUTHORS.rst
index 5685c1427..0d7e707a0 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -2,4 +2,4 @@
 Authors
 =======
 
-* Guido Pleßmann, Ilka Cußman, Stephan Günther - https://github.com/openego/eGon-data
+* Guido Pleßmann, Ilka Cußman, Stephan Günther, Jonathan Amme - https://github.com/openego/eGon-data
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 48437a61d..647468339 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -33,6 +33,14 @@ Added
   returns the current configuration settings. See `PR #159`_ for more
   details.
 
+* You can now use tasks which are not part of a ``Dataset``, i.e. which are
+  unversioned, as dependencies of a dataset. See `PR #318`_ for more
+  details.
+
+* You can now force the tasks of a ``Dataset`` to be always executed by
+  giving the version of the ``Dataset`` a ``".dev"`` suffix. See `PR
+  #318`_ for more details.
+
 * OSM data import as done in open_ego
   `#1 <https://github.com/openego/eGon-data/issues/1>`_
   which was updated to the latest long-term data set of the 2021-01-01 in
@@ -104,6 +112,16 @@ Added
   demand profiles to census cells and aggregated household
   electricity demand time series at MV grid district level
   `#256 <https://github.com/openego/eGon-data/issues/256>`_
+* Integrate distribution of wind onshore and pv ground mounted generation
+  `#146 <https://github.com/openego/eGon-data/issues/146>`_
+* Integrate dynamic line rating potentials
+  `#72 <https://github.com/openego/eGon-data/issues/72>`_
+* Integrate gas voronoi polygons
+  `#308 <https://github.com/openego/eGon-data/issues/308>`_
+* Integrate supply strategies for individual and district heating
+  `#232 <https://github.com/openego/eGon-data/issues/232>`_
+
+>>>>>>> dev
 
 .. _PR #159: https://github.com/openego/eGon-data/pull/159
 
@@ -143,6 +161,10 @@ Changed
   `#285 <https://github.com/openego/eGon-data/issues/285>`_
 * Migrate VG250 to datasets
   `#283 <https://github.com/openego/eGon-data/issues/283>`_
+* Allow configuring the airflow port
+  `#281 <https://github.com/openego/eGon-data/issues/281>`_
+* Migrate mastr, mv_grid_districts and re_potential_areas to datasets
+  `#297 <https://github.com/openego/eGon-data/issues/297>`_
 
 Bug fixes
 ---------
@@ -167,3 +189,15 @@ Bug fixes
   `#248 <https://github.com/openego/eGon-data/issues/248>`_
 * Change order of osmtgmod tasks
   `#253 <https://github.com/openego/eGon-data/issues/253>`_
+* Fix missing municipalities
+  `#279 <https://github.com/openego/eGon-data/issues/279>`_
+* Fix import of hydro power plants
+  `#270 <https://github.com/openego/eGon-data/issues/270>`_
+* Fix path to osm-file for osmtgmod_osm_import
+  `#258 <https://github.com/openego/eGon-data/issues/258>`_
+* Fix conflicting docker containers by setting a project name
+  `#289 <https://github.com/openego/eGon-data/issues/289>`_
+* Fix versioning conflict with mv_grid_districts
+  `#340 <https://github.com/openego/eGon-data/issues/340>`_
+* Set current working directory as java's temp dir when executing osmosis
+  `#344 <https://github.com/openego/eGon-data/issues/344>`_
diff --git a/README.rst b/README.rst
index 12a58a1be..645e52f07 100644
--- a/README.rst
+++ b/README.rst
@@ -120,11 +120,14 @@ packages are required too. Right now these are:
   :code:`sudo apt install gdal-bin`.
 
 * To download ERA5 weather data you need to register at the CDS
-  registration page and install the CDS API key as descibed
+  registration page and install the CDS API key as described
   `here <https://cds.climate.copernicus.eu/api-how-to>`_
-  You also have to agree on the `terms of use 
+  You also have to agree on the `terms of use
   <https://cds.climate.copernicus.eu/cdsapp/#!/terms/licence-to-use-copernicus-products>`_
 
+* Make sure you have enough free disk space (~350 GB) in your working
+  directory.
+
 Installation
 ============
 
@@ -188,6 +191,10 @@ solution.
    can't be run on laptop. Use the :ref:`test mode <Test mode>` for
    experimenting.
 
+.. warning::
+
+   A complete run of the workflow needs loads of free disk space (~350 GB) to
+   store (temporary) files.
 
 Test mode
 ---------
@@ -206,4 +213,4 @@ Data is reduced during execution of the workflow to represent only this area.
 Further Reading
 ===============
 
-You can find more in depth documentation at https://eGon-data.readthedocs.io.
+You can find more in-depth documentation at https://eGon-data.readthedocs.io.
diff --git a/docs/troubleshooting.rst b/docs/troubleshooting.rst
index 553e750fc..fed67fd62 100644
--- a/docs/troubleshooting.rst
+++ b/docs/troubleshooting.rst
@@ -99,6 +99,21 @@ The container and its data can be kept by renaming the docker container.
 
   docker rename egon-data-local-database NEW_CONTAINER_NAME
 
+
+``Working with multiple instances of egon-data``
+----------------------------------------------------------------------------
+
+To make sure parallel installations of egon-data are not conflicting each other
+users have to set different values for the following options in the configuration:
+
+.. code-block:: none
+
+    --airflow-port
+    --compose-project-name
+    --database-port
+    --docker-container-name
+
+
 Other import or incompatible package version errors
 ===================================================
 
diff --git a/setup.py b/setup.py
index f3cded9aa..a8e9975c7 100755
--- a/setup.py
+++ b/setup.py
@@ -97,6 +97,8 @@ def read(*names, **kwargs):
         "rasterio",
         "rtree",
         "sqlalchemy<1.4",
+        "xarray",
+        "rioxarray",
     ],
     extras_require={
         "dev": ["black", "flake8", "isort>=5", "pre-commit", "pytest", "tox"]
diff --git a/src/egon/data/airflow/airflow.cfg b/src/egon/data/airflow/airflow.cfg
index 47451bdc6..80b7247f7 100644
--- a/src/egon/data/airflow/airflow.cfg
+++ b/src/egon/data/airflow/airflow.cfg
@@ -302,7 +302,7 @@ default_ui_timezone = UTC
 web_server_host = 0.0.0.0
 
 # The port on which to run the web server
-web_server_port = 8080
+web_server_port = {--airflow-port}
 
 # Paths to the SSL certificate and key for the web server. When both are
 # provided SSL will be enabled. This does not change the web server port.
diff --git a/src/egon/data/airflow/dags/pipeline.py b/src/egon/data/airflow/dags/pipeline.py
index 0a126405f..2ec3dd2e3 100755
--- a/src/egon/data/airflow/dags/pipeline.py
+++ b/src/egon/data/airflow/dags/pipeline.py
@@ -7,10 +7,16 @@
 
 from egon.data.datasets import database
 from egon.data.datasets.data_bundle import DataBundle
+from egon.data.datasets.heat_etrago import HeatEtrago
+from egon.data.datasets.heat_supply import HeatSupply
 from egon.data.datasets.osm import OpenStreetMap
 from egon.data.datasets.hh_demand_profiles import hh_demand_setup, mv_grid_district_HH_electricity_load, \
     houseprofiles_in_census_cells
 
+from egon.data.datasets.mastr import mastr_data_setup
+from egon.data.datasets.re_potential_areas import re_potential_area_setup
+from egon.data.datasets.mv_grid_districts import mv_grid_districts_setup
+
 from egon.data.datasets.vg250 import Vg250
 from egon.data.processing.zensus_vg250 import (
     zensus_population_inside_germany as zensus_vg250,
@@ -22,22 +28,28 @@
 import egon.data.importing.etrago as etrago
 import egon.data.importing.heat_demand_data as import_hd
 import egon.data.importing.industrial_sites as industrial_sites
-import egon.data.importing.mastr as mastr
+
 import egon.data.importing.nep_input_data as nep_input
-import egon.data.importing.re_potential_areas as re_potential_areas
 import egon.data.importing.scenarios as import_scenarios
 import egon.data.importing.zensus as import_zs
+import egon.data.importing.gas_grid as gas_grid
+
 import egon.data.processing.boundaries_grid_districts as boundaries_grid_districts
 import egon.data.processing.demandregio as process_dr
 import egon.data.processing.district_heating_areas as district_heating_areas
-import egon.data.processing.loadarea as loadarea
 import egon.data.processing.osmtgmod as osmtgmod
 import egon.data.processing.power_plants as power_plants
 import egon.data.processing.renewable_feedin as import_feedin
 import egon.data.processing.substation as substation
 import egon.data.processing.zensus_vg250.zensus_population_inside_germany as zensus_vg250
-import egon.data.importing.gas_grid as gas_grid
-import egon.data.processing.mv_grid_districts as mvgd
+import egon.data.processing.gas_areas as gas_areas
+import egon.data.processing.wind_farms as wf
+import egon.data.processing.pv_ground_mounted as pv_gm
+import egon.data.importing.scenarios as import_scenarios
+import egon.data.importing.industrial_sites as industrial_sites
+import egon.data.processing.loadarea as loadarea
+import egon.data.processing.calculate_dlr as dlr
+
 import egon.data.processing.zensus as process_zs
 import egon.data.processing.zensus_grid_districts as zensus_grid_districts
 
@@ -268,11 +280,9 @@
     setup >> etrago_input_data
 
     # Retrieve MaStR data
-    retrieve_mastr_data = PythonOperator(
-        task_id="retrieve_mastr_data",
-        python_callable=mastr.download_mastr_data,
-    )
-    setup >> retrieve_mastr_data
+    mastr_data = mastr_data_setup(dependencies=[setup])
+    mastr_data.insert_into(pipeline)
+    retrieve_mastr_data = tasks["mastr.download-mastr-data"]
 
     # Substation extraction
     substation_tables = PythonOperator(
@@ -299,6 +309,7 @@
         autocommit=True,
     )
 
+
     osm_add_metadata >> substation_tables >> substation_functions
     substation_functions >> hvmv_substation_extraction
     substation_functions >> ehv_substation_extraction
@@ -335,35 +346,22 @@
     etrago_input_data >> osmtgmod_pypsa
     run_osmtgmod >> osmtgmod_substation
 
-    # MV grid districts
-    create_voronoi = PythonOperator(
-        task_id="create_voronoi",
-        python_callable=substation.create_voronoi
+    # create Voronoi for MV grid districts
+    create_voronoi_substation = PythonOperator(
+        task_id="create-voronoi-substations",
+        python_callable=substation.create_voronoi,
     )
-    osmtgmod_substation >> create_voronoi
-
+    osmtgmod_substation >> create_voronoi_substation
 
-    define_mv_grid_districts = PythonOperator(
-        task_id="define_mv_grid_districts",
-        python_callable=mvgd.define_mv_grid_districts
-    )
-    create_voronoi >> define_mv_grid_districts
+    # MV grid districts
+    mv_grid_districts = mv_grid_districts_setup(dependencies=[create_voronoi_substation])
+    mv_grid_districts.insert_into(pipeline)
+    define_mv_grid_districts = tasks["mv_grid_districts.define-mv-grid-districts"]
 
     # Import potential areas for wind onshore and ground-mounted PV
-    download_re_potential_areas = PythonOperator(
-        task_id="download_re_potential_area_data",
-        python_callable=re_potential_areas.download_datasets,
-    )
-    create_re_potential_areas_tables = PythonOperator(
-        task_id="create_re_potential_areas_tables",
-        python_callable=re_potential_areas.create_tables,
-    )
-    insert_re_potential_areas = PythonOperator(
-        task_id="insert_re_potential_areas",
-        python_callable=re_potential_areas.insert_data,
-    )
-    setup >> download_re_potential_areas >> create_re_potential_areas_tables
-    create_re_potential_areas_tables >> insert_re_potential_areas
+    re_potential_areas = re_potential_area_setup(dependencies=[setup])
+    re_potential_areas.insert_into(pipeline)
+    insert_re_potential_areas = tasks["re_potential_areas.insert-data"]
 
     # Future heat demand calculation based on Peta5_0_1 data
     heat_demand_import = PythonOperator(
@@ -431,6 +429,15 @@
     etrago_input_data >> gas_grid_insert_data
     download_data_bundle >> gas_grid_insert_data
 
+    # Create gas voronoi
+    create_gas_polygons = PythonOperator(
+        task_id="create-gas-voronoi",
+        python_callable=gas_areas.create_voronoi,
+    )
+
+    gas_grid_insert_data  >> create_gas_polygons
+    vg250_clean_and_prepare >> create_gas_polygons
+
     # Extract landuse areas from osm data set
     create_landuse_table = PythonOperator(
         task_id="create-landuse-table",
@@ -448,7 +455,38 @@
     osm_add_metadata >> landuse_extraction
     vg250_clean_and_prepare >> landuse_extraction
 
- # Import weather data
+    # Generate wind power farms
+    generate_wind_farms = PythonOperator(
+        task_id="generate_wind_farms",
+        python_callable=wf.wind_power_parks,
+    )
+    retrieve_mastr_data >> generate_wind_farms
+    insert_re_potential_areas >> generate_wind_farms
+    scenario_input_import >> generate_wind_farms
+    hvmv_substation_extraction >> generate_wind_farms
+    define_mv_grid_districts >> generate_wind_farms
+
+    # Regionalization of PV ground mounted
+    generate_pv_ground_mounted = PythonOperator(
+        task_id="generate_pv_ground_mounted",
+        python_callable=pv_gm.regio_of_pv_ground_mounted,
+    )
+    retrieve_mastr_data >> generate_pv_ground_mounted
+    insert_re_potential_areas >> generate_pv_ground_mounted
+    scenario_input_import >> generate_pv_ground_mounted
+    hvmv_substation_extraction >> generate_pv_ground_mounted
+    define_mv_grid_districts >> generate_pv_ground_mounted
+
+    # Calculate dynamic line rating for HV trans lines
+
+    calculate_dlr = PythonOperator(
+        task_id="calculate_dlr",
+        python_callable=dlr.Calculate_DLR,
+    )
+    osmtgmod_pypsa >> calculate_dlr
+    download_data_bundle >> calculate_dlr
+
+    # Import weather data
     download_era5 = PythonOperator(
         task_id="download-weather-data",
         python_callable=import_era5.download_era5,
@@ -572,3 +610,31 @@
     householdprofiles_in_cencus_cells = tasks["hh_demand_profiles.houseprofiles-in-census-cells"]
     mv_hh_electricity_load_2035 = tasks["MV-hh-electricity-load-2035"]
     mv_hh_electricity_load_2050 = tasks["MV-hh-electricity-load-2050"]
+
+    # Heat supply
+    heat_supply = HeatSupply(
+        dependencies=[data_bundle])
+
+    import_district_heating_supply = tasks["heat_supply.district-heating"]
+    import_individual_heating_supply = tasks["heat_supply.individual-heating"]
+    heat_supply_tables = tasks["heat_supply.create-tables"]
+    geothermal_potential = tasks["heat_supply.geothermal.potential-germany"]
+
+    create_district_heating_areas_table >> heat_supply_tables
+    import_district_heating_areas >> import_district_heating_supply
+    map_zensus_grid_districts >> import_district_heating_supply
+    import_district_heating_areas >> geothermal_potential
+    import_district_heating_areas >> import_individual_heating_supply
+    map_zensus_grid_districts >> import_individual_heating_supply
+    power_plant_import >> import_individual_heating_supply
+
+    # Heat to eTraGo
+    heat_etrago = HeatEtrago(
+        dependencies=[heat_supply])
+
+    heat_etrago_buses = tasks["heat_etrago.buses"]
+    heat_etrago_supply = tasks["heat_etrago.supply"]
+
+    etrago_input_data >> heat_etrago_buses
+    define_mv_grid_districts >> heat_etrago_buses
+    import_district_heating_supply >> heat_etrago_supply
diff --git a/src/egon/data/cli.py b/src/egon/data/cli.py
index 5056e6e8e..c7545f830 100644
--- a/src/egon/data/cli.py
+++ b/src/egon/data/cli.py
@@ -121,6 +121,26 @@
     ),
     show_default=True,
 )
+
+@click.option(
+    "--compose-project-name",
+    default="egon-data",
+    metavar="PROJECT",
+    help=(
+        "The name of the Docker project."
+        " Different compose_project_names are needed to run multiple instances"
+        " of egon-data on the same machine."
+    ),
+    show_default=True,
+)
+
+@click.option(
+    "--airflow-port",
+    default=8080,
+    metavar="AIRFLOW_PORT",
+    help=("Specify the port on which airflow runs."),
+    show_default=True,
+)
 @click.version_option(version=egon.data.__version__)
 @click.pass_context
 def egon_data(context, **kwargs):
@@ -279,7 +299,8 @@ def render(template, target, update=True, inserts={}, **more_inserts):
         )
     if code != 0:
         subprocess.run(
-            ["docker-compose", "up", "-d", "--build"],
+            ["docker-compose", "-p", options["--compose-project-name"],
+             "up", "-d", "--build"],
             cwd=str((Path(".") / "docker").absolute()),
         )
         time.sleep(1.5)  # Give the container time to boot.
diff --git a/src/egon/data/datasets.yml b/src/egon/data/datasets.yml
index 08696fc8f..69d908e01 100644
--- a/src/egon/data/datasets.yml
+++ b/src/egon/data/datasets.yml
@@ -497,3 +497,85 @@ data-bundle:
     file:
       'data_bundle_egon_data.zip'
 
+heat_supply:
+  sources:
+    scenario_capacities:
+      schema: 'supply'
+      table: 'egon_scenario_capacities'
+    district_heating_areas:
+      schema: 'demand'
+      table: 'district_heating_areas'
+    power_plants:
+      schema: 'supply'
+      table: 'egon_power_plants'
+    federal_states:
+      schema: 'boundaries'
+      table: 'vg250_lan'
+    heat_demand:
+      schema: 'demand'
+      table: 'egon_peta_heat'
+    map_zensus_grid:
+      schema: 'boundaries'
+      table: 'egon_map_zensus_grid_districts'
+    map_vg250_grid:
+      schema: 'boundaries'
+      table: 'egon_map_mvgriddistrict_vg250'
+    mv_grids:
+      schema: 'grid'
+      table: 'mv_grid_districts'
+    map_dh:
+      schema: 'demand'
+      table: 'map_zensus_district_heating_areas'
+  targets:
+    district_heating_supply:
+      schema: 'supply'
+      table: 'egon_district_heating'
+    individual_heating_supply:
+      schema: 'supply'
+      table: 'egon_individual_heating'
+
+etrago_heat:
+  sources:
+    scenario_capacities:
+      schema: 'supply'
+      table: 'egon_scenario_capacities'
+    district_heating_areas:
+      schema: 'demand'
+      table: 'district_heating_areas'
+    map_district_heating_areas:
+      schema: 'demand'
+      table: 'map_zensus_district_heating_areas'
+    mv_grids:
+      schema: 'grid'
+      table: 'mv_grid_districts'
+    district_heating_supply:
+      schema: 'supply'
+      table: 'egon_district_heating'
+    individual_heating_supply:
+      schema: 'supply'
+      table: 'egon_individual_heating'
+    weather_cells:
+      schema: 'supply'
+      table: 'egon_era5_weather_cells'
+    solar_thermal_feedin:
+      schema: 'supply'
+      table: 'egon_era5_renewable_feedin'
+    mv_grid_districts:
+      schema: 'grid'
+      table: 'mv_grid_districts'
+    heat_demand:
+      schema: 'demand'
+      table: 'egon_peta_heat'
+  targets:
+    heat_buses:
+      schema: 'grid'
+      table: 'egon_pf_hv_bus'
+    heat_generators:
+      schema: 'grid'
+      table: 'egon_pf_hv_generator'
+    heat_generator_timeseries:
+      schema: 'grid'
+      table: 'egon_pf_hv_generator_timeseries'
+    heat_links:
+      schema: 'grid'
+      table: 'egon_pf_hv_link'
diff --git a/src/egon/data/datasets/__init__.py b/src/egon/data/datasets/__init__.py
index b3cc2ba6d..f8502bb57 100644
--- a/src/egon/data/datasets/__init__.py
+++ b/src/egon/data/datasets/__init__.py
@@ -6,6 +6,7 @@
 from dataclasses import dataclass
 from functools import reduce
 from typing import Callable, Iterable, Set, Tuple, Union
+import re
 
 from airflow import DAG
 from airflow.operators import BaseOperator as Operator
@@ -178,7 +179,9 @@ def check_version(self, after_execution=()):
         def skip_task(task, *xs, **ks):
             with db.session_scope() as session:
                 datasets = session.query(Model).filter_by(name=self.name).all()
-                if self.version in [ds.version for ds in datasets]:
+                if self.version in [
+                    ds.version for ds in datasets
+                ] and not re.search(r"\.dev$", self.version):
                     logger.info(
                         f"Dataset '{self.name}' version '{self.version}'"
                         f" already executed. Skipping."
diff --git a/src/egon/data/datasets/heat_etrago/__init__.py b/src/egon/data/datasets/heat_etrago/__init__.py
new file mode 100644
index 000000000..d6c83a6ea
--- /dev/null
+++ b/src/egon/data/datasets/heat_etrago/__init__.py
@@ -0,0 +1,247 @@
+"""The central module containing all code dealing with heat sector in etrago
+"""
+import pandas as pd
+import geopandas as gpd
+from egon.data import db, config
+from egon.data.datasets.heat_etrago.power_to_heat import (
+    insert_central_power_to_heat,insert_individual_power_to_heat, next_id)
+from egon.data.datasets import Dataset
+
+def insert_buses(carrier, version='0.0.0', scenario='eGon2035'):
+    """ Insert heat buses to etrago table
+
+    Heat buses are divided into central and individual heating
+
+    Parameters
+    ----------
+    carrier : str
+        Name of the carrier, either 'central_heat' or 'rural_heat'
+    version : str, optional
+        Version number. The default is '0.0.0'.
+    scenario : str, optional
+        Name of the scenario The default is 'eGon2035'.
+
+    """
+    sources = config.datasets()['etrago_heat']['sources']
+    target = config.datasets()['etrago_heat']['targets']['heat_buses']
+    # Delete existing heat buses (central or rural)
+    db.execute_sql(
+        f"""
+        DELETE FROM {target['schema']}.{target['table']}
+        WHERE scn_name = '{scenario}'
+        AND carrier = '{carrier}'
+        AND version = '{version}'
+        """)
+
+    # Select unused index of buses
+    next_bus_id = next_id('bus')
+
+    # initalize dataframe for heat buses
+    heat_buses = gpd.GeoDataFrame(columns = [
+        'version', 'scn_name', 'bus_id', 'carrier',
+        'x', 'y', 'geom']).set_geometry('geom').set_crs(epsg=4326)
+
+    # If central heat, create one bus per district heating area
+    if carrier == 'central_heat':
+        areas = db.select_geodataframe(
+            f"""
+            SELECT area_id, geom_polygon as geom
+            FROM  {sources['district_heating_areas']['schema']}.
+            {sources['district_heating_areas']['table']}
+            WHERE scenario = '{scenario}'
+            """,
+            index_col='area_id'
+            )
+        heat_buses.geom = areas.centroid.to_crs(epsg=4326)
+    # otherwise create one heat bus per hvmv substation
+    # which represents aggregated individual heating for etrago
+    else:
+        mv_grids = db.select_geodataframe(
+            f"""
+            SELECT ST_Centroid(geom) AS geom
+            FROM {sources['mv_grids']['schema']}.
+            {sources['mv_grids']['table']}
+            """)
+        heat_buses.geom = mv_grids.geom.to_crs(epsg=4326)
+
+    # Insert values into dataframe
+    heat_buses.version = '0.0.0'
+    heat_buses.scn_name = scenario
+    heat_buses.carrier = carrier
+    heat_buses.x = heat_buses.geom.x
+    heat_buses.y = heat_buses.geom.y
+    heat_buses.bus_id = range(next_bus_id, next_bus_id+len(heat_buses))
+
+    # Insert data into database
+    heat_buses.to_postgis(target['table'],
+                        schema=target['schema'],
+                        if_exists='append',
+                        con=db.engine())
+
+def insert_central_direct_heat(version = '0.0.0', scenario='eGon2035'):
+    """ Insert renewable heating technologies (solar and geo thermal)
+
+    Parameters
+    ----------
+    version : str, optional
+        Version number. The default is '0.0.0'.
+    scenario : str, optional
+        Name of the scenario The default is 'eGon2035'.
+
+    Returns
+    -------
+    None.
+
+    """
+    sources = config.datasets()['etrago_heat']['sources']
+    targets = config.datasets()['etrago_heat']['targets']
+
+    db.execute_sql(
+        f"""
+        DELETE FROM {targets['heat_generators']['schema']}.
+        {targets['heat_generators']['table']}
+        WHERE carrier IN ('solar_thermal_collector', 'geo_thermal')
+        AND scn_name = '{scenario}'
+        AND version = '{version}'
+        """)
+
+    db.execute_sql(
+        f"""
+        DELETE FROM {targets['heat_generator_timeseries']['schema']}.
+        {targets['heat_generator_timeseries']['table']}
+        WHERE scn_name = '{scenario}'
+        AND generator_id NOT IN (
+            SELECT generator_id FROM
+            grid.egon_pf_hv_generator
+            WHERE version = '{version}'
+            AND scn_name = '{scenario}')
+        """)
+
+    central_thermal = db.select_geodataframe(
+            f"""
+            SELECT district_heating_id, capacity, geometry, carrier
+            FROM  {sources['district_heating_supply']['schema']}.
+            {sources['district_heating_supply']['table']}
+            WHERE scenario = '{scenario}'
+            AND carrier IN (
+                'solar_thermal_collector', 'geo_thermal')
+            """,
+            geom_col='geometry',
+            index_col='district_heating_id')
+
+    map_dh_id_bus_id = db.select_dataframe(
+        f"""
+        SELECT bus_id, area_id, id FROM
+        {targets['heat_buses']['schema']}.
+        {targets['heat_buses']['table']}
+        JOIN {sources['district_heating_areas']['schema']}.
+            {sources['district_heating_areas']['table']}
+        ON ST_Transform(ST_Centroid(geom_polygon), 4326) = geom
+        WHERE carrier = 'central_heat'
+        AND scenario = '{scenario}'
+        """,
+        index_col='id')
+
+    new_id = next_id('generator')
+
+    generator = pd.DataFrame(
+        data = {'version': version,
+                'scn_name': scenario,
+                'carrier': central_thermal.carrier,
+                'bus': map_dh_id_bus_id.bus_id[central_thermal.index],
+                'p_nom': central_thermal.capacity,
+                'generator_id': range(
+                    new_id, new_id+len(central_thermal))})
+
+    solar_thermal = central_thermal[
+        central_thermal.carrier=='solar_thermal_collector']
+
+    weather_cells = db.select_geodataframe(
+        f"""
+        SELECT w_id, geom
+        FROM {sources['weather_cells']['schema']}.
+            {sources['weather_cells']['table']}
+        """,
+        index_col='w_id'
+        )
+
+    # Map solar thermal collectors to weather cells
+    join = gpd.sjoin(weather_cells, solar_thermal)[['index_right']]
+
+    feedin = db.select_dataframe(
+        f"""
+        SELECT w_id, feedin
+        FROM {sources['solar_thermal_feedin']['schema']}.
+            {sources['solar_thermal_feedin']['table']}
+        WHERE carrier = 'solar_thermal'
+        AND weather_year = 2011
+        """,
+        index_col='w_id')
+
+    timeseries =  pd.DataFrame(
+        data = {'version': version,
+                'scn_name': scenario,
+                'temp_id': 1,
+                'p_max_pu': feedin.feedin[join.index].values,
+                'generator_id': generator.generator_id[
+                    generator.carrier=='solar_thermal_collector'].values
+                    }
+        ).set_index('generator_id')
+
+    generator = generator.set_index('generator_id')
+
+    generator.to_sql(
+        targets['heat_generators']['table'],
+        schema=targets['heat_generators']['schema'],
+        if_exists='append',
+        con=db.engine())
+
+    timeseries.to_sql(
+        targets['heat_generator_timeseries']['table'],
+        schema=targets['heat_generator_timeseries']['schema'],
+        if_exists='append',
+        con=db.engine())
+
+def buses(version='0.0.0'):
+    """ Insert individual and district heat buses into eTraGo-tables
+
+    Parameters
+    ----------
+    version : str, optional
+        Version number. The default is '0.0.0'.
+
+    Returns
+    -------
+    None.
+
+    """
+
+    insert_buses('central_heat', version=version, scenario='eGon2035')
+    insert_buses('rural_heat', version=version, scenario='eGon2035')
+
+def supply(version='0.0.0'):
+    """ Insert individual and district heat supply into eTraGo-tables
+
+    Parameters
+    ----------
+    version : str, optional
+        Version number. The default is '0.0.0'.
+
+    Returns
+    -------
+    None.
+
+    """
+
+    insert_central_direct_heat(version = '0.0.0', scenario='eGon2035')
+    insert_central_power_to_heat(version, scenario='eGon2035')
+    insert_individual_power_to_heat(version, scenario='eGon2035')
+
+class HeatEtrago(Dataset):
+    def __init__(self, dependencies):
+        super().__init__(
+            name="HeatEtrago",
+            version="0.0.0",
+            dependencies=dependencies,
+            tasks=(buses, supply),
+        )
diff --git a/src/egon/data/datasets/heat_etrago/power_to_heat.py b/src/egon/data/datasets/heat_etrago/power_to_heat.py
new file mode 100644
index 000000000..6db96973c
--- /dev/null
+++ b/src/egon/data/datasets/heat_etrago/power_to_heat.py
@@ -0,0 +1,404 @@
+"""The central module containing all code dealing with power to heat
+"""
+import pandas as pd
+import geopandas as gpd
+from egon.data import db, config
+from shapely.geometry import LineString
+
+def next_id(component):
+    """ Select next id value for components in pf-tables
+
+    Parameters
+    ----------
+    component : str
+        Name of componenet
+
+    Returns
+    -------
+    next_id : int
+        Next index value
+
+    """
+    max_id = db.select_dataframe(
+        f"""
+        SELECT MAX({component}_id) FROM grid.egon_pf_hv_{component}
+        """)['max'][0]
+
+    if max_id:
+        next_id = max_id + 1
+    else:
+        next_id = 1
+
+    return next_id
+
+def insert_individual_power_to_heat(version = '0.0.0', scenario='eGon2035'):
+    """ Insert power to heat into database
+
+    Parameters
+    ----------
+    version : str, optional
+        Version number. The default is '0.0.0'.
+    scenario : str, optional
+        Name of the scenario The default is 'eGon2035'.
+
+    Returns
+    -------
+    None.
+
+    """
+
+    sources = config.datasets()['etrago_heat']['sources']
+    targets = config.datasets()['etrago_heat']['targets']
+
+    # Delete existing entries
+    db.execute_sql(
+        f"""
+        DELETE FROM {targets['heat_links']['schema']}.
+        {targets['heat_links']['table']}
+        WHERE carrier = 'individual_heat_pump'
+        """)
+
+    # Select heat pumps for individual heating
+    heat_pumps = db.select_dataframe(
+        f"""
+        SELECT mv_grid_id as power_bus,
+        a.carrier, capacity, b.bus_id as heat_bus
+        FROM {sources['individual_heating_supply']['schema']}.
+            {sources['individual_heating_supply']['table']} a
+        JOIN {targets['heat_buses']['schema']}.
+        {targets['heat_buses']['table']} b
+        ON ST_Intersects(
+            ST_Buffer(ST_Transform(ST_Centroid(a.geometry), 4326), 0.00000001),
+            geom)
+        WHERE scenario = '{scenario}'
+        AND scn_name  = '{scenario}'
+        AND a.carrier = 'heat_pump'
+        AND b.carrier = 'rural_heat'
+        """)
+
+    # Assign voltage level
+    heat_pumps['voltage_level'] = 7
+
+    # Insert heatpumps
+    insert_power_to_heat_per_level(
+        heat_pumps,
+        carrier = 'individual_heat_pump',
+        multiple_per_mv_grid=False,
+        version = '0.0.0', scenario='eGon2035')
+
+
+def insert_central_power_to_heat(version = '0.0.0', scenario='eGon2035'):
+    """ Insert power to heat in district heating areas into database
+
+    Parameters
+    ----------
+    version : str, optional
+        Version number. The default is '0.0.0'.
+    scenario : str, optional
+        Name of the scenario The default is 'eGon2035'.
+
+    Returns
+    -------
+    None.
+
+    """
+
+    sources = config.datasets()['etrago_heat']['sources']
+    targets = config.datasets()['etrago_heat']['targets']
+
+    # Delete existing entries
+    db.execute_sql(
+        f"""
+        DELETE FROM {targets['heat_links']['schema']}.
+        {targets['heat_links']['table']}
+        WHERE carrier = 'central_heat_pump'
+        """)
+    # Select heat pumps in district heating
+    central_heat_pumps = db.select_geodataframe(
+        f"""
+        SELECT * FROM {sources['district_heating_supply']['schema']}.
+            {sources['district_heating_supply']['table']}
+        WHERE scenario = '{scenario}'
+        AND carrier = 'heat_pump'
+        """,
+        geom_col='geometry')
+
+    # Assign voltage level
+    central_heat_pumps = assign_voltage_level(central_heat_pumps)
+
+    # Insert heatpumps in mv and below
+    # (one hvmv substation per district heating grid)
+    insert_power_to_heat_per_level(
+        central_heat_pumps[central_heat_pumps.voltage_level>3],
+        multiple_per_mv_grid=False,
+        version = '0.0.0', scenario='eGon2035')
+    # Insert heat pumps in hv grid
+    # (as many hvmv substations as intersect with district heating grid)
+    insert_power_to_heat_per_level(
+        central_heat_pumps[central_heat_pumps.voltage_level<3],
+        multiple_per_mv_grid=True,
+        version = '0.0.0', scenario='eGon2035')
+
+
+def insert_power_to_heat_per_level(heat_pumps, multiple_per_mv_grid,
+                                   carrier = 'central_heat_pump',
+                                   version = '0.0.0', scenario='eGon2035'):
+    """ Insert power to heat plants per grid level
+
+    Parameters
+    ----------
+    heat_pumps : pandas.DataFrame
+        Heat pumps in selected grid level
+    multiple_per_mv_grid : boolean
+        Choose if one district heating areas is supplied by one hvmv substation
+    version : str, optional
+        Version number. The default is '0.0.0'.
+    scenario : str, optional
+        Name of the scenario The default is 'eGon2035'.
+
+    Returns
+    -------
+    None.
+
+    """
+    sources = config.datasets()['etrago_heat']['sources']
+    targets = config.datasets()['etrago_heat']['targets']
+
+    if 'central' in carrier:
+        # Calculate heat pumps per electrical bus
+        gdf = assign_electrical_bus(heat_pumps, multiple_per_mv_grid)
+
+    else:
+        gdf = heat_pumps.copy()
+
+    # Select geometry of buses
+    geom_buses = db.select_geodataframe(
+        f"""
+        SELECT bus_id, geom FROM {targets['heat_buses']['schema']}.
+        {targets['heat_buses']['table']}
+        WHERE scn_name = '{scenario}'
+        """,
+        index_col='bus_id',
+        epsg=4326)
+
+    # Create topology of heat pumps
+    gdf['geom_power'] = geom_buses.geom[gdf.power_bus].values
+    gdf['geom_heat'] = geom_buses.loc[gdf.heat_bus, 'geom'].reset_index().geom
+    gdf['geometry']=gdf.apply(
+            lambda x: LineString([x['geom_power'], x['geom_heat']]),axis=1)
+
+    # Choose next unused link id
+    next_link_id = next_id('link')
+
+    # Initilize dataframe of links
+    links = gpd.GeoDataFrame(
+        index = range(len(gdf)),
+        columns = [
+            'version', 'scn_name', 'bus0', 'bus1',
+            'carrier', 'link_id', 'p_nom', 'topo'],
+        data = {'version': version, 'scn_name': scenario,
+                'carrier': carrier}
+        ).set_geometry('topo').set_crs(epsg=4326)
+
+    # Insert values into dataframe
+    links.bus0 = gdf.power_bus.values
+    links.bus1 = gdf.heat_bus.values
+    links.p_nom = gdf.capacity.values
+    links.topo = gdf.geometry.values
+    links.link_id = range(next_link_id, next_link_id+len(links))
+
+    # Insert data into database
+    links.to_postgis(targets['heat_links']['table'],
+                     schema=targets['heat_links']['schema'],
+                     if_exists = 'append',
+                     con=db.engine())
+
+def assign_voltage_level(heat_pumps):
+    """ Assign voltage level to heat pumps
+
+    Parameters
+    ----------
+    heat_pumps : pandas.DataFrame
+        Heat pumps without voltage level
+
+    Returns
+    -------
+    heat_pumps : pandas.DataFrame
+        Heat pumps including voltage level
+
+    """
+
+    # set voltage level for heat pumps according to category
+    heat_pumps['voltage_level'] = 0
+
+    heat_pumps.loc[
+        heat_pumps[(heat_pumps.carrier=='heat_pump')
+                             & (heat_pumps.category=='small')].index
+        , 'voltage_level'] = 7
+
+    heat_pumps.loc[
+        heat_pumps[(heat_pumps.carrier=='heat_pump')
+                             & (heat_pumps.category=='medium')].index
+        , 'voltage_level'] = 5
+
+    heat_pumps.loc[
+        heat_pumps[(heat_pumps.carrier=='heat_pump')
+                             & (heat_pumps.category=='large')].index
+        , 'voltage_level'] = 1
+
+    # if capacity > 5.5 MW, heatpump is installed in HV
+    heat_pumps.loc[
+        heat_pumps[(heat_pumps.carrier=='heat_pump')
+                             & (heat_pumps.capacity>5.5)].index
+        , 'voltage_level'] = 1
+
+    return heat_pumps
+
+def assign_electrical_bus(heat_pumps, multiple_per_mv_grid=False):
+    """ Calculates heat pumps per electrical bus
+
+    Parameters
+    ----------
+    heat_pumps : pandas.DataFrame
+        Heat pumps including voltage level
+    multiple_per_mv_grid : boolean, optional
+        Choose if a district heating area can by supplied by multiple
+        hvmv substaions/mv grids. The default is False.
+
+    Returns
+    -------
+    gdf : pandas.DataFrame
+        Heat pumps per electrical bus
+
+    """
+
+    sources = config.datasets()['etrago_heat']['sources']
+    targets = config.datasets()['etrago_heat']['targets']
+
+    # Map heat buses to district heating id and area_id
+    heat_buses = db.select_dataframe(
+        f"""
+        SELECT bus_id, area_id, id FROM
+        {targets['heat_buses']['schema']}.
+        {targets['heat_buses']['table']}
+        JOIN {sources['district_heating_areas']['schema']}.
+            {sources['district_heating_areas']['table']}
+        ON ST_Transform(ST_Centroid(geom_polygon), 4326) = geom
+        WHERE carrier = 'central_heat'
+        AND scenario='eGon2035'
+        """,
+        index_col='id')
+
+    heat_pumps['power_bus'] = ''
+
+    # Select mv grid distrcits
+    mv_grid_district = db.select_geodataframe(
+        f"""
+        SELECT subst_id, geom FROM
+        {sources['mv_grid_districts']['schema']}.
+        {sources['mv_grid_districts']['table']}
+        """)
+
+    # Map zensus cells to district heating areas
+    map_zensus_dh = db.select_geodataframe(
+        f"""
+        SELECT area_id, a.zensus_population_id,
+        geom_point as geom, sum(a.demand) as demand
+        FROM {sources['map_district_heating_areas']['schema']}.
+            {sources['map_district_heating_areas']['table']} b
+        JOIN {sources['heat_demand']['schema']}.
+            {sources['heat_demand']['table']} a
+        ON b.zensus_population_id = a.zensus_population_id
+        JOIN society.destatis_zensus_population_per_ha
+        ON society.destatis_zensus_population_per_ha.id =
+        a.zensus_population_id
+        WHERE a.scenario = 'eGon2035'
+        AND b.scenario = 'eGon2035'
+        GROUP BY (area_id, a.zensus_population_id, geom_point)
+        """)
+
+    # Select area_id per heat pump
+    heat_pumps['area_id'] = heat_buses.area_id[
+        heat_pumps.district_heating_id.values].values
+
+    heat_buses.set_index('area_id', inplace=True)
+
+    # Select only cells in choosen district heating areas
+    cells = map_zensus_dh[map_zensus_dh.area_id.isin(heat_pumps.area_id)]
+
+    # Assign power bus per zensus cell
+    cells['power_bus'] = gpd.sjoin(cells, mv_grid_district,
+                     how='inner', op='intersects').subst_id
+
+    # Calclate district heating demand per substaion
+    demand_per_substation = pd.DataFrame(
+        cells.groupby(['area_id', 'power_bus']).demand.sum())
+
+    heat_pumps.set_index('area_id', inplace=True)
+
+    # If district heating areas are supplied by multiple hvmv-substations,
+    # create one heatpup per electrical bus.
+    # The installed capacity is assigned regarding the share of heat demand.
+    if multiple_per_mv_grid:
+
+        power_to_heat = demand_per_substation.reset_index()
+
+        power_to_heat.loc[:, 'carrier'] = 'urban_central_heat_pump'
+
+        power_to_heat.loc[:, 'voltage_level'] = heat_pumps.voltage_level[
+            power_to_heat.area_id].values
+
+        power_to_heat['share_demand'] = power_to_heat.groupby(
+            'area_id').demand.apply(lambda grp: grp/grp.sum())
+
+        power_to_heat['capacity'] = power_to_heat['share_demand'].mul(
+            heat_pumps.capacity[power_to_heat.area_id].values)
+
+        power_to_heat = power_to_heat[power_to_heat.voltage_level.notnull()]
+
+
+        gdf = gpd.GeoDataFrame(power_to_heat, index = power_to_heat.index,
+                               geometry = heat_pumps.geometry[
+                                       power_to_heat.area_id].values)
+
+    # If district heating areas are supplied by one hvmv-substations,
+    # the hvmv substation which has the most heat demand is choosen.
+    else:
+
+        substation_max_demand = demand_per_substation.reset_index(
+            ).set_index('power_bus').groupby('area_id').demand.max()
+
+        selected_substations = demand_per_substation[
+            demand_per_substation.demand.isin(
+                substation_max_demand)].reset_index().set_index('area_id')
+
+        selected_substations.rename({'demand': 'demand_selected_substation'},
+                                    axis=1, inplace=True)
+
+        selected_substations['share_demand'] = cells.groupby(
+            ['area_id', 'power_bus']).demand.sum().reset_index().groupby(
+                'area_id').demand.max()/cells.groupby(
+                    ['area_id', 'power_bus']).demand.sum(
+                        ).reset_index().groupby('area_id').demand.sum()
+
+        power_to_heat = selected_substations
+
+        power_to_heat.loc[:, 'carrier'] = 'urban_central_heat_pump'
+
+        power_to_heat.loc[:, 'voltage_level'] = heat_pumps.voltage_level
+
+        power_to_heat['capacity'] = heat_pumps.capacity[
+            power_to_heat.index].values
+
+        power_to_heat = power_to_heat[power_to_heat.voltage_level.notnull()]
+
+        gdf = gpd.GeoDataFrame(power_to_heat, index = power_to_heat.index,
+                               geometry = heat_pumps.geometry)
+
+    gdf.reset_index(inplace=True)
+
+    gdf['heat_bus'] = heat_buses.loc[
+            gdf.area_id, 'bus_id'].reset_index().bus_id
+
+    return gdf
+
diff --git a/src/egon/data/datasets/heat_supply/__init__.py b/src/egon/data/datasets/heat_supply/__init__.py
new file mode 100644
index 000000000..bddfc904d
--- /dev/null
+++ b/src/egon/data/datasets/heat_supply/__init__.py
@@ -0,0 +1,118 @@
+"""The central module containing all code dealing with heat supply data
+
+"""
+
+from egon.data import db, config
+
+from egon.data.datasets.heat_supply.district_heating import (
+    cascade_heat_supply)
+from egon.data.datasets.heat_supply.individual_heating import (
+    cascade_heat_supply_indiv)
+from egon.data.datasets.heat_supply.geothermal import (
+    potential_germany)
+from egon.data.processing.district_heating_areas import DistrictHeatingAreas
+from sqlalchemy import Column, String, Float, Integer, ForeignKey
+from sqlalchemy.ext.declarative import declarative_base
+from geoalchemy2.types import Geometry
+from egon.data.datasets import Dataset
+### will be later imported from another file ###
+Base = declarative_base()
+
+# TODO: set district_heating_id as ForeignKey
+class EgonDistrictHeatingSupply(Base):
+    __tablename__ = 'egon_district_heating'
+    __table_args__ = {'schema': 'supply'}
+    index = Column(Integer, primary_key=True)
+    district_heating_id = Column(Integer)
+    carrier = Column(String(25))
+    category = Column(String(25))
+    capacity = Column(Float)
+    geometry = Column(Geometry('POINT', 3035))
+    scenario = Column(String(50))
+
+class EgonIndividualHeatingSupply(Base):
+    __tablename__ = 'egon_individual_heating'
+    __table_args__ = {'schema': 'supply'}
+    index = Column(Integer, primary_key=True)
+    mv_grid_id = Column(Integer)
+    carrier = Column(String(25))
+    category = Column(String(25))
+    capacity = Column(Float)
+    geometry = Column(Geometry('POINT', 3035))
+    scenario = Column(String(50))
+
+def create_tables():
+    """Create tables for district heating areas
+
+    Returns
+    -------
+        None
+    """
+
+    engine = db.engine()
+    EgonDistrictHeatingSupply.__table__.drop(bind=engine, checkfirst=True)
+    EgonDistrictHeatingSupply.__table__.create(bind=engine, checkfirst=True)
+    EgonIndividualHeatingSupply.__table__.drop(bind=engine, checkfirst=True)
+    EgonIndividualHeatingSupply.__table__.create(bind=engine, checkfirst=True)
+
+
+def district_heating():
+    """ Insert supply for district heating areas
+
+    Returns
+    -------
+    None.
+
+    """
+    targets = config.datasets()['heat_supply']['targets']
+
+    db.execute_sql(
+        f"""
+        DELETE FROM {targets['district_heating_supply']['schema']}.
+        {targets['district_heating_supply']['table']}
+        """)
+
+    supply_2035 = cascade_heat_supply('eGon2035', plotting=False)
+
+    supply_2035['scenario'] = 'eGon2035'
+
+    supply_2035.to_postgis(
+        targets['district_heating_supply']['table'],
+        schema=targets['district_heating_supply']['schema'],
+        con=db.engine(), if_exists='append')
+
+def individual_heating():
+    """ Insert supply for individual heating
+
+    Returns
+    -------
+    None.
+
+    """
+    targets = config.datasets()['heat_supply']['targets']
+
+    db.execute_sql(
+        f"""
+        DELETE FROM {targets['individual_heating_supply']['schema']}.
+        {targets['individual_heating_supply']['table']}
+        """)
+
+    supply_2035 = cascade_heat_supply_indiv(
+        'eGon2035', distribution_level='federal_states', plotting=False)
+
+    supply_2035['scenario'] = 'eGon2035'
+
+    supply_2035.to_postgis(
+        targets['individual_heating_supply']['table'],
+        schema=targets['individual_heating_supply']['schema'],
+        con=db.engine(), if_exists='append')
+
+class HeatSupply(Dataset):
+    def __init__(self, dependencies):
+        super().__init__(
+            name="HeatSupply",
+            version="0.0.0",
+            dependencies=dependencies,
+            tasks=(create_tables, {
+                district_heating, individual_heating, potential_germany}),
+        )
diff --git a/src/egon/data/datasets/heat_supply/district_heating.py b/src/egon/data/datasets/heat_supply/district_heating.py
new file mode 100644
index 000000000..a2bf3cd14
--- /dev/null
+++ b/src/egon/data/datasets/heat_supply/district_heating.py
@@ -0,0 +1,326 @@
+"""The central module containing all code dealing with heat supply
+for district heating areas.
+
+"""
+import pandas as pd
+import geopandas as gpd
+from egon.data import db, config
+
+from egon.data.datasets.heat_supply.geothermal import calc_geothermal_costs
+
+def capacity_per_district_heating_category(district_heating_areas, scenario):
+    """ Calculates target values per district heating category and technology
+
+    Parameters
+    ----------
+    district_heating_areas : geopandas.geodataframe.GeoDataFrame
+        District heating areas per scenario
+    scenario : str
+        Name of the scenario
+
+    Returns
+    -------
+    capacity_per_category : pandas.DataFrame
+        Installed capacities per technology and size category
+
+    """
+    sources = config.datasets()['heat_supply']['sources']
+
+    target_values = db.select_dataframe(
+        f"""
+        SELECT capacity, split_part(carrier, 'urban_central_', 2) as technology
+        FROM {sources['scenario_capacities']['schema']}.
+        {sources['scenario_capacities']['table']}
+        WHERE carrier IN (
+            'urban_central_heat_pump',
+            'urban_central_resistive_heater',
+            'urban_central_geo_thermal',
+            'urban_central_solar_thermal_collector')
+        """,
+        index_col='technology')
+
+
+    capacity_per_category = pd.DataFrame(
+        index=['small', 'medium', 'large'],
+        columns=['solar_thermal_collector',
+                 'heat_pump', 'geo_thermal', 'demand'])
+
+    capacity_per_category.demand = district_heating_areas.groupby(
+        district_heating_areas.category).demand.sum()
+
+    capacity_per_category.loc[
+        ['small', 'medium'],'solar_thermal_collector'] = (
+            target_values.capacity['solar_thermal_collector']
+            *capacity_per_category.demand
+            /capacity_per_category.demand[['small', 'medium']].sum())
+
+    capacity_per_category.loc[:, 'heat_pump'] = (
+        target_values.capacity['heat_pump']
+        *capacity_per_category.demand
+        /capacity_per_category.demand.sum())
+
+    capacity_per_category.loc['large', 'geo_thermal'] = (
+        target_values.capacity['geo_thermal'])
+
+    return capacity_per_category
+
+
+def set_technology_data():
+    """Set data per technology according to Kurzstudie KWK
+
+    Returns
+    -------
+    pandas.DataFrame
+        List of parameters per technology
+
+    """
+    return  pd.DataFrame(
+        index = ['CHP', 'solar_thermal_collector',
+                 'heat_pump', 'geo_thermal'],
+        columns = ['estimated_flh', 'priority'],
+        data = {
+            'estimated_flh': [8760, 1330, 7000, 3000],
+            'priority': [4, 2 ,1 ,3]})
+
+
+def select_district_heating_areas(scenario):
+    """ Selects district heating areas per scenario and assigns size-category
+
+    Parameters
+    ----------
+    scenario : str
+        Name of the scenario
+
+    Returns
+    -------
+    district_heating_areas : geopandas.geodataframe.GeoDataFrame
+        District heating areas per scenario
+
+    """
+
+    sources = config.datasets()['heat_supply']['sources']
+
+    max_demand_medium_district_heating = 96000
+
+    max_demand_small_district_heating = 2400
+
+    district_heating_areas = db.select_geodataframe(
+         f"""
+         SELECT id as district_heating_id,
+         residential_and_service_demand as demand,
+         geom_polygon as geom
+         FROM {sources['district_heating_areas']['schema']}.
+        {sources['district_heating_areas']['table']}
+         WHERE scenario = '{scenario}'
+         """,
+         index_col='district_heating_id')
+
+    district_heating_areas['category'] = 'large'
+
+    district_heating_areas.loc[
+        district_heating_areas[
+            district_heating_areas.demand
+            < max_demand_medium_district_heating].index,
+        'category'] = 'medium'
+
+    district_heating_areas.loc[
+        district_heating_areas[
+            district_heating_areas.demand
+            < max_demand_small_district_heating].index,
+        'category'] = 'small'
+
+    return district_heating_areas
+
+
+def cascade_per_technology(
+        areas, technologies, capacity_per_category, size_dh,
+        max_geothermal_costs = 2):
+
+    """ Add plants of one technology suppliing district heating
+
+    Parameters
+    ----------
+    areas : geopandas.geodataframe.GeoDataFrame
+        District heating areas which need to be supplied
+    technologies : pandas.DataFrame
+        List of supply technologies and their parameters
+    capacity_per_category : pandas.DataFrame
+        Target installed capacities per size-category
+    size_dh : str
+        Category of the district heating areas
+    max_geothermal_costs : float, optional
+        Maxiumal costs of MW geothermal in EUR/MW. The default is 2.
+
+    Returns
+    -------
+    areas : geopandas.geodataframe.GeoDataFrame
+        District heating areas which need additional supply technologies
+    technologies : pandas.DataFrame
+        List of supply technologies and their parameters
+    append_df : pandas.DataFrame
+        List of plants per district heating grid for the selected technology
+
+    """
+    sources = config.datasets()['heat_supply']['sources']
+
+    tech = technologies[technologies.priority==technologies.priority.max()]
+
+    # Assign CHP plants inside district heating area
+    # TODO: This has to be updaten when all chp plants are available!
+    if tech.index == 'CHP':
+
+        # Select chp plants from database
+        gdf_chp = db.select_geodataframe(
+            f"""SELECT id, geom, th_capacity as capacity
+            FROM {sources['power_plants']['schema']}.
+            {sources['power_plants']['table']}
+            WHERE chp = True""")
+
+        # Choose chp plants that intersect with district heating areas
+        join = gpd.sjoin(gdf_chp.to_crs(4326), areas, rsuffix='area')
+
+        append_df = pd.DataFrame(
+            join.groupby('index_area').capacity.sum()).reset_index().rename(
+                {'index_area': 'district_heating_id'}, axis=1)
+
+    # Distribute solar thermal and heatpumps linear to remaining demand.
+    # Geothermal plants are distributed to areas with geothermal potential.
+    if tech.index in ['solar_thermal_collector', 'heat_pump', 'geo_thermal']:
+
+        if tech.index == 'geo_thermal':
+            # Select areas with geothermal potential considering costs
+            gdf_geothermal = calc_geothermal_costs(max_geothermal_costs)
+            # Select areas which intersect with district heating areas
+            join = gpd.sjoin(
+                gdf_geothermal.to_crs(4326), areas, rsuffix='area')
+            # Calculate share of installed capacity
+            share_per_area = (
+                join.groupby('index_area')['remaining_demand'].sum()/
+                join['remaining_demand'].sum().sum())
+
+        else:
+            share_per_area = (
+                areas['remaining_demand']/areas['remaining_demand'].sum())
+        # Prepare list of heat supply technologies
+        append_df = pd.DataFrame(
+            (share_per_area).mul(
+                capacity_per_category.loc[size_dh, tech.index].values[0]
+                )).reset_index()
+        # Rename columns
+        append_df.rename({
+            'index_area':'district_heating_id',
+            'remaining_demand':'capacity'}, axis = 1, inplace=True)
+    # Add heat supply to overall list
+    if append_df.size > 0:
+        append_df['carrier'] = tech.index[0]
+        append_df['category'] = size_dh
+        areas.loc[append_df.district_heating_id,
+                  'remaining_demand'] -= append_df.set_index(
+                      'district_heating_id').capacity.mul(
+                          tech.estimated_flh.values[0])
+    # Select district heating areas which need an additional supply technology
+    areas = areas[areas.remaining_demand>=0]
+
+    # Delete inserted technology from list
+    technologies = technologies.drop(tech.index)
+
+    return areas, technologies, append_df
+
+
+def cascade_heat_supply(scenario, plotting=True):
+    """Assigns supply strategy for ditsrict heating areas.
+
+    Different technologies are selected for three categories of district
+    heating areas (small, medium and large annual demand).
+    The technologies are priorized according to
+    Flexibilisierung der Kraft-Wärme-Kopplung; 2017;
+    Forschungsstelle für Energiewirtschaft e.V. (FfE)
+
+    Parameters
+    ----------
+    scenario : str
+        Name of scenario
+    plotting : bool, optional
+        Choose if district heating supply is plotted. The default is True.
+
+    Returns
+    -------
+    resulting_capacities : pandas.DataFrame
+        List of plants per district heating grid
+
+    """
+
+    # Select district heating areas from database
+    district_heating_areas = select_district_heating_areas(scenario)
+
+    # Select technolgies per district heating size
+    map_dh_technologies = {
+        'small': ['CHP', 'solar_thermal_collector', 'heat_pump'],
+        'medium': ['CHP', 'solar_thermal_collector', 'heat_pump'],
+        'large': ['CHP', 'geo_thermal', 'heat_pump'],
+        }
+
+    # Assign capacities per district heating category
+    capacity_per_category = capacity_per_district_heating_category(
+        district_heating_areas, scenario)
+
+    # Initalize Dataframe for results
+    resulting_capacities = pd.DataFrame(
+        columns=['district_heating_id', 'carrier', 'capacity', 'category'])
+
+    # Set technology data according to Kurzstudie KWK, NEP 2021
+    technology_data = set_technology_data()
+
+    for size_dh in ['small', 'medium', 'large']:
+
+        # Select areas in size-category
+        areas = district_heating_areas[
+            district_heating_areas.category==size_dh].to_crs(4326)
+
+        # Set remaining_demand to demand for first iteration
+        areas['remaining_demand'] = areas['demand']
+
+        # Select technologies which can be use in this size-category
+        technologies = technology_data.loc[map_dh_technologies[size_dh], :]
+
+        # Assign new supply technologies to district heating areas
+        # as long as the demand is not covered and there are technologies left
+        while (len(technologies) > 0) and (len(areas) > 0):
+
+            areas, technologies, append_df = cascade_per_technology(
+                areas, technologies, capacity_per_category, size_dh)
+
+            resulting_capacities = resulting_capacities.append(
+                append_df, ignore_index=True)
+
+    # Plot results per district heating area
+    if plotting:
+        plot_heat_supply(resulting_capacities)
+
+    return gpd.GeoDataFrame(
+        resulting_capacities,
+        geometry = district_heating_areas.geom[
+            resulting_capacities.district_heating_id].centroid.values)
+
+
+def plot_heat_supply(resulting_capacities):
+
+    from matplotlib import pyplot as plt
+
+    district_heating_areas = select_district_heating_areas('eGon2035')
+
+    for c in ['CHP', 'solar_thermal_collector', 'geo_thermal', 'heat_pump']:
+        district_heating_areas[c] = resulting_capacities[
+            resulting_capacities.carrier==c].set_index(
+                'district_heating_id').capacity
+
+        fig, ax = plt.subplots(1, 1)
+        district_heating_areas.boundary.plot(linewidth=0.2,ax=ax, color='black')
+        district_heating_areas.plot(
+            ax=ax,
+            column=c,
+            cmap='magma_r',
+            legend=True,
+            legend_kwds={'label': f"Installed {c} in MW",
+                         'orientation': "vertical"})
+        plt.savefig(f'plots/heat_supply_{c}.png', dpi=300)
diff --git a/src/egon/data/datasets/heat_supply/geothermal.py b/src/egon/data/datasets/heat_supply/geothermal.py
new file mode 100644
index 000000000..6266e6527
--- /dev/null
+++ b/src/egon/data/datasets/heat_supply/geothermal.py
@@ -0,0 +1,228 @@
+"""The module containing all code dealing with geothermal potentials and costs
+
+Main source: Ableitung eines Korridors für den Ausbau
+der erneuerbaren Wärme im Gebäudebereich, Beuth Hochschule für Technik
+Berlin ifeu – Institut für Energie- und Umweltforschung Heidelberg GmbH
+Februar 2017
+
+"""
+import geopandas as gpd
+import pandas as pd
+import numpy as np
+
+from egon.data import db
+
+
+def calc_geothermal_potentials():
+    # Set parameters
+    ## specific thermal capacity of water in kJ/kg*K (p. 95)
+    c_p = 4
+    ## full load hours per year in h (p. 95)
+    flh = 3000
+    ## mass flow per reservoir in kg/s (p. 95)
+    m_flow = pd.Series(
+        data={'NDB': 35, 'ORG': 90, 'SMB': 125}, name = 'm_flow')
+
+    ## geothermal potentials per temperature (p. 94)
+    potentials = gpd.read_file(
+        'data_bundle_egon_data/geothermal_potential/geothermal_potential_germany.shp')
+    ## temperature heating system in °C (p. 95)
+    sys_temp = 60
+    ## temeprature losses heat recuperator in °C (p. 95)
+    loss_temp = 5
+
+    # calc mean temperatures per region (p. 93/94):
+    potentials['mean_temperature'] = potentials['min_temper'] + 15
+
+    # exclude regions with mean_temp < 60°C (p. 93):
+    potentials = potentials[potentials.mean_temperature>=60]
+
+    # exclude regions outside of NDB, ORG or SMB because of missing mass flow
+    potentials = potentials[~potentials.reservoir.isnull()]
+
+    ## set mass flows per region
+    potentials['m_flow'] = potentials.join(m_flow, on = 'reservoir').m_flow
+
+    # calculate flow in kW
+    potentials['Q_flow'] = potentials.m_flow * c_p * (
+        potentials.mean_temperature - loss_temp - sys_temp)
+
+    potentials['Q'] = potentials.Q_flow * flh
+
+    return potentials
+
+def calc_geothermal_costs(max_costs=np.inf, min_costs=0):
+    # Set parameters
+    ## drilling depth per reservoir in m (p. 99)
+    depth = pd.Series(
+        data={'NDB': 2500, 'ORG': 3400, 'SMB': 2800}, name = 'depth')
+    ## drillings costs in EUR/m (p. 99)
+    depth_costs = 1500
+    ## ratio of investment costs to drilling costs  (p. 99)
+    ratio = 1.4
+    ## annulazaion factors
+    p = 0.045
+    T = 30
+    PVA = 1/p - 1/(p*(1+p)**T)
+
+    # calculate overnight investment costs per drilling and region
+    overnight_investment = depth*depth_costs*ratio
+    investment_per_year = overnight_investment/PVA
+
+    # investment costs per well according to p.99
+    costs = pd.Series(
+        data={'NDB': 12.5e6, 'ORG': 17e6, 'SMB': 14e6}, name = 'costs')
+
+    potentials = calc_geothermal_potentials()
+
+
+    potentials['cost_per_well'] = potentials.join(
+        costs, on = 'reservoir').costs
+
+    potentials['cost_per_well_mw'] = potentials.cost_per_well/1000/potentials.Q_flow
+
+    potentials = potentials.to_crs(3035)
+
+    # area weighted mean costs per well and mw
+    np.average(potentials.cost_per_well_mw, weights=potentials.area)
+
+    return potentials[(potentials['cost_per_well_mw']<=max_costs)
+                      &(potentials['cost_per_well_mw']>min_costs)]
+
+def calc_usable_geothermal_potential(max_costs=2, min_costs=0):
+    """ Calculate geothermal potentials close to district heating demands
+
+    Parameters
+    ----------
+    max_costs : float, optional
+        Maximum accepted costs for geo thermal in EUR/MW_th. The default is 2.
+    min_costs : float, optional
+        Minimum accepted costs for geo thermal in EUR/MW_th. The default is 0.
+
+    Returns
+    -------
+    float
+        Geothermal potential close to district heating areas in MW
+
+    """
+
+    # Select 1km buffer arround large district heating areas as possible areas
+    district_heating = db.select_geodataframe(
+        """
+        SELECT area_id,
+        residential_and_service_demand as demand,
+        ST_Difference(
+            ST_Buffer(geom_polygon, 1000), geom_polygon) as geom
+        FROM demand.district_heating_areas
+        WHERE scenario = 'eGon100RE'
+        AND residential_and_service_demand > 96000
+        """,
+        index_col="area_id",
+    )
+
+    # Select geothermal potential areas where investments costs per MW
+    # are in given range
+    geothermal_potential = calc_geothermal_costs(
+        max_costs=max_costs, min_costs=min_costs)
+
+    # Intersect geothermal potential areas with district heating areas:
+    # geothermal will be build only if demand of a large district heating
+    # grid is close
+    overlay = gpd.overlay(district_heating.reset_index(), geothermal_potential)
+
+    if len(overlay) > 0:
+
+        # Calculate available area for geothermal power plants
+        overlay["area_sqkm"] = overlay.area * 1e-6
+
+        # Assmue needed area per well
+        pw_km = 0.25
+
+        # Calculate number of possible wells per intersecting area
+        overlay["number_wells"] = overlay["area_sqkm"].mul(pw_km)
+
+        # Calculate share of overlaying areas per district heating grid
+        overlay["area_share"] = (
+            overlay.groupby("area_id")
+            .apply(lambda grp: grp.area_sqkm / grp.area_sqkm.sum())
+            .values
+        )
+
+        # Possible installable capacity per intersecting area
+        overlay["Q_per_area"] = overlay.Q_flow.mul(overlay.number_wells)
+
+        # Prepare geothermal potenital per district heating area
+        gt_potential_dh = pd.DataFrame(index=district_heating.index)
+        gt_potential_dh["demand"] = district_heating.demand
+
+        # Group intersecting areas by district heating area
+        grouped = overlay[
+            overlay.area_id.isin(
+                gt_potential_dh[gt_potential_dh.index.isin(
+                    overlay.area_id)].index)
+            ].groupby(overlay.area_id)
+
+        # Calculate geo thermal capacity per district heating area
+        gt_potential_dh["Q_flow"] = grouped.Q_per_area.sum() / 1000
+        gt_potential_dh["installed_MW"] = gt_potential_dh["Q_flow"]
+
+        # Demand resitriction: If technical potential exceeds demand of
+        # district heating area, reduce potential according to demand
+        idx_demand_restriction = (
+            gt_potential_dh["Q_flow"] * 3000 > gt_potential_dh["demand"])
+        gt_potential_dh.loc[
+            idx_demand_restriction, "installed_MW"] = (
+                gt_potential_dh.loc[idx_demand_restriction, "demand"]
+                / 3000)
+
+        print(f"""Geothermal potential in Germany:
+              {round(gt_potential_dh["Q_flow"].sum()/1000, 3)} GW_th""")
+        print(
+            f"""
+            Geothermal potential in Germany close to large district heating:
+                {round(gt_potential_dh['installed_MW'].sum()/1000, 3)} GW_th
+            """
+        )
+
+
+        return gt_potential_dh["installed_MW"].sum()
+    else:
+        return 0
+
+def potential_germany():
+    """Calculates geothermal potentials for different investment costs.
+
+    The investment costs for geothermal district heating highly depend on
+    the location because of different mass flows and drilling depths.
+    Thsi functions calcultaes the geothermal potentials close to germany
+    for five different costs ranges.
+    This data can be used in pypsa-eur-sec to optimise the share of
+    geothermal district heating by considering different investment costs.
+
+    Returns
+    -------
+    None.
+
+    """
+    geothermal_costs_and_potentials = pd.Series(
+        index=[0.5, 1, 2, 5, 10])
+
+    geothermal_costs_and_potentials[0.5] = calc_usable_geothermal_potential(
+        max_costs=0.5, min_costs=0)
+
+    geothermal_costs_and_potentials[1] = calc_usable_geothermal_potential(
+        max_costs=1, min_costs=0.5)
+
+    geothermal_costs_and_potentials[2] = calc_usable_geothermal_potential(
+        max_costs=2, min_costs=1)
+
+    geothermal_costs_and_potentials[5] = calc_usable_geothermal_potential(
+        max_costs=5, min_costs=2)
+
+    geothermal_costs_and_potentials[10] = calc_usable_geothermal_potential(
+        max_costs=10, min_costs=5)
+
+    pd.DataFrame(
+        geothermal_costs_and_potentials).reset_index().rename(
+            {'index':'cost [EUR/kW]', 0: 'potential [MW]'}, axis=1).to_csv(
+                'geothermal_potential_germany.csv')
diff --git a/src/egon/data/datasets/heat_supply/individual_heating.py b/src/egon/data/datasets/heat_supply/individual_heating.py
new file mode 100644
index 000000000..12dc85a6a
--- /dev/null
+++ b/src/egon/data/datasets/heat_supply/individual_heating.py
@@ -0,0 +1,226 @@
+"""The central module containing all code dealing with
+individual heat supply.
+
+"""
+import pandas as pd
+import geopandas as gpd
+from egon.data import db, config
+
+def cascade_per_technology(
+        heat_per_mv, technologies, scenario,
+        distribution_level, max_size_individual_chp=0.05):
+
+    """ Add plants for individual heat.
+    Currently only on mv grid district level.
+
+    Parameters
+    ----------
+    mv_grid_districts : geopandas.geodataframe.GeoDataFrame
+        MV grid districts including the heat demand
+    technologies : pandas.DataFrame
+        List of supply technologies and their parameters
+    scenario : str
+        Name of the scenario
+    max_size_individual_chp : float
+        Maximum capacity of an individual chp in MW
+    Returns
+    -------
+    mv_grid_districts : geopandas.geodataframe.GeoDataFrame
+        MV grid district which need additional individual heat supply
+    technologies : pandas.DataFrame
+        List of supply technologies and their parameters
+    append_df : pandas.DataFrame
+        List of plants per mv grid for the selected technology
+
+    """
+    sources = config.datasets()['heat_supply']['sources']
+
+    tech = technologies[technologies.priority==technologies.priority.max()]
+
+    if tech.index == 'CHP':
+
+        gdf_chp = db.select_geodataframe(
+            f"""SELECT id, geom, th_capacity as capacity
+            FROM {sources['power_plants']['schema']}.
+            {sources['power_plants']['table']}
+            WHERE chp = True
+            AND el_capacity < {max_size_individual_chp}
+            """)
+
+        join = gpd.sjoin(gdf_chp.to_crs(3035), heat_per_mv,
+                         rsuffix='mv')
+
+        append_df = pd.DataFrame(
+            join.groupby('index_mv').capacity.sum()).reset_index().rename(
+                {'index_mv': 'mv_grid_id'}, axis=1)
+
+    # Distribute heat pumps linear to remaining demand.
+    if tech.index == 'heat_pump':
+
+        if distribution_level == 'federal_state':
+            # Select target values per federal state
+            target = db.select_dataframe(
+                    f"""
+                    SELECT DISTINCT ON (gen) gen as state, capacity
+                    FROM {sources['scenario_capacities']['schema']}.
+                    {sources['scenario_capacities']['table']} a
+                    JOIN {sources['federal_states']['schema']}.
+                    {sources['federal_states']['table']} b
+                    ON a.nuts = b.nuts
+                    WHERE scenario_name = '{scenario}'
+                    AND carrier = 'residential_rural_heat_pump'
+                    """,
+                    index_col='state')
+
+            heat_per_mv['share'] = heat_per_mv.groupby(
+                'state').remaining_demand.apply(lambda grp: grp/grp.sum())
+
+            append_df = heat_per_mv['share'].mul(
+                target.capacity[heat_per_mv['state']].values).reset_index()
+        else:
+            # Select target value for Germany
+            target = db.select_dataframe(
+                    f"""
+                    SELECT SUM(capacity) AS capacity
+                    FROM {sources['scenario_capacities']['schema']}.
+                    {sources['scenario_capacities']['table']} a
+                    WHERE scenario_name = '{scenario}'
+                    AND carrier = 'residential_rural_heat_pump'
+                    """)
+
+            heat_per_mv['share'] = (heat_per_mv.remaining_demand/
+                                    heat_per_mv.remaining_demand.sum())
+
+            append_df = heat_per_mv['share'].mul(
+                target.capacity[0]).reset_index()
+
+        append_df.rename({
+            'bus_id':'mv_grid_id',
+            'share':'capacity'}, axis = 1, inplace=True)
+
+    if append_df.size > 0:
+        append_df['carrier'] = tech.index[0]
+        heat_per_mv.loc[append_df.mv_grid_id,
+                  'remaining_demand'] -= append_df.set_index(
+                      'mv_grid_id').capacity.mul(
+                          tech.estimated_flh.values[0])
+
+    heat_per_mv = heat_per_mv[heat_per_mv.remaining_demand>=0]
+
+    technologies = technologies.drop(tech.index)
+
+    return heat_per_mv, technologies, append_df
+
+
+def cascade_heat_supply_indiv(scenario, distribution_level, plotting=True):
+    """Assigns supply strategy for individual heating in four steps.
+
+    1.) all small scale CHP are connected.
+    2.) If the supply can not  meet the heat demand, solar thermal collectors
+        are attached. This is not implemented yet, since individual
+        solar thermal plants are not considered in eGon2035 scenario.
+    3.) If this is not suitable, the mv grid is also supplied by heat pumps.
+    4.) The last option are individual gas boilers.
+
+    Parameters
+    ----------
+    scenario : str
+        Name of scenario
+    plotting : bool, optional
+        Choose if individual heating supply is plotted. The default is True.
+
+    Returns
+    -------
+    resulting_capacities : pandas.DataFrame
+        List of plants per mv grid
+
+    """
+
+    sources =  config.datasets()['heat_supply']['sources']
+
+    # Select residential heat demand per mv grid district and federal state
+    heat_per_mv = db.select_geodataframe(
+        f"""
+        SELECT d.subst_id as bus_id, SUM(demand) as demand,
+        c.vg250_lan as state, d.geom
+        FROM {sources['heat_demand']['schema']}.
+        {sources['heat_demand']['table']} a
+        JOIN {sources['map_zensus_grid']['schema']}.
+        {sources['map_zensus_grid']['table']} b
+        ON a.zensus_population_id = b.zensus_population_id
+        JOIN {sources['map_vg250_grid']['schema']}.
+        {sources['map_vg250_grid']['table']} c
+        ON b.subst_id = c.bus_id
+        JOIN {sources['mv_grids']['schema']}.
+        {sources['mv_grids']['table']} d
+        ON d.subst_id = c.bus_id
+        WHERE scenario = '{scenario}'
+        AND sector = 'residential'
+        AND a.zensus_population_id NOT IN (
+            SELECT zensus_population_id
+            FROM {sources['map_dh']['schema']}.{sources['map_dh']['table']}
+            WHERE scenario = '{scenario}')
+        GROUP BY d.subst_id, vg250_lan, geom
+        """,
+        index_col = 'bus_id')
+
+    # Store geometry of mv grid
+    geom_mv = heat_per_mv.geom.centroid.copy()
+
+    # Initalize Dataframe for results
+    resulting_capacities = pd.DataFrame(
+        columns=['mv_grid_id', 'carrier', 'capacity'])
+
+    # Set technology data according to
+    # http://www.wbzu.de/seminare/infopool/infopool-bhkw
+    # TODO: Add gas boilers and solar themal (eGon100RE)
+    technologies = pd.DataFrame(
+        index = ['CHP', 'heat_pump'],
+        columns = ['estimated_flh', 'priority'],
+        data = {
+            'estimated_flh': [4000, 4000],
+            'priority': [3, 1]})
+
+    # In the beginning, the remaining demand equals demand
+    heat_per_mv['remaining_demand'] = heat_per_mv['demand']
+
+    # Connect new technologies, if there is still heat demand left
+    while (len(technologies) > 0) and (len(heat_per_mv) > 0):
+        # Attach new supply technology
+        heat_per_mv, technologies, append_df = cascade_per_technology(
+            heat_per_mv, technologies, scenario, distribution_level)
+        # Collect resulting capacities
+        resulting_capacities = resulting_capacities.append(
+            append_df, ignore_index=True)
+
+    if plotting:
+        plot_heat_supply(resulting_capacities)
+
+    return gpd.GeoDataFrame(
+        resulting_capacities,
+        geometry = geom_mv[resulting_capacities.mv_grid_id].values)
+
+def plot_heat_supply(resulting_capacities):
+
+    from matplotlib import pyplot as plt
+
+    mv_grids = db.select_geodataframe(
+        """
+        SELECT * FROM grid.mv_grid_districts
+        """, index_col='subst_id')
+
+    for c in ['CHP', 'heat_pump']:
+        mv_grids[c] = resulting_capacities[
+            resulting_capacities.carrier==c].set_index(
+                'mv_grid_id').capacity
+
+        fig, ax = plt.subplots(1, 1)
+        mv_grids.boundary.plot(linewidth=0.2,ax=ax, color='black')
+        mv_grids.plot(
+            ax=ax,
+            column=c,
+            cmap='magma_r',
+            legend=True,
+            legend_kwds={'label': f"Installed {c} in MW",
+                         'orientation': "vertical"})
+        plt.savefig(f'plots/individual_heat_supply_{c}.png', dpi=300)
\ No newline at end of file
diff --git a/src/egon/data/importing/mastr.py b/src/egon/data/datasets/mastr.py
similarity index 86%
rename from src/egon/data/importing/mastr.py
rename to src/egon/data/datasets/mastr.py
index da84d70df..3c8ff4a49 100644
--- a/src/egon/data/importing/mastr.py
+++ b/src/egon/data/datasets/mastr.py
@@ -1,6 +1,8 @@
+from functools import partial
 from urllib.request import urlretrieve
 import os
 
+from egon.data.datasets import Dataset
 import egon.data.config
 
 
@@ -43,3 +45,12 @@ def download_mastr_data(data_stages=None):
     for filename in files:
         if not os.path.isfile(filename):
             urlretrieve(zenodo_files_url + filename, filename)
+
+
+mastr_data_setup = partial(
+    Dataset,
+    name="MastrData",
+    version="0.0.0",
+    dependencies=[],
+    tasks=(download_mastr_data,),
+)
diff --git a/src/egon/data/processing/mv_grid_districts.py b/src/egon/data/datasets/mv_grid_districts.py
similarity index 98%
rename from src/egon/data/processing/mv_grid_districts.py
rename to src/egon/data/datasets/mv_grid_districts.py
index 45c0ccf90..45340b02b 100644
--- a/src/egon/data/processing/mv_grid_districts.py
+++ b/src/egon/data/datasets/mv_grid_districts.py
@@ -13,6 +13,8 @@
 See :func:`define_mv_grid_districts` for more details.
 """
 
+from functools import partial
+
 from geoalchemy2.types import Geometry
 from sqlalchemy import (
     ARRAY,
@@ -28,9 +30,12 @@
 from sqlalchemy.ext.declarative import declarative_base
 
 from egon.data import db
+from egon.data.datasets import Dataset
 from egon.data.db import session_scope
-from egon.data.processing.substation import (EgonHvmvSubstationVoronoi,
-                                             EgonHvmvSubstation)
+from egon.data.processing.substation import (
+    EgonHvmvSubstation,
+    EgonHvmvSubstationVoronoi,
+)
 
 Base = declarative_base()
 metadata = Base.metadata
@@ -694,11 +699,11 @@ def nearest_polygon_with_substation(
             func.ST_Distance(
                 without_substation.c.geom, with_substation.c.geom
             ),
-            #with_substation.c.id
+            # with_substation.c.id
             func.ST_Distance(
                 func.ST_Centroid(without_substation.c.geom),
-                func.ST_Centroid(with_substation.c.geom)
-            )
+                func.ST_Centroid(with_substation.c.geom),
+            ),
         )
         .subquery()
     )
@@ -762,3 +767,12 @@ def define_mv_grid_districts():
         bind=engine, checkfirst=True
     )
     MvGridDistrictsDissolved.__table__.drop(bind=engine, checkfirst=True)
+
+
+mv_grid_districts_setup = partial(
+    Dataset,
+    name="MvGridDistricts",
+    version="0.0.0",
+    dependencies=[],
+    tasks=(define_mv_grid_districts),
+)
diff --git a/src/egon/data/importing/re_potential_areas/__init__.py b/src/egon/data/datasets/re_potential_areas/__init__.py
similarity index 50%
rename from src/egon/data/importing/re_potential_areas/__init__.py
rename to src/egon/data/datasets/re_potential_areas/__init__.py
index b1e8ddd19..3d321935d 100644
--- a/src/egon/data/importing/re_potential_areas/__init__.py
+++ b/src/egon/data/datasets/re_potential_areas/__init__.py
@@ -2,63 +2,69 @@
 potential areas for wind onshore and ground-mounted PV.
 """
 
-import os
+from functools import partial
 from urllib.request import urlretrieve
-import geopandas as gpd
-from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy import Column, Integer
+import os
+
 from geoalchemy2 import Geometry
+from sqlalchemy import Column, Integer
+from sqlalchemy.ext.declarative import declarative_base
+import geopandas as gpd
 
-import egon.data.config
 from egon.data import db
+from egon.data.datasets import Dataset
+import egon.data.config
 
 Base = declarative_base()
 
 
 class EgonRePotentialAreaPvAgriculture(Base):
-    __tablename__ = 'egon_re_potential_area_pv_agriculture'
-    __table_args__ = {'schema': 'supply'}
+    __tablename__ = "egon_re_potential_area_pv_agriculture"
+    __table_args__ = {"schema": "supply"}
     id = Column(Integer, primary_key=True)
-    geom = Column(Geometry('MULTIPOLYGON', 3035))
+    geom = Column(Geometry("MULTIPOLYGON", 3035))
 
 
 class EgonRePotentialAreaPvRoadRailway(Base):
-    __tablename__ = 'egon_re_potential_area_pv_road_railway'
-    __table_args__ = {'schema': 'supply'}
+    __tablename__ = "egon_re_potential_area_pv_road_railway"
+    __table_args__ = {"schema": "supply"}
     id = Column(Integer, primary_key=True)
-    geom = Column(Geometry('MULTIPOLYGON', 3035))
+    geom = Column(Geometry("MULTIPOLYGON", 3035))
 
 
 class EgonRePotentialAreaWind(Base):
-    __tablename__ = 'egon_re_potential_area_wind'
-    __table_args__ = {'schema': 'supply'}
+    __tablename__ = "egon_re_potential_area_wind"
+    __table_args__ = {"schema": "supply"}
     id = Column(Integer, primary_key=True)
-    geom = Column(Geometry('MULTIPOLYGON', 3035))
+    geom = Column(Geometry("MULTIPOLYGON", 3035))
 
 
 def download_datasets():
     """Download geopackages from Zenodo."""
 
     data_config = egon.data.config.datasets()
-    pa_config = data_config['re_potential_areas']
+    pa_config = data_config["re_potential_areas"]
 
     def ve(s):
         raise (ValueError(s))
 
-    dataset = egon.data.config.settings()['egon-data']['--dataset-boundary']
+    dataset = egon.data.config.settings()["egon-data"]["--dataset-boundary"]
     url_section = (
-        'url'
-        if dataset == 'Everything'
-        else 'url_testmode'
-        if dataset == 'Schleswig-Holstein'
+        "url"
+        if dataset == "Everything"
+        else "url_testmode"
+        if dataset == "Schleswig-Holstein"
         else ve(f"'{dataset}' is not a valid dataset boundary.")
     )
 
     url_target_file_map = zip(
-        pa_config['original_data']['source'][url_section],
-        [os.path.join(os.path.dirname(__file__), file)
-         for file in pa_config['original_data']['target'][
-             'path_table_map'].keys()]
+        pa_config["original_data"]["source"][url_section],
+        [
+            os.path.join(os.path.dirname(__file__), file)
+            for file in pa_config["original_data"]["target"][
+                "path_table_map"
+            ].keys()
+        ],
     )
 
     for url, file in url_target_file_map:
@@ -71,10 +77,11 @@ def create_tables():
 
     data_config = egon.data.config.datasets()
 
-    schema = data_config['re_potential_areas']['original_data'][
-        'target'].get('schema', 'supply')
+    schema = data_config["re_potential_areas"]["original_data"]["target"].get(
+        "schema", "supply"
+    )
 
-    db.execute_sql(f'CREATE SCHEMA IF NOT EXISTS {schema};')
+    db.execute_sql(f"CREATE SCHEMA IF NOT EXISTS {schema};")
     engine = db.engine()
 
     # drop tables
@@ -83,34 +90,36 @@ def create_tables():
     EgonRePotentialAreaWind.__table__.drop(engine, checkfirst=True)
 
     # create tables
-    EgonRePotentialAreaPvAgriculture.__table__.create(bind=engine,
-                                                      checkfirst=True)
-    EgonRePotentialAreaPvRoadRailway.__table__.create(bind=engine,
-                                                      checkfirst=True)
-    EgonRePotentialAreaWind.__table__.create(bind=engine,
-                                             checkfirst=True)
+    EgonRePotentialAreaPvAgriculture.__table__.create(
+        bind=engine, checkfirst=True
+    )
+    EgonRePotentialAreaPvRoadRailway.__table__.create(
+        bind=engine, checkfirst=True
+    )
+    EgonRePotentialAreaWind.__table__.create(bind=engine, checkfirst=True)
 
 
 def insert_data():
     """Insert data into DB"""
 
     data_config = egon.data.config.datasets()
-    pa_config = data_config['re_potential_areas']
+    pa_config = data_config["re_potential_areas"]
 
     file_table_map = {
         os.path.join(os.path.dirname(__file__), file): table
-        for file, table in pa_config['original_data']['target'][
-            'path_table_map'].items()
+        for file, table in pa_config["original_data"]["target"][
+            "path_table_map"
+        ].items()
     }
 
     engine_local_db = db.engine()
 
     for file, table in file_table_map.items():
         data = gpd.read_file(file).to_crs("EPSG:3035")
-        data.rename(columns={'geometry': 'geom'}, inplace=True)
-        data.set_geometry('geom', inplace=True)
+        data.rename(columns={"geometry": "geom"}, inplace=True)
+        data.set_geometry("geom", inplace=True)
 
-        schema = pa_config['original_data']['target'].get('schema', 'supply')
+        schema = pa_config["original_data"]["target"].get("schema", "supply")
 
         # create database table from geopandas dataframe
         data.to_postgis(
@@ -121,3 +130,13 @@ def insert_data():
             if_exists="append",
             dtype={"geom": Geometry()},
         )
+
+
+# create re_potential_areas dataset partial object
+re_potential_area_setup = partial(
+    Dataset,
+    name="RePotentialAreas",
+    version="0.0.0",
+    dependencies=[],
+    tasks=(download_datasets, create_tables, insert_data),
+)
diff --git a/src/egon/data/datasets/vg250/__init__.py b/src/egon/data/datasets/vg250/__init__.py
index 34bf35500..14bf94146 100644
--- a/src/egon/data/datasets/vg250/__init__.py
+++ b/src/egon/data/datasets/vg250/__init__.py
@@ -318,7 +318,7 @@ class Vg250(Dataset):
     def __init__(self, dependencies):
         super().__init__(
             name="VG250",
-            version=self.filename + "-0.0.0",
+            version=self.filename + "-0.0.1",
             dependencies=dependencies,
             tasks=(
                 download_files,
diff --git a/src/egon/data/datasets/vg250/cleaning_and_preparation.sql b/src/egon/data/datasets/vg250/cleaning_and_preparation.sql
index ee206b3b0..06bfc497a 100644
--- a/src/egon/data/datasets/vg250/cleaning_and_preparation.sql
+++ b/src/egon/data/datasets/vg250/cleaning_and_preparation.sql
@@ -562,8 +562,7 @@ UPDATE 	boundaries.vg250_gem_clean AS t1
 
 -- remove holes
 DELETE FROM 	boundaries.vg250_gem_clean
-WHERE		is_hole IS TRUE OR
-		id = '9251' OR id = '8362'; -- Two special cases deleted manualy
+WHERE		is_hole IS TRUE;
 
 /*
 -- metadata
diff --git a/src/egon/data/importing/gas_grid/__init__.py b/src/egon/data/importing/gas_grid/__init__.py
index 80b11e149..a3a69c7e7 100755
--- a/src/egon/data/importing/gas_grid/__init__.py
+++ b/src/egon/data/importing/gas_grid/__init__.py
@@ -29,7 +29,7 @@ def next_id(component):
         Next index value
     """
     max_id = db.select_dataframe(
-        """
+        f"""
         SELECT MAX({component}_id) FROM grid.egon_pf_hv_{component}
         """)['max'][0]
 
@@ -308,9 +308,9 @@ def insert_gas_pipeline_list(gas_nodes_list):
     
     INSERT INTO grid.egon_pf_hv_link (version, scn_name, link_id, bus0,
                                               bus1, p_nom, length,
-                                              geom, topo)
+                                              geom, topo, carrier)
     SELECT
-    version, scn_name, link_id, bus0, bus1, p_nom, length, geom, topo
+    version, scn_name, link_id, bus0, bus1, p_nom, length, geom, topo, carrier
     FROM grid.egon_pf_hv_gas_link;
         
     DROP TABLE grid.egon_pf_hv_gas_link;
diff --git a/src/egon/data/processing/calculate_dlr.py b/src/egon/data/processing/calculate_dlr.py
new file mode 100644
index 000000000..0ca985bd1
--- /dev/null
+++ b/src/egon/data/processing/calculate_dlr.py
@@ -0,0 +1,289 @@
+"""
+Use the concept of dynamic line rating(DLR) to calculate temporal
+depending capacity for HV transmission lines.
+Inspired mainly on Planungsgrundsaetze-2020
+Available at: 
+<https://www.transnetbw.de/files/pdf/netzentwicklung/netzplanungsgrundsaetze/UENB_PlGrS_Juli2020.pdf>
+"""
+import geopandas as gpd
+import pandas as pd
+import numpy as np
+from egon.data import db
+import xarray as xr
+import rioxarray
+from shapely.geometry import Point
+import psycopg2
+
+
+def Calculate_DLR():
+    """Calculate DLR and assign values to each line in the db
+
+    Parameters
+    ----------
+    *No parameters required
+
+    """
+
+    weather_info_path = "cutouts/europe-2011-era5/201101.nc"
+    regions_shape_path = (
+        "data_bundle_egon_data/regions_dynamic_line_rating/Germany_regions.shp"
+    )
+
+    # Calculate hourly DLR per region
+    dlr_hourly_dic, dlr_hourly = DLR_Regions(weather_info_path, regions_shape_path)
+
+    regions = gpd.read_file(regions_shape_path)
+    regions = regions.sort_values(by=["Region"])
+
+    # Connect to the data base
+    con = db.engine()
+
+    sql = "SELECT version, scn_name, line_id, geom, s_nom FROM grid.egon_pf_hv_line"
+    df = gpd.GeoDataFrame.from_postgis(sql, con, crs="EPSG:4326")
+
+    trans_lines_R = {}
+    for i in regions.Region:
+        shape_area = regions[regions["Region"] == i]
+        trans_lines_R[i] = gpd.clip(df, shape_area)
+    trans_lines = df[["s_nom"]]
+    trans_lines["in_regions"] = [[] for i in range(len(df))]
+
+    trans_lines[["line_id", "geometry", "version", "scn_name"]] = df[
+        ["line_id", "geom", "version", "scn_name"]
+    ]
+
+    # Assign to each transmission line the region to which it belongs
+    for i in trans_lines_R:
+        for j in trans_lines_R[i].index:
+            trans_lines.loc[j][1] = trans_lines.loc[j][1].append(i)
+
+    DLR = []
+    # Assign to each transmision line the final values of DLR based on location
+    # and type of line (overhead or underground)
+    for i in trans_lines.index:
+        # lines completely out of the Germany border have DLR = 1
+        if len(trans_lines.loc[i][1]) == 0:
+            DLR.append([1] * 8760)
+            continue
+        # Underground lines have DLR = 1
+        if (
+            trans_lines.loc[i][0] % 280 == 0
+            or trans_lines.loc[i][0] % 550 == 0
+            or trans_lines.loc[i][0] % 925 == 0
+        ):
+            DLR.append([1] * 8760)
+            continue
+        # Lines completely in one of the regions, have the DLR of the region
+        if len(trans_lines.loc[i][1]) == 1:
+            region = int(trans_lines.loc[i][1][0])
+            DLR.append(dlr_hourly_dic["R" + str(region) + "-DLR"])
+            continue
+        # For lines crossing 2 or more regions, the lowest DLR between the
+        # different regions per hour is assigned.
+        if len(trans_lines.loc[i][1]) > 1:
+            reg = []
+            for j in trans_lines.loc[i][1]:
+                reg.append("Reg_" + str(j))
+            min_DLR_reg = dlr_hourly[reg].min(axis=1)
+            DLR.append(list(min_DLR_reg))
+
+    trans_lines["s_max_pu"] = DLR
+
+    # delete unnecessary columns
+    trans_lines.drop(columns=["in_regions", "s_nom", "geometry"], inplace=True)
+
+    # Modify column "s_max_pu" to fit the requirement of the table
+    trans_lines["s_max_pu"] = trans_lines.apply(lambda x: list(x["s_max_pu"]), axis=1)
+    trans_lines["temp_id"] = 1
+    # Insert into database
+    trans_lines.to_sql(
+        "egon_pf_hv_line_timeseries",
+        schema="grid",
+        con=db.engine(),
+        if_exists="append",
+        index=False,
+    )
+    return 0
+
+
+def DLR_Regions(weather_info_path, regions_shape_path):
+    """Calculate DLR values for the given regions
+
+    Parameters
+    ----------
+    weather_info_path: str, mandatory
+        path of the weather data downloaded from ERA5
+    regions_shape_path: str, mandatory
+        path to the shape file with the shape of the regions to analyze
+
+    """
+
+    # load, index and sort shapefile with the 9 regions defined by NEP 2020
+    regions = gpd.read_file(regions_shape_path)
+    regions = regions.set_index(["Region"])
+    regions = regions.sort_values(by=["Region"])
+
+    # The data downloaded using Atlite is divided by months. Paths_weather stores
+    # the paths of the 12 files to be loaded together in 'weather_data_raw'.
+    paths_weather = []
+    for i in range(1, 13):
+        paths_weather.append("cutouts/europe-2011-era5/2011" + str(i).zfill(2) + ".nc")
+
+    weather_data_raw = xr.open_mfdataset(paths_weather)
+    weather_data_raw = weather_data_raw.rio.write_crs(4326)
+    weather_data_raw = weather_data_raw.rio.clip_box(
+        minx=5.5,
+        miny=47,
+        maxx=15.5,
+        maxy=55.5,
+    )
+
+    wind_speed_raw = weather_data_raw.wnd100m.values
+    temperature_raw = weather_data_raw.temperature.values
+    roughness_raw = weather_data_raw.roughness.values
+    index = weather_data_raw.indexes._indexes
+    # The info in 'weather_data_raw' has 3 dimensions. In 'weather_data' will be
+    # stored all the relevant data in a 2 dimensions array.
+    weather_data = np.zeros(shape=(wind_speed_raw.size, 5))
+    count = 0
+    for hour in range(index["time"].size):
+        for row in range(index["y"].size):
+            for column in range(index["x"].size):
+                rough = roughness_raw[hour, row, column]
+                ws_100m = wind_speed_raw[hour, row, column]
+                # Use Log Law to calculate wind speed at 50m height
+                ws_50m = ws_100m * (np.log(50 / rough) / np.log(100 / rough))
+                weather_data[count, 0] = hour
+                weather_data[count, 1] = index["y"][row]
+                weather_data[count, 2] = index["x"][column]
+                weather_data[count, 3] = ws_50m
+                weather_data[count, 4] = temperature_raw[hour, row, column] - 273.15
+                count += 1
+
+    weather_data = pd.DataFrame(
+        weather_data, columns=["hour", "lat", "lon", "wind_s", "temp"]
+    )
+
+    region_selec = weather_data[0 : index["x"].size * index["y"].size].copy()
+    region_selec["geom"] = region_selec.apply(
+        lambda x: Point(x["lon"], x["lat"]), axis=1
+    )
+    region_selec = gpd.GeoDataFrame(region_selec)
+    region_selec = region_selec.set_geometry("geom")
+    region_selec["region"] = np.zeros(index["x"].size * index["y"].size)
+
+    # Mask weather information for each region defined by NEP 2020
+    for reg in regions.index:
+        weather_region = gpd.clip(region_selec, regions.loc[reg][0])
+        region_selec["region"][region_selec.isin(weather_region).any(axis=1)] = reg
+
+    weather_data["region"] = region_selec["region"].tolist() * index["time"].size
+    weather_data = weather_data[weather_data["region"] != 0]
+
+    # Create data frame to save results(Min wind speed, max temperature and %DLR per region along 8760h in a year)
+    time = pd.date_range("2011-01-01", "2011-12-31 23:00:00", freq="H")
+    # time = time.transpose()
+    dlr = pd.DataFrame(
+        0,
+        columns=[
+            "R1-Wind_min",
+            "R1-Temp_max",
+            "R1-DLR",
+            "R2-Wind_min",
+            "R2-Temp_max",
+            "R2-DLR",
+            "R3-Wind_min",
+            "R3-Temp_max",
+            "R3-DLR",
+            "R4-Wind_min",
+            "R4-Temp_max",
+            "R4-DLR",
+            "R5-Wind_min",
+            "R5-Temp_max",
+            "R5-DLR",
+            "R6-Wind_min",
+            "R6-Temp_max",
+            "R6-DLR",
+            "R7-Wind_min",
+            "R7-Temp_max",
+            "R7-DLR",
+            "R8-Wind_min",
+            "R8-Temp_max",
+            "R8-DLR",
+            "R9-Wind_min",
+            "R9-Temp_max",
+            "R9-DLR",
+        ],
+        index=time,
+    )
+
+    # Calculate and save min wind speed and max temperature in a dataframe.
+    # Since the dataframe generated by the function era5.weather_df_from_era5() is sorted by date,
+    # it is faster to calculate the hourly results using blocks of data defined by "step", instead of
+    # using a filter or a search function.
+    for reg, df in weather_data.groupby(["region"]):
+        for t in range(0, len(time)):
+            step = df.shape[0] / len(time)
+            low_limit = int(t * step)
+            up_limit = int(step * (t + 1))
+            dlr.iloc[t, 0 + int(reg - 1) * 3] = min(df.iloc[low_limit:up_limit, 3])
+            dlr.iloc[t, 1 + int(reg - 1) * 3] = max(df.iloc[low_limit:up_limit, 4])
+
+    # The next loop use the min wind speed and max temperature calculated previously to
+    # define the hourly DLR in for each region based on the table given by NEP 2020 pag 31
+    for i in range(0, len(regions)):
+        for j in range(0, len(time)):
+            if dlr.iloc[j, 1 + i * 3] <= 5:
+                if dlr.iloc[j, 0 + i * 3] < 3:
+                    dlr.iloc[j, 2 + i * 3] = 1.30
+                elif dlr.iloc[j, 0 + i * 3] < 4:
+                    dlr.iloc[j, 2 + i * 3] = 1.35
+                elif dlr.iloc[j, 0 + i * 3] < 5:
+                    dlr.iloc[j, 2 + i * 3] = 1.45
+                else:
+                    dlr.iloc[j, 2 + i * 3] = 1.50
+            elif dlr.iloc[j, 1 + i * 3] <= 15:
+                if dlr.iloc[j, 0 + i * 3] < 3:
+                    dlr.iloc[j, 2 + i * 3] = 1.20
+                elif dlr.iloc[j, 0 + i * 3] < 4:
+                    dlr.iloc[j, 2 + i * 3] = 1.25
+                elif dlr.iloc[j, 0 + i * 3] < 5:
+                    dlr.iloc[j, 2 + i * 3] = 1.35
+                elif dlr.iloc[j, 0 + i * 3] < 6:
+                    dlr.iloc[j, 2 + i * 3] = 1.45
+                else:
+                    dlr.iloc[j, 2 + i * 3] = 1.50
+            elif dlr.iloc[j, 1 + i * 3] <= 25:
+                if dlr.iloc[j, 0 + i * 3] < 3:
+                    dlr.iloc[j, 2 + i * 3] = 1.10
+                elif dlr.iloc[j, 0 + i * 3] < 4:
+                    dlr.iloc[j, 2 + i * 3] = 1.15
+                elif dlr.iloc[j, 0 + i * 3] < 5:
+                    dlr.iloc[j, 2 + i * 3] = 1.20
+                elif dlr.iloc[j, 0 + i * 3] < 6:
+                    dlr.iloc[j, 2 + i * 3] = 1.30
+                else:
+                    dlr.iloc[j, 2 + i * 3] = 1.40
+            elif dlr.iloc[j, 1 + i * 3] <= 35:
+                if dlr.iloc[j, 0 + i * 3] < 3:
+                    dlr.iloc[j, 2 + i * 3] = 1.00
+                elif dlr.iloc[j, 0 + i * 3] < 4:
+                    dlr.iloc[j, 2 + i * 3] = 1.05
+                elif dlr.iloc[j, 0 + i * 3] < 5:
+                    dlr.iloc[j, 2 + i * 3] = 1.10
+                elif dlr.iloc[j, 0 + i * 3] < 6:
+                    dlr.iloc[j, 2 + i * 3] = 1.15
+                else:
+                    dlr.iloc[j, 2 + i * 3] = 1.25
+            else:
+                dlr.iloc[j, 2 + i * 3] = 1.00
+
+    DLR_hourly_df_dic = {}
+    for i in dlr.columns[range(2, 29, 3)]:  # columns with DLR values
+        DLR_hourly_df_dic[i] = dlr[i].values
+
+    dlr_hourly = pd.DataFrame(index=time)
+    for i in range(len(regions)):
+        dlr_hourly["Reg_" + str(i + 1)] = dlr.iloc[:, 3 * i + 2]
+
+    return DLR_hourly_df_dic, dlr_hourly
diff --git a/src/egon/data/processing/gas_areas/__init__.py b/src/egon/data/processing/gas_areas/__init__.py
new file mode 100755
index 000000000..679892299
--- /dev/null
+++ b/src/egon/data/processing/gas_areas/__init__.py
@@ -0,0 +1,98 @@
+"""The central module containing code to create gas voronoi polygones
+
+"""
+from egon.data import db
+from sqlalchemy.ext.declarative import declarative_base
+
+Base = declarative_base()
+
+def create_voronoi():
+    '''
+    Creates voronoi polygons for gas buses
+
+    Returns
+    -------
+    None.
+
+    '''
+        
+    db.execute_sql(
+        """
+        DROP TABLE IF EXISTS grid.egon_gas_voronoi;
+                
+        SELECT bus_id, bus_id as id, geom as point
+        INTO grid.egon_gas_voronoi
+        FROM grid.egon_pf_hv_bus 
+        WHERE carrier = 'gas';
+        
+        ALTER TABLE grid.egon_gas_voronoi ADD geom Geometry('Multipolygon', 4326);     
+        
+        DROP TABLE IF EXISTS grid.egon_gas_bus CASCADE;
+        
+        SELECT bus_id, bus_id as id, geom as point 
+        INTO grid.egon_gas_bus
+        FROM grid.egon_pf_hv_bus 
+        WHERE carrier = 'gas';
+        """
+                    )
+
+    schema = 'grid'
+    substation_table = 'egon_gas_bus'
+    voronoi_table = 'egon_gas_voronoi'
+    view = 'grid.egon_voronoi_no_borders'
+    boundary = 'boundaries.vg250_sta_union'
+    
+    # Create view for Voronoi polygons without taking borders into account
+    db.execute_sql(
+        f"DROP VIEW IF EXISTS {schema}.egon_voronoi_no_borders CASCADE;"
+                   )
+    
+    db.execute_sql(
+        f"""
+        CREATE VIEW {view} AS
+           SELECT (ST_Dump(ST_VoronoiPolygons(ST_collect(a.point)))).geom
+           FROM {schema}.{substation_table} a;
+        """
+        )
+    
+    # Clip Voronoi with boundaries
+    db.execute_sql(
+        f"""
+        INSERT INTO {schema}.{voronoi_table} (geom)
+        (SELECT ST_Multi(ST_Intersection(
+            ST_Transform(a.geometry, 4326), b.geom)) AS geom
+         FROM {boundary} a
+         CROSS JOIN {view} b);
+        """
+        )
+    
+    # Assign substation id as foreign key
+    db.execute_sql(
+        f"""
+        UPDATE {schema}.{voronoi_table} AS t1
+            SET  	bus_id = t2.bus_id
+    	            FROM	(SELECT	voi.id AS id,
+    			                sub.bus_id ::integer AS bus_id
+    		            FROM	{schema}.{voronoi_table} AS voi,
+    			                {schema}.{substation_table} AS sub
+    		            WHERE  	voi.geom && sub.point AND
+    			                ST_CONTAINS(voi.geom,sub.point)
+    		           GROUP BY voi.id,sub.bus_id
+    		           )AS t2
+    	            WHERE  	t1.id = t2.id;
+        """
+        )
+    
+    db.execute_sql(
+        f"""
+        CREATE INDEX  	{voronoi_table}_idx
+            ON          {schema}.{voronoi_table} USING gist (geom);
+        """
+        )
+    
+    db.execute_sql(
+        f"""
+        DROP VIEW IF EXISTS {view} CASCADE;
+        DROP TABLE IF EXISTS grid.egon_gas_bus;
+        """
+                    )
\ No newline at end of file
diff --git a/src/egon/data/processing/osmtgmod/__init__.py b/src/egon/data/processing/osmtgmod/__init__.py
index 9f20ed39b..6d17d3514 100644
--- a/src/egon/data/processing/osmtgmod/__init__.py
+++ b/src/egon/data/processing/osmtgmod/__init__.py
@@ -6,7 +6,7 @@
 import datetime
 import logging
 import codecs
-import subprocess
+from pathlib import Path
 import egon.data.config
 from egon.data.config import settings
 import egon.data.subprocess as subproc
@@ -26,8 +26,7 @@ def run_osmtgmod():
         target_path = osm_config["target"]["path_testmode"]
 
     filtered_osm_pbf_path_to_file = os.path.join(
-        egon.data.__path__[0] + "/importing" + "/openstreetmap/"
-        + target_path
+        egon.data.__path__[0], "datasets", "osm", target_path
     )
     docker_db_config = db.credentials()
 
@@ -54,7 +53,7 @@ def import_osm_data():
         )
 
     else:
-    	subproc.run(
+        subproc.run(
             [
                 "git",
                 "clone",
@@ -75,8 +74,7 @@ def import_osm_data():
         target_path = osm_config["target"]["path_testmode"]
 
     filtered_osm_pbf_path_to_file = os.path.join(
-        egon.data.__path__[0] + "/importing" + "/openstreetmap/"
-        + target_path
+        egon.data.__path__[0], "datasets", "osm", target_path
     )
 
     docker_db_config=db.credentials()
@@ -142,15 +140,19 @@ def import_osm_data():
         {config['osm_data']['osmosis_path_to_binary']}"""
         )
 
-    # BUG: Python continues (and sets osm_metadata)
-    # even in case osmosis fails!!!
-    proc = subprocess.Popen(
-            "%s --read-pbf %s --write-pgsql \
+    # create directory to store osmosis' temp files
+    osmosis_temp_dir = Path('.') / "osmosis_temp/"
+    if not os.path.exists(osmosis_temp_dir):
+        os.mkdir(osmosis_temp_dir)
+
+    subproc.run(
+            "JAVACMD_OPTIONS='%s' %s --read-pbf %s --write-pgsql \
                 database=%s host=%s user=%s password=%s"
             % (
+                f"-Djava.io.tmpdir={osmosis_temp_dir}",
                 os.path.join(egon.data.__path__[0],
-                                 "processing/osmtgmod/osmTGmod/",
-                                 config["osm_data"]["osmosis_path_to_binary"]),
+                             "processing/osmtgmod/osmTGmod/",
+                             config["osm_data"]["osmosis_path_to_binary"]),
                 filtered_osm_pbf_path_to_file,
                 config_database,
                 config["postgres_server"]["host"]
@@ -162,7 +164,6 @@ def import_osm_data():
             shell=True,
         )
     logging.info("Importing OSM-Data...")
-    proc.wait()
 
     # After updating OSM-Data, power_tables (for editing)
     # have to be updated as well
diff --git a/src/egon/data/processing/power_plants/__init__.py b/src/egon/data/processing/power_plants/__init__.py
index 5ec4dfbe6..a4a207ec7 100644
--- a/src/egon/data/processing/power_plants/__init__.py
+++ b/src/egon/data/processing/power_plants/__init__.py
@@ -1,7 +1,7 @@
 """The central module containing all code dealing with power plant data.
 """
 from egon.data import db
-from sqlalchemy import Column, String, Float, Integer, Sequence, Boolean
+from sqlalchemy import Column, String, Float, Integer, Sequence, Boolean, BigInteger
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.dialects.postgresql import JSONB
 from sqlalchemy.orm import sessionmaker
@@ -17,7 +17,7 @@
 class EgonPowerPlants(Base):
     __tablename__ = "egon_power_plants"
     __table_args__ = {"schema": "supply"}
-    id = Column(Integer, Sequence("pp_seq"), primary_key=True)
+    id = Column(BigInteger, Sequence("pp_seq"), primary_key=True)
     sources = Column(JSONB)
     source_id = Column(JSONB)
     carrier = Column(String)
@@ -45,6 +45,10 @@ def create_tables():
         f"""DROP TABLE IF EXISTS
         {cfg['target']['schema']}.{cfg['target']['table']}"""
     )
+
+    db.execute_sql(
+        """DROP SEQUENCE IF EXISTS pp_seq"""
+    )
     EgonPowerPlants.__table__.create(bind=engine, checkfirst=True)
 
 
@@ -304,7 +308,7 @@ def insert_hydro_plants(scenario):
         mastr = scale_prox2now(mastr, target, level=level)
 
         # Choose only entries with valid geometries inside DE/test mode
-        mastr_loc = filter_mastr_geometry(mastr)
+        mastr_loc = filter_mastr_geometry(mastr).set_geometry('geometry')
         # TODO: Deal with power plants without geometry
 
         # Assign bus_id and voltage level
@@ -442,9 +446,12 @@ def assign_bus_id(power_plants, cfg):
 
     if len(power_plants_ehv) > 0:
         power_plants.loc[power_plants_ehv, 'bus_id'] = gpd.sjoin(
-            power_plants[power_plants.index.isin(power_plants_hv)
-                         ], ehv_grid_districts).bus_id
+            power_plants[power_plants.index.isin(power_plants_ehv)
+                         ], ehv_grid_districts).bus_id_right
 
+    # Assert that all power plants have a bus_id
+    assert power_plants.bus_id.notnull().all(), """Some power plants are
+    not attached to a bus."""
 
     return power_plants
 
@@ -458,7 +465,10 @@ def insert_power_plants():
     """
     cfg = egon.data.config.datasets()["power_plants"]
     db.execute_sql(
-        f"DELETE FROM {cfg['target']['schema']}.{cfg['target']['table']}"
+        f"""
+        DELETE FROM {cfg['target']['schema']}.{cfg['target']['table']}
+        WHERE carrier IN ('biomass', 'reservoir', 'run_of_river')
+        """
     )
 
     for scenario in ["eGon2035"]:
diff --git a/src/egon/data/processing/pv_ground_mounted.py b/src/egon/data/processing/pv_ground_mounted.py
new file mode 100644
index 000000000..0026b8c9e
--- /dev/null
+++ b/src/egon/data/processing/pv_ground_mounted.py
@@ -0,0 +1,1200 @@
+from egon.data import db
+import psycopg2
+import geopandas as gpd
+from shapely import wkb
+import pandas as pd
+import numpy as np
+
+
+def regio_of_pv_ground_mounted():
+    def mastr_existing_pv(path, pow_per_area):
+
+        """Import MaStR data from csv-files.
+
+        Parameters
+        ----------
+        path : string
+            Path to location of MaStR-file
+        pow_per_area: int
+            Assumption for areas of existing pv farms and power of new built pv farms depending on area in kW/m²
+
+        """
+
+        # import MaStR data: locations, grid levels and installed capacities
+
+        # get relevant pv plants: ground mounted
+        df = pd.read_csv(
+            path + "bnetza_mastr_solar_cleaned.csv",
+            usecols=[
+                "Lage",
+                "Laengengrad",
+                "Breitengrad",
+                "Nettonennleistung",
+                "EinheitMastrNummer",
+            ],
+        )
+        df = df[df["Lage"] == "Freiflaeche"]
+
+        ### examine data concerning geographical locations and drop NaNs
+        x1 = df["Laengengrad"].isnull().sum()
+        x2 = df["Breitengrad"].isnull().sum()
+        print(" ")
+        print("Untersuchung des MaStR-Datensatzes:")
+        print("originale Anzahl der Zeilen im Datensatz: " + str(len(df)))
+        print("NaNs für Längen- und Breitengrad: " + str(x1) + " & " + str(x2))
+        df.dropna(inplace=True)
+        print("Anzahl der Zeilen im Datensatz nach Dropping der NaNs: " + str(len(df)))
+        print(" ")
+
+        # derive dataframe for locations
+        mastr = gpd.GeoDataFrame(
+            index=df.index,
+            geometry=gpd.points_from_xy(df["Laengengrad"], df["Breitengrad"]),
+            crs={"init": "epsg:4326"},
+        )
+        mastr = mastr.to_crs(3035)
+
+        # derive installed capacities
+        mastr["installed capacity in kW"] = df["Nettonennleistung"]
+
+        # create buffer around locations
+
+        # calculate bufferarea and -radius considering installed capacity
+        df_radius = (
+            mastr["installed capacity in kW"].div(pow_per_area * np.pi) ** 0.5
+        )  # in m
+
+        # create buffer
+        mastr["buffer"] = mastr["geometry"].buffer(df_radius)
+        mastr["buffer"].crs = 3035
+
+        # derive MaStR-Nummer
+        mastr["mastr_nummer"] = df["EinheitMastrNummer"]
+
+        # derive voltage level
+
+        mastr["voltage_level"] = pd.Series(dtype=int)
+        lvl = pd.read_csv(
+            path + "location_elec_generation_raw.csv",
+            usecols=["Spannungsebene", "MaStRNummer"],
+        )
+
+        # assign voltage_level to MaStR-unit:
+        v_l = pd.Series()
+        for index, row in mastr.iterrows():
+            nr = row["mastr_nummer"]
+            l = lvl[lvl["MaStRNummer"] == "['" + nr + "']"]["Spannungsebene"]
+            if len(l) > 0:
+                if l.iloc[0] == "Mittelspannung":
+                    v_l.loc[index] = 5
+                if l.iloc[0] == "UmspannungZurMittelspannung":
+                    v_l.loc[index] = 4
+                elif l.iloc[0] == "Hochspannung":
+                    v_l.loc[index] = 3
+                elif l.iloc[0] == "UmspannungZurHochspannung":
+                    v_l.loc[index] = 1
+                elif l.iloc[0] == "Höchstspannung":
+                    v_l.loc[index] = 1
+                elif l.iloc[0] == "UmspannungZurNiederspannung":
+                    v_l.loc[index] = l.iloc[0]
+                elif l.iloc[0] == "Niederspannung":
+                    v_l.loc[index] = l.iloc[0]
+            else:
+                v_l.loc[index] = np.NaN
+        mastr["voltage_level"] = v_l
+
+        ### examine data concerning voltage level
+        x1 = mastr["voltage_level"].isnull().sum()
+        print(" ")
+        print("Untersuchung des MaStR-Datensatzes für Spannungsebenen:")
+        print("Anzahl der Zeilen im MaStR-Datensatz vorher: " + str(len(mastr)))
+        print(
+            "NaNs in Spannungsebene aufgrund a) keine Zuordnung zur Nummer oder b) fehlender Daten: "
+            + str(x1)
+        )
+        # drop PVs with missing values due to a) no assignemtn of MaStR-numbers or b) missing data in row
+        mastr.dropna(inplace=True)
+        print(
+            "Anzahl der Zeilen im Datensatz nach Dropping der NaNs: " + str(len(mastr))
+        )
+
+        # drop PVs in low voltage level
+        index_names = mastr[mastr["voltage_level"] == "Niederspannung"].index
+        x2 = len(index_names)
+        mastr.drop(index_names, inplace=True)
+        index_names = mastr[
+            mastr["voltage_level"] == "UmspannungZurNiederspannung"
+        ].index
+        x3 = len(index_names)
+        mastr.drop(index_names, inplace=True)
+
+        ### further examination
+        print("Anzahl der PVs in der Niederspannungsebene: " + str(x2))
+        print("Anzahl der PVs in der NSMS-Ebene: " + str(x3))
+        print(
+            "Anzahl der Zeilen im Datensatz nach Dropping dieser Ebenen: "
+            + str(len(mastr))
+        )
+        print(" ")
+
+        return mastr
+
+    def potential_areas(con, join_buffer):
+
+        """Import potential areas and choose and prepare areas suitable for PV ground mounted.
+
+        Parameters
+        ----------
+        con:
+            Connection to database
+        join_buffer: int
+            Maximum distance for joining of potential areas (only small ones to big ones) in m
+
+        """
+
+        # import potential areas: railways and roads & agriculture
+
+        # roads and railway
+        sql = "SELECT id, geom FROM supply.egon_re_potential_area_pv_road_railway"
+        potentials_rora = gpd.GeoDataFrame.from_postgis(sql, con)
+        potentials_rora = potentials_rora.set_index("id")
+
+        # agriculture
+        sql = "SELECT id, geom FROM supply.egon_re_potential_area_pv_agriculture"
+        potentials_agri = gpd.GeoDataFrame.from_postgis(sql, con)
+        potentials_agri = potentials_agri.set_index("id")
+
+        # add areas < 1 ha to bigger areas if they are very close, otherwise exclude areas < 1 ha
+
+        # calculate area
+        potentials_rora["area"] = potentials_rora.area
+        potentials_agri["area"] = potentials_agri.area
+
+        # roads and railways
+
+        ### counting variable for examination
+        before = len(potentials_rora)
+
+        # get small areas and create buffer for joining around them
+        small_areas = potentials_rora[potentials_rora["area"] < 10000]
+        small_buffers = small_areas.copy()
+        small_buffers["geom"] = small_areas["geom"].buffer(join_buffer)
+
+        # drop small areas in potential areas
+        index_names = potentials_rora[potentials_rora["area"] < 10000].index
+        potentials_rora.drop(index_names, inplace=True)
+
+        # check intersection of small areas with other potential areas
+        overlay = gpd.sjoin(potentials_rora, small_buffers)
+        o = overlay["index_right"]
+        o.drop_duplicates(inplace=True)
+
+        # add small areas to big ones if buffer intersects
+        for i in range(0, len(o)):
+            index_potentials = o.index[i]
+            index_small = o.iloc[i]
+            x = potentials_rora["geom"].loc[index_potentials]
+            y = small_areas["geom"].loc[index_small]
+            join = gpd.GeoSeries(data=[x, y])
+            potentials_rora["geom"].loc[index_potentials] = join.unary_union
+
+        ### examination of joining of areas
+        count_small = len(small_buffers)
+        count_join = len(o)
+        count_delete = count_small - count_join
+        print(" ")
+        print(
+            "Untersuchung der Zusammenfassung von Potentialflächen im Bereich Roads and Railways"
+        )
+        print("Länge des Dataframes der Flächen vorher: " + str(before))
+        print("Anzahl kleiner Flächen: " + str(count_small))
+        print(
+            "Anzahl der durchgeführten Prozedur des Zusammenfassens: " + str(count_join)
+        )
+        print("gelöschte Flächen (not joined): " + str(count_delete))
+        print("Länge des Dataframes der Flächen danach: " + str(len(potentials_rora)))
+        print(" ")
+
+        # agriculture
+
+        ### counting variable for examination
+        before = len(potentials_agri)
+
+        # get small areas and create buffer for joining around them
+        small_areas = potentials_agri[potentials_agri["area"] < 10000]
+        small_buffers = small_areas.copy()
+        small_buffers["geom"] = small_areas["geom"].buffer(join_buffer)
+
+        # drop small areas in potential areas
+        index_names = potentials_agri[potentials_agri["area"] < 10000].index
+        potentials_agri.drop(index_names, inplace=True)
+
+        # check intersection of small areas with other potential areas
+        overlay = gpd.sjoin(potentials_agri, small_buffers)
+        o = overlay["index_right"]
+        o.drop_duplicates(inplace=True)
+
+        # add small areas to big ones if buffer intersects
+        for i in range(0, len(o)):
+            index_potentials = o.index[i]
+            index_small = o.iloc[i]
+            x = potentials_agri["geom"].loc[index_potentials]
+            y = small_areas["geom"].loc[index_small]
+            join = gpd.GeoSeries(data=[x, y])
+            potentials_agri["geom"].loc[index_potentials] = join.unary_union
+
+        ### examination of joining of areas
+        count_small = len(small_buffers)
+        count_join = len(o)
+        count_delete = count_small - count_join
+        print(" ")
+        print(
+            "Untersuchung der Zusammenfassung von Potentialflächen im Bereich Agriculture"
+        )
+        print("Länge des Dataframes der Flächen vorher: " + str(before))
+        print("Anzahl kleiner Flächen: " + str(count_small))
+        print(
+            "Anzahl der durchgeführten Prozedur des Zusammenfassens: " + str(count_join)
+        )
+        print("gelöschte Flächen (not joined): " + str(count_delete))
+        print("Länge des Dataframes der Flächen danach: " + str(len(potentials_agri)))
+        print(" ")
+
+        # calculate new areas
+        potentials_rora["area"] = potentials_rora.area
+        potentials_agri["area"] = potentials_agri.area
+
+        # check intersection of potential areas
+
+        ### counting variable
+        agri_vorher = len(potentials_agri)
+
+        # if areas intersect, keep road & railway potential areas and drop agricultural ones
+        overlay = gpd.sjoin(potentials_rora, potentials_agri)
+        o = overlay["index_right"]
+        o.drop_duplicates(inplace=True)
+        for i in range(0, len(o)):
+            index = o.iloc[i]
+            potentials_agri.drop([index], inplace=True)
+
+        ### examination of intersection of areas
+        print(" ")
+        print(
+            "Überprüfung der Funktion zur Meidung der Intersection von Potentialflächen:"
+        )
+        print("Länge potentials_agri vorher: " + str(agri_vorher))
+        print("Anzahl der auftretenden Fälle: " + str(len(o)))
+        print("Länge potentials_agri nachher: " + str(len(potentials_agri)))
+        print(" ")
+
+        return potentials_rora, potentials_agri
+
+    def select_pot_areas(mastr, potentials_pot):
+
+        """Select potential areas where there are existing pv parks (MaStR-data).
+
+        Parameters
+        ----------
+        mastr: gpd.GeoDataFrame()
+            MaStR-DataFrame with existing pv parks
+        potentials_pot: gpd.GeoDataFrame()
+            Suitable potential areas
+
+        """
+
+        # select potential areas with existing pv parks
+        # (potential areas intersect buffer around existing plants)
+
+        # prepare dataframes to check intersection
+        pvs = gpd.GeoDataFrame()
+        pvs["geom"] = mastr["buffer"].copy()
+        pvs.crs = 3035
+        pvs = pvs.set_geometry("geom")
+        potentials = gpd.GeoDataFrame()
+        potentials["geom"] = potentials_pot["geom"].copy()
+        potentials.crs = 3035
+        potentials = potentials.set_geometry("geom")
+
+        # check intersection of potential areas with exisiting PVs (MaStR)
+        overlay = gpd.sjoin(pvs, potentials)
+        o = overlay["index_right"]
+        o.drop_duplicates(inplace=True)
+
+        # define selected potentials areas
+        pot_sel = potentials_pot.copy()
+        pot_sel["selected"] = pd.Series()
+        pot_sel["voltage_level"] = pd.Series(dtype=int)
+        for i in range(0, len(o)):
+            index_pot = o.iloc[i]
+            pot_sel["selected"].loc[index_pot] = True
+            # get voltage level of existing PVs
+            index_pv = o.index[i]
+            pot_sel["voltage_level"] = mastr["voltage_level"].loc[index_pv]
+        pot_sel = pot_sel[pot_sel["selected"] == True]
+        pot_sel.drop("selected", axis=1, inplace=True)
+
+        return pot_sel
+
+    def build_pv(pv_pot, pow_per_area):
+
+        """Build new pv parks in selected potential areas.
+
+        Parameters
+        ----------
+        pv_pot: gpd.GeoDataFrame()
+            Selected potential areas
+        pow_per_area: int
+            Assumption for areas of existing pv farms and power of new built pv farms depending on area in kW/m²
+
+        """
+
+        # build pv farms in selected areas
+
+        # calculation of centroids
+        pv_pot["centroid"] = pv_pot["geom"].centroid
+
+        # calculation of power in kW
+        pv_pot["installed capacity in kW"] = pd.Series()
+        pv_pot["installed capacity in kW"] = pv_pot["area"] * pow_per_area
+
+        # check for maximal capacity for PV ground mounted
+        limit_cap = 120000  # in kW
+        pv_pot["installed capacity in kW"] = pv_pot["installed capacity in kW"].apply(
+            lambda x: x if x < limit_cap else limit_cap
+        )
+
+        return pv_pot
+
+    def adapt_grid_level(pv_pot, max_dist_hv, con):
+
+        """Check and if needed adapt grid level of newly built pv parks.
+
+        Parameters
+        ----------
+        pv_pot: gpd.GeoDataFrame()
+            Newly built pv parks on selected potential areas
+        max_dist_hv: int
+            Assumption for maximum distance of park with hv-power to next substation in m
+        con:
+            Connection to database
+
+        """
+
+        # divide dataframe in MV and HV
+        pv_pot_mv = pv_pot[pv_pot["voltage_level"] == 5]
+        pv_pot_hv = pv_pot[pv_pot["voltage_level"] == 4]
+
+        # check installed capacity in MV
+
+        max_cap_mv = 5500  # in kW
+
+        # find PVs which need to be HV or to have reduced capacity
+        pv_pot_mv_to_hv = pv_pot_mv[pv_pot_mv["installed capacity in kW"] > max_cap_mv]
+
+        if len(pv_pot_mv_to_hv) > 0:
+
+            # import data for HV substations
+
+            sql = "SELECT point, voltage FROM grid.egon_hvmv_substation"
+            hvmv_substation = gpd.GeoDataFrame.from_postgis(sql, con, geom_col="point")
+            hvmv_substation = hvmv_substation.to_crs(3035)
+            hvmv_substation["voltage"] = hvmv_substation["voltage"].apply(
+                lambda x: int(x.split(";")[0])
+            )
+            hv_substations = hvmv_substation[hvmv_substation["voltage"] >= 110000]
+            hv_substations = hv_substations.unary_union  # join all the hv_substations
+
+            # check distance to HV substations of PVs with too high installed capacity for MV
+
+            # calculate distance to substations
+            pv_pot_mv_to_hv["dist_to_HV"] = (
+                pv_pot_mv_to_hv["geom"].to_crs(3035).distance(hv_substations)
+            )
+
+            # adjust grid level and keep capacity if transmission lines are close
+            pv_pot_mv_to_hv = pv_pot_mv_to_hv[
+                pv_pot_mv_to_hv["dist_to_HV"] <= max_dist_hv
+            ]
+            pv_pot_mv_to_hv = pv_pot_mv_to_hv.drop(columns=["dist_to_HV"])
+            pv_pot_hv = pv_pot_hv.append(pv_pot_mv_to_hv)
+
+            # delete PVs which are now HV from MV dataframe
+            for index, pot in pv_pot_mv_to_hv.iterrows():
+                pv_pot_mv = pv_pot_mv.drop([index])
+            pv_pot_hv["voltage_level"] = 4
+
+            # keep grid level adjust capacity if transmission lines are too far
+            pv_pot_mv["installed capacity in kW"] = pv_pot_mv[
+                "installed capacity in kW"
+            ].apply(lambda x: x if x < max_cap_mv else max_cap_mv)
+            pv_pot_mv["voltage_level"] = 5
+
+            pv_pot = pv_pot_mv.append(pv_pot_hv)
+
+        return pv_pot
+
+    def build_additional_pv(potentials, pv, pow_per_area, con):
+
+        """Build additional pv parks if pv parks on selected potential areas do not hit the target value.
+
+         Parameters
+         ----------
+         potenatials: gpd.GeoDataFrame()
+             All suitable potential areas
+         pv: gpd.GeoDataFrame()
+             Newly built pv parks on selected potential areas
+        pow_per_area: int
+             Assumption for areas of existing pv farms and power of new built pv farms depending on area in kW/m²
+         con:
+             Connection to database
+
+        """
+
+        # get MV grid districts
+        sql = "SELECT subst_id, geom FROM grid.mv_grid_districts"
+        distr = gpd.GeoDataFrame.from_postgis(sql, con)
+        distr = distr.set_index("subst_id")
+
+        # identify potential areas where there are no PV parks yet
+        for index, pv in pv.iterrows():
+            potentials = potentials.drop([index])
+
+        # aggregate potential area per MV grid district
+        pv_per_distr = gpd.GeoDataFrame()
+        pv_per_distr["geom"] = distr["geom"].copy()
+        centroids = potentials.copy()
+        centroids["geom"] = centroids["geom"].centroid
+
+        overlay = gpd.sjoin(centroids, distr)
+
+        ### examine potential area per grid district
+        anz = len(overlay)
+        anz_distr = len(overlay["index_right"].unique())
+        size = 137500  # m2 Fläche für > 5,5 MW: (5500 kW / (0,04 kW/m2))
+        anz_big = len(overlay[overlay["area"] >= size])
+        anz_small = len(overlay[overlay["area"] < size])
+
+        print(" ")
+        print("Untersuchung der (übrigen) Potentialflächen in den MV Grid Districts: ")
+        print("Anzahl der Potentialflächen: " + str(anz))
+        print(" -> verteilt über " + str(anz_distr) + " Districts")
+        print("Anzahl der Flächen mit einem Potential >= 5,5 MW: " + str(anz_big))
+        print("Anzahl der Flächen mit einem Potential < 5,5 MW: " + str(anz_small))
+        print(" ")
+
+        for index, dist in distr.iterrows():
+            pots = overlay[overlay["index_right"] == index]["geom"].index
+            p = gpd.GeoSeries(index=pots)
+            for i in pots:
+                p.loc[i] = potentials["geom"].loc[i]
+            pv_per_distr["geom"].loc[index] = p.unary_union
+
+        # calculate area per MV grid district and linearly distribute needed capacity considering pow_per_area
+        pv_per_distr["area"] = pv_per_distr["geom"].area
+        pv_per_distr["installed capacity in kW"] = pv_per_distr["area"] * pow_per_area
+
+        # calculate centroid
+        pv_per_distr["centroid"] = pv_per_distr["geom"].centroid
+
+        return pv_per_distr
+
+    def check_target(
+        pv_rora_i,
+        pv_agri_i,
+        potentials_rora_i,
+        potentials_agri_i,
+        target_power,
+        pow_per_area,
+        con,
+    ):
+
+        """Check target value per scenario and per state.
+
+         Parameters
+         ----------
+         pv_rora_i: gpd.GeoDataFrame()
+             Newly built pv parks on selected potential areas of road and railways p
+         pv_agri_i: gpd.GeoDataFrame()
+             Newly built pv parks on selected potential areas of agriculture
+         potenatials_rora_i: gpd.GeoDataFrame()
+             All suitable potential areas of road and railway
+         potenatials_rora_i: gpd.GeoDataFrame()
+             All suitable potential areas of agriculture
+         target_power: int
+             Target for installed capacity of pv ground mounted in referenced state
+        pow_per_area: int
+             Assumption for areas of existing pv farms and power of new built pv farms depending on area in kW/m²
+         con:
+             Connection to database
+
+        """
+
+        # sum overall installed capacity for MV and HV
+
+        total_pv_power = (
+            pv_rora_i["installed capacity in kW"].sum()
+            + pv_agri_i["installed capacity in kW"].sum()
+        )
+
+        pv_per_distr_i = gpd.GeoDataFrame()
+
+        # check target value
+
+        ###
+        print(" ")
+        print(
+            "Installierte Kapazität auf Flächen existierender PV-Parks (Bestandsflächen): "
+            + str(total_pv_power / 1000)
+            + " MW"
+        )
+
+        # linear scale farms to meet target if sum of installed capacity is too high
+        if total_pv_power > target_power:
+
+            scale_factor = target_power / total_pv_power
+            pv_rora_i["installed capacity in kW"] = (
+                pv_rora_i["installed capacity in kW"] * scale_factor
+            )
+            pv_agri_i["installed capacity in kW"] = (
+                pv_agri_i["installed capacity in kW"] * scale_factor
+            )
+
+            pv_per_distr_i["grid_district"] = pd.Series()
+            pv_per_distr_i["installed capacity in kW"] = pd.Series(0)
+
+            ###
+            print(
+                "Ausweitung existierender PV-Parks auf Potentialflächen zur Erreichung der Zielkapazität ist ausreichend."
+            )
+            print(
+                "Installierte Leistung ist größer als der Zielwert, es wird eine Skalierung vorgenommen:"
+            )
+            print("Saklierungsfaktor: " + str(scale_factor))
+
+        # build new pv parks if sum of installed capacity is below target value
+        elif total_pv_power < target_power:
+
+            rest_cap = target_power - total_pv_power
+
+            ###
+            print(
+                "Ausweitung existierender PV-Parks auf Potentialflächen zur Erreichung der Zielkapazität NICHT ausreichend:"
+            )
+            print("Restkapazität: " + str(rest_cap / 1000) + " MW")
+            print(
+                "Restkapazität wird zunächst über übrige Potentialflächen Road & Railway verteilt."
+            )
+
+            # build pv parks in potential areas road & railway
+            pv_per_distr_i = build_additional_pv(
+                potentials_rora_i, pv_rora_i, pow_per_area, con
+            )
+            # change index to add different Dataframes in the end
+            pv_per_distr_i["grid_district"] = pv_per_distr_i.index.copy()
+            pv_per_distr_i.index = range(0, len(pv_per_distr_i))
+            # delete empty grid districts
+            index_names = pv_per_distr_i[
+                pv_per_distr_i["installed capacity in kW"] == 0.0
+            ].index
+            pv_per_distr_i.drop(index_names, inplace=True)
+
+            if pv_per_distr_i["installed capacity in kW"].sum() > rest_cap:
+                scale_factor = (
+                    rest_cap / pv_per_distr_i["installed capacity in kW"].sum()
+                )
+                pv_per_distr_i["installed capacity in kW"] = (
+                    pv_per_distr_i["installed capacity in kW"] * scale_factor
+                )
+
+                ###
+                print(
+                    "Restkapazität ist mit dem Skalierungsfaktor "
+                    + str(scale_factor)
+                    + " über übrige Potentialflächen Road & Railway verteilt."
+                )
+
+            # build pv parks on potential areas ariculture if still necessary
+            elif pv_per_distr_i["installed capacity in kW"].sum() < rest_cap:
+
+                rest_cap = target_power - total_pv_power
+
+                ###
+                print(
+                    "Verteilung über Potentialflächen Road & Railway zur Erreichung der Zielkapazität NICHT ausreichend:"
+                )
+                print("Restkapazität: " + str(rest_cap / 1000) + " MW")
+                print(
+                    "Restkapazität wird über übrige Potentialflächen Agriculture verteilt."
+                )
+
+                pv_per_distr_i_2 = build_additional_pv(
+                    potentials_agri_i, pv_agri_i, pow_per_area, con
+                )
+                # change index to add different Dataframes in the end
+                pv_per_distr_i_2["grid_district"] = pv_per_distr_i_2.index
+                pv_per_distr_i_2.index = range(
+                    len(pv_per_distr_i), 2 * len(pv_per_distr_i)
+                )
+                # delete empty grid districts
+                index_names = pv_per_distr_i_2[
+                    pv_per_distr_i_2["installed capacity in kW"] == 0.0
+                ].index
+                pv_per_distr_i_2.drop(index_names, inplace=True)
+                pv_per_distr_i.append(pv_per_distr_i_2)
+
+                if pv_per_distr_i["installed capacity in kW"].sum() > rest_cap:
+                    scale_factor = (
+                        rest_cap / pv_per_distr_i["installed capacity in kW"].sum()
+                    )
+                    pv_per_distr_i["installed capacity in kW"] = (
+                        pv_per_distr_i["installed capacity in kW"] * scale_factor
+                    )
+
+                    ###
+                    print(
+                        "Restkapazität ist mit dem Skalierungsfaktor "
+                        + str(scale_factor)
+                        + " über übrige Potentialflächen Road & Railway und Agriculture verteilt."
+                    )
+
+            # assign grid level to pv_per_distr
+            v_lvl = pd.Series(dtype=int, index=pv_per_distr_i.index)
+            for index, distr in pv_per_distr_i.iterrows():
+                if distr["installed capacity in kW"] > 5500:  # > 5 MW
+                    v_lvl[index] = 4
+                else:
+                    v_lvl[index] = 5
+            pv_per_distr_i["voltage_level"] = v_lvl
+
+            # new overall installed capacity
+            total_pv_power = (
+                pv_rora_i["installed capacity in kW"].sum()
+                + pv_agri_i["installed capacity in kW"].sum()
+                + pv_per_distr_i["installed capacity in kW"].sum()
+            )
+
+            ###
+            print(
+                "Installierte Leistung der PV-Parks insgesamt: "
+                + str(total_pv_power / 1000)
+                + " MW"
+            )
+            print(" ")
+
+        return pv_rora_i, pv_agri_i, pv_per_distr_i
+
+    def run_methodology(
+        con=db.engine(),
+        path="",
+        pow_per_area=0.04,
+        join_buffer=10,
+        max_dist_hv=20000,
+        show_map=False,
+    ):
+
+        """Execute methodology to distribute pv ground mounted.
+
+         Parameters
+         ----------
+         con:
+             Connection to database
+         path : string
+             Path to location of MaStR-file
+         pow_per_area: int, default 0.4
+             Assumption for areas of existing pv farms and power of new built pv farms depending on area in kW/m²
+         join_buffer : int, default 10
+             Maximum distance for joining of potential areas (only small ones to big ones) in m
+         max_dist_hv : int, default 20000
+             Assumption for maximum distance of park with hv-power to next substation in m
+        show_map:  boolean
+            Optional creation of map to show distribution of installed capacity
+
+        """
+
+        ###
+        print(" ")
+        print("MaStR-Data")
+        print(" ")
+
+        # MaStR-data: existing PV farms
+        mastr = mastr_existing_pv(path, pow_per_area)
+
+        ###
+        print(" ")
+        print("potential area")
+        print(" ")
+
+        # database-data: potential areas for new PV farms
+        potentials_rora, potentials_agri = potential_areas(con, join_buffer)
+
+        ###
+        print(" ")
+        print("select potentials area")
+        print(" ")
+
+        # select potential areas with existing PV farms to build new PV farms
+        pv_rora = select_pot_areas(mastr, potentials_rora)
+        pv_agri = select_pot_areas(mastr, potentials_agri)
+
+        ###
+        print(" ")
+        print(
+            "build PV parks where there is PV ground mounted already (-> MaStR) on potential area"
+        )
+        print(" ")
+
+        # build new PV farms
+        pv_rora = build_pv(pv_rora, pow_per_area)
+        pv_agri = build_pv(pv_agri, pow_per_area)
+
+        ###
+        print(" ")
+        print("adapt grid level of PV parks")
+        print(" ")
+
+        # adapt grid level to new farms
+        rora = adapt_grid_level(pv_rora, max_dist_hv, con)
+        agri = adapt_grid_level(pv_agri, max_dist_hv, con)
+
+        ###
+        print(" ")
+        print(
+            "check target value and build more PV parks on potential area if necessary"
+        )
+        print(" ")
+
+        # 1) scenario: eGon2035
+
+        ###
+        print(" ")
+        print("scenario: eGon2035")
+        print(" ")
+
+        # German states
+        sql = "SELECT geometry as geom, nuts FROM boundaries.vg250_lan"
+        states = gpd.GeoDataFrame.from_postgis(sql, con)
+
+        # assumption for target value of installed capacity
+        sql = "SELECT capacity,scenario_name,nuts FROM supply.egon_scenario_capacities WHERE carrier='solar'"
+        target = pd.read_sql(sql, con)
+        target = target[target["scenario_name"] == "eGon2035"]
+        nuts = np.unique(target["nuts"])
+
+        # initialize final dataframe
+        pv_rora = gpd.GeoDataFrame()
+        pv_agri = gpd.GeoDataFrame()
+        pv_per_distr = gpd.GeoDataFrame()
+
+        # prepare selection per state
+        rora = rora.set_geometry("centroid")
+        agri = agri.set_geometry("centroid")
+        potentials_rora = potentials_rora.set_geometry("geom")
+        potentials_agri = potentials_agri.set_geometry("geom")
+
+        # check target value per state
+        for i in nuts:
+            target_power = target[target["nuts"] == i]["capacity"].iloc[0] * 1000
+
+            ###
+            land = target[target["nuts"] == i]["nuts"].iloc[0]
+            print(" ")
+            print("Bundesland (NUTS): " + land)
+            print("target power: " + str(target_power / 1000) + " MW")
+
+            # select state
+            state = states[states["nuts"] == i]
+            state = state.to_crs(3035)
+
+            # select PVs in state
+            rora_i = gpd.sjoin(rora, state)
+            agri_i = gpd.sjoin(agri, state)
+            rora_i.drop("index_right", axis=1, inplace=True)
+            agri_i.drop("index_right", axis=1, inplace=True)
+            rora_i.drop_duplicates(inplace=True)
+            agri_i.drop_duplicates(inplace=True)
+
+            # select potential area in state
+            potentials_rora_i = gpd.sjoin(potentials_rora, state)
+            potentials_agri_i = gpd.sjoin(potentials_agri, state)
+            potentials_rora_i.drop("index_right", axis=1, inplace=True)
+            potentials_agri_i.drop("index_right", axis=1, inplace=True)
+            potentials_rora_i.drop_duplicates(inplace=True)
+            potentials_agri_i.drop_duplicates(inplace=True)
+
+            # check target value and adapt installed capacity if necessary
+            rora_i, agri_i, distr_i = check_target(
+                rora_i,
+                agri_i,
+                potentials_rora_i,
+                potentials_agri_i,
+                target_power,
+                pow_per_area,
+                con,
+            )
+            if len(distr_i) > 0:
+                distr_i["nuts"] = target[target["nuts"] == i]["nuts"].iloc[0]
+
+            ### examination of built PV parks per state
+            rora_i_mv = rora_i[rora_i["voltage_level"] == 5]
+            rora_i_hv = rora_i[rora_i["voltage_level"] == 4]
+            agri_i_mv = agri_i[agri_i["voltage_level"] == 5]
+            agri_i_hv = agri_i[agri_i["voltage_level"] == 4]
+            print("Untersuchung der Spannungslevel pro Bundesland:")
+            print("a) PVs auf Potentialflächen Road & Railway: ")
+            print(
+                "Insgesamt installierte Leistung: "
+                + str(rora_i["installed capacity in kW"].sum() / 1000)
+                + " MW"
+            )
+            print("Anzahl der PV-Parks: " + str(len(rora_i)))
+            print(" - davon Mittelspannung: " + str(len(rora_i_mv)))
+            print(" - davon Hochspannung: " + str(len(rora_i_hv)))
+            print("b) PVs auf Potentialflächen Agriculture: ")
+            print(
+                "Insgesamt installierte Leistung: "
+                + str(agri_i["installed capacity in kW"].sum() / 1000)
+                + " MW"
+            )
+            print("Anzahl der PV-Parks: " + str(len(agri_i)))
+            print(" - davon Mittelspannung: " + str(len(agri_i_mv)))
+            print(" - davon Hochspannung: " + str(len(agri_i_hv)))
+            print("c) PVs auf zusätzlichen Potentialflächen pro MV-District: ")
+            if len(distr_i) > 0:
+                distr_i_mv = distr_i[distr_i["voltage_level"] == 5]
+                distr_i_hv = distr_i[distr_i["voltage_level"] == 4]
+                print(
+                    "Insgesamt installierte Leistung: "
+                    + str(distr_i["installed capacity in kW"].sum() / 1000)
+                    + " MW"
+                )
+                print("Anzahl der PV-Parks: " + str(len(distr_i)))
+                print(" - davon Mittelspannung: " + str(len(distr_i_mv)))
+                print(" - davon Hochspannung: " + str(len(distr_i_hv)))
+            else:
+                print(" -> zusätzlicher Ausbau nicht notwendig")
+            print(" ")
+
+            pv_rora = pv_rora.append(rora_i)
+            pv_agri = pv_agri.append(agri_i)
+            if len(distr_i) > 0:
+                pv_per_distr = pv_per_distr.append(distr_i)
+
+        # 2) scenario: eGon100RE
+
+        # assumption for target value of installed capacity in Germany per scenario
+        sql = "SELECT capacity,scenario_name FROM supply.egon_scenario_capacities WHERE carrier='solar'"
+        target_power = pd.read_sql(sql, con)
+        target_power = target_power[target_power["scenario_name"] == "eGon100RE"]
+        target_power = target_power["capacity"].sum() * 1000
+
+        ###
+        print(" ")
+        print("scenario: eGon100RE")
+        print("target power: " + str(target_power) + " kW")
+        print(" ")
+
+        # check target value and adapt installed capacity if necessary
+        pv_rora_100RE, pv_agri_100RE, pv_per_distr_100RE = check_target(
+            rora,
+            agri,
+            potentials_rora,
+            potentials_agri,
+            target_power,
+            pow_per_area,
+            con,
+        )
+
+        ### create map to show distribution of installed capacity
+        if show_map == True:
+
+            # 1) eGon2035
+
+            # get MV grid districts
+            sql = "SELECT subst_id, geom FROM grid.mv_grid_districts"
+            distr = gpd.GeoDataFrame.from_postgis(sql, con)
+            distr = distr.set_index("subst_id")
+
+            # assign pv_per_distr-power to districts
+            distr["capacity"] = pd.Series()
+            for index, row in distr.iterrows():
+                if index in np.unique(pv_per_distr["grid_district"]):
+                    pv = pv_per_distr[pv_per_distr["grid_district"] == index]
+                    x = pv["installed capacity in kW"].iloc[0]
+                    distr["capacity"].loc[index] = x
+                else:
+                    distr["capacity"].loc[index] = 0
+            distr["capacity"] = distr["capacity"] / 1000
+
+            # add pv_rora- and pv_agri-power to district
+            pv_rora = pv_rora.set_geometry("centroid")
+            pv_agri = pv_agri.set_geometry("centroid")
+            overlay_rora = gpd.sjoin(pv_rora, distr)
+            overlay_agri = gpd.sjoin(pv_agri, distr)
+
+            for index, row in distr.iterrows():
+                o_rora = overlay_rora[overlay_rora["index_right"] == index]
+                o_agri = overlay_agri[overlay_agri["index_right"] == index]
+                cap_rora = o_rora["installed capacity in kW"].sum() / 1000
+                cap_agri = o_agri["installed capacity in kW"].sum() / 1000
+            distr["capacity"].loc[index] = (
+                distr["capacity"].loc[index] + cap_rora + cap_agri
+            )
+
+            from matplotlib import pyplot as plt
+
+            fig, ax = plt.subplots(1, 1)
+            distr.boundary.plot(linewidth=0.2, ax=ax, color="black")
+            distr.plot(
+                ax=ax,
+                column="capacity",
+                cmap="magma_r",
+                legend=True,
+                legend_kwds={
+                    "label": f"Installed capacity in MW",
+                    "orientation": "vertical",
+                },
+            )
+            plt.savefig("pv_per_distr_map_eGon2035.png", dpi=300)
+
+            # 2) eGon100RE
+
+            # get MV grid districts
+            sql = "SELECT subst_id, geom FROM grid.mv_grid_districts"
+            distr = gpd.GeoDataFrame.from_postgis(sql, con)
+            distr = distr.set_index("subst_id")
+
+            # assign pv_per_distr-power to districts
+            distr["capacity"] = pd.Series()
+            for index, row in distr.iterrows():
+                if index in np.unique(pv_per_distr_100RE["grid_district"]):
+                    pv = pv_per_distr_100RE[
+                        pv_per_distr_100RE["grid_district"] == index
+                    ]
+                    x = pv["installed capacity in kW"].iloc[0]
+                    distr["capacity"].loc[index] = x
+                else:
+                    distr["capacity"].loc[index] = 0
+            distr["capacity"] = distr["capacity"] / 1000
+
+            # add pv_rora- and pv_agri-power to district
+            pv_rora_100RE = pv_rora_100RE.set_geometry("centroid")
+            pv_agri_100RE = pv_agri_100RE.set_geometry("centroid")
+            overlay_rora = gpd.sjoin(pv_rora_100RE, distr)
+            overlay_agri = gpd.sjoin(pv_agri_100RE, distr)
+
+            for index, row in distr.iterrows():
+                o_rora = overlay_rora[overlay_rora["index_right"] == index]
+                o_agri = overlay_agri[overlay_agri["index_right"] == index]
+                cap_rora = o_rora["installed capacity in kW"].sum() / 1000
+                cap_agri = o_agri["installed capacity in kW"].sum() / 1000
+            distr["capacity"].loc[index] = (
+                distr["capacity"].loc[index] + cap_rora + cap_agri
+            )
+
+            from matplotlib import pyplot as plt
+
+            fig, ax = plt.subplots(1, 1)
+            distr.boundary.plot(linewidth=0.2, ax=ax, color="black")
+            distr.plot(
+                ax=ax,
+                column="capacity",
+                cmap="magma_r",
+                legend=True,
+                legend_kwds={
+                    "label": f"Installed capacity in MW",
+                    "orientation": "vertical",
+                },
+            )
+            plt.savefig("pv_per_distr_map_eGon100RE.png", dpi=300)
+
+        return (
+            pv_rora,
+            pv_agri,
+            pv_per_distr,
+            pv_rora_100RE,
+            pv_agri_100RE,
+            pv_per_distr_100RE,
+        )
+
+    def pv_parks(pv_rora, pv_agri, pv_per_distr, scenario_name):
+
+        """Write to database.
+
+        Parameters
+        ----------
+        pv_rora : gpd.GeoDataFrame()
+            Pv parks on selected potential areas of raod and railway
+        pv_agri : gpd.GeoDataFrame()
+            Pv parks on selected potential areas of raod and railway
+        pv_per_distr: gpd.GeoDataFrame()
+            Additionally built pv parks on potential areas per mv grid district
+        scenario_name:
+            Scenario name of calculation
+
+        """
+
+        # prepare dataframe for integration in supply.egon_power_plants
+
+        # change indices to sum up Dataframes in the end
+        pv_rora["pot_idx"] = pv_rora.index
+        pv_rora.index = range(0, len(pv_rora))
+        pv_agri["pot_idx"] = pv_agri.index
+        l1 = len(pv_rora) + len(pv_agri)
+        pv_agri.index = range(len(pv_rora), l1)
+        l2 = l1 + len(pv_per_distr)
+        pv_per_distr.index = range(l1, l2)
+
+        pv_parks = gpd.GeoDataFrame(index=range(0, l2))
+
+        # electrical capacity in MW
+        cap = pv_rora["installed capacity in kW"].append(
+            pv_agri["installed capacity in kW"]
+        )
+        cap = cap.append(pv_per_distr["installed capacity in kW"])
+        cap = cap / 1000
+        pv_parks["el_capacity"] = cap
+
+        # voltage level
+        lvl = pv_rora["voltage_level"].append(pv_agri["voltage_level"])
+        lvl = lvl.append(pv_per_distr["voltage_level"])
+        pv_parks["voltage_level"] = lvl
+
+        # centroids
+        cen = pv_rora["centroid"].append(pv_agri["centroid"])
+        cen = cen.append(pv_per_distr["centroid"])
+        pv_parks = pv_parks.set_geometry(cen)
+
+        # integration in supply.egon_power_plants
+
+        con = db.engine()
+
+        # maximum ID in egon_power_plants
+        sql = "SELECT MAX(id) FROM supply.egon_power_plants"
+        max_id = pd.read_sql(sql, con)
+        max_id = max_id["max"].iat[0]
+        if max_id == None:
+            max_id = 1
+
+        pv_park_id = max_id + 1
+
+        # copy relevant columns from pv_parks
+        insert_pv_parks = pv_parks[["el_capacity", "voltage_level", "geometry"]]
+
+        # set static column values
+        insert_pv_parks["carrier"] = "solar"
+        insert_pv_parks["chp"] = False
+        insert_pv_parks["th_capacity"] = 0
+        insert_pv_parks["scenario"] = scenario_name
+
+        # change name and crs of geometry column
+        insert_pv_parks = (
+            insert_pv_parks.rename({"geometry": "geom"}, axis=1)
+            .set_geometry("geom")
+            .to_crs(4326)
+        )
+
+        # reset index
+        insert_pv_parks.index = pd.RangeIndex(
+            start=pv_park_id, stop=pv_park_id + len(insert_pv_parks), name="id"
+        )
+
+        # insert into database
+        insert_pv_parks.reset_index().to_postgis(
+            "egon_power_plants", schema="supply", con=db.engine(), if_exists="append"
+        )
+
+        return pv_parks
+
+    #########################################################################
+
+    # execute methodology
+
+    (
+        pv_rora,
+        pv_agri,
+        pv_per_distr,
+        pv_rora_100RE,
+        pv_agri_100RE,
+        pv_per_distr_100RE,
+    ) = run_methodology(
+        con=db.engine(),
+        path="",
+        pow_per_area=0.04,
+        join_buffer=10,
+        max_dist_hv=20000,
+        show_map=False,
+    )
+
+    ### examination of results
+    if len(pv_per_distr) > 0:
+        pv_per_distr_mv = pv_per_distr[pv_per_distr["voltage_level"] == 5]
+        pv_per_distr_hv = pv_per_distr[pv_per_distr["voltage_level"] == 4]
+    pv_rora_mv = pv_rora[pv_rora["voltage_level"] == 5]
+    pv_rora_hv = pv_rora[pv_rora["voltage_level"] == 4]
+    pv_agri_mv = pv_agri[pv_agri["voltage_level"] == 5]
+    pv_agri_hv = pv_agri[pv_agri["voltage_level"] == 4]
+
+    print(" ")
+    print("Untersuchung der Spannungslevel (gesamt):")
+    print("a) PVs auf Potentialflächen Road & Railway: ")
+    print(
+        "Insgesamt installierte Leistung: "
+        + str(pv_rora["installed capacity in kW"].sum() / 1000)
+        + " MW"
+    )
+    print("Anzahl der PV-Parks: " + str(len(pv_rora)))
+    print(" - davon Mittelspannung: " + str(len(pv_rora_mv)))
+    print(" - davon Hochspannung: " + str(len(pv_rora_hv)))
+    print("b) PVs auf Potentialflächen Agriculture: ")
+    print(
+        "Insgesamt installierte Leistung: "
+        + str(pv_agri["installed capacity in kW"].sum() / 1000)
+        + " MW"
+    )
+    print("Anzahl der PV-Parks: " + str(len(pv_agri)))
+    print(" - davon Mittelspannung: " + str(len(pv_agri_mv)))
+    print(" - davon Hochspannung: " + str(len(pv_agri_hv)))
+    print("c) PVs auf zusätzlichen Potentialflächen pro MV-District: ")
+    if len(pv_per_distr) > 0:
+        print(
+            "Insgesamt installierte Leistung: "
+            + str(pv_per_distr["installed capacity in kW"].sum() / 1000)
+            + " MW"
+        )
+        print("Anzahl der PV-Parks: " + str(len(pv_per_distr)))
+        print(" - davon Mittelspannung: " + str(len(pv_per_distr_mv)))
+        print(" - davon Hochspannung: " + str(len(pv_per_distr_hv)))
+    else:
+        print(" -> zusätzlicher Ausbau nicht notwendig")
+    print(" ")
+    ###
+
+    # save to DB
+
+    if (
+        pv_rora["installed capacity in kW"].sum() > 0
+        or pv_agri["installed capacity in kW"].sum() > 0
+        or pv_per_distr["installed capacity in kW"].sum()
+    ):
+
+        pv_parks = pv_parks(pv_rora, pv_agri, pv_per_distr, "eGon2035")
+
+    else:
+
+        pv_parks = gpd.GeoDataFrame()
+
+    if (
+        pv_rora_100RE["installed capacity in kW"].sum() > 0
+        or pv_agri_100RE["installed capacity in kW"].sum() > 0
+        or pv_per_distr_100RE["installed capacity in kW"].sum()
+    ):
+
+        pv_parks_100RE = pv_parks(
+            pv_rora_100RE, pv_agri_100RE, pv_per_distr_100RE, "eGon100RE"
+        )
+
+    else:
+
+        pv_parks_100RE = gpd.GeoDataFrame()
+
+    return pv_parks, pv_parks_100RE
diff --git a/src/egon/data/processing/wind_farms.py b/src/egon/data/processing/wind_farms.py
new file mode 100755
index 000000000..74c829ed5
--- /dev/null
+++ b/src/egon/data/processing/wind_farms.py
@@ -0,0 +1,477 @@
+from egon.data import db
+import geopandas as gpd
+import pandas as pd
+import numpy as np
+from matplotlib import pyplot as plt
+from shapely.geometry import Polygon, LineString, Point, MultiPoint
+
+
+def wind_power_parks():
+    """ Main function. Import power objectives generate results calling the
+    functions "generate_wind_farms" and  "wind_power_states".
+    
+    Parameters
+    ----------
+    *No parameters required
+    
+    """
+    
+    con = db.engine()
+
+    # federal_std has the shapes of the German states
+    sql = "SELECT  gen, gf, nuts, geometry FROM boundaries.vg250_lan"
+    federal_std = gpd.GeoDataFrame.from_postgis(sql, con, geom_col="geometry", crs=4326)
+
+    # target_power_df has the expected capacity of each federal state
+    sql = "SELECT  carrier, capacity, nuts, scenario_name FROM supply.egon_scenario_capacities"
+    target_power_df = pd.read_sql(sql, con)
+
+    # mv_districts has geographic info of medium voltage districts in Germany
+    sql = "SELECT geom FROM grid.mv_grid_districts"
+    mv_districts = gpd.GeoDataFrame.from_postgis(sql, con)
+
+    # Delete all the water bodies from the federal states shapes
+    federal_std = federal_std[federal_std["gf"] != 1]
+    federal_std.drop(columns=["gf"], inplace=True)
+    # Filter the potential expected from wind_onshore
+    target_power_df = target_power_df[target_power_df["carrier"] == "wind_onshore"]
+    target_power_df.set_index("nuts", inplace=True)
+    target_power_df["geom"] = Point(0, 0)
+
+    # Join the geometries which belong to the same states
+    for std in target_power_df.index:
+        df = federal_std[federal_std["nuts"] == std]
+        if df.size > 0:
+            target_power_df.at[std, "name"] = df["gen"].iat[0]
+        else:
+            target_power_df.at[std, "name"] = np.nan
+        target_power_df.at[std, "geom"] = df.unary_union
+    target_power_df = gpd.GeoDataFrame(target_power_df, geometry="geom", crs=4326)
+    target_power_df = target_power_df[target_power_df["capacity"] > 0]
+    target_power_df = target_power_df.to_crs(3035)
+
+    # Generate WFs for Germany based on potential areas and existing WFs
+    wf_areas, wf_areas_ni = generate_wind_farms()
+
+    # Change the columns "geometry" of this GeoDataFrames
+    wf_areas.set_geometry("centroid", inplace=True)
+    wf_areas_ni.set_geometry("centroid", inplace=True)
+
+    # Create centroids of mv_districts to apply the clip function
+    mv_districts["centroid"] = mv_districts.centroid
+    mv_districts.set_geometry("centroid", inplace=True)
+
+    summary_t = pd.DataFrame()
+    farms = pd.DataFrame()
+
+    # Fit wind farms scenarions for each one of the states
+    for scenario in target_power_df.index:
+        state_wf = gpd.clip(wf_areas, target_power_df.at[scenario, "geom"])
+        state_wf_ni = gpd.clip(wf_areas_ni, target_power_df.at[scenario, "geom"])
+        state_mv_districts = gpd.clip(
+            mv_districts, target_power_df.at[scenario, "geom"]
+        )
+        target_power = target_power_df.at[scenario, "capacity"]
+        scenario_year = target_power_df.at[scenario, "scenario_name"]
+        source = target_power_df.at[scenario, "carrier"]
+        fed_state = target_power_df.at[scenario, "name"]
+        wind_farms_state, summary_state = wind_power_states(
+            state_wf,
+            state_wf_ni,
+            state_mv_districts,
+            target_power,
+            scenario_year,
+            source,
+            fed_state,
+        )
+        summary_t = summary_t.append(summary_state)
+        farms = farms.append(wind_farms_state)
+
+    generate_map()
+
+    return (summary_t, farms)
+
+
+def generate_wind_farms():
+    """Generate wind farms based on existing wind farms.
+    
+    Parameters
+    ----------
+    *No parameters required
+    
+    """
+    # Due to typos in some inputs, some areas of existing wind farms
+    # should be discarded using perimeter and area filters
+    def filter_current_wf(wf_geometry):
+        if wf_geometry.geom_type == "Point":
+            return True
+        if wf_geometry.geom_type == "Polygon":
+            area = wf_geometry.area
+            length = wf_geometry.length
+            # Filter based on the biggest (# of WT) wind farm
+            return (area < 40000000) & (length < 40000)
+        if wf_geometry.geom_type == "LineString":
+            length = wf_geometry.length
+            return length < 1008  # 8 * rotor diameter (8*126m)
+
+    # The function 'wind_farm' returns the connection point of a wind turbine
+    def wind_farm(x):
+        try:
+            return map_ap_wea_farm[x]
+        except:
+            return np.nan
+
+    # The function 'voltage' returns the voltage level a wind turbine operates
+    def voltage(x):
+        try:
+            return map_ap_wea_voltage[x]
+        except:
+            return np.nan
+
+    # Connect to the data base
+    con = db.engine()
+    sql = "SELECT geom FROM supply.egon_re_potential_area_wind"
+    # wf_areas has all the potential areas geometries for wind farms
+    wf_areas = gpd.GeoDataFrame.from_postgis(sql, con)
+    # bus has the connection points of the wind farms
+    bus = pd.read_csv("location_elec_generation_raw.csv")
+    # Drop all the rows without connection point
+    bus.dropna(subset=["NetzanschlusspunktMastrNummer"], inplace=True)
+    # wea has info of each wind turbine in Germany.
+    wea = pd.read_csv("bnetza_mastr_wind_cleaned.csv")
+
+    # Delete all the rows without information about geographical location
+    wea = wea[(pd.notna(wea["Laengengrad"])) & (pd.notna(wea["Breitengrad"]))]
+    # Delete all the offshore wind turbines
+    wea = wea[wea["Kuestenentfernung"] == 0]
+    # the variable map_ap_wea_farm have the connection point of all the available wt
+    # in the dataframe bus.
+    map_ap_wea_farm = {}
+    map_ap_wea_voltage = {}
+    for i in bus.index:
+        for unit in bus["MaStRNummer"][i][1:-1].split(", "):
+            map_ap_wea_farm[unit[1:-1]] = bus["NetzanschlusspunktMastrNummer"][i]
+            map_ap_wea_voltage[unit[1:-1]] = bus["Spannungsebene"][i]
+    wea["connection point"] = wea["EinheitMastrNummer"].apply(wind_farm)
+    wea["voltage"] = wea["EinheitMastrNummer"].apply(voltage)
+
+    # Create the columns 'geometry' which will have location of each WT in a point type
+    wea = gpd.GeoDataFrame(
+        wea,
+        geometry=gpd.points_from_xy(wea["Laengengrad"], wea["Breitengrad"], crs=4326),
+    )
+
+    # wf_size storages the number of WT connected to each connection point
+    wf_size = wea["connection point"].value_counts()
+    # Delete all the connection points with less than 3 WT
+    wf_size = wf_size[wf_size >= 3]
+    # Filter all the WT which are not part of a wind farm of at least 3 WT
+    wea = wea[wea["connection point"].isin(wf_size.index)]
+    # current_wfs has all the geometries that represent the existing wind farms
+    current_wfs = gpd.GeoDataFrame(
+        index=wf_size.index, crs=4326, columns=["geometry", "voltage"]
+    )
+    for conn_point, wt_location in wea.groupby("connection point"):
+        current_wfs.at[conn_point, "geometry"] = MultiPoint(
+            wt_location["geometry"].values
+        ).convex_hull
+        current_wfs.at[conn_point, "voltage"] = wt_location["voltage"].iat[0]
+    current_wfs["geometry2"] = current_wfs["geometry"].to_crs(3035)
+    current_wfs["area"] = current_wfs["geometry2"].apply(lambda x: x.area)
+    current_wfs["length"] = current_wfs["geometry2"].apply(lambda x: x.length)
+    # The 'filter_wts' is used to discard atypical values for the current wind farms
+    current_wfs["filter2"] = current_wfs["geometry2"].apply(
+        lambda x: filter_current_wf(x)
+    )
+
+    # Apply the filter based on area and perimeter
+    current_wfs = current_wfs[current_wfs["filter2"]]
+    current_wfs = current_wfs.drop(columns=["geometry2", "filter2"])
+
+    wf_areas["area [km²]"] = wf_areas.area / 1000000
+
+    # Exclude areas smaller than X m². X was calculated as the area of
+    # 3 WT in the corners of an equilateral triangle with l = 4*rotor_diameter
+    min_area = 4 * (0.126 ** 2) * np.sqrt(3)
+    wf_areas = wf_areas[wf_areas["area [km²]"] > min_area]
+
+    # Find the centroid of all the suitable potential areas
+    wf_areas["centroid"] = wf_areas.centroid
+
+    # find the potential areas that intersects the convex hulls of current wind farms
+    # and assign voltage levels
+    wf_areas = wf_areas.to_crs(4326)
+    for i in wf_areas.index:
+        intersection = current_wfs.intersects(wf_areas.at[i, "geom"])
+        if intersection.any() == False:
+            wf_areas.at[i, "voltage"] = "No Intersection"
+        else:
+            wf_areas.at[i, "voltage"] = current_wfs[intersection].voltage[0]
+
+    # wf_areas_ni has the potential areas which don't intersect any current wind farm
+    wf_areas_ni = wf_areas[wf_areas["voltage"] == "No Intersection"]
+    wf_areas = wf_areas[wf_areas["voltage"] != "No Intersection"]
+    return wf_areas, wf_areas_ni
+
+
+def wind_power_states(
+    state_wf,
+    state_wf_ni,
+    state_mv_districts,
+    target_power,
+    scenario_year,
+    source,
+    fed_state,
+):
+    """Import OSM data from a Geofabrik `.pbf` file into a PostgreSQL database.
+    
+    Parameters
+    ----------
+    state_wf: geodataframe, mandatory
+        gdf containing all the wf in the state created based on existing wf.
+    state_wf_ni: geodataframe, mandatory
+        potential areas in the the state wich don't intersect any existing wf
+    state_mv_districts: geodataframe, mandatory
+        gdf containing all the MV/HV substations in the state
+    target_power: int, mandatory
+        Objective power for a state given in MW
+    scenario_year: str, mandatory
+        name of the scenario
+    source: str, mandatory
+        Type of energy genetor. Always "Wind_onshore" for this script.
+    fed_state: str, mandatory
+        Name of the state where the wind farms will be allocated
+        
+    """
+        
+    def match_district_se(x):
+        for sub in hvmv_substation.index:
+            if x["geom"].contains(hvmv_substation.at[sub, "point"]):
+                return hvmv_substation.at[sub, "point"]
+
+    con = db.engine()
+    sql = "SELECT point, voltage FROM grid.egon_hvmv_substation"
+    # hvmv_substation has the information about HV transmission lines in Germany
+    hvmv_substation = gpd.GeoDataFrame.from_postgis(sql, con, geom_col="point")
+
+    # Set wind potential depending on geographical location
+    power_north = 21.05  # MW/km²
+    power_south = 16.81  # MW/km²
+    # Set a maximum installed capacity to limit the power of big potential areas
+    max_power_hv = 120  # in MW
+    max_power_mv = 20  # in MW
+    # Max distance between WF (connected to MV) and nearest HV substation that
+    # allows its connection to HV.
+    max_dist_hv = 20000  # in meters
+
+    summary = pd.DataFrame(
+        columns=["state", "target", "from existin WF", "MV districts"]
+    )
+
+    north = [
+        "Schleswig-Holstein",
+        "Mecklenburg-Vorpommern",
+        "Niedersachsen",
+        "Bremen",
+        "Hamburg",
+    ]
+
+    if fed_state in north:
+        state_wf["inst capacity [MW]"] = power_north * state_wf["area [km²]"]
+    else:
+        state_wf["inst capacity [MW]"] = power_south * state_wf["area [km²]"]
+
+    # Divide selected areas based on voltage of connection points
+    wf_mv = state_wf[
+        (state_wf["voltage"] != "Hochspannung")
+        & (state_wf["voltage"] != "Hoechstspannung")
+        & (state_wf["voltage"] != "UmspannungZurHochspannung")
+    ]
+
+    wf_hv = state_wf[
+        (state_wf["voltage"] == "Hochspannung")
+        | (state_wf["voltage"] == "Hoechstspannung")
+        | (state_wf["voltage"] == "UmspannungZurHochspannung")
+    ]
+
+    # Wind farms connected to MV network will be connected to HV network if the distance
+    # to the closest HV substation is =< max_dist_hv, and the installed capacity
+    # is bigger than max_power_mv
+    hvmv_substation = hvmv_substation.to_crs(3035)
+    hvmv_substation["voltage"] = hvmv_substation["voltage"].apply(
+        lambda x: int(x.split(";")[0])
+    )
+    hv_substations = hvmv_substation[hvmv_substation["voltage"] >= 110000]
+    hv_substations = hv_substations.unary_union  # join all the hv_substations
+    wf_mv["dist_to_HV"] = state_wf["geom"].to_crs(3035).distance(hv_substations)
+    wf_mv_to_hv = wf_mv[
+        (wf_mv["dist_to_HV"] <= max_dist_hv)
+        & (wf_mv["inst capacity [MW]"] >= max_power_mv)
+    ]
+    wf_mv_to_hv = wf_mv_to_hv.drop(columns=["dist_to_HV"])
+    wf_mv_to_hv["voltage"] = "Hochspannung"
+
+    wf_hv = wf_hv.append(wf_mv_to_hv)
+    wf_mv = wf_mv[
+        (wf_mv["dist_to_HV"] > max_dist_hv)
+        | (wf_mv["inst capacity [MW]"] < max_power_mv)
+    ]
+    wf_mv = wf_mv.drop(columns=["dist_to_HV"])
+
+    wf_hv["inst capacity [MW]"] = wf_hv["inst capacity [MW]"].apply(
+        lambda x: x if x < max_power_hv else max_power_hv
+    )
+
+    wf_mv["inst capacity [MW]"] = wf_mv["inst capacity [MW]"].apply(
+        lambda x: x if x < max_power_mv else max_power_mv
+    )
+
+    wind_farms = wf_hv.append(wf_mv)
+
+    # Adjust the total installed capacity to the scenario
+    total_wind_power = (
+        wf_hv["inst capacity [MW]"].sum() + wf_mv["inst capacity [MW]"].sum()
+    )
+    if total_wind_power > target_power:
+        scale_factor = target_power / total_wind_power
+        wf_mv["inst capacity [MW]"] = wf_mv["inst capacity [MW]"] * scale_factor
+        wf_hv["inst capacity [MW]"] = wf_hv["inst capacity [MW]"] * scale_factor
+        wind_farms = wf_hv.append(wf_mv)
+        summary = summary.append(
+            {
+                "state": fed_state,
+                "target": target_power,
+                "from existin WF": wind_farms["inst capacity [MW]"].sum(),
+                "MV districts": 0,
+            },
+            ignore_index=True,
+        )
+    else:
+        extra_wf = state_mv_districts.copy()
+        extra_wf = extra_wf.drop(columns=["centroid"])
+        # the column centroid has the coordinates of the substation corresponting
+        # to each mv_grid_district
+        extra_wf["centroid"] = extra_wf.apply(match_district_se, axis=1)
+        extra_wf = extra_wf.set_geometry("centroid")
+        extra_wf["area [km²]"] = 0.0
+        for district in extra_wf.index:
+            try:
+                pot_area_district = gpd.clip(state_wf_ni, extra_wf.at[district, "geom"])
+                extra_wf.at[district, "area [km²]"] = pot_area_district[
+                    "area [km²]"
+                ].sum()
+            except:
+                print(district)
+        extra_wf = extra_wf[extra_wf["area [km²]"] != 0]
+        total_new_area = extra_wf["area [km²]"].sum()
+        scale_factor = (target_power - total_wind_power) / total_new_area
+        extra_wf["inst capacity [MW]"] = extra_wf["area [km²]"] * scale_factor
+        extra_wf["voltage"] = "Hochspannung"
+        summary = summary.append(
+            {
+                "state": fed_state,
+                "target": target_power,
+                "from existin WF": wind_farms["inst capacity [MW]"].sum(),
+                "MV districts": extra_wf["inst capacity [MW]"].sum(),
+            },
+            ignore_index=True,
+        )
+        wind_farms = wind_farms.append(extra_wf, ignore_index=True)
+
+    # Use Definition of thresholds for voltage level assignment
+    wind_farms["voltage_level"] = 0
+    for i in wind_farms.index:
+        try:
+            if wind_farms.at[i, "inst capacity [MW]"] < 5.5:
+                wind_farms.at[i, "voltage_level"] = 5
+                continue
+            if wind_farms.at[i, "inst capacity [MW]"] < 20:
+                wind_farms.at[i, "voltage_level"] = 4
+                continue
+            if wind_farms.at[i, "inst capacity [MW]"] >= 20:
+                wind_farms.at[i, "voltage_level"] = 3
+                continue
+        except:
+            print(i)
+
+    # Look for the maximum id in the table egon_power_plants
+    sql = "SELECT MAX(id) FROM supply.egon_power_plants"
+    max_id = pd.read_sql(sql, con)
+    max_id = max_id["max"].iat[0]
+    if max_id == None:
+        wind_farm_id = 1
+    else:
+        wind_farm_id = int(max_id + 1)
+
+    # write_table in egon-data database:
+
+    # Copy relevant columns from wind_farms
+    insert_wind_farms = wind_farms[["inst capacity [MW]", "voltage_level", "centroid"]]
+
+    # Set static column values
+    insert_wind_farms["carrier"] = source
+    insert_wind_farms["chp"] = False
+    insert_wind_farms["th_capacity"] = 0
+    insert_wind_farms["scenario"] = scenario_year
+
+    # Change name and crs of geometry column
+    insert_wind_farms = (
+        insert_wind_farms.rename(
+            {"centroid": "geom", "inst capacity [MW]": "el_capacity"}, axis=1
+        )
+        .set_geometry("geom")
+        .to_crs(4326)
+    )
+
+    # Reset index
+    insert_wind_farms.index = pd.RangeIndex(
+        start=wind_farm_id, stop=wind_farm_id + len(insert_wind_farms), name="id"
+    )
+
+    # Insert into database
+    insert_wind_farms.reset_index().to_postgis(
+        "egon_power_plants", schema="supply", con=db.engine(), if_exists="append"
+    )
+    return wind_farms, summary
+
+
+def generate_map():
+    """ Generates a map with the position of all the wind farms
+
+    Parameters
+    ----------
+    *No parameters required
+    
+    """
+    con = db.engine()
+
+    # Import wind farms from egon-data
+    sql = "SELECT  carrier, el_capacity, geom FROM supply.egon_power_plants"
+    wind_farms = gpd.GeoDataFrame.from_postgis(sql, con, geom_col="geom", crs=4326)
+    wind_farms = wind_farms.to_crs(3035)
+
+    # mv_districts has geographic info of medium voltage districts in Germany
+    sql = "SELECT geom FROM grid.mv_grid_districts"
+    mv_districts = gpd.GeoDataFrame.from_postgis(sql, con)
+    mv_districts = mv_districts.to_crs(3035)
+
+    mv_districts["power"] = 0.0
+    for std in mv_districts.index:
+        try:
+            mv_districts.at[std, "power"] = gpd.clip(
+                wind_farms, mv_districts.at[std, "geom"]
+            ).el_capacity.sum()
+        except:
+            print(std)
+
+    fig, ax = plt.subplots(1, 1)
+    mv_districts.geom.plot(linewidth=0.2, ax=ax, color="black")
+    mv_districts.plot(
+        ax=ax,
+        column="power",
+        cmap="magma_r",
+        legend=True,
+        legend_kwds={"label": "Installed capacity in MW", "orientation": "vertical"},
+    )
+    plt.savefig("wind_farms_map.png", dpi=300)
+    return 0
diff --git a/src/egon/data/processing/zensus_grid_districts.py b/src/egon/data/processing/zensus_grid_districts.py
index e620bba92..b335d576c 100644
--- a/src/egon/data/processing/zensus_grid_districts.py
+++ b/src/egon/data/processing/zensus_grid_districts.py
@@ -8,7 +8,7 @@
 from sqlalchemy import Column, Integer, ForeignKey
 from sqlalchemy.ext.declarative import declarative_base
 from egon.data.processing.zensus_vg250.zensus_population_inside_germany import DestatisZensusPopulationPerHa
-from egon.data.processing.mv_grid_districts import MvGridDistricts
+from egon.data.datasets.mv_grid_districts import MvGridDistricts
 
 
 # will be later imported from another file ###

From 09b0575c1bb78d0c95837e2e26ef61fb703f217a Mon Sep 17 00:00:00 2001
From: nesnoj <jonathan.amme@rl-institut.de>
Date: Wed, 28 Jul 2021 09:53:23 +0200
Subject: [PATCH 58/97] filter VG250 states - use land only (GF=4) #260

---
 src/egon/data/datasets/hh_demand_profiles.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 3ef6ba2c0..35107dd32 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -858,7 +858,8 @@ def houseprofiles_in_census_cells():
                             LEFT JOIN boundaries.egon_map_zensus_vg250 as vg250
                             ON (pop.gid=vg250.zensus_population_id)
                             LEFT JOIN boundaries.vg250_lan as lan
-                            ON (LEFT(vg250.vg250_nuts3, 3)=lan.nuts) """
+                            ON (LEFT(vg250.vg250_nuts3, 3)=lan.nuts)
+                            WHERE lan.gf = 4 """
     )
     df_grid_id = df_grid_id.drop_duplicates()
     df_grid_id = df_grid_id.reset_index(drop=True)

From f28f41074cb3cc9eede2dd7b8070d6f1e4c9d188 Mon Sep 17 00:00:00 2001
From: nesnoj <jonathan.amme@rl-institut.de>
Date: Wed, 28 Jul 2021 13:24:59 +0200
Subject: [PATCH 59/97] use drawing with replacement if pool size < sample
 size, #260

---
 src/egon/data/datasets/hh_demand_profiles.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 35107dd32..ac51e6a4a 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -548,7 +548,9 @@ def get_cell_demand_profile_ids(df_cell, pool_size):
     """
     Generates tuple of hh_type and zensus cell ids
 
-    Takes a random sample (without replacement) of profile ids for given cell.
+    Takes a random sample of profile ids for given cell:
+      * if pool size >= sample size: without replacement
+      * if pool size < sample size: with replacement
     The number of households are rounded to the nearest integer if float.
 
     Parameters
@@ -566,10 +568,12 @@ def get_cell_demand_profile_ids(df_cell, pool_size):
     """
     # maybe use instead
     # np.random.default_rng().integers(low=0, high=pool_size[hh_type], size=sq) instead of random.sample
-    # use random.choice() if with replacement
+    # use random.choices() if with replacement
     # list of sample ids per hh_type in cell
     cell_profile_ids = [
         (hh_type, random.sample(range(pool_size[hh_type]), k=sq))
+        if pool_size[hh_type] >= sq
+        else (hh_type, random.choices(range(pool_size[hh_type]), k=sq))
         for hh_type, sq in zip(
             df_cell["hh_type"],
             np.rint(df_cell["hh_10types"].values).astype(int),
@@ -636,9 +640,10 @@ def get_cell_demand_metadata(df_zensus_cells, df_profiles):
     for grid_id, df_cell in df_zensus_cells.groupby(by="grid_id"):
 
         # random sampling of household profiles for each cell
-        # without replacement within cell but after
-        # number of households are rounded to the nearest integer if float
-        # this results in a small deviation for the course of the aggregated profiles
+        # with or without replacement (see :func:`get_cell_demand_profile_ids`)
+        # within cell but after number of households are rounded to the nearest
+        # integer if float this results in a small deviation for the course of
+        # the aggregated profiles.
         cell_profile_ids = get_cell_demand_profile_ids(df_cell, pool_size)
 
         df_cell_demand_metadata.at[grid_id, "cell_id"] = df_cell.loc[

From 990edd4d8f1d77c50cdceb92e22f727f9ac370ca Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Wed, 28 Jul 2021 16:19:12 +0200
Subject: [PATCH 60/97] Exclude zensus cell data which is unacceptably modified

Destatis marks cell data which is highly modified for
secrecy reasons. These cell data is not corrected but
excluded.
---
 src/egon/data/datasets/hh_demand_profiles.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index ac51e6a4a..602e66b74 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -45,7 +45,7 @@
 To use most of them, the spatial information about the number of households
 per cell (5 categories) needs to be enriched by supplementary data to match
 the household demand profile categories specifications. Hence, 10 out of 12
-different household profile categories can be distinguished and by increasing
+different household profile categories can be distinguished by increasing
 the number of categories of cell-level household data.
 
 **How are these datasets combined?**
@@ -79,6 +79,8 @@
   each federal state
 * Household profiles aggregated annual demand matches Demand Regio demand at
   NUTS-3 level, but it is not matching the demand regio time series profile
+* Due to secrecy, some census data are highly modified under certain attributes
+ (quantity_q = 2). This cell data is not corrected, but excluded.
 
 
 Notes
@@ -806,7 +808,7 @@ def houseprofiles_in_census_cells():
         sql="""
                         SELECT characteristics_text, SUM(quantity) as summe
                         FROM society.egon_destatis_zensus_household_per_ha as egon_d
-                        WHERE attribute = 'HHGROESS_KLASS'
+                        WHERE attribute = 'HHGROESS_KLASS' AND quantity_q < 2
                         GROUP BY characteristics_text """,
         index_col="characteristics_text",
     )
@@ -842,11 +844,12 @@ def houseprofiles_in_census_cells():
         )
 
     # Retrieve information about households for each census cell
+    # Only use cell-data which quality (quantity_q<2) is acceptable
     df_households_typ = db.select_dataframe(
         sql="""
                     SELECT grid_id, attribute, characteristics_code, characteristics_text, quantity
                     FROM society.egon_destatis_zensus_household_per_ha
-                    WHERE attribute = 'HHTYP_FAM' """
+                    WHERE attribute = 'HHTYP_FAM' AND quantity_q <2"""
     )
     df_households_typ = df_households_typ.drop(
         columns=["attribute", "characteristics_text"]

From 3c34d9745ec4e3a7bd520c5c66f118f52ba02554 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Thu, 29 Jul 2021 23:34:30 +0200
Subject: [PATCH 61/97] Extend Module-docstring for census data correction

---
 src/egon/data/datasets/hh_demand_profiles.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 602e66b74..f6fe01e8b 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -25,9 +25,9 @@
   (see :func:`get_household_demand_profiles_raw`)
 * Spatial information about people living in households by Zensus 2011 at
   federal state level
-    * type of household (family status)
-    * age
-    * size
+    * Type of household (family status)
+    * Age
+    * Size
 * Spatial information about number of households per ha, categorized by type
   of household (family status) with 5 categories (also from Zensus 2011)
 * Demand-Regio annual household demand at NUTS3 level
@@ -64,23 +64,26 @@
 
 **What are central assumptions during the data processing?**
 
-* mapping zensus data to IEE household categories is not trivial. In
+* Mapping zensus data to IEE household categories is not trivial. In
   conversion from persons in household to number of
   households, number of inhabitants for multi-person households is estimated
   as weighted average in :var:`OO_factor`
-* the distribution to refine household types at cell level are the same for each federal state
-* refining of household types lead to float number of profiles drew at cell level and need to be rounded to nearest int.
+* The distribution to refine household types at cell level are the same for each federal state
+* Refining of household types lead to float number of profiles drew at cell level and need to be rounded to nearest int.
 * 100 x 100 m cells are matched to NUTS via centroid location
-* cells with households in unpopulated areas are removed
+* Cells with households in unpopulated areas are removed
 
 **Drawbacks and limitations of the data**
 
-* the distribution to refine household types at cell level are the same for
+* The distribution to refine household types at cell level are the same for
   each federal state
 * Household profiles aggregated annual demand matches Demand Regio demand at
   NUTS-3 level, but it is not matching the demand regio time series profile
 * Due to secrecy, some census data are highly modified under certain attributes
  (quantity_q = 2). This cell data is not corrected, but excluded.
+* Census data with attribute 'HHTYP_FAM' is missing for some cells with small
+ amount of households. This data is generated using the average share of household types
+  for cells with similar household number
 
 
 Notes

From f6ab4a285b00a38b2c3223b3e75c61a345d328b8 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Thu, 29 Jul 2021 23:34:52 +0200
Subject: [PATCH 62/97] Add create_missing_zensus_data function

Zensus data is missing data for the attribute HHTYP_FAM.
This data is generated for the missing cells  using the
average split for cells with same amount of households.
---
 src/egon/data/datasets/hh_demand_profiles.py | 114 ++++++++++++++++++-
 1 file changed, 112 insertions(+), 2 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index f6fe01e8b..c3468871f 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -94,12 +94,11 @@
 docs attribute of the respective dataset class.
 """
 from functools import partial
-from itertools import cycle
+from itertools import cycle, product
 from pathlib import Path
 from urllib.request import urlretrieve
 import random
 
-from airflow.operators.python_operator import PythonOperator
 from sqlalchemy import ARRAY, Column, Float, Integer, String
 from sqlalchemy.ext.declarative import declarative_base
 import numpy as np
@@ -361,6 +360,73 @@ def download_process_zensus_households():
     return households_nuts1
 
 
+def create_missing_zensus_data(df_households_typ, df_missing_data, missing_cells):
+    """
+    There is missing data for specific attributes in the zensus dataset because of secrecy reasons.
+    Some cells with only small amount of households are missing with the attribute HHTYP_FAM.
+    However the total amount of households is known with attribute INSGESAMT.
+    The missing data is generated as average share of the household types for cell groups
+    with the same amount of households.
+
+    Parameters
+    ----------
+    df_households_typ: pd.DataFrame
+        Zensus households data
+    df_missing_data: pd.DataFrame
+        number of missing cells of group of amount of households
+    missing_cells: dict
+        dictionary with lists of grids of the missing cells grouped by amount of
+        households in cell
+
+    Returns
+    ----------
+    df_average_split: pd.DataFrame
+        generated dataset of missing cells
+
+    """
+    # grid_ids of missing cells grouped by amount of households
+    missing_grid_ids = {group: list(df.grid_id) for group, df in missing_cells.groupby('quantity')}
+
+    # Grid ids for cells with low household numbers
+    df_households_typ = df_households_typ.set_index('grid_id', drop=True)
+    hh_in_cells = df_households_typ.groupby('grid_id')['quantity'].sum()
+    hh_index = {i: hh_in_cells.loc[hh_in_cells == i].index for i in df_missing_data.households.values}
+
+    df_average_split = pd.DataFrame()
+    for hh_size, index in hh_index.items():
+        # average split of household types in cells with low household numbers
+        split = df_households_typ.loc[index].groupby('characteristics_code').sum() / df_households_typ.loc[
+            index].quantity.sum()
+        split = split.quantity * hh_size
+
+        # correct rounding
+        difference = int(split.sum() - split.round().sum())
+        if difference > 0:
+            # add to any row
+            split = split.round()
+            random_row = split.sample()
+            split[random_row.index] = random_row + difference
+        elif difference < 0:
+            # subtract only from rows > 0
+            split = split.round()
+            random_row = split[split > 0].sample()
+            split[random_row.index] = random_row + difference
+        else:
+            split = split.round()
+
+        # Dataframe with average split for each cell
+        temp = pd.DataFrame(product(zip(split, range(1, 6)), missing_grid_ids[hh_size]), columns=['tuple', 'grid_id'])
+        temp = pd.DataFrame(temp.tuple.tolist()).join(temp.grid_id)
+        temp = temp.rename(columns={0: 'hh_5types', 1: 'characteristics_code'})
+        temp = temp.dropna()
+        temp = temp[(temp['hh_5types'] != 0)]
+        # append for each cell group of households
+        df_average_split = pd.concat([df_average_split, temp], ignore_index=True)
+    df_average_split['hh_5types'] = df_average_split['hh_5types'].astype(int)
+
+    return df_average_split
+
+
 def get_hh_dist(df_zensus, hh_types, multi_adjust=True, relative=True):
     """
     Group zensus data to fit Demand-Profile-Generator (DPG) format.
@@ -857,10 +923,54 @@ def houseprofiles_in_census_cells():
     df_households_typ = df_households_typ.drop(
         columns=["attribute", "characteristics_text"]
     )
+    df_missing_data = db.select_dataframe(
+        sql="""
+                SELECT count(joined.quantity_gesamt) as amount, joined.quantity_gesamt as households
+                FROM(
+                    SELECT t2.grid_id, quantity_gesamt, quantity_sum_fam,
+                     (quantity_gesamt-(case when quantity_sum_fam isnull then 0 else quantity_sum_fam end))
+                     as insgesamt_minus_fam
+                FROM (
+                    SELECT  grid_id, SUM(quantity) as quantity_sum_fam
+                    FROM society.egon_destatis_zensus_household_per_ha
+                    WHERE attribute = 'HHTYP_FAM'
+                    GROUP BY grid_id) as t1
+                Full JOIN (
+                    SELECT grid_id, sum(quantity) as quantity_gesamt
+                    FROM society.egon_destatis_zensus_household_per_ha
+                    WHERE attribute = 'INSGESAMT'
+                    GROUP BY grid_id) as t2 ON t1.grid_id = t2.grid_id
+                    ) as joined
+                WHERE quantity_sum_fam isnull
+                Group by quantity_gesamt """
+    )
+    missing_cells = db.select_dataframe(
+        sql="""
+                    SELECT t12.grid_id, t12.quantity
+                    FROM (
+                    SELECT t2.grid_id, (case when quantity_sum_fam isnull then quantity_gesamt end) as quantity
+                    FROM (
+                        SELECT  grid_id, SUM(quantity) as quantity_sum_fam
+                        FROM society.egon_destatis_zensus_household_per_ha
+                        WHERE attribute = 'HHTYP_FAM'
+                        GROUP BY grid_id) as t1
+                    Full JOIN (
+                        SELECT grid_id, sum(quantity) as quantity_gesamt
+                        FROM society.egon_destatis_zensus_household_per_ha
+                        WHERE attribute = 'INSGESAMT'
+                        GROUP BY grid_id) as t2 ON t1.grid_id = t2.grid_id
+                        ) as t12
+                    WHERE quantity is not null"""
+    )
+
+    df_average_split = create_missing_zensus_data(df_households_typ, df_missing_data, missing_cells)
+
     df_households_typ = df_households_typ.rename(
         columns={"quantity": "hh_5types"}
     )
 
+    df_households_typ = pd.concat([df_households_typ, df_average_split], ignore_index=True)
+
     # Census cells with nuts3 and nuts1 information
     df_grid_id = db.select_dataframe(
         sql="""

From 2ea5eb9d68a1c22536489fccbab8faa9ae24d4be Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Mon, 2 Aug 2021 13:45:09 +0200
Subject: [PATCH 63/97] Create directory in working directory for downloaded
 files

---
 src/egon/data/datasets/hh_demand_profiles.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index c3468871f..544037bb0 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -98,6 +98,7 @@
 from pathlib import Path
 from urllib.request import urlretrieve
 import random
+import os
 
 from sqlalchemy import ARRAY, Column, Float, Integer, String
 from sqlalchemy.ext.declarative import declarative_base
@@ -274,11 +275,18 @@ def get_household_demand_profiles_raw():
     hh_profiles_url = data_config["sources"][
         "household_electricity_demand_profiles"
     ]["url"]
-    hh_profiles_file = Path(".") / Path(hh_profiles_url).name
+
+    download_directory = "hh_demand_profiles"
+    # Create the folder, if it does not exists already
+    if not os.path.exists(download_directory):
+        os.mkdir(download_directory)
+
+    hh_profiles_file = Path(".") / download_directory / Path(hh_profiles_url).name
 
     if not hh_profiles_file.is_file():
         urlretrieve(hh_profiles_url, hh_profiles_file)
 
+    # hh_profiles_file = Path(".") /
     hh_profiles = pd.read_hdf(hh_profiles_file)
 
     # set multiindex to HH_types
@@ -331,7 +339,13 @@ def download_process_zensus_households():
     data_config = egon.data.config.datasets()["household_electricity_demand"]
 
     households_url = data_config["sources"]["zensus_household_types"]["url"]
-    households_file = Path(".") / Path(households_url).name
+
+    download_directory = "hh_demand_profiles"
+    # Create the folder, if it does not exists already
+    if not os.path.exists(download_directory):
+        os.mkdir(download_directory)
+
+    households_file = Path(".") / download_directory / Path(households_url).name
 
     # Download prepared data file from nextcloud
     if not households_file.is_file():

From 1b22726f847419c503ea9ae36a23cbe667b88c46 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Mon, 2 Aug 2021 13:53:03 +0200
Subject: [PATCH 64/97] Add egon_ prefix to db-table-name and change to
 singular form #349

---
 src/egon/data/datasets/hh_demand_profiles.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 544037bb0..24dd96a8b 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -194,7 +194,7 @@
 
 
 class HouseholdElectricityProfilesInCensusCells(Base):
-    __tablename__ = "household_electricity_profiles_in_census_cells"
+    __tablename__ = "egon_household_electricity_profile_in_census_cell"
     __table_args__ = {"schema": "demand"}
 
     cell_id = Column(Integer, primary_key=True)

From 6e0f7480c98478248d711752c4f9b70fec0c3ad0 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Thu, 12 Aug 2021 10:32:06 +0200
Subject: [PATCH 65/97] Change etrago table from kWh to MWh

---
 src/egon/data/datasets/hh_demand_profiles.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 24dd96a8b..7cde9094f 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -14,7 +14,7 @@
   demand profiles on MV grid level or for determining the peak load at load
   area level. The table is created by :func:`houseprofiles_in_census_cells`.
 * `demand.household_electricity_profiles_hvmv_substation`:
-  Household electricity demand profiles aggregated at MV grid district level.
+  Household electricity demand profiles aggregated at MV grid district level in MWh.
   Primarily used to create the eTraGo data model.
   The table is created with :func:`mv_grid_district_HH_electricity_load`.
 
@@ -1162,7 +1162,7 @@ def mv_grid_district_HH_electricity_load(
             year=scenario_year,
             peak_load_only=False,
         )
-        mvgd_profiles_dict[grid_district] = [mvgd_profile.to_list()]
+        mvgd_profiles_dict[grid_district] = [(mvgd_profile / 1e3).round(3).to_list()]  # to MWh
     mvgd_profiles = pd.DataFrame.from_dict(mvgd_profiles_dict, orient="index")
 
     # Reshape data: put MV grid ids in columns to a single index column

From facddae5ae6d6fa96f498f77c997e0247ef7151d Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Fri, 13 Aug 2021 00:49:32 +0200
Subject: [PATCH 66/97] Update download url for
 household_electricity_demand_profiles

Use new donwload url to nextcloud for zensus data and small package
of iee_profiles
---
 src/egon/data/datasets.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/egon/data/datasets.yml b/src/egon/data/datasets.yml
index 69d908e01..60246d4c9 100644
--- a/src/egon/data/datasets.yml
+++ b/src/egon/data/datasets.yml
@@ -438,9 +438,10 @@ electrical_load_curves_cts:
 household_electricity_demand:
   sources:
     household_electricity_demand_profiles:
-      url: "https://next.rl-institut.de/s/M8o3ALXPappRM3Y/download/h0_profiles.h5"
+      url: "https://next.rl-institut.de/s/BTx6cAKdDNYM9yL/download/hh_el_load_profiles_2400.hdf"
     zensus_household_types:
-      url: "https://next.rl-institut.de/s/oQXRkYgWLXK3zND/download/Zensus2011_Personen.csv"
+      url: "https://next.rl-institut.de/s/Eg3iGJPSiyczQeb/download/Zensus2011_Personen.csv"
+
 
 map_mvgrid_vg250:
   sources:

From cb466013277231b1a26c1841a749bf1399f94529 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Fri, 13 Aug 2021 13:02:58 +0200
Subject: [PATCH 67/97] Change db import of egon_etrago_electricity_households
 to pandas.to_sql() with chunksize 1e4

---
 src/egon/data/datasets/hh_demand_profiles.py | 22 +++++++++-----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 7cde9094f..0f998f01c 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -1125,13 +1125,6 @@ def mv_grid_district_HH_electricity_load(
         Demand is given in kWh.
     """
     engine = db.engine()
-    if drop_table:
-        EgonEtragoElectricityHouseholds.__table__.drop(
-            bind=engine, checkfirst=True
-        )
-    EgonEtragoElectricityHouseholds.__table__.create(
-        bind=engine, checkfirst=True
-    )
 
     with db.session_scope() as session:
         cells_query = session.query(
@@ -1173,12 +1166,17 @@ def mv_grid_district_HH_electricity_load(
     mvgd_profiles["version"] = version
     mvgd_profiles["scn_name"] = scenario_name
 
-    # Insert data into respective database table
-    with db.session_scope() as session:
-        session.bulk_insert_mappings(
-            EgonEtragoElectricityHouseholds,
-            mvgd_profiles.to_dict(orient="records"),
+    if drop_table:
+        EgonEtragoElectricityHouseholds.__table__.drop(
+            bind=engine, checkfirst=True
         )
+    EgonEtragoElectricityHouseholds.__table__.create(
+        bind=engine, checkfirst=True
+    )
+    # Insert data into respective database table
+    mvgd_profiles.to_sql(name=EgonEtragoElectricityHouseholds.__table__.name,
+                         schema=EgonEtragoElectricityHouseholds.__table__.schema,
+                         con=engine, if_exists='append', method='multi', chunksize=10000, index=False)
 
     return mvgd_profiles
 

From ea0289f6467dd4d2a7d5dd52b74fb6b989be2f02 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Sun, 15 Aug 2021 19:58:42 +0200
Subject: [PATCH 68/97] Add function to write iee-hh-demand-profiles to db

---
 src/egon/data/datasets/hh_demand_profiles.py | 35 ++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 0f998f01c..ea072ecf1 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -101,6 +101,7 @@
 import os
 
 from sqlalchemy import ARRAY, Column, Float, Integer, String
+from sqlalchemy.dialects.postgresql import INTEGER, CHAR, REAL
 from sqlalchemy.ext.declarative import declarative_base
 import numpy as np
 import pandas as pd
@@ -193,6 +194,15 @@
 }
 
 
+class IeeHouseholdLoadProfiles(Base):
+    __tablename__ = "iee_household_load_profiles"
+    __table_args__ = {"schema": "demand"}
+
+    id = Column(INTEGER, primary_key=True)
+    type = Column(CHAR(7))
+    load = Column(ARRAY(REAL))#, dimensions=2))
+
+
 class HouseholdElectricityProfilesInCensusCells(Base):
     __tablename__ = "egon_household_electricity_profile_in_census_cell"
     __table_args__ = {"schema": "demand"}
@@ -243,6 +253,31 @@ def clean(x):
     return x
 
 
+def write_hh_profiles_to_db(hh_profiles):
+
+    engine = db.engine()
+
+    hh_profiles = hh_profiles.rename_axis('type', axis=1)
+    hh_profiles = hh_profiles.rename_axis('timestep', axis=0)
+    hh_profiles = hh_profiles.stack().rename('load')
+    hh_profiles = hh_profiles.to_frame().reset_index()
+    hh_profiles = hh_profiles.groupby('type').load.apply(tuple)
+    # hh_profiles = hh_profiles.groupby('type').load.apply(list)
+    hh_profiles = hh_profiles.reset_index()
+
+    IeeHouseholdLoadProfiles.__table__.drop(bind=engine, checkfirst=True)
+    IeeHouseholdLoadProfiles.__table__.create(bind=engine)
+
+    hh_profiles.to_sql(name=IeeHouseholdLoadProfiles.__table__.name,
+                       schema=IeeHouseholdLoadProfiles.__table__.schema,
+                       con=engine, if_exists='append',
+                       method='multi', chunksize=100, index=False,
+                       dtype={'load': IeeHouseholdLoadProfiles.load.type,
+                              'type': IeeHouseholdLoadProfiles.type.type,
+                              'id': IeeHouseholdLoadProfiles.id.type}
+                       )
+
+
 def get_household_demand_profiles_raw():
     """
     Downloads and returns household electricity demand profiles

From 8ef8a2b9905393300eea2c62b4d201f008432f97 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Sun, 15 Aug 2021 20:05:28 +0200
Subject: [PATCH 69/97] Separate hh-demand-profile processing to implement
 write_hh_profiles_to_db()

---
 src/egon/data/datasets/hh_demand_profiles.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index ea072ecf1..201605049 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -321,9 +321,13 @@ def get_household_demand_profiles_raw():
     if not hh_profiles_file.is_file():
         urlretrieve(hh_profiles_url, hh_profiles_file)
 
-    # hh_profiles_file = Path(".") /
     hh_profiles = pd.read_hdf(hh_profiles_file)
 
+    return hh_profiles
+
+
+def process_household_demand_profiles(hh_profiles):
+
     # set multiindex to HH_types
     hh_profiles.columns = pd.MultiIndex.from_arrays(
         [hh_profiles.columns.str[:2], hh_profiles.columns.str[3:]]
@@ -913,6 +917,13 @@ def houseprofiles_in_census_cells():
     """
     # Get demand profiles and zensus household type x age category data
     df_profiles = get_household_demand_profiles_raw()
+
+    # Write raw profiles into db
+    write_hh_profiles_to_db(df_profiles)
+
+    # process profiles for further use
+    df_profiles = process_household_demand_profiles(df_profiles)
+
     df_zensus = download_process_zensus_households()
 
     # hh_tools.get_hh_dist without eurostat adjustment for O1-03 Groups in absolute values
@@ -1087,7 +1098,7 @@ def houseprofiles_in_census_cells():
     )
     df_cell_demand_metadata = df_cell_demand_metadata.reset_index(drop=False)
 
-    # Insert data into respective database table
+    # Insert Zensus-cell-profile metadata-table into respective database table
     engine = db.engine()
     HouseholdElectricityProfilesInCensusCells.__table__.drop(
         bind=engine, checkfirst=True

From 85ac5b913c29a1b45666ab80455e9d8987e82639 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Sun, 15 Aug 2021 20:53:49 +0200
Subject: [PATCH 70/97] Rename download functions

---
 src/egon/data/datasets/hh_demand_profiles.py | 61 +++-----------------
 1 file changed, 9 insertions(+), 52 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 201605049..b0cc70cfc 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -278,7 +278,7 @@ def write_hh_profiles_to_db(hh_profiles):
                        )
 
 
-def get_household_demand_profiles_raw():
+def download_process_household_demand_profiles_raw():
     """
     Downloads and returns household electricity demand profiles
 
@@ -341,7 +341,7 @@ def process_household_demand_profiles(hh_profiles):
     return hh_profiles
 
 
-def download_process_zensus_households():
+def download_process_zensus_households_raw():
     """
     Downloads and pre-processes zensus age x household type data
 
@@ -915,13 +915,13 @@ def houseprofiles_in_census_cells():
     the database as pandas
 
     """
-    # Get demand profiles and zensus household type x age category data
-    df_profiles = get_household_demand_profiles_raw()
+    # Download demand profiles
+    df_profiles = download_process_household_demand_profiles_raw()
 
     # Write raw profiles into db
     write_hh_profiles_to_db(df_profiles)
 
-    # process profiles for further use
+    # Process profiles for further use
     df_profiles = process_household_demand_profiles(df_profiles)
 
     df_zensus = download_process_zensus_households()
@@ -1030,54 +1030,11 @@ def houseprofiles_in_census_cells():
     )
 
     df_households_typ = pd.concat([df_households_typ, df_average_split], ignore_index=True)
+    # Download zensus household type x age category data
+    df_zensus = download_process_zensus_households_raw()
 
-    # Census cells with nuts3 and nuts1 information
-    df_grid_id = db.select_dataframe(
-        sql="""
-                            SELECT pop.grid_id, pop.gid as cell_id, vg250.vg250_nuts3 as nuts3, lan.nuts as nuts1, lan.gen
-                            FROM society.destatis_zensus_population_per_ha_inside_germany as pop
-                            LEFT JOIN boundaries.egon_map_zensus_vg250 as vg250
-                            ON (pop.gid=vg250.zensus_population_id)
-                            LEFT JOIN boundaries.vg250_lan as lan
-                            ON (LEFT(vg250.vg250_nuts3, 3)=lan.nuts)
-                            WHERE lan.gf = 4 """
-    )
-    df_grid_id = df_grid_id.drop_duplicates()
-    df_grid_id = df_grid_id.reset_index(drop=True)
-
-    # Merge household type and size data with considered (populated) census cells
-    # how='inner' is used as ids of unpopulated areas are removed df_grid_id or earliers tables. see here:
-    # https://github.com/openego/eGon-data/blob/59195926e41c8bd6d1ca8426957b97f33ef27bcc/src/egon/data/importing/zensus/__init__.py#L418-L449
-    df_households_typ = pd.merge(
-        df_households_typ,
-        df_grid_id,
-        left_on="grid_id",
-        right_on="grid_id",
-        how="inner",
-    )
-
-    # Merge Zensus nuts1 level household data with zensus cell level 100 x 100 m
-    # by refining hh-groups with MAPPING_ZENSUS_HH_SUBGROUPS
-    df_zensus_cells = pd.DataFrame()
-    for (country, code), df_country_type in df_households_typ.groupby(
-        ["gen", "characteristics_code"]
-    ):
-
-        # iterate over zenus_country subgroups
-        for typ in MAPPING_ZENSUS_HH_SUBGROUPS[code]:
-            df_country_type["hh_type"] = typ
-            df_country_type["factor"] = df_dist_households.loc[typ, country]
-            df_country_type["hh_10types"] = (
-                df_country_type["hh_5types"]
-                * df_dist_households.loc[typ, country]
-            )
-            df_zensus_cells = df_zensus_cells.append(
-                df_country_type, ignore_index=True
-            )
-
-    df_zensus_cells = df_zensus_cells.sort_values(
-        by=["grid_id", "characteristics_code"]
-    ).reset_index(drop=True)
+    # Process zensus data for further use
+    df_zensus_cells = process_zensus_data(df_zensus)
 
     # Annual household electricity demand on NUTS-3 level (demand regio)
     df_demand_regio = db.select_dataframe(

From 8d62604bbf881f2aa84d4e8cd1f7a0f8d7711119 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Sun, 15 Aug 2021 20:54:29 +0200
Subject: [PATCH 71/97] Separate zensus data processing into function

---
 src/egon/data/datasets/hh_demand_profiles.py | 263 +++++++++++--------
 1 file changed, 157 insertions(+), 106 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index b0cc70cfc..d4b2757b7 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -668,6 +668,163 @@ def process_nuts1_zensus_data(df_zensus):
     return df_zensus
 
 
+def process_zensus_data(df_zensus):
+
+    # hh_tools.get_hh_dist without eurostat adjustment for O1-03 Groups in absolute values
+    df_hh_types_nad_abs = get_hh_dist(
+        df_zensus, HH_TYPES, multi_adjust=False, relative=False
+    )
+
+    # Get household size for each census cell grouped by
+    # As this is only used to estimate size of households for OR, OO, 1 P and 2 P households are dropped
+    df_hh_size = db.select_dataframe(
+        sql="""
+                        SELECT characteristics_text, SUM(quantity) as summe
+                        FROM society.egon_destatis_zensus_household_per_ha as egon_d
+                        WHERE attribute = 'HHGROESS_KLASS' AND quantity_q < 2
+                        GROUP BY characteristics_text """,
+        index_col="characteristics_text",
+    )
+    df_hh_size = df_hh_size.drop(index=["1 Person", "2 Personen"])
+
+    # Define/ estimate number of persons (w/o kids) for each household category
+    # For categories S* and P* it's clear; for multi-person households (OO,OR)
+    # the number is estimated as average by taking remaining persons
+    OO_factor = (
+        sum(df_hh_size["summe"] * [3, 4, 5, 6]) / df_hh_size["summe"].sum()
+    )
+    mapping_people_in_households = {
+        "SR": 1,
+        "SO": 1,
+        "SK": 1,  # kids are excluded
+        "PR": 2,
+        "PO": 2,
+        "P1": 2,  # kids are excluded
+        "P2": 2,  # ""
+        "P3": 2,  # ""
+        "OR": OO_factor,
+        "OO": OO_factor,
+    }
+    # Determine number of persons for each household category and per federal state
+    df_dist_households = inhabitants_to_households(
+        df_hh_types_nad_abs, mapping_people_in_households
+    )
+
+    # Calculate fraction of fine household types within subgroup of rough household types
+    for value in MAPPING_ZENSUS_HH_SUBGROUPS.values():
+        df_dist_households.loc[value] = df_dist_households.loc[value].div(
+            df_dist_households.loc[value].sum()
+        )
+
+    # Retrieve information about households for each census cell
+    # Only use cell-data which quality (quantity_q<2) is acceptable
+    df_households_typ = db.select_dataframe(
+        sql="""
+                    SELECT grid_id, attribute, characteristics_code, characteristics_text, quantity
+                    FROM society.egon_destatis_zensus_household_per_ha
+                    WHERE attribute = 'HHTYP_FAM' AND quantity_q <2"""
+    )
+    df_households_typ = df_households_typ.drop(
+        columns=["attribute", "characteristics_text"]
+    )
+    df_missing_data = db.select_dataframe(
+        sql="""
+                SELECT count(joined.quantity_gesamt) as amount, joined.quantity_gesamt as households
+                FROM(
+                    SELECT t2.grid_id, quantity_gesamt, quantity_sum_fam,
+                     (quantity_gesamt-(case when quantity_sum_fam isnull then 0 else quantity_sum_fam end))
+                     as insgesamt_minus_fam
+                FROM (
+                    SELECT  grid_id, SUM(quantity) as quantity_sum_fam
+                    FROM society.egon_destatis_zensus_household_per_ha
+                    WHERE attribute = 'HHTYP_FAM'
+                    GROUP BY grid_id) as t1
+                Full JOIN (
+                    SELECT grid_id, sum(quantity) as quantity_gesamt
+                    FROM society.egon_destatis_zensus_household_per_ha
+                    WHERE attribute = 'INSGESAMT'
+                    GROUP BY grid_id) as t2 ON t1.grid_id = t2.grid_id
+                    ) as joined
+                WHERE quantity_sum_fam isnull
+                Group by quantity_gesamt """
+    )
+    missing_cells = db.select_dataframe(
+        sql="""
+                    SELECT t12.grid_id, t12.quantity
+                    FROM (
+                    SELECT t2.grid_id, (case when quantity_sum_fam isnull then quantity_gesamt end) as quantity
+                    FROM (
+                        SELECT  grid_id, SUM(quantity) as quantity_sum_fam
+                        FROM society.egon_destatis_zensus_household_per_ha
+                        WHERE attribute = 'HHTYP_FAM'
+                        GROUP BY grid_id) as t1
+                    Full JOIN (
+                        SELECT grid_id, sum(quantity) as quantity_gesamt
+                        FROM society.egon_destatis_zensus_household_per_ha
+                        WHERE attribute = 'INSGESAMT'
+                        GROUP BY grid_id) as t2 ON t1.grid_id = t2.grid_id
+                        ) as t12
+                    WHERE quantity is not null"""
+    )
+
+    df_average_split = create_missing_zensus_data(df_households_typ, df_missing_data, missing_cells)
+
+    df_households_typ = df_households_typ.rename(
+        columns={"quantity": "hh_5types"}
+    )
+
+    df_households_typ = pd.concat([df_households_typ, df_average_split], ignore_index=True)
+
+    # Census cells with nuts3 and nuts1 information
+    df_grid_id = db.select_dataframe(
+        sql="""
+                            SELECT pop.grid_id, pop.gid as cell_id, vg250.vg250_nuts3 as nuts3, lan.nuts as nuts1, lan.gen
+                            FROM society.destatis_zensus_population_per_ha_inside_germany as pop
+                            LEFT JOIN boundaries.egon_map_zensus_vg250 as vg250
+                            ON (pop.gid=vg250.zensus_population_id)
+                            LEFT JOIN boundaries.vg250_lan as lan
+                            ON (LEFT(vg250.vg250_nuts3, 3)=lan.nuts)
+                            WHERE lan.gf = 4 """
+    )
+    df_grid_id = df_grid_id.drop_duplicates()
+    df_grid_id = df_grid_id.reset_index(drop=True)
+
+    # Merge household type and size data with considered (populated) census cells
+    # how='inner' is used as ids of unpopulated areas are removed df_grid_id or earliers tables. see here:
+    # https://github.com/openego/eGon-data/blob/59195926e41c8bd6d1ca8426957b97f33ef27bcc/src/egon/data/importing/zensus/__init__.py#L418-L449
+    df_households_typ = pd.merge(
+        df_households_typ,
+        df_grid_id,
+        left_on="grid_id",
+        right_on="grid_id",
+        how="inner",
+    )
+
+    # Merge Zensus nuts1 level household data with zensus cell level 100 x 100 m
+    # by refining hh-groups with MAPPING_ZENSUS_HH_SUBGROUPS
+    df_zensus_cells = pd.DataFrame()
+    for (country, code), df_country_type in df_households_typ.groupby(
+        ["gen", "characteristics_code"]
+    ):
+
+        # iterate over zenus_country subgroups
+        for typ in MAPPING_ZENSUS_HH_SUBGROUPS[code]:
+            df_country_type["hh_type"] = typ
+            df_country_type["factor"] = df_dist_households.loc[typ, country]
+            df_country_type["hh_10types"] = (
+                df_country_type["hh_5types"]
+                * df_dist_households.loc[typ, country]
+            )
+            df_zensus_cells = df_zensus_cells.append(
+                df_country_type, ignore_index=True
+            )
+
+    df_zensus_cells = df_zensus_cells.sort_values(
+        by=["grid_id", "characteristics_code"]
+    ).reset_index(drop=True)
+
+    return df_zensus_cells
+
 def get_cell_demand_profile_ids(df_cell, pool_size):
     """
     Generates tuple of hh_type and zensus cell ids
@@ -924,112 +1081,6 @@ def houseprofiles_in_census_cells():
     # Process profiles for further use
     df_profiles = process_household_demand_profiles(df_profiles)
 
-    df_zensus = download_process_zensus_households()
-
-    # hh_tools.get_hh_dist without eurostat adjustment for O1-03 Groups in absolute values
-    df_hh_types_nad_abs = get_hh_dist(
-        df_zensus, HH_TYPES, multi_adjust=False, relative=False
-    )
-
-    # Get household size for each census cell grouped by
-    # As this is only used to estimate size of households for OR, OO, 1 P and 2 P households are dropped
-    df_hh_size = db.select_dataframe(
-        sql="""
-                        SELECT characteristics_text, SUM(quantity) as summe
-                        FROM society.egon_destatis_zensus_household_per_ha as egon_d
-                        WHERE attribute = 'HHGROESS_KLASS' AND quantity_q < 2
-                        GROUP BY characteristics_text """,
-        index_col="characteristics_text",
-    )
-    df_hh_size = df_hh_size.drop(index=["1 Person", "2 Personen"])
-
-    # Define/ estimate number of persons (w/o kids) for each household category
-    # For categories S* and P* it's clear; for multi-person households (OO,OR)
-    # the number is estimated as average by taking remaining persons
-    OO_factor = (
-        sum(df_hh_size["summe"] * [3, 4, 5, 6]) / df_hh_size["summe"].sum()
-    )
-    mapping_people_in_households = {
-        "SR": 1,
-        "SO": 1,
-        "SK": 1,  # kids are excluded
-        "PR": 2,
-        "PO": 2,
-        "P1": 2,  # kids are excluded
-        "P2": 2,  # ""
-        "P3": 2,  # ""
-        "OR": OO_factor,
-        "OO": OO_factor,
-    }
-    # Determine number of persons for each household category and per federal state
-    df_dist_households = inhabitants_to_households(
-        df_hh_types_nad_abs, mapping_people_in_households
-    )
-
-    # Calculate fraction of fine household types within subgroup of rough household types
-    for value in MAPPING_ZENSUS_HH_SUBGROUPS.values():
-        df_dist_households.loc[value] = df_dist_households.loc[value].div(
-            df_dist_households.loc[value].sum()
-        )
-
-    # Retrieve information about households for each census cell
-    # Only use cell-data which quality (quantity_q<2) is acceptable
-    df_households_typ = db.select_dataframe(
-        sql="""
-                    SELECT grid_id, attribute, characteristics_code, characteristics_text, quantity
-                    FROM society.egon_destatis_zensus_household_per_ha
-                    WHERE attribute = 'HHTYP_FAM' AND quantity_q <2"""
-    )
-    df_households_typ = df_households_typ.drop(
-        columns=["attribute", "characteristics_text"]
-    )
-    df_missing_data = db.select_dataframe(
-        sql="""
-                SELECT count(joined.quantity_gesamt) as amount, joined.quantity_gesamt as households
-                FROM(
-                    SELECT t2.grid_id, quantity_gesamt, quantity_sum_fam,
-                     (quantity_gesamt-(case when quantity_sum_fam isnull then 0 else quantity_sum_fam end))
-                     as insgesamt_minus_fam
-                FROM (
-                    SELECT  grid_id, SUM(quantity) as quantity_sum_fam
-                    FROM society.egon_destatis_zensus_household_per_ha
-                    WHERE attribute = 'HHTYP_FAM'
-                    GROUP BY grid_id) as t1
-                Full JOIN (
-                    SELECT grid_id, sum(quantity) as quantity_gesamt
-                    FROM society.egon_destatis_zensus_household_per_ha
-                    WHERE attribute = 'INSGESAMT'
-                    GROUP BY grid_id) as t2 ON t1.grid_id = t2.grid_id
-                    ) as joined
-                WHERE quantity_sum_fam isnull
-                Group by quantity_gesamt """
-    )
-    missing_cells = db.select_dataframe(
-        sql="""
-                    SELECT t12.grid_id, t12.quantity
-                    FROM (
-                    SELECT t2.grid_id, (case when quantity_sum_fam isnull then quantity_gesamt end) as quantity
-                    FROM (
-                        SELECT  grid_id, SUM(quantity) as quantity_sum_fam
-                        FROM society.egon_destatis_zensus_household_per_ha
-                        WHERE attribute = 'HHTYP_FAM'
-                        GROUP BY grid_id) as t1
-                    Full JOIN (
-                        SELECT grid_id, sum(quantity) as quantity_gesamt
-                        FROM society.egon_destatis_zensus_household_per_ha
-                        WHERE attribute = 'INSGESAMT'
-                        GROUP BY grid_id) as t2 ON t1.grid_id = t2.grid_id
-                        ) as t12
-                    WHERE quantity is not null"""
-    )
-
-    df_average_split = create_missing_zensus_data(df_households_typ, df_missing_data, missing_cells)
-
-    df_households_typ = df_households_typ.rename(
-        columns={"quantity": "hh_5types"}
-    )
-
-    df_households_typ = pd.concat([df_households_typ, df_average_split], ignore_index=True)
     # Download zensus household type x age category data
     df_zensus = download_process_zensus_households_raw()
 

From f1e90547e9feca26af8833e7b446133a6e5e7a3d Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Sun, 15 Aug 2021 21:07:12 +0200
Subject: [PATCH 72/97] Rename download function and add processing to
 mv_grid_district_HH_electricity_load

---
 src/egon/data/datasets/hh_demand_profiles.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index d4b2757b7..769c92304 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -1197,9 +1197,13 @@ def mv_grid_district_HH_electricity_load(
         lambda x: [(cat, int(profile_id)) for cat, profile_id in x]
     )
 
-    # Create aggregated load profile for each MV grid district
-    df_profiles = get_household_demand_profiles_raw()
+    # Download demand profiles
+    df_profiles = download_process_household_demand_profiles_raw()
 
+    # Process profiles for further use
+    df_profiles = process_household_demand_profiles(df_profiles)
+
+    # Create aggregated load profile for each MV grid district
     mvgd_profiles_dict = {}
     for grid_district, data in cells.groupby("subst_id"):
         mvgd_profile = get_load_timeseries(

From df9fae687c1e92a798a2c37e7ab2fc428ba39424 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Sun, 15 Aug 2021 21:32:54 +0200
Subject: [PATCH 73/97] Update module docstring

---
 src/egon/data/datasets/hh_demand_profiles.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 769c92304..ed5d3f9e3 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -22,7 +22,7 @@
 
 * Electricity demand time series for household categories
   produced by demand profile generator (DPG) from Fraunhofer IEE
-  (see :func:`get_household_demand_profiles_raw`)
+  (see :func:`download_process_household_demand_profiles_raw`)
 * Spatial information about people living in households by Zensus 2011 at
   federal state level
     * Type of household (family status)

From be4a91af946ef2dcea26b73234389f1afe17ae56 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Sun, 15 Aug 2021 23:22:50 +0200
Subject: [PATCH 74/97] Add docstrings

---
 src/egon/data/datasets/hh_demand_profiles.py | 42 +++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index ed5d3f9e3..b0551009e 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -254,6 +254,22 @@ def clean(x):
 
 
 def write_hh_profiles_to_db(hh_profiles):
+    """Write HH demand profiles of IEE into db. One row per profile type.
+    The annual load profile timeseries is an array.
+
+    schema: demand
+    tablename: iee_household_load_profiles
+
+
+
+    Parameters
+    ----------
+    hh_profiles: pd.DataFrame
+        It is meant to be used with :code:`df.applymap()`
+
+    Returns
+    -------
+    """
 
     engine = db.engine()
 
@@ -262,7 +278,6 @@ def write_hh_profiles_to_db(hh_profiles):
     hh_profiles = hh_profiles.stack().rename('load')
     hh_profiles = hh_profiles.to_frame().reset_index()
     hh_profiles = hh_profiles.groupby('type').load.apply(tuple)
-    # hh_profiles = hh_profiles.groupby('type').load.apply(list)
     hh_profiles = hh_profiles.reset_index()
 
     IeeHouseholdLoadProfiles.__table__.drop(bind=engine, checkfirst=True)
@@ -327,6 +342,19 @@ def download_process_household_demand_profiles_raw():
 
 
 def process_household_demand_profiles(hh_profiles):
+    """Process household demand profiles in a more easy to use format.
+    The profile type is splitted into type and number and set as multiindex.
+
+    Parameters
+    ----------
+    hh_profiles: pd.DataFrame
+        Profiles
+    Returns
+    -------
+    hh_profiles: pd.DataFrame
+        Profiles with Multiindex
+    """
+
 
     # set multiindex to HH_types
     hh_profiles.columns = pd.MultiIndex.from_arrays(
@@ -669,6 +697,17 @@ def process_nuts1_zensus_data(df_zensus):
 
 
 def process_zensus_data(df_zensus):
+    """The zensus data is processed to define the number and type of households per zensus cell.
+    Two subsets of the zensus data are merged to fit the IEE profiles specifications.
+    For this, the dataset 'HHGROESS_KLASS' is converted from people living in households to number of households
+    of specific size. Missing data in 'HHTYP_FAM' is substituted in :func:`create_missing_zensus_data`.
+
+
+    Returns
+    -------
+    pd.DataFrame
+        Number of hh types per census cell and scaling factors
+    """
 
     # hh_tools.get_hh_dist without eurostat adjustment for O1-03 Groups in absolute values
     df_hh_types_nad_abs = get_hh_dist(
@@ -825,6 +864,7 @@ def process_zensus_data(df_zensus):
 
     return df_zensus_cells
 
+
 def get_cell_demand_profile_ids(df_cell, pool_size):
     """
     Generates tuple of hh_type and zensus cell ids

From 7c0a68854e13b78a31c29b8a9ebee376ff88ceb3 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Sun, 15 Aug 2021 23:28:09 +0200
Subject: [PATCH 75/97] Black&Isort

---
 src/egon/data/datasets/hh_demand_profiles.py | 108 ++++++++++++-------
 1 file changed, 72 insertions(+), 36 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index b0551009e..b93454ad4 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -97,11 +97,11 @@
 from itertools import cycle, product
 from pathlib import Path
 from urllib.request import urlretrieve
-import random
 import os
+import random
 
 from sqlalchemy import ARRAY, Column, Float, Integer, String
-from sqlalchemy.dialects.postgresql import INTEGER, CHAR, REAL
+from sqlalchemy.dialects.postgresql import CHAR, INTEGER, REAL
 from sqlalchemy.ext.declarative import declarative_base
 import numpy as np
 import pandas as pd
@@ -200,7 +200,7 @@ class IeeHouseholdLoadProfiles(Base):
 
     id = Column(INTEGER, primary_key=True)
     type = Column(CHAR(7))
-    load = Column(ARRAY(REAL))#, dimensions=2))
+    load = Column(ARRAY(REAL))  # , dimensions=2))
 
 
 class HouseholdElectricityProfilesInCensusCells(Base):
@@ -273,24 +273,30 @@ def write_hh_profiles_to_db(hh_profiles):
 
     engine = db.engine()
 
-    hh_profiles = hh_profiles.rename_axis('type', axis=1)
-    hh_profiles = hh_profiles.rename_axis('timestep', axis=0)
-    hh_profiles = hh_profiles.stack().rename('load')
+    hh_profiles = hh_profiles.rename_axis("type", axis=1)
+    hh_profiles = hh_profiles.rename_axis("timestep", axis=0)
+    hh_profiles = hh_profiles.stack().rename("load")
     hh_profiles = hh_profiles.to_frame().reset_index()
-    hh_profiles = hh_profiles.groupby('type').load.apply(tuple)
+    hh_profiles = hh_profiles.groupby("type").load.apply(tuple)
     hh_profiles = hh_profiles.reset_index()
 
     IeeHouseholdLoadProfiles.__table__.drop(bind=engine, checkfirst=True)
     IeeHouseholdLoadProfiles.__table__.create(bind=engine)
 
-    hh_profiles.to_sql(name=IeeHouseholdLoadProfiles.__table__.name,
-                       schema=IeeHouseholdLoadProfiles.__table__.schema,
-                       con=engine, if_exists='append',
-                       method='multi', chunksize=100, index=False,
-                       dtype={'load': IeeHouseholdLoadProfiles.load.type,
-                              'type': IeeHouseholdLoadProfiles.type.type,
-                              'id': IeeHouseholdLoadProfiles.id.type}
-                       )
+    hh_profiles.to_sql(
+        name=IeeHouseholdLoadProfiles.__table__.name,
+        schema=IeeHouseholdLoadProfiles.__table__.schema,
+        con=engine,
+        if_exists="append",
+        method="multi",
+        chunksize=100,
+        index=False,
+        dtype={
+            "load": IeeHouseholdLoadProfiles.load.type,
+            "type": IeeHouseholdLoadProfiles.type.type,
+            "id": IeeHouseholdLoadProfiles.id.type,
+        },
+    )
 
 
 def download_process_household_demand_profiles_raw():
@@ -331,7 +337,9 @@ def download_process_household_demand_profiles_raw():
     if not os.path.exists(download_directory):
         os.mkdir(download_directory)
 
-    hh_profiles_file = Path(".") / download_directory / Path(hh_profiles_url).name
+    hh_profiles_file = (
+        Path(".") / download_directory / Path(hh_profiles_url).name
+    )
 
     if not hh_profiles_file.is_file():
         urlretrieve(hh_profiles_url, hh_profiles_file)
@@ -355,7 +363,6 @@ def process_household_demand_profiles(hh_profiles):
         Profiles with Multiindex
     """
 
-
     # set multiindex to HH_types
     hh_profiles.columns = pd.MultiIndex.from_arrays(
         [hh_profiles.columns.str[:2], hh_profiles.columns.str[3:]]
@@ -412,7 +419,9 @@ def download_process_zensus_households_raw():
     if not os.path.exists(download_directory):
         os.mkdir(download_directory)
 
-    households_file = Path(".") / download_directory / Path(households_url).name
+    households_file = (
+        Path(".") / download_directory / Path(households_url).name
+    )
 
     # Download prepared data file from nextcloud
     if not households_file.is_file():
@@ -441,7 +450,9 @@ def download_process_zensus_households_raw():
     return households_nuts1
 
 
-def create_missing_zensus_data(df_households_typ, df_missing_data, missing_cells):
+def create_missing_zensus_data(
+    df_households_typ, df_missing_data, missing_cells
+):
     """
     There is missing data for specific attributes in the zensus dataset because of secrecy reasons.
     Some cells with only small amount of households are missing with the attribute HHTYP_FAM.
@@ -466,18 +477,26 @@ def create_missing_zensus_data(df_households_typ, df_missing_data, missing_cells
 
     """
     # grid_ids of missing cells grouped by amount of households
-    missing_grid_ids = {group: list(df.grid_id) for group, df in missing_cells.groupby('quantity')}
+    missing_grid_ids = {
+        group: list(df.grid_id)
+        for group, df in missing_cells.groupby("quantity")
+    }
 
     # Grid ids for cells with low household numbers
-    df_households_typ = df_households_typ.set_index('grid_id', drop=True)
-    hh_in_cells = df_households_typ.groupby('grid_id')['quantity'].sum()
-    hh_index = {i: hh_in_cells.loc[hh_in_cells == i].index for i in df_missing_data.households.values}
+    df_households_typ = df_households_typ.set_index("grid_id", drop=True)
+    hh_in_cells = df_households_typ.groupby("grid_id")["quantity"].sum()
+    hh_index = {
+        i: hh_in_cells.loc[hh_in_cells == i].index
+        for i in df_missing_data.households.values
+    }
 
     df_average_split = pd.DataFrame()
     for hh_size, index in hh_index.items():
         # average split of household types in cells with low household numbers
-        split = df_households_typ.loc[index].groupby('characteristics_code').sum() / df_households_typ.loc[
-            index].quantity.sum()
+        split = (
+            df_households_typ.loc[index].groupby("characteristics_code").sum()
+            / df_households_typ.loc[index].quantity.sum()
+        )
         split = split.quantity * hh_size
 
         # correct rounding
@@ -496,14 +515,19 @@ def create_missing_zensus_data(df_households_typ, df_missing_data, missing_cells
             split = split.round()
 
         # Dataframe with average split for each cell
-        temp = pd.DataFrame(product(zip(split, range(1, 6)), missing_grid_ids[hh_size]), columns=['tuple', 'grid_id'])
+        temp = pd.DataFrame(
+            product(zip(split, range(1, 6)), missing_grid_ids[hh_size]),
+            columns=["tuple", "grid_id"],
+        )
         temp = pd.DataFrame(temp.tuple.tolist()).join(temp.grid_id)
-        temp = temp.rename(columns={0: 'hh_5types', 1: 'characteristics_code'})
+        temp = temp.rename(columns={0: "hh_5types", 1: "characteristics_code"})
         temp = temp.dropna()
-        temp = temp[(temp['hh_5types'] != 0)]
+        temp = temp[(temp["hh_5types"] != 0)]
         # append for each cell group of households
-        df_average_split = pd.concat([df_average_split, temp], ignore_index=True)
-    df_average_split['hh_5types'] = df_average_split['hh_5types'].astype(int)
+        df_average_split = pd.concat(
+            [df_average_split, temp], ignore_index=True
+        )
+    df_average_split["hh_5types"] = df_average_split["hh_5types"].astype(int)
 
     return df_average_split
 
@@ -806,13 +830,17 @@ def process_zensus_data(df_zensus):
                     WHERE quantity is not null"""
     )
 
-    df_average_split = create_missing_zensus_data(df_households_typ, df_missing_data, missing_cells)
+    df_average_split = create_missing_zensus_data(
+        df_households_typ, df_missing_data, missing_cells
+    )
 
     df_households_typ = df_households_typ.rename(
         columns={"quantity": "hh_5types"}
     )
 
-    df_households_typ = pd.concat([df_households_typ, df_average_split], ignore_index=True)
+    df_households_typ = pd.concat(
+        [df_households_typ, df_average_split], ignore_index=True
+    )
 
     # Census cells with nuts3 and nuts1 information
     df_grid_id = db.select_dataframe(
@@ -1253,7 +1281,9 @@ def mv_grid_district_HH_electricity_load(
             year=scenario_year,
             peak_load_only=False,
         )
-        mvgd_profiles_dict[grid_district] = [(mvgd_profile / 1e3).round(3).to_list()]  # to MWh
+        mvgd_profiles_dict[grid_district] = [
+            (mvgd_profile / 1e3).round(3).to_list()
+        ]  # to MWh
     mvgd_profiles = pd.DataFrame.from_dict(mvgd_profiles_dict, orient="index")
 
     # Reshape data: put MV grid ids in columns to a single index column
@@ -1272,9 +1302,15 @@ def mv_grid_district_HH_electricity_load(
         bind=engine, checkfirst=True
     )
     # Insert data into respective database table
-    mvgd_profiles.to_sql(name=EgonEtragoElectricityHouseholds.__table__.name,
-                         schema=EgonEtragoElectricityHouseholds.__table__.schema,
-                         con=engine, if_exists='append', method='multi', chunksize=10000, index=False)
+    mvgd_profiles.to_sql(
+        name=EgonEtragoElectricityHouseholds.__table__.name,
+        schema=EgonEtragoElectricityHouseholds.__table__.schema,
+        con=engine,
+        if_exists="append",
+        method="multi",
+        chunksize=10000,
+        index=False,
+    )
 
     return mvgd_profiles
 

From b25016d08153bad91aae6aed8314fb3f777116db Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Sun, 15 Aug 2021 23:59:43 +0200
Subject: [PATCH 76/97] Update module import names and dependencies

---
 src/egon/data/airflow/dags/pipeline.py       | 4 ++--
 src/egon/data/datasets/hh_demand_profiles.py | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/egon/data/airflow/dags/pipeline.py b/src/egon/data/airflow/dags/pipeline.py
index 369ab1130..a3f5ba660 100755
--- a/src/egon/data/airflow/dags/pipeline.py
+++ b/src/egon/data/airflow/dags/pipeline.py
@@ -277,7 +277,7 @@
 
     gas_grid_insert_data  >> create_gas_polygons
     vg250_clean_and_prepare >> create_gas_polygons
-    
+
     # Gas prod import
     gas_production_insert_data = GasProduction(
         dependencies=[create_gas_polygons])
@@ -385,7 +385,7 @@
         zensus_misc_import,
         map_zensus_grid_districts,
         zensus_inside_ger,
-        demandregio_demand_households,
+        demandregio,
     ],
         tasks=(houseprofiles_in_census_cells,
                mv_HH_electricity_load_2035,
diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index b93454ad4..7bb8192cd 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -108,8 +108,7 @@
 
 from egon.data import db
 from egon.data.datasets import Dataset
-from egon.data.processing.zensus_grid_districts import MapZensusGridDistricts
-
+from egon.data.datasets.zensus_mv_grid_districts import MapZensusGridDistricts
 Base = declarative_base()
 
 import egon.data.config

From 70986d46fc77b5bf359edd45b48b2cd48b847395 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Mon, 16 Aug 2021 17:47:06 +0200
Subject: [PATCH 77/97] Create task of zensus_inside_ger for dependecies of
 hh_demand

---
 src/egon/data/airflow/dags/pipeline.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/egon/data/airflow/dags/pipeline.py b/src/egon/data/airflow/dags/pipeline.py
index a3f5ba660..1f23f25c9 100755
--- a/src/egon/data/airflow/dags/pipeline.py
+++ b/src/egon/data/airflow/dags/pipeline.py
@@ -125,6 +125,7 @@
     # Combine Zensus and VG250 data
     zensus_vg250 = ZensusVg250(
         dependencies=[vg250, population_import])
+    zensus_inside_ger = tasks['zensus_vg250.inside-germany']
 
     # DemandRegio data import
     demandregio = DemandRegio(dependencies=[

From 59e0675d499b15bdbf285cf3ffea00b96b39a56b Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Mon, 16 Aug 2021 18:02:50 +0200
Subject: [PATCH 78/97] Restructure code and split functions

---
 src/egon/data/datasets/hh_demand_profiles.py | 24 ++++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 7bb8192cd..d202d2250 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -439,14 +439,7 @@ def download_process_zensus_households_raw():
         engine="python",
     )
 
-    # Clean data
-    households = households_raw.applymap(clean).applymap(int)
-
-    # Make data compatible with household demand profile categories
-    # Use less age interval and aggregate data to NUTS-1 level
-    households_nuts1 = process_nuts1_zensus_data(households)
-
-    return households_nuts1
+    return households_raw
 
 
 def create_missing_zensus_data(
@@ -719,7 +712,7 @@ def process_nuts1_zensus_data(df_zensus):
     return df_zensus
 
 
-def process_zensus_data(df_zensus):
+def enrich_zensus_data_at_cell_level(df_zensus):
     """The zensus data is processed to define the number and type of households per zensus cell.
     Two subsets of the zensus data are merged to fit the IEE profiles specifications.
     For this, the dataset 'HHGROESS_KLASS' is converted from people living in households to number of households
@@ -1149,10 +1142,17 @@ def houseprofiles_in_census_cells():
     df_profiles = process_household_demand_profiles(df_profiles)
 
     # Download zensus household type x age category data
-    df_zensus = download_process_zensus_households_raw()
+    df_households_raw = download_process_zensus_households_raw()
+
+    # Clean data
+    df_households = df_households_raw.applymap(clean).applymap(int)
+
+    # Make data compatible with household demand profile categories
+    # Use less age interval and aggregate data to NUTS-1 level
+    df_zensus_nuts1 = process_nuts1_zensus_data(df_households)
 
-    # Process zensus data for further use
-    df_zensus_cells = process_zensus_data(df_zensus)
+    # Enrich census cell data with nuts1 level attributes
+    df_zensus_cells = enrich_zensus_data_at_cell_level(df_zensus_nuts1)
 
     # Annual household electricity demand on NUTS-3 level (demand regio)
     df_demand_regio = db.select_dataframe(

From 92e31d7fcea26840b11d8847f12b9a6b9186b818 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Mon, 16 Aug 2021 18:03:19 +0200
Subject: [PATCH 79/97] Move Dataset definition to beginning of code

---
 src/egon/data/datasets/hh_demand_profiles.py | 23 ++++++++++----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index d202d2250..b698bab3b 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -226,6 +226,18 @@ class EgonEtragoElectricityHouseholds(Base):
     q_set = Column(ARRAY(Float))
 
 
+hh_demand_setup = partial(
+    Dataset,
+    name="HH Demand",
+    version="0.0.0",
+    dependencies=[],
+    # Tasks are declared in pipeline as function is used multiple times with different args
+    # To differentiate these tasks PythonOperator with specific id-names are used
+    # PythonOperator needs to be declared in pipeline to be mapped to DAG
+    # tasks=[],
+)
+
+
 def clean(x):
     """Clean zensus household data row-wise
 
@@ -1313,14 +1325,3 @@ def mv_grid_district_HH_electricity_load(
 
     return mvgd_profiles
 
-
-hh_demand_setup = partial(
-    Dataset,
-    name="HH Demand",
-    version="0.0.0",
-    dependencies=[],
-    # Tasks are declared in pipeline as function is used multiple times with different args
-    # To differentiate these tasks PythonOperator with specific id-names are used
-    # PythonOperator needs to be declared in pipeline to be mapped to DAG
-    # tasks=[],
-)

From 4e256d5480a80824df830d3b7bcef353c92c89f2 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Mon, 16 Aug 2021 22:57:30 +0200
Subject: [PATCH 80/97] Change file import to egon-data-bundle for
 household_electricity demand

---
 src/egon/data/datasets.yml                   |   6 +-
 src/egon/data/datasets/hh_demand_profiles.py | 110 +++++++++++--------
 2 files changed, 68 insertions(+), 48 deletions(-)

diff --git a/src/egon/data/datasets.yml b/src/egon/data/datasets.yml
index 77b8dfa68..0443f87e5 100644
--- a/src/egon/data/datasets.yml
+++ b/src/egon/data/datasets.yml
@@ -399,9 +399,13 @@ electrical_load_curves_cts:
 household_electricity_demand:
   sources:
     household_electricity_demand_profiles:
-      url: "https://next.rl-institut.de/s/BTx6cAKdDNYM9yL/download/hh_el_load_profiles_2400.hdf"
+      url_testmode: "https://next.rl-institut.de/s/BTx6cAKdDNYM9yL/download/hh_el_load_profiles_2400.hdf"
+      path_testmode: "household_electricity_demand_profiles_2400.hdf5"
+      path: "household_electricity_demand_profiles_100k.hdf5"
+
     zensus_household_types:
       url: "https://next.rl-institut.de/s/Eg3iGJPSiyczQeb/download/Zensus2011_Personen.csv"
+      path: "Zensus2011_Personen.csv"
 
 
 map_mvgrid_vg250:
diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index b698bab3b..679b2f1c1 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -310,54 +310,68 @@ def write_hh_profiles_to_db(hh_profiles):
     )
 
 
-def download_process_household_demand_profiles_raw():
+def get_household_demand_profiles_raw():
     """
-    Downloads and returns household electricity demand profiles
-
-    Household electricity demand profiles generated by Fraunhofer IEE.
-    Methodology is described in
-    :ref:`Erzeugung zeitlich hochaufgelöster Stromlastprofile für verschiedene
-    Haushaltstypen
-    <https://www.researchgate.net/publication/273775902_Erzeugung_zeitlich_hochaufgeloster_Stromlastprofile_fur_verschiedene_Haushaltstypen>`_.
-    It is used and further described in the following theses by:
-
-    * Jonas Haack:
-      "Auswirkungen verschiedener Haushaltslastprofile auf PV-Batterie-Systeme" (confidential)
-    * Simon Ruben Drauz
-      "Synthesis of a heat and electrical load profile for single and multi-family houses used for subsequent
-      performance tests of a multi-component energy system",
-      http://dx.doi.org/10.13140/RG.2.2.13959.14248
-
-    Download only happens, if file 'h0_profiles.h5' isn't already existing.
-
-    Returns
-    -------
-    pd.DataFrame
-        Table with profiles in columns and time as index. A pd.MultiIndex is
-        used to distinguish load profiles from different EUROSTAT household
-        types.
-    """
-    data_config = egon.data.config.datasets()["household_electricity_demand"]
+        Gets and returns household electricity demand profiles from the egon-data-bundle.
+
+        Household electricity demand profiles generated by Fraunhofer IEE.
+        Methodology is described in
+        :ref:`Erzeugung zeitlich hochaufgelöster Stromlastprofile für verschiedene
+        Haushaltstypen
+        <https://www.researchgate.net/publication/273775902_Erzeugung_zeitlich_hochaufgeloster_Stromlastprofile_fur_verschiedene_Haushaltstypen>`_.
+        It is used and further described in the following theses by:
+
+        * Jonas Haack:
+          "Auswirkungen verschiedener Haushaltslastprofile auf PV-Batterie-Systeme" (confidential)
+        * Simon Ruben Drauz
+          "Synthesis of a heat and electrical load profile for single and multi-family houses used for subsequent
+          performance tests of a multi-component energy system",
+          http://dx.doi.org/10.13140/RG.2.2.13959.14248
+
+        Download only happens, if file isn't already existing.
+
+        Returns
+        -------
+        pd.DataFrame
+            Table with profiles in columns and time as index. A pd.MultiIndex is
+            used to distinguish load profiles from different EUROSTAT household
+            types.
+        """
+    data_config = egon.data.config.datasets()
+    pa_config = data_config["household_electricity_demand"]
+
+    def ve(s):
+        raise (ValueError(s))
+
+    dataset = egon.data.config.settings()["egon-data"]["--dataset-boundary"]
+
+    file_section = (
+        "path" if dataset == "Everything" else "path_testmode"
+        if dataset == "Schleswig-Holstein"
+        else ve(f"'{dataset}' is not a valid dataset boundary.")
+    )
 
-    hh_profiles_url = data_config["sources"][
-        "household_electricity_demand_profiles"
-    ]["url"]
+    file_path = pa_config["sources"][
+        "household_electricity_demand_profiles"][file_section]
 
-    download_directory = "hh_demand_profiles"
+    download_directory = os.path.join("data_bundle_egon_data", "hh_demand_profiles")
     # Create the folder, if it does not exists already
     if not os.path.exists(download_directory):
         os.mkdir(download_directory)
 
     hh_profiles_file = (
-        Path(".") / download_directory / Path(hh_profiles_url).name
+        Path(".") / Path(download_directory) / Path(file_path).name
     )
 
+    # Download file, if it does not exists already
     if not hh_profiles_file.is_file():
+        hh_profiles_url = pa_config["sources"][
+            "household_electricity_demand_profiles"]["url_testmode"]
         urlretrieve(hh_profiles_url, hh_profiles_file)
 
-    hh_profiles = pd.read_hdf(hh_profiles_file)
+    df_hh_profiles = pd.read_hdf(hh_profiles_file)
 
-    return hh_profiles
+    return df_hh_profiles
 
 
 def process_household_demand_profiles(hh_profiles):
@@ -387,9 +401,9 @@ def process_household_demand_profiles(hh_profiles):
     return hh_profiles
 
 
-def download_process_zensus_households_raw():
+def get_zensus_households_raw():
     """
-    Downloads and pre-processes zensus age x household type data
+    Get zensus age x household type data from egon-data-bundle
 
     Dataset about household size with information about the categories:
 
@@ -421,21 +435,23 @@ def download_process_zensus_households_raw():
     pd.DataFrame
         Pre-processed zensus household data
     """
-    data_config = egon.data.config.datasets()["household_electricity_demand"]
-
-    households_url = data_config["sources"]["zensus_household_types"]["url"]
+    data_config = egon.data.config.datasets()
+    pa_config = data_config["household_electricity_demand"]
+    file_path = pa_config["sources"]["zensus_household_types"]["path"]
 
-    download_directory = "hh_demand_profiles"
+    download_directory = os.path.join("data_bundle_egon_data", "zensus_households")
     # Create the folder, if it does not exists already
     if not os.path.exists(download_directory):
         os.mkdir(download_directory)
 
     households_file = (
-        Path(".") / download_directory / Path(households_url).name
+        Path(".") / Path(download_directory) / Path(file_path).name
     )
 
-    # Download prepared data file from nextcloud
+    # Download file, if it does not exists already
     if not households_file.is_file():
+        households_url = pa_config["sources"][
+            "zensus_households"]["url"]
         urlretrieve(households_url, households_file)
 
     # Read downloaded file from disk
@@ -1144,8 +1160,8 @@ def houseprofiles_in_census_cells():
     the database as pandas
 
     """
-    # Download demand profiles
-    df_profiles = download_process_household_demand_profiles_raw()
+    # Read demand profiles from egon-data-bundle
+    df_profiles = get_household_demand_profiles_raw()
 
     # Write raw profiles into db
     write_hh_profiles_to_db(df_profiles)
@@ -1154,7 +1170,7 @@ def houseprofiles_in_census_cells():
     df_profiles = process_household_demand_profiles(df_profiles)
 
     # Download zensus household type x age category data
-    df_households_raw = download_process_zensus_households_raw()
+    df_households_raw = get_zensus_households_raw()
 
     # Clean data
     df_households = df_households_raw.applymap(clean).applymap(int)
@@ -1276,8 +1292,8 @@ def mv_grid_district_HH_electricity_load(
         lambda x: [(cat, int(profile_id)) for cat, profile_id in x]
     )
 
-    # Download demand profiles
-    df_profiles = download_process_household_demand_profiles_raw()
+    # Read demand profiles from egon-data-bundle
+    df_profiles = get_household_demand_profiles_raw()
 
     # Process profiles for further use
     df_profiles = process_household_demand_profiles(df_profiles)

From 6e0d78ad3262c8ec59d723b004b5335798a9f270 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Tue, 17 Aug 2021 15:09:38 +0200
Subject: [PATCH 81/97] Add unit descriptions to hh-profiles

---
 src/egon/data/datasets/hh_demand_profiles.py | 22 ++++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 679b2f1c1..1593ba6fb 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -199,7 +199,7 @@ class IeeHouseholdLoadProfiles(Base):
 
     id = Column(INTEGER, primary_key=True)
     type = Column(CHAR(7))
-    load = Column(ARRAY(REAL))  # , dimensions=2))
+    load_in_wh = Column(ARRAY(REAL))  # , dimensions=2))
 
 
 class HouseholdElectricityProfilesInCensusCells(Base):
@@ -286,9 +286,9 @@ def write_hh_profiles_to_db(hh_profiles):
 
     hh_profiles = hh_profiles.rename_axis("type", axis=1)
     hh_profiles = hh_profiles.rename_axis("timestep", axis=0)
-    hh_profiles = hh_profiles.stack().rename("load")
+    hh_profiles = hh_profiles.stack().rename("load_in_wh")
     hh_profiles = hh_profiles.to_frame().reset_index()
-    hh_profiles = hh_profiles.groupby("type").load.apply(tuple)
+    hh_profiles = hh_profiles.groupby("type").load_in_wh.apply(tuple)
     hh_profiles = hh_profiles.reset_index()
 
     IeeHouseholdLoadProfiles.__table__.drop(bind=engine, checkfirst=True)
@@ -303,7 +303,7 @@ def write_hh_profiles_to_db(hh_profiles):
         chunksize=100,
         index=False,
         dtype={
-            "load": IeeHouseholdLoadProfiles.load.type,
+            "load_in_wh": IeeHouseholdLoadProfiles.load_in_wh.type,
             "type": IeeHouseholdLoadProfiles.type.type,
             "id": IeeHouseholdLoadProfiles.id.type,
         },
@@ -1094,7 +1094,7 @@ def get_load_timeseries(
     df_profiles, df_cell_demand_metadata, cell_ids, year, peak_load_only=False
 ):
     """
-    Get peak load for one load area
+    Get peak load for one load area in MWh
 
     The peak load is calculated in aggregated manner for a group of zensus
     cells that belong to one load area (defined by `cell_ids`).
@@ -1102,7 +1102,7 @@ def get_load_timeseries(
     Parameters
     ----------
     df_profiles: pd.DataFrame
-        Household load profile data
+        Household load profile data in Wh
 
         * Index: Times steps as serial integers
         * Columns: pd.MultiIndex with (`HH_TYPE`, `id`)
@@ -1125,7 +1125,7 @@ def get_load_timeseries(
     -------
     pd.Series or float
         Aggregated time series for given `cell_ids` or peak load of this time
-        series.
+        series in MWh.
     """
     timesteps = len(df_profiles)
     full_load = pd.Series(
@@ -1140,8 +1140,8 @@ def get_load_timeseries(
         part_load = (
             df_profiles.loc[:, df["cell_profile_ids"].sum()].sum(axis=1)
             * factor
-            / 1e3
-        )  # profiles in Wh
+            / 1e6
+        )  # from Wh to MWh
         full_load = full_load.add(part_load)
     if peak_load_only:
         return full_load.max()
@@ -1309,8 +1309,8 @@ def mv_grid_district_HH_electricity_load(
             peak_load_only=False,
         )
         mvgd_profiles_dict[grid_district] = [
-            (mvgd_profile / 1e3).round(3).to_list()
-        ]  # to MWh
+            mvgd_profile.round(3).to_list()
+        ]
     mvgd_profiles = pd.DataFrame.from_dict(mvgd_profiles_dict, orient="index")
 
     # Reshape data: put MV grid ids in columns to a single index column

From fb0ddaca665b85b6375c6d4445ca9a5eef87b34a Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Thu, 19 Aug 2021 12:15:51 +0200
Subject: [PATCH 82/97] Add code comment for get_load_timeseries()

---
 src/egon/data/datasets/hh_demand_profiles.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 1593ba6fb..df2cdcd56 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -1134,6 +1134,7 @@ def get_load_timeseries(
     load_area_meta = df_cell_demand_metadata.loc[
         cell_ids, ["cell_profile_ids", "nuts3", f"factor_{year}"]
     ]
+    # loop over nuts3 (part_load) and sum (full_load) as the scaling factor applies at nuts3 level
     for (nuts3, factor), df in load_area_meta.groupby(
         by=["nuts3", f"factor_{year}"]
     ):

From 7a419035931833d663555ed8baa89e2abdef1ea5 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Thu, 19 Aug 2021 13:14:35 +0200
Subject: [PATCH 83/97] Update file sources to egon-data-bundle and remove
 download backup

---
 src/egon/data/datasets.yml                   |  5 +--
 src/egon/data/datasets/hh_demand_profiles.py | 36 ++++----------------
 2 files changed, 7 insertions(+), 34 deletions(-)

diff --git a/src/egon/data/datasets.yml b/src/egon/data/datasets.yml
index 5257c5bd3..b6f7a357b 100644
--- a/src/egon/data/datasets.yml
+++ b/src/egon/data/datasets.yml
@@ -396,18 +396,15 @@ electrical_load_curves_cts:
       schema: 'demand'
       table: 'egon_etrago_electricity_cts'
 
-household_electricity_demand:
+hh_demand_profiles:
   sources:
     household_electricity_demand_profiles:
-      url_testmode: "https://next.rl-institut.de/s/BTx6cAKdDNYM9yL/download/hh_el_load_profiles_2400.hdf"
       path_testmode: "household_electricity_demand_profiles_2400.hdf5"
       path: "household_electricity_demand_profiles_100k.hdf5"
 
     zensus_household_types:
-      url: "https://next.rl-institut.de/s/Eg3iGJPSiyczQeb/download/Zensus2011_Personen.csv"
       path: "Zensus2011_Personen.csv"
 
-
 map_mvgrid_vg250:
   sources:
     mv_grid_districts:
diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index df2cdcd56..ffff61b44 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -311,8 +311,7 @@ def write_hh_profiles_to_db(hh_profiles):
 
 
 def get_household_demand_profiles_raw():
-    """
-        Gets and returns household electricity demand profiles from the egon-data-bundle.
+    """Gets and returns household electricity demand profiles from the egon-data-bundle.
 
         Household electricity demand profiles generated by Fraunhofer IEE.
         Methodology is described in
@@ -328,7 +327,6 @@ def get_household_demand_profiles_raw():
           performance tests of a multi-component energy system",
           http://dx.doi.org/10.13140/RG.2.2.13959.14248
 
-        Download only happens, if file isn't already existing.
 
         Returns
         -------
@@ -338,7 +336,7 @@ def get_household_demand_profiles_raw():
             types.
         """
     data_config = egon.data.config.datasets()
-    pa_config = data_config["household_electricity_demand"]
+    pa_config = data_config['hh_demand_profiles']
 
     def ve(s):
         raise (ValueError(s))
@@ -351,24 +349,14 @@ def ve(s):
         else ve(f"'{dataset}' is not a valid dataset boundary.")
     )
 
-    file_path = pa_config["sources"][
-        "household_electricity_demand_profiles"][file_section]
+    file_path = pa_config["sources"]["household_electricity_demand_profiles"][file_section]
 
-    download_directory = os.path.join("data_bundle_egon_data", "hh_demand_profiles")
-    # Create the folder, if it does not exists already
-    if not os.path.exists(download_directory):
-        os.mkdir(download_directory)
+    download_directory = os.path.join("data_bundle_egon_data", "household_electricity_demand_profiles")
 
     hh_profiles_file = (
         Path(".") / Path(download_directory) / Path(file_path).name
     )
 
-    # Download file, if it does not exists already
-    if not hh_profiles_file.is_file():
-        hh_profiles_url = pa_config["sources"][
-            "household_electricity_demand_profiles"]["url_testmode"]
-        urlretrieve(hh_profiles_url, hh_profiles_file)
-
     df_hh_profiles = pd.read_hdf(hh_profiles_file)
 
     return df_hh_profiles
@@ -402,8 +390,7 @@ def process_household_demand_profiles(hh_profiles):
 
 
 def get_zensus_households_raw():
-    """
-    Get zensus age x household type data from egon-data-bundle
+    """Get zensus age x household type data from egon-data-bundle
 
     Dataset about household size with information about the categories:
 
@@ -428,7 +415,6 @@ def get_zensus_households_raw():
 
     The downloaded file is called 'Zensus2011_Personen.csv'.
 
-    Download only happens, if file isn't already existing.
 
     Returns
     -------
@@ -436,25 +422,15 @@ def get_zensus_households_raw():
         Pre-processed zensus household data
     """
     data_config = egon.data.config.datasets()
-    pa_config = data_config["household_electricity_demand"]
+    pa_config = data_config['hh_demand_profiles']
     file_path = pa_config["sources"]["zensus_household_types"]["path"]
 
     download_directory = os.path.join("data_bundle_egon_data", "zensus_households")
-    # Create the folder, if it does not exists already
-    if not os.path.exists(download_directory):
-        os.mkdir(download_directory)
 
     households_file = (
         Path(".") / Path(download_directory) / Path(file_path).name
     )
 
-    # Download file, if it does not exists already
-    if not households_file.is_file():
-        households_url = pa_config["sources"][
-            "zensus_households"]["url"]
-        urlretrieve(households_url, households_file)
-
-    # Read downloaded file from disk
     households_raw = pd.read_csv(
         households_file,
         sep=";",

From ec24a5cf1d056075bcf5b98cade7a6c4fdf1b1f8 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Thu, 19 Aug 2021 14:11:34 +0200
Subject: [PATCH 84/97] Add functions to retrieve scaled profiles from db

---
 src/egon/data/datasets/hh_demand_profiles.py | 158 ++++++++++++++++++-
 1 file changed, 156 insertions(+), 2 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index ffff61b44..4ba839e29 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -1198,7 +1198,7 @@ def houseprofiles_in_census_cells():
 
 def get_houseprofiles_in_census_cells():
     """
-    Retrieve household demand time profile mapping
+    Retrieve household electricity demand profile mapping
 
     See Also
     --------
@@ -1223,6 +1223,159 @@ def get_houseprofiles_in_census_cells():
     return census_profile_mapping
 
 
+def get_cell_demand_metadata_from_db(attribute, list_of_identifiers):
+    """
+    Retrieve selection of household electricity demand profile mapping
+
+    Parameters
+    ----------
+    attribute: str
+        attribute to filter the table
+
+        * nuts3
+        * nuts1
+        * cell_id
+
+    list_of_identifiers: list of str/int
+        nuts3/nuts1 need to be str
+        cell_id need to be int
+
+    See Also
+    --------
+    :func:`houseprofiles_in_census_cells`
+
+    Returns
+    -------
+    pd.DataFrame
+        Selection of mapping of household demand profiles to zensus cells
+    """
+    attribute_options = ['nuts3', 'nuts1', 'cell_id']
+    if attribute not in attribute_options:
+        raise ValueError(f'attribute has to be one of: {attribute_options}')
+
+    # Query profile ids and scaling factors for specific attributes
+    with db.session_scope() as session:
+        if attribute == 'nuts3':
+            cells_query = session.query(
+                HouseholdElectricityProfilesInCensusCells.cell_id,
+                HouseholdElectricityProfilesInCensusCells.cell_profile_ids,
+                HouseholdElectricityProfilesInCensusCells.nuts3,
+                HouseholdElectricityProfilesInCensusCells.nuts1,
+                HouseholdElectricityProfilesInCensusCells.factor_2035,
+                HouseholdElectricityProfilesInCensusCells.factor_2050). \
+                filter(HouseholdElectricityProfilesInCensusCells.nuts3.in_(list_of_identifiers))
+        elif attribute == 'nuts1':
+            cells_query = session.query(
+                HouseholdElectricityProfilesInCensusCells.cell_id,
+                HouseholdElectricityProfilesInCensusCells.cell_profile_ids,
+                HouseholdElectricityProfilesInCensusCells.nuts3,
+                HouseholdElectricityProfilesInCensusCells.nuts1,
+                HouseholdElectricityProfilesInCensusCells.factor_2035,
+                HouseholdElectricityProfilesInCensusCells.factor_2050). \
+                filter(HouseholdElectricityProfilesInCensusCells.nuts1.in_(list_of_identifiers))
+        elif attribute == 'cell_id':
+            cells_query = session.query(
+                HouseholdElectricityProfilesInCensusCells.cell_id,
+                HouseholdElectricityProfilesInCensusCells.cell_profile_ids,
+                HouseholdElectricityProfilesInCensusCells.nuts3,
+                HouseholdElectricityProfilesInCensusCells.nuts1,
+                HouseholdElectricityProfilesInCensusCells.factor_2035,
+                HouseholdElectricityProfilesInCensusCells.factor_2050). \
+                filter(HouseholdElectricityProfilesInCensusCells.cell_id.in_(list_of_identifiers))
+
+    cell_demand_metadata = pd.read_sql(
+        cells_query.statement, cells_query.session.bind, index_col='cell_id')
+    cell_demand_metadata["cell_profile_ids"] = cell_demand_metadata["cell_profile_ids"].apply(
+        lambda x: [(cat, int(profile_id)) for cat, profile_id in x]
+    )
+    return cell_demand_metadata
+
+
+def get_hh_profiles_from_db(profile_ids):
+    """
+    Retrieve selection of household electricity demand profiles
+
+    Parameters
+    ----------
+    profile_ids: list of tuple (str, int)
+        tuple consists of (category, profile number)
+
+    See Also
+    --------
+    :func:`houseprofiles_in_census_cells`
+
+    Returns
+    -------
+    pd.DataFrame
+         Selection of household demand profiles
+    """
+    def gen_profile_names(n):
+        """Join from Format (str),(int) to (str)a000(int)"""
+        a = f"{n[0]}a{int(n[1]):04d}"
+        return a
+
+    # Format profile ids to query
+    profile_ids = list(map(gen_profile_names, profile_ids))
+
+    # Query load profiles
+    with db.session_scope() as session:
+        cells_query = session.query(
+            IeeHouseholdLoadProfiles.load_in_wh,
+            IeeHouseholdLoadProfiles.type). \
+            filter(IeeHouseholdLoadProfiles.type.in_(profile_ids))
+
+    df_profile_loads = pd.read_sql(
+        cells_query.statement, cells_query.session.bind, index_col="type")
+
+    df_profile_loads = pd.DataFrame.from_records(df_profile_loads['load_in_wh'],
+                                                 index=df_profile_loads.index).T
+
+    return df_profile_loads
+
+
+def get_scaled_profiles_from_db(attribute, list_of_identifiers, year):
+    """Retrieve selection of scaled household electricity demand profiles
+
+       Parameters
+       ----------
+       attribute: str
+           attribute to filter the table
+
+           * nuts3
+           * nuts1
+           * cell_id
+
+       list_of_identifiers: list of str/int
+           nuts3/nuts1 need to be str
+           cell_id need to be int
+
+        year: int
+            * 2035
+            * 2050
+
+       See Also
+       --------
+       :func:`houseprofiles_in_census_cells`
+
+       Returns
+       -------
+       pd.DataFrame
+           Selection of scaled household electricity demand profiles
+       """
+    cell_demand_metadata = get_cell_demand_metadata_from_db(attribute=attribute,
+                                                            list_of_identifiers=list_of_identifiers)
+    profile_ids = cell_demand_metadata.cell_profile_ids.sum()
+
+    df_profiles = get_hh_profiles_from_db(profile_ids)
+    df_profiles = process_household_demand_profiles(df_profiles)
+
+    df_scaled_profiles = get_load_timeseries(df_profiles=df_profiles,
+                                             df_cell_demand_metadata=cell_demand_metadata,
+                                             cell_ids=cell_demand_metadata.index.to_list(),
+                                             year=year)
+    return df_scaled_profiles
+
+
 def mv_grid_district_HH_electricity_load(
     scenario_name, scenario_year, version, drop_table=False
 ):
@@ -1230,7 +1383,8 @@ def mv_grid_district_HH_electricity_load(
     Aggregated household demand time series at HV/MV substation level
 
     Calculate the aggregated demand time series based on the demand profiles
-    of each zensus cell inside each MV grid district.
+    of each zensus cell inside each MV grid district. Profiles are read from
+    local hdf5-file.
 
     Parameters
     ----------

From bfade72e6cd1b3dc2d87f56821daae0adfcc108d Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Thu, 19 Aug 2021 18:00:07 +0200
Subject: [PATCH 85/97] Add minor docstring infos

---
 src/egon/data/datasets/hh_demand_profiles.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 4ba839e29..6255e3034 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -662,6 +662,11 @@ def process_nuts1_zensus_data(df_zensus):
     * Adults (15<65)
     * Seniors (<65)
 
+    Parameters
+    ----------
+    df_zensus: pd.DataFrame
+        cleaned zensus household type x age category data
+
     Returns
     -------
     pd.DataFrame
@@ -722,6 +727,10 @@ def enrich_zensus_data_at_cell_level(df_zensus):
     For this, the dataset 'HHGROESS_KLASS' is converted from people living in households to number of households
     of specific size. Missing data in 'HHTYP_FAM' is substituted in :func:`create_missing_zensus_data`.
 
+    Parameters
+    ----------
+    df_zensus: pd.DataFrame
+        Aggregated zensus household data on NUTS-1 level
 
     Returns
     -------

From e09b7697c8d7eaa4c2a90f79eb1b4a8af4d94b2a Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Thu, 19 Aug 2021 19:35:50 +0200
Subject: [PATCH 86/97] Fix some pylint issues

---
 src/egon/data/airflow/dags/pipeline.py       | 8 ++++----
 src/egon/data/datasets/hh_demand_profiles.py | 9 +++------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/egon/data/airflow/dags/pipeline.py b/src/egon/data/airflow/dags/pipeline.py
index 1f23f25c9..10377aeef 100755
--- a/src/egon/data/airflow/dags/pipeline.py
+++ b/src/egon/data/airflow/dags/pipeline.py
@@ -368,14 +368,14 @@
     etrago_input_data >> solar_rooftop_etrago
     map_zensus_grid_districts >> solar_rooftop_etrago
 
-    mv_HH_electricity_load_2035 = PythonOperator(
+    mv_hh_electricity_load_2035 = PythonOperator(
         task_id="MV-hh-electricity-load-2035",
         python_callable=mv_grid_district_HH_electricity_load,
         op_args=["eGon2035", 2035, "0.0.0"],
         op_kwargs={"drop_table": True},
     )
 
-    mv_HH_electricity_load_2050 = PythonOperator(
+    mv_hh_electricity_load_2050 = PythonOperator(
         task_id="MV-hh-electricity-load-2050",
         python_callable=mv_grid_district_HH_electricity_load,
         op_args=["eGon100RE", 2050, "0.0.0"],
@@ -389,8 +389,8 @@
         demandregio,
     ],
         tasks=(houseprofiles_in_census_cells,
-               mv_HH_electricity_load_2035,
-               mv_HH_electricity_load_2050,)
+               mv_hh_electricity_load_2035,
+               mv_hh_electricity_load_2050,)
     )
     hh_demand.insert_into(pipeline)
     householdprofiles_in_cencus_cells = tasks["hh_demand_profiles.houseprofiles-in-census-cells"]
diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 6255e3034..867c4ac29 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -96,7 +96,6 @@
 from functools import partial
 from itertools import cycle, product
 from pathlib import Path
-from urllib.request import urlretrieve
 import os
 import random
 
@@ -106,12 +105,12 @@
 import numpy as np
 import pandas as pd
 
+import egon.data.config
 from egon.data import db
 from egon.data.datasets import Dataset
 from egon.data.datasets.zensus_mv_grid_districts import MapZensusGridDistricts
 Base = declarative_base()
 
-import egon.data.config
 
 # Define mapping of zensus household categories to eurostat categories
 # - Adults living in househould type
@@ -1130,9 +1129,8 @@ def get_load_timeseries(
         )  # from Wh to MWh
         full_load = full_load.add(part_load)
     if peak_load_only:
-        return full_load.max()
-    else:
-        return full_load
+        full_load = full_load.max()
+    return full_load
 
 
 def houseprofiles_in_census_cells():
@@ -1480,4 +1478,3 @@ def mv_grid_district_HH_electricity_load(
     )
 
     return mvgd_profiles
-

From 74a73ce846b5663933cb7581611038d8dbbb777d Mon Sep 17 00:00:00 2001
From: nesnoj <jonathan.amme@rl-institut.de>
Date: Wed, 25 Aug 2021 13:57:09 +0200
Subject: [PATCH 87/97] rm return val from HH task

---
 src/egon/data/datasets/hh_demand_profiles.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 867c4ac29..2cd8e3bc1 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -1476,5 +1476,3 @@ def mv_grid_district_HH_electricity_load(
         chunksize=10000,
         index=False,
     )
-
-    return mvgd_profiles

From 7d5d294b2d0aabd9616f6dd6124d36f136502191 Mon Sep 17 00:00:00 2001
From: nesnoj <jonathan.amme@rl-institut.de>
Date: Wed, 25 Aug 2021 13:57:19 +0200
Subject: [PATCH 88/97] bump version no

---
 src/egon/data/datasets/hh_demand_profiles.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 2cd8e3bc1..95ab82f44 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -228,7 +228,7 @@ class EgonEtragoElectricityHouseholds(Base):
 hh_demand_setup = partial(
     Dataset,
     name="HH Demand",
-    version="0.0.0",
+    version="0.0.1",
     dependencies=[],
     # Tasks are declared in pipeline as function is used multiple times with different args
     # To differentiate these tasks PythonOperator with specific id-names are used

From aa3e2d20f5c47c5029543af04b7ff18cc13c7a79 Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Wed, 25 Aug 2021 14:38:11 +0200
Subject: [PATCH 89/97] Add fixed random_seed

---
 src/egon/data/datasets/hh_demand_profiles.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 95ab82f44..54b5cd124 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -112,6 +112,9 @@
 Base = declarative_base()
 
 
+# Set random_seed as long as no global_seed exists (see #351).
+RANDOM_SEED = 42
+
 # Define mapping of zensus household categories to eurostat categories
 # - Adults living in househould type
 # - number of kids not included even if in housholdtype name
@@ -499,12 +502,12 @@ def create_missing_zensus_data(
         if difference > 0:
             # add to any row
             split = split.round()
-            random_row = split.sample()
+            random_row = split.sample(random_state=RANDOM_SEED)
             split[random_row.index] = random_row + difference
         elif difference < 0:
             # subtract only from rows > 0
             split = split.round()
-            random_row = split[split > 0].sample()
+            random_row = split[split > 0].sample(random_state=RANDOM_SEED)
             split[random_row.index] = random_row + difference
         else:
             split = split.round()
@@ -923,6 +926,7 @@ def get_cell_demand_profile_ids(df_cell, pool_size):
     # np.random.default_rng().integers(low=0, high=pool_size[hh_type], size=sq) instead of random.sample
     # use random.choices() if with replacement
     # list of sample ids per hh_type in cell
+    random.seed(RANDOM_SEED)
     cell_profile_ids = [
         (hh_type, random.sample(range(pool_size[hh_type]), k=sq))
         if pool_size[hh_type] >= sq

From 7a1cdf28ccd3ac5f05a3b8b94698280dc704a04a Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Wed, 25 Aug 2021 14:43:49 +0200
Subject: [PATCH 90/97] Add autorship

---
 AUTHORS.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/AUTHORS.rst b/AUTHORS.rst
index 0d7e707a0..a17b7fd23 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -2,4 +2,4 @@
 Authors
 =======
 
-* Guido Pleßmann, Ilka Cußman, Stephan Günther, Jonathan Amme - https://github.com/openego/eGon-data
+* Guido Pleßmann, Ilka Cußman, Stephan Günther, Jonathan Amme, Julian Endres - https://github.com/openego/eGon-data

From 2cad50150a3d08e1ddfa46a31dca323e2342dcc9 Mon Sep 17 00:00:00 2001
From: nesnoj <jonathan.amme@rl-institut.de>
Date: Fri, 27 Aug 2021 11:01:17 +0200
Subject: [PATCH 91/97] Obtain global random seed (from CLI param)

---
 src/egon/data/datasets/hh_demand_profiles.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 54b5cd124..4623f71cc 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -112,8 +112,8 @@
 Base = declarative_base()
 
 
-# Set random_seed as long as no global_seed exists (see #351).
-RANDOM_SEED = 42
+# Get random seed from config
+RANDOM_SEED = egon.data.config.settings()['egon-data']['--random-seed']
 
 # Define mapping of zensus household categories to eurostat categories
 # - Adults living in househould type

From c2df97b66360e2079b34a15aa8575ea04ada48e2 Mon Sep 17 00:00:00 2001
From: nesnoj <jonathan.amme@rl-institut.de>
Date: Fri, 27 Aug 2021 11:02:22 +0200
Subject: [PATCH 92/97] Remove re-seeding the random generator on every
 iteration

---
 src/egon/data/datasets/hh_demand_profiles.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 4623f71cc..38872cd9f 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -502,12 +502,12 @@ def create_missing_zensus_data(
         if difference > 0:
             # add to any row
             split = split.round()
-            random_row = split.sample(random_state=RANDOM_SEED)
+            random_row = split.sample()
             split[random_row.index] = random_row + difference
         elif difference < 0:
             # subtract only from rows > 0
             split = split.round()
-            random_row = split[split > 0].sample(random_state=RANDOM_SEED)
+            random_row = split[split > 0].sample()
             split[random_row.index] = random_row + difference
         else:
             split = split.round()

From 63867b5db48ece85feabd728e3cb23f44cee3052 Mon Sep 17 00:00:00 2001
From: nesnoj <jonathan.amme@rl-institut.de>
Date: Fri, 27 Aug 2021 11:02:43 +0200
Subject: [PATCH 93/97] Remove re-seeding the random generator

---
 src/egon/data/datasets/hh_demand_profiles.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 38872cd9f..fef871a30 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -926,7 +926,6 @@ def get_cell_demand_profile_ids(df_cell, pool_size):
     # np.random.default_rng().integers(low=0, high=pool_size[hh_type], size=sq) instead of random.sample
     # use random.choices() if with replacement
     # list of sample ids per hh_type in cell
-    random.seed(RANDOM_SEED)
     cell_profile_ids = [
         (hh_type, random.sample(range(pool_size[hh_type]), k=sq))
         if pool_size[hh_type] >= sq

From 08d230b4d661560486bbc1fc94dc854109729371 Mon Sep 17 00:00:00 2001
From: nesnoj <jonathan.amme@rl-institut.de>
Date: Fri, 27 Aug 2021 11:03:36 +0200
Subject: [PATCH 94/97] Seed python's and numpy's random generator on init

---
 src/egon/data/datasets/hh_demand_profiles.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index fef871a30..d977cac49 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -1147,6 +1147,10 @@ def houseprofiles_in_census_cells():
     the database as pandas
 
     """
+    # Init random generators using global seed
+    random.seed(RANDOM_SEED)
+    np.random.seed(RANDOM_SEED)
+
     # Read demand profiles from egon-data-bundle
     df_profiles = get_household_demand_profiles_raw()
 

From 53bf6c38da3ce09588e51a42635c73f790e8841c Mon Sep 17 00:00:00 2001
From: Julian Endres <51374526+nailend@users.noreply.github.com>
Date: Mon, 30 Aug 2021 16:26:06 +0200
Subject: [PATCH 95/97] add peak_load_only option to
 get_scaled_profiles_from_db() #275

---
 src/egon/data/datasets/hh_demand_profiles.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index d977cac49..a94db918a 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -1347,7 +1347,7 @@ def gen_profile_names(n):
     return df_profile_loads
 
 
-def get_scaled_profiles_from_db(attribute, list_of_identifiers, year):
+def get_scaled_profiles_from_db(attribute, list_of_identifiers, year, peak_load_only=False):
     """Retrieve selection of scaled household electricity demand profiles
 
        Parameters
@@ -1366,15 +1366,18 @@ def get_scaled_profiles_from_db(attribute, list_of_identifiers, year):
         year: int
             * 2035
             * 2050
-
+            
+       peak_load_only: bool
+            
        See Also
        --------
        :func:`houseprofiles_in_census_cells`
 
        Returns
        -------
-       pd.DataFrame
-           Selection of scaled household electricity demand profiles
+       pd.Series or float
+        Aggregated time series for given `cell_ids` or peak load of this time
+        series in MWh.
        """
     cell_demand_metadata = get_cell_demand_metadata_from_db(attribute=attribute,
                                                             list_of_identifiers=list_of_identifiers)
@@ -1383,11 +1386,12 @@ def get_scaled_profiles_from_db(attribute, list_of_identifiers, year):
     df_profiles = get_hh_profiles_from_db(profile_ids)
     df_profiles = process_household_demand_profiles(df_profiles)
 
-    df_scaled_profiles = get_load_timeseries(df_profiles=df_profiles,
+    scaled_profiles = get_load_timeseries(df_profiles=df_profiles,
                                              df_cell_demand_metadata=cell_demand_metadata,
                                              cell_ids=cell_demand_metadata.index.to_list(),
-                                             year=year)
-    return df_scaled_profiles
+                                             year=year,
+                                             peak_load_only=peak_load_only)
+    return scaled_profiles
 
 
 def mv_grid_district_HH_electricity_load(

From 6769675e24fa07f5e958076138482ae92a3227ca Mon Sep 17 00:00:00 2001
From: "Julian.Endres" <julian.endres@rl-institut.de>
Date: Wed, 1 Sep 2021 12:56:45 +0200
Subject: [PATCH 96/97] Set column 'type' as index for table
 iee_household_load_profiles

---
 src/egon/data/datasets/hh_demand_profiles.py | 209 +++++++++++--------
 1 file changed, 118 insertions(+), 91 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index a94db918a..0906a6bf6 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -105,15 +105,16 @@
 import numpy as np
 import pandas as pd
 
-import egon.data.config
 from egon.data import db
 from egon.data.datasets import Dataset
 from egon.data.datasets.zensus_mv_grid_districts import MapZensusGridDistricts
+import egon.data.config
+
 Base = declarative_base()
 
 
 # Get random seed from config
-RANDOM_SEED = egon.data.config.settings()['egon-data']['--random-seed']
+RANDOM_SEED = egon.data.config.settings()["egon-data"]["--random-seed"]
 
 # Define mapping of zensus household categories to eurostat categories
 # - Adults living in househould type
@@ -200,7 +201,7 @@ class IeeHouseholdLoadProfiles(Base):
     __table_args__ = {"schema": "demand"}
 
     id = Column(INTEGER, primary_key=True)
-    type = Column(CHAR(7))
+    type = Column(CHAR(7), index=True)
     load_in_wh = Column(ARRAY(REAL))  # , dimensions=2))
 
 
@@ -315,30 +316,30 @@ def write_hh_profiles_to_db(hh_profiles):
 def get_household_demand_profiles_raw():
     """Gets and returns household electricity demand profiles from the egon-data-bundle.
 
-        Household electricity demand profiles generated by Fraunhofer IEE.
-        Methodology is described in
-        :ref:`Erzeugung zeitlich hochaufgelöster Stromlastprofile für verschiedene
-        Haushaltstypen
-        <https://www.researchgate.net/publication/273775902_Erzeugung_zeitlich_hochaufgeloster_Stromlastprofile_fur_verschiedene_Haushaltstypen>`_.
-        It is used and further described in the following theses by:
-
-        * Jonas Haack:
-          "Auswirkungen verschiedener Haushaltslastprofile auf PV-Batterie-Systeme" (confidential)
-        * Simon Ruben Drauz
-          "Synthesis of a heat and electrical load profile for single and multi-family houses used for subsequent
-          performance tests of a multi-component energy system",
-          http://dx.doi.org/10.13140/RG.2.2.13959.14248
-
-
-        Returns
-        -------
-        pd.DataFrame
-            Table with profiles in columns and time as index. A pd.MultiIndex is
-            used to distinguish load profiles from different EUROSTAT household
-            types.
-        """
+    Household electricity demand profiles generated by Fraunhofer IEE.
+    Methodology is described in
+    :ref:`Erzeugung zeitlich hochaufgelöster Stromlastprofile für verschiedene
+    Haushaltstypen
+    <https://www.researchgate.net/publication/273775902_Erzeugung_zeitlich_hochaufgeloster_Stromlastprofile_fur_verschiedene_Haushaltstypen>`_.
+    It is used and further described in the following theses by:
+
+    * Jonas Haack:
+      "Auswirkungen verschiedener Haushaltslastprofile auf PV-Batterie-Systeme" (confidential)
+    * Simon Ruben Drauz
+      "Synthesis of a heat and electrical load profile for single and multi-family houses used for subsequent
+      performance tests of a multi-component energy system",
+      http://dx.doi.org/10.13140/RG.2.2.13959.14248
+
+
+    Returns
+    -------
+    pd.DataFrame
+        Table with profiles in columns and time as index. A pd.MultiIndex is
+        used to distinguish load profiles from different EUROSTAT household
+        types.
+    """
     data_config = egon.data.config.datasets()
-    pa_config = data_config['hh_demand_profiles']
+    pa_config = data_config["hh_demand_profiles"]
 
     def ve(s):
         raise (ValueError(s))
@@ -346,14 +347,20 @@ def ve(s):
     dataset = egon.data.config.settings()["egon-data"]["--dataset-boundary"]
 
     file_section = (
-        "path" if dataset == "Everything" else "path_testmode"
+        "path"
+        if dataset == "Everything"
+        else "path_testmode"
         if dataset == "Schleswig-Holstein"
         else ve(f"'{dataset}' is not a valid dataset boundary.")
     )
 
-    file_path = pa_config["sources"]["household_electricity_demand_profiles"][file_section]
+    file_path = pa_config["sources"]["household_electricity_demand_profiles"][
+        file_section
+    ]
 
-    download_directory = os.path.join("data_bundle_egon_data", "household_electricity_demand_profiles")
+    download_directory = os.path.join(
+        "data_bundle_egon_data", "household_electricity_demand_profiles"
+    )
 
     hh_profiles_file = (
         Path(".") / Path(download_directory) / Path(file_path).name
@@ -424,10 +431,12 @@ def get_zensus_households_raw():
         Pre-processed zensus household data
     """
     data_config = egon.data.config.datasets()
-    pa_config = data_config['hh_demand_profiles']
+    pa_config = data_config["hh_demand_profiles"]
     file_path = pa_config["sources"]["zensus_household_types"]["path"]
 
-    download_directory = os.path.join("data_bundle_egon_data", "zensus_households")
+    download_directory = os.path.join(
+        "data_bundle_egon_data", "zensus_households"
+    )
 
     households_file = (
         Path(".") / Path(download_directory) / Path(file_path).name
@@ -1263,45 +1272,58 @@ def get_cell_demand_metadata_from_db(attribute, list_of_identifiers):
     pd.DataFrame
         Selection of mapping of household demand profiles to zensus cells
     """
-    attribute_options = ['nuts3', 'nuts1', 'cell_id']
+    attribute_options = ["nuts3", "nuts1", "cell_id"]
     if attribute not in attribute_options:
-        raise ValueError(f'attribute has to be one of: {attribute_options}')
+        raise ValueError(f"attribute has to be one of: {attribute_options}")
 
     # Query profile ids and scaling factors for specific attributes
     with db.session_scope() as session:
-        if attribute == 'nuts3':
+        if attribute == "nuts3":
             cells_query = session.query(
                 HouseholdElectricityProfilesInCensusCells.cell_id,
                 HouseholdElectricityProfilesInCensusCells.cell_profile_ids,
                 HouseholdElectricityProfilesInCensusCells.nuts3,
                 HouseholdElectricityProfilesInCensusCells.nuts1,
                 HouseholdElectricityProfilesInCensusCells.factor_2035,
-                HouseholdElectricityProfilesInCensusCells.factor_2050). \
-                filter(HouseholdElectricityProfilesInCensusCells.nuts3.in_(list_of_identifiers))
-        elif attribute == 'nuts1':
+                HouseholdElectricityProfilesInCensusCells.factor_2050,
+            ).filter(
+                HouseholdElectricityProfilesInCensusCells.nuts3.in_(
+                    list_of_identifiers
+                )
+            )
+        elif attribute == "nuts1":
             cells_query = session.query(
                 HouseholdElectricityProfilesInCensusCells.cell_id,
                 HouseholdElectricityProfilesInCensusCells.cell_profile_ids,
                 HouseholdElectricityProfilesInCensusCells.nuts3,
                 HouseholdElectricityProfilesInCensusCells.nuts1,
                 HouseholdElectricityProfilesInCensusCells.factor_2035,
-                HouseholdElectricityProfilesInCensusCells.factor_2050). \
-                filter(HouseholdElectricityProfilesInCensusCells.nuts1.in_(list_of_identifiers))
-        elif attribute == 'cell_id':
+                HouseholdElectricityProfilesInCensusCells.factor_2050,
+            ).filter(
+                HouseholdElectricityProfilesInCensusCells.nuts1.in_(
+                    list_of_identifiers
+                )
+            )
+        elif attribute == "cell_id":
             cells_query = session.query(
                 HouseholdElectricityProfilesInCensusCells.cell_id,
                 HouseholdElectricityProfilesInCensusCells.cell_profile_ids,
                 HouseholdElectricityProfilesInCensusCells.nuts3,
                 HouseholdElectricityProfilesInCensusCells.nuts1,
                 HouseholdElectricityProfilesInCensusCells.factor_2035,
-                HouseholdElectricityProfilesInCensusCells.factor_2050). \
-                filter(HouseholdElectricityProfilesInCensusCells.cell_id.in_(list_of_identifiers))
+                HouseholdElectricityProfilesInCensusCells.factor_2050,
+            ).filter(
+                HouseholdElectricityProfilesInCensusCells.cell_id.in_(
+                    list_of_identifiers
+                )
+            )
 
     cell_demand_metadata = pd.read_sql(
-        cells_query.statement, cells_query.session.bind, index_col='cell_id')
-    cell_demand_metadata["cell_profile_ids"] = cell_demand_metadata["cell_profile_ids"].apply(
-        lambda x: [(cat, int(profile_id)) for cat, profile_id in x]
+        cells_query.statement, cells_query.session.bind, index_col="cell_id"
     )
+    cell_demand_metadata["cell_profile_ids"] = cell_demand_metadata[
+        "cell_profile_ids"
+    ].apply(lambda x: [(cat, int(profile_id)) for cat, profile_id in x])
     return cell_demand_metadata
 
 
@@ -1323,6 +1345,7 @@ def get_hh_profiles_from_db(profile_ids):
     pd.DataFrame
          Selection of household demand profiles
     """
+
     def gen_profile_names(n):
         """Join from Format (str),(int) to (str)a000(int)"""
         a = f"{n[0]}a{int(n[1]):04d}"
@@ -1334,63 +1357,69 @@ def gen_profile_names(n):
     # Query load profiles
     with db.session_scope() as session:
         cells_query = session.query(
-            IeeHouseholdLoadProfiles.load_in_wh,
-            IeeHouseholdLoadProfiles.type). \
-            filter(IeeHouseholdLoadProfiles.type.in_(profile_ids))
+            IeeHouseholdLoadProfiles.load_in_wh, IeeHouseholdLoadProfiles.type
+        ).filter(IeeHouseholdLoadProfiles.type.in_(profile_ids))
 
     df_profile_loads = pd.read_sql(
-        cells_query.statement, cells_query.session.bind, index_col="type")
+        cells_query.statement, cells_query.session.bind, index_col="type"
+    )
 
-    df_profile_loads = pd.DataFrame.from_records(df_profile_loads['load_in_wh'],
-                                                 index=df_profile_loads.index).T
+    df_profile_loads = pd.DataFrame.from_records(
+        df_profile_loads["load_in_wh"], index=df_profile_loads.index
+    ).T
 
     return df_profile_loads
 
 
-def get_scaled_profiles_from_db(attribute, list_of_identifiers, year, peak_load_only=False):
+def get_scaled_profiles_from_db(
+    attribute, list_of_identifiers, year, peak_load_only=False
+):
     """Retrieve selection of scaled household electricity demand profiles
 
-       Parameters
-       ----------
-       attribute: str
-           attribute to filter the table
-
-           * nuts3
-           * nuts1
-           * cell_id
-
-       list_of_identifiers: list of str/int
-           nuts3/nuts1 need to be str
-           cell_id need to be int
-
-        year: int
-            * 2035
-            * 2050
-            
-       peak_load_only: bool
-            
-       See Also
-       --------
-       :func:`houseprofiles_in_census_cells`
-
-       Returns
-       -------
-       pd.Series or float
-        Aggregated time series for given `cell_ids` or peak load of this time
-        series in MWh.
-       """
-    cell_demand_metadata = get_cell_demand_metadata_from_db(attribute=attribute,
-                                                            list_of_identifiers=list_of_identifiers)
+    Parameters
+    ----------
+    attribute: str
+        attribute to filter the table
+
+        * nuts3
+        * nuts1
+        * cell_id
+
+    list_of_identifiers: list of str/int
+        nuts3/nuts1 need to be str
+        cell_id need to be int
+
+     year: int
+         * 2035
+         * 2050
+
+    peak_load_only: bool
+
+    See Also
+    --------
+    :func:`houseprofiles_in_census_cells`
+
+    Returns
+    -------
+    pd.Series or float
+     Aggregated time series for given `cell_ids` or peak load of this time
+     series in MWh.
+    """
+    cell_demand_metadata = get_cell_demand_metadata_from_db(
+        attribute=attribute, list_of_identifiers=list_of_identifiers
+    )
     profile_ids = cell_demand_metadata.cell_profile_ids.sum()
 
     df_profiles = get_hh_profiles_from_db(profile_ids)
     df_profiles = process_household_demand_profiles(df_profiles)
 
-    scaled_profiles = get_load_timeseries(df_profiles=df_profiles,
-                                             df_cell_demand_metadata=cell_demand_metadata,
-                                             cell_ids=cell_demand_metadata.index.to_list(),
-                                             year=year,
-                                             peak_load_only=peak_load_only)
+    scaled_profiles = get_load_timeseries(
+        df_profiles=df_profiles,
+        df_cell_demand_metadata=cell_demand_metadata,
+        cell_ids=cell_demand_metadata.index.to_list(),
+        year=year,
+        peak_load_only=peak_load_only,
+    )
     return scaled_profiles
 
 
@@ -1457,9 +1486,7 @@ def mv_grid_district_HH_electricity_load(
             year=scenario_year,
             peak_load_only=False,
         )
-        mvgd_profiles_dict[grid_district] = [
-            mvgd_profile.round(3).to_list()
-        ]
+        mvgd_profiles_dict[grid_district] = [mvgd_profile.round(3).to_list()]
     mvgd_profiles = pd.DataFrame.from_dict(mvgd_profiles_dict, orient="index")
 
     # Reshape data: put MV grid ids in columns to a single index column

From 1fbd656dbeb7d6cdb0c641dbf86cbdf4746a120b Mon Sep 17 00:00:00 2001
From: nesnoj <jonathan.amme@rl-institut.de>
Date: Mon, 6 Sep 2021 14:46:27 +0200
Subject: [PATCH 97/97] Fix table column name

---
 src/egon/data/datasets/hh_demand_profiles.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/egon/data/datasets/hh_demand_profiles.py b/src/egon/data/datasets/hh_demand_profiles.py
index 0906a6bf6..9d3df501b 100644
--- a/src/egon/data/datasets/hh_demand_profiles.py
+++ b/src/egon/data/datasets/hh_demand_profiles.py
@@ -861,10 +861,10 @@ def enrich_zensus_data_at_cell_level(df_zensus):
     # Census cells with nuts3 and nuts1 information
     df_grid_id = db.select_dataframe(
         sql="""
-                            SELECT pop.grid_id, pop.gid as cell_id, vg250.vg250_nuts3 as nuts3, lan.nuts as nuts1, lan.gen
+                            SELECT pop.grid_id, pop.id as cell_id, vg250.vg250_nuts3 as nuts3, lan.nuts as nuts1, lan.gen
                             FROM society.destatis_zensus_population_per_ha_inside_germany as pop
                             LEFT JOIN boundaries.egon_map_zensus_vg250 as vg250
-                            ON (pop.gid=vg250.zensus_population_id)
+                            ON (pop.id=vg250.zensus_population_id)
                             LEFT JOIN boundaries.vg250_lan as lan
                             ON (LEFT(vg250.vg250_nuts3, 3)=lan.nuts)
                             WHERE lan.gf = 4 """