Reformat by black.

interpretml · Dec 12, 2019 · 44fed0c · 44fed0c
1 parent 3aaf4fd
commit 44fed0c
Show file tree

Hide file tree

Showing 9 changed files with 323 additions and 220 deletions.
diff --git a/python/interpret-core/interpret/glassbox/ebm/ebm.py b/python/interpret-core/interpret/glassbox/ebm/ebm.py
diff --git a/python/interpret-core/interpret/glassbox/ebm/internal.py b/python/interpret-core/interpret/glassbox/ebm/internal.py
diff --git a/python/interpret-core/interpret/glassbox/ebm/postprocessing.py b/python/interpret-core/interpret/glassbox/ebm/postprocessing.py
@@ -63,7 +63,10 @@ def multiclass_postprocess(
                 updated_feature_graphs[i], change.reshape((num_bins, -1))
             )
         for k in range(K):
-            mean = np.sum(np.multiply(updated_feature_graphs[i][:, k], bincount)) / X_binned.shape[1]
+            mean = (
+                np.sum(np.multiply(updated_feature_graphs[i][:, k], bincount))
+                / X_binned.shape[1]
+            )
             updated_feature_graphs[i][:, k] = np.subtract(
                 updated_feature_graphs[i][:, k], mean
             )

diff --git a/python/interpret-core/interpret/glassbox/ebm/test/test_ebm.py b/python/interpret-core/interpret/glassbox/ebm/test/test_ebm.py
@@ -226,4 +226,3 @@ def test_zero_validation():
 
     clf = ExplainableBoostingClassifier(n_jobs=1, interactions=2, holdout_split=0)
     clf.fit(X, y)
-
diff --git a/python/interpret-core/interpret/glassbox/ebm/utils.py b/python/interpret-core/interpret/glassbox/ebm/utils.py
@@ -23,52 +23,54 @@ def get_count_scores_c(n_classes):
         return 1 if n_classes <= 2 else n_classes
 
     @staticmethod
-    def ebm_train_test_split(X, y, test_size, random_state, is_classification, is_train=True):
+    def ebm_train_test_split(
+        X, y, test_size, random_state, is_classification, is_train=True
+    ):
         # TODO PK Implement the following for memory efficiency and speed of initialization:
         #   - NOTE: FOR RawArray ->  import multiprocessing ++ from multiprocessing import RawArray ++ RawArray(ct.c_ubyte, memory_size) ++ ct.POINTER(ct.c_ubyte)
-        #   - OBSERVATION: We want sparse feature support in our booster since we don't need to access 
+        #   - OBSERVATION: We want sparse feature support in our booster since we don't need to access
         #                  memory if there are long segments with a single value
-        #   - OBSERVATION: Sorting a dataset with sparse features will lead to unpredictably sized final memory sizes, 
+        #   - OBSERVATION: Sorting a dataset with sparse features will lead to unpredictably sized final memory sizes,
         #                  since more clumped data will be more compressed
-        #   - OBSERVATION: for interactions, from a CPU access point of view, we want all of our features to have the 
-        #                  same # of bits so that we can have one loop compare any tuple of features.  
+        #   - OBSERVATION: for interactions, from a CPU access point of view, we want all of our features to have the
+        #                  same # of bits so that we can have one loop compare any tuple of features.
         #                  We therefore do NOT want sparse feature support when looking for interactions
-        #   - OBSERVATION: sorting will be easier for non-sparse data, and we'll want non-sparse data for interactions anyways, 
+        #   - OBSERVATION: sorting will be easier for non-sparse data, and we'll want non-sparse data for interactions anyways,
         #                  so we should only do sparseness for our boosting dataset allocation
         #   - OBSERVATION: without sparse memory in the initial shared memory object, we can calculate the size without seeing the data.
-        #                  Even if we had sorted sparse features, we'd only find out the memory size after the sort, 
+        #                  Even if we had sorted sparse features, we'd only find out the memory size after the sort,
         #                  so we'd want dynamically allocated memory during the sort
-        #   - OBSERVATION: for boosting, we can compress memory to the right size per feature_combination, 
+        #   - OBSERVATION: for boosting, we can compress memory to the right size per feature_combination,
         #                  but for interactions, we want to compress all features by the same amount
-        #                  (all features use the same number of bits) so that we can compare any two/three/etc 
+        #                  (all features use the same number of bits) so that we can compare any two/three/etc
         #                  features and loop at the same points for each
         # STEPS:
         #   - We receive the data from the user in the cache inefficient format X[instances, features]
-        #   - Do preprocessing so that we know how many bins each feature has 
+        #   - Do preprocessing so that we know how many bins each feature has
         #     (we might want to process X[instances, features] in chunks, like below to do this)
-        #   - call into C to get back the exact size of the memory object that we need in order to store all the data.  
+        #   - call into C to get back the exact size of the memory object that we need in order to store all the data.
         #     We can do this because we won't store any of the data at this point as sparse
         #   - Allocate the buffer in python using RawArray (RawArray will be shared with other processes as read only data)
         #   - Divide the features into M chunks of N features.  Let's choose M to be 32, so that we don't increase memory usage by more than 3%
         #   - Loop over M:
         #     - Take N features and all the instances from the original X and transpose them into X_partial[features_N, instances]
         #     - Loop over N:
         #       - take 1 feature and pass it into C for bit compression (don't use sparse coding here) into the RawArray
-        #   - NOTE: this transposes the matrix twice (once for preprocessing and once for adding to C), 
+        #   - NOTE: this transposes the matrix twice (once for preprocessing and once for adding to C),
         #     but this is expected to be a small amount of time compared to training, and we care more about memory size at this point
-        #   - Call a C function which will finalize the dataset (this function will accept the target array).  
+        #   - Call a C function which will finalize the dataset (this function will accept the target array).
         #     - The C function will create an index array and add this index to the dataset (it will be shared)
         #     - sort the index array by the target first, then the features with the highest counts of the mode value
         #     - sort the underlying data by the index array
         #   - Now the memory is read only from now on, and shareable.  Include a reverse index in the data for reconstructing the
-        #     original order inside the data structure.  
+        #     original order inside the data structure.
         #   - No pointers in the data structure, just offsets (for sharing cross process)!
-        #   - Start each child processes, and pass them our shared memory structure 
+        #   - Start each child processes, and pass them our shared memory structure
         #     (it will be mapped into each process address space, but not copied)
         #   - each child calls a train/validation splitter provided by our C that fills a numpy array of bools
-        #     We do this in C instead of using the sklearn train_test_split because sklearn would require us to first split sequential indexes, 
-        #     possibly sort them (if order in not guaranteed), then convert to bools in a caching inefficient way, 
-        #     whereas in C we can do a single pass without any memory array inputs (using just a random number generator) 
+        #     We do this in C instead of using the sklearn train_test_split because sklearn would require us to first split sequential indexes,
+        #     possibly sort them (if order in not guaranteed), then convert to bools in a caching inefficient way,
+        #     whereas in C we can do a single pass without any memory array inputs (using just a random number generator)
         #     and we can make the outputs consistent across languages.
         #   - with the RawArray complete data PLUS the train/validation bool list we can generate either interaction datasets OR boosting dataset as needed.
         #     We can reduce our memory footprint, by never having both an interaction AND boosting dataset in memory at the same time.
@@ -160,9 +162,7 @@ def gen_feature_combinations(feature_indices):
         return feature_combinations
 
     @staticmethod
-    def scores_by_feature_combination(
-        X, feature_combinations, model
-    ):
+    def scores_by_feature_combination(X, feature_combinations, model):
         for set_idx, feature_combination in enumerate(feature_combinations):
             tensor = model[set_idx]
 
@@ -175,9 +175,7 @@ def scores_by_feature_combination(
             yield set_idx, feature_combination, scores
 
     @staticmethod
-    def decision_function(
-        X, feature_combinations, model, intercept
-    ):
+    def decision_function(X, feature_combinations, model, intercept):
         if X.ndim == 1:
             X = X.reshape(X.shape[0], 1)
 
@@ -205,10 +203,7 @@ def decision_function(
     @staticmethod
     def classifier_predict_proba(X, feature_combinations, model, intercept):
         log_odds_vector = EBMUtils.decision_function(
-            X,
-            feature_combinations,
-            model,
-            intercept
+            X, feature_combinations, model, intercept
         )
 
         # Handle binary classification case -- softmax only works with 0s appended
@@ -220,10 +215,7 @@ def classifier_predict_proba(X, feature_combinations, model, intercept):
     @staticmethod
     def classifier_predict(X, feature_combinations, model, intercept, classes):
         log_odds_vector = EBMUtils.decision_function(
-            X,
-            feature_combinations,
-            model,
-            intercept
+            X, feature_combinations, model, intercept
         )
         if log_odds_vector.ndim == 1:
             log_odds_vector = np.c_[np.zeros(log_odds_vector.shape), log_odds_vector]
@@ -232,20 +224,19 @@ def classifier_predict(X, feature_combinations, model, intercept, classes):
 
     @staticmethod
     def regressor_predict(X, feature_combinations, model, intercept):
-        scores = EBMUtils.decision_function(
-            X,
-            feature_combinations,
-            model,
-            intercept
-        )
+        scores = EBMUtils.decision_function(X, feature_combinations, model, intercept)
         return scores
 
     @staticmethod
     def gen_feature_name(feature_idxs, col_names):
         feature_name = []
         for feature_index in feature_idxs:
             col_name = col_names[feature_index]
-            feature_name.append("feature_" + str(col_name) if isinstance(col_name, int) else str(col_name))
+            feature_name.append(
+                "feature_" + str(col_name)
+                if isinstance(col_name, int)
+                else str(col_name)
+            )
         feature_name = " x ".join(feature_name)
         return feature_name
 

diff --git a/python/interpret-core/interpret/glassbox/skoperules.py b/python/interpret-core/interpret/glassbox/skoperules.py
@@ -121,9 +121,13 @@ def fit(self, X, y):
         self.pos_ratio_ = np.mean(y)
 
         # Extract rules
-        self.internal_rules_, self.rules_, self.prec_, self.recall_, self.feat_rule_map_ = self._extract_rules(
-            self.sk_model_.rules_
-        )
+        (
+            self.internal_rules_,
+            self.rules_,
+            self.prec_,
+            self.recall_,
+            self.feat_rule_map_,
+        ) = self._extract_rules(self.sk_model_.rules_)
 
         self.global_selector = gen_global_selector(
             X, self.feature_names, self.feature_types, None

diff --git a/python/interpret-core/interpret/provider/test/test_providers.py b/python/interpret-core/interpret/provider/test/test_providers.py
@@ -47,6 +47,7 @@ def test_azureml_provider():
 def test_auto_visualize_provider(example_explanation):
     # NOTE: We know this environment is going to use Dash.
     from ...visual.dashboard import AppRunner
+
     ip = "127.0.0.1"
     port = "7200"
     app_runner = AppRunner(addr=(ip, port))

diff --git a/python/interpret-core/interpret/utils/all.py b/python/interpret-core/interpret/utils/all.py
@@ -90,6 +90,7 @@ def gen_global_selector(X, feature_names, feature_types, importance_scores, roun
     else:  # pragma: no cover
         return df
 
+
 def gen_local_selector(y, y_hat, round=3):
     records = []
 

diff --git a/python/interpret-core/interpret/visual/interactive.py b/python/interpret-core/interpret/visual/interactive.py
@@ -47,7 +47,10 @@ def get_show_addr():
         Address tuple (ip, port).
     """
     if isinstance(this.visualize_provider, DashProvider):
-        addr = (this.visualize_provider.app_runner.ip, this.visualize_provider.app_runner.port)
+        addr = (
+            this.visualize_provider.app_runner.ip,
+            this.visualize_provider.app_runner.port,
+        )
         return addr
     else:
         return None
@@ -99,11 +102,22 @@ def init_show_server(addr=None, base_url=None, use_relative_links=False):
         log.info("Stopping previous dash provider")
         shutdown_show_server()
 
-    log.info("Replacing visualize provider: {} with {}".format(type(this.visualize_provider), type(DashProvider)))
-    set_visualize_provider(DashProvider.from_address(addr=addr, base_url=base_url, use_relative_links=use_relative_links))
+    log.info(
+        "Replacing visualize provider: {} with {}".format(
+            type(this.visualize_provider), type(DashProvider)
+        )
+    )
+    set_visualize_provider(
+        DashProvider.from_address(
+            addr=addr, base_url=base_url, use_relative_links=use_relative_links
+        )
+    )
     this.visualize_provider.idempotent_start()
 
-    addr = (this.visualize_provider.app_runner.ip, this.visualize_provider.app_runner.port)
+    addr = (
+        this.visualize_provider.app_runner.ip,
+        this.visualize_provider.app_runner.port,
+    )
     log.info("Running dash provider at {}".format(addr))
 
     return None
Original file line number	Diff line number	Diff line change
Expand Up		@@ -226,4 +226,3 @@ def test_zero_validation():

		clf = ExplainableBoostingClassifier(n_jobs=1, interactions=2, holdout_split=0)
		clf.fit(X, y)