Skip to content

Commit

Permalink
Reformat by black.
Browse files Browse the repository at this point in the history
  • Loading branch information
interpret-ml committed Dec 12, 2019
1 parent 3aaf4fd commit 44fed0c
Show file tree
Hide file tree
Showing 9 changed files with 323 additions and 220 deletions.
243 changes: 148 additions & 95 deletions python/interpret-core/interpret/glassbox/ebm/ebm.py

Large diffs are not rendered by default.

191 changes: 114 additions & 77 deletions python/interpret-core/interpret/glassbox/ebm/internal.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@ def multiclass_postprocess(
updated_feature_graphs[i], change.reshape((num_bins, -1))
)
for k in range(K):
mean = np.sum(np.multiply(updated_feature_graphs[i][:, k], bincount)) / X_binned.shape[1]
mean = (
np.sum(np.multiply(updated_feature_graphs[i][:, k], bincount))
/ X_binned.shape[1]
)
updated_feature_graphs[i][:, k] = np.subtract(
updated_feature_graphs[i][:, k], mean
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -226,4 +226,3 @@ def test_zero_validation():

clf = ExplainableBoostingClassifier(n_jobs=1, interactions=2, holdout_split=0)
clf.fit(X, y)

69 changes: 30 additions & 39 deletions python/interpret-core/interpret/glassbox/ebm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,52 +23,54 @@ def get_count_scores_c(n_classes):
return 1 if n_classes <= 2 else n_classes

@staticmethod
def ebm_train_test_split(X, y, test_size, random_state, is_classification, is_train=True):
def ebm_train_test_split(
X, y, test_size, random_state, is_classification, is_train=True
):
# TODO PK Implement the following for memory efficiency and speed of initialization:
# - NOTE: FOR RawArray -> import multiprocessing ++ from multiprocessing import RawArray ++ RawArray(ct.c_ubyte, memory_size) ++ ct.POINTER(ct.c_ubyte)
# - OBSERVATION: We want sparse feature support in our booster since we don't need to access
# - OBSERVATION: We want sparse feature support in our booster since we don't need to access
# memory if there are long segments with a single value
# - OBSERVATION: Sorting a dataset with sparse features will lead to unpredictably sized final memory sizes,
# - OBSERVATION: Sorting a dataset with sparse features will lead to unpredictably sized final memory sizes,
# since more clumped data will be more compressed
# - OBSERVATION: for interactions, from a CPU access point of view, we want all of our features to have the
# same # of bits so that we can have one loop compare any tuple of features.
# - OBSERVATION: for interactions, from a CPU access point of view, we want all of our features to have the
# same # of bits so that we can have one loop compare any tuple of features.
# We therefore do NOT want sparse feature support when looking for interactions
# - OBSERVATION: sorting will be easier for non-sparse data, and we'll want non-sparse data for interactions anyways,
# - OBSERVATION: sorting will be easier for non-sparse data, and we'll want non-sparse data for interactions anyways,
# so we should only do sparseness for our boosting dataset allocation
# - OBSERVATION: without sparse memory in the initial shared memory object, we can calculate the size without seeing the data.
# Even if we had sorted sparse features, we'd only find out the memory size after the sort,
# Even if we had sorted sparse features, we'd only find out the memory size after the sort,
# so we'd want dynamically allocated memory during the sort
# - OBSERVATION: for boosting, we can compress memory to the right size per feature_combination,
# - OBSERVATION: for boosting, we can compress memory to the right size per feature_combination,
# but for interactions, we want to compress all features by the same amount
# (all features use the same number of bits) so that we can compare any two/three/etc
# (all features use the same number of bits) so that we can compare any two/three/etc
# features and loop at the same points for each
# STEPS:
# - We receive the data from the user in the cache inefficient format X[instances, features]
# - Do preprocessing so that we know how many bins each feature has
# - Do preprocessing so that we know how many bins each feature has
# (we might want to process X[instances, features] in chunks, like below to do this)
# - call into C to get back the exact size of the memory object that we need in order to store all the data.
# - call into C to get back the exact size of the memory object that we need in order to store all the data.
# We can do this because we won't store any of the data at this point as sparse
# - Allocate the buffer in python using RawArray (RawArray will be shared with other processes as read only data)
# - Divide the features into M chunks of N features. Let's choose M to be 32, so that we don't increase memory usage by more than 3%
# - Loop over M:
# - Take N features and all the instances from the original X and transpose them into X_partial[features_N, instances]
# - Loop over N:
# - take 1 feature and pass it into C for bit compression (don't use sparse coding here) into the RawArray
# - NOTE: this transposes the matrix twice (once for preprocessing and once for adding to C),
# - NOTE: this transposes the matrix twice (once for preprocessing and once for adding to C),
# but this is expected to be a small amount of time compared to training, and we care more about memory size at this point
# - Call a C function which will finalize the dataset (this function will accept the target array).
# - Call a C function which will finalize the dataset (this function will accept the target array).
# - The C function will create an index array and add this index to the dataset (it will be shared)
# - sort the index array by the target first, then the features with the highest counts of the mode value
# - sort the underlying data by the index array
# - Now the memory is read only from now on, and shareable. Include a reverse index in the data for reconstructing the
# original order inside the data structure.
# original order inside the data structure.
# - No pointers in the data structure, just offsets (for sharing cross process)!
# - Start each child processes, and pass them our shared memory structure
# - Start each child processes, and pass them our shared memory structure
# (it will be mapped into each process address space, but not copied)
# - each child calls a train/validation splitter provided by our C that fills a numpy array of bools
# We do this in C instead of using the sklearn train_test_split because sklearn would require us to first split sequential indexes,
# possibly sort them (if order in not guaranteed), then convert to bools in a caching inefficient way,
# whereas in C we can do a single pass without any memory array inputs (using just a random number generator)
# We do this in C instead of using the sklearn train_test_split because sklearn would require us to first split sequential indexes,
# possibly sort them (if order in not guaranteed), then convert to bools in a caching inefficient way,
# whereas in C we can do a single pass without any memory array inputs (using just a random number generator)
# and we can make the outputs consistent across languages.
# - with the RawArray complete data PLUS the train/validation bool list we can generate either interaction datasets OR boosting dataset as needed.
# We can reduce our memory footprint, by never having both an interaction AND boosting dataset in memory at the same time.
Expand Down Expand Up @@ -160,9 +162,7 @@ def gen_feature_combinations(feature_indices):
return feature_combinations

@staticmethod
def scores_by_feature_combination(
X, feature_combinations, model
):
def scores_by_feature_combination(X, feature_combinations, model):
for set_idx, feature_combination in enumerate(feature_combinations):
tensor = model[set_idx]

Expand All @@ -175,9 +175,7 @@ def scores_by_feature_combination(
yield set_idx, feature_combination, scores

@staticmethod
def decision_function(
X, feature_combinations, model, intercept
):
def decision_function(X, feature_combinations, model, intercept):
if X.ndim == 1:
X = X.reshape(X.shape[0], 1)

Expand Down Expand Up @@ -205,10 +203,7 @@ def decision_function(
@staticmethod
def classifier_predict_proba(X, feature_combinations, model, intercept):
log_odds_vector = EBMUtils.decision_function(
X,
feature_combinations,
model,
intercept
X, feature_combinations, model, intercept
)

# Handle binary classification case -- softmax only works with 0s appended
Expand All @@ -220,10 +215,7 @@ def classifier_predict_proba(X, feature_combinations, model, intercept):
@staticmethod
def classifier_predict(X, feature_combinations, model, intercept, classes):
log_odds_vector = EBMUtils.decision_function(
X,
feature_combinations,
model,
intercept
X, feature_combinations, model, intercept
)
if log_odds_vector.ndim == 1:
log_odds_vector = np.c_[np.zeros(log_odds_vector.shape), log_odds_vector]
Expand All @@ -232,20 +224,19 @@ def classifier_predict(X, feature_combinations, model, intercept, classes):

@staticmethod
def regressor_predict(X, feature_combinations, model, intercept):
scores = EBMUtils.decision_function(
X,
feature_combinations,
model,
intercept
)
scores = EBMUtils.decision_function(X, feature_combinations, model, intercept)
return scores

@staticmethod
def gen_feature_name(feature_idxs, col_names):
feature_name = []
for feature_index in feature_idxs:
col_name = col_names[feature_index]
feature_name.append("feature_" + str(col_name) if isinstance(col_name, int) else str(col_name))
feature_name.append(
"feature_" + str(col_name)
if isinstance(col_name, int)
else str(col_name)
)
feature_name = " x ".join(feature_name)
return feature_name

Expand Down
10 changes: 7 additions & 3 deletions python/interpret-core/interpret/glassbox/skoperules.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,13 @@ def fit(self, X, y):
self.pos_ratio_ = np.mean(y)

# Extract rules
self.internal_rules_, self.rules_, self.prec_, self.recall_, self.feat_rule_map_ = self._extract_rules(
self.sk_model_.rules_
)
(
self.internal_rules_,
self.rules_,
self.prec_,
self.recall_,
self.feat_rule_map_,
) = self._extract_rules(self.sk_model_.rules_)

self.global_selector = gen_global_selector(
X, self.feature_names, self.feature_types, None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def test_azureml_provider():
def test_auto_visualize_provider(example_explanation):
# NOTE: We know this environment is going to use Dash.
from ...visual.dashboard import AppRunner

ip = "127.0.0.1"
port = "7200"
app_runner = AppRunner(addr=(ip, port))
Expand Down
1 change: 1 addition & 0 deletions python/interpret-core/interpret/utils/all.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def gen_global_selector(X, feature_names, feature_types, importance_scores, roun
else: # pragma: no cover
return df


def gen_local_selector(y, y_hat, round=3):
records = []

Expand Down
22 changes: 18 additions & 4 deletions python/interpret-core/interpret/visual/interactive.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,10 @@ def get_show_addr():
Address tuple (ip, port).
"""
if isinstance(this.visualize_provider, DashProvider):
addr = (this.visualize_provider.app_runner.ip, this.visualize_provider.app_runner.port)
addr = (
this.visualize_provider.app_runner.ip,
this.visualize_provider.app_runner.port,
)
return addr
else:
return None
Expand Down Expand Up @@ -99,11 +102,22 @@ def init_show_server(addr=None, base_url=None, use_relative_links=False):
log.info("Stopping previous dash provider")
shutdown_show_server()

log.info("Replacing visualize provider: {} with {}".format(type(this.visualize_provider), type(DashProvider)))
set_visualize_provider(DashProvider.from_address(addr=addr, base_url=base_url, use_relative_links=use_relative_links))
log.info(
"Replacing visualize provider: {} with {}".format(
type(this.visualize_provider), type(DashProvider)
)
)
set_visualize_provider(
DashProvider.from_address(
addr=addr, base_url=base_url, use_relative_links=use_relative_links
)
)
this.visualize_provider.idempotent_start()

addr = (this.visualize_provider.app_runner.ip, this.visualize_provider.app_runner.port)
addr = (
this.visualize_provider.app_runner.ip,
this.visualize_provider.app_runner.port,
)
log.info("Running dash provider at {}".format(addr))

return None
Expand Down

0 comments on commit 44fed0c

Please sign in to comment.