From a12b61a12d48bf3760278f272bec30ac68548a5c Mon Sep 17 00:00:00 2001 From: Paul Koch Date: Thu, 26 Dec 2024 00:25:12 -0800 Subject: [PATCH] separate flags for nominal and continuous missing behavior --- .../interpret/glassbox/_ebm/_boost.py | 17 +++++--- .../interpret/glassbox/_ebm/_ebm.py | 4 -- .../interpret-core/interpret/utils/_native.py | 2 +- shared/libebm/GenerateTermUpdate.cpp | 11 ++--- .../PartitionOneDimensionalBoosting.cpp | 41 ++++++++++++------- shared/libebm/inc/libebm.h | 2 +- .../libebm/tests/boosting_unusual_inputs.cpp | 12 +++--- 7 files changed, 49 insertions(+), 40 deletions(-) diff --git a/python/interpret-core/interpret/glassbox/_ebm/_boost.py b/python/interpret-core/interpret/glassbox/_ebm/_boost.py index 72c212ca3..49f9ef596 100644 --- a/python/interpret-core/interpret/glassbox/_ebm/_boost.py +++ b/python/interpret-core/interpret/glassbox/_ebm/_boost.py @@ -152,13 +152,20 @@ def boost( ) if missing == "low": - term_boost_flags_local |= Native.TermBoostFlags_MissingLow + term_boost_flags_local |= ( + Native.TermBoostFlags_MissingLow + | Native.TermBoostFlags_MissingCategory + ) elif missing == "high": - term_boost_flags_local |= Native.TermBoostFlags_MissingHigh + term_boost_flags_local |= ( + Native.TermBoostFlags_MissingHigh + | Native.TermBoostFlags_MissingCategory + ) elif missing == "separate": - term_boost_flags_local |= Native.TermBoostFlags_MissingSeparate - elif missing == "drop": - term_boost_flags_local |= Native.TermBoostFlags_MissingDrop + term_boost_flags_local |= ( + Native.TermBoostFlags_MissingSeparate + | Native.TermBoostFlags_MissingCategory + ) elif missing != "gain": msg = f"Unrecognized missing option {missing}." raise Exception(msg) diff --git a/python/interpret-core/interpret/glassbox/_ebm/_ebm.py b/python/interpret-core/interpret/glassbox/_ebm/_ebm.py index 509ca4d24..7888459e8 100644 --- a/python/interpret-core/interpret/glassbox/_ebm/_ebm.py +++ b/python/interpret-core/interpret/glassbox/_ebm/_ebm.py @@ -2784,8 +2784,6 @@ class ExplainableBoostingClassifier(ClassifierMixin, EBMModel): - `'separate'`: Place the missing bin in its own leaf during each boosting step, effectively making it location-agnostic. This can lead to overfitting, especially when the proportion of missing values is small. - - `'drop'`: Ignore the contribution of the missing bin, or split the feature into two leaves based on gain: - one for missing values and one for non-missing values. - `'gain'`: Choose the best leaf for the missing value contribution at each boosting step, based on gain. max_leaves : int, default=3 Maximum number of leaves allowed in each tree. @@ -3158,8 +3156,6 @@ class ExplainableBoostingRegressor(RegressorMixin, EBMModel): - `'separate'`: Place the missing bin in its own leaf during each boosting step, effectively making it location-agnostic. This can lead to overfitting, especially when the proportion of missing values is small. - - `'drop'`: Ignore the contribution of the missing bin, or split the feature into two leaves based on gain: - one for missing values and one for non-missing values. - `'gain'`: Choose the best leaf for the missing value contribution at each boosting step, based on gain. max_leaves : int, default=2 Maximum number of leaves allowed in each tree. diff --git a/python/interpret-core/interpret/utils/_native.py b/python/interpret-core/interpret/utils/_native.py index d343dcf34..56cb08f1c 100644 --- a/python/interpret-core/interpret/utils/_native.py +++ b/python/interpret-core/interpret/utils/_native.py @@ -40,7 +40,7 @@ class Native: TermBoostFlags_MissingLow = 0x00000080 TermBoostFlags_MissingHigh = 0x00000100 TermBoostFlags_MissingSeparate = 0x00000200 - TermBoostFlags_MissingDrop = 0x00000400 + TermBoostFlags_MissingCategory = 0x00000400 # CreateInteractionFlags CreateInteractionFlags_Default = 0x00000000 diff --git a/shared/libebm/GenerateTermUpdate.cpp b/shared/libebm/GenerateTermUpdate.cpp index 8f32b4108..3dd8e4513 100644 --- a/shared/libebm/GenerateTermUpdate.cpp +++ b/shared/libebm/GenerateTermUpdate.cpp @@ -749,22 +749,17 @@ EBM_API_BODY ErrorEbm EBM_CALLING_CONVENTION GenerateTermUpdate(void* rng, ~(TermBoostFlags_PurifyGain | TermBoostFlags_DisableNewtonGain | TermBoostFlags_DisableCategorical | TermBoostFlags_PurifyUpdate | TermBoostFlags_DisableNewtonUpdate | TermBoostFlags_GradientSums | TermBoostFlags_RandomSplits | TermBoostFlags_MissingLow | TermBoostFlags_MissingHigh | - TermBoostFlags_MissingSeparate | TermBoostFlags_MissingDrop)) { + TermBoostFlags_MissingSeparate | TermBoostFlags_MissingCategory)) { LOG_0(Trace_Error, "ERROR GenerateTermUpdate flags contains unknown flags. Ignoring extras."); } if(TermBoostFlags_MissingLow & flags) { - if((TermBoostFlags_MissingHigh | TermBoostFlags_MissingSeparate | TermBoostFlags_MissingDrop) & flags) { + if((TermBoostFlags_MissingHigh | TermBoostFlags_MissingSeparate) & flags) { LOG_0(Trace_Error, "ERROR GenerateTermUpdate flags contains multiple Missing value flags."); return Error_IllegalParamVal; } } else if(TermBoostFlags_MissingHigh & flags) { - if((TermBoostFlags_MissingSeparate | TermBoostFlags_MissingDrop) & flags) { - LOG_0(Trace_Error, "ERROR GenerateTermUpdate flags contains multiple Missing value flags."); - return Error_IllegalParamVal; - } - } else if(TermBoostFlags_MissingSeparate & flags) { - if(TermBoostFlags_MissingDrop & flags) { + if(TermBoostFlags_MissingSeparate & flags) { LOG_0(Trace_Error, "ERROR GenerateTermUpdate flags contains multiple Missing value flags."); return Error_IllegalParamVal; } diff --git a/shared/libebm/PartitionOneDimensionalBoosting.cpp b/shared/libebm/PartitionOneDimensionalBoosting.cpp index fdda51dcd..9fa9f7a45 100644 --- a/shared/libebm/PartitionOneDimensionalBoosting.cpp +++ b/shared/libebm/PartitionOneDimensionalBoosting.cpp @@ -895,22 +895,35 @@ template class PartitionOneDimensionalBoo bool bMissingIsolated = false; const TreeNode* pMissingValueTreeNode = nullptr; - if(TermBoostFlags_MissingLow & flags) { - if(bMissing && !bNominal) { - pMissingBin = pBin; - } - } else if(TermBoostFlags_MissingHigh & flags) { - if(bMissing && !bNominal) { - pMissingBin = pBin; - // the concept of TermBoostFlags_MissingHigh does not exist for nominals - pBin = IndexBin(pBin, cBytesPerBin); + + if(bNominal) { + if(TermBoostFlags_MissingCategory & flags) { + // nothing to do + } else { + if(bMissing) { + pMissingValueTreeNode = pRootTreeNode; + // Skip the missing bin in the pointer to pointer mapping since it will not be part of the continuous + // region. + pBin = IndexBin(pBin, cBytesPerBin); + } } } else { - if(bMissing) { - pMissingValueTreeNode = pRootTreeNode; - // Skip the missing bin in the pointer to pointer mapping since it will not be part of the continuous - // region. - pBin = IndexBin(pBin, cBytesPerBin); + if(TermBoostFlags_MissingLow & flags) { + if(bMissing) { + pMissingBin = pBin; + } + } else if(TermBoostFlags_MissingHigh & flags) { + if(bMissing) { + pMissingBin = pBin; + pBin = IndexBin(pBin, cBytesPerBin); + } + } else { + if(bMissing) { + pMissingValueTreeNode = pRootTreeNode; + // Skip the missing bin in the pointer to pointer mapping since it will not be part of the continuous + // region. + pBin = IndexBin(pBin, cBytesPerBin); + } } } diff --git a/shared/libebm/inc/libebm.h b/shared/libebm/inc/libebm.h index b25b1ae6d..dc5514763 100644 --- a/shared/libebm/inc/libebm.h +++ b/shared/libebm/inc/libebm.h @@ -228,7 +228,7 @@ typedef struct _InteractionHandle { #define TermBoostFlags_MissingLow (TERM_BOOST_FLAGS_CAST(0x00000080)) #define TermBoostFlags_MissingHigh (TERM_BOOST_FLAGS_CAST(0x00000100)) #define TermBoostFlags_MissingSeparate (TERM_BOOST_FLAGS_CAST(0x00000200)) -#define TermBoostFlags_MissingDrop (TERM_BOOST_FLAGS_CAST(0x00000400)) +#define TermBoostFlags_MissingCategory (TERM_BOOST_FLAGS_CAST(0x00000400)) #define CreateInteractionFlags_Default (CREATE_INTERACTION_FLAGS_CAST(0x00000000)) #define CreateInteractionFlags_DifferentialPrivacy (CREATE_INTERACTION_FLAGS_CAST(0x00000001)) diff --git a/shared/libebm/tests/boosting_unusual_inputs.cpp b/shared/libebm/tests/boosting_unusual_inputs.cpp index 1026487ff..a63c8877c 100644 --- a/shared/libebm/tests/boosting_unusual_inputs.cpp +++ b/shared/libebm/tests/boosting_unusual_inputs.cpp @@ -2096,12 +2096,10 @@ static double RandomizedTesting(const AccelerationFlags acceleration) { TermBoostFlags_PurifyUpdate, // TermBoostFlags_GradientSums, // does not return a metric TermBoostFlags_DisableNewtonUpdate, - TermBoostFlags_RandomSplits}; - std::vector boostFlagsChoose{TermBoostFlags_Default, - TermBoostFlags_MissingLow, - TermBoostFlags_MissingHigh, - TermBoostFlags_MissingSeparate, - TermBoostFlags_MissingDrop}; + TermBoostFlags_RandomSplits, + TermBoostFlags_MissingCategory}; + std::vector boostFlagsChoose{ + TermBoostFlags_Default, TermBoostFlags_MissingLow, TermBoostFlags_MissingHigh, TermBoostFlags_MissingSeparate}; double validationMetric = 1.0; for(IntEbm classesCount = Task_Regression; classesCount < 5; ++classesCount) { @@ -2175,7 +2173,7 @@ static double RandomizedTesting(const AccelerationFlags acceleration) { } TEST_CASE("stress test, boosting") { - const double expected = 26746562197367.172; + const double expected = 14939439873840.908; double validationMetricExact = RandomizedTesting(AccelerationFlags_NONE); CHECK(validationMetricExact == expected);