From 8581df33972df8f16b989e686f478474a68a0ca0 Mon Sep 17 00:00:00 2001 From: Paul Koch Date: Thu, 26 Dec 2024 20:57:20 -0800 Subject: [PATCH] change default missing handling to "separate" --- .../interpret/glassbox/_ebm/_ebm.py | 8 +- .../PartitionOneDimensionalBoosting.cpp | 87 +++++++++---------- 2 files changed, 47 insertions(+), 48 deletions(-) diff --git a/python/interpret-core/interpret/glassbox/_ebm/_ebm.py b/python/interpret-core/interpret/glassbox/_ebm/_ebm.py index 7888459e8..9d1bca5a1 100644 --- a/python/interpret-core/interpret/glassbox/_ebm/_ebm.py +++ b/python/interpret-core/interpret/glassbox/_ebm/_ebm.py @@ -2771,7 +2771,7 @@ class ExplainableBoostingClassifier(ClassifierMixin, EBMModel): L2 regularization. max_delta_step : float, default=0.0 Used to limit the max output of tree leaves. <=0.0 means no constraint. - missing: str, default="low" + missing: str, default="separate" Method for handling missing values during boosting. The placement of the missing value bin can influence the resulting model graphs. For example, placing the bin on the "low" side may cause missing values to @@ -2944,7 +2944,7 @@ def __init__( reg_alpha: Optional[float] = 0.0, reg_lambda: Optional[float] = 0.0, max_delta_step: Optional[float] = 0.0, - missing: str = "low", + missing: str = "separate", max_leaves: int = 3, monotone_constraints: Optional[Sequence[int]] = None, objective: str = "log_loss", @@ -3143,7 +3143,7 @@ class ExplainableBoostingRegressor(RegressorMixin, EBMModel): L2 regularization. max_delta_step : float, default=0.0 Used to limit the max output of tree leaves. <=0.0 means no constraint. - missing: str, default="low" + missing: str, default="separate" Method for handling missing values during boosting. The placement of the missing value bin can influence the resulting model graphs. For example, placing the bin on the "low" side may cause missing values to @@ -3316,7 +3316,7 @@ def __init__( reg_alpha: Optional[float] = 0.0, reg_lambda: Optional[float] = 0.0, max_delta_step: Optional[float] = 0.0, - missing: str = "low", + missing: str = "separate", max_leaves: int = 2, monotone_constraints: Optional[Sequence[int]] = None, objective: str = "rmse", diff --git a/shared/libebm/PartitionOneDimensionalBoosting.cpp b/shared/libebm/PartitionOneDimensionalBoosting.cpp index 98485e34e..211eab06d 100644 --- a/shared/libebm/PartitionOneDimensionalBoosting.cpp +++ b/shared/libebm/PartitionOneDimensionalBoosting.cpp @@ -326,50 +326,7 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell, while(true) { if(nullptr == pTreeNode) { - done:; - EBM_ASSERT(cSamplesTotalDebug == cSamplesExpectedDebug); - - EBM_ASSERT(bNominal || pUpdateScore == aUpdateScore + cScores * cSlices); - - EBM_ASSERT(bNominal || pSplit == cSlices - 1 + pInnerTermUpdate->GetSplitPointer(iDimension)); - -#ifndef NDEBUG - UIntSplit prevDebug = 0; - for(size_t iDebug = 0; iDebug < cSlices - 1; ++iDebug) { - UIntSplit curDebug = pInnerTermUpdate->GetSplitPointer(iDimension)[iDebug]; - EBM_ASSERT(prevDebug < curDebug); - prevDebug = curDebug; - } - EBM_ASSERT(prevDebug < cBins); -#endif - - EBM_ASSERT(nullptr == pMissingValueTreeNode || nullptr != pMissingBin); - if(nullptr != pMissingBin) { - EBM_ASSERT(bMissing); - - FloatScore hess = static_cast(pMissingBin->GetWeight()); - const auto* pGradientPair = pMissingBin->GetGradientPairs(); - const auto* const pGradientPairEnd = pGradientPair + cScores; - FloatScore* pMissingUpdateScore = aUpdateScore; - do { - if(bUpdateWithHessian) { - hess = static_cast(pGradientPair->GetHess()); - } - FloatCalc updateScore = -CalcNegUpdate(static_cast(pGradientPair->m_sumGradients), - hess, - regAlpha, - regLambda, - deltaStepMax); - - *pMissingUpdateScore = updateScore; - ++pMissingUpdateScore; - - ++pGradientPair; - } while(pGradientPairEnd != pGradientPair); - } - - LOG_0(Trace_Verbose, "Exited Flatten"); - return Error_None; + goto done; } if(!pTreeNode->DECONSTRUCT_IsRightChildTraversal()) { // we checked earlier that countBins could be converted to a UIntSplit @@ -411,6 +368,48 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell, } } } + +done:; + EBM_ASSERT(cSamplesTotalDebug == cSamplesExpectedDebug); + + EBM_ASSERT(bNominal || pUpdateScore == aUpdateScore + cScores * cSlices); + + EBM_ASSERT(bNominal || pSplit == cSlices - 1 + pInnerTermUpdate->GetSplitPointer(iDimension)); + +#ifndef NDEBUG + UIntSplit prevDebug = 0; + for(size_t iDebug = 0; iDebug < cSlices - 1; ++iDebug) { + UIntSplit curDebug = pInnerTermUpdate->GetSplitPointer(iDimension)[iDebug]; + EBM_ASSERT(prevDebug < curDebug); + prevDebug = curDebug; + } + EBM_ASSERT(prevDebug < cBins); +#endif + + EBM_ASSERT(nullptr == pMissingValueTreeNode || nullptr != pMissingBin); + if(nullptr != pMissingBin) { + EBM_ASSERT(bMissing); + + FloatScore hess = static_cast(pMissingBin->GetWeight()); + const auto* pGradientPair = pMissingBin->GetGradientPairs(); + const auto* const pGradientPairEnd = pGradientPair + cScores; + FloatScore* pMissingUpdateScore = aUpdateScore; + do { + if(bUpdateWithHessian) { + hess = static_cast(pGradientPair->GetHess()); + } + FloatCalc updateScore = -CalcNegUpdate( + static_cast(pGradientPair->m_sumGradients), hess, regAlpha, regLambda, deltaStepMax); + + *pMissingUpdateScore = updateScore; + ++pMissingUpdateScore; + + ++pGradientPair; + } while(pGradientPairEnd != pGradientPair); + } + + LOG_0(Trace_Verbose, "Exited Flatten"); + return Error_None; } WARNING_POP