From 8581df33972df8f16b989e686f478474a68a0ca0 Mon Sep 17 00:00:00 2001
From: Paul Koch <code@koch.ninja>
Date: Thu, 26 Dec 2024 20:57:20 -0800
Subject: [PATCH] change default missing handling to "separate"

---
 .../interpret/glassbox/_ebm/_ebm.py           |  8 +-
 .../PartitionOneDimensionalBoosting.cpp       | 87 +++++++++----------
 2 files changed, 47 insertions(+), 48 deletions(-)
diff --git a/python/interpret-core/interpret/glassbox/_ebm/_ebm.py b/python/interpret-core/interpret/glassbox/_ebm/_ebm.py
index 7888459e8..9d1bca5a1 100644
--- a/python/interpret-core/interpret/glassbox/_ebm/_ebm.py
+++ b/python/interpret-core/interpret/glassbox/_ebm/_ebm.py
@@ -2771,7 +2771,7 @@ class ExplainableBoostingClassifier(ClassifierMixin, EBMModel):
         L2 regularization.
     max_delta_step : float, default=0.0
         Used to limit the max output of tree leaves. <=0.0 means no constraint.
-    missing: str, default="low"
+    missing: str, default="separate"
 
         Method for handling missing values during boosting. The placement of the missing value bin can influence
         the resulting model graphs. For example, placing the bin on the "low" side may cause missing values to
@@ -2944,7 +2944,7 @@ def __init__(
         reg_alpha: Optional[float] = 0.0,
         reg_lambda: Optional[float] = 0.0,
         max_delta_step: Optional[float] = 0.0,
-        missing: str = "low",
+        missing: str = "separate",
         max_leaves: int = 3,
         monotone_constraints: Optional[Sequence[int]] = None,
         objective: str = "log_loss",
@@ -3143,7 +3143,7 @@ class ExplainableBoostingRegressor(RegressorMixin, EBMModel):
         L2 regularization.
     max_delta_step : float, default=0.0
         Used to limit the max output of tree leaves. <=0.0 means no constraint.
-    missing: str, default="low"
+    missing: str, default="separate"
 
         Method for handling missing values during boosting. The placement of the missing value bin can influence
         the resulting model graphs. For example, placing the bin on the "low" side may cause missing values to
@@ -3316,7 +3316,7 @@ def __init__(
         reg_alpha: Optional[float] = 0.0,
         reg_lambda: Optional[float] = 0.0,
         max_delta_step: Optional[float] = 0.0,
-        missing: str = "low",
+        missing: str = "separate",
         max_leaves: int = 2,
         monotone_constraints: Optional[Sequence[int]] = None,
         objective: str = "rmse",
diff --git a/shared/libebm/PartitionOneDimensionalBoosting.cpp b/shared/libebm/PartitionOneDimensionalBoosting.cpp
index 98485e34e..211eab06d 100644
--- a/shared/libebm/PartitionOneDimensionalBoosting.cpp
+++ b/shared/libebm/PartitionOneDimensionalBoosting.cpp
@@ -326,50 +326,7 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell,
 
          while(true) {
             if(nullptr == pTreeNode) {
-            done:;
-               EBM_ASSERT(cSamplesTotalDebug == cSamplesExpectedDebug);
-
-               EBM_ASSERT(bNominal || pUpdateScore == aUpdateScore + cScores * cSlices);
-
-               EBM_ASSERT(bNominal || pSplit == cSlices - 1 + pInnerTermUpdate->GetSplitPointer(iDimension));
-
-#ifndef NDEBUG
-               UIntSplit prevDebug = 0;
-               for(size_t iDebug = 0; iDebug < cSlices - 1; ++iDebug) {
-                  UIntSplit curDebug = pInnerTermUpdate->GetSplitPointer(iDimension)[iDebug];
-                  EBM_ASSERT(prevDebug < curDebug);
-                  prevDebug = curDebug;
-               }
-               EBM_ASSERT(prevDebug < cBins);
-#endif
-
-               EBM_ASSERT(nullptr == pMissingValueTreeNode || nullptr != pMissingBin);
-               if(nullptr != pMissingBin) {
-                  EBM_ASSERT(bMissing);
-
-                  FloatScore hess = static_cast<FloatCalc>(pMissingBin->GetWeight());
-                  const auto* pGradientPair = pMissingBin->GetGradientPairs();
-                  const auto* const pGradientPairEnd = pGradientPair + cScores;
-                  FloatScore* pMissingUpdateScore = aUpdateScore;
-                  do {
-                     if(bUpdateWithHessian) {
-                        hess = static_cast<FloatCalc>(pGradientPair->GetHess());
-                     }
-                     FloatCalc updateScore = -CalcNegUpdate<true>(static_cast<FloatCalc>(pGradientPair->m_sumGradients),
-                           hess,
-                           regAlpha,
-                           regLambda,
-                           deltaStepMax);
-
-                     *pMissingUpdateScore = updateScore;
-                     ++pMissingUpdateScore;
-
-                     ++pGradientPair;
-                  } while(pGradientPairEnd != pGradientPair);
-               }
-
-               LOG_0(Trace_Verbose, "Exited Flatten");
-               return Error_None;
+               goto done;
             }
             if(!pTreeNode->DECONSTRUCT_IsRightChildTraversal()) {
                // we checked earlier that countBins could be converted to a UIntSplit
@@ -411,6 +368,48 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell,
          }
       }
    }
+
+done:;
+   EBM_ASSERT(cSamplesTotalDebug == cSamplesExpectedDebug);
+
+   EBM_ASSERT(bNominal || pUpdateScore == aUpdateScore + cScores * cSlices);
+
+   EBM_ASSERT(bNominal || pSplit == cSlices - 1 + pInnerTermUpdate->GetSplitPointer(iDimension));
+
+#ifndef NDEBUG
+   UIntSplit prevDebug = 0;
+   for(size_t iDebug = 0; iDebug < cSlices - 1; ++iDebug) {
+      UIntSplit curDebug = pInnerTermUpdate->GetSplitPointer(iDimension)[iDebug];
+      EBM_ASSERT(prevDebug < curDebug);
+      prevDebug = curDebug;
+   }
+   EBM_ASSERT(prevDebug < cBins);
+#endif
+
+   EBM_ASSERT(nullptr == pMissingValueTreeNode || nullptr != pMissingBin);
+   if(nullptr != pMissingBin) {
+      EBM_ASSERT(bMissing);
+
+      FloatScore hess = static_cast<FloatCalc>(pMissingBin->GetWeight());
+      const auto* pGradientPair = pMissingBin->GetGradientPairs();
+      const auto* const pGradientPairEnd = pGradientPair + cScores;
+      FloatScore* pMissingUpdateScore = aUpdateScore;
+      do {
+         if(bUpdateWithHessian) {
+            hess = static_cast<FloatCalc>(pGradientPair->GetHess());
+         }
+         FloatCalc updateScore = -CalcNegUpdate<true>(
+               static_cast<FloatCalc>(pGradientPair->m_sumGradients), hess, regAlpha, regLambda, deltaStepMax);
+
+         *pMissingUpdateScore = updateScore;
+         ++pMissingUpdateScore;
+
+         ++pGradientPair;
+      } while(pGradientPairEnd != pGradientPair);
+   }
+
+   LOG_0(Trace_Verbose, "Exited Flatten");
+   return Error_None;
 }
 WARNING_POP