From 2a3376a099d888df27e7e181d2098128851b7942 Mon Sep 17 00:00:00 2001 From: Paul Koch Date: Tue, 24 Dec 2024 16:35:34 -0800 Subject: [PATCH] allow caller to place the missing bin at the high end of the feature values --- .../PartitionOneDimensionalBoosting.cpp | 72 ++++++++++++++----- .../libebm/tests/boosting_unusual_inputs.cpp | 2 +- 2 files changed, 54 insertions(+), 20 deletions(-) diff --git a/shared/libebm/PartitionOneDimensionalBoosting.cpp b/shared/libebm/PartitionOneDimensionalBoosting.cpp index 5f815b354..790fc0f36 100644 --- a/shared/libebm/PartitionOneDimensionalBoosting.cpp +++ b/shared/libebm/PartitionOneDimensionalBoosting.cpp @@ -117,12 +117,8 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell, const size_t iDimension, const Bin* const* const apBins, const TreeNode* pMissingValueTreeNode, - const size_t cSlices -#ifndef NDEBUG - , - const size_t cBins -#endif // NDEBUG -) { + const size_t cSlices, + const size_t cBins) { LOG_0(Trace_Verbose, "Entered Flatten"); EBM_ASSERT(nullptr != pBoosterShell); @@ -178,6 +174,8 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell, pUpdateScore = aUpdateScore; if(bMissing) { + EBM_ASSERT(2 <= cSlices); // no cuts if there was only missing bin + // always put a split on the missing bin *pSplit = 1; ++pSplit; @@ -199,6 +197,7 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell, const bool bUpdateWithHessian = bHessian && !(TermBoostFlags_DisableNewtonUpdate & flags); TreeNode* pParent = nullptr; + bool bDone = false; while(true) { if(UNPREDICTABLE(pTreeNode->AFTER_IsSplit())) { @@ -253,11 +252,6 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell, } EBM_ASSERT(!bNominal); - // if !bNominal, check the bin above and below for order - EBM_ASSERT(apBins == ppBinLast || *(ppBinLast - 1) < *ppBinLast); - EBM_ASSERT(ppBinLast == apBins + (cBins - (nullptr != pMissingValueTreeNode ? size_t{2} : size_t{1})) || - *ppBinLast < *(ppBinLast + 1)); - iEdge = ppBinLast - apBins + 1 + (nullptr != pMissingValueTreeNode ? 1 : 0); while(true) { // not a real loop @@ -267,8 +261,17 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell, pMissingBin = pTreeNode->GetBin(); } if(1 == iEdge) { + // this cut would isolate the missing bin, but we handle those scores separately break; } + } else if(TermBoostFlags_MissingHigh & flags) { + ++iEdge; // missing is at index 0 in the model, so we are offset by one + pMissingBin = pTreeNode->GetBin(); + EBM_ASSERT(iEdge <= cBins + 1); + if(bDone) { + // this cut would isolate the missing bin, but we handle those scores separately + goto done; + } } } @@ -316,10 +319,13 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell, while(true) { if(nullptr == pTreeNode) { + done:; EBM_ASSERT(cSamplesTotalDebug == cSamplesExpectedDebug); EBM_ASSERT(nullptr == pMissingValueTreeNode || nullptr != pMissingBin); if(nullptr != pMissingBin) { + EBM_ASSERT(bMissing); + FloatScore hess = static_cast(pMissingBin->GetWeight()); const auto* pGradientPair = pMissingBin->GetGradientPairs(); const auto* const pGradientPairEnd = pGradientPair + cScores; @@ -341,6 +347,18 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell, } while(pGradientPairEnd != pGradientPair); } + EBM_ASSERT(bNominal || pSplit == cSlices - 1 + pInnerTermUpdate->GetSplitPointer(iDimension)); + +#ifndef NDEBUG + UIntSplit prevDebug = 0; + for(size_t iDebug = 0; iDebug < cSlices - 1; ++iDebug) { + UIntSplit curDebug = pInnerTermUpdate->GetSplitPointer(iDimension)[iDebug]; + EBM_ASSERT(prevDebug < curDebug); + prevDebug = curDebug; + } + EBM_ASSERT(prevDebug < cBins); +#endif + LOG_0(Trace_Verbose, "Exited Flatten"); return Error_None; } @@ -353,12 +371,21 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell, if(bMissing) { if(TermBoostFlags_MissingLow & flags) { if(1 == iEdge) { + // this cut would isolate the missing bin, but missing already has a cut + break; + } + } else if(TermBoostFlags_MissingHigh & flags) { + EBM_ASSERT(iEdge <= cBins); + if(cBins == iEdge) { + // This cut would isolate the missing bin, but missing already has a cut. + // We still need to find the missing bin though in the tree, so continue. break; } } } EBM_ASSERT(!IsConvertError(iEdge)); + EBM_ASSERT(pSplit < cSlices - 1 + pInnerTermUpdate->GetSplitPointer(iDimension)); *pSplit = static_cast(iEdge); ++pSplit; @@ -869,9 +896,13 @@ template class PartitionOneDimensionalBoo if(!bNominal) { pMissingBin = pBin; } - *ppBin = pBin; + } + } else if(TermBoostFlags_MissingHigh & flags) { + if(bMissing) { + if(!bNominal) { + pMissingBin = pBin; + } pBin = IndexBin(pBin, cBytesPerBin); - ++ppBin; } } else { if(bMissing) { @@ -888,6 +919,13 @@ template class PartitionOneDimensionalBoo ++ppBin; } while(pBinsEnd != pBin); + if(TermBoostFlags_MissingHigh & flags) { + if(bMissing) { + *ppBin = aBins; + ++ppBin; + } + } + if(bNominal) { std::sort(apBins, ppBin, @@ -1072,12 +1110,8 @@ template class PartitionOneDimensionalBoo iDimension, reinterpret_cast* const*>(apBins), nullptr != pMissingValueTreeNode ? pMissingValueTreeNode->Downgrade() : nullptr, - cSlices -#ifndef NDEBUG - , - cBins -#endif // NDEBUG - ); + cSlices, + cBins); EBM_ASSERT(!bMissing || 2 <= pBoosterShell->GetInnerTermUpdate()->GetCountSlices(iDimension)); EBM_ASSERT(!bMissing || *pBoosterShell->GetInnerTermUpdate()->GetSplitPointer(iDimension) == 1); diff --git a/shared/libebm/tests/boosting_unusual_inputs.cpp b/shared/libebm/tests/boosting_unusual_inputs.cpp index 8aa683e96..1026487ff 100644 --- a/shared/libebm/tests/boosting_unusual_inputs.cpp +++ b/shared/libebm/tests/boosting_unusual_inputs.cpp @@ -2175,7 +2175,7 @@ static double RandomizedTesting(const AccelerationFlags acceleration) { } TEST_CASE("stress test, boosting") { - const double expected = 26758407585917.129; + const double expected = 26746562197367.172; double validationMetricExact = RandomizedTesting(AccelerationFlags_NONE); CHECK(validationMetricExact == expected);