From d7d7d1b5e0f2952e63c68a4eb8d650cca74219e2 Mon Sep 17 00:00:00 2001 From: Paul Koch Date: Fri, 27 Dec 2024 00:59:54 -0800 Subject: [PATCH] fix bug in missing="separate" that was computing the histogram totals incorrectly --- .../PartitionOneDimensionalBoosting.cpp | 27 ++++++++++++------- .../libebm/tests/boosting_unusual_inputs.cpp | 22 ++++++++++++++- 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/shared/libebm/PartitionOneDimensionalBoosting.cpp b/shared/libebm/PartitionOneDimensionalBoosting.cpp index 211eab06d..5e48d1d40 100644 --- a/shared/libebm/PartitionOneDimensionalBoosting.cpp +++ b/shared/libebm/PartitionOneDimensionalBoosting.cpp @@ -36,6 +36,7 @@ namespace DEFINED_ZONE_NAME { template INLINE_RELEASE_TEMPLATED static void SumAllBins(BoosterShell* const pBoosterShell, + const Bin* const aBins, const Bin* const pBinsEnd, const size_t cSamplesTotal, const FloatMain weightTotal, @@ -54,10 +55,6 @@ INLINE_RELEASE_TEMPLATED static void SumAllBins(BoosterShell* const pBoosterShel ZeroGradientPairs(aSumGradHess, cScores); - const auto* const aBins = - pBoosterShell->GetBoostingMainBins() - ->Specialize(); - #ifndef NDEBUG UIntMain cSamplesTotalDebug = 0; FloatMain weightTotalDebug = 0; @@ -242,7 +239,12 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell, } EBM_ASSERT(apBins <= ppBinLast); - EBM_ASSERT(ppBinLast < apBins + (cBins - (nullptr != pMissingValueTreeNode ? size_t{1} : size_t{0}))); + EBM_ASSERT(ppBinLast < apBins + + (cBins - + (nullptr != pMissingValueTreeNode || + bMissing && !bNominal && (TermBoostFlags_MissingSeparate & flags) ? + size_t{1} : + size_t{0}))); #ifndef NDEBUG cSamplesTotalDebug += static_cast(pTreeNode->GetBin()->GetCountSamples()); @@ -842,8 +844,8 @@ template class PartitionOneDimensionalBoo const FloatCalc categoricalInclusionPercent, const size_t cSplitsMax, const MonotoneDirection monotoneDirection, - const size_t cSamplesTotal, - const FloatMain weightTotal, + size_t cSamplesTotal, + FloatMain weightTotal, double* const pTotalGain) { EBM_ASSERT(2 <= cBins); // filter these out at the start where we can handle this case easily EBM_ASSERT(1 <= cSplitsMax); // filter these out at the start where we can handle this case easily @@ -886,9 +888,6 @@ template class PartitionOneDimensionalBoo ->Specialize(); auto* pBinsEnd = IndexBin(aBins, cBytesPerBin * cBins); - SumAllBins( - pBoosterShell, pBinsEnd, cSamplesTotal, weightTotal, pRootTreeNode->GetBin()); - const Bin** const apBins = reinterpret_cast**>( pBinsEnd); @@ -901,6 +900,7 @@ template class PartitionOneDimensionalBoo const TreeNode* pMissingValueTreeNode = nullptr; + const auto* aSumBins = aBins; if(bNominal) { if(TermBoostFlags_MissingCategory & flags) { // nothing to do @@ -923,6 +923,10 @@ template class PartitionOneDimensionalBoo pBin = IndexBin(pBin, cBytesPerBin); } } else if(TermBoostFlags_MissingSeparate & flags) { + cSamplesTotal -= aSumBins->GetCountSamples(); + weightTotal -= aSumBins->GetWeight(); + aSumBins = IndexBin(aSumBins, cBytesPerBin); + if(bMissing) { pBin = IndexBin(pBin, cBytesPerBin); } @@ -936,6 +940,9 @@ template class PartitionOneDimensionalBoo } } + SumAllBins( + pBoosterShell, aSumBins, pBinsEnd, cSamplesTotal, weightTotal, pRootTreeNode->GetBin()); + do { *ppBin = pBin; pBin = IndexBin(pBin, cBytesPerBin); diff --git a/shared/libebm/tests/boosting_unusual_inputs.cpp b/shared/libebm/tests/boosting_unusual_inputs.cpp index 93bd0afb6..db2d43574 100644 --- a/shared/libebm/tests/boosting_unusual_inputs.cpp +++ b/shared/libebm/tests/boosting_unusual_inputs.cpp @@ -2061,6 +2061,26 @@ TEST_CASE("lossguide, boosting, regression") { CHECK_APPROX(termScore, 0.40592050000000002); } +TEST_CASE("missing separate continuous, boosting, regression") { + TestBoost test = TestBoost(Task_Regression, + {FeatureTest(4, true, false, false)}, + {{0}}, + { + TestSample({0}, 10.0), + TestSample({1}, 20.0), + TestSample({2}, 30.0), + }, + {TestSample({1}, 20.0)}); + + // boost continuous missing separate + double validationMetric = test.Boost(0, TermBoostFlags_MissingSeparate).validationMetric; + CHECK_APPROX(validationMetric, 392.04000000000002); + + double termScore; + termScore = test.GetCurrentTermScore(0, {0}, 0); + CHECK_APPROX(termScore, 0.10000000000000001); +} + static double RandomizedTesting(const AccelerationFlags acceleration) { const IntEbm cTrainSamples = 211; // have some non-SIMD residuals const IntEbm cValidationSamples = 101; // have some non-SIMD residuals @@ -2173,7 +2193,7 @@ static double RandomizedTesting(const AccelerationFlags acceleration) { } TEST_CASE("stress test, boosting") { - const double expected = 14909739735305.580; + const double expected = 16554721767106.137; double validationMetricExact = RandomizedTesting(AccelerationFlags_NONE); CHECK(validationMetricExact == expected);