Skip to content

Commit

Permalink
fix bug in missing="separate" that was computing the histogram totals…
Browse files Browse the repository at this point in the history
… incorrectly
  • Loading branch information
paulbkoch committed Dec 27, 2024
1 parent 8581df3 commit d7d7d1b
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 11 deletions.
27 changes: 17 additions & 10 deletions shared/libebm/PartitionOneDimensionalBoosting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ namespace DEFINED_ZONE_NAME {

template<bool bHessian, size_t cCompilerScores>
INLINE_RELEASE_TEMPLATED static void SumAllBins(BoosterShell* const pBoosterShell,
const Bin<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>* const aBins,
const Bin<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>* const pBinsEnd,
const size_t cSamplesTotal,
const FloatMain weightTotal,
Expand All @@ -54,10 +55,6 @@ INLINE_RELEASE_TEMPLATED static void SumAllBins(BoosterShell* const pBoosterShel

ZeroGradientPairs(aSumGradHess, cScores);

const auto* const aBins =
pBoosterShell->GetBoostingMainBins()
->Specialize<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>();

#ifndef NDEBUG
UIntMain cSamplesTotalDebug = 0;
FloatMain weightTotalDebug = 0;
Expand Down Expand Up @@ -242,7 +239,12 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell,
}

EBM_ASSERT(apBins <= ppBinLast);
EBM_ASSERT(ppBinLast < apBins + (cBins - (nullptr != pMissingValueTreeNode ? size_t{1} : size_t{0})));
EBM_ASSERT(ppBinLast < apBins +
(cBins -
(nullptr != pMissingValueTreeNode ||
bMissing && !bNominal && (TermBoostFlags_MissingSeparate & flags) ?
size_t{1} :
size_t{0})));

#ifndef NDEBUG
cSamplesTotalDebug += static_cast<size_t>(pTreeNode->GetBin()->GetCountSamples());
Expand Down Expand Up @@ -842,8 +844,8 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
const FloatCalc categoricalInclusionPercent,
const size_t cSplitsMax,
const MonotoneDirection monotoneDirection,
const size_t cSamplesTotal,
const FloatMain weightTotal,
size_t cSamplesTotal,
FloatMain weightTotal,
double* const pTotalGain) {
EBM_ASSERT(2 <= cBins); // filter these out at the start where we can handle this case easily
EBM_ASSERT(1 <= cSplitsMax); // filter these out at the start where we can handle this case easily
Expand Down Expand Up @@ -886,9 +888,6 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
->Specialize<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>();
auto* pBinsEnd = IndexBin(aBins, cBytesPerBin * cBins);

SumAllBins<bHessian, cCompilerScores>(
pBoosterShell, pBinsEnd, cSamplesTotal, weightTotal, pRootTreeNode->GetBin());

const Bin<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>** const apBins =
reinterpret_cast<const Bin<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>**>(
pBinsEnd);
Expand All @@ -901,6 +900,7 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo

const TreeNode<bHessian, GetArrayScores(cCompilerScores)>* pMissingValueTreeNode = nullptr;

const auto* aSumBins = aBins;
if(bNominal) {
if(TermBoostFlags_MissingCategory & flags) {
// nothing to do
Expand All @@ -923,6 +923,10 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
pBin = IndexBin(pBin, cBytesPerBin);
}
} else if(TermBoostFlags_MissingSeparate & flags) {
cSamplesTotal -= aSumBins->GetCountSamples();
weightTotal -= aSumBins->GetWeight();
aSumBins = IndexBin(aSumBins, cBytesPerBin);

if(bMissing) {
pBin = IndexBin(pBin, cBytesPerBin);
}
Expand All @@ -936,6 +940,9 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
}
}

SumAllBins<bHessian, cCompilerScores>(
pBoosterShell, aSumBins, pBinsEnd, cSamplesTotal, weightTotal, pRootTreeNode->GetBin());

do {
*ppBin = pBin;
pBin = IndexBin(pBin, cBytesPerBin);
Expand Down
22 changes: 21 additions & 1 deletion shared/libebm/tests/boosting_unusual_inputs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2061,6 +2061,26 @@ TEST_CASE("lossguide, boosting, regression") {
CHECK_APPROX(termScore, 0.40592050000000002);
}

TEST_CASE("missing separate continuous, boosting, regression") {
TestBoost test = TestBoost(Task_Regression,
{FeatureTest(4, true, false, false)},
{{0}},
{
TestSample({0}, 10.0),
TestSample({1}, 20.0),
TestSample({2}, 30.0),
},
{TestSample({1}, 20.0)});

// boost continuous missing separate
double validationMetric = test.Boost(0, TermBoostFlags_MissingSeparate).validationMetric;
CHECK_APPROX(validationMetric, 392.04000000000002);

double termScore;
termScore = test.GetCurrentTermScore(0, {0}, 0);
CHECK_APPROX(termScore, 0.10000000000000001);
}

static double RandomizedTesting(const AccelerationFlags acceleration) {
const IntEbm cTrainSamples = 211; // have some non-SIMD residuals
const IntEbm cValidationSamples = 101; // have some non-SIMD residuals
Expand Down Expand Up @@ -2173,7 +2193,7 @@ static double RandomizedTesting(const AccelerationFlags acceleration) {
}

TEST_CASE("stress test, boosting") {
const double expected = 14909739735305.580;
const double expected = 16554721767106.137;

double validationMetricExact = RandomizedTesting(AccelerationFlags_NONE);
CHECK(validationMetricExact == expected);
Expand Down

0 comments on commit d7d7d1b

Please sign in to comment.