Skip to content

Commit

Permalink
allow caller to place the missing bin at the high end of the feature …
Browse files Browse the repository at this point in the history
…values
  • Loading branch information
paulbkoch committed Dec 25, 2024
1 parent 42875bc commit 2a3376a
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 20 deletions.
72 changes: 53 additions & 19 deletions shared/libebm/PartitionOneDimensionalBoosting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,12 +117,8 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell,
const size_t iDimension,
const Bin<FloatMain, UIntMain, true, true, bHessian>* const* const apBins,
const TreeNode<bHessian>* pMissingValueTreeNode,
const size_t cSlices
#ifndef NDEBUG
,
const size_t cBins
#endif // NDEBUG
) {
const size_t cSlices,
const size_t cBins) {
LOG_0(Trace_Verbose, "Entered Flatten");

EBM_ASSERT(nullptr != pBoosterShell);
Expand Down Expand Up @@ -178,6 +174,8 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell,
pUpdateScore = aUpdateScore;

if(bMissing) {
EBM_ASSERT(2 <= cSlices); // no cuts if there was only missing bin

// always put a split on the missing bin
*pSplit = 1;
++pSplit;
Expand All @@ -199,6 +197,7 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell,
const bool bUpdateWithHessian = bHessian && !(TermBoostFlags_DisableNewtonUpdate & flags);

TreeNode<bHessian>* pParent = nullptr;
bool bDone = false;

while(true) {
if(UNPREDICTABLE(pTreeNode->AFTER_IsSplit())) {
Expand Down Expand Up @@ -253,11 +252,6 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell,
}
EBM_ASSERT(!bNominal);

// if !bNominal, check the bin above and below for order
EBM_ASSERT(apBins == ppBinLast || *(ppBinLast - 1) < *ppBinLast);
EBM_ASSERT(ppBinLast == apBins + (cBins - (nullptr != pMissingValueTreeNode ? size_t{2} : size_t{1})) ||
*ppBinLast < *(ppBinLast + 1));

iEdge = ppBinLast - apBins + 1 + (nullptr != pMissingValueTreeNode ? 1 : 0);

while(true) { // not a real loop
Expand All @@ -267,8 +261,17 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell,
pMissingBin = pTreeNode->GetBin();
}
if(1 == iEdge) {
// this cut would isolate the missing bin, but we handle those scores separately
break;
}
} else if(TermBoostFlags_MissingHigh & flags) {
++iEdge; // missing is at index 0 in the model, so we are offset by one
pMissingBin = pTreeNode->GetBin();
EBM_ASSERT(iEdge <= cBins + 1);
if(bDone) {
// this cut would isolate the missing bin, but we handle those scores separately
goto done;
}
}
}

Expand Down Expand Up @@ -316,10 +319,13 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell,

while(true) {
if(nullptr == pTreeNode) {
done:;
EBM_ASSERT(cSamplesTotalDebug == cSamplesExpectedDebug);

EBM_ASSERT(nullptr == pMissingValueTreeNode || nullptr != pMissingBin);
if(nullptr != pMissingBin) {
EBM_ASSERT(bMissing);

FloatScore hess = static_cast<FloatCalc>(pMissingBin->GetWeight());
const auto* pGradientPair = pMissingBin->GetGradientPairs();
const auto* const pGradientPairEnd = pGradientPair + cScores;
Expand All @@ -341,6 +347,18 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell,
} while(pGradientPairEnd != pGradientPair);
}

EBM_ASSERT(bNominal || pSplit == cSlices - 1 + pInnerTermUpdate->GetSplitPointer(iDimension));

#ifndef NDEBUG
UIntSplit prevDebug = 0;
for(size_t iDebug = 0; iDebug < cSlices - 1; ++iDebug) {
UIntSplit curDebug = pInnerTermUpdate->GetSplitPointer(iDimension)[iDebug];
EBM_ASSERT(prevDebug < curDebug);
prevDebug = curDebug;
}
EBM_ASSERT(prevDebug < cBins);
#endif

LOG_0(Trace_Verbose, "Exited Flatten");
return Error_None;
}
Expand All @@ -353,12 +371,21 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell,
if(bMissing) {
if(TermBoostFlags_MissingLow & flags) {
if(1 == iEdge) {
// this cut would isolate the missing bin, but missing already has a cut
break;
}
} else if(TermBoostFlags_MissingHigh & flags) {
EBM_ASSERT(iEdge <= cBins);
if(cBins == iEdge) {
// This cut would isolate the missing bin, but missing already has a cut.
// We still need to find the missing bin though in the tree, so continue.
break;
}
}
}

EBM_ASSERT(!IsConvertError<UIntSplit>(iEdge));
EBM_ASSERT(pSplit < cSlices - 1 + pInnerTermUpdate->GetSplitPointer(iDimension));
*pSplit = static_cast<UIntSplit>(iEdge);
++pSplit;

Expand Down Expand Up @@ -869,9 +896,13 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
if(!bNominal) {
pMissingBin = pBin;
}
*ppBin = pBin;
}
} else if(TermBoostFlags_MissingHigh & flags) {
if(bMissing) {
if(!bNominal) {
pMissingBin = pBin;
}
pBin = IndexBin(pBin, cBytesPerBin);
++ppBin;
}
} else {
if(bMissing) {
Expand All @@ -888,6 +919,13 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
++ppBin;
} while(pBinsEnd != pBin);

if(TermBoostFlags_MissingHigh & flags) {
if(bMissing) {
*ppBin = aBins;
++ppBin;
}
}

if(bNominal) {
std::sort(apBins,
ppBin,
Expand Down Expand Up @@ -1072,12 +1110,8 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
iDimension,
reinterpret_cast<const Bin<FloatMain, UIntMain, true, true, bHessian>* const*>(apBins),
nullptr != pMissingValueTreeNode ? pMissingValueTreeNode->Downgrade() : nullptr,
cSlices
#ifndef NDEBUG
,
cBins
#endif // NDEBUG
);
cSlices,
cBins);

EBM_ASSERT(!bMissing || 2 <= pBoosterShell->GetInnerTermUpdate()->GetCountSlices(iDimension));
EBM_ASSERT(!bMissing || *pBoosterShell->GetInnerTermUpdate()->GetSplitPointer(iDimension) == 1);
Expand Down
2 changes: 1 addition & 1 deletion shared/libebm/tests/boosting_unusual_inputs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2175,7 +2175,7 @@ static double RandomizedTesting(const AccelerationFlags acceleration) {
}

TEST_CASE("stress test, boosting") {
const double expected = 26758407585917.129;
const double expected = 26746562197367.172;

double validationMetricExact = RandomizedTesting(AccelerationFlags_NONE);
CHECK(validationMetricExact == expected);
Expand Down

0 comments on commit 2a3376a

Please sign in to comment.