Skip to content

Commit

Permalink
[fix](orc) Fix has null statistics for forward compatibility (#259)
Browse files Browse the repository at this point in the history
  • Loading branch information
suxiaogang223 authored Dec 10, 2024
1 parent 6324006 commit 86a5c44
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 22 deletions.
20 changes: 10 additions & 10 deletions c++/src/Statistics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -181,13 +181,13 @@ namespace orc {

ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& pb) {
_stats.setNumberOfValues(pb.numberofvalues());
_stats.setHasNull(pb.hasnull());
_stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
}

BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl(const proto::ColumnStatistics& pb,
const StatContext& statContext) {
_stats.setNumberOfValues(pb.numberofvalues());
_stats.setHasNull(pb.hasnull());
_stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
if (pb.has_binarystatistics() && statContext.correctStats) {
_stats.setHasTotalLength(pb.binarystatistics().has_sum());
_stats.setTotalLength(static_cast<uint64_t>(pb.binarystatistics().sum()));
Expand All @@ -197,7 +197,7 @@ namespace orc {
BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl(const proto::ColumnStatistics& pb,
const StatContext& statContext) {
_stats.setNumberOfValues(pb.numberofvalues());
_stats.setHasNull(pb.hasnull());
_stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
if (pb.has_bucketstatistics() && statContext.correctStats) {
_hasCount = true;
_trueCount = pb.bucketstatistics().count(0);
Expand All @@ -210,7 +210,7 @@ namespace orc {
DateColumnStatisticsImpl::DateColumnStatisticsImpl(const proto::ColumnStatistics& pb,
const StatContext& statContext) {
_stats.setNumberOfValues(pb.numberofvalues());
_stats.setHasNull(pb.hasnull());
_stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
if (!pb.has_datestatistics() || !statContext.correctStats) {
// hasMinimum_ is false by default;
// hasMaximum_ is false by default;
Expand All @@ -227,7 +227,7 @@ namespace orc {
DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl(const proto::ColumnStatistics& pb,
const StatContext& statContext) {
_stats.setNumberOfValues(pb.numberofvalues());
_stats.setHasNull(pb.hasnull());
_stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
if (pb.has_decimalstatistics() && statContext.correctStats) {
const proto::DecimalStatistics& stats = pb.decimalstatistics();
_stats.setHasMinimum(stats.has_minimum());
Expand All @@ -242,7 +242,7 @@ namespace orc {

DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl(const proto::ColumnStatistics& pb) {
_stats.setNumberOfValues(pb.numberofvalues());
_stats.setHasNull(pb.hasnull());
_stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
if (!pb.has_doublestatistics()) {
_stats.setMinimum(0);
_stats.setMaximum(0);
Expand All @@ -261,7 +261,7 @@ namespace orc {

IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl(const proto::ColumnStatistics& pb) {
_stats.setNumberOfValues(pb.numberofvalues());
_stats.setHasNull(pb.hasnull());
_stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
if (!pb.has_intstatistics()) {
_stats.setMinimum(0);
_stats.setMaximum(0);
Expand All @@ -281,7 +281,7 @@ namespace orc {
StringColumnStatisticsImpl::StringColumnStatisticsImpl(const proto::ColumnStatistics& pb,
const StatContext& statContext) {
_stats.setNumberOfValues(pb.numberofvalues());
_stats.setHasNull(pb.hasnull());
_stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
if (!pb.has_stringstatistics() || !statContext.correctStats) {
_stats.setTotalLength(0);
} else {
Expand All @@ -299,7 +299,7 @@ namespace orc {
TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl(const proto::ColumnStatistics& pb,
const StatContext& statContext) {
_stats.setNumberOfValues(pb.numberofvalues());
_stats.setHasNull(pb.hasnull());
_stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
if (!pb.has_timestampstatistics() || !statContext.correctStats) {
_stats.setMinimum(0);
_stats.setMaximum(0);
Expand Down Expand Up @@ -365,7 +365,7 @@ namespace orc {
CollectionColumnStatisticsImpl::CollectionColumnStatisticsImpl(
const proto::ColumnStatistics& pb) {
_stats.setNumberOfValues(pb.numberofvalues());
_stats.setHasNull(pb.hasnull());
_stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
if (!pb.has_collectionstatistics()) {
_stats.setMinimum(0);
_stats.setMaximum(0);
Expand Down
30 changes: 18 additions & 12 deletions c++/src/sargs/PredicateLeaf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -390,10 +390,15 @@ namespace orc {

DIAGNOSTIC_POP

static bool col_stats_hasnull(const proto::ColumnStatistics& stats) {
// for foward compatibility, if hasnull is not set, assume that the column has nulls
return stats.has_hasnull() ? stats.hasnull() : true;
}

static TruthValue evaluateBoolPredicate(const PredicateLeaf::Operator op,
const std::vector<Literal>& literals,
const proto::ColumnStatistics& stats) {
bool hasNull = stats.hasnull();
bool hasNull = col_stats_hasnull(stats);
if (!stats.has_bucketstatistics() || stats.bucketstatistics().count_size() == 0) {
// does not have bool stats
return hasNull ? TruthValue::YES_NO_NULL : TruthValue::YES_NO;
Expand Down Expand Up @@ -513,7 +518,7 @@ namespace orc {
colStats.intstatistics().has_maximum()) {
const auto& stats = colStats.intstatistics();
result = evaluatePredicateRange(mOperator, literal2Long(mLiterals), stats.minimum(),
stats.maximum(), colStats.hasnull());
stats.maximum(), col_stats_hasnull(colStats));
}
break;
}
Expand All @@ -522,10 +527,10 @@ namespace orc {
colStats.doublestatistics().has_maximum()) {
const auto& stats = colStats.doublestatistics();
if (!std::isfinite(stats.sum())) {
result = colStats.hasnull() ? TruthValue::YES_NO_NULL : TruthValue::YES_NO;
result = col_stats_hasnull(colStats) ? TruthValue::YES_NO_NULL : TruthValue::YES_NO;
} else {
result = evaluatePredicateRange(mOperator, literal2Double(mLiterals), stats.minimum(),
stats.maximum(), colStats.hasnull());
stats.maximum(), col_stats_hasnull(colStats));
}
}
break;
Expand All @@ -536,7 +541,7 @@ namespace orc {
colStats.stringstatistics().has_maximum()) {
const auto& stats = colStats.stringstatistics();
result = evaluatePredicateRange(mOperator, literal2String(mLiterals), stats.minimum(),
stats.maximum(), colStats.hasnull());
stats.maximum(), col_stats_hasnull(colStats));
}
break;
}
Expand All @@ -545,7 +550,7 @@ namespace orc {
colStats.datestatistics().has_maximum()) {
const auto& stats = colStats.datestatistics();
result = evaluatePredicateRange(mOperator, literal2Date(mLiterals), stats.minimum(),
stats.maximum(), colStats.hasnull());
stats.maximum(), col_stats_hasnull(colStats));
}
break;
}
Expand All @@ -564,7 +569,7 @@ namespace orc {
stats.maximumutc() / 1000,
static_cast<int32_t>((stats.maximumutc() % 1000) * 1000000) + maxNano);
result = evaluatePredicateRange(mOperator, literal2Timestamp(mLiterals), minTimestamp,
maxTimestamp, colStats.hasnull());
maxTimestamp, col_stats_hasnull(colStats));
}
break;
}
Expand All @@ -574,7 +579,7 @@ namespace orc {
const auto& stats = colStats.decimalstatistics();
result = evaluatePredicateRange(mOperator, literal2Decimal(mLiterals),
Decimal(stats.minimum()), Decimal(stats.maximum()),
colStats.hasnull());
col_stats_hasnull(colStats));
}
break;
}
Expand All @@ -589,7 +594,7 @@ namespace orc {
}

// make sure null literal is respected for IN operator
if (mOperator == Operator::IN && colStats.hasnull()) {
if (mOperator == Operator::IN && col_stats_hasnull(colStats)) {
for (const auto& literal : mLiterals) {
if (literal.isNull()) {
result = TruthValue::YES_NO_NULL;
Expand Down Expand Up @@ -698,20 +703,21 @@ namespace orc {
}
}

bool allNull = colStats.hasnull() && colStats.numberofvalues() == 0;
bool allNull = col_stats_hasnull(colStats) && colStats.numberofvalues() == 0;
if (mOperator == Operator::IS_NULL ||
((mOperator == Operator::EQUALS || mOperator == Operator::NULL_SAFE_EQUALS) &&
mLiterals.at(0).isNull())) {
// IS_NULL operator does not need to check min/max stats and bloom filter
return allNull ? TruthValue::YES : (colStats.hasnull() ? TruthValue::YES_NO : TruthValue::NO);
return allNull ? TruthValue::YES
: (col_stats_hasnull(colStats) ? TruthValue::YES_NO : TruthValue::NO);
} else if (allNull) {
// if we don't have any value, everything must have been null
return TruthValue::IS_NULL;
}

TruthValue result = evaluatePredicateMinMax(colStats);
if (shouldEvaluateBloomFilter(mOperator, result, bloomFilter)) {
return evaluatePredicateBloomFiter(bloomFilter, colStats.hasnull());
return evaluatePredicateBloomFiter(bloomFilter, col_stats_hasnull(colStats));
} else {
return result;
}
Expand Down

0 comments on commit 86a5c44

Please sign in to comment.