Skip to content

Commit

Permalink
GH-26648: [C++] Optimize union equality comparison (#45384)
Browse files Browse the repository at this point in the history
### Rationale for this change

#26648 proposes an optimization in checking sparse array equality by detecting contiguous runs, this PR implements that change

### What changes are included in this PR?

previously, sparse array comparison was checked one by one, in this change, contiguous runs are detected and compared by checking equality of current and previous child_ids

### Are these changes tested?

already covered by existing unit tests

### Are there any user-facing changes?

no

* GitHub Issue: #26648

Lead-authored-by: shawn <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
mcshawn10 and pitrou authored Feb 4, 2025
1 parent 31747f0 commit 1567be0
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 8 deletions.
30 changes: 30 additions & 0 deletions cpp/src/arrow/array/array_union_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,36 @@ TEST(TestSparseUnionArray, Validate) {
ASSERT_RAISES(Invalid, arr->ValidateFull());
}

TEST(TestSparseUnionArray, Comparison) {
auto ints1 = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6]");
auto ints2 = ArrayFromJSON(int32(), "[1, 2, -3, 4, -5, 6]");
auto strs1 = ArrayFromJSON(utf8(), R"(["a", "b", "c", "d", "e", "f"])");
auto strs2 = ArrayFromJSON(utf8(), R"(["a", "*", "c", "d", "e", "*"])");
std::vector<int8_t> type_codes{8, 42};

auto check_equality = [&](const std::string& type_ids_json1,
const std::string& type_ids_json2, bool expected_equals) {
auto type_ids1 = ArrayFromJSON(int8(), type_ids_json1);
auto type_ids2 = ArrayFromJSON(int8(), type_ids_json2);
ASSERT_OK_AND_ASSIGN(auto arr1,
SparseUnionArray::Make(*type_ids1, {ints1, strs1}, type_codes));
ASSERT_OK_AND_ASSIGN(auto arr2,
SparseUnionArray::Make(*type_ids2, {ints2, strs2}, type_codes));
ASSERT_EQ(arr1->Equals(arr2), expected_equals);
ASSERT_EQ(arr2->Equals(arr1), expected_equals);
};

// Same type ids
check_equality("[8, 8, 42, 42, 42, 8]", "[8, 8, 42, 42, 42, 8]", true);
check_equality("[8, 8, 42, 42, 42, 42]", "[8, 8, 42, 42, 42, 42]", false);
check_equality("[8, 8, 8, 42, 42, 8]", "[8, 8, 8, 42, 42, 8]", false);
check_equality("[8, 42, 42, 42, 42, 8]", "[8, 42, 42, 42, 42, 8]", false);

// Different type ids
check_equality("[42, 8, 42, 42, 42, 8]", "[8, 8, 42, 42, 42, 8]", false);
check_equality("[8, 8, 42, 42, 42, 8]", "[8, 8, 42, 42, 42, 42]", false);
}

// -------------------------------------------------------------------------
// Tests for MakeDense and MakeSparse

Expand Down
44 changes: 36 additions & 8 deletions cpp/src/arrow/compare.cc
Original file line number Diff line number Diff line change
Expand Up @@ -381,21 +381,49 @@ class RangeDataEqualsImpl {
const int8_t* right_codes = right_.GetValues<int8_t>(1);

// Unions don't have a null bitmap
int64_t run_start = 0; // Start index of the current run

for (int64_t i = 0; i < range_length_; ++i) {
const auto type_id = left_codes[left_start_idx_ + i];
if (type_id != right_codes[right_start_idx_ + i]) {
const auto current_type_id = left_codes[left_start_idx_ + i];

if (current_type_id != right_codes[right_start_idx_ + i]) {
result_ = false;
break;
}
const auto child_num = child_ids[type_id];
// XXX can we instead detect runs of same-child union values?
// Check if the current element breaks the run
if (i > 0 && current_type_id != left_codes[left_start_idx_ + i - 1]) {
// Compare the previous run
const auto previous_child_num = child_ids[left_codes[left_start_idx_ + i - 1]];
int64_t run_length = i - run_start;

RangeDataEqualsImpl impl(
options_, floating_approximate_, *left_.child_data[previous_child_num],
*right_.child_data[previous_child_num],
left_start_idx_ + left_.offset + run_start,
right_start_idx_ + right_.offset + run_start, run_length);

if (!impl.Compare()) {
result_ = false;
break;
}

// Start a new run
run_start = i;
}
}

// Handle the final run
if (result_) {
const auto final_child_num = child_ids[left_codes[left_start_idx_ + run_start]];
int64_t final_run_length = range_length_ - run_start;

RangeDataEqualsImpl impl(
options_, floating_approximate_, *left_.child_data[child_num],
*right_.child_data[child_num], left_start_idx_ + left_.offset + i,
right_start_idx_ + right_.offset + i, 1);
options_, floating_approximate_, *left_.child_data[final_child_num],
*right_.child_data[final_child_num], left_start_idx_ + left_.offset + run_start,
right_start_idx_ + right_.offset + run_start, final_run_length);

if (!impl.Compare()) {
result_ = false;
break;
}
}
return Status::OK();
Expand Down

0 comments on commit 1567be0

Please sign in to comment.