Skip to content

Commit

Permalink
Nits
Browse files Browse the repository at this point in the history
  • Loading branch information
pitrou committed Dec 12, 2022
1 parent 3e0aeed commit 3fa5d62
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 25 deletions.
33 changes: 18 additions & 15 deletions cpp/src/parquet/page_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
#include "arrow/util/unreachable.h"

#include <limits>
#include <map>
#include <numeric>

namespace parquet {
Expand All @@ -42,7 +41,10 @@ void Decode(std::unique_ptr<typename EncodingTraits<DType>::Decoder>& decoder,

decoder->SetData(/*num_values=*/1, reinterpret_cast<const uint8_t*>(input.c_str()),
static_cast<int>(input.size()));
decoder->Decode(&output->at(output_index), /*max_values=*/1);
const auto num_values = decoder->Decode(&output->at(output_index), /*max_values=*/1);
if (ARROW_PREDICT_FALSE(num_values != 1)) {
throw ParquetException("Could not decode statistics value");
}
}

template <>
Expand All @@ -56,7 +58,10 @@ void Decode<BooleanType>(std::unique_ptr<BooleanDecoder>& decoder,
bool value;
decoder->SetData(/*num_values=*/1, reinterpret_cast<const uint8_t*>(input.c_str()),
static_cast<int>(input.size()));
decoder->Decode(&value, /*max_values=*/1);
const auto num_values = decoder->Decode(&value, /*max_values=*/1);
if (ARROW_PREDICT_FALSE(num_values != 1)) {
throw ParquetException("Could not decode statistics value");
}
output->at(output_index) = value;
}

Expand All @@ -72,9 +77,8 @@ void Decode<ByteArrayType>(std::unique_ptr<ByteArrayDecoder>&, const std::string
throw ParquetException("Invalid encoded byte array length");
}

auto& decoded = output->at(output_index);
decoded.len = static_cast<uint32_t>(input.size());
decoded.ptr = reinterpret_cast<const uint8_t*>(input.data());
output->at(output_index) = {/*len=*/static_cast<uint32_t>(input.size()),
/*ptr=*/reinterpret_cast<const uint8_t*>(input.data())};
}

template <typename DType>
Expand All @@ -86,26 +90,25 @@ class TypedColumnIndexImpl : public TypedColumnIndex<DType> {
const format::ColumnIndex& column_index)
: column_index_(column_index) {
// Make sure the number of pages is valid and it does not overflow to int32_t.
if (ARROW_PREDICT_FALSE(column_index_.null_pages.size() >=
static_cast<size_t>(std::numeric_limits<int32_t>::max())) ||
column_index_.null_pages.size() != column_index_.min_values.size() ||
column_index_.min_values.size() != column_index_.max_values.size() ||
const size_t num_pages = column_index_.null_pages.size();
if (num_pages >= static_cast<size_t>(std::numeric_limits<int32_t>::max()) ||
column_index_.min_values.size() != num_pages ||
column_index_.max_values.size() != num_pages ||
(column_index_.__isset.null_counts &&
column_index_.null_counts.size() != column_index_.null_pages.size())) {
column_index_.null_counts.size() != num_pages)) {
throw ParquetException("Invalid column index");
}

size_t num_pages = column_index_.null_pages.size();
size_t num_non_null_pages = static_cast<size_t>(std::accumulate(
const size_t num_non_null_pages = static_cast<size_t>(std::accumulate(
column_index_.null_pages.cbegin(), column_index_.null_pages.cend(), 0,
[](int32_t num_non_null_pages, bool null_page) {
return num_non_null_pages + (null_page ? 0 : 1);
}));
DCHECK_LE(num_non_null_pages, num_pages);

// Allocate slots for decoded values.
min_values_.resize(num_pages);
max_values_.resize(num_pages);
min_values_.resize(num_non_null_pages);
max_values_.resize(num_non_null_pages);
non_null_page_indices_.reserve(num_non_null_pages);

// Decode min and max values according to the physical type.
Expand Down
20 changes: 10 additions & 10 deletions cpp/src/parquet/page_index_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ TEST(PageIndex, ReadDoubleColumnIndex) {
null_pages, min_values, max_values, has_null_counts, null_counts);
}

TEST(PageIndex, ByteArrayColumnIndex) {
TEST(PageIndex, ReadByteArrayColumnIndex) {
const int column_id = 9;
const size_t num_pages = 352;
const BoundaryOrder::type boundary_order = BoundaryOrder::Ascending;
Expand Down Expand Up @@ -214,11 +214,11 @@ TEST(PageIndex, ReadBoolColumnIndex) {
null_pages, min_values, max_values, has_null_counts, null_counts);
}

namespace {
FLBA toFLBA(const char* ptr) { return FLBA{reinterpret_cast<const uint8_t*>(ptr)}; }
} // namespace

TEST(PageIndex, FixedLengthByteArrayColumnIndex) {
auto to_flba = [](const char* ptr) {
return FLBA{reinterpret_cast<const uint8_t*>(ptr)};
};

const int column_id = 0;
const size_t num_pages = 10;
const BoundaryOrder::type boundary_order = BoundaryOrder::Descending;
Expand All @@ -230,10 +230,10 @@ TEST(PageIndex, FixedLengthByteArrayColumnIndex) {
"\x00\x00\x00\x65"};
const std::vector<const char*> max_literals = {"\x00\x00\x03\xE8", "\x00\x00\x02\x58",
"\x00\x00\x00\xC8"};
const std::vector<FLBA> min_values = {toFLBA(min_literals[0]), toFLBA(min_literals[1]),
toFLBA(min_literals[2])};
const std::vector<FLBA> max_values = {toFLBA(max_literals[0]), toFLBA(max_literals[1]),
toFLBA(max_literals[2])};
const std::vector<FLBA> min_values = {
to_flba(min_literals[0]), to_flba(min_literals[1]), to_flba(min_literals[2])};
const std::vector<FLBA> max_values = {
to_flba(max_literals[0]), to_flba(max_literals[1]), to_flba(max_literals[2])};

TestReadTypedColumnIndex<FLBAType>(
"fixed_length_byte_array.parquet", column_id, num_pages, boundary_order,
Expand All @@ -253,7 +253,7 @@ TEST(PageIndex, ReadColumnIndexWithNullPage) {
const std::vector<int32_t> max_values = {};

TestReadTypedColumnIndex<Int32Type>(
"datapage_v1-corrupt-checksum.parquet", column_id, num_pages, boundary_order,
"datapage_v1-uncompressed-checksum.parquet", column_id, num_pages, boundary_order,
page_indices, null_pages, min_values, max_values, has_null_counts, null_counts);
}

Expand Down

0 comments on commit 3fa5d62

Please sign in to comment.