Skip to content

Commit

Permalink
Optimize add_items with out requiring fast path registrations for pri…
Browse files Browse the repository at this point in the history
…mitive types. (#8194)

Summary:
Pull Request resolved: #8194

add_items append elements from an array view to array writer.
when the input array is ArrayView<Generic> and output is ArrayWriter<Generic> its is expensive
and If a function is using add_items() then to avoid the cost authors register fast paths for primitives
see (#7393)

we can optimize add_items() and avoid that authoring overhead, right now its slow because
it does a type check per element when copying it. We can optimize the cast cost and avoid to do it for
each element in the array (since they are all of the same type) and instead do it before we start the
copy and have a fast path for when the elements are of a pritmive type.

when the elements are not primitive the cost of checking the type s amortized by the cost of the
copying the complex elements.

with this diff, the function array_concat performance with generic implementation is very close
to the one with registration for primitive fast paths.  up to 5X faster than before

## Array concat benchmark.

generic before
```
============================================================================
[...]hmarks/ExpressionBenchmarkBuilder.cpp     relative  time/iter   iters/s
============================================================================
array_concat_BOOLEAN_10##2arg                             567.29ms      1.76
array_concat_BOOLEAN_10##3arg                             848.30ms      1.18
array_concat_BOOLEAN_10##4arg                                1.20s   835.32m
array_concat_BOOLEAN_20##2arg                                1.24s   804.59m
array_concat_BOOLEAN_20##3arg                                1.83s   545.78m
array_concat_BOOLEAN_20##4arg                                2.43s   411.28m
array_concat_BOOLEAN_40##2arg                                2.42s   413.40m
array_concat_BOOLEAN_40##3arg                                3.45s   290.10m
array_concat_BOOLEAN_40##4arg                                4.72s   211.95m
array_concat_BOOLEAN_5##2arg                              326.58ms      3.06
array_concat_BOOLEAN_5##3arg                              500.23ms      2.00
array_concat_BOOLEAN_5##4arg                              647.58ms      1.54
array_concat_INTEGER_10##2arg                             451.38ms      2.22
array_concat_INTEGER_10##3arg                             676.54ms      1.48
array_concat_INTEGER_10##4arg                             907.98ms      1.10
array_concat_INTEGER_20##2arg                             903.66ms      1.11
array_concat_INTEGER_20##3arg                                1.46s   685.90m
array_concat_INTEGER_20##4arg                                1.90s   525.07m
array_concat_INTEGER_40##2arg                                1.83s   547.40m
array_concat_INTEGER_40##3arg                                2.63s   379.91m
array_concat_INTEGER_40##4arg                                3.65s   274.16m
array_concat_INTEGER_5##2arg                              243.12ms      4.11
array_concat_INTEGER_5##3arg                              381.92ms      2.62
array_concat_INTEGER_5##4arg                              502.78ms      1.99
array_concat_VARCHAR_10##2arg                                1.26s   792.79m
array_concat_VARCHAR_10##3arg                                1.73s   579.50m
array_concat_VARCHAR_10##4arg                                2.21s   452.26m
array_concat_VARCHAR_20##2arg                                3.23s   309.67m
array_concat_VARCHAR_20##3arg                                4.08s   244.99m
array_concat_VARCHAR_20##4arg                                5.09s   196.40m
array_concat_VARCHAR_40##2arg                                5.49s   182.17m
array_concat_VARCHAR_40##3arg                                9.23s   108.36m
```

generic after
```
BUILD SUCCEEDED
============================================================================
[...]hmarks/ExpressionBenchmarkBuilder.cpp     relative  time/iter   iters/s
============================================================================
array_concat_BOOLEAN_10##2arg                             195.54ms      5.11
array_concat_BOOLEAN_10##3arg                             265.57ms      3.77
array_concat_BOOLEAN_10##4arg                             397.59ms      2.52
array_concat_BOOLEAN_20##2arg                             487.38ms      2.05
array_concat_BOOLEAN_20##3arg                             758.45ms      1.32
array_concat_BOOLEAN_20##4arg                                1.07s   930.67m
array_concat_BOOLEAN_40##2arg                             914.62ms      1.09
array_concat_BOOLEAN_40##3arg                                1.36s   737.16m
array_concat_BOOLEAN_40##4arg                                1.72s   580.03m
array_concat_BOOLEAN_5##2arg                              149.76ms      6.68
array_concat_BOOLEAN_5##3arg                              234.81ms      4.26
array_concat_BOOLEAN_5##4arg                              300.58ms      3.33
array_concat_INTEGER_10##2arg                              70.89ms     14.11
array_concat_INTEGER_10##3arg                              95.07ms     10.52
array_concat_INTEGER_10##4arg                             124.94ms      8.00
array_concat_INTEGER_20##2arg                             102.19ms      9.79
array_concat_INTEGER_20##3arg                             155.30ms      6.44
array_concat_INTEGER_20##4arg                             187.59ms      5.33
array_concat_INTEGER_40##2arg                             122.93ms      8.13
array_concat_INTEGER_40##3arg                             153.85ms      6.50
array_concat_INTEGER_40##4arg                             322.33ms      3.10
array_concat_INTEGER_5##2arg                               70.71ms     14.14
array_concat_INTEGER_5##3arg                              100.96ms      9.90
array_concat_INTEGER_5##4arg                              124.78ms      8.01
array_concat_VARCHAR_10##2arg                             239.86ms      4.17
array_concat_VARCHAR_10##3arg                             313.51ms      3.19
array_concat_VARCHAR_10##4arg                             418.63ms      2.39
array_concat_VARCHAR_20##2arg                             492.72ms      2.03
array_concat_VARCHAR_20##3arg                             645.26ms      1.55
array_concat_VARCHAR_20##4arg                             872.10ms      1.15
array_concat_VARCHAR_40##2arg                             737.43ms      1.36
array_concat_VARCHAR_40##3arg                                1.19s   843.70m
array_concat_VARCHAR_40##4arg                                1.52s   658.16m
array_concat_VARCHAR_5##2arg                              111.10ms      9.00
array_concat_VARCHAR_5##3arg                              148.33ms      6.74
array_concat_VARCHAR_5##4arg                              193.35ms      5.17

```

primitive fast path
```
============================================================================
[...]hmarks/ExpressionBenchmarkBuilder.cpp     relative  time/iter   iters/s
============================================================================
array_concat_BOOLEAN_10##2arg                             178.21ms      5.61
array_concat_BOOLEAN_10##3arg                             233.11ms      4.29
array_concat_BOOLEAN_10##4arg                             363.77ms      2.75
array_concat_BOOLEAN_20##2arg                             456.42ms      2.19
array_concat_BOOLEAN_20##3arg                             712.48ms      1.40
array_concat_BOOLEAN_20##4arg                             927.58ms      1.08
array_concat_BOOLEAN_40##2arg                             873.87ms      1.14
array_concat_BOOLEAN_40##3arg                                1.35s   742.65m
array_concat_BOOLEAN_40##4arg                                1.66s   602.28m
array_concat_BOOLEAN_5##2arg                              141.29ms      7.08
array_concat_BOOLEAN_5##3arg                              224.04ms      4.46
array_concat_BOOLEAN_5##4arg                              290.93ms      3.44
array_concat_INTEGER_10##2arg                              58.67ms     17.05
array_concat_INTEGER_10##3arg                              80.23ms     12.46
array_concat_INTEGER_10##4arg                             107.38ms      9.31
array_concat_INTEGER_20##2arg                              90.53ms     11.05
array_concat_INTEGER_20##3arg                             146.84ms      6.81
array_concat_INTEGER_20##4arg                             174.97ms      5.72
array_concat_INTEGER_40##2arg                             113.06ms      8.85
array_concat_INTEGER_40##3arg                             144.51ms      6.92
array_concat_INTEGER_40##4arg                             317.69ms      3.15
array_concat_INTEGER_5##2arg                               60.72ms     16.47
array_concat_INTEGER_5##3arg                               86.76ms     11.53
array_concat_INTEGER_5##4arg                              104.10ms      9.61
array_concat_VARCHAR_10##2arg                             226.63ms      4.41
array_concat_VARCHAR_10##3arg                             304.74ms      3.28
array_concat_VARCHAR_10##4arg                             393.14ms      2.54
array_concat_VARCHAR_20##2arg                             467.90ms      2.14
array_concat_VARCHAR_20##3arg                             624.86ms      1.60
array_concat_VARCHAR_20##4arg                             833.13ms      1.20
array_concat_VARCHAR_40##2arg                             703.85ms      1.42
array_concat_VARCHAR_40##3arg                                1.20s   834.57m
array_concat_VARCHAR_40##4arg                                1.58s   634.88m
array_concat_VARCHAR_5##2arg                              104.95ms      9.53
array_concat_VARCHAR_5##3arg                              138.85ms      7.20
array_concat_VARCHAR_5##4arg                              178.57ms      5.60
```

Reviewed By: kevinwilfong

Differential Revision: D52380460

fbshipit-source-id: b92bae384de643ad5c6cd614c050cdd78637a5e6
  • Loading branch information
laithsakka authored and facebook-github-bot committed Jan 9, 2024
1 parent 322f309 commit cb1dd43
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 41 deletions.
2 changes: 1 addition & 1 deletion velox/expression/ComplexViewTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -1126,7 +1126,7 @@ class GenericView {
// If its a primitive type, then the casted reader always exists at
// castReaders_[0], and is set in Vector readers.
if constexpr (SimpleTypeTrait<ToType>::isPrimitiveType) {
return static_cast<VectorReader<ToType>*>(castReaders_[0].get())
return reinterpret_cast<VectorReader<ToType>*>(castReaders_[0].get())
->
operator[](index_);
} else {
Expand Down
181 changes: 141 additions & 40 deletions velox/expression/ComplexWriterTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,23 @@ class ArrayWriter {
return valuesOffset_;
}

// Copy from nullable ArrayView.
void add_items(
const typename VectorExec::template resolver<Array<V>>::in_type&
arrayView) {
if constexpr (isGenericType<V>::value) {
addItemsGeneric(arrayView);
} else if constexpr (hasStringValue) {
addItemsStringFastPath<SimpleTypeTrait<V>::typeKind>(arrayView);
} else if constexpr (std::is_same_v<V, bool>) {
addItemsBoolFastPath(arrayView);
} else if constexpr (provide_std_interface<V>) {
addItemsOrimitiveFastPath<V>(arrayView);
} else {
addItemsGeneralSlowPath(arrayView);
}
}

// Any vector type with std-like optional-free interface.
template <typename VectorType>
void add_items(const VectorType& data) {
Expand All @@ -251,60 +268,141 @@ class ArrayWriter {
}
}

// Copy from nullable ArrayView.
void add_items(
const typename VectorExec::template resolver<Array<V>>::in_type&
arrayView) {
if constexpr (provide_std_interface<V>) {
// TODO: accelerate this with memcpy.
auto start = length_;
resize(length_ + arrayView.size());
for (auto i = 0; i < arrayView.size(); i++) {
if (arrayView[i].has_value()) {
this->operator[](i + start) = arrayView[i].value();
private:
template <TypeKind kind, typename VectorType>
void addItemsGenericPrimitive(const VectorType& data) {
if constexpr (kind == TypeKind::BOOLEAN) {
addItemsBoolFastPath(data);
} else if constexpr (
kind == TypeKind::VARCHAR || kind == TypeKind::VARBINARY) {
addItemsStringFastPath<kind>(data);
} else if constexpr (TypeTraits<kind>::isPrimitiveType) {
addItemsOrimitiveFastPath<typename KindToSimpleType<kind>::type>(data);
} else {
VELOX_UNREACHABLE("non primitives handled in addItemsGeneric");
}
}

template <typename ElementSimpleType, typename Input>
void addItemsOrimitiveFastPath(const Input& sourceArray) {
VELOX_DCHECK_NE(sourceArray.elementKind(), TypeKind::BOOLEAN);
auto start = length_ + valuesOffset_;
resize(length_ + sourceArray.size());
auto* flat = elementsVector_->template asUnchecked<FlatVector<
typename VectorExec::resolver<ElementSimpleType>::in_type>>();
auto* rawValues = flat->mutableRawValues();

flat->clearNulls(start, start + sourceArray.size());

int i = 0;
if (!sourceArray.mayHaveNulls()) {
for (auto item : sourceArray) {
if constexpr (isGenericType<V>::value) {
rawValues[i + start] = item->template castTo<ElementSimpleType>();
} else {
this->operator[](i + start) = std::nullopt;
rawValues[i + start] = item.value();
}
i++;
}
} else if constexpr (hasStringValue) {
auto* vector = arrayView.elementsVector();
bool found = false;
// Caching at this layer is much faster.
for (auto* item : vectorsWithAcquiredBuffers_) {
if (item == vector) {
found = true;
break;
} else {
for (auto item : sourceArray) {
if (item.has_value()) {
if constexpr (isGenericType<V>::value) {
rawValues[i + start] = item->template castTo<ElementSimpleType>();
} else {
rawValues[i + start] = item.value();
}
} else {
flat->setNull(i + start, true);
}
i++;
}
}
}

if (!found) {
vectorsWithAcquiredBuffers_.push_back(vector);
this->elementsVector_->acquireSharedStringBuffers(vector);
template <TypeKind kind, typename Input>
void addItemsStringFastPath(const Input& sourceArray) {
auto* vector = sourceArray.elementsVector();
auto* flatOutput =
this->elementsVector_->template asUnchecked<FlatVector<StringView>>();
bool found = false;
// Caching at this layer is much faster.
for (auto* item : vectorsWithAcquiredBuffers_) {
if (item == vector) {
found = true;
break;
}
auto start = length_ + valuesOffset_;
resize(length_ + arrayView.size());
for (const auto& element : arrayView) {
if (element.has_value()) {
elementsVector_->setNoCopy(start, element.value());
}

if (!found) {
vectorsWithAcquiredBuffers_.push_back(vector);
flatOutput->acquireSharedStringBuffers(vector);
}
auto start = length_ + valuesOffset_;
resize(length_ + sourceArray.size());
for (const auto& element : sourceArray) {
if (element.has_value()) {
if constexpr (isGenericType<V>::value) {
flatOutput->setNoCopy(
start,
element
->template castTo<typename KindToSimpleType<kind>::type>());
} else {
elementsVector_->setNull(start, true);
flatOutput->setNoCopy(start, element.value());
}
start++;
} else {
flatOutput->setNull(start, true);
}
} else {
reserve(size() + arrayView.size());
for (const auto& item : arrayView) {
if (item.has_value()) {
auto& writer = add_item();
writer.copy_from(item.value());
start++;
}
}

template <typename Input>
void addItemsBoolFastPath(const Input& sourceArray) {
auto* vector = sourceArray.elementsVector();
auto* flatOutput =
this->elementsVector_->template asUnchecked<FlatVector<bool>>();

auto start = length_ + valuesOffset_;
resize(length_ + sourceArray.size());
for (const auto& element : sourceArray) {
if (element.has_value()) {
if constexpr (isGenericType<V>::value) {
flatOutput->set(start, element->template castTo<bool>());
} else {
add_null();
flatOutput->set(start, element.value());
}
} else {
flatOutput->setNull(start, true);
}
start++;
}
}

template <typename Input>
void addItemsGeneralSlowPath(const Input& sourceArray) {
reserve(size() + sourceArray.size());
for (const auto& item : sourceArray) {
if (item.has_value()) {
auto& writer = add_item();
writer.copy_from(item.value());
} else {
add_null();
}
}
}

template <typename VectorType>
void addItemsGeneric(const VectorType& arrayView) {
if (elementIsPrimitive) {
auto kind = arrayView.elementKind();
VELOX_DYNAMIC_TYPE_DISPATCH_ALL(
addItemsGenericPrimitive, kind, arrayView);
} else {
addItemsGeneralSlowPath(arrayView);
}
}

private:
// Make sure user do not use those.
ArrayWriter<V>() = default;
ArrayWriter<V>(const ArrayWriter<V>&) = default;
Expand All @@ -325,6 +423,7 @@ class ArrayWriter {
valuesOffset_ = elementsVector_->size();
childWriter_->ensureSize(1);
elementsVectorCapacity_ = elementsVector_->size();
elementIsPrimitive = elementsVector_->type()->isPrimitiveType();
}

typename child_writer_t::vector_t* elementsVector_ = nullptr;
Expand All @@ -346,10 +445,12 @@ class ArrayWriter {
vector_size_t elementsVectorCapacity_ = 0;

typename std::conditional<
hasStringValue,
hasStringValue || isGenericType<V>::value,
std::vector<const BaseVector*>,
std::byte>::type vectorsWithAcquiredBuffers_;

bool elementIsPrimitive;

template <typename A, typename B>
friend struct VectorWriter;

Expand Down Expand Up @@ -919,8 +1020,8 @@ class GenericWriter {
VectorWriter<B, void>* retrieveCastedWriter() {
DCHECK(castType_);
DCHECK(castWriter_);

// TODO: optimize this check using disjoint sets of types (see GenericView).

return dynamic_cast<VectorWriter<B, void>*>(castWriter_.get());
}

Expand Down

0 comments on commit cb1dd43

Please sign in to comment.