Skip to content

Commit

Permalink
PyVelox implementation for Array Vector (#6100)
Browse files Browse the repository at this point in the history
Summary:
This PR implements the support for Array vectors in PyVelox, based on the previous work done by save-buffer in PR #4113 and by richtia in PR #4602.

Pull Request resolved: #6100

Reviewed By: Yuhta

Differential Revision: D52255845

Pulled By: kgpai

fbshipit-source-id: a15028913685163fef44e75d0ca0d2707b014313
  • Loading branch information
sanjibansg authored and facebook-github-bot committed Jan 31, 2024
1 parent 280fc86 commit fbbeb56
Show file tree
Hide file tree
Showing 6 changed files with 279 additions and 6 deletions.
10 changes: 8 additions & 2 deletions pyvelox/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,14 @@ if(VELOX_BUILD_PYTHON_PACKAGE)
include_directories(SYSTEM ${CMAKE_SOURCE_DIR})
add_definitions(-DCREATE_PYVELOX_MODULE -DVELOX_DISABLE_GOOGLETEST)
# Define our Python module:
pybind11_add_module(pyvelox MODULE pyvelox.cpp serde.cpp signatures.cpp
conversion.cpp)
pybind11_add_module(
pyvelox
MODULE
pyvelox.cpp
serde.cpp
signatures.cpp
complex.cpp
conversion.cpp)
# Link with Velox:
target_link_libraries(
pyvelox
Expand Down
181 changes: 181 additions & 0 deletions pyvelox/complex.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "complex.h"
#include "velox/vector/ComplexVector.h"

#include <functional>

namespace facebook::velox::py {

using namespace velox;
namespace py = pybind11;

namespace {
// Structure used to keep check on the number
// of constituent elements. Attributes totalElements
// and insertedElements keeps the length of the vector, and
// the number of elements inserted during the operation
// respectively.
struct ElementCounter {
vector_size_t insertedElements =
0; // to track the elements already in the vector
vector_size_t totalElements = 0;
std::vector<ElementCounter> children;
};
} // namespace

void checkOrAssignType(TypePtr& type, const TypePtr& expected_type) {
if (type->kind() == TypeKind::UNKNOWN) {
type = expected_type;
} else if (!(type->kindEquals(expected_type))) {
throw py::type_error(
"Cannot construct type tree, invalid variant for complex type");
}
}

template <TypeKind Kind>
void setElementInFlatVector(
vector_size_t idx,
const variant& v,
VectorPtr& vector) {
using NativeType = typename TypeTraits<Kind>::NativeType;
auto asFlat = vector->asFlatVector<NativeType>();
asFlat->set(idx, NativeType{v.value<NativeType>()});
}

// This function determines the type and the number of elements for a variant.
// Takes reference to Type and ElementCounter which will be set after the run.
// It is supposed to run a recursive call with a pre-instantiated TypePtr,
// the target variant and the counter. The passed variant is checked for its
// data type, and for any complex type involved, the function is called again.
// The counter here is used to keep in track of the number of elements inserted
// and the number of types of elements allowed if a complex vector is involved
// in the variant.
void constructType(const variant& v, TypePtr& type, ElementCounter& counter) {
++counter.totalElements;

if (v.isNull()) {
// since the variant is NULL, we can't infer the data type
// thus it maybe UNKNOWN or INVALID at this stage
// which implies further investigation is required
if (v.kind() != TypeKind::UNKNOWN && v.kind() != TypeKind::INVALID &&
v.kind() != type->kind()) {
throw std::invalid_argument("Variant was of an unexpected kind");
}
return;
} else {
// if a Non-Null variant's type is unknown or not one of the valid
// types which are supported then the Type tree cannot be constructed
if (v.kind() == TypeKind::UNKNOWN || v.kind() == TypeKind::INVALID) {
throw std::invalid_argument(
"Non-null variant has unknown or invalid kind");
}

switch (v.kind()) {
case TypeKind::ARRAY: {
counter.children.resize(1);
auto asArray = v.array();
TypePtr childType = createType(TypeKind::UNKNOWN, {});
for (const auto& element : asArray) {
constructType(element, childType, counter.children[0]);
}

// if child's type still remains Unknown, implies all the
// elements in the array are actually NULL
if (childType->kind() == TypeKind::UNKNOWN) {
throw py::value_error("Cannot construct array with all None values");
}
checkOrAssignType(type, createType<TypeKind::ARRAY>({childType}));
break;
}

default: {
checkOrAssignType(type, createScalarType(v.kind()));
break;
}
}
}
}

// Function is called with the variant to be added,
// the target vector and the element counter. The element counter
// is used to track the number of elements already inserted, so as
// to get the index for the next element to insert. For an array
// vector, the required offset and size is first set into the vector
// then the function is called recursively for the contained elements.
// In the default case where the variant is a scalar type, the
// setElementInFlatVector is called without any further recursion.
static void insertVariantIntoVector(
const variant& v,
VectorPtr& vector,
ElementCounter& counter,
vector_size_t previous_size,
vector_size_t previous_offset) {
if (v.isNull()) {
vector->setNull(counter.insertedElements, true);
} else {
switch (v.kind()) {
case TypeKind::ARRAY: {
auto asArray = vector->as<ArrayVector>();
asArray->elements()->resize(counter.children[0].totalElements);
const std::vector<variant>& elements = v.array();
vector_size_t offset = previous_offset + previous_size;
vector_size_t size = elements.size();
asArray->setOffsetAndSize(counter.insertedElements, offset, size);
for (const variant& elt : elements) {
insertVariantIntoVector(
elt, asArray->elements(), counter.children[0], offset, size);
}

break;
}
default: {
VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(
setElementInFlatVector,
v.kind(),
counter.insertedElements,
v,
vector);
break;
}
}
}
counter.insertedElements += 1;
}

VectorPtr variantsToVector(
const std::vector<variant>& variants,
velox::memory::MemoryPool* pool) {
ElementCounter counter;
TypePtr type = createType(TypeKind::UNKNOWN, {});
for (const auto& variant : variants) {
constructType(variant, type, counter);
}
VectorPtr resultVector =
BaseVector::create(std::move(type), variants.size(), pool);
for (const variant& v : variants) {
insertVariantIntoVector(
v,
resultVector,
counter,
/*previous_size*/ 0,
/*previous_offset*/ 0);
}
return resultVector;
}

} // namespace facebook::velox::py
29 changes: 29 additions & 0 deletions pyvelox/complex.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <pybind11/pybind11.h>

#include "velox/vector/FlatVector.h"

namespace facebook::velox::py {

VectorPtr variantsToVector(
const std::vector<variant>& variants,
velox::memory::MemoryPool* pool);

}
5 changes: 4 additions & 1 deletion pyvelox/pyvelox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include "pyvelox.h"
#include "complex.h"
#include "conversion.h"
#include "serde.h"
#include "signatures.h"
Expand Down Expand Up @@ -143,6 +144,8 @@ static VectorPtr pyListToVector(
if (first_kind == velox::TypeKind::INVALID) {
throw py::value_error(
"Can't create a Velox vector consisting of only None");
} else if (first_kind == velox::TypeKind::ARRAY) {
return variantsToVector(variants, pool);
}

return VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(
Expand All @@ -164,7 +167,7 @@ static VectorPtr pyListToVector(
}

template <typename NativeType>
static py::object getItemFromSimpleVector(
inline py::object getItemFromSimpleVector(
SimpleVectorPtr<NativeType>& vector,
vector_size_t idx) {
checkBounds(vector, idx);
Expand Down
20 changes: 17 additions & 3 deletions pyvelox/pyvelox.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <velox/vector/DictionaryVector.h>
#include <velox/vector/FlatVector.h>
#include "folly/json.h"
#include "velox/vector/VariantToVector.h"

#include "context.h"

Expand Down Expand Up @@ -68,6 +69,13 @@ inline velox::variant pyToVariant(const py::handle& obj) {
return pyToVariant<velox::TypeKind::DOUBLE>(obj);
} else if (py::isinstance<py::str>(obj)) {
return pyToVariant<velox::TypeKind::VARCHAR>(obj);
} else if (py::isinstance<py::list>(obj)) {
py::list objAsList = py::cast<py::list>(obj);
std::vector<velox::variant> result;
for (auto& item : objAsList) {
result.push_back(pyToVariant(item));
}
return velox::variant::array(std::move(result));
} else {
throw py::type_error("Invalid type of object");
}
Expand Down Expand Up @@ -175,13 +183,13 @@ static VectorPtr createDictionaryVector(
}

template <typename NativeType>
static py::object getItemFromSimpleVector(
SimpleVectorPtr<NativeType>& vector,
inline py::object getItemFromSimpleVector(
SimpleVectorPtr<NativeType>& v,
vector_size_t idx);

template <typename NativeType>
inline void setItemInFlatVector(
FlatVectorPtr<NativeType>& vector,
FlatVectorPtr<NativeType>& v,
vector_size_t idx,
py::handle& obj);

Expand Down Expand Up @@ -464,6 +472,12 @@ static void addVectorBindings(
py::arg("stop"),
py::arg("step") = 1);

py::class_<ArrayVector, ArrayVectorPtr, BaseVector>(
m, "ArrayVector", py::module_local(asModuleLocalDefinitions))
.def("elements", [](ArrayVectorPtr vec) -> VectorPtr {
return vec->elements();
});

constexpr TypeKind supportedTypes[] = {
TypeKind::BOOLEAN,
TypeKind::TINYINT,
Expand Down
40 changes: 40 additions & 0 deletions pyvelox/test/test_vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,46 @@ def test_dictionary_encoding(self):
pv.dictionary_vector(pv.from_list([1, 2, 3]), [1, 2, 1000000])
pv.dictionary_vector(pv.from_list([1, 2, 3]), [0, -1, -2])

def test_array_vector(self):
v1 = pv.from_list([[1, 2, 3], [1, 2, 3]])
self.assertTrue(isinstance(v1, pv.ArrayVector))
self.assertTrue(isinstance(v1.elements(), pv.FlatVector_BIGINT))
self.assertEqual(len(v1), 2)
expected_flat = [1, 2, 3, 1, 2, 3]
self.assertEqual(len(expected_flat), len(v1.elements()))
for i in range(len(expected_flat)):
self.assertEqual(expected_flat[i], v1.elements()[i])

v2 = pv.from_list([[1], [1, 2, None]])
self.assertTrue(isinstance(v2, pv.ArrayVector))
self.assertTrue(isinstance(v2.elements(), pv.FlatVector_BIGINT))
self.assertEqual(len(v2), 2)
expected_flat = [1, 1, 2, None]
self.assertEqual(len(v2.elements()), len(expected_flat))
for i in range(len(expected_flat)):
self.assertEqual(expected_flat[i], v2.elements()[i])

doubleNested = pv.from_list([[[1, 2], [3, None]], [[1], [2]]])
self.assertTrue(isinstance(doubleNested, pv.ArrayVector))
self.assertTrue(isinstance(doubleNested.elements(), pv.ArrayVector))
self.assertEqual(len(doubleNested), 2)
elements = doubleNested.elements().elements()
self.assertTrue(isinstance(elements, pv.FlatVector_BIGINT))
self.assertEqual(len(elements), 6)
expected_firstElements = [1, 2, 3, None, 1, 2]
self.assertEqual(len(elements), len(expected_firstElements))
for i in range(len(expected_firstElements)):
self.assertEqual(expected_firstElements[i], elements[i])

with self.assertRaises(TypeError):
a = pv.from_list([[[1, 2], [3, 4]], [[1.1], [2.3]]])

with self.assertRaises(ValueError):
v = pv.from_list([[None], [None, None, None]])

with self.assertRaises(TypeError):
a = pv.from_list([[[1, 2], [3, 4]], [["hello"], ["world"]]])

def test_to_string(self):
self.assertEqual(
str(pv.from_list([1, 2, 3])),
Expand Down

0 comments on commit fbbeb56

Please sign in to comment.