Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds load balanced kernel for point-point, point-linestring and linestring-linestring #1144

Draft
wants to merge 33 commits into
base: branch-23.08
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
ac872e7
[skip ci] initial segment_counter working
isVoid May 11, 2023
84f317c
segment iterator works with empty linestrings
isVoid May 16, 2023
ce28b3c
enable polygon segment count test
isVoid May 16, 2023
cd2d229
add polygon segment methods
isVoid May 16, 2023
fcbe105
remove incomplete validations and add implicit assumptions in documen…
isVoid May 17, 2023
ee3576e
enable multipolygon range tests
isVoid May 17, 2023
7a53d05
improve doc of validators
isVoid May 18, 2023
2181e96
replace all multilinestring range tests with segment method test
isVoid May 18, 2023
5e3a45a
updates multipolygon tests to make use the new segment methods view
isVoid May 18, 2023
53e3ce0
remove segment methods from ranges and update all usage to `segment_m…
isVoid May 18, 2023
789f7a2
Cleanup: Remove all debug prints
isVoid May 18, 2023
2390acc
Cleanup: Remove segment functions in multilinestring_range and dead c…
isVoid May 18, 2023
728caa4
remove num_segments
isVoid May 18, 2023
5df7ea0
rename segment_methods -> multilinestring_segment
isVoid May 18, 2023
e030449
Updates segment_range and tests to make sure APIs exposed are all mul…
isVoid May 19, 2023
771f340
Refactors `linestring_polygon_distance` and add tests to include empt…
isVoid May 19, 2023
81e05bd
Move all segment range only functors to local file
isVoid May 19, 2023
9d00b41
Documentation for segment_range
isVoid May 19, 2023
f6255b8
typo
isVoid May 19, 2023
9ec9882
Merge branch 'branch-23.06' into fix/segment_iterator
isVoid May 19, 2023
0f151a5
Cleanups & doc improvement
isVoid May 19, 2023
57a2327
Merge branch 'fix/segment_iterator' of github.com:isVoid/cuspatial in…
isVoid May 19, 2023
a45f2c7
[skip ci] initial addition of the load balanced kernel
isVoid May 19, 2023
e6cd5d8
Merge branch 'fix/segment_iterator' into improvement/load_balanced_di…
isVoid May 22, 2023
8475fe8
Move detail iterator factory into detail folder
isVoid May 22, 2023
c463978
Remove functors.cuh, make iterator factory for it.
isVoid May 22, 2023
b442f7c
Remove every functors usage
isVoid May 22, 2023
8d76d0a
Impelement load balanced linestring distance
isVoid May 22, 2023
242987d
Merge branch 'branch-23.06' into improvement/load_balanced_distance_k…
isVoid May 26, 2023
64b638f
Merge branch 'branch-23.06' of https://github.com/rapidsai/cuspatial …
isVoid May 30, 2023
3621efd
optimize load_balanced_kernel
isVoid May 30, 2023
4633224
minor updates to balanced kernel
isVoid May 30, 2023
b8099d7
Merge branch 'improvement/load_balanced_distance_kernel' of github.co…
isVoid May 30, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 70 additions & 11 deletions cpp/include/cuspatial/detail/algorithm/linestring_distance.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,13 @@

#include <rmm/device_uvector.hpp>

#include <thrust/binary_search.h>
#include <thrust/optional.h>

#include <cub/cub.cuh>

#include <cooperative_groups.h>

namespace cuspatial {
namespace detail {

Expand All @@ -42,10 +47,10 @@ namespace detail {
* set to nullopt, no distance computation will be bypassed.
*/
template <class MultiLinestringRange1, class MultiLinestringRange2, class OutputIt>
__global__ void linestring_distance_multilinestring_loop(MultiLinestringRange1 multilinestrings1,
MultiLinestringRange2 multilinestrings2,
thrust::optional<uint8_t*> intersects,
OutputIt distances_first)
__global__ void linestring_distance(MultiLinestringRange1 multilinestrings1,
MultiLinestringRange2 multilinestrings2,
thrust::optional<uint8_t*> intersects,
OutputIt distances_first)
{
using T = typename MultiLinestringRange1::element_t;

Expand All @@ -72,6 +77,15 @@ __global__ void linestring_distance_multilinestring_loop(MultiLinestringRange1 m
}
}

template<typename BoundsIterator, typename IndexType>
auto __device__ compute_geometry_id(BoundsIterator bounds_begin, BoundsIterator bounds_end, IndexType idx)
{
auto it = thrust::prev(
thrust::upper_bound(thrust::seq, bounds_begin, bounds_end, idx));
auto const geometry_id = thrust::distance(bounds_begin, it);
return thrust::make_pair(it, geometry_id);
}

/**
* @internal
* @brief The kernel to compute (multi)linestring to (multi)linestring distance
Expand All @@ -83,17 +97,62 @@ __global__ void linestring_distance_multilinestring_loop(MultiLinestringRange1 m
* `i`th output should be set to 0 and bypass distance computation. This argument is optional, if
* set to nullopt, no distance computation will be bypassed.
*/
template <class MultiLinestringRange1, class MultiLinestringRange2, class OutputIt>
__global__ void linestring_distance_load_balanced(MultiLinestringRange1 multilinestrings1,
MultiLinestringRange2 multilinestrings2,
template <typename T, int block_size, class SegmentRange1, class SegmentRange2, class IndexRange, class OutputIt>
__global__ void linestring_distance_load_balanced(SegmentRange1 multilinestrings1,
SegmentRange2 multilinestrings2,
IndexRange thread_bounds,
thrust::optional<uint8_t*> intersects,
OutputIt distances_first)
OutputIt distances)
{
using T = typename MultiLinestringRange1::element_t;
using index_t = typename IndexRange::value_type;

for (auto idx = threadIdx.x + blockIdx.x * blockDim.x; idx < multilinestrings1.num_points();
auto num_segment_pairs = thread_bounds[thread_bounds.size() - 1];
for (auto idx = threadIdx.x + blockIdx.x * blockDim.x; idx < num_segment_pairs;
idx += gridDim.x * blockDim.x) {
atomicMin(&distances_first[geometry_idx], static_cast<T>(sqrt(min_distance_squared)));

auto [it, geometry_id] = compute_geometry_id(thread_bounds.begin(), thread_bounds.end(), idx);

auto first_thread_of_this_block = blockDim.x * blockIdx.x;
auto last_thread_of_block = blockDim.x * (blockIdx.x + 1) - 1;
auto first_thread_of_next_geometry = thread_bounds[geometry_id + 1];

bool split_block = first_thread_of_this_block < first_thread_of_next_geometry && first_thread_of_next_geometry <= last_thread_of_block;

if (intersects.has_value() && intersects.value()[geometry_id]) {
distances[geometry_id] = 0.0f;
continue;
}

auto const local_idx = idx - *it;
// Retrieve the number of segments in multilinestrings[geometry_id]
auto num_segment_this_multilinestring =
multilinestrings1.multigeometry_count_begin()[geometry_id];
// The segment id from multilinestring1 this thread is computing (local_id + global_offset)
auto multilinestring1_segment_id = local_idx % num_segment_this_multilinestring +
multilinestrings1.multigeometry_offset_begin()[geometry_id];

// The segment id from multilinestring2 this thread is computing (local_id + global_offset)
auto multilinestring2_segment_id = local_idx / num_segment_this_multilinestring +
multilinestrings2.multigeometry_offset_begin()[geometry_id];

auto [a, b] = multilinestrings1.begin()[multilinestring1_segment_id];
auto [c, d] = multilinestrings2.begin()[multilinestring2_segment_id];

auto partial = sqrt(squared_segment_distance(a, b, c, d));

if (split_block)
atomicMin(&distances[geometry_id], partial);
else
{
// block reduce
typedef cub::BlockReduce<T, block_size> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
auto aggregate = BlockReduce(temp_storage).Reduce(partial, cub::Min());

// atmomic with leading thread
if (cooperative_groups::this_thread_block().thread_rank() == 0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any advantage to this over if (threadIdx.x == 0)? What code does this produce? It's impossible for threadIdx.x == 0 to have exited before any other threads in its block in this code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cannot think of any advantage comparing to threadIdx.x == 0. I originally had hopped writing modularized kernel with cg but but later regressed. This is probably a remnant from prototyping.

atomicMin(&distances[geometry_id], aggregate);
}
}
}

Expand Down
41 changes: 41 additions & 0 deletions cpp/include/cuspatial/detail/distance/distance_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <thrust/iterator/discard_iterator.h>
#include <thrust/logical.h>
#include <thrust/tabulate.h>
#include <thrust/zip_function.h>

namespace cuspatial {
namespace detail {
Expand Down Expand Up @@ -115,5 +116,45 @@ rmm::device_uvector<uint8_t> point_polygon_intersects(MultiPointRange multipoint
return multipoint_intersects;
}

/**
* @brief Compute the thread bound between two ranges of partitions
*
* @tparam CountIterator1
* @tparam CountIterator2
* @param lhs
* @param rhs
* @param stream
* @return rmm::device_uvector<IndexType>
*/
template <typename CountIterator1,
typename CountIterator2,
typename index_t = iterator_value_type<CountIterator1>>
rmm::device_uvector<index_t> compute_segment_thread_bounds(CountIterator1 lhs_begin,
CountIterator1 lhs_end,
CountIterator2 rhs_begin,
rmm::cuda_stream_view stream)
{
auto size = thrust::distance(lhs_begin, lhs_end) + 1;

// Compute the "boundary" of threads. Threads are partitioned based on the number of linestrings
// times the number of polygons in a multilinestring-multipolygon pair.
auto segment_count_product_it =
thrust::make_transform_iterator(thrust::make_zip_iterator(lhs_begin, rhs_begin),
thrust::make_zip_function(thrust::multiplies<index_t>{}));

// Computes the "thread boundary" of each pair. This array partitions the thread range by
// geometries. E.g. threadIdx within [thread_bounds[i], thread_bounds[i+1]) computes distances of
// the ith pair.
auto thread_bounds = rmm::device_uvector<index_t>(size, stream);
detail::zero_data_async(thread_bounds.begin(), thread_bounds.end(), stream);

thrust::inclusive_scan(rmm::exec_policy(stream),
segment_count_product_it,
segment_count_product_it + thread_bounds.size() - 1,
thrust::next(thread_bounds.begin()));

return thread_bounds;
}

} // namespace detail
} // namespace cuspatial
42 changes: 31 additions & 11 deletions cpp/include/cuspatial/detail/distance/linestring_distance.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
#pragma once

#include <cuspatial/detail/algorithm/linestring_distance.cuh>
#include <cuspatial/detail/distance/distance_utils.cuh>
#include <cuspatial/error.hpp>
#include <cuspatial/geometry/vec_2d.hpp>
#include <cuspatial/traits.hpp>
#include <cuspatial/range/range.cuh>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>
Expand All @@ -33,8 +35,8 @@
namespace cuspatial {

template <class MultiLinestringRange1, class MultiLinestringRange2, class OutputIt>
OutputIt pairwise_linestring_distance(MultiLinestringRange1 multilinestrings1,
MultiLinestringRange2 multilinestrings2,
OutputIt pairwise_linestring_distance(MultiLinestringRange1 lhs,
MultiLinestringRange2 rhs,
OutputIt distances_first,
rmm::cuda_stream_view stream)
{
Expand All @@ -48,25 +50,43 @@ OutputIt pairwise_linestring_distance(MultiLinestringRange1 multilinestrings1,
typename MultiLinestringRange2::point_t>(),
"All input types must be cuspatial::vec_2d with the same value type");

CUSPATIAL_EXPECTS(multilinestrings1.size() == multilinestrings2.size(),
"Inputs must have the same number of rows.");
CUSPATIAL_EXPECTS(lhs.size() == rhs.size(), "Inputs must have the same number of rows.");

if (multilinestrings1.size() == 0) return distances_first;
if (lhs.size() == 0) return distances_first;

// Make views to the segments in the multilinestring
auto lhs_segments = lhs._segments(stream);
auto lhs_segments_range = lhs_segments.view();

// Make views to the segments in the multilinestring
auto rhs_segments = rhs._segments(stream);
auto rhs_segments_range = rhs_segments.view();

auto thread_bounds =
detail::compute_segment_thread_bounds(lhs_segments_range.multigeometry_count_begin(),
lhs_segments_range.multigeometry_count_end(),
rhs_segments_range.multigeometry_count_begin(),
stream);
// Initialize the output range
thrust::fill(rmm::exec_policy(stream),
distances_first,
distances_first + multilinestrings1.size(),
distances_first + lhs.size(),
std::numeric_limits<T>::max());

std::size_t constexpr threads_per_block = 256;
std::size_t const num_blocks =
(multilinestrings1.num_points() + threads_per_block - 1) / threads_per_block;
std::size_t num_threads = 1e8;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than hard coding, this may be a job for the occupancy API...
https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__OCCUPANCY.html

std::size_t const num_blocks = (num_threads + threads_per_block - 1) / threads_per_block;

detail::linestring_distance<<<num_blocks, threads_per_block, 0, stream.value()>>>(
multilinestrings1, multilinestrings2, thrust::nullopt, distances_first);
detail::linestring_distance_load_balanced<T, threads_per_block>
<<<num_blocks, threads_per_block, 0, stream.value()>>>(
lhs_segments_range,
rhs_segments_range,
range(thread_bounds.begin(), thread_bounds.end()),
thrust::nullopt,
distances_first);

CUSPATIAL_CUDA_TRY(cudaGetLastError());
return distances_first + multilinestrings1.size();
return distances_first + lhs.size();
}

} // namespace cuspatial
Loading