From 9e97c6708fdc3c7f52420992184b1bfdbd2c515f Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 6 Feb 2025 18:39:04 -0800 Subject: [PATCH 1/6] Eliminate IO from bloom_filter::add benchmark --- benchmarks/bloom_filter/add_bench.cu | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/benchmarks/bloom_filter/add_bench.cu b/benchmarks/bloom_filter/add_bench.cu index 8b502d0d5..5d3d54b05 100644 --- a/benchmarks/bloom_filter/add_bench.cu +++ b/benchmarks/bloom_filter/add_bench.cu @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include @@ -61,10 +61,7 @@ void bloom_filter_add(nvbench::state& state, (filter_size_mb * 1024 * 1024) / (sizeof(typename filter_type::word_type) * filter_type::words_per_block); - thrust::device_vector keys(num_keys); - - key_generator gen; - gen.generate(dist_from_state(state), keys.begin(), keys.end()); + thrust::counting_iterator keys(0); state.add_element_count(num_keys); @@ -79,7 +76,7 @@ void bloom_filter_add(nvbench::state& state, add_fpr_summary(state, filter); state.exec([&](nvbench::launch& launch) { - filter.add_async(keys.begin(), keys.end(), {launch.get_stream()}); + filter.add_async(keys, keys + num_keys, {launch.get_stream()}); }); } @@ -106,10 +103,7 @@ void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list // configurations } - thrust::device_vector keys(num_keys); - - key_generator gen; - gen.generate(dist_from_state(state), keys.begin(), keys.end()); + thrust::counting_iterator keys(0); state.add_element_count(num_keys); @@ -124,7 +118,7 @@ void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list add_fpr_summary(state, filter); state.exec([&](nvbench::launch& launch) { - filter.add_async(keys.begin(), keys.end(), {launch.get_stream()}); + filter.add_async(keys, keys + num_keys, {launch.get_stream()}); }); } From e2bb17923a220bcf93c892c52d50ea34747b52b7 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 7 Feb 2025 15:43:28 -0800 Subject: [PATCH 2/6] Don't read benchmark input data from gmem --- benchmarks/bloom_filter/add_bench.cu | 9 +------- benchmarks/bloom_filter/contains_bench.cu | 28 +++++++---------------- 2 files changed, 9 insertions(+), 28 deletions(-) diff --git a/benchmarks/bloom_filter/add_bench.cu b/benchmarks/bloom_filter/add_bench.cu index 5d3d54b05..72322bc21 100644 --- a/benchmarks/bloom_filter/add_bench.cu +++ b/benchmarks/bloom_filter/add_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,6 @@ #include #include -#include #include @@ -68,10 +67,7 @@ void bloom_filter_add(nvbench::state& state, filter_type filter{num_sub_filters, {}, {static_cast(pattern_bits)}}; state.collect_dram_throughput(); - state.collect_l1_hit_rates(); state.collect_l2_hit_rates(); - state.collect_loads_efficiency(); - state.collect_stores_efficiency(); add_fpr_summary(state, filter); @@ -110,10 +106,7 @@ void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list filter_type filter{num_sub_filters}; state.collect_dram_throughput(); - state.collect_l1_hit_rates(); state.collect_l2_hit_rates(); - state.collect_loads_efficiency(); - state.collect_stores_efficiency(); add_fpr_summary(state, filter); diff --git a/benchmarks/bloom_filter/contains_bench.cu b/benchmarks/bloom_filter/contains_bench.cu index 3d2ed1e54..1eae4e13f 100644 --- a/benchmarks/bloom_filter/contains_bench.cu +++ b/benchmarks/bloom_filter/contains_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,12 +21,12 @@ #include #include -#include #include #include #include +#include #include @@ -63,28 +63,22 @@ void bloom_filter_contains( (filter_size_mb * 1024 * 1024) / (sizeof(typename filter_type::word_type) * filter_type::words_per_block); - thrust::device_vector keys(num_keys); + thrust::counting_iterator keys(0); thrust::device_vector result(num_keys, false); - key_generator gen; - gen.generate(dist_from_state(state), keys.begin(), keys.end()); - state.add_element_count(num_keys); filter_type filter{num_sub_filters, {}, {static_cast(pattern_bits)}}; state.collect_dram_throughput(); - state.collect_l1_hit_rates(); state.collect_l2_hit_rates(); - state.collect_loads_efficiency(); - state.collect_stores_efficiency(); add_fpr_summary(state, filter); - filter.add(keys.begin(), keys.end()); + filter.add(keys, keys + num_keys); state.exec([&](nvbench::launch& launch) { - filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()}); + filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()}); }); } @@ -113,28 +107,22 @@ void arrow_bloom_filter_contains(nvbench::state& state, nvbench::type_list keys(num_keys); + thrust::counting_iterator keys(0); thrust::device_vector result(num_keys, false); - key_generator gen; - gen.generate(dist_from_state(state), keys.begin(), keys.end()); - state.add_element_count(num_keys); filter_type filter{num_sub_filters}; state.collect_dram_throughput(); - state.collect_l1_hit_rates(); state.collect_l2_hit_rates(); - state.collect_loads_efficiency(); - state.collect_stores_efficiency(); add_fpr_summary(state, filter); - filter.add(keys.begin(), keys.end()); + filter.add(keys, keys + num_keys); state.exec([&](nvbench::launch& launch) { - filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()}); + filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()}); }); } From 15ecc9363d3cb9c30c7e531871db939d6d47c909 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 11 Feb 2025 16:47:35 -0800 Subject: [PATCH 3/6] Increase benchmark input to reduce noise in measurements --- benchmarks/bloom_filter/defaults.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/bloom_filter/defaults.hpp b/benchmarks/bloom_filter/defaults.hpp index 67f3cf6ff..8ca9d711c 100644 --- a/benchmarks/bloom_filter/defaults.hpp +++ b/benchmarks/bloom_filter/defaults.hpp @@ -30,7 +30,7 @@ using BF_KEY = nvbench::int64_t; using BF_HASH = cuco::xxhash_64; using BF_WORD = nvbench::uint32_t; -static constexpr auto BF_N = 400'000'000; +static constexpr auto BF_N = 1'000'000'000; static constexpr auto BF_SIZE_MB = 2'000; static constexpr auto BF_WORDS_PER_BLOCK = 8; From 8e3caf9d99adf207ece96dd703a9a179a2ab3a00 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 12 Feb 2025 07:18:21 -0800 Subject: [PATCH 4/6] Rename hash_value_type -> hash_result_type --- .../detail/bloom_filter/arrow_filter_policy.cuh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh index bfe97cfaf..c7b7b2fb0 100644 --- a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh +++ b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -83,10 +83,10 @@ namespace cuco::detail { template class XXHash64> class arrow_filter_policy { public: - using hasher = XXHash64; ///< 64-bit XXHash hasher for Arrow bloom filter policy - using word_type = std::uint32_t; ///< uint32_t for Arrow bloom filter policy - using key_type = Key; ///< Hash function input type - using hash_value_type = std::uint64_t; ///< hash function output type + using hasher = XXHash64; ///< 64-bit XXHash hasher for Arrow bloom filter policy + using word_type = std::uint32_t; ///< uint32_t for Arrow bloom filter policy + using key_type = Key; ///< Hash function input type + using hash_result_type = std::uint64_t; ///< hash function output type static constexpr uint32_t bits_set_per_block = 8; ///< hardcoded bits set per Arrow filter block static constexpr uint32_t words_per_block = 8; ///< hardcoded words per Arrow filter block @@ -133,7 +133,7 @@ class arrow_filter_policy { * * @return The hash value of the key */ - __device__ constexpr hash_value_type hash(key_type const& key) const { return hash_(key); } + __device__ constexpr hash_result_type hash(key_type const& key) const { return hash_(key); } /** * @brief Determines the filter block a key is added into. @@ -150,7 +150,7 @@ class arrow_filter_policy { * @return The block index for the given key's hash value */ template - __device__ constexpr auto block_index(hash_value_type hash, Extent num_blocks) const + __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const { constexpr auto hash_bits = cuda::std::numeric_limits::digits; // TODO: assert if num_blocks > max_filter_blocks @@ -168,7 +168,7 @@ class arrow_filter_policy { * * @return The bit pattern for the word/segment in the filter block */ - __device__ constexpr word_type word_pattern(hash_value_type hash, std::uint32_t word_index) const + __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const { // SALT array to calculate bit indexes for the current word auto constexpr salt = SALT(); From a4b91cb4d5524adab4465a2bec34ed583454d37e Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 12 Feb 2025 07:22:43 -0800 Subject: [PATCH 5/6] Eliminate lmem access during salt lookup --- .../bloom_filter/arrow_filter_policy.cuh | 42 +++++++++++-------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh index c7b7b2fb0..2f17fa726 100644 --- a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh +++ b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh @@ -99,21 +99,6 @@ class arrow_filter_policy { (max_arrow_filter_bytes / bytes_per_filter_block); ///< Max sub-filter blocks allowed in Arrow bloom filter - private: - // Arrow's block-based bloom filter algorithm needs these eight odd SALT values to calculate - // eight indexes of bit to set, one bit in each 32-bit (uint32_t) word. - __device__ static constexpr cuda::std::array SALT() - { - return {0x47b6137bU, - 0x44974d91U, - 0x8824ad5bU, - 0xa2b7289dU, - 0x705495c7U, - 0x2df1424bU, - 0x9efc4947U, - 0x5c6bfb31U}; - } - public: /** * @brief Constructs the `arrow_filter_policy` object. @@ -170,10 +155,31 @@ class arrow_filter_policy { */ __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const { - // SALT array to calculate bit indexes for the current word - auto constexpr salt = SALT(); word_type const key = static_cast(hash); - return word_type{1} << ((key * salt[word_index]) >> 27); + std::uint32_t salt; + + // Basically a switch (word_index) { case 0-7 ... } + // First split: 0..3 versus 4..7. + if (word_index < 4) { + // For indices 0..3, further split into 0..1 and 2..3. + if (word_index < 2) { + // word_index is 0 or 1. + salt = (word_index == 0) ? 0x47b6137bU : 0x44974d91U; + } else { + // word_index is 2 or 3. + salt = (word_index == 2) ? 0x8824ad5bU : 0xa2b7289dU; + } + } else { + // For indices 4..7, further split into 4..5 and 6..7. + if (word_index < 6) { + // word_index is 4 or 5. + salt = (word_index == 4) ? 0x705495c7U : 0x2df1424bU; + } else { + // word_index is 6 or 7. + salt = (word_index == 6) ? 0x9efc4947U : 0x5c6bfb31U; + } + } + return word_type{1} << ((key * salt) >> 27); } private: From a51e7e6911f3ace88049bd0db7824c371734dfd4 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 12 Feb 2025 15:45:01 -0800 Subject: [PATCH 6/6] Update copyright year --- benchmarks/bloom_filter/defaults.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/bloom_filter/defaults.hpp b/benchmarks/bloom_filter/defaults.hpp index 8ca9d711c..f1b192aa3 100644 --- a/benchmarks/bloom_filter/defaults.hpp +++ b/benchmarks/bloom_filter/defaults.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License.