From 9e97c6708fdc3c7f52420992184b1bfdbd2c515f Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 6 Feb 2025 18:39:04 -0800
Subject: [PATCH 1/6] Eliminate IO from bloom_filter::add benchmark

---
 benchmarks/bloom_filter/add_bench.cu | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)
diff --git a/benchmarks/bloom_filter/add_bench.cu b/benchmarks/bloom_filter/add_bench.cu
index 8b502d0d5..5d3d54b05 100644
--- a/benchmarks/bloom_filter/add_bench.cu
+++ b/benchmarks/bloom_filter/add_bench.cu
@@ -26,7 +26,7 @@
 #include <nvbench/nvbench.cuh>
 
 #include <cuda/std/limits>
-#include <thrust/device_vector.h>
+#include <thrust/iterator/counting_iterator.h>
 
 #include <cstdint>
 #include <exception>
@@ -61,10 +61,7 @@ void bloom_filter_add(nvbench::state& state,
     (filter_size_mb * 1024 * 1024) /
     (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
 
-  thrust::device_vector<Key> keys(num_keys);
-
-  key_generator gen;
-  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+  thrust::counting_iterator<Key> keys(0);
 
   state.add_element_count(num_keys);
 
@@ -79,7 +76,7 @@ void bloom_filter_add(nvbench::state& state,
   add_fpr_summary(state, filter);
 
   state.exec([&](nvbench::launch& launch) {
-    filter.add_async(keys.begin(), keys.end(), {launch.get_stream()});
+    filter.add_async(keys, keys + num_keys, {launch.get_stream()});
   });
 }
 
@@ -106,10 +103,7 @@ void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Dist>
                                                                                  // configurations
   }
 
-  thrust::device_vector<Key> keys(num_keys);
-
-  key_generator gen;
-  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+  thrust::counting_iterator<Key> keys(0);
 
   state.add_element_count(num_keys);
 
@@ -124,7 +118,7 @@ void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Dist>
   add_fpr_summary(state, filter);
 
   state.exec([&](nvbench::launch& launch) {
-    filter.add_async(keys.begin(), keys.end(), {launch.get_stream()});
+    filter.add_async(keys, keys + num_keys, {launch.get_stream()});
   });
 }
 

From e2bb17923a220bcf93c892c52d50ea34747b52b7 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 7 Feb 2025 15:43:28 -0800
Subject: [PATCH 2/6] Don't read benchmark input data from gmem

---
 benchmarks/bloom_filter/add_bench.cu      |  9 +-------
 benchmarks/bloom_filter/contains_bench.cu | 28 +++++++----------------
 2 files changed, 9 insertions(+), 28 deletions(-)

diff --git a/benchmarks/bloom_filter/add_bench.cu b/benchmarks/bloom_filter/add_bench.cu
index 5d3d54b05..72322bc21 100644
--- a/benchmarks/bloom_filter/add_bench.cu
+++ b/benchmarks/bloom_filter/add_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@
 #include <benchmark_utils.hpp>
 
 #include <cuco/bloom_filter.cuh>
-#include <cuco/utility/key_generator.cuh>
 
 #include <nvbench/nvbench.cuh>
 
@@ -68,10 +67,7 @@ void bloom_filter_add(nvbench::state& state,
   filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};
 
   state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
 
   add_fpr_summary(state, filter);
 
@@ -110,10 +106,7 @@ void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Dist>
   filter_type filter{num_sub_filters};
 
   state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
 
   add_fpr_summary(state, filter);
 
diff --git a/benchmarks/bloom_filter/contains_bench.cu b/benchmarks/bloom_filter/contains_bench.cu
index 3d2ed1e54..1eae4e13f 100644
--- a/benchmarks/bloom_filter/contains_bench.cu
+++ b/benchmarks/bloom_filter/contains_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,12 +21,12 @@
 #include <benchmark_utils.hpp>
 
 #include <cuco/bloom_filter.cuh>
-#include <cuco/utility/key_generator.cuh>
 
 #include <nvbench/nvbench.cuh>
 
 #include <cuda/std/limits>
 #include <thrust/device_vector.h>
+#include <thrust/iterator/counting_iterator.h>
 
 #include <exception>
 
@@ -63,28 +63,22 @@ void bloom_filter_contains(
     (filter_size_mb * 1024 * 1024) /
     (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
 
-  thrust::device_vector<Key> keys(num_keys);
+  thrust::counting_iterator<Key> keys(0);
   thrust::device_vector<bool> result(num_keys, false);
 
-  key_generator gen;
-  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
-
   state.add_element_count(num_keys);
 
   filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};
 
   state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
 
   add_fpr_summary(state, filter);
 
-  filter.add(keys.begin(), keys.end());
+  filter.add(keys, keys + num_keys);
 
   state.exec([&](nvbench::launch& launch) {
-    filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
+    filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()});
   });
 }
 
@@ -113,28 +107,22 @@ void arrow_bloom_filter_contains(nvbench::state& state, nvbench::type_list<Key,
                                                                                  // configurations
   }
 
-  thrust::device_vector<Key> keys(num_keys);
+  thrust::counting_iterator<Key> keys(0);
   thrust::device_vector<bool> result(num_keys, false);
 
-  key_generator gen;
-  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
-
   state.add_element_count(num_keys);
 
   filter_type filter{num_sub_filters};
 
   state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
 
   add_fpr_summary(state, filter);
 
-  filter.add(keys.begin(), keys.end());
+  filter.add(keys, keys + num_keys);
 
   state.exec([&](nvbench::launch& launch) {
-    filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
+    filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()});
   });
 }
 

From 15ecc9363d3cb9c30c7e531871db939d6d47c909 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Tue, 11 Feb 2025 16:47:35 -0800
Subject: [PATCH 3/6] Increase benchmark input to reduce noise in measurements

---
 benchmarks/bloom_filter/defaults.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/bloom_filter/defaults.hpp b/benchmarks/bloom_filter/defaults.hpp
index 67f3cf6ff..8ca9d711c 100644
--- a/benchmarks/bloom_filter/defaults.hpp
+++ b/benchmarks/bloom_filter/defaults.hpp
@@ -30,7 +30,7 @@ using BF_KEY  = nvbench::int64_t;
 using BF_HASH = cuco::xxhash_64<char>;
 using BF_WORD = nvbench::uint32_t;
 
-static constexpr auto BF_N               = 400'000'000;
+static constexpr auto BF_N               = 1'000'000'000;
 static constexpr auto BF_SIZE_MB         = 2'000;
 static constexpr auto BF_WORDS_PER_BLOCK = 8;
 

From 8e3caf9d99adf207ece96dd703a9a179a2ab3a00 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 12 Feb 2025 07:18:21 -0800
Subject: [PATCH 4/6] Rename hash_value_type -> hash_result_type

---
 .../detail/bloom_filter/arrow_filter_policy.cuh  | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh
index bfe97cfaf..c7b7b2fb0 100644
--- a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh
+++ b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -83,10 +83,10 @@ namespace cuco::detail {
 template <class Key, template <typename> class XXHash64>
 class arrow_filter_policy {
  public:
-  using hasher          = XXHash64<Key>;  ///< 64-bit XXHash hasher for Arrow bloom filter policy
-  using word_type       = std::uint32_t;  ///< uint32_t for Arrow bloom filter policy
-  using key_type        = Key;            ///< Hash function input type
-  using hash_value_type = std::uint64_t;  ///< hash function output type
+  using hasher           = XXHash64<Key>;  ///< 64-bit XXHash hasher for Arrow bloom filter policy
+  using word_type        = std::uint32_t;  ///< uint32_t for Arrow bloom filter policy
+  using key_type         = Key;            ///< Hash function input type
+  using hash_result_type = std::uint64_t;  ///< hash function output type
 
   static constexpr uint32_t bits_set_per_block = 8;  ///< hardcoded bits set per Arrow filter block
   static constexpr uint32_t words_per_block    = 8;  ///< hardcoded words per Arrow filter block
@@ -133,7 +133,7 @@ class arrow_filter_policy {
    *
    * @return The hash value of the key
    */
-  __device__ constexpr hash_value_type hash(key_type const& key) const { return hash_(key); }
+  __device__ constexpr hash_result_type hash(key_type const& key) const { return hash_(key); }
 
   /**
    * @brief Determines the filter block a key is added into.
@@ -150,7 +150,7 @@ class arrow_filter_policy {
    * @return The block index for the given key's hash value
    */
   template <class Extent>
-  __device__ constexpr auto block_index(hash_value_type hash, Extent num_blocks) const
+  __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const
   {
     constexpr auto hash_bits = cuda::std::numeric_limits<word_type>::digits;
     // TODO: assert if num_blocks > max_filter_blocks
@@ -168,7 +168,7 @@ class arrow_filter_policy {
    *
    * @return The bit pattern for the word/segment in the filter block
    */
-  __device__ constexpr word_type word_pattern(hash_value_type hash, std::uint32_t word_index) const
+  __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const
   {
     // SALT array to calculate bit indexes for the current word
     auto constexpr salt = SALT();

From a4b91cb4d5524adab4465a2bec34ed583454d37e Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 12 Feb 2025 07:22:43 -0800
Subject: [PATCH 5/6] Eliminate lmem access during salt lookup

---
 .../bloom_filter/arrow_filter_policy.cuh      | 42 +++++++++++--------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh
index c7b7b2fb0..2f17fa726 100644
--- a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh
+++ b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh
@@ -99,21 +99,6 @@ class arrow_filter_policy {
     (max_arrow_filter_bytes /
      bytes_per_filter_block);  ///< Max sub-filter blocks allowed in Arrow bloom filter
 
- private:
-  // Arrow's block-based bloom filter algorithm needs these eight odd SALT values to calculate
-  // eight indexes of bit to set, one bit in each 32-bit (uint32_t) word.
-  __device__ static constexpr cuda::std::array<std::uint32_t, 8> SALT()
-  {
-    return {0x47b6137bU,
-            0x44974d91U,
-            0x8824ad5bU,
-            0xa2b7289dU,
-            0x705495c7U,
-            0x2df1424bU,
-            0x9efc4947U,
-            0x5c6bfb31U};
-  }
-
  public:
   /**
    * @brief Constructs the `arrow_filter_policy` object.
@@ -170,10 +155,31 @@ class arrow_filter_policy {
    */
   __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const
   {
-    // SALT array to calculate bit indexes for the current word
-    auto constexpr salt = SALT();
     word_type const key = static_cast<word_type>(hash);
-    return word_type{1} << ((key * salt[word_index]) >> 27);
+    std::uint32_t salt;
+
+    // Basically a switch (word_index) { case 0-7 ... }
+    // First split: 0..3 versus 4..7.
+    if (word_index < 4) {
+      // For indices 0..3, further split into 0..1 and 2..3.
+      if (word_index < 2) {
+        // word_index is 0 or 1.
+        salt = (word_index == 0) ? 0x47b6137bU : 0x44974d91U;
+      } else {
+        // word_index is 2 or 3.
+        salt = (word_index == 2) ? 0x8824ad5bU : 0xa2b7289dU;
+      }
+    } else {
+      // For indices 4..7, further split into 4..5 and 6..7.
+      if (word_index < 6) {
+        // word_index is 4 or 5.
+        salt = (word_index == 4) ? 0x705495c7U : 0x2df1424bU;
+      } else {
+        // word_index is 6 or 7.
+        salt = (word_index == 6) ? 0x9efc4947U : 0x5c6bfb31U;
+      }
+    }
+    return word_type{1} << ((key * salt) >> 27);
   }
 
  private:

From a51e7e6911f3ace88049bd0db7824c371734dfd4 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 12 Feb 2025 15:45:01 -0800
Subject: [PATCH 6/6] Update copyright year

---
 benchmarks/bloom_filter/defaults.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/bloom_filter/defaults.hpp b/benchmarks/bloom_filter/defaults.hpp
index 8ca9d711c..f1b192aa3 100644
--- a/benchmarks/bloom_filter/defaults.hpp
+++ b/benchmarks/bloom_filter/defaults.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.