microsoft
diff --git a/‎diskann-quantization/src/bits/distances.rs‎
Lines changed: 814 additions & 28 deletions b/‎diskann-quantization/src/bits/distances.rs‎
Lines changed: 814 additions & 28 deletions
diff --git a/‎diskann-quantization/src/minmax/vectors.rs‎
Lines changed: 108 additions & 69 deletions b/‎diskann-quantization/src/minmax/vectors.rs‎
Lines changed: 108 additions & 69 deletions
diff --git a/‎diskann-wide/src/arch/aarch64/double.rs‎
Lines changed: 33 additions & 0 deletions b/‎diskann-wide/src/arch/aarch64/double.rs‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎diskann-wide/src/arch/aarch64/macros.rs‎
Lines changed: 48 additions & 0 deletions b/‎diskann-wide/src/arch/aarch64/macros.rs‎
Lines changed: 48 additions & 0 deletions
@@ -203,16 +203,16 @@ pub type FullQueryMut<'a> = slice::SliceMut<'a, f32, FullQueryMeta>;
 // Compensated Distances //
 ///////////////////////////
 #[inline(always)]
-fn kernel<const NBITS: usize, F>(
-    x: DataRef<'_, NBITS>,
-    y: DataRef<'_, NBITS>,
+fn kernel<const N: usize, const M: usize, F>(
+    x: DataRef<'_, N>,
+    y: DataRef<'_, M>,
     f: F,
 ) -> distances::MathematicalResult<f32>
 where
-    Unsigned: Representation<NBITS>,
+    Unsigned: Representation<N> + Representation<M>,
     InnerProduct: for<'a, 'b> PureDistanceFunction<
-            BitSlice<'a, NBITS, Unsigned>,
-            BitSlice<'b, NBITS, Unsigned>,
+            BitSlice<'a, N, Unsigned>,
+            BitSlice<'b, M, Unsigned>,
             distances::MathematicalResult<u32>,
         >,
     F: Fn(f32, &MinMaxCompensation, &MinMaxCompensation) -> f32,
@@ -477,6 +477,50 @@ mod minmax_vector_tests {
     use super::*;
     use crate::{alloc::GlobalAllocator, scalar::bit_scale};
 
+    /// Builds a random MinMax quantized vector and its full-precision reconstruction.
+    ///
+    /// Returns `(compressed, original)` where `compressed` has its `MinMaxCompensation`
+    /// metadata fully populated and `original` is the dequantized f32 vector.
+    fn random_minmax_vector<const NBITS: usize>(
+        dim: usize,
+        rng: &mut impl Rng,
+    ) -> (Data<NBITS>, Vec<f32>)
+    where
+        Unsigned: Representation<NBITS>,
+    {
+        let mut v = Data::<NBITS>::new_boxed(dim);
+
+        let domain = Unsigned::domain_const::<NBITS>();
+        let code_dist = Uniform::new_inclusive(*domain.start(), *domain.end()).unwrap();
+
+        {
+            let mut bs = v.vector_mut();
+            for i in 0..dim {
+                bs.set(i, code_dist.sample(rng)).unwrap();
+            }
+        }
+
+        let a: f32 = Uniform::new_inclusive(0.0, 2.0).unwrap().sample(rng);
+        let b: f32 = Uniform::new_inclusive(0.0, 2.0).unwrap().sample(rng);
+
+        let original: Vec<f32> = (0..dim)
+            .map(|i| a * v.vector().get(i).unwrap() as f32 + b)
+            .collect();
+
+        let code_sum: f32 = (0..dim).map(|i| v.vector().get(i).unwrap() as f32).sum();
+        let norm_squared: f32 = original.iter().map(|x| x * x).sum();
+
+        v.set_meta(MinMaxCompensation {
+            a,
+            b,
+            n: a * code_sum,
+            norm_squared,
+            dim: dim as u32,
+        });
+
+        (v, original)
+    }
+
     fn test_minmax_compensated_vectors<const NBITS: usize, R>(dim: usize, rng: &mut R)
     where
         Unsigned: Representation<NBITS>,
@@ -494,70 +538,11 @@ mod minmax_vector_tests {
     {
         assert!(dim <= bit_scale::<NBITS>() as usize);
 
-        // Create two vectors with known compensation values
-        let mut v1 = Data::<NBITS>::new_boxed(dim);
-        let mut v2 = Data::<NBITS>::new_boxed(dim);
-
-        let domain = Unsigned::domain_const::<NBITS>();
-        let code_distribution = Uniform::new_inclusive(*domain.start(), *domain.end()).unwrap();
-
-        // Set bit values
-        {
-            let mut bitslice1 = v1.vector_mut();
-            let mut bitslice2 = v2.vector_mut();
-
-            for i in 0..dim {
-                bitslice1.set(i, code_distribution.sample(rng)).unwrap();
-                bitslice2.set(i, code_distribution.sample(rng)).unwrap();
-            }
-        }
-        let a_rnd = Uniform::new_inclusive(0.0, 2.0).unwrap();
-        let b_rnd = Uniform::new_inclusive(0.0, 2.0).unwrap();
-
-        // Set compensation coefficients
-        // v1: X = a1 * X' + b1
-        // v2: Y = a2 * Y' + b2
-        let a1 = a_rnd.sample(rng);
-        let b1 = b_rnd.sample(rng);
-        let a2 = a_rnd.sample(rng);
-        let b2 = b_rnd.sample(rng);
-
-        // Calculate sum of vector elements for n values
-        let sum1: f32 = (0..dim).map(|i| v1.vector().get(i).unwrap() as f32).sum();
-        let sum2: f32 = (0..dim).map(|i| v2.vector().get(i).unwrap() as f32).sum();
-
-        // Create original full-precision vectors for reference calculations
-        let mut original1 = Vec::with_capacity(dim);
-        let mut original2 = Vec::with_capacity(dim);
-
-        // Calculate the reconstructed original vectors and their norms
-        for i in 0..dim {
-            let val1 = a1 * v1.vector().get(i).unwrap() as f32 + b1;
-            let val2 = a2 * v2.vector().get(i).unwrap() as f32 + b2;
-            original1.push(val1);
-            original2.push(val2);
-        }
-
-        // Calculate squared norms
-        let norm1_squared: f32 = original1.iter().map(|x| x * x).sum();
-        let norm2_squared: f32 = original2.iter().map(|x| x * x).sum();
-
-        // Set compensation coefficients
-        v1.set_meta(MinMaxCompensation {
-            a: a1,
-            b: b1,
-            n: a1 * sum1,
-            norm_squared: norm1_squared,
-            dim: dim as u32,
-        });
+        let (v1, original1) = random_minmax_vector::<NBITS>(dim, rng);
+        let (v2, original2) = random_minmax_vector::<NBITS>(dim, rng);
 
-        v2.set_meta(MinMaxCompensation {
-            a: a2,
-            b: b2,
-            n: a2 * sum2,
-            norm_squared: norm2_squared,
-            dim: dim as u32,
-        });
+        let norm1_squared = v1.meta().norm_squared;
+        let norm2_squared = v2.meta().norm_squared;
 
         // Calculate raw integer dot product
         let expected_ip = (0..dim).map(|i| original1[i] * original2[i]).sum::<f32>();
@@ -741,4 +726,58 @@ mod minmax_vector_tests {
     test_minmax_compensated!(unsigned_minmax_compensated_test_u2, 2, 0xaedf3d2a223b7b77);
     test_minmax_compensated!(unsigned_minmax_compensated_test_u4, 4, 0xf60c0c8d1aadc126);
     test_minmax_compensated!(unsigned_minmax_compensated_test_u8, 8, 0x09fa14c42a9d7d98);
+
+    /// Test the heterogeneous MinMax kernel for N-bit queries × M-bit database vectors.
+    ///
+    /// Verifies that `kernel::<N, M, _>` produces inner-product and squared-L2
+    /// results matching the full-precision reference, for random codes and
+    /// random compensation coefficients.
+    fn test_minmax_heterogeneous_kernel<const N: usize, const M: usize, R>(dim: usize, rng: &mut R)
+    where
+        Unsigned: Representation<N> + Representation<M>,
+        InnerProduct: for<'a, 'b> PureDistanceFunction<
+                BitSlice<'a, N, Unsigned>,
+                BitSlice<'b, M, Unsigned>,
+                distances::MathematicalResult<u32>,
+            >,
+        R: Rng,
+    {
+        let (v_query, original1) = random_minmax_vector::<N>(dim, rng);
+        let (v_data, original2) = random_minmax_vector::<M>(dim, rng);
+
+        // ── Inner Product ──
+        let expected_ip: f32 = original1.iter().zip(&original2).map(|(x, y)| x * y).sum();
+        let computed_ip = kernel(v_query.reborrow(), v_data.reborrow(), |v, _, _| v)
+            .unwrap()
+            .into_inner();
+        assert!(
+            (expected_ip - computed_ip).abs() / expected_ip.abs().max(1e-10) < 1e-6,
+            "Heterogeneous IP ({},{}) failed: expected {}, got {} on dim: {}",
+            N,
+            M,
+            expected_ip,
+            computed_ip,
+            dim,
+        );
+    }
+
+    macro_rules! test_minmax_heterogeneous {
+        ($name:ident, $N:literal, $M:literal, $seed:literal) => {
+            #[test]
+            fn $name() {
+                let mut rng = StdRng::seed_from_u64($seed);
+                // Use the smaller bit width's scale as max dimension.
+                const MAX_DIM: usize = bit_scale::<$M>() as usize;
+                for dim in 1..=MAX_DIM {
+                    for _ in 0..TRIALS {
+                        test_minmax_heterogeneous_kernel::<$N, $M, _>(dim, &mut rng);
+                    }
+                }
+            }
+        };
+    }
+
+    test_minmax_heterogeneous!(minmax_heterogeneous_8x4, 8, 4, 0xb7c3d9e5f1a20864);
+    test_minmax_heterogeneous!(minmax_heterogeneous_8x2, 8, 2, 0x4e8f2c6a1d3b5079);
+    test_minmax_heterogeneous!(minmax_heterogeneous_8x1, 8, 1, 0x1b0f2c614d2a7141);
 }
@@ -3,6 +3,8 @@
  * Licensed under the MIT license.
  */
 
+use std::arch::aarch64::*;
+
 use half::f16;
 
 use crate::{
@@ -75,6 +77,17 @@ doubled::double_scalar_shift!(Doubled<Doubled<i8x16>>);
 doubled::double_scalar_shift!(Doubled<Doubled<i16x8>>);
 doubled::double_scalar_shift!(Doubled<Doubled<i32x4>>);
 
+//////////////
+// ZipUnzip //
+//////////////
+
+super::macros::aarch64_zipunzip!(i8x16, vzip1q_s8, vzip2q_s8, vuzp1q_s8, vuzp2q_s8);
+super::macros::aarch64_zipunzip!(i16x8, vzip1q_s16, vzip2q_s16, vuzp1q_s16, vuzp2q_s16);
+super::macros::aarch64_zipunzip!(i32x4, vzip1q_s32, vzip2q_s32, vuzp1q_s32, vuzp2q_s32);
+super::macros::aarch64_zipunzip!(u8x16, vzip1q_u8, vzip2q_u8, vuzp1q_u8, vuzp2q_u8);
+super::macros::aarch64_zipunzip!(u32x4, vzip1q_u32, vzip2q_u32, vuzp1q_u32, vuzp2q_u32);
+super::macros::aarch64_zipunzip!(f16x8, vzip1q_u16, vzip2q_u16, vuzp1q_u16, vuzp2q_u16);
+
 //-------------//
 // Conversions //
 //-------------//
@@ -230,6 +243,8 @@ mod tests {
 
         // Bit ops
         test_utils::ops::test_bitops!(u8x32, 0xd62d8de09f82ed4e, test_neon());
+        test_utils::ops::test_splitjoin!(u8x32 => u8x16, 0x2e301b7e12090d5c, test_neon());
+        test_utils::ops::test_zipunzip!(u8x32 => u8x16, 0xa1b2c3d4e5f67890, test_neon());
     }
 
     mod test_u8x64 {
@@ -238,6 +253,7 @@ mod tests {
 
         // Bit ops
         test_utils::ops::test_bitops!(u8x64, 0xd62d8de09f82ed4e, test_neon());
+        test_utils::ops::test_splitjoin!(u8x64 => u8x32, 0x2e301b7e12090d5c, test_neon());
     }
 
     // u32s
@@ -250,6 +266,8 @@ mod tests {
 
         // Reductions
         test_utils::ops::test_sumtree!(u32x8, 0x90a59e23ad545de1, test_neon());
+        test_utils::ops::test_splitjoin!(u32x8 => u32x4, 0x2e301b7e12090d5c, test_neon());
+        test_utils::ops::test_zipunzip!(u32x8 => u32x4, 0x4e7c0a3d5b9f2816, test_neon());
     }
 
     mod test_u32x16 {
@@ -261,6 +279,7 @@ mod tests {
 
         // Reductions
         test_utils::ops::test_sumtree!(u32x16, 0x90a59e23ad545de1, test_neon());
+        test_utils::ops::test_splitjoin!(u32x16 => u32x8, 0x2e301b7e12090d5c, test_neon());
     }
 
     // u64s
@@ -270,6 +289,7 @@ mod tests {
 
         // Bit ops
         test_utils::ops::test_bitops!(u64x4, 0xc4491a44af4aa58e, test_neon());
+        test_utils::ops::test_splitjoin!(u64x4 => u64x2, 0x2e301b7e12090d5c, test_neon());
     }
 
     // i8s
@@ -280,6 +300,8 @@ mod tests {
         // Bit ops
         test_utils::ops::test_bitops!(i8x32, 0xd62d8de09f82ed4e, test_neon());
         test_utils::ops::test_abs!(i8x32, 0xd62d8de09f82ed4e, test_neon());
+        test_utils::ops::test_splitjoin!(i8x32 => i8x16, 0x2e301b7e12090d5c, test_neon());
+        test_utils::ops::test_zipunzip!(i8x32 => i8x16, 0xc7e3a92f1d8b5604, test_neon());
     }
 
     mod test_i8x64 {
@@ -289,6 +311,7 @@ mod tests {
         // Bit ops
         test_utils::ops::test_bitops!(i8x64, 0xd62d8de09f82ed4e, test_neon());
         test_utils::ops::test_abs!(i8x64, 0xd62d8de09f82ed4e, test_neon());
+        test_utils::ops::test_splitjoin!(i8x64 => i8x32, 0x2e301b7e12090d5c, test_neon());
     }
 
     // i16s
@@ -299,6 +322,8 @@ mod tests {
         // Bit ops
         test_utils::ops::test_bitops!(i16x16, 0x9167644fc4ad5cfa, test_neon());
         test_utils::ops::test_abs!(i16x16, 0x9167644fc4ad5cfa, test_neon());
+        test_utils::ops::test_splitjoin!(i16x16 => i16x8, 0x2e301b7e12090d5c, test_neon());
+        test_utils::ops::test_zipunzip!(i16x16 => i16x8, 0x3f84d1b6e7a20c59, test_neon());
     }
 
     mod test_i16x32 {
@@ -308,6 +333,7 @@ mod tests {
         // Bit ops
         test_utils::ops::test_bitops!(i16x32, 0x9167644fc4ad5cfa, test_neon());
         test_utils::ops::test_abs!(i16x32, 0x9167644fc4ad5cfa, test_neon());
+        test_utils::ops::test_splitjoin!(i16x32 => i16x16, 0x2e301b7e12090d5c, test_neon());
     }
 
     // i32s
@@ -340,6 +366,8 @@ mod tests {
 
         // Reductions
         test_utils::ops::test_sumtree!(i32x8, 0x90a59e23ad545de1, test_neon());
+        test_utils::ops::test_splitjoin!(i32x8 => i32x4, 0x2e301b7e12090d5c, test_neon());
+        test_utils::ops::test_zipunzip!(i32x8 => i32x4, 0x92d5f4a83e1b07c6, test_neon());
     }
 
     mod test_i32x16 {
@@ -371,6 +399,7 @@ mod tests {
 
         // Reductions
         test_utils::ops::test_sumtree!(i32x16, 0x90a59e23ad545de1, test_neon());
+        test_utils::ops::test_splitjoin!(i32x16 => i32x8, 0x2e301b7e12090d5c, test_neon());
     }
 
     // Conversions
@@ -388,4 +417,8 @@ mod tests {
     test_utils::ops::test_cast!(f32x16 => f16x16, 0xba8fe343fc9dbeff, test_neon());
 
     test_utils::ops::test_cast!(i32x8 => f32x8, 0xba8fe343fc9dbeff, test_neon());
+
+    // SplitJoin + ZipUnzip for f16x16
+    test_utils::ops::test_splitjoin!(f16x16 => f16x8, 0x2e301b7e12090d5c, test_neon());
+    test_utils::ops::test_zipunzip!(f16x16 => f16x8, 0x6b2e0f9d8a41c573, test_neon());
 }
@@ -569,3 +569,51 @@ pub(crate) use aarch64_define_loadstore;
 pub(crate) use aarch64_define_register;
 pub(crate) use aarch64_define_splat;
 pub(crate) use aarch64_splitjoin;
+
+/// Implement [`ZipUnzip`] for a [`Doubled`] type using Neon zip/unzip intrinsics.
+///
+/// ## Parameters
+///
+/// * `$half`   — the native 128-bit Neon type (e.g. `i8x16`)
+/// * `$zip1`   — `vzip1q_*` intrinsic (interleave lower halves)
+/// * `$zip2`   — `vzip2q_*` intrinsic (interleave upper halves)
+/// * `$uzp1`   — `vuzp1q_*` intrinsic (collect even-indexed elements)
+/// * `$uzp2`   — `vuzp2q_*` intrinsic (collect odd-indexed elements)
+///
+/// ## Safety
+///
+/// The caller must ensure the provided intrinsics match the element type of `$half`.
+macro_rules! aarch64_zipunzip {
+    ($half:path, $zip1:ident, $zip2:ident, $uzp1:ident, $uzp2:ident) => {
+        impl $crate::ZipUnzip for $crate::doubled::Doubled<$half> {
+            #[inline(always)]
+            fn zip(halves: $crate::LoHi<<Self as $crate::SplitJoin>::Halved>) -> Self {
+                use $crate::SIMDVector;
+                // SAFETY: Caller asserts that these intrinsics match the element type.
+                unsafe {
+                    let lo_raw = halves.lo.to_underlying();
+                    let hi_raw = halves.hi.to_underlying();
+                    $crate::doubled::Doubled(
+                        <$half>::from_underlying(halves.lo.arch(), $zip1(lo_raw, hi_raw)),
+                        <$half>::from_underlying(halves.lo.arch(), $zip2(lo_raw, hi_raw)),
+                    )
+                }
+            }
+
+            #[inline(always)]
+            fn unzip(self) -> $crate::LoHi<<Self as $crate::SplitJoin>::Halved> {
+                use $crate::SIMDVector;
+                // SAFETY: Caller asserts that these intrinsics match the element type.
+                unsafe {
+                    let lo_raw = self.0.to_underlying();
+                    let hi_raw = self.1.to_underlying();
+                    $crate::LoHi::new(
+                        <$half>::from_underlying(self.0.arch(), $uzp1(lo_raw, hi_raw)),
+                        <$half>::from_underlying(self.0.arch(), $uzp2(lo_raw, hi_raw)),
+                    )
+                }
+            }
+        }
+    };
+}
+pub(crate) use aarch64_zipunzip;