1515use std:: sync:: Arc ;
1616
1717use arrow:: array:: * ;
18- use datafusion:: { common:: Result , physical_plan:: ColumnarValue } ;
19- use datafusion_ext_commons:: spark_hash:: create_xxhash64_hashes;
18+ use datafusion:: {
19+ common:: { Result , ScalarValue } ,
20+ physical_plan:: ColumnarValue ,
21+ } ;
22+ use datafusion_ext_commons:: spark_hash:: { create_murmur3_hashes, create_xxhash64_hashes} ;
23+
24+ /// implements org.apache.spark.sql.catalyst.expressions.Murmur3Hash
25+ pub fn spark_murmur3_hash ( args : & [ ColumnarValue ] ) -> Result < ColumnarValue > {
26+ spark_hash ( args, |len, is_scalar, arrays| {
27+ // use identical seed as spark hash partition
28+ let spark_murmur3_default_seed = 42i32 ;
29+ let hash_buffer = create_murmur3_hashes ( len, & arrays, spark_murmur3_default_seed) ;
30+ if is_scalar {
31+ ColumnarValue :: Scalar ( ScalarValue :: from ( hash_buffer[ 0 ] ) )
32+ } else {
33+ ColumnarValue :: Array ( Arc :: new ( Int32Array :: from ( hash_buffer) ) )
34+ }
35+ } )
36+ }
2037
2138/// implements org.apache.spark.sql.catalyst.expressions.XxHash64
2239pub fn spark_xxhash64 ( args : & [ ColumnarValue ] ) -> Result < ColumnarValue > {
40+ spark_hash ( args, |len, is_scalar, arrays| {
41+ // use identical seed as spark hash partition
42+ let spark_xxhash64_default_seed = 42i64 ;
43+ let hash_buffer = create_xxhash64_hashes ( len, arrays, spark_xxhash64_default_seed) ;
44+ if is_scalar {
45+ ColumnarValue :: Scalar ( ScalarValue :: from ( hash_buffer[ 0 ] ) )
46+ } else {
47+ ColumnarValue :: Array ( Arc :: new ( Int64Array :: from ( hash_buffer) ) )
48+ }
49+ } )
50+ }
51+
52+ pub fn spark_hash (
53+ args : & [ ColumnarValue ] ,
54+ hash_impl : impl Fn ( usize , bool , & [ ArrayRef ] ) -> ColumnarValue ,
55+ ) -> Result < ColumnarValue > {
56+ let is_scalar = args
57+ . iter ( )
58+ . all ( |arg| matches ! ( arg, ColumnarValue :: Scalar ( _) ) ) ;
2359 let len = args
2460 . iter ( )
2561 . map ( |arg| match arg {
@@ -38,14 +74,7 @@ pub fn spark_xxhash64(args: &[ColumnarValue]) -> Result<ColumnarValue> {
3874 } )
3975 } )
4076 . collect :: < Result < Vec < _ > > > ( ) ?;
41-
42- // use identical seed as spark hash partition
43- let spark_xxhash64_default_seed = 42i64 ;
44- let hash_buffer = create_xxhash64_hashes ( len, & arrays, spark_xxhash64_default_seed) ;
45-
46- Ok ( ColumnarValue :: Array ( Arc :: new ( Int64Array :: from (
47- hash_buffer,
48- ) ) ) )
77+ Ok ( hash_impl ( len, is_scalar, & arrays) )
4978}
5079
5180#[ cfg( test) ]
@@ -57,6 +86,45 @@ mod test {
5786
5887 use super :: * ;
5988
89+ #[ test]
90+ fn test_murmur3_hash_int64 ( ) -> Result < ( ) , Box < dyn Error > > {
91+ let result = spark_murmur3_hash ( & vec ! [ ColumnarValue :: Array ( Arc :: new( Int64Array :: from(
92+ vec![ Some ( 1 ) , Some ( 0 ) , Some ( -1 ) , Some ( i64 :: MAX ) , Some ( i64 :: MIN ) ] ,
93+ ) ) ) ] ) ?
94+ . into_array ( 5 ) ?;
95+
96+ let expected = Int32Array :: from ( vec ! [
97+ Some ( -1712319331 ) ,
98+ Some ( -1670924195 ) ,
99+ Some ( -939490007 ) ,
100+ Some ( -1604625029 ) ,
101+ Some ( -853646085 ) ,
102+ ] ) ;
103+ let expected: ArrayRef = Arc :: new ( expected) ;
104+
105+ assert_eq ! ( & result, & expected) ;
106+ Ok ( ( ) )
107+ }
108+
109+ #[ test]
110+ fn test_murmur3_hash_string ( ) -> Result < ( ) , Box < dyn Error > > {
111+ let result = spark_murmur3_hash ( & vec ! [ ColumnarValue :: Array ( Arc :: new(
112+ StringArray :: from_iter_values( [ "hello" , "bar" , "" , "😁" , "天地" ] ) ,
113+ ) ) ] ) ?
114+ . into_array ( 5 ) ?;
115+
116+ let expected = Int32Array :: from ( vec ! [
117+ Some ( -1008564952 ) ,
118+ Some ( -1808790533 ) ,
119+ Some ( 142593372 ) ,
120+ Some ( 885025535 ) ,
121+ Some ( -1899966402 ) ,
122+ ] ) ;
123+ let expected: ArrayRef = Arc :: new ( expected) ;
124+
125+ assert_eq ! ( & result, & expected) ;
126+ Ok ( ( ) )
127+ }
60128 #[ test]
61129 fn test_xxhash64_int64 ( ) -> Result < ( ) , Box < dyn Error > > {
62130 let result = spark_xxhash64 ( & vec ! [ ColumnarValue :: Array ( Arc :: new( Int64Array :: from(
0 commit comments