4141#include " bolt/shuffle/sparksql/BoltArrowMemoryPool.h"
4242#include " bolt/shuffle/sparksql/Payload.h"
4343#include " bolt/shuffle/sparksql/Utils.h"
44+ #include " bolt/shuffle/sparksql/compression/Codec.h"
4445#include " bolt/shuffle/sparksql/compression/Compression.h"
4546#include " bolt/vector/ComplexVector.h"
4647#include " bolt/vector/FlatVector.h"
@@ -373,7 +374,7 @@ RowVectorPtr makeColumnarBatch(
373374BoltColumnarBatchDeserializer::BoltColumnarBatchDeserializer (
374375 std::shared_ptr<arrow::io::InputStream> in,
375376 const std::shared_ptr<arrow::Schema>& schema,
376- const std::shared_ptr<arrow::util:: Codec>& codec,
377+ const std::shared_ptr<Codec>& codec,
377378 const bytedance::bolt::RowTypePtr& rowType,
378379 int32_t batchSize,
379380 int32_t shuffleBatchByteSize,
@@ -384,7 +385,7 @@ BoltColumnarBatchDeserializer::BoltColumnarBatchDeserializer(
384385 uint64_t & deserializeTime,
385386 uint64_t & decompressTime,
386387 bool isRowFormat,
387- ZstdStreamCodec * zstdCodec,
388+ AdaptiveParallelZstdCodec * zstdCodec,
388389 RowBufferPool* rowBufferPool,
389390 ShuffleRowToColumnarConverter* row2ColConverter)
390391 : schema_(schema),
@@ -701,19 +702,21 @@ bool BoltColumnarBatchDeserializer::isCompositeRowVectorLayout(int64_t& bytes) {
701702
702703BoltColumnarBatchDeserializerFactory::BoltColumnarBatchDeserializerFactory (
703704 const std::shared_ptr<arrow::Schema>& schema,
704- const std::shared_ptr<arrow::util:: Codec>& codec,
705+ const std::shared_ptr<Codec>& codec,
705706 const RowTypePtr& rowType,
706707 int32_t batchSize,
707708 int32_t shuffleBatchByteSize,
708709 arrow::MemoryPool* memoryPool,
709- bytedance::bolt::memory::MemoryPool* boltPool)
710+ bytedance::bolt::memory::MemoryPool* boltPool,
711+ bool checksumEnabled)
710712 : schema_(schema),
711713 codec_(codec),
712714 rowType_(rowType),
713715 batchSize_(batchSize),
714716 shuffleBatchByteSize_(shuffleBatchByteSize),
715717 memoryPool_(memoryPool),
716- boltPool_(boltPool) {
718+ boltPool_(boltPool),
719+ checksumEnabled_(checksumEnabled) {
717720 initFromSchema ();
718721}
719722
@@ -729,8 +732,8 @@ BoltColumnarBatchDeserializerFactory::createDeserializer(
729732 schema_->num_fields () >= rowBaseColumnNumThreshold) ||
730733 (shuffleWriterType_ == ShuffleWriterType::RowBased));
731734 if (!zstdCodec_) {
732- zstdCodec_ =
733- std::make_shared<ZstdStreamCodec>( 1 /* not used*/ , false , memoryPool_);
735+ zstdCodec_ = std::make_shared<AdaptiveParallelZstdCodec>(
736+ 1 /* not used*/ , false , memoryPool_, checksumEnabled_ );
734737 rowBufferPool_ = std::make_shared<RowBufferPool>(memoryPool_);
735738 row2ColConverter_ =
736739 std::make_shared<ShuffleRowToColumnarConverter>(rowType_, boltPool_);
@@ -821,14 +824,18 @@ BoltShuffleReader::BoltShuffleReader(
821824 bytedance::bolt::memory::MemoryPool* boltPool)
822825 : factory_(std::make_unique<BoltColumnarBatchDeserializerFactory>(
823826 schema,
824- createArrowIpcCodec (
827+ createCodec (
825828 options.compressionType,
826- getCodecBackend (options.codecBackend)),
829+ CodecOptions{
830+ getCodecBackend (options.codecBackend ),
831+ kDefaultCompressionLevel ,
832+ options.checksumEnabled }),
827833 bytedance::bolt::asRowType(fromBoltTypeToArrowSchema(schema)),
828834 options.batchSize,
829835 options.shuffleBatchByteSize,
830836 pool,
831- boltPool)) {
837+ boltPool,
838+ options.checksumEnabled)) {
832839 factory_->setNumPartitions (options.numPartitions );
833840 factory_->setShuffleWriterType (options.forceShuffleWriterType );
834841 factory_->setpartitioningShortName (options.partitionShortName );
0 commit comments