@@ -486,7 +486,7 @@ service ContentAddressableStorage {
486486 //
487487 // When blob splitting and splicing is used at the same time, the clients and
488488 // the server SHOULD agree out-of-band upon a chunking algorithm used by both
489- // parties to benefit from each others chunk data and avoid unnecessary data
489+ // parties to benefit from each other's chunk data and avoid unnecessary data
490490 // duplication.
491491 //
492492 // Errors:
@@ -1986,6 +1986,11 @@ message SplitBlobRequest {
19861986 // length of the blob digest hashes and the digest functions announced
19871987 // in the server's capabilities.
19881988 DigestFunction.Value digest_function = 3 ;
1989+
1990+ // The chunking function that the client prefers to use.
1991+ //
1992+ // The server MAY use a different chunking function.
1993+ ChunkingFunction.Value chunking_function = 4 ;
19891994}
19901995
19911996// A response message for
@@ -1998,6 +2003,9 @@ message SplitBlobResponse {
19982003 // The server MUST use the same digest function as the one explicitly or
19992004 // implicitly (through hash length) specified in the split request.
20002005 repeated Digest chunk_digests = 1 ;
2006+
2007+ // The chunking function used to split the blob.
2008+ ChunkingFunction.Value chunking_function = 2 ;
20012009}
20022010
20032011// A request message for
@@ -2036,6 +2044,9 @@ message SpliceBlobRequest {
20362044 // server SHOULD infer the digest function using the length of the blob digest
20372045 // hashes and the digest functions announced in the server's capabilities.
20382046 DigestFunction.Value digest_function = 4 ;
2047+
2048+ // The chunking function that the client used to split the blob.
2049+ ChunkingFunction.Value chunking_function = 5 ;
20392050}
20402051
20412052// A response message for
@@ -2178,6 +2189,34 @@ message DigestFunction {
21782189 }
21792190}
21802191
2192+ // The chunking function is used to split a blob into chunks.
2193+ //
2194+ // The server advertises support for a chunking function by setting the
2195+ // corresponding params field in
2196+ // [CacheCapabilities][build.bazel.remote.execution.v2.CacheCapabilities].
2197+ // For example, if fast_cdc_2020_params is set, the server supports FAST_CDC_2020.
2198+ //
2199+ // For optimal deduplication, clients SHOULD use an advertised chunking function.
2200+ // When clients use UNKNOWN, the server chooses an algorithm for SplitBlob and
2201+ // simply verifies chunk concatenation for SpliceBlob.
2202+ message ChunkingFunction {
2203+ enum Value {
2204+ // No specific algorithm. Servers MUST always accept this value.
2205+ // For SplitBlob, the server chooses the algorithm. For SpliceBlob, the
2206+ // server only verifies that chunks concatenate to form the expected blob.
2207+ UNKNOWN = 0 ;
2208+
2209+ // The FastCDC chunking algorithm as described in the 2020 paper by
2210+ // Wen Xia, et al. See https://ieeexplore.ieee.org/document/9055082
2211+ // for details.
2212+ FAST_CDC_2020 = 1 ;
2213+
2214+ // The RepMaxCDC chunking algorithm as implemented by buildbarn/go-cdc.
2215+ // See https://github.com/buildbarn/go-cdc for details.
2216+ REP_MAX_CDC = 2 ;
2217+ }
2218+ }
2219+
21812220// Describes the server/instance capabilities for updating the action cache.
21822221message ActionCacheUpdateCapabilities {
21832222 bool update_enabled = 1 ;
@@ -2299,6 +2338,91 @@ message CacheCapabilities {
22992338 // [ContentAddressableStorage.SpliceBlob][build.bazel.remote.execution.v2.ContentAddressableStorage.SpliceBlob]
23002339 // operation.
23012340 bool splice_blob_support = 10 ;
2341+
2342+ // The parameters for the FastCDC 2020 chunking algorithm.
2343+ // If set, the server supports the FastCDC chunking algorithm.
2344+ FastCdc2020Params fast_cdc_2020_params = 11 ;
2345+
2346+ // The parameters for the RepMaxCDC chunking algorithm.
2347+ // If set, the server supports the RepMaxCDC chunking algorithm.
2348+ RepMaxCdcParams rep_max_cdc_params = 12 ;
2349+ }
2350+
2351+ // Parameters for the FastCDC content-defined chunking algorithm.
2352+ //
2353+ // Implementations MUST follow the FastCDC 2020 paper by Wen Xia, et al.:
2354+ // https://ieeexplore.ieee.org/document/9055082
2355+ //
2356+ // Supported implementations:
2357+ // - Rust: https://docs.rs/fastcdc/3.2.1/fastcdc/v2020/index.html
2358+ // - Go: https://github.com/buildbuddy-io/fastcdc2020
2359+ //
2360+ // Test vectors can be found in the accompanying fastcdc2020_test_vectors.txt file.
2361+ //
2362+ // Implementations MUST use normalization level 2, which has been found
2363+ // successful for build artifacts with an average chunk size of 512 KiB.
2364+ //
2365+ // Key algorithm components from the paper:
2366+ //
2367+ // GEAR table: 256 64-bit integers for the rolling hash, computed as:
2368+ // GEAR[i] = high_64_bits(MD5(byte(i))) for i in 0..255
2369+ //
2370+ // MASKS table: Bit patterns for chunk boundary detection, derived from
2371+ // the C reference implementation. The mask selection based on average
2372+ // chunk size SHOULD match the paper.
2373+ //
2374+ // The minimum and maximum chunk sizes MUST be derived from the average:
2375+ // - min_chunk_size = avg_chunk_size_bytes / 4
2376+ // - max_chunk_size = avg_chunk_size_bytes * 4
2377+ //
2378+ // Blobs smaller than max_chunk_size (avg_chunk_size_bytes * 4) SHOULD be
2379+ // uploaded without chunking.
2380+ //
2381+ // If any of the advertised parameters are not within the expected range,
2382+ // the client SHOULD ignore FastCDC chunking function support.
2383+ message FastCdc2020Params {
2384+ // The average (expected) chunk size for the FastCDC chunking algorithm.
2385+ // The value MUST be between 1 KiB and 1 MiB. The recommended value is
2386+ // 524288 (512 KiB).
2387+ uint64 avg_chunk_size_bytes = 1 ;
2388+
2389+ // The seed for the FastCDC mask generation.
2390+ // The recommended value is 0.
2391+ //
2392+ // All clients sharing a cache SHOULD use the same seed to maximize
2393+ // chunk reuse.
2394+ uint32 seed = 2 ;
2395+ }
2396+
2397+ // Parameters for the RepMaxCDC content-defined chunking algorithm.
2398+ //
2399+ // Supported implementations:
2400+ // - Go: https://github.com/buildbarn/go-cdc
2401+ //
2402+ // Key algorithm components:
2403+ //
2404+ // GEAR table: 256 64-bit integers for the rolling hash, computed as:
2405+ // GEAR[i] = high_64_bits(MD5(byte(i))) for i in 0..255
2406+ //
2407+ // The algorithm repeatedly applies chunking until all chunks are in the
2408+ // range [min_chunk_size_bytes, 2*min_chunk_size_bytes). Cutting points are
2409+ // selected where the Gear rolling hash is maximized within a lookahead
2410+ // window of horizon_size_bytes.
2411+ //
2412+ // If any of the advertised parameters are not within the expected range,
2413+ // the client SHOULD ignore RepMaxCDC chunking function support.
2414+ message RepMaxCdcParams {
2415+ // The minimum chunk size for the RepMaxCDC chunking algorithm.
2416+ // The value MUST be at least 64 bytes (the Gear hash window size).
2417+ // All chunks will be in the range [min_chunk_size_bytes, 2*min_chunk_size_bytes).
2418+ // The recommended value is 262144 (256 KiB).
2419+ uint64 min_chunk_size_bytes = 1 ;
2420+
2421+ // The lookahead window for finding optimal cutting points.
2422+ // Larger values improve deduplication quality with diminishing returns.
2423+ // Setting to 0 produces uniform chunks of min_chunk_size_bytes.
2424+ // The recommended value is 8 * min_chunk_size_bytes.
2425+ uint64 horizon_size_bytes = 2 ;
23022426}
23032427
23042428// Capabilities of the remote execution system.
0 commit comments