diff --git a/.gitignore b/.gitignore index 8b8b79960..6fb9b523e 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,5 @@ darwin.bazelrc nativelink.bazelrc *.log buck-out/ +.cargo/config.toml +.claude/worktrees/ diff --git a/Cargo.lock b/Cargo.lock index 6daa6a21d..d0f3bf339 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -23,9 +23,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.1.3" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" dependencies = [ "memchr", ] @@ -36,15 +36,6 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" -[[package]] -name = "android_system_properties" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" -dependencies = [ - "libc", -] - [[package]] name = "anstream" version = "0.6.21" @@ -77,35 +68,38 @@ dependencies = [ [[package]] name = "anstyle-query" -version = "1.1.4" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] name = "anstyle-wincon" -version = "3.0.10" +version = "3.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] name = "anyhow" -version = "1.0.100" +version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" [[package]] name = "arc-swap" -version = "1.7.1" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" +checksum = "f9f3647c145568cec02c42054e07bdf9a5a698e15b466fb2341bfc393cd24aa5" +dependencies = [ + "rustversion", +] [[package]] name = "arcstr" @@ -137,9 +131,9 @@ dependencies = [ [[package]] name = "async-lock" -version = "3.4.1" +version = "3.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fd03604047cee9b6ce9de9f70c6cd540a0520c813cbd49bae61f33ab80ed1dc" +checksum = "290f7f2596bd5b78a9fec8088ccd89180d7f9f55b94b0576823bbbdc72ee8311" dependencies = [ "event-listener", "event-listener-strategy", @@ -180,9 +174,9 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "aws-config" -version = "1.8.8" +version = "1.8.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37cf2b6af2a95a20e266782b4f76f1a5e12bf412a9db2de9c1e9123b9d8c0ad8" +checksum = "8a8fc176d53d6fe85017f230405e3255cedb4a02221cb55ed6d76dccbbb099b2" dependencies = [ "aws-credential-types", "aws-runtime", @@ -199,7 +193,7 @@ dependencies = [ "bytes", "fastrand", "hex", - "http 1.3.1", + "http 1.4.0", "ring", "time", "tokio", @@ -210,9 +204,9 @@ dependencies = [ [[package]] name = "aws-credential-types" -version = "1.2.8" +version = "1.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "faf26925f4a5b59eb76722b63c2892b1d70d06fa053c72e4a100ec308c1d47bc" +checksum = "6d203b0bf2626dcba8665f5cd0871d7c2c0930223d6b6be9097592fea21242d0" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -220,11 +214,33 @@ dependencies = [ "zeroize", ] +[[package]] +name = "aws-lc-rs" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9a7b350e3bb1767102698302bc37256cbd48422809984b98d292c40e2579aa9" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.37.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b092fe214090261288111db7a2b2c2118e5a7f30dc2569f1732c4069a6840549" +dependencies = [ + "cc", + "cmake", + "dunce", + "fs_extra", +] + [[package]] name = "aws-runtime" -version = "1.5.12" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa006bb32360ed90ac51203feafb9d02e3d21046e1fd3a450a404b90ea73e5d" +checksum = "ede2ddc593e6c8acc6ce3358c28d6677a6dc49b65ba4b37a2befe14a11297e75" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -236,9 +252,12 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", + "bytes-utils", "fastrand", "http 0.2.12", + "http 1.4.0", "http-body 0.4.6", + "http-body 1.0.1", "percent-encoding", "pin-project-lite", "tracing", @@ -247,9 +266,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "1.109.0" +version = "1.124.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c6d81b75f8ff78882e70c5909804b44553d56136899fb4015a0a68ecc870e0e" +checksum = "744c09d75dfec039a05cf8e117c995ded3b0baffa6eb83f3ed7075a01d8d8947" dependencies = [ "aws-credential-types", "aws-runtime", @@ -259,6 +278,7 @@ dependencies = [ "aws-smithy-eventstream", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -269,10 +289,9 @@ dependencies = [ "hex", "hmac", "http 0.2.12", - "http 1.3.1", - "http-body 0.4.6", + "http 1.4.0", "http-body 1.0.1", - "lru 0.12.5", + "lru", "percent-encoding", "regex-lite", "sha2", @@ -282,15 +301,16 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.86.0" +version = "1.95.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a0abbfab841446cce6e87af853a3ba2cc1bc9afcd3f3550dd556c43d434c86d" +checksum = "00c5ff27c6ba2cbd95e6e26e2e736676fdf6bcf96495b187733f521cfe4ce448" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -298,21 +318,23 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", + "http 1.4.0", "regex-lite", "tracing", ] [[package]] name = "aws-sdk-ssooidc" -version = "1.88.0" +version = "1.97.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a68d675582afea0e94d38b6ca9c5aaae4ca14f1d36faa6edb19b42e687e70d7" +checksum = "4d186f1e5a3694a188e5a0640b3115ccc6e084d104e16fd6ba968dca072ffef8" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -320,21 +342,23 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", + "http 1.4.0", "regex-lite", "tracing", ] [[package]] name = "aws-sdk-sts" -version = "1.88.0" +version = "1.99.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d30990923f4f675523c51eb1c0dec9b752fb267b36a61e83cbc219c9d86da715" +checksum = "9acba7c62f3d4e2408fa998a3a8caacd8b9a5b5549cf36e2372fbdae329d5449" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -343,15 +367,16 @@ dependencies = [ "aws-types", "fastrand", "http 0.2.12", + "http 1.4.0", "regex-lite", "tracing", ] [[package]] name = "aws-sigv4" -version = "1.3.5" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bffc03068fbb9c8dd5ce1c6fb240678a5cffb86fb2b7b1985c999c4b83c8df68" +checksum = "37411f8e0f4bea0c3ca0958ce7f18f6439db24d555dbd809787262cd00926aa9" dependencies = [ "aws-credential-types", "aws-smithy-eventstream", @@ -363,7 +388,7 @@ dependencies = [ "hex", "hmac", "http 0.2.12", - "http 1.3.1", + "http 1.4.0", "percent-encoding", "sha2", "time", @@ -372,9 +397,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.2.6" +version = "1.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "127fcfad33b7dfc531141fda7e1c402ac65f88aca5511a4d31e2e3d2cd01ce9c" +checksum = "5cc50d0f63e714784b84223abd7abbc8577de8c35d699e0edd19f0a88a08ae13" dependencies = [ "futures-util", "pin-project-lite", @@ -383,17 +408,18 @@ dependencies = [ [[package]] name = "aws-smithy-checksums" -version = "0.63.9" +version = "0.64.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "165d8583d8d906e2fb5511d29201d447cc710864f075debcdd9c31c265412806" +checksum = "180dddf5ef0f52a2f99e2fada10e16ea610e507ef6148a42bdc4d5867596aa00" dependencies = [ "aws-smithy-http", "aws-smithy-types", "bytes", "crc-fast", "hex", - "http 0.2.12", - "http-body 0.4.6", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", "md-5", "pin-project-lite", "sha1", @@ -403,9 +429,9 @@ dependencies = [ [[package]] name = "aws-smithy-eventstream" -version = "0.60.12" +version = "0.60.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9656b85088f8d9dc7ad40f9a6c7228e1e8447cdf4b046c87e152e0805dea02fa" +checksum = "1c0b3e587fbaa5d7f7e870544508af8ce82ea47cd30376e69e1e37c4ac746f79" dependencies = [ "aws-smithy-types", "bytes", @@ -414,9 +440,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.62.4" +version = "0.63.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3feafd437c763db26aa04e0cc7591185d0961e64c61885bece0fb9d50ceac671" +checksum = "d619373d490ad70966994801bc126846afaa0d1ee920697a031f0cf63f2568e7" dependencies = [ "aws-smithy-eventstream", "aws-smithy-runtime-api", @@ -424,9 +450,10 @@ dependencies = [ "bytes", "bytes-utils", "futures-core", - "http 0.2.12", - "http 1.3.1", - "http-body 0.4.6", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", "percent-encoding", "pin-project-lite", "pin-utils", @@ -435,9 +462,9 @@ dependencies = [ [[package]] name = "aws-smithy-http-client" -version = "1.1.3" +version = "1.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1053b5e587e6fa40ce5a79ea27957b04ba660baa02b28b7436f64850152234f1" +checksum = "00ccbb08c10f6bcf912f398188e42ee2eab5f1767ce215a02a73bc5df1bbdd95" dependencies = [ "aws-smithy-async", "aws-smithy-protocol-test", @@ -445,13 +472,13 @@ dependencies = [ "aws-smithy-types", "bytes", "h2 0.3.27", - "h2 0.4.12", + "h2 0.4.13", "http 0.2.12", - "http 1.3.1", + "http 1.4.0", "http-body 0.4.6", "http-body 1.0.1", "hyper 0.14.32", - "indexmap 2.12.0", + "indexmap", "pin-project-lite", "serde", "serde_json", @@ -461,27 +488,27 @@ dependencies = [ [[package]] name = "aws-smithy-json" -version = "0.61.6" +version = "0.62.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cff418fc8ec5cadf8173b10125f05c2e7e1d46771406187b2c878557d4503390" +checksum = "27b3a779093e18cad88bbae08dc4261e1d95018c4c5b9356a52bcae7c0b6e9bb" dependencies = [ "aws-smithy-types", ] [[package]] name = "aws-smithy-observability" -version = "0.1.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d1881b1ea6d313f9890710d65c158bdab6fb08c91ea825f74c1c8c357baf4cc" +checksum = "4d3f39d5bb871aaf461d59144557f16d5927a5248a983a40654d9cf3b9ba183b" dependencies = [ "aws-smithy-runtime-api", ] [[package]] name = "aws-smithy-protocol-test" -version = "0.63.5" +version = "0.63.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09e4a766a447bf2aca69100278a6777cffcef2f97199f2443d481c698dd2887c" +checksum = "dbd2bae1fe1f465dc0e1f8865c3b36867a34848178707a31f74f92279266c78d" dependencies = [ "assert-json-diff", "aws-smithy-runtime-api", @@ -493,14 +520,14 @@ dependencies = [ "regex-lite", "roxmltree", "serde_json", - "thiserror 2.0.17", + "thiserror 2.0.18", ] [[package]] name = "aws-smithy-query" -version = "0.60.8" +version = "0.60.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d28a63441360c477465f80c7abac3b9c4d075ca638f982e605b7dc2a2c7156c9" +checksum = "05f76a580e3d8f8961e5d48763214025a2af65c2fa4cd1fb7f270a0e107a71b0" dependencies = [ "aws-smithy-types", "urlencoding", @@ -508,9 +535,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.9.3" +version = "1.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40ab99739082da5347660c556689256438defae3bcefd66c52b095905730e404" +checksum = "22ccf7f6eba8b2dcf8ce9b74806c6c185659c311665c4bf8d6e71ebd454db6bf" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -521,9 +548,10 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.3.1", + "http 1.4.0", "http-body 0.4.6", "http-body 1.0.1", + "http-body-util", "pin-project-lite", "pin-utils", "tokio", @@ -533,15 +561,15 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.9.1" +version = "1.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3683c5b152d2ad753607179ed71988e8cfd52964443b4f74fd8e552d0bbfeb46" +checksum = "b4af6e5def28be846479bbeac55aa4603d6f7986fc5da4601ba324dd5d377516" dependencies = [ "aws-smithy-async", "aws-smithy-types", "bytes", "http 0.2.12", - "http 1.3.1", + "http 1.4.0", "pin-project-lite", "tokio", "tracing", @@ -550,16 +578,16 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.3.3" +version = "1.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f5b3a7486f6690ba25952cabf1e7d75e34d69eaff5081904a47bc79074d6457" +checksum = "8ca2734c16913a45343b37313605d84e7d8b34a4611598ce1d25b35860a2bed3" dependencies = [ "base64-simd", "bytes", "bytes-utils", "futures-core", "http 0.2.12", - "http 1.3.1", + "http 1.4.0", "http-body 0.4.6", "http-body 1.0.1", "http-body-util", @@ -576,18 +604,18 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.60.11" +version = "0.60.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9c34127e8c624bc2999f3b657e749c1393bedc9cd97b92a804db8ced4d2e163" +checksum = "b53543b4b86ed43f051644f704a98c7291b3618b67adf057ee77a366fa52fcaa" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "1.3.9" +version = "1.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2fd329bf0e901ff3f60425691410c69094dc2a1f34b331f37bfc4e9ac1565a1" +checksum = "0470cc047657c6e286346bdf10a8719d26efd6a91626992e0e64481e44323e96" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -599,14 +627,14 @@ dependencies = [ [[package]] name = "axum" -version = "0.8.6" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a18ed336352031311f4e0b4dd2ff392d4fbb370777c9d18d7fc9d7359f73871" +checksum = "8b52af3cb4058c895d37317bb27508dccc8e5f2d39454016b297bf4a400597b8" dependencies = [ "axum-core", "bytes", "futures-util", - "http 1.3.1", + "http 1.4.0", "http-body 1.0.1", "http-body-util", "itoa", @@ -617,20 +645,20 @@ dependencies = [ "pin-project-lite", "serde_core", "sync_wrapper", - "tower 0.5.2", + "tower", "tower-layer", "tower-service", ] [[package]] name = "axum-core" -version = "0.5.5" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59446ce19cd142f8833f856eb31f3eb097812d1479ab224f54d72428ca21ea22" +checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1" dependencies = [ "bytes", "futures-core", - "http 1.3.1", + "http 1.4.0", "http-body 1.0.1", "http-body-util", "mime", @@ -655,12 +683,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf" -[[package]] -name = "base64" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" - [[package]] name = "base64" version = "0.22.1" @@ -679,9 +701,9 @@ dependencies = [ [[package]] name = "base64ct" -version = "1.8.0" +version = "1.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55248b47b0caf0546f7988906588779981c43bb1bc9d0c44087278f80cdb44ba" +checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" [[package]] name = "bincode" @@ -695,15 +717,9 @@ dependencies = [ [[package]] name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - -[[package]] -name = "bitflags" -version = "2.10.0" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" [[package]] name = "bitvec" @@ -719,16 +735,18 @@ dependencies = [ [[package]] name = "blake3" -version = "1.8.2" +version = "1.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3888aaa89e4b2a40fca9848e400f6a658a5a3978de7be858e209cafa8be9a4a0" +checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d" dependencies = [ "arrayref", "arrayvec", "cc", "cfg-if", "constant_time_eq", + "cpufeatures", "memmap2", + "rayon-core", ] [[package]] @@ -756,12 +774,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7969a9ba84b0ff843813e7249eed1678d9b6607ce5a3b8f0a47af3fcf7978e6e" dependencies = [ "ahash", - "base64 0.22.1", + "base64", "bitvec", - "getrandom 0.2.16", + "getrandom 0.2.17", "getrandom 0.3.4", "hex", - "indexmap 2.12.0", + "indexmap", "js-sys", "once_cell", "rand 0.9.2", @@ -774,15 +792,15 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.19.0" +version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" [[package]] name = "byte-unit" -version = "5.1.6" +version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1cd29c3c585209b0cbc7309bfe3ed7efd8c84c21b7af29c8bfae908f8777174" +checksum = "8c6d47a4e2961fb8721bcfc54feae6455f2f64e7054f9bc67e875f0e77f4c58d" dependencies = [ "rust_decimal", "utf8-width", @@ -790,9 +808,9 @@ dependencies = [ [[package]] name = "bytemuck" -version = "1.24.0" +version = "1.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fbdf580320f38b612e485521afda1ee26d10cc9884efaaa750d383e13e3c5f4" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" [[package]] name = "byteorder" @@ -837,9 +855,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.41" +version = "1.2.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7" +checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" dependencies = [ "find-msvc-tools", "jobserver", @@ -867,14 +885,11 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.42" +version = "0.4.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" +checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" dependencies = [ - "iana-time-zone", "num-traits", - "serde", - "windows-link", ] [[package]] @@ -906,9 +921,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.50" +version = "4.5.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c2cfd7bf8a6017ddaa4e32ffe7403d547790db06bd171c1c53926faab501623" +checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" dependencies = [ "clap_builder", "clap_derive", @@ -916,9 +931,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.50" +version = "4.5.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a4c05b9e80c5ccd3a7ef080ad7b6ba7d6fc00a985b8b157197075677c82c7a0" +checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" dependencies = [ "anstream", "anstyle", @@ -928,9 +943,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.49" +version = "4.5.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" +checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" dependencies = [ "heck", "proc-macro2", @@ -940,9 +955,18 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.6" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" +checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" + +[[package]] +name = "cmake" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" +dependencies = [ + "cc", +] [[package]] name = "colorchoice" @@ -994,7 +1018,7 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" dependencies = [ - "getrandom 0.2.16", + "getrandom 0.2.17", "once_cell", "tiny-keccak", ] @@ -1021,15 +1045,18 @@ dependencies = [ [[package]] name = "constant_time_eq" -version = "0.3.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" [[package]] name = "convert_case" -version = "0.4.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" +checksum = "633458d4ef8c78b72454de2d54fd6ab2e60f9e02be22f3c6104cdc8a4e0fceb9" +dependencies = [ + "unicode-segmentation", +] [[package]] name = "cookie-factory" @@ -1079,15 +1106,14 @@ checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" [[package]] name = "crc-fast" -version = "1.3.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bf62af4cc77d8fe1c22dde4e721d87f2f54056139d8c412e1366b740305f56f" +checksum = "2fd92aca2c6001b1bf5ba0ff84ee74ec8501b52bbef0cac80bf25a6c1d87a83d" dependencies = [ "crc", "digest", - "libc", - "rand 0.9.2", - "regex", + "rustversion", + "spin 0.10.0", ] [[package]] @@ -1105,6 +1131,25 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -1203,9 +1248,9 @@ dependencies = [ [[package]] name = "data-encoding" -version = "2.9.0" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476" +checksum = "d7a1e2f27636f116493b8b860f5546edb47c8d8f8ea73e1d2a20be88e28d1fea" [[package]] name = "der" @@ -1220,9 +1265,9 @@ dependencies = [ [[package]] name = "deranged" -version = "0.5.4" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a41953f86f8a05768a6cda24def994fd2f424b04ec5c719cf89989779f199071" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" dependencies = [ "powerfmt", "serde_core", @@ -1252,32 +1297,20 @@ dependencies = [ [[package]] name = "derive_more" -version = "0.99.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f" -dependencies = [ - "convert_case", - "proc-macro2", - "quote", - "rustc_version", - "syn", -] - -[[package]] -name = "derive_more" -version = "2.1.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10b768e943bed7bf2cab53df09f4bc34bfd217cdb57d971e769874c9a6710618" +checksum = "d751e9e49156b02b44f9c1815bcb94b984cdcc4396ecc32521c739452808b134" dependencies = [ "derive_more-impl", ] [[package]] name = "derive_more-impl" -version = "2.1.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d286bfdaf75e988b4a78e013ecd79c581e06399ab53fbacd2d916c2f904f30b" +checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb" dependencies = [ + "convert_case", "proc-macro2", "quote", "rustc_version", @@ -1315,10 +1348,10 @@ dependencies = [ ] [[package]] -name = "dyn-clone" -version = "1.0.20" +name = "dunce" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" [[package]] name = "ecdsa" @@ -1455,21 +1488,20 @@ checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" [[package]] name = "filetime" -version = "0.2.26" +version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc0505cd1b6fa6580283f6bdf70a73fcf4aba1184038c90902b92b3dd0df63ed" +checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" dependencies = [ "cfg-if", "libc", "libredox", - "windows-sys 0.60.2", ] [[package]] name = "find-msvc-tools" -version = "0.1.4" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" [[package]] name = "fixedbitset" @@ -1479,9 +1511,9 @@ checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" [[package]] name = "flate2" -version = "1.1.4" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc5a4e564e38c699f2880d3fda590bedc2e69f3f84cd48b457bd892ce61d0aa9" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", "miniz_oxide", @@ -1499,6 +1531,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -1514,6 +1552,12 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8866fac38f53fc87fa3ae1b09ddd723e0482f8fa74323518b4c59df2c55a00a" +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "funty" version = "2.0.0" @@ -1522,9 +1566,9 @@ checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" [[package]] name = "futures" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" dependencies = [ "futures-channel", "futures-core", @@ -1537,9 +1581,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" dependencies = [ "futures-core", "futures-sink", @@ -1547,15 +1591,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" [[package]] name = "futures-executor" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" dependencies = [ "futures-core", "futures-task", @@ -1564,15 +1608,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" [[package]] name = "futures-macro" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", @@ -1581,21 +1625,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" [[package]] name = "futures-task" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" [[package]] name = "futures-util" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" dependencies = [ "futures-channel", "futures-core", @@ -1605,25 +1649,23 @@ dependencies = [ "futures-task", "memchr", "pin-project-lite", - "pin-utils", "slab", ] [[package]] name = "gcloud-auth" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bdedbc36e6b9d8d79558fbf2ebc098745bc721e9d37d3e369558e420038e360" +version = "1.3.0" +source = "git+https://github.com/yoshidan/google-cloud-rust?rev=e0e790b9d4de1fbd7085dc98fde21eaf9573899a#e0e790b9d4de1fbd7085dc98fde21eaf9573899a" dependencies = [ "async-trait", - "base64 0.22.1", + "base64", "gcloud-metadata", "home", "jsonwebtoken", "reqwest", "serde", "serde_json", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", "token-source", "tokio", @@ -1634,22 +1676,20 @@ dependencies = [ [[package]] name = "gcloud-metadata" version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61f706788c1b58712c513e4d403234707fd255f49caa89d1c930197418b5fb2c" +source = "git+https://github.com/yoshidan/google-cloud-rust?rev=e0e790b9d4de1fbd7085dc98fde21eaf9573899a#e0e790b9d4de1fbd7085dc98fde21eaf9573899a" dependencies = [ "reqwest", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", ] [[package]] name = "gcloud-storage" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3515c85ca8d12aaf1104c9765f46d91a9ddd2a62b853fe12db109a40cde06e1" +version = "1.3.0" +source = "git+https://github.com/yoshidan/google-cloud-rust?rev=e0e790b9d4de1fbd7085dc98fde21eaf9573899a#e0e790b9d4de1fbd7085dc98fde21eaf9573899a" dependencies = [ "anyhow", - "base64 0.22.1", + "base64", "bytes", "futures-util", "gcloud-auth", @@ -1665,7 +1705,7 @@ dependencies = [ "serde", "serde_json", "sha2", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", "token-source", "tokio", @@ -1686,9 +1726,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" dependencies = [ "cfg-if", "js-sys", @@ -1712,10 +1752,17 @@ dependencies = [ ] [[package]] -name = "glob" -version = "0.3.3" +name = "getrandom" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" +checksum = "139ef39800118c7683f2fd3c98c1b23c09ae076556b435f8e9064ae108aaeeec" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", + "wasip3", +] [[package]] name = "group" @@ -1740,7 +1787,7 @@ dependencies = [ "futures-sink", "futures-util", "http 0.2.12", - "indexmap 2.12.0", + "indexmap", "slab", "tokio", "tokio-util", @@ -1749,17 +1796,17 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3c0b69cfcb4e1b9f1bf2f53f95f766e4661169728ec61cd3fe5a0166f2d1386" +checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" dependencies = [ "atomic-waker", "bytes", "fnv", "futures-core", "futures-sink", - "http 1.3.1", - "indexmap 2.12.0", + "http 1.4.0", + "indexmap", "slab", "tokio", "tokio-util", @@ -1777,28 +1824,25 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "hashbrown" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" - [[package]] name = "hashbrown" version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "allocator-api2", - "equivalent", - "foldhash", + "foldhash 0.1.5", ] [[package]] name = "hashbrown" -version = "0.16.0" +version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] [[package]] name = "heck" @@ -1832,11 +1876,22 @@ dependencies = [ [[package]] name = "home" -version = "0.5.11" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "hostname" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" +checksum = "617aaa3557aef3810a6369d0a99fac8a080891b68bd9f9812a1eeda0c0730cbd" dependencies = [ - "windows-sys 0.59.0", + "cfg-if", + "libc", + "windows-link", ] [[package]] @@ -1852,12 +1907,11 @@ dependencies = [ [[package]] name = "http" -version = "1.3.1" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565" +checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" dependencies = [ "bytes", - "fnv", "itoa", ] @@ -1879,7 +1933,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http 1.3.1", + "http 1.4.0", ] [[package]] @@ -1890,7 +1944,7 @@ checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" dependencies = [ "bytes", "futures-core", - "http 1.3.1", + "http 1.4.0", "http-body 1.0.1", "pin-project-lite", ] @@ -1939,16 +1993,16 @@ dependencies = [ [[package]] name = "hyper" -version = "1.7.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb3aa54a13a0dfe7fbe3a59e0c76093041720fdc77b110cc0fc260fafb4dc51e" +checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" dependencies = [ "atomic-waker", "bytes", "futures-channel", "futures-core", - "h2 0.4.12", - "http 1.3.1", + "h2 0.4.13", + "http 1.4.0", "http-body 1.0.1", "httparse", "httpdate", @@ -1966,8 +2020,8 @@ version = "0.27.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" dependencies = [ - "http 1.3.1", - "hyper 1.7.0", + "http 1.4.0", + "hyper 1.8.1", "hyper-util", "rustls", "rustls-native-certs", @@ -1976,7 +2030,6 @@ dependencies = [ "tokio", "tokio-rustls", "tower-service", - "webpki-roots 1.0.3", ] [[package]] @@ -1985,7 +2038,7 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" dependencies = [ - "hyper 1.7.0", + "hyper 1.8.1", "hyper-util", "pin-project-lite", "tokio", @@ -1994,57 +2047,32 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.17" +version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c6995591a8f1380fcb4ba966a252a4b29188d51d2b89e3a252f5305be65aea8" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" dependencies = [ - "base64 0.22.1", + "base64", "bytes", "futures-channel", - "futures-core", "futures-util", - "http 1.3.1", + "http 1.4.0", "http-body 1.0.1", - "hyper 1.7.0", + "hyper 1.8.1", "ipnet", "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.1", + "socket2 0.6.2", "tokio", "tower-service", "tracing", ] -[[package]] -name = "iana-time-zone" -version = "0.1.64" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb" -dependencies = [ - "android_system_properties", - "core-foundation-sys", - "iana-time-zone-haiku", - "js-sys", - "log", - "wasm-bindgen", - "windows-core", -] - -[[package]] -name = "iana-time-zone-haiku" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" -dependencies = [ - "cc", -] - [[package]] name = "icu_collections" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" +checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" dependencies = [ "displaydoc", "potential_utf", @@ -2055,9 +2083,9 @@ dependencies = [ [[package]] name = "icu_locale_core" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" +checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" dependencies = [ "displaydoc", "litemap", @@ -2068,11 +2096,10 @@ dependencies = [ [[package]] name = "icu_normalizer" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" +checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" dependencies = [ - "displaydoc", "icu_collections", "icu_normalizer_data", "icu_properties", @@ -2083,42 +2110,38 @@ dependencies = [ [[package]] name = "icu_normalizer_data" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" +checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" [[package]] name = "icu_properties" -version = "2.0.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" +checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" dependencies = [ - "displaydoc", "icu_collections", "icu_locale_core", "icu_properties_data", "icu_provider", - "potential_utf", "zerotrie", "zerovec", ] [[package]] name = "icu_properties_data" -version = "2.0.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" +checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" [[package]] name = "icu_provider" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" +checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" dependencies = [ "displaydoc", "icu_locale_core", - "stable_deref_trait", - "tinystr", "writeable", "yoke", "zerofrom", @@ -2126,6 +2149,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + [[package]] name = "ident_case" version = "1.0.1" @@ -2155,23 +2184,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.9.3" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" -dependencies = [ - "autocfg", - "hashbrown 0.12.3", - "serde", -] - -[[package]] -name = "indexmap" -version = "2.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" dependencies = [ "equivalent", - "hashbrown 0.16.0", + "hashbrown 0.16.1", "serde", "serde_core", ] @@ -2184,9 +2202,9 @@ checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" [[package]] name = "iri-string" -version = "0.7.8" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbc5ebe9c3a1a7a5127f920a418f7585e9e758e911d0466ed004f393b0e380b2" +checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" dependencies = [ "memchr", "serde", @@ -2209,9 +2227,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.15" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] name = "jni" @@ -2247,9 +2265,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.81" +version = "0.3.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305" +checksum = "14dc6f6450b3f6d4ed5b16327f38fed626d375a886159ca555bd7822c0c3a5a6" dependencies = [ "once_cell", "wasm-bindgen", @@ -2261,9 +2279,9 @@ version = "10.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0529410abe238729a60b108898784df8984c87f6054c9c4fcacc47e4803c1ce1" dependencies = [ - "base64 0.22.1", + "base64", "ed25519-dalek", - "getrandom 0.2.16", + "getrandom 0.2.17", "hmac", "js-sys", "p256", @@ -2284,20 +2302,26 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" dependencies = [ - "spin", + "spin 0.9.8", ] +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + [[package]] name = "libc" -version = "0.2.177" +version = "0.2.182" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" +checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" [[package]] name = "libm" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" [[package]] name = "libmimalloc-sys" @@ -2311,26 +2335,26 @@ dependencies = [ [[package]] name = "libredox" -version = "0.1.10" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "416f7e718bdb06000964960ffa43b4335ad4012ae8b99060261aa4a8088d5ccb" +checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616" dependencies = [ - "bitflags 2.10.0", + "bitflags", "libc", - "redox_syscall", + "redox_syscall 0.7.2", ] [[package]] name = "linux-raw-sys" -version = "0.11.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" [[package]] name = "litemap" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" [[package]] name = "lock_api" @@ -2343,24 +2367,18 @@ dependencies = [ [[package]] name = "log" -version = "0.4.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" - -[[package]] -name = "lru" -version = "0.12.5" +version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" -dependencies = [ - "hashbrown 0.15.5", -] +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" [[package]] name = "lru" version = "0.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593" +dependencies = [ + "hashbrown 0.16.1", +] [[package]] name = "lru-slab" @@ -2449,15 +2467,15 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.6" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" [[package]] name = "memmap2" -version = "0.9.9" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "744133e4a0e0a658e1374cf3bf8e415c4052a15a111acd372764c55b4177d490" +checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3" dependencies = [ "libc", ] @@ -2515,9 +2533,9 @@ dependencies = [ [[package]] name = "mio" -version = "1.1.0" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69d83b0086dc8ecf3ce9ae2874b2d1290252e2a30720bea58a5c6639b0092873" +checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" dependencies = [ "libc", "wasi", @@ -2532,9 +2550,9 @@ checksum = "4e1d4c44418358edcac6e1d9ce59cea7fb38052429c7704033f1196f0c179e6a" [[package]] name = "mongocrypt" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22426d6318d19c5c0773f783f85375265d6a8f0fa76a733da8dc4355516ec63d" +checksum = "8da0cd419a51a5fb44819e290fbdb0665a54f21dead8923446a799c7f4d26ad9" dependencies = [ "bson", "mongocrypt-sys", @@ -2544,25 +2562,22 @@ dependencies = [ [[package]] name = "mongocrypt-sys" -version = "0.1.4+1.12.0" +version = "0.1.5+1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dda42df21d035f88030aad8e877492fac814680e1d7336a57b2a091b989ae388" +checksum = "224484c5d09285a7b8cb0a0c117e847ebd14cb6e4470ecf68cdb89c503b0edb9" [[package]] name = "mongodb" -version = "3.3.0" +version = "3.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "622f272c59e54a3c85f5902c6b8e7b1653a6b6681f45e4c42d6581301119a4b8" +checksum = "803dd859e8afa084c255a8effd8000ff86f7c8076a50cd6d8c99e8f3496f75c2" dependencies = [ - "async-trait", - "base64 0.13.1", - "bitflags 1.3.2", + "base64", + "bitflags", "bson", - "chrono", "derive-where", - "derive_more 0.99.20", + "derive_more", "futures-core", - "futures-executor", "futures-io", "futures-util", "hex", @@ -2571,10 +2586,9 @@ dependencies = [ "md-5", "mongocrypt", "mongodb-internal-macros", - "once_cell", "pbkdf2", "percent-encoding", - "rand 0.8.5", + "rand 0.9.2", "rustc_version_runtime", "rustls", "rustversion", @@ -2583,24 +2597,24 @@ dependencies = [ "serde_with", "sha1", "sha2", - "socket2 0.5.10", + "socket2 0.6.2", "stringprep", "strsim", "take_mut", - "thiserror 1.0.69", + "thiserror 2.0.18", "tokio", "tokio-rustls", "tokio-util", "typed-builder", "uuid", - "webpki-roots 0.26.11", + "webpki-roots", ] [[package]] name = "mongodb-internal-macros" -version = "3.3.0" +version = "3.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63981427a0f26b89632fd2574280e069d09fb2912a3138da15de0174d11dd077" +checksum = "a973ef3dd3dbc6f6e65bbdecfd9ec5e781b9e7493b0f369a7c62e35d8e5ae2c8" dependencies = [ "macro_magic", "proc-macro2", @@ -2623,22 +2637,28 @@ dependencies = [ "bytes", "clap", "futures", - "hyper 1.7.0", + "hyper 1.8.1", "hyper-util", "mimalloc", "nativelink-config", "nativelink-error", + "nativelink-proto", "nativelink-scheduler", "nativelink-service", "nativelink-store", "nativelink-util", "nativelink-worker", + "prost", + "prost-types", "rand 0.9.2", "rustls-pki-types", + "sha2", + "socket2 0.5.10", + "tempfile", "tokio", "tokio-rustls", - "tonic 0.13.1", - "tower 0.5.2", + "tonic", + "tower", "tracing", ] @@ -2672,7 +2692,7 @@ dependencies = [ "serde", "serde_json5", "tokio", - "tonic 0.13.1", + "tonic", "url", "uuid", "walkdir", @@ -2711,12 +2731,14 @@ dependencies = [ name = "nativelink-proto" version = "1.0.0-rc2" dependencies = [ - "derive_more 2.1.0", + "derive_more", "prost", "prost-build", "prost-types", - "tonic 0.13.1", + "tonic", "tonic-build", + "tonic-prost", + "tonic-prost-build", ] [[package]] @@ -2739,7 +2761,7 @@ dependencies = [ "async-trait", "bytes", "futures", - "lru 0.16.3", + "lru", "mock_instant", "nativelink-config", "nativelink-error", @@ -2761,7 +2783,7 @@ dependencies = [ "static_assertions", "tokio", "tokio-stream", - "tonic 0.13.1", + "tonic", "tracing", "tracing-test", "uuid", @@ -2778,7 +2800,7 @@ dependencies = [ "futures", "hex", "http-body-util", - "hyper 1.7.0", + "hyper 1.8.1", "hyper-util", "nativelink-config", "nativelink-error", @@ -2800,8 +2822,9 @@ dependencies = [ "sha2", "tokio", "tokio-stream", - "tonic 0.13.1", - "tower 0.5.2", + "tonic", + "tonic-prost", + "tower", "tracing", "tracing-test", "uuid", @@ -2818,7 +2841,7 @@ dependencies = [ "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", - "base64 0.22.1", + "base64", "bincode", "blake3", "byteorder", @@ -2828,10 +2851,10 @@ dependencies = [ "gcloud-auth", "gcloud-storage", "hex", - "http 1.3.1", + "http 1.4.0", "http-body 1.0.1", "http-body-util", - "hyper 1.7.0", + "hyper 1.8.1", "hyper-rustls", "hyper-util", "itertools", @@ -2866,7 +2889,7 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "tonic 0.13.1", + "tonic", "tracing", "tracing-test", "url", @@ -2878,18 +2901,18 @@ name = "nativelink-util" version = "1.0.0-rc2" dependencies = [ "async-trait", - "base64 0.22.1", - "bitflags 2.10.0", + "base64", + "bitflags", "blake3", "bytes", "futures", "hex", "http-body-util", "humantime", - "hyper 1.7.0", + "hyper 1.8.1", "hyper-util", "libc", - "lru 0.16.3", + "lru", "mock_instant", "nativelink-config", "nativelink-error", @@ -2909,6 +2932,7 @@ dependencies = [ "prost", "prost-types", "rand 0.9.2", + "rayon", "rlimit", "serde", "serde_json", @@ -2917,8 +2941,8 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "tonic 0.13.1", - "tower 0.5.2", + "tonic", + "tower", "tracing", "tracing-opentelemetry", "tracing-subscriber", @@ -2936,12 +2960,15 @@ dependencies = [ "filetime", "formatx", "futures", - "hyper 1.7.0", + "hostname", + "hyper 1.8.1", + "libc", "nativelink-config", "nativelink-error", "nativelink-macro", "nativelink-metric", "nativelink-proto", + "nativelink-service", "nativelink-store", "nativelink-util", "opentelemetry", @@ -2959,7 +2986,8 @@ dependencies = [ "tempfile", "tokio", "tokio-stream", - "tonic 0.13.1", + "tonic", + "tonic-prost", "tracing", "tracing-test", "uuid", @@ -3012,9 +3040,9 @@ dependencies = [ [[package]] name = "num-conv" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" [[package]] name = "num-integer" @@ -3071,29 +3099,28 @@ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] name = "openssl-probe" -version = "0.1.6" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" +checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" [[package]] name = "opentelemetry" -version = "0.29.1" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e87237e2775f74896f9ad219d26a2081751187eb7c9f5c58dde20a23b95d16c" +checksum = "b84bcd6ae87133e903af7ef497404dda70c60d0ea14895fc8a5e6722754fc2a0" dependencies = [ "futures-core", "futures-sink", "js-sys", "pin-project-lite", - "thiserror 2.0.17", - "tracing", + "thiserror 2.0.18", ] [[package]] name = "opentelemetry-appender-tracing" -version = "0.29.1" +version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e716f864eb23007bdd9dc4aec381e188a1cee28eecf22066772b5fd822b9727d" +checksum = "ef6a1ac5ca3accf562b8c306fa8483c85f4390f768185ab775f242f7fe8fdcc2" dependencies = [ "opentelemetry", "tracing", @@ -3103,66 +3130,64 @@ dependencies = [ [[package]] name = "opentelemetry-http" -version = "0.29.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46d7ab32b827b5b495bd90fa95a6cb65ccc293555dcc3199ae2937d2d237c8ed" +checksum = "d7a6d09a73194e6b66df7c8f1b680f156d916a1a942abf2de06823dd02b7855d" dependencies = [ "async-trait", "bytes", - "http 1.3.1", + "http 1.4.0", "opentelemetry", ] [[package]] name = "opentelemetry-otlp" -version = "0.29.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d899720fe06916ccba71c01d04ecd77312734e2de3467fd30d9d580c8ce85656" +checksum = "7a2366db2dca4d2ad033cad11e6ee42844fd727007af5ad04a1730f4cb8163bf" dependencies = [ - "futures-core", - "http 1.3.1", + "http 1.4.0", "opentelemetry", "opentelemetry-proto", "opentelemetry_sdk", "prost", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", - "tonic 0.12.3", + "tonic", ] [[package]] name = "opentelemetry-proto" -version = "0.29.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c40da242381435e18570d5b9d50aca2a4f4f4d8e146231adb4e7768023309b3" +checksum = "a7175df06de5eaee9909d4805a3d07e28bb752c34cab57fa9cff549da596b30f" dependencies = [ "opentelemetry", "opentelemetry_sdk", "prost", - "tonic 0.12.3", + "tonic", + "tonic-prost", ] [[package]] name = "opentelemetry-semantic-conventions" -version = "0.29.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84b29a9f89f1a954936d5aa92f19b2feec3c8f3971d3e96206640db7f9706ae3" +checksum = "e62e29dfe041afb8ed2a6c9737ab57db4907285d999ef8ad3a59092a36bdc846" [[package]] name = "opentelemetry_sdk" -version = "0.29.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afdefb21d1d47394abc1ba6c57363ab141be19e27cc70d0e422b7f303e4d290b" +checksum = "e14ae4f5991976fd48df6d843de219ca6d31b01daaab2dad5af2badeded372bd" dependencies = [ "futures-channel", "futures-executor", "futures-util", - "glob", "opentelemetry", "percent-encoding", "rand 0.9.2", - "serde_json", - "thiserror 2.0.17", + "thiserror 2.0.18", ] [[package]] @@ -3219,7 +3244,7 @@ checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.5.18", "smallvec", "windows-link", ] @@ -3230,14 +3255,14 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edb45b6331bbdbb54c9a29413703e892ab94f83a31e4a546c778495a91e7fbca" dependencies = [ - "bitflags 2.10.0", + "bitflags", ] [[package]] name = "pbkdf2" -version = "0.11.0" +version = "0.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917" +checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2" dependencies = [ "digest", ] @@ -3248,7 +3273,7 @@ version = "3.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d30c53c26bc5b31a98cd02d20f25a7c8567146caf63ed593a9d87b2775291be" dependencies = [ - "base64 0.22.1", + "base64", "serde_core", ] @@ -3269,9 +3294,9 @@ checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" [[package]] name = "pest" -version = "2.8.3" +version = "2.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "989e7521a040efde50c3ab6bbadafbe15ab6dc042686926be59ac35d74607df4" +checksum = "e0848c601009d37dfa3430c4666e147e49cdcf1b92ecd3e63657d8a5f19da662" dependencies = [ "memchr", "ucd-trie", @@ -3279,9 +3304,9 @@ dependencies = [ [[package]] name = "pest_derive" -version = "2.8.3" +version = "2.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "187da9a3030dbafabbbfb20cb323b976dc7b7ce91fcd84f2f74d6e31d378e2de" +checksum = "11f486f1ea21e6c10ed15d5a7c77165d0ee443402f0780849d1768e7d9d6fe77" dependencies = [ "pest", "pest_generator", @@ -3289,9 +3314,9 @@ dependencies = [ [[package]] name = "pest_generator" -version = "2.8.3" +version = "2.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49b401d98f5757ebe97a26085998d6c0eecec4995cad6ab7fc30ffdf4b052843" +checksum = "8040c4647b13b210a963c1ed407c1ff4fdfa01c31d6d2a098218702e6664f94f" dependencies = [ "pest", "pest_meta", @@ -3302,9 +3327,9 @@ dependencies = [ [[package]] name = "pest_meta" -version = "2.8.3" +version = "2.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72f27a2cfee9f9039c4d86faa5af122a0ac3851441a34865b8a043b46be0065a" +checksum = "89815c69d36021a140146f26659a81d6c2afa33d216d736dd4be5381a7362220" dependencies = [ "pest", "sha2", @@ -3312,12 +3337,13 @@ dependencies = [ [[package]] name = "petgraph" -version = "0.7.1" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ "fixedbitset", - "indexmap 2.12.0", + "hashbrown 0.15.5", + "indexmap", ] [[package]] @@ -3381,9 +3407,9 @@ checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" [[package]] name = "potential_utf" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84df19adbe5b5a0782edcab45899906947ab039ccf4573713735ee7de1e6b08a" +checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" dependencies = [ "zerovec", ] @@ -3434,18 +3460,18 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.101" +version = "1.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" dependencies = [ "unicode-ident", ] [[package]] name = "prost" -version = "0.13.5" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" +checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" dependencies = [ "bytes", "prost-derive", @@ -3453,15 +3479,14 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.13.5" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" +checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" dependencies = [ "heck", "itertools", "log", "multimap", - "once_cell", "petgraph", "prettyplease", "prost", @@ -3473,9 +3498,9 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.13.5" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" +checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" dependencies = [ "anyhow", "itertools", @@ -3486,9 +3511,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.13.5" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" +checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" dependencies = [ "prost", ] @@ -3506,8 +3531,8 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls", - "socket2 0.6.1", - "thiserror 2.0.17", + "socket2 0.6.2", + "thiserror 2.0.18", "tokio", "tracing", "web-time", @@ -3519,6 +3544,7 @@ version = "0.11.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" dependencies = [ + "aws-lc-rs", "bytes", "getrandom 0.3.4", "lru-slab", @@ -3528,7 +3554,7 @@ dependencies = [ "rustls", "rustls-pki-types", "slab", - "thiserror 2.0.17", + "thiserror 2.0.18", "tinyvec", "tracing", "web-time", @@ -3543,16 +3569,16 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.6.1", + "socket2 0.6.2", "tracing", "windows-sys 0.60.2", ] [[package]] name = "quote" -version = "1.0.41" +version = "1.0.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" +checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" dependencies = [ "proc-macro2", ] @@ -3587,7 +3613,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ "rand_chacha 0.9.0", - "rand_core 0.9.3", + "rand_core 0.9.5", ] [[package]] @@ -3607,7 +3633,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core 0.9.3", + "rand_core 0.9.5", ] [[package]] @@ -3616,23 +3642,43 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom 0.2.16", + "getrandom 0.2.17", ] [[package]] name = "rand_core" -version = "0.9.3" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "redis" -version = "1.0.0" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47ba378d39b8053bffbfc2750220f5a24a06189b5129523d5db01618774e0239" +checksum = "dbe7f6e08ce1c6a9b21684e643926f6fc3b683bc006cb89afd72a5e0eb16e3a2" dependencies = [ "ahash", "arc-swap", @@ -3651,7 +3697,7 @@ dependencies = [ "rand 0.9.2", "ryu", "sha1_smol", - "socket2 0.6.1", + "socket2 0.6.2", "tokio", "tokio-util", "url", @@ -3674,14 +3720,14 @@ dependencies = [ [[package]] name = "redis-test" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7a5cadf877f090eebfef0f4e8646c56531ab416b388410fe1c974f4e6e9cb20" +checksum = "5143ae9e73f2ff0f3509af5e3a056b48bac2d1e1caa093257f20a9e68ef7534f" dependencies = [ "futures", "rand 0.9.2", "redis", - "socket2 0.6.1", + "socket2 0.6.2", "tempfile", ] @@ -3691,34 +3737,23 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags 2.10.0", + "bitflags", ] [[package]] -name = "ref-cast" -version = "1.0.25" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d" -dependencies = [ - "ref-cast-impl", -] - -[[package]] -name = "ref-cast-impl" -version = "1.0.25" +name = "redox_syscall" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" +checksum = "6d94dd2f7cd932d4dc02cc8b2b50dfd38bd079a4e5d79198b99743d7fcf9a4b4" dependencies = [ - "proc-macro2", - "quote", - "syn", + "bitflags", ] [[package]] name = "regex" -version = "1.12.2" +version = "1.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" dependencies = [ "aho-corasick", "memchr", @@ -3728,9 +3763,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" dependencies = [ "aho-corasick", "memchr", @@ -3739,15 +3774,15 @@ dependencies = [ [[package]] name = "regex-lite" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d942b98df5e658f56f20d592c7f868833fe38115e65c33003d8cd224b0155da" +checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" [[package]] name = "regex-syntax" -version = "0.8.8" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" [[package]] name = "relative-path" @@ -3760,19 +3795,19 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.24" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d0946410b9f7b082a427e4ef5c8ff541a88b357bc6c637c40db3a68ac70a36f" +checksum = "ab3f43e3283ab1488b624b44b0e988d0acea0b3214e694730a055cb6b2efa801" dependencies = [ - "base64 0.22.1", + "base64", "bytes", "encoding_rs", "futures-core", "futures-util", - "http 1.3.1", + "http 1.4.0", "http-body 1.0.1", "http-body-util", - "hyper 1.7.0", + "hyper 1.8.1", "hyper-rustls", "hyper-util", "js-sys", @@ -3784,6 +3819,7 @@ dependencies = [ "quinn", "rustls", "rustls-pki-types", + "rustls-platform-verifier", "serde", "serde_json", "serde_urlencoded", @@ -3791,7 +3827,7 @@ dependencies = [ "tokio", "tokio-rustls", "tokio-util", - "tower 0.5.2", + "tower", "tower-http", "tower-service", "url", @@ -3799,21 +3835,20 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "webpki-roots 1.0.3", ] [[package]] name = "reqwest-middleware" -version = "0.4.2" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57f17d28a6e6acfe1733fe24bcd30774d13bffa4b8a22535b4c8c98423088d4e" +checksum = "199dda04a536b532d0cc04d7979e39b1c763ea749bf91507017069c00b96056f" dependencies = [ "anyhow", "async-trait", - "http 1.3.1", + "http 1.4.0", "reqwest", "serde", - "thiserror 1.0.69", + "thiserror 2.0.18", "tower-service", ] @@ -3835,7 +3870,7 @@ checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" dependencies = [ "cc", "cfg-if", - "getrandom 0.2.16", + "getrandom 0.2.17", "libc", "untrusted", "windows-sys 0.52.0", @@ -3881,9 +3916,9 @@ dependencies = [ [[package]] name = "rust_decimal" -version = "1.39.0" +version = "1.40.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35affe401787a9bd846712274d97654355d21b2a2c092a3139aabe31e9022282" +checksum = "61f703d19852dbf87cbc513643fa81428361eb6940f1ac14fd58155d295a3eb0" dependencies = [ "arrayvec", "num-traits", @@ -3916,11 +3951,11 @@ dependencies = [ [[package]] name = "rustix" -version = "1.1.2" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags 2.10.0", + "bitflags", "errno", "libc", "linux-raw-sys", @@ -3929,10 +3964,11 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.34" +version = "0.23.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a9586e9ee2b4f8fab52a0048ca7334d7024eef48e2cb9407e3497bb7cab7fa7" +checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" dependencies = [ + "aws-lc-rs", "log", "once_cell", "ring", @@ -3944,9 +3980,9 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9980d917ebb0c0536119ba501e90834767bffc3d60641457fd84a1f3fd337923" +checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" dependencies = [ "openssl-probe", "rustls-pki-types", @@ -3956,9 +3992,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.13.1" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "708c0f9d5f54ba0272468c1d306a52c495b31fa155e91bc25371e6df7996908c" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" dependencies = [ "web-time", "zeroize", @@ -3993,10 +4029,11 @@ checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f" [[package]] name = "rustls-webpki" -version = "0.103.7" +version = "0.103.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e10b3f4191e8a80e6b43eebabfac91e5dcecebb27a71f04e820c47ec41d314bf" +checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" dependencies = [ + "aws-lc-rs", "ring", "rustls-pki-types", "untrusted", @@ -4010,9 +4047,9 @@ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "ryu" -version = "1.0.20" +version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" [[package]] name = "same-file" @@ -4041,30 +4078,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "schemars" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cd191f9397d57d581cddd31014772520aa448f65ef991055d7f61582c65165f" -dependencies = [ - "dyn-clone", - "ref-cast", - "serde", - "serde_json", -] - -[[package]] -name = "schemars" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82d20c4491bc164fa2f6c5d44565947a52ad80b9505d8e36f8d54c27c739fcd0" -dependencies = [ - "dyn-clone", - "ref-cast", - "serde", - "serde_json", -] - [[package]] name = "scopeguard" version = "1.2.0" @@ -4093,11 +4106,11 @@ dependencies = [ [[package]] name = "security-framework" -version = "3.5.1" +version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" +checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ - "bitflags 2.10.0", + "bitflags", "core-foundation", "core-foundation-sys", "libc", @@ -4106,9 +4119,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.15.0" +version = "2.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0" +checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" dependencies = [ "core-foundation-sys", "libc", @@ -4168,16 +4181,16 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.145" +version = "1.0.149" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" dependencies = [ - "indexmap 2.12.0", + "indexmap", "itoa", "memchr", - "ryu", "serde", "serde_core", + "zmij", ] [[package]] @@ -4205,28 +4218,19 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.15.1" +version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa66c845eee442168b2c8134fec70ac50dc20e760769c8ba0ad1319ca1959b04" +checksum = "381b283ce7bc6b476d903296fb59d0d36633652b633b27f64db4fb46dcbfc3b9" dependencies = [ - "base64 0.22.1", - "chrono", - "hex", - "indexmap 1.9.3", - "indexmap 2.12.0", - "schemars 0.9.0", - "schemars 1.0.4", "serde_core", - "serde_json", "serde_with_macros", - "time", ] [[package]] name = "serde_with_macros" -version = "3.15.1" +version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b91a903660542fced4e99881aa481bdbaec1634568ee02e0b8bd57c64cb38955" +checksum = "a6d4e30573c8cb306ed6ab1dca8423eec9a463ea0e155f45399455e0368b27e0" dependencies = [ "darling", "proc-macro2", @@ -4236,11 +4240,12 @@ dependencies = [ [[package]] name = "serial_test" -version = "3.2.0" +version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b258109f244e1d6891bf1053a55d63a5cd4f8f4c30cf9a1280989f80e7a1fa9" +checksum = "911bd979bf1070a3f3aa7b691a3b3e9968f339ceeec89e08c280a8a22207a32f" dependencies = [ - "futures", + "futures-executor", + "futures-util", "once_cell", "parking_lot", "scc", @@ -4249,9 +4254,9 @@ dependencies = [ [[package]] name = "serial_test_derive" -version = "3.2.0" +version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d69265a08751de7844521fd15003ae0a888e035773ba05695c5c759a6f89eef" +checksum = "0a7d91949b85b0d2fb687445e448b40d322b6b3e4af6b44a29b21d9a5f33e6d9" dependencies = [ "proc-macro2", "quote", @@ -4284,6 +4289,16 @@ dependencies = [ "cfg-if", "cpufeatures", "digest", + "sha2-asm", +] + +[[package]] +name = "sha2-asm" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b845214d6175804686b2bd482bcffe96651bb2d1200742b712003504a2dac1ab" +dependencies = [ + "cc", ] [[package]] @@ -4297,9 +4312,9 @@ dependencies = [ [[package]] name = "shellexpand" -version = "3.1.1" +version = "3.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b1fdf65dd6331831494dd616b30351c38e96e45921a27745cf98490458b90bb" +checksum = "32824fab5e16e6c4d86dc1ba84489390419a39f97699852b66480bb87d297ed8" [[package]] name = "shlex" @@ -4309,10 +4324,11 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "signal-hook-registry" -version = "1.4.6" +version = "1.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2a4719bff48cee6b39d12c020eeb490953ad2443b7055bd0b21fca26bd8c28b" +checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" dependencies = [ + "errno", "libc", ] @@ -4328,27 +4344,27 @@ dependencies = [ [[package]] name = "simd-adler32" -version = "0.3.7" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" [[package]] name = "simple_asn1" -version = "0.6.3" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "297f631f50729c8c99b84667867963997ec0b50f32b2a7dbcab828ef0541e8bb" +checksum = "0d585997b0ac10be3c5ee635f1bab02d512760d14b7c468801ac8a01d9ae5f1d" dependencies = [ "num-bigint", "num-traits", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", ] [[package]] name = "slab" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "smallvec" @@ -4368,9 +4384,9 @@ dependencies = [ [[package]] name = "socket2" -version = "0.6.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17129e116933cf371d018bb80ae557e889637989d8638274fb25622827b03881" +checksum = "86f4aa3ad99f2088c990dfa82d367e19cb29268ed67c574d10d0a4bfe71f07e0" dependencies = [ "libc", "windows-sys 0.60.2", @@ -4382,6 +4398,12 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +[[package]] +name = "spin" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5fe4ccb98d9c292d56fec89a5e07da7fc4cf0dc11e156b41793132775d3e591" + [[package]] name = "spki" version = "0.7.3" @@ -4429,9 +4451,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.107" +version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a26dbd934e5451d21ef060c018dae56fc073894c5a7896f882928a76e6d081b" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", "quote", @@ -4472,12 +4494,12 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "tempfile" -version = "3.23.0" +version = "3.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" +checksum = "82a72c767771b47409d2345987fda8628641887d5466101319899796367354a0" dependencies = [ "fastrand", - "getrandom 0.3.4", + "getrandom 0.4.1", "once_cell", "rustix", "windows-sys 0.61.2", @@ -4494,11 +4516,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl 2.0.17", + "thiserror-impl 2.0.18", ] [[package]] @@ -4514,9 +4536,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", @@ -4534,30 +4556,30 @@ dependencies = [ [[package]] name = "time" -version = "0.3.44" +version = "0.3.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" dependencies = [ "deranged", "itoa", "num-conv", "powerfmt", - "serde", + "serde_core", "time-core", "time-macros", ] [[package]] name = "time-core" -version = "0.1.6" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40868e7c1d2f0b8d73e4a8c7f0ff63af4f6d19be117e90bd73eb1d62cf831c6b" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" [[package]] name = "time-macros" -version = "0.2.24" +version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30cfb0125f12d9c277f35663a0a33f8c30190f4e4574868a330595412d34ebf3" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" dependencies = [ "num-conv", "time-core", @@ -4574,9 +4596,9 @@ dependencies = [ [[package]] name = "tinystr" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" dependencies = [ "displaydoc", "zerovec", @@ -4608,9 +4630,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.48.0" +version = "1.49.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408" +checksum = "72a2903cd7736441aac9df9d7688bd0ce48edccaadf181c3b90be801e81d3d86" dependencies = [ "bytes", "libc", @@ -4618,7 +4640,7 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2 0.6.1", + "socket2 0.6.2", "tokio-macros", "windows-sys 0.61.2", ] @@ -4646,9 +4668,9 @@ dependencies = [ [[package]] name = "tokio-stream" -version = "0.1.17" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047" +checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" dependencies = [ "futures-core", "pin-project-lite", @@ -4657,9 +4679,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.16" +version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" dependencies = [ "bytes", "futures-core", @@ -4671,106 +4693,85 @@ dependencies = [ [[package]] name = "tonic" -version = "0.12.3" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" -dependencies = [ - "async-trait", - "base64 0.22.1", - "bytes", - "http 1.3.1", - "http-body 1.0.1", - "http-body-util", - "hyper 1.7.0", - "hyper-timeout", - "hyper-util", - "percent-encoding", - "pin-project", - "prost", - "tokio", - "tokio-stream", - "tower 0.4.13", - "tower-layer", - "tower-service", - "tracing", - "zstd", -] - -[[package]] -name = "tonic" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9" +checksum = "fec7c61a0695dc1887c1b53952990f3ad2e3a31453e1f49f10e75424943a93ec" dependencies = [ "async-trait", "axum", - "base64 0.22.1", + "base64", "bytes", "flate2", - "h2 0.4.12", - "http 1.3.1", + "h2 0.4.13", + "http 1.4.0", "http-body 1.0.1", "http-body-util", - "hyper 1.7.0", + "hyper 1.8.1", "hyper-timeout", "hyper-util", "percent-encoding", "pin-project", - "prost", "rustls-native-certs", - "socket2 0.5.10", + "socket2 0.6.2", + "sync_wrapper", "tokio", "tokio-rustls", "tokio-stream", - "tower 0.5.2", + "tower", "tower-layer", "tower-service", "tracing", + "zstd", ] [[package]] name = "tonic-build" -version = "0.13.1" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac6f67be712d12f0b41328db3137e0d0757645d8904b4cb7d51cd9c2279e847" +checksum = "1882ac3bf5ef12877d7ed57aad87e75154c11931c2ba7e6cde5e22d63522c734" dependencies = [ "prettyplease", "proc-macro2", - "prost-build", - "prost-types", "quote", "syn", ] [[package]] -name = "tower" -version = "0.4.13" +name = "tonic-prost" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +checksum = "a55376a0bbaa4975a3f10d009ad763d8f4108f067c7c2e74f3001fb49778d309" dependencies = [ - "futures-core", - "futures-util", - "indexmap 1.9.3", - "pin-project", - "pin-project-lite", - "rand 0.8.5", - "slab", - "tokio", - "tokio-util", - "tower-layer", - "tower-service", - "tracing", + "bytes", + "prost", + "tonic", +] + +[[package]] +name = "tonic-prost-build" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3144df636917574672e93d0f56d7edec49f90305749c668df5101751bb8f95a" +dependencies = [ + "prettyplease", + "proc-macro2", + "prost-build", + "prost-types", + "quote", + "syn", + "tempfile", + "tonic-build", ] [[package]] name = "tower" -version = "0.5.2" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", - "indexmap 2.12.0", + "indexmap", "pin-project-lite", "slab", "sync_wrapper", @@ -4783,18 +4784,18 @@ dependencies = [ [[package]] name = "tower-http" -version = "0.6.6" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" +checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" dependencies = [ - "bitflags 2.10.0", + "bitflags", "bytes", "futures-util", - "http 1.3.1", + "http 1.4.0", "http-body 1.0.1", "iri-string", "pin-project-lite", - "tower 0.5.2", + "tower", "tower-layer", "tower-service", ] @@ -4813,9 +4814,9 @@ checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" -version = "0.1.41" +version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" dependencies = [ "pin-project-lite", "tracing-attributes", @@ -4824,9 +4825,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.30" +version = "0.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", @@ -4835,9 +4836,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.34" +version = "0.1.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9d12581f227e93f094d3af2ae690a574abb8a2b9b7a96e7cfe9647b2b617678" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" dependencies = [ "once_cell", "valuable", @@ -4856,14 +4857,12 @@ dependencies = [ [[package]] name = "tracing-opentelemetry" -version = "0.30.0" +version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd8e764bd6f5813fd8bebc3117875190c5b0415be8f7f8059bffb6ecd979c444" +checksum = "1ac28f2d093c6c477eaa76b23525478f38de514fa9aeb1285738d4b97a9552fc" dependencies = [ "js-sys", - "once_cell", "opentelemetry", - "opentelemetry_sdk", "smallvec", "tracing", "tracing-core", @@ -4883,9 +4882,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.20" +version = "0.3.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2054a14f5307d601f88daf0553e1cbf472acc4f2c51afab632431cdcd72124d5" +checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" dependencies = [ "matchers", "nu-ansi-term", @@ -4904,9 +4903,9 @@ dependencies = [ [[package]] name = "tracing-test" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "557b891436fe0d5e0e363427fc7f217abf9ccd510d5136549847bdcbcd011d68" +checksum = "19a4c448db514d4f24c5ddb9f73f2ee71bfb24c526cf0c570ba142d1119e0051" dependencies = [ "tracing-core", "tracing-subscriber", @@ -4915,9 +4914,9 @@ dependencies = [ [[package]] name = "tracing-test-macro" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04659ddb06c87d233c566112c1c9c5b9e98256d9af50ec3bc9c8327f873a7568" +checksum = "ad06847b7afb65c7866a36664b75c40b895e318cea4f71299f013fb22965329d" dependencies = [ "quote", "syn", @@ -4931,18 +4930,18 @@ checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] name = "typed-builder" -version = "0.20.1" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd9d30e3a08026c78f246b173243cf07b3696d274debd26680773b6773c2afc7" +checksum = "398a3a3c918c96de527dc11e6e846cd549d4508030b8a33e1da12789c856b81a" dependencies = [ "typed-builder-macro", ] [[package]] name = "typed-builder-macro" -version = "0.20.1" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c36781cc0e46a83726d9879608e4cf6c2505237e263a8eb8c24502989cfdb28" +checksum = "0e48cea23f68d1f78eb7bc092881b6bb88d3d6b5b7e6234f6f9c911da1ffb221" dependencies = [ "proc-macro2", "quote", @@ -4963,9 +4962,9 @@ checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" [[package]] name = "unicase" -version = "2.8.1" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75b844d17643ee918803943289730bec8aac480150456169e647ed0b576ba539" +checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" [[package]] name = "unicode-bidi" @@ -4975,24 +4974,30 @@ checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" [[package]] name = "unicode-ident" -version = "1.0.20" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" [[package]] name = "unicode-normalization" -version = "0.1.24" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" dependencies = [ "tinyvec", ] [[package]] name = "unicode-properties" -version = "0.1.3" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d" + +[[package]] +name = "unicode-segmentation" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-xid" @@ -5014,9 +5019,9 @@ checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" [[package]] name = "url" -version = "2.5.7" +version = "2.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" dependencies = [ "form_urlencoded", "idna", @@ -5032,9 +5037,9 @@ checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" [[package]] name = "utf8-width" -version = "0.1.7" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3" +checksum = "1292c0d970b54115d14f2492fe0170adf21d68a1de108eebc51c1df4f346a091" [[package]] name = "utf8_iter" @@ -5050,14 +5055,14 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.18.1" +version = "1.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" +checksum = "b672338555252d43fd2240c714dc444b8c6fb0a5c5335e65a07bba7742735ddb" dependencies = [ "atomic", - "getrandom 0.3.4", + "getrandom 0.4.1", "js-sys", - "serde", + "serde_core", "wasm-bindgen", ] @@ -5106,47 +5111,43 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasip2" -version = "1.0.1+wasi-0.2.4" +version = "1.0.2+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" dependencies = [ "wit-bindgen", ] [[package]] -name = "wasm-bindgen" -version = "0.2.104" +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" dependencies = [ - "cfg-if", - "once_cell", - "rustversion", - "wasm-bindgen-macro", - "wasm-bindgen-shared", + "wit-bindgen", ] [[package]] -name = "wasm-bindgen-backend" -version = "0.2.104" +name = "wasm-bindgen" +version = "0.2.113" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19" +checksum = "60722a937f594b7fde9adb894d7c092fc1bb6612897c46368d18e7a20208eff2" dependencies = [ - "bumpalo", - "log", - "proc-macro2", - "quote", - "syn", + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.54" +version = "0.4.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e038d41e478cc73bae0ff9b36c60cff1c98b8f38f8d7e8061e79ee63608ac5c" +checksum = "8a89f4650b770e4521aa6573724e2aed4704372151bd0de9d16a3bbabb87441a" dependencies = [ "cfg-if", + "futures-util", "js-sys", "once_cell", "wasm-bindgen", @@ -5155,9 +5156,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.104" +version = "0.2.113" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119" +checksum = "0fac8c6395094b6b91c4af293f4c79371c163f9a6f56184d2c9a85f5a95f3950" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -5165,31 +5166,53 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.104" +version = "0.2.113" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" +checksum = "ab3fabce6159dc20728033842636887e4877688ae94382766e00b180abac9d60" dependencies = [ + "bumpalo", "proc-macro2", "quote", "syn", - "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.104" +version = "0.2.113" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1" +checksum = "de0e091bdb824da87dc01d967388880d017a0a9bc4f3bdc0d86ee9f9336e3bb5" dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + [[package]] name = "wasm-streams" -version = "0.4.2" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +checksum = "9d1ec4f6517c9e11ae630e200b2b65d193279042e28edd4a2cda233e46670bbb" dependencies = [ "futures-util", "js-sys", @@ -5198,11 +5221,23 @@ dependencies = [ "web-sys", ] +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + [[package]] name = "web-sys" -version = "0.3.81" +version = "0.3.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120" +checksum = "705eceb4ce901230f8625bd1d665128056ccbe4b7408faa625eec1ba80f59a97" dependencies = [ "js-sys", "wasm-bindgen", @@ -5220,27 +5255,18 @@ dependencies = [ [[package]] name = "webpki-root-certs" -version = "1.0.3" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05d651ec480de84b762e7be71e6efa7461699c19d9e2c272c8d93455f567786e" +checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca" dependencies = [ "rustls-pki-types", ] [[package]] name = "webpki-roots" -version = "0.26.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" -dependencies = [ - "webpki-roots 1.0.3", -] - -[[package]] -name = "webpki-roots" -version = "1.0.3" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32b130c0d2d49f8b6889abc456e795e82525204f27c42cf767cf0d7734e089b8" +checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed" dependencies = [ "rustls-pki-types", ] @@ -5254,65 +5280,12 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "windows-core" -version = "0.62.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" -dependencies = [ - "windows-implement", - "windows-interface", - "windows-link", - "windows-result", - "windows-strings", -] - -[[package]] -name = "windows-implement" -version = "0.60.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "windows-interface" -version = "0.59.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "windows-link" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" -[[package]] -name = "windows-result" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" -dependencies = [ - "windows-link", -] - -[[package]] -name = "windows-strings" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" -dependencies = [ - "windows-link", -] - [[package]] name = "windows-sys" version = "0.45.0" @@ -5331,15 +5304,6 @@ dependencies = [ "windows-targets 0.52.6", ] -[[package]] -name = "windows-sys" -version = "0.59.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" -dependencies = [ - "windows-targets 0.52.6", -] - [[package]] name = "windows-sys" version = "0.60.2" @@ -5546,15 +5510,97 @@ checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" [[package]] name = "wit-bindgen" -version = "0.46.0" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] [[package]] name = "writeable" -version = "0.6.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" +checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" [[package]] name = "wyz" @@ -5585,11 +5631,10 @@ checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" [[package]] name = "yoke" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" +checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" dependencies = [ - "serde", "stable_deref_trait", "yoke-derive", "zerofrom", @@ -5597,9 +5642,9 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" +checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" dependencies = [ "proc-macro2", "quote", @@ -5609,18 +5654,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.27" +version = "0.8.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c" +checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.27" +version = "0.8.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" +checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517" dependencies = [ "proc-macro2", "quote", @@ -5656,9 +5701,9 @@ checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" [[package]] name = "zerotrie" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" dependencies = [ "displaydoc", "yoke", @@ -5667,9 +5712,9 @@ dependencies = [ [[package]] name = "zerovec" -version = "0.11.4" +version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" +checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" dependencies = [ "yoke", "zerofrom", @@ -5678,15 +5723,21 @@ dependencies = [ [[package]] name = "zerovec-derive" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" +checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" dependencies = [ "proc-macro2", "quote", "syn", ] +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + [[package]] name = "zstd" version = "0.13.3" diff --git a/Cargo.toml b/Cargo.toml index a94f54aee..e61093753 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,8 @@ rust-version = "1.87.0" version = "1.0.0-rc2" [profile.release] -lto = true +lto = "thin" +codegen-units = 16 # Prefer this profile in CI, for instance via `cargo test --all --profile=smol`. # It reduces the size of the `target` directory from ~12GB to ~1GB. @@ -55,42 +56,61 @@ hyper = { version = "1.6.0", default-features = false } hyper-util = { version = "0.1.11", default-features = false, features = [ "tracing", ] } -mimalloc = { version = "0.1.44", default-features = false } +mimalloc = { version = "0.1.44", default-features = false, features = ["override", "v3"] } rand = { version = "0.9.0", default-features = false, features = [ "thread_rng", ] } rustls-pki-types = { version = "1.13.1", features = [ "std", ], default-features = false } +socket2 = { version = "0.5.10", default-features = false } tokio = { version = "1.44.1", features = [ "fs", "io-util", + "parking_lot", "rt-multi-thread", "signal", ], default-features = false } tokio-rustls = { version = "0.26.2", default-features = false, features = [ - "ring", + "aws_lc_rs", ] } -tonic = { version = "0.13.0", features = [ - "tls-ring", +tonic = { version = "0.14.5", features = [ + "gzip", + "tls-aws-lc", "transport", + "zstd", ], default-features = false } tower = { version = "0.5.2", default-features = false } tracing = { version = "0.1.41", default-features = false } -[workspace.cargo-features-manager.keep] +[dev-dependencies] +nativelink-proto = { path = "nativelink-proto" } +prost = { version = "0.14.3", default-features = false } +prost-types = { version = "0.14.3", default-features = false } +sha2 = { version = "0.10.8", default-features = false, features = ["asm"] } +tempfile = { version = "3.15.0", default-features = false } +tokio = { version = "1.44.1", features = [ + "macros", + "rt-multi-thread", + "time", +], default-features = false } +tonic = { version = "0.14.5", features = [ + "transport", +], default-features = false } + +[workspace.metadata.cargo-features-manager.keep] async-lock = ["std"] aws-sdk-s3 = ["rt-tokio"] aws-smithy-runtime = ["test-util"] # This causes blake3 to detect SIMD capabilities at runtime. -blake3 = ["std"] +blake3 = ["std", "rayon"] pretty_assertions = ["std"] redis-test = ["aio"] serial_test = ["async"] -tokio = ["fs", "io-util", "rt-multi-thread", "signal"] +tokio = ["fs", "io-util", "parking_lot", "rt-multi-thread", "signal"] tokio-stream = ["fs"] -tonic = ["tls", "transport"] -tonic-build = ["prost"] +tonic = ["gzip", "tls", "transport", "zstd"] +tonic-build = [] uuid = ["serde", "v4"] [workspace.lints.rust] @@ -193,3 +213,10 @@ ref_option = { level = "allow", priority = 1 } too_many_lines = { level = "allow", priority = 1 } unused_async = { level = "allow", priority = 1 } unused_self = { level = "allow", priority = 1 } + +# Pin gcloud crates to unreleased main branch for reqwest 0.13 support. +# Remove once gcloud-storage 1.3+ is published to crates.io. +[patch.crates-io] +gcloud-storage = { git = "https://github.com/yoshidan/google-cloud-rust", rev = "e0e790b9d4de1fbd7085dc98fde21eaf9573899a" } +gcloud-auth = { git = "https://github.com/yoshidan/google-cloud-rust", rev = "e0e790b9d4de1fbd7085dc98fde21eaf9573899a" } +gcloud-metadata = { git = "https://github.com/yoshidan/google-cloud-rust", rev = "e0e790b9d4de1fbd7085dc98fde21eaf9573899a" } diff --git a/deployment-examples/docker-compose/docker-compose-multi-worker.yml b/deployment-examples/docker-compose/docker-compose-multi-worker.yml index 80f13baa2..7ad1ed558 100644 --- a/deployment-examples/docker-compose/docker-compose-multi-worker.yml +++ b/deployment-examples/docker-compose/docker-compose-multi-worker.yml @@ -53,6 +53,8 @@ services: - cas-data:/data/cas # Shared CAS volume - worker1-data:/data/worker1 - ./worker-shared-cas.json5:/nativelink-config.json5 + ports: + - "50181:50081" # Peer CAS endpoint for blob sharing environment: - RUST_LOG=info - SCHEDULER_ENDPOINT=scheduler @@ -78,6 +80,8 @@ services: - cas-data:/data/cas # Shared CAS volume - worker2-data:/data/worker2 - ./worker-shared-cas.json5:/nativelink-config.json5 + ports: + - "50182:50081" # Peer CAS endpoint for blob sharing environment: - RUST_LOG=info - SCHEDULER_ENDPOINT=scheduler @@ -103,6 +107,8 @@ services: - cas-data:/data/cas # Shared CAS volume - worker3-data:/data/worker3 - ./worker-shared-cas.json5:/nativelink-config.json5 + ports: + - "50183:50081" # Peer CAS endpoint for blob sharing environment: - RUST_LOG=info - SCHEDULER_ENDPOINT=scheduler diff --git a/deployment-examples/docker-compose/docker-compose.yml b/deployment-examples/docker-compose/docker-compose.yml index f2cc124fb..b2b33da2f 100644 --- a/deployment-examples/docker-compose/docker-compose.yml +++ b/deployment-examples/docker-compose/docker-compose.yml @@ -70,6 +70,7 @@ services: RUST_LOG: ${RUST_LOG:-warn} CAS_ENDPOINT: nativelink_local_cas SCHEDULER_ENDPOINT: nativelink_scheduler + ports: [ "50081:50081/tcp" ] command: | nativelink /root/worker.json5 depends_on: diff --git a/deployment-examples/docker-compose/scheduler-multi-worker.json5 b/deployment-examples/docker-compose/scheduler-multi-worker.json5 index 18a28333f..a47deccc8 100644 --- a/deployment-examples/docker-compose/scheduler-multi-worker.json5 +++ b/deployment-examples/docker-compose/scheduler-multi-worker.json5 @@ -40,6 +40,8 @@ "lre-rs": "priority", ISA: "exact", }, + // Enable locality-aware scheduling by pointing at the CAS store. + cas_store: "GRPC_LOCAL_STORE", }, }, ], diff --git a/deployment-examples/docker-compose/scheduler.json5 b/deployment-examples/docker-compose/scheduler.json5 index 18a28333f..11e1f2588 100644 --- a/deployment-examples/docker-compose/scheduler.json5 +++ b/deployment-examples/docker-compose/scheduler.json5 @@ -40,6 +40,10 @@ "lre-rs": "priority", ISA: "exact", }, + // Enable locality-aware scheduling by pointing at the CAS store. + // The scheduler will resolve input trees and score workers by + // how many input bytes they already have cached. + cas_store: "GRPC_LOCAL_STORE", }, }, ], diff --git a/deployment-examples/docker-compose/test-multi-worker-simple.json5 b/deployment-examples/docker-compose/test-multi-worker-simple.json5 index 407a520eb..53e876209 100644 --- a/deployment-examples/docker-compose/test-multi-worker-simple.json5 +++ b/deployment-examples/docker-compose/test-multi-worker-simple.json5 @@ -52,6 +52,8 @@ supported_platform_properties: { cpu_count: "minimum", }, + // Enable locality-aware scheduling by pointing at the CAS store. + cas_store: "CAS", }, }, ], @@ -63,6 +65,8 @@ uri: "grpc://127.0.0.1:50061", }, cas_fast_slow_store: "CAS", + // Expose a CAS server for peer-to-peer blob sharing. + cas_server_port: 50081, upload_action_result: { ac_store: "AC", }, @@ -83,6 +87,7 @@ uri: "grpc://127.0.0.1:50061", }, cas_fast_slow_store: "CAS", + cas_server_port: 50082, upload_action_result: { ac_store: "AC", }, @@ -103,6 +108,7 @@ uri: "grpc://127.0.0.1:50061", }, cas_fast_slow_store: "CAS", + cas_server_port: 50083, upload_action_result: { ac_store: "AC", }, diff --git a/deployment-examples/docker-compose/worker-shared-cas.json5 b/deployment-examples/docker-compose/worker-shared-cas.json5 index 1198cde34..5c5a590b8 100644 --- a/deployment-examples/docker-compose/worker-shared-cas.json5 +++ b/deployment-examples/docker-compose/worker-shared-cas.json5 @@ -56,6 +56,9 @@ uri: "grpc://${SCHEDULER_ENDPOINT:-127.0.0.1}:50061", }, cas_fast_slow_store: "WORKER_FAST_SLOW_STORE", + // Expose a CAS server endpoint so other workers can fetch blobs + // directly from this worker (peer-to-peer blob sharing). + cas_server_port: 50081, upload_action_result: { ac_store: "GRPC_LOCAL_AC_STORE", }, diff --git a/deployment-examples/docker-compose/worker.json5 b/deployment-examples/docker-compose/worker.json5 index fd2aac594..414bc75a8 100644 --- a/deployment-examples/docker-compose/worker.json5 +++ b/deployment-examples/docker-compose/worker.json5 @@ -57,6 +57,9 @@ uri: "grpc://${SCHEDULER_ENDPOINT:-127.0.0.1}:50061", }, cas_fast_slow_store: "WORKER_FAST_SLOW_STORE", + // Expose a CAS server endpoint so other workers can fetch blobs + // directly from this worker (peer-to-peer blob sharing). + cas_server_port: 50081, upload_action_result: { ac_store: "GRPC_LOCAL_AC_STORE", }, diff --git a/integration_tests/buck2/buck2_cas.json5 b/integration_tests/buck2/buck2_cas.json5 index 963c6107e..5e27e510e 100644 --- a/integration_tests/buck2/buck2_cas.json5 +++ b/integration_tests/buck2/buck2_cas.json5 @@ -59,6 +59,8 @@ "lre-rs": "priority", ISA: "exact", }, + // Enable locality-aware scheduling. + cas_store: "WORKER_FAST_SLOW_STORE", }, }, ], @@ -69,6 +71,8 @@ uri: "grpc://127.0.0.1:50061", }, cas_fast_slow_store: "WORKER_FAST_SLOW_STORE", + // Expose a CAS server for peer-to-peer blob sharing. + cas_server_port: 50081, upload_action_result: { ac_store: "AC_MAIN_STORE", }, diff --git a/integration_tests/buildstream/buildstream_cas.json5 b/integration_tests/buildstream/buildstream_cas.json5 index 591d4df43..6c52482fc 100644 --- a/integration_tests/buildstream/buildstream_cas.json5 +++ b/integration_tests/buildstream/buildstream_cas.json5 @@ -61,6 +61,8 @@ "lre-rs": "priority", ISA: "exact", }, + // Enable locality-aware scheduling. + cas_store: "WORKER_FAST_SLOW_STORE", }, }, ], @@ -71,6 +73,8 @@ uri: "grpc://127.0.0.1:50061", }, cas_fast_slow_store: "WORKER_FAST_SLOW_STORE", + // Expose a CAS server for peer-to-peer blob sharing. + cas_server_port: 50081, upload_action_result: { ac_store: "AC_MAIN_STORE", }, diff --git a/integration_tests/mongo/mongo.json5 b/integration_tests/mongo/mongo.json5 index 80e11d494..13e96880a 100644 --- a/integration_tests/mongo/mongo.json5 +++ b/integration_tests/mongo/mongo.json5 @@ -74,6 +74,8 @@ }, max_job_retries: 3, worker_timeout_s: 300, + // Enable locality-aware scheduling. + cas_store: "PRODUCTION_CAS", }, }, ], diff --git a/kubernetes/components/worker/worker.json5 b/kubernetes/components/worker/worker.json5 index d68c57d55..ca12bfefb 100644 --- a/kubernetes/components/worker/worker.json5 +++ b/kubernetes/components/worker/worker.json5 @@ -56,6 +56,8 @@ uri: "grpc://${NATIVELINK_ENDPOINT:-127.0.0.1}:50061", }, cas_fast_slow_store: "WORKER_FAST_SLOW_STORE", + // Expose a CAS server for peer-to-peer blob sharing. + cas_server_port: 50081, upload_action_result: { ac_store: "GRPC_LOCAL_AC_STORE", }, diff --git a/kubernetes/nativelink/nativelink-config.json5 b/kubernetes/nativelink/nativelink-config.json5 index 630d1505f..d95892291 100644 --- a/kubernetes/nativelink/nativelink-config.json5 +++ b/kubernetes/nativelink/nativelink-config.json5 @@ -117,6 +117,8 @@ "lre-rs": "priority", ISA: "exact", }, + // Enable locality-aware scheduling. + cas_store: "CAS_MAIN_STORE", }, }, ], diff --git a/local-remote-execution/rust/aarch64-darwin.BUILD.bazel b/local-remote-execution/rust/aarch64-darwin.BUILD.bazel index ac97014eb..a4098069c 100644 --- a/local-remote-execution/rust/aarch64-darwin.BUILD.bazel +++ b/local-remote-execution/rust/aarch64-darwin.BUILD.bazel @@ -43,42 +43,42 @@ filegroup( "bin/*.so", "lib/*.so", "lib/rustlib/aarch64-apple-darwin/codegen-backends/*.so", - "lib/rustlib/aarch64-apple-darwin/bin/rust-lld", + "lib/rustlib/aarch64-apple-darwin/bin/**", "lib/rustlib/aarch64-apple-darwin/lib/*.so", ], allow_empty = True), "@local-remote-execution//rust/triple:aarch64-unknown-linux-gnu": glob([ "bin/*.so", "lib/*.so", "lib/rustlib/aarch64-unknown-linux-gnu/codegen-backends/*.so", - "lib/rustlib/aarch64-unknown-linux-gnu/bin/rust-lld", + "lib/rustlib/aarch64-unknown-linux-gnu/bin/**", "lib/rustlib/aarch64-unknown-linux-gnu/lib/*.so", ], allow_empty = True), "@local-remote-execution//rust/triple:aarch64-unknown-linux-musl": glob([ "bin/*.so", "lib/*.so", "lib/rustlib/aarch64-unknown-linux-musl/codegen-backends/*.so", - "lib/rustlib/aarch64-unknown-linux-musl/bin/rust-lld", + "lib/rustlib/aarch64-unknown-linux-musl/bin/**", "lib/rustlib/aarch64-unknown-linux-musl/lib/*.so", ], allow_empty = True), "@local-remote-execution//rust/triple:x86_64-apple-darwin": glob([ "bin/*.so", "lib/*.so", "lib/rustlib/x86_64-apple-darwin/codegen-backends/*.so", - "lib/rustlib/x86_64-apple-darwin/bin/rust-lld", + "lib/rustlib/x86_64-apple-darwin/bin/**", "lib/rustlib/x86_64-apple-darwin/lib/*.so", ], allow_empty = True), "@local-remote-execution//rust/triple:x86_64-unknown-linux-gnu": glob([ "bin/*.so", "lib/*.so", "lib/rustlib/x86_64-unknown-linux-gnu/codegen-backends/*.so", - "lib/rustlib/x86_64-unknown-linux-gnu/bin/rust-lld", + "lib/rustlib/x86_64-unknown-linux-gnu/bin/**", "lib/rustlib/x86_64-unknown-linux-gnu/lib/*.so", ], allow_empty = True), "@local-remote-execution//rust/triple:x86_64-unknown-linux-musl": glob([ "bin/*.so", "lib/*.so", "lib/rustlib/x86_64-unknown-linux-musl/codegen-backends/*.so", - "lib/rustlib/x86_64-unknown-linux-musl/bin/rust-lld", + "lib/rustlib/x86_64-unknown-linux-musl/bin/**", "lib/rustlib/x86_64-unknown-linux-musl/lib/*.so", ], allow_empty = True), }), diff --git a/local-remote-execution/rust/aarch64-linux.BUILD.bazel b/local-remote-execution/rust/aarch64-linux.BUILD.bazel index 54f9171d7..a69b7264b 100644 --- a/local-remote-execution/rust/aarch64-linux.BUILD.bazel +++ b/local-remote-execution/rust/aarch64-linux.BUILD.bazel @@ -43,28 +43,28 @@ filegroup( "bin/*.so", "lib/*.so", "lib/rustlib/aarch64-unknown-linux-gnu/codegen-backends/*.so", - "lib/rustlib/aarch64-unknown-linux-gnu/bin/rust-lld", + "lib/rustlib/aarch64-unknown-linux-gnu/bin/**", "lib/rustlib/aarch64-unknown-linux-gnu/lib/*.so", ], allow_empty = True), "@local-remote-execution//rust/triple:aarch64-unknown-linux-musl": glob([ "bin/*.so", "lib/*.so", "lib/rustlib/aarch64-unknown-linux-musl/codegen-backends/*.so", - "lib/rustlib/aarch64-unknown-linux-musl/bin/rust-lld", + "lib/rustlib/aarch64-unknown-linux-musl/bin/**", "lib/rustlib/aarch64-unknown-linux-musl/lib/*.so", ], allow_empty = True), "@local-remote-execution//rust/triple:x86_64-unknown-linux-gnu": glob([ "bin/*.so", "lib/*.so", "lib/rustlib/x86_64-unknown-linux-gnu/codegen-backends/*.so", - "lib/rustlib/x86_64-unknown-linux-gnu/bin/rust-lld", + "lib/rustlib/x86_64-unknown-linux-gnu/bin/**", "lib/rustlib/x86_64-unknown-linux-gnu/lib/*.so", ], allow_empty = True), "@local-remote-execution//rust/triple:x86_64-unknown-linux-musl": glob([ "bin/*.so", "lib/*.so", "lib/rustlib/x86_64-unknown-linux-musl/codegen-backends/*.so", - "lib/rustlib/x86_64-unknown-linux-musl/bin/rust-lld", + "lib/rustlib/x86_64-unknown-linux-musl/bin/**", "lib/rustlib/x86_64-unknown-linux-musl/lib/*.so", ], allow_empty = True), }), diff --git a/local-remote-execution/rust/x86_64-darwin.BUILD.bazel b/local-remote-execution/rust/x86_64-darwin.BUILD.bazel index fcff515c0..27c2130b4 100644 --- a/local-remote-execution/rust/x86_64-darwin.BUILD.bazel +++ b/local-remote-execution/rust/x86_64-darwin.BUILD.bazel @@ -43,42 +43,42 @@ filegroup( "bin/*.so", "lib/*.so", "lib/rustlib/aarch64-apple-darwin/codegen-backends/*.so", - "lib/rustlib/aarch64-apple-darwin/bin/rust-lld", + "lib/rustlib/aarch64-apple-darwin/bin/**", "lib/rustlib/aarch64-apple-darwin/lib/*.so", ], allow_empty = True), "@local-remote-execution//rust/triple:aarch64-unknown-linux-gnu": glob([ "bin/*.so", "lib/*.so", "lib/rustlib/aarch64-unknown-linux-gnu/codegen-backends/*.so", - "lib/rustlib/aarch64-unknown-linux-gnu/bin/rust-lld", + "lib/rustlib/aarch64-unknown-linux-gnu/bin/**", "lib/rustlib/aarch64-unknown-linux-gnu/lib/*.so", ], allow_empty = True), "@local-remote-execution//rust/triple:aarch64-unknown-linux-musl": glob([ "bin/*.so", "lib/*.so", "lib/rustlib/aarch64-unknown-linux-musl/codegen-backends/*.so", - "lib/rustlib/aarch64-unknown-linux-musl/bin/rust-lld", + "lib/rustlib/aarch64-unknown-linux-musl/bin/**", "lib/rustlib/aarch64-unknown-linux-musl/lib/*.so", ], allow_empty = True), "@local-remote-execution//rust/triple:x86_64-apple-darwin": glob([ "bin/*.so", "lib/*.so", "lib/rustlib/x86_64-apple-darwin/codegen-backends/*.so", - "lib/rustlib/x86_64-apple-darwin/bin/rust-lld", + "lib/rustlib/x86_64-apple-darwin/bin/**", "lib/rustlib/x86_64-apple-darwin/lib/*.so", ], allow_empty = True), "@local-remote-execution//rust/triple:x86_64-unknown-linux-gnu": glob([ "bin/*.so", "lib/*.so", "lib/rustlib/x86_64-unknown-linux-gnu/codegen-backends/*.so", - "lib/rustlib/x86_64-unknown-linux-gnu/bin/rust-lld", + "lib/rustlib/x86_64-unknown-linux-gnu/bin/**", "lib/rustlib/x86_64-unknown-linux-gnu/lib/*.so", ], allow_empty = True), "@local-remote-execution//rust/triple:x86_64-unknown-linux-musl": glob([ "bin/*.so", "lib/*.so", "lib/rustlib/x86_64-unknown-linux-musl/codegen-backends/*.so", - "lib/rustlib/x86_64-unknown-linux-musl/bin/rust-lld", + "lib/rustlib/x86_64-unknown-linux-musl/bin/**", "lib/rustlib/x86_64-unknown-linux-musl/lib/*.so", ], allow_empty = True), }), diff --git a/local-remote-execution/rust/x86_64-linux.BUILD.bazel b/local-remote-execution/rust/x86_64-linux.BUILD.bazel index 9fdc08f2f..32909a27a 100644 --- a/local-remote-execution/rust/x86_64-linux.BUILD.bazel +++ b/local-remote-execution/rust/x86_64-linux.BUILD.bazel @@ -43,28 +43,28 @@ filegroup( "bin/*.so", "lib/*.so", "lib/rustlib/aarch64-unknown-linux-gnu/codegen-backends/*.so", - "lib/rustlib/aarch64-unknown-linux-gnu/bin/rust-lld", + "lib/rustlib/aarch64-unknown-linux-gnu/bin/**", "lib/rustlib/aarch64-unknown-linux-gnu/lib/*.so", ], allow_empty = True), "@local-remote-execution//rust/triple:aarch64-unknown-linux-musl": glob([ "bin/*.so", "lib/*.so", "lib/rustlib/aarch64-unknown-linux-musl/codegen-backends/*.so", - "lib/rustlib/aarch64-unknown-linux-musl/bin/rust-lld", + "lib/rustlib/aarch64-unknown-linux-musl/bin/**", "lib/rustlib/aarch64-unknown-linux-musl/lib/*.so", ], allow_empty = True), "@local-remote-execution//rust/triple:x86_64-unknown-linux-gnu": glob([ "bin/*.so", "lib/*.so", "lib/rustlib/x86_64-unknown-linux-gnu/codegen-backends/*.so", - "lib/rustlib/x86_64-unknown-linux-gnu/bin/rust-lld", + "lib/rustlib/x86_64-unknown-linux-gnu/bin/**", "lib/rustlib/x86_64-unknown-linux-gnu/lib/*.so", ], allow_empty = True), "@local-remote-execution//rust/triple:x86_64-unknown-linux-musl": glob([ "bin/*.so", "lib/*.so", "lib/rustlib/x86_64-unknown-linux-musl/codegen-backends/*.so", - "lib/rustlib/x86_64-unknown-linux-musl/bin/rust-lld", + "lib/rustlib/x86_64-unknown-linux-musl/bin/**", "lib/rustlib/x86_64-unknown-linux-musl/lib/*.so", ], allow_empty = True), }), diff --git a/nativelink-config/examples/basic_cas.json5 b/nativelink-config/examples/basic_cas.json5 index 4d7278204..c7d52d4ab 100644 --- a/nativelink-config/examples/basic_cas.json5 +++ b/nativelink-config/examples/basic_cas.json5 @@ -62,6 +62,10 @@ ISA: "exact", InputRootAbsolutePath: "ignore", // used by chromium builds, but we can drop it }, + // Enable locality-aware scheduling. The scheduler resolves input + // trees and scores workers by how many input bytes they already + // have cached. + cas_store: "WORKER_FAST_SLOW_STORE", }, }, ], @@ -72,6 +76,8 @@ uri: "grpc://127.0.0.1:50061", }, cas_fast_slow_store: "WORKER_FAST_SLOW_STORE", + // Expose a CAS server for peer-to-peer blob sharing. + cas_server_port: 50081, upload_action_result: { ac_store: "AC_MAIN_STORE", }, diff --git a/nativelink-config/examples/filesystem_cas.json5 b/nativelink-config/examples/filesystem_cas.json5 index 29e8f92e7..f4617c754 100644 --- a/nativelink-config/examples/filesystem_cas.json5 +++ b/nativelink-config/examples/filesystem_cas.json5 @@ -116,6 +116,8 @@ "lre-rs": "priority", ISA: "exact", }, + // Enable locality-aware scheduling. + cas_store: "CAS_MAIN_STORE", }, }, ], diff --git a/nativelink-config/examples/gcs_backend.json5 b/nativelink-config/examples/gcs_backend.json5 index 2fcd8cc6f..1ec07cce0 100644 --- a/nativelink-config/examples/gcs_backend.json5 +++ b/nativelink-config/examples/gcs_backend.json5 @@ -119,6 +119,8 @@ docker_image: "priority", "lre-rs": "priority", }, + // Enable locality-aware scheduling. + cas_store: "CAS_MAIN_STORE", }, }, ], diff --git a/nativelink-config/examples/mongo.json5 b/nativelink-config/examples/mongo.json5 index 74d2168f1..28ed275b9 100644 --- a/nativelink-config/examples/mongo.json5 +++ b/nativelink-config/examples/mongo.json5 @@ -91,6 +91,8 @@ }, max_job_retries: 3, worker_timeout_s: 300, + // Enable locality-aware scheduling. + cas_store: "PRODUCTION_CAS", }, }, ], diff --git a/nativelink-config/examples/ontap_backend.json5 b/nativelink-config/examples/ontap_backend.json5 index d54bfc27b..40b4f8c49 100644 --- a/nativelink-config/examples/ontap_backend.json5 +++ b/nativelink-config/examples/ontap_backend.json5 @@ -138,6 +138,8 @@ "lre-rs": "priority", ISA: "exact", }, + // Enable locality-aware scheduling. + cas_store: "CAS_MAIN_STORE", }, }, ], diff --git a/nativelink-config/examples/s3_backend_with_local_fast_cas.json5 b/nativelink-config/examples/s3_backend_with_local_fast_cas.json5 index 4d9abf276..2c6f6b26a 100644 --- a/nativelink-config/examples/s3_backend_with_local_fast_cas.json5 +++ b/nativelink-config/examples/s3_backend_with_local_fast_cas.json5 @@ -140,6 +140,8 @@ "lre-rs": "priority", ISA: "exact", }, + // Enable locality-aware scheduling. + cas_store: "CAS_MAIN_STORE", }, }, ], diff --git a/nativelink-config/examples/worker_with_redis_scheduler.json5 b/nativelink-config/examples/worker_with_redis_scheduler.json5 index 85d845850..207fddc23 100644 --- a/nativelink-config/examples/worker_with_redis_scheduler.json5 +++ b/nativelink-config/examples/worker_with_redis_scheduler.json5 @@ -69,6 +69,8 @@ redis_store: "SCHEDULER_REDIS_STORE", }, }, + // Enable locality-aware scheduling. + cas_store: "WORKER_FAST_SLOW_STORE", }, }, ], @@ -80,6 +82,8 @@ }, max_inflight_tasks: 5, cas_fast_slow_store: "WORKER_FAST_SLOW_STORE", + // Expose a CAS server for peer-to-peer blob sharing. + cas_server_port: 50081, upload_action_result: { ac_store: "AC_MAIN_STORE", }, diff --git a/nativelink-config/src/cas_server.rs b/nativelink-config/src/cas_server.rs index 70616694d..0a5eb8edc 100644 --- a/nativelink-config/src/cas_server.rs +++ b/nativelink-config/src/cas_server.rs @@ -63,8 +63,11 @@ pub enum HttpCompressionAlgorithm { #[default] None, - /// Zlib compression. + /// Gzip compression. Gzip, + + /// Zstandard compression. + Zstd, } /// Note: Compressing data in the cloud rarely has a benefit, since most @@ -192,7 +195,7 @@ pub struct ByteStreamConfig { /// 16KiB - 64KiB is optimal. /// /// - /// Default: 64KiB + /// Default: 64MiB #[serde( default, deserialize_with = "convert_data_size_with_shellexpand", @@ -518,11 +521,18 @@ pub struct HttpListener { #[serde(default)] pub advanced_http: HttpServerConfig, - /// Maximum number of bytes to decode on each grpc stream chunk. + /// Maximum number of bytes to decode on each inbound gRPC message. /// Default: 4 MiB #[serde(default, deserialize_with = "convert_data_size_with_shellexpand")] pub max_decoding_message_size: usize, + /// Maximum number of bytes to encode on each outbound gRPC message. + /// Default: 4 MiB (matches Bazel's Java gRPC client inbound limit). + /// Workers with a higher `max_decoding_message_size` should use a + /// separate listener with this value raised accordingly. + #[serde(default, deserialize_with = "convert_data_size_with_shellexpand")] + pub max_encoding_message_size: usize, + /// Tls Configuration for this server. /// If not set, the server will not use TLS. /// @@ -820,6 +830,25 @@ pub struct LocalWorkerConfig { /// them from CAS for every action. /// Default: None (directory cache disabled) pub directory_cache: Option, + + /// If set, the worker will start a CAS + ByteStream gRPC server on + /// 0.0.0.0: and advertise grpc://: to the + /// scheduler and other workers for peer-to-peer blob sharing. + /// The hostname is resolved at runtime via gethostname(). + /// Example: 50081 + /// Default: None (no peer CAS server) + #[serde(default)] + pub cas_server_port: Option, + + /// How often (in milliseconds) the worker should send a periodic + /// BlobsAvailable snapshot to the scheduler, reporting which blobs + /// are in the local CAS cache and their LRU timestamps. + /// Interval in milliseconds. Default: 0 (uses built-in default of + /// 500ms). + /// + /// Default: 0 + #[serde(default, deserialize_with = "convert_numeric_with_shellexpand")] + pub blobs_available_interval_ms: u64, } #[derive(Deserialize, Serialize, Debug, Clone)] diff --git a/nativelink-config/src/schedulers.rs b/nativelink-config/src/schedulers.rs index 36b267c47..8ce90fcdd 100644 --- a/nativelink-config/src/schedulers.rs +++ b/nativelink-config/src/schedulers.rs @@ -160,6 +160,28 @@ pub struct SimpleSpec { deserialize_with = "convert_duration_with_shellexpand_and_negative" )] pub worker_match_logging_interval_s: i64, + + /// Maximum number of actions that can be matched to workers for a single + /// client (identified by `instance_name`) in one matching cycle. When + /// multiple clients are competing for workers, this prevents one client + /// from monopolizing all available workers by round-robin interleaving + /// actions from different clients. + /// + /// Set to 0 to disable fair scheduling (unlimited matches per client + /// per cycle). Default: 0 (disabled). + #[serde(default, deserialize_with = "convert_numeric_with_shellexpand")] + pub max_matches_per_client_per_cycle: usize, + + /// Name of the CAS store used for resolving input trees during + /// locality-aware scheduling. When set, the scheduler resolves the + /// full input tree for each action and scores workers by how many + /// input bytes they already have cached. + /// + /// This should reference a CAS store in the `stores` section. + /// If not set, locality-aware tree scoring is disabled (only the + /// action affinity tier is used). + #[serde(default)] + pub cas_store: Option, } #[derive(Deserialize, Serialize, Debug)] diff --git a/nativelink-config/src/stores.rs b/nativelink-config/src/stores.rs index 59ecb7afa..8e38cfa9d 100644 --- a/nativelink-config/src/stores.rs +++ b/nativelink-config/src/stores.rs @@ -578,7 +578,7 @@ pub struct RefSpec { pub name: String, } -#[derive(Serialize, Deserialize, Debug, Default, Clone)] +#[derive(Serialize, Deserialize, Debug, Clone)] #[serde(deny_unknown_fields)] pub struct FilesystemSpec { /// Path on the system where to store the actual content. This is where @@ -599,7 +599,7 @@ pub struct FilesystemSpec { /// Buffer size to use when reading files. Generally this should be left /// to the default value except for testing. - /// Default: 32k. + /// Default: 256k. #[serde(default, deserialize_with = "convert_data_size_with_shellexpand")] pub read_buffer_size: u32, @@ -624,6 +624,41 @@ pub struct FilesystemSpec { /// Default: 0 #[serde(default, deserialize_with = "convert_numeric_with_shellexpand")] pub max_concurrent_writes: usize, + + /// If true, use sync_data() instead of sync_all() when flushing writes + /// to disk. sync_data() only syncs the file data without metadata + /// (timestamps, permissions), which is faster. For content-addressed + /// storage where the content is verified by hash, metadata sync is + /// unnecessary and this significantly reduces write latency. + /// Default: true + #[serde(default = "default_sync_data_only")] + pub sync_data_only: bool, + + /// If true, skip writes when a blob with the same key already exists + /// in the store. This is safe for content-addressed storage (CAS) where + /// identical keys guarantee identical content. Do NOT enable this for + /// stores where the same key can hold different content (e.g. action + /// cache). + /// When a duplicate write is skipped, the existing entry's access time + /// is updated in the LRU to prevent premature eviction. + /// Default: false + #[serde(default)] + pub content_is_immutable: bool, +} + +impl Default for FilesystemSpec { + fn default() -> Self { + Self { + content_path: String::new(), + temp_path: String::new(), + read_buffer_size: 0, + eviction_policy: None, + block_size: 0, + max_concurrent_writes: 0, + sync_data_only: true, + content_is_immutable: false, + } + } } // NetApp ONTAP S3 Spec @@ -1095,6 +1130,32 @@ pub struct GrpcEndpoint { /// If not set or 0, defaults to 20 seconds. #[serde(default, deserialize_with = "convert_duration_with_shellexpand")] pub http2_keepalive_timeout_s: u64, + + /// Whether to set TCP_NODELAY on the connection socket. + /// Disables Nagle's algorithm, reducing latency for small writes. + /// Default: true + #[serde(default = "default_tcp_nodelay")] + pub tcp_nodelay: bool, +} + +fn default_sync_data_only() -> bool { + true +} + +fn default_tcp_nodelay() -> bool { + true +} + +fn default_batch_update_threshold_bytes() -> u64 { + 1_048_576 +} + +fn default_batch_coalesce_delay_ms() -> u64 { + 10 +} + +const fn default_connections_per_endpoint() -> usize { + 32 } #[derive(Serialize, Deserialize, Debug, Clone)] @@ -1121,8 +1182,8 @@ pub struct GrpcSpec { pub max_concurrent_requests: usize, /// The number of connections to make to each specified endpoint to balance - /// the load over multiple TCP connections. Default 1. - #[serde(default, deserialize_with = "convert_numeric_with_shellexpand")] + /// the load over multiple TCP connections. Default 16. + #[serde(default = "default_connections_per_endpoint", deserialize_with = "convert_numeric_with_shellexpand")] pub connections_per_endpoint: usize, /// Maximum time (seconds) allowed for a single RPC request (e.g. a @@ -1132,6 +1193,35 @@ pub struct GrpcSpec { /// Default: 120 (seconds) #[serde(default, deserialize_with = "convert_duration_with_shellexpand")] pub rpc_timeout_s: u64, + + /// Maximum blob size (in bytes) for using BatchUpdateBlobs instead of + /// ByteStream.Write. Blobs at or below this size skip per-blob streaming + /// overhead (UUID generation, resource_name, streaming setup). Only + /// applies to CAS stores, not AC. + /// + /// Set to 0 to disable (all uploads use ByteStream.Write). + /// + /// Default: 1048576 (1 MiB) + #[serde( + default = "default_batch_update_threshold_bytes", + deserialize_with = "convert_numeric_with_shellexpand" + )] + pub batch_update_threshold_bytes: u64, + + /// Time window (in milliseconds) to coalesce multiple small blob uploads + /// into a single BatchUpdateBlobs RPC. Requires + /// `batch_update_threshold_bytes > 0`. + /// + /// When > 0, incoming small uploads are buffered for up to this duration + /// before being sent as one batch. When 0, each small upload is sent + /// immediately as a single-element BatchUpdateBlobs RPC. + /// + /// Default: 10 (milliseconds) + #[serde( + default = "default_batch_coalesce_delay_ms", + deserialize_with = "convert_numeric_with_shellexpand" + )] + pub batch_coalesce_delay_ms: u64, } /// The possible error codes that might occur on an upstream request. diff --git a/nativelink-error/Cargo.toml b/nativelink-error/Cargo.toml index 13581368b..3b8b2a976 100644 --- a/nativelink-error/Cargo.toml +++ b/nativelink-error/Cargo.toml @@ -14,8 +14,8 @@ version = "1.0.0-rc2" nativelink-metric = { path = "../nativelink-metric" } nativelink-proto = { path = "../nativelink-proto" } -prost = { version = "0.13.5", default-features = false } -prost-types = { version = "0.13.5", default-features = false } +prost = { version = "0.14.3", default-features = false } +prost-types = { version = "0.14.3", default-features = false } redis = { version = "1.0.0", default-features = false } rustls-pki-types = { version = "1.13.1", default-features = false } serde = { version = "1.0.219", default-features = false } @@ -26,8 +26,8 @@ tokio = { version = "1.44.1", features = [ "rt-multi-thread", "signal", ], default-features = false } -tonic = { version = "0.13.0", features = [ - "tls-ring", +tonic = { version = "0.14.5", features = [ + "tls-aws-lc", "transport", ], default-features = false } url = { version = "2.5.7", default-features = false } diff --git a/nativelink-error/src/lib.rs b/nativelink-error/src/lib.rs index 04df9e64a..a6bcddbbd 100644 --- a/nativelink-error/src/lib.rs +++ b/nativelink-error/src/lib.rs @@ -56,6 +56,8 @@ pub struct Error { #[serde(with = "CodeDef")] pub code: Code, pub messages: Vec, + #[serde(skip)] + pub details: Vec, } impl MetricsComponent for Error { @@ -71,7 +73,11 @@ impl MetricsComponent for Error { impl Error { #[must_use] pub const fn new_with_messages(code: Code, messages: Vec) -> Self { - Self { code, messages } + Self { + code, + messages, + details: Vec::new(), + } } #[must_use] @@ -131,7 +137,7 @@ impl From for nativelink_proto::google::rpc::Status { Self { code: val.code as i32, message: val.message_string(), - details: vec![], + details: val.details, } } } @@ -141,6 +147,7 @@ impl From for Error { Self { code: val.code.into(), messages: vec![val.message], + details: val.details, } } } @@ -156,6 +163,10 @@ impl core::fmt::Display for Error { builder.field("messages", &self.messages); } + if !self.details.is_empty() { + builder.field("details", &self.details); + } + builder.finish() } } @@ -252,6 +263,7 @@ impl From for Error { Self { code: err.kind().into_code(), messages: vec![err.to_string()], + details: Vec::new(), } } } @@ -405,6 +417,7 @@ impl ResultExt for Option { let mut error = Error { code: Code::Internal, messages: vec![], + details: Vec::new(), }; let (code, message) = tip_fn(&error); error.code = code; @@ -486,3 +499,69 @@ pub enum CodeDef { // NOTE: Additional codes must be added to stores.rs in ErrorCodes and also // in both match statements in retry.rs. } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn error_to_rpc_status_preserves_details() { + let detail = prost_types::Any { + type_url: "type.googleapis.com/google.rpc.PreconditionFailure".into(), + value: vec![1, 2, 3], // Dummy bytes + }; + let err = Error { + code: Code::FailedPrecondition, + messages: vec!["missing blob".into()], + details: vec![detail.clone()], + }; + let status: nativelink_proto::google::rpc::Status = err.into(); + assert_eq!(status.code, Code::FailedPrecondition as i32); + assert_eq!(status.details.len(), 1); + assert_eq!(status.details[0].type_url, detail.type_url); + assert_eq!(status.details[0].value, detail.value); + } + + #[test] + fn rpc_status_to_error_preserves_details() { + let detail = prost_types::Any { + type_url: "type.googleapis.com/google.rpc.PreconditionFailure".into(), + value: vec![4, 5, 6], + }; + let status = nativelink_proto::google::rpc::Status { + code: Code::FailedPrecondition as i32, + message: "test".into(), + details: vec![detail.clone()], + }; + let err: Error = status.into(); + assert_eq!(err.code, Code::FailedPrecondition); + assert_eq!(err.details.len(), 1); + assert_eq!(err.details[0].type_url, detail.type_url); + assert_eq!(err.details[0].value, detail.value); + } + + #[test] + fn error_details_roundtrip_through_rpc_status() { + let detail = prost_types::Any { + type_url: "type.googleapis.com/google.rpc.PreconditionFailure".into(), + value: vec![10, 20, 30], + }; + let original = Error { + code: Code::FailedPrecondition, + messages: vec!["missing".into()], + details: vec![detail], + }; + let status: nativelink_proto::google::rpc::Status = original.clone().into(); + let roundtripped: Error = status.into(); + assert_eq!(roundtripped.code, original.code); + assert_eq!(roundtripped.details.len(), original.details.len()); + assert_eq!(roundtripped.details[0].type_url, original.details[0].type_url); + assert_eq!(roundtripped.details[0].value, original.details[0].value); + } + + #[test] + fn make_err_macro_has_empty_details() { + let err = make_err!(Code::Internal, "something failed"); + assert!(err.details.is_empty()); + } +} diff --git a/nativelink-metric/src/lib.rs b/nativelink-metric/src/lib.rs index 5661f14b0..b885262dd 100644 --- a/nativelink-metric/src/lib.rs +++ b/nativelink-metric/src/lib.rs @@ -458,6 +458,18 @@ impl MetricsComponent for async_lock::Mutex { } } +impl MetricsComponent for async_lock::RwLock { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + // It is safe to block in the publishing thread. + let lock = self.read_blocking(); + lock.publish(kind, field_metadata) + } +} + impl MetricsComponent for parking_lot::Mutex { fn publish( &self, diff --git a/nativelink-proto/Cargo.toml b/nativelink-proto/Cargo.toml index fb9a08ad3..4174b44f3 100644 --- a/nativelink-proto/Cargo.toml +++ b/nativelink-proto/Cargo.toml @@ -12,20 +12,19 @@ path = "genproto/lib.rs" derive_more = { version = "2.0.1", default-features = false, features = [ "debug", ] } -prost = { version = "0.13.5", default-features = false } -prost-types = { version = "0.13.5", default-features = false } -tonic = { version = "0.13.0", features = [ +prost = { version = "0.14.3", default-features = false } +prost-types = { version = "0.14.3", default-features = false } +tonic = { version = "0.14.5", features = [ "codegen", - "prost", - "tls-ring", + "tls-aws-lc", "transport", ], default-features = false } +tonic-prost = { version = "0.14.5", default-features = false } [dev-dependencies] -prost-build = { version = "0.13.5", default-features = false } -tonic-build = { version = "0.13.0", features = [ - "prost", -], default-features = false } +prost-build = { version = "0.14.3", default-features = false } +tonic-build = { version = "0.14.5", default-features = false } +tonic-prost-build = { version = "0.14.5", default-features = false } [package.metadata.cargo-machete] # Used by gen_protos_tool.rs diff --git a/nativelink-proto/com/github/trace_machina/nativelink/remote_execution/worker_api.proto b/nativelink-proto/com/github/trace_machina/nativelink/remote_execution/worker_api.proto index d736d1624..d472505b2 100644 --- a/nativelink-proto/com/github/trace_machina/nativelink/remote_execution/worker_api.proto +++ b/nativelink-proto/com/github/trace_machina/nativelink/remote_execution/worker_api.proto @@ -44,6 +44,9 @@ service WorkerApi { /// Request object for keep alive requests. message KeepAliveRequest { reserved 1; // NextId. + /// CPU load percentage: load_avg_1m / num_cpus * 100. + /// 0 means unknown (old workers that don't report load). + uint32 cpu_load_pct = 2; } /// Request object for going away requests. @@ -75,7 +78,86 @@ message ConnectWorkerRequest { /// The default (0) means unlimited. uint64 max_inflight_tasks = 3; - reserved 4; // NextId. + /// This worker's CAS gRPC endpoint for peer blob serving. + /// If set, other workers can fetch blobs directly from this worker. + /// Example: "grpc://192.168.191.5:50081" + string cas_endpoint = 5; + + reserved 4; + reserved 6; +} + +/// Per-digest info including LRU access time for cache eviction heuristics. +message BlobDigestInfo { + /// The digest of the blob. + build.bazel.remote.execution.v2.Digest digest = 1; + /// The last time this blob was accessed in the worker's local cache. + /// Seconds since UNIX epoch. The scheduler can use this to estimate + /// how close a blob is to eviction (lower = more likely to be evicted). + int64 last_access_timestamp = 2; +} + +/// Notification that blobs are available on a worker for peer serving. +message BlobsAvailableNotification { + /// The worker's CAS endpoint where these blobs can be fetched. + string worker_cas_endpoint = 1; + /// The digests of newly available blobs (kept for backward compat / simple notifications). + repeated build.bazel.remote.execution.v2.Digest digests = 2; + /// If true, this is a full snapshot of all blobs in the worker's cache. + /// The server should replace its entire view for this endpoint with the + /// contents of this message (digest_infos + digests). If false, this is + /// an incremental update (new blobs only). + bool is_full_snapshot = 3; + /// Digests that have been evicted from the worker since the last update. + /// Only meaningful when is_full_snapshot == false. + repeated build.bazel.remote.execution.v2.Digest evicted_digests = 4; + /// Per-digest info with LRU timestamps. When present, the server should + /// prefer this over the plain `digests` field. + repeated BlobDigestInfo digest_infos = 5; + /// CPU load percentage: load_avg_1m / num_cpus * 100. + /// 0 means unknown (old workers that don't report load). + uint32 cpu_load_pct = 6; + /// Digests of input root directories that are cached in this worker's + /// directory cache. The scheduler can give routing preference to workers + /// that already have the action's input_root_digest cached. + /// Also used for the full subtree snapshot (when is_full_subtree_snapshot=true, + /// this contains ALL directory digests including subtrees). + repeated build.bazel.remote.execution.v2.Digest cached_directory_digests = 7; + + /// Delta-encoded subtree updates since last notification. + /// When a cache entry is added, send ALL directory digests in its merkle tree. + /// When a cache entry is evicted, send ALL directory digests that were removed + /// (only those no longer present in ANY cached entry's merkle tree). + repeated build.bazel.remote.execution.v2.Digest added_subtree_digests = 8; + repeated build.bazel.remote.execution.v2.Digest removed_subtree_digests = 9; + + /// True on the first notification after (re)connect — scheduler should + /// replace its cached_subtree_digests state rather than applying a delta. + /// In this case, cached_directory_digests (field 7) contains the full set + /// of all subtree digests. + bool is_full_subtree_snapshot = 10; +} + +/// Notification that blobs have been evicted from a worker. +message BlobsEvictedNotification { + /// The worker's CAS endpoint from which these blobs were evicted. + string worker_cas_endpoint = 1; + /// The digests of evicted blobs. + repeated build.bazel.remote.execution.v2.Digest digests = 2; +} + +/// Request to touch (update access time) blobs on a worker to prevent eviction. +message TouchBlobsRequest { + /// The digests of blobs to touch. + repeated build.bazel.remote.execution.v2.Digest digests = 1; +} + +/// A hint that a specific digest is available on one or more peer workers. +message PeerHint { + /// The digest available on peers. + build.bazel.remote.execution.v2.Digest digest = 1; + /// gRPC endpoints of workers that have this blob. + repeated string peer_endpoints = 2; } /// The result of an ExecutionRequest. @@ -106,6 +188,9 @@ message ExecuteResult { message ExecuteComplete { /// The operation ID that was executed. string operation_id = 1; + /// CPU load percentage: load_avg_1m / num_cpus * 100. + /// 0 means unknown (old workers that don't report load). + uint32 cpu_load_pct = 2; } /// Result sent back from the server when a node connects. @@ -146,8 +231,12 @@ message UpdateForWorker { /// Instructs the worker to kill a specific running operation. KillOperationRequest kill_operation_request = 5; + + /// Instructs the worker to touch (update access time) on blobs + /// to prevent premature eviction. + TouchBlobsRequest touch_blobs = 7; } - reserved 6; // NextId. + reserved 6; // Previously NextId, now reserved. } /// Communication from the worker to the scheduler. @@ -182,8 +271,14 @@ message UpdateForScheduler { /// Notify that the execution has completed, but result is uploading. ExecuteComplete execute_complete = 5; + + /// Notifies the scheduler that new blobs are available on this worker. + BlobsAvailableNotification blobs_available = 7; + + /// Notifies the scheduler that blobs have been evicted from this worker. + BlobsEvictedNotification blobs_evicted = 8; } - reserved 6; // NextId. + reserved 6; // Previously NextId, now reserved. } message StartExecute { @@ -204,7 +299,11 @@ message StartExecute { /// The ID of the worker that is executing the action. string worker_id = 6; - reserved 7; // NextId. + /// Hints about input blobs available on peer workers. + /// Workers should try these peers first before falling back to server CAS. + repeated PeerHint peer_hints = 8; + + reserved 9; // NextId. } /// This is a special message used to save actions into the CAS that can be used diff --git a/nativelink-proto/genproto/build.bazel.remote.asset.v1.pb.rs b/nativelink-proto/genproto/build.bazel.remote.asset.v1.pb.rs index c2a863a12..b88f92115 100644 --- a/nativelink-proto/genproto/build.bazel.remote.asset.v1.pb.rs +++ b/nativelink-proto/genproto/build.bazel.remote.asset.v1.pb.rs @@ -531,7 +531,7 @@ pub mod fetch_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/build.bazel.remote.asset.v1.Fetch/FetchBlob", ); @@ -557,7 +557,7 @@ pub mod fetch_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/build.bazel.remote.asset.v1.Fetch/FetchDirectory", ); @@ -709,7 +709,7 @@ pub mod push_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/build.bazel.remote.asset.v1.Push/PushBlob", ); @@ -733,7 +733,7 @@ pub mod push_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/build.bazel.remote.asset.v1.Push/PushDirectory", ); @@ -943,7 +943,7 @@ pub mod fetch_server { let inner = self.inner.clone(); let fut = async move { let method = FetchBlobSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, @@ -988,7 +988,7 @@ pub mod fetch_server { let inner = self.inner.clone(); let fut = async move { let method = FetchDirectorySvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, @@ -1216,7 +1216,7 @@ pub mod push_server { let inner = self.inner.clone(); let fut = async move { let method = PushBlobSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, @@ -1261,7 +1261,7 @@ pub mod push_server { let inner = self.inner.clone(); let fut = async move { let method = PushDirectorySvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, diff --git a/nativelink-proto/genproto/build.bazel.remote.execution.v2.pb.rs b/nativelink-proto/genproto/build.bazel.remote.execution.v2.pb.rs index f6e831311..c033f959e 100644 --- a/nativelink-proto/genproto/build.bazel.remote.execution.v2.pb.rs +++ b/nativelink-proto/genproto/build.bazel.remote.execution.v2.pb.rs @@ -2052,7 +2052,7 @@ pub mod execution_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/build.bazel.remote.execution.v2.Execution/Execute", ); @@ -2099,7 +2099,7 @@ pub mod execution_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/build.bazel.remote.execution.v2.Execution/WaitExecution", ); @@ -2235,7 +2235,7 @@ pub mod action_cache_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/build.bazel.remote.execution.v2.ActionCache/GetActionResult", ); @@ -2280,7 +2280,7 @@ pub mod action_cache_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/build.bazel.remote.execution.v2.ActionCache/UpdateActionResult", ); @@ -2545,7 +2545,7 @@ pub mod content_addressable_storage_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/build.bazel.remote.execution.v2.ContentAddressableStorage/FindMissingBlobs", ); @@ -2597,7 +2597,7 @@ pub mod content_addressable_storage_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/build.bazel.remote.execution.v2.ContentAddressableStorage/BatchUpdateBlobs", ); @@ -2646,7 +2646,7 @@ pub mod content_addressable_storage_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/build.bazel.remote.execution.v2.ContentAddressableStorage/BatchReadBlobs", ); @@ -2698,7 +2698,7 @@ pub mod content_addressable_storage_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/build.bazel.remote.execution.v2.ContentAddressableStorage/GetTree", ); @@ -2825,7 +2825,7 @@ pub mod capabilities_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/build.bazel.remote.execution.v2.Capabilities/GetCapabilities", ); @@ -3086,7 +3086,7 @@ pub mod execution_server { let inner = self.inner.clone(); let fut = async move { let method = ExecuteSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, @@ -3132,7 +3132,7 @@ pub mod execution_server { let inner = self.inner.clone(); let fut = async move { let method = WaitExecutionSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, @@ -3362,7 +3362,7 @@ pub mod action_cache_server { let inner = self.inner.clone(); let fut = async move { let method = GetActionResultSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, @@ -3408,7 +3408,7 @@ pub mod action_cache_server { let inner = self.inner.clone(); let fut = async move { let method = UpdateActionResultSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, @@ -3837,7 +3837,7 @@ pub mod content_addressable_storage_server { let inner = self.inner.clone(); let fut = async move { let method = FindMissingBlobsSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, @@ -3886,7 +3886,7 @@ pub mod content_addressable_storage_server { let inner = self.inner.clone(); let fut = async move { let method = BatchUpdateBlobsSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, @@ -3935,7 +3935,7 @@ pub mod content_addressable_storage_server { let inner = self.inner.clone(); let fut = async move { let method = BatchReadBlobsSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, @@ -3982,7 +3982,7 @@ pub mod content_addressable_storage_server { let inner = self.inner.clone(); let fut = async move { let method = GetTreeSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, @@ -4180,7 +4180,7 @@ pub mod capabilities_server { let inner = self.inner.clone(); let fut = async move { let method = GetCapabilitiesSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, diff --git a/nativelink-proto/genproto/com.github.trace_machina.nativelink.remote_execution.pb.rs b/nativelink-proto/genproto/com.github.trace_machina.nativelink.remote_execution.pb.rs index c4a53f73f..6e60964f4 100644 --- a/nativelink-proto/genproto/com.github.trace_machina.nativelink.remote_execution.pb.rs +++ b/nativelink-proto/genproto/com.github.trace_machina.nativelink.remote_execution.pb.rs @@ -15,7 +15,12 @@ // This file is @generated by prost-build. /// / Request object for keep alive requests. #[derive(Clone, Copy, PartialEq, ::prost::Message)] -pub struct KeepAliveRequest {} +pub struct KeepAliveRequest { + /// / CPU load percentage: load_avg_1m / num_cpus * 100. + /// / 0 means unknown (old workers that don't report load). + #[prost(uint32, tag = "2")] + pub cpu_load_pct: u32, +} /// / Request object for going away requests. #[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct GoingAwayRequest {} @@ -46,6 +51,117 @@ pub struct ConnectWorkerRequest { /// / The default (0) means unlimited. #[prost(uint64, tag = "3")] pub max_inflight_tasks: u64, + /// / This worker's CAS gRPC endpoint for peer blob serving. + /// / If set, other workers can fetch blobs directly from this worker. + /// / Example: "grpc://192.168.191.5:50081" + #[prost(string, tag = "5")] + pub cas_endpoint: ::prost::alloc::string::String, +} +/// / Per-digest info including LRU access time for cache eviction heuristics. +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct BlobDigestInfo { + /// / The digest of the blob. + #[prost(message, optional, tag = "1")] + pub digest: ::core::option::Option< + super::super::super::super::super::build::bazel::remote::execution::v2::Digest, + >, + /// / The last time this blob was accessed in the worker's local cache. + /// / Seconds since UNIX epoch. The scheduler can use this to estimate + /// / how close a blob is to eviction (lower = more likely to be evicted). + #[prost(int64, tag = "2")] + pub last_access_timestamp: i64, +} +/// / Notification that blobs are available on a worker for peer serving. +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct BlobsAvailableNotification { + /// / The worker's CAS endpoint where these blobs can be fetched. + #[prost(string, tag = "1")] + pub worker_cas_endpoint: ::prost::alloc::string::String, + /// / The digests of newly available blobs (kept for backward compat / simple notifications). + #[prost(message, repeated, tag = "2")] + pub digests: ::prost::alloc::vec::Vec< + super::super::super::super::super::build::bazel::remote::execution::v2::Digest, + >, + /// / If true, this is a full snapshot of all blobs in the worker's cache. + /// / The server should replace its entire view for this endpoint with the + /// / contents of this message (digest_infos + digests). If false, this is + /// / an incremental update (new blobs only). + #[prost(bool, tag = "3")] + pub is_full_snapshot: bool, + /// / Digests that have been evicted from the worker since the last update. + /// / Only meaningful when is_full_snapshot == false. + #[prost(message, repeated, tag = "4")] + pub evicted_digests: ::prost::alloc::vec::Vec< + super::super::super::super::super::build::bazel::remote::execution::v2::Digest, + >, + /// / Per-digest info with LRU timestamps. When present, the server should + /// / prefer this over the plain `digests` field. + #[prost(message, repeated, tag = "5")] + pub digest_infos: ::prost::alloc::vec::Vec, + /// / CPU load percentage: load_avg_1m / num_cpus * 100. + /// / 0 means unknown (old workers that don't report load). + #[prost(uint32, tag = "6")] + pub cpu_load_pct: u32, + /// / Digests of input root directories that are cached in this worker's + /// / directory cache. The scheduler can give routing preference to workers + /// / that already have the action's input_root_digest cached. + /// / Also used for the full subtree snapshot (when is_full_subtree_snapshot=true, + /// / this contains ALL directory digests including subtrees). + #[prost(message, repeated, tag = "7")] + pub cached_directory_digests: ::prost::alloc::vec::Vec< + super::super::super::super::super::build::bazel::remote::execution::v2::Digest, + >, + /// / Delta-encoded subtree updates since last notification. + /// / When a cache entry is added, send ALL directory digests in its merkle tree. + /// / When a cache entry is evicted, send ALL directory digests that were removed + /// / (only those no longer present in ANY cached entry's merkle tree). + #[prost(message, repeated, tag = "8")] + pub added_subtree_digests: ::prost::alloc::vec::Vec< + super::super::super::super::super::build::bazel::remote::execution::v2::Digest, + >, + #[prost(message, repeated, tag = "9")] + pub removed_subtree_digests: ::prost::alloc::vec::Vec< + super::super::super::super::super::build::bazel::remote::execution::v2::Digest, + >, + /// / True on the first notification after (re)connect — scheduler should + /// / replace its cached_subtree_digests state rather than applying a delta. + /// / In this case, cached_directory_digests (field 7) contains the full set + /// / of all subtree digests. + #[prost(bool, tag = "10")] + pub is_full_subtree_snapshot: bool, +} +/// / Notification that blobs have been evicted from a worker. +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct BlobsEvictedNotification { + /// / The worker's CAS endpoint from which these blobs were evicted. + #[prost(string, tag = "1")] + pub worker_cas_endpoint: ::prost::alloc::string::String, + /// / The digests of evicted blobs. + #[prost(message, repeated, tag = "2")] + pub digests: ::prost::alloc::vec::Vec< + super::super::super::super::super::build::bazel::remote::execution::v2::Digest, + >, +} +/// / Request to touch (update access time) blobs on a worker to prevent eviction. +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct TouchBlobsRequest { + /// / The digests of blobs to touch. + #[prost(message, repeated, tag = "1")] + pub digests: ::prost::alloc::vec::Vec< + super::super::super::super::super::build::bazel::remote::execution::v2::Digest, + >, +} +/// / A hint that a specific digest is available on one or more peer workers. +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct PeerHint { + /// / The digest available on peers. + #[prost(message, optional, tag = "1")] + pub digest: ::core::option::Option< + super::super::super::super::super::build::bazel::remote::execution::v2::Digest, + >, + /// / gRPC endpoints of workers that have this blob. + #[prost(string, repeated, tag = "2")] + pub peer_endpoints: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, } /// / The result of an ExecutionRequest. #[derive(Clone, PartialEq, ::prost::Message)] @@ -85,6 +201,10 @@ pub struct ExecuteComplete { /// / The operation ID that was executed. #[prost(string, tag = "1")] pub operation_id: ::prost::alloc::string::String, + /// / CPU load percentage: load_avg_1m / num_cpus * 100. + /// / 0 means unknown (old workers that don't report load). + #[prost(uint32, tag = "2")] + pub cpu_load_pct: u32, } /// / Result sent back from the server when a node connects. #[derive(Clone, PartialEq, ::prost::Message)] @@ -103,7 +223,7 @@ pub struct KillOperationRequest { /// / Communication from the scheduler to the worker. #[derive(Clone, PartialEq, ::prost::Message)] pub struct UpdateForWorker { - #[prost(oneof = "update_for_worker::Update", tags = "1, 2, 3, 4, 5")] + #[prost(oneof = "update_for_worker::Update", tags = "1, 2, 3, 4, 5, 7")] pub update: ::core::option::Option, } /// Nested message and enum types in `UpdateForWorker`. @@ -132,12 +252,16 @@ pub mod update_for_worker { /// / Instructs the worker to kill a specific running operation. #[prost(message, tag = "5")] KillOperationRequest(super::KillOperationRequest), + /// / Instructs the worker to touch (update access time) on blobs + /// / to prevent premature eviction. + #[prost(message, tag = "7")] + TouchBlobs(super::TouchBlobsRequest), } } /// / Communication from the worker to the scheduler. #[derive(Clone, PartialEq, ::prost::Message)] pub struct UpdateForScheduler { - #[prost(oneof = "update_for_scheduler::Update", tags = "1, 2, 3, 4, 5")] + #[prost(oneof = "update_for_scheduler::Update", tags = "1, 2, 3, 4, 5, 7, 8")] pub update: ::core::option::Option, } /// Nested message and enum types in `UpdateForScheduler`. @@ -174,6 +298,12 @@ pub mod update_for_scheduler { /// / Notify that the execution has completed, but result is uploading. #[prost(message, tag = "5")] ExecuteComplete(super::ExecuteComplete), + /// / Notifies the scheduler that new blobs are available on this worker. + #[prost(message, tag = "7")] + BlobsAvailable(super::BlobsAvailableNotification), + /// / Notifies the scheduler that blobs have been evicted from this worker. + #[prost(message, tag = "8")] + BlobsEvicted(super::BlobsEvictedNotification), } } #[derive(Clone, PartialEq, ::prost::Message)] @@ -199,6 +329,10 @@ pub struct StartExecute { /// / The ID of the worker that is executing the action. #[prost(string, tag = "6")] pub worker_id: ::prost::alloc::string::String, + /// / Hints about input blobs available on peer workers. + /// / Workers should try these peers first before falling back to server CAS. + #[prost(message, repeated, tag = "8")] + pub peer_hints: ::prost::alloc::vec::Vec, } /// / This is a special message used to save actions into the CAS that can be used /// / by programs like bb_browswer to inspect the history of a build. @@ -328,7 +462,7 @@ pub mod worker_api_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/com.github.trace_machina.nativelink.remote_execution.WorkerApi/ConnectWorker", ); @@ -496,7 +630,7 @@ pub mod worker_api_server { let inner = self.inner.clone(); let fut = async move { let method = ConnectWorkerSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, diff --git a/nativelink-proto/genproto/google.bytestream.pb.rs b/nativelink-proto/genproto/google.bytestream.pb.rs index d0229a041..fe14f6bb4 100644 --- a/nativelink-proto/genproto/google.bytestream.pb.rs +++ b/nativelink-proto/genproto/google.bytestream.pb.rs @@ -232,7 +232,7 @@ pub mod byte_stream_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/google.bytestream.ByteStream/Read", ); @@ -275,7 +275,7 @@ pub mod byte_stream_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/google.bytestream.ByteStream/Write", ); @@ -313,7 +313,7 @@ pub mod byte_stream_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/google.bytestream.ByteStream/QueryWriteStatus", ); @@ -530,7 +530,7 @@ pub mod byte_stream_server { let inner = self.inner.clone(); let fut = async move { let method = ReadSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, @@ -577,7 +577,7 @@ pub mod byte_stream_server { let inner = self.inner.clone(); let fut = async move { let method = WriteSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, @@ -622,7 +622,7 @@ pub mod byte_stream_server { let inner = self.inner.clone(); let fut = async move { let method = QueryWriteStatusSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, diff --git a/nativelink-proto/genproto/google.devtools.build.v1.pb.rs b/nativelink-proto/genproto/google.devtools.build.v1.pb.rs index 94d70d8f6..a0f46a41a 100644 --- a/nativelink-proto/genproto/google.devtools.build.v1.pb.rs +++ b/nativelink-proto/genproto/google.devtools.build.v1.pb.rs @@ -633,7 +633,7 @@ pub mod publish_build_event_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/google.devtools.build.v1.PublishBuildEvent/PublishLifecycleEvent", ); @@ -668,7 +668,7 @@ pub mod publish_build_event_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/google.devtools.build.v1.PublishBuildEvent/PublishBuildToolEventStream", ); @@ -857,7 +857,7 @@ pub mod publish_build_event_server { let inner = self.inner.clone(); let fut = async move { let method = PublishLifecycleEventSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, @@ -912,7 +912,7 @@ pub mod publish_build_event_server { let inner = self.inner.clone(); let fut = async move { let method = PublishBuildToolEventStreamSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, diff --git a/nativelink-proto/genproto/google.longrunning.pb.rs b/nativelink-proto/genproto/google.longrunning.pb.rs index fec578107..aafbbb9b2 100644 --- a/nativelink-proto/genproto/google.longrunning.pb.rs +++ b/nativelink-proto/genproto/google.longrunning.pb.rs @@ -267,7 +267,7 @@ pub mod operations_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/google.longrunning.Operations/ListOperations", ); @@ -293,7 +293,7 @@ pub mod operations_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/google.longrunning.Operations/GetOperation", ); @@ -320,7 +320,7 @@ pub mod operations_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/google.longrunning.Operations/DeleteOperation", ); @@ -353,7 +353,7 @@ pub mod operations_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/google.longrunning.Operations/CancelOperation", ); @@ -385,7 +385,7 @@ pub mod operations_client { format!("Service was not ready: {}", e.into()), ) })?; - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/google.longrunning.Operations/WaitOperation", ); @@ -586,7 +586,7 @@ pub mod operations_server { let inner = self.inner.clone(); let fut = async move { let method = ListOperationsSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, @@ -631,7 +631,7 @@ pub mod operations_server { let inner = self.inner.clone(); let fut = async move { let method = GetOperationSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, @@ -676,7 +676,7 @@ pub mod operations_server { let inner = self.inner.clone(); let fut = async move { let method = DeleteOperationSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, @@ -721,7 +721,7 @@ pub mod operations_server { let inner = self.inner.clone(); let fut = async move { let method = CancelOperationSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, @@ -766,7 +766,7 @@ pub mod operations_server { let inner = self.inner.clone(); let fut = async move { let method = WaitOperationSvc(inner); - let codec = tonic::codec::ProstCodec::default(); + let codec = tonic_prost::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) .apply_compression_config( accept_compression_encodings, diff --git a/nativelink-scheduler/Cargo.toml b/nativelink-scheduler/Cargo.toml index 5f98f9fd8..920e28972 100644 --- a/nativelink-scheduler/Cargo.toml +++ b/nativelink-scheduler/Cargo.toml @@ -20,13 +20,13 @@ bytes = { version = "1.10.1", default-features = false } futures = { version = "0.3.31", default-features = false } lru = { version = "0.16.0", default-features = false } mock_instant = { version = "0.5.3", default-features = false } -opentelemetry = { version = "0.29.1", default-features = false } -opentelemetry-semantic-conventions = { version = "0.29.0", default-features = false, features = [ +opentelemetry = { version = "0.31.0", default-features = false } +opentelemetry-semantic-conventions = { version = "0.31.0", default-features = false, features = [ "default", "semconv_experimental", ] } parking_lot = { version = "0.12.3", default-features = false } -prost = { version = "0.13.5", default-features = false } +prost = { version = "0.14.3", default-features = false } redis = { version = "1.0.0", default-features = false } scopeguard = { version = "1.2.0", default-features = false } serde = { version = "1.0.219", features = ["rc"], default-features = false } @@ -41,8 +41,8 @@ tokio = { version = "1.44.1", features = [ tokio-stream = { version = "0.1.17", features = [ "fs", ], default-features = false } -tonic = { version = "0.13.0", features = [ - "tls-ring", +tonic = { version = "0.14.5", features = [ + "tls-aws-lc", "transport", ], default-features = false } tracing = { version = "0.1.41", default-features = false } diff --git a/nativelink-scheduler/src/api_worker_scheduler.rs b/nativelink-scheduler/src/api_worker_scheduler.rs index edfe56c67..435a8c404 100644 --- a/nativelink-scheduler/src/api_worker_scheduler.rs +++ b/nativelink-scheduler/src/api_worker_scheduler.rs @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. +use core::num::NonZeroUsize; use core::ops::{Deref, DerefMut}; use core::sync::atomic::{AtomicU64, Ordering}; use core::time::Duration; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use std::time::{Instant, UNIX_EPOCH}; +use std::time::{Instant, SystemTime, UNIX_EPOCH}; -use async_lock::Mutex; +use async_lock::RwLock; use lru::LruCache; use nativelink_config::schedulers::WorkerAllocationStrategy; use nativelink_error::{Code, Error, ResultExt, error_if, make_err, make_input_err}; @@ -26,13 +28,22 @@ use nativelink_metric::{ MetricFieldData, MetricKind, MetricPublishKnownKindData, MetricsComponent, RootMetricsComponent, group, }; +use nativelink_proto::build::bazel::remote::execution::v2::Directory; +use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ + PeerHint, StartExecute, UpdateForWorker, update_for_worker, +}; +use nativelink_util::blob_locality_map::SharedBlobLocalityMap; use nativelink_util::action_messages::{OperationId, WorkerId}; +use nativelink_util::common::DigestInfo; use nativelink_util::operation_state_manager::{UpdateOperationType, WorkerStateManager}; use nativelink_util::platform_properties::PlatformProperties; use nativelink_util::shutdown_guard::ShutdownGuard; +use nativelink_util::store_trait::{Store, StoreKey, StoreLike}; +use prost::Message; use tokio::sync::Notify; +use tokio::sync::mpsc::UnboundedSender; use tonic::async_trait; -use tracing::{error, info, trace, warn}; +use tracing::{debug, error, info, trace, warn}; /// Metrics for tracking scheduler performance. #[derive(Debug, Default)] @@ -60,7 +71,10 @@ pub struct SchedulerMetrics { } use crate::platform_property_manager::PlatformPropertyManager; -use crate::worker::{ActionInfoWithProps, Worker, WorkerTimestamp, WorkerUpdate}; +use crate::worker::{ + ActionInfoWithProps, PendingActionInfoData, Worker, WorkerTimestamp, WorkerUpdate, + reduce_platform_properties, +}; use crate::worker_capability_index::WorkerCapabilityIndex; use crate::worker_registry::SharedWorkerRegistry; use crate::worker_scheduler::WorkerScheduler; @@ -123,6 +137,10 @@ struct ApiWorkerSchedulerImpl { /// Used to accelerate `find_worker_for_action` by filtering candidates /// based on properties before doing linear scan. capability_index: WorkerCapabilityIndex, + + /// Reverse map: CAS endpoint → WorkerId. + /// Updated when workers are added/removed. + endpoint_to_worker: HashMap, } impl core::fmt::Debug for ApiWorkerSchedulerImpl { @@ -136,6 +154,7 @@ impl core::fmt::Debug for ApiWorkerSchedulerImpl { &self.capability_index.worker_count(), ) .field("worker_registry", &self.worker_registry) + .field("endpoint_to_worker_len", &self.endpoint_to_worker.len()) .finish_non_exhaustive() } } @@ -168,6 +187,14 @@ impl ApiWorkerSchedulerImpl { ); worker.last_update_timestamp = timestamp; + // If the worker was in quarantine, clear it now that it has checked in. + if worker.quarantined_at.take().is_some() { + info!( + ?worker_id, + "Worker exited quarantine after sending keepalive" + ); + } + trace!( ?worker_id, running_operations = worker.running_action_infos.len(), @@ -182,6 +209,13 @@ impl ApiWorkerSchedulerImpl { fn add_worker(&mut self, worker: Worker) -> Result<(), Error> { let worker_id = worker.id.clone(); let platform_properties = worker.platform_properties.clone(); + + // Update endpoint → worker reverse map for locality scoring. + if !worker.cas_endpoint.is_empty() { + self.endpoint_to_worker + .insert(worker.cas_endpoint.clone(), worker_id.clone()); + } + self.workers.put(worker_id.clone(), worker); // Add to capability index for fast matching @@ -214,6 +248,14 @@ impl ApiWorkerSchedulerImpl { self.capability_index.remove_worker(worker_id); let result = self.workers.pop(worker_id); + + // Remove from endpoint → worker reverse map. + if let Some(ref worker) = result { + if !worker.cas_endpoint.is_empty() { + self.endpoint_to_worker.remove(&worker.cas_endpoint); + } + } + self.worker_change_notify.notify_one(); result } @@ -234,7 +276,7 @@ impl ApiWorkerSchedulerImpl { } fn inner_find_worker_for_action( - &self, + &mut self, platform_properties: &PlatformProperties, full_worker_logging: bool, ) -> Option { @@ -247,18 +289,45 @@ impl ApiWorkerSchedulerImpl { if candidates.is_empty() { if full_worker_logging { - info!("No workers in capability index match required properties"); + debug!("No workers in capability index match required properties"); } return None; } + // Clear is_paused for candidate workers that now have capacity, + // but only if they were paused due to a capacity check (not explicit + // worker backpressure like ResourceExhausted). Workers that reported + // ResourceExhausted should remain paused until they complete an action. + for wid in &candidates { + if let Some(worker) = self.workers.0.peek_mut(wid) { + if worker.is_paused && !worker.is_draining && !worker.paused_due_to_backpressure { + let has_capacity = worker.max_inflight_tasks == 0 + || u64::try_from(worker.running_action_infos.len()).unwrap_or(u64::MAX) + < worker.max_inflight_tasks; + if has_capacity { + worker.is_paused = false; + } + } + } + } + // Check function for availability AND dynamic Minimum property verification. // The index only does presence checks for Minimum properties since their // values change dynamically as jobs are assigned to workers. let worker_matches = |(worker_id, w): &(&WorkerId, &Worker)| -> bool { + // Quarantined workers must not receive new actions. + if w.quarantined_at.is_some() { + if full_worker_logging { + debug!( + "Worker {worker_id} is quarantined, skipping for new work" + ); + } + return false; + } + if !w.can_accept_work() { if full_worker_logging { - info!( + debug!( "Worker {worker_id} cannot accept work: is_paused={}, is_draining={}, inflight={}/{}", w.is_paused, w.is_draining, @@ -279,28 +348,348 @@ impl ApiWorkerSchedulerImpl { // Now check constraints on filtered candidates. // Iterate in LRU order based on allocation strategy. + // Note: iter() does not promote entries in the LRU. We find the worker + // first via iter(), then promote it via get_mut() below to avoid + // multiple consecutive actions all matching the same "least recently used" worker. let workers_iter = self.workers.iter(); - let worker_id = match self.allocation_strategy { - // Use rfind to get the least recently used that satisfies the properties. + // Collect viable candidates with their load info for load-aware selection. + let viable: Vec<_> = match self.allocation_strategy { WorkerAllocationStrategy::LeastRecentlyUsed => workers_iter .rev() .filter(|(worker_id, _)| candidates.contains(worker_id)) - .find(&worker_matches) - .map(|(_, w)| w.id.clone()), - - // Use find to get the most recently used that satisfies the properties. + .filter(|pair| worker_matches(pair)) + .map(|(_, w)| (w.id.clone(), w.cpu_load_pct)) + .collect(), WorkerAllocationStrategy::MostRecentlyUsed => workers_iter .filter(|(worker_id, _)| candidates.contains(worker_id)) - .find(&worker_matches) - .map(|(_, w)| w.id.clone()), + .filter(|pair| worker_matches(pair)) + .map(|(_, w)| (w.id.clone(), w.cpu_load_pct)) + .collect(), + }; + + // Pick the lightest-loaded worker among viable candidates. + // Workers with cpu_load_pct == 0 (unknown) are sorted last among + // workers that have reported load. Falls back to LRU/MRU order + // (first in the vec) when no workers have reported load. + let worker_id = if viable.iter().any(|(_, load)| *load > 0) { + // At least one worker has reported load — pick lightest. + viable + .iter() + .min_by_key(|(_, load)| if *load == 0 { u32::MAX } else { *load }) + .map(|(id, _)| id.clone()) + } else { + // No load data — use first viable (LRU/MRU order). + viable.first().map(|(id, _)| id.clone()) }; + + // Log load-aware selection decision. + if let Some(ref wid) = worker_id { + let viable_loads: Vec<_> = viable + .iter() + .map(|(id, load)| { + let short_id = id.0.chars().take(12).collect::(); + (short_id, *load) + }) + .collect(); + let winner_load = viable + .iter() + .find(|(id, _)| id == wid) + .map(|(_, l)| *l) + .unwrap_or(0); + debug!( + candidates = viable.len(), + worker_id = %wid, + winner_load_pct = winner_load, + ?viable_loads, + "Load-aware worker selection" + ); + } + + // Promote the found worker in the LRU so the next find_worker_for_action + // call won't pick the same worker again (prevents work bunching). + if let Some(ref wid) = worker_id { + self.workers.get_mut(wid); + } + if full_worker_logging && worker_id.is_none() { - warn!("No workers matched!"); + debug!("No workers matched!"); } worker_id } + /// Atomically finds a suitable worker AND reserves it for the given + /// operation by mutating the worker's state (reducing platform properties, + /// inserting into `running_action_infos`). Returns the worker ID, the + /// channel sender, and pre-built protobuf message so the caller can + /// send the notification after releasing the lock. + /// + /// Uses locality-aware scheduling: + /// - Primary: score candidates by total bytes of cached input blobs + /// using pre-computed endpoint scores (computed outside the lock). + /// - Fallback: existing LRU/MRU strategy. + /// + /// This prevents two concurrent match operations from selecting the + /// same worker, which is the key enabler for `MATCH_CONCURRENCY > 1`. + /// + /// `endpoint_scores` and `peer_hints` are pre-computed outside the write + /// lock to avoid holding it during O(files) iterations over the locality + /// map. + fn inner_find_and_reserve_worker( + &mut self, + platform_properties: &PlatformProperties, + operation_id: &OperationId, + action_info: &ActionInfoWithProps, + full_worker_logging: bool, + endpoint_scores: Option<&HashMap>, + peer_hints: Vec, + resolved_tree: Option<&ResolvedTree>, + ) -> Option<(WorkerId, UnboundedSender, UpdateForWorker)> { + let input_root_digest = action_info.inner.input_root_digest; + + // Build the set of capability-matching candidates that can accept work. + let candidates = self + .capability_index + .find_matching_workers(platform_properties, full_worker_logging); + + if candidates.is_empty() { + if full_worker_logging { + debug!("No workers in capability index match required properties"); + } + return None; + } + + // Helper: check if a specific worker is a valid candidate. + let worker_is_viable = |worker_id: &WorkerId| -> bool { + if !candidates.contains(worker_id) { + return false; + } + let Some(w) = self.workers.0.peek(worker_id) else { + return false; + }; + if w.quarantined_at.is_some() || !w.can_accept_work() { + return false; + } + platform_properties.is_satisfied_by(&w.platform_properties, false) + }; + + // ── Tier 1: Exact root match ── + // If a viable worker has the action's input_root_digest in its directory + // cache (either as a root or as a subtree of a previously cached tree), + // it can hardlink the entire input tree in milliseconds instead of + // reconstructing it from CAS. + let dir_cache_winner: Option = { + let mut best: Option<(WorkerId, u32)> = None; // (id, cpu_load) + for wid in &candidates { + if let Some(w) = self.workers.0.peek(wid) { + let has_root_match = w.cached_directory_digests.contains(&input_root_digest); + let has_subtree_match = w.cached_subtree_digests.contains(&input_root_digest); + if (has_root_match || has_subtree_match) + && worker_is_viable(wid) + { + let load = w.cpu_load_pct; + let dominated = best.as_ref().is_some_and(|(_, best_load)| { + let effective_best = if *best_load == 0 { u32::MAX } else { *best_load }; + let effective_this = if load == 0 { u32::MAX } else { load }; + effective_this >= effective_best + }); + if !dominated { + best = Some((wid.clone(), load)); + } + } + } + } + if let Some((ref wid, load)) = best { + debug!( + ?wid, + cpu_load_pct = load, + %input_root_digest, + "Directory cache hit -- worker has input_root_digest cached (root or subtree), giving scheduling priority" + ); + } + best.map(|(wid, _)| wid) + }; + + // ── Tier 1.5: Partial subtree coverage scoring ── + // When no worker has the exact root cached, score workers by the total + // file bytes under their cached subtrees. A worker caching a subtree with + // 10GB of files scores higher than one caching a subtree with 100 bytes. + // We sum the subtree_bytes for each matching directory, taking only the + // top-level match (avoid double-counting nested matches). + let subtree_coverage_winner: Option = if dir_cache_winner.is_some() { + None // exact match found, skip coverage scoring + } else if let Some(tree) = resolved_tree { + let total_bytes: u64 = tree.subtree_bytes.get(&input_root_digest).copied().unwrap_or(0); + if tree.dir_digests.len() <= 1 || total_bytes == 0 { + None // only root (or empty), no subtrees to match + } else { + let mut best: Option<(WorkerId, u64, u32)> = None; // (id, cached_bytes, cpu_load) + for wid in &candidates { + if let Some(w) = self.workers.0.peek(wid) { + if !worker_is_viable(wid) { + continue; + } + // Sum the subtree_bytes for each of the action's directory + // digests that this worker has cached. + let cached_bytes: u64 = tree.dir_digests.iter() + .filter(|d| w.cached_subtree_digests.contains(d)) + .map(|d| tree.subtree_bytes.get(d).copied().unwrap_or(0)) + .sum(); + if cached_bytes == 0 { + continue; + } + let load = w.cpu_load_pct; + let dominated = best.as_ref().is_some_and(|(_, best_bytes, best_load)| { + if cached_bytes != *best_bytes { + return cached_bytes < *best_bytes; + } + // Same coverage — prefer lower CPU load. + let effective_best = if *best_load == 0 { u32::MAX } else { *best_load }; + let effective_this = if load == 0 { u32::MAX } else { load }; + effective_this >= effective_best + }); + if !dominated { + best = Some((wid.clone(), cached_bytes, load)); + } + } + } + if let Some((ref wid, cached_bytes, load)) = best { + let pct = if total_bytes > 0 { cached_bytes * 100 / total_bytes } else { 0 }; + debug!( + ?wid, + cached_bytes, + total_bytes, + cpu_load_pct = load, + coverage_pct = pct, + %input_root_digest, + "Subtree coverage winner -- worker has {}% of input tree bytes cached in subtrees", + pct, + ); + } + best.map(|(wid, _, _)| wid) + } + } else { + None + }; + + // ── Locality scoring ── + // Convert pre-computed endpoint scores to worker scores, filtering + // to the candidate set. This is O(endpoints) not O(files). + let locality_winner = if let Some(ep_scores) = endpoint_scores { + let scores = endpoint_scores_to_worker_scores( + ep_scores, + &self.endpoint_to_worker, + &candidates, + ); + if !scores.is_empty() { + // Sort workers by score descending, then by timestamp + // descending as a tiebreaker. Workers within 10% of the + // top score are considered tied and the most recently + // refreshed one wins. + let mut sorted: Vec<_> = scores.into_iter().collect(); + // Look up cpu_load_pct for tiebreaking within 10% score range. + let load_for_worker = |wid: &WorkerId| -> u32 { + self.workers.0.peek(wid) + .map(|w| w.cpu_load_pct) + .unwrap_or(0) + }; + sorted.sort_by(|a, b| { + let (score_a, ts_a) = a.1; + let (score_b, ts_b) = b.1; + let max_score = score_a.max(score_b); + // Within 10% of each other? Use CPU load, then timestamp. + let threshold = max_score / 10; // 10% of the larger score + if score_a.abs_diff(score_b) <= threshold { + // Scores are similar — prefer lower CPU load. + let load_a = load_for_worker(&a.0); + let load_b = load_for_worker(&b.0); + if load_a != load_b && (load_a > 0 || load_b > 0) { + // Sort unknown (0) after known loads. + let effective_a = if load_a == 0 { u32::MAX } else { load_a }; + let effective_b = if load_b == 0 { u32::MAX } else { load_b }; + effective_a.cmp(&effective_b) + } else { + // Same load or both unknown — prefer more recent timestamp. + ts_b.cmp(&ts_a) + } + } else { + // Scores differ significantly, prefer higher score. + score_b.cmp(&score_a) + } + }); + + let best = sorted.first().map(|(_, (s, _))| *s).unwrap_or(0); + if best > 0 { + sorted.into_iter() + .find(|(wid, (score, _))| *score > 0 && worker_is_viable(wid)) + .map(|(wid, (score, _))| { + debug!( + ?wid, + score, + %input_root_digest, + "Locality scoring -- worker has {} cached input bytes", + score + ); + wid + }) + } else { + None + } + } else { + None + } + } else { + None + }; + + let worker_id = if let Some(wid) = dir_cache_winner { + // Exact root match trumps all other scoring. + self.workers.get_mut(&wid); + wid + } else if let Some(wid) = subtree_coverage_winner { + // Partial subtree coverage beats blob-level locality. + self.workers.get_mut(&wid); + wid + } else if let Some(wid) = locality_winner { + // Blob-level locality scoring. + self.workers.get_mut(&wid); + wid + } else { + // ── Fallback: existing LRU/MRU strategy ── + let wid = self.inner_find_worker_for_action(platform_properties, full_worker_logging)?; + wid + }; + + // Atomically reserve the worker by mutating its state under the same lock. + let (tx, msg) = self.prepare_worker_run_action( + &worker_id, + operation_id, + action_info, + peer_hints, + )?; + + Some((worker_id, tx, msg)) + } + + /// Undoes a reservation made by `inner_find_and_reserve_worker`. + /// This removes the operation from the worker's `running_action_infos` + /// and restores the reduced platform properties. + fn inner_unreserve_worker( + &mut self, + worker_id: &WorkerId, + operation_id: &OperationId, + ) { + if let Some(worker) = self.workers.get_mut(worker_id) { + if let Some(pending) = worker.running_action_infos.remove(operation_id) { + if !worker.restored_platform_properties.remove(operation_id) { + worker.restore_platform_properties( + &pending.action_info.platform_properties, + ); + } + } + } + } + async fn update_action( &mut self, worker_id: &WorkerId, @@ -367,6 +756,7 @@ impl ApiWorkerSchedulerImpl { if (due_to_backpressure || !worker.can_accept_work()) && worker.has_actions() { worker.is_paused = true; + worker.paused_due_to_backpressure = due_to_backpressure; } complete_action_res }; @@ -376,61 +766,63 @@ impl ApiWorkerSchedulerImpl { complete_action_res } - /// Notifies the specified worker to run the given action and handles errors by evicting - /// the worker if the notification fails. - async fn worker_notify_run_action( + /// Prepares a worker to run an action by mutating its state (reducing platform + /// properties, recording the running action), then returns the cloned `tx` sender + /// and pre-built message so the caller can send the notification *after* releasing + /// the write lock. + /// + /// `peer_hints` are pre-computed outside the write lock from the resolved + /// input tree. When no resolved tree is available the hints will be empty + /// -- the old fallback that generated a single hint for `input_root_digest` + /// never worked because workers register individual file digests, not + /// directory digests. + /// + /// Returns `None` if the worker was not found. + fn prepare_worker_run_action( &mut self, - worker_id: WorkerId, - operation_id: OperationId, - action_info: ActionInfoWithProps, - ) -> Result<(), Error> { - if let Some(worker) = self.workers.get_mut(&worker_id) { - let notify_worker_result = worker - .notify_update(WorkerUpdate::RunAction((operation_id, action_info.clone()))) - .await; - - if let Err(notify_worker_result) = notify_worker_result { - warn!( - ?worker_id, - ?action_info, - ?notify_worker_result, - "Worker command failed, removing worker", - ); - - // A slightly nasty way of figuring out that the worker disconnected - // from send_msg_to_worker without introducing complexity to the - // code path from here to there. - let is_disconnect = notify_worker_result.code == Code::Internal - && notify_worker_result.messages.len() == 1 - && notify_worker_result.messages[0] == "Worker Disconnected"; - - let err = make_err!( - Code::Internal, - "Worker command failed, removing worker {worker_id} -- {notify_worker_result:?}", - ); - - return Result::<(), _>::Err(err.clone()).merge( - self.immediate_evict_worker(&worker_id, err, is_disconnect) - .await, - ); - } - Ok(()) - } else { - warn!( + worker_id: &WorkerId, + operation_id: &OperationId, + action_info: &ActionInfoWithProps, + peer_hints: Vec, + ) -> Option<(UnboundedSender, UpdateForWorker)> { + let worker = self.workers.get_mut(worker_id)?; + // Clone the tx so we can send outside the lock. + let tx = worker.tx.clone(); + + if !peer_hints.is_empty() { + debug!( ?worker_id, - %operation_id, - ?action_info, - "Worker not found in worker map in worker_notify_run_action" + hints = peer_hints.len(), + "Generated peer hints for StartExecute" ); - // Ensure the operation is put back to queued state. - self.worker_state_manager - .update_operation( - &operation_id, - &worker_id, - UpdateOperationType::UpdateWithDisconnect, - ) - .await } + + // Build the protobuf message while we still have access to worker state. + let start_execute = StartExecute { + execute_request: Some(action_info.inner.as_ref().into()), + operation_id: operation_id.to_string(), + queued_timestamp: Some(action_info.inner.insert_timestamp.into()), + platform: Some((&action_info.platform_properties).into()), + worker_id: worker.id.clone().into(), + peer_hints, + }; + let msg = UpdateForWorker { + update: Some(update_for_worker::Update::StartAction(start_execute)), + }; + + // Perform the state mutation that run_action would do: + // reduce platform properties and record the running action. + reduce_platform_properties( + &mut worker.platform_properties, + &action_info.platform_properties, + ); + worker.running_action_infos.insert( + operation_id.clone(), + PendingActionInfoData { + action_info: action_info.clone(), + }, + ); + Some((tx, msg)) } /// Evicts the worker from the pool and puts items back into the queue if anything was being executed on it. @@ -467,7 +859,7 @@ impl ApiWorkerSchedulerImpl { #[derive(Debug, MetricsComponent)] pub struct ApiWorkerScheduler { #[metric] - inner: Mutex, + inner: RwLock, #[metric(group = "platform_property_manager")] platform_property_manager: Arc, @@ -480,8 +872,23 @@ pub struct ApiWorkerScheduler { /// Performance metrics for observability. metrics: Arc, + + /// Blob locality map for peer-to-peer blob sharing. + /// Used to generate peer hints in StartExecute messages. + locality_map: Option, + + /// CAS store for resolving input trees (reading Directory protos). + /// When set, enables tier-2 locality scoring. + cas_store: Option, + + /// Cached resolved input trees: input_root_digest → ResolvedTree. + /// Held under a tokio::Mutex briefly for get/put, not during I/O. + tree_cache: Arc>>>, } +/// Capacity for the resolved input tree LRU cache. +const TREE_CACHE_CAPACITY: usize = 1024; + impl ApiWorkerScheduler { pub fn new( worker_state_manager: Arc, @@ -490,9 +897,31 @@ impl ApiWorkerScheduler { worker_change_notify: Arc, worker_timeout_s: u64, worker_registry: SharedWorkerRegistry, + ) -> Arc { + Self::new_with_locality_map( + worker_state_manager, + platform_property_manager, + allocation_strategy, + worker_change_notify, + worker_timeout_s, + worker_registry, + None, + None, + ) + } + + pub fn new_with_locality_map( + worker_state_manager: Arc, + platform_property_manager: Arc, + allocation_strategy: WorkerAllocationStrategy, + worker_change_notify: Arc, + worker_timeout_s: u64, + worker_registry: SharedWorkerRegistry, + locality_map: Option, + cas_store: Option, ) -> Arc { Arc::new(Self { - inner: Mutex::new(ApiWorkerSchedulerImpl { + inner: RwLock::new(ApiWorkerSchedulerImpl { workers: Workers(LruCache::unbounded()), worker_state_manager: worker_state_manager.clone(), allocation_strategy, @@ -500,11 +929,17 @@ impl ApiWorkerScheduler { worker_registry: worker_registry.clone(), shutting_down: false, capability_index: WorkerCapabilityIndex::new(), + endpoint_to_worker: HashMap::new(), }), platform_property_manager, worker_timeout_s, worker_registry, metrics: Arc::new(SchedulerMetrics::default()), + locality_map, + cas_store, + tree_cache: Arc::new(tokio::sync::Mutex::new(LruCache::new( + NonZeroUsize::new(TREE_CACHE_CAPACITY).unwrap(), + ))), }) } @@ -522,10 +957,94 @@ impl ApiWorkerScheduler { self.metrics .actions_dispatched .fetch_add(1, Ordering::Relaxed); - let mut inner = self.inner.lock().await; - inner - .worker_notify_run_action(worker_id, operation_id, action_info) - .await + + // Phase 1: Acquire write lock, mutate worker state, extract tx + message, + // then drop the lock BEFORE sending on the channel. + let prepare_result = { + let mut inner = self.inner.write().await; + let result = + inner.prepare_worker_run_action(&worker_id, &operation_id, &action_info, Vec::new()); + if result.is_none() { + // Worker not found - handle under the lock since we need worker_state_manager. + warn!( + ?worker_id, + %operation_id, + ?action_info, + "Worker not found in worker map in worker_notify_run_action" + ); + return inner + .worker_state_manager + .update_operation( + &operation_id, + &worker_id, + UpdateOperationType::UpdateWithDisconnect, + ) + .await; + } + result + // inner (write lock) is dropped here + }; + + // Phase 2: Send notification outside the lock to avoid blocking other + // scheduler operations if the channel has backpressure. + if let Some((tx, msg)) = prepare_result { + if let Err(_send_err) = tx.send(msg) { + // Worker disconnected. Re-acquire lock to evict. + warn!( + ?worker_id, + ?action_info, + "Worker command failed (disconnected), removing worker", + ); + let err = make_err!( + Code::Internal, + "Worker command failed, removing worker {worker_id} -- Worker Disconnected", + ); + let mut inner = self.inner.write().await; + return Result::<(), _>::Err(err.clone()).merge( + inner + .immediate_evict_worker(&worker_id, err, true) + .await, + ); + } + } + + Ok(()) + } + + /// Sends the start-execution notification for a worker that was already + /// reserved by `find_and_reserve_worker`. The worker's state has already + /// been mutated (platform properties reduced, action recorded in + /// `running_action_infos`), so this method only sends the pre-built + /// message over the channel and handles disconnection errors. + pub async fn send_reserved_worker_notification( + &self, + worker_id: &WorkerId, + tx: UnboundedSender, + msg: UpdateForWorker, + ) -> Result<(), Error> { + self.metrics + .actions_dispatched + .fetch_add(1, Ordering::Relaxed); + + if let Err(_send_err) = tx.send(msg) { + // Worker disconnected. Re-acquire lock to evict. + warn!( + ?worker_id, + "Worker command failed (disconnected) after reservation, removing worker", + ); + let err = make_err!( + Code::Internal, + "Worker command failed, removing worker {worker_id} -- Worker Disconnected", + ); + let mut inner = self.inner.write().await; + return Result::<(), _>::Err(err.clone()).merge( + inner + .immediate_evict_worker(worker_id, err, true) + .await, + ); + } + + Ok(()) } /// Returns the scheduler metrics for observability. @@ -548,7 +1067,7 @@ impl ApiWorkerScheduler { .find_worker_calls .fetch_add(1, Ordering::Relaxed); - let inner = self.inner.lock().await; + let mut inner = self.inner.write().await; let worker_count = inner.workers.len() as u64; let result = inner.inner_find_worker_for_action(platform_properties, full_worker_logging); @@ -574,10 +1093,111 @@ impl ApiWorkerScheduler { result } + /// Atomically finds a suitable worker AND reserves it for the given + /// operation. This combines the find and reservation into a single lock + /// acquisition, preventing two concurrent match operations from selecting + /// the same worker. + /// + /// Returns `(worker_id, tx, msg)` where `tx` and `msg` can be used to + /// send the start-execution notification to the worker outside the lock. + /// Returns `None` if no suitable worker was found. + /// + /// If the caller later decides not to use this reservation (e.g., because + /// `assign_operation` fails), it MUST call `unreserve_worker` to undo + /// the reservation. + pub async fn find_and_reserve_worker( + &self, + platform_properties: &PlatformProperties, + operation_id: &OperationId, + action_info: &ActionInfoWithProps, + full_worker_logging: bool, + ) -> Option<(WorkerId, UnboundedSender, UpdateForWorker)> { + let start = Instant::now(); + self.metrics + .find_worker_calls + .fetch_add(1, Ordering::Relaxed); + + // ── Phase 1: async tree resolution (BEFORE write lock) ── + let resolved_tree = self + .resolve_input_tree(action_info.inner.input_root_digest) + .await; + + // ── Phase 2: pre-compute locality scores and peer hints (BEFORE write lock) ── + // These are O(files × endpoints_per_blob) operations that previously + // ran inside the write lock, blocking all scheduler operations for + // 2-5ms on large actions (50K+ inputs). + let (endpoint_scores, peer_hints) = match (&resolved_tree, &self.locality_map) { + (Some(tree), Some(loc_map)) => { + let (scores, hints) = score_and_generate_hints(&tree.file_digests, loc_map); + (Some(scores), hints) + } + _ => (None, Vec::new()), + }; + + // ── Phase 3: acquire write lock, do selection + reservation ── + // Inside the lock we only do O(workers) work: candidate filtering, + // endpoint→WorkerId mapping, and state mutation. + let mut inner = self.inner.write().await; + let worker_count = inner.workers.len() as u64; + let result = inner.inner_find_and_reserve_worker( + platform_properties, + operation_id, + action_info, + full_worker_logging, + endpoint_scores.as_ref(), + peer_hints, + resolved_tree.as_deref(), + ); + + // Track workers iterated (worst case is all workers) + self.metrics + .workers_iterated + .fetch_add(worker_count, Ordering::Relaxed); + + if result.is_some() { + self.metrics + .find_worker_hits + .fetch_add(1, Ordering::Relaxed); + } else { + self.metrics + .find_worker_misses + .fetch_add(1, Ordering::Relaxed); + } + + #[allow(clippy::cast_possible_truncation)] + self.metrics + .find_worker_time_ns + .fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed); + result + } + + /// Undoes a reservation made by `find_and_reserve_worker`. This must + /// be called if the match is abandoned after reservation (e.g., if + /// `assign_operation` returns an error). + pub async fn unreserve_worker( + &self, + worker_id: &WorkerId, + operation_id: &OperationId, + ) { + let mut inner = self.inner.write().await; + inner.inner_unreserve_worker(worker_id, operation_id); + } + + /// Returns true if any registered worker could match the given platform + /// properties (static check only — does not consider dynamic resource + /// availability like current cpu_count). + pub async fn has_matching_workers(&self, platform_properties: &PlatformProperties) -> bool { + let inner = self.inner.read().await; + !inner + .capability_index + .find_matching_workers(platform_properties, false) + .is_empty() + } + /// Checks to see if the worker exists in the worker pool. Should only be used in unit tests. #[must_use] pub async fn contains_worker_for_test(&self, worker_id: &WorkerId) -> bool { - let inner = self.inner.lock().await; + let inner = self.inner.read().await; inner.workers.contains(worker_id) } @@ -586,12 +1206,296 @@ impl ApiWorkerScheduler { &self, worker_id: &WorkerId, ) -> Result<(), Error> { - let mut inner = self.inner.lock().await; + let mut inner = self.inner.write().await; let worker = inner.workers.get_mut(worker_id).ok_or_else(|| { make_input_err!("WorkerId '{}' does not exist in workers map", worker_id) })?; worker.keep_alive() } + + /// Resolves the full input tree for the given `input_root_digest` by + /// reading Directory protos from the CAS store and collecting all file + /// digests and sizes. Results are cached in `tree_cache`. + /// + /// Returns `None` if no CAS store is configured or on any error (errors + /// are logged but do not fail scheduling — we just skip locality scoring). + /// + /// Runs *outside* the scheduler write lock, so multiple actions can + /// resolve concurrently. The `tokio::Mutex` on `tree_cache` is held + /// only briefly for get/put, not during store I/O. + async fn resolve_input_tree( + &self, + input_root_digest: DigestInfo, + ) -> Option> { + let cas_store = self.cas_store.as_ref()?; + + // Check cache first (brief lock). + { + let mut cache = self.tree_cache.lock().await; + if let Some(cached) = cache.get(&input_root_digest) { + debug!( + %input_root_digest, + file_count = cached.file_digests.len(), + dir_count = cached.dir_digests.len(), + "Tree resolution cache hit" + ); + return Some(cached.clone()); + } + } + + // Cache miss — resolve the tree by reading Directory protos from CAS. + let result = resolve_tree_from_cas(cas_store, input_root_digest).await; + match result { + Ok(resolved) => { + debug!( + %input_root_digest, + file_count = resolved.file_digests.len(), + dir_count = resolved.dir_digests.len(), + "Resolved input tree from CAS (cache miss)" + ); + let arc = Arc::new(resolved); + // Store in cache (brief lock). + { + let mut cache = self.tree_cache.lock().await; + cache.put(input_root_digest, arc.clone()); + } + Some(arc) + } + Err(err) => { + warn!( + %input_root_digest, + ?err, + "Failed to resolve input tree for locality scoring, skipping" + ); + None + } + } + } +} + +/// Resolved input tree containing file digests, directory digests, +/// and per-subtree file byte totals for coverage scoring. +struct ResolvedTree { + /// (file_digest, file_size) pairs, deduplicated. + file_digests: Vec<(DigestInfo, u64)>, + /// All directory digests in the tree (including root), deduplicated. + dir_digests: HashSet, + /// Total file bytes under each directory subtree (recursive). + /// Used to weight subtree coverage scoring — a subtree with 10GB + /// of files is worth more than one with 100 bytes. + subtree_bytes: HashMap, +} + +/// Resolves a directory tree from the CAS store by recursively reading +/// Directory protos and collecting file digests (for locality scoring), +/// directory digests (for subtree coverage scoring), and per-subtree +/// file byte totals (for weighted coverage scoring). Deduplicates both +/// file and directory digests. +async fn resolve_tree_from_cas( + cas_store: &Store, + root_digest: DigestInfo, +) -> Result { + use futures::stream::FuturesUnordered; + use futures::StreamExt; + + let mut file_digests: Vec<(DigestInfo, u64)> = Vec::new(); + let mut seen_files: HashSet = HashSet::new(); + let mut dirs_to_visit: Vec = vec![root_digest]; + let mut seen_dirs: HashSet = HashSet::new(); + seen_dirs.insert(root_digest); + + // Track tree structure for bottom-up subtree size computation. + let mut dir_direct_bytes: HashMap = HashMap::new(); + let mut dir_children: HashMap> = HashMap::new(); + // BFS order — used for bottom-up traversal (reverse of BFS = leaves first). + let mut bfs_order: Vec = vec![root_digest]; + + while !dirs_to_visit.is_empty() { + let fetches: FuturesUnordered<_> = dirs_to_visit + .drain(..) + .map(|dir_digest| { + let cas_store = cas_store.clone(); + async move { + let key: StoreKey<'_> = dir_digest.into(); + let bytes = cas_store + .get_part_unchunked(key, 0, None) + .await + .err_tip(|| { + format!( + "Reading directory {dir_digest} from CAS for tree resolution" + ) + })?; + let directory = Directory::decode(bytes).map_err(|e| { + make_err!(Code::Internal, "Failed to decode Directory proto: {e}") + })?; + Ok::<_, Error>((dir_digest, directory)) + } + }) + .collect(); + + let results: Vec> = fetches.collect().await; + for result in results { + let (parent_digest, directory) = result?; + + // Sum direct file bytes for this directory. + let mut direct_bytes: u64 = 0; + for file_node in &directory.files { + if let Some(ref digest) = file_node.digest { + if let Ok(digest_info) = DigestInfo::try_from(digest) { + let size = digest_info.size_bytes(); + direct_bytes += size; + if seen_files.insert(digest_info) { + file_digests.push((digest_info, size)); + } + } + } + } + dir_direct_bytes.insert(parent_digest, direct_bytes); + + // Queue subdirectories for visiting (dedup via seen_dirs). + let mut children = Vec::new(); + for dir_node in &directory.directories { + if let Some(ref digest) = dir_node.digest { + if let Ok(digest_info) = DigestInfo::try_from(digest) { + children.push(digest_info); + if seen_dirs.insert(digest_info) { + dirs_to_visit.push(digest_info); + bfs_order.push(digest_info); + } + } + } + } + dir_children.insert(parent_digest, children); + } + } + + // Bottom-up pass: compute total file bytes under each subtree. + // Reverse BFS order gives us leaves-first, so children are always + // computed before parents. + let mut subtree_bytes: HashMap = HashMap::new(); + for &dir_digest in bfs_order.iter().rev() { + let direct = dir_direct_bytes.get(&dir_digest).copied().unwrap_or(0); + let children_total: u64 = dir_children + .get(&dir_digest) + .map(|children| { + children.iter() + .map(|c| subtree_bytes.get(c).copied().unwrap_or(0)) + .sum() + }) + .unwrap_or(0); + subtree_bytes.insert(dir_digest, direct + children_total); + } + + Ok(ResolvedTree { + file_digests, + dir_digests: seen_dirs, + subtree_bytes, + }) +} + +/// Scores endpoints by the total bytes of input blobs they have cached +/// AND generates peer hints in a single pass over the file digests, +/// acquiring the locality map read lock only once. +/// +/// Returns: +/// - `HashMap`: endpoint scores (total cached +/// bytes, most recent blob timestamp) +/// - `Vec`: peer hints sorted by file size descending, truncated +/// to MAX_PEER_HINTS +/// +/// This is called OUTSIDE the scheduler write lock, so it does not need +/// access to `endpoint_to_worker` or the candidate set. The caller maps +/// endpoints to WorkerIds and filters to candidates inside the lock. +fn score_and_generate_hints( + file_digests: &[(DigestInfo, u64)], + locality_map: &SharedBlobLocalityMap, +) -> (HashMap, Vec) { + /// Maximum number of peer hints to include in a StartExecute message + /// to avoid oversized messages. + const MAX_PEER_HINTS: usize = 16384; + + let map = locality_map.read(); + let blobs = map.blobs_map(); + let mut scores: HashMap = HashMap::new(); + let mut hint_candidates: Vec<(DigestInfo, u64, Vec)> = Vec::new(); + + for &(digest, size) in file_digests { + if let Some(endpoints) = blobs.get(&digest) { + // Accumulate endpoint scores. + for (endpoint, ts) in endpoints { + let entry = scores + .entry(endpoint.to_string()) + .or_insert((0, UNIX_EPOCH)); + entry.0 += size; + if *ts > entry.1 { + entry.1 = *ts; + } + } + // Collect hint candidate if this digest has peer locations. + if !endpoints.is_empty() { + let peer_eps: Vec = + endpoints.keys().map(|e| e.to_string()).collect(); + hint_candidates.push((digest, size, peer_eps)); + } + } + } + + // Sort by size descending to prioritize large files. + hint_candidates.sort_by(|a, b| b.1.cmp(&a.1)); + hint_candidates.truncate(MAX_PEER_HINTS); + + let peer_hints: Vec = hint_candidates + .into_iter() + .map(|(digest, _size, peer_endpoints)| PeerHint { + digest: Some(digest.into()), + peer_endpoints, + }) + .collect(); + + (scores, peer_hints) +} + +/// Converts endpoint scores to worker scores using the endpoint-to-worker +/// mapping, filtering to the given candidate set. +/// +/// Returns `HashMap` where the tuple is +/// (total cached bytes, most recent blob timestamp across all endpoints +/// belonging to this worker). +fn endpoint_scores_to_worker_scores( + endpoint_scores: &HashMap, + endpoint_to_worker: &HashMap, + candidates: &HashSet, +) -> HashMap { + let mut worker_scores: HashMap = HashMap::new(); + for (endpoint, &(score, ts)) in endpoint_scores { + if let Some(worker_id) = endpoint_to_worker.get(endpoint) { + if candidates.contains(worker_id) { + let entry = worker_scores + .entry(worker_id.clone()) + .or_insert((0, UNIX_EPOCH)); + entry.0 += score; + if ts > entry.1 { + entry.1 = ts; + } + } + } + } + worker_scores +} + +/// Backward-compatible wrapper used by existing tests. Scores candidate +/// workers by the total bytes of input blobs they have cached. +/// Returns only the byte score (drops the timestamp) for simpler assertions. +#[cfg(test)] +fn score_workers( + candidates: &HashSet, + file_digests: &[(DigestInfo, u64)], + locality_map: &SharedBlobLocalityMap, + endpoint_to_worker: &HashMap, +) -> HashMap { + let (endpoint_scores, _hints) = score_and_generate_hints(file_digests, locality_map); + let full_scores = endpoint_scores_to_worker_scores(&endpoint_scores, endpoint_to_worker, candidates); + full_scores.into_iter().map(|(wid, (score, _))| (wid, score)).collect() } #[async_trait] @@ -603,7 +1507,7 @@ impl WorkerScheduler for ApiWorkerScheduler { async fn add_worker(&self, worker: Worker) -> Result<(), Error> { let worker_id = worker.id.clone(); let worker_timestamp = worker.last_update_timestamp; - let mut inner = self.inner.lock().await; + let mut inner = self.inner.write().await; if inner.shutting_down { warn!("Rejected worker add during shutdown: {}", worker_id); return Err(make_err!( @@ -632,7 +1536,7 @@ impl WorkerScheduler for ApiWorkerScheduler { operation_id: &OperationId, update: UpdateOperationType, ) -> Result<(), Error> { - let mut inner = self.inner.lock().await; + let mut inner = self.inner.write().await; inner.update_action(worker_id, operation_id, update).await } @@ -642,7 +1546,7 @@ impl WorkerScheduler for ApiWorkerScheduler { timestamp: WorkerTimestamp, ) -> Result<(), Error> { { - let mut inner = self.inner.lock().await; + let mut inner = self.inner.write().await; inner .refresh_lifetime(worker_id, timestamp) .err_tip(|| "Error refreshing lifetime in worker_keep_alive_received()")?; @@ -657,7 +1561,7 @@ impl WorkerScheduler for ApiWorkerScheduler { async fn remove_worker(&self, worker_id: &WorkerId) -> Result<(), Error> { self.worker_registry.remove_worker(worker_id).await; - let mut inner = self.inner.lock().await; + let mut inner = self.inner.write().await; inner .immediate_evict_worker( worker_id, @@ -668,7 +1572,7 @@ impl WorkerScheduler for ApiWorkerScheduler { } async fn shutdown(&self, shutdown_guard: ShutdownGuard) { - let mut inner = self.inner.lock().await; + let mut inner = self.inner.write().await; inner.shutting_down = true; // should reject further worker registration while let Some(worker_id) = inner .workers @@ -692,54 +1596,100 @@ impl WorkerScheduler for ApiWorkerScheduler { async fn remove_timedout_workers(&self, now_timestamp: WorkerTimestamp) -> Result<(), Error> { // Check worker liveness using both the local timestamp (from LRU) // and the worker registry. A worker is alive if either source says it's alive. + // + // Quarantine phase: workers that miss keepalive for > worker_timeout but + // < 2*worker_timeout are quarantined (stop receiving new work) rather than + // immediately evicted. Workers that miss keepalive for >= 2*worker_timeout + // are fully evicted. let timeout = Duration::from_secs(self.worker_timeout_s); let now = UNIX_EPOCH + Duration::from_secs(now_timestamp); let timeout_threshold = now_timestamp.saturating_sub(self.worker_timeout_s); + let evict_threshold = now_timestamp.saturating_sub(self.worker_timeout_s * 2); - let workers_to_check: Vec<(WorkerId, bool)> = { - let inner = self.inner.lock().await; + // Collect (worker_id, local_alive, already_quarantined) for workers that + // have not responded within the base timeout window. + let workers_to_check: Vec<(WorkerId, bool, bool)> = { + let inner = self.inner.read().await; inner .workers .iter() - .map(|(worker_id, worker)| { + .filter_map(|(worker_id, worker)| { let local_alive = worker.last_update_timestamp > timeout_threshold; - (worker_id.clone(), local_alive) + if local_alive { + None + } else { + let already_quarantined = worker.quarantined_at.is_some(); + // Check if past the eviction threshold (2x timeout) + let past_evict_threshold = + worker.last_update_timestamp <= evict_threshold; + Some((worker_id.clone(), past_evict_threshold, already_quarantined)) + } }) .collect() }; - let mut worker_ids_to_remove = Vec::new(); - for (worker_id, local_alive) in workers_to_check { - if local_alive { - continue; - } + if workers_to_check.is_empty() { + return Ok(()); + } + // For each candidate, consult the registry to determine actual liveness. + let mut workers_to_quarantine = Vec::new(); + let mut worker_ids_to_remove = Vec::new(); + for (worker_id, past_evict_threshold, already_quarantined) in workers_to_check { let registry_alive = self .worker_registry .is_worker_alive(&worker_id, timeout, now) .await; - if !registry_alive { + if registry_alive { + // Registry says alive — no action needed. + continue; + } + + if past_evict_threshold { + // Has been unresponsive for >= 2x the timeout — evict. trace!( ?worker_id, - local_alive, - registry_alive, - timeout_threshold, - "Worker timed out - neither local nor registry shows alive" + past_evict_threshold, + "Worker exceeded double-timeout, evicting from pool" ); worker_ids_to_remove.push(worker_id); + } else if !already_quarantined { + // Has been unresponsive for > timeout but < 2x timeout — quarantine. + trace!( + ?worker_id, + "Worker missed keepalive, entering quarantine (stops receiving work)" + ); + workers_to_quarantine.push(worker_id); } + // If already_quarantined && !past_evict_threshold: still waiting, no action. } - if worker_ids_to_remove.is_empty() { + if workers_to_quarantine.is_empty() && worker_ids_to_remove.is_empty() { return Ok(()); } - let mut inner = self.inner.lock().await; - let mut result = Ok(()); + let mut inner = self.inner.write().await; + // Apply quarantine to workers that just crossed the first timeout. + let quarantine_time = SystemTime::now(); + for worker_id in &workers_to_quarantine { + if let Some(worker) = inner.workers.peek_mut(worker_id) { + warn!( + ?worker_id, + "Worker missed keepalive, quarantining (will not receive new work)" + ); + worker.quarantined_at = Some(quarantine_time); + } + } + // Notify the matching engine so it skips quarantined workers on next cycle. + if !workers_to_quarantine.is_empty() { + inner.worker_change_notify.notify_one(); + } + + let mut result = Ok(()); for worker_id in &worker_ids_to_remove { - warn!(?worker_id, "Worker timed out, removing from pool"); + warn!(?worker_id, "Worker timed out (2x timeout), removing from pool"); result = result.merge( inner .immediate_evict_worker( @@ -758,9 +1708,560 @@ impl WorkerScheduler for ApiWorkerScheduler { } async fn set_drain_worker(&self, worker_id: &WorkerId, is_draining: bool) -> Result<(), Error> { - let mut inner = self.inner.lock().await; + let mut inner = self.inner.write().await; inner.set_drain_worker(worker_id, is_draining).await } + + async fn update_worker_load(&self, worker_id: &WorkerId, cpu_load_pct: u32) -> Result<(), Error> { + // Use peek_mut to avoid promoting the worker in the LRU cache — + // load updates should not affect scheduling order. + let mut inner = self.inner.write().await; + let worker = inner.workers.0.peek_mut(worker_id).ok_or_else(|| { + make_input_err!( + "Worker not found in worker map in update_worker_load() {}", + worker_id + ) + })?; + worker.cpu_load_pct = cpu_load_pct; + debug!(%worker_id, cpu_load_pct, "Worker load updated"); + Ok(()) + } + + async fn update_cached_directories( + &self, + worker_id: &WorkerId, + digests: HashSet, + ) -> Result<(), Error> { + let mut inner = self.inner.write().await; + let worker = inner.workers.0.peek_mut(worker_id).ok_or_else(|| { + make_input_err!( + "Worker not found in worker map in update_cached_directories() {}", + worker_id + ) + })?; + let count = digests.len(); + worker.cached_directory_digests = digests; + debug!(%worker_id, count, "Worker cached directory digests updated"); + Ok(()) + } + + async fn update_cached_subtrees( + &self, + worker_id: &WorkerId, + is_full_snapshot: bool, + full_set: Vec, + added: Vec, + removed: Vec, + ) -> Result<(), Error> { + let mut inner = self.inner.write().await; + let worker = inner.workers.0.peek_mut(worker_id).ok_or_else(|| { + make_input_err!( + "Worker not found in worker map in update_cached_subtrees() {}", + worker_id + ) + })?; + if is_full_snapshot { + let count = full_set.len(); + worker.cached_subtree_digests = full_set.into_iter().collect(); + debug!(%worker_id, count, "Worker cached subtree digests replaced (full snapshot)"); + } else { + let added_count = added.len(); + let removed_count = removed.len(); + for digest in added { + worker.cached_subtree_digests.insert(digest); + } + for digest in &removed { + worker.cached_subtree_digests.remove(digest); + } + let total = worker.cached_subtree_digests.len(); + debug!( + %worker_id, + added_count, + removed_count, + total, + "Worker cached subtree digests updated (delta)" + ); + } + Ok(()) + } } impl RootMetricsComponent for ApiWorkerScheduler {} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashSet; + use bytes::Bytes; + use nativelink_config::stores::MemorySpec; + use nativelink_proto::build::bazel::remote::execution::v2::{ + Digest as ProtoDigest, DirectoryNode, FileNode, + }; + use nativelink_store::memory_store::MemoryStore; + use nativelink_util::blob_locality_map::new_shared_blob_locality_map; + use nativelink_util::digest_hasher::{DigestHasher, DigestHasherFunc}; + + /// Helper: encode a Directory proto and compute its DigestInfo (SHA256). + fn encode_directory(dir: &Directory) -> (Vec, DigestInfo) { + let dir_bytes = dir.encode_to_vec(); + let mut hasher = DigestHasherFunc::Sha256.hasher(); + hasher.update(&dir_bytes); + let digest_info = hasher.finalize_digest(); + (dir_bytes, digest_info) + } + + /// Helper: create a FileNode with a deterministic fake digest. + fn make_file_node(name: &str, hash_byte: u8, size: i64) -> FileNode { + FileNode { + name: name.to_string(), + digest: Some(ProtoDigest { + hash: format!("{:02x}", hash_byte).repeat(32), // 64-char hex + size_bytes: size, + ..Default::default() + }), + ..Default::default() + } + } + + #[test] + fn test_score_workers_basic() { + let locality_map = new_shared_blob_locality_map(); + let d1 = DigestInfo::new([1u8; 32], 1000); + let d2 = DigestInfo::new([2u8; 32], 2000); + let d3 = DigestInfo::new([3u8; 32], 3000); + + // worker-a has d1 and d2 (3000 bytes total) + // worker-b has d2 and d3 (5000 bytes total) + { + let mut map = locality_map.write(); + map.register_blobs("grpc://worker-a:50081", &[d1, d2]); + map.register_blobs("grpc://worker-b:50081", &[d2, d3]); + } + + let worker_a = WorkerId::from("worker-a-id".to_string()); + let worker_b = WorkerId::from("worker-b-id".to_string()); + + let mut endpoint_to_worker = HashMap::new(); + endpoint_to_worker.insert("grpc://worker-a:50081".to_string(), worker_a.clone()); + endpoint_to_worker.insert("grpc://worker-b:50081".to_string(), worker_b.clone()); + + let mut candidates = HashSet::new(); + candidates.insert(worker_a.clone()); + candidates.insert(worker_b.clone()); + + let file_digests = vec![(d1, 1000), (d2, 2000), (d3, 3000)]; + + let scores = score_workers(&candidates, &file_digests, &locality_map, &endpoint_to_worker); + + assert_eq!(scores.get(&worker_a), Some(&3000)); // d1(1000) + d2(2000) + assert_eq!(scores.get(&worker_b), Some(&5000)); // d2(2000) + d3(3000) + } + + #[test] + fn test_score_workers_non_candidate_excluded() { + let locality_map = new_shared_blob_locality_map(); + let d1 = DigestInfo::new([1u8; 32], 1000); + + { + let mut map = locality_map.write(); + map.register_blobs("grpc://worker-a:50081", &[d1]); + } + + let worker_a = WorkerId::from("worker-a-id".to_string()); + let mut endpoint_to_worker = HashMap::new(); + endpoint_to_worker.insert("grpc://worker-a:50081".to_string(), worker_a.clone()); + + // worker_a is NOT in candidates + let candidates = HashSet::new(); + let file_digests = vec![(d1, 1000)]; + + let scores = score_workers(&candidates, &file_digests, &locality_map, &endpoint_to_worker); + assert!(scores.is_empty()); + } + + #[test] + fn test_score_workers_empty_locality_map() { + let locality_map = new_shared_blob_locality_map(); + let d1 = DigestInfo::new([1u8; 32], 1000); + + let worker_a = WorkerId::from("worker-a-id".to_string()); + let mut candidates = HashSet::new(); + candidates.insert(worker_a.clone()); + + let endpoint_to_worker = HashMap::new(); + let file_digests = vec![(d1, 1000)]; + + let scores = score_workers(&candidates, &file_digests, &locality_map, &endpoint_to_worker); + assert!(scores.is_empty()); + } + + // --------------------------------------------------------------- + // resolve_tree_from_cas tests + // --------------------------------------------------------------- + + #[tokio::test] + async fn test_resolve_tree_single_directory() { + // A single directory with 3 files, no subdirectories. + let dir = Directory { + files: vec![ + make_file_node("file1.txt", 0xaa, 1000), + make_file_node("file2.txt", 0xbb, 2000), + make_file_node("file3.txt", 0xcc, 3000), + ], + directories: vec![], + ..Default::default() + }; + + let (dir_bytes, dir_digest) = encode_directory(&dir); + let store = Store::new(MemoryStore::new(&MemorySpec::default())); + let key: StoreKey<'_> = dir_digest.into(); + store + .update_oneshot(key, Bytes::from(dir_bytes)) + .await + .expect("store update_oneshot failed"); + + let result = resolve_tree_from_cas(&store, dir_digest) + .await + .expect("resolve_tree_from_cas failed"); + + assert_eq!(result.file_digests.len(), 3, "Expected 3 file digests"); + assert_eq!(result.dir_digests.len(), 1, "Expected 1 directory digest (root)"); + assert!(result.dir_digests.contains(&dir_digest)); + + // Root subtree contains all files: 1000+2000+3000 = 6000 + assert_eq!(result.subtree_bytes.get(&dir_digest), Some(&6000)); + + // Verify all three sizes are present (order may vary). + let mut sizes: Vec = result.file_digests.iter().map(|&(_, s)| s).collect(); + sizes.sort(); + assert_eq!(sizes, vec![1000, 2000, 3000]); + } + + #[tokio::test] + async fn test_resolve_tree_nested_directories() { + // Subdirectory with 2 files. + let sub_dir = Directory { + files: vec![ + make_file_node("sub_file1.txt", 0x11, 500), + make_file_node("sub_file2.txt", 0x22, 700), + ], + directories: vec![], + ..Default::default() + }; + let (sub_dir_bytes, sub_dir_digest) = encode_directory(&sub_dir); + + // Root directory with 1 file and a reference to the subdirectory. + let root_dir = Directory { + files: vec![make_file_node("root_file.txt", 0x33, 1200)], + directories: vec![DirectoryNode { + name: "subdir".to_string(), + digest: Some(sub_dir_digest.into()), + }], + ..Default::default() + }; + let (root_dir_bytes, root_dir_digest) = encode_directory(&root_dir); + + let store = Store::new(MemoryStore::new(&MemorySpec::default())); + let root_key: StoreKey<'_> = root_dir_digest.into(); + store + .update_oneshot(root_key, Bytes::from(root_dir_bytes)) + .await + .expect("store root dir"); + let sub_key: StoreKey<'_> = sub_dir_digest.into(); + store + .update_oneshot(sub_key, Bytes::from(sub_dir_bytes)) + .await + .expect("store sub dir"); + + let result = resolve_tree_from_cas(&store, root_dir_digest) + .await + .expect("resolve_tree_from_cas failed"); + + assert_eq!(result.file_digests.len(), 3, "Expected 3 files (1 root + 2 subdir)"); + assert_eq!(result.dir_digests.len(), 2, "Expected 2 directory digests (root + subdir)"); + assert!(result.dir_digests.contains(&root_dir_digest)); + assert!(result.dir_digests.contains(&sub_dir_digest)); + + // subdir has 500+700=1200 bytes of files + assert_eq!(result.subtree_bytes.get(&sub_dir_digest), Some(&1200)); + // root has 1200 (own file) + 1200 (subdir subtree) = 2400 + assert_eq!(result.subtree_bytes.get(&root_dir_digest), Some(&2400)); + + let mut sizes: Vec = result.file_digests.iter().map(|&(_, s)| s).collect(); + sizes.sort(); + assert_eq!(sizes, vec![500, 700, 1200]); + } + + #[tokio::test] + async fn test_resolve_tree_deduplicates_files() { + // Two directories both referencing the same file digest. + let shared_file = make_file_node("shared.txt", 0xdd, 999); + + let sub_dir = Directory { + files: vec![shared_file.clone()], + directories: vec![], + ..Default::default() + }; + let (sub_dir_bytes, sub_dir_digest) = encode_directory(&sub_dir); + + let root_dir = Directory { + files: vec![ + // Same digest as the file in sub_dir (same hash_byte 0xdd, same size). + make_file_node("also_shared.txt", 0xdd, 999), + ], + directories: vec![DirectoryNode { + name: "subdir".to_string(), + digest: Some(sub_dir_digest.into()), + }], + ..Default::default() + }; + let (root_dir_bytes, root_dir_digest) = encode_directory(&root_dir); + + let store = Store::new(MemoryStore::new(&MemorySpec::default())); + let root_key: StoreKey<'_> = root_dir_digest.into(); + store + .update_oneshot(root_key, Bytes::from(root_dir_bytes)) + .await + .expect("store root dir"); + let sub_key: StoreKey<'_> = sub_dir_digest.into(); + store + .update_oneshot(sub_key, Bytes::from(sub_dir_bytes)) + .await + .expect("store sub dir"); + + let result = resolve_tree_from_cas(&store, root_dir_digest) + .await + .expect("resolve_tree_from_cas failed"); + + // The same digest should appear only once. + assert_eq!( + result.file_digests.len(), + 1, + "Duplicate file digest should be deduplicated" + ); + assert_eq!(result.file_digests[0].1, 999); + assert_eq!(result.dir_digests.len(), 2, "Expected root + subdir"); + assert!(result.dir_digests.contains(&root_dir_digest)); + assert!(result.dir_digests.contains(&sub_dir_digest)); + + // Both dirs have the same file (999 bytes) — subtree_bytes counts + // each occurrence (not deduplicated, since it's per-directory). + assert_eq!(result.subtree_bytes.get(&sub_dir_digest), Some(&999)); + assert_eq!(result.subtree_bytes.get(&root_dir_digest), Some(&1998)); // 999 + 999 + } + + #[tokio::test] + async fn test_resolve_tree_circular_directory() { + // A true hash cycle (A->B->A) is impossible with content-addressed + // hashes: the digest of A depends on B's digest and vice versa. + // Instead, we test the seen_dirs guard with a diamond structure: + // root -> {dir_left, dir_right}, both -> dir_shared + // Without the seen_dirs set, dir_shared would be visited twice. + let dir_shared = Directory { + files: vec![make_file_node("shared.txt", 0x11, 100)], + directories: vec![], + ..Default::default() + }; + let (shared_bytes, shared_digest) = encode_directory(&dir_shared); + + let dir_left = Directory { + files: vec![make_file_node("left.txt", 0x22, 200)], + directories: vec![DirectoryNode { + name: "shared".to_string(), + digest: Some(shared_digest.into()), + }], + ..Default::default() + }; + let (left_bytes, left_digest) = encode_directory(&dir_left); + + let dir_right = Directory { + files: vec![make_file_node("right.txt", 0x33, 300)], + directories: vec![DirectoryNode { + name: "shared".to_string(), + digest: Some(shared_digest.into()), + }], + ..Default::default() + }; + let (right_bytes, right_digest) = encode_directory(&dir_right); + + let root = Directory { + files: vec![], + directories: vec![ + DirectoryNode { + name: "left".to_string(), + digest: Some(left_digest.into()), + }, + DirectoryNode { + name: "right".to_string(), + digest: Some(right_digest.into()), + }, + ], + ..Default::default() + }; + let (root_bytes, root_digest) = encode_directory(&root); + + let store = Store::new(MemoryStore::new(&MemorySpec::default())); + for (bytes, digest) in [ + (root_bytes, root_digest), + (left_bytes, left_digest), + (right_bytes, right_digest), + (shared_bytes, shared_digest), + ] { + let key: StoreKey<'_> = digest.into(); + store + .update_oneshot(key, Bytes::from(bytes)) + .await + .expect("store update"); + } + + let result = resolve_tree_from_cas(&store, root_digest) + .await + .expect("resolve_tree_from_cas failed"); + + // dir_shared is referenced by both dir_left and dir_right, but + // seen_dirs ensures it's only visited once. Files: shared(0x11), + // left(0x22), right(0x33) — all unique digests, so 3 total. + assert_eq!( + result.file_digests.len(), + 3, + "Diamond structure: shared dir visited once, 3 unique files" + ); + // 4 directories: root, left, right, shared + assert_eq!(result.dir_digests.len(), 4, "Expected 4 directory digests"); + assert!(result.dir_digests.contains(&root_digest)); + assert!(result.dir_digests.contains(&left_digest)); + assert!(result.dir_digests.contains(&right_digest)); + assert!(result.dir_digests.contains(&shared_digest)); + + // shared: 100 bytes (its own file) + assert_eq!(result.subtree_bytes.get(&shared_digest), Some(&100)); + // left: 200 (own) + 100 (shared) = 300 + assert_eq!(result.subtree_bytes.get(&left_digest), Some(&300)); + // right: 300 (own) + 100 (shared) = 400 + assert_eq!(result.subtree_bytes.get(&right_digest), Some(&400)); + // root: 0 (no own files) + 300 (left) + 400 (right) = 700 + assert_eq!(result.subtree_bytes.get(&root_digest), Some(&700)); + + let mut sizes: Vec = result.file_digests.iter().map(|&(_, s)| s).collect(); + sizes.sort(); + assert_eq!(sizes, vec![100, 200, 300]); + } + + #[tokio::test] + async fn test_resolve_tree_missing_directory() { + // Attempt to resolve a digest that doesn't exist in the store. + let store = Store::new(MemoryStore::new(&MemorySpec::default())); + + let missing_digest = DigestInfo::new([0xff; 32], 42); + let result = resolve_tree_from_cas(&store, missing_digest).await; + + assert!( + result.is_err(), + "Should return an error for a missing directory" + ); + } + + #[test] + fn test_score_workers_empty_file_list() { + let locality_map = new_shared_blob_locality_map(); + + // Even with data in the locality map, empty file_digests => empty scores. + { + let mut map = locality_map.write(); + let d1 = DigestInfo::new([1u8; 32], 1000); + map.register_blobs("grpc://worker-a:50081", &[d1]); + } + + let worker_a = WorkerId::from("worker-a-id".to_string()); + let mut endpoint_to_worker = HashMap::new(); + endpoint_to_worker.insert("grpc://worker-a:50081".to_string(), worker_a.clone()); + + let mut candidates = HashSet::new(); + candidates.insert(worker_a); + + let file_digests: Vec<(DigestInfo, u64)> = vec![]; + + let scores = score_workers(&candidates, &file_digests, &locality_map, &endpoint_to_worker); + assert!( + scores.is_empty(), + "Expected empty scores for empty file_digests, got {scores:?}" + ); + } + + #[tokio::test] + async fn test_resolve_input_tree_cache_hit_returns_same_arc() { + use nativelink_config::schedulers::WorkerAllocationStrategy; + use nativelink_metric::MetricsComponent; + use nativelink_util::operation_state_manager::{UpdateOperationType, WorkerStateManager}; + use crate::platform_property_manager::PlatformPropertyManager; + use crate::worker_registry::WorkerRegistry; + + // Minimal mock WorkerStateManager for constructing ApiWorkerScheduler. + #[derive(Debug)] + struct NoopWorkerStateManager; + + impl MetricsComponent for NoopWorkerStateManager { + fn publish( + &self, + _kind: MetricKind, + _field_metadata: MetricFieldData, + ) -> Result { + Ok(MetricPublishKnownKindData::Component) + } + } + + #[tonic::async_trait] + impl WorkerStateManager for NoopWorkerStateManager { + async fn update_operation( + &self, + _operation_id: &OperationId, + _worker_id: &WorkerId, + _update: UpdateOperationType, + ) -> Result<(), Error> { + Ok(()) + } + } + + // Create a store with a single-directory tree (one file). + let store = Store::new(MemoryStore::new(&MemorySpec::default())); + + let dir = Directory { + files: vec![make_file_node("test.txt", 0xaa, 1000)], + directories: vec![], + ..Default::default() + }; + let (dir_bytes, dir_digest) = encode_directory(&dir); + let key: StoreKey<'_> = dir_digest.into(); + store + .update_oneshot(key, Bytes::from(dir_bytes)) + .await + .expect("store update"); + + // Build scheduler with CAS store. + let scheduler = ApiWorkerScheduler::new_with_locality_map( + Arc::new(NoopWorkerStateManager), + Arc::new(PlatformPropertyManager::new(HashMap::new())), + WorkerAllocationStrategy::default(), + Arc::new(Notify::new()), + 100, + Arc::new(WorkerRegistry::new()), + None, + Some(store), + ); + + // First call: cache miss, resolves from CAS. + let result1 = scheduler.resolve_input_tree(dir_digest).await; + assert!(result1.is_some(), "Expected Some from first resolve"); + + // Second call: cache hit, should return the same Arc. + let result2 = scheduler.resolve_input_tree(dir_digest).await; + assert!(result2.is_some(), "Expected Some from second resolve"); + + let arc1 = result1.unwrap(); + let arc2 = result2.unwrap(); + assert!( + Arc::ptr_eq(&arc1, &arc2), + "Expected resolve_input_tree to return the same Arc on cache hit (pointer equality)" + ); + } +} diff --git a/nativelink-scheduler/src/awaited_action_db/awaited_action.rs b/nativelink-scheduler/src/awaited_action_db/awaited_action.rs index 337c354e0..ab8abc14d 100644 --- a/nativelink-scheduler/src/awaited_action_db/awaited_action.rs +++ b/nativelink-scheduler/src/awaited_action_db/awaited_action.rs @@ -163,6 +163,12 @@ impl AwaitedAction { self.sort_key } + /// Boost this action to maximum priority so it is scheduled next. + /// Used for retrying infrastructure failures (e.g. OOM/SIGKILL). + pub(crate) fn boost_priority(&mut self) { + self.sort_key = AwaitedActionSortKey::new(i32::MAX, 0); + } + pub const fn state(&self) -> &Arc { &self.state } diff --git a/nativelink-scheduler/src/default_scheduler_factory.rs b/nativelink-scheduler/src/default_scheduler_factory.rs index 58e27605b..fc9fd3c7e 100644 --- a/nativelink-scheduler/src/default_scheduler_factory.rs +++ b/nativelink-scheduler/src/default_scheduler_factory.rs @@ -23,6 +23,7 @@ use nativelink_error::{Error, ResultExt, make_input_err}; use nativelink_proto::com::github::trace_machina::nativelink::events::OriginEvent; use nativelink_store::redis_store::RedisStore; use nativelink_store::store_manager::StoreManager; +use nativelink_util::blob_locality_map::SharedBlobLocalityMap; use nativelink_util::instant_wrapper::InstantWrapper; use nativelink_util::operation_state_manager::ClientStateManager; use redis::aio::{ConnectionManager, PubSub}; @@ -49,18 +50,20 @@ pub fn scheduler_factory( spec: &SchedulerSpec, store_manager: &StoreManager, maybe_origin_event_tx: Option<&mpsc::Sender>, + locality_map: Option, ) -> Result { - inner_scheduler_factory(spec, store_manager, maybe_origin_event_tx) + inner_scheduler_factory(spec, store_manager, maybe_origin_event_tx, locality_map) } fn inner_scheduler_factory( spec: &SchedulerSpec, store_manager: &StoreManager, maybe_origin_event_tx: Option<&mpsc::Sender>, + locality_map: Option, ) -> Result { let scheduler: SchedulerFactoryResults = match spec { SchedulerSpec::Simple(spec) => { - simple_scheduler_factory(spec, store_manager, SystemTime::now, maybe_origin_event_tx)? + simple_scheduler_factory(spec, store_manager, SystemTime::now, maybe_origin_event_tx, locality_map)? } SchedulerSpec::Grpc(spec) => (Some(Arc::new(GrpcScheduler::new(spec)?)), None), SchedulerSpec::CacheLookup(spec) => { @@ -68,7 +71,7 @@ fn inner_scheduler_factory( .get_store(&spec.ac_store) .err_tip(|| format!("'ac_store': '{}' does not exist", spec.ac_store))?; let (action_scheduler, worker_scheduler) = - inner_scheduler_factory(&spec.scheduler, store_manager, maybe_origin_event_tx) + inner_scheduler_factory(&spec.scheduler, store_manager, maybe_origin_event_tx, locality_map.clone()) .err_tip(|| "In nested CacheLookupScheduler construction")?; let cache_lookup_scheduler = Arc::new(CacheLookupScheduler::new( ac_store, @@ -78,7 +81,7 @@ fn inner_scheduler_factory( } SchedulerSpec::PropertyModifier(spec) => { let (action_scheduler, worker_scheduler) = - inner_scheduler_factory(&spec.scheduler, store_manager, maybe_origin_event_tx) + inner_scheduler_factory(&spec.scheduler, store_manager, maybe_origin_event_tx, locality_map.clone()) .err_tip(|| "In nested PropertyModifierScheduler construction")?; let property_modifier_scheduler = Arc::new(PropertyModifierScheduler::new( spec, @@ -96,7 +99,19 @@ fn simple_scheduler_factory( store_manager: &StoreManager, now_fn: fn() -> SystemTime, maybe_origin_event_tx: Option<&mpsc::Sender>, + locality_map: Option, ) -> Result { + // Resolve the CAS store for locality-aware scheduling if configured. + let cas_store = if let Some(ref cas_store_name) = spec.cas_store { + Some( + store_manager + .get_store(cas_store_name) + .err_tip(|| format!("'cas_store': '{cas_store_name}' does not exist"))?, + ) + } else { + None + }; + match spec .experimental_backend .as_ref() @@ -109,11 +124,13 @@ fn simple_scheduler_factory( &task_change_notify, SystemTime::now, ); - let (action_scheduler, worker_scheduler) = SimpleScheduler::new( + let (action_scheduler, worker_scheduler) = SimpleScheduler::new_with_cas_store( spec, awaited_action_db, task_change_notify, maybe_origin_event_tx.cloned(), + cas_store, + locality_map, ); Ok((Some(action_scheduler), Some(worker_scheduler))) } @@ -143,11 +160,13 @@ fn simple_scheduler_factory( Default::default, ) .err_tip(|| "In state_manager_factory::redis_state_manager")?; - let (action_scheduler, worker_scheduler) = SimpleScheduler::new( + let (action_scheduler, worker_scheduler) = SimpleScheduler::new_with_cas_store( spec, awaited_action_db, task_change_notify, maybe_origin_event_tx.cloned(), + cas_store, + locality_map, ); Ok((Some(action_scheduler), Some(worker_scheduler))) } diff --git a/nativelink-scheduler/src/memory_awaited_action_db.rs b/nativelink-scheduler/src/memory_awaited_action_db.rs index 6154bd17e..ac62b7dce 100644 --- a/nativelink-scheduler/src/memory_awaited_action_db.rs +++ b/nativelink-scheduler/src/memory_awaited_action_db.rs @@ -286,7 +286,7 @@ impl SortedAwaitedActions { operation_id: new_awaited_action.operation_id().clone(), }); - let Some(sorted_awaited_action) = maybe_sorted_awaited_action else { + let Some(mut sorted_awaited_action) = maybe_sorted_awaited_action else { return Err(make_err!( Code::Internal, "sorted_action_info_hash_keys and action_info_hash_key_to_awaited_action are out of sync - {} - {:?}", @@ -295,6 +295,13 @@ impl SortedAwaitedActions { )); }; + // Update sort_key to match the new awaited action. Without this, + // boost_priority() (used during SIGKILL retry) changes the sort_key + // on the AwaitedAction stored in the watch channel, but the BTree + // entry retains the old sort_key, causing all subsequent lookups to + // fail with "out of sync". + sorted_awaited_action.sort_key = new_awaited_action.sort_key(); + self.insert_sort_map_for_stage(&new_awaited_action.state().stage, &sorted_awaited_action) .err_tip(|| "In AwaitedActionDb::update_awaited_action")?; Ok(()) @@ -417,14 +424,19 @@ impl I + Clone + Send + Sync> AwaitedActionDbI debug!(%operation_id, "Clearing operation from state manager"); let awaited_action = tx.borrow().clone(); // Cleanup action_info_hash_key_to_awaited_action if it was marked cached. + // Only remove the entry if it still points to THIS operation. + // A newer operation may have claimed this key slot if the + // action completed and was re-requested before this cleanup ran. match &awaited_action.action_info().unique_qualifier { ActionUniqueQualifier::Cacheable(action_key) => { - let maybe_awaited_action = self + let dominated_by_self = self .action_info_hash_key_to_awaited_action - .remove(action_key); - if !awaited_action.state().stage.is_finished() - && maybe_awaited_action.is_none() - { + .get(action_key) + .map_or(false, |mapped_op_id| *mapped_op_id == operation_id); + if dominated_by_self { + self.action_info_hash_key_to_awaited_action + .remove(action_key); + } else if !awaited_action.state().stage.is_finished() { error!( %operation_id, ?awaited_action, @@ -552,18 +564,22 @@ impl I + Clone + Send + Sync> AwaitedActionDbI } match &new_awaited_action.action_info().unique_qualifier { ActionUniqueQualifier::Cacheable(action_key) => { - let maybe_awaited_action = - action_info_hash_key_to_awaited_action.remove(action_key); - match maybe_awaited_action { - Some(removed_operation_id) => { - if &removed_operation_id != new_awaited_action.operation_id() { - error!( - ?removed_operation_id, - ?new_awaited_action, - ?action_key, - "action_info_hash_key_to_awaited_action and operation_id_to_awaited_action are out of sync", - ); - } + // Only remove the entry if it belongs to this operation. + // A newer operation may have claimed this key slot if the + // original was cleaned up and re-requested. + match action_info_hash_key_to_awaited_action.get(action_key) { + Some(mapped_operation_id) + if mapped_operation_id == new_awaited_action.operation_id() => + { + action_info_hash_key_to_awaited_action.remove(action_key); + } + Some(mapped_operation_id) => { + error!( + ?mapped_operation_id, + ?new_awaited_action, + ?action_key, + "action_info_hash_key_to_awaited_action points to a different operation_id", + ); } None => { error!( @@ -702,6 +718,20 @@ impl I + Clone + Send + Sync> AwaitedActionDbI } } + // Log orphaned completed actions (no active WaitExecution subscriber). + // These are typically from Bazel dynamic execution where the local leg + // won and the client dropped the remote stream. + if matches!( + new_awaited_action.state().stage, + ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) + ) && tx.receiver_count() == 0 + { + debug!( + operation_id = ?new_awaited_action.operation_id(), + "Completed action has no subscribers (likely orphaned dynamic execution)", + ); + } + // Notify all listeners of the new state and ignore if no one is listening. // Note: Do not use `.send()` as it will not update the state if all listeners // are dropped. diff --git a/nativelink-scheduler/src/simple_scheduler.rs b/nativelink-scheduler/src/simple_scheduler.rs index d977fceea..dfdb27059 100644 --- a/nativelink-scheduler/src/simple_scheduler.rs +++ b/nativelink-scheduler/src/simple_scheduler.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::{BTreeSet, HashMap}; +use std::collections::{BTreeSet, HashMap, HashSet}; use std::sync::Arc; use std::time::{Instant, SystemTime}; @@ -23,6 +23,7 @@ use nativelink_error::{Code, Error, ResultExt}; use nativelink_metric::{MetricsComponent, RootMetricsComponent}; use nativelink_proto::com::github::trace_machina::nativelink::events::OriginEvent; use nativelink_util::action_messages::{ActionInfo, ActionState, OperationId, WorkerId}; +use nativelink_util::common::DigestInfo; use nativelink_util::instant_wrapper::InstantWrapper; use nativelink_util::known_platform_property_provider::KnownPlatformPropertyProvider; use nativelink_util::operation_state_manager::{ @@ -30,6 +31,7 @@ use nativelink_util::operation_state_manager::{ OperationFilter, OperationStageFlags, OrderDirection, UpdateOperationType, }; use nativelink_util::origin_event::OriginMetadata; +use nativelink_util::platform_properties::PlatformProperties; use nativelink_util::shutdown_guard::ShutdownGuard; use nativelink_util::spawn; use nativelink_util::task::JoinHandleDropGuard; @@ -39,7 +41,7 @@ use opentelemetry::context::{Context, FutureExt as OtelFutureExt}; use opentelemetry_semantic_conventions::attribute::ENDUSER_ID; use tokio::sync::{Notify, mpsc}; use tokio::time::Duration; -use tracing::{debug, error, info, info_span, warn}; +use tracing::{debug, error, info_span, warn}; use crate::api_worker_scheduler::ApiWorkerScheduler; use crate::awaited_action_db::{AwaitedActionDb, CLIENT_KEEPALIVE_DURATION}; @@ -51,7 +53,9 @@ use crate::worker_scheduler::WorkerScheduler; /// Default timeout for workers in seconds. /// If this changes, remember to change the documentation in the config. -const DEFAULT_WORKER_TIMEOUT_S: u64 = 5; +/// A 5-second timeout causes unnecessary worker churn on any brief network +/// hiccup or GC pause, so we use a more generous default. +const DEFAULT_WORKER_TIMEOUT_S: u64 = 30; /// Mark operations as completed with error if no client has updated them /// within this duration. @@ -146,6 +150,11 @@ pub struct SimpleScheduler { /// e.g. "worker busy", "can't find any worker" /// Set to None to disable. This is quite noisy, so we limit it worker_match_logging_interval: Option, + + /// Maximum number of actions that can be matched per client + /// (identified by `instance_name`) in one matching cycle. + /// 0 means unlimited (fair scheduling disabled). + max_matches_per_client_per_cycle: usize, } impl core::fmt::Debug for SimpleScheduler { @@ -216,98 +225,31 @@ impl SimpleScheduler { // can create a map of capabilities of each worker and then try and match // the actions to the worker using the map lookup (ie. map reduce). async fn do_try_match(&self, full_worker_logging: bool) -> Result<(), Error> { - async fn match_action_to_worker( - action_state_result: &dyn ActionStateResult, - workers: &ApiWorkerScheduler, - matching_engine_state_manager: &dyn MatchingEngineStateManager, - platform_property_manager: &PlatformPropertyManager, - full_worker_logging: bool, - ) -> Result<(), Error> { - let (action_info, maybe_origin_metadata) = - action_state_result - .as_action_info() - .await - .err_tip(|| "Failed to get action_info from as_action_info_result stream")?; - - // TODO(palfrey) We should not compute this every time and instead store - // it with the ActionInfo when we receive it. - let platform_properties = platform_property_manager - .make_platform_properties(action_info.platform_properties.clone()) - .err_tip( - || "Failed to make platform properties in SimpleScheduler::do_try_match", - )?; - - let action_info = ActionInfoWithProps { - inner: action_info, - platform_properties, - }; - - // Try to find a worker for the action. - let worker_id = { - match workers - .find_worker_for_action(&action_info.platform_properties, full_worker_logging) - .await - { - Some(worker_id) => worker_id, - // If we could not find a worker for the action, - // we have nothing to do. - None => return Ok(()), - } - }; - - let attach_operation_fut = async move { - // Extract the operation_id from the action_state. - let operation_id = { - let (action_state, _origin_metadata) = action_state_result - .as_state() - .await - .err_tip(|| "Failed to get action_info from as_state_result stream")?; - action_state.client_operation_id.clone() - }; - - // Tell the matching engine that the operation is being assigned to a worker. - let assign_result = matching_engine_state_manager - .assign_operation(&operation_id, Ok(&worker_id)) - .await - .err_tip(|| "Failed to assign operation in do_try_match"); - if let Err(err) = assign_result { - if err.code == Code::Aborted { - // If the operation was aborted, it means that the operation was - // cancelled due to another operation being assigned to the worker. - return Ok(()); - } - // Any other error is a real error. - return Err(err); - } - - debug!(%worker_id, %operation_id, ?action_info, "Notifying worker of operation"); - workers - .worker_notify_run_action(worker_id, operation_id, action_info) - .await - .err_tip(|| { - "Failed to run worker_notify_run_action in SimpleScheduler::do_try_match" - }) - }; - tokio::pin!(attach_operation_fut); - - let origin_metadata = maybe_origin_metadata.unwrap_or_default(); - - let ctx = Context::current_with_baggage(vec![KeyValue::new( - ENDUSER_ID, - origin_metadata.identity, - )]); - - info_span!("do_try_match") - .in_scope(|| attach_operation_fut) - .with_context(ctx) - .await - } - - let mut result = Ok(()); + /// Maximum number of actions to process concurrently during matching. + /// find_and_reserve_worker atomically finds AND reserves the worker + /// (reducing platform properties and inserting into running_action_infos) + /// under a single lock acquisition, so concurrent matches cannot + /// select the same worker. + const MATCH_CONCURRENCY: usize = 8; + + // Cache for computed platform properties, keyed by sorted key-value + // pairs. This avoids recomputing the same PlatformProperties for + // actions that share identical platform requirements (the common case). + let props_cache: std::sync::Mutex< + HashMap, Arc>, + > = std::sync::Mutex::new(HashMap::new()); + + // Per-client match counter for fair scheduling. When + // max_matches_per_client_per_cycle > 0, limits how many actions + // from the same instance_name can be matched in one cycle, + // preventing a single client from monopolizing all workers. + let per_client_matches: std::sync::Mutex> = + std::sync::Mutex::new(HashMap::new()); + let max_per_client = self.max_matches_per_client_per_cycle; let start = Instant::now(); - let mut stream = self + let stream = self .get_queued_operations() .await .err_tip(|| "Failed to get queued operations in do_try_match")?; @@ -320,17 +262,49 @@ impl SimpleScheduler { ); } - while let Some(action_state_result) = stream.next().await { - result = result.merge( - match_action_to_worker( - action_state_result.as_ref(), + // Collect all queued actions so we own them, then process up to + // MATCH_CONCURRENCY concurrently using FuturesUnordered. Each action + // independently finds a worker and assigns itself; conflicts are + // resolved by the existing error handling (Aborted codes, None from + // find_worker, etc.). + let queued_actions: Vec> = stream.collect().await; + + let mut futures_set = futures::stream::FuturesUnordered::< + std::pin::Pin> + Send + '_>>, + >::new(); + let mut action_iter = queued_actions.into_iter(); + let mut result = Ok(()); + + // Seed the initial batch. + for action_state_result in action_iter.by_ref().take(MATCH_CONCURRENCY) { + futures_set.push(Box::pin(Self::match_action_to_worker_cached( + action_state_result, + self.worker_scheduler.as_ref(), + self.matching_engine_state_manager.as_ref(), + self.platform_property_manager.as_ref(), + &props_cache, + &per_client_matches, + max_per_client, + full_worker_logging, + ))); + } + + // Process futures as they complete, adding new ones to maintain concurrency. + while let Some(match_result) = futures_set.next().await { + result = result.merge(match_result); + + if let Some(action_state_result) = action_iter.next() { + futures_set.push(Box::pin(Self::match_action_to_worker_cached( + action_state_result, self.worker_scheduler.as_ref(), self.matching_engine_state_manager.as_ref(), self.platform_property_manager.as_ref(), + &props_cache, + &per_client_matches, + max_per_client, full_worker_logging, - ) - .await, - ); + ))); + } } let total_elapsed = start.elapsed(); @@ -344,6 +318,165 @@ impl SimpleScheduler { result } + + /// Matches a single action to a worker, using a shared cache for computed + /// platform properties to avoid redundant recomputation across actions + /// with identical platform requirements. + /// + /// When `max_per_client > 0`, enforces fair scheduling by limiting how + /// many actions from the same `instance_name` can be matched per cycle. + /// Actions that exceed the limit are skipped (left in queue for next cycle). + async fn match_action_to_worker_cached( + action_state_result: Box, + workers: &ApiWorkerScheduler, + matching_engine_state_manager: &dyn MatchingEngineStateManager, + platform_property_manager: &PlatformPropertyManager, + props_cache: &std::sync::Mutex< + HashMap, Arc>, + >, + per_client_matches: &std::sync::Mutex>, + max_per_client: usize, + full_worker_logging: bool, + ) -> Result<(), Error> { + let (action_info, maybe_origin_metadata) = action_state_result + .as_action_info() + .await + .err_tip(|| "Failed to get action_info from as_action_info_result stream")?; + + // Fair scheduling: atomically check and optimistically increment the + // per-client counter. If the client has hit its limit, skip the action. + // If the match later fails, we decrement to undo the reservation. + let client_name = action_info.instance_name().clone(); + let claimed_slot = if max_per_client > 0 { + let mut map = per_client_matches.lock().unwrap_or_else(|e| e.into_inner()); + let count = map.entry(client_name.clone()).or_insert(0); + if *count >= max_per_client { + // Skip — action stays queued for next cycle. + return Ok(()); + } + *count += 1; + true + } else { + false + }; + + // Helper to undo the optimistic increment on failure paths. + let undo_claim = |per_client_matches: &std::sync::Mutex>, + client_name: &str| { + let mut map = per_client_matches.lock().unwrap_or_else(|e| e.into_inner()); + if let Some(count) = map.get_mut(client_name) { + *count = count.saturating_sub(1); + } + }; + + // Build a deterministic cache key from the raw platform + // properties (sorted key-value pairs). + let mut cache_key: Vec<(String, String)> = + action_info.platform_properties.clone().into_iter().collect(); + cache_key.sort(); + + // Look up or compute and cache the platform properties. + let platform_properties = { + let mut cache = props_cache.lock().unwrap_or_else(|e| e.into_inner()); + if let Some(cached) = cache.get(&cache_key) { + cached.clone() + } else { + let computed = platform_property_manager + .make_platform_properties(action_info.platform_properties.clone()) + .err_tip(|| { + "Failed to make platform properties in SimpleScheduler::do_try_match" + })?; + let arc = Arc::new(computed); + cache.insert(cache_key, arc.clone()); + arc + } + }; + + let action_info_with_props = ActionInfoWithProps { + inner: action_info, + platform_properties: (*platform_properties).clone(), + }; + + // Extract the operation_id from the action_state BEFORE finding a + // worker, so we can pass it to find_and_reserve_worker for atomic + // reservation. + let operation_id = { + let (action_state, _origin_metadata) = action_state_result + .as_state() + .await + .err_tip(|| "Failed to get action_info from as_state_result stream")?; + action_state.client_operation_id.clone() + }; + + // Atomically find a worker AND reserve it for this operation. + // The worker's platform properties are reduced and the action is + // recorded in running_action_infos under a single lock acquisition, + // preventing concurrent matches from selecting the same worker. + let (worker_id, tx, msg) = match workers + .find_and_reserve_worker( + &action_info_with_props.platform_properties, + &operation_id, + &action_info_with_props, + full_worker_logging, + ) + .await + { + Some(result) => result, + // No worker found — undo the optimistic increment. + None => { + if claimed_slot { + undo_claim(per_client_matches, &client_name); + } + return Ok(()); + } + }; + + // Tell the matching engine that the operation is being assigned to a worker. + let assign_result = matching_engine_state_manager + .assign_operation(&operation_id, Ok(&worker_id)) + .await + .err_tip(|| "Failed to assign operation in do_try_match"); + if let Err(err) = assign_result { + // Undo the worker reservation since the assignment failed. + workers.unreserve_worker(&worker_id, &operation_id).await; + if claimed_slot { + undo_claim(per_client_matches, &client_name); + } + if err.code == Code::Aborted { + // The operation was cancelled due to another operation + // being assigned to the worker. + return Ok(()); + } + // Any other error is a real error. + return Err(err); + } + + let origin_metadata = maybe_origin_metadata.unwrap_or_default(); + let ctx = Context::current_with_baggage(vec![KeyValue::new( + ENDUSER_ID, + origin_metadata.identity, + )]); + + let notify_fut = async { + debug!( + %worker_id, + %operation_id, + ?action_info_with_props, + "Notifying worker of operation" + ); + workers + .send_reserved_worker_notification(&worker_id, tx, msg) + .await + .err_tip(|| { + "Failed to send_reserved_worker_notification in SimpleScheduler::do_try_match" + }) + }; + + info_span!("do_try_match") + .in_scope(|| notify_fut) + .with_context(ctx) + .await + } } impl SimpleScheduler { @@ -352,24 +485,41 @@ impl SimpleScheduler { awaited_action_db: A, task_change_notify: Arc, maybe_origin_event_tx: Option>, + ) -> (Arc, Arc) { + Self::new_with_cas_store( + spec, + awaited_action_db, + task_change_notify, + maybe_origin_event_tx, + None, + None, + ) + } + + pub fn new_with_cas_store( + spec: &SimpleSpec, + awaited_action_db: A, + task_change_notify: Arc, + maybe_origin_event_tx: Option>, + cas_store: Option, + locality_map: Option, ) -> (Arc, Arc) { Self::new_with_callback( spec, awaited_action_db, || { - // The cost of running `do_try_match()` is very high, but constant - // in relation to the number of changes that have happened. This - // means that grabbing this lock to process `do_try_match()` should - // always yield to any other tasks that might want the lock. The - // easiest and most fair way to do this is to sleep for a small - // amount of time. Using something like tokio::task::yield_now() - // does not yield as aggressively as we'd like if new futures are - // scheduled within a future. - tokio::time::sleep(Duration::from_millis(1)) + // Yield to allow other tasks to make progress between match + // cycles. A full 1ms sleep is too aggressive and caps matching + // to ~1000 cycles/sec. sleep(ZERO) defers to the next timer + // tick, preventing busy-spinning when no other tasks are + // runnable (unlike yield_now which returns immediately). + tokio::time::sleep(Duration::ZERO) }, task_change_notify, SystemTime::now, maybe_origin_event_tx, + cas_store, + locality_map, ) } @@ -386,6 +536,8 @@ impl SimpleScheduler { task_change_notify: Arc, now_fn: NowFn, maybe_origin_event_tx: Option>, + cas_store: Option, + locality_map: Option, ) -> (Arc, Arc) { let platform_property_manager = Arc::new(PlatformPropertyManager::new( spec.supported_platform_properties @@ -433,13 +585,15 @@ impl SimpleScheduler { Some(worker_registry.clone()), ); - let worker_scheduler = ApiWorkerScheduler::new( + let worker_scheduler = ApiWorkerScheduler::new_with_locality_map( state_manager.clone(), platform_property_manager.clone(), spec.allocation_strategy, worker_change_notify.clone(), worker_timeout_s, worker_registry, + locality_map, + cas_store, ); let worker_scheduler_clone = worker_scheduler.clone(); @@ -450,6 +604,8 @@ impl SimpleScheduler { spawn!("simple_scheduler_task_worker_matching", async move { let mut last_match_successful = true; let mut worker_match_logging_last: Option = None; + let mut last_stall_check: Option = None; + let mut consecutive_match_errors: u32 = 0; // Break out of the loop only when the inner is dropped. loop { let task_change_fut = task_change_notify.notified(); @@ -542,11 +698,129 @@ impl SimpleScheduler { for item in value { items.push(item.to_string()); } - info!(?items, "Oldest actions in state"); + debug!(?items, "Oldest actions in state"); } worker_match_logging_last.replace(now); } + + // Stall detection: every 30s, check for actions stuck + // in Queued state for >60s. Only fires as an error when + // no actions are executing (true deadlock). If workers are + // busy executing, queued stalls are just capacity limits. + let should_check_stalls = match last_stall_check { + None => true, + Some(when) => now.duration_since(when) >= Duration::from_secs(30), + }; + if should_check_stalls { + last_stall_check = Some(now); + let stall_threshold = Duration::from_secs(60); + match scheduler + .matching_engine_state_manager + .filter_operations(OperationFilter { + stages: OperationStageFlags::Queued, + order_by_priority_direction: Some(OrderDirection::Desc), + ..Default::default() + }) + .await + { + Ok(queued_stream) => { + let queued_actions: Vec<_> = queued_stream.collect().await; + let mut stalled_count: usize = 0; + let mut unmatchable_count: usize = 0; + let prop_manager = scheduler.worker_scheduler.get_platform_property_manager(); + for action_state_result in &queued_actions { + if let Ok((state, _)) = action_state_result.as_state().await { + if let Ok(elapsed) = state.last_transition_timestamp.elapsed() { + if elapsed > stall_threshold { + stalled_count += 1; + // Check if any worker could ever match this action. + match action_state_result.as_action_info().await { + Ok((action_info, _)) => { + match prop_manager.make_platform_properties( + action_info.platform_properties.clone(), + ) { + Ok(props) => { + if !scheduler.worker_scheduler.has_matching_workers(&props).await { + error!( + operation_id = %state.client_operation_id, + action_digest = %state.action_digest, + properties = ?action_info.platform_properties, + "Action queued >60s with NO matching workers — \ + no registered worker can satisfy its platform requirements" + ); + unmatchable_count += 1; + } + } + Err(e) => { + warn!( + operation_id = %state.client_operation_id, + ?e, + "Failed to parse platform properties for stalled action — cannot check matchability" + ); + } + } + } + Err(e) => { + warn!( + operation_id = %state.client_operation_id, + ?e, + "Failed to get action_info for stalled action — cannot check matchability" + ); + } + } + } + } + } + } + let matchable_stalled = stalled_count - unmatchable_count; + if matchable_stalled > 0 { + // Check if workers are actively executing. If so, + // the queue backlog is just capacity pressure. + let executing_count = match scheduler + .matching_engine_state_manager + .filter_operations(OperationFilter { + stages: OperationStageFlags::Executing, + ..Default::default() + }) + .await + { + Ok(s) => s.count().await, + Err(e) => { + // Query failed — assume workers are busy + // rather than raising a false deadlock alarm. + warn!(?e, "Failed to query executing actions for stall check"); + usize::MAX + } + }; + + if executing_count > 0 { + warn!( + stalled_count = matchable_stalled, + total_queued = queued_actions.len(), + executing_count, + unmatchable_count, + "Actions waiting in queue >60s (workers at capacity)" + ); + } else { + error!( + stalled_count = matchable_stalled, + total_queued = queued_actions.len(), + unmatchable_count, + "Actions stalled in Queued state >60s with NO executing actions (possible scheduling deadlock)" + ); + } + } + } + Err(e) => { + error!( + ?e, + "Failed to query queued actions for stall check — scheduler state may be corrupted" + ); + } + } + } + res } // If the inner went away it means the scheduler is shutting @@ -554,8 +828,21 @@ impl SimpleScheduler { None => return, }; last_match_successful = result.is_ok(); - if let Err(err) = result { - error!(?err, "Error while running do_try_match"); + if let Err(err) = &result { + consecutive_match_errors += 1; + if consecutive_match_errors >= 10 { + error!( + consecutive_match_errors, + ?err, + "do_try_match failing consecutively — \ + possible scheduler data structure corruption. \ + A server restart may be required to recover.", + ); + } else { + error!(?err, "Error while running do_try_match"); + } + } else { + consecutive_match_errors = 0; } on_matching_engine_run().await; @@ -586,6 +873,7 @@ impl SimpleScheduler { maybe_origin_event_tx, task_worker_matching_spawn, worker_match_logging_interval, + max_matches_per_client_per_cycle: spec.max_matches_per_client_per_cycle, } }); (action_scheduler, worker_scheduler_clone) @@ -678,6 +966,35 @@ impl WorkerScheduler for SimpleScheduler { .set_drain_worker(worker_id, is_draining) .await } + + async fn update_worker_load(&self, worker_id: &WorkerId, cpu_load_pct: u32) -> Result<(), Error> { + self.worker_scheduler + .update_worker_load(worker_id, cpu_load_pct) + .await + } + + async fn update_cached_directories( + &self, + worker_id: &WorkerId, + digests: HashSet, + ) -> Result<(), Error> { + self.worker_scheduler + .update_cached_directories(worker_id, digests) + .await + } + + async fn update_cached_subtrees( + &self, + worker_id: &WorkerId, + is_full_snapshot: bool, + full_set: Vec, + added: Vec, + removed: Vec, + ) -> Result<(), Error> { + self.worker_scheduler + .update_cached_subtrees(worker_id, is_full_snapshot, full_set, added, removed) + .await + } } impl RootMetricsComponent for SimpleScheduler {} diff --git a/nativelink-scheduler/src/simple_scheduler_state_manager.rs b/nativelink-scheduler/src/simple_scheduler_state_manager.rs index 040290ce3..090ed597a 100644 --- a/nativelink-scheduler/src/simple_scheduler_state_manager.rs +++ b/nativelink-scheduler/src/simple_scheduler_state_manager.rs @@ -676,7 +676,7 @@ where // No action found. It is ok if the action was not found. It // probably means that the action was dropped, but worker was // still processing it. - warn!( + debug!( %operation_id, "Unable to update action due to it being missing, probably dropped" ); @@ -716,21 +716,16 @@ where // Make sure we don't update an action that is already completed. if awaited_action.state().stage.is_finished() { - match &update { - UpdateOperationType::UpdateWithDisconnect | UpdateOperationType::KeepAlive => { - // No need to error a keep-alive when it's completed, it's just - // unnecessary log noise. - return Ok(()); - } - _ => { - return Err(make_err!( - Code::Internal, - "Action {operation_id} is already completed with state {:?} - maybe_worker_id: {:?}", - awaited_action.state().stage, - maybe_worker_id, - )); - } - } + // This is a benign race: the worker finished after the scheduler + // already timed out the operation (e.g. client stopped listening). + // No client is waiting for the result, so just log and move on. + debug!( + %operation_id, + ?maybe_worker_id, + stage = ?awaited_action.state().stage, + "Ignoring late update for already-completed action" + ); + return Ok(()); } let stage = match &update { @@ -756,16 +751,46 @@ where warn!(state = ?awaited_action.state(), "Action already assigned"); return Err(make_err!(Code::Aborted, "Action already assigned")); } - stage.clone() + // Exit code 9 = SIGKILL, typically from the OOM killer. + // Treat as a retryable infrastructure error rather than + // a permanent action failure. + if let ActionStage::Completed(result) = stage { + if result.exit_code == 9 { + awaited_action.attempts += 1; + if awaited_action.attempts <= self.max_job_retries { + warn!( + %operation_id, + attempts = awaited_action.attempts, + max_retries = self.max_job_retries, + "Action killed by SIGKILL (OOM?), re-queuing with max priority" + ); + awaited_action.boost_priority(); + ActionStage::Queued + } else { + warn!( + %operation_id, + attempts = awaited_action.attempts, + "Action killed by SIGKILL (OOM?) and exceeded max retries" + ); + stage.clone() + } + } else { + stage.clone() + } + } else { + stage.clone() + } } UpdateOperationType::UpdateWithError(err) => { // Don't count a backpressure failure as an attempt for an action. let due_to_backpressure = err.code == Code::ResourceExhausted; + // Missing inputs can only be fixed by the client re-uploading. + let missing_inputs = err.code == Code::FailedPrecondition; if !due_to_backpressure { awaited_action.attempts += 1; } - if awaited_action.attempts > self.max_job_retries { + if missing_inputs || awaited_action.attempts > self.max_job_retries { ActionStage::Completed(ActionResult { execution_metadata: ExecutionMetadata { worker: maybe_worker_id.map_or_else(String::default, ToString::to_string), diff --git a/nativelink-scheduler/src/worker.rs b/nativelink-scheduler/src/worker.rs index 0d6e68b6a..eb346fcb0 100644 --- a/nativelink-scheduler/src/worker.rs +++ b/nativelink-scheduler/src/worker.rs @@ -24,6 +24,7 @@ use nativelink_proto::com::github::trace_machina::nativelink::remote_execution:: ConnectionResult, StartExecute, UpdateForWorker, update_for_worker, }; use nativelink_util::action_messages::{ActionInfo, OperationId, WorkerId}; +use nativelink_util::common::DigestInfo; use nativelink_util::metrics_utils::{AsyncCounterWrapper, CounterWithTime, FuncCounterWrapper}; use nativelink_util::platform_properties::{PlatformProperties, PlatformPropertyValue}; use tokio::sync::mpsc::UnboundedSender; @@ -92,6 +93,12 @@ pub struct Worker { #[metric(help = "If the worker is paused.")] pub is_paused: bool, + /// Whether the pause was caused by explicit worker backpressure + /// (ResourceExhausted) as opposed to a capacity check. When true, + /// the scheduler should not auto-clear is_paused based on capacity + /// alone — it should wait for the worker to complete an action. + pub paused_due_to_backpressure: bool, + /// Whether the worker is draining. #[metric(help = "If the worker is draining.")] pub is_draining: bool, @@ -100,6 +107,33 @@ pub struct Worker { #[metric(help = "Maximum inflight tasks for this worker (or 0 for unlimited)")] pub max_inflight_tasks: u64, + /// When this worker entered quarantine (i.e. missed keepalive for + /// > worker_timeout but < 2*worker_timeout). While quarantined the + /// worker will not receive new actions but is not yet evicted. + /// Reset to `None` when a keepalive is received. + pub quarantined_at: Option, + + /// The worker's CAS gRPC endpoint for peer blob serving. + /// Empty if the worker does not support peer serving. + #[metric(help = "The worker's CAS endpoint for peer blob sharing.")] + pub cas_endpoint: String, + + /// CPU load percentage reported by the worker (load_avg_1m / num_cpus * 100). + /// 0 means unknown (worker hasn't reported load yet). + #[metric(help = "CPU load percentage reported by the worker.")] + pub cpu_load_pct: u32, + + /// Digests of input root directories cached in the worker's directory cache. + /// The scheduler gives routing preference to workers that already have the + /// action's input_root_digest cached. + pub cached_directory_digests: HashSet, + + /// All subtree digests (roots + subtrees) from the worker's directory cache. + /// Updated via delta encoding from BlobsAvailableNotification. + /// The scheduler uses this for subtree-aware scheduling: checking whether + /// the action's input_root_digest appears as ANY subtree in any cached entry. + pub cached_subtree_digests: HashSet, + /// Stats about the worker. #[metric] metrics: Arc, @@ -116,7 +150,7 @@ fn send_msg_to_worker( /// Reduces the platform properties available on the worker based on the platform properties provided. /// This is used because we allow more than 1 job to run on a worker at a time, and this is how the /// scheduler knows if more jobs can run on a given worker. -fn reduce_platform_properties( +pub(crate) fn reduce_platform_properties( parent_props: &mut PlatformProperties, reduction_props: &PlatformProperties, ) { @@ -140,6 +174,17 @@ impl Worker { tx: UnboundedSender, timestamp: WorkerTimestamp, max_inflight_tasks: u64, + ) -> Self { + Self::new_with_cas_endpoint(id, platform_properties, tx, timestamp, max_inflight_tasks, String::new()) + } + + pub fn new_with_cas_endpoint( + id: WorkerId, + platform_properties: PlatformProperties, + tx: UnboundedSender, + timestamp: WorkerTimestamp, + max_inflight_tasks: u64, + cas_endpoint: String, ) -> Self { Self { id, @@ -149,8 +194,14 @@ impl Worker { restored_platform_properties: HashSet::new(), last_update_timestamp: timestamp, is_paused: false, + paused_due_to_backpressure: false, is_draining: false, max_inflight_tasks, + quarantined_at: None, + cas_endpoint, + cpu_load_pct: 0, + cached_directory_digests: HashSet::new(), + cached_subtree_digests: HashSet::new(), metrics: Arc::new(Metrics { connected_timestamp: SystemTime::now() .duration_since(UNIX_EPOCH) @@ -218,6 +269,7 @@ impl Worker { queued_timestamp: Some(action_info.inner.insert_timestamp.into()), platform: Some((&action_info.platform_properties).into()), worker_id, + peer_hints: Vec::new(), }; reduce_platform_properties( worker_platform_properties, @@ -256,6 +308,7 @@ impl Worker { self.restore_platform_properties(&pending_action_info.action_info.platform_properties); } self.is_paused = false; + self.paused_due_to_backpressure = false; self.metrics.actions_completed.inc(); Ok(()) } @@ -264,7 +317,7 @@ impl Worker { !self.running_action_infos.is_empty() } - fn restore_platform_properties(&mut self, props: &PlatformProperties) { + pub(crate) fn restore_platform_properties(&mut self, props: &PlatformProperties) { for (property, prop_value) in &props.properties { if let PlatformPropertyValue::Minimum(value) = prop_value { let worker_props = &mut self.platform_properties.properties; diff --git a/nativelink-scheduler/src/worker_capability_index.rs b/nativelink-scheduler/src/worker_capability_index.rs index b0e45b76b..b7a15d923 100644 --- a/nativelink-scheduler/src/worker_capability_index.rs +++ b/nativelink-scheduler/src/worker_capability_index.rs @@ -31,7 +31,7 @@ use std::collections::{HashMap, HashSet}; use nativelink_util::action_messages::WorkerId; use nativelink_util::platform_properties::{PlatformProperties, PlatformPropertyValue}; -use tracing::info; +use tracing::debug; /// A property key-value pair used for indexing. #[derive(Clone, Hash, Eq, PartialEq, Debug)] @@ -136,7 +136,7 @@ impl WorkerCapabilityIndex { ) -> HashSet { if self.all_workers.is_empty() { if full_worker_logging { - info!("No workers available to match!"); + debug!("No workers available to match!"); } return HashSet::new(); } @@ -173,7 +173,7 @@ impl WorkerCapabilityIndex { .filter(|pk| &pk.0.name == name) .map(|pk| pk.0.value.clone()) .collect(); - info!( + debug!( "No candidate workers due to a lack of matching '{name}' = {value:?}. Workers have: {values:?}" ); } @@ -202,7 +202,7 @@ impl WorkerCapabilityIndex { if internal_candidates.is_empty() { if full_worker_logging { - info!( + debug!( "No candidate workers due to a lack of key '{name}'. Job asked for {value:?}" ); } diff --git a/nativelink-scheduler/src/worker_scheduler.rs b/nativelink-scheduler/src/worker_scheduler.rs index fe9bcb0f4..b13289140 100644 --- a/nativelink-scheduler/src/worker_scheduler.rs +++ b/nativelink-scheduler/src/worker_scheduler.rs @@ -12,10 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashSet; + use async_trait::async_trait; use nativelink_error::Error; use nativelink_metric::RootMetricsComponent; use nativelink_util::action_messages::{OperationId, WorkerId}; +use nativelink_util::common::DigestInfo; use nativelink_util::operation_state_manager::UpdateOperationType; use nativelink_util::shutdown_guard::ShutdownGuard; @@ -59,4 +62,31 @@ pub trait WorkerScheduler: Sync + Send + Unpin + RootMetricsComponent + 'static /// Sets if the worker is draining or not. async fn set_drain_worker(&self, worker_id: &WorkerId, is_draining: bool) -> Result<(), Error>; + + /// Updates the CPU load reported by a worker. + /// `cpu_load_pct` is load_avg_1m / num_cpus * 100. 0 means unknown. + async fn update_worker_load(&self, worker_id: &WorkerId, cpu_load_pct: u32) -> Result<(), Error>; + + /// Updates the set of cached directory digests for a worker. + /// The scheduler uses this to give routing preference to workers that + /// already have the action's input_root_digest cached in their directory cache. + async fn update_cached_directories( + &self, + worker_id: &WorkerId, + digests: HashSet, + ) -> Result<(), Error>; + + /// Updates the set of cached subtree digests for a worker using delta encoding. + /// + /// When `is_full_snapshot` is true, `full_set` replaces the entire set. + /// When `is_full_snapshot` is false, `added` digests are inserted and + /// `removed` digests are deleted from the existing set. + async fn update_cached_subtrees( + &self, + worker_id: &WorkerId, + is_full_snapshot: bool, + full_set: Vec, + added: Vec, + removed: Vec, + ) -> Result<(), Error>; } diff --git a/nativelink-scheduler/tests/redis_store_awaited_action_db_test.rs b/nativelink-scheduler/tests/redis_store_awaited_action_db_test.rs index 906d511ac..0e2070c76 100644 --- a/nativelink-scheduler/tests/redis_store_awaited_action_db_test.rs +++ b/nativelink-scheduler/tests/redis_store_awaited_action_db_test.rs @@ -270,6 +270,8 @@ async fn test_multiple_clients_subscribe_to_same_action() -> Result<(), Error> { task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); // First client adds the action @@ -324,6 +326,7 @@ async fn test_multiple_clients_subscribe_to_same_action() -> Result<(), Error> { queued_timestamp: Some(SystemTime::UNIX_EPOCH.into()), platform: Some(Platform::default()), worker_id: worker_id.clone().into(), + peer_hints: Vec::new(), })), }; let msg_for_worker = rx_from_worker.recv().await.unwrap(); diff --git a/nativelink-scheduler/tests/simple_scheduler_test.rs b/nativelink-scheduler/tests/simple_scheduler_test.rs index 59364bf28..b2ae67644 100644 --- a/nativelink-scheduler/tests/simple_scheduler_test.rs +++ b/nativelink-scheduler/tests/simple_scheduler_test.rs @@ -22,15 +22,17 @@ use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; use async_lock::Mutex; +use bytes::Bytes; use futures::task::Poll; use futures::{Stream, StreamExt, poll}; use mock_instant::thread_local::{MockClock, SystemTime as MockSystemTime}; use nativelink_config::schedulers::{PropertyType, SimpleSpec}; +use nativelink_config::stores::MemorySpec; use nativelink_error::{Code, Error, ResultExt, make_err}; use nativelink_macro::nativelink_test; use nativelink_metric::MetricsComponent; use nativelink_proto::build::bazel::remote::execution::v2::{ - ExecuteRequest, Platform, digest_function, + Directory, ExecuteRequest, FileNode, Platform, digest_function, }; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ ConnectionResult, StartExecute, UpdateForWorker, update_for_worker, @@ -43,10 +45,12 @@ use nativelink_scheduler::default_scheduler_factory::memory_awaited_action_db_fa use nativelink_scheduler::simple_scheduler::SimpleScheduler; use nativelink_scheduler::worker::Worker; use nativelink_scheduler::worker_scheduler::WorkerScheduler; +use nativelink_store::memory_store::MemoryStore; use nativelink_util::action_messages::{ ActionInfo, ActionResult, ActionStage, ActionState, DirectoryInfo, ExecutionMetadata, FileInfo, INTERNAL_ERROR_EXIT_CODE, NameOrPath, OperationId, SymlinkInfo, WorkerId, }; +use nativelink_util::blob_locality_map::new_shared_blob_locality_map; use nativelink_util::common::DigestInfo; use nativelink_util::instant_wrapper::MockInstantWrapped; use nativelink_util::operation_state_manager::{ @@ -54,6 +58,8 @@ use nativelink_util::operation_state_manager::{ UpdateOperationType, }; use nativelink_util::platform_properties::{PlatformProperties, PlatformPropertyValue}; +use nativelink_util::store_trait::{Store, StoreLike}; +use prost::Message; use pretty_assertions::assert_eq; use tokio::sync::{Notify, mpsc}; use utils::scheduler_utils::{INSTANCE_NAME, make_base_action_info, update_eq}; @@ -134,6 +140,8 @@ async fn basic_add_action_with_one_worker_test() -> Result<(), Error> { task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest = DigestInfo::new([99u8; 32], 512); @@ -159,6 +167,7 @@ async fn basic_add_action_with_one_worker_test() -> Result<(), Error> { queued_timestamp: Some(insert_timestamp.into()), platform: Some(Platform::default()), worker_id: worker_id.into(), + peer_hints: Vec::new(), })), }; let msg_for_worker = rx_from_worker.recv().await.unwrap(); @@ -234,6 +243,8 @@ async fn client_does_not_receive_update_timeout() -> Result<(), Error> { task_change_notify.clone(), MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest = DigestInfo::new([99u8; 32], 512); @@ -295,6 +306,8 @@ async fn find_executing_action() -> Result<(), Error> { task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest = DigestInfo::new([99u8; 32], 512); @@ -339,6 +352,7 @@ async fn find_executing_action() -> Result<(), Error> { queued_timestamp: Some(insert_timestamp.into()), platform: Some(Platform::default()), worker_id: worker_id.into(), + peer_hints: Vec::new(), })), }; let msg_for_worker = rx_from_worker.recv().await.unwrap(); @@ -380,6 +394,8 @@ async fn remove_worker_reschedules_multiple_running_job_test() -> Result<(), Err task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest1 = DigestInfo::new([99u8; 32], 512); let action_digest2 = DigestInfo::new([88u8; 32], 512); @@ -418,6 +434,7 @@ async fn remove_worker_reschedules_multiple_running_job_test() -> Result<(), Err queued_timestamp: Some(insert_timestamp1.into()), platform: Some(Platform::default()), worker_id: worker_id1.to_string(), + peer_hints: Vec::new(), }; let mut expected_start_execute_for_worker2 = StartExecute { @@ -431,6 +448,7 @@ async fn remove_worker_reschedules_multiple_running_job_test() -> Result<(), Err queued_timestamp: Some(insert_timestamp2.into()), platform: Some(Platform::default()), worker_id: worker_id1.to_string(), + peer_hints: Vec::new(), }; let operation_id1 = { // Worker1 should now see first execution request. @@ -574,6 +592,8 @@ async fn set_drain_worker_pauses_and_resumes_worker_test() -> Result<(), Error> task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest = DigestInfo::new([99u8; 32], 512); @@ -664,6 +684,8 @@ async fn worker_should_not_queue_if_properties_dont_match_test() -> Result<(), E task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest = DigestInfo::new([99u8; 32], 512); let mut platform_properties = HashMap::new(); @@ -718,6 +740,7 @@ async fn worker_should_not_queue_if_properties_dont_match_test() -> Result<(), E queued_timestamp: Some(insert_timestamp.into()), platform: Some((&worker2_properties).into()), worker_id: worker_id2.to_string(), + peer_hints: Vec::new(), })), }; let msg_for_worker = rx_from_worker2.recv().await.unwrap(); @@ -761,6 +784,8 @@ async fn cacheable_items_join_same_action_queued_test() -> Result<(), Error> { task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest = DigestInfo::new([99u8; 32], 512); @@ -817,6 +842,7 @@ async fn cacheable_items_join_same_action_queued_test() -> Result<(), Error> { queued_timestamp: Some(insert_timestamp1.into()), platform: Some(Platform::default()), worker_id: worker_id.into(), + peer_hints: Vec::new(), })), }; let msg_for_worker = rx_from_worker.recv().await.unwrap(); @@ -870,6 +896,8 @@ async fn worker_disconnects_does_not_schedule_for_execution_test() -> Result<(), task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let worker_id = WorkerId("worker_id".to_string()); let action_digest = DigestInfo::new([99u8; 32], 512); @@ -1028,6 +1056,8 @@ async fn matching_engine_fails_sends_abort() -> Result<(), Error> { task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); // Initial worker calls do_try_match, so send it no items. senders.get_range_of_actions.send(vec![]).unwrap(); @@ -1074,6 +1104,8 @@ async fn matching_engine_fails_sends_abort() -> Result<(), Error> { task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); // senders.tx_get_awaited_action_by_id.send(Ok(None)).unwrap(); senders.get_range_of_actions.send(vec![]).unwrap(); @@ -1135,6 +1167,8 @@ async fn worker_timesout_reschedules_running_job_test() -> Result<(), Error> { task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest = DigestInfo::new([99u8; 32], 512); @@ -1168,6 +1202,7 @@ async fn worker_timesout_reschedules_running_job_test() -> Result<(), Error> { queued_timestamp: Some(insert_timestamp.into()), platform: Some(Platform::default()), worker_id: worker_id1.to_string(), + peer_hints: Vec::new(), }; { @@ -1205,14 +1240,19 @@ async fn worker_timesout_reschedules_running_job_test() -> Result<(), Error> { ); } - // Keep worker 2 alive. + // Keep worker 2 alive at 2x timeout so it survives both phases. scheduler - .worker_keep_alive_received(&worker_id2, NOW_TIME + WORKER_TIMEOUT_S) + .worker_keep_alive_received(&worker_id2, NOW_TIME + 2 * WORKER_TIMEOUT_S) .await?; - // This should remove worker 1 (the one executing our job). + // Phase 1: quarantine worker 1 at 1x timeout (stops receiving new work). scheduler .remove_timedout_workers(NOW_TIME + WORKER_TIMEOUT_S) .await?; + tokio::task::yield_now().await; + // Phase 2: evict worker 1 at 2x timeout (fully removed, job rescheduled). + scheduler + .remove_timedout_workers(NOW_TIME + 2 * WORKER_TIMEOUT_S) + .await?; tokio::task::yield_now().await; // Allow task<->worker matcher to run. { @@ -1269,6 +1309,8 @@ async fn update_action_sends_completed_result_to_client_test() -> Result<(), Err task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest = DigestInfo::new([99u8; 32], 512); @@ -1372,6 +1414,8 @@ async fn update_action_sends_completed_result_after_disconnect() -> Result<(), E task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest = DigestInfo::new([99u8; 32], 512); @@ -1493,6 +1537,8 @@ async fn update_action_with_wrong_worker_id_errors_test() -> Result<(), Error> { task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest = DigestInfo::new([99u8; 32], 512); @@ -1603,6 +1649,8 @@ async fn does_not_crash_if_operation_joined_then_relaunched() -> Result<(), Erro task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest = DigestInfo::new([99u8; 32], 512); @@ -1638,6 +1686,7 @@ async fn does_not_crash_if_operation_joined_then_relaunched() -> Result<(), Erro queued_timestamp: Some(insert_timestamp.into()), platform: Some(Platform::default()), worker_id: worker_id.clone().into(), + peer_hints: Vec::new(), })), }; let msg_for_worker = rx_from_worker.recv().await.unwrap(); @@ -1753,6 +1802,8 @@ async fn run_two_jobs_on_same_worker_with_platform_properties_restrictions() -> task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest1 = DigestInfo::new([11u8; 32], 512); let action_digest2 = DigestInfo::new([99u8; 32], 512); @@ -1921,6 +1972,8 @@ async fn run_jobs_in_the_order_they_were_queued() -> Result<(), Error> { task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest1 = DigestInfo::new([11u8; 32], 512); let action_digest2 = DigestInfo::new([99u8; 32], 512); @@ -1989,6 +2042,8 @@ async fn worker_retries_on_internal_error_and_fails_test() -> Result<(), Error> task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest = DigestInfo::new([99u8; 32], 512); @@ -2151,6 +2206,8 @@ async fn ensure_scheduler_drops_inner_spawn() -> Result<(), Error> { task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); assert_eq!(dropped.load(Ordering::Relaxed), false); @@ -2181,6 +2238,8 @@ async fn ensure_task_or_worker_change_notification_received_test() -> Result<(), task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest = DigestInfo::new([99u8; 32], 512); @@ -2267,6 +2326,8 @@ async fn client_reconnect_keeps_action_alive() -> Result<(), Error> { task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest = DigestInfo::new([99u8; 32], 512); @@ -2346,6 +2407,8 @@ async fn client_timesout_job_then_same_action_requested() -> Result<(), Error> { task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest = DigestInfo::new([99u8; 32], 512); @@ -2419,6 +2482,8 @@ async fn logs_when_no_workers_match() -> Result<(), Error> { task_change_notify, MockInstantWrapped::default, None, + None, // cas_store + None, // locality_map ); let action_digest = DigestInfo::new([99u8; 32], 512); @@ -2450,3 +2515,1150 @@ async fn logs_when_no_workers_match() -> Result<(), Error> { Ok(()) } + +#[nativelink_test] +async fn worker_fails_precondition_completes_immediately_test() -> Result<(), Error> { + let worker_id = WorkerId("worker_id".to_string()); + + let task_change_notify = Arc::new(Notify::new()); + let (scheduler, _worker_scheduler) = SimpleScheduler::new_with_callback( + &SimpleSpec { + max_job_retries: 5, + ..Default::default() + }, + memory_awaited_action_db_factory( + 0, + &task_change_notify.clone(), + MockInstantWrapped::default, + ), + || async move {}, + task_change_notify, + MockInstantWrapped::default, + None, + None, // cas_store + None, // locality_map + ); + let action_digest = DigestInfo::new([99u8; 32], 512); + + let mut rx_from_worker = + setup_new_worker(&scheduler, worker_id.clone(), PlatformProperties::default()).await?; + let insert_timestamp = make_system_time(1); + let mut action_listener = + setup_action(&scheduler, action_digest, HashMap::new(), insert_timestamp).await?; + + let operation_id = { + // Other tests check full data. We only care if we got StartAction. + let operation_id = match rx_from_worker.recv().await.unwrap().update { + Some(update_for_worker::Update::StartAction(exec)) => exec.operation_id, + v => panic!("Expected StartAction, got : {v:?}"), + }; + // Other tests check full data. We only care if client thinks we are Executing. + assert_eq!( + action_listener.changed().await.unwrap().0.stage, + ActionStage::Executing + ); + OperationId::from(operation_id.as_str()) + }; + + let err = make_err!(Code::FailedPrecondition, "Missing input blobs"); + // Send FailedPrecondition error from worker. This should NOT be retried + // even though max_job_retries is 5. + drop( + scheduler + .update_action( + &worker_id, + &operation_id, + UpdateOperationType::UpdateWithError(err.clone()), + ) + .await, + ); + + { + // Client should get notification saying the action completed (not re-queued). + let (action_state, _maybe_origin_metadata) = action_listener.changed().await.unwrap(); + let expected_action_state = ActionState { + // Name is a random string, so we ignore it and just make it the same. + client_operation_id: action_state.client_operation_id.clone(), + stage: ActionStage::Completed(ActionResult { + output_files: Vec::default(), + output_folders: Vec::default(), + output_file_symlinks: Vec::default(), + output_directory_symlinks: Vec::default(), + exit_code: INTERNAL_ERROR_EXIT_CODE, + stdout_digest: DigestInfo::zero_digest(), + stderr_digest: DigestInfo::zero_digest(), + execution_metadata: ExecutionMetadata { + worker: worker_id.to_string(), + queued_timestamp: SystemTime::UNIX_EPOCH, + worker_start_timestamp: SystemTime::UNIX_EPOCH, + worker_completed_timestamp: SystemTime::UNIX_EPOCH, + input_fetch_start_timestamp: SystemTime::UNIX_EPOCH, + input_fetch_completed_timestamp: SystemTime::UNIX_EPOCH, + execution_start_timestamp: SystemTime::UNIX_EPOCH, + execution_completed_timestamp: SystemTime::UNIX_EPOCH, + output_upload_start_timestamp: SystemTime::UNIX_EPOCH, + output_upload_completed_timestamp: SystemTime::UNIX_EPOCH, + }, + server_logs: HashMap::default(), + error: Some(err.clone()), + message: String::new(), + }), + action_digest: action_state.action_digest, + last_transition_timestamp: SystemTime::now(), + }; + let mut received_state = action_state.as_ref().clone(); + if let ActionStage::Completed(stage) = &mut received_state.stage { + if let Some(real_err) = &mut stage.error { + // Verify the error contains the FailedPrecondition message. + assert!( + real_err.to_string().contains("Missing input blobs"), + "{real_err} did not contain 'Missing input blobs'", + ); + assert!( + real_err + .to_string() + .contains("Job cancelled because it attempted to execute too many times"), + "{real_err} did not contain 'Job cancelled because it attempted to execute too many times'", + ); + *real_err = err; + } + } else { + panic!( + "Expected Completed (not re-queued), got : {:?}", + action_state.stage + ); + } + assert_eq!(received_state, expected_action_state); + } + + Ok(()) +} + +// ============================================================================ +// Locality-aware scheduling tests +// ============================================================================ + +/// Helper: adds a worker with a specific CAS endpoint (for locality mapping). +async fn setup_new_worker_with_cas_endpoint( + scheduler: &SimpleScheduler, + worker_id: WorkerId, + props: PlatformProperties, + cas_endpoint: &str, +) -> Result, Error> { + let (tx, mut rx) = mpsc::unbounded_channel(); + let worker = Worker::new_with_cas_endpoint( + worker_id.clone(), + props, + tx, + NOW_TIME, + 0, + cas_endpoint.to_string(), + ); + scheduler + .add_worker(worker) + .await + .err_tip(|| "Failed to add worker")?; + tokio::task::yield_now().await; + verify_initial_connection_message(worker_id, &mut rx).await; + Ok(rx) +} + +/// Helper: schedules an action with a custom `input_root_digest`. +async fn setup_action_with_input_root( + scheduler: &SimpleScheduler, + action_digest: DigestInfo, + input_root_digest: DigestInfo, + platform_properties: HashMap, + insert_timestamp: SystemTime, +) -> Result, Error> { + let mut action_info = make_base_action_info(insert_timestamp, action_digest); + Arc::make_mut(&mut action_info).platform_properties = platform_properties; + Arc::make_mut(&mut action_info).input_root_digest = input_root_digest; + let client_id = OperationId::default(); + let result = scheduler.add_action(client_id, action_info).await; + tokio::task::yield_now().await; + result +} + +/// Helper: extracts the StartExecute from a worker receiver, returning +/// (operation_id, start_execute). +async fn recv_start_execute( + rx: &mut mpsc::UnboundedReceiver, +) -> (String, StartExecute) { + match rx.recv().await.unwrap().update { + Some(update_for_worker::Update::StartAction(se)) => (se.operation_id.clone(), se), + v => panic!("Expected StartAction, got: {v:?}"), + } +} + +#[nativelink_test] +async fn locality_scoring_selects_best_worker_test() -> Result<(), Error> { + // Test: When a locality map is populated and CAS store has Directory protos, + // the worker with the most cached input bytes should be preferred. + let worker_id_a = WorkerId("worker_a".to_string()); + let worker_id_b = WorkerId("worker_b".to_string()); + let cas_endpoint_a = "worker-a:50081"; + let cas_endpoint_b = "worker-b:50081"; + + // Create file digests that will be in the input tree. + let file_digest1 = DigestInfo::new([1u8; 32], 5000); // 5000 bytes + let file_digest2 = DigestInfo::new([2u8; 32], 3000); // 3000 bytes + let file_digest3 = DigestInfo::new([3u8; 32], 2000); // 2000 bytes + + // Build a Directory proto with these files as the input root. + let input_root_dir = Directory { + files: vec![ + FileNode { + name: "file1.txt".to_string(), + digest: Some(file_digest1.into()), + is_executable: false, + ..Default::default() + }, + FileNode { + name: "file2.txt".to_string(), + digest: Some(file_digest2.into()), + is_executable: false, + ..Default::default() + }, + FileNode { + name: "file3.txt".to_string(), + digest: Some(file_digest3.into()), + is_executable: false, + ..Default::default() + }, + ], + ..Default::default() + }; + let dir_bytes = input_root_dir.encode_to_vec(); + let input_root_digest = DigestInfo::new( + { + use nativelink_util::digest_hasher::{DigestHasher, DigestHasherFunc}; + let mut hasher = DigestHasherFunc::Sha256.hasher(); + hasher.update(&dir_bytes); + let digest_info = hasher.finalize_digest(); + **digest_info.packed_hash() + }, + dir_bytes.len() as u64, + ); + + // Create a CAS store and populate it with the directory proto. + let cas_store_inner = MemoryStore::new(&MemorySpec::default()); + let cas_store = Store::new(cas_store_inner.clone()); + let key: nativelink_util::store_trait::StoreKey<'_> = input_root_digest.into(); + cas_store + .update_oneshot(key, Bytes::from(dir_bytes)) + .await?; + + // Create and populate the locality map. + // Worker A has file1 (5000) and file3 (2000) = 7000 total. + // Worker B has file2 (3000) = 3000 total. + // Worker A should win. + let locality_map = new_shared_blob_locality_map(); + { + let mut map = locality_map.write(); + map.register_blobs(cas_endpoint_a, &[file_digest1, file_digest3]); + map.register_blobs(cas_endpoint_b, &[file_digest2]); + } + + let task_change_notify = Arc::new(Notify::new()); + let (scheduler, _worker_scheduler) = SimpleScheduler::new_with_callback( + &SimpleSpec::default(), + memory_awaited_action_db_factory( + 0, + &task_change_notify.clone(), + MockInstantWrapped::default, + ), + || async move {}, + task_change_notify, + MockInstantWrapped::default, + None, + Some(cas_store), + Some(locality_map), + ); + + let action_digest = DigestInfo::new([99u8; 32], 512); + + // Add workers WITH cas_endpoints so the endpoint_to_worker map is populated. + let mut rx_a = setup_new_worker_with_cas_endpoint( + &scheduler, + worker_id_a.clone(), + PlatformProperties::default(), + cas_endpoint_a, + ) + .await?; + let mut rx_b = setup_new_worker_with_cas_endpoint( + &scheduler, + worker_id_b.clone(), + PlatformProperties::default(), + cas_endpoint_b, + ) + .await?; + + // Schedule the action. + let insert_timestamp = make_system_time(1); + let mut action_listener = setup_action_with_input_root( + &scheduler, + action_digest, + input_root_digest, + HashMap::new(), + insert_timestamp, + ) + .await?; + + // Worker A should get the action because it has the highest locality score (7000 > 3000). + let (selected_worker_id, _se) = tokio::select! { + msg = rx_a.recv() => { + let se = match msg.unwrap().update { + Some(update_for_worker::Update::StartAction(se)) => se, + v => panic!("Expected StartAction on worker_a, got: {v:?}"), + }; + (worker_id_a.clone(), se) + } + msg = rx_b.recv() => { + let se = match msg.unwrap().update { + Some(update_for_worker::Update::StartAction(se)) => se, + v => panic!("Expected StartAction on worker_b, got: {v:?}"), + }; + (worker_id_b.clone(), se) + } + }; + + assert_eq!( + selected_worker_id, worker_id_a, + "Locality scoring should select worker_a (7000 cached bytes > worker_b's 3000)" + ); + + assert_eq!( + action_listener.changed().await.unwrap().0.stage, + ActionStage::Executing + ); + + Ok(()) +} + +#[nativelink_test] +async fn no_peer_hints_without_resolved_tree_test() -> Result<(), Error> { + // Test: When a locality map has entries for the input_root_digest itself + // but there is no CAS store / no resolved tree, peer hints should be + // empty. The old fallback that generated a single hint for + // input_root_digest never worked because workers register individual + // file digests, not directory digests. + let worker_id = WorkerId("worker_recv".to_string()); + let peer_endpoint = "peer-worker:50081"; + + let input_root = DigestInfo::new([77u8; 32], 4096); + + // Create locality map and register the input_root_digest on a peer endpoint. + let locality_map = new_shared_blob_locality_map(); + { + let mut map = locality_map.write(); + map.register_blobs(peer_endpoint, &[input_root]); + } + + let task_change_notify = Arc::new(Notify::new()); + let (scheduler, _worker_scheduler) = SimpleScheduler::new_with_callback( + &SimpleSpec::default(), + memory_awaited_action_db_factory( + 0, + &task_change_notify.clone(), + MockInstantWrapped::default, + ), + || async move {}, + task_change_notify, + MockInstantWrapped::default, + None, + None, // no CAS store -- no resolved tree available + Some(locality_map), + ); + + let action_digest = DigestInfo::new([88u8; 32], 256); + + let mut rx_from_worker = + setup_new_worker(&scheduler, worker_id.clone(), PlatformProperties::default()).await?; + + // Schedule action with a specific input_root. + let insert_timestamp = make_system_time(1); + let _action_listener = setup_action_with_input_root( + &scheduler, + action_digest, + input_root, + HashMap::new(), + insert_timestamp, + ) + .await?; + + // Worker should receive StartAction with empty peer_hints (no resolved tree). + let (_, start_execute) = recv_start_execute(&mut rx_from_worker).await; + + assert!( + start_execute.peer_hints.is_empty(), + "peer_hints should be empty without a resolved tree (directory digests are not useful)" + ); + + Ok(()) +} + +#[nativelink_test] +async fn peer_hints_from_resolved_tree_test() -> Result<(), Error> { + // Test: When a CAS store has a Directory proto for the input root, and + // the locality map has entries for individual file digests, the + // StartExecute message should contain per-file peer hints sorted by + // size descending. + let worker_id = WorkerId("worker_recv".to_string()); + let peer_endpoint = "peer-worker:50081"; + + // Create file digests. + let file_large = DigestInfo::new([10u8; 32], 10000); + let file_small = DigestInfo::new([11u8; 32], 500); + + // Build Directory proto. + let input_root_dir = Directory { + files: vec![ + FileNode { + name: "large.bin".to_string(), + digest: Some(file_large.into()), + is_executable: false, + ..Default::default() + }, + FileNode { + name: "small.txt".to_string(), + digest: Some(file_small.into()), + is_executable: false, + ..Default::default() + }, + ], + ..Default::default() + }; + let dir_bytes = input_root_dir.encode_to_vec(); + let input_root_digest = DigestInfo::new( + { + use nativelink_util::digest_hasher::{DigestHasher, DigestHasherFunc}; + let mut hasher = DigestHasherFunc::Sha256.hasher(); + hasher.update(&dir_bytes); + let digest_info = hasher.finalize_digest(); + **digest_info.packed_hash() + }, + dir_bytes.len() as u64, + ); + + // Create and populate CAS store. + let cas_store_inner = MemoryStore::new(&MemorySpec::default()); + let cas_store = Store::new(cas_store_inner); + let key: nativelink_util::store_trait::StoreKey<'_> = input_root_digest.into(); + cas_store + .update_oneshot(key, Bytes::from(dir_bytes)) + .await?; + + // Create locality map with file blobs registered on a peer. + let locality_map = new_shared_blob_locality_map(); + { + let mut map = locality_map.write(); + map.register_blobs(peer_endpoint, &[file_large, file_small]); + } + + let task_change_notify = Arc::new(Notify::new()); + let (scheduler, _worker_scheduler) = SimpleScheduler::new_with_callback( + &SimpleSpec::default(), + memory_awaited_action_db_factory( + 0, + &task_change_notify.clone(), + MockInstantWrapped::default, + ), + || async move {}, + task_change_notify, + MockInstantWrapped::default, + None, + Some(cas_store), + Some(locality_map), + ); + + let action_digest = DigestInfo::new([99u8; 32], 512); + + let mut rx_from_worker = + setup_new_worker(&scheduler, worker_id.clone(), PlatformProperties::default()).await?; + + let insert_timestamp = make_system_time(1); + let _action_listener = setup_action_with_input_root( + &scheduler, + action_digest, + input_root_digest, + HashMap::new(), + insert_timestamp, + ) + .await?; + + let (_, start_execute) = recv_start_execute(&mut rx_from_worker).await; + + // Should have per-file peer hints (one per file in the tree). + assert_eq!( + start_execute.peer_hints.len(), + 2, + "Should have 2 peer hints (one per file in the input tree)" + ); + + // Hints should be sorted by size descending (large first). + let first_hint_digest = DigestInfo::try_from( + start_execute.peer_hints[0] + .digest + .as_ref() + .expect("hint should have digest"), + ) + .unwrap(); + let second_hint_digest = DigestInfo::try_from( + start_execute.peer_hints[1] + .digest + .as_ref() + .expect("hint should have digest"), + ) + .unwrap(); + + assert_eq!( + first_hint_digest, file_large, + "First hint should be the largest file" + ); + assert_eq!( + second_hint_digest, file_small, + "Second hint should be the smaller file" + ); + + // Both hints should reference the peer endpoint. + for hint in &start_execute.peer_hints { + assert!( + hint.peer_endpoints.contains(&peer_endpoint.to_string()), + "Each hint should reference the peer endpoint" + ); + } + + Ok(()) +} + +#[nativelink_test] +async fn fallback_to_lru_when_no_locality_data_test() -> Result<(), Error> { + // Test: When a locality map and CAS store are configured but contain NO + // blob data for the action's input tree, the scheduler should fall back + // to the normal LRU worker selection without errors. + let worker_id_a = WorkerId("worker_a".to_string()); + let worker_id_b = WorkerId("worker_b".to_string()); + let cas_endpoint_a = "worker-a:50081"; + let cas_endpoint_b = "worker-b:50081"; + + // Build a Directory proto with files, but do NOT register those files + // in the locality map -- simulating a fresh deployment or cold start. + let file_digest1 = DigestInfo::new([30u8; 32], 4000); + let file_digest2 = DigestInfo::new([31u8; 32], 2000); + + let input_root_dir = Directory { + files: vec![ + FileNode { + name: "cold_file1.bin".to_string(), + digest: Some(file_digest1.into()), + is_executable: false, + ..Default::default() + }, + FileNode { + name: "cold_file2.bin".to_string(), + digest: Some(file_digest2.into()), + is_executable: false, + ..Default::default() + }, + ], + ..Default::default() + }; + let dir_bytes = input_root_dir.encode_to_vec(); + let input_root_digest = DigestInfo::new( + { + use nativelink_util::digest_hasher::{DigestHasher, DigestHasherFunc}; + let mut hasher = DigestHasherFunc::Sha256.hasher(); + hasher.update(&dir_bytes); + let digest_info = hasher.finalize_digest(); + **digest_info.packed_hash() + }, + dir_bytes.len() as u64, + ); + + // Create CAS store with the directory proto so tree resolution succeeds. + let cas_store_inner = MemoryStore::new(&MemorySpec::default()); + let cas_store = Store::new(cas_store_inner); + let key: nativelink_util::store_trait::StoreKey<'_> = input_root_digest.into(); + cas_store + .update_oneshot(key, Bytes::from(dir_bytes)) + .await?; + + // Create an EMPTY locality map -- no blobs registered on any endpoint. + let locality_map = new_shared_blob_locality_map(); + + let task_change_notify = Arc::new(Notify::new()); + let (scheduler, _worker_scheduler) = SimpleScheduler::new_with_callback( + &SimpleSpec::default(), + memory_awaited_action_db_factory( + 0, + &task_change_notify.clone(), + MockInstantWrapped::default, + ), + || async move {}, + task_change_notify, + MockInstantWrapped::default, + None, + Some(cas_store), + Some(locality_map), + ); + + let action_digest = DigestInfo::new([99u8; 32], 512); + + // Add two workers with CAS endpoints. + let mut rx_a = setup_new_worker_with_cas_endpoint( + &scheduler, + worker_id_a.clone(), + PlatformProperties::default(), + cas_endpoint_a, + ) + .await?; + let mut rx_b = setup_new_worker_with_cas_endpoint( + &scheduler, + worker_id_b.clone(), + PlatformProperties::default(), + cas_endpoint_b, + ) + .await?; + + // Schedule action with the input root. + let insert_timestamp = make_system_time(1); + let mut action_listener = setup_action_with_input_root( + &scheduler, + action_digest, + input_root_digest, + HashMap::new(), + insert_timestamp, + ) + .await?; + + // One of the workers should receive the action (LRU fallback). + // We don't care which worker gets it -- just that it succeeds. + let (selected_worker_id, start_execute) = tokio::select! { + msg = rx_a.recv() => { + let se = match msg.unwrap().update { + Some(update_for_worker::Update::StartAction(se)) => se, + v => panic!("Expected StartAction on worker_a, got: {v:?}"), + }; + (worker_id_a.clone(), se) + } + msg = rx_b.recv() => { + let se = match msg.unwrap().update { + Some(update_for_worker::Update::StartAction(se)) => se, + v => panic!("Expected StartAction on worker_b, got: {v:?}"), + }; + (worker_id_b.clone(), se) + } + }; + + // Verify the action was dispatched to one of the two workers. + assert!( + selected_worker_id == worker_id_a || selected_worker_id == worker_id_b, + "Action should be dispatched to one of the available workers via LRU fallback" + ); + + // With no locality data, there should be no peer hints (no blobs are registered). + assert!( + start_execute.peer_hints.is_empty(), + "peer_hints should be empty when locality map has no data for input files, got {} hints", + start_execute.peer_hints.len() + ); + + // Client should see the Executing state. + assert_eq!( + action_listener.changed().await.unwrap().0.stage, + ActionStage::Executing + ); + + Ok(()) +} + +#[nativelink_test] +async fn locality_scoring_with_empty_map_and_no_cas_store_test() -> Result<(), Error> { + // Test: When locality_map is provided but cas_store is None (tree + // resolution impossible), scheduling should still work via LRU fallback. + // This covers the path where resolve_input_tree returns None. + let worker_id = WorkerId("worker_solo".to_string()); + + // Create locality map but don't populate it. + let locality_map = new_shared_blob_locality_map(); + + let task_change_notify = Arc::new(Notify::new()); + let (scheduler, _worker_scheduler) = SimpleScheduler::new_with_callback( + &SimpleSpec::default(), + memory_awaited_action_db_factory( + 0, + &task_change_notify.clone(), + MockInstantWrapped::default, + ), + || async move {}, + task_change_notify, + MockInstantWrapped::default, + None, + None, // No CAS store -- tree resolution returns None + Some(locality_map), + ); + + let action_digest = DigestInfo::new([55u8; 32], 256); + + let mut rx_from_worker = + setup_new_worker(&scheduler, worker_id.clone(), PlatformProperties::default()).await?; + + let insert_timestamp = make_system_time(1); + let mut action_listener = + setup_action(&scheduler, action_digest, HashMap::new(), insert_timestamp).await?; + + // Worker should receive the action via normal LRU selection. + let (_, start_execute) = recv_start_execute(&mut rx_from_worker).await; + + // No peer hints should be generated (no tree, no locality data). + assert!( + start_execute.peer_hints.is_empty(), + "peer_hints should be empty when no CAS store is configured" + ); + + assert_eq!( + action_listener.changed().await.unwrap().0.stage, + ActionStage::Executing + ); + + Ok(()) +} + +#[nativelink_test] +async fn locality_scoring_partial_data_still_selects_best_worker_test() -> Result<(), Error> { + // Test: When only SOME workers have locality data, the scoring should + // still pick the one with the most cached bytes, and the worker with + // no cached data should get a score of 0 (falling behind). + let worker_id_a = WorkerId("worker_a".to_string()); + let worker_id_b = WorkerId("worker_b".to_string()); + let cas_endpoint_a = "worker-a:50081"; + let cas_endpoint_b = "worker-b:50081"; + + // Files in the input tree. + let file_digest1 = DigestInfo::new([40u8; 32], 8000); + let file_digest2 = DigestInfo::new([41u8; 32], 1000); + + let input_root_dir = Directory { + files: vec![ + FileNode { + name: "big.dat".to_string(), + digest: Some(file_digest1.into()), + is_executable: false, + ..Default::default() + }, + FileNode { + name: "small.dat".to_string(), + digest: Some(file_digest2.into()), + is_executable: false, + ..Default::default() + }, + ], + ..Default::default() + }; + let dir_bytes = input_root_dir.encode_to_vec(); + let input_root_digest = DigestInfo::new( + { + use nativelink_util::digest_hasher::{DigestHasher, DigestHasherFunc}; + let mut hasher = DigestHasherFunc::Sha256.hasher(); + hasher.update(&dir_bytes); + let digest_info = hasher.finalize_digest(); + **digest_info.packed_hash() + }, + dir_bytes.len() as u64, + ); + + // Create CAS store with directory proto. + let cas_store_inner = MemoryStore::new(&MemorySpec::default()); + let cas_store = Store::new(cas_store_inner); + let key: nativelink_util::store_trait::StoreKey<'_> = input_root_digest.into(); + cas_store + .update_oneshot(key, Bytes::from(dir_bytes)) + .await?; + + // Only worker B has file_digest1 (8000 bytes). Worker A has nothing. + let locality_map = new_shared_blob_locality_map(); + { + let mut map = locality_map.write(); + map.register_blobs(cas_endpoint_b, &[file_digest1]); + } + + let task_change_notify = Arc::new(Notify::new()); + let (scheduler, _worker_scheduler) = SimpleScheduler::new_with_callback( + &SimpleSpec::default(), + memory_awaited_action_db_factory( + 0, + &task_change_notify.clone(), + MockInstantWrapped::default, + ), + || async move {}, + task_change_notify, + MockInstantWrapped::default, + None, + Some(cas_store), + Some(locality_map), + ); + + let action_digest = DigestInfo::new([99u8; 32], 512); + + let mut rx_a = setup_new_worker_with_cas_endpoint( + &scheduler, + worker_id_a.clone(), + PlatformProperties::default(), + cas_endpoint_a, + ) + .await?; + let mut rx_b = setup_new_worker_with_cas_endpoint( + &scheduler, + worker_id_b.clone(), + PlatformProperties::default(), + cas_endpoint_b, + ) + .await?; + + let insert_timestamp = make_system_time(1); + let mut action_listener = setup_action_with_input_root( + &scheduler, + action_digest, + input_root_digest, + HashMap::new(), + insert_timestamp, + ) + .await?; + + // Worker B should be selected (8000 cached bytes vs. 0 for worker A). + let (selected_worker_id, _se) = tokio::select! { + msg = rx_a.recv() => { + let se = match msg.unwrap().update { + Some(update_for_worker::Update::StartAction(se)) => se, + v => panic!("Expected StartAction on worker_a, got: {v:?}"), + }; + (worker_id_a.clone(), se) + } + msg = rx_b.recv() => { + let se = match msg.unwrap().update { + Some(update_for_worker::Update::StartAction(se)) => se, + v => panic!("Expected StartAction on worker_b, got: {v:?}"), + }; + (worker_id_b.clone(), se) + } + }; + + assert_eq!( + selected_worker_id, worker_id_b, + "Locality scoring should select worker_b (8000 cached bytes vs. worker_a's 0)" + ); + + assert_eq!( + action_listener.changed().await.unwrap().0.stage, + ActionStage::Executing + ); + + Ok(()) +} + +// --------------------------------------------------------------- +// CPU-load-aware scheduling tests +// --------------------------------------------------------------- + +#[nativelink_test] +async fn cpu_load_update_worker_load_stores_correctly() -> Result<(), Error> { + // Verify that update_worker_load stores the load on the worker and + // influences scheduling. We set load on a single worker, submit an + // action, and confirm the worker still receives it (proving the + // update didn't break anything and the worker is still viable). + let worker_id = WorkerId("worker_load_test".to_string()); + + let task_change_notify = Arc::new(Notify::new()); + let (scheduler, _worker_scheduler) = SimpleScheduler::new_with_callback( + &SimpleSpec::default(), + memory_awaited_action_db_factory( + 0, + &task_change_notify.clone(), + MockInstantWrapped::default, + ), + || async move {}, + task_change_notify, + MockInstantWrapped::default, + None, + None, // cas_store + None, // locality_map + ); + + let mut rx = setup_new_worker( + &scheduler, + worker_id.clone(), + PlatformProperties::default(), + ) + .await?; + + // Update the worker's CPU load. + scheduler.update_worker_load(&worker_id, 42).await?; + + // Submit an action — the single worker should still be selected. + let action_digest = DigestInfo::new([10u8; 32], 256); + let insert_timestamp = make_system_time(1); + let mut action_listener = + setup_action(&scheduler, action_digest, HashMap::new(), insert_timestamp).await?; + + // Worker should receive the action. + let (_op_id, _se) = recv_start_execute(&mut rx).await; + + assert_eq!( + action_listener.changed().await.unwrap().0.stage, + ActionStage::Executing + ); + + Ok(()) +} + +#[nativelink_test] +async fn cpu_load_lightest_loaded_worker_gets_picked() -> Result<(), Error> { + // Create 3 workers with different cpu_load_pct values. + // Worker A=80, Worker B=20, Worker C=50. + // Worker B (lightest load) should be selected for the action. + let worker_id_a = WorkerId("worker_a".to_string()); + let worker_id_b = WorkerId("worker_b".to_string()); + let worker_id_c = WorkerId("worker_c".to_string()); + + let task_change_notify = Arc::new(Notify::new()); + let (scheduler, _worker_scheduler) = SimpleScheduler::new_with_callback( + &SimpleSpec::default(), + memory_awaited_action_db_factory( + 0, + &task_change_notify.clone(), + MockInstantWrapped::default, + ), + || async move {}, + task_change_notify, + MockInstantWrapped::default, + None, + None, // cas_store + None, // locality_map + ); + + // Add all 3 workers (no queued actions yet, so no matching happens). + let mut rx_a = setup_new_worker( + &scheduler, + worker_id_a.clone(), + PlatformProperties::default(), + ) + .await?; + let mut rx_b = setup_new_worker( + &scheduler, + worker_id_b.clone(), + PlatformProperties::default(), + ) + .await?; + let mut rx_c = setup_new_worker( + &scheduler, + worker_id_c.clone(), + PlatformProperties::default(), + ) + .await?; + + // Set CPU loads: A=80, B=20, C=50. + scheduler.update_worker_load(&worker_id_a, 80).await?; + scheduler.update_worker_load(&worker_id_b, 20).await?; + scheduler.update_worker_load(&worker_id_c, 50).await?; + + // Submit an action. + let action_digest = DigestInfo::new([20u8; 32], 512); + let insert_timestamp = make_system_time(1); + let mut action_listener = + setup_action(&scheduler, action_digest, HashMap::new(), insert_timestamp).await?; + + // Determine which worker received the action. + let (selected_worker_id, _se) = tokio::select! { + msg = rx_a.recv() => { + let se = match msg.unwrap().update { + Some(update_for_worker::Update::StartAction(se)) => se, + v => panic!("Expected StartAction on worker_a, got: {v:?}"), + }; + (worker_id_a.clone(), se) + } + msg = rx_b.recv() => { + let se = match msg.unwrap().update { + Some(update_for_worker::Update::StartAction(se)) => se, + v => panic!("Expected StartAction on worker_b, got: {v:?}"), + }; + (worker_id_b.clone(), se) + } + msg = rx_c.recv() => { + let se = match msg.unwrap().update { + Some(update_for_worker::Update::StartAction(se)) => se, + v => panic!("Expected StartAction on worker_c, got: {v:?}"), + }; + (worker_id_c.clone(), se) + } + }; + + assert_eq!( + selected_worker_id, worker_id_b, + "Worker B (cpu_load_pct=20) should be selected as lightest-loaded" + ); + + assert_eq!( + action_listener.changed().await.unwrap().0.stage, + ActionStage::Executing + ); + + Ok(()) +} + +#[nativelink_test] +async fn cpu_load_unknown_zero_sorted_last() -> Result<(), Error> { + // Create 2 workers: one with cpu_load_pct=60 (known) and one with + // cpu_load_pct=0 (unknown). The worker with known load should be + // selected over the unknown one, even though 0 < 60 numerically. + let worker_id_known = WorkerId("worker_known".to_string()); + let worker_id_unknown = WorkerId("worker_unknown".to_string()); + + let task_change_notify = Arc::new(Notify::new()); + let (scheduler, _worker_scheduler) = SimpleScheduler::new_with_callback( + &SimpleSpec::default(), + memory_awaited_action_db_factory( + 0, + &task_change_notify.clone(), + MockInstantWrapped::default, + ), + || async move {}, + task_change_notify, + MockInstantWrapped::default, + None, + None, // cas_store + None, // locality_map + ); + + let mut rx_known = setup_new_worker( + &scheduler, + worker_id_known.clone(), + PlatformProperties::default(), + ) + .await?; + let mut rx_unknown = setup_new_worker( + &scheduler, + worker_id_unknown.clone(), + PlatformProperties::default(), + ) + .await?; + + // Set only one worker's load; the other stays at default 0 (unknown). + scheduler.update_worker_load(&worker_id_known, 60).await?; + // worker_unknown stays at cpu_load_pct=0. + + // Submit an action. + let action_digest = DigestInfo::new([30u8; 32], 512); + let insert_timestamp = make_system_time(1); + let mut action_listener = + setup_action(&scheduler, action_digest, HashMap::new(), insert_timestamp).await?; + + // Determine which worker received the action. + let (selected_worker_id, _se) = tokio::select! { + msg = rx_known.recv() => { + let se = match msg.unwrap().update { + Some(update_for_worker::Update::StartAction(se)) => se, + v => panic!("Expected StartAction on worker_known, got: {v:?}"), + }; + (worker_id_known.clone(), se) + } + msg = rx_unknown.recv() => { + let se = match msg.unwrap().update { + Some(update_for_worker::Update::StartAction(se)) => se, + v => panic!("Expected StartAction on worker_unknown, got: {v:?}"), + }; + (worker_id_unknown.clone(), se) + } + }; + + assert_eq!( + selected_worker_id, worker_id_known, + "Worker with known load (60) should be preferred over unknown (0)" + ); + + assert_eq!( + action_listener.changed().await.unwrap().0.stage, + ActionStage::Executing + ); + + Ok(()) +} + +#[nativelink_test] +async fn cpu_load_falls_back_to_lru_when_no_load_data() -> Result<(), Error> { + // Create 2 workers with cpu_load_pct=0 on both (no load data). + // Scheduling should still work via LRU/MRU fallback. + let worker_id_1 = WorkerId("worker_1".to_string()); + let worker_id_2 = WorkerId("worker_2".to_string()); + + let task_change_notify = Arc::new(Notify::new()); + let (scheduler, _worker_scheduler) = SimpleScheduler::new_with_callback( + &SimpleSpec::default(), + memory_awaited_action_db_factory( + 0, + &task_change_notify.clone(), + MockInstantWrapped::default, + ), + || async move {}, + task_change_notify, + MockInstantWrapped::default, + None, + None, // cas_store + None, // locality_map + ); + + // Add both workers (both have cpu_load_pct=0 by default). + let mut rx_1 = setup_new_worker( + &scheduler, + worker_id_1.clone(), + PlatformProperties::default(), + ) + .await?; + let mut rx_2 = setup_new_worker( + &scheduler, + worker_id_2.clone(), + PlatformProperties::default(), + ) + .await?; + + // Neither worker has load data — cpu_load_pct stays at 0. + + // Submit an action. It should be assigned to one of the workers + // via LRU fallback (the first in LRU order). + let action_digest = DigestInfo::new([40u8; 32], 512); + let insert_timestamp = make_system_time(1); + let mut action_listener = + setup_action(&scheduler, action_digest, HashMap::new(), insert_timestamp).await?; + + // Either worker is acceptable — just verify one was selected. + let (selected_worker_id, _se) = tokio::select! { + msg = rx_1.recv() => { + let se = match msg.unwrap().update { + Some(update_for_worker::Update::StartAction(se)) => se, + v => panic!("Expected StartAction on worker_1, got: {v:?}"), + }; + (worker_id_1.clone(), se) + } + msg = rx_2.recv() => { + let se = match msg.unwrap().update { + Some(update_for_worker::Update::StartAction(se)) => se, + v => panic!("Expected StartAction on worker_2, got: {v:?}"), + }; + (worker_id_2.clone(), se) + } + }; + + // Verify a worker was actually selected (the assert_eq on stage below + // also proves this, but let's be explicit). + assert!( + selected_worker_id == worker_id_1 || selected_worker_id == worker_id_2, + "One of the workers should have been selected via LRU fallback" + ); + + assert_eq!( + action_listener.changed().await.unwrap().0.stage, + ActionStage::Executing + ); + + Ok(()) +} diff --git a/nativelink-scheduler/tests/utils/scheduler_utils.rs b/nativelink-scheduler/tests/utils/scheduler_utils.rs index 7492efe6e..f7986f985 100644 --- a/nativelink-scheduler/tests/utils/scheduler_utils.rs +++ b/nativelink-scheduler/tests/utils/scheduler_utils.rs @@ -143,5 +143,11 @@ pub(crate) fn update_eq( } _ => false, }, + update_for_worker::Update::TouchBlobs(actual_update) => match expected_update { + update_for_worker::Update::TouchBlobs(expected_update) => { + expected_update == actual_update + } + _ => false, + }, } } diff --git a/nativelink-service/Cargo.toml b/nativelink-service/Cargo.toml index 3f14715d1..0352fad23 100644 --- a/nativelink-service/Cargo.toml +++ b/nativelink-service/Cargo.toml @@ -20,14 +20,14 @@ bytes = { version = "1.10.1", default-features = false } futures = { version = "0.3.31", default-features = false } http-body-util = { version = "0.1.3", default-features = false } hyper = { version = "1.6.0", default-features = false } -opentelemetry = { version = "0.29.1", default-features = false } -opentelemetry-semantic-conventions = { version = "0.29.0", default-features = false, features = [ +opentelemetry = { version = "0.31.0", default-features = false } +opentelemetry-semantic-conventions = { version = "0.31.0", default-features = false, features = [ "default", "semconv_experimental", ] } parking_lot = { version = "0.12.3", default-features = false } -prost = { version = "0.13.5", default-features = false } -prost-types = { version = "0.13.5", default-features = false, features = [ +prost = { version = "0.14.3", default-features = false } +prost-types = { version = "0.14.3", default-features = false, features = [ "std", ] } rand = { version = "0.9.0", default-features = false, features = [ @@ -43,11 +43,12 @@ tokio = { version = "1.44.1", features = [ tokio-stream = { version = "0.1.17", features = [ "fs", ], default-features = false } -tonic = { version = "0.13.0", features = [ +tonic = { version = "0.14.5", features = [ "gzip", "router", - "tls-ring", + "tls-aws-lc", "transport", + "zstd", ], default-features = false } tower = { version = "0.5.2", default-features = false } tracing = { version = "0.1.41", default-features = false } @@ -67,11 +68,12 @@ hyper-util = { version = "0.1.11", default-features = false } pretty_assertions = { version = "1.4.1", features = [ "std", ], default-features = false } -prost-types = { version = "0.13.5", default-features = false } +prost-types = { version = "0.14.3", default-features = false } +tonic-prost = { version = "0.14.5", default-features = false } serde_json = { version = "1.0.140", default-features = false, features = [ "std", ] } -sha2 = { version = "0.10.8", default-features = false } +sha2 = { version = "0.10.8", default-features = false, features = ["asm"] } tracing-test = { version = "0.2.5", default-features = false, features = [ "no-env-filter", ] } diff --git a/nativelink-service/src/ac_server.rs b/nativelink-service/src/ac_server.rs index 29db64d14..b9e190aef 100644 --- a/nativelink-service/src/ac_server.rs +++ b/nativelink-service/src/ac_server.rs @@ -30,11 +30,13 @@ use nativelink_store::grpc_store::GrpcStore; use nativelink_store::store_manager::StoreManager; use nativelink_util::common::DigestInfo; use nativelink_util::digest_hasher::make_ctx_for_hash_func; +use nativelink_util::log_utils::throughput_mbps; +use nativelink_util::stall_detector::StallGuard; use nativelink_util::store_trait::{Store, StoreLike}; use opentelemetry::context::FutureExt; use prost::Message; use tonic::{Request, Response, Status}; -use tracing::{Instrument, Level, error, error_span, instrument}; +use tracing::{Instrument, Level, error, error_span, info, instrument}; #[derive(Debug, Clone)] pub struct AcStoreInfo { @@ -104,9 +106,21 @@ impl AcServer { return grpc_store.get_action_result(Request::new(request)).await; } + let get_start = std::time::Instant::now(); let res = get_and_decode_digest::(&store_info.store, digest.into()).await; match res { - Ok(action_result) => Ok(Response::new(action_result)), + Ok(action_result) => { + let elapsed = get_start.elapsed(); + let size_bytes = action_result.encoded_len() as u64; + info!( + ?digest, + size_bytes, + elapsed_ms = elapsed.as_millis() as u64, + throughput_mbps = format!("{:.1}", throughput_mbps(size_bytes, elapsed)), + "AC read completed", + ); + Ok(Response::new(action_result)) + } Err(mut e) => { if e.code == Code::NotFound { // `get_action_result` is frequent to get NotFound errors, so remove all @@ -158,11 +172,35 @@ impl AcServer { .encode(&mut store_data) .err_tip(|| "Provided ActionResult could not be serialized")?; - store_info + let size_bytes = store_data.len() as u64; + let start = std::time::Instant::now(); + let result = store_info .store .update_oneshot(digest, store_data.freeze()) .await - .err_tip(|| "Failed to update in action cache")?; + .err_tip(|| "Failed to update in action cache"); + let elapsed = start.elapsed(); + match &result { + Ok(()) => { + info!( + ?digest, + size_bytes, + elapsed_ms = elapsed.as_millis() as u64, + throughput_mbps = format!("{:.1}", throughput_mbps(size_bytes, elapsed)), + "AC write completed", + ); + } + Err(e) => { + error!( + ?digest, + size_bytes, + elapsed_ms = elapsed.as_millis() as u64, + ?e, + "AC write failed", + ); + } + } + result?; Ok(Response::new(action_result)) } } @@ -181,6 +219,10 @@ impl ActionCache for AcServer { ) -> Result, Status> { let request = grpc_request.into_inner(); let digest_function = request.digest_function; + let _stall_guard = StallGuard::new( + nativelink_util::stall_detector::DEFAULT_STALL_THRESHOLD, + "AC::get_action_result", + ); let result = self .inner_get_action_result(request) .instrument(error_span!("ac_server_get_action_result")) @@ -201,7 +243,7 @@ impl ActionCache for AcServer { #[instrument( err, - ret(level = Level::INFO), + ret(level = Level::DEBUG), level = Level::ERROR, skip_all, fields(request = ?grpc_request.get_ref()) @@ -212,6 +254,10 @@ impl ActionCache for AcServer { ) -> Result, Status> { let request = grpc_request.into_inner(); let digest_function = request.digest_function; + let _stall_guard = StallGuard::new( + nativelink_util::stall_detector::DEFAULT_STALL_THRESHOLD, + "AC::update_action_result", + ); self.inner_update_action_result(request) .instrument(error_span!("ac_server_update_action_result")) .with_context( diff --git a/nativelink-service/src/bytestream_server.rs b/nativelink-service/src/bytestream_server.rs index d47b3cd9e..403ce7e98 100644 --- a/nativelink-service/src/bytestream_server.rs +++ b/nativelink-service/src/bytestream_server.rs @@ -16,6 +16,7 @@ use core::convert::Into; use core::fmt::{Debug, Formatter}; use core::pin::Pin; use core::sync::atomic::{AtomicU64, Ordering}; +use core::task::{Context, Poll}; use core::time::Duration; use std::collections::HashMap; use std::collections::hash_map::Entry; @@ -41,16 +42,18 @@ use nativelink_proto::google::bytestream::{ use nativelink_store::grpc_store::GrpcStore; use nativelink_store::store_manager::StoreManager; use nativelink_util::buf_channel::{ - DropCloserReadHalf, DropCloserWriteHalf, make_buf_channel_pair, + DropCloserReadHalf, DropCloserWriteHalf, make_buf_channel_pair_with_size, }; use nativelink_util::common::DigestInfo; +use nativelink_util::log_utils::throughput_mbps; +use nativelink_util::stall_detector::StallGuard; use nativelink_util::digest_hasher::{ DigestHasherFunc, default_digest_hasher_func, make_ctx_for_hash_func, }; use nativelink_util::proto_stream_utils::WriteRequestStreamWrapper; use nativelink_util::resource_info::ResourceInfo; use nativelink_util::spawn; -use nativelink_util::store_trait::{Store, StoreLike, StoreOptimizations, UploadSizeInfo}; +use nativelink_util::store_trait::{IS_WORKER_REQUEST, Store, StoreLike, StoreOptimizations, UploadSizeInfo}; use nativelink_util::task::JoinHandleDropGuard; use opentelemetry::context::FutureExt; use parking_lot::Mutex; @@ -62,7 +65,7 @@ use tracing::{Instrument, Level, debug, error, error_span, info, instrument, tra const DEFAULT_PERSIST_STREAM_ON_DISCONNECT_TIMEOUT: Duration = Duration::from_secs(60); /// If this value changes update the documentation in the config definition. -const DEFAULT_MAX_BYTES_PER_STREAM: usize = 64 * 1024; +const DEFAULT_MAX_BYTES_PER_STREAM: usize = 3 * 1024 * 1024; /// Metrics for `ByteStream` server operations. /// Tracks upload/download activity, throughput, and latency. @@ -271,6 +274,75 @@ impl Debug for InstanceInfo { type ReadStream = Pin> + Send + 'static>>; type StoreUpdateFuture = Pin> + Send + 'static>>; +/// Wrapper around a `ReadStream` that logs total bytes and elapsed time when +/// the stream completes (yields `None`) or is dropped before completion. +struct LoggingReadStream { + inner: ReadStream, + start_time: Instant, + digest: DigestInfo, + expected_size: u64, + bytes_sent: u64, + completed: bool, +} + +impl LoggingReadStream { + fn new(inner: ReadStream, start_time: Instant, digest: DigestInfo, expected_size: u64) -> Self { + Self { + inner, + start_time, + digest, + expected_size, + bytes_sent: 0, + completed: false, + } + } + + fn log_completion(&self, status: &str) { + let elapsed = self.start_time.elapsed(); + let elapsed_ms = elapsed.as_millis() as u64; + debug!( + digest = %self.digest, + expected_size = self.expected_size, + bytes_sent = self.bytes_sent, + elapsed_ms, + throughput_mbps = %throughput_mbps(self.bytes_sent, elapsed), + status, + "ByteStream::read: CAS read completed", + ); + } +} + +impl Stream for LoggingReadStream { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let result = self.inner.as_mut().poll_next(cx); + match &result { + Poll::Ready(Some(Ok(response))) => { + self.bytes_sent += response.data.len() as u64; + } + Poll::Ready(None) => { + self.completed = true; + self.log_completion("ok"); + } + Poll::Ready(Some(Err(_))) => { + self.completed = true; + self.log_completion("error"); + } + Poll::Pending => {} + } + result + } +} + +impl Drop for LoggingReadStream { + fn drop(&mut self) { + if !self.completed { + self.log_completion("dropped"); + } + } +} + struct StreamState { uuid: UuidKey, tx: DropCloserWriteHalf, @@ -402,6 +474,15 @@ impl ByteStreamServer { let max_bytes_per_stream = if config.max_bytes_per_stream == 0 { DEFAULT_MAX_BYTES_PER_STREAM } else { + if config.max_bytes_per_stream > 4 * 1024 * 1024 { + warn!( + configured = config.max_bytes_per_stream, + default = DEFAULT_MAX_BYTES_PER_STREAM, + "max_bytes_per_stream exceeds 4 MiB; Bazel and other REAPI clients \ + typically have a 4 MiB gRPC inbound message limit and will reject \ + oversized ByteStream.Read chunks with RESOURCE_EXHAUSTED" + ); + } config.max_bytes_per_stream }; @@ -494,8 +575,18 @@ impl ByteStreamServer { // Parse UUID string to u128 key for efficient HashMap operations let uuid_key = parse_uuid_to_key(uuid_str); - let (uuid, bytes_received, is_collision) = - match instance.active_uploads.lock().entry(uuid_key) { + // We handle the three cases in two phases to avoid holding the + // mutex guard across a second .lock() call (which would deadlock + // on parking_lot::Mutex since it is not reentrant). + enum UploadAction { + Resume(Box), + New(u128, Arc), + Collision(u128), + } + + let action = { + let mut active_uploads = instance.active_uploads.lock(); + match active_uploads.entry(uuid_key) { Entry::Occupied(mut entry) => { let maybe_idle_stream = entry.get_mut(); if let Some(idle_stream) = maybe_idle_stream.1.take() { @@ -510,34 +601,41 @@ impl ByteStreamServer { .metrics .resumed_uploads .fetch_add(1, Ordering::Relaxed); - return idle_stream.into_active_stream(bytes_received, instance); + UploadAction::Resume(Box::new( + idle_stream.into_active_stream(bytes_received, instance), + )) + } else { + // Case 3: Stream is active - generate a unique UUID to avoid collision + let original_key = *entry.key(); + let unique_key = Self::generate_unique_uuid_key(original_key); + warn!( + msg = "UUID collision detected, generating unique UUID to prevent conflict", + original_uuid = format!("{:032x}", original_key), + unique_uuid = format!("{:032x}", unique_key) + ); + UploadAction::Collision(unique_key) } - // Case 3: Stream is active - generate a unique UUID to avoid collision - // Using nanosecond timestamp makes collision probability essentially zero - let original_key = *entry.key(); - let unique_key = Self::generate_unique_uuid_key(original_key); - warn!( - msg = "UUID collision detected, generating unique UUID to prevent conflict", - original_uuid = format!("{:032x}", original_key), - unique_uuid = format!("{:032x}", unique_key) - ); - // Entry goes out of scope here, releasing the lock - - let bytes_received = Arc::new(AtomicU64::new(0)); - let mut active_uploads = instance.active_uploads.lock(); - // Insert with the unique UUID - this should never collide due to nanosecond precision - active_uploads.insert(unique_key, (bytes_received.clone(), None)); - (unique_key, bytes_received, true) } Entry::Vacant(entry) => { // Case 1: UUID doesn't exist, create new stream let bytes_received = Arc::new(AtomicU64::new(0)); let uuid = *entry.key(); - // Our stream is "in use" if the key is in the map, but the value is None. entry.insert((bytes_received.clone(), None)); - (uuid, bytes_received, false) + UploadAction::New(uuid, bytes_received) } - }; + } + }; // First lock guard dropped here. + + let (uuid, bytes_received, is_collision) = match action { + UploadAction::Resume(guard) => return *guard, + UploadAction::New(uuid, bytes_received) => (uuid, bytes_received, false), + UploadAction::Collision(unique_key) => { + let bytes_received = Arc::new(AtomicU64::new(0)); + let mut active_uploads = instance.active_uploads.lock(); + active_uploads.insert(unique_key, (bytes_received.clone(), None)); + (unique_key, bytes_received, true) + } + }; // Track metrics for new upload instance @@ -555,7 +653,9 @@ impl ByteStreamServer { // removing the entry from the map, otherwise that UUID becomes // unusable. - let (tx, rx) = make_buf_channel_pair(); + // Use a larger buffer (256 slots = ~64MiB at 256KiB chunks) to sustain + // high-throughput streaming at 10Gbps+ without backpressure stalls. + let (tx, rx) = make_buf_channel_pair_with_size(256); let store = instance.store.clone(); let store_update_fut = Box::pin(async move { // We need to wrap `Store::update()` in a another future because we need to capture @@ -582,6 +682,7 @@ impl ByteStreamServer { instance: &InstanceInfo, digest: DigestInfo, read_request: ReadRequest, + is_worker: bool, ) -> Result> + Send + use<>, Error> { struct ReaderState { max_bytes_per_stream: usize, @@ -593,7 +694,9 @@ impl ByteStreamServer { let read_limit = u64::try_from(read_request.read_limit) .err_tip(|| "Could not convert read_limit to u64")?; - let (tx, rx) = make_buf_channel_pair(); + // Use a larger buffer (256 slots = ~64MiB at 256KiB chunks) to sustain + // high-throughput streaming at 10Gbps+ without backpressure stalls. + let (tx, rx) = make_buf_channel_pair_with_size(256); let read_limit = if read_limit != 0 { Some(read_limit) @@ -608,14 +711,21 @@ impl ByteStreamServer { max_bytes_per_stream: instance.max_bytes_per_stream, maybe_get_part_result: None, get_part_fut: Box::pin(async move { - store - .get_part( - digest, - tx, - u64::try_from(read_request.read_offset) - .err_tip(|| "Could not convert read_offset to u64")?, - read_limit, - ) + // Propagate the worker/non-worker distinction into the store + // layer so WorkerProxyStore can decide whether to proxy or + // redirect. + IS_WORKER_REQUEST + .scope(is_worker, async { + store + .get_part( + digest, + tx, + u64::try_from(read_request.read_offset) + .err_tip(|| "Could not convert read_offset to u64")?, + read_limit, + ) + .await + }) .await }), }); @@ -762,8 +872,14 @@ impl ByteStreamServer { ) } else { if write_offset != tx.get_bytes_written() { - return Err(make_input_err!( - "Received out of order data. Got {}, expected {}", + // The client is trying to resume at an offset we + // don't have (e.g. the idle stream was swept). + // Return UNAVAILABLE so the client retries with + // QueryWriteStatus → committed_size=0 → restart. + return Err(make_err!( + Code::Unavailable, + "Received out of order data (write_offset {} but server has {}). \ + Partial upload state was lost; retry from committed offset.", write_offset, tx.get_bytes_written() )); @@ -785,6 +901,17 @@ impl ByteStreamServer { return Err(make_input_err!("Received more bytes than expected")); } if write_request.finish_write { + // Validate that we received the expected number of bytes + // before accepting the upload. The stream wrapper only + // validates on a *subsequent* poll_next after finish_write, + // which we never perform, so check here explicitly. + if tx.get_bytes_written() != expected_size { + return Err(make_input_err!( + "Client declared size {} but only sent {} bytes", + expected_size, + tx.get_bytes_written() + )); + } // Gracefully close our stream. tx.send_eof() .err_tip(|| "Failed to send EOF in ByteStream::write")?; @@ -879,8 +1006,10 @@ impl ByteStreamServer { .slice(usize::try_from(bytes_received - write_offset).unwrap_or(usize::MAX)..) } else { if write_offset != bytes_received { - return Err(make_input_err!( - "Received out of order data. Got {}, expected {}", + return Err(make_err!( + Code::Unavailable, + "Received out of order data (write_offset {} but server has {}). \ + Partial upload state was lost; retry from committed offset.", write_offset, bytes_received )); @@ -898,6 +1027,15 @@ impl ByteStreamServer { } if write_request.finish_write { + // Validate that we received the expected number of bytes + // before accepting the upload. + if bytes_received != expected_size { + return Err(make_input_err!( + "Client declared size {} but only sent {} bytes", + expected_size, + bytes_received + )); + } break; } } @@ -994,6 +1132,9 @@ impl ByteStream for ByteStreamServer { ) -> Result, Status> { let start_time = Instant::now(); + let is_worker = grpc_request + .metadata() + .contains_key("x-nativelink-worker"); let read_request = grpc_request.into_inner(); let resource_info = ResourceInfo::new(&read_request.resource_name, false)?; let instance_name = resource_info.instance_name.as_ref(); @@ -1024,15 +1165,31 @@ impl ByteStream for ByteStreamServer { DigestHasherFunc::try_from, )?; + // Covers stream setup only (inner_read returns a Stream). + // Actual data transfer stalls are not covered by this guard. + let _stall_guard = StallGuard::new( + nativelink_util::stall_detector::DEFAULT_STALL_THRESHOLD, + "ByteStream::read", + ); let resp = self - .inner_read(instance, digest, read_request) + .inner_read(instance, digest, read_request, is_worker) .instrument(error_span!("bytestream_read")) .with_context( make_ctx_for_hash_func(digest_function).err_tip(|| "In BytestreamServer::read")?, ) .await .err_tip(|| "In ByteStreamServer::read") - .map(|stream| -> Response { Response::new(Box::pin(stream)) }); + .map(|stream| -> Response { + // Wrap in LoggingReadStream to log when the client finishes + // consuming all data (or drops the stream early). + let logging = LoggingReadStream::new( + Box::pin(stream), + start_time, + digest, + expected_size, + ); + Response::new(Box::pin(logging)) + }); // Track metrics based on result #[allow(clippy::cast_possible_truncation)] @@ -1044,6 +1201,12 @@ impl ByteStream for ByteStreamServer { match &resp { Ok(_) => { + debug!( + %digest, + size_bytes = expected_size, + elapsed_ms = start_time.elapsed().as_millis() as u64, + "ByteStream::read: CAS read stream created", + ); instance .metrics .read_requests_success @@ -1052,9 +1215,15 @@ impl ByteStream for ByteStreamServer { .metrics .bytes_read_total .fetch_add(expected_size, Ordering::Relaxed); - debug!(return = "Ok()"); } - Err(_) => { + Err(e) => { + error!( + %digest, + size_bytes = expected_size, + elapsed_ms = start_time.elapsed().as_millis() as u64, + ?e, + "ByteStream::read: failed", + ); instance .metrics .read_requests_failure @@ -1149,6 +1318,18 @@ impl ByteStream for ByteStreamServer { false }; + let oneshot = use_oneshot; + debug!( + %digest, + expected_size, + oneshot, + "ByteStream::write: starting upload", + ); + + let _stall_guard = StallGuard::new( + nativelink_util::stall_detector::DEFAULT_STALL_THRESHOLD, + "ByteStream::write", + ); let result = if use_oneshot { self.inner_write_oneshot(instance, digest, stream) .instrument(error_span!("bytestream_write_oneshot")) @@ -1179,6 +1360,15 @@ impl ByteStream for ByteStreamServer { match &result { Ok(_) => { + let elapsed = start_time.elapsed(); + debug!( + %digest, + size_bytes = expected_size, + elapsed_ms = elapsed.as_millis() as u64, + throughput_mbps = format!("{:.1}", throughput_mbps(expected_size, elapsed)), + oneshot, + "ByteStream::write: CAS write completed", + ); instance .metrics .write_requests_success @@ -1188,7 +1378,15 @@ impl ByteStream for ByteStreamServer { .bytes_written_total .fetch_add(expected_size, Ordering::Relaxed); } - Err(_) => { + Err(e) => { + error!( + %digest, + expected_size, + elapsed_ms = start_time.elapsed().as_millis() as u64, + oneshot, + ?e, + "ByteStream::write: upload failed", + ); instance .metrics .write_requests_failure diff --git a/nativelink-service/src/capabilities_server.rs b/nativelink-service/src/capabilities_server.rs index e7058baec..11accd4e3 100644 --- a/nativelink-service/src/capabilities_server.rs +++ b/nativelink-service/src/capabilities_server.rs @@ -33,7 +33,9 @@ use nativelink_util::operation_state_manager::ClientStateManager; use tonic::{Request, Response, Status}; use tracing::{Level, instrument, warn}; -const MAX_BATCH_TOTAL_SIZE: i64 = 64 * 1024; +// Must leave headroom below Bazel's 4 MiB client-side gRPC inbound limit +// so that BatchReadBlobs responses (blob data + protobuf framing) fit. +const MAX_BATCH_TOTAL_SIZE: i64 = 3 * 1024 * 1024 + 512 * 1024; // 3.5 MiB #[derive(Debug, Default)] pub struct CapabilitiesServer { diff --git a/nativelink-service/src/cas_server.rs b/nativelink-service/src/cas_server.rs index 7e0f5f437..fa2f4afb6 100644 --- a/nativelink-service/src/cas_server.rs +++ b/nativelink-service/src/cas_server.rs @@ -14,7 +14,7 @@ use core::convert::Into; use core::pin::Pin; -use std::collections::{HashMap, VecDeque}; +use std::collections::{HashMap, HashSet, VecDeque}; use bytes::Bytes; use futures::stream::{FuturesUnordered, Stream}; @@ -36,10 +36,13 @@ use nativelink_store::grpc_store::GrpcStore; use nativelink_store::store_manager::StoreManager; use nativelink_util::common::DigestInfo; use nativelink_util::digest_hasher::make_ctx_for_hash_func; -use nativelink_util::store_trait::{Store, StoreLike}; +use nativelink_util::log_utils::throughput_mbps; +use nativelink_util::stall_detector::StallGuard; +use nativelink_util::store_trait::{IS_WORKER_REQUEST, Store, StoreLike}; use opentelemetry::context::FutureExt; +use prost::Message; use tonic::{Request, Response, Status}; -use tracing::{Instrument, Level, debug, error_span, instrument}; +use tracing::{Instrument, Level, debug, error, error_span, instrument, warn}; #[derive(Debug)] pub struct CasServer { @@ -86,12 +89,24 @@ impl CasServer { .has_many(&requested_blobs) .await .err_tip(|| "In find_missing_blobs")?; - let missing_blob_digests = sizes + let missing_blob_digests: Vec<_> = sizes .into_iter() .zip(request.blob_digests) .filter_map(|(maybe_size, digest)| maybe_size.map_or_else(|| Some(digest), |_| None)) .collect(); + debug!( + requested = requested_blobs.len(), + missing = missing_blob_digests.len(), + "FindMissingBlobs", + ); + if !missing_blob_digests.is_empty() { + debug!( + digests = ?missing_blob_digests.iter().map(|d| format!("{}-{}", d.hash, d.size_bytes)).collect::>(), + "FindMissingBlobs: missing digests", + ); + } + Ok(Response::new(FindMissingBlobsResponse { missing_blob_digests, })) @@ -135,10 +150,38 @@ impl CasServer { size_bytes, request_data.len() ); + debug!( + %digest_info, + size_bytes, + "BatchUpdateBlobs: starting upload", + ); + let upload_start = std::time::Instant::now(); let result = store_ref .update_oneshot(digest_info, request_data) .await .err_tip(|| "Error writing to store"); + match &result { + Ok(()) => { + let elapsed = upload_start.elapsed(); + debug!( + %digest_info, + size_bytes, + elapsed_ms = elapsed.as_millis() as u64, + throughput_mbps = format!("{:.1}", throughput_mbps(size_bytes as u64, elapsed)), + "BatchUpdateBlobs: CAS write completed", + ); + } + Err(e) => { + let elapsed = upload_start.elapsed(); + error!( + %digest_info, + size_bytes, + elapsed_ms = elapsed.as_millis() as u64, + ?e, + "BatchUpdateBlobs: upload failed", + ); + } + } Ok::<_, Error>(batch_update_blobs_response::Response { digest: Some(digest), status: Some(result.map_or_else(Into::into, |()| GrpcStatus::default())), @@ -178,12 +221,22 @@ impl CasServer { .map(|digest| async move { let digest_copy = DigestInfo::try_from(digest.clone())?; // TODO(palfrey) There is a security risk here of someone taking all the memory on the instance. + let read_start = std::time::Instant::now(); let result = store_ref .get_part_unchunked(digest_copy, 0, None) .await .err_tip(|| "Error reading from store"); let (status, data) = result.map_or_else( |mut e| { + let elapsed = read_start.elapsed(); + if e.code != Code::NotFound { + error!( + %digest_copy, + elapsed_ms = elapsed.as_millis() as u64, + ?e, + "BatchReadBlobs: CAS read failed", + ); + } if e.code == Code::NotFound { // Trim the error code. Not Found is quite common and we don't want to send a large // error (debug) message for something that is common. We resize to just the last @@ -192,7 +245,18 @@ impl CasServer { } (e.into(), Bytes::new()) }, - |v| (GrpcStatus::default(), v), + |v| { + let elapsed = read_start.elapsed(); + let size_bytes = v.len() as u64; + debug!( + %digest_copy, + size_bytes, + elapsed_ms = elapsed.as_millis() as u64, + throughput_mbps = format!("{:.1}", throughput_mbps(size_bytes, elapsed)), + "BatchReadBlobs: CAS read completed", + ); + (GrpcStatus::default(), v) + }, ); Ok::<_, Error>(batch_read_blobs_response::Response { status: Some(status), @@ -231,6 +295,7 @@ impl CasServer { .into_inner(); return Ok(stream.left_stream()); } + let tree_start = std::time::Instant::now(); let root_digest: DigestInfo = request .root_digest .err_tip(|| "Expected root_digest to exist in GetTreeRequest")? @@ -238,6 +303,15 @@ impl CasServer { .err_tip(|| "In GetTreeRequest::root_digest")?; let mut deque: VecDeque = VecDeque::new(); + // Track all digests we have ever enqueued to avoid fetching/processing + // the same directory twice. In a Merkle tree, identical subdirectory + // structures share the same digest, so multiple parents at the same BFS + // level can reference the same child digest. Without deduplication: + // 1. We fetch the same blob N times concurrently (wasteful). + // 2. `level_results.remove()` succeeds for the first occurrence but + // returns None for duplicates, causing a spurious + // "Directory missing from level results" error. + let mut seen: HashSet = HashSet::new(); let mut directories: Vec = Vec::new(); // `page_token` will return the `{hash_str}-{size_bytes}` of the current request's first directory digest. let page_token_digest = if request.page_token.is_empty() { @@ -257,43 +331,169 @@ impl CasServer { .err_tip(|| "Failed to parse `page_token` as `Digest` in `GetTreeRequest`")? }; let page_size = request.page_size; - // If `page_size` is 0, paging is not necessary. + // If `page_size` is 0, paging is not necessary — return all directories. + let page_size_limit = if page_size == 0 { + usize::MAX + } else { + usize::try_from(page_size).unwrap_or(usize::MAX) + }; let mut page_token_matched = page_size == 0; + seen.insert(root_digest); deque.push_back(root_digest); - - while !deque.is_empty() { - let digest: DigestInfo = deque.pop_front().err_tip(|| "In VecDeque::pop_front")?; - let directory = get_and_decode_digest::(&store, digest.into()) - .await - .err_tip(|| "Converting digest to Directory")?; - if digest == page_token_digest { - page_token_matched = true; + let mut page_filled = false; + + // Per-level timing and dedup tracking for diagnostics. + let mut bfs_level: u32 = 0; + let mut total_duplicates_skipped: u64 = 0; + let mut level_timings: Vec<(u32, usize, u64, u64)> = Vec::new(); // (level, dirs_fetched, children_discovered, elapsed_ms) + + while !deque.is_empty() && !page_filled { + let level_start = std::time::Instant::now(); + let level: Vec = deque.drain(..).collect(); + // Fetch all directories in this BFS level concurrently. + let mut futs = FuturesUnordered::new(); + for digest in &level { + let store = store.clone(); + let digest = *digest; + futs.push(async move { + let dir = get_and_decode_digest::(&store, digest.into()) + .await + .err_tip(|| { + format!( + "Converting digest to Directory (digest: {})", + digest, + ) + })?; + Ok::<_, Error>((digest, dir)) + }); } - for directory in &directory.directories { - let digest: DigestInfo = directory - .digest - .clone() - .err_tip(|| "Expected Digest to exist in Directory::directories::digest")? - .try_into() - .err_tip(|| "In Directory::file::digest")?; - deque.push_back(digest); + // Collect results into a map so we can iterate in deterministic (discovery) order. + let mut level_results: HashMap = + HashMap::with_capacity(level.len()); + while let Some(result) = futs.next().await { + let (digest, directory) = result?; + level_results.insert(digest, directory); + } + // Process directories in the order they appeared in the deque (BFS discovery order). + let mut level_new_children: u64 = 0; + let mut level_duplicates: u64 = 0; + for (i, digest) in level.iter().enumerate() { + let directory = level_results + .get(digest) + .cloned() + .err_tip(|| { + format!( + "Directory missing from level results (digest: {}, level_size: {}, results_size: {})", + digest, + level.len(), + level_results.len(), + ) + })?; + if *digest == page_token_digest { + page_token_matched = true; + } + // Always enqueue children so BFS traversal finds the page token + // even when it's deeper in the tree. + for child in &directory.directories { + let child_digest: DigestInfo = child + .digest + .clone() + .err_tip(|| { + "Expected Digest to exist in Directory::directories::digest" + })? + .try_into() + .err_tip(|| "In Directory::file::digest")?; + // Only enqueue children we haven't seen before to avoid + // duplicate fetches and processing. + if seen.insert(child_digest) { + deque.push_back(child_digest); + level_new_children += 1; + } else { + level_duplicates += 1; + } + } + if page_token_matched { + directories.push(directory); + if directories.len() >= page_size_limit { + // Put remaining unprocessed items from this level back + // into the front of the deque for the next page token. + let remaining: Vec = + level[i + 1..].iter().copied().collect(); + // Prepend remaining items before any children already in deque. + for (j, rem) in remaining.into_iter().enumerate() { + deque.insert(j, rem); + } + page_filled = true; + break; + } + } } - let page_size_usize = usize::try_from(page_size).unwrap_or(usize::MAX); + let level_elapsed_ms = level_start.elapsed().as_millis() as u64; + total_duplicates_skipped += level_duplicates; - if page_token_matched { - directories.push(directory); - if directories.len() == page_size_usize { - break; - } + if level_duplicates > 0 { + debug!( + ?root_digest, + bfs_level, + duplicates_skipped = level_duplicates, + "GetTree: deduplication skipped children at this level", + ); + } + + debug!( + ?root_digest, + bfs_level, + dirs_fetched = level.len(), + new_children = level_new_children, + duplicates_skipped = level_duplicates, + elapsed_ms = level_elapsed_ms, + "GetTree: BFS level completed", + ); + + if level_elapsed_ms > 100 { + warn!( + ?root_digest, + bfs_level, + dirs_fetched = level.len(), + new_children = level_new_children, + elapsed_ms = level_elapsed_ms, + "GetTree: slow BFS level (>100ms)", + ); } + + level_timings.push((bfs_level, level.len(), level_new_children, level_elapsed_ms)); + bfs_level += 1; } - // `next_page_token` will return the `{hash_str}:{size_bytes}` of the next request's first directory digest. + // `next_page_token` will return the `{hash_str}-{size_bytes}` of the next request's first directory digest. // It will be an empty string when it reached the end of the directory tree. let next_page_token: String = deque .front() .map_or_else(String::new, |value| format!("{value}")); + let elapsed = tree_start.elapsed(); + let total_bytes: u64 = directories.iter().map(|d| d.encoded_len() as u64).sum(); + + // Build per-level timing breakdown string for the summary log. + let level_breakdown: String = level_timings + .iter() + .map(|(lvl, dirs, children, ms)| { + format!("L{lvl}:{dirs}dirs/{children}children/{ms}ms") + }) + .collect::>() + .join(", "); + + debug!( + ?root_digest, + dir_count = directories.len(), + total_bytes, + total_duplicates_skipped, + bfs_levels = bfs_level, + elapsed_ms = elapsed.as_millis() as u64, + level_breakdown = %level_breakdown, + "GetTree: resolved directory tree", + ); + Ok(futures::stream::once(async { Ok(GetTreeResponse { directories, @@ -350,6 +550,10 @@ impl ContentAddressableStorage for CasServer { let request = grpc_request.into_inner(); let digest_function = request.digest_function; + let _stall_guard = StallGuard::new( + nativelink_util::stall_detector::DEFAULT_STALL_THRESHOLD, + "BatchUpdateBlobs", + ); self.inner_batch_update_blobs(request) .instrument(error_span!("cas_server_batch_update_blobs")) .with_context( @@ -372,14 +576,25 @@ impl ContentAddressableStorage for CasServer { &self, grpc_request: Request, ) -> Result, Status> { + let is_worker = grpc_request + .metadata() + .contains_key("x-nativelink-worker"); let request = grpc_request.into_inner(); let digest_function = request.digest_function; - self.inner_batch_read_blobs(request) - .instrument(error_span!("cas_server_batch_read_blobs")) - .with_context( - make_ctx_for_hash_func(digest_function) - .err_tip(|| "In CasServer::batch_read_blobs")?, + let _stall_guard = StallGuard::new( + nativelink_util::stall_detector::DEFAULT_STALL_THRESHOLD, + "BatchReadBlobs", + ); + IS_WORKER_REQUEST + .scope( + is_worker, + self.inner_batch_read_blobs(request) + .instrument(error_span!("cas_server_batch_read_blobs")) + .with_context( + make_ctx_for_hash_func(digest_function) + .err_tip(|| "In CasServer::batch_read_blobs")?, + ), ) .await .err_tip(|| "Failed on batch_read_blobs() command") diff --git a/nativelink-service/src/worker_api_server.rs b/nativelink-service/src/worker_api_server.rs index 9b6918155..733c9a09e 100644 --- a/nativelink-service/src/worker_api_server.rs +++ b/nativelink-service/src/worker_api_server.rs @@ -28,8 +28,11 @@ use nativelink_proto::com::github::trace_machina::nativelink::remote_execution:: WorkerApi, WorkerApiServer as Server, }; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ - execute_result, ExecuteComplete, ExecuteResult, GoingAwayRequest, KeepAliveRequest, UpdateForScheduler, UpdateForWorker + execute_result, ExecuteComplete, ExecuteResult, GoingAwayRequest, KeepAliveRequest, + UpdateForScheduler, UpdateForWorker, }; +use nativelink_util::blob_locality_map::SharedBlobLocalityMap; +use nativelink_util::common::DigestInfo; use nativelink_scheduler::worker::Worker; use nativelink_scheduler::worker_scheduler::WorkerScheduler; use nativelink_util::background_spawn; @@ -40,7 +43,7 @@ use rand::RngCore; use tokio::sync::mpsc; use tokio::time::interval; use tonic::{Response, Status}; -use tracing::{debug, error, warn, instrument, Level}; +use tracing::{debug, error, info, warn, instrument, Level}; use uuid::Uuid; pub type ConnectWorkerStream = @@ -52,6 +55,7 @@ pub struct WorkerApiServer { scheduler: Arc, now_fn: Arc, node_id: [u8; 6], + locality_map: Option, } impl core::fmt::Debug for WorkerApiServer { @@ -66,6 +70,7 @@ impl WorkerApiServer { pub fn new( config: &WorkerApiConfig, schedulers: &HashMap>, + locality_map: Option, ) -> Result { let node_id = { let mut out = [0; 6]; @@ -108,6 +113,7 @@ impl WorkerApiServer { .map_err(|_| make_err!(Code::Internal, "System time is now behind unix epoch")) }), node_id, + locality_map, ) } @@ -118,6 +124,7 @@ impl WorkerApiServer { schedulers: &HashMap>, now_fn: NowFn, node_id: [u8; 6], + locality_map: Option, ) -> Result { let scheduler = schedulers .get(&config.scheduler) @@ -132,6 +139,7 @@ impl WorkerApiServer { scheduler, now_fn: Arc::new(now_fn), node_id, + locality_map, }) } @@ -159,6 +167,8 @@ impl WorkerApiServer { )); }; + let worker_cas_endpoint = connect_worker_request.cas_endpoint.clone(); + let (tx, rx) = mpsc::unbounded_channel(); // First convert our proto platform properties into one our scheduler understands. @@ -184,12 +194,13 @@ impl WorkerApiServer { connect_worker_request.worker_id_prefix, Uuid::now_v6(&self.node_id).hyphenated() )); - let worker = Worker::new( + let worker = Worker::new_with_cas_endpoint( worker_id.clone(), platform_properties, tx, (self.now_fn)()?.as_secs(), connect_worker_request.max_inflight_tasks, + worker_cas_endpoint.clone(), ); self.scheduler .add_worker(worker) @@ -202,6 +213,8 @@ impl WorkerApiServer { self.scheduler.clone(), self.now_fn.clone(), worker_id.clone(), + self.locality_map.clone(), + worker_cas_endpoint, update_stream, ); @@ -259,6 +272,8 @@ struct WorkerConnection { scheduler: Arc, now_fn: Arc, worker_id: WorkerId, + locality_map: Option, + cas_endpoint: String, } impl WorkerConnection { @@ -266,12 +281,16 @@ impl WorkerConnection { scheduler: Arc, now_fn: Arc, worker_id: WorkerId, + locality_map: Option, + cas_endpoint: String, mut connection: impl Stream> + Unpin + Send + 'static, ) { let instance = Self { scheduler, now_fn, worker_id, + locality_map, + cas_endpoint, }; background_spawn!("worker_api", async move { @@ -307,23 +326,52 @@ impl WorkerConnection { Update::ExecuteComplete(execute_complete) => { instance.execution_complete(execute_complete).await } + Update::BlobsAvailable(notification) => { + instance.handle_blobs_available(notification).await + } + Update::BlobsEvicted(_notification) => { + // Dead code path: evictions now go through + // BlobsAvailableNotification.evicted_digests. + // Kept for wire compatibility with older workers. + Ok(()) + } }; if let Err(err) = result { tracing::warn!(worker_id=?instance.worker_id, ?err, "Error processing worker message"); } } tracing::debug!(worker_id=?instance.worker_id, "Update for scheduler dropped"); + + // Clean up locality map on disconnect. + if !instance.cas_endpoint.is_empty() { + if let Some(ref locality_map) = instance.locality_map { + locality_map.write().remove_endpoint(&instance.cas_endpoint); + info!( + worker_id=?instance.worker_id, + endpoint=%instance.cas_endpoint, + "Removed worker from blob locality map on disconnect" + ); + } + } + if !had_going_away { drop(instance.scheduler.remove_worker(&instance.worker_id).await); } }); } - async fn inner_keep_alive(&self, _keep_alive_request: KeepAliveRequest) -> Result<(), Error> { + async fn inner_keep_alive(&self, keep_alive_request: KeepAliveRequest) -> Result<(), Error> { self.scheduler .worker_keep_alive_received(&self.worker_id, (self.now_fn)()?.as_secs()) .await .err_tip(|| "Could not process keep_alive from worker in inner_keep_alive()")?; + let cpu_load_pct = keep_alive_request.cpu_load_pct; + if cpu_load_pct > 0 { + debug!(worker_id=?self.worker_id, cpu_load_pct, "KeepAlive received with CPU load"); + if let Err(err) = self.scheduler.update_worker_load(&self.worker_id, cpu_load_pct).await { + warn!(worker_id=?self.worker_id, ?err, cpu_load_pct, "Failed to update worker load"); + } + } Ok(()) } @@ -335,6 +383,51 @@ impl WorkerConnection { Ok(()) } + fn register_action_result_digests( + locality_map: &SharedBlobLocalityMap, + endpoint: &str, + execute_response: &nativelink_proto::build::bazel::remote::execution::v2::ExecuteResponse, + ) { + let Some(ref action_result) = execute_response.result else { + return; + }; + let now = SystemTime::now(); + let mut digests = Vec::new(); + for file in &action_result.output_files { + if let Some(ref d) = file.digest { + if let Ok(di) = DigestInfo::try_from(d.clone()) { + digests.push((di, now)); + } + } + } + for dir in &action_result.output_directories { + if let Some(ref d) = dir.tree_digest { + if let Ok(di) = DigestInfo::try_from(d.clone()) { + digests.push((di, now)); + } + } + } + if let Some(ref d) = action_result.stdout_digest { + if d.size_bytes > 0 { + if let Ok(di) = DigestInfo::try_from(d.clone()) { + digests.push((di, now)); + } + } + } + if let Some(ref d) = action_result.stderr_digest { + if d.size_bytes > 0 { + if let Ok(di) = DigestInfo::try_from(d.clone()) { + digests.push((di, now)); + } + } + } + if !digests.is_empty() { + locality_map + .write() + .register_blobs_with_timestamps(endpoint, &digests); + } + } + async fn inner_execution_response(&self, execute_result: ExecuteResult) -> Result<(), Error> { let operation_id = OperationId::from(execute_result.operation_id); @@ -343,6 +436,18 @@ impl WorkerConnection { .err_tip(|| "Expected result to exist in ExecuteResult")? { execute_result::Result::ExecuteResponse(finished_result) => { + // Register output digests in the locality map so the server + // can proxy blob reads back to the worker immediately, even + // before the BlobsAvailableNotification arrives. + if let Some(ref locality_map) = self.locality_map { + if !self.cas_endpoint.is_empty() { + Self::register_action_result_digests( + locality_map, + &self.cas_endpoint, + &finished_result, + ); + } + } let action_stage = finished_result .try_into() .err_tip(|| "Failed to convert ExecuteResponse into an ActionStage")?; @@ -369,7 +474,176 @@ impl WorkerConnection { Ok(()) } + async fn handle_blobs_available( + &self, + notification: nativelink_proto::com::github::trace_machina::nativelink::remote_execution::BlobsAvailableNotification, + ) -> Result<(), Error> { + let cpu_load_pct = notification.cpu_load_pct; + if cpu_load_pct > 0 { + debug!(worker_id=?self.worker_id, cpu_load_pct, "BlobsAvailable received with CPU load"); + if let Err(err) = self.scheduler.update_worker_load(&self.worker_id, cpu_load_pct).await { + warn!(worker_id=?self.worker_id, ?err, cpu_load_pct, "Failed to update worker load"); + } + } + + // Update the worker's cached directory digests if any were reported (legacy path). + if !notification.cached_directory_digests.is_empty() && !notification.is_full_subtree_snapshot { + let cached_dirs: std::collections::HashSet = notification + .cached_directory_digests + .iter() + .filter_map(|d| DigestInfo::try_from(d.clone()).ok()) + .collect(); + let count = cached_dirs.len(); + debug!(worker_id=?self.worker_id, count, "BlobsAvailable received with cached directory digests"); + if let Err(err) = self.scheduler.update_cached_directories(&self.worker_id, cached_dirs).await { + warn!(worker_id=?self.worker_id, ?err, count, "Failed to update cached directory digests"); + } + } + + // Handle delta-encoded subtree digest updates. + let has_subtree_update = notification.is_full_subtree_snapshot + || !notification.added_subtree_digests.is_empty() + || !notification.removed_subtree_digests.is_empty(); + if has_subtree_update { + let is_full = notification.is_full_subtree_snapshot; + let full_set: Vec = if is_full { + notification + .cached_directory_digests + .iter() + .filter_map(|d| DigestInfo::try_from(d.clone()).ok()) + .collect() + } else { + Vec::new() + }; + let added: Vec = notification + .added_subtree_digests + .iter() + .filter_map(|d| DigestInfo::try_from(d.clone()).ok()) + .collect(); + let removed: Vec = notification + .removed_subtree_digests + .iter() + .filter_map(|d| DigestInfo::try_from(d.clone()).ok()) + .collect(); + let full_count = full_set.len(); + let added_count = added.len(); + let removed_count = removed.len(); + debug!( + worker_id=?self.worker_id, + is_full, + full_count, + added_count, + removed_count, + "BlobsAvailable received with subtree digest updates" + ); + if let Err(err) = self + .scheduler + .update_cached_subtrees( + &self.worker_id, + is_full, + full_set, + added, + removed, + ) + .await + { + warn!( + worker_id=?self.worker_id, + ?err, + is_full, + full_count, + added_count, + removed_count, + "Failed to update cached subtree digests" + ); + } + } + + let Some(ref locality_map) = self.locality_map else { + return Ok(()); + }; + let endpoint = if notification.worker_cas_endpoint.is_empty() { + &self.cas_endpoint + } else { + ¬ification.worker_cas_endpoint + }; + if endpoint.is_empty() { + return Ok(()); + } + + let is_full_snapshot = notification.is_full_snapshot; + + // Process evicted digests (incremental updates report evictions here). + let evicted: Vec = notification + .evicted_digests + .into_iter() + .filter_map(|d| d.try_into().ok()) + .collect(); + + // Collect digests with timestamps from digest_infos (preferred). + let mut digests_with_ts: Vec<(DigestInfo, SystemTime)> = notification + .digest_infos + .into_iter() + .filter_map(|info| { + let digest = info.digest.and_then(|d| DigestInfo::try_from(d).ok())?; + let ts = if info.last_access_timestamp > 0 { + UNIX_EPOCH + Duration::from_secs(info.last_access_timestamp as u64) + } else { + SystemTime::now() + }; + Some((digest, ts)) + }) + .collect(); + // Also include plain digests for backward compatibility / simple notifications. + let now = SystemTime::now(); + digests_with_ts.extend( + notification + .digests + .into_iter() + .filter_map(|d| DigestInfo::try_from(d).ok()) + .map(|d| (d, now)), + ); + + // Acquire the write lock once for all mutations to avoid repeated + // lock acquisition and eliminate inconsistency windows. + let mut map = locality_map.write(); + + if is_full_snapshot { + // Remove all existing entries for this endpoint first. + map.remove_endpoint(endpoint); + } + + if !evicted.is_empty() { + debug!( + worker_id=?self.worker_id, + endpoint, + count=evicted.len(), + "Processing evicted digests from BlobsAvailable" + ); + map.evict_blobs(endpoint, &evicted); + } + + if !digests_with_ts.is_empty() { + debug!( + worker_id=?self.worker_id, + endpoint, + count=digests_with_ts.len(), + is_full_snapshot, + "Registering blobs available from worker" + ); + map.register_blobs_with_timestamps(endpoint, &digests_with_ts); + } + Ok(()) + } + async fn execution_complete(&self, execute_complete: ExecuteComplete) -> Result<(), Error> { + let cpu_load_pct = execute_complete.cpu_load_pct; + if cpu_load_pct > 0 { + debug!(worker_id=?self.worker_id, cpu_load_pct, "ExecuteComplete received with CPU load"); + if let Err(err) = self.scheduler.update_worker_load(&self.worker_id, cpu_load_pct).await { + warn!(worker_id=?self.worker_id, ?err, cpu_load_pct, "Failed to update worker load"); + } + } let operation_id = OperationId::from(execute_complete.operation_id); self.scheduler .update_action( diff --git a/nativelink-service/tests/bep_server_test.rs b/nativelink-service/tests/bep_server_test.rs index d6461875d..ee8baf51c 100644 --- a/nativelink-service/tests/bep_server_test.rs +++ b/nativelink-service/tests/bep_server_test.rs @@ -44,7 +44,8 @@ use nativelink_util::store_trait::{Store, StoreKey, StoreLike}; use pretty_assertions::assert_eq; use prost::Message; use prost_types::Timestamp; -use tonic::codec::{Codec, ProstCodec}; +use tonic::codec::Codec; +use tonic_prost::ProstCodec; use tonic::{Request, Streaming}; const BEP_STORE_NAME: &str = "main_bep"; diff --git a/nativelink-service/tests/bytestream_server_test.rs b/nativelink-service/tests/bytestream_server_test.rs index 7089e1613..2c35d50a4 100644 --- a/nativelink-service/tests/bytestream_server_test.rs +++ b/nativelink-service/tests/bytestream_server_test.rs @@ -25,7 +25,7 @@ use hyper_util::server::conn::auto; use hyper_util::service::TowerToHyperService; use nativelink_config::cas_server::{ByteStreamConfig, HttpListener, WithInstanceName}; use nativelink_config::stores::{MemorySpec, StoreSpec}; -use nativelink_error::{Code, Error, ResultExt, make_err}; +use nativelink_error::{Code, Error, ResultExt}; use nativelink_macro::nativelink_test; use nativelink_proto::google::bytestream::byte_stream_client::ByteStreamClient; use nativelink_proto::google::bytestream::byte_stream_server::ByteStream; @@ -47,7 +47,8 @@ use tokio::sync::mpsc::unbounded_channel; use tokio::task::yield_now; use tokio_stream::StreamExt; use tokio_stream::wrappers::UnboundedReceiverStream; -use tonic::codec::{Codec, CompressionEncoding, ProstCodec}; +use tonic::codec::{Codec, CompressionEncoding}; +use tonic_prost::ProstCodec; use tonic::transport::{Channel, Endpoint}; use tonic::{Request, Response, Streaming}; use tower::service_fn; @@ -855,13 +856,12 @@ pub async fn read_with_not_found_does_not_deadlock() -> Result<(), Error> { let result_fut = read_stream.next(); let result = result_fut.await.err_tip(|| "Expected result to be ready")?; - let expected_err_str = concat!( - "status: NotFound, message: \"Key Digest(DigestInfo(\\\"0123456789abcdef000000000000000000000000000000000123456789abcdef-55\\\")) not found\", details: [], metadata: MetadataMap { headers: {} }", - ); - assert_eq!( - Error::from(result.unwrap_err()), - make_err!(Code::NotFound, "{expected_err_str}"), - "Expected error data to match" + let err = Error::from(result.unwrap_err()); + assert_eq!(err.code, Code::NotFound, "Expected NotFound error code"); + let msg = err.messages.join(" "); + assert!( + msg.contains("0123456789abcdef000000000000000000000000000000000123456789abcdef-55"), + "Expected error message to contain the digest, got: {msg}" ); } Ok(()) @@ -991,7 +991,7 @@ pub async fn max_decoding_message_size_test() -> Result<(), Box Result<(), Box Result<(), Box Result<(), Box Result<(), Box, + _worker_api_server: WorkerApiServer, + connection_worker_stream: ConnectWorkerStream, + _worker_id: WorkerId, + worker_stream: mpsc::Sender, + locality_map: SharedBlobLocalityMap, +} + +/// Sets up a WorkerApiServer with a real SharedBlobLocalityMap and a worker +/// that has a CAS endpoint set. Returns the context needed to send updates +/// and verify the locality map. +async fn setup_api_server_with_locality( + cas_endpoint: &str, +) -> Result { + const SCHEDULER_NAME: &str = "DUMMY_SCHEDULE_NAME"; + const UUID_SIZE: usize = 36; + + let platform_property_manager = Arc::new(PlatformPropertyManager::new(HashMap::new())); + let tasks_or_worker_change_notify = Arc::new(Notify::new()); + let state_manager = Arc::new(MockWorkerStateManager::new()); + let worker_registry = Arc::new(WorkerRegistry::new()); + let scheduler = ApiWorkerScheduler::new( + state_manager.clone(), + platform_property_manager, + WorkerAllocationStrategy::default(), + tasks_or_worker_change_notify, + BASE_WORKER_TIMEOUT_S, + worker_registry, + ); + + let locality_map = new_shared_blob_locality_map(); + + let mut schedulers: HashMap> = HashMap::new(); + schedulers.insert(SCHEDULER_NAME.to_string(), scheduler.clone()); + let worker_api_server = WorkerApiServer::new_with_now_fn( + &WorkerApiConfig { + scheduler: SCHEDULER_NAME.to_string(), + }, + &schedulers, + Box::new(static_now_fn), + [1u8; 6], + Some(locality_map.clone()), + ) + .err_tip(|| "Error creating WorkerApiServer")?; + + let connect_worker_request = ConnectWorkerRequest { + cas_endpoint: cas_endpoint.to_string(), + ..Default::default() + }; + let (tx, rx) = mpsc::channel(1); + tx.send(Update::ConnectWorkerRequest(connect_worker_request)) + .await + .unwrap(); + let update_stream = Box::pin(futures::stream::unfold(rx, |mut rx| async move { + rx.recv().await.map(|update| { + let update = Ok(UpdateForScheduler { + update: Some(update), + }); + (update, rx) + }) + })); + let mut connection_worker_stream = worker_api_server + .inner_connect_worker_for_testing(update_stream) + .await? + .into_inner(); + + let maybe_first_message = connection_worker_stream.next().await; + assert!( + maybe_first_message.is_some(), + "Expected first message from stream" + ); + let first_update = maybe_first_message + .unwrap() + .err_tip(|| "Expected success result")? + .update + .err_tip(|| "Expected update field to be populated")?; + let worker_id = match first_update { + update_for_worker::Update::ConnectionResult(connection_result) => { + connection_result.worker_id + } + other => unreachable!("Expected ConnectionResult, got {:?}", other), + }; + + assert_eq!( + worker_id.len(), + UUID_SIZE, + "Worker ID should be 36 characters" + ); + + Ok(LocalityTestContext { + _scheduler: scheduler, + _worker_api_server: worker_api_server, + connection_worker_stream, + _worker_id: worker_id.into(), + worker_stream: tx, + locality_map, + }) +} + +#[nativelink_test] +pub async fn handle_blobs_available_populates_locality_map_test() +-> Result<(), Box> { + let cas_endpoint = "grpc://192.168.1.10:50081"; + let test_context = setup_api_server_with_locality(cas_endpoint).await?; + + let d1 = DigestInfo::new([1u8; 32], 100); + let d2 = DigestInfo::new([2u8; 32], 200); + + // Send a BlobsAvailable notification with two digests. + test_context + .worker_stream + .send(Update::BlobsAvailable(BlobsAvailableNotification { + worker_cas_endpoint: String::new(), // Empty means use the worker's registered endpoint. + digests: vec![d1.into(), d2.into()], + is_full_snapshot: false, + evicted_digests: vec![], + digest_infos: vec![], + cpu_load_pct: 0, + cached_directory_digests: vec![], + added_subtree_digests: vec![], + removed_subtree_digests: vec![], + is_full_subtree_snapshot: false, + })) + .await + .map_err(|e| make_err!(tonic::Code::Internal, "Error sending blobs available: {e}"))?; + + // Allow background task to process the update. + tokio::time::sleep(Duration::from_millis(50)).await; + + // Verify the locality map has both digests registered to the endpoint. + let map = test_context.locality_map.read(); + let workers_d1 = map.lookup_workers(&d1); + assert_eq!( + workers_d1.len(), + 1, + "Expected d1 to have 1 endpoint, got {workers_d1:?}" + ); + assert_eq!(&*workers_d1[0], cas_endpoint); + + let workers_d2 = map.lookup_workers(&d2); + assert_eq!( + workers_d2.len(), + 1, + "Expected d2 to have 1 endpoint, got {workers_d2:?}" + ); + assert_eq!(&*workers_d2[0], cas_endpoint); + + assert_eq!(map.digest_count(), 2); + assert_eq!(map.endpoint_count(), 1); + + Ok(()) +} + +#[nativelink_test] +pub async fn full_snapshot_replaces_endpoint_view_test() +-> Result<(), Box> { + let cas_endpoint = "grpc://192.168.1.10:50081"; + let test_context = setup_api_server_with_locality(cas_endpoint).await?; + + let d1 = DigestInfo::new([1u8; 32], 100); + let d2 = DigestInfo::new([2u8; 32], 200); + let d3 = DigestInfo::new([3u8; 32], 300); + + // First, register d1 and d2 with an incremental update. + test_context + .worker_stream + .send(Update::BlobsAvailable(BlobsAvailableNotification { + worker_cas_endpoint: String::new(), + digests: vec![d1.into(), d2.into()], + is_full_snapshot: false, + evicted_digests: vec![], + digest_infos: vec![], + cpu_load_pct: 0, + cached_directory_digests: vec![], + added_subtree_digests: vec![], + removed_subtree_digests: vec![], + is_full_subtree_snapshot: false, + })) + .await + .map_err(|e| make_err!(tonic::Code::Internal, "Error sending: {e}"))?; + tokio::time::sleep(Duration::from_millis(50)).await; + + // Confirm d1 and d2 are present. + { + let map = test_context.locality_map.read(); + assert_eq!(map.digest_count(), 2); + assert!(!map.lookup_workers(&d1).is_empty()); + assert!(!map.lookup_workers(&d2).is_empty()); + } + + // Now send a full snapshot containing only d3. + // This should clear d1 and d2 and only have d3. + test_context + .worker_stream + .send(Update::BlobsAvailable(BlobsAvailableNotification { + worker_cas_endpoint: String::new(), + digests: vec![d3.into()], + is_full_snapshot: true, + evicted_digests: vec![], + digest_infos: vec![], + cpu_load_pct: 0, + cached_directory_digests: vec![], + added_subtree_digests: vec![], + removed_subtree_digests: vec![], + is_full_subtree_snapshot: false, + })) + .await + .map_err(|e| make_err!(tonic::Code::Internal, "Error sending: {e}"))?; + tokio::time::sleep(Duration::from_millis(50)).await; + + // Verify: d1 and d2 should be gone, only d3 remains. + let map = test_context.locality_map.read(); + assert!( + map.lookup_workers(&d1).is_empty(), + "d1 should have been cleared by full snapshot" + ); + assert!( + map.lookup_workers(&d2).is_empty(), + "d2 should have been cleared by full snapshot" + ); + let workers_d3 = map.lookup_workers(&d3); + assert_eq!( + workers_d3.len(), + 1, + "d3 should be registered after full snapshot" + ); + assert_eq!(&*workers_d3[0], cas_endpoint); + assert_eq!(map.digest_count(), 1); + + Ok(()) +} + +#[nativelink_test] +pub async fn incremental_update_preserves_existing_blobs_test() +-> Result<(), Box> { + let cas_endpoint = "grpc://192.168.1.10:50081"; + let test_context = setup_api_server_with_locality(cas_endpoint).await?; + + let d1 = DigestInfo::new([1u8; 32], 100); + let d2 = DigestInfo::new([2u8; 32], 200); + let d3 = DigestInfo::new([3u8; 32], 300); + + // First update: register d1 and d2. + test_context + .worker_stream + .send(Update::BlobsAvailable(BlobsAvailableNotification { + worker_cas_endpoint: String::new(), + digests: vec![d1.into(), d2.into()], + is_full_snapshot: false, + evicted_digests: vec![], + digest_infos: vec![], + cpu_load_pct: 0, + cached_directory_digests: vec![], + added_subtree_digests: vec![], + removed_subtree_digests: vec![], + is_full_subtree_snapshot: false, + })) + .await + .map_err(|e| make_err!(tonic::Code::Internal, "Error sending: {e}"))?; + tokio::time::sleep(Duration::from_millis(50)).await; + + // Second update (incremental): register d3 only. + test_context + .worker_stream + .send(Update::BlobsAvailable(BlobsAvailableNotification { + worker_cas_endpoint: String::new(), + digests: vec![d3.into()], + is_full_snapshot: false, + evicted_digests: vec![], + digest_infos: vec![], + cpu_load_pct: 0, + cached_directory_digests: vec![], + added_subtree_digests: vec![], + removed_subtree_digests: vec![], + is_full_subtree_snapshot: false, + })) + .await + .map_err(|e| make_err!(tonic::Code::Internal, "Error sending: {e}"))?; + tokio::time::sleep(Duration::from_millis(50)).await; + + // All three digests should be present. + let map = test_context.locality_map.read(); + assert_eq!( + map.digest_count(), + 3, + "All three digests should be present after incremental update" + ); + assert!(!map.lookup_workers(&d1).is_empty(), "d1 should still exist"); + assert!(!map.lookup_workers(&d2).is_empty(), "d2 should still exist"); + assert!(!map.lookup_workers(&d3).is_empty(), "d3 should be added"); + + Ok(()) +} + +#[nativelink_test] +pub async fn eviction_removes_digests_from_locality_map_test() +-> Result<(), Box> { + let cas_endpoint = "grpc://192.168.1.10:50081"; + let test_context = setup_api_server_with_locality(cas_endpoint).await?; + + let d1 = DigestInfo::new([1u8; 32], 100); + let d2 = DigestInfo::new([2u8; 32], 200); + let d3 = DigestInfo::new([3u8; 32], 300); + + // Register d1, d2, d3. + test_context + .worker_stream + .send(Update::BlobsAvailable(BlobsAvailableNotification { + worker_cas_endpoint: String::new(), + digests: vec![d1.into(), d2.into(), d3.into()], + is_full_snapshot: false, + evicted_digests: vec![], + digest_infos: vec![], + cpu_load_pct: 0, + cached_directory_digests: vec![], + added_subtree_digests: vec![], + removed_subtree_digests: vec![], + is_full_subtree_snapshot: false, + })) + .await + .map_err(|e| make_err!(tonic::Code::Internal, "Error sending: {e}"))?; + tokio::time::sleep(Duration::from_millis(50)).await; + + // Now send an incremental update with evicted_digests containing d1 and d2. + test_context + .worker_stream + .send(Update::BlobsAvailable(BlobsAvailableNotification { + worker_cas_endpoint: String::new(), + digests: vec![], + is_full_snapshot: false, + evicted_digests: vec![d1.into(), d2.into()], + digest_infos: vec![], + cpu_load_pct: 0, + cached_directory_digests: vec![], + added_subtree_digests: vec![], + removed_subtree_digests: vec![], + is_full_subtree_snapshot: false, + })) + .await + .map_err(|e| make_err!(tonic::Code::Internal, "Error sending: {e}"))?; + tokio::time::sleep(Duration::from_millis(50)).await; + + // d1 and d2 should be evicted, d3 remains. + let map = test_context.locality_map.read(); + assert!( + map.lookup_workers(&d1).is_empty(), + "d1 should have been evicted" + ); + assert!( + map.lookup_workers(&d2).is_empty(), + "d2 should have been evicted" + ); + assert_eq!( + map.lookup_workers(&d3).len(), + 1, + "d3 should still be present" + ); + assert_eq!(map.digest_count(), 1); + + Ok(()) +} + +#[nativelink_test] +pub async fn worker_disconnect_cleans_up_locality_map_test() +-> Result<(), Box> { + let cas_endpoint = "grpc://192.168.1.10:50081"; + let test_context = setup_api_server_with_locality(cas_endpoint).await?; + + let d1 = DigestInfo::new([1u8; 32], 100); + let d2 = DigestInfo::new([2u8; 32], 200); + + // Register d1 and d2. + test_context + .worker_stream + .send(Update::BlobsAvailable(BlobsAvailableNotification { + worker_cas_endpoint: String::new(), + digests: vec![d1.into(), d2.into()], + is_full_snapshot: false, + evicted_digests: vec![], + digest_infos: vec![], + cpu_load_pct: 0, + cached_directory_digests: vec![], + added_subtree_digests: vec![], + removed_subtree_digests: vec![], + is_full_subtree_snapshot: false, + })) + .await + .map_err(|e| make_err!(tonic::Code::Internal, "Error sending: {e}"))?; + tokio::time::sleep(Duration::from_millis(50)).await; + + // Confirm blobs are present. + { + let map = test_context.locality_map.read(); + assert_eq!(map.digest_count(), 2); + assert_eq!(map.endpoint_count(), 1); + } + + // Drop the worker stream sender to simulate disconnect. + // The background task in WorkerConnection will see the stream end + // and call remove_endpoint on the locality map. + drop(test_context.worker_stream); + drop(test_context.connection_worker_stream); + + // Allow the background cleanup task to run. + tokio::time::sleep(Duration::from_millis(100)).await; + + // All entries for this endpoint should be removed. + let map = test_context.locality_map.read(); + assert!( + map.lookup_workers(&d1).is_empty(), + "d1 should be removed after worker disconnect" + ); + assert!( + map.lookup_workers(&d2).is_empty(), + "d2 should be removed after worker disconnect" + ); + assert_eq!( + map.endpoint_count(), + 0, + "No endpoints should remain after disconnect" + ); + assert_eq!( + map.digest_count(), + 0, + "No digests should remain after disconnect" + ); + + Ok(()) +} + +#[nativelink_test] +pub async fn blobs_available_with_malformed_digests_test() +-> Result<(), Box> { + use nativelink_proto::build::bazel::remote::execution::v2::Digest as ProtoDigest; + + let cas_endpoint = "grpc://192.168.1.10:50081"; + let test_context = setup_api_server_with_locality(cas_endpoint).await?; + + let d1 = DigestInfo::new([1u8; 32], 100); + let d2 = DigestInfo::new([2u8; 32], 200); + + // Build the digests list: 2 valid + 1 malformed (hash too short). + let valid1: ProtoDigest = d1.into(); + let valid2: ProtoDigest = d2.into(); + let malformed = ProtoDigest { + hash: "deadbeef".to_string(), // Only 8 hex chars, not 64. + size_bytes: 999, + ..Default::default() + }; + + test_context + .worker_stream + .send(Update::BlobsAvailable(BlobsAvailableNotification { + worker_cas_endpoint: String::new(), + digests: vec![valid1, malformed, valid2], + is_full_snapshot: false, + evicted_digests: vec![], + digest_infos: vec![], + cpu_load_pct: 0, + cached_directory_digests: vec![], + added_subtree_digests: vec![], + removed_subtree_digests: vec![], + is_full_subtree_snapshot: false, + })) + .await + .map_err(|e| make_err!(tonic::Code::Internal, "Error sending: {e}"))?; + tokio::time::sleep(Duration::from_millis(50)).await; + + // Only the 2 valid digests should appear in the locality map. + let map = test_context.locality_map.read(); + assert_eq!( + map.digest_count(), + 2, + "Expected exactly 2 valid digests in locality map, got {}", + map.digest_count() + ); + assert!( + !map.lookup_workers(&d1).is_empty(), + "Expected d1 to be registered" + ); + assert!( + !map.lookup_workers(&d2).is_empty(), + "Expected d2 to be registered" + ); + + Ok(()) +} + +#[nativelink_test] +pub async fn blobs_evicted_is_noop_for_wire_compat_test() +-> Result<(), Box> { + let cas_endpoint = "grpc://192.168.1.10:50081"; + let test_context = setup_api_server_with_locality(cas_endpoint).await?; + + let d1 = DigestInfo::new([1u8; 32], 100); + + // Register d1. + test_context + .worker_stream + .send(Update::BlobsAvailable(BlobsAvailableNotification { + worker_cas_endpoint: String::new(), + digests: vec![d1.into()], + is_full_snapshot: false, + evicted_digests: vec![], + digest_infos: vec![], + cpu_load_pct: 0, + cached_directory_digests: vec![], + added_subtree_digests: vec![], + removed_subtree_digests: vec![], + is_full_subtree_snapshot: false, + })) + .await + .map_err(|e| make_err!(tonic::Code::Internal, "Error sending: {e}"))?; + tokio::time::sleep(Duration::from_millis(50)).await; + + // Send BlobsEvicted -- should be a no-op (handler returns Ok(())). + // The old BlobsEvicted RPC is kept for wire compatibility but ignored. + test_context + .worker_stream + .send(Update::BlobsEvicted(BlobsEvictedNotification { + worker_cas_endpoint: String::new(), + digests: vec![d1.into()], + })) + .await + .map_err(|e| make_err!(tonic::Code::Internal, "Error sending: {e}"))?; + tokio::time::sleep(Duration::from_millis(50)).await; + + // d1 should STILL be present because BlobsEvicted is now a no-op. + let map = test_context.locality_map.read(); + assert_eq!( + map.lookup_workers(&d1).len(), + 1, + "d1 should still be present -- BlobsEvicted is a no-op for wire compat" + ); + + Ok(()) +} diff --git a/nativelink-store/Cargo.toml b/nativelink-store/Cargo.toml index 5a0a62928..df450c440 100644 --- a/nativelink-store/Cargo.toml +++ b/nativelink-store/Cargo.toml @@ -33,7 +33,7 @@ bincode = { version = "2.0.1", default-features = false, features = [ "alloc", "serde", ] } -blake3 = { version = "1.8.0", default-features = false } +blake3 = { version = "1.8.0", default-features = false, features = ["std", "rayon"] } byteorder = { version = "1.5.0", default-features = false } bytes = { version = "1.10.1", default-features = false } const_format = { version = "0.2.34", default-features = false } @@ -53,7 +53,7 @@ hyper = { version = "1.6.0", default-features = false } hyper-rustls = { version = "0.27.5", default-features = false, features = [ "http1", "http2", - "ring", + "aws-lc-rs", "rustls-native-certs", "rustls-platform-verifier", ] } @@ -64,13 +64,13 @@ mongodb = { version = "3", features = [ "compat-3-0-0", "rustls-tls", ], default-features = false } -opentelemetry = { version = "0.29.1", default-features = false } +opentelemetry = { version = "0.31.0", default-features = false } parking_lot = { version = "0.12.3", features = [ "arc_lock", "send_guard", ], default-features = false } patricia_tree = { version = "0.9.0", default-features = false } -prost = { version = "0.13.5", default-features = false } +prost = { version = "0.14.3", default-features = false } rand = { version = "0.9.0", default-features = false, features = [ "thread_rng", ] } @@ -83,13 +83,13 @@ redis = { version = "1.0.0", default-features = false, features = [ "tokio-comp", ] } regex = { version = "1.11.1", default-features = false } -reqwest = { version = "0.12", default-features = false } -reqwest-middleware = { version = "0.4.2", default-features = false } +reqwest = { version = "0.13.2", default-features = false } +reqwest-middleware = { version = "0.5.1", default-features = false } rustls = { version = "0.23.27", default-features = false, features = [] } rustls-pki-types = { version = "1.13.1", default-features = false } serde = { version = "1.0.219", default-features = false } serde_json = { version = "1.0.140", default-features = false } -sha2 = { version = "0.10.8", default-features = false } +sha2 = { version = "0.10.8", default-features = false, features = ["asm"] } tokio = { version = "1.44.1", features = [ "fs", "io-util", @@ -100,8 +100,8 @@ tokio-stream = { version = "0.1.17", features = [ "fs", ], default-features = false } tokio-util = { version = "0.7.14", default-features = false } -tonic = { version = "0.13.0", features = [ - "tls-ring", +tonic = { version = "0.14.5", features = [ + "tls-aws-lc", "transport", ], default-features = false } tracing = { version = "0.1.41", default-features = false } diff --git a/nativelink-store/src/ac_utils.rs b/nativelink-store/src/ac_utils.rs index 7e24270cb..1a72ca134 100644 --- a/nativelink-store/src/ac_utils.rs +++ b/nativelink-store/src/ac_utils.rs @@ -24,8 +24,10 @@ use futures::TryFutureExt; use nativelink_error::{Code, Error, ResultExt}; use nativelink_util::common::DigestInfo; use nativelink_util::digest_hasher::DigestHasher; +use nativelink_util::log_utils::throughput_mbps; use nativelink_util::store_trait::{StoreKey, StoreLike}; use prost::Message; +use tracing::debug; // NOTE(aaronmondal) From some local testing it looks like action cache items are rarely greater than // 1.2k. Giving a bit more just in case to reduce allocs. @@ -104,15 +106,25 @@ pub async fn serialize_and_upload_message<'a, T: Message>( let mut buffer = BytesMut::with_capacity(message.encoded_len()); let digest = message_to_digest(message, &mut buffer, hasher) .err_tip(|| "In serialize_and_upload_message")?; + let size_bytes = buffer.len() as u64; // Note: For unknown reasons we appear to be hitting: // https://github.com/rust-lang/rust/issues/92096 // or a smiliar issue if we try to use the non-store driver function, so we // are using the store driver function here. + let start = std::time::Instant::now(); cas_store .as_store_driver_pin() .update_oneshot(digest.into(), buffer.freeze()) .await .err_tip(|| "In serialize_and_upload_message")?; + let elapsed = start.elapsed(); + debug!( + ?digest, + size_bytes, + elapsed_ms = elapsed.as_millis() as u64, + throughput_mbps = format!("{:.1}", throughput_mbps(size_bytes, elapsed)), + "serialize_and_upload_message: CAS write completed", + ); Ok(digest) } diff --git a/nativelink-store/src/callback_utils.rs b/nativelink-store/src/callback_utils.rs index a18f20c52..4cc3ed405 100644 --- a/nativelink-store/src/callback_utils.rs +++ b/nativelink-store/src/callback_utils.rs @@ -17,22 +17,21 @@ use core::pin::Pin; use std::sync::Arc; use nativelink_util::evicting_map; -use nativelink_util::store_trait::{RemoveItemCallback, StoreKey}; +use nativelink_util::store_trait::{ItemCallback, StoreKey}; -// Generic struct to hold a RemoveItemCallback ref for the purposes -// of a RemoveStateCallback call +// Generic struct to hold an ItemCallback ref for the purposes of an item callback call #[derive(Debug)] -pub struct RemoveItemCallbackHolder { - callback: Arc, +pub struct ItemCallbackHolder { + callback: Arc, } -impl RemoveItemCallbackHolder { - pub fn new(callback: Arc) -> Self { +impl ItemCallbackHolder { + pub fn new(callback: Arc) -> Self { Self { callback } } } -impl<'a, Q> evicting_map::RemoveItemCallback for RemoveItemCallbackHolder +impl<'a, Q> evicting_map::ItemCallback for ItemCallbackHolder where Q: Borrow>, { @@ -42,4 +41,9 @@ where let store_key = store_key.borrow().into_owned(); Box::pin(async move { callback.callback(store_key).await }) } + + fn on_insert(&self, store_key: &Q, size: u64) { + let store_key: &StoreKey<'_> = Borrow::>::borrow(store_key); + self.callback.on_insert(store_key.borrow().into_owned(), size); + } } diff --git a/nativelink-store/src/completeness_checking_store.rs b/nativelink-store/src/completeness_checking_store.rs index bbdbde8d9..6eb90f548 100644 --- a/nativelink-store/src/completeness_checking_store.rs +++ b/nativelink-store/src/completeness_checking_store.rs @@ -29,7 +29,7 @@ use nativelink_util::common::DigestInfo; use nativelink_util::health_utils::{HealthStatusIndicator, default_health_status_indicator}; use nativelink_util::metrics_utils::CounterWithTime; use nativelink_util::store_trait::{ - RemoveItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, + ItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, }; use parking_lot::Mutex; use tokio::sync::Notify; @@ -390,12 +390,12 @@ impl StoreDriver for CompletenessCheckingStore { self } - fn register_remove_callback( + fn register_item_callback( self: Arc, - callback: Arc, + callback: Arc, ) -> Result<(), Error> { - self.ac_store.register_remove_callback(callback.clone())?; - self.cas_store.register_remove_callback(callback)?; + self.ac_store.register_item_callback(callback.clone())?; + self.cas_store.register_item_callback(callback)?; Ok(()) } } diff --git a/nativelink-store/src/compression_store.rs b/nativelink-store/src/compression_store.rs index 345e06703..71655170e 100644 --- a/nativelink-store/src/compression_store.rs +++ b/nativelink-store/src/compression_store.rs @@ -31,7 +31,7 @@ use nativelink_util::buf_channel::{ use nativelink_util::health_utils::{HealthStatusIndicator, default_health_status_indicator}; use nativelink_util::spawn; use nativelink_util::store_trait::{ - RemoveItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, + ItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, }; use serde::{Deserialize, Serialize}; @@ -44,7 +44,7 @@ pub const CURRENT_STREAM_FORMAT_VERSION: u8 = 1; // Default block size that will be used to slice stream into. pub const DEFAULT_BLOCK_SIZE: u32 = 64 * 1024; -const U32_SZ: u64 = size_of::() as u64; +const U32_SZ: u64 = size_of::() as u64; // We use a custom frame format here because I wanted the ability in the future to: // * Read a random part of the data without needing to parse entire file. @@ -630,14 +630,16 @@ impl StoreDriver for CompressionStore { }; let (read_result, get_part_fut_result) = tokio::join!(read_fut, get_part_fut); - if let Err(mut e) = read_result { - // We may need to propagate the error from reading the data through first. - if let Err(err) = get_part_fut_result { - e = err.merge(e); - } - return Err(e); + // Propagate errors from both futures. Previously, if read_fut + // succeeded but get_part_fut failed (e.g., inner store returned + // NotFound), the error was silently swallowed — masking real + // data-loss errors from the caller. + match (read_result, get_part_fut_result) { + (Ok(()), Ok(())) => Ok(()), + (Err(e), Ok(())) => Err(e), + (Ok(()), Err(e)) => Err(e), + (Err(read_err), Err(get_err)) => Err(get_err.merge(read_err)), } - Ok(()) } fn inner_store(&self, _digest: Option) -> &dyn StoreDriver { @@ -652,11 +654,11 @@ impl StoreDriver for CompressionStore { self } - fn register_remove_callback( + fn register_item_callback( self: Arc, - callback: Arc, + callback: Arc, ) -> Result<(), Error> { - self.inner_store.register_remove_callback(callback) + self.inner_store.register_item_callback(callback) } } diff --git a/nativelink-store/src/dedup_store.rs b/nativelink-store/src/dedup_store.rs index 252411a45..c10edd893 100644 --- a/nativelink-store/src/dedup_store.rs +++ b/nativelink-store/src/dedup_store.rs @@ -27,7 +27,7 @@ use nativelink_util::common::DigestInfo; use nativelink_util::fastcdc::FastCDC; use nativelink_util::health_utils::{HealthStatusIndicator, default_health_status_indicator}; use nativelink_util::store_trait::{ - RemoveItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, + ItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, }; use serde::{Deserialize, Serialize}; use tokio_util::codec::FramedRead; @@ -209,16 +209,13 @@ impl StoreDriver for DedupStore { .map_ok(|frame| async move { let hash = blake3::hash(&frame[..]).into(); let index_entry = DigestInfo::new(hash, frame.len() as u64); - if self - .content_store - .has(index_entry) - .await - .err_tip(|| "Failed to call .has() in DedupStore::update()")? - .is_some() - { - // If our store has this digest, we don't need to upload it. - return Result::<_, Error>::Ok(index_entry); - } + // Always upload the chunk unconditionally. A previous has() + // check here skipped the upload when the chunk appeared to + // exist, but the chunk could be evicted between that check + // and the index commit — leaving the index pointing to a + // missing chunk and causing "Lost inputs" errors. + // Content-addressed upload is idempotent, so re-uploading + // an existing chunk is safe and cheap. self.content_store .update_oneshot(index_entry, frame) .await @@ -379,13 +376,13 @@ impl StoreDriver for DedupStore { self } - fn register_remove_callback( + fn register_item_callback( self: Arc, - callback: Arc, + callback: Arc, ) -> Result<(), Error> { self.index_store - .register_remove_callback(callback.clone())?; - self.content_store.register_remove_callback(callback)?; + .register_item_callback(callback.clone())?; + self.content_store.register_item_callback(callback)?; Ok(()) } } diff --git a/nativelink-store/src/existence_cache_store.rs b/nativelink-store/src/existence_cache_store.rs index a59d48e70..fb9370b50 100644 --- a/nativelink-store/src/existence_cache_store.rs +++ b/nativelink-store/src/existence_cache_store.rs @@ -29,7 +29,7 @@ use nativelink_util::evicting_map::{EvictingMap, LenEntry}; use nativelink_util::health_utils::{HealthStatus, HealthStatusIndicator}; use nativelink_util::instant_wrapper::InstantWrapper; use nativelink_util::store_trait::{ - RemoveItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, + ItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, }; use parking_lot::Mutex; use tracing::{debug, info, trace}; @@ -59,7 +59,7 @@ pub struct ExistenceCacheStore { // as if it immediately expires them, we should only apply the remove callbacks // afterwards. If this is None, we're not pausing; if it's Some it's the location to // store them in temporarily - pause_remove_callbacks: Mutex>>>, + pause_item_callbacks: Mutex>>>, } impl ExistenceCacheStore { @@ -68,7 +68,7 @@ impl ExistenceCacheStore { } } -impl RemoveItemCallback for ExistenceCacheStore { +impl ItemCallback for ExistenceCacheStore { fn callback<'a>( &'a self, store_key: StoreKey<'a>, @@ -89,14 +89,14 @@ struct ExistenceCacheCallback { cache: Weak>, } -impl RemoveItemCallback for ExistenceCacheCallback { +impl ItemCallback for ExistenceCacheCallback { fn callback<'a>( &'a self, store_key: StoreKey<'a>, ) -> Pin + Send + 'a>> { let cache = self.cache.upgrade(); if let Some(local_cache) = cache { - if let Some(callbacks) = local_cache.pause_remove_callbacks.lock().as_mut() { + if let Some(callbacks) = local_cache.pause_item_callbacks.lock().as_mut() { callbacks.push(store_key.into_owned()); } else { let store_key = store_key.into_owned(); @@ -109,6 +109,7 @@ impl RemoveItemCallback for ExistenceCacheCallback { } Box::pin(async {}) } + } impl ExistenceCacheStore { @@ -122,13 +123,13 @@ impl ExistenceCacheStore { let existence_cache_store = Arc::new(Self { inner_store, existence_cache: EvictingMap::new(eviction_policy, anchor_time), - pause_remove_callbacks: Mutex::new(None), + pause_item_callbacks: Mutex::new(None), }); let other_ref = Arc::downgrade(&existence_cache_store); existence_cache_store .inner_store - .register_remove_callback(Arc::new(ExistenceCacheCallback { cache: other_ref })) - .expect("Register remove callback should work"); + .register_item_callback(Arc::new(ExistenceCacheCallback { cache: other_ref })) + .expect("Register item callback should work"); existence_cache_store } @@ -233,21 +234,33 @@ impl StoreDriver for ExistenceCacheStore { size_info: UploadSizeInfo, ) -> Result<(), Error> { let digest = key.into_digest(); + // Check the inner store directly, bypassing the existence cache. + // The existence cache may have a stale positive for a blob that was + // evicted from the inner store (the async eviction callback may not + // have fired yet). Trusting the cache here would skip the upload, + // causing Bazel's "Lost inputs no longer available remotely" error. let mut exists = [None]; - self.inner_has_with_results(&[digest], &mut exists) + self.inner_store + .has_with_results(&[digest.into()], &mut exists) .await .err_tip(|| "In ExistenceCacheStore::update")?; if exists[0].is_some() { - // We need to drain the reader to avoid the writer complaining that we dropped - // the connection prematurely. + // Blob genuinely exists in the inner store — safe to skip. reader .drain() .await .err_tip(|| "In ExistenceCacheStore::update")?; + // Refresh the existence cache since we verified it exists. + let _ = self + .existence_cache + .insert(digest, ExistenceItem(exists[0].unwrap())) + .await; return Ok(()); } + // If the existence cache had a stale entry, remove it now. + self.existence_cache.remove(&digest).await; { - let mut locked_callbacks = self.pause_remove_callbacks.lock(); + let mut locked_callbacks = self.pause_item_callbacks.lock(); if locked_callbacks.is_none() { locked_callbacks.replace(vec![]); } @@ -256,15 +269,37 @@ impl StoreDriver for ExistenceCacheStore { let result = self.inner_store.update(digest, reader, size_info).await; if result.is_ok() { trace!(?digest, "Inserting into existence cache"); - if let UploadSizeInfo::ExactSize(size) = size_info { - let _ = self - .existence_cache - .insert(digest, ExistenceItem(size)) - .await; + // Cache on both ExactSize and MaxSize — the digest carries the + // authoritative size for content-addressed blobs. + let size = match size_info { + UploadSizeInfo::ExactSize(size) => size, + UploadSizeInfo::MaxSize(_) => digest.size_bytes(), + }; + let _ = self + .existence_cache + .insert(digest, ExistenceItem(size)) + .await; + + // Diagnostic: verify the blob actually persisted in the inner store. + // If this fires, it means the inner store reported success but the + // blob is not findable immediately after write. + let mut verify = [None]; + if let Ok(()) = self + .inner_store + .has_with_results(&[digest.into()], &mut verify) + .await + { + if verify[0].is_none() { + tracing::error!( + ?digest, + "CRITICAL: inner store update() succeeded but has() returns \ + None immediately after! Blob was NOT persisted to slow store.", + ); + } } } { - let maybe_keys = self.pause_remove_callbacks.lock().take(); + let maybe_keys = self.pause_item_callbacks.lock().take(); if let Some(keys) = maybe_keys { let mut callbacks: FuturesUnordered<_> = keys .into_iter() @@ -288,11 +323,20 @@ impl StoreDriver for ExistenceCacheStore { .inner_store .get_part(digest, writer, offset, length) .await; - if result.is_ok() { - let _ = self - .existence_cache - .insert(digest, ExistenceItem(digest.size_bytes())) - .await; + match &result { + Ok(()) => { + let _ = self + .existence_cache + .insert(digest, ExistenceItem(digest.size_bytes())) + .await; + } + Err(err) if err.code == nativelink_error::Code::NotFound => { + // Blob was evicted from the inner store — remove the stale + // existence cache entry so subsequent has() calls get an + // accurate result. + self.existence_cache.remove(&digest).await; + } + Err(_) => {} } result } @@ -309,11 +353,11 @@ impl StoreDriver for ExistenceCacheStore { self } - fn register_remove_callback( + fn register_item_callback( self: Arc, - callback: Arc, + callback: Arc, ) -> Result<(), Error> { - self.inner_store.register_remove_callback(callback) + self.inner_store.register_item_callback(callback) } } diff --git a/nativelink-store/src/fast_slow_store.rs b/nativelink-store/src/fast_slow_store.rs index 1a52d7577..da0c55a4a 100644 --- a/nativelink-store/src/fast_slow_store.rs +++ b/nativelink-store/src/fast_slow_store.rs @@ -22,17 +22,18 @@ use std::ffi::OsString; use std::sync::{Arc, Weak}; use async_trait::async_trait; +use bytes::Bytes; use futures::{FutureExt, join}; use nativelink_config::stores::{FastSlowSpec, StoreDirection}; use nativelink_error::{Code, Error, ResultExt, make_err}; use nativelink_metric::MetricsComponent; use nativelink_util::buf_channel::{ - DropCloserReadHalf, DropCloserWriteHalf, make_buf_channel_pair, + DropCloserReadHalf, DropCloserWriteHalf, make_buf_channel_pair_with_size, }; use nativelink_util::fs; use nativelink_util::health_utils::{HealthStatusIndicator, default_health_status_indicator}; use nativelink_util::store_trait::{ - RemoveItemCallback, Store, StoreDriver, StoreKey, StoreLike, StoreOptimizations, + ItemCallback, Store, StoreDriver, StoreKey, StoreLike, StoreOptimizations, UploadSizeInfo, slow_update_store_with_file, }; use parking_lot::Mutex; @@ -100,9 +101,11 @@ impl Drop for LoaderGuard<'_> { return; }; + // Pre-compute the owned key outside the lock to minimize lock hold time. + let owned_key = self.key.borrow().into_owned(); let mut guard = store.populating_digests.lock(); if let std::collections::hash_map::Entry::Occupied(occupied_entry) = - guard.entry(self.key.borrow().into_owned()) + guard.entry(owned_key) { if Arc::ptr_eq(occupied_entry.get(), &loader) { drop(loader); @@ -136,6 +139,14 @@ impl FastSlowStore { &self.slow_store } + pub const fn fast_direction(&self) -> StoreDirection { + self.fast_direction + } + + pub const fn slow_direction(&self) -> StoreDirection { + self.slow_direction + } + pub fn get_arc(&self) -> Option> { self.weak_self.upgrade() } @@ -143,10 +154,12 @@ impl FastSlowStore { fn get_loader<'a>(&self, key: StoreKey<'a>) -> LoaderGuard<'a> { // Get a single loader instance that's used to populate the fast store // for this digest. If another request comes in then it's de-duplicated. + // Pre-compute the owned key outside the lock to minimize lock hold time. + let owned_key = key.borrow().into_owned(); let loader = match self .populating_digests .lock() - .entry(key.borrow().into_owned()) + .entry(owned_key) { std::collections::hash_map::Entry::Occupied(occupied_entry) => { occupied_entry.get().clone() @@ -187,6 +200,11 @@ impl FastSlowStore { .await .err_tip(|| "Failed to run has() on slow store")? .ok_or_else(|| { + debug!( + %key, + slow_store = %self.slow_store.inner_store(Some(key.borrow())).get_name(), + "CAS read miss: blob not found in slow store" + ); make_err!( Code::NotFound, "Object {} not found in either fast or slow store. \ @@ -201,8 +219,10 @@ impl FastSlowStore { let mut bytes_received: u64 = 0; let mut counted_hit = false; - let (mut fast_tx, fast_rx) = make_buf_channel_pair(); - let (slow_tx, mut slow_rx) = make_buf_channel_pair(); + // Use 128 slots (~32MiB at 256KiB chunks) for dual-store + // read-through to reduce backpressure between fast and slow stores. + let (mut fast_tx, fast_rx) = make_buf_channel_pair_with_size(128); + let (slow_tx, mut slow_rx) = make_buf_channel_pair_with_size(128); let data_stream_fut = async move { let mut maybe_writer_pin = maybe_writer.map(Pin::new); loop { @@ -276,20 +296,10 @@ impl FastSlowStore { } } - /// Ensure our fast store is populated. This should be kept as a low - /// cost function. Since the data itself is shared and not copied it should be fairly - /// low cost to just discard the data, but does cost a few mutex locks while - /// streaming. - pub async fn populate_fast_store(&self, key: StoreKey<'_>) -> Result<(), Error> { - let maybe_size_info = self - .fast_store - .has(key.borrow()) - .await - .err_tip(|| "While querying in populate_fast_store")?; - if maybe_size_info.is_some() { - return Ok(()); - } - + /// Internal helper: copy a blob from the slow store into the fast store, + /// using the de-duplicating loader. Assumes the caller has already verified + /// the blob is not in the fast store (or does not care). + async fn copy_slow_to_fast(&self, key: StoreKey<'_>) -> Result<(), Error> { // If the fast store is noop or read only or update only then this is an error. if self .fast_store @@ -312,6 +322,31 @@ impl FastSlowStore { .err_tip(|| "Failed to populate()") } + /// Ensure our fast store is populated. This should be kept as a low + /// cost function. Since the data itself is shared and not copied it should be fairly + /// low cost to just discard the data, but does cost a few mutex locks while + /// streaming. + pub async fn populate_fast_store(&self, key: StoreKey<'_>) -> Result<(), Error> { + let maybe_size_info = self + .fast_store + .has(key.borrow()) + .await + .err_tip(|| "While querying in populate_fast_store")?; + if maybe_size_info.is_some() { + return Ok(()); + } + + self.copy_slow_to_fast(key).await + } + + /// Like [`populate_fast_store`](Self::populate_fast_store) but skips the + /// `has()` check on the fast store. Use this when the caller has already + /// verified that the blob is missing from the fast store (e.g. via a prior + /// batch `has_with_results` call) to avoid a redundant existence check. + pub async fn populate_fast_store_unchecked(&self, key: StoreKey<'_>) -> Result<(), Error> { + self.copy_slow_to_fast(key).await + } + /// Returns the range of bytes that should be sent given a slice bounds /// offset so the output range maps the `received_range.start` to 0. // TODO(palfrey) This should be put into utils, as this logic is used @@ -396,8 +431,10 @@ impl StoreDriver for FastSlowStore { return self.slow_store.update(key, reader, size_info).await; } - let (mut fast_tx, fast_rx) = make_buf_channel_pair(); - let (mut slow_tx, slow_rx) = make_buf_channel_pair(); + // Use 128 slots (~32MiB at 256KiB chunks) for dual-store + // update to reduce backpressure between fast and slow stores. + let (mut fast_tx, fast_rx) = make_buf_channel_pair_with_size(128); + let (mut slow_tx, slow_rx) = make_buf_channel_pair_with_size(128); let key_debug = format!("{key:?}"); trace!( @@ -460,26 +497,45 @@ impl StoreDriver for FastSlowStore { } }; - let fast_store_fut = self.fast_store.update(key.borrow(), fast_rx, size_info); - let slow_store_fut = self.slow_store.update(key.borrow(), slow_rx, size_info); + let fast_start = std::time::Instant::now(); + let fast_store_fut = async { + let res = self.fast_store.update(key.borrow(), fast_rx, size_info).await; + (res, fast_start.elapsed()) + }; + let slow_start = std::time::Instant::now(); + let slow_store_fut = async { + let res = self.slow_store.update(key.borrow(), slow_rx, size_info).await; + (res, slow_start.elapsed()) + }; - let (data_stream_res, fast_res, slow_res) = + let (data_stream_res, (fast_res, fast_elapsed), (slow_res, slow_elapsed)) = join!(data_stream_fut, fast_store_fut, slow_store_fut); let total_elapsed = update_start.elapsed(); + let fast_ms = fast_elapsed.as_millis(); + let slow_ms = slow_elapsed.as_millis(); + let slower_leg = if fast_ms >= slow_ms { "fast" } else { "slow" }; if data_stream_res.is_err() || fast_res.is_err() || slow_res.is_err() { warn!( key = %key_debug, elapsed_ms = total_elapsed.as_millis(), + fast_ms, + slow_ms, + slower_leg, + total_bytes = bytes_sent, data_stream_ok = data_stream_res.is_ok(), fast_store_ok = fast_res.is_ok(), slow_store_ok = slow_res.is_ok(), "FastSlowStore::update: completed with error(s)", ); } else { - trace!( + debug!( key = %key_debug, elapsed_ms = total_elapsed.as_millis(), + fast_ms, + slow_ms, + slower_leg, + total_bytes = bytes_sent, "FastSlowStore::update: completed successfully", ); } @@ -487,6 +543,80 @@ impl StoreDriver for FastSlowStore { Ok(()) } + async fn update_oneshot( + self: Pin<&Self>, + key: StoreKey<'_>, + data: Bytes, + ) -> Result<(), Error> { + let ignore_slow = self + .slow_store + .inner_store(Some(key.borrow())) + .optimized_for(StoreOptimizations::NoopUpdates) + || self.slow_direction == StoreDirection::ReadOnly + || self.slow_direction == StoreDirection::Get; + let ignore_fast = self + .fast_store + .inner_store(Some(key.borrow())) + .optimized_for(StoreOptimizations::NoopUpdates) + || self.fast_direction == StoreDirection::ReadOnly + || self.fast_direction == StoreDirection::Get; + + if ignore_slow && ignore_fast { + return Ok(()); + } + if ignore_slow { + return self.fast_store.update_oneshot(key, data).await; + } + if ignore_fast { + return self.slow_store.update_oneshot(key, data).await; + } + + let oneshot_start = std::time::Instant::now(); + let key_debug = format!("{key:?}"); + let data_len = data.len(); + let fast_oneshot_start = std::time::Instant::now(); + let data_for_slow = data.clone(); + let fast_fut = async { + let res = self.fast_store.update_oneshot(key.borrow(), data).await; + (res, fast_oneshot_start.elapsed()) + }; + let slow_oneshot_start = std::time::Instant::now(); + let slow_fut = async { + let res = self.slow_store.update_oneshot(key.borrow(), data_for_slow).await; + (res, slow_oneshot_start.elapsed()) + }; + let ((fast_res, fast_elapsed), (slow_res, slow_elapsed)) = join!(fast_fut, slow_fut); + let total_elapsed = oneshot_start.elapsed(); + let fast_ms = fast_elapsed.as_millis(); + let slow_ms = slow_elapsed.as_millis(); + let slower_leg = if fast_ms >= slow_ms { "fast" } else { "slow" }; + if fast_res.is_err() || slow_res.is_err() { + warn!( + key = %key_debug, + elapsed_ms = total_elapsed.as_millis(), + fast_ms, + slow_ms, + slower_leg, + data_len, + fast_store_ok = fast_res.is_ok(), + slow_store_ok = slow_res.is_ok(), + "FastSlowStore::update_oneshot: completed with error(s)", + ); + } else { + debug!( + key = %key_debug, + elapsed_ms = total_elapsed.as_millis(), + fast_ms, + slow_ms, + slower_leg, + data_len, + "FastSlowStore::update_oneshot: completed", + ); + } + fast_res.merge(slow_res)?; + Ok(()) + } + /// `FastSlowStore` has optimizations for dealing with files. fn optimized_for(&self, optimization: StoreOptimizations) -> bool { optimization == StoreOptimizations::FileUpdates @@ -520,10 +650,10 @@ impl StoreDriver for FastSlowStore { { trace!("FastSlowStore::update_with_whole_file: uploading to slow_store"); let slow_start = std::time::Instant::now(); - slow_update_store_with_file( + file = slow_update_store_with_file( self.slow_store.as_store_driver_pin(), key.borrow(), - &mut file, + file, upload_size, ) .await @@ -555,10 +685,10 @@ impl StoreDriver for FastSlowStore { || self.fast_direction == StoreDirection::ReadOnly || self.fast_direction == StoreDirection::Get; if !ignore_fast { - slow_update_store_with_file( + file = slow_update_store_with_file( self.fast_store.as_store_driver_pin(), key.borrow(), - &mut file, + file, upload_size, ) .await @@ -575,7 +705,7 @@ impl StoreDriver for FastSlowStore { .await; } - slow_update_store_with_file(self, key, &mut file, upload_size) + let file = slow_update_store_with_file(self, key, file, upload_size) .await .err_tip(|| "In FastSlowStore::update_with_whole_file")?; Ok(Some(file)) @@ -588,19 +718,34 @@ impl StoreDriver for FastSlowStore { offset: u64, length: Option, ) -> Result<(), Error> { - // TODO(palfrey) Investigate if we should maybe ignore errors here instead of - // forwarding them up. if self.fast_store.has(key.borrow()).await?.is_some() { - self.metrics - .fast_store_hit_count - .fetch_add(1, Ordering::Acquire); - self.fast_store - .get_part(key, writer.borrow_mut(), offset, length) - .await?; - self.metrics - .fast_store_downloaded_bytes - .fetch_add(writer.get_bytes_written(), Ordering::Acquire); - return Ok(()); + // Try the fast store first. If the item was evicted between the + // has() check and this get_part() call (TOCTOU race), fall through + // to the slow-store path instead of propagating NotFound. + match self + .fast_store + .get_part(key.borrow(), writer.borrow_mut(), offset, length) + .await + { + Ok(()) => { + self.metrics + .fast_store_hit_count + .fetch_add(1, Ordering::Acquire); + self.metrics + .fast_store_downloaded_bytes + .fetch_add(writer.get_bytes_written(), Ordering::Acquire); + return Ok(()); + } + Err(err) if err.code == Code::NotFound && writer.get_bytes_written() == 0 => { + // Item was evicted between has() and get_part(). + // Only safe to fall through if no bytes were written yet. + debug!( + ?key, + "Fast store item evicted between has() and get_part(), falling through to slow store" + ); + } + Err(err) => return Err(err), + } } // If the fast store is noop or read only or update only then bypass it. @@ -630,15 +775,36 @@ impl StoreDriver for FastSlowStore { }) .await?; - // If we didn't stream then re-enter which will stream from the fast - // store, or retry the download. We should not get in a loop here - // because OnceCell has the good sense to retry for all callers so in - // order to get here the fast store will have been populated. There's - // an outside chance it was evicted, but that's slim. + // If we were a waiter (not the streaming thread), read from the + // fast store which was just populated. If the blob was evicted + // between populate and this read, fall back directly to the slow + // store instead of recursing (which could loop indefinitely under + // heavy eviction pressure). if let Some(writer) = writer.take() { - self.get_part(key, writer, offset, length).await + let bytes_before = writer.get_bytes_written(); + match self + .fast_store + .get_part(key.borrow(), &mut *writer, offset, length) + .await + { + Ok(()) => Ok(()), + Err(err) + if err.code == Code::NotFound + && writer.get_bytes_written() == bytes_before => + { + warn!( + ?key, + "Fast store item evicted immediately after population, \ + reading directly from slow store" + ); + self.slow_store + .get_part(key, &mut *writer, offset, length) + .await + } + Err(err) => Err(err), + } } else { - // This was the thread that did the streaming already, lucky duck. + // This was the thread that did the streaming already. Ok(()) } } @@ -655,12 +821,12 @@ impl StoreDriver for FastSlowStore { self } - fn register_remove_callback( + fn register_item_callback( self: Arc, - callback: Arc, + callback: Arc, ) -> Result<(), Error> { - self.fast_store.register_remove_callback(callback.clone())?; - self.slow_store.register_remove_callback(callback)?; + self.fast_store.register_item_callback(callback.clone())?; + self.slow_store.register_item_callback(callback)?; Ok(()) } } diff --git a/nativelink-store/src/filesystem_store.rs b/nativelink-store/src/filesystem_store.rs index 97f531043..81509569e 100644 --- a/nativelink-store/src/filesystem_store.rs +++ b/nativelink-store/src/filesystem_store.rs @@ -22,11 +22,11 @@ use std::time::SystemTime; use async_lock::RwLock; use async_trait::async_trait; -use bytes::{Bytes, BytesMut}; +use bytes::Bytes; use futures::stream::{StreamExt, TryStreamExt}; use futures::{Future, TryFutureExt}; use nativelink_config::stores::FilesystemSpec; -use nativelink_error::{Code, Error, ResultExt, make_err, make_input_err}; +use nativelink_error::{Code, Error, ResultExt, make_err}; use nativelink_metric::MetricsComponent; use nativelink_util::background_spawn; use nativelink_util::buf_channel::{ @@ -36,18 +36,20 @@ use nativelink_util::common::{DigestInfo, fs}; use nativelink_util::evicting_map::{EvictingMap, LenEntry}; use nativelink_util::health_utils::{HealthRegistryBuilder, HealthStatus, HealthStatusIndicator}; use nativelink_util::store_trait::{ - RemoveItemCallback, StoreDriver, StoreKey, StoreKeyBorrow, StoreOptimizations, UploadSizeInfo, + ItemCallback, StoreDriver, StoreKey, StoreKeyBorrow, StoreOptimizations, UploadSizeInfo, }; -use tokio::io::{AsyncReadExt, AsyncWriteExt, Take}; use tokio::sync::Semaphore; use tokio_stream::wrappers::ReadDirStream; -use tracing::{debug, error, info, trace, warn}; +use tracing::{debug, error, trace, warn}; -use crate::callback_utils::RemoveItemCallbackHolder; +use crate::callback_utils::ItemCallbackHolder; use crate::cas_utils::is_zero_digest; // Default size to allocate memory of the buffer when reading files. -const DEFAULT_BUFF_SIZE: usize = 32 * 1024; +// 256 KiB reduces syscalls by 4x compared to 64 KiB. At 10Gbps, 64 KiB reads +// cause ~19,500 syscalls/sec/stream; 256 KiB brings this down to ~4,900. +// Modern NVMe SSDs perform significantly better with larger read sizes. +const DEFAULT_BUFF_SIZE: usize = 256 * 1024; // Default block size of all major filesystems is 4KB const DEFAULT_BLOCK_SIZE: u64 = 4 * 1024; @@ -139,7 +141,12 @@ impl Drop for EncodedFilePath { .await .err_tip(|| format!("Failed to remove file {}", file_path.display())); if let Err(err) = result { - error!(?file_path, ?err, "Failed to delete file",); + if err.code == Code::NotFound { + // File already deleted (e.g. race between eviction paths). + debug!(?file_path, "File already deleted, ignoring"); + } else { + error!(?file_path, ?err, "Failed to delete file"); + } } else { debug!(?file_path, "File deleted",); } @@ -201,8 +208,7 @@ pub trait FileEntry: LenEntry + Send + Sync + Debug + 'static { fn read_file_part( &self, offset: u64, - length: u64, - ) -> impl Future, Error>> + Send; + ) -> impl Future> + Send; /// This function is a safe way to extract the file name of the underlying file. To protect users from /// accidentally creating undefined behavior we encourage users to do the logic they need to do with @@ -297,10 +303,9 @@ impl FileEntry for FileEntryImpl { fn read_file_part( &self, offset: u64, - length: u64, - ) -> impl Future, Error>> + Send { + ) -> impl Future> + Send { self.get_file_path_locked(move |full_content_path| async move { - let file = fs::open_file(&full_content_path, offset, length) + let file = fs::open_file(&full_content_path, offset) .await .err_tip(|| { format!( @@ -368,9 +373,10 @@ impl LenEntry for FileEntryImpl { async fn unref(&self) { let mut encoded_file_path = self.encoded_file_path.write().await; if encoded_file_path.path_type == PathType::Temp { - // We are already a temp file that is now marked for deletion on drop. - // This is very rare, but most likely the rename into the content path failed. - warn!( + // Already a temp file marked for deletion on drop. This happens + // when the entry is evicted from the map before emplace_file + // renames it into the content path — expected under cache pressure. + debug!( key = ?encoded_file_path.key, "File is already a temp file", ); @@ -394,7 +400,7 @@ impl LenEntry for FileEntryImpl { key = ?encoded_file_path.key, ?from_path, ?to_path, - "Renamed file (unref)", + "Evicted blob from filesystem cache (unref)", ); encoded_file_path.path_type = PathType::Temp; encoded_file_path.key = new_key; @@ -421,7 +427,7 @@ pub fn key_from_file(file_name: &str, file_type: FileType) -> Result = - EvictingMap, Arc, SystemTime, RemoveItemCallbackHolder>; + EvictingMap, Arc, SystemTime, ItemCallbackHolder>; async fn add_files_to_cache( evicting_map: &FsEvictingMap<'_, Fe>, @@ -452,14 +458,28 @@ async fn add_files_to_cache( key: key.borrow().into_owned(), }), ); - let time_since_anchor = anchor_time - .duration_since(atime) - .map_err(|_| make_input_err!("File access time newer than now"))?; + // Use a negative seconds_since_anchor for files that existed before + // the anchor time (startup). This correctly represents them as "older + // than anything inserted during runtime" in the EvictingMap timeline. + // Files with atime closer to startup get values closer to 0 (newer), + // while files not accessed for days get large negative values (older). + let seconds_since_anchor = if let Ok(before) = anchor_time.duration_since(atime) { + let secs = before.as_secs(); + if secs > i32::MAX as u64 { + i32::MIN + } else { + -(secs as i32) + } + } else { + // atime is after anchor_time (file touched between capturing + // `now` and reading metadata) — treat as most-recently-used. + 0 + }; evicting_map .insert_with_time( key.into_owned().into(), Arc::new(file_entry), - i32::try_from(time_since_anchor.as_secs()).unwrap_or(i32::MAX), + seconds_since_anchor, ) .await; Ok(()) @@ -549,13 +569,19 @@ async fn add_files_to_cache( block_size: u64, folder: &str, ) -> Result<(), Error> { - let file_infos = read_files(Some(folder), shared_context).await?; + let mut file_infos = read_files(Some(folder), shared_context).await?; let file_type = match folder { STR_FOLDER => FileType::String, DIGEST_FOLDER => FileType::Digest, _ => panic!("Invalid folder type"), }; + // Sort by atime oldest-first so that the LRU cache ordering matches + // actual file access recency. Without this, items are inserted in + // directory-iteration order (random), causing recently-used files to + // be evicted while cold files survive. + file_infos.sort_by(|a, b| a.1.cmp(&b.1)); + let path_root = format!("{}/{folder}", shared_context.content_path); for (file_name, atime, data_size, _) in file_infos.into_iter().filter(|x| x.3) { @@ -639,6 +665,8 @@ pub struct FilesystemStore { rename_fn: fn(&OsStr, &OsStr) -> Result<(), std::io::Error>, /// Limits concurrent write operations to prevent disk I/O saturation. write_semaphore: Option, + /// Skip writes when a blob with the same key already exists (CAS dedup). + content_is_immutable: bool, } impl FilesystemStore { @@ -709,6 +737,7 @@ impl FilesystemStore { weak_self: weak_self.clone(), rename_fn, write_semaphore, + content_is_immutable: spec.content_is_immutable, })) } @@ -716,6 +745,28 @@ impl FilesystemStore { self.weak_self.upgrade() } + /// Returns all digest entries in the cache with their absolute last-access + /// timestamps (seconds since UNIX epoch). String-keyed entries are skipped. + /// This is a peek-only operation and does NOT promote entries in the LRU. + pub fn get_all_digests_with_timestamps(&self) -> Vec<(DigestInfo, i64)> { + self.evicting_map + .get_all_entries_with_timestamps() + .into_iter() + .filter_map(|(key_borrow, abs_timestamp)| { + match StoreKey::from(key_borrow) { + StoreKey::Digest(digest) => Some((digest, abs_timestamp)), + _ => None, + } + }) + .collect() + } + + /// Remove a digest's entry from the evicting map so the next + /// `populate_fast_store` is forced to re-download from the slow store. + pub async fn remove_entry_for_digest(&self, digest: &DigestInfo) { + self.evicting_map.remove(&digest.into()).await; + } + pub async fn get_file_entry_for_digest(&self, digest: &DigestInfo) -> Result, Error> { if is_zero_digest(digest) { return Ok(Arc::new(Fe::create( @@ -734,29 +785,56 @@ impl FilesystemStore { .ok_or_else(|| make_err!(Code::NotFound, "{digest} not found in filesystem store. This may indicate the file was evicted due to cache pressure. Consider increasing 'max_bytes' in your filesystem store's eviction_policy configuration.")) } + /// Batch-retrieves file entries for multiple digests in a single lock + /// acquisition on the EvictingMap, reducing contention compared to + /// calling `get_file_entry_for_digest()` individually for each digest. + pub async fn get_file_entries_batch( + &self, + digests: &[DigestInfo], + ) -> Vec>> { + // Separate zero digests (which don't go through evicting_map). + let store_keys: Vec> = digests + .iter() + .filter(|d| !is_zero_digest(**d)) + .map(|d| (*d).into()) + .collect(); + + let batch_results = self.evicting_map.get_many(store_keys.iter()).await; + + // Reassemble results, inserting zero-digest entries where needed. + let mut batch_iter = batch_results.into_iter(); + digests + .iter() + .map(|digest| { + if is_zero_digest(*digest) { + Some(Arc::new(Fe::create( + 0, + 0, + RwLock::new(EncodedFilePath { + shared_context: self.shared_context.clone(), + path_type: PathType::Content, + key: (*digest).into(), + }), + ))) + } else { + batch_iter.next().flatten() + } + }) + .collect() + } + async fn update_file( self: Pin<&Self>, mut entry: Fe, - mut temp_file: fs::FileSlot, + temp_file: fs::FileSlot, final_key: StoreKey<'static>, mut reader: DropCloserReadHalf, ) -> Result<(), Error> { - let mut data_size = 0; - loop { - let mut data = reader - .recv() - .await - .err_tip(|| "Failed to receive data in filesystem store")?; - let data_len = data.len(); - if data_len == 0 { - break; // EOF. - } - temp_file - .write_all_buf(&mut data) - .await - .err_tip(|| "Failed to write data into filesystem store")?; - data_size += data_len as u64; - } + let write_start = std::time::Instant::now(); + let (data_size, temp_file) = fs::write_file_from_channel(temp_file, &mut reader) + .await + .err_tip(|| "Failed to write data into filesystem store")?; + let write_ms = write_start.elapsed().as_millis(); let _permit = if let Some(sem) = &self.write_semaphore { Some( @@ -768,20 +846,28 @@ impl FilesystemStore { None }; - temp_file - .as_ref() - .sync_all() - .await - .err_tip(|| "Failed to sync_data in filesystem store")?; - drop(_permit); - temp_file.advise_dontneed(); trace!(?temp_file, "Dropping file to update_file"); drop(temp_file); *entry.data_size_mut() = data_size; - self.emplace_file(final_key, Arc::new(entry)).await + let emplace_start = std::time::Instant::now(); + let result = self.emplace_file(final_key.borrow().into_owned(), Arc::new(entry)).await; + let emplace_ms = emplace_start.elapsed().as_millis(); + + let total_ms = write_ms + emplace_ms; + if total_ms > 50 { + debug!( + key = %final_key.as_str(), + total_ms, + write_ms, + emplace_ms, + data_size, + "FilesystemStore::update_file: slow phases", + ); + } + result } async fn emplace_file(&self, key: StoreKey<'static>, entry: Arc) -> Result<(), Error> { @@ -817,7 +903,7 @@ impl FilesystemStore { let mut encoded_file_path = entry.get_encoded_file_path().write().await; // Then check it's still in there... if evicting_map.get(&key).await.is_none() { - info!(%key, "Got eviction while emplacing, dropping"); + debug!(%key, "Got eviction while emplacing, dropping"); return Ok(()); } @@ -827,23 +913,37 @@ impl FilesystemStore { &key, ); - let from_path = encoded_file_path.get_file_path(); - // Internally tokio spawns fs commands onto a blocking thread anyways. - // Since we are already on a blocking thread, we just need the `fs` wrapper to manage - // an open-file permit (ensure we don't open too many files at once). - let result = (rename_fn)(&from_path, &final_path).err_tip(|| { - format!( - "Failed to rename temp file to final path {}", - final_path.display() - ) - }); + let from_path: OsString = encoded_file_path.get_file_path().into_owned(); + let final_path_owned: OsString = final_path.into_owned(); + // Run rename + set_permissions on a blocking thread to avoid + // stalling the async runtime with syscalls. + let from_clone = from_path.clone(); + let to_clone = final_path_owned.clone(); + let result = tokio::task::spawn_blocking(move || -> Result<(), Error> { + (rename_fn)(&from_clone, &to_clone)?; + // Pre-set CAS file permissions to read+execute (0o555) so that + // hardlinked copies already have correct permissions without + // needing a per-file chmod during input materialization. + #[cfg(target_family = "unix")] + { + use std::os::unix::fs::PermissionsExt; + let perms = std::fs::Permissions::from_mode(0o555); + if let Err(err) = std::fs::set_permissions(&to_clone, perms) { + tracing::warn!(?err, path = ?to_clone, "Failed to set CAS file permissions to 0o555"); + } + } + Ok(()) + }) + .await + .map_err(|e| make_err!(Code::Internal, "Rename task join error: {e:?}")) + .and_then(|r| r.err_tip(|| "Failed to rename temp file to final path")); // In the event our move from temp file to final file fails we need to ensure we remove // the entry from our map. // Remember: At this point it is possible for another thread to have a reference to // `entry`, so we can't delete the file, only drop() should ever delete files. if let Err(err) = result { - error!(?err, ?from_path, ?final_path, "Failed to rename file",); + error!(?err, ?from_path, ?final_path_owned, "Failed to rename file",); // Warning: To prevent deadlock we need to release our lock or during `remove_if()` // it will call `unref()`, which triggers a write-lock on `encoded_file_path`. drop(encoded_file_path); @@ -911,7 +1011,26 @@ impl StoreDriver for FilesystemStore { return Ok(()); } + // CAS dedup: skip write if blob already exists (same digest = same content). + // sizes_for_keys with peek=false promotes the key in the LRU, updating + // its access time so it won't be evicted prematurely. + if self.content_is_immutable { + let owned_key = key.borrow().into_owned(); + let mut exists = [None]; + self.evicting_map + .sizes_for_keys(core::iter::once(&owned_key), &mut exists, false) + .await; + if exists[0].is_some() { + reader + .drain() + .await + .err_tip(|| "Failed to drain reader for existing blob")?; + return Ok(()); + } + } + let temp_key = make_temp_key(&key); + let update_total_start = std::time::Instant::now(); // There's a possibility of deadlock here where we take all of the // file semaphores with make_and_open_file and the semaphores for @@ -921,6 +1040,7 @@ impl StoreDriver for FilesystemStore { // reader available to know that the populator is active. reader.peek().await?; + let temp_create_start = std::time::Instant::now(); let (entry, temp_file, temp_full_path) = Fe::make_and_open_file( self.block_size, EncodedFilePath { @@ -930,15 +1050,28 @@ impl StoreDriver for FilesystemStore { }, ) .await?; + let temp_create_ms = temp_create_start.elapsed().as_millis(); - self.update_file(entry, temp_file, key.into_owned(), reader) + let result = self.update_file(entry, temp_file, key.borrow().into_owned(), reader) .await .err_tip(|| { format!( "While processing with temp file {}", temp_full_path.display() ) - }) + }); + + let total_ms = update_total_start.elapsed().as_millis(); + if total_ms > 50 { + debug!( + key = %key.as_str(), + total_ms, + temp_create_ms, + write_and_emplace_ms = total_ms.saturating_sub(temp_create_ms), + "FilesystemStore::update: slow write", + ); + } + result } fn optimized_for(&self, optimization: StoreOptimizations) -> bool { @@ -953,7 +1086,21 @@ impl StoreDriver for FilesystemStore { return Ok(()); } + // CAS dedup: skip write if blob already exists (same digest = same content). + if self.content_is_immutable { + let owned_key = key.borrow().into_owned(); + let mut exists = [None]; + self.evicting_map + .sizes_for_keys(core::iter::once(&owned_key), &mut exists, false) + .await; + if exists[0].is_some() { + return Ok(()); + } + } + + let oneshot_total_start = std::time::Instant::now(); let temp_key = make_temp_key(&key); + let temp_create_start = std::time::Instant::now(); let (mut entry, mut temp_file, temp_full_path) = Fe::make_and_open_file( self.block_size, EncodedFilePath { @@ -964,13 +1111,30 @@ impl StoreDriver for FilesystemStore { ) .await .err_tip(|| "Failed to create temp file in filesystem store update_oneshot")?; + let temp_create_ms = temp_create_start.elapsed().as_millis(); // Write directly without channel overhead + let data_len = data.len() as u64; + let write_ms; if !data.is_empty() { - temp_file - .write_all(&data) - .await - .err_tip(|| format!("Failed to write data to {}", temp_full_path.display()))?; + let write_start = std::time::Instant::now(); + let temp_full_path_clone = temp_full_path.clone(); + temp_file = nativelink_util::spawn_blocking!("fs_write_oneshot", move || { + use std::io::Write; + temp_file + .as_std_mut() + .write_all(&data) + .map_err(|e| Into::::into(e)) + .err_tip(|| { + format!("Failed to write data to {}", temp_full_path_clone.display()) + })?; + Ok::<_, Error>(temp_file) + }) + .await + .map_err(|e| make_err!(Code::Internal, "write oneshot join failed: {e:?}"))??; + write_ms = write_start.elapsed().as_millis(); + } else { + write_ms = 0; } let _permit = if let Some(sem) = &self.write_semaphore { @@ -983,19 +1147,28 @@ impl StoreDriver for FilesystemStore { None }; - temp_file - .as_ref() - .sync_all() - .await - .err_tip(|| "Failed to sync_data in filesystem store update_oneshot")?; - drop(_permit); - temp_file.advise_dontneed(); drop(temp_file); - *entry.data_size_mut() = data.len() as u64; - self.emplace_file(key.into_owned(), Arc::new(entry)).await + *entry.data_size_mut() = data_len; + let emplace_start = std::time::Instant::now(); + let result = self.emplace_file(key.borrow().into_owned(), Arc::new(entry)).await; + let emplace_ms = emplace_start.elapsed().as_millis(); + + let total_ms = oneshot_total_start.elapsed().as_millis(); + if total_ms > 50 { + debug!( + key = %key.as_str(), + total_ms, + temp_create_ms, + write_ms, + emplace_ms, + data_len, + "FilesystemStore::update_oneshot: slow write", + ); + } + result } async fn update_with_whole_file( @@ -1008,9 +1181,8 @@ impl StoreDriver for FilesystemStore { let file_size = match upload_size { UploadSizeInfo::ExactSize(size) => size, UploadSizeInfo::MaxSize(_) => file - .as_ref() + .as_std() .metadata() - .await .err_tip(|| format!("While reading metadata for {}", path.display()))? .len(), }; @@ -1030,7 +1202,6 @@ impl StoreDriver for FilesystemStore { // We are done with the file, if we hold a reference to the file here, it could // result in a deadlock if `emplace_file()` also needs file descriptors. trace!(?file, "Dropping file to to update_with_whole_file"); - file.advise_dontneed(); drop(file); self.emplace_file(key.into_owned(), Arc::new(entry)) .await @@ -1063,34 +1234,31 @@ impl StoreDriver for FilesystemStore { ) })?; let read_limit = length.unwrap_or(u64::MAX); - let mut temp_file = entry.read_file_part(offset, read_limit).or_else(|err| async move { + let temp_file = entry.read_file_part(offset).or_else(|err| async move { // If the file is not found, we need to remove it from the eviction map. if err.code == Code::NotFound { - error!( + warn!( ?err, key = ?owned_key, - "Entry was in our map, but not found on disk. Removing from map as a precaution, but process probably need restarted." + "Stale filesystem cache entry: file not found on disk. \ + Removed from map; upper store layer will re-fetch from remote." ); self.evicting_map.remove(&owned_key).await; } Err(err) }).await?; - loop { - let mut buf = BytesMut::with_capacity(self.read_buffer_size); - temp_file - .read_buf(&mut buf) - .await - .err_tip(|| "Failed to read data in filesystem store")?; - if buf.is_empty() { - break; // EOF. - } - writer - .send(buf.freeze()) - .await - .err_tip(|| "Failed to send chunk in filesystem store get_part")?; - } - temp_file.get_ref().advise_dontneed(); + // Hint to the kernel that we'll read sequentially — enables more + // aggressive readahead (typically 2-4x the default 128 KiB). + temp_file.advise_sequential(); + + // NOTE: We intentionally do NOT call advise_dontneed() after reading. + // The same blobs are frequently read by multiple workers within + // seconds of each other — keeping them in page cache avoids + // redundant disk I/O (measured: 76% of read I/O is re-reads). + fs::read_file_to_channel(temp_file, writer, read_limit, self.read_buffer_size) + .await + .err_tip(|| "Failed to read data in filesystem store")?; writer .send_eof() .err_tip(|| "Filed to send EOF in filesystem store get_part")?; @@ -1114,12 +1282,12 @@ impl StoreDriver for FilesystemStore { registry.register_indicator(self); } - fn register_remove_callback( + fn register_item_callback( self: Arc, - callback: Arc, + callback: Arc, ) -> Result<(), Error> { self.evicting_map - .add_remove_callback(RemoveItemCallbackHolder::new(callback)); + .add_item_callback(ItemCallbackHolder::new(callback)); Ok(()) } } diff --git a/nativelink-store/src/gcs_store.rs b/nativelink-store/src/gcs_store.rs index 4334bbdd2..dcf281d36 100644 --- a/nativelink-store/src/gcs_store.rs +++ b/nativelink-store/src/gcs_store.rs @@ -29,7 +29,7 @@ use nativelink_util::health_utils::{HealthRegistryBuilder, HealthStatus, HealthS use nativelink_util::instant_wrapper::InstantWrapper; use nativelink_util::retry::{Retrier, RetryResult}; use nativelink_util::store_trait::{ - RemoveItemCallback, StoreDriver, StoreKey, StoreOptimizations, UploadSizeInfo, + ItemCallback, StoreDriver, StoreKey, StoreOptimizations, UploadSizeInfo, }; use rand::Rng; use tokio::time::sleep; @@ -465,9 +465,9 @@ where registry.register_indicator(self); } - fn register_remove_callback( + fn register_item_callback( self: Arc, - _callback: Arc, + _callback: Arc, ) -> Result<(), Error> { // As we're backed by GCS, this store doesn't actually drop stuff // so we can actually just ignore this diff --git a/nativelink-store/src/grpc_store.rs b/nativelink-store/src/grpc_store.rs index 0d399284f..f7d4f3439 100644 --- a/nativelink-store/src/grpc_store.rs +++ b/nativelink-store/src/grpc_store.rs @@ -15,10 +15,11 @@ use core::pin::Pin; use core::time::Duration; use std::borrow::Cow; -use std::sync::Arc; +use std::collections::HashMap; +use std::sync::{Arc, Weak}; use async_trait::async_trait; -use bytes::BytesMut; +use bytes::{Bytes, BytesMut}; use futures::stream::{FuturesUnordered, unfold}; use futures::{Future, Stream, StreamExt, TryFutureExt, TryStreamExt, future}; use nativelink_config::stores::GrpcSpec; @@ -30,13 +31,14 @@ use nativelink_proto::build::bazel::remote::execution::v2::{ ActionResult, BatchReadBlobsRequest, BatchReadBlobsResponse, BatchUpdateBlobsRequest, BatchUpdateBlobsResponse, FindMissingBlobsRequest, FindMissingBlobsResponse, GetActionResultRequest, GetTreeRequest, GetTreeResponse, UpdateActionResultRequest, + batch_update_blobs_request, compressor, }; use nativelink_proto::google::bytestream::byte_stream_client::ByteStreamClient; use nativelink_proto::google::bytestream::{ QueryWriteStatusRequest, QueryWriteStatusResponse, ReadRequest, ReadResponse, WriteRequest, WriteResponse, }; -use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; +use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf, make_buf_channel_pair}; use nativelink_util::common::DigestInfo; use nativelink_util::connection_manager::ConnectionManager; use nativelink_util::digest_hasher::{DigestHasherFunc, default_digest_hasher_func}; @@ -46,19 +48,33 @@ use nativelink_util::proto_stream_utils::{ }; use nativelink_util::resource_info::ResourceInfo; use nativelink_util::retry::{Retrier, RetryResult}; -use nativelink_util::store_trait::{RemoveItemCallback, StoreDriver, StoreKey, UploadSizeInfo}; +use nativelink_util::store_trait::{ + IS_WORKER_REQUEST, ItemCallback, StoreDriver, StoreKey, StoreOptimizations, UploadSizeInfo, +}; use nativelink_util::{default_health_status_indicator, tls_utils}; use opentelemetry::context::Context; use parking_lot::Mutex; use prost::Message; use tokio::time::sleep; use tonic::{Code, IntoRequest, Request, Response, Status, Streaming}; -use tracing::{error, trace, warn}; +use tracing::{error, info, trace, warn}; use uuid::Uuid; // This store is usually a pass-through store, but can also be used as a CAS store. Using it as an +/// Maximum gRPC message decoding size. Must be larger than the biggest +/// possible response (e.g. batch_read_blobs, get_tree, or a single +/// ByteStream ReadResponse chunk). 256 MiB is generous while still +/// providing an OOM safety net. +const MAX_GRPC_DECODING_SIZE: usize = 256 * 1024 * 1024; + // AC store has one major side-effect... The has() function may not give the proper size of the // underlying data. This might cause issues if embedded in certain stores. +struct PendingBatchEntry { + digest: DigestInfo, + data: Bytes, + result_tx: tokio::sync::oneshot::Sender>, +} + #[derive(Debug, MetricsComponent)] pub struct GrpcStore { #[metric(help = "Instance name for the store")] @@ -68,6 +84,12 @@ pub struct GrpcStore { connection_manager: ConnectionManager, /// Per-RPC timeout. Duration::ZERO means disabled. rpc_timeout: Duration, + /// Blobs at or below this size use BatchUpdateBlobs instead of + /// ByteStream.Write. 0 means disabled. + batch_update_threshold: u64, + /// Sender for coalescing batch entries. None when coalescing is + /// disabled (delay_ms == 0 or threshold == 0). + batch_tx: Option>, } impl GrpcStore { @@ -96,7 +118,18 @@ impl GrpcStore { Duration::from_secs(120) }; - Ok(Arc::new(Self { + let batch_update_threshold = spec.batch_update_threshold_bytes; + let coalesce_delay_ms = spec.batch_coalesce_delay_ms; + + let (batch_tx, batch_rx) = + if batch_update_threshold > 0 && coalesce_delay_ms > 0 { + let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); + (Some(tx), Some(rx)) + } else { + (None, None) + }; + + let store = Arc::new(Self { instance_name: spec.instance_name.clone(), store_type: spec.store_type, retrier: Retrier::new( @@ -112,7 +145,183 @@ impl GrpcStore { jitter_fn, ), rpc_timeout, - })) + batch_update_threshold, + batch_tx, + }); + + if let Some(rx) = batch_rx { + let weak = Arc::downgrade(&store); + let delay = Duration::from_millis(coalesce_delay_ms); + tokio::spawn(Self::batch_flush_loop(weak, rx, delay)); + info!( + batch_update_threshold, + coalesce_delay_ms, + "GrpcStore: BatchUpdateBlobs coalescing enabled", + ); + } else if batch_update_threshold > 0 { + info!( + batch_update_threshold, + "GrpcStore: BatchUpdateBlobs enabled (no coalescing)", + ); + } + + Ok(store) + } + + /// Maximum total payload size for a single BatchUpdateBlobs RPC. + /// The RE API spec recommends servers support at least 4 MiB. + const MAX_BATCH_TOTAL_SIZE: usize = 4 * 1024 * 1024; + + /// Send one or more blobs via a single BatchUpdateBlobs RPC. + /// Returns per-entry results keyed by digest. The RE API does not + /// guarantee response ordering, so we match by digest, not index. + async fn do_batch_update( + &self, + digests: &[DigestInfo], + entries: Vec<(DigestInfo, Bytes)>, + ) -> HashMap> { + let digest_function = Context::current() + .get::() + .map_or_else(default_digest_hasher_func, |v| *v) + .proto_digest_func() + .into(); + + // Deduplicate entries by digest — multiple callers may submit the + // same blob in the same batch (e.g., identical stdout/stderr). + let deduped: HashMap = entries.into_iter().collect(); + let requests: Vec<_> = deduped + .into_iter() + .map(|(digest, data)| batch_update_blobs_request::Request { + digest: Some(digest.into()), + data, + compressor: compressor::Value::Identity.into(), + }) + .collect(); + + let response = match self + .batch_update_blobs(Request::new(BatchUpdateBlobsRequest { + instance_name: String::new(), // Overwritten by batch_update_blobs() + requests, + digest_function, + })) + .await + { + Ok(resp) => resp, + Err(e) => { + let err = e.append("In GrpcStore::do_batch_update"); + return digests + .iter() + .map(|d| (*d, Err(err.clone()))) + .collect(); + } + }; + + // Build result map keyed by digest (RE API does not guarantee ordering). + let mut results: HashMap> = response + .into_inner() + .responses + .into_iter() + .filter_map(|resp| { + let digest = DigestInfo::try_from(resp.digest?).ok()?; + let result = match &resp.status { + Some(status) if status.code != 0 => Err(make_input_err!( + "BatchUpdateBlobs failed: code={}, message={}", + status.code, + status.message + )), + _ => Ok(()), + }; + Some((digest, result)) + }) + .collect(); + + // Fill in missing responses as errors. + for d in digests { + results + .entry(*d) + .or_insert_with(|| Err(make_input_err!("BatchUpdateBlobs: no response for digest"))); + } + results + } + + /// Background task that accumulates small blob uploads and flushes + /// them as batched RPCs. + async fn batch_flush_loop( + weak: Weak, + mut rx: tokio::sync::mpsc::UnboundedReceiver, + delay: Duration, + ) { + // An entry that didn't fit in the previous batch, carried forward. + let mut held_entry: Option = None; + + loop { + // Use held entry from previous iteration, or wait for a new one. + let first = if let Some(entry) = held_entry.take() { + entry + } else { + match rx.recv().await { + Some(entry) => entry, + None => return, // Channel closed + } + }; + + let mut batch = vec![first]; + let mut total_size = batch[0].data.len(); + + // Collect more entries within the delay window, up to size limit. + let deadline = tokio::time::Instant::now() + delay; + loop { + let remaining = + deadline.saturating_duration_since(tokio::time::Instant::now()); + if remaining.is_zero() { + break; + } + match tokio::time::timeout(remaining, rx.recv()).await { + Ok(Some(entry)) => { + let new_total = total_size + entry.data.len(); + if new_total > Self::MAX_BATCH_TOTAL_SIZE && !batch.is_empty() + { + // Would exceed limit — hold for next batch. + held_entry = Some(entry); + break; + } + total_size = new_total; + batch.push(entry); + } + _ => break, // Timeout or channel closed + } + } + + let store = match weak.upgrade() { + Some(s) => s, + None => return, // GrpcStore dropped + }; + + let num = batch.len(); + trace!( + count = num, + total_size, + "GrpcStore: flushing coalesced batch", + ); + + let digests: Vec<_> = batch.iter().map(|e| e.digest).collect(); + let (senders_with_digests, entries): (Vec<_>, Vec<_>) = batch + .into_iter() + .map(|e| ((e.digest, e.result_tx), (e.digest, e.data))) + .unzip(); + + let results = store.do_batch_update(&digests, entries).await; + + for (digest, sender) in senders_with_digests { + // Use .get().cloned() instead of .remove() because multiple + // senders may reference the same digest (e.g., stdout and stderr + // with identical content in the same batch). + let result = results.get(&digest).cloned().unwrap_or_else(|| { + Err(make_input_err!("BatchUpdateBlobs: missing result for {digest:?}")) + }); + drop(sender.send(result)); + } + } } async fn perform_request(&self, input: I, mut request: F) -> Result @@ -153,6 +362,7 @@ impl GrpcStore { .await .err_tip(|| "in find_missing_blobs")?; ContentAddressableStorageClient::new(channel) + .max_decoding_message_size(MAX_GRPC_DECODING_SIZE) .find_missing_blobs(Request::new(request)) .await .err_tip(|| "in GrpcStore::find_missing_blobs") @@ -178,6 +388,7 @@ impl GrpcStore { .await .err_tip(|| "in batch_update_blobs")?; ContentAddressableStorageClient::new(channel) + .max_decoding_message_size(MAX_GRPC_DECODING_SIZE) .batch_update_blobs(Request::new(request)) .await .err_tip(|| "in GrpcStore::batch_update_blobs") @@ -196,14 +407,23 @@ impl GrpcStore { let mut request = grpc_request.into_inner(); request.instance_name.clone_from(&self.instance_name); + let is_worker = IS_WORKER_REQUEST.try_with(|v| *v).unwrap_or(false); self.perform_request(request, |request| async move { let channel = self .connection_manager .connection() .await .err_tip(|| "in batch_read_blobs")?; + let mut grpc_request = Request::new(request); + if is_worker { + grpc_request.metadata_mut().insert( + "x-nativelink-worker", + tonic::metadata::MetadataValue::from_static("true"), + ); + } ContentAddressableStorageClient::new(channel) - .batch_read_blobs(Request::new(request)) + .max_decoding_message_size(MAX_GRPC_DECODING_SIZE) + .batch_read_blobs(grpc_request) .await .err_tip(|| "in GrpcStore::batch_read_blobs") }) @@ -228,6 +448,7 @@ impl GrpcStore { .await .err_tip(|| "in get_tree")?; ContentAddressableStorageClient::new(channel) + .max_decoding_message_size(MAX_GRPC_DECODING_SIZE) .get_tree(Request::new(request)) .await .err_tip(|| "in GrpcStore::get_tree") @@ -254,8 +475,16 @@ impl GrpcStore { .connection() .await .err_tip(|| "in read_internal")?; + let mut grpc_request = Request::new(request); + if IS_WORKER_REQUEST.try_with(|v| *v).unwrap_or(false) { + grpc_request.metadata_mut().insert( + "x-nativelink-worker", + tonic::metadata::MetadataValue::from_static("true"), + ); + } let mut response = ByteStreamClient::new(channel) - .read(Request::new(request)) + .max_decoding_message_size(MAX_GRPC_DECODING_SIZE) + .read(grpc_request) .await .err_tip(|| "in GrpcStore::read")? .into_inner(); @@ -343,6 +572,7 @@ impl GrpcStore { let local_state_for_rpc = local_state.clone(); async move { let res = ByteStreamClient::new(channel) + .max_decoding_message_size(MAX_GRPC_DECODING_SIZE) .write(WriteStateWrapper::new(local_state_for_rpc)) .await .err_tip(|| "in GrpcStore::write"); @@ -452,6 +682,7 @@ impl GrpcStore { .await .err_tip(|| "in query_write_status")?; ByteStreamClient::new(channel) + .max_decoding_message_size(MAX_GRPC_DECODING_SIZE) .query_write_status(Request::new(request)) .await .err_tip(|| "in GrpcStore::query_write_status") @@ -472,6 +703,7 @@ impl GrpcStore { .await .err_tip(|| "in get_action_result")?; ActionCacheClient::new(channel) + .max_decoding_message_size(MAX_GRPC_DECODING_SIZE) .get_action_result(Request::new(request)) .await .err_tip(|| "in GrpcStore::get_action_result") @@ -492,6 +724,7 @@ impl GrpcStore { .await .err_tip(|| "in update_action_result")?; ActionCacheClient::new(channel) + .max_decoding_message_size(MAX_GRPC_DECODING_SIZE) .update_action_result(Request::new(request)) .await .err_tip(|| "in GrpcStore::update_action_result") @@ -736,6 +969,74 @@ impl StoreDriver for GrpcStore { Ok(()) } + async fn update_oneshot( + self: Pin<&Self>, + key: StoreKey<'_>, + data: Bytes, + ) -> Result<(), Error> { + // Route small CAS blobs through BatchUpdateBlobs. + if !matches!(self.store_type, nativelink_config::stores::StoreType::Ac) + && self.batch_update_threshold > 0 + && (data.len() as u64) <= self.batch_update_threshold + { + let digest = key.into_digest(); + + if let Some(tx) = &self.batch_tx { + // Approach B: coalescing — queue for the background flush loop. + let (result_tx, result_rx) = tokio::sync::oneshot::channel(); + tx.send(PendingBatchEntry { + digest, + data, + result_tx, + }) + .map_err(|_| make_input_err!("Batch coalescer channel closed"))?; + return result_rx + .await + .map_err(|_| make_input_err!("Batch coalescer dropped"))?; + } + + // Approach A: immediate single-element BatchUpdateBlobs. + let digests = [digest]; + let mut results = + self.do_batch_update(&digests, vec![(digest, data)]).await; + return results.remove(&digest).unwrap_or_else(|| { + Err(make_input_err!("BatchUpdateBlobs: no response for digest")) + }); + } + + // Fallback: standard ByteStream.Write via channel pair. + let (mut tx, rx) = make_buf_channel_pair(); + let data_len = + u64::try_from(data.len()).err_tip(|| "Could not convert data.len() to u64")?; + let send_fut = async move { + if !data.is_empty() { + tx.send(data) + .await + .err_tip(|| "Failed to write data in update_oneshot")?; + } + tx.send_eof() + .err_tip(|| "Failed to write EOF in update_oneshot")?; + Ok(()) + }; + future::try_join( + send_fut, + self.update(key, rx, UploadSizeInfo::ExactSize(data_len)), + ) + .await?; + Ok(()) + } + + fn optimized_for(&self, optimization: StoreOptimizations) -> bool { + if optimization == StoreOptimizations::LazyExistenceOnSync + && !matches!(self.store_type, nativelink_config::stores::StoreType::Ac) + { + return true; + } + optimization == StoreOptimizations::SubscribesToUpdateOneshot + && self.batch_update_threshold > 0 + && !matches!(self.store_type, nativelink_config::stores::StoreType::Ac) + } + async fn get_part( self: Pin<&Self>, key: StoreKey<'_>, @@ -809,7 +1110,7 @@ impl StoreDriver for GrpcStore { loop { let data = match stream.next().await { // Create an empty response to represent EOF. - None => bytes::Bytes::new(), + None => Bytes::new(), Some(Ok(message)) => message.data, Some(Err(status)) => { return Some(( @@ -858,9 +1159,9 @@ impl StoreDriver for GrpcStore { self } - fn register_remove_callback( + fn register_item_callback( self: Arc, - _callback: Arc, + _callback: Arc, ) -> Result<(), Error> { Err(Error::new( Code::Internal, diff --git a/nativelink-store/src/lib.rs b/nativelink-store/src/lib.rs index 72b7f46d6..21d531a6f 100644 --- a/nativelink-store/src/lib.rs +++ b/nativelink-store/src/lib.rs @@ -39,3 +39,4 @@ pub mod shard_store; pub mod size_partitioning_store; pub mod store_manager; pub mod verify_store; +pub mod worker_proxy_store; diff --git a/nativelink-store/src/memory_store.rs b/nativelink-store/src/memory_store.rs index 22391596f..fb5f30725 100644 --- a/nativelink-store/src/memory_store.rs +++ b/nativelink-store/src/memory_store.rs @@ -31,10 +31,10 @@ use nativelink_util::health_utils::{ HealthRegistryBuilder, HealthStatusIndicator, default_health_status_indicator, }; use nativelink_util::store_trait::{ - RemoveItemCallback, StoreDriver, StoreKey, StoreKeyBorrow, StoreOptimizations, UploadSizeInfo, + ItemCallback, StoreDriver, StoreKey, StoreKeyBorrow, StoreOptimizations, UploadSizeInfo, }; -use crate::callback_utils::RemoveItemCallbackHolder; +use crate::callback_utils::ItemCallbackHolder; use crate::cas_utils::is_zero_digest; #[derive(Clone)] @@ -66,7 +66,7 @@ pub struct MemoryStore { StoreKey<'static>, BytesWrapper, SystemTime, - RemoveItemCallbackHolder, + ItemCallbackHolder, >, } @@ -81,8 +81,8 @@ impl MemoryStore { /// Returns the number of key-value pairs that are currently in the the cache. /// Function is not for production code paths. - pub fn len_for_test(&self) -> usize { - self.evicting_map.len_for_test() + pub async fn len_for_test(&self) -> usize { + self.evicting_map.len_for_test().await } pub async fn remove_entry(&self, key: StoreKey<'_>) -> bool { @@ -126,7 +126,8 @@ impl StoreDriver for MemoryStore { ); let iterations = self .evicting_map - .range(range, move |key, _value| handler(key.borrow())); + .range(range, move |key, _value| handler(key.borrow())) + .await; Ok(iterations) } @@ -136,17 +137,12 @@ impl StoreDriver for MemoryStore { mut reader: DropCloserReadHalf, _size_info: UploadSizeInfo, ) -> Result<(), Error> { - // Internally Bytes might hold a reference to more data than just our data. To prevent - // this potential case, we make a full copy of our data for long-term storage. - let final_buffer = { - let buffer = reader - .consume(None) - .await - .err_tip(|| "Failed to collect all bytes from reader in memory_store::update")?; - let mut new_buffer = BytesMut::with_capacity(buffer.len()); - new_buffer.extend_from_slice(&buffer[..]); - new_buffer.freeze() - }; + // consume() returns a standalone Bytes from a frozen BytesMut inside + // buf_channel — no shared parent buffer, so no need to copy. + let final_buffer = reader + .consume(None) + .await + .err_tip(|| "Failed to collect all bytes from reader in memory_store::update")?; self.evicting_map .insert(key.into_owned().into(), BytesWrapper(final_buffer)) @@ -232,12 +228,12 @@ impl StoreDriver for MemoryStore { registry.register_indicator(self); } - fn register_remove_callback( + fn register_item_callback( self: Arc, - callback: Arc, + callback: Arc, ) -> Result<(), Error> { self.evicting_map - .add_remove_callback(RemoveItemCallbackHolder::new(callback)); + .add_item_callback(ItemCallbackHolder::new(callback)); Ok(()) } } diff --git a/nativelink-store/src/mongo_store.rs b/nativelink-store/src/mongo_store.rs index b85e1ec3b..76ea69e7e 100644 --- a/nativelink-store/src/mongo_store.rs +++ b/nativelink-store/src/mongo_store.rs @@ -32,7 +32,7 @@ use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use nativelink_util::health_utils::{HealthRegistryBuilder, HealthStatus, HealthStatusIndicator}; use nativelink_util::spawn; use nativelink_util::store_trait::{ - BoolValue, RemoveItemCallback, SchedulerCurrentVersionProvider, SchedulerIndexProvider, + BoolValue, ItemCallback, SchedulerCurrentVersionProvider, SchedulerIndexProvider, SchedulerStore, SchedulerStoreDataProvider, SchedulerStoreDecodeTo, SchedulerStoreKeyProvider, SchedulerSubscription, SchedulerSubscriptionManager, StoreDriver, StoreKey, UploadSizeInfo, }; @@ -577,9 +577,9 @@ impl StoreDriver for ExperimentalMongoStore { registry.register_indicator(self); } - fn register_remove_callback( + fn register_item_callback( self: Arc, - _callback: Arc, + _callback: Arc, ) -> Result<(), Error> { // drop because we don't remove anything from Mongo Ok(()) diff --git a/nativelink-store/src/noop_store.rs b/nativelink-store/src/noop_store.rs index 9c749750b..c283eee52 100644 --- a/nativelink-store/src/noop_store.rs +++ b/nativelink-store/src/noop_store.rs @@ -23,7 +23,7 @@ use nativelink_metric::{ use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use nativelink_util::health_utils::{HealthStatusIndicator, default_health_status_indicator}; use nativelink_util::store_trait::{ - RemoveItemCallback, StoreDriver, StoreKey, StoreOptimizations, UploadSizeInfo, + ItemCallback, StoreDriver, StoreKey, StoreOptimizations, UploadSizeInfo, }; #[derive(Debug, Default, Clone, Copy)] @@ -97,9 +97,9 @@ impl StoreDriver for NoopStore { self } - fn register_remove_callback( + fn register_item_callback( self: Arc, - _callback: Arc, + _callback: Arc, ) -> Result<(), Error> { // does nothing, so drop Ok(()) diff --git a/nativelink-store/src/ontap_s3_existence_cache_store.rs b/nativelink-store/src/ontap_s3_existence_cache_store.rs index a78d2d35a..59c88ad65 100644 --- a/nativelink-store/src/ontap_s3_existence_cache_store.rs +++ b/nativelink-store/src/ontap_s3_existence_cache_store.rs @@ -36,7 +36,7 @@ use nativelink_util::instant_wrapper::InstantWrapper; use nativelink_util::metrics_utils::CounterWithTime; use nativelink_util::spawn; use nativelink_util::store_trait::{ - RemoveItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, + ItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, }; use serde::{Deserialize, Serialize}; use tokio::fs; @@ -97,7 +97,7 @@ where } } -impl RemoveItemCallback for OntapS3CacheCallback +impl ItemCallback for OntapS3CacheCallback where I: InstantWrapper, NowFn: Fn() -> I + Send + Sync + Unpin + Clone + 'static, @@ -368,7 +368,7 @@ where let other_ref = Arc::downgrade(&cache); cache .inner_store - .register_remove_callback(Arc::new(OntapS3CacheCallback { cache: other_ref }))?; + .register_item_callback(Arc::new(OntapS3CacheCallback { cache: other_ref }))?; // Try to load existing cache file if let Ok(contents) = fs::read_to_string(&spec.index_path).await { @@ -429,7 +429,7 @@ async fn create_s3_client(spec: &ExperimentalOntapS3Spec) -> Result, - callback: Arc, + callback: Arc, ) -> Result<(), Error> { - self.inner_store.register_remove_callback(callback) + self.inner_store.register_item_callback(callback) } } -impl RemoveItemCallback for OntapS3ExistenceCache +impl ItemCallback for OntapS3ExistenceCache where I: InstantWrapper, NowFn: Fn() -> I + Send + Sync + Unpin + Clone + 'static, diff --git a/nativelink-store/src/ontap_s3_store.rs b/nativelink-store/src/ontap_s3_store.rs index ecec6bd55..e39769bf9 100644 --- a/nativelink-store/src/ontap_s3_store.rs +++ b/nativelink-store/src/ontap_s3_store.rs @@ -47,7 +47,7 @@ use nativelink_util::buf_channel::{ use nativelink_util::health_utils::{HealthStatus, HealthStatusIndicator}; use nativelink_util::instant_wrapper::InstantWrapper; use nativelink_util::retry::{Retrier, RetryResult}; -use nativelink_util::store_trait::{RemoveItemCallback, StoreDriver, StoreKey, UploadSizeInfo}; +use nativelink_util::store_trait::{ItemCallback, StoreDriver, StoreKey, UploadSizeInfo}; use parking_lot::Mutex; use rustls::{ClientConfig, RootCertStore}; use rustls_pki_types::CertificateDer; @@ -74,7 +74,7 @@ const DEFAULT_MAX_RETRY_BUFFER_PER_REQUEST: usize = 20 * 1024 * 1024; // 20MB // Default limit for concurrent part uploads per multipart upload const DEFAULT_MULTIPART_MAX_CONCURRENT_UPLOADS: usize = 10; -type RemoveCallback = Arc; +type ItemCb = Arc; #[derive(Debug, MetricsComponent)] pub struct OntapS3Store { @@ -92,7 +92,7 @@ pub struct OntapS3Store { #[metric(help = "The number of concurrent uploads allowed for multipart uploads")] multipart_max_concurrent_uploads: usize, - remove_callbacks: Mutex>, + item_callbacks: Mutex>, } pub fn load_custom_certs(cert_path: &str) -> Result, Error> { @@ -167,7 +167,7 @@ where .app_name(aws_config::AppName::new("nativelink").expect("valid app name")) .http_client(http_client) .force_path_style(true) - .behavior_version(BehaviorVersion::v2025_08_07()) + .behavior_version(BehaviorVersion::v2026_01_12()) .timeout_config( aws_config::timeout::TimeoutConfig::builder() .connect_timeout(Duration::from_secs(30)) @@ -216,7 +216,7 @@ where .common .multipart_max_concurrent_uploads .unwrap_or(DEFAULT_MULTIPART_MAX_CONCURRENT_UPLOADS), - remove_callbacks: Mutex::new(vec![]), + item_callbacks: Mutex::new(vec![]), })) } @@ -245,8 +245,8 @@ where let now_s = (self.now_fn)().unix_timestamp() as i64; if last_modified.secs() + self.consider_expired_after_s <= now_s { - let remove_callbacks = self.remove_callbacks.lock().clone(); - let mut callbacks: FuturesUnordered<_> = remove_callbacks + let item_callbacks = self.item_callbacks.lock().clone(); + let mut callbacks: FuturesUnordered<_> = item_callbacks .into_iter() .map(|callback| { let store_key = local_digest.borrow(); @@ -767,11 +767,11 @@ where self } - fn register_remove_callback( + fn register_item_callback( self: Arc, - callback: Arc, + callback: Arc, ) -> Result<(), Error> { - self.remove_callbacks.lock().push(callback); + self.item_callbacks.lock().push(callback); Ok(()) } } diff --git a/nativelink-store/src/redis_store.rs b/nativelink-store/src/redis_store.rs index 590605429..a82183a3a 100644 --- a/nativelink-store/src/redis_store.rs +++ b/nativelink-store/src/redis_store.rs @@ -36,7 +36,7 @@ use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use nativelink_util::health_utils::{HealthRegistryBuilder, HealthStatus, HealthStatusIndicator}; use nativelink_util::spawn; use nativelink_util::store_trait::{ - BoolValue, RemoveItemCallback, SchedulerCurrentVersionProvider, SchedulerIndexProvider, + BoolValue, ItemCallback, SchedulerCurrentVersionProvider, SchedulerIndexProvider, SchedulerStore, SchedulerStoreDataProvider, SchedulerStoreDecodeTo, SchedulerStoreKeyProvider, SchedulerSubscription, SchedulerSubscriptionManager, StoreDriver, StoreKey, UploadSizeInfo, }; @@ -855,9 +855,9 @@ impl, - _callback: Arc, + _callback: Arc, ) -> Result<(), Error> { // As redis doesn't drop stuff, we can just ignore this Ok(()) diff --git a/nativelink-store/src/ref_store.rs b/nativelink-store/src/ref_store.rs index d432553f0..2f89380fa 100644 --- a/nativelink-store/src/ref_store.rs +++ b/nativelink-store/src/ref_store.rs @@ -23,7 +23,7 @@ use nativelink_metric::MetricsComponent; use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use nativelink_util::health_utils::{HealthStatusIndicator, default_health_status_indicator}; use nativelink_util::store_trait::{ - RemoveItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, + ItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, }; use parking_lot::Mutex; use tracing::error; @@ -48,7 +48,7 @@ pub struct RefStore { name: String, store_manager: Weak, inner: StoreReference, - remove_callbacks: Mutex>>, + item_callbacks: Mutex>>, } impl RefStore { @@ -60,7 +60,7 @@ impl RefStore { mux: Mutex::new(()), cell: AlignedStoreCell(UnsafeCell::new(None)), }, - remove_callbacks: Mutex::new(vec![]), + item_callbacks: Mutex::new(vec![]), }) } @@ -87,9 +87,9 @@ impl RefStore { .upgrade() .err_tip(|| "Store manager is gone")?; if let Some(store) = store_manager.get_store(&self.name) { - let remove_callbacks = self.remove_callbacks.lock().clone(); - for callback in remove_callbacks { - store.register_remove_callback(callback)?; + let item_callbacks = self.item_callbacks.lock().clone(); + for callback in item_callbacks { + store.register_item_callback(callback)?; } unsafe { *ref_store = Some(store); @@ -152,15 +152,15 @@ impl StoreDriver for RefStore { self } - fn register_remove_callback( + fn register_item_callback( self: Arc, - callback: Arc, + callback: Arc, ) -> Result<(), Error> { - self.remove_callbacks.lock().push(callback.clone()); + self.item_callbacks.lock().push(callback.clone()); let ref_store = self.inner.cell.0.get(); unsafe { if let Some(ref store) = *ref_store { - store.register_remove_callback(callback)?; + store.register_item_callback(callback)?; } } Ok(()) diff --git a/nativelink-store/src/s3_store.rs b/nativelink-store/src/s3_store.rs index a175a0b54..0a2f5420d 100644 --- a/nativelink-store/src/s3_store.rs +++ b/nativelink-store/src/s3_store.rs @@ -47,7 +47,7 @@ use nativelink_util::health_utils::{HealthRegistryBuilder, HealthStatus, HealthS use nativelink_util::instant_wrapper::InstantWrapper; use nativelink_util::retry::{Retrier, RetryResult}; use nativelink_util::store_trait::{ - RemoveItemCallback, StoreDriver, StoreKey, StoreOptimizations, UploadSizeInfo, + ItemCallback, StoreDriver, StoreKey, StoreOptimizations, UploadSizeInfo, }; use parking_lot::Mutex; use tokio::sync::mpsc; @@ -93,7 +93,7 @@ pub struct S3Store { #[metric(help = "The number of concurrent uploads allowed for multipart uploads")] multipart_max_concurrent_uploads: usize, - remove_callbacks: Mutex>>, + item_callbacks: Mutex>>, } impl S3Store @@ -115,7 +115,7 @@ where .build() .await; - let config = aws_config::defaults(BehaviorVersion::v2025_08_07()) + let config = aws_config::defaults(BehaviorVersion::v2026_01_12()) .credentials_provider(credential_provider) .app_name(AppName::new("nativelink").expect("valid app name")) .timeout_config( @@ -163,7 +163,7 @@ where .common .multipart_max_concurrent_uploads .map_or(DEFAULT_MULTIPART_MAX_CONCURRENT_UPLOADS, |v| v), - remove_callbacks: Mutex::new(Vec::new()), + item_callbacks: Mutex::new(Vec::new()), })) } @@ -192,8 +192,8 @@ where let now_s = (self.now_fn)().unix_timestamp() as i64; if last_modified.secs() + self.consider_expired_after_s <= now_s { - let remove_callbacks = self.remove_callbacks.lock().clone(); - let mut callbacks: FuturesUnordered<_> = remove_callbacks + let item_callbacks = self.item_callbacks.lock().clone(); + let mut callbacks: FuturesUnordered<_> = item_callbacks .iter() .map(|callback| { callback.callback(local_digest.borrow()) @@ -653,11 +653,11 @@ where registry.register_indicator(self); } - fn register_remove_callback( + fn register_item_callback( self: Arc, - callback: Arc, + callback: Arc, ) -> Result<(), Error> { - self.remove_callbacks.lock().push(callback); + self.item_callbacks.lock().push(callback); Ok(()) } } diff --git a/nativelink-store/src/shard_store.rs b/nativelink-store/src/shard_store.rs index e59a05845..1ba722666 100644 --- a/nativelink-store/src/shard_store.rs +++ b/nativelink-store/src/shard_store.rs @@ -12,10 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -use core::hash::Hasher; use core::ops::BitXor; use core::pin::Pin; -use std::hash::DefaultHasher; use std::sync::Arc; use async_trait::async_trait; @@ -26,7 +24,7 @@ use nativelink_metric::MetricsComponent; use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use nativelink_util::health_utils::{HealthStatusIndicator, default_health_status_indicator}; use nativelink_util::store_trait::{ - RemoveItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, + ItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, }; #[derive(Debug, MetricsComponent)] @@ -127,10 +125,9 @@ impl ShardStore { .bitxor(u32::from_le_bytes(size_bytes[4..8].try_into().unwrap())) } StoreKey::Str(s) => { - let mut hasher = DefaultHasher::new(); - hasher.write(s.as_bytes()); - let key_u64 = hasher.finish(); - (key_u64 >> 32) as u32 // We only need the top 32 bits. + let hash = blake3::hash(s.as_bytes()); + let hash_bytes = hash.as_bytes(); + u32::from_le_bytes([hash_bytes[0], hash_bytes[1], hash_bytes[2], hash_bytes[3]]) } }; self.weights_and_stores @@ -244,12 +241,12 @@ impl StoreDriver for ShardStore { self } - fn register_remove_callback( + fn register_item_callback( self: Arc, - callback: Arc, + callback: Arc, ) -> Result<(), Error> { for store in &self.weights_and_stores { - store.store.register_remove_callback(callback.clone())?; + store.store.register_item_callback(callback.clone())?; } Ok(()) } diff --git a/nativelink-store/src/size_partitioning_store.rs b/nativelink-store/src/size_partitioning_store.rs index a959244b5..399785b7b 100644 --- a/nativelink-store/src/size_partitioning_store.rs +++ b/nativelink-store/src/size_partitioning_store.rs @@ -22,7 +22,7 @@ use nativelink_metric::MetricsComponent; use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use nativelink_util::health_utils::{HealthStatusIndicator, default_health_status_indicator}; use nativelink_util::store_trait::{ - RemoveItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, + ItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, }; use tokio::join; @@ -162,13 +162,13 @@ impl StoreDriver for SizePartitioningStore { self } - fn register_remove_callback( + fn register_item_callback( self: Arc, - callback: Arc, + callback: Arc, ) -> Result<(), Error> { self.lower_store - .register_remove_callback(callback.clone())?; - self.upper_store.register_remove_callback(callback)?; + .register_item_callback(callback.clone())?; + self.upper_store.register_item_callback(callback)?; Ok(()) } } diff --git a/nativelink-store/src/verify_store.rs b/nativelink-store/src/verify_store.rs index 04ba3a02f..bc71df2ae 100644 --- a/nativelink-store/src/verify_store.rs +++ b/nativelink-store/src/verify_store.rs @@ -27,7 +27,7 @@ use nativelink_util::digest_hasher::{DigestHasher, DigestHasherFunc, default_dig use nativelink_util::health_utils::{HealthStatusIndicator, default_health_status_indicator}; use nativelink_util::metrics_utils::CounterWithTime; use nativelink_util::store_trait::{ - RemoveItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, + ItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, }; use opentelemetry::context::Context; @@ -231,11 +231,11 @@ impl StoreDriver for VerifyStore { self } - fn register_remove_callback( + fn register_item_callback( self: Arc, - callback: Arc, + callback: Arc, ) -> Result<(), Error> { - self.inner_store.register_remove_callback(callback) + self.inner_store.register_item_callback(callback) } } diff --git a/nativelink-store/src/worker_proxy_store.rs b/nativelink-store/src/worker_proxy_store.rs new file mode 100644 index 000000000..38e333d27 --- /dev/null +++ b/nativelink-store/src/worker_proxy_store.rs @@ -0,0 +1,1176 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// See LICENSE file for details +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use core::pin::Pin; +use std::borrow::Cow; +use std::collections::HashMap; +use std::sync::Arc; + +use async_trait::async_trait; +use nativelink_config::stores::{GrpcEndpoint, GrpcSpec, Retry, StoreType}; +use nativelink_error::{Code, Error, ResultExt, make_err}; +use nativelink_metric::MetricsComponent; +use nativelink_util::blob_locality_map::SharedBlobLocalityMap; +use nativelink_util::buf_channel::{ + DropCloserReadHalf, DropCloserWriteHalf, make_buf_channel_pair, +}; +use nativelink_util::health_utils::{HealthStatus, HealthStatusIndicator}; +use nativelink_util::store_trait::{ + IS_WORKER_REQUEST, ItemCallback, REDIRECT_PREFIX, Store, StoreDriver, StoreKey, StoreLike, + StoreOptimizations, UploadSizeInfo, +}; +use parking_lot::RwLock; +use tokio::task::JoinHandle; +use tracing::{debug, trace, warn}; + +use crate::grpc_store::GrpcStore; + +/// A store wrapper that transparently proxies CAS reads from workers when +/// the inner store returns NotFound. This enables worker-to-worker blob sharing. +/// +/// Behavior: +/// - `get_part()`: Try inner store first. If NotFound, consult the locality map +/// for workers that have the digest, try reading from a worker. +/// - `has()` / `has_with_results()`: ONLY check inner store. Never consult the +/// locality map. (Prevents stale-positive issues with FindMissingBlobs.) +/// - `update()`: Pass through to inner store. +#[derive(MetricsComponent)] +pub struct WorkerProxyStore { + #[metric(group = "inner_store")] + inner: Store, + /// Blob locality map — digest → worker endpoints. + locality_map: SharedBlobLocalityMap, + /// Cached GrpcStore connections to worker endpoints. + worker_connections: RwLock, Store>>, + /// When true, race peer fetches against server fetches in get_part. + /// Only workers should enable this — servers should use the sequential + /// path which generates redirects for workers. + race_peers: bool, +} + +impl core::fmt::Debug for WorkerProxyStore { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("WorkerProxyStore") + .field("inner", &self.inner) + .field("worker_connections", &self.worker_connections.read().len()) + .finish() + } +} + +/// Returns true if the error code indicates a connection-level failure, +/// meaning the cached connection should be removed. +fn is_connection_error(e: &Error) -> bool { + matches!(e.code, Code::Unavailable | Code::Unknown) +} + +impl WorkerProxyStore { + pub fn new(inner: Store, locality_map: SharedBlobLocalityMap) -> Arc { + Arc::new(Self { + inner, + locality_map, + worker_connections: RwLock::new(HashMap::new()), + race_peers: false, + }) + } + + /// Enable racing peer fetches against server fetches. + /// Only workers should call this — servers should leave it disabled. + pub fn enable_race_peers(&mut self) { + self.race_peers = true; + } + + /// Add a worker endpoint to the connection pool. + pub async fn add_worker_endpoint(&self, endpoint: &str) { + if self.get_worker_connection(endpoint).is_some() { + return; + } + self.get_or_create_connection(endpoint).await; + } + + /// Returns the inner (server) store. + pub fn inner_store(&self) -> &Store { + &self.inner + } + + /// Returns the locality map for looking up which peers have which digests. + pub fn locality_map(&self) -> &SharedBlobLocalityMap { + &self.locality_map + } + + /// Returns all currently-connected peer stores. + pub fn peer_stores(&self) -> HashMap, Store> { + self.worker_connections.read().clone() + } + + /// Remove a worker endpoint from the connection pool. + pub fn remove_worker_endpoint(&self, endpoint: &str) { + let mut conns = self.worker_connections.write(); + if conns.remove(endpoint).is_some() { + debug!(endpoint, "WorkerProxyStore: removed worker connection"); + } + } + + /// Inject a pre-built Store as a worker connection for the given endpoint. + /// This is primarily useful for testing, where you want to use a MemoryStore + /// instead of a real GrpcStore. + pub fn inject_worker_connection(&self, endpoint: &str, store: Store) { + self.worker_connections + .write() + .insert(Arc::from(endpoint), store); + } + + /// Get a cached connection to a worker endpoint, or None. + fn get_worker_connection(&self, endpoint: &str) -> Option { + self.worker_connections.read().get(endpoint).cloned() + } + + /// Get or create a connection to a worker endpoint. + /// Returns None if the connection could not be created. + async fn get_or_create_connection(&self, endpoint: &str) -> Option { + if let Some(store) = self.get_worker_connection(endpoint) { + return Some(store); + } + match Self::create_worker_connection(endpoint).await { + Ok(store) => { + self.worker_connections + .write() + .entry(Arc::from(endpoint)) + .or_insert_with(|| store.clone()); + Some(store) + } + Err(e) => { + trace!(endpoint, ?e, "WorkerProxyStore: failed to connect to peer"); + None + } + } + } + + /// Create a minimal GrpcStore connection to a worker endpoint. + async fn create_worker_connection(endpoint: &str) -> Result { + let spec = GrpcSpec { + instance_name: String::new(), + endpoints: vec![GrpcEndpoint { + address: endpoint.to_string(), + tls_config: None, + concurrency_limit: None, + connect_timeout_s: 5, + tcp_keepalive_s: 30, + http2_keepalive_interval_s: 30, + http2_keepalive_timeout_s: 20, + tcp_nodelay: true, + }], + store_type: StoreType::Cas, + retry: Retry::default(), + max_concurrent_requests: 0, + connections_per_endpoint: 64, + rpc_timeout_s: 120, + batch_update_threshold_bytes: 0, // Not uploading via this store + batch_coalesce_delay_ms: 0, + }; + let store = GrpcStore::new(&spec) + .await + .err_tip(|| format!("Creating worker proxy connection to {endpoint}"))?; + Ok(Store::new(store)) + } + + /// Try to read a blob from a specific list of peer endpoints (e.g. from + /// a redirect response). Same logic as `try_read_from_worker` but uses + /// the caller-provided endpoints instead of consulting the locality map. + async fn try_read_from_endpoints( + &self, + key: StoreKey<'_>, + writer: &mut DropCloserWriteHalf, + offset: u64, + length: Option, + endpoints: &[String], + ) -> Result { + let digest = key.borrow().into_digest(); + debug!( + ?digest, + endpoint_count = endpoints.len(), + "WorkerProxyStore: following redirect to peer endpoints" + ); + + for endpoint in endpoints { + let Some(store) = self.get_or_create_connection(endpoint).await else { + continue; + }; + + match store + .get_part(key.borrow(), &mut *writer, offset, length) + .await + { + Ok(()) => { + debug!( + ?digest, + endpoint = endpoint.as_str(), + "WorkerProxyStore: successfully read blob from redirected peer" + ); + return Ok(true); + } + Err(e) => { + if is_connection_error(&e) { + self.remove_worker_endpoint(endpoint); + } + warn!( + ?digest, + endpoint = endpoint.as_str(), + ?e, + "WorkerProxyStore: read from redirected peer failed, trying next" + ); + continue; + } + } + } + + Ok(false) + } + + /// Try to read a blob from a worker that has it, according to the locality map. + /// + /// Streams directly from the peer to the caller's writer via `get_part()` — + /// no buffering. If a peer fails mid-stream, we resume from the next peer + /// at the byte offset where the previous one left off (content-addressed + /// blobs are identical across peers). + async fn try_read_from_worker( + &self, + key: StoreKey<'_>, + writer: &mut DropCloserWriteHalf, + offset: u64, + length: Option, + ) -> Result { + let digest = key.borrow().into_digest(); + let workers = self.locality_map.read().lookup_workers(&digest); + + if workers.is_empty() { + return Ok(false); + } + + debug!( + ?digest, + worker_count = workers.len(), + "WorkerProxyStore: attempting to proxy blob from workers" + ); + + // Track how many bytes have been written so we can resume from the + // correct offset if a streaming peer fails mid-transfer. + let bytes_before_proxy = writer.get_bytes_written(); + let mut current_offset = offset; + let mut remaining_length = length; + + for endpoint in &workers { + let Some(store) = self.get_or_create_connection(endpoint).await else { + continue; + }; + + // Stream directly from the peer — no buffering. + // On failure, compute how many bytes were written and resume + // from the next peer at the correct offset. + match store + .get_part(key.borrow(), &mut *writer, current_offset, remaining_length) + .await + { + Ok(()) => { + debug!( + ?digest, + endpoint = %endpoint, + "WorkerProxyStore: successfully proxied blob from worker" + ); + return Ok(true); + } + Err(e) => { + if is_connection_error(&e) { + self.remove_worker_endpoint(endpoint); + } + let bytes_written_total = + writer.get_bytes_written() - bytes_before_proxy; + warn!( + ?digest, + endpoint = %endpoint, + bytes_written_total, + ?e, + "WorkerProxyStore: streaming get_part from peer failed, \ + will resume from next peer at offset {}", + offset + bytes_written_total, + ); + // Advance offset so the next peer picks up where this one left off. + current_offset = offset + bytes_written_total; + if let Some(len) = remaining_length { + remaining_length = + Some(len.saturating_sub(bytes_written_total)); + } + continue; + } + } + } + + Ok(false) + } + + /// The original sequential get_part logic: try inner store, then parse + /// redirects, then fall back to locality map / peer proxying. + /// This is used as the fallback when no peers are known for racing. + async fn get_part_sequential( + &self, + key: StoreKey<'_>, + writer: &mut DropCloserWriteHalf, + offset: u64, + length: Option, + ) -> Result<(), Error> { + let mut redirect_endpoints: Option> = None; + match IS_WORKER_REQUEST + .scope( + true, + self.inner.get_part(key.borrow(), &mut *writer, offset, length), + ) + .await + { + Ok(()) => return Ok(()), + Err(e) if e.code == Code::NotFound => { + trace!( + key = ?key.borrow().into_digest(), + "WorkerProxyStore: inner store miss (NotFound), consulting locality map" + ); + } + Err(e) if e.code == Code::FailedPrecondition => { + let msg = e.message_string(); + if let Some(start) = msg.find(REDIRECT_PREFIX) { + let endpoints_str = &msg[start + REDIRECT_PREFIX.len()..]; + let endpoints_str = endpoints_str + .split('|') + .next() + .unwrap_or(endpoints_str); + let endpoints: Vec = endpoints_str + .split(',') + .filter(|s| !s.is_empty()) + .map(String::from) + .collect(); + if !endpoints.is_empty() { + debug!( + key = ?key.borrow().into_digest(), + ?endpoints, + "WorkerProxyStore: received redirect from inner store" + ); + redirect_endpoints = Some(endpoints); + } + } + if redirect_endpoints.is_none() { + return Err(e); + } + } + Err(e) => return Err(e), + } + + if let Some(endpoints) = redirect_endpoints { + if self + .try_read_from_endpoints(key.borrow(), writer, offset, length, &endpoints) + .await? + { + return Ok(()); + } + } + + let is_worker = IS_WORKER_REQUEST.try_with(|v| *v).unwrap_or(false); + + if is_worker { + let digest = key.borrow().into_digest(); + let workers = self.locality_map.read().lookup_workers(&digest); + if workers.is_empty() { + return Err(make_err!( + Code::NotFound, + "Blob {digest:?} not found in inner store or locality map" + )); + } + let endpoints = workers.join(","); + debug!( + ?digest, + endpoints, + "WorkerProxyStore: redirecting worker to peer endpoints" + ); + return Err(make_err!( + Code::FailedPrecondition, + "{REDIRECT_PREFIX}{endpoints}|" + )); + } + + if self + .try_read_from_worker(key.borrow(), writer, offset, length) + .await? + { + return Ok(()); + } + + Err(make_err!( + Code::NotFound, + "Blob {:?} not found in inner store or any worker", + key.borrow().into_digest() + )) + } + + /// Forward remaining data from a racer's read half to the caller's writer, + /// then wait for the spawned task to complete. + async fn forward_racer( + winner_name: &str, + writer: &mut DropCloserWriteHalf, + rx: &mut DropCloserReadHalf, + handle: JoinHandle>, + ) -> Result<(), Error> { + // Forward all remaining chunks from the racer's channel to the + // caller's writer. bind_buffered handles EOF propagation. + writer + .bind_buffered(rx) + .await + .err_tip(|| format!("WorkerProxyStore: {winner_name} racer bind_buffered"))?; + + // Wait for the spawned get_part to confirm it finished successfully. + // If the task was already done (sent EOF), this returns immediately. + handle + .await + .map_err(|e| make_err!(Code::Internal, "WorkerProxyStore: {winner_name} task join error: {e}"))? + .err_tip(|| format!("WorkerProxyStore: {winner_name} get_part failed after winning race")) + } +} + +#[async_trait] +impl StoreDriver for WorkerProxyStore { + async fn has_with_results( + self: Pin<&Self>, + digests: &[StoreKey<'_>], + results: &mut [Option], + ) -> Result<(), Error> { + // ONLY check inner store. Never consult the locality map for has(). + // This prevents stale-positive issues with FindMissingBlobs. + self.inner.has_with_results(digests, results).await + } + + async fn update( + self: Pin<&Self>, + key: StoreKey<'_>, + reader: DropCloserReadHalf, + upload_size: UploadSizeInfo, + ) -> Result<(), Error> { + // Pass through to inner store. + self.inner.update(key, reader, upload_size).await + } + + fn optimized_for(&self, optimization: StoreOptimizations) -> bool { + // Report LazyExistenceOnSync so that FastSlowStore skips the has() + // check before get_part(). Our has() only checks the inner store + // (to avoid stale-positive FindMissingBlobs), but get_part() also + // consults the locality map and peer workers. Without this, blobs + // that exist only on peer workers would never be found by + // FastSlowStore because has() returns None. + if optimization == StoreOptimizations::LazyExistenceOnSync { + return true; + } + self.inner + .inner_store(None::>) + .optimized_for(optimization) + } + + async fn get_part( + self: Pin<&Self>, + key: StoreKey<'_>, + writer: &mut DropCloserWriteHalf, + offset: u64, + length: Option, + ) -> Result<(), Error> { + // Only race when explicitly enabled (worker side). Server-side + // WorkerProxyStore uses the sequential path which generates + // redirects for workers and proxies for non-worker callers. + let digest = key.borrow().into_digest(); + let peers = if self.race_peers { + self.locality_map.read().lookup_workers(&digest) + } else { + Vec::new() + }; + + if peers.is_empty() { + // No peers known (or server side) — use the sequential path. + return self + .get_part_sequential(key, writer, offset, length) + .await; + } + + // Try to get a connection to the first peer. + let peer_store = match self.get_or_create_connection(&peers[0]).await { + Some(store) => store, + None => { + return self + .get_part_sequential(key, writer, offset, length) + .await; + } + }; + let peer_endpoint: Arc = peers[0].clone(); + + // Create buf_channel pairs for each racer. Each spawned task writes + // into its own tx; we read from the rx to see who produces data first. + let (mut server_tx, mut server_rx) = make_buf_channel_pair(); + let (mut peer_tx, mut peer_rx) = make_buf_channel_pair(); + + // We need owned keys for the spawned tasks. + let server_key = key.borrow().into_owned(); + let peer_key = key.borrow().into_owned(); + + // Clone inner store for the server task. + let inner = self.inner.clone(); + + // Spawn server fetch. Do NOT set IS_WORKER_REQUEST — we want the + // server to actually serve the blob data, not return a redirect. + let server_handle: JoinHandle> = tokio::spawn(async move { + inner + .get_part(server_key.borrow(), &mut server_tx, offset, length) + .await + }); + + // Spawn peer fetch. + let peer_handle: JoinHandle> = tokio::spawn(async move { + peer_store + .get_part(peer_key.borrow(), &mut peer_tx, offset, length) + .await + }); + + // Race: wait for the first racer to produce a data chunk (or error). + tokio::select! { + server_result = server_rx.recv() => { + match server_result { + Ok(chunk) if !chunk.is_empty() => { + // Server produced data first — it wins. + peer_handle.abort(); + debug!( + ?digest, + "WorkerProxyStore: server won race against peer" + ); + writer.send(chunk).await + .err_tip(|| "WorkerProxyStore: sending server winner chunk")?; + Self::forward_racer("server", writer, &mut server_rx, server_handle).await + } + Ok(_empty) => { + // Server returned EOF immediately (zero-length blob). + peer_handle.abort(); + debug!( + ?digest, + "WorkerProxyStore: server won race (empty blob)" + ); + writer.send_eof() + .err_tip(|| "WorkerProxyStore: sending EOF for empty blob")?; + server_handle.await + .map_err(|e| make_err!(Code::Internal, "server task join: {e}"))? + } + Err(_server_err) => { + // Server racer failed — wait for peer. + warn!( + ?digest, + "WorkerProxyStore: server racer failed, waiting for peer" + ); + let peer_chunk = peer_rx.recv().await + .err_tip(|| "WorkerProxyStore: peer recv after server failure")?; + if peer_chunk.is_empty() { + writer.send_eof() + .err_tip(|| "WorkerProxyStore: peer EOF after server failure")?; + return peer_handle.await + .map_err(|e| make_err!(Code::Internal, "peer task join: {e}"))?; + } + debug!( + ?digest, + endpoint = %peer_endpoint, + "WorkerProxyStore: peer won race (server failed)" + ); + writer.send(peer_chunk).await + .err_tip(|| "WorkerProxyStore: sending peer fallback chunk")?; + Self::forward_racer("peer", writer, &mut peer_rx, peer_handle).await + } + } + } + peer_result = peer_rx.recv() => { + match peer_result { + Ok(chunk) if !chunk.is_empty() => { + // Peer produced data first — it wins. + server_handle.abort(); + debug!( + ?digest, + endpoint = %peer_endpoint, + "WorkerProxyStore: peer won race against server" + ); + writer.send(chunk).await + .err_tip(|| "WorkerProxyStore: sending peer winner chunk")?; + Self::forward_racer("peer", writer, &mut peer_rx, peer_handle).await + } + Ok(_empty) => { + // Peer returned EOF immediately (zero-length blob). + server_handle.abort(); + debug!( + ?digest, + endpoint = %peer_endpoint, + "WorkerProxyStore: peer won race (empty blob)" + ); + writer.send_eof() + .err_tip(|| "WorkerProxyStore: sending EOF for empty blob from peer")?; + peer_handle.await + .map_err(|e| make_err!(Code::Internal, "peer task join: {e}"))? + } + Err(_peer_err) => { + // Peer racer failed — wait for server. + warn!( + ?digest, + endpoint = %peer_endpoint, + "WorkerProxyStore: peer racer failed, waiting for server" + ); + let server_chunk = server_rx.recv().await + .err_tip(|| "WorkerProxyStore: server recv after peer failure")?; + if server_chunk.is_empty() { + writer.send_eof() + .err_tip(|| "WorkerProxyStore: server EOF after peer failure")?; + return server_handle.await + .map_err(|e| make_err!(Code::Internal, "server task join: {e}"))?; + } + debug!( + ?digest, + "WorkerProxyStore: server won race (peer failed)" + ); + writer.send(server_chunk).await + .err_tip(|| "WorkerProxyStore: sending server fallback chunk")?; + Self::forward_racer("server", writer, &mut server_rx, server_handle).await + } + } + } + } + } + + fn inner_store(&self, key: Option) -> &dyn StoreDriver { + // Delegate to inner store so that callers can downcast through + // the chain (e.g. worker finding FastSlowStore via downcast_ref). + // WorkerProxyStore's optimized_for override is independent of this. + self.inner.inner_store(key) + } + + fn as_any<'a>(&'a self) -> &'a (dyn core::any::Any + Sync + Send + 'static) { + self + } + + fn as_any_arc(self: Arc) -> Arc { + self + } + + fn register_item_callback( + self: Arc, + callback: Arc, + ) -> Result<(), Error> { + self.inner.register_item_callback(callback) + } +} + +#[async_trait] +impl HealthStatusIndicator for WorkerProxyStore { + fn get_name(&self) -> &'static str { + "WorkerProxyStore" + } + + async fn check_health( + &self, + namespace: Cow<'static, str>, + ) -> HealthStatus { + self.inner.check_health(namespace).await + } +} + +#[cfg(test)] +mod tests { + use bytes::Bytes; + use nativelink_config::stores::MemorySpec; + use nativelink_error::{Code, Error, make_err}; + use nativelink_macro::nativelink_test; + use nativelink_util::blob_locality_map::new_shared_blob_locality_map; + use nativelink_util::common::DigestInfo; + use nativelink_util::store_trait::{ + IS_WORKER_REQUEST, REDIRECT_PREFIX, StoreLike, StoreKey, StoreOptimizations, + }; + use pretty_assertions::assert_eq; + + use super::*; + use crate::memory_store::MemoryStore; + + const VALID_HASH1: &str = + "0123456789abcdef000000000000000000010000000000000123456789abcdef"; + const VALID_HASH2: &str = + "0123456789abcdef000000000000000000020000000000000123456789abcdef"; + + /// Helper: create a WorkerProxyStore backed by a fresh MemoryStore. + fn make_proxy_store() -> (Store, SharedBlobLocalityMap) { + let inner = Store::new(MemoryStore::new(&MemorySpec::default())); + let locality_map = new_shared_blob_locality_map(); + let proxy = WorkerProxyStore::new(inner, locality_map.clone()); + (Store::new(proxy), locality_map) + } + + // --------------------------------------------------------------- + // 1. Inner store hit returns data without consulting locality map. + // --------------------------------------------------------------- + #[nativelink_test] + async fn test_inner_store_hit_skips_locality() -> Result<(), Error> { + let (store, locality_map) = make_proxy_store(); + + let value = b"hello world"; + let digest = DigestInfo::try_new(VALID_HASH1, value.len() as u64)?; + + // Write the blob into the inner store via the proxy. + store + .update_oneshot(digest, Bytes::from_static(value)) + .await?; + + // Register a fake worker in the locality map so we can verify + // it is NOT contacted when the inner store already has the blob. + locality_map + .write() + .register_blobs("fake-worker:50081", &[digest]); + + // Read the blob back — should succeed from the inner store. + let result = store + .get_part_unchunked(digest, 0, None) + .await?; + assert_eq!(result.as_ref(), value); + + Ok(()) + } + + // --------------------------------------------------------------- + // 2. Inner store miss + empty locality map => NotFound. + // --------------------------------------------------------------- + #[nativelink_test] + async fn test_inner_store_miss_no_peers_returns_not_found() -> Result<(), Error> { + let (store, _locality_map) = make_proxy_store(); + + let digest = DigestInfo::try_new(VALID_HASH1, 100)?; + + // The inner store is empty and the locality map has no entries. + let result = store.get_part_unchunked(digest, 0, None).await; + + assert!(result.is_err(), "Expected NotFound error"); + let err = result.unwrap_err(); + assert_eq!( + err.code, + Code::NotFound, + "Expected NotFound code, got: {err:?}" + ); + + Ok(()) + } + + // --------------------------------------------------------------- + // 3. Inner store miss + locality has peers but no gRPC connections + // => falls through gracefully and returns NotFound. + // --------------------------------------------------------------- + #[nativelink_test] + async fn test_inner_store_miss_locality_has_peers_but_no_connections() + -> Result<(), Error> + { + let (store, locality_map) = make_proxy_store(); + + let digest = DigestInfo::try_new(VALID_HASH1, 100)?; + + // Use an invalid URI that fails during GrpcStore::new(). The + // space character is illegal in URIs, so Uri::try_from() fails + // and create_worker_connection returns Err. try_read_from_worker + // will `continue` past this endpoint and return Ok(false), + // resulting in the final NotFound error. + locality_map + .write() + .register_blobs("not a valid uri", &[digest]); + + let result = store.get_part_unchunked(digest, 0, None).await; + + assert!(result.is_err(), "Expected NotFound error"); + let err = result.unwrap_err(); + assert_eq!( + err.code, + Code::NotFound, + "Expected NotFound, got: {err:?}" + ); + + Ok(()) + } + + // --------------------------------------------------------------- + // 4. has_with_results passes through to inner store (no proxy). + // --------------------------------------------------------------- + #[nativelink_test] + async fn test_has_with_results_passes_through() -> Result<(), Error> { + let (store, locality_map) = make_proxy_store(); + + let value = b"test data"; + let d1 = DigestInfo::try_new(VALID_HASH1, value.len() as u64)?; + let d2 = DigestInfo::try_new(VALID_HASH2, 999)?; + + // Only d1 is in the inner store. + store + .update_oneshot(d1, Bytes::from_static(value)) + .await?; + + // Register d2 on a worker so we can prove has() does NOT + // consult the locality map. + locality_map + .write() + .register_blobs("worker-a:50081", &[d2]); + + let keys: Vec> = vec![d1.into(), d2.into()]; + let mut results = vec![None; 2]; + store.has_with_results(&keys, &mut results).await?; + + // d1 should be found with correct size. + assert_eq!( + results[0], + Some(value.len() as u64), + "d1 should be present in inner store" + ); + // d2 should NOT be found (locality map is never consulted for has). + assert_eq!( + results[1], None, + "d2 should NOT be found — has() must not consult locality map" + ); + + Ok(()) + } + + // --------------------------------------------------------------- + // 5. update() passes through to inner store. + // --------------------------------------------------------------- + #[nativelink_test] + async fn test_update_passes_through() -> Result<(), Error> { + let (store, _locality_map) = make_proxy_store(); + + let value = b"upload me"; + let digest = DigestInfo::try_new(VALID_HASH1, value.len() as u64)?; + + // Upload via the proxy store. + store + .update_oneshot(digest, Bytes::from_static(value)) + .await?; + + // Verify the blob is retrievable (proving it went into the inner store). + let data = store.get_part_unchunked(digest, 0, None).await?; + assert_eq!(data.as_ref(), value); + + // Also verify via has(). + let size = store.has(digest).await?; + assert_eq!(size, Some(value.len() as u64)); + + Ok(()) + } + + // --------------------------------------------------------------- + // 6. get_part with offset and length returns correct subset. + // --------------------------------------------------------------- + #[nativelink_test] + async fn test_get_part_with_offset_and_length() -> Result<(), Error> { + let (store, _locality_map) = make_proxy_store(); + + let value = b"0123456789abcdefghij"; // 20 bytes + let digest = DigestInfo::try_new(VALID_HASH1, value.len() as u64)?; + + store + .update_oneshot(digest, Bytes::from_static(value)) + .await?; + + // Read bytes [5..15) — 10 bytes starting at offset 5. + let data = store + .get_part_unchunked(digest, 5, Some(10)) + .await?; + assert_eq!( + data.as_ref(), + b"56789abcde", + "Expected subset at offset=5, length=10" + ); + + // Read from offset 15 to end (no length limit). + let data = store.get_part_unchunked(digest, 15, None).await?; + assert_eq!( + data.as_ref(), + b"fghij", + "Expected tail from offset=15" + ); + + // Read 0 bytes from offset 0 with length 0. + let data = store + .get_part_unchunked(digest, 0, Some(0)) + .await?; + assert_eq!(data.as_ref(), b"", "Expected empty result for length=0"); + + Ok(()) + } + + // --------------------------------------------------------------- + // 7. Redirect parsing: well-formed redirect error. + // --------------------------------------------------------------- + #[nativelink_test] + async fn test_redirect_well_formed() -> Result<(), Error> { + let err = make_err!( + Code::FailedPrecondition, + "{REDIRECT_PREFIX}grpc://w1:50071,grpc://w2:50071|" + ); + let msg = err.message_string(); + let start = msg.find(REDIRECT_PREFIX).expect("prefix missing"); + let endpoints_str = &msg[start + REDIRECT_PREFIX.len()..]; + let endpoints_str = endpoints_str.split('|').next().unwrap_or(endpoints_str); + let endpoints: Vec = endpoints_str + .split(',') + .filter(|s| !s.is_empty()) + .map(String::from) + .collect(); + assert_eq!(endpoints.len(), 2); + assert_eq!(endpoints[0], "grpc://w1:50071"); + assert_eq!(endpoints[1], "grpc://w2:50071"); + Ok(()) + } + + // --------------------------------------------------------------- + // 8. Redirect parsing: trailing noise after pipe is ignored. + // --------------------------------------------------------------- + #[nativelink_test] + async fn test_redirect_trailing_noise_after_pipe() -> Result<(), Error> { + let err = make_err!( + Code::FailedPrecondition, + "{REDIRECT_PREFIX}grpc://w1:50071|some extra noise" + ); + let msg = err.message_string(); + let start = msg.find(REDIRECT_PREFIX).expect("prefix missing"); + let endpoints_str = &msg[start + REDIRECT_PREFIX.len()..]; + let endpoints_str = endpoints_str.split('|').next().unwrap_or(endpoints_str); + let endpoints: Vec = endpoints_str + .split(',') + .filter(|s| !s.is_empty()) + .map(String::from) + .collect(); + assert_eq!(endpoints.len(), 1); + assert_eq!(endpoints[0], "grpc://w1:50071"); + Ok(()) + } + + // --------------------------------------------------------------- + // 9. Redirect parsing: empty segments filtered out. + // --------------------------------------------------------------- + #[nativelink_test] + async fn test_redirect_empty_segments_filtered() -> Result<(), Error> { + let err = make_err!( + Code::FailedPrecondition, + "{REDIRECT_PREFIX}a,,b,|" + ); + let msg = err.message_string(); + let start = msg.find(REDIRECT_PREFIX).expect("prefix missing"); + let endpoints_str = &msg[start + REDIRECT_PREFIX.len()..]; + let endpoints_str = endpoints_str.split('|').next().unwrap_or(endpoints_str); + let endpoints: Vec = endpoints_str + .split(',') + .filter(|s| !s.is_empty()) + .map(String::from) + .collect(); + assert_eq!(endpoints, vec!["a", "b"]); + Ok(()) + } + + // --------------------------------------------------------------- + // 10. IS_WORKER_REQUEST=true gets redirect with peer endpoints. + // --------------------------------------------------------------- + #[nativelink_test] + async fn test_worker_request_gets_redirect() -> Result<(), Error> { + let (store, locality_map) = make_proxy_store(); + + let digest = DigestInfo::try_new(VALID_HASH1, 100)?; + let peer_endpoint = "grpc://peer-worker:50071"; + + locality_map + .write() + .register_blobs(peer_endpoint, &[digest]); + + let result = IS_WORKER_REQUEST + .scope(true, store.get_part_unchunked(digest, 0, None)) + .await; + + assert!(result.is_err(), "Expected redirect error"); + let err = result.unwrap_err(); + assert_eq!( + err.code, + Code::FailedPrecondition, + "Redirect should use FailedPrecondition, got: {err:?}" + ); + let msg = err.message_string(); + assert!( + msg.contains(REDIRECT_PREFIX), + "Error should contain redirect prefix: {msg}" + ); + assert!( + msg.contains(peer_endpoint), + "Error should contain peer endpoint: {msg}" + ); + + Ok(()) + } + + // --------------------------------------------------------------- + // 11. IS_WORKER_REQUEST=false gets NotFound (no proxy to invalid peer). + // --------------------------------------------------------------- + #[nativelink_test] + async fn test_non_worker_request_gets_not_found() -> Result<(), Error> { + let (store, locality_map) = make_proxy_store(); + + let digest = DigestInfo::try_new(VALID_HASH1, 100)?; + + // Use an invalid URI so the proxy attempt fails gracefully. + locality_map + .write() + .register_blobs("not a valid uri", &[digest]); + + let result = IS_WORKER_REQUEST + .scope(false, store.get_part_unchunked(digest, 0, None)) + .await; + + assert!(result.is_err(), "Expected NotFound error"); + let err = result.unwrap_err(); + assert_eq!( + err.code, + Code::NotFound, + "Non-worker should get NotFound, got: {err:?}" + ); + + Ok(()) + } + + // --------------------------------------------------------------- + // 12. optimized_for(LazyExistenceOnSync) returns true. + // --------------------------------------------------------------- + #[nativelink_test] + async fn test_optimized_for_lazy_existence() -> Result<(), Error> { + let inner = Store::new(MemoryStore::new(&MemorySpec::default())); + let locality_map = new_shared_blob_locality_map(); + let proxy = WorkerProxyStore::new(inner, locality_map); + + assert!( + StoreDriver::optimized_for(&*proxy, StoreOptimizations::LazyExistenceOnSync), + "WorkerProxyStore should report LazyExistenceOnSync" + ); + + Ok(()) + } + + // --------------------------------------------------------------- + // 13. optimized_for(other) delegates to inner store. + // --------------------------------------------------------------- + #[nativelink_test] + async fn test_optimized_for_other_delegates_to_inner() -> Result<(), Error> { + let inner = Store::new(MemoryStore::new(&MemorySpec::default())); + let locality_map = new_shared_blob_locality_map(); + let proxy = WorkerProxyStore::new(inner, locality_map); + + assert!( + !StoreDriver::optimized_for(&*proxy, StoreOptimizations::NoopUpdates), + "Should delegate non-LazyExistence optimizations to inner store" + ); + + Ok(()) + } + + // --------------------------------------------------------------- + // 14. Race: inner store has blob, peer registered — server wins race. + // --------------------------------------------------------------- + #[nativelink_test] + async fn test_race_server_wins_when_inner_has_blob() -> Result<(), Error> { + let inner = Store::new(MemoryStore::new(&MemorySpec::default())); + let locality_map = new_shared_blob_locality_map(); + let mut proxy = WorkerProxyStore::new(inner.clone(), locality_map.clone()); + Arc::get_mut(&mut proxy).unwrap().enable_race_peers(); + let store = Store::new(proxy.clone()); + + let value = b"race test data"; + let digest = DigestInfo::try_new(VALID_HASH1, value.len() as u64)?; + + // Put blob in inner store. + inner + .update_oneshot(digest, Bytes::from_static(value)) + .await?; + + // Inject a peer that also has the blob (MemoryStore with same data). + let peer_store = Store::new(MemoryStore::new(&MemorySpec::default())); + peer_store + .update_oneshot(digest, Bytes::from_static(value)) + .await?; + proxy.inject_worker_connection("grpc://peer:50071", peer_store); + + locality_map + .write() + .register_blobs("grpc://peer:50071", &[digest]); + + // NOT in IS_WORKER_REQUEST scope, so racing path is taken. + let result = store.get_part_unchunked(digest, 0, None).await?; + assert_eq!(result.as_ref(), value); + + Ok(()) + } + + // --------------------------------------------------------------- + // 15. Race: inner store miss, peer has blob — peer wins race. + // --------------------------------------------------------------- + #[nativelink_test] + async fn test_race_peer_wins_when_inner_misses() -> Result<(), Error> { + let inner = Store::new(MemoryStore::new(&MemorySpec::default())); + let locality_map = new_shared_blob_locality_map(); + let mut proxy = WorkerProxyStore::new(inner, locality_map.clone()); + Arc::get_mut(&mut proxy).unwrap().enable_race_peers(); + let store = Store::new(proxy.clone()); + + let value = b"peer only data"; + let digest = DigestInfo::try_new(VALID_HASH1, value.len() as u64)?; + + // Inner store is empty. Peer has the blob. + let peer_store = Store::new(MemoryStore::new(&MemorySpec::default())); + peer_store + .update_oneshot(digest, Bytes::from_static(value)) + .await?; + proxy.inject_worker_connection("grpc://peer:50071", peer_store); + + locality_map + .write() + .register_blobs("grpc://peer:50071", &[digest]); + + let result = store.get_part_unchunked(digest, 0, None).await?; + assert_eq!(result.as_ref(), value); + + Ok(()) + } + + // --------------------------------------------------------------- + // 16. Race: both inner and peer miss — returns error. + // --------------------------------------------------------------- + #[nativelink_test] + async fn test_race_both_miss_returns_error() -> Result<(), Error> { + let inner = Store::new(MemoryStore::new(&MemorySpec::default())); + let locality_map = new_shared_blob_locality_map(); + let mut proxy = WorkerProxyStore::new(inner, locality_map.clone()); + Arc::get_mut(&mut proxy).unwrap().enable_race_peers(); + let store = Store::new(proxy.clone()); + + let digest = DigestInfo::try_new(VALID_HASH1, 100)?; + + // Both inner and peer are empty. + let peer_store = Store::new(MemoryStore::new(&MemorySpec::default())); + proxy.inject_worker_connection("grpc://peer:50071", peer_store); + + locality_map + .write() + .register_blobs("grpc://peer:50071", &[digest]); + + let result = store.get_part_unchunked(digest, 0, None).await; + assert!(result.is_err(), "Expected error when both miss"); + + Ok(()) + } +} diff --git a/nativelink-store/tests/ac_utils_test.rs b/nativelink-store/tests/ac_utils_test.rs index f9cd4ac9f..d1270483b 100644 --- a/nativelink-store/tests/ac_utils_test.rs +++ b/nativelink-store/tests/ac_utils_test.rs @@ -62,10 +62,9 @@ async fn upload_file_to_store_with_large_file() -> Result<(), Error> { } { // Upload our file. - let file = fs::open_file(&filepath, 0, u64::MAX) + let file = fs::open_file(&filepath, 0) .await - .unwrap() - .into_inner(); + .unwrap(); store .update_with_whole_file( digest, diff --git a/nativelink-store/tests/existence_store_test.rs b/nativelink-store/tests/existence_store_test.rs index 5bba22256..9560140b8 100644 --- a/nativelink-store/tests/existence_store_test.rs +++ b/nativelink-store/tests/existence_store_test.rs @@ -26,6 +26,7 @@ use nativelink_util::common::DigestInfo; use nativelink_util::instant_wrapper::MockInstantWrapped; use nativelink_util::store_trait::{Store, StoreLike}; use pretty_assertions::assert_eq; +use tokio::time::sleep; const VALID_HASH1: &str = "0123456789abcdef000000000000000000010000000000000123456789abcdef"; @@ -144,11 +145,12 @@ async fn ensure_has_requests_do_let_evictions_happen() -> Result<(), Error> { assert_eq!(store.has(digest).await, Ok(Some(VALUE.len() as u64))); MockClock::advance(Duration::from_secs(3)); - // Now that our existence cache has been populated, remove - // it from the inner store. + // Remove from the inner store. inner_store.remove_entry(digest.into()).await; - // It should be immediately evicted from the existence cache. + // Allow background eviction callbacks to propagate to the existence cache. + sleep(Duration::from_millis(10)).await; + // has() reflects the removal once the background callback clears the cache. assert_eq!(store.has(digest).await, Ok(None)); Ok(()) @@ -175,6 +177,8 @@ async fn copes_with_dropped_items() -> Result<(), Error> { .await .err_tip(|| "Failed to update store")?; + // Allow background eviction callbacks to propagate to the existence cache. + sleep(Duration::from_millis(10)).await; let inner_store_item = inner_store.has(digest).await; assert!( inner_store_item.is_ok(), diff --git a/nativelink-store/tests/fast_slow_store_test.rs b/nativelink-store/tests/fast_slow_store_test.rs index 53dd12387..04a82d870 100644 --- a/nativelink-store/tests/fast_slow_store_test.rs +++ b/nativelink-store/tests/fast_slow_store_test.rs @@ -28,7 +28,7 @@ use nativelink_store::noop_store::NoopStore; use nativelink_util::buf_channel::make_buf_channel_pair; use nativelink_util::common::DigestInfo; use nativelink_util::health_utils::{HealthStatusIndicator, default_health_status_indicator}; -use nativelink_util::store_trait::{RemoveItemCallback, Store, StoreDriver, StoreKey, StoreLike}; +use nativelink_util::store_trait::{ItemCallback, Store, StoreDriver, StoreKey, StoreLike}; use pretty_assertions::assert_eq; use rand::rngs::SmallRng; use rand::{Rng, SeedableRng}; @@ -310,9 +310,9 @@ async fn drop_on_eof_completes_store_futures() -> Result<(), Error> { self } - fn register_remove_callback( + fn register_item_callback( self: Arc, - _callback: Arc, + _callback: Arc, ) -> Result<(), Error> { Ok(()) } @@ -634,9 +634,9 @@ fn make_stores_with_lazy_slow() -> (Store, Store, Store) { self } - fn register_remove_callback( + fn register_item_callback( self: Arc, - _callback: Arc, + _callback: Arc, ) -> Result<(), Error> { Ok(()) } diff --git a/nativelink-store/tests/filesystem_store_test.rs b/nativelink-store/tests/filesystem_store_test.rs index 7655de0c1..cc441a80e 100644 --- a/nativelink-store/tests/filesystem_store_test.rs +++ b/nativelink-store/tests/filesystem_store_test.rs @@ -44,7 +44,6 @@ use pretty_assertions::assert_eq; use rand::rngs::SmallRng; use rand::{Rng, SeedableRng}; use sha2::{Digest, Sha256}; -use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt, Take}; use tokio::sync::{Barrier, Semaphore}; use tokio::time::sleep; use tokio_stream::StreamExt; @@ -124,11 +123,11 @@ impl FileEntry for TestFileEntry< self.inner.as_ref().unwrap().get_encoded_file_path() } - async fn read_file_part(&self, offset: u64, length: u64) -> Result, Error> { + async fn read_file_part(&self, offset: u64) -> Result { self.inner .as_ref() .unwrap() - .read_file_part(offset, length) + .read_file_part(offset) .await } @@ -211,14 +210,7 @@ fn make_temp_path(data: &str) -> String { } async fn read_file_contents(file_name: &OsStr) -> Result, Error> { - let mut file = fs::open_file(file_name, 0, u64::MAX) - .await - .err_tip(|| format!("Failed to open file: {}", file_name.display()))?; - let mut data = vec![]; - file.read_to_end(&mut data) - .await - .err_tip(|| "Error reading file to end")?; - Ok(data) + fs::read(Path::new(file_name)).await } async fn wait_for_no_open_files() -> Result<(), Error> { @@ -406,7 +398,13 @@ async fn file_continues_to_stream_on_content_replace_test() -> Result<(), Error> } } - let digest1 = DigestInfo::try_new(HASH1, VALUE1.len())?; + // Use a large value so the producer is still blocked mid-stream when we + // check the temp directory. With read_buffer_size=1 and channel capacity 64, + // the producer sends 1-byte chunks. It needs well over 64 bytes to ensure + // it can't finish before the test inspects temp_path. + let large_value1: String = "abcdefghij".repeat(10); // 100 bytes + let large_value2: String = "ABCDEFGHIJ".repeat(10); // 100 bytes + let digest1 = DigestInfo::try_new(HASH1, large_value1.len())?; let content_path = make_temp_path("content_path"); let temp_path = make_temp_path("temp_path"); @@ -426,7 +424,9 @@ async fn file_continues_to_stream_on_content_replace_test() -> Result<(), Error> ); // Insert data into store. - store.update_oneshot(digest1, VALUE1.into()).await?; + store + .update_oneshot(digest1, large_value1.clone().into()) + .await?; let (writer, mut reader) = make_buf_channel_pair(); let store_clone = store.clone(); @@ -444,13 +444,15 @@ async fn file_continues_to_stream_on_content_replace_test() -> Result<(), Error> .err_tip(|| "Error reading first byte")?; assert_eq!( first_byte[0], - VALUE1.as_bytes()[0], + large_value1.as_bytes()[0], "Expected first byte to match" ); } // Replace content. - store.update_oneshot(digest1, VALUE2.into()).await?; + store + .update_oneshot(digest1, large_value2.into()) + .await?; // Ensure we let any background tasks finish. tokio::task::yield_now().await; @@ -469,7 +471,7 @@ async fn file_continues_to_stream_on_content_replace_test() -> Result<(), Error> let data = read_file_contents(path.as_os_str()).await?; assert_eq!( &data[..], - VALUE1.as_bytes(), + large_value1.as_bytes(), "Expected file content to match" ); } @@ -486,7 +488,7 @@ async fn file_continues_to_stream_on_content_replace_test() -> Result<(), Error> assert_eq!( &remaining_file_data, - &VALUE1.as_bytes()[1..], + &large_value1.as_bytes()[1..], "Expected file content to match" ); @@ -514,8 +516,17 @@ async fn file_gets_cleans_up_on_cache_eviction() -> Result<(), Error> { } } - let digest1 = DigestInfo::try_new(HASH1, VALUE1.len())?; - let digest2 = DigestInfo::try_new(HASH2, VALUE2.len())?; + // Use a large value so the producer is still blocked mid-stream when we + // check the temp directory. With read_buffer_size=1 and channel capacity 64, + // the producer sends 1-byte chunks. It needs well over 64 bytes to ensure + // it can't finish before the test inspects temp_path. With a small value + // (e.g. 10 bytes), all chunks fit in the channel buffer, the get task + // completes immediately, and the background delete can race ahead of the + // temp directory inspection. + let large_value1: String = "abcdefghij".repeat(10); // 100 bytes + let large_value2: String = "ABCDEFGHIJ".repeat(10); // 100 bytes + let digest1 = DigestInfo::try_new(HASH1, large_value1.len())?; + let digest2 = DigestInfo::try_new(HASH2, large_value2.len())?; let content_path = make_temp_path("content_path"); let temp_path = make_temp_path("temp_path"); @@ -535,23 +546,36 @@ async fn file_gets_cleans_up_on_cache_eviction() -> Result<(), Error> { ); // Insert data into store. - store.update_oneshot(digest1, VALUE1.into()).await.unwrap(); - - let mut reader = { - let (writer, reader) = make_buf_channel_pair(); - let store_clone = store.clone(); - background_spawn!( - "file_gets_cleans_up_on_cache_eviction_store_get", - async move { store_clone.get(digest1, writer).await.unwrap() }, + store + .update_oneshot(digest1, large_value1.clone().into()) + .await + .unwrap(); + + let (writer, mut reader) = make_buf_channel_pair(); + let store_clone = store.clone(); + background_spawn!( + "file_gets_cleans_up_on_cache_eviction_store_get", + async move { store_clone.get(digest1, writer).await.unwrap() }, + ); + + { + // Check to ensure our first byte has been received. The future should be stalled + // here because the large value exceeds the channel capacity with read_buffer_size=1. + let first_byte = reader + .consume(Some(1)) + .await + .err_tip(|| "Error reading first byte")?; + assert_eq!( + first_byte[0], + large_value1.as_bytes()[0], + "Expected first byte to match" ); - reader - }; - // Ensure we have received 1 byte in our buffer. This will ensure we have a reference to - // our file open. - assert!(reader.peek().await.is_ok(), "Could not peek into reader"); + } // Insert new content. This will evict the old item. - store.update_oneshot(digest2, VALUE2.into()).await?; + store + .update_oneshot(digest2, large_value2.into()) + .await?; // Ensure we let any background tasks finish. tokio::task::yield_now().await; @@ -570,7 +594,7 @@ async fn file_gets_cleans_up_on_cache_eviction() -> Result<(), Error> { let data = read_file_contents(path.as_os_str()).await?; assert_eq!( &data[..], - VALUE1.as_bytes(), + large_value1.as_bytes(), "Expected file content to match" ); } @@ -580,12 +604,16 @@ async fn file_gets_cleans_up_on_cache_eviction() -> Result<(), Error> { ); } - let reader_data = reader + let remaining_file_data = reader .consume(Some(1024)) .await .err_tip(|| "Error reading remaining bytes")?; - assert_eq!(&reader_data, VALUE1, "Expected file content to match"); + assert_eq!( + &remaining_file_data, + &large_value1.as_bytes()[1..], + "Expected file content to match" + ); loop { if DELETES_FINISHED.load(Ordering::Relaxed) == 1 { @@ -619,9 +647,9 @@ async fn digest_contents_replaced_continues_using_old_data() -> Result<(), Error let file_entry = store.get_file_entry_for_digest(&digest).await?; { // The file contents should equal our initial data. - let mut reader = file_entry.read_file_part(0, u64::MAX).await?; + let mut reader = file_entry.read_file_part(0).await?; let mut file_contents = String::new(); - reader.read_to_string(&mut file_contents).await?; + std::io::Read::read_to_string(reader.as_std_mut(), &mut file_contents)?; assert_eq!(file_contents, VALUE1); } @@ -630,9 +658,9 @@ async fn digest_contents_replaced_continues_using_old_data() -> Result<(), Error { // The file contents still equal our old data. - let mut reader = file_entry.read_file_part(0, u64::MAX).await?; + let mut reader = file_entry.read_file_part(0).await?; let mut file_contents = String::new(); - reader.read_to_string(&mut file_contents).await?; + std::io::Read::read_to_string(reader.as_std_mut(), &mut file_contents)?; assert_eq!(file_contents, VALUE1); } @@ -723,11 +751,11 @@ async fn rename_on_insert_fails_due_to_filesystem_error_proper_cleanup_happens() let dir_entry = dir_entry?; { // Some filesystems won't sync automatically, so force it. - let file_handle = fs::open_file(dir_entry.path().into_os_string(), 0, u64::MAX) + let file_handle = fs::open_file(dir_entry.path().into_os_string(), 0) .await .err_tip(|| "Failed to open temp file")?; // We don't care if it fails, this is only best attempt. - drop(file_handle.get_ref().as_ref().sync_all().await); + drop(file_handle.as_std().sync_all()); } // Ensure we have written to the file too. This ensures we have an open file handle. // Failing to do this may result in the file existing, but the `update_fut` not actually @@ -983,7 +1011,7 @@ async fn update_whole_file_with_zero_digest() -> Result<(), Error> { let temp_file_path = Path::new(&temp_file_dir).join("zero-length-file"); std::fs::write(&temp_file_path, b"") .err_tip(|| format!("Writing to {temp_file_path:?}"))?; - let file_slot = fs::open_file(&temp_file_path, 0, 0).await?.into_inner(); + let file_slot = fs::open_file(&temp_file_path, 0).await?; store .update_with_whole_file( digest, @@ -1244,9 +1272,13 @@ async fn update_with_whole_file_closes_file() -> Result<(), Error> { let file_path = OsString::from(format!("{temp_path}/dummy_file")); let mut file = fs::create_file(&file_path).await?; { - file.write_all(value.as_bytes()).await?; - file.as_mut().sync_all().await?; - file.seek(tokio::io::SeekFrom::Start(0)).await?; + use std::io::{Seek, Write}; + file.as_std_mut().write_all(value.as_bytes()) + .err_tip(|| "Could not write to file")?; + file.as_std().sync_all() + .err_tip(|| "Could not sync file")?; + file.as_std_mut().seek(std::io::SeekFrom::Start(0)) + .err_tip(|| "Could not seek file")?; } store @@ -1288,7 +1320,8 @@ async fn update_with_whole_file_uses_same_inode() -> Result<(), Error> { let file_path = OsString::from(format!("{temp_path}/dummy_file")); let original_inode = { let file = fs::create_file(&file_path).await?; - let original_inode = file.as_ref().metadata().await?.ino(); + let original_inode = file.as_std().metadata() + .err_tip(|| "Could not get metadata")?.ino(); let result = store .update_with_whole_file( @@ -1305,14 +1338,8 @@ async fn update_with_whole_file_uses_same_inode() -> Result<(), Error> { original_inode }; - let expected_file_name = OsString::from(format!("{content_path}/{DIGEST_FOLDER}/{digest}")); - let new_inode = fs::create_file(expected_file_name) - .await - .unwrap() - .as_ref() - .metadata() - .await? - .ino(); + let expected_file_name = format!("{content_path}/{DIGEST_FOLDER}/{digest}"); + let new_inode = tokio::fs::metadata(&expected_file_name).await?.ino(); assert_eq!( original_inode, new_inode, "Expected the same inode for the file" @@ -1457,6 +1484,7 @@ async fn safe_small_safe_eviction() -> Result<(), Error> { messages: vec![format!( "{VALID_HASH}-{bytes} not found in filesystem store here" )], + details: vec![], }), "Expected data to not exist in store, because eviction" ); diff --git a/nativelink-store/tests/redis_store_test.rs b/nativelink-store/tests/redis_store_test.rs index 4d558b416..12cf8cb1c 100644 --- a/nativelink-store/tests/redis_store_test.rs +++ b/nativelink-store/tests/redis_store_test.rs @@ -639,7 +639,8 @@ fn test_connection_errors() { messages: vec![ "deadline has elapsed".into(), format!("While connecting to redis with url: redis://nativelink.com:6379/") - ] + ], + details: vec![], }, err ); @@ -738,7 +739,8 @@ async fn test_sentinel_connect_with_bad_master() { messages: vec![ "MasterNameNotFoundBySentinel: Master with given name not found in sentinel - MasterNameNotFoundBySentinel".into(), format!("While connecting to redis with url: redis+sentinel://127.0.0.1:{port}/") - ] + ], + details: vec![], }, RedisStore::new_standard(spec).await.unwrap_err() ); @@ -778,7 +780,8 @@ async fn test_redis_connect_timeout() { messages: vec![ "deadline has elapsed".into(), format!("While connecting to redis with url: redis://127.0.0.1:{port}/") - ] + ], + details: vec![], }, RedisStore::new_standard(spec).await.unwrap_err() ); diff --git a/nativelink-store/tests/shard_store_test.rs b/nativelink-store/tests/shard_store_test.rs index f8753849a..ac6b22988 100644 --- a/nativelink-store/tests/shard_store_test.rs +++ b/nativelink-store/tests/shard_store_test.rs @@ -81,7 +81,7 @@ async fn verify_weights( } for (index, (store, expected_hit)) in stores.iter().zip(expected_hits.iter()).enumerate() { - let total_hits = store.len_for_test(); + let total_hits = store.len_for_test().await; #[expect(clippy::print_stdout, reason = "improves debugging")] if print_results { println!("expected_hit: {expected_hit} - total_hits: {total_hits}"); diff --git a/nativelink-store/tests/worker_proxy_store_test.rs b/nativelink-store/tests/worker_proxy_store_test.rs new file mode 100644 index 000000000..641b335f0 --- /dev/null +++ b/nativelink-store/tests/worker_proxy_store_test.rs @@ -0,0 +1,839 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// See LICENSE file for details +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use core::pin::Pin; +use std::sync::Arc; + +use async_trait::async_trait; +use bytes::Bytes; +use nativelink_config::stores::MemorySpec; +use nativelink_error::{Code, Error, make_err}; +use nativelink_macro::nativelink_test; +use nativelink_metric::MetricsComponent; +use nativelink_store::memory_store::MemoryStore; +use nativelink_store::worker_proxy_store::WorkerProxyStore; +use nativelink_util::blob_locality_map::{SharedBlobLocalityMap, new_shared_blob_locality_map}; +use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; +use nativelink_util::common::DigestInfo; +use nativelink_util::health_utils::{HealthStatusIndicator, default_health_status_indicator}; +use nativelink_util::store_trait::{ + IS_WORKER_REQUEST, ItemCallback, REDIRECT_PREFIX, Store, StoreDriver, StoreKey, StoreLike, + StoreOptimizations, UploadSizeInfo, +}; +use pretty_assertions::assert_eq; + +const VALID_HASH1: &str = "0123456789abcdef000000000000000000010000000000000123456789abcdef"; +const VALID_HASH2: &str = "0123456789abcdef000000000000000000020000000000000123456789abcdef"; +const VALID_HASH3: &str = "0123456789abcdef000000000000000000030000000000000123456789abcdef"; + +/// Helper: create a WorkerProxyStore backed by a fresh MemoryStore. +/// Returns (proxy_store_as_Store, inner_memory_store, locality_map). +fn make_proxy_store() -> (Store, Store, SharedBlobLocalityMap) { + let inner = Store::new(MemoryStore::new(&MemorySpec::default())); + let locality_map = new_shared_blob_locality_map(); + let proxy = WorkerProxyStore::new(inner.clone(), locality_map.clone()); + (Store::new(proxy), inner, locality_map) +} + +// ------------------------------------------------------------------- +// 1. get_part delegates to inner store on hit +// ------------------------------------------------------------------- +#[nativelink_test] +async fn get_part_returns_data_from_inner_store_on_hit() -> Result<(), Error> { + let (proxy, _inner, locality_map) = make_proxy_store(); + + let value = b"hello from inner store"; + let digest = DigestInfo::try_new(VALID_HASH1, value.len() as u64)?; + + // Write directly through the proxy (which delegates update to inner). + proxy + .update_oneshot(digest, Bytes::from_static(value)) + .await?; + + // Register a fake worker in the locality map. If get_part were to + // consult it, it would try to connect and potentially fail or return + // different data. We verify the inner store data is returned instead. + locality_map + .write() + .register_blobs("fake-worker:9999", &[digest]); + + let result = proxy.get_part_unchunked(digest, 0, None).await?; + assert_eq!( + result.as_ref(), + value, + "Expected data from inner store, not from worker" + ); + + Ok(()) +} + +// ------------------------------------------------------------------- +// 2. get_part returns NotFound when inner misses and no peers +// ------------------------------------------------------------------- +#[nativelink_test] +async fn get_part_returns_not_found_when_inner_misses_and_no_peers() -> Result<(), Error> { + let (proxy, _inner, _locality_map) = make_proxy_store(); + + let digest = DigestInfo::try_new(VALID_HASH1, 42)?; + + let result = proxy.get_part_unchunked(digest, 0, None).await; + assert!(result.is_err(), "Expected an error for missing blob"); + + let err = result.unwrap_err(); + assert_eq!( + err.code, + Code::NotFound, + "Expected NotFound error code, got: {err:?}" + ); + + Ok(()) +} + +// ------------------------------------------------------------------- +// 3. has delegates to inner store (returns Some on hit) +// ------------------------------------------------------------------- +#[nativelink_test] +async fn has_returns_size_when_inner_has_blob() -> Result<(), Error> { + let (proxy, _inner, _locality_map) = make_proxy_store(); + + let value = b"test data for has"; + let digest = DigestInfo::try_new(VALID_HASH1, value.len() as u64)?; + + proxy + .update_oneshot(digest, Bytes::from_static(value)) + .await?; + + let size = proxy.has(digest).await?; + assert_eq!( + size, + Some(value.len() as u64), + "has() should return the blob size from inner store" + ); + + Ok(()) +} + +// ------------------------------------------------------------------- +// 4. has returns None when inner does not have blob +// (locality map is never consulted for has) +// ------------------------------------------------------------------- +#[nativelink_test] +async fn has_returns_none_when_inner_missing_even_if_locality_has_peers() -> Result<(), Error> { + let (proxy, _inner, locality_map) = make_proxy_store(); + + let digest = DigestInfo::try_new(VALID_HASH1, 100)?; + + // Register the digest on a worker endpoint. + locality_map + .write() + .register_blobs("worker-a:50081", &[digest]); + + // has() must NOT consult the locality map. + let size = proxy.has(digest).await?; + assert_eq!( + size, None, + "has() should return None even though locality map has the digest" + ); + + Ok(()) +} + +// ------------------------------------------------------------------- +// 5. has_with_results delegates to inner store (pass-through) +// ------------------------------------------------------------------- +#[nativelink_test] +async fn has_with_results_delegates_to_inner_store() -> Result<(), Error> { + let (proxy, _inner, locality_map) = make_proxy_store(); + + let value = b"test data"; + let d1 = DigestInfo::try_new(VALID_HASH1, value.len() as u64)?; + let d2 = DigestInfo::try_new(VALID_HASH2, 999)?; + let d3 = DigestInfo::try_new(VALID_HASH3, 50)?; + + // Only d1 is in the inner store. + proxy + .update_oneshot(d1, Bytes::from_static(value)) + .await?; + + // Register d2 and d3 on workers — should NOT affect has_with_results. + { + let mut map = locality_map.write(); + map.register_blobs("worker-a:50081", &[d2]); + map.register_blobs("worker-b:50081", &[d3]); + } + + let keys: Vec> = vec![d1.into(), d2.into(), d3.into()]; + let mut results = vec![None; 3]; + proxy.has_with_results(&keys, &mut results).await?; + + assert_eq!( + results[0], + Some(value.len() as u64), + "d1 should be found in inner store" + ); + assert_eq!( + results[1], None, + "d2 should NOT be found — has_with_results must not consult locality map" + ); + assert_eq!( + results[2], None, + "d3 should NOT be found — has_with_results must not consult locality map" + ); + + Ok(()) +} + +// ------------------------------------------------------------------- +// 6. has_with_results on empty digest list succeeds +// ------------------------------------------------------------------- +#[nativelink_test] +async fn has_with_results_empty_digests_succeeds() -> Result<(), Error> { + let (proxy, _inner, _locality_map) = make_proxy_store(); + + let keys: Vec> = vec![]; + let mut results: Vec> = vec![]; + proxy.has_with_results(&keys, &mut results).await?; + + // No assertions needed beyond not panicking. + Ok(()) +} + +// ------------------------------------------------------------------- +// 7. update_oneshot delegates to inner store +// ------------------------------------------------------------------- +#[nativelink_test] +async fn update_oneshot_stores_in_inner() -> Result<(), Error> { + let (proxy, inner, _locality_map) = make_proxy_store(); + + let value = b"upload via proxy"; + let digest = DigestInfo::try_new(VALID_HASH1, value.len() as u64)?; + + proxy + .update_oneshot(digest, Bytes::from_static(value)) + .await?; + + // Verify the blob landed in the inner store directly. + let inner_data = inner.get_part_unchunked(digest, 0, None).await?; + assert_eq!( + inner_data.as_ref(), + value, + "Data should be present in the inner store after update_oneshot" + ); + + Ok(()) +} + +// ------------------------------------------------------------------- +// 8. get_part with offset and length on inner hit +// ------------------------------------------------------------------- +#[nativelink_test] +async fn get_part_with_offset_and_length_from_inner() -> Result<(), Error> { + let (proxy, _inner, _locality_map) = make_proxy_store(); + + let value = b"0123456789abcdefghij"; // 20 bytes + let digest = DigestInfo::try_new(VALID_HASH1, value.len() as u64)?; + + proxy + .update_oneshot(digest, Bytes::from_static(value)) + .await?; + + // Read bytes [5..15) — 10 bytes at offset 5. + let data = proxy.get_part_unchunked(digest, 5, Some(10)).await?; + assert_eq!( + data.as_ref(), + b"56789abcde", + "Expected subset at offset=5, length=10" + ); + + // Read from offset 15 to end. + let data = proxy.get_part_unchunked(digest, 15, None).await?; + assert_eq!(data.as_ref(), b"fghij", "Expected tail from offset=15"); + + // Read 0 bytes. + let data = proxy.get_part_unchunked(digest, 0, Some(0)).await?; + assert_eq!(data.as_ref(), b"", "Expected empty result for length=0"); + + Ok(()) +} + +// ------------------------------------------------------------------- +// 9. Inner miss + locality has peers for a DIFFERENT digest +// => the queried digest is still NotFound (locality map miss) +// ------------------------------------------------------------------- +#[nativelink_test] +async fn get_part_inner_miss_locality_has_different_digest_returns_not_found() -> Result<(), Error> { + let (proxy, _inner, locality_map) = make_proxy_store(); + + let d1 = DigestInfo::try_new(VALID_HASH1, 100)?; + let d2 = DigestInfo::try_new(VALID_HASH2, 200)?; + + // Register d2 on a worker, but NOT d1. + locality_map + .write() + .register_blobs("worker-a:50081", &[d2]); + + // Query d1 — not in inner store, not in locality map. + let result = proxy.get_part_unchunked(d1, 0, None).await; + assert!(result.is_err(), "Expected NotFound for d1"); + + let err = result.unwrap_err(); + assert_eq!( + err.code, + Code::NotFound, + "Expected NotFound since d1 has no locality entries, got: {err:?}" + ); + + Ok(()) +} + +// ------------------------------------------------------------------- +// 10. Locality map returns empty workers list after eviction +// => NotFound (no peers to try) +// ------------------------------------------------------------------- +#[nativelink_test] +async fn get_part_inner_miss_locality_evicted_returns_not_found() -> Result<(), Error> { + let (proxy, _inner, locality_map) = make_proxy_store(); + + let digest = DigestInfo::try_new(VALID_HASH1, 100)?; + + // Register then evict the digest. + { + let mut map = locality_map.write(); + map.register_blobs("worker-a:50081", &[digest]); + map.evict_blobs("worker-a:50081", &[digest]); + } + + // Now there are no workers for this digest. + let result = proxy.get_part_unchunked(digest, 0, None).await; + assert!(result.is_err(), "Expected NotFound after eviction"); + + let err = result.unwrap_err(); + assert_eq!( + err.code, + Code::NotFound, + "Expected NotFound since locality was evicted, got: {err:?}" + ); + + Ok(()) +} + +// ------------------------------------------------------------------- +// 11. update followed by get_part roundtrip +// ------------------------------------------------------------------- +#[nativelink_test] +async fn update_then_get_roundtrip() -> Result<(), Error> { + let (proxy, _inner, _locality_map) = make_proxy_store(); + + let value = b"roundtrip data payload"; + let digest = DigestInfo::try_new(VALID_HASH1, value.len() as u64)?; + + // Upload via proxy. + proxy + .update_oneshot(digest, Bytes::from_static(value)) + .await?; + + // Verify has() works. + let size = proxy.has(digest).await?; + assert_eq!(size, Some(value.len() as u64)); + + // Verify get_part returns the correct data. + let data = proxy.get_part_unchunked(digest, 0, None).await?; + assert_eq!(data.as_ref(), value); + + Ok(()) +} + +// ------------------------------------------------------------------- +// 12. Multiple blobs: has_with_results shows correct presence +// ------------------------------------------------------------------- +#[nativelink_test] +async fn has_with_results_multiple_blobs_mixed() -> Result<(), Error> { + let (proxy, _inner, _locality_map) = make_proxy_store(); + + let v1 = b"first blob"; + let v3 = b"third blob"; + let d1 = DigestInfo::try_new(VALID_HASH1, v1.len() as u64)?; + let d2 = DigestInfo::try_new(VALID_HASH2, 999)?; // not stored + let d3 = DigestInfo::try_new(VALID_HASH3, v3.len() as u64)?; + + proxy + .update_oneshot(d1, Bytes::from_static(v1)) + .await?; + proxy + .update_oneshot(d3, Bytes::from_static(v3)) + .await?; + + let keys: Vec> = vec![d1.into(), d2.into(), d3.into()]; + let mut results = vec![None; 3]; + proxy.has_with_results(&keys, &mut results).await?; + + assert_eq!(results[0], Some(v1.len() as u64), "d1 should be found"); + assert_eq!(results[1], None, "d2 should not be found"); + assert_eq!(results[2], Some(v3.len() as u64), "d3 should be found"); + + Ok(()) +} + +// ------------------------------------------------------------------- +// 13. get_part for a blob that was never stored and has no locality +// entries returns NotFound (different digest, not in map at all) +// ------------------------------------------------------------------- +#[nativelink_test] +async fn get_part_completely_unknown_digest_returns_not_found() -> Result<(), Error> { + let (proxy, _inner, locality_map) = make_proxy_store(); + + // Register a DIFFERENT digest on a worker (not the one we query). + let other_digest = DigestInfo::try_new(VALID_HASH2, 50)?; + locality_map + .write() + .register_blobs("worker-x:50081", &[other_digest]); + + // Query a digest that is not in the inner store and not in the + // locality map at all. + let query_digest = DigestInfo::try_new(VALID_HASH1, 100)?; + let result = proxy.get_part_unchunked(query_digest, 0, None).await; + + assert!(result.is_err()); + assert_eq!(result.unwrap_err().code, Code::NotFound); + + Ok(()) +} + +// ------------------------------------------------------------------- +// 14. Overwrite a blob via update and verify new data is returned +// ------------------------------------------------------------------- +#[nativelink_test] +async fn update_overwrites_existing_blob() -> Result<(), Error> { + let (proxy, _inner, _locality_map) = make_proxy_store(); + + let digest = DigestInfo::try_new(VALID_HASH1, 5)?; + + proxy + .update_oneshot(digest, Bytes::from_static(b"first")) + .await?; + + let data = proxy.get_part_unchunked(digest, 0, None).await?; + assert_eq!(data.as_ref(), b"first"); + + // Overwrite with new data (same digest key, different content for + // MemoryStore which doesn't validate content hash). + proxy + .update_oneshot(digest, Bytes::from_static(b"secnd")) + .await?; + + let data = proxy.get_part_unchunked(digest, 0, None).await?; + assert_eq!(data.as_ref(), b"secnd"); + + Ok(()) +} + +// ------------------------------------------------------------------- +// 15. Non-NotFound errors from inner store propagate directly +// (no locality map fallback) +// ------------------------------------------------------------------- +// Note: This is difficult to test without a custom mock store that +// returns a non-NotFound error. The inline tests cover this via the +// match arm in get_part(). We verify the pattern indirectly: a +// successful inner read never consults the locality map (test 1), +// and NotFound triggers the locality path (tests 2, 9, 10). + +// ------------------------------------------------------------------- +// 16. Large blob roundtrip through the proxy +// ------------------------------------------------------------------- +#[nativelink_test] +async fn large_blob_roundtrip() -> Result<(), Error> { + let (proxy, _inner, _locality_map) = make_proxy_store(); + + // 1 MiB of repeated bytes + let size: usize = 1024 * 1024; + let value: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + let digest = DigestInfo::try_new(VALID_HASH1, size as u64)?; + + proxy + .update_oneshot(digest, Bytes::from(value.clone())) + .await?; + + let data = proxy.get_part_unchunked(digest, 0, None).await?; + assert_eq!(data.len(), size, "Returned blob size should match"); + assert_eq!(data.as_ref(), value.as_slice()); + + Ok(()) +} + +// =================================================================== +// Gap 1: Successful peer proxy read — inject a MemoryStore as a peer +// =================================================================== + +/// Helper: create a WorkerProxyStore and return the underlying Arc so we +/// can call inject_worker_connection(). +fn make_proxy_store_with_arc() -> (Arc, Store, SharedBlobLocalityMap) { + let inner = Store::new(MemoryStore::new(&MemorySpec::default())); + let locality_map = new_shared_blob_locality_map(); + let proxy_arc = WorkerProxyStore::new(inner.clone(), locality_map.clone()); + (proxy_arc, inner, locality_map) +} + +// ------------------------------------------------------------------- +// 17. Successful peer proxy read: inner miss, peer has the blob +// ------------------------------------------------------------------- +#[nativelink_test] +async fn get_part_proxies_from_injected_peer() -> Result<(), Error> { + let (proxy_arc, _inner, locality_map) = make_proxy_store_with_arc(); + let proxy = Store::new(proxy_arc.clone()); + + let value = b"data from the peer worker"; + let digest = DigestInfo::try_new(VALID_HASH1, value.len() as u64)?; + + // Create a "peer" MemoryStore and populate it with the blob. + let peer_store = Store::new(MemoryStore::new(&MemorySpec::default())); + peer_store + .update_oneshot(digest, Bytes::from_static(value)) + .await?; + + // Inject the peer store as a worker connection. + let peer_endpoint = "grpc://peer-worker:50081"; + proxy_arc.inject_worker_connection(peer_endpoint, peer_store); + + // Register the digest on the peer in the locality map. + locality_map + .write() + .register_blobs(peer_endpoint, &[digest]); + + // The inner store is empty, so get_part should proxy from the peer. + let result = proxy.get_part_unchunked(digest, 0, None).await?; + assert_eq!( + result.as_ref(), + value, + "Expected blob data from the injected peer store" + ); + + Ok(()) +} + +// ------------------------------------------------------------------- +// 18. Peer proxy read with offset and length +// ------------------------------------------------------------------- +#[nativelink_test] +async fn get_part_proxies_from_peer_with_offset() -> Result<(), Error> { + let (proxy_arc, _inner, locality_map) = make_proxy_store_with_arc(); + let proxy = Store::new(proxy_arc.clone()); + + let value = b"0123456789abcdef"; // 16 bytes + let digest = DigestInfo::try_new(VALID_HASH1, value.len() as u64)?; + + let peer_store = Store::new(MemoryStore::new(&MemorySpec::default())); + peer_store + .update_oneshot(digest, Bytes::from_static(value)) + .await?; + + let peer_endpoint = "grpc://peer-worker:50081"; + proxy_arc.inject_worker_connection(peer_endpoint, peer_store); + locality_map + .write() + .register_blobs(peer_endpoint, &[digest]); + + // Read bytes [4..12) from the peer. + let result = proxy.get_part_unchunked(digest, 4, Some(8)).await?; + assert_eq!( + result.as_ref(), + b"456789ab", + "Expected subset from peer at offset=4, length=8" + ); + + Ok(()) +} + +// ------------------------------------------------------------------- +// 19. Peer proxy: first peer doesn't have blob, second peer does +// ------------------------------------------------------------------- +#[nativelink_test] +async fn get_part_skips_peer_without_blob_and_reads_from_next() -> Result<(), Error> { + let (proxy_arc, _inner, locality_map) = make_proxy_store_with_arc(); + let proxy = Store::new(proxy_arc.clone()); + + let value = b"only on peer-b"; + let digest = DigestInfo::try_new(VALID_HASH1, value.len() as u64)?; + + // Peer A: empty store (has() returns None). + let peer_a_store = Store::new(MemoryStore::new(&MemorySpec::default())); + let peer_a_endpoint = "grpc://peer-a:50081"; + proxy_arc.inject_worker_connection(peer_a_endpoint, peer_a_store); + + // Peer B: has the blob. + let peer_b_store = Store::new(MemoryStore::new(&MemorySpec::default())); + peer_b_store + .update_oneshot(digest, Bytes::from_static(value)) + .await?; + let peer_b_endpoint = "grpc://peer-b:50081"; + proxy_arc.inject_worker_connection(peer_b_endpoint, peer_b_store); + + // Register the digest on both peers. + { + let mut map = locality_map.write(); + map.register_blobs(peer_a_endpoint, &[digest]); + map.register_blobs(peer_b_endpoint, &[digest]); + } + + let result = proxy.get_part_unchunked(digest, 0, None).await?; + assert_eq!( + result.as_ref(), + value, + "Expected data from peer-b after peer-a returned None for has()" + ); + + Ok(()) +} + +// =================================================================== +// Gap 2: Resume-from-offset — PartialFailStore + next peer +// =================================================================== + +/// A store wrapper that delegates to an inner store but fails `get_part` +/// after writing a configured number of bytes. Used to test streaming +/// resume logic in WorkerProxyStore. +#[derive(Debug, MetricsComponent)] +struct PartialFailStore { + inner: Store, + /// Number of bytes to successfully write before returning an error. + fail_after_bytes: u64, +} + +default_health_status_indicator!(PartialFailStore); + +#[async_trait] +impl StoreDriver for PartialFailStore { + async fn has_with_results( + self: Pin<&Self>, + digests: &[StoreKey<'_>], + results: &mut [Option], + ) -> Result<(), Error> { + self.inner.has_with_results(digests, results).await + } + + async fn update( + self: Pin<&Self>, + key: StoreKey<'_>, + reader: DropCloserReadHalf, + upload_size: UploadSizeInfo, + ) -> Result<(), Error> { + self.inner.update(key, reader, upload_size).await + } + + async fn get_part( + self: Pin<&Self>, + key: StoreKey<'_>, + writer: &mut DropCloserWriteHalf, + offset: u64, + length: Option, + ) -> Result<(), Error> { + // Read the full blob from the inner store. + let data = self.inner.get_part_unchunked(key.borrow(), offset, length).await?; + + // Write up to `fail_after_bytes` bytes, then return an error. + let write_len = core::cmp::min(data.len() as u64, self.fail_after_bytes) as usize; + if write_len > 0 { + writer + .send(data.slice(..write_len)) + .await + .map_err(|e| make_err!(Code::Internal, "PartialFailStore write error: {e:?}"))?; + } + + Err(make_err!( + Code::Internal, + "PartialFailStore: simulated failure after {} bytes", + write_len + )) + } + + fn inner_store(&self, _key: Option) -> &dyn StoreDriver { + self + } + + fn as_any<'a>(&'a self) -> &'a (dyn core::any::Any + Sync + Send + 'static) { + self + } + + fn as_any_arc(self: Arc) -> Arc { + self + } + + fn register_item_callback( + self: Arc, + _callback: Arc, + ) -> Result<(), Error> { + Ok(()) + } +} + +// ------------------------------------------------------------------- +// 20. Resume from offset: first peer fails mid-stream, second succeeds +// ------------------------------------------------------------------- +#[nativelink_test] +async fn get_part_resumes_from_next_peer_after_mid_stream_failure() -> Result<(), Error> { + let (proxy_arc, _inner, locality_map) = make_proxy_store_with_arc(); + let proxy = Store::new(proxy_arc.clone()); + + let value = b"0123456789abcdef"; // 16 bytes + let digest = DigestInfo::try_new(VALID_HASH1, value.len() as u64)?; + + // Peer A: a PartialFailStore that writes 5 bytes then fails. + let peer_a_inner = Store::new(MemoryStore::new(&MemorySpec::default())); + peer_a_inner + .update_oneshot(digest, Bytes::from_static(value)) + .await?; + let peer_a_store = Store::new(Arc::new(PartialFailStore { + inner: peer_a_inner, + fail_after_bytes: 5, + })); + let peer_a_endpoint = "grpc://peer-a:50081"; + proxy_arc.inject_worker_connection(peer_a_endpoint, peer_a_store); + + // Peer B: has the full blob (normal MemoryStore). + let peer_b_store = Store::new(MemoryStore::new(&MemorySpec::default())); + peer_b_store + .update_oneshot(digest, Bytes::from_static(value)) + .await?; + let peer_b_endpoint = "grpc://peer-b:50081"; + proxy_arc.inject_worker_connection(peer_b_endpoint, peer_b_store); + + // Register the digest on both peers. The order in the locality map + // determines which peer is tried first. We register A first. + { + let mut map = locality_map.write(); + map.register_blobs(peer_a_endpoint, &[digest]); + map.register_blobs(peer_b_endpoint, &[digest]); + } + + // The proxy should: try peer A, get 5 bytes, fail, then resume from + // peer B at offset 5. The final result should be the complete blob. + let result = proxy.get_part_unchunked(digest, 0, None).await?; + assert_eq!( + result.as_ref(), + value, + "Expected complete blob after resume from second peer" + ); + + Ok(()) +} + +// =================================================================== +// Gap 3: IS_WORKER_REQUEST branching tests +// =================================================================== + +// ------------------------------------------------------------------- +// 21. IS_WORKER_REQUEST=true: inner miss + locality has peer +// => FailedPrecondition redirect with peer endpoint +// ------------------------------------------------------------------- +#[nativelink_test] +async fn worker_request_returns_redirect_with_peer_endpoints() -> Result<(), Error> { + let (proxy, _inner, locality_map) = make_proxy_store(); + + let digest = DigestInfo::try_new(VALID_HASH1, 100)?; + let peer_endpoint = "grpc://peer-worker:50071"; + + locality_map + .write() + .register_blobs(peer_endpoint, &[digest]); + + let result = IS_WORKER_REQUEST + .scope(true, proxy.get_part_unchunked(digest, 0, None)) + .await; + + assert!(result.is_err(), "Expected redirect error for worker request"); + let err = result.unwrap_err(); + assert_eq!( + err.code, + Code::FailedPrecondition, + "Redirect should use FailedPrecondition, got: {err:?}" + ); + let msg = err.message_string(); + assert!( + msg.contains(REDIRECT_PREFIX), + "Error message should contain redirect prefix: {msg}" + ); + assert!( + msg.contains(peer_endpoint), + "Error message should contain peer endpoint: {msg}" + ); + + Ok(()) +} + +// ------------------------------------------------------------------- +// 22. IS_WORKER_REQUEST=false: inner miss + locality has peer with +// invalid URI => NotFound (proxy attempt fails gracefully) +// ------------------------------------------------------------------- +#[nativelink_test] +async fn non_worker_request_returns_not_found_when_peer_unreachable() -> Result<(), Error> { + let (proxy, _inner, locality_map) = make_proxy_store(); + + let digest = DigestInfo::try_new(VALID_HASH1, 100)?; + + // Invalid URI fails during create_worker_connection. + locality_map + .write() + .register_blobs("not a valid uri", &[digest]); + + let result = IS_WORKER_REQUEST + .scope(false, proxy.get_part_unchunked(digest, 0, None)) + .await; + + assert!(result.is_err(), "Expected NotFound error"); + let err = result.unwrap_err(); + assert_eq!( + err.code, + Code::NotFound, + "Non-worker request should get NotFound, got: {err:?}" + ); + + Ok(()) +} + +// =================================================================== +// Gap 4: optimized_for tests +// =================================================================== + +// ------------------------------------------------------------------- +// 23. optimized_for(LazyExistenceOnSync) returns true +// ------------------------------------------------------------------- +#[nativelink_test] +async fn optimized_for_lazy_existence_returns_true() -> Result<(), Error> { + let inner = Store::new(MemoryStore::new(&MemorySpec::default())); + let locality_map = new_shared_blob_locality_map(); + let proxy = WorkerProxyStore::new(inner, locality_map); + + assert!( + StoreDriver::optimized_for(&*proxy, StoreOptimizations::LazyExistenceOnSync), + "WorkerProxyStore should report LazyExistenceOnSync" + ); + + Ok(()) +} + +// ------------------------------------------------------------------- +// 24. optimized_for(other) delegates to inner store +// ------------------------------------------------------------------- +#[nativelink_test] +async fn optimized_for_other_delegates_to_inner() -> Result<(), Error> { + let inner = Store::new(MemoryStore::new(&MemorySpec::default())); + let locality_map = new_shared_blob_locality_map(); + let proxy = WorkerProxyStore::new(inner, locality_map); + + assert!( + !StoreDriver::optimized_for(&*proxy, StoreOptimizations::NoopUpdates), + "Should delegate non-LazyExistence optimizations to inner store" + ); + + Ok(()) +} diff --git a/nativelink-util/Cargo.toml b/nativelink-util/Cargo.toml index 7001cd075..12566b090 100644 --- a/nativelink-util/Cargo.toml +++ b/nativelink-util/Cargo.toml @@ -15,7 +15,7 @@ nativelink-proto = { path = "../nativelink-proto" } async-trait = { version = "0.1.88", default-features = false } base64 = { version = "0.22.1", default-features = false, features = ["std"] } bitflags = { version = "2.9.0", default-features = false } -blake3 = { version = "1.8.0", features = ["mmap"], default-features = false } +blake3 = { version = "1.8.0", features = ["mmap", "rayon"], default-features = false } bytes = { version = "1.10.1", default-features = false } futures = { version = "0.3.31", features = [ "async-await", @@ -27,37 +27,38 @@ hyper-util = { version = "0.1.11", default-features = false } libc = { version = "0.2.177", default-features = false } lru = { version = "0.16.0", default-features = false } mock_instant = { version = "0.5.3", default-features = false } -opentelemetry = { version = "0.29.0", default-features = false } -opentelemetry-appender-tracing = { version = "0.29.1", default-features = false } -opentelemetry-http = { version = "0.29.0", default-features = false } -opentelemetry-otlp = { version = "0.29.0", default-features = false, features = [ +opentelemetry = { version = "0.31.0", default-features = false } +opentelemetry-appender-tracing = { version = "0.31.1", default-features = false } +opentelemetry-http = { version = "0.31.0", default-features = false } +opentelemetry-otlp = { version = "0.31.0", default-features = false, features = [ "grpc-tonic", "logs", "metrics", "trace", "zstd-tonic", ] } -opentelemetry-semantic-conventions = { version = "0.29.0", default-features = false, features = [ +opentelemetry-semantic-conventions = { version = "0.31.0", default-features = false, features = [ "default", "semconv_experimental", ] } -opentelemetry_sdk = { version = "0.29.0", default-features = false } +opentelemetry_sdk = { version = "0.31.0", default-features = false } parking_lot = { version = "0.12.3", features = [ "arc_lock", "send_guard", ], default-features = false } pin-project = { version = "1.1.10", default-features = false } pin-project-lite = { version = "0.2.16", default-features = false } -prost = { version = "0.13.5", default-features = false } -prost-types = { version = "0.13.5", default-features = false, features = [ +prost = { version = "0.14.3", default-features = false } +prost-types = { version = "0.14.3", default-features = false, features = [ "std", ] } rand = { version = "0.9.0", default-features = false, features = [ "thread_rng", ] } +rayon = { version = "1.10.0", default-features = false } rlimit = { version = "0.10.2", default-features = false } serde = { version = "1.0.219", default-features = false } -sha2 = { version = "0.10.8", default-features = false } +sha2 = { version = "0.10.8", default-features = false, features = ["asm"] } tempfile = { version = "3.20.0", default-features = false } tokio = { version = "1.44.1", features = [ "fs", @@ -69,14 +70,14 @@ tokio-stream = { version = "0.1.17", features = [ "fs", ], default-features = false } tokio-util = { version = "0.7.14", default-features = false } -tonic = { version = "0.13.0", features = [ +tonic = { version = "0.14.5", features = [ "tls-native-roots", - "tls-ring", + "tls-aws-lc", "transport", ], default-features = false } tower = { version = "0.5.2", default-features = false } tracing = { version = "0.1.41", default-features = false } -tracing-opentelemetry = { version = "0.30.0", default-features = false, features = [ +tracing-opentelemetry = { version = "0.32.1", default-features = false, features = [ "metrics", ] } tracing-subscriber = { version = "0.3.19", features = [ diff --git a/nativelink-util/src/blob_locality_map.rs b/nativelink-util/src/blob_locality_map.rs new file mode 100644 index 000000000..16a28a454 --- /dev/null +++ b/nativelink-util/src/blob_locality_map.rs @@ -0,0 +1,483 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// See LICENSE file for details +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use std::time::SystemTime; + +use crate::common::DigestInfo; +use parking_lot::RwLock; + +/// Tracks which worker endpoints have which blobs, enabling peer-to-peer +/// blob fetching between workers. +/// +/// The map is bidirectional: +/// - `blobs`: digest → { endpoint → last_registered_timestamp } +/// - `endpoint_blobs`: endpoint → set of digests (for fast cleanup on disconnect) +/// +/// Cleanup relies entirely on explicit eviction notifications and worker +/// disconnect (no TTL — EvictingMap's `max_seconds_since_last_access` defaults +/// to unlimited). +#[derive(Debug)] +pub struct BlobLocalityMap { + /// digest → { endpoint → timestamp } + blobs: HashMap, SystemTime>>, + /// endpoint → set of digests (for fast cleanup on disconnect) + endpoint_blobs: HashMap, HashSet>, +} + +impl BlobLocalityMap { + pub fn new() -> Self { + Self { + blobs: HashMap::new(), + endpoint_blobs: HashMap::new(), + } + } + + /// Register that the given digests are available on the given endpoint. + pub fn register_blobs(&mut self, endpoint: &str, digests: &[DigestInfo]) { + let now = SystemTime::now(); + self.register_blobs_with_timestamps( + endpoint, + &digests.iter().map(|d| (*d, now)).collect::>(), + ); + } + + /// Register digests with explicit timestamps (e.g. from BlobDigestInfo). + pub fn register_blobs_with_timestamps( + &mut self, + endpoint: &str, + digests_with_ts: &[(DigestInfo, SystemTime)], + ) { + // Allocate the endpoint Arc once; clones are O(1) atomic increments + // instead of O(N) String allocations per digest. + let ep: Arc = endpoint.into(); + let digest_set = self + .endpoint_blobs + .entry(ep.clone()) + .or_default(); + + for (digest, ts) in digests_with_ts { + digest_set.insert(*digest); + self.blobs + .entry(*digest) + .or_default() + .insert(ep.clone(), *ts); + } + } + + /// Remove specific digests from the given endpoint (eviction notification). + pub fn evict_blobs(&mut self, endpoint: &str, digests: &[DigestInfo]) { + if let Some(digest_set) = self.endpoint_blobs.get_mut(endpoint) { + for digest in digests { + digest_set.remove(digest); + if let Some(endpoints) = self.blobs.get_mut(digest) { + endpoints.remove(endpoint); + if endpoints.is_empty() { + self.blobs.remove(digest); + } + } + } + if digest_set.is_empty() { + self.endpoint_blobs.remove(endpoint); + } + } + } + + /// Remove ALL entries for an endpoint (worker disconnect). + pub fn remove_endpoint(&mut self, endpoint: &str) { + if let Some(digests) = self.endpoint_blobs.remove(endpoint) { + for digest in &digests { + if let Some(endpoints) = self.blobs.get_mut(digest) { + endpoints.remove(endpoint); + if endpoints.is_empty() { + self.blobs.remove(digest); + } + } + } + } + } + + /// Look up which worker endpoints have the given digest. + /// Returns all endpoints that have registered this digest. + /// + /// Workers refresh their timestamps on every BlobsAvailable update + /// (typically every ~500ms), so stale entries are only possible if + /// a worker disconnects without cleanup. Disconnects are handled + /// via `remove_endpoint`, so we can simply return all endpoints. + pub fn lookup_workers(&self, digest: &DigestInfo) -> Vec> { + let Some(endpoints) = self.blobs.get(digest) else { + return Vec::new(); + }; + + endpoints.keys().cloned().collect() + } + + /// Look up which worker endpoints have the given digest, including the + /// timestamp of when the blob was last registered/refreshed on each endpoint. + /// Useful for preferring workers with more recently-refreshed locality data. + pub fn lookup_workers_with_timestamps(&self, digest: &DigestInfo) -> Vec<(Arc, SystemTime)> { + let Some(endpoints) = self.blobs.get(digest) else { + return Vec::new(); + }; + + endpoints + .iter() + .map(|(endpoint, ts)| (endpoint.clone(), *ts)) + .collect() + } + + /// Returns the set of all known endpoints. + pub fn all_endpoints(&self) -> Vec> { + self.endpoint_blobs.keys().cloned().collect() + } + + /// Returns the number of tracked digests. + pub fn digest_count(&self) -> usize { + self.blobs.len() + } + + /// Returns the number of tracked endpoints. + pub fn endpoint_count(&self) -> usize { + self.endpoint_blobs.len() + } + + /// Raw access to the blobs map for bulk scoring. + /// Caller must hold the read lock. + pub fn blobs_map(&self) -> &HashMap, SystemTime>> { + &self.blobs + } +} + +impl Default for BlobLocalityMap { + fn default() -> Self { + Self::new() + } +} + +/// Thread-safe shared handle to a `BlobLocalityMap`. +pub type SharedBlobLocalityMap = Arc>; + +/// Create a new shared blob locality map. +pub fn new_shared_blob_locality_map() -> SharedBlobLocalityMap { + Arc::new(RwLock::new(BlobLocalityMap::new())) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_register_and_lookup() { + let mut map = BlobLocalityMap::new(); + let d1 = DigestInfo::new([1u8; 32], 100); + let d2 = DigestInfo::new([2u8; 32], 200); + + map.register_blobs("worker-a:50081", &[d1, d2]); + map.register_blobs("worker-b:50081", &[d1]); + + let workers = map.lookup_workers(&d1); + assert_eq!(workers.len(), 2); + assert!(workers.contains(&Arc::from("worker-a:50081"))); + assert!(workers.contains(&Arc::from("worker-b:50081"))); + + let workers = map.lookup_workers(&d2); + assert_eq!(workers.len(), 1); + assert!(workers.contains(&Arc::from("worker-a:50081"))); + } + + #[test] + fn test_evict_blobs() { + let mut map = BlobLocalityMap::new(); + let d1 = DigestInfo::new([1u8; 32], 100); + let d2 = DigestInfo::new([2u8; 32], 200); + + map.register_blobs("worker-a:50081", &[d1, d2]); + map.evict_blobs("worker-a:50081", &[d1]); + + assert!(map.lookup_workers(&d1).is_empty()); + assert_eq!(map.lookup_workers(&d2).len(), 1); + } + + #[test] + fn test_remove_endpoint() { + let mut map = BlobLocalityMap::new(); + let d1 = DigestInfo::new([1u8; 32], 100); + let d2 = DigestInfo::new([2u8; 32], 200); + + map.register_blobs("worker-a:50081", &[d1, d2]); + map.register_blobs("worker-b:50081", &[d1]); + + map.remove_endpoint("worker-a:50081"); + + // d1 still available on worker-b + let workers = map.lookup_workers(&d1); + assert_eq!(workers.len(), 1); + assert!(workers.contains(&Arc::from("worker-b:50081"))); + + // d2 no longer available anywhere + assert!(map.lookup_workers(&d2).is_empty()); + } + + #[test] + fn test_lookup_unknown_digest() { + let map = BlobLocalityMap::new(); + let d1 = DigestInfo::new([1u8; 32], 100); + assert!(map.lookup_workers(&d1).is_empty()); + } + + #[test] + fn test_blobs_map_accessor() { + let mut map = BlobLocalityMap::new(); + let d1 = DigestInfo::new([1u8; 32], 100); + let d2 = DigestInfo::new([2u8; 32], 200); + + map.register_blobs("worker-a:50081", &[d1, d2]); + map.register_blobs("worker-b:50081", &[d1]); + + let blobs = map.blobs_map(); + assert_eq!(blobs.len(), 2); + + // d1 has two endpoints + let d1_endpoints = blobs.get(&d1).unwrap(); + assert_eq!(d1_endpoints.len(), 2); + assert!(d1_endpoints.contains_key("worker-a:50081")); + assert!(d1_endpoints.contains_key("worker-b:50081")); + + // d2 has one endpoint + let d2_endpoints = blobs.get(&d2).unwrap(); + assert_eq!(d2_endpoints.len(), 1); + assert!(d2_endpoints.contains_key("worker-a:50081")); + } + + #[test] + fn test_re_registration_updates_timestamp() { + let mut map = BlobLocalityMap::new(); + let d1 = DigestInfo::new([1u8; 32], 100); + + map.register_blobs("worker-a", &[d1]); + let ts1 = *map + .blobs_map() + .get(&d1) + .unwrap() + .get("worker-a") + .unwrap(); + + // Spin until the clock advances (SystemTime resolution varies by OS). + loop { + if SystemTime::now() > ts1 { + break; + } + } + + map.register_blobs("worker-a", &[d1]); + let ts2 = *map + .blobs_map() + .get(&d1) + .unwrap() + .get("worker-a") + .unwrap(); + + assert!( + ts2 > ts1, + "Expected re-registration to update timestamp: ts1={ts1:?}, ts2={ts2:?}" + ); + } + + #[test] + fn test_evict_all_blobs_removes_endpoint() { + let mut map = BlobLocalityMap::new(); + let d1 = DigestInfo::new([1u8; 32], 100); + let d2 = DigestInfo::new([2u8; 32], 200); + + map.register_blobs("worker-a", &[d1, d2]); + assert_eq!(map.endpoint_count(), 1); + + map.evict_blobs("worker-a", &[d1, d2]); + + assert_eq!(map.endpoint_count(), 0); + assert_eq!(map.digest_count(), 0); + assert!(map.lookup_workers(&d1).is_empty()); + assert!(map.lookup_workers(&d2).is_empty()); + // endpoint_blobs should be fully cleaned up + assert!(map.all_endpoints().is_empty()); + } + + #[test] + fn test_partial_eviction_preserves_remaining() { + let mut map = BlobLocalityMap::new(); + let d1 = DigestInfo::new([1u8; 32], 100); + let d2 = DigestInfo::new([2u8; 32], 200); + let d3 = DigestInfo::new([3u8; 32], 300); + + map.register_blobs("worker-a", &[d1, d2, d3]); + assert_eq!(map.digest_count(), 3); + assert_eq!(map.endpoint_count(), 1); + + map.evict_blobs("worker-a", &[d1]); + + assert!(map.lookup_workers(&d1).is_empty()); + assert_eq!(map.lookup_workers(&d2), vec![Arc::from("worker-a")]); + assert_eq!(map.lookup_workers(&d3), vec![Arc::from("worker-a")]); + assert_eq!(map.digest_count(), 2); + assert_eq!(map.endpoint_count(), 1); + } + + #[test] + fn test_evict_unknown_digest_is_noop() { + let mut map = BlobLocalityMap::new(); + let d1 = DigestInfo::new([1u8; 32], 100); + let d2 = DigestInfo::new([2u8; 32], 200); + + map.register_blobs("worker-a", &[d1]); + + // Evict a digest that was never registered — should not panic. + map.evict_blobs("worker-a", &[d2]); + + assert_eq!(map.lookup_workers(&d1), vec![Arc::from("worker-a")]); + assert_eq!(map.endpoint_count(), 1); + assert_eq!(map.digest_count(), 1); + } + + #[test] + fn test_complex_multi_endpoint_topology() { + let mut map = BlobLocalityMap::new(); + let d1 = DigestInfo::new([1u8; 32], 100); + let d2 = DigestInfo::new([2u8; 32], 200); + let d3 = DigestInfo::new([3u8; 32], 300); + let d4 = DigestInfo::new([4u8; 32], 400); + let d5 = DigestInfo::new([5u8; 32], 500); + + map.register_blobs("worker-a", &[d1, d2, d3]); + map.register_blobs("worker-b", &[d2, d3, d4]); + map.register_blobs("worker-c", &[d4, d5]); + + assert_eq!(map.digest_count(), 5); + assert_eq!(map.endpoint_count(), 3); + + // D2 on both worker-a and worker-b + let d2_workers = map.lookup_workers(&d2); + assert_eq!(d2_workers.len(), 2); + assert!(d2_workers.contains(&Arc::from("worker-a"))); + assert!(d2_workers.contains(&Arc::from("worker-b"))); + + // Remove worker-b + map.remove_endpoint("worker-b"); + + assert_eq!(map.endpoint_count(), 2); + + // D2 still on worker-a + let d2_workers = map.lookup_workers(&d2); + assert_eq!(d2_workers.len(), 1); + assert!(d2_workers.contains(&Arc::from("worker-a"))); + + // D4 still on worker-c + let d4_workers = map.lookup_workers(&d4); + assert_eq!(d4_workers.len(), 1); + assert!(d4_workers.contains(&Arc::from("worker-c"))); + + // D3 only on worker-a now + let d3_workers = map.lookup_workers(&d3); + assert_eq!(d3_workers.len(), 1); + assert!(d3_workers.contains(&Arc::from("worker-a"))); + + // D1 still on worker-a, D5 still on worker-c + assert_eq!(map.lookup_workers(&d1).len(), 1); + assert_eq!(map.lookup_workers(&d5).len(), 1); + assert_eq!(map.digest_count(), 5); + } + + #[test] + fn test_digest_count_and_endpoint_count_consistency() { + let mut map = BlobLocalityMap::new(); + let d1 = DigestInfo::new([1u8; 32], 100); + let d2 = DigestInfo::new([2u8; 32], 200); + let d3 = DigestInfo::new([3u8; 32], 300); + + // Step 1: Empty map. + assert_eq!(map.digest_count(), 0); + assert_eq!(map.endpoint_count(), 0); + + // Step 2: Register d1, d2 on worker-a. + map.register_blobs("worker-a", &[d1, d2]); + assert_eq!(map.digest_count(), 2); + assert_eq!(map.endpoint_count(), 1); + + // Step 3: Register d2, d3 on worker-b (d2 shared). + map.register_blobs("worker-b", &[d2, d3]); + assert_eq!(map.digest_count(), 3); + assert_eq!(map.endpoint_count(), 2); + + // Step 4: Evict d1 from worker-a (d1 disappears entirely). + map.evict_blobs("worker-a", &[d1]); + assert_eq!(map.digest_count(), 2); + assert_eq!(map.endpoint_count(), 2); + + // Step 5: Evict d2 from worker-a (d2 still on worker-b). + map.evict_blobs("worker-a", &[d2]); + assert_eq!(map.digest_count(), 2); // d2 and d3 remain + assert_eq!(map.endpoint_count(), 1); // worker-a removed (empty) + + // Step 6: Remove worker-b entirely. + map.remove_endpoint("worker-b"); + assert_eq!(map.digest_count(), 0); + assert_eq!(map.endpoint_count(), 0); + } + + #[test] + fn test_lookup_workers_with_timestamps() { + let mut map = BlobLocalityMap::new(); + let d1 = DigestInfo::new([1u8; 32], 100); + + map.register_blobs("worker-a:50081", &[d1]); + map.register_blobs("worker-b:50081", &[d1]); + + let workers_with_ts = map.lookup_workers_with_timestamps(&d1); + assert_eq!( + workers_with_ts.len(), + 2, + "Expected 2 endpoints with timestamps" + ); + + // Both timestamps should be non-UNIX_EPOCH (i.e., set to SystemTime::now()). + for (endpoint, ts) in &workers_with_ts { + assert!( + *ts > std::time::UNIX_EPOCH, + "Expected valid timestamp for {endpoint}, got {ts:?}" + ); + } + + // Verify endpoint names match. + let endpoints: Vec<&str> = workers_with_ts.iter().map(|(e, _)| &**e).collect(); + assert!( + endpoints.contains(&"worker-a:50081"), + "Expected worker-a:50081 in results" + ); + assert!( + endpoints.contains(&"worker-b:50081"), + "Expected worker-b:50081 in results" + ); + } + + #[test] + fn test_lookup_workers_with_timestamps_unknown_digest() { + let map = BlobLocalityMap::new(); + let d1 = DigestInfo::new([1u8; 32], 100); + let result = map.lookup_workers_with_timestamps(&d1); + assert!( + result.is_empty(), + "Expected empty result for unknown digest" + ); + } +} diff --git a/nativelink-util/src/buf_channel.rs b/nativelink-util/src/buf_channel.rs index ad3b8c288..e26a0ffdd 100644 --- a/nativelink-util/src/buf_channel.rs +++ b/nativelink-util/src/buf_channel.rs @@ -27,18 +27,37 @@ use tracing::warn; const ZERO_DATA: Bytes = Bytes::new(); +/// Default channel capacity: 64 slots. At 256KiB chunks this gives 16MiB of +/// buffered data, which is sufficient for most workloads. +const DEFAULT_BUF_CHANNEL_CAPACITY: usize = 64; + /// Create a channel pair that can be used to transport buffer objects around to /// different components. This wrapper is used because the streams give some /// utility like managing EOF in a more friendly way, ensure if no EOF is received /// it will send an error to the receiver channel before shutting down and count /// the number of bytes sent. +/// +/// Uses the default capacity of 64 slots. For high-throughput or +/// latency-sensitive paths, use [`make_buf_channel_pair_with_size`] instead. #[must_use] pub fn make_buf_channel_pair() -> (DropCloserWriteHalf, DropCloserReadHalf) { - // We allow up to 2 items in the buffer at any given time. There is no major - // reason behind this magic number other than thinking it will be nice to give - // a little time for another thread to wake up and consume data if another - // thread is pumping large amounts of data into the channel. - let (tx, rx) = mpsc::channel(2); + make_buf_channel_pair_with_size(DEFAULT_BUF_CHANNEL_CAPACITY) +} + +/// Like [`make_buf_channel_pair`], but with a caller-specified channel capacity. +/// +/// The `capacity` parameter controls how many chunks can be buffered before the +/// producer is forced to wait. At 256KiB chunks (the default `read_buffer_size`), +/// each slot represents ~256KiB of buffered data, so: +/// +/// - 64 slots = ~16MiB (default, good for most workloads) +/// - 128 slots = ~32MiB (suitable for dual-store writes in FastSlowStore) +/// - 256 slots = ~64MiB (suitable for high-throughput streaming at 10Gbps+) +#[must_use] +pub fn make_buf_channel_pair_with_size( + capacity: usize, +) -> (DropCloserWriteHalf, DropCloserReadHalf) { + let (tx, rx) = mpsc::channel(capacity); let eof_sent = Arc::new(AtomicBool::new(false)); ( DropCloserWriteHalf { @@ -368,7 +387,9 @@ impl DropCloserReadHalf { } chunk }; - let mut output = BytesMut::new(); + // If we get here, first_chunk was not enough and there is more data. + // Fall back to concatenation for multiple chunks. + let mut output = BytesMut::with_capacity(size.min(first_chunk.len() * 2)); output.extend_from_slice(&first_chunk); loop { @@ -396,20 +417,41 @@ impl DropCloserReadHalf { impl Stream for DropCloserReadHalf { type Item = Result; - // TODO(palfrey) This is not very efficient as we are creating a new future on every - // poll() call. It might be better to use a waker. fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - Box::pin(self.recv()) - .as_mut() - .poll(cx) - .map(|result| match result { + // First drain any queued data (e.g., from try_reset_stream or peek). + if let Some(chunk) = self.queued_data.pop_front() { + // queued_data may contain empty bytes representing EOF. + if chunk.is_empty() { + return Poll::Ready(None); + } + return Poll::Ready(Some(Ok(chunk))); + } + + // Check for previous errors. + if let Some(err) = &self.last_err { + return Poll::Ready(Some(Err(err.clone().to_std_err()))); + } + + // Poll the underlying mpsc channel directly to avoid heap allocation. + match self.rx.poll_recv(cx) { + Poll::Ready(Some(bytes)) => match self.recv_inner(bytes) { Ok(bytes) => { if bytes.is_empty() { - return None; + Poll::Ready(None) // EOF + } else { + Poll::Ready(Some(Ok(bytes))) } - Some(Ok(bytes)) } - Err(e) => Some(Err(e.to_std_err())), - }) + Err(e) => Poll::Ready(Some(Err(e.to_std_err()))), + }, + Poll::Ready(None) => { + // Channel closed — treat as EOF or error depending on eof_sent flag. + match self.recv_inner(ZERO_DATA) { + Ok(_) => Poll::Ready(None), + Err(e) => Poll::Ready(Some(Err(e.to_std_err()))), + } + } + Poll::Pending => Poll::Pending, + } } } diff --git a/nativelink-util/src/connection_manager.rs b/nativelink-util/src/connection_manager.rs index 26d9f9553..762f63c63 100644 --- a/nativelink-util/src/connection_manager.rs +++ b/nativelink-util/src/connection_manager.rs @@ -111,7 +111,7 @@ struct ConnectionManagerWorker { /// The maximum number of queued requests to obtain a connection from the /// worker before applying back pressure to the requestor. It makes sense to /// keep this small since it has to wait for a response anyway. -const WORKER_BACKLOG: usize = 8; +const WORKER_BACKLOG: usize = 64; impl ConnectionManager { /// Create a connection manager that creates a balance list between a given diff --git a/nativelink-util/src/digest_hasher.rs b/nativelink-util/src/digest_hasher.rs index 61d1269c2..ed695c70a 100644 --- a/nativelink-util/src/digest_hasher.rs +++ b/nativelink-util/src/digest_hasher.rs @@ -26,10 +26,10 @@ use nativelink_proto::build::bazel::remote::execution::v2::digest_function::Valu use opentelemetry::context::Context; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; -use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeekExt}; +use tokio::io::{AsyncRead, AsyncReadExt}; use crate::common::DigestInfo; -use crate::{fs, spawn_blocking}; +use crate::fs; static DEFAULT_DIGEST_HASHER_FUNC: OnceLock = OnceLock::new(); @@ -229,15 +229,27 @@ pub struct DigestHasherImpl { } impl DigestHasherImpl { - #[inline] async fn hash_file( - &mut self, - mut file: fs::FileSlot, + self, + file: fs::FileSlot, ) -> Result<(DigestInfo, fs::FileSlot), Error> { - let digest = self - .compute_from_reader(&mut file) - .await - .err_tip(|| "In digest_for_file")?; + let (mut hasher, file) = crate::spawn_blocking!("hash_file", move || { + let mut f = file; + let mut hasher = self; + let mut buf = vec![0u8; fs::DEFAULT_READ_BUFF_SIZE]; + loop { + let n = std::io::Read::read(f.as_std_mut(), &mut buf) + .err_tip(|| "Read error in hash_file")?; + if n == 0 { + break; + } + DigestHasher::update(&mut hasher, &buf[..n]); + } + Ok::<_, Error>((hasher, f)) + }) + .await + .map_err(|e| make_err!(Code::Internal, "hash_file spawn failed: {e:?}"))??; + let digest = hasher.finalize_digest(); Ok((digest, file)) } } @@ -264,14 +276,12 @@ impl DigestHasher for DigestHasherImpl { } async fn digest_for_file( - mut self, + self, file_path: impl AsRef, mut file: fs::FileSlot, size_hint: Option, ) -> Result<(DigestInfo, fs::FileSlot), Error> { - let file_position = file - .stream_position() - .await + let file_position = std::io::Seek::stream_position(file.as_std_mut()) .err_tip(|| "Couldn't get stream position in digest_for_file")?; if file_position != 0 { return self.hash_file(file).await; @@ -287,17 +297,26 @@ impl DigestHasher for DigestHasherImpl { match self.hash_func_impl { DigestHasherFuncImpl::Sha256(_) => self.hash_file(file).await, DigestHasherFuncImpl::Blake3(mut hasher) => { - spawn_blocking!("digest_for_file", move || { - hasher.update_mmap(file_path).map_err(|e| { - make_err!(Code::Internal, "Error in blake3's update_mmap: {e:?}") - })?; - Result::<_, Error>::Ok(( - DigestInfo::new(hasher.finalize().into(), hasher.count()), - file, - )) - }) - .await - .err_tip(|| "Could not spawn blocking task in digest_for_file")? + // Use rayon::spawn + oneshot instead of spawn_blocking so we + // don't hold a tokio blocking thread while rayon's thread pool + // does the parallel hashing work. + let (tx, rx) = tokio::sync::oneshot::channel(); + rayon::spawn(move || { + let result = match hasher.update_mmap_rayon(file_path) { + Ok(_) => Ok(( + DigestInfo::new(hasher.finalize().into(), hasher.count()), + file, + )), + Err(e) => Err(make_err!( + Code::Internal, + "Error in blake3's update_mmap_rayon: {e:?}" + )), + }; + drop(tx.send(result)); + }); + rx.await.map_err(|_| { + make_err!(Code::Internal, "Rayon task dropped in digest_for_file") + })? } } } diff --git a/nativelink-util/src/evicting_map.rs b/nativelink-util/src/evicting_map.rs index e779f38b6..5e5c5aa23 100644 --- a/nativelink-util/src/evicting_map.rs +++ b/nativelink-util/src/evicting_map.rs @@ -23,15 +23,16 @@ use core::pin::Pin; use std::collections::BTreeSet; use std::sync::Arc; +use parking_lot::Mutex; use futures::StreamExt; use futures::stream::FuturesUnordered; use lru::LruCache; use nativelink_config::stores::EvictionPolicy; use nativelink_metric::MetricsComponent; -use parking_lot::Mutex; use serde::{Deserialize, Serialize}; -use tracing::{debug, info}; +use tracing::{debug, warn}; +use crate::background_spawn; use crate::instant_wrapper::InstantWrapper; use crate::metrics_utils::{Counter, CounterWithTime}; @@ -89,11 +90,13 @@ impl LenEntry for Arc { } } -// Callback to be called when the EvictingMap removes an item -// either via eviction or direct deletion. This will be called with -// whatever key type the EvictingMap uses. -pub trait RemoveItemCallback: Debug + Send + Sync { +// Callback invoked when the EvictingMap inserts or removes an item. +pub trait ItemCallback: Debug + Send + Sync { fn callback(&self, store_key: &Q) -> Pin + Send>>; + + /// Called synchronously when a new item is inserted. + /// Default is a no-op. + fn on_insert(&self, _store_key: &Q, _size: u64) {} } #[derive(Debug, MetricsComponent)] @@ -101,7 +104,7 @@ struct State< K: Ord + Hash + Eq + Clone + Debug + Send + Borrow, Q: Ord + Hash + Eq + Debug, T: LenEntry + Debug + Send, - C: RemoveItemCallback, + C: ItemCallback, > { lru: LruCache>, btree: Option>, @@ -120,7 +123,7 @@ struct State< lifetime_inserted_bytes: Counter, _key_type: PhantomData, - remove_callbacks: Vec, + item_callbacks: Vec, } type RemoveFuture = Pin + Send>>; @@ -129,7 +132,7 @@ impl< K: Ord + Hash + Eq + Clone + Debug + Send + Sync + Borrow, Q: Ord + Hash + Eq + Debug + Sync, T: LenEntry + Debug + Sync + Send, - C: RemoveItemCallback, + C: ItemCallback, > State { /// Removes an item from the cache and returns the data for deferred cleanup. @@ -157,7 +160,7 @@ impl< } let callbacks = self - .remove_callbacks + .item_callbacks .iter() .map(|callback| callback.callback(key)) .collect(); @@ -168,6 +171,10 @@ impl< /// Inserts a new item into the cache. If the key already exists, the old item is returned /// for deferred cleanup. + /// + /// Note: This method does NOT fire `on_insert` callbacks. The caller is + /// responsible for collecting the key+size pairs and firing callbacks + /// after releasing the State mutex to avoid nested locking. #[must_use] fn put(&mut self, key: &K, eviction_item: EvictionItem) -> Option<(T, Vec)> where @@ -183,18 +190,20 @@ impl< .map(|old_item| self.remove(key.borrow(), &old_item, true)) } - fn add_remove_callback(&mut self, callback: C) { - self.remove_callbacks.push(callback); + fn add_item_callback(&mut self, callback: C) { + self.item_callbacks.push(callback); } } #[derive(Debug, Clone, Copy)] -pub struct NoopRemove; +pub struct NoopCallback; -impl RemoveItemCallback for NoopRemove { +impl ItemCallback for NoopCallback { fn callback(&self, _store_key: &Q) -> Pin + Send>> { Box::pin(async {}) } + + fn on_insert(&self, _store_key: &Q, _size: u64) {} } #[derive(Debug, MetricsComponent)] @@ -203,7 +212,7 @@ pub struct EvictingMap< Q: Ord + Hash + Eq + Debug, T: LenEntry + Debug + Send, I: InstantWrapper, - C: RemoveItemCallback = NoopRemove, + C: ItemCallback = NoopCallback, > { #[metric] state: Mutex>, @@ -224,7 +233,7 @@ where Q: Ord + Hash + Eq + Debug + Sync, T: LenEntry + Debug + Clone + Send + Sync, I: InstantWrapper, - C: RemoveItemCallback, + C: ItemCallback, { pub fn new(config: &EvictionPolicy, anchor_time: I) -> Self { Self { @@ -240,7 +249,7 @@ where replaced_items: CounterWithTime::default(), lifetime_inserted_bytes: Counter::default(), _key_type: PhantomData, - remove_callbacks: Vec::new(), + item_callbacks: Vec::new(), }), anchor_time, max_bytes: config.max_bytes as u64, @@ -265,7 +274,7 @@ where /// and return the number of items that were processed. /// The `handler` function should return `true` to continue processing the next item /// or `false` to stop processing. - pub fn range(&self, prefix_range: impl RangeBounds + Send, mut handler: F) -> u64 + pub async fn range(&self, prefix_range: impl RangeBounds + Send, mut handler: F) -> u64 where F: FnMut(&K, &T) -> bool + Send, K: Ord, @@ -291,7 +300,7 @@ where /// Returns the number of key-value pairs that are currently in the the cache. /// Function is not for production code paths. - pub fn len_for_test(&self) -> usize { + pub async fn len_for_test(&self) -> usize { self.state.lock().lru.len() } @@ -335,6 +344,9 @@ where self.max_bytes }; + let elapsed_seconds = + i32::try_from(self.anchor_time.elapsed().as_secs()).unwrap_or(i32::MAX); + let mut items_to_unref = Vec::new(); let mut removal_futures = Vec::new(); @@ -343,7 +355,13 @@ where .lru .pop_lru() .expect("Tried to peek() then pop() but failed"); - debug!(?key, "Evicting",); + let age_secs = elapsed_seconds.saturating_sub(eviction_item.seconds_since_anchor); + let size = eviction_item.data.len(); + if age_secs < 120 { + warn!(?key, age_secs, size, "Evicting recently-inserted item"); + } else { + debug!(?key, age_secs, size, "Evicting"); + } let (data, futures) = state.remove(key.borrow(), &eviction_item, false); items_to_unref.push(data); removal_futures.extend(futures.into_iter()); @@ -385,7 +403,16 @@ where R: Borrow + Send, { let (removal_futures, data_to_unref) = { + let lock_start = std::time::Instant::now(); let mut state = self.state.lock(); + let lock_wait = lock_start.elapsed(); + if lock_wait.as_millis() > 1 { + warn!( + lock_wait_ms = lock_wait.as_millis(), + op = "sizes_for_keys", + "EvictingMap: lock contention", + ); + } let lru_len = state.lru.len(); let mut data_to_unref = Vec::new(); @@ -404,7 +431,15 @@ where if self.should_evict(lru_len, entry, 0, u64::MAX) { *result = None; if let Some((key, eviction_item)) = state.lru.pop_entry(key.borrow()) { - info!(?key, "Item expired, evicting"); + let elapsed_seconds = + i32::try_from(self.anchor_time.elapsed().as_secs()).unwrap_or(i32::MAX); + let age_secs = elapsed_seconds.saturating_sub(eviction_item.seconds_since_anchor); + let size = eviction_item.data.len(); + if age_secs < 120 { + warn!(?key, age_secs, size, "Expired recently-inserted item"); + } else { + debug!(?key, age_secs, size, "Item expired, evicting"); + } let (data, futures) = state.remove(key.borrow(), &eviction_item, false); // Store data for later unref - we can't drop state here as we're still iterating @@ -426,50 +461,142 @@ where (removal_futures, data_to_unref) }; - // Perform the async callbacks outside of the lock - let mut callbacks: FuturesUnordered<_> = removal_futures.into_iter().collect(); - while callbacks.next().await.is_some() {} - let mut callbacks: FuturesUnordered<_> = - data_to_unref.iter().map(LenEntry::unref).collect(); - while callbacks.next().await.is_some() {} + // Fire-and-forget eviction cleanup in background. + if !removal_futures.is_empty() || !data_to_unref.is_empty() { + drop(background_spawn!("evicting_map_sizes_cleanup", async move { + let mut callbacks: FuturesUnordered<_> = removal_futures.into_iter().collect(); + while callbacks.next().await.is_some() {} + let mut callbacks: FuturesUnordered<_> = + data_to_unref.iter().map(LenEntry::unref).collect(); + while callbacks.next().await.is_some() {} + })); + } } pub async fn get(&self, key: &Q) -> Option { - // Fast path: Check if we need eviction before acquiring lock for eviction - let needs_eviction = { - let state = self.state.lock(); + let lock_start = std::time::Instant::now(); + let mut state = self.state.lock(); + let lock_wait = lock_start.elapsed(); + if lock_wait.as_millis() > 1 { + warn!( + lock_wait_ms = lock_wait.as_millis(), + op = "get", + "EvictingMap: lock contention", + ); + } + + // Perform eviction if needed, collecting items for background cleanup. + let eviction_cleanup = { if let Some((_, peek_entry)) = state.lru.peek_lru() { - self.should_evict( + if self.should_evict( state.lru.len(), peek_entry, state.sum_store_size, self.max_bytes, - ) + ) { + let (items_to_unref, removal_futures) = self.evict_items(&mut *state); + if !removal_futures.is_empty() || !items_to_unref.is_empty() { + Some((items_to_unref, removal_futures)) + } else { + None + } + } else { + None + } } else { - false + None } }; - // Perform eviction if needed - if needs_eviction { - let (items_to_unref, removal_futures) = { - let mut state = self.state.lock(); - self.evict_items(&mut *state) - }; - // Unref items outside of lock - let mut callbacks: FuturesUnordered<_> = removal_futures.into_iter().collect(); - while callbacks.next().await.is_some() {} - let mut callbacks: FuturesUnordered<_> = - items_to_unref.iter().map(LenEntry::unref).collect(); - while callbacks.next().await.is_some() {} + // Get the item while still holding the lock. + let result = state.lru.get_mut(key.borrow()).map(|entry| { + entry.seconds_since_anchor = + i32::try_from(self.anchor_time.elapsed().as_secs()).unwrap_or(i32::MAX); + entry.data.clone() + }); + + drop(state); + + // Fire-and-forget eviction cleanup in background. + if let Some((items_to_unref, removal_futures)) = eviction_cleanup { + drop(background_spawn!("evicting_map_get_cleanup", async move { + let mut futures: FuturesUnordered<_> = removal_futures.into_iter().collect(); + while futures.next().await.is_some() {} + let mut callbacks: FuturesUnordered<_> = + items_to_unref.iter().map(LenEntry::unref).collect(); + while callbacks.next().await.is_some() {} + })); } - // Now get the item + result + } + + /// Retrieves multiple entries in a single lock acquisition, reducing + /// contention compared to calling `get()` in a loop. + pub async fn get_many<'b, Iter>(&self, keys: Iter) -> Vec> + where + Iter: IntoIterator, + Q: 'b, + { + let lock_start = std::time::Instant::now(); let mut state = self.state.lock(); - let entry = state.lru.get_mut(key.borrow())?; - entry.seconds_since_anchor = - i32::try_from(self.anchor_time.elapsed().as_secs()).unwrap_or(i32::MAX); - Some(entry.data.clone()) + let lock_wait = lock_start.elapsed(); + if lock_wait.as_millis() > 1 { + warn!( + lock_wait_ms = lock_wait.as_millis(), + op = "get_many", + "EvictingMap: lock contention", + ); + } + + // Perform eviction if needed, collecting items for background cleanup. + let eviction_cleanup = { + if let Some((_, peek_entry)) = state.lru.peek_lru() { + if self.should_evict( + state.lru.len(), + peek_entry, + state.sum_store_size, + self.max_bytes, + ) { + let (items_to_unref, removal_futures) = self.evict_items(&mut *state); + if !removal_futures.is_empty() || !items_to_unref.is_empty() { + Some((items_to_unref, removal_futures)) + } else { + None + } + } else { + None + } + } else { + None + } + }; + + let now = i32::try_from(self.anchor_time.elapsed().as_secs()).unwrap_or(i32::MAX); + let results: Vec> = keys + .into_iter() + .map(|key: &'b Q| { + state.lru.get_mut(key.borrow()).map(|entry| { + entry.seconds_since_anchor = now; + entry.data.clone() + }) + }) + .collect(); + + drop(state); + + // Fire-and-forget eviction cleanup in background. + if let Some((items_to_unref, removal_futures)) = eviction_cleanup { + drop(background_spawn!("evicting_map_get_many_cleanup", async move { + let mut futures: FuturesUnordered<_> = removal_futures.into_iter().collect(); + while futures.next().await.is_some() {} + let mut callbacks: FuturesUnordered<_> = + items_to_unref.iter().map(LenEntry::unref).collect(); + while callbacks.next().await.is_some() {} + })); + } + + results } /// Returns the replaced item if any. @@ -487,23 +614,58 @@ where /// Returns the replaced item if any. pub async fn insert_with_time(&self, key: K, data: T, seconds_since_anchor: i32) -> Option { - let (items_to_unref, removal_futures) = { + let (replaced_items, evicted_items, removal_futures, insert_notifications) = { + let lock_start = std::time::Instant::now(); let mut state = self.state.lock(); + let lock_wait = lock_start.elapsed(); + if lock_wait.as_millis() > 1 { + warn!( + lock_wait_ms = lock_wait.as_millis(), + op = "insert", + "EvictingMap: lock contention", + ); + } self.inner_insert_many(&mut state, [(key, data)], seconds_since_anchor) }; + // State lock released. Fire insert callbacks outside the critical section. + if !insert_notifications.is_empty() { + let state = self.state.lock(); + for (key, size) in &insert_notifications { + for cb in &state.item_callbacks { + cb.on_insert(key.borrow(), *size); + } + } + } - let mut futures: FuturesUnordered<_> = removal_futures.into_iter().collect(); - while futures.next().await.is_some() {} + // Replaced items share the same key (and thus content path) as the + // new insert. Their unrefs MUST complete before the caller continues + // to rename the new file into the same path. + let result = if !replaced_items.is_empty() { + let futures: FuturesUnordered<_> = replaced_items + .into_iter() + .map(|item| async move { + item.unref().await; + item + }) + .collect(); + futures.collect::>().await.into_iter().next() + } else { + None + }; - // Unref items outside of lock - let futures: FuturesUnordered<_> = items_to_unref - .into_iter() - .map(|item| async move { - item.unref().await; - item - }) - .collect(); - futures.collect::>().await.into_iter().next() + // Fire-and-forget eviction cleanup (different keys, no path conflict) + // and removal callbacks (cache invalidation, protected by stale-positive handling). + if !removal_futures.is_empty() || !evicted_items.is_empty() { + drop(background_spawn!("evicting_map_insert_cleanup", async move { + let mut futures: FuturesUnordered<_> = removal_futures.into_iter().collect(); + while futures.next().await.is_some() {} + let mut callbacks: FuturesUnordered<_> = + evicted_items.iter().map(LenEntry::unref).collect(); + while callbacks.next().await.is_some() {} + })); + } + + result } /// Same as `insert()`, but optimized for multiple inserts. @@ -522,20 +684,35 @@ where return Vec::new(); } - let (items_to_unref, removal_futures) = { + let (replaced_items, evicted_items, removal_futures, insert_notifications) = { + let lock_start = std::time::Instant::now(); let mut state = self.state.lock(); + let lock_wait = lock_start.elapsed(); + if lock_wait.as_millis() > 1 { + warn!( + lock_wait_ms = lock_wait.as_millis(), + op = "insert_many", + "EvictingMap: lock contention", + ); + } self.inner_insert_many( &mut state, inserts, i32::try_from(self.anchor_time.elapsed().as_secs()).unwrap_or(i32::MAX), ) }; + // State lock released. Fire insert callbacks outside the critical section. + if !insert_notifications.is_empty() { + let state = self.state.lock(); + for (key, size) in &insert_notifications { + for cb in &state.item_callbacks { + cb.on_insert(key.borrow(), *size); + } + } + } - let mut futures: FuturesUnordered<_> = removal_futures.into_iter().collect(); - while futures.next().await.is_some() {} - - // Unref items outside of lock - items_to_unref + // Replaced items share the same key/path — must await their unrefs. + let result: Vec = replaced_items .into_iter() .map(|item| async move { item.unref().await; @@ -543,15 +720,39 @@ where }) .collect::>() .collect::>() - .await + .await; + + // Fire-and-forget eviction cleanup (different keys, no path conflict). + if !removal_futures.is_empty() || !evicted_items.is_empty() { + drop(background_spawn!("evicting_map_insert_many_cleanup", async move { + let mut futures: FuturesUnordered<_> = removal_futures.into_iter().collect(); + while futures.next().await.is_some() {} + let mut callbacks: FuturesUnordered<_> = + evicted_items.iter().map(LenEntry::unref).collect(); + while callbacks.next().await.is_some() {} + })); + } + + result } + /// Returns `(replaced_items, evicted_items, removal_futures, insert_notifications)`. + /// - `replaced_items`: items that were replaced by new inserts (same key). + /// - `evicted_items`: items evicted due to size/age/count limits. + /// - `removal_futures`: callbacks from item_callbacks for all removed items. + /// - `insert_notifications`: (key, size) pairs for firing on_insert callbacks + /// outside the State mutex critical section. + /// + /// Callers should fire-and-forget the eviction cleanup (evicted_items unrefs + /// + removal_futures) via `background_spawn!` to avoid blocking the caller. + /// Callers MUST fire on_insert callbacks for each insert_notification after + /// releasing the State mutex to avoid nested locking. fn inner_insert_many( &self, state: &mut State, inserts: It, seconds_since_anchor: i32, - ) -> (Vec, Vec) + ) -> (Vec, Vec, Vec, Vec<(K, u64)>) where It: IntoIterator + Send, // Note: It's not enough to have the inserts themselves be Send. The @@ -560,6 +761,7 @@ where { let mut replaced_items = Vec::new(); let mut removal_futures = Vec::new(); + let mut insert_notifications = Vec::new(); for (key, data) in inserts { let new_item_size = data.len(); let eviction_item = EvictionItem { @@ -573,22 +775,28 @@ where } state.sum_store_size += new_item_size; state.lifetime_inserted_bytes.add(new_item_size); + insert_notifications.push((key, new_item_size)); } // Perform eviction after all insertions - let (items_to_unref, futures) = self.evict_items(state); + let (evicted_items, futures) = self.evict_items(state); removal_futures.extend(futures); - // Note: We cannot drop the state lock here since we're borrowing it, - // but the caller will handle unreffing these items after releasing the lock - replaced_items.extend(items_to_unref); - - (replaced_items, removal_futures) + (replaced_items, evicted_items, removal_futures, insert_notifications) } pub async fn remove(&self, key: &Q) -> bool { - let (items_to_unref, removed_item, removal_futures) = { + let (evicted_items, removed_item, removal_futures) = { + let lock_start = std::time::Instant::now(); let mut state = self.state.lock(); + let lock_wait = lock_start.elapsed(); + if lock_wait.as_millis() > 1 { + warn!( + lock_wait_ms = lock_wait.as_millis(), + op = "remove", + "EvictingMap: lock contention", + ); + } // First perform eviction let (evicted_items, mut removal_futures) = self.evict_items(&mut *state); @@ -605,21 +813,25 @@ where (evicted_items, removed, removal_futures) }; - let mut callbacks: FuturesUnordered<_> = removal_futures.into_iter().collect(); - while callbacks.next().await.is_some() {} - - // Unref evicted items outside of lock - let mut callbacks: FuturesUnordered<_> = - items_to_unref.iter().map(LenEntry::unref).collect(); - while callbacks.next().await.is_some() {} - - // Unref removed item if any - if let Some(item) = removed_item { - item.unref().await; - return true; + let was_removed = removed_item.is_some(); + + // Fire-and-forget all cleanup (evicted + removed + callbacks) in background. + let has_cleanup = + !removal_futures.is_empty() || !evicted_items.is_empty() || removed_item.is_some(); + if has_cleanup { + drop(background_spawn!("evicting_map_remove_cleanup", async move { + let mut futures: FuturesUnordered<_> = removal_futures.into_iter().collect(); + while futures.next().await.is_some() {} + let mut callbacks: FuturesUnordered<_> = evicted_items + .iter() + .chain(removed_item.iter()) + .map(LenEntry::unref) + .collect(); + while callbacks.next().await.is_some() {} + })); } - false + was_removed } /// Same as `remove()`, but allows for a conditional to be applied to the @@ -648,29 +860,46 @@ where (evicted_items, removal_futures, removed_item) } else { - (vec![], vec![].into_iter().collect(), None) + return false; } }; - // Perform the async callbacks outside of the lock - let mut removal_futures: FuturesUnordered<_> = removal_futures.into_iter().collect(); - while removal_futures.next().await.is_some() {} + let was_removed = removed_item.is_some(); + + // Fire-and-forget all cleanup in background. + let has_cleanup = + !removal_futures.is_empty() || !evicted_items.is_empty() || removed_item.is_some(); + if has_cleanup { + drop(background_spawn!("evicting_map_remove_if_cleanup", async move { + let mut futures: FuturesUnordered<_> = removal_futures.into_iter().collect(); + while futures.next().await.is_some() {} + let mut callbacks: FuturesUnordered<_> = evicted_items + .iter() + .chain(removed_item.iter()) + .map(LenEntry::unref) + .collect(); + while callbacks.next().await.is_some() {} + })); + } - // Unref evicted items - let mut callbacks: FuturesUnordered<_> = - evicted_items.iter().map(LenEntry::unref).collect(); - while callbacks.next().await.is_some() {} + was_removed + } - // Unref removed item if any - if let Some(item) = removed_item { - item.unref().await; - true - } else { - false - } + pub fn add_item_callback(&self, callback: C) { + self.state.lock().add_item_callback(callback); } - pub fn add_remove_callback(&self, callback: C) { - self.state.lock().add_remove_callback(callback); + /// Returns all entries in the cache with their LRU timestamps as absolute + /// seconds since UNIX epoch. Each entry is (key, unix_timestamp_secs). + /// + /// This is a peek-only operation: it does NOT promote entries in the LRU. + pub fn get_all_entries_with_timestamps(&self) -> Vec<(K, i64)> { + let anchor_epoch = self.anchor_time.unix_timestamp() as i64; + let state = self.state.lock(); + let mut result = Vec::with_capacity(state.lru.len()); + result.extend(state.lru.iter().map(|(k, v)| { + (k.clone(), anchor_epoch + v.seconds_since_anchor as i64) + })); + result } } diff --git a/nativelink-util/src/fs.rs b/nativelink-util/src/fs.rs index 284d2ca58..015a5228a 100644 --- a/nativelink-util/src/fs.rs +++ b/nativelink-util/src/fs.rs @@ -12,36 +12,48 @@ // See the License for the specific language governing permissions and // limitations under the License. -use core::pin::Pin; use core::sync::atomic::{AtomicUsize, Ordering}; -use core::task::{Context, Poll}; use std::fs::{Metadata, Permissions}; -use std::io::{IoSlice, Seek}; +use std::io::{Read, Seek, Write}; use std::path::{Path, PathBuf}; +use bytes::{Bytes, BytesMut}; use nativelink_error::{Code, Error, ResultExt, make_err}; use rlimit::increase_nofile_limit; /// We wrap all `tokio::fs` items in our own wrapper so we can limit the number of outstanding /// open files at any given time. This will greatly reduce the chance we'll hit open file limit /// issues. pub use tokio::fs::DirEntry; -use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncWrite, ReadBuf, SeekFrom, Take}; +use tokio::io::SeekFrom; use tokio::sync::{Semaphore, SemaphorePermit}; use tracing::{error, info, trace, warn}; +use crate::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use crate::spawn_blocking; /// Default read buffer size when reading to/from disk. -pub const DEFAULT_READ_BUFF_SIZE: usize = 0x4000; +pub const DEFAULT_READ_BUFF_SIZE: usize = 64 * 1024; #[derive(Debug)] pub struct FileSlot { // We hold the permit because once it is dropped it goes back into the queue. _permit: SemaphorePermit<'static>, - inner: tokio::fs::File, + inner: std::fs::File, } impl FileSlot { + /// Returns a reference to the underlying `std::fs::File`. + #[inline] + pub fn as_std(&self) -> &std::fs::File { + &self.inner + } + + /// Returns a mutable reference to the underlying `std::fs::File`. + #[inline] + pub fn as_std_mut(&mut self) -> &mut std::fs::File { + &mut self.inner + } + /// Advise the kernel to drop page cache for this file's contents. /// Only available on Linux; #[cfg(target_os = "linux")] @@ -62,77 +74,25 @@ impl FileSlot { pub const fn advise_dontneed(&self) { // No-op: posix_fadvise is not available on Mac or Windows. } -} - -impl AsRef for FileSlot { - fn as_ref(&self) -> &tokio::fs::File { - &self.inner - } -} - -impl AsMut for FileSlot { - fn as_mut(&mut self) -> &mut tokio::fs::File { - &mut self.inner - } -} -impl AsyncRead for FileSlot { - fn poll_read( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &mut ReadBuf<'_>, - ) -> Poll> { - Pin::new(&mut self.inner).poll_read(cx, buf) - } -} - -impl AsyncSeek for FileSlot { - fn start_seek(mut self: Pin<&mut Self>, position: SeekFrom) -> Result<(), tokio::io::Error> { - Pin::new(&mut self.inner).start_seek(position) - } - - fn poll_complete( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { - Pin::new(&mut self.inner).poll_complete(cx) - } -} - -impl AsyncWrite for FileSlot { - fn poll_write( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &[u8], - ) -> Poll> { - Pin::new(&mut self.inner).poll_write(cx, buf) - } - - fn poll_flush( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { - Pin::new(&mut self.inner).poll_flush(cx) - } - - fn poll_shutdown( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { - Pin::new(&mut self.inner).poll_shutdown(cx) - } - - fn poll_write_vectored( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - bufs: &[IoSlice<'_>], - ) -> Poll> { - Pin::new(&mut self.inner).poll_write_vectored(cx, bufs) + /// Advise the kernel that this file will be read sequentially, + /// enabling more aggressive readahead (typically 2-4x default). + #[cfg(target_os = "linux")] + pub fn advise_sequential(&self) { + use std::os::unix::io::AsRawFd; + let fd = self.inner.as_raw_fd(); + let ret = unsafe { libc::posix_fadvise(fd, 0, 0, libc::POSIX_FADV_SEQUENTIAL) }; + if ret != 0 { + tracing::debug!( + fd, + ret, + "posix_fadvise(SEQUENTIAL) returned non-zero (best-effort, ignoring)", + ); + } } - fn is_write_vectored(&self) -> bool { - self.inner.is_write_vectored() - } + #[cfg(not(target_os = "linux"))] + pub const fn advise_sequential(&self) {} } // Note: If the default changes make sure you update the documentation in @@ -231,11 +191,7 @@ pub fn get_open_files_for_test() -> usize { OPEN_FILE_LIMIT.load(Ordering::Acquire) - OPEN_FILE_SEMAPHORE.available_permits() } -pub async fn open_file( - path: impl AsRef, - start: u64, - limit: u64, -) -> Result, Error> { +pub async fn open_file(path: impl AsRef, start: u64) -> Result { let path = path.as_ref().to_owned(); let (permit, os_file) = call_with_permit(move |permit| { let mut os_file = @@ -250,9 +206,8 @@ pub async fn open_file( .await?; Ok(FileSlot { _permit: permit, - inner: tokio::fs::File::from_std(os_file), - } - .take(limit)) + inner: os_file, + }) } pub async fn create_file(path: impl AsRef) -> Result { @@ -272,10 +227,111 @@ pub async fn create_file(path: impl AsRef) -> Result { .await?; Ok(FileSlot { _permit: permit, - inner: tokio::fs::File::from_std(os_file), + inner: os_file, }) } +/// Read from `file` in a blocking thread, sending chunks to `writer`. +/// Reads up to `limit` bytes starting from the current file position. +/// `read_buffer_size` controls the chunk size (typically 256 KiB). +/// Returns the `FileSlot` so the caller can reuse or drop it. +pub async fn read_file_to_channel( + file: FileSlot, + writer: &mut DropCloserWriteHalf, + limit: u64, + read_buffer_size: usize, +) -> Result { + let (sync_tx, mut async_rx) = tokio::sync::mpsc::channel::>(4); + + let read_task = spawn_blocking!("fs_read_file", move || { + let mut f = file; + let mut remaining = limit; + loop { + let to_read = read_buffer_size.min(remaining as usize); + if to_read == 0 { + break; + } + let mut buf = BytesMut::zeroed(to_read); + match f.as_std_mut().read(&mut buf[..]) { + Ok(0) => break, + Ok(n) => { + buf.truncate(n); + remaining -= n as u64; + if sync_tx.blocking_send(Ok(buf.freeze())).is_err() { + break; // reader dropped + } + } + Err(e) => { + drop(sync_tx.blocking_send(Err(e.into()))); + break; + } + } + } + f + }); + + // Receive chunks and forward to the async writer. + while let Some(result) = async_rx.recv().await { + let chunk = result?; + writer + .send(chunk) + .await + .err_tip(|| "Failed to send chunk from file reader")?; + } + // Ensure the blocking task completed successfully. + read_task + .await + .map_err(|e| make_err!(Code::Internal, "read task join failed: {e:?}")) +} + +/// Write to `file` from a blocking thread, receiving chunks from `reader`. +/// Returns total bytes written and the `FileSlot`. +pub async fn write_file_from_channel( + file: FileSlot, + reader: &mut DropCloserReadHalf, +) -> Result<(u64, FileSlot), Error> { + let (async_tx, mut sync_rx) = tokio::sync::mpsc::channel::(4); + + let write_task = spawn_blocking!("fs_write_file", move || { + let mut f = file; + let mut total: u64 = 0; + while let Some(data) = sync_rx.blocking_recv() { + f.as_std_mut() + .write_all(&data) + .map_err(|e| Into::::into(e))?; + total += data.len() as u64; + } + Ok::<_, Error>((total, f)) + }); + + // Async side: recv from channel, send to blocking writer. + let send_result: Result<(), Error> = async { + loop { + let data = reader + .recv() + .await + .err_tip(|| "Failed to recv in write_file_from_channel")?; + if data.is_empty() { + break; // EOF + } + if async_tx.send(data).await.is_err() { + // Writer task died — we'll get the error from write_task. + break; + } + } + Ok(()) + } + .await; + drop(async_tx); // Signal EOF to writer. + + let (total, file) = write_task + .await + .map_err(|e| make_err!(Code::Internal, "write task join failed: {e:?}"))??; + + send_result?; + Ok((total, file)) +} + pub async fn hard_link(src: impl AsRef, dst: impl AsRef) -> Result<(), Error> { let src = src.as_ref().to_owned(); let dst = dst.as_ref().to_owned(); diff --git a/nativelink-util/src/fs_util.rs b/nativelink-util/src/fs_util.rs index c84215448..a785ec1eb 100644 --- a/nativelink-util/src/fs_util.rs +++ b/nativelink-util/src/fs_util.rs @@ -24,7 +24,7 @@ use tokio::fs; /// /// # Arguments /// * `src_dir` - Source directory path (must exist) -/// * `dst_dir` - Destination directory path (will be created) +/// * `dst_dir` - Destination directory path (will be created if it doesn't exist) /// /// # Returns /// * `Ok(())` on success @@ -37,7 +37,6 @@ use tokio::fs; /// /// # Errors /// - Source directory doesn't exist -/// - Destination already exists /// - Cross-filesystem hardlinking attempted /// - Filesystem doesn't support hardlinks /// - Permission denied @@ -48,13 +47,7 @@ pub async fn hardlink_directory_tree(src_dir: &Path, dst_dir: &Path) -> Result<( src_dir.display() ); - error_if!( - dst_dir.exists(), - "Destination directory already exists: {}", - dst_dir.display() - ); - - // Create the root destination directory + // Create the root destination directory (idempotent — ok if it already exists) fs::create_dir_all(dst_dir).await.err_tip(|| { format!( "Failed to create destination directory: {}", @@ -163,10 +156,17 @@ fn set_readonly_recursive_impl<'a>( path: &'a Path, ) -> Pin> + Send + 'a>> { Box::pin(async move { - let metadata = fs::metadata(path) + // Use symlink_metadata to avoid following symlinks (security: prevents + // changing permissions on external paths via crafted symlinks). + let metadata = fs::symlink_metadata(path) .await .err_tip(|| format!("Failed to get metadata for: {}", path.display()))?; + // Skip symlinks — do not follow them or change their target's permissions. + if metadata.is_symlink() { + return Ok(()); + } + if metadata.is_dir() { let mut entries = fs::read_dir(path) .await @@ -187,9 +187,11 @@ fn set_readonly_recursive_impl<'a>( use std::os::unix::fs::PermissionsExt; let mut perms = metadata.permissions(); - // If it's a directory, set to r-xr-xr-x (555) - // If it's a file, set to r--r--r-- (444) - let mode = if metadata.is_dir() { 0o555 } else { 0o444 }; + // Strip write bits but preserve execute bits. + // Files marked is_executable (e.g., shell scripts) are 0o555; + // stripping write keeps them at 0o555. Non-executable files + // at 0o644 become 0o444. Directories at 0o755 become 0o555. + let mode = perms.mode() & !0o222; perms.set_mode(mode); fs::set_permissions(path, perms) @@ -229,10 +231,17 @@ fn calculate_directory_size_impl<'a>( path: &'a Path, ) -> Pin> + Send + 'a>> { Box::pin(async move { - let metadata = fs::metadata(path) + // Use symlink_metadata to avoid following symlinks (security: prevents + // counting external files reachable via crafted symlinks). + let metadata = fs::symlink_metadata(path) .await .err_tip(|| format!("Failed to get metadata for: {}", path.display()))?; + // Symlinks count as 0 bytes — do not follow them. + if metadata.is_symlink() { + return Ok(0); + } + if metadata.is_file() { return Ok(metadata.len()); } @@ -370,14 +379,24 @@ mod tests { } #[tokio::test] - async fn test_hardlink_existing_destination() -> Result<(), Error> { - let (_temp_dir, src_dir) = create_test_directory().await?; - let dst_dir = _temp_dir.path().join("existing"); + async fn test_hardlink_into_existing_destination() -> Result<(), Error> { + let (temp_dir, src_dir) = create_test_directory().await?; + let dst_dir = temp_dir.path().join("existing"); + // Pre-create the destination directory (simulates work_directory already existing) fs::create_dir(&dst_dir).await?; - let result = hardlink_directory_tree(&src_dir, &dst_dir).await; - assert!(result.is_err()); + // Should succeed — hardlink contents into existing directory + hardlink_directory_tree(&src_dir, &dst_dir).await?; + + // Verify structure + assert!(dst_dir.join("file1.txt").exists()); + assert!(dst_dir.join("subdir").is_dir()); + assert!(dst_dir.join("subdir/file2.txt").exists()); + + // Verify contents + let content1 = fs::read_to_string(dst_dir.join("file1.txt")).await?; + assert_eq!(content1, "Hello, World!"); Ok(()) } diff --git a/nativelink-util/src/lib.rs b/nativelink-util/src/lib.rs index 8ab85754e..5949f7f77 100644 --- a/nativelink-util/src/lib.rs +++ b/nativelink-util/src/lib.rs @@ -13,6 +13,7 @@ // limitations under the License. pub mod action_messages; +pub mod blob_locality_map; pub mod buf_channel; pub mod channel_body_for_tests; pub mod chunked_stream; @@ -24,6 +25,7 @@ pub mod fastcdc; pub mod fs; pub mod fs_util; pub mod health_utils; +pub mod log_utils; pub mod instant_wrapper; pub mod known_platform_property_provider; pub mod metrics; @@ -36,6 +38,7 @@ pub mod proto_stream_utils; pub mod resource_info; pub mod retry; pub mod shutdown_guard; +pub mod stall_detector; pub mod store_trait; pub mod task; pub mod telemetry; diff --git a/nativelink-util/src/log_utils.rs b/nativelink-util/src/log_utils.rs new file mode 100644 index 000000000..3de473391 --- /dev/null +++ b/nativelink-util/src/log_utils.rs @@ -0,0 +1,25 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// See LICENSE file for details +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use core::time::Duration; + +/// Computes throughput in megabits per second. +#[inline] +pub fn throughput_mbps(size_bytes: u64, elapsed: Duration) -> f64 { + let secs = elapsed.as_secs_f64(); + if secs == 0.0 { + return 0.0; + } + (size_bytes as f64 * 8.0) / (secs * 1_000_000.0) +} diff --git a/nativelink-util/src/platform_properties.rs b/nativelink-util/src/platform_properties.rs index 37d19b2e3..1b6e5a5f0 100644 --- a/nativelink-util/src/platform_properties.rs +++ b/nativelink-util/src/platform_properties.rs @@ -21,7 +21,7 @@ use nativelink_metric::{ use nativelink_proto::build::bazel::remote::execution::v2::Platform as ProtoPlatform; use nativelink_proto::build::bazel::remote::execution::v2::platform::Property as ProtoProperty; use serde::{Deserialize, Serialize}; -use tracing::info; +use tracing::debug; /// `PlatformProperties` helps manage the configuration of platform properties to /// keys and types. The scheduler uses these properties to decide what jobs @@ -54,12 +54,12 @@ impl PlatformProperties { if full_worker_logging { match check_value { PlatformPropertyValue::Minimum(_) => { - info!( + debug!( "Property mismatch on worker property {property}. {worker_value:?} < {check_value:?}" ); } _ => { - info!( + debug!( "Property mismatch on worker property {property}. {worker_value:?} != {check_value:?}" ); } @@ -69,7 +69,7 @@ impl PlatformProperties { } } else { if full_worker_logging { - info!("Property missing on worker property {property}"); + debug!("Property missing on worker property {property}"); } return false; } diff --git a/nativelink-util/src/stall_detector.rs b/nativelink-util/src/stall_detector.rs new file mode 100644 index 000000000..d6128bb4a --- /dev/null +++ b/nativelink-util/src/stall_detector.rs @@ -0,0 +1,217 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// See LICENSE file for details +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Stall detection and thread dump utilities. +//! +//! When an async operation takes longer than a configured threshold, +//! [`StallGuard`] dumps all thread stacks to a file for post-mortem analysis. + +use core::time::Duration; +use std::sync::atomic::{AtomicU64, Ordering}; + +use tracing::error; + +/// Minimum interval between consecutive stack dumps (seconds). +/// Prevents flooding /tmp with dumps during a sustained stall. +const MIN_DUMP_INTERVAL_SECS: u64 = 30; + +/// Unix epoch seconds of the last dump. Used for rate-limiting. +static LAST_DUMP_EPOCH: AtomicU64 = AtomicU64::new(0); + +/// Default stall threshold for store operations. +pub const DEFAULT_STALL_THRESHOLD: Duration = Duration::from_secs(30); + +/// A guard that spawns a background task to detect stalls. When the +/// guarded operation completes (i.e., the guard is dropped), the +/// background task is cancelled. If the operation exceeds `threshold`, +/// a thread dump is written to `/tmp/nativelink-stall-.txt`. +/// +/// This relies on tokio's timer infrastructure, so it cannot detect +/// stalls caused by the tokio runtime itself being blocked. The +/// runtime-watchdog OS thread in nativelink.rs covers that case. +#[must_use = "StallGuard is immediately cancelled if not held in a variable"] +#[derive(Debug)] +pub struct StallGuard { + handle: tokio::task::JoinHandle<()>, +} + +impl StallGuard { + /// Create a stall guard for an operation with the given label. + /// If the guard is not dropped within `threshold`, a stack dump fires. + pub fn new(threshold: Duration, label: &'static str) -> Self { + Self::new_inner(threshold, label, None) + } + + /// Create a stall guard with additional dynamic context (e.g. digest + /// hash, size, operation details). The context string is included in + /// the stall message and thread dump header when the threshold fires. + pub fn with_context(threshold: Duration, label: &'static str, context: String) -> Self { + Self::new_inner(threshold, label, Some(context)) + } + + fn new_inner(threshold: Duration, label: &'static str, context: Option) -> Self { + let handle = tokio::spawn(async move { + tokio::time::sleep(threshold).await; + let ctx_suffix = context + .as_deref() + .map_or_else(String::new, |c| format!(" [{c}]")); + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + let prev = LAST_DUMP_EPOCH.load(Ordering::Relaxed); + if now.saturating_sub(prev) >= MIN_DUMP_INTERVAL_SECS + && LAST_DUMP_EPOCH + .compare_exchange(prev, now, Ordering::SeqCst, Ordering::Relaxed) + .is_ok() + { + error!( + "STORE OPERATION STALL: {label}{ctx_suffix} has been running for >{threshold:.0?} — dumping thread stacks", + ); + let dump_label = if ctx_suffix.is_empty() { + label.to_string() + } else { + format!("{label}{ctx_suffix}") + }; + dump_thread_stacks(&dump_label); + } else { + error!( + "STORE OPERATION STALL: {label}{ctx_suffix} has been running for >{threshold:.0?} (dump rate-limited)", + ); + } + }); + Self { handle } + } +} + +impl Drop for StallGuard { + fn drop(&mut self) { + self.handle.abort(); + } +} + +/// Dump all thread stacks to `/tmp/nativelink-stall-.txt`. +/// +/// On Linux, reads `/proc/self/task/` to enumerate threads and collects +/// thread name, wait channel, state, context switches, and kernel stack. +/// +/// On non-Linux platforms, this is a no-op (logs a message). +pub fn dump_thread_stacks(label: &str) { + #[cfg(target_os = "linux")] + dump_thread_stacks_linux(label); + + #[cfg(not(target_os = "linux"))] + { + let timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + error!( + "Thread dump not available on this platform (trigger: {label}, ts: {timestamp})" + ); + } +} + +#[cfg(target_os = "linux")] +fn dump_thread_stacks_linux(label: &str) { + use std::fmt::Write as _; + + let timestamp_ms = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_millis(); + let path = format!("/tmp/nativelink-stall-{timestamp_ms}.txt"); + let mut output = String::new(); + + let _ = writeln!(output, "=== STORE OPERATION STALL THREAD DUMP ==="); + let _ = writeln!(output, "Trigger: {label}"); + let _ = writeln!(output, "Timestamp: {timestamp_ms}"); + let _ = writeln!(output, "PID: {}", std::process::id()); + let _ = writeln!(output); + + let task_dir = "/proc/self/task"; + let entries = match std::fs::read_dir(task_dir) { + Ok(e) => e, + Err(err) => { + error!("Failed to read {task_dir}: {err}"); + return; + } + }; + + let mut tids: Vec<_> = entries + .filter_map(|e| e.ok()) + .filter_map(|e| e.file_name().to_str().map(String::from)) + .collect(); + tids.sort(); + + let _ = writeln!(output, "Thread count: {}", tids.len()); + let _ = writeln!(output); + + for tid in &tids { + let _ = writeln!(output, "--- TID {tid} ---"); + let base = format!("{task_dir}/{tid}"); + + // Thread name + if let Ok(comm) = std::fs::read_to_string(format!("{base}/comm")) { + let _ = write!(output, " comm: {comm}"); + } + // Wait channel (kernel function the thread is sleeping in) + if let Ok(wchan) = std::fs::read_to_string(format!("{base}/wchan")) { + let _ = writeln!(output, " wchan: {wchan}"); + } + // Status (state, voluntary/involuntary context switches) + if let Ok(status) = std::fs::read_to_string(format!("{base}/status")) { + for line in status.lines() { + if line.starts_with("State:") + || line.starts_with("voluntary_ctxt_switches:") + || line.starts_with("nonvoluntary_ctxt_switches:") + { + let _ = writeln!(output, " {line}"); + } + } + } + // Kernel stack (requires CAP_SYS_PTRACE or permissive ptrace_scope) + if let Ok(stack) = std::fs::read_to_string(format!("{base}/stack")) { + if !stack.trim().is_empty() { + let _ = writeln!(output, " kernel stack:"); + for line in stack.lines() { + let _ = writeln!(output, " {line}"); + } + } + } + let _ = writeln!(output); + } + + match std::fs::write(&path, &output) { + Ok(()) => error!("Thread dump written to {path}"), + Err(err) => error!("Failed to write thread dump to {path}: {err}"), + } + + // Capture userspace backtraces via eu-stack for full Rust call stacks. + let bt_path = format!("/tmp/nativelink-stall-{timestamp_ms}-bt.txt"); + let pid = std::process::id(); + match std::process::Command::new("eu-stack") + .args(["-p", &pid.to_string(), "-l"]) + .output() + { + Ok(out) => { + let combined = [&out.stdout[..], b"\n--- stderr ---\n", &out.stderr[..]].concat(); + match std::fs::write(&bt_path, &combined) { + Ok(()) => error!("Userspace backtrace written to {bt_path}"), + Err(err) => error!("Failed to write backtrace to {bt_path}: {err}"), + } + } + Err(err) => error!("Failed to run eu-stack: {err}"), + } +} diff --git a/nativelink-util/src/store_trait.rs b/nativelink-util/src/store_trait.rs index b7be933da..b838aa794 100644 --- a/nativelink-util/src/store_trait.rs +++ b/nativelink-util/src/store_trait.rs @@ -25,14 +25,26 @@ use std::ffi::OsString; use std::sync::{Arc, OnceLock}; use async_trait::async_trait; -use bytes::{Bytes, BytesMut}; +use bytes::Bytes; use futures::{Future, FutureExt, Stream, join, try_join}; use nativelink_error::{Code, Error, ResultExt, error_if, make_err}; + +tokio::task_local! { + /// Set to `true` when the current CAS request originates from a worker + /// (not a client like Bazel). `WorkerProxyStore` checks this to decide + /// between proxying blob data (for clients) and returning a redirect + /// with peer endpoints (for workers). + pub static IS_WORKER_REQUEST: bool; +} + +/// Prefix for redirect errors returned by `WorkerProxyStore` to worker callers. +/// The remainder of the message is a comma-separated list of peer gRPC endpoints +/// that have the requested blob. Example: `"NL_REDIRECT:grpc://w1:50081,grpc://w2:50081"` +pub const REDIRECT_PREFIX: &str = "NL_REDIRECT:"; use nativelink_metric::MetricsComponent; use rand::rngs::StdRng; use rand::{RngCore, SeedableRng}; use serde::{Deserialize, Serialize}; -use tokio::io::{AsyncReadExt, AsyncSeekExt}; use tracing::warn; use crate::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf, make_buf_channel_pair}; @@ -81,11 +93,12 @@ pub enum UploadSizeInfo { pub async fn slow_update_store_with_file( store: Pin<&S>, digest: impl Into>, - file: &mut fs::FileSlot, + mut file: fs::FileSlot, upload_size: UploadSizeInfo, -) -> Result<(), Error> { - file.rewind() - .await +) -> Result { + use std::io::Seek; + file.as_std_mut() + .seek(std::io::SeekFrom::Start(0)) .err_tip(|| "Failed to rewind in upload_file_to_store")?; let (mut tx, rx) = make_buf_channel_pair(); @@ -93,25 +106,17 @@ pub async fn slow_update_store_with_file( .update(digest.into(), rx, upload_size) .map(|r| r.err_tip(|| "Could not upload data to store in upload_file_to_store")); let read_data_fut = async move { - loop { - let mut buf = BytesMut::with_capacity(fs::DEFAULT_READ_BUFF_SIZE); - let read = file - .read_buf(&mut buf) - .await - .err_tip(|| "Failed to read in upload_file_to_store")?; - if read == 0 { - break; - } - tx.send(buf.freeze()) - .await - .err_tip(|| "Failed to send in upload_file_to_store")?; - } + let file = fs::read_file_to_channel(file, &mut tx, u64::MAX, fs::DEFAULT_READ_BUFF_SIZE) + .await + .err_tip(|| "Failed to read in upload_file_to_store")?; tx.send_eof() - .err_tip(|| "Could not send EOF to store in upload_file_to_store") + .err_tip(|| "Could not send EOF to store in upload_file_to_store")?; + Ok::<_, Error>(file) }; - tokio::pin!(read_data_fut); let (update_res, read_res) = tokio::join!(update_fut, read_data_fut); - update_res.merge(read_res) + update_res?; + let file = read_res?; + Ok(file) } /// Optimizations that stores may want to expose to the callers. @@ -389,11 +394,11 @@ impl Store { } #[inline] - pub fn register_remove_callback( + pub fn register_item_callback( &self, - callback: Arc, + callback: Arc, ) -> Result<(), Error> { - self.inner.clone().register_remove_callback(callback) + self.inner.clone().register_item_callback(callback) } } @@ -661,7 +666,7 @@ pub trait StoreDriver: self: Pin<&Self>, key: StoreKey<'_>, path: OsString, - mut file: fs::FileSlot, + file: fs::FileSlot, upload_size: UploadSizeInfo, ) -> Result, Error> { let inner_store = self.inner_store(Some(key.borrow())); @@ -674,7 +679,7 @@ pub trait StoreDriver: .update_with_whole_file(key, path, file, upload_size) .await; } - slow_update_store_with_file(self, key, &mut file, upload_size).await?; + let file = slow_update_store_with_file(self, key, file, upload_size).await?; Ok(Some(file)) } @@ -843,20 +848,21 @@ pub trait StoreDriver: // Register health checks used to monitor the store. fn register_health(self: Arc, _registry: &mut HealthRegistryBuilder) {} - fn register_remove_callback( + fn register_item_callback( self: Arc, - callback: Arc, + callback: Arc, ) -> Result<(), Error>; } -// Callback to be called when a store deletes an item. This is used so -// compound stores can remove items from their internal state when their -// underlying stores remove items e.g. caches -pub trait RemoveItemCallback: Debug + Send + Sync { +// Callback invoked when a store inserts or deletes an item. +pub trait ItemCallback: Debug + Send + Sync { fn callback<'a>( &'a self, store_key: StoreKey<'a>, ) -> Pin + Send + 'a>>; + + /// Called synchronously when a new item is inserted. + fn on_insert(&self, _store_key: StoreKey<'_>, _size: u64) {} } /// The instructions on how to decode a value from a Bytes & version into diff --git a/nativelink-util/src/tls_utils.rs b/nativelink-util/src/tls_utils.rs index 15f685861..71f198be0 100644 --- a/nativelink-util/src/tls_utils.rs +++ b/nativelink-util/src/tls_utils.rs @@ -120,6 +120,19 @@ pub fn endpoint_from( tonic::transport::Endpoint::from(endpoint) }; + // Always enable TCP_NODELAY to reduce latency on gRPC connections. + // Nagle's algorithm delays small writes (up to 40ms), which is + // harmful for gRPC's many small HTTP/2 frames. + let endpoint_transport = endpoint_transport.tcp_nodelay(true); + + // Set HTTP/2 flow-control windows to match the server defaults (16 MiB + // stream, 32 MiB connection). Tonic/h2 defaults to 64 KiB for both, + // which caps aggregate throughput per connection to ~128 MB/s at 0.5 ms + // RTT — far below 10 GbE capacity when many streams share a connection. + let endpoint_transport = endpoint_transport + .initial_stream_window_size(16 * 1024 * 1024) + .initial_connection_window_size(32 * 1024 * 1024); + Ok(endpoint_transport) } @@ -162,10 +175,16 @@ pub fn endpoint(endpoint_config: &GrpcEndpoint) -> Result Result<(), Error> { evicting_map.range(range, |k, v: &BytesWrapper| { found_values.push((k.clone(), v.0.clone())); true - }); + }).await; found_values } diff --git a/nativelink-worker/BUILD.bazel b/nativelink-worker/BUILD.bazel index 5fcffff20..18166d1d3 100644 --- a/nativelink-worker/BUILD.bazel +++ b/nativelink-worker/BUILD.bazel @@ -26,12 +26,14 @@ rust_library( "//nativelink-error", "//nativelink-metric", "//nativelink-proto", + "//nativelink-service", "//nativelink-store", "//nativelink-util", "@crates//:bytes", "@crates//:filetime", "@crates//:formatx", "@crates//:futures", + "@crates//:hostname", "@crates//:opentelemetry", "@crates//:parking_lot", "@crates//:prost", diff --git a/nativelink-worker/Cargo.toml b/nativelink-worker/Cargo.toml index 500ab104e..c72dfe4e9 100644 --- a/nativelink-worker/Cargo.toml +++ b/nativelink-worker/Cargo.toml @@ -14,6 +14,7 @@ nativelink-config = { path = "../nativelink-config" } nativelink-error = { path = "../nativelink-error" } nativelink-metric = { path = "../nativelink-metric" } nativelink-proto = { path = "../nativelink-proto" } +nativelink-service = { path = "../nativelink-service" } nativelink-store = { path = "../nativelink-store" } nativelink-util = { path = "../nativelink-util" } @@ -22,9 +23,12 @@ bytes = { version = "1.10.1", default-features = false } filetime = { version = "0.2.25", default-features = false } formatx = { version = "0.2.3", default-features = false } futures = { version = "0.3.31", default-features = false } -opentelemetry = { version = "0.29.1", default-features = false } +hostname = { version = "0.4.0", default-features = false } +libc = { version = "0.2", default-features = false } +opentelemetry = { version = "0.31.0", default-features = false } parking_lot = { version = "0.12.3", default-features = false } -prost = { version = "0.13.5", default-features = false } +prost = { version = "0.14.3", default-features = false } +prost-types = { version = "0.14.3", default-features = false } relative-path = { version = "2.0.0", default-features = false, features = [ "alloc", "std", @@ -43,9 +47,9 @@ tokio = { version = "1.44.1", features = [ tokio-stream = { version = "0.1.17", default-features = false, features = [ "fs", ] } -tonic = { version = "0.13.0", features = [ +tonic = { version = "0.14.5", features = [ "gzip", - "tls-ring", + "tls-aws-lc", "transport", ], default-features = false } tracing = { version = "0.1.41", default-features = false } @@ -61,7 +65,6 @@ hyper = { version = "1.6.0", default-features = false } pretty_assertions = { version = "1.4.1", features = [ "std", ], default-features = false } -prost-types = { version = "0.13.5", default-features = false } rand = { version = "0.9.0", default-features = false, features = [ "thread_rng", ] } @@ -69,6 +72,7 @@ serial_test = { version = "3.2.0", features = [ "async", ], default-features = false } tempfile = { version = "3.15.0", default-features = false } +tonic-prost = { version = "0.14.5", default-features = false } tracing-test = { version = "0.2.5", default-features = false, features = [ "no-env-filter", ] } diff --git a/nativelink-worker/src/directory_cache.rs b/nativelink-worker/src/directory_cache.rs index 8a016593c..64e90e68d 100644 --- a/nativelink-worker/src/directory_cache.rs +++ b/nativelink-worker/src/directory_cache.rs @@ -14,22 +14,134 @@ use core::future::Future; use core::pin::Pin; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet, VecDeque}; use std::path::{Path, PathBuf}; use std::sync::Arc; -use std::time::SystemTime; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::time::{Instant, SystemTime}; use nativelink_error::{Code, Error, ResultExt, make_err}; use nativelink_proto::build::bazel::remote::execution::v2::{ Directory as ProtoDirectory, DirectoryNode, FileNode, SymlinkNode, }; use nativelink_store::ac_utils::get_and_decode_digest; +use nativelink_store::fast_slow_store::FastSlowStore; +use nativelink_store::filesystem_store::{FileEntry, FilesystemStore}; use nativelink_util::common::DigestInfo; -use nativelink_util::fs_util::{hardlink_directory_tree, set_readonly_recursive}; +use nativelink_util::fs_util::hardlink_directory_tree; use nativelink_util::store_trait::{Store, StoreKey, StoreLike}; use tokio::fs; use tokio::sync::{Mutex, RwLock}; -use tracing::{debug, trace, warn}; +use tracing::{debug, info, trace, warn}; + +/// Name of the merkle tree metadata file stored alongside each cached directory. +const MERKLE_METADATA_FILENAME: &str = ".merkle_tree_meta"; + +/// Cache format version file. Bump when the on-disk format changes in a way +/// that makes old entries invalid (e.g., permission semantics). On startup, +/// if the version file is missing or stale, the entire cache is wiped. +const CACHE_VERSION_FILENAME: &str = ".cache_version"; +/// Bump this when the cache format changes. +const CACHE_FORMAT_VERSION: u32 = 5; + +/// Merkle tree metadata for a cached directory entry. +/// +/// Stores the mapping from each directory digest in the tree to its relative +/// path within the cached directory on disk. This allows us to index subtrees +/// so that future cache misses can reuse already-cached subtrees via symlinks. +#[derive(Debug, Clone)] +pub struct MerkleTreeMetadata { + /// Map from directory digest -> relative path within the cache entry. + /// For the root directory, the relative path is "" (empty string). + pub digest_to_relpath: HashMap, +} + +impl MerkleTreeMetadata { + /// Serialize to a simple line-based text format: + /// `hash:size_bytes:relative_path\n` + fn serialize(&self) -> String { + let mut lines = Vec::with_capacity(self.digest_to_relpath.len()); + for (digest, relpath) in &self.digest_to_relpath { + lines.push(format!("{}:{}:{}", digest.packed_hash(), digest.size_bytes(), relpath)); + } + // Sort for deterministic output + lines.sort(); + lines.join("\n") + } + + /// Deserialize from the line-based text format. + fn deserialize(data: &str) -> Result { + let mut digest_to_relpath = HashMap::new(); + for line in data.lines() { + let line = line.trim(); + if line.is_empty() { + continue; + } + // Format: hash:size_bytes:relative_path + // The relative path may contain colons, so split at most 3 parts. + let mut parts = line.splitn(3, ':'); + let hash = parts.next().ok_or_else(|| { + make_err!(Code::Internal, "Missing hash in merkle metadata line: {line}") + })?; + let size_str = parts.next().ok_or_else(|| { + make_err!(Code::Internal, "Missing size in merkle metadata line: {line}") + })?; + let relpath = parts.next().unwrap_or(""); + + let size: i64 = size_str.parse().map_err(|e| { + make_err!(Code::Internal, "Invalid size in merkle metadata line: {line}: {e}") + })?; + + let digest = DigestInfo::try_new(hash, size) + .err_tip(|| format!("Invalid digest in merkle metadata line: {line}"))?; + + digest_to_relpath.insert(digest, relpath.to_string()); + } + Ok(Self { digest_to_relpath }) + } + + /// Build merkle tree metadata by walking a resolved directory tree. + /// + /// `tree` is the map from digest -> Directory proto (as returned by + /// `resolve_directory_tree`). `root_digest` is the root of the tree. + /// + /// Returns a mapping from each directory digest to its relative path + /// within the cache entry (root = ""). + fn from_directory_tree( + tree: &HashMap, + root_digest: &DigestInfo, + ) -> Self { + let mut digest_to_relpath = HashMap::with_capacity(tree.len()); + let mut queue = VecDeque::new(); + queue.push_back((*root_digest, String::new())); + + while let Some((digest, relpath)) = queue.pop_front() { + if digest_to_relpath.contains_key(&digest) { + continue; // Already visited (handles diamond dependencies) + } + digest_to_relpath.insert(digest, relpath.clone()); + + if let Some(dir) = tree.get(&digest) { + for subdir_node in &dir.directories { + if let Some(child_digest) = subdir_node + .digest + .as_ref() + .and_then(|d| DigestInfo::try_from(d).ok()) + { + let child_relpath = if relpath.is_empty() { + subdir_node.name.clone() + } else { + format!("{}/{}", relpath, subdir_node.name) + }; + queue.push_back((child_digest, child_relpath)); + } + } + } + } + + Self { digest_to_relpath } + } +} /// Configuration for the directory cache #[derive(Debug, Clone)] @@ -52,17 +164,30 @@ impl Default for DirectoryCacheConfig { } } -/// Metadata for a cached directory -#[derive(Debug, Clone)] +/// Metadata for a cached directory. +/// +/// `ref_count` and `last_access` use atomics so that the cache hit fast path +/// only needs a *read* lock on the cache HashMap (no write lock contention). +#[derive(Debug)] struct CachedDirectoryMetadata { /// Path to the cached directory path: PathBuf, /// Size in bytes size: u64, - /// Last access time for LRU eviction - last_access: SystemTime, - /// Reference count (number of active users) - ref_count: usize, + /// Last access time as duration-since-EPOCH in millis (atomic for read-lock access) + last_access_millis: AtomicU64, + /// Reference count (number of active hardlink operations in flight) + ref_count: AtomicUsize, +} + +impl CachedDirectoryMetadata { + fn touch(&self) { + let millis = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_millis() as u64; + self.last_access_millis.store(millis, Ordering::Relaxed); + } } /// High-performance directory cache that uses hardlinks to avoid repeated @@ -75,21 +200,83 @@ struct CachedDirectoryMetadata { /// 3. If no, construct it once and cache for future use /// /// This dramatically reduces I/O and improves action startup time. +/// +/// ## Security Note +/// +/// Hardlinked files share inodes. If an action process has elevated privileges +/// (e.g. root, `CAP_DAC_OVERRIDE`), it can bypass read-only permissions and +/// modify cached files through the workspace hardlink, poisoning the cache for +/// subsequent actions. For multi-tenant clusters, consider running actions in +/// user namespaces or using copy-on-write (reflink) instead of hardlinks. #[derive(Debug)] pub struct DirectoryCache { /// Configuration config: DirectoryCacheConfig, /// Cache mapping digest -> metadata cache: Arc>>, - /// Lock for cache construction to prevent stampedes + /// Per-digest construction locks to prevent stampedes. + /// + /// Protocol: + /// 1. A task entering construction clones the `Arc>`, incrementing + /// strong_count to >= 2 (HashMap entry + task clone). + /// 2. On completion, if strong_count == 2 and the entry is still *our* Arc + /// (checked via `Arc::ptr_eq`), no other task is waiting, so we remove it. + /// 3. If another task is waiting (strong_count > 2), we leave cleanup to the + /// last finisher. The worst case of a missed cleanup is a stale empty Mutex + /// in the HashMap, which is harmless. construction_locks: Arc>>>>, - /// CAS store for fetching directories + /// CAS store for fetching directories (used as fallback in construct_directory_impl) cas_store: Store, + /// Concrete FastSlowStore for the fast `download_to_directory` path. + /// When available, cache-miss construction uses batch RPCs instead of + /// serial per-file fetches. + fast_slow_store: Option>, + /// Concrete FilesystemStore (the fast store inside FastSlowStore). + /// Required for hardlinking files from the CAS to the cache directory. + filesystem_store: Option>, + /// Subtree index: maps each directory digest to its absolute path on disk + /// within a cached entry. This allows partial reuse of cached subtrees + /// when a new root digest is requested that shares subtrees with an + /// already-cached root. + /// + /// Updated when cache entries are inserted or evicted. + subtree_index: RwLock>, + /// Reference count for each subtree digest across all cached entries. + /// When a digest's count drops to zero, it is truly removed and should + /// be reported in the "removed" delta. + subtree_refcount: RwLock>, + /// Pending subtree digest changes since the last `take_pending_subtree_changes()` call. + /// Protected by a Mutex for interior mutability from insertion/eviction paths. + pending_subtree_changes: Mutex, + /// Cumulative hit count for stats logging + hit_count: AtomicU64, + /// Cumulative miss count for stats logging + miss_count: AtomicU64, + /// Cumulative subtree hit count for stats logging + subtree_hit_count: AtomicU64, +} + +/// Accumulated subtree digest changes between periodic reports. +#[derive(Debug, Default)] +pub struct PendingSubtreeChanges { + /// Subtree digests added since last report. + pub added: HashSet, + /// Subtree digests removed since last report (only those no longer in ANY cached entry). + pub removed: HashSet, } impl DirectoryCache { - /// Creates a new `DirectoryCache` - pub async fn new(config: DirectoryCacheConfig, cas_store: Store) -> Result { + /// Creates a new `DirectoryCache`. + /// + /// If `fast_slow_store` is provided, cache-miss construction will use the + /// fast batch `download_to_directory` path (GetTree + BatchReadBlobs + + /// parallel hardlinks). Otherwise falls back to the serial + /// `construct_directory_impl` method. + pub async fn new( + config: DirectoryCacheConfig, + cas_store: Store, + fast_slow_store: Option>, + ) -> Result { // Ensure cache root exists fs::create_dir_all(&config.cache_root).await.err_tip(|| { format!( @@ -98,59 +285,291 @@ impl DirectoryCache { ) })?; + // Try to extract the FilesystemStore from the FastSlowStore if provided. + let filesystem_store = fast_slow_store.as_ref().and_then(|fss| { + fss.fast_store() + .downcast_ref::(None) + .and_then(|fs| fs.get_arc()) + }); + + let has_fast_path = fast_slow_store.is_some() && filesystem_store.is_some(); + + if has_fast_path { + info!( + cache_root = %config.cache_root.display(), + max_entries = config.max_entries, + max_size_bytes = config.max_size_bytes, + fast_path = true, + "DirectoryCache initialized: using fast download_to_directory path for cache misses", + ); + } else if fast_slow_store.is_some() { + warn!( + cache_root = %config.cache_root.display(), + max_entries = config.max_entries, + max_size_bytes = config.max_size_bytes, + "DirectoryCache initialized: FastSlowStore provided but could not extract FilesystemStore; falling back to serial construction", + ); + } else { + info!( + cache_root = %config.cache_root.display(), + max_entries = config.max_entries, + max_size_bytes = config.max_size_bytes, + fast_path = false, + "DirectoryCache initialized: no FastSlowStore, using serial construction", + ); + } + + let mut initial_cache = HashMap::new(); + let mut initial_subtree_index = HashMap::new(); + let mut initial_subtree_refcount: HashMap = HashMap::new(); + + // Check cache format version. If stale or missing, wipe the cache. + let version_path = config.cache_root.join(CACHE_VERSION_FILENAME); + let version_ok = match fs::read_to_string(&version_path).await { + Ok(v) => v.trim().parse::().ok() == Some(CACHE_FORMAT_VERSION), + Err(_) => false, + }; + if !version_ok { + info!( + expected = CACHE_FORMAT_VERSION, + "DirectoryCache: format version mismatch, clearing stale entries", + ); + if let Ok(mut entries) = fs::read_dir(&config.cache_root).await { + while let Ok(Some(entry)) = entries.next_entry().await { + let p = entry.path(); + if let Ok(meta) = fs::symlink_metadata(&p).await { + if meta.is_dir() { + // Only chmod directories writable, not files (which + // are hardlinked to CAS). On unix, directory write + // permission is sufficient to unlink files. + Self::remove_readonly_dir(&p).await; + } else { + drop(fs::remove_file(&p).await); + } + } + } + } + fs::write(&version_path, format!("{CACHE_FORMAT_VERSION}\n")) + .await + .err_tip(|| "Failed to write cache version file")?; + } + + // Load existing cache entries from disk on startup. + let load_start = Instant::now(); + let mut loaded_count = 0u64; + let mut loaded_subtrees = 0u64; + let mut loaded_errors = 0u64; + if let Ok(mut entries) = fs::read_dir(&config.cache_root).await { + while let Ok(Some(entry)) = entries.next_entry().await { + let entry_name = entry.file_name().to_string_lossy().to_string(); + // Skip temp directories and the merkle metadata files + if entry_name.starts_with(".tmp-") || entry_name == MERKLE_METADATA_FILENAME { + continue; + } + let entry_path = entry.path(); + let Ok(metadata) = fs::symlink_metadata(&entry_path).await else { + continue; + }; + if !metadata.is_dir() { + continue; + } + + // Try to parse the entry name as a DigestInfo + let Some(digest) = Self::parse_digest_from_dirname(&entry_name) else { + debug!(name = %entry_name, "Skipping non-digest cache directory entry"); + continue; + }; + + // Calculate the directory size + let size = match Self::set_readonly_and_calculate_size(&entry_path).await { + Ok(s) => s, + Err(e) => { + warn!( + name = %entry_name, + ?e, + "Failed to calculate size for existing cache entry, skipping", + ); + loaded_errors += 1; + continue; + } + }; + + // Load merkle tree metadata if available + let merkle_path = entry_path.join(MERKLE_METADATA_FILENAME); + if let Ok(data) = fs::read_to_string(&merkle_path).await { + match MerkleTreeMetadata::deserialize(&data) { + Ok(merkle) => { + for (sub_digest, relpath) in &merkle.digest_to_relpath { + let abs_path = if relpath.is_empty() { + entry_path.clone() + } else { + entry_path.join(relpath) + }; + initial_subtree_index.insert(*sub_digest, abs_path); + *initial_subtree_refcount.entry(*sub_digest).or_insert(0) += 1; + loaded_subtrees += 1; + } + } + Err(e) => { + debug!( + name = %entry_name, + ?e, + "Failed to parse merkle metadata, subtrees won't be indexed", + ); + } + } + } + + let now_millis = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_millis() as u64; + + initial_cache.insert( + digest, + CachedDirectoryMetadata { + path: entry_path, + size, + last_access_millis: AtomicU64::new(now_millis), + ref_count: AtomicUsize::new(0), + }, + ); + loaded_count += 1; + } + } + + let load_elapsed = load_start.elapsed(); + if loaded_count > 0 || loaded_errors > 0 { + info!( + loaded_entries = loaded_count, + loaded_subtrees, + load_errors = loaded_errors, + elapsed_ms = load_elapsed.as_millis() as u64, + "DirectoryCache: loaded existing entries from disk on startup", + ); + } + Ok(Self { config, - cache: Arc::new(RwLock::new(HashMap::new())), + cache: Arc::new(RwLock::new(initial_cache)), construction_locks: Arc::new(Mutex::new(HashMap::new())), cas_store, + fast_slow_store, + filesystem_store, + subtree_index: RwLock::new(initial_subtree_index), + subtree_refcount: RwLock::new(initial_subtree_refcount), + pending_subtree_changes: Mutex::new(PendingSubtreeChanges::default()), + hit_count: AtomicU64::new(0), + miss_count: AtomicU64::new(0), + subtree_hit_count: AtomicU64::new(0), }) } - /// Gets or creates a directory in the cache, then hardlinks it to the destination + /// Returns the digests of all currently cached input root directories. + /// The scheduler uses this to give routing preference to workers that + /// already have an action's input_root_digest cached. + pub async fn cached_digests(&self) -> Vec { + let cache = self.cache.read().await; + cache.keys().copied().collect() + } + + /// Returns ALL subtree digests currently tracked across all cached entries. + /// Used for the initial full snapshot on (re)connect. + pub async fn all_subtree_digests(&self) -> Vec { + let refcount = self.subtree_refcount.read().await; + refcount.keys().copied().collect() + } + + /// Atomically takes the pending subtree changes since the last call, + /// returning (added, removed) digest lists and clearing the internal state. + pub async fn take_pending_subtree_changes(&self) -> (Vec, Vec) { + let mut pending = self.pending_subtree_changes.lock().await; + let added: Vec = pending.added.drain().collect(); + let removed: Vec = pending.removed.drain().collect(); + (added, removed) + } + + /// Records that subtree digests from a merkle tree were added (new cache entry). + /// Increments refcounts and records newly-appearing digests in pending added. + async fn record_subtree_insertion(&self, merkle: &MerkleTreeMetadata) { + let mut refcount = self.subtree_refcount.write().await; + let mut pending = self.pending_subtree_changes.lock().await; + for sub_digest in merkle.digest_to_relpath.keys() { + let count = refcount.entry(*sub_digest).or_insert(0); + if *count == 0 { + // This digest is newly appearing across all cached entries. + pending.added.insert(*sub_digest); + // If it was in the removed set (evicted then re-added before + // the delta was taken), cancel it out. + pending.removed.remove(sub_digest); + } + *count += 1; + } + } + + /// Records that subtree digests from a merkle tree were removed (evicted cache entry). + /// Decrements refcounts and records fully-removed digests in pending removed. + async fn record_subtree_removal(&self, merkle_digests: &[DigestInfo]) { + let mut refcount = self.subtree_refcount.write().await; + let mut pending = self.pending_subtree_changes.lock().await; + for sub_digest in merkle_digests { + if let Some(count) = refcount.get_mut(sub_digest) { + *count = count.saturating_sub(1); + if *count == 0 { + refcount.remove(sub_digest); + // This digest is no longer in ANY cached entry. + pending.removed.insert(*sub_digest); + // If it was in the added set (added then evicted before + // the delta was taken), cancel it out. + pending.added.remove(sub_digest); + } + } + } + } + + /// Gets or creates a directory in the cache, then hardlinks it to the destination. /// /// # Arguments /// * `digest` - Digest of the root Directory proto - /// * `dest_path` - Where to hardlink/create the directory + /// * `dest_path` - Where to hardlink/create the directory (may already exist) /// /// # Returns /// * `Ok(true)` - Cache hit (directory was hardlinked) - /// * `Ok(false)` - Cache miss (directory was constructed) + /// * `Ok(false)` - Cache miss (directory was constructed and cached) /// * `Err` - Error during construction or hardlinking pub async fn get_or_create(&self, digest: DigestInfo, dest_path: &Path) -> Result { - // Fast path: check if already in cache - { - let mut cache = self.cache.write().await; - if let Some(metadata) = cache.get_mut(&digest) { - // Update access time and ref count - metadata.last_access = SystemTime::now(); - metadata.ref_count += 1; - - debug!( - ?digest, - path = ?metadata.path, - "Directory cache HIT" - ); - - // Try to hardlink from cache - match hardlink_directory_tree(&metadata.path, dest_path).await { - Ok(()) => { - metadata.ref_count -= 1; - return Ok(true); - } - Err(e) => { - warn!( - ?digest, - error = ?e, - "Failed to hardlink from cache, will reconstruct" - ); - metadata.ref_count -= 1; - // Fall through to reconstruction - } - } - } + let overall_start = Instant::now(); + + // Fast path: check if already in cache (read lock only for the lookup) + if self.try_hardlink_cached(&digest, dest_path).await? { + let hits = self.hit_count.fetch_add(1, Ordering::Relaxed) + 1; + let misses = self.miss_count.load(Ordering::Relaxed); + let total = hits + misses; + let hit_rate = if total > 0 { (hits as f64 / total as f64) * 100.0 } else { 0.0 }; + debug!( + hash = %&digest.packed_hash().to_string()[..12], + elapsed_ms = overall_start.elapsed().as_millis() as u64, + hits, + misses, + hit_rate = format!("{hit_rate:.1}%"), + "DirectoryCache HIT (hardlinked from cache)", + ); + return Ok(true); } - debug!(?digest, "Directory cache MISS"); + let misses = self.miss_count.fetch_add(1, Ordering::Relaxed) + 1; + let hits = self.hit_count.load(Ordering::Relaxed); + let total = hits + misses; + let hit_rate = if total > 0 { (hits as f64 / total as f64) * 100.0 } else { 0.0 }; + debug!( + hash = %&digest.packed_hash().to_string()[..12], + size_bytes = digest.size_bytes(), + hits, + misses, + hit_rate = format!("{hit_rate:.1}%"), + has_fast_path = self.fast_slow_store.is_some() && self.filesystem_store.is_some(), + "DirectoryCache MISS, starting construction", + ); // Get or create construction lock to prevent stampede let construction_lock = { @@ -164,163 +583,1077 @@ impl DirectoryCache { // Only one task constructs at a time for this digest let _guard = construction_lock.lock().await; - // Check again in case another task just constructed it - { - let cache = self.cache.read().await; - if let Some(metadata) = cache.get(&digest) { - return match hardlink_directory_tree(&metadata.path, dest_path).await { - Ok(()) => Ok(true), + // Double-check after acquiring lock — another task may have just constructed it + if self.try_hardlink_cached(&digest, dest_path).await? { + self.cleanup_construction_lock(&digest, &construction_lock); + return Ok(true); + } + + // Construct in a temp path, rename to final path on success. + // This prevents orphaned partial directories on failure. + let cache_path = self.get_cache_path(&digest); + let temp_path = self.config.cache_root.join(format!( + ".tmp-{digest}-{}-{}", + std::process::id(), + self.next_temp_id(), + )); + + // Clean up any stale temp path from a previous crashed attempt + drop(fs::remove_dir_all(&temp_path).await); + + let construction_result: Result = async { + fs::create_dir_all(&temp_path).await.err_tip(|| { + format!("Failed to create temp dir: {}", temp_path.display()) + })?; + + // Step 1: Resolve the merkle tree if we have a FastSlowStore. + // This gives us the full directory tree structure, which we use for: + // (a) subtree matching against the subtree_index + // (b) storing merkle metadata alongside the cache entry + let resolved_tree = if let Some(fss) = &self.fast_slow_store { + match crate::running_actions_manager::resolve_directory_tree(fss, &digest).await { + Ok(tree) => Some(tree), Err(e) => { warn!( - ?digest, - error = ?e, - "Failed to hardlink after construction" + hash = %&digest.packed_hash().to_string()[..12], + ?e, + "DirectoryCache: failed to resolve directory tree, skipping subtree matching", ); - // Construct directly at dest_path - self.construct_directory(digest, dest_path).await?; - Ok(false) + None } - }; + } + } else { + None + }; + + // Step 2: Check for cached subtrees and construct a partial build plan. + // A "subtree hit" means a directory node in the requested tree is + // already materialized on disk from a different cached root. We can + // symlink to it instead of downloading. + let subtree_hits: HashMap = if let Some(tree) = &resolved_tree { + let index = self.subtree_index.read().await; + let mut hits = HashMap::new(); + for dir_digest in tree.keys() { + // Don't count the root itself (that's a full cache hit, handled above) + if *dir_digest == digest { + continue; + } + if let Some(cached_path) = index.get(dir_digest) { + // Verify the cached path still exists on disk + if cached_path.exists() { + hits.insert(*dir_digest, cached_path.clone()); + } + } + } + hits + } else { + HashMap::new() + }; + + if !subtree_hits.is_empty() { + let subtree_count = subtree_hits.len(); + let total_dirs = resolved_tree.as_ref().map_or(0, |t| t.len()); + self.subtree_hit_count.fetch_add(subtree_count as u64, Ordering::Relaxed); + debug!( + hash = %&digest.packed_hash().to_string()[..12], + subtree_hits = subtree_count, + total_dirs, + "DirectoryCache: found cached subtrees, will symlink instead of downloading", + ); } - } - // Construct the directory in cache - let cache_path = self.get_cache_path(&digest); - self.construct_directory(digest, &cache_path).await?; + // Step 3: Build the directory tree. + // If we have subtree hits and a resolved tree, use subtree-aware + // construction. Otherwise, fall back to full construction. + if let Some(tree) = &resolved_tree { + if !subtree_hits.is_empty() { + // Subtree-aware construction: walk the tree, symlink cached + // subtrees, and only download uncached portions. + self.construct_with_subtrees( + &digest, + tree, + &subtree_hits, + &temp_path, + ) + .await + .err_tip(|| "Failed subtree-aware construction")?; + } else { + // No subtree hits -- use fast download_to_directory if available. + self.construct_full(&digest, &temp_path).await + .err_tip(|| "Failed full construction")?; + } + } else { + // No resolved tree -- use full construction. + self.construct_full(&digest, &temp_path).await + .err_tip(|| "Failed full construction (no resolved tree)")?; + } - // Make it read-only to prevent modifications - set_readonly_recursive(&cache_path) - .await - .err_tip(|| "Failed to set cache directory to readonly")?; + // Step 4: Store merkle tree metadata alongside the cache entry. + if let Some(tree) = &resolved_tree { + let merkle_meta = MerkleTreeMetadata::from_directory_tree(tree, &digest); + let merkle_path = temp_path.join(MERKLE_METADATA_FILENAME); + let serialized = merkle_meta.serialize(); + if let Err(e) = fs::write(&merkle_path, serialized.as_bytes()).await { + warn!( + hash = %&digest.packed_hash().to_string()[..12], + ?e, + "DirectoryCache: failed to write merkle metadata, subtrees won't be indexed", + ); + } + } - // Calculate size - let size = nativelink_util::fs_util::calculate_directory_size(&cache_path) - .await - .err_tip(|| "Failed to calculate directory size")?; + // Combined walk: set read-only permissions and calculate size in one pass. + let readonly_start = Instant::now(); + let size = Self::set_readonly_and_calculate_size(&temp_path).await + .err_tip(|| "Failed to set readonly and calculate size for cache directory")?; + debug!( + hash = %&digest.packed_hash().to_string()[..12], + size_bytes = size, + size_mb = format!("{:.2}", size as f64 / (1024.0 * 1024.0)), + elapsed_ms = readonly_start.elapsed().as_millis() as u64, + "DirectoryCache: set_readonly_and_calculate_size completed", + ); + // macOS requires the source directory to be writable for rename(2). + // Temporarily restore write permission on the root, rename, then + // lock it down again. + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mut perms = fs::metadata(&temp_path).await + .err_tip(|| "Failed to get temp dir metadata before rename")? + .permissions(); + perms.set_mode(0o755); + fs::set_permissions(&temp_path, perms).await + .err_tip(|| "Failed to make temp dir writable before rename")?; + } + fs::rename(&temp_path, &cache_path).await.err_tip(|| { + format!( + "Failed to rename temp dir {} to cache path {}", + temp_path.display(), + cache_path.display() + ) + })?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mut perms = fs::metadata(&cache_path).await + .err_tip(|| "Failed to get cache dir metadata after rename")? + .permissions(); + perms.set_mode(0o555); + fs::set_permissions(&cache_path, perms).await + .err_tip(|| "Failed to lock down cache dir after rename")?; + } - // Add to cache - { - let mut cache = self.cache.write().await; + // Step 5: Update the subtree index with all directories from this entry, + // and record the insertion for delta reporting. + if let Some(tree) = &resolved_tree { + let merkle_meta = MerkleTreeMetadata::from_directory_tree(tree, &digest); + let mut index = self.subtree_index.write().await; + for (sub_digest, relpath) in &merkle_meta.digest_to_relpath { + let abs_path = if relpath.is_empty() { + cache_path.clone() + } else { + cache_path.join(relpath) + }; + index.insert(*sub_digest, abs_path); + } + drop(index); + self.record_subtree_insertion(&merkle_meta).await; + } - // Evict if necessary - self.evict_if_needed(size, &mut cache).await?; + Ok(size) + } + .await; + + let size = match construction_result { + Ok(s) => s, + Err(e) => { + warn!( + hash = %&digest.packed_hash().to_string()[..12], + ?e, + elapsed_ms = overall_start.elapsed().as_millis() as u64, + "DirectoryCache MISS construction FAILED", + ); + Self::remove_readonly_dir(&temp_path).await; + self.cleanup_construction_lock(&digest, &construction_lock); + return Err(e); + } + }; + // Insert with ref_count=1 to prevent eviction during hardlink. + // Collect eviction candidates while holding the lock, then delete outside. + let (evicted_paths, cache_entries, cache_total_size) = { + let mut cache = self.cache.write().await; + let evicted = self.collect_evictions(size, &mut cache); cache.insert( digest, CachedDirectoryMetadata { path: cache_path.clone(), size, - last_access: SystemTime::now(), - ref_count: 0, + last_access_millis: AtomicU64::new( + SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_millis() as u64, + ), + ref_count: AtomicUsize::new(1), }, ); + let total_size: u64 = cache.values().map(|m| m.size).sum(); + (evicted, cache.len(), total_size) + }; + + debug!( + hash = %&digest.packed_hash().to_string()[..12], + size_bytes = size, + size_mb = format!("{:.2}", size as f64 / (1024.0 * 1024.0)), + cache_entries, + cache_total_size_mb = format!("{:.2}", cache_total_size as f64 / (1024.0 * 1024.0)), + evicted_count = evicted_paths.len(), + elapsed_ms = overall_start.elapsed().as_millis() as u64, + "DirectoryCache MISS construction complete, inserted into cache", + ); + + // Delete evicted directories outside the lock. + // Cached directories are read-only (0o555/0o444), so we must make them + // writable before removal. Also clean up the subtree index. + if !evicted_paths.is_empty() { + let mut index = self.subtree_index.write().await; + for path in &evicted_paths { + self.remove_subtree_index_for_path(path, &mut index).await; + } + drop(index); + for path in evicted_paths { + Self::remove_readonly_dir(&path).await; + } } - // Hardlink to destination - hardlink_directory_tree(&cache_path, dest_path) - .await - .err_tip(|| "Failed to hardlink newly cached directory")?; + // Hardlink to destination (safe — ref_count=1 prevents eviction) + let hardlink_start = Instant::now(); + let hardlink_result = hardlink_directory_tree(&cache_path, dest_path).await; + let hardlink_elapsed = hardlink_start.elapsed(); + + // Decrement ref_count regardless of hardlink result + { + let cache = self.cache.read().await; + if let Some(metadata) = cache.get(&digest) { + metadata.ref_count.fetch_sub(1, Ordering::Relaxed); + } + } + + // Drop the construction lock guard before cleanup + drop(_guard); + self.cleanup_construction_lock(&digest, &construction_lock); + + match &hardlink_result { + Ok(()) => { + debug!( + hash = %&digest.packed_hash().to_string()[..12], + hardlink_ms = hardlink_elapsed.as_millis() as u64, + total_ms = overall_start.elapsed().as_millis() as u64, + "DirectoryCache: hardlinked newly constructed directory to dest", + ); + } + Err(e) => { + warn!( + hash = %&digest.packed_hash().to_string()[..12], + ?e, + hardlink_ms = hardlink_elapsed.as_millis() as u64, + "DirectoryCache: failed to hardlink newly constructed directory to dest", + ); + } + } + + hardlink_result.err_tip(|| "Failed to hardlink newly cached directory")?; Ok(false) } - /// Constructs a directory from the CAS at the given path - fn construct_directory<'a>( - &'a self, - digest: DigestInfo, - dest_path: &'a Path, - ) -> Pin> + Send + 'a>> { - Box::pin(async move { - debug!(?digest, ?dest_path, "Constructing directory"); + /// Attempts to hardlink a cached directory to dest, guarding eviction with ref_count. + /// Returns `Ok(true)` on cache hit + successful hardlink, `Ok(false)` on cache miss + /// or failed hardlink (caller should fall through to reconstruction). + async fn try_hardlink_cached( + &self, + digest: &DigestInfo, + dest_path: &Path, + ) -> Result { + let (src_path, cached_size) = { + // Read lock is sufficient — ref_count and last_access are atomic. + let cache = self.cache.read().await; + let Some(metadata) = cache.get(digest) else { + debug!( + hash = %&digest.packed_hash().to_string()[..12], + "DirectoryCache: not in cache (miss)", + ); + return Ok(false); + }; + metadata.touch(); + metadata.ref_count.fetch_add(1, Ordering::Relaxed); + (metadata.path.clone(), metadata.size) + }; - // Fetch the Directory proto - let directory: ProtoDirectory = get_and_decode_digest(&self.cas_store, digest.into()) - .await - .err_tip(|| format!("Failed to fetch directory digest: {digest:?}"))?; + debug!( + hash = %&digest.packed_hash().to_string()[..12], + cached_size_bytes = cached_size, + "DirectoryCache: found in cache, hardlinking", + ); - // Create the destination directory - fs::create_dir_all(dest_path) - .await - .err_tip(|| format!("Failed to create directory: {}", dest_path.display()))?; + let hardlink_start = Instant::now(); + let result = hardlink_directory_tree(&src_path, dest_path).await; + let hardlink_elapsed = hardlink_start.elapsed(); - // Process files - for file in &directory.files { - self.create_file(dest_path, file).await?; + // Always decrement ref_count + { + let cache = self.cache.read().await; + if let Some(metadata) = cache.get(digest) { + metadata.ref_count.fetch_sub(1, Ordering::Relaxed); } + } - // Process subdirectories recursively - for dir_node in &directory.directories { - self.create_subdirectory(dest_path, dir_node).await?; + match result { + Ok(()) => { + debug!( + hash = %&digest.packed_hash().to_string()[..12], + cached_size_bytes = cached_size, + hardlink_ms = hardlink_elapsed.as_millis() as u64, + "DirectoryCache: hardlink from cache succeeded", + ); + Ok(true) + } + Err(e) => { + warn!( + hash = %&digest.packed_hash().to_string()[..12], + error = ?e, + hardlink_ms = hardlink_elapsed.as_millis() as u64, + "DirectoryCache: hardlink from cache FAILED, will reconstruct", + ); + Ok(false) + } + } + } + + /// Removes the construction lock entry if no other task is waiting on it. + fn cleanup_construction_lock(&self, digest: &DigestInfo, lock: &Arc>) { + // Acquire the outer mutex to make the check+remove atomic with respect + // to new tasks cloning from the HashMap. + if let Ok(mut locks) = self.construction_locks.try_lock() { + // Only remove if the entry is still *our* lock (not a replacement) + // and no other task is holding a clone. + if let Some(existing) = locks.get(digest) { + if Arc::ptr_eq(existing, lock) && Arc::strong_count(lock) <= 2 { + locks.remove(digest); + } } + } + } - // Process symlinks - for symlink in &directory.symlinks { - self.create_symlink(dest_path, symlink).await?; + /// Recursively removes a read-only directory by first restoring write + /// permissions on directories. Files are NOT chmoded because they are + /// hardlinked to CAS entries — changing their mode would corrupt the + /// shared inode's permissions for all concurrent actions. + /// On unix, only the parent directory needs write permission to unlink files. + async fn remove_readonly_dir(path: &Path) { + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + if let Ok(metadata) = fs::symlink_metadata(path).await { + if metadata.is_dir() { + drop(fs::set_permissions(path, std::fs::Permissions::from_mode(0o755)).await); + if let Ok(mut entries) = fs::read_dir(path).await { + while let Ok(Some(entry)) = entries.next_entry().await { + if let Ok(meta) = fs::symlink_metadata(entry.path()).await { + if meta.is_dir() { + Box::pin(Self::remove_readonly_dir(&entry.path())).await; + } + // Do NOT chmod files — they are hardlinked to CAS. + } + } + } + } } + } - Ok(()) - }) + if let Err(e) = fs::remove_dir_all(path).await { + warn!(path = ?path, error = ?e, "Failed to remove evicted directory from disk"); + } } - /// Creates a file from a `FileNode` - async fn create_file(&self, parent: &Path, file_node: &FileNode) -> Result<(), Error> { - let file_path = parent.join(&file_node.name); - let digest = DigestInfo::try_from( - file_node - .digest - .clone() - .ok_or_else(|| make_err!(Code::InvalidArgument, "File node missing digest"))?, - ) - .err_tip(|| "Invalid file digest")?; + /// Monotonically increasing counter for unique temp paths. + fn next_temp_id(&self) -> u64 { + use std::sync::atomic::AtomicU64 as StaticAtomicU64; + static COUNTER: StaticAtomicU64 = StaticAtomicU64::new(0); + COUNTER.fetch_add(1, Ordering::Relaxed) + } - trace!(?file_path, ?digest, "Creating file"); + /// Validates that a node name is a single safe path component. + /// Rejects path separators, traversal components, empty names, and null bytes. + fn validate_node_name(name: &str) -> Result<(), Error> { + if name.is_empty() + || name == "." + || name == ".." + || name.contains('/') + || name.contains('\\') + || name.contains('\0') + { + return Err(make_err!( + Code::InvalidArgument, + "Invalid node name in Directory proto: {:?}", + name + )); + } + Ok(()) + } - // Fetch file content from CAS - let data = self - .cas_store - .get_part_unchunked(StoreKey::Digest(digest), 0, None) - .await - .err_tip(|| format!("Failed to fetch file: {}", file_path.display()))?; + /// Validates that a symlink target does not escape the workspace root. + /// Rejects absolute paths. For relative paths, verifies the resolved path + /// stays within the workspace by counting `..` components. + fn validate_symlink_target(target: &str, depth: usize) -> Result<(), Error> { + if target.is_empty() || target.contains('\0') { + return Err(make_err!( + Code::InvalidArgument, + "Invalid symlink target: {:?}", + target + )); + } - // Write to disk - fs::write(&file_path, data.as_ref()) - .await - .err_tip(|| format!("Failed to write file: {}", file_path.display()))?; + // Reject absolute symlink targets + if target.starts_with('/') || target.starts_with('\\') { + return Err(make_err!( + Code::InvalidArgument, + "Absolute symlink target not allowed: {:?}", + target + )); + } - // Set permissions - #[cfg(unix)] - if file_node.is_executable { - use std::os::unix::fs::PermissionsExt; - let mut perms = fs::metadata(&file_path) - .await - .err_tip(|| "Failed to get file metadata")? - .permissions(); - perms.set_mode(0o755); - fs::set_permissions(&file_path, perms) - .await - .err_tip(|| "Failed to set file permissions")?; + // Count net upward traversals. `depth` is how deep we are in the tree. + let mut net_up: usize = 0; + for component in target.split('/') { + match component { + ".." => { + net_up += 1; + if net_up > depth { + return Err(make_err!( + Code::InvalidArgument, + "Symlink target escapes workspace root: {:?}", + target + )); + } + } + "" | "." => {} + _ => { + net_up = net_up.saturating_sub(1); + } + } } Ok(()) } - /// Creates a subdirectory from a `DirectoryNode` - async fn create_subdirectory( - &self, - parent: &Path, - dir_node: &DirectoryNode, - ) -> Result<(), Error> { + /// Walks a directory tree, setting all entries to read-only and computing + /// the total file size in a single traversal (avoiding two separate walks). + /// Directories are set to 0o555, files have write bits stripped. + fn set_readonly_and_calculate_size<'a>( + path: &'a Path, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + let metadata = fs::symlink_metadata(path) + .await + .err_tip(|| format!("Failed to get metadata for: {}", path.display()))?; + + // Skip symlinks -- do not follow them or change permissions. + if metadata.is_symlink() { + return Ok(0); + } + + if metadata.is_dir() { + let mut entries = fs::read_dir(path) + .await + .err_tip(|| format!("Failed to read directory: {}", path.display()))?; + + let mut total_size = 0u64; + while let Some(entry) = entries + .next_entry() + .await + .err_tip(|| format!("Failed to get next entry in: {}", path.display()))? + { + total_size += Self::set_readonly_and_calculate_size(&entry.path()).await?; + } + + // Set directory to read-only (0o555) to protect cache integrity. + // Since we use hardlinks (not symlinks), actions never access + // cached directories directly — they get fresh writable copies. + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mut perms = metadata.permissions(); + perms.set_mode(0o555); + fs::set_permissions(path, perms) + .await + .err_tip(|| format!("Failed to set permissions for: {}", path.display()))?; + } + #[cfg(windows)] + { + let mut perms = metadata.permissions(); + perms.set_readonly(true); + fs::set_permissions(path, perms) + .await + .err_tip(|| format!("Failed to set permissions for: {}", path.display()))?; + } + + Ok(total_size) + } else if metadata.is_file() { + let size = metadata.len(); + + // Ensure all cached files are 0o555 (read+execute, no write). + // This both protects cache integrity and ensures shell scripts + // remain executable. Old CAS files with 0o644 become 0o555. + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let current_mode = metadata.permissions().mode() & 0o777; + if current_mode != 0o555 { + let mut perms = metadata.permissions(); + perms.set_mode(0o555); + fs::set_permissions(path, perms) + .await + .err_tip(|| format!("Failed to set permissions for: {}", path.display()))?; + } + } + #[cfg(windows)] + { + let mut perms = metadata.permissions(); + perms.set_readonly(true); + fs::set_permissions(path, perms) + .await + .err_tip(|| format!("Failed to set permissions for: {}", path.display()))?; + } + + Ok(size) + } else { + Ok(0) + } + }) + } + + /// Full construction path: tries fast download_to_directory, falls back to serial. + /// Used when there are no subtree hits. + async fn construct_full(&self, digest: &DigestInfo, temp_path: &Path) -> Result<(), Error> { + // Try the fast batch path first if concrete stores are available. + let fast_path_result = if let (Some(fss), Some(_fs_store)) = + (&self.fast_slow_store, &self.filesystem_store) + { + let fs_pin = Pin::new( + fss.fast_store() + .downcast_ref::(None) + .err_tip(|| "Could not downcast fast store to FilesystemStore")?, + ); + let temp_str = temp_path.to_string_lossy().to_string(); + debug!( + hash = %&digest.packed_hash().to_string()[..12], + "DirectoryCache: fast download_to_directory starting", + ); + let construction_start = Instant::now(); + let result = crate::running_actions_manager::download_to_directory( + fss, fs_pin, digest, &temp_str, + ) + .await; + let elapsed = construction_start.elapsed(); + match &result { + Ok(()) => { + debug!( + hash = %&digest.packed_hash().to_string()[..12], + elapsed_ms = elapsed.as_millis() as u64, + "DirectoryCache: fast download_to_directory completed", + ); + Some(Ok(())) + } + Err(e) => { + warn!( + hash = %&digest.packed_hash().to_string()[..12], + ?e, + elapsed_ms = elapsed.as_millis() as u64, + "DirectoryCache: fast download_to_directory failed, trying serial fallback", + ); + // Clean up the partial temp directory before fallback + drop(fs::remove_dir_all(temp_path).await); + drop(fs::create_dir_all(temp_path).await); + Some(Err(e.clone())) + } + } + } else { + None + }; + + // Use the fast path result, or fall back to serial construction. + match fast_path_result { + Some(Ok(())) => Ok(()), + Some(Err(_)) | None => { + if fast_path_result.is_none() { + debug!( + hash = %&digest.packed_hash().to_string()[..12], + "DirectoryCache: using serial construct_directory_impl (no fast path available)", + ); + } + let serial_start = Instant::now(); + self.construct_directory(*digest, temp_path).await + .err_tip(|| "Failed to construct directory for cache")?; + debug!( + hash = %&digest.packed_hash().to_string()[..12], + elapsed_ms = serial_start.elapsed().as_millis() as u64, + "DirectoryCache: serial construct_directory_impl completed", + ); + Ok(()) + } + } + } + + /// Subtree-aware construction: walks the resolved directory tree, creates + /// hardlinked subtrees for cached portions, and only downloads uncached + /// portions via `download_to_directory` or serial fallback. + /// + /// Uses file hardlinks (creating fresh directories) rather than directory + /// symlinks because Bazel actions create output directories inside the + /// input tree — symlinks would mutate the cache. + async fn construct_with_subtrees( + &self, + root_digest: &DigestInfo, + tree: &HashMap, + subtree_hits: &HashMap, + dest_path: &Path, + ) -> Result<(), Error> { + let construction_start = Instant::now(); + + // BFS walk of the tree, creating directories and symlinks. + // When we encounter a subtree hit, we create a directory symlink and + // skip its entire subtree (no need to traverse children). + let mut queue = VecDeque::new(); + queue.push_back((*root_digest, dest_path.to_path_buf())); + + let mut dirs_created = 0usize; + let mut subtrees_linked = 0usize; + let mut files_to_download = Vec::new(); + let mut symlinks_to_create: Vec<(String, PathBuf)> = Vec::new(); + + while let Some((dir_digest, dir_path)) = queue.pop_front() { + let directory = tree.get(&dir_digest).ok_or_else(|| { + make_err!( + Code::Internal, + "Directory {:?} not found in resolved tree during subtree construction", + dir_digest + ) + })?; + + // Process subdirectories + for subdir_node in &directory.directories { + Self::validate_node_name(&subdir_node.name)?; + let child_digest: DigestInfo = subdir_node + .digest + .as_ref() + .ok_or_else(|| { + make_err!(Code::InvalidArgument, "Directory node missing digest") + })? + .try_into() + .err_tip(|| "Invalid directory digest in subtree construction")?; + + let child_path = dir_path.join(&subdir_node.name); + + if let Some(cached_path) = subtree_hits.get(&child_digest) { + // Subtree hit: hardlink files from cached subtree into + // fresh writable directories. We can't use directory symlinks + // because Bazel creates output directories inside the input + // tree, which would mutate the cache. + match hardlink_directory_tree(cached_path, &child_path).await { + Ok(()) => { + subtrees_linked += 1; + debug!( + child_hash = %&child_digest.packed_hash().to_string()[..12], + src = %cached_path.display(), + dst = %child_path.display(), + "DirectoryCache: hardlinked cached subtree", + ); + // Do NOT enqueue children -- the hardlink covers the entire subtree. + continue; + } + Err(e) => { + // The cached subtree was evicted between our + // exists() check and now. Fall back to creating + // the directory and downloading its contents. + warn!( + child_hash = %&child_digest.packed_hash().to_string()[..12], + src = %cached_path.display(), + ?e, + "DirectoryCache: subtree evicted during construction, falling back to download", + ); + } + } + } + + // No subtree hit (or subtree evicted) -- create the directory and recurse. + fs::create_dir_all(&child_path).await.err_tip(|| { + format!("Failed to create directory: {}", child_path.display()) + })?; + dirs_created += 1; + queue.push_back((child_digest, child_path)); + } + + // Collect files that need to be downloaded for this (non-symlinked) directory. + for file_node in &directory.files { + Self::validate_node_name(&file_node.name)?; + let file_digest: DigestInfo = file_node + .digest + .as_ref() + .ok_or_else(|| { + make_err!(Code::InvalidArgument, "File node missing digest") + })? + .try_into() + .err_tip(|| "Invalid file digest in subtree construction")?; + + let file_path = dir_path.join(&file_node.name); + files_to_download.push((file_digest, file_path, file_node.is_executable)); + } + + // Collect symlinks from the proto + for symlink_node in &directory.symlinks { + Self::validate_node_name(&symlink_node.name)?; + let link_path = dir_path.join(&symlink_node.name); + symlinks_to_create.push((symlink_node.target.clone(), link_path)); + } + } + + debug!( + hash = %&root_digest.packed_hash().to_string()[..12], + dirs_created, + subtrees_linked, + files_to_download = files_to_download.len(), + symlinks = symlinks_to_create.len(), + "DirectoryCache: subtree-aware construction plan", + ); + + // Create symlinks from the proto + #[cfg(target_family = "unix")] + for (target, link_path) in &symlinks_to_create { + fs::symlink(target, link_path) + .await + .err_tip(|| format!("Failed to create symlink: {} -> {}", link_path.display(), target))?; + } + + // Download uncached files. + // If we have a FastSlowStore + FilesystemStore, use hardlinks from CAS. + // Otherwise fall back to serial CAS fetch. + if !files_to_download.is_empty() { + if let (Some(fss), Some(_fs_store)) = (&self.fast_slow_store, &self.filesystem_store) { + let fs_store_pin = Pin::new( + fss.fast_store() + .downcast_ref::(None) + .err_tip(|| "Could not downcast fast store to FilesystemStore")?, + ); + + // Check which blobs are already in the fast store. + // Skip zero-byte digests — they aren't stored in FilesystemStore. + let unique_digests: Vec = { + let mut seen = HashSet::new(); + files_to_download + .iter() + .filter_map(|(d, _, _)| { + if d.size_bytes() > 0 && seen.insert(*d) { Some(*d) } else { None } + }) + .collect() + }; + let store_keys: Vec> = + unique_digests.iter().map(|d| (*d).into()).collect(); + let mut has_results = vec![None; store_keys.len()]; + Pin::new(fss.fast_store()) + .has_with_results(&store_keys, &mut has_results) + .await + .err_tip(|| "Batch has_with_results in subtree construction")?; + + // Populate missing blobs into the fast store. + let missing: Vec<&DigestInfo> = unique_digests + .iter() + .zip(has_results.iter()) + .filter_map(|(d, r)| if r.is_none() { Some(d) } else { None }) + .collect(); + + if !missing.is_empty() { + debug!( + hash = %&root_digest.packed_hash().to_string()[..12], + missing = missing.len(), + "DirectoryCache: fetching missing blobs for uncached files", + ); + for d in &missing { + let key: StoreKey<'_> = (**d).into(); + fss.populate_fast_store(key).await + .err_tip(|| format!("Failed to populate fast store for {:?}", d))?; + } + } + + // Hardlink files from the fast store to their destination paths. + for (file_digest, file_path, is_executable) in &files_to_download { + if file_digest.size_bytes() == 0 { + // Zero-byte files aren't stored in FilesystemStore. + // Create them directly. + fs::write(&file_path, b"") + .await + .err_tip(|| format!("Failed to create empty file: {}", file_path.display()))?; + } else { + let file_entry = fs_store_pin + .get_file_entry_for_digest(file_digest) + .await + .err_tip(|| format!("Getting file entry for {:?}", file_digest))?; + let dest = file_path.clone(); + file_entry + .get_file_path_locked(|src_path| async move { + fs::hard_link(&src_path, &dest) + .await + .err_tip(|| format!( + "Failed to hardlink {:?} to {}", + src_path, + dest.display(), + )) + }) + .await?; + } + + // Ensure all files have 0o555. CAS files ingested before the + // 0o555 default may still be 0o644; we must fix them here since + // hardlinks share the inode and set_readonly_and_calculate_size + // would turn 0o644 into 0o444 (no execute), breaking shell scripts. + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let meta = fs::metadata(&file_path).await + .err_tip(|| "Failed to get file metadata for permission fix")?; + let current_mode = meta.permissions().mode() & 0o777; + let new_mode = if *is_executable { + current_mode | 0o111 + } else { + 0o555 + }; + if new_mode != current_mode { + let mut perms = meta.permissions(); + perms.set_mode(new_mode); + fs::set_permissions(&file_path, perms).await + .err_tip(|| "Failed to set file permission")?; + } + } + } + } else { + // Serial fallback: fetch each file from CAS individually. + for (file_digest, file_path, _is_executable) in &files_to_download { + let data = self + .cas_store + .get_part_unchunked(StoreKey::Digest(*file_digest), 0, None) + .await + .err_tip(|| format!("Failed to fetch file: {}", file_path.display()))?; + fs::write(&file_path, data.as_ref()) + .await + .err_tip(|| format!("Failed to write file: {}", file_path.display()))?; + + // Always set 0o555 to match CAS defaults (see create_file). + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mut perms = fs::metadata(&file_path).await + .err_tip(|| "Failed to get file metadata")? + .permissions(); + perms.set_mode(0o555); + fs::set_permissions(&file_path, perms).await + .err_tip(|| "Failed to set file permissions")?; + } + } + } + } + + let elapsed = construction_start.elapsed(); + debug!( + hash = %&root_digest.packed_hash().to_string()[..12], + dirs_created, + subtrees_linked, + files_downloaded = files_to_download.len(), + elapsed_ms = elapsed.as_millis() as u64, + "DirectoryCache: subtree-aware construction completed", + ); + + Ok(()) + } + + /// Removes subtree index entries that belong to a given cache entry path. + /// Loads the merkle metadata file from the cache entry to determine which + /// digests to remove. Also decrements subtree refcounts and records + /// fully-removed digests for delta reporting. + async fn remove_subtree_index_for_path( + &self, + cache_entry_path: &Path, + index: &mut HashMap, + ) { + let merkle_path = cache_entry_path.join(MERKLE_METADATA_FILENAME); + if let Ok(data) = fs::read_to_string(&merkle_path).await { + if let Ok(merkle) = MerkleTreeMetadata::deserialize(&data) { + let mut removed = 0usize; + let merkle_digests: Vec = + merkle.digest_to_relpath.keys().copied().collect(); + for (sub_digest, relpath) in &merkle.digest_to_relpath { + // Only remove if the index entry points to this specific cache entry. + let abs_path = if relpath.is_empty() { + cache_entry_path.to_path_buf() + } else { + cache_entry_path.join(relpath) + }; + if let Some(existing) = index.get(sub_digest) { + if *existing == abs_path { + index.remove(sub_digest); + removed += 1; + } + } + } + // Record subtree removals for delta reporting. + // This decrements refcounts and only marks digests as removed + // when they are no longer present in ANY cached entry. + self.record_subtree_removal(&merkle_digests).await; + debug!( + path = %cache_entry_path.display(), + removed_subtrees = removed, + "DirectoryCache: cleaned up subtree index for evicted entry", + ); + } + } + } + + /// Try to parse a directory entry name as a DigestInfo. + /// Expected format is the same as `DigestInfo::to_string()`, + /// i.e., `{hash}-{size_bytes}`. + fn parse_digest_from_dirname(name: &str) -> Option { + // DigestInfo::to_string() produces "{hash}-{size}", so split on the last '-' + let last_dash = name.rfind('-')?; + let hash = &name[..last_dash]; + let size_str = &name[last_dash + 1..]; + let size: i64 = size_str.parse().ok()?; + DigestInfo::try_new(hash, size).ok() + } + + /// Constructs a directory from the CAS at the given path. + /// `depth` tracks nesting depth for symlink target validation. + fn construct_directory_impl<'a>( + &'a self, + digest: DigestInfo, + dest_path: &'a Path, + depth: usize, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + debug!(?digest, ?dest_path, "Constructing directory"); + + // Fetch the Directory proto + let directory: ProtoDirectory = get_and_decode_digest(&self.cas_store, digest.into()) + .await + .err_tip(|| format!("Failed to fetch directory digest: {digest:?}"))?; + + // Create the destination directory + fs::create_dir_all(dest_path) + .await + .err_tip(|| format!("Failed to create directory: {}", dest_path.display()))?; + + // Process files + for file in &directory.files { + Self::validate_node_name(&file.name)?; + self.create_file(dest_path, file).await?; + } + + // Process subdirectories recursively + for dir_node in &directory.directories { + Self::validate_node_name(&dir_node.name)?; + self.create_subdirectory(dest_path, dir_node, depth + 1) + .await?; + } + + // Process symlinks + for symlink in &directory.symlinks { + Self::validate_node_name(&symlink.name)?; + Self::validate_symlink_target(&symlink.target, depth)?; + self.create_symlink(dest_path, symlink).await?; + } + + Ok(()) + }) + } + + /// Constructs a directory from the CAS at the given path + fn construct_directory<'a>( + &'a self, + digest: DigestInfo, + dest_path: &'a Path, + ) -> Pin> + Send + 'a>> { + self.construct_directory_impl(digest, dest_path, 0) + } + + /// Creates a file from a `FileNode` + async fn create_file(&self, parent: &Path, file_node: &FileNode) -> Result<(), Error> { + let file_path = parent.join(&file_node.name); + let digest = DigestInfo::try_from( + file_node + .digest + .as_ref() + .ok_or_else(|| make_err!(Code::InvalidArgument, "File node missing digest"))? + .clone(), + ) + .err_tip(|| "Invalid file digest")?; + + trace!(?file_path, ?digest, "Creating file"); + + // Fetch file content from CAS + let data = self + .cas_store + .get_part_unchunked(StoreKey::Digest(digest), 0, None) + .await + .err_tip(|| format!("Failed to fetch file: {}", file_path.display()))?; + + // Write to disk + fs::write(&file_path, data.as_ref()) + .await + .err_tip(|| format!("Failed to write file: {}", file_path.display()))?; + + // Always set 0o555 to match CAS store defaults. Some build tools + // (rules_cc, rules_rust) set is_executable=false on shell scripts + // that must be executable; 0o555 as the base avoids EPERM. + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mut perms = fs::metadata(&file_path) + .await + .err_tip(|| "Failed to get file metadata")? + .permissions(); + perms.set_mode(0o555); + fs::set_permissions(&file_path, perms) + .await + .err_tip(|| "Failed to set file permissions")?; + } + + Ok(()) + } + + /// Creates a subdirectory from a `DirectoryNode` + async fn create_subdirectory( + &self, + parent: &Path, + dir_node: &DirectoryNode, + depth: usize, + ) -> Result<(), Error> { let dir_path = parent.join(&dir_node.name); - let digest = - DigestInfo::try_from(dir_node.digest.clone().ok_or_else(|| { - make_err!(Code::InvalidArgument, "Directory node missing digest") - })?) - .err_tip(|| "Invalid directory digest")?; + let digest = DigestInfo::try_from( + dir_node + .digest + .as_ref() + .ok_or_else(|| { + make_err!(Code::InvalidArgument, "Directory node missing digest") + })? + .clone(), + ) + .err_tip(|| "Invalid directory digest")?; trace!(?dir_path, ?digest, "Creating subdirectory"); // Recursively construct subdirectory - self.construct_directory(digest, &dir_path).await + self.construct_directory_impl(digest, &dir_path, depth) + .await } /// Creates a symlink from a `SymlinkNode` @@ -347,74 +1680,107 @@ impl DirectoryCache { Ok(()) } - /// Evicts entries if cache is too full - async fn evict_if_needed( + /// Collects entries to evict to make room for `incoming_size` bytes. + /// Removes them from the HashMap and returns their paths for disk cleanup. + /// This is called while holding the write lock; actual disk I/O happens after + /// the lock is released. + fn collect_evictions( &self, incoming_size: u64, cache: &mut HashMap, - ) -> Result<(), Error> { - // Check entry count + ) -> Vec { + let mut evicted_paths = Vec::new(); + + // Evict by entry count while cache.len() >= self.config.max_entries { - self.evict_lru(cache).await?; + if let Some((path, digest, size)) = self.evict_lru_entry(cache) { + debug!( + hash = %&digest.packed_hash().to_string()[..12], + size_bytes = size, + reason = "count_limit", + entries_remaining = cache.len(), + max_entries = self.config.max_entries, + "DirectoryCache: evicting entry", + ); + evicted_paths.push(path); + } else { + warn!( + entries = cache.len(), + max = self.config.max_entries, + "DirectoryCache: over entry limit but all entries are in use" + ); + break; + } } - // Check total size + // Evict by size if self.config.max_size_bytes > 0 { - let current_size: u64 = cache.values().map(|m| m.size).sum(); - let mut size_after = current_size + incoming_size; - - while size_after > self.config.max_size_bytes { - let evicted_size = self.evict_lru(cache).await?; - size_after -= evicted_size; + loop { + let current_size: u64 = cache.values().map(|m| m.size).sum(); + if current_size + incoming_size <= self.config.max_size_bytes { + break; + } + if let Some((path, digest, size)) = self.evict_lru_entry(cache) { + debug!( + hash = %&digest.packed_hash().to_string()[..12], + size_bytes = size, + size_freed_mb = format!("{:.2}", size as f64 / (1024.0 * 1024.0)), + reason = "size_limit", + entries_remaining = cache.len(), + current_total_mb = format!("{:.2}", cache.values().map(|m| m.size).sum::() as f64 / (1024.0 * 1024.0)), + max_size_mb = format!("{:.2}", self.config.max_size_bytes as f64 / (1024.0 * 1024.0)), + "DirectoryCache: evicting entry", + ); + evicted_paths.push(path); + } else { + warn!( + current_size = current_size + incoming_size, + max = self.config.max_size_bytes, + "DirectoryCache: over size limit but all entries are in use" + ); + break; + } } } - Ok(()) + evicted_paths } - /// Evicts the least recently used entry - async fn evict_lru( + /// Removes the LRU entry with ref_count == 0 from the cache HashMap. + /// Returns the evicted entry's (path, digest, size) for logging and disk + /// cleanup, or `None` if no evictable entry exists. + fn evict_lru_entry( &self, cache: &mut HashMap, - ) -> Result { - // Find LRU entry that isn't currently in use + ) -> Option<(PathBuf, DigestInfo, u64)> { let to_evict = cache .iter() - .filter(|(_, m)| m.ref_count == 0) - .min_by_key(|(_, m)| m.last_access) + .filter(|(_, m)| m.ref_count.load(Ordering::Relaxed) == 0) + .min_by_key(|(_, m)| m.last_access_millis.load(Ordering::Relaxed)) .map(|(digest, _)| *digest); if let Some(digest) = to_evict { if let Some(metadata) = cache.remove(&digest) { - debug!(?digest, size = metadata.size, "Evicting cached directory"); - - // Remove from disk - if let Err(e) = fs::remove_dir_all(&metadata.path).await { - warn!( - ?digest, - path = ?metadata.path, - error = ?e, - "Failed to remove evicted directory from disk" - ); - } - - return Ok(metadata.size); + return Some((metadata.path, digest, metadata.size)); } } - Ok(0) + None } /// Gets the cache path for a digest fn get_cache_path(&self, digest: &DigestInfo) -> PathBuf { - self.config.cache_root.join(format!("{digest}")) + self.config.cache_root.join(digest.to_string()) } /// Returns cache statistics pub async fn stats(&self) -> CacheStats { let cache = self.cache.read().await; let total_size: u64 = cache.values().map(|m| m.size).sum(); - let in_use = cache.values().filter(|m| m.ref_count > 0).count(); + let in_use = cache + .values() + .filter(|m| m.ref_count.load(Ordering::Relaxed) > 0) + .count(); CacheStats { entries: cache.len(), @@ -493,6 +1859,83 @@ mod tests { (store, dir_digest) } + /// Creates a store with two different directory digests for eviction testing. + async fn setup_two_digest_store() -> (Store, DigestInfo, DigestInfo) { + let store = Store::new(MemoryStore::new(&Default::default())); + + // File A + let content_a = b"File A content"; + let digest_a = DigestInfo::try_new( + "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2", + content_a.len() as i64, + ) + .unwrap(); + store + .as_store_driver_pin() + .update_oneshot(digest_a.into(), content_a.to_vec().into()) + .await + .unwrap(); + + // Directory A + let dir_a = ProtoDirectory { + files: vec![FileNode { + name: "a.txt".to_string(), + digest: Some(digest_a.into()), + ..Default::default() + }], + ..Default::default() + }; + let mut dir_a_data = Vec::new(); + dir_a.encode(&mut dir_a_data).unwrap(); + let dir_digest_a = DigestInfo::try_new( + "aaaa567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef", + dir_a_data.len() as i64, + ) + .unwrap(); + store + .as_store_driver_pin() + .update_oneshot(dir_digest_a.into(), dir_a_data.into()) + .await + .unwrap(); + + // File B + let content_b = b"File B content!!"; + let digest_b = DigestInfo::try_new( + "b1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6b1b2", + content_b.len() as i64, + ) + .unwrap(); + store + .as_store_driver_pin() + .update_oneshot(digest_b.into(), content_b.to_vec().into()) + .await + .unwrap(); + + // Directory B + let dir_b = ProtoDirectory { + files: vec![FileNode { + name: "b.txt".to_string(), + digest: Some(digest_b.into()), + ..Default::default() + }], + ..Default::default() + }; + let mut dir_b_data = Vec::new(); + dir_b.encode(&mut dir_b_data).unwrap(); + let dir_digest_b = DigestInfo::try_new( + "bbbb567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef", + dir_b_data.len() as i64, + ) + .unwrap(); + store + .as_store_driver_pin() + .update_oneshot(dir_digest_b.into(), dir_b_data.into()) + .await + .unwrap(); + + (store, dir_digest_a, dir_digest_b) + } + #[tokio::test] async fn test_directory_cache_basic() -> Result<(), Error> { let temp_dir = TempDir::new().unwrap(); @@ -505,7 +1948,7 @@ mod tests { cache_root, }; - let cache = DirectoryCache::new(config, store).await?; + let cache = DirectoryCache::new(config, store, None).await?; // First access - cache miss let dest1 = temp_dir.path().join("dest1"); @@ -525,4 +1968,692 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn test_hardlink_into_existing_directory() -> Result<(), Error> { + let temp_dir = TempDir::new().unwrap(); + let cache_root = temp_dir.path().join("cache"); + let (store, dir_digest) = setup_test_store().await; + + let config = DirectoryCacheConfig { + max_entries: 10, + max_size_bytes: 1024 * 1024, + cache_root, + }; + + let cache = DirectoryCache::new(config, store, None).await?; + + // Pre-create destination directory (simulates work_directory already existing) + let dest = temp_dir.path().join("existing_dest"); + fs::create_dir(&dest).await.unwrap(); + + // Should succeed even though dest already exists (Bug 1 fix) + let hit = cache.get_or_create(dir_digest, &dest).await?; + assert!(!hit, "First access should be cache miss"); + assert!(dest.join("test.txt").exists()); + + // Cache hit into another pre-existing directory + let dest2 = temp_dir.path().join("existing_dest2"); + fs::create_dir(&dest2).await.unwrap(); + let hit = cache.get_or_create(dir_digest, &dest2).await?; + assert!(hit, "Second access should be cache hit"); + assert!(dest2.join("test.txt").exists()); + + Ok(()) + } + + #[tokio::test] + async fn test_construction_failure_cleanup() -> Result<(), Error> { + let temp_dir = TempDir::new().unwrap(); + let cache_root = temp_dir.path().join("cache"); + + // Create a store with no data — construction will fail when fetching the digest + let store = Store::new(MemoryStore::new(&Default::default())); + + let bogus_digest = DigestInfo::try_new( + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + 42, + ) + .unwrap(); + + let config = DirectoryCacheConfig { + max_entries: 10, + max_size_bytes: 1024 * 1024, + cache_root: cache_root.clone(), + }; + + let cache = DirectoryCache::new(config, store, None).await?; + + let dest = temp_dir.path().join("dest"); + let result = cache.get_or_create(bogus_digest, &dest).await; + assert!(result.is_err(), "Should fail when digest not in store"); + + // Bug 2 fix: No orphaned temp directories should remain + let mut entries = fs::read_dir(&cache_root).await.unwrap(); + let mut leftover = Vec::new(); + while let Some(entry) = entries.next_entry().await.unwrap() { + leftover.push(entry.file_name().to_string_lossy().to_string()); + } + assert!( + leftover.is_empty(), + "No orphaned temp dirs should remain in cache_root, found: {leftover:?}" + ); + + // Verify construction lock was cleaned up (Bug 3 fix) + let locks = cache.construction_locks.lock().await; + assert!( + locks.is_empty(), + "Construction lock should be cleaned up after failure" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_eviction_all_in_use() -> Result<(), Error> { + let temp_dir = TempDir::new().unwrap(); + let cache_root = temp_dir.path().join("cache"); + let (store, dir_digest) = setup_test_store().await; + + let config = DirectoryCacheConfig { + max_entries: 1, + max_size_bytes: 0, + cache_root, + }; + + let cache = DirectoryCache::new(config, store, None).await?; + + // Fill the cache + let dest1 = temp_dir.path().join("dest1"); + cache.get_or_create(dir_digest, &dest1).await?; + + // Simulate all entries being in-use + { + let cache_map = cache.cache.read().await; + if let Some(metadata) = cache_map.get(&dir_digest) { + metadata.ref_count.store(1, Ordering::Relaxed); + } + } + + // Bug 4 fix: collect_evictions should not loop infinitely. + { + let mut cache_map = cache.cache.write().await; + let evicted = cache.collect_evictions(100, &mut cache_map); + assert!(evicted.is_empty(), "Nothing should be evictable"); + assert_eq!(cache_map.len(), 1, "Entry should still be present"); + } + + // Clean up ref_count + { + let cache_map = cache.cache.read().await; + if let Some(metadata) = cache_map.get(&dir_digest) { + metadata.ref_count.store(0, Ordering::Relaxed); + } + } + + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn test_concurrent_same_digest() -> Result<(), Error> { + let temp_dir = TempDir::new().unwrap(); + let cache_root = temp_dir.path().join("cache"); + let (store, dir_digest) = setup_test_store().await; + + let config = DirectoryCacheConfig { + max_entries: 10, + max_size_bytes: 1024 * 1024, + cache_root, + }; + + let cache = Arc::new(DirectoryCache::new(config, store, None).await?); + + // Spawn multiple concurrent requests for the same digest + let mut handles = Vec::new(); + for i in 0..5 { + let cache = Arc::clone(&cache); + let dest = temp_dir.path().join(format!("concurrent_dest_{i}")); + handles.push(tokio::spawn(async move { + cache.get_or_create(dir_digest, &dest).await + })); + } + + let mut hits = 0; + let mut misses = 0; + for handle in handles { + let result = handle.await.unwrap()?; + if result { + hits += 1; + } else { + misses += 1; + } + } + + // Exactly one task should construct (miss), the rest should hit cache + assert_eq!(misses, 1, "Exactly one task should construct the directory"); + assert_eq!(hits, 4, "Other tasks should get cache hits"); + + // Verify only one cache entry exists + let stats = cache.stats().await; + assert_eq!(stats.entries, 1); + assert_eq!(stats.in_use_entries, 0, "All ref_counts should be back to 0"); + + // Verify construction locks are cleaned up (Bug 3) + let locks = cache.construction_locks.lock().await; + assert!( + locks.is_empty(), + "Construction locks should be cleaned up, found: {}", + locks.len() + ); + + Ok(()) + } + + #[tokio::test] + async fn test_construction_lock_cleanup() -> Result<(), Error> { + let temp_dir = TempDir::new().unwrap(); + let cache_root = temp_dir.path().join("cache"); + let (store, dir_digest) = setup_test_store().await; + + let config = DirectoryCacheConfig { + max_entries: 10, + max_size_bytes: 1024 * 1024, + cache_root, + }; + + let cache = DirectoryCache::new(config, store, None).await?; + + let dest = temp_dir.path().join("dest"); + cache.get_or_create(dir_digest, &dest).await?; + + let locks = cache.construction_locks.lock().await; + assert!( + locks.is_empty(), + "Construction lock should be removed after get_or_create completes" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_eviction_removes_oldest_entry() -> Result<(), Error> { + let temp_dir = TempDir::new().unwrap(); + let cache_root = temp_dir.path().join("cache"); + let (store, digest_a, digest_b) = setup_two_digest_store().await; + + let config = DirectoryCacheConfig { + max_entries: 1, // Only 1 entry allowed + max_size_bytes: 0, + cache_root: cache_root.clone(), + }; + + let cache = DirectoryCache::new(config, store, None).await?; + + // Insert entry A + let dest_a = temp_dir.path().join("dest_a"); + cache.get_or_create(digest_a, &dest_a).await?; + assert_eq!(cache.stats().await.entries, 1); + + // Insert entry B — should evict A + let dest_b = temp_dir.path().join("dest_b"); + cache.get_or_create(digest_b, &dest_b).await?; + assert_eq!(cache.stats().await.entries, 1); + + // A's cache directory should be gone from disk + let cache_path_a = cache_root.join(digest_a.to_string()); + assert!( + !cache_path_a.exists(), + "Evicted entry A should be removed from disk" + ); + + // B should be in cache + let cache_path_b = cache_root.join(digest_b.to_string()); + assert!(cache_path_b.exists(), "Entry B should be on disk"); + + // Requesting A again should be a miss (reconstruct) + let dest_a2 = temp_dir.path().join("dest_a2"); + let hit = cache.get_or_create(digest_a, &dest_a2).await?; + assert!(!hit, "A should be a cache miss after eviction"); + assert!(dest_a2.join("a.txt").exists()); + + Ok(()) + } + + #[tokio::test] + async fn test_path_traversal_rejected() -> Result<(), Error> { + // Test validate_node_name directly + assert!(DirectoryCache::validate_node_name("good_file.txt").is_ok()); + assert!(DirectoryCache::validate_node_name("subdir").is_ok()); + + // These should all be rejected + assert!(DirectoryCache::validate_node_name("").is_err()); + assert!(DirectoryCache::validate_node_name(".").is_err()); + assert!(DirectoryCache::validate_node_name("..").is_err()); + assert!(DirectoryCache::validate_node_name("../etc/passwd").is_err()); + assert!(DirectoryCache::validate_node_name("/etc/passwd").is_err()); + assert!(DirectoryCache::validate_node_name("foo/bar").is_err()); + assert!(DirectoryCache::validate_node_name("foo\\bar").is_err()); + assert!(DirectoryCache::validate_node_name("foo\0bar").is_err()); + + Ok(()) + } + + #[tokio::test] + async fn test_symlink_target_validation() -> Result<(), Error> { + // Valid relative targets + assert!(DirectoryCache::validate_symlink_target("file.txt", 0).is_ok()); + assert!(DirectoryCache::validate_symlink_target("subdir/file.txt", 0).is_ok()); + assert!(DirectoryCache::validate_symlink_target("../sibling", 1).is_ok()); + + // Absolute targets rejected + assert!(DirectoryCache::validate_symlink_target("/etc/shadow", 0).is_err()); + assert!(DirectoryCache::validate_symlink_target("\\windows\\system32", 0).is_err()); + + // Traversal beyond root rejected + assert!(DirectoryCache::validate_symlink_target("..", 0).is_err()); + assert!(DirectoryCache::validate_symlink_target("../..", 1).is_err()); + assert!(DirectoryCache::validate_symlink_target("../../escape", 1).is_err()); + + // Deep enough to allow traversal + assert!(DirectoryCache::validate_symlink_target("../..", 2).is_ok()); + + // Empty and null rejected + assert!(DirectoryCache::validate_symlink_target("", 0).is_err()); + assert!(DirectoryCache::validate_symlink_target("foo\0bar", 0).is_err()); + + Ok(()) + } + + #[tokio::test] + async fn test_path_traversal_in_directory_proto() -> Result<(), Error> { + let temp_dir = TempDir::new().unwrap(); + let cache_root = temp_dir.path().join("cache"); + let store = Store::new(MemoryStore::new(&Default::default())); + + // Create a malicious directory proto with a path-traversal file name + let file_content = b"malicious"; + let file_digest = DigestInfo::try_new( + "c0535e4be2b79ffd93291305436bf889314e4a3faec05ecffcbb7df31ad9e51a", + 9, + ) + .unwrap(); + store + .as_store_driver_pin() + .update_oneshot(file_digest.into(), file_content.to_vec().into()) + .await + .unwrap(); + + let malicious_dir = ProtoDirectory { + files: vec![FileNode { + name: "../escape.txt".to_string(), + digest: Some(file_digest.into()), + ..Default::default() + }], + ..Default::default() + }; + let mut dir_data = Vec::new(); + malicious_dir.encode(&mut dir_data).unwrap(); + let dir_digest = DigestInfo::try_new( + "cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc", + dir_data.len() as i64, + ) + .unwrap(); + store + .as_store_driver_pin() + .update_oneshot(dir_digest.into(), dir_data.into()) + .await + .unwrap(); + + let config = DirectoryCacheConfig { + max_entries: 10, + max_size_bytes: 1024 * 1024, + cache_root, + }; + let cache = DirectoryCache::new(config, store, None).await?; + + let dest = temp_dir.path().join("dest"); + let result = cache.get_or_create(dir_digest, &dest).await; + assert!(result.is_err(), "Path traversal should be rejected"); + + // The escape file should NOT exist in the parent directory + assert!( + !temp_dir.path().join("escape.txt").exists(), + "Path traversal should not create files outside dest" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_absolute_symlink_rejected() -> Result<(), Error> { + let temp_dir = TempDir::new().unwrap(); + let cache_root = temp_dir.path().join("cache"); + let store = Store::new(MemoryStore::new(&Default::default())); + + let malicious_dir = ProtoDirectory { + symlinks: vec![SymlinkNode { + name: "evil_link".to_string(), + target: "/etc/shadow".to_string(), + ..Default::default() + }], + ..Default::default() + }; + let mut dir_data = Vec::new(); + malicious_dir.encode(&mut dir_data).unwrap(); + let dir_digest = DigestInfo::try_new( + "dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd", + dir_data.len() as i64, + ) + .unwrap(); + store + .as_store_driver_pin() + .update_oneshot(dir_digest.into(), dir_data.into()) + .await + .unwrap(); + + let config = DirectoryCacheConfig { + max_entries: 10, + max_size_bytes: 1024 * 1024, + cache_root, + }; + let cache = DirectoryCache::new(config, store, None).await?; + + let dest = temp_dir.path().join("dest"); + let result = cache.get_or_create(dir_digest, &dest).await; + assert!(result.is_err(), "Absolute symlink target should be rejected"); + + Ok(()) + } + + #[tokio::test] + async fn test_ref_count_returns_to_zero_after_operations() -> Result<(), Error> { + let temp_dir = TempDir::new().unwrap(); + let cache_root = temp_dir.path().join("cache"); + let (store, dir_digest) = setup_test_store().await; + + let config = DirectoryCacheConfig { + max_entries: 10, + max_size_bytes: 1024 * 1024, + cache_root, + }; + + let cache = DirectoryCache::new(config, store, None).await?; + + // Cache miss + let dest1 = temp_dir.path().join("dest1"); + cache.get_or_create(dir_digest, &dest1).await?; + + // Cache hit + let dest2 = temp_dir.path().join("dest2"); + cache.get_or_create(dir_digest, &dest2).await?; + + // ref_count should be 0 after both operations + let stats = cache.stats().await; + assert_eq!(stats.in_use_entries, 0, "ref_count should be 0 after all operations"); + + Ok(()) + } + + #[tokio::test] + async fn test_size_based_eviction() -> Result<(), Error> { + let temp_dir = TempDir::new().unwrap(); + let cache_root = temp_dir.path().join("cache"); + let (store, digest_a, digest_b) = setup_two_digest_store().await; + + let config = DirectoryCacheConfig { + max_entries: 100, // High entry limit + max_size_bytes: 20, // Very small — forces size-based eviction + cache_root: cache_root.clone(), + }; + + let cache = DirectoryCache::new(config, store, None).await?; + + // Insert entry A (14 bytes for "File A content") + let dest_a = temp_dir.path().join("dest_a"); + cache.get_or_create(digest_a, &dest_a).await?; + assert_eq!(cache.stats().await.entries, 1); + + // Insert entry B (16 bytes for "File B content!!") — total would be 30 > 20, + // so A should be evicted + let dest_b = temp_dir.path().join("dest_b"); + cache.get_or_create(digest_b, &dest_b).await?; + assert_eq!(cache.stats().await.entries, 1); + + // A should have been evicted + let cache_map = cache.cache.read().await; + assert!( + !cache_map.contains_key(&digest_a), + "Digest A should have been evicted due to size limit" + ); + assert!( + cache_map.contains_key(&digest_b), + "Digest B should be present" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_merkle_tree_metadata_roundtrip() -> Result<(), Error> { + // Test serialization/deserialization of MerkleTreeMetadata + let mut digest_to_relpath = HashMap::new(); + let d1 = DigestInfo::try_new( + "aaaa567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef", + 100, + ) + .unwrap(); + let d2 = DigestInfo::try_new( + "bbbb567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef", + 200, + ) + .unwrap(); + + digest_to_relpath.insert(d1, String::new()); // root + digest_to_relpath.insert(d2, "subdir/nested".to_string()); + + let meta = MerkleTreeMetadata { digest_to_relpath }; + let serialized = meta.serialize(); + let deserialized = MerkleTreeMetadata::deserialize(&serialized)?; + + assert_eq!(deserialized.digest_to_relpath.len(), 2); + assert_eq!(deserialized.digest_to_relpath.get(&d1).unwrap(), ""); + assert_eq!( + deserialized.digest_to_relpath.get(&d2).unwrap(), + "subdir/nested" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_merkle_tree_metadata_from_directory_tree() -> Result<(), Error> { + // Build a small directory tree and verify MerkleTreeMetadata generation + let file_digest = DigestInfo::try_new( + "dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f", + 13, + ) + .unwrap(); + + // Child directory + let child_dir = ProtoDirectory { + files: vec![FileNode { + name: "child_file.txt".to_string(), + digest: Some(file_digest.into()), + ..Default::default() + }], + ..Default::default() + }; + let mut child_data = Vec::new(); + child_dir.encode(&mut child_data).unwrap(); + let child_digest = DigestInfo::try_new( + "cccc567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef", + child_data.len() as i64, + ) + .unwrap(); + + // Root directory referencing the child + let root_dir = ProtoDirectory { + files: vec![FileNode { + name: "root_file.txt".to_string(), + digest: Some(file_digest.into()), + ..Default::default() + }], + directories: vec![DirectoryNode { + name: "child".to_string(), + digest: Some(child_digest.into()), + }], + ..Default::default() + }; + let mut root_data = Vec::new(); + root_dir.encode(&mut root_data).unwrap(); + let root_digest = DigestInfo::try_new( + "1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef", + root_data.len() as i64, + ) + .unwrap(); + + let mut tree = HashMap::new(); + tree.insert(root_digest, root_dir); + tree.insert(child_digest, child_dir); + + let meta = MerkleTreeMetadata::from_directory_tree(&tree, &root_digest); + assert_eq!(meta.digest_to_relpath.len(), 2); + assert_eq!(meta.digest_to_relpath.get(&root_digest).unwrap(), ""); + assert_eq!(meta.digest_to_relpath.get(&child_digest).unwrap(), "child"); + + Ok(()) + } + + #[tokio::test] + async fn test_parse_digest_from_dirname() -> Result<(), Error> { + // Valid format: hash-size + let name = "aaaa567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef-100"; + let parsed = DirectoryCache::parse_digest_from_dirname(name); + assert!(parsed.is_some()); + let d = parsed.unwrap(); + assert_eq!(d.size_bytes(), 100); + + // Invalid: no dash + assert!(DirectoryCache::parse_digest_from_dirname("nodashhere").is_none()); + + // Invalid: not a number after dash + assert!(DirectoryCache::parse_digest_from_dirname("hash-notanumber").is_none()); + + // Invalid: empty + assert!(DirectoryCache::parse_digest_from_dirname("").is_none()); + + Ok(()) + } + + #[tokio::test] + async fn test_merkle_metadata_stored_on_construction() -> Result<(), Error> { + let temp_dir = TempDir::new().unwrap(); + let cache_root = temp_dir.path().join("cache"); + let (store, dir_digest) = setup_test_store().await; + + let config = DirectoryCacheConfig { + max_entries: 10, + max_size_bytes: 1024 * 1024, + cache_root: cache_root.clone(), + }; + + let cache = DirectoryCache::new(config, store, None).await?; + + // Construct a directory (serial path, no FastSlowStore) + let dest = temp_dir.path().join("dest"); + cache.get_or_create(dir_digest, &dest).await?; + + // Merkle metadata file should NOT exist because we don't have + // FastSlowStore (resolve_directory_tree requires it). + // This is expected -- subtree indexing is only available with + // the fast path. + let cache_path = cache.get_cache_path(&dir_digest); + let merkle_path = cache_path.join(MERKLE_METADATA_FILENAME); + // Without FastSlowStore, no merkle metadata is generated + assert!( + !merkle_path.exists(), + "Merkle metadata should not exist without FastSlowStore" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_subtree_index_populated_and_cleaned_on_eviction() -> Result<(), Error> { + let temp_dir = TempDir::new().unwrap(); + let cache_root = temp_dir.path().join("cache"); + let (store, digest_a, digest_b) = setup_two_digest_store().await; + + let config = DirectoryCacheConfig { + max_entries: 1, + max_size_bytes: 0, + cache_root: cache_root.clone(), + }; + + let cache = DirectoryCache::new(config, store, None).await?; + + // Insert entry A + let dest_a = temp_dir.path().join("dest_a"); + cache.get_or_create(digest_a, &dest_a).await?; + + // Without FastSlowStore, subtree index should be empty (no merkle tree resolved) + { + let index = cache.subtree_index.read().await; + assert!( + index.is_empty(), + "Subtree index should be empty without FastSlowStore" + ); + } + + // Insert entry B (evicts A) + let dest_b = temp_dir.path().join("dest_b"); + cache.get_or_create(digest_b, &dest_b).await?; + assert_eq!(cache.stats().await.entries, 1); + + Ok(()) + } + + #[tokio::test] + async fn test_cache_reload_from_disk() -> Result<(), Error> { + let temp_dir = TempDir::new().unwrap(); + let cache_root = temp_dir.path().join("cache"); + let (store, dir_digest) = setup_test_store().await; + + // Create a cache and populate it + { + let config = DirectoryCacheConfig { + max_entries: 10, + max_size_bytes: 1024 * 1024, + cache_root: cache_root.clone(), + }; + let cache = DirectoryCache::new(config, store.clone(), None).await?; + let dest = temp_dir.path().join("dest1"); + cache.get_or_create(dir_digest, &dest).await?; + assert_eq!(cache.stats().await.entries, 1); + } + + // Create a NEW cache pointing to the same cache_root -- it should + // reload the existing entry from disk. + { + let config = DirectoryCacheConfig { + max_entries: 10, + max_size_bytes: 1024 * 1024, + cache_root: cache_root.clone(), + }; + let cache = DirectoryCache::new(config, store, None).await?; + assert_eq!( + cache.stats().await.entries, + 1, + "Cache should have reloaded the entry from disk" + ); + + // The reloaded entry should be usable (cache hit) + let dest2 = temp_dir.path().join("dest2"); + let hit = cache.get_or_create(dir_digest, &dest2).await?; + assert!(hit, "Reloaded entry should produce a cache hit"); + assert!(dest2.join("test.txt").exists()); + } + + Ok(()) + } } diff --git a/nativelink-worker/src/local_worker.rs b/nativelink-worker/src/local_worker.rs index c8e5f76f6..04d88ff7e 100644 --- a/nativelink-worker/src/local_worker.rs +++ b/nativelink-worker/src/local_worker.rs @@ -17,7 +17,7 @@ use core::str; use core::sync::atomic::{AtomicU64, Ordering}; use core::time::Duration; use std::borrow::Cow; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::env; use std::process::Stdio; use std::sync::{Arc, Weak}; @@ -31,18 +31,21 @@ use nativelink_metric::{MetricsComponent, RootMetricsComponent}; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::update_for_worker::Update; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::worker_api_client::WorkerApiClient; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ - ExecuteComplete, ExecuteResult, GoingAwayRequest, KeepAliveRequest, UpdateForWorker, - execute_result, + BlobDigestInfo, BlobsAvailableNotification, ExecuteComplete, ExecuteResult, GoingAwayRequest, + KeepAliveRequest, UpdateForWorker, execute_result, }; use nativelink_store::fast_slow_store::FastSlowStore; +use nativelink_store::filesystem_store::FilesystemStore; use nativelink_util::action_messages::{ActionResult, ActionStage, OperationId}; -use nativelink_util::common::fs; +use nativelink_util::common::{DigestInfo, fs}; use nativelink_util::digest_hasher::DigestHasherFunc; use nativelink_util::metrics_utils::{AsyncCounterWrapper, CounterWithTime}; use nativelink_util::shutdown_guard::ShutdownGuard; -use nativelink_util::store_trait::Store; +use nativelink_util::store_trait::{ItemCallback, Store, StoreDriver, StoreKey}; +use nativelink_util::task::JoinHandleDropGuard; use nativelink_util::{spawn, tls_utils}; use opentelemetry::context::Context; +use parking_lot::Mutex; use tokio::process; use tokio::sync::{broadcast, mpsc}; use tokio::time::sleep; @@ -57,6 +60,114 @@ use crate::running_actions_manager::{ use crate::worker_api_client_wrapper::{WorkerApiClientTrait, WorkerApiClientWrapper}; use crate::worker_utils::make_connect_worker_request; +/// Default interval for periodic BlobsAvailable reports (milliseconds). +const DEFAULT_BLOBS_AVAILABLE_INTERVAL_MS: u64 = 500; + +/// Returns the current CPU load as a percentage (load_avg_1m / num_cpus * 100). +/// Returns 0 if the load cannot be determined. +fn get_cpu_load_pct() -> u32 { + let num_cpus = std::thread::available_parallelism() + .map(|n| n.get() as f64) + .unwrap_or(1.0); + let mut loadavg: [f64; 1] = [0.0]; + // SAFETY: getloadavg writes at most `nelem` doubles into the array. + let ret = unsafe { libc::getloadavg(loadavg.as_mut_ptr(), 1) }; + if ret < 1 { + return 0; + } + let pct = (loadavg[0] / num_cpus * 100.0).round() as u32; + // Clamp to a reasonable maximum (can exceed 100 on overloaded systems). + pct.min(1000) +} + +/// Build the advertised gRPC endpoint for peer blob sharing. +/// Uses the machine's hostname so a single config works across all workers. +/// The hostname is resolved once and cached for the lifetime of the process. +fn cas_advertised_endpoint(port: u16) -> String { + use std::sync::OnceLock; + static HOSTNAME: OnceLock = OnceLock::new(); + let hostname = HOSTNAME.get_or_init(|| { + match hostname::get() { + Ok(h) => { + let name = h.to_string_lossy().into_owned(); + // Append .local for mDNS resolution if the hostname is bare + // (no dots), so the server can resolve it via multicast DNS. + if name.contains('.') { + name + } else { + format!("{name}.local") + } + } + Err(err) => { + error!( + ?err, + "hostname::get() failed, using 'localhost' — peer blob sharing will not work across machines" + ); + "localhost".to_string() + } + } + }); + format!("grpc://{hostname}:{port}") +} + +/// Accumulated blob changes between BlobsAvailable ticks. +#[derive(Debug, Default)] +pub struct BlobChanges { + /// digest → last_access_timestamp (unix seconds). + pub added: HashMap, + pub evicted: HashSet, +} + +/// Tracks inserts and evictions from the FilesystemStore between ticks. +/// Registered as a callback on the FilesystemStore's evicting map. +#[derive(Debug)] +pub struct BlobChangeTracker { + pending: Mutex, +} + +impl BlobChangeTracker { + pub fn new() -> Arc { + Arc::new(Self { + pending: Mutex::new(BlobChanges::default()), + }) + } + + /// Atomically swap out accumulated changes, returning them. + /// The internal state is replaced with an empty BlobChanges. + pub fn swap(&self) -> BlobChanges { + let mut pending = self.pending.lock(); + std::mem::take(&mut *pending) + } +} + +impl ItemCallback for BlobChangeTracker { + // On evict: add to evicted, remove from added (cancel out insert+evict). + fn callback<'a>( + &'a self, + store_key: StoreKey<'a>, + ) -> Pin + Send + 'a>> { + if let StoreKey::Digest(digest) = store_key { + let mut pending = self.pending.lock(); + pending.added.remove(&digest); + pending.evicted.insert(digest); + } + Box::pin(core::future::ready(())) + } + + // On insert: add to added, remove from evicted (cancel out evict+reinsert). + fn on_insert(&self, store_key: StoreKey<'_>, _size: u64) { + if let StoreKey::Digest(digest) = store_key { + let ts = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_secs() as i64) + .unwrap_or(0); + let mut pending = self.pending.lock(); + pending.evicted.remove(&digest); + pending.added.insert(digest, ts); + } + } +} + /// Amount of time to wait if we have actions in transit before we try to /// consider an error to have occurred. const ACTIONS_IN_TRANSIT_TIMEOUT_S: f32 = 10.; @@ -74,6 +185,20 @@ const DEFAULT_ENDPOINT_TIMEOUT_S: f32 = 5.; const DEFAULT_MAX_ACTION_TIMEOUT: Duration = Duration::from_secs(1200); // 20 mins. const DEFAULT_MAX_UPLOAD_TIMEOUT: Duration = Duration::from_secs(600); // 10 mins. +/// Holds the FilesystemStore reference and change tracker needed for +/// periodic BlobsAvailable reporting. +#[derive(Clone, Debug)] +pub struct BlobsAvailableState { + /// Reference to the worker's local FilesystemStore (the fast store in FastSlowStore). + fs_store: Arc, + /// Tracks inserted and evicted digests between periodic ticks. + tracker: Arc, + /// The worker's CAS endpoint for peer serving (e.g. "grpc://192.168.191.5:50081"). + cas_endpoint: String, + /// How often to send periodic BlobsAvailable (0 = disabled). + interval: Duration, +} + struct LocalWorkerImpl<'a, T: WorkerApiClientTrait + 'static, U: RunningActionsManager> { config: &'a LocalWorkerConfig, // According to the tonic documentation it is a cheap operation to clone this. @@ -86,6 +211,8 @@ struct LocalWorkerImpl<'a, T: WorkerApiClientTrait + 'static, U: RunningActionsM // on by the scheduler. actions_in_transit: Arc, metrics: Arc, + /// State for periodic BlobsAvailable reporting. None if disabled (no CAS endpoint). + blobs_available_state: Option, } pub async fn preconditions_met( @@ -146,6 +273,7 @@ impl<'a, T: WorkerApiClientTrait + 'static, U: RunningActionsManager> LocalWorke worker_id: String, running_actions_manager: Arc, metrics: Arc, + blobs_available_state: Option, ) -> Self { Self { config, @@ -158,6 +286,7 @@ impl<'a, T: WorkerApiClientTrait + 'static, U: RunningActionsManager> LocalWorke // on by the scheduler. actions_in_transit: Arc::new(AtomicU64::new(0)), metrics, + blobs_available_state, } } @@ -175,7 +304,11 @@ impl<'a, T: WorkerApiClientTrait + 'static, U: RunningActionsManager> LocalWorke // We always send 2 keep alive requests per timeout. Http2 should manage most of our // timeout issues, this is a secondary check to ensure we can still send data. sleep(Duration::from_secs_f32(timeout / 2.)).await; - if let Err(e) = grpc_client.keep_alive(KeepAliveRequest {}).await { + let load = get_cpu_load_pct(); + debug!("KeepAlive cpu_load_pct={load}"); + if let Err(e) = grpc_client.keep_alive(KeepAliveRequest { + cpu_load_pct: load, + }).await { return Err(make_err!( Code::Internal, "Failed to send KeepAlive in LocalWorker : {:?}", @@ -185,6 +318,125 @@ impl<'a, T: WorkerApiClientTrait + 'static, U: RunningActionsManager> LocalWorke } } + /// Sends a periodic BlobsAvailable notification. + /// - First tick: full snapshot of all digests with timestamps (scans store once). + /// Also sends a full subtree snapshot with ALL subtree digests. + /// - Subsequent ticks: delta from callback-accumulated changes (no scan). + /// Sends delta-encoded subtree changes (added/removed). + async fn send_periodic_blobs_available( + grpc_client: &mut T, + state: &BlobsAvailableState, + running_actions_manager: &Arc, + is_first: bool, + ) { + let (digest_infos, evicted_digests) = if is_first { + // Full snapshot: scan everything once. + let all = state.fs_store.get_all_digests_with_timestamps(); + // Drain any changes that accumulated during startup. + drop(state.tracker.swap()); + + let infos: Vec = all + .iter() + .map(|(digest, ts)| BlobDigestInfo { + digest: Some((*digest).into()), + last_access_timestamp: *ts, + }) + .collect(); + + (infos, Vec::new()) + } else { + // Delta: swap out accumulated changes. + let changes = state.tracker.swap(); + if changes.added.is_empty() && changes.evicted.is_empty() { + // Even if no blob changes, we may have subtree changes to report. + // We'll check below and skip only if both are empty. + } + + let infos: Vec = changes + .added + .iter() + .map(|(digest, &ts)| BlobDigestInfo { + digest: Some((*digest).into()), + last_access_timestamp: ts, + }) + .collect(); + let evicted_protos = changes.evicted.iter().map(|d| (*d).into()).collect(); + + (infos, evicted_protos) + }; + + // Collect subtree delta or full snapshot. + let (cached_directory_digests, added_subtree_digests, removed_subtree_digests, is_full_subtree_snapshot) = if is_first { + // Full subtree snapshot: send ALL subtree digests in cached_directory_digests. + // Also drain any pending changes accumulated during startup. + drop(running_actions_manager.take_pending_subtree_changes().await); + let all_subtrees = running_actions_manager.all_subtree_digests().await; + let all_subtree_protos = all_subtrees.into_iter().map(|d| d.into()).collect(); + (all_subtree_protos, Vec::new(), Vec::new(), true) + } else { + // Delta: take pending subtree changes. + let (added, removed) = running_actions_manager.take_pending_subtree_changes().await; + let added_protos = added.into_iter().map(|d| d.into()).collect(); + let removed_protos = removed.into_iter().map(|d| d.into()).collect(); + (Vec::new(), added_protos, removed_protos, false) + }; + + let new_or_touched_count = digest_infos.len(); + let evicted_count = evicted_digests.len(); + let cached_dir_count = cached_directory_digests.len(); + let added_subtree_count = added_subtree_digests.len(); + let removed_subtree_count = removed_subtree_digests.len(); + + // Skip sending if there are truly no changes at all. + if !is_first + && new_or_touched_count == 0 + && evicted_count == 0 + && added_subtree_count == 0 + && removed_subtree_count == 0 + { + trace!("BlobsAvailable: no changes since last tick, skipping"); + return; + } + + let load = get_cpu_load_pct(); + debug!("BlobsAvailable cpu_load_pct={load}"); + let notification = BlobsAvailableNotification { + worker_cas_endpoint: state.cas_endpoint.clone(), + digests: Vec::new(), + is_full_snapshot: is_first, + evicted_digests, + digest_infos, + cpu_load_pct: load, + cached_directory_digests, + added_subtree_digests, + removed_subtree_digests, + is_full_subtree_snapshot, + }; + + if let Err(err) = grpc_client.blobs_available(notification).await { + warn!( + ?err, + new_or_touched_count, + evicted_count, + cached_dir_count, + added_subtree_count, + removed_subtree_count, + is_first, + "Failed to send periodic BlobsAvailable" + ); + } else { + debug!( + new_or_touched_count, + evicted_count, + cached_dir_count, + added_subtree_count, + removed_subtree_count, + is_first, + "Sent periodic BlobsAvailable" + ); + } + } + async fn run( &self, update_for_worker_stream: Streaming, @@ -205,6 +457,32 @@ impl<'a, T: WorkerApiClientTrait + 'static, U: RunningActionsManager> LocalWorke let mut futures = FuturesUnordered::new(); futures.push(self.start_keep_alive().boxed()); + // Start periodic BlobsAvailable reporting if configured. + if let Some(ref state) = self.blobs_available_state { + if !state.interval.is_zero() { + let mut grpc_client = self.grpc_client.clone(); + let state = state.clone(); + let ram = self.running_actions_manager.clone(); + futures.push( + async move { + let mut is_first = true; + loop { + sleep(state.interval).await; + Self::send_periodic_blobs_available( + &mut grpc_client, + &state, + &ram, + is_first, + ) + .await; + is_first = false; + } + } + .boxed(), + ); + } + } + let (add_future_channel, add_future_rx) = mpsc::unbounded_channel(); let mut add_future_rx = UnboundedReceiverStream::new(add_future_rx).fuse(); @@ -248,6 +526,44 @@ impl<'a, T: WorkerApiClientTrait + 'static, U: RunningActionsManager> LocalWorke ); } } + Update::TouchBlobs(touch_request) => { + // Touch blobs in the local store to update access times + // and prevent premature eviction of referenced blobs. + let digest_count = touch_request.digests.len(); + trace!(digest_count, "Received TouchBlobs request"); + if let Some(ref state) = self.blobs_available_state { + let fs_store = state.fs_store.clone(); + let digests: Vec = touch_request + .digests + .into_iter() + .filter_map(|d| DigestInfo::try_from(d).ok()) + .collect(); + // Best-effort: call has() on each digest to update + // the EvictingMap's LRU access time. + let keys: Vec> = digests + .iter() + .map(|d| StoreKey::from(*d)) + .collect(); + let mut results = vec![None; keys.len()]; + if let Err(err) = Pin::new(fs_store.as_ref()) + .has_with_results(&keys, &mut results) + .await + { + warn!( + ?err, + digest_count, + "TouchBlobs: failed to touch digests in FilesystemStore" + ); + } else { + let found = results.iter().filter(|r| r.is_some()).count(); + trace!( + digest_count, + found, + "TouchBlobs: touched digests in FilesystemStore" + ); + } + } + } Update::StartAction(start_execute) => { // Don't accept any new requests if we're shutting down. if shutting_down { @@ -297,10 +613,6 @@ impl<'a, T: WorkerApiClientTrait + 'static, U: RunningActionsManager> LocalWorke let actions_in_transit = self.actions_in_transit.clone(); let worker_id = self.worker_id.clone(); let running_actions_manager = self.running_actions_manager.clone(); - let mut grpc_client = self.grpc_client.clone(); - let complete = ExecuteComplete { - operation_id: operation_id.clone(), - }; self.metrics.clone().wrap(move |metrics| async move { metrics.preconditions.wrap(preconditions_met(precondition_script_cfg, &extra_envs)) .and_then(|()| running_actions_manager.create_and_add_action(worker_id, start_execute)) @@ -319,18 +631,21 @@ impl<'a, T: WorkerApiClientTrait + 'static, U: RunningActionsManager> LocalWorke .clone() .prepare_action() .and_then(RunningAction::execute) - .and_then(|result| async move { - // Notify that execution has completed so it can schedule a new action. - drop(grpc_client.execution_complete(complete).await); - Ok(result) - }) + // upload_results now only uploads to the local fast store + // (FilesystemStore). The remote CAS upload is deferred to + // the background after the result is reported. .and_then(RunningAction::upload_results) .and_then(RunningAction::get_finished_result) - // Note: We need ensure we run cleanup even if one of the other steps fail. .then(|result| async move { - if let Err(e) = action.cleanup().await { - return Result::::Err(e).merge(result); - } + // Spawn cleanup in the background — it only removes + // the work directory (files already renamed into CAS). + // The cleaning_up_operations + wait_for_cleanup mechanism + // handles the race if the same action is retried. + tokio::spawn(async move { + if let Err(e) = action.cleanup().await { + error!(?e, "Background cleanup failed"); + } + }); result }) }).await @@ -339,24 +654,87 @@ impl<'a, T: WorkerApiClientTrait + 'static, U: RunningActionsManager> LocalWorke let make_publish_future = { let mut grpc_client = self.grpc_client.clone(); + let cas_endpoint_for_notify = self.config.cas_server_port + .map(|port| cas_advertised_endpoint(port)) + .unwrap_or_default(); let running_actions_manager = self.running_actions_manager.clone(); + let exec_load = get_cpu_load_pct(); + debug!("ExecuteComplete cpu_load_pct={exec_load}"); + let complete = ExecuteComplete { + operation_id: operation_id.clone(), + cpu_load_pct: exec_load, + }; move |res: Result| async move { let instance_name = maybe_instance_name .err_tip(|| "`instance_name` could not be resolved; this is likely an internal error in local_worker.")?; match res { Ok(mut action_result) => { - // Save in the action cache before notifying the scheduler that we've completed. - if let Some(digest_info) = action_digest.clone().and_then(|action_digest| action_digest.try_into().ok()) { - if let Err(err) = running_actions_manager.cache_action_result(digest_info, &mut action_result, digest_hasher).await { - error!( - ?err, - ?action_digest, - "Error saving action in store", - ); + // Collect output digests upfront so both futures + // can proceed without borrowing action_result. + let output_digests: Vec<_> = { + let mut v = Vec::new(); + if !cas_endpoint_for_notify.is_empty() { + for file in &action_result.output_files { + v.push(file.digest.into()); + } + for folder in &action_result.output_folders { + v.push(folder.tree_digest.into()); + } + if action_result.stdout_digest.size_bytes() > 0 { + v.push(action_result.stdout_digest.into()); + } + if action_result.stderr_digest.size_bytes() > 0 { + v.push(action_result.stderr_digest.into()); + } } - } - let action_stage = ActionStage::Completed(action_result); + v + }; + + // 1. BlobsAvailableNotif and cache_action_result run + // concurrently — they use independent connections + // (worker API stream vs AC/historical stores). + let blobs_fut = async { + if !output_digests.is_empty() { + let load = get_cpu_load_pct(); + debug!("BlobsAvailable cpu_load_pct={load}"); + if let Err(err) = grpc_client.blobs_available( + BlobsAvailableNotification { + worker_cas_endpoint: cas_endpoint_for_notify.clone(), + digests: output_digests, + is_full_snapshot: false, + evicted_digests: Vec::new(), + digest_infos: Vec::new(), + cpu_load_pct: load, + cached_directory_digests: Vec::new(), + added_subtree_digests: Vec::new(), + removed_subtree_digests: Vec::new(), + is_full_subtree_snapshot: false, + } + ).await { + warn!(?err, "Failed to send blobs_available notification"); + } + } + }; + let cache_fut = async { + if let Some(digest_info) = action_digest.clone().and_then(|action_digest| action_digest.try_into().ok()) { + if let Err(err) = running_actions_manager.cache_action_result(digest_info, &mut action_result, digest_hasher).await { + error!( + ?err, + ?action_digest, + "Error saving action in store", + ); + } + } + }; + tokio::join!(blobs_fut, cache_fut); + + // 2. Notify scheduler that execution is complete + // so it can schedule new work on this worker. + drop(grpc_client.execution_complete(complete).await); + + // 3. Send execution response with the action result. + let action_stage = ActionStage::Completed(action_result.clone()); grpc_client.execution_response( ExecuteResult{ instance_name, @@ -366,8 +744,30 @@ impl<'a, T: WorkerApiClientTrait + 'static, U: RunningActionsManager> LocalWorke ) .await .err_tip(|| "Error while calling execution_response")?; + + // 4. Upload output blobs from local CAS to remote + // CAS in the background. This is fire-and-forget; + // peers can already serve the blobs directly. + running_actions_manager.spawn_upload_to_remote(&action_result); }, Err(e) => { + // Still notify completion on error so the worker + // is freed for new work. + drop(grpc_client.execution_complete(complete).await); + + let e = if e.code == Code::NotFound { + // Per REAPI spec, missing inputs should return + // FAILED_PRECONDITION so the client re-uploads. + let mut err = make_err!( + Code::FailedPrecondition, + "One or more input blobs missing: {}", + e.message_string() + ); + err.details = e.details; + err + } else { + e + }; grpc_client.execution_response(ExecuteResult{ instance_name, operation_id, @@ -467,6 +867,11 @@ pub struct LocalWorker, sleep_fn: Option BoxFuture<'static, ()> + Send + Sync>>, metrics: Arc, + /// State for periodic BlobsAvailable reporting. + blobs_available_state: Option, + /// Guard for the worker CAS server task. Keeps the task alive as long as + /// the `LocalWorker` is alive. When dropped, the CAS server is aborted. + _cas_server_guard: Option>>, } impl< @@ -534,7 +939,48 @@ pub async fn new_local_worker( Duration::from_secs(config.max_upload_timeout as u64) }; - // Initialize directory cache if configured + // If peer blob sharing is configured (cas_server_port is set), create a + // worker-local locality map and wrap the slow store with WorkerProxyStore. + // This enables workers to fetch blobs from peers instead of the central CAS. + let (effective_cas_store, peer_locality_map) = if config.cas_server_port.is_some() { + let locality_map = nativelink_util::blob_locality_map::new_shared_blob_locality_map(); + + // Wrap the slow store (central CAS) with WorkerProxyStore. + // Enable racing so the worker races peer fetches against server fetches. + let slow_store = fast_slow_store.slow_store().clone(); + let mut proxy_arc = + nativelink_store::worker_proxy_store::WorkerProxyStore::new( + slow_store, + locality_map.clone(), + ); + Arc::get_mut(&mut proxy_arc) + .expect("WorkerProxyStore just created, no other refs") + .enable_race_peers(); + let proxy_store = Store::new(proxy_arc); + + // Build a new FastSlowStore: fast=local disk, slow=WorkerProxyStore(central CAS). + // Preserve the original store's direction config so that e.g. + // slow_direction=get prevents uploads from propagating to the server. + let fast_store = fast_slow_store.fast_store().clone(); + let fss_spec = nativelink_config::stores::FastSlowSpec { + fast: nativelink_config::stores::StoreSpec::Noop(Default::default()), + slow: nativelink_config::stores::StoreSpec::Noop(Default::default()), + fast_direction: fast_slow_store.fast_direction(), + slow_direction: fast_slow_store.slow_direction(), + }; + let new_fss = FastSlowStore::new(&fss_spec, fast_store, proxy_store); + info!( + "Peer blob sharing enabled: wrapping slow store with WorkerProxyStore" + ); + + (new_fss, Some(locality_map)) + } else { + (fast_slow_store.clone(), None) + }; + + // Initialize directory cache if configured. + // This is done after effective_cas_store is created so the cache can use + // the same FastSlowStore (with WorkerProxyStore) for batch downloads. let directory_cache = if let Some(cache_config) = &config.directory_cache { use std::path::PathBuf; @@ -557,7 +1003,11 @@ pub async fn new_local_worker( cache_root, }; - match DirectoryCache::new(worker_cache_config, Store::new(fast_slow_store.clone())).await { + match DirectoryCache::new( + worker_cache_config, + Store::new(effective_cas_store.clone()), + Some(effective_cas_store.clone()), + ).await { Ok(cache) => { tracing::info!("Directory cache initialized successfully"); Some(Arc::new(cache)) @@ -571,6 +1021,8 @@ pub async fn new_local_worker( None }; + let effective_cas_store_for_cas_server = effective_cas_store.clone(); + let running_actions_manager = Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { root_action_directory: config.work_directory.clone(), @@ -578,7 +1030,7 @@ pub async fn new_local_worker( entrypoint, additional_environment: config.additional_environment.clone(), }, - cas_store: fast_slow_store, + cas_store: effective_cas_store, ac_store, historical_store, upload_action_result_config: &config.upload_action_result, @@ -586,7 +1038,110 @@ pub async fn new_local_worker( max_upload_timeout, timeout_handled_externally: config.timeout_handled_externally, directory_cache, + peer_locality_map: peer_locality_map.clone(), })?); + + // Set up periodic BlobsAvailable reporting if we have a CAS port. + let blobs_available_state = if config.cas_server_port.is_some() { + // Try to get a reference to the FilesystemStore (the fast store in FastSlowStore). + let fs_store_opt: Option> = fast_slow_store + .fast_store() + .downcast_ref::(None) + .and_then(|fs| fs.get_arc()); + + if let Some(fs_store) = fs_store_opt { + let interval_ms = if config.blobs_available_interval_ms == 0 { + DEFAULT_BLOBS_AVAILABLE_INTERVAL_MS + } else { + config.blobs_available_interval_ms + }; + let cas_endpoint = config + .cas_server_port + .map(|port| cas_advertised_endpoint(port)) + .unwrap_or_default(); + + // Create change tracker and register it on the FilesystemStore. + let tracker = BlobChangeTracker::new(); + if let Err(err) = fs_store + .clone() + .register_item_callback(tracker.clone()) + { + warn!(?err, "Failed to register blob change tracker on FilesystemStore"); + } else { + info!( + interval_ms, + "Registered periodic BlobsAvailable reporting with callback-based change tracking" + ); + } + + Some(BlobsAvailableState { + fs_store, + tracker, + cas_endpoint, + interval: Duration::from_millis(interval_ms), + }) + } else { + warn!("FastSlowStore's fast store is not a FilesystemStore; periodic BlobsAvailable reporting disabled"); + None + } + } else { + None + }; + + // Start a CAS + ByteStream gRPC server for peer blob sharing if configured. + // Serves the effective_cas_store (which includes WorkerProxyStore) so that + // reads can be proxied to peers when the local store doesn't have the blob. + let cas_server_guard = if let Some(cas_port) = config.cas_server_port { + let cas_store = Store::new(effective_cas_store_for_cas_server); + let store_manager = Arc::new(nativelink_store::store_manager::StoreManager::new()); + store_manager.add_store("worker_cas", cas_store); + + let cas_configs = vec![nativelink_config::cas_server::WithInstanceName { + instance_name: String::new(), + config: nativelink_config::cas_server::CasStoreConfig { + cas_store: "worker_cas".to_string(), + }, + }]; + let bytestream_configs = vec![nativelink_config::cas_server::WithInstanceName { + instance_name: String::new(), + config: nativelink_config::cas_server::ByteStreamConfig { + cas_store: "worker_cas".to_string(), + ..Default::default() + }, + }]; + + let cas_server = nativelink_service::cas_server::CasServer::new(&cas_configs, &store_manager) + .err_tip(|| "Failed to create worker CAS server")?; + let bytestream_server = + nativelink_service::bytestream_server::ByteStreamServer::new(&bytestream_configs, &store_manager) + .err_tip(|| "Failed to create worker ByteStream server")?; + + let addr: std::net::SocketAddr = ([0, 0, 0, 0], cas_port).into(); + let advertised = cas_advertised_endpoint(cas_port); + + let worker_name = config.name.clone(); + Some(spawn!("worker_cas_server", async move { + info!( + worker_name = %worker_name, + %addr, + %advertised, + "Starting worker CAS server for peer blob sharing" + ); + let result = tonic::transport::Server::builder() + .add_service(cas_server.into_service()) + .add_service(bytestream_server.into_service()) + .serve(addr) + .await + .map_err(|e| make_err!(Code::Internal, "Worker CAS server failed: {e:?}")); + if let Err(ref e) = result { + error!(%addr, ?e, "Worker CAS server exited with error"); + } + result + })) + } else { + None + }; + let local_worker = LocalWorker::new_with_connection_factory_and_actions_manager( config.clone(), running_actions_manager, @@ -618,6 +1173,8 @@ pub async fn new_local_worker( }) }), Box::new(move |d| Box::pin(sleep(d))), + blobs_available_state, + cas_server_guard, ); Ok(local_worker) } @@ -628,6 +1185,8 @@ impl LocalWorker, connection_factory: ConnectionFactory, sleep_fn: Box BoxFuture<'static, ()> + Send + Sync>, + blobs_available_state: Option, + cas_server_guard: Option>>, ) -> Self { let metrics = Arc::new(Metrics::new(Arc::downgrade( running_actions_manager.metrics(), @@ -638,6 +1197,8 @@ impl LocalWorker LocalWorker LocalWorker EvictingMap integration test + // --------------------------------------------------------------- + // Wires: EvictingMap -> ItemCallbackHolder -> BlobChangeTracker + // and verifies that inserts and evictions flow through correctly. + #[test] + fn test_blob_change_tracker_evicting_map_integration() { + use std::time::SystemTime; + + use nativelink_config::stores::EvictionPolicy; + use nativelink_store::callback_utils::ItemCallbackHolder; + use nativelink_util::evicting_map::{EvictingMap, LenEntry}; + use nativelink_util::store_trait::StoreKeyBorrow; + + // Simple value type for the EvictingMap. + #[derive(Clone, Debug)] + struct TestValue(u64); + + impl LenEntry for TestValue { + fn len(&self) -> u64 { + self.0 + } + fn is_empty(&self) -> bool { + self.0 == 0 + } + } + + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + + rt.block_on(async { + // Create an EvictingMap with max_bytes = 100. + let evicting_map = EvictingMap::< + StoreKeyBorrow, + StoreKey<'static>, + TestValue, + SystemTime, + ItemCallbackHolder, + >::new( + &EvictionPolicy { + max_count: 0, + max_seconds: 0, + max_bytes: 100, + evict_bytes: 0, + }, + SystemTime::now(), + ); + + // Create a BlobChangeTracker and register it. + let tracker = BlobChangeTracker::new(); + let holder = ItemCallbackHolder::new(tracker.clone()); + evicting_map.add_item_callback(holder); + + let d1 = DigestInfo::new([1u8; 32], 30); + let d2 = DigestInfo::new([2u8; 32], 40); + + // Insert two items (total 70 bytes, under 100 limit). + let key1: StoreKeyBorrow = StoreKey::Digest(d1).into(); + let key2: StoreKeyBorrow = StoreKey::Digest(d2).into(); + evicting_map.insert(key1, TestValue(30)).await; + evicting_map.insert(key2, TestValue(40)).await; + + // Swap and verify both digests appear in `added`. + let changes = tracker.swap(); + assert_eq!( + changes.added.len(), + 2, + "Expected 2 added digests after initial inserts" + ); + assert!( + changes.added.contains_key(&d1), + "Expected d1 in added set" + ); + assert!( + changes.added.contains_key(&d2), + "Expected d2 in added set" + ); + assert!( + changes.evicted.is_empty(), + "Expected no evictions yet" + ); + + // Now insert a third item (50 bytes) — total would be 120 bytes, + // which exceeds max_bytes=100. This should trigger eviction of + // the least recently used item (d1, 30 bytes). + let d3 = DigestInfo::new([3u8; 32], 50); + let key3: StoreKeyBorrow = StoreKey::Digest(d3).into(); + evicting_map.insert(key3, TestValue(50)).await; + + // Allow background tasks to run (eviction callbacks are fire-and-forget). + tokio::task::yield_now().await; + + let changes = tracker.swap(); + assert!( + changes.added.contains_key(&d3), + "Expected d3 in added set after third insert" + ); + assert!( + changes.evicted.contains(&d1), + "Expected d1 in evicted set (LRU eviction)" + ); + // d2 should NOT have been evicted (total after eviction: 40 + 50 = 90 <= 100). + assert!( + !changes.evicted.contains(&d2), + "Expected d2 to NOT be evicted" + ); + }); + } + + #[test] + fn test_cas_advertised_endpoint_format() { + let endpoint = cas_advertised_endpoint(50081); + assert!( + endpoint.starts_with("grpc://"), + "Expected endpoint to start with 'grpc://', got: {endpoint}" + ); + assert!( + endpoint.ends_with(":50081"), + "Expected endpoint to end with ':50081', got: {endpoint}" + ); + + // Extract hostname and verify it's non-empty. + let without_prefix = endpoint.strip_prefix("grpc://").unwrap(); + let hostname = without_prefix.strip_suffix(":50081").unwrap(); + assert!( + !hostname.is_empty(), + "Expected non-empty hostname in endpoint: {endpoint}" + ); + } +} diff --git a/nativelink-worker/src/running_actions_manager.rs b/nativelink-worker/src/running_actions_manager.rs index 993be3dab..aad81b594 100644 --- a/nativelink-worker/src/running_actions_manager.rs +++ b/nativelink-worker/src/running_actions_manager.rs @@ -42,12 +42,13 @@ use futures::stream::{FuturesUnordered, StreamExt, TryStreamExt}; use nativelink_config::cas_server::{ EnvironmentSource, UploadActionResultConfig, UploadCacheResultsStrategy, }; +use nativelink_config::stores::StoreDirection; use nativelink_error::{Code, Error, ResultExt, make_err, make_input_err}; use nativelink_metric::MetricsComponent; use nativelink_proto::build::bazel::remote::execution::v2::{ - Action, ActionResult as ProtoActionResult, Command as ProtoCommand, - Directory as ProtoDirectory, Directory, DirectoryNode, ExecuteResponse, FileNode, SymlinkNode, - Tree as ProtoTree, UpdateActionResultRequest, + Action, ActionResult as ProtoActionResult, BatchReadBlobsRequest, Command as ProtoCommand, + Directory as ProtoDirectory, Directory, DirectoryNode, ExecuteResponse, FileNode, + GetTreeRequest, SymlinkNode, Tree as ProtoTree, UpdateActionResultRequest, }; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ HistoricalExecuteResponse, StartExecute, @@ -59,27 +60,30 @@ use nativelink_store::cas_utils::is_zero_digest; use nativelink_store::fast_slow_store::FastSlowStore; use nativelink_store::filesystem_store::{FileEntry, FilesystemStore}; use nativelink_store::grpc_store::GrpcStore; +use nativelink_store::worker_proxy_store::WorkerProxyStore; use nativelink_util::action_messages::{ ActionInfo, ActionResult, DirectoryInfo, ExecutionMetadata, FileInfo, NameOrPath, OperationId, SymlinkInfo, to_execute_response, }; use nativelink_util::common::{DigestInfo, fs}; -use nativelink_util::digest_hasher::{DigestHasher, DigestHasherFunc}; +use nativelink_util::digest_hasher::{DigestHasher, DigestHasherFunc, default_digest_hasher_func}; use nativelink_util::metrics_utils::{AsyncCounterWrapper, CounterWithTime}; -use nativelink_util::store_trait::{Store, StoreLike, UploadSizeInfo}; +use nativelink_util::buf_channel::make_buf_channel_pair; +use nativelink_util::store_trait::{Store, StoreKey, StoreLike, StoreOptimizations, UploadSizeInfo}; +use nativelink_util::log_utils::throughput_mbps; use nativelink_util::{background_spawn, spawn, spawn_blocking}; use parking_lot::Mutex; use prost::Message; -use relative_path::RelativePath; use scopeguard::{ScopeGuard, guard}; use serde::Deserialize; -use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; +use tokio::io::AsyncReadExt; use tokio::process; -use tokio::sync::{Notify, oneshot, watch}; +use tokio::sync::{Notify, mpsc, oneshot, watch}; use tokio::time::Instant; use tokio_stream::wrappers::ReadDirStream; +use opentelemetry::context::Context; use tonic::Request; -use tracing::{debug, error, info, trace, warn}; +use tracing::{debug, error, event, trace, warn, Level}; use uuid::Uuid; /// For simplicity we use a fixed exit code for cases when our program is terminated @@ -111,157 +115,1355 @@ struct SideChannelInfo { failure: Option, } -/// Aggressively download the digests of files and make a local folder from it. This function -/// will spawn unbounded number of futures to try and get these downloaded. The store itself -/// should be rate limited if spawning too many requests at once is an issue. -/// We require the `FilesystemStore` to be the `fast` store of `FastSlowStore`. This is for -/// efficiency reasons. We will request the `FastSlowStore` to populate the entry then we will -/// assume the `FilesystemStore` has the file available immediately after and hardlink the file -/// to a new location. -// Sadly we cannot use `async fn` here because the rust compiler cannot determine the auto traits -// of the future. So we need to force this function to return a dynamic future instead. -// see: https://github.com/rust-lang/rust/issues/78649 -pub fn download_to_directory<'a>( +#[derive(prost::Message)] +struct PreconditionFailure { + #[prost(message, repeated, tag = "1")] + violations: Vec, +} + +#[derive(prost::Message)] +struct Violation { + #[prost(string, tag = "1")] + r#type: String, + #[prost(string, tag = "2")] + subject: String, + #[prost(string, tag = "3")] + description: String, +} + +fn make_precondition_failure_any(digest: DigestInfo) -> prost_types::Any { + let failure = PreconditionFailure { + violations: vec![Violation { + r#type: "MISSING".into(), + subject: format!("blobs/{}/{}", digest.packed_hash(), digest.size_bytes()), + description: String::new(), + }], + }; + prost_types::Any { + type_url: "type.googleapis.com/google.rpc.PreconditionFailure".into(), + value: failure.encode_to_vec(), + } +} + +/// Metadata about a file to be materialized from CAS to disk. +struct FileToMaterialize { + digest: DigestInfo, + dest: String, + #[cfg(target_family = "unix")] + unix_mode: Option, + mtime: Option, +} + +/// Maximum size for a blob to be eligible for BatchReadBlobs (1 MiB). +/// Blobs larger than this use the existing ByteStream path. +const BATCH_READ_MAX_BLOB_SIZE: u64 = 1024 * 1024; + +/// Maximum total payload per BatchReadBlobs request (4 MiB), per REAPI recommendation. +const BATCH_READ_MAX_REQUEST_SIZE: u64 = 4 * 1024 * 1024; + +/// Resolve the full directory tree starting from `root_digest`. +/// +/// Tries the `GetTree` RPC (single streaming call) if the slow store is a `GrpcStore`. +/// Falls back to recursive `get_and_decode_digest` calls otherwise. +/// +/// Returns a map from digest to Directory proto for every directory in the tree. +pub async fn resolve_directory_tree( + cas_store: &FastSlowStore, + root_digest: &DigestInfo, +) -> Result, Error> { + let tree_start = std::time::Instant::now(); + debug!( + root = ?root_digest, + "resolve_directory_tree: starting tree resolution", + ); + // Try the fast path: GetTree RPC via the underlying GrpcStore. + if let Some(grpc_store) = cas_store.slow_store().downcast_ref::(None) { + debug!( + root = ?root_digest, + method = "GetTree RPC", + "resolve_directory_tree: using GetTree RPC fast path", + ); + let request = GetTreeRequest { + instance_name: String::new(), // GrpcStore fills this in + root_digest: Some((*root_digest).into()), + page_size: 0, // server decides + page_token: String::new(), + digest_function: Context::current() + .get::() + .map_or_else(default_digest_hasher_func, |v| *v) + .proto_digest_func() + .into(), + }; + + match grpc_store.get_tree(Request::new(request)).await { + Ok(response) => { + let rpc_elapsed = tree_start.elapsed(); + let mut stream = response.into_inner(); + // Collect all directories from the stream into a flat list. + let mut all_dirs: Vec = Vec::new(); + while let Some(resp) = stream.message().await.err_tip(|| "In GetTree stream")? { + all_dirs.extend(resp.directories); + } + let stream_elapsed = tree_start.elapsed(); + + debug!( + root = ?root_digest, + raw_dir_count = all_dirs.len(), + rpc_connect_ms = rpc_elapsed.as_millis() as u64, + stream_complete_ms = stream_elapsed.as_millis() as u64, + "resolve_directory_tree: GetTree stream received", + ); + + if !all_dirs.is_empty() { + // Build the tree using BFS assignment from the root. + // The GetTree response returns directories in BFS order + // (root first). Rather than re-encoding each directory + // and hoping the digest matches (which fails when the + // original bytes were serialized by a different protobuf + // implementation, e.g. Java), we assign digests by + // walking the tree structure: the root gets `root_digest`, + // and each child gets the digest its parent references. + // + // The server deduplicates: if two parents reference the + // same child digest, the child appears only once in the + // response. We mirror this by tracking `seen` digests + // and only consuming a new position for unseen children. + let mut tree = HashMap::with_capacity(all_dirs.len()); + let mut dir_by_pos: Vec = all_dirs; + // BFS queue: (position_in_dir_by_pos, assigned_digest). + let mut queue: VecDeque<(usize, DigestInfo)> = VecDeque::new(); + queue.push_back((0, *root_digest)); + let mut next_child_pos: usize = 1; + // Track digests we've already assigned a position to, + // mirroring the server's deduplication. + let mut seen: HashSet = HashSet::new(); + seen.insert(*root_digest); + + while let Some((pos, digest)) = queue.pop_front() { + if pos >= dir_by_pos.len() { + break; + } + let dir = std::mem::take(&mut dir_by_pos[pos]); + for child_node in &dir.directories { + if let Some(child_digest) = child_node + .digest + .as_ref() + .and_then(|d| DigestInfo::try_from(d).ok()) + { + // Only assign a new position for previously + // unseen digests (matching server dedup). + if seen.insert(child_digest) { + if next_child_pos < dir_by_pos.len() { + queue.push_back((next_child_pos, child_digest)); + next_child_pos += 1; + } + } + } + } + tree.insert(digest, dir); + } + + // Validate structural completeness: every child reference + // should point to a digest in the tree. + let tree_valid = tree.contains_key(root_digest) && { + tree.values().all(|dir| { + dir.directories.iter().all(|node| { + node.digest + .as_ref() + .and_then(|d| DigestInfo::try_from(d).ok()) + .is_some_and(|d| tree.contains_key(&d)) + }) + }) + }; + + if tree_valid { + let elapsed = tree_start.elapsed(); + let total_bytes: u64 = tree.keys().map(|d| d.size_bytes()).sum(); + let total_files: usize = tree.values().map(|d| d.files.len()).sum(); + let total_symlinks: usize = tree.values().map(|d| d.symlinks.len()).sum(); + debug!( + root = ?root_digest, + dir_count = tree.len(), + total_files, + total_symlinks, + total_bytes, + elapsed_ms = elapsed.as_millis() as u64, + "resolve_directory_tree: completed via GetTree RPC" + ); + return Ok(tree); + } + // Tree structure didn't match BFS ordering; fall through. + // Count how many child references are missing from the tree + // so the warning includes actionable diagnostic info. + let missing_children: usize = tree.values().map(|dir| { + dir.directories.iter().filter(|node| { + node.digest + .as_ref() + .and_then(|d| DigestInfo::try_from(d).ok()) + .map_or(true, |d| !tree.contains_key(&d)) + }).count() + }).sum(); + warn!( + root = ?root_digest, + tree_has_root = tree.contains_key(root_digest), + tree_size = tree.len(), + expected_size = dir_by_pos.len(), + missing_children, + validation_elapsed_ms = tree_start.elapsed().as_millis() as u64, + "resolve_directory_tree: GetTree BFS validation failed, falling back to recursive fetch" + ); + } + } + Err(e) => { + warn!( + root = ?root_digest, + err = ?e, + elapsed_ms = tree_start.elapsed().as_millis() as u64, + "resolve_directory_tree: GetTree RPC failed, falling back to recursive fetch" + ); + } + } + } else { + debug!( + root = ?root_digest, + method = "recursive fetch", + "resolve_directory_tree: no GrpcStore available, using recursive fetch", + ); + } + + // Fallback: recursive fetch (original behavior). + let recursive_start = std::time::Instant::now(); + let mut tree = HashMap::new(); + resolve_directory_tree_recursive(cas_store, root_digest, &mut tree).await?; + let recursive_elapsed = recursive_start.elapsed(); + let total_elapsed = tree_start.elapsed(); + let total_bytes: u64 = tree.keys().map(|d| d.size_bytes()).sum(); + let total_files: usize = tree.values().map(|d| d.files.len()).sum(); + let total_symlinks: usize = tree.values().map(|d| d.symlinks.len()).sum(); + debug!( + root = ?root_digest, + dir_count = tree.len(), + total_files, + total_symlinks, + total_bytes, + individual_fetches = tree.len(), + recursive_ms = recursive_elapsed.as_millis() as u64, + total_elapsed_ms = total_elapsed.as_millis() as u64, + "resolve_directory_tree: completed via recursive fetch" + ); + Ok(tree) +} + +/// Recursively fetch directories via individual `get_and_decode_digest` calls. +fn resolve_directory_tree_recursive<'a>( cas_store: &'a FastSlowStore, - filesystem_store: Pin<&'a FilesystemStore>, digest: &'a DigestInfo, - current_directory: &'a str, + tree: &'a mut HashMap, ) -> BoxFuture<'a, Result<(), Error>> { async move { + if tree.contains_key(digest) { + return Ok(()); + } let directory = get_and_decode_digest::(cas_store, digest.into()) .await - .err_tip(|| "Converting digest to Directory")?; - let mut futures = FuturesUnordered::new(); + .err_tip(|| "Converting digest to Directory in recursive tree fetch")?; + let child_digests: Vec = directory + .directories + .iter() + .map(|d| { + d.digest + .as_ref() + .err_tip(|| "Expected Digest in DirectoryNode")? + .try_into() + .err_tip(|| "Parsing child directory digest in recursive tree fetch") + }) + .collect::, _>>()?; + tree.insert(*digest, directory); + for child in &child_digests { + resolve_directory_tree_recursive(cas_store, child, tree).await?; + } + Ok(()) + } + .boxed() +} + +/// Walk the resolved directory tree, creating all directories and collecting +/// all files that need to be materialized. Returns the flat list of files. +fn collect_files_from_tree( + tree: &HashMap, + root_digest: &DigestInfo, + root_path: &str, +) -> Result<(Vec, Vec<(String, String)>), Error> { + let mut files = Vec::new(); + // (symlink_target, dest_path) + let mut symlinks: Vec<(String, String)> = Vec::new(); + // BFS to create directories in order and collect files. + let mut queue = VecDeque::new(); + queue.push_back((*root_digest, root_path.to_string())); + + while let Some((dir_digest, dir_path)) = queue.pop_front() { + let directory = tree.get(&dir_digest).ok_or_else(|| { + make_err!( + Code::Internal, + "Directory {dir_digest:?} not found in resolved tree" + ) + })?; - for file in directory.files { + for file in &directory.files { let digest: DigestInfo = file .digest - .err_tip(|| "Expected Digest to exist in Directory::file::digest")? + .as_ref() + .err_tip(|| "Expected Digest in Directory::file::digest")? .try_into() .err_tip(|| "In Directory::file::digest")?; - let dest = format!("{}/{}", current_directory, file.name); - let (mtime, mut unix_mode) = match file.node_properties { - Some(properties) => (properties.mtime, properties.unix_mode), - None => (None, None), + let dest = format!("{}/{}", dir_path, file.name); + + #[cfg(target_family = "unix")] + let unix_mode = { + let (_, mut mode) = match &file.node_properties { + Some(properties) => (properties.mtime.clone(), properties.unix_mode), + None => (None, None), + }; + if file.is_executable { + mode = Some(mode.unwrap_or(0o555) | 0o111); + } + // Default to 0o555 (read+execute, no write) to match CAS store + // defaults. Some build tools (rules_cc, rules_rust) set + // is_executable=false on shell scripts that must be executable; + // using 0o555 as the base avoids breaking those actions. + Some(mode.unwrap_or(0o555)) }; - #[cfg_attr(target_family = "windows", allow(unused_assignments))] - if file.is_executable { - unix_mode = Some(unix_mode.unwrap_or(0o444) | 0o111); + + let mtime = file.node_properties.as_ref().and_then(|p| p.mtime.clone()); + + files.push(FileToMaterialize { + digest, + dest, + #[cfg(target_family = "unix")] + unix_mode, + mtime, + }); + } + + for subdir in &directory.directories { + let child_digest: DigestInfo = subdir + .digest + .as_ref() + .err_tip(|| "Expected Digest in Directory::directories::digest")? + .try_into() + .err_tip(|| "In Directory::directories::digest")?; + let child_path = format!("{}/{}", dir_path, subdir.name); + queue.push_back((child_digest, child_path)); + } + + #[cfg(target_family = "unix")] + for symlink_node in &directory.symlinks { + let dest = format!("{}/{}", dir_path, symlink_node.name); + symlinks.push((symlink_node.target.clone(), dest)); + } + } + + Ok((files, symlinks)) +} + +/// Maximum number of concurrent BatchReadBlobs RPCs in flight. +const BATCH_READ_CONCURRENCY: usize = 16; + +/// Maximum number of concurrent ByteStream fetches in flight. + +/// Batch-download small blobs via `BatchReadBlobs` and write them into the fast store. +/// Returns the set of digests that were successfully fetched. +/// +/// If WorkerProxyStore is available, uses the locality map to route digests +/// to peers that have them. Digests without a known peer go to the server. +/// Any misses from peers or server are retried via `populate_fast_store_unchecked`. +async fn batch_read_small_blobs( + cas_store: &FastSlowStore, + small_digests: &[DigestInfo], +) -> Result, Error> { + let slow_store = cas_store.slow_store(); + + // Try locality-aware routing through WorkerProxyStore. + if let Some(proxy) = slow_store.downcast_ref::(None) { + let peer_stores = proxy.peer_stores(); + if !peer_stores.is_empty() { + // Assign digests to endpoints using the locality map. + let mut endpoint_digests: HashMap, Vec> = HashMap::new(); + let mut server_digests: Vec = Vec::new(); + + { + let locality = proxy.locality_map().read(); + let mut round_robin_idx: usize = 0; + for &digest in small_digests { + let peers = locality.lookup_workers(&digest); + // Filter to connected peers only. + let connected: Vec<&Arc> = peers + .iter() + .filter(|ep| peer_stores.contains_key(ep.as_ref())) + .collect(); + if connected.is_empty() { + server_digests.push(digest); + } else { + // Round-robin among connected peers that have this blob. + let endpoint = connected[round_robin_idx % connected.len()].clone(); + round_robin_idx = round_robin_idx.wrapping_add(1); + endpoint_digests + .entry(endpoint) + .or_default() + .push(digest); + } + } } - futures.push( - cas_store - .populate_fast_store(digest.into()) - .and_then(move |()| async move { - if is_zero_digest(digest) { - let mut file_slot = fs::create_file(&dest).await?; - file_slot.write_all(&[]).await?; + + let peer_blob_count: usize = endpoint_digests.values().map(|v| v.len()).sum(); + debug!( + total = small_digests.len(), + to_peers = peer_blob_count, + to_server = server_digests.len(), + peer_endpoints = endpoint_digests.len(), + "BatchReadBlobs: locality-based routing" + ); + + // Collect ALL batch work items (peer + server) for parallel execution. + let mut all_batches: Vec<(&str, &GrpcStore, Vec)> = Vec::new(); + + for (endpoint, digests) in &endpoint_digests { + if let Some(store) = peer_stores.get(endpoint.as_ref()) { + if let Some(grpc) = store.downcast_ref::(None) { + for batch in partition_into_batches(digests) { + all_batches.push((endpoint.as_ref(), grpc, batch)); } - else { - let file_entry = filesystem_store - .get_file_entry_for_digest(&digest) - .await - .err_tip(|| "During hard link")?; - // TODO: add a test for #2051: deadlock with large number of files - let src_path = file_entry.get_file_path_locked(|src| async move { Ok(PathBuf::from(src)) }).await?; - fs::hard_link(&src_path, &dest) + } + } + } + + if let Some(grpc) = proxy.inner_store().downcast_ref::(None) { + for batch in partition_into_batches(&server_digests) { + all_batches.push(("server", grpc, batch)); + } + } + + // Execute ALL batches in parallel across all endpoints. + let results = futures::future::join_all( + all_batches.into_iter().map(|(ep, grpc, batch)| async move { + let result = execute_batch_read(grpc, cas_store, &batch).await; + (ep, result) + }), + ) + .await; + + let mut fetched = HashSet::new(); + for (ep, result) in results { + match result { + Ok(completed) => fetched.extend(completed), + Err(e) => debug!(endpoint = ep, ?e, "BatchReadBlobs: batch failed"), + } + } + + // Retry misses via populate_fast_store_unchecked (full store chain). + let misses: Vec = small_digests + .iter() + .filter(|d| !fetched.contains(d)) + .copied() + .collect(); + + if !misses.is_empty() { + debug!(count = misses.len(), "BatchReadBlobs: fetching misses via store chain"); + let retry_results = futures::future::join_all( + misses.iter().map(|&digest| async move { + let result = cas_store + .populate_fast_store_unchecked(digest.into()) + .await; + (digest, result) + }), + ) + .await; + let mut retry_failures = 0u32; + for (digest, result) in retry_results { + match result { + Ok(()) => { fetched.insert(digest); } + Err(e) => { + retry_failures += 1; + debug!(?digest, ?e, "BatchReadBlobs: retry fetch failed"); + } + } + } + if retry_failures > 0 { + debug!(retry_failures, "BatchReadBlobs: some retries failed"); + } + } + + return Ok(fetched); + } + } + + // No peers available — server-only batch read. + let grpc_store = match slow_store.downcast_ref::(None) { + Some(store) => store, + None => return Ok(HashSet::new()), + }; + + let batches = partition_into_batches(small_digests); + let fetched: HashSet = futures::stream::iter(batches.into_iter()) + .map(|batch| async move { execute_batch_read(grpc_store, cas_store, &batch).await }) + .buffer_unordered(BATCH_READ_CONCURRENCY) + .try_fold(HashSet::new(), |mut acc, completed| async move { + acc.extend(completed); + Ok(acc) + }) + .await?; + + Ok(fetched) +} + +/// Partition digests into 4 MiB batches for BatchReadBlobs. +fn partition_into_batches(digests: &[DigestInfo]) -> Vec> { + let mut batches: Vec> = Vec::new(); + let mut current_batch: Vec = Vec::new(); + let mut current_size: u64 = 0; + + for &digest in digests { + let blob_size = digest.size_bytes(); + if !current_batch.is_empty() && current_size + blob_size > BATCH_READ_MAX_REQUEST_SIZE { + batches.push(std::mem::take(&mut current_batch)); + current_size = 0; + } + current_batch.push(digest); + current_size += blob_size; + } + if !current_batch.is_empty() { + batches.push(current_batch); + } + batches +} + +/// Execute a single BatchReadBlobs request and write results to fast store. +async fn execute_batch_read( + grpc_store: &GrpcStore, + cas_store: &FastSlowStore, + digests: &[DigestInfo], +) -> Result, Error> { + let request = BatchReadBlobsRequest { + instance_name: String::new(), // GrpcStore fills this in + digests: digests.iter().map(|d| (*d).into()).collect(), + acceptable_compressors: vec![], + digest_function: Context::current() + .get::() + .map_or_else(default_digest_hasher_func, |v| *v) + .proto_digest_func() + .into(), + }; + + let response = grpc_store + .batch_read_blobs(Request::new(request)) + .await + .err_tip(|| "In execute_batch_read")? + .into_inner(); + + let fast_store = cas_store.fast_store(); + + // Parse all valid responses first, then write to fast store concurrently. + let valid_blobs: Vec<(DigestInfo, Bytes)> = response + .responses + .into_iter() + .filter_map(|blob_resp| { + let status_code = blob_resp.status.as_ref().map_or(0, |s| s.code); + if status_code != 0 { + return None; + } + let proto_digest = blob_resp.digest?; + let digest = DigestInfo::try_from(proto_digest).ok()?; + Some((digest, Bytes::from(blob_resp.data))) + }) + .collect(); + + // Write all blobs to fast store concurrently. + let write_futures: FuturesUnordered<_> = valid_blobs + .into_iter() + .map(|(digest, data)| { + let data_len = data.len() as u64; + async move { + let (mut tx, rx) = make_buf_channel_pair(); + let store_key: StoreKey<'_> = digest.into(); + let update_fut = fast_store.update( + store_key, + rx, + UploadSizeInfo::ExactSize(data_len), + ); + let send_fut = async { + tx.send(data) + .await + .err_tip(|| "Sending batch blob to fast store")?; + tx.send_eof().err_tip(|| "Sending EOF for batch blob")?; + Ok::<_, Error>(()) + }; + let (update_res, send_res) = futures::join!(update_fut, send_fut); + update_res + .merge(send_res) + .err_tip(|| format!("Writing batch-read blob {digest:?} to fast store"))?; + Ok::(digest) + } + }) + .collect(); + + let completed: Vec = write_futures.try_collect().await?; + + Ok(completed) +} + +/// Populate the fast store for a single digest and hardlink it to `dest`. +/// Contains the retry loop for cache eviction races. +async fn populate_and_hardlink( + cas_store: &FastSlowStore, + filesystem_store: Pin<&FilesystemStore>, + digest: DigestInfo, + dest: &str, +) -> Result<(), Error> { + if is_zero_digest(digest) { + cas_store.populate_fast_store(digest.into()).await?; + let mut file_slot = fs::create_file(dest).await?; + std::io::Write::write_all(file_slot.as_std_mut(), &[]) + .err_tip(|| "Could not write to file")?; + return Ok(()); + } + + const MAX_RETRIES: u32 = 3; + let mut last_err = None; + for attempt in 0..MAX_RETRIES { + if attempt > 0 { + filesystem_store.remove_entry_for_digest(&digest).await; + } + cas_store.populate_fast_store(digest.into()).await?; + + let result = async { + let file_entry = filesystem_store + .get_file_entry_for_digest(&digest) + .await + .err_tip(|| "Getting file entry for hardlink")?; + let dest_clone = dest.to_string(); + file_entry + .get_file_path_locked(move |src| async move { + let src_exists = Path::new(&src).exists(); + let result = fs::hard_link(&src, &dest_clone).await; + if result.is_err() { + warn!( + src = %src.to_string_lossy(), + src_exists = src_exists, + dest = %dest_clone, + "hard_link failed while holding read lock" + ); + } + result + }) + .await + } + .await; + + match result { + Ok(()) => { + last_err = None; + break; + } + Err(e) if e.code == Code::NotFound => { + warn!( + attempt = attempt + 1, + max_retries = MAX_RETRIES, + ?digest, + dest = %dest, + err = ?e, + "File evicted from cache during hardlink. Retrying." + ); + last_err = Some(e); + } + Err(e) => { + return Err(make_err!( + Code::Internal, + "Could not make hardlink, {e:?} : {dest}" + )); + } + } + } + if let Some(e) = last_err { + return Err(make_err!( + Code::Internal, + "Could not make hardlink after {MAX_RETRIES} attempts, \ + file was repeatedly evicted from cache. {e:?} : {dest}\n\ + This error often occurs when the filesystem store's max_bytes is too small for your workload.\n\ + To fix this issue:\n\ + 1. Increase the 'max_bytes' value in your filesystem store configuration\n\ + 2. Example: Change 'max_bytes: 10000000000' to 'max_bytes: 50000000000' (or higher)\n\ + 3. The setting is typically found in your nativelink.json config under:\n\ + stores -> [your_filesystem_store] -> filesystem -> eviction_policy -> max_bytes\n\ + 4. Restart NativeLink after making the change\n\n\ + If this error persists after increasing max_bytes several times, please report at:\n\ + https://github.com/TraceMachina/nativelink/issues\n\ + Include your config file and both server and client logs to help us assist you." + )); + } + Ok(()) +} + +/// Like `hardlink_and_set_metadata` but uses a pre-fetched file entry +/// (from batch `get_file_entries_batch`) to avoid per-file EvictingMap lock +/// contention. Falls back to the regular path on cache miss. +async fn hardlink_and_set_metadata_prefetched( + cas_store: &FastSlowStore, + filesystem_store: Pin<&FilesystemStore>, + file: FileToMaterialize, + prefetched_entry: Option>, +) -> Result<(), Error> { + let digest = file.digest; + let dest = file.dest.clone(); + + if let Some(file_entry) = prefetched_entry { + // We have a pre-fetched entry — try hardlink directly. + let dest_clone = dest.clone(); + let result = file_entry + .get_file_path_locked(move |src| async move { + fs::hard_link(&src, &dest_clone).await + }) + .await; + + match result { + Ok(()) => { + // Success — apply permissions and mtime, then return. + } + Err(e) if e.code == Code::NotFound => { + // File was evicted between pre-fetch and hardlink. + // Fall back to full populate+hardlink. + populate_and_hardlink(cas_store, filesystem_store, digest, &dest).await?; + } + Err(e) => { + return Err(make_err!( + Code::Internal, + "Could not make hardlink (prefetched), {e:?} : {dest}" + )); + } + } + } else { + // No pre-fetched entry (cache miss or zero digest). + populate_and_hardlink(cas_store, filesystem_store, digest, &dest).await?; + } + + // Always set permissions — CAS files default to 0o555 but concurrent + // hardlinks from other actions can change the shared inode's mode. + // We must unconditionally chmod to ensure correctness. + #[cfg(target_family = "unix")] + if let Some(unix_mode) = file.unix_mode { + fs::set_permissions(&dest, Permissions::from_mode(unix_mode)) + .await + .err_tip(|| format!("Could not set unix mode in download_to_directory {dest}"))?; + } + + // Apply mtime. + if let Some(mtime) = file.mtime { + let dest_owned = dest.clone(); + spawn_blocking!("download_to_directory_set_mtime", move || { + set_file_mtime( + &dest_owned, + FileTime::from_unix_time(mtime.seconds, mtime.nanos as u32), + ) + .err_tip(|| format!("Failed to set mtime in download_to_directory {dest_owned}")) + }) + .await + .err_tip(|| "Failed to launch spawn_blocking in download_to_directory")??; + } + + Ok(()) +} + +/// Aggressively download the digests of files and make a local folder from it. +/// +/// This optimized version: +/// 1. Resolves the full directory tree via `GetTree` RPC (single streaming call) +/// instead of issuing recursive individual `get_and_decode_digest` calls. +/// 2. Batch-checks which blobs are already in the fast store via `has_with_results` +/// (maps to `FindMissingBlobs` on GrpcStore), avoiding per-file existence RPCs. +/// 3. Fetches small missing blobs (<1 MiB) via `BatchReadBlobs` in 4 MiB batches, +/// with large blobs using the existing ByteStream path. +/// +/// We require the `FilesystemStore` to be the `fast` store of `FastSlowStore`. +/// We will request the `FastSlowStore` to populate the entry then we will +/// assume the `FilesystemStore` has the file available immediately after and hardlink the file +/// to a new location. +pub fn download_to_directory<'a>( + cas_store: &'a FastSlowStore, + filesystem_store: Pin<&'a FilesystemStore>, + digest: &'a DigestInfo, + current_directory: &'a str, +) -> BoxFuture<'a, Result<(), Error>> { + async move { + let phase_start = std::time::Instant::now(); + + // Step 1: Resolve the full directory tree. + let tree = resolve_directory_tree(cas_store, digest).await?; + let tree_resolve_ms = phase_start.elapsed().as_millis(); + + // Step 2: Walk the tree, creating all directories and collecting files. + let (files, symlinks) = collect_files_from_tree(&tree, digest, current_directory)?; + + debug!( + root = ?digest, + total_dirs = tree.len(), + total_files = files.len(), + total_symlinks = symlinks.len(), + "download_to_directory: starting materialization", + ); + + // Create all subdirectories using level-parallel BFS — siblings at + // the same depth are created concurrently while parent-before-child + // ordering is maintained (each level completes before the next starts). + let mkdir_start = std::time::Instant::now(); + let mut dirs_created: usize = 0; + let mut mkdir_depth: u32 = 0; + { + let mut current_level = vec![(*digest, current_directory.to_string())]; + while !current_level.is_empty() { + let mut next_level = Vec::new(); + for (dir_digest, dir_path) in ¤t_level { + if let Some(directory) = tree.get(dir_digest) { + debug!( + depth = mkdir_depth, + path = %dir_path, + files = directory.files.len(), + subdirs = directory.directories.len(), + "download_to_directory: processing directory", + ); + for subdir in &directory.directories { + let child_digest: DigestInfo = subdir + .digest + .as_ref() + .err_tip(|| "Expected Digest")? + .try_into() + .err_tip(|| "In Directory::directories::digest")?; + let child_path = format!("{}/{}", dir_path, subdir.name); + next_level.push((child_digest, child_path)); + } + } + } + if !next_level.is_empty() { + dirs_created += next_level.len(); + try_join_all(next_level.iter().map(|(_, path)| { + let path = path.clone(); + async move { + fs::create_dir(&path) .await - .map_err(|e| { - if e.code == Code::NotFound { - make_err!( - Code::Internal, - "Could not make hardlink, file was likely evicted from cache. {e:?} : {dest}\n\ - This error often occurs when the filesystem store's max_bytes is too small for your workload.\n\ - To fix this issue:\n\ - 1. Increase the 'max_bytes' value in your filesystem store configuration\n\ - 2. Example: Change 'max_bytes: 10000000000' to 'max_bytes: 50000000000' (or higher)\n\ - 3. The setting is typically found in your nativelink.json config under:\n\ - stores -> [your_filesystem_store] -> filesystem -> eviction_policy -> max_bytes\n\ - 4. Restart NativeLink after making the change\n\n\ - If this error persists after increasing max_bytes several times, please report at:\n\ - https://github.com/TraceMachina/nativelink/issues\n\ - Include your config file and both server and client logs to help us assist you." - ) - } else { - make_err!(Code::Internal, "Could not make hardlink, {e:?} : {dest}") - } - })?; - } - #[cfg(target_family = "unix")] - if let Some(unix_mode) = unix_mode { - fs::set_permissions(&dest, Permissions::from_mode(unix_mode)) + .err_tip(|| format!("Could not create directory {path}")) + } + })) + .await?; + } + mkdir_depth += 1; + current_level = next_level; + } + } + let mkdir_elapsed = mkdir_start.elapsed(); + debug!( + dirs_created, + mkdir_depth_levels = mkdir_depth, + mkdir_ms = mkdir_elapsed.as_millis() as u64, + "download_to_directory: directories created", + ); + + // Create symlinks concurrently. + #[cfg(target_family = "unix")] + { + let symlink_futures: FuturesUnordered<_> = symlinks + .iter() + .map(|(target, dest)| async move { + fs::symlink(target, dest) + .await + .err_tip(|| format!("Could not create symlink {target} -> {dest}")) + }) + .collect(); + symlink_futures + .try_for_each(|()| futures::future::ready(Ok(()))) + .await?; + } + + if files.is_empty() { + debug!( + root = ?digest, + "download_to_directory: no files to materialize (directory-only tree)", + ); + return Ok(()); + } + + // Step 3: Batch-check which blobs are already in the fast store. + // Deduplicate digests first to avoid redundant checks. + let unique_digests: Vec = { + let mut seen = HashSet::with_capacity(files.len()); + files + .iter() + .filter_map(|f| { + if seen.insert(f.digest) { + Some(f.digest) + } else { + None + } + }) + .collect() + }; + + let has_check_start = std::time::Instant::now(); + let store_keys: Vec> = + unique_digests.iter().map(|d| (*d).into()).collect(); + let mut has_results = vec![None; store_keys.len()]; + // Check in chunks to reduce Mutex hold time in the fast store, + // allowing concurrent operations from other actions to interleave. + const HAS_CHECK_CHUNK: usize = 500; + for start in (0..store_keys.len()).step_by(HAS_CHECK_CHUNK) { + let end = (start + HAS_CHECK_CHUNK).min(store_keys.len()); + Pin::new(cas_store.fast_store()) + .has_with_results(&store_keys[start..end], &mut has_results[start..end]) + .await + .err_tip(|| "Batch has_with_results on fast store")?; + } + + let cached_set: HashSet = unique_digests + .iter() + .zip(has_results.iter()) + .filter_map(|(digest, result)| result.map(|_| *digest)) + .collect(); + + let missing_digests: Vec = unique_digests + .iter() + .zip(has_results.iter()) + .filter_map(|(digest, result)| if result.is_none() { Some(*digest) } else { None }) + .collect(); + + let has_check_elapsed = has_check_start.elapsed(); + let has_check_ms = phase_start.elapsed().as_millis(); + + let cached_bytes: u64 = cached_set.iter().map(|d| d.size_bytes()).sum(); + let missing_bytes: u64 = missing_digests.iter().map(|d| d.size_bytes()).sum(); + debug!( + total_files = files.len(), + unique_digests = unique_digests.len(), + cached = cached_set.len(), + cached_bytes, + missing = missing_digests.len(), + missing_bytes, + elapsed_ms = has_check_elapsed.as_millis() as u64, + "download_to_directory: batch existence check complete" + ); + + // Steps 4+5 (pipelined): Three concurrent futures: + // + // Fetcher: launches ALL missing blob fetches at once with bounded + // concurrency. As each blob arrives it is inserted into a + // `fetched_set` so the producer knows it is ready. + // + // Producer: iterates files in batches. Files whose blobs are already + // cached go to the channel immediately. Files whose blobs are + // still being fetched are deferred and retried after a short + // yield. This means hardlinking starts right away for cached + // files while fetches proceed in parallel. + // + // Consumer: reads from the channel, hardlinks with bounded + // concurrency (unchanged from before). + // + const HARDLINK_CONCURRENCY: usize = 64; + const FETCH_CONCURRENCY: usize = 128; + const HARDLINK_BATCH: usize = 64; + // Channel capacity: buffer ahead of the consumer. + const CHANNEL_CAPACITY: usize = HARDLINK_BATCH * 2; + + type PipelineItem = ( + FileToMaterialize, + Option>, + ); + + let total_files_to_link = files.len(); + let (tx, rx) = mpsc::channel::(CHANNEL_CAPACITY); + + let fetch_start = std::time::Instant::now(); + + let missing_set: HashSet = missing_digests.iter().copied().collect(); + + debug!( + total_files = total_files_to_link, + cached = cached_set.len(), + missing = missing_digests.len(), + missing_bytes, + fetch_concurrency = FETCH_CONCURRENCY, + hardlink_concurrency = HARDLINK_CONCURRENCY, + "download_to_directory: starting pipelined fetch+hardlink", + ); + + // --- Shared state: tracks which missing digests have arrived --- + let fetched_set: Arc>> = + Arc::new(std::sync::Mutex::new(HashSet::with_capacity(missing_digests.len()))); + let fetch_error: Arc>> = + Arc::new(std::sync::Mutex::new(None)); + let fetched_notify = Arc::new(Notify::new()); + + // --- Fetcher future --- + // Launches all missing blob fetches concurrently (bounded). + let fetcher_start = std::time::Instant::now(); + let fetched_set_ref = &fetched_set; + let fetch_error_ref = &fetch_error; + let fetched_notify_ref = &fetched_notify; + let fetcher_fut = async { + // Partition into small (BatchReadBlobs) and large (ByteStream). + let mut small: Vec = Vec::new(); + let mut large: Vec = Vec::new(); + for &d in &missing_digests { + if is_zero_digest(d) { + // Zero digests don't need fetching; mark as ready. + fetched_set_ref.lock().unwrap().insert(d); + continue; + } + if d.size_bytes() <= BATCH_READ_MAX_BLOB_SIZE { + small.push(d); + } else { + large.push(d); + } + } + + debug!( + small = small.len(), + large = large.len(), + missing_bytes, + "fetcher: starting all blob fetches", + ); + + // Fetch small blobs via BatchReadBlobs (already batches internally). + let batch_read_fut = async { + if small.is_empty() { + return Ok::<(), Error>(()); + } + let fetched = batch_read_small_blobs(cas_store, &small).await?; + // Mark all successfully fetched small blobs as ready. + { + let mut set = fetched_set_ref.lock().unwrap(); + for &d in &small { + // batch_read_small_blobs returns the set of blobs it + // actually got; unfetched ones need ByteStream fallback. + if fetched.contains(&d) { + set.insert(d); + } + } + } + fetched_notify_ref.notify_one(); + + // Fallback for small blobs not returned by BatchReadBlobs. + let fallback: Vec = small + .iter() + .filter(|d| !fetched.contains(d)) + .copied() + .collect(); + if !fallback.is_empty() { + debug!( + count = fallback.len(), + "fetcher: BatchReadBlobs fallback via ByteStream", + ); + futures::stream::iter(fallback.into_iter().map(Ok::<_, Error>)) + .try_for_each_concurrent(FETCH_CONCURRENCY, |d| async move { + cas_store + .populate_fast_store_unchecked(d.into()) .await - .err_tip(|| { - format!( - "Could not set unix mode in download_to_directory {dest}" - ) - })?; + .err_tip(|| format!("Populating fast store (fallback) for {d:?}"))?; + fetched_set_ref.lock().unwrap().insert(d); + fetched_notify_ref.notify_one(); + Ok(()) + }) + .await?; + } + Ok(()) + }; + + // Fetch large blobs via ByteStream with bounded concurrency. + let bytestream_fut = async { + if large.is_empty() { + return Ok::<(), Error>(()); + } + futures::stream::iter(large.into_iter().map(Ok::<_, Error>)) + .try_for_each_concurrent(FETCH_CONCURRENCY, |d| async move { + let blob_start = std::time::Instant::now(); + cas_store + .populate_fast_store_unchecked(d.into()) + .await + .err_tip(|| format!("Populating fast store for {d:?}"))?; + let blob_elapsed = blob_start.elapsed(); + if blob_elapsed.as_secs() >= 2 { + warn!( + digest = ?d, + size_bytes = d.size_bytes(), + elapsed_ms = blob_elapsed.as_millis() as u64, + "fetcher: slow blob fetch (>2s)", + ); + } + fetched_set_ref.lock().unwrap().insert(d); + fetched_notify_ref.notify_one(); + Ok(()) + }) + .await + }; + + // Run small and large fetches concurrently. + let (batch_result, bs_result) = + futures::future::join(batch_read_fut, bytestream_fut).await; + + let fetcher_elapsed = fetcher_start.elapsed(); + + // If either failed, record the error so the producer can see it. + if let Err(e) = batch_result { + *fetch_error_ref.lock().unwrap() = Some(e); + fetched_notify_ref.notify_one(); + } + if let Err(e) = bs_result { + let mut guard = fetch_error_ref.lock().unwrap(); + if guard.is_none() { + *guard = Some(e); + } + fetched_notify_ref.notify_one(); + } + + debug!( + elapsed_ms = fetcher_elapsed.as_millis() as u64, + fetched = fetched_set_ref.lock().unwrap().len(), + missing_total = missing_digests.len(), + throughput_mbps = format!("{:.1}", throughput_mbps(missing_bytes, fetcher_elapsed)), + "fetcher: all blob fetches complete", + ); + }; + + // --- Producer future --- + // Iterates files, sends cached ones immediately, waits for missing + // ones as they arrive from the fetcher. + let producer_start = std::time::Instant::now(); + let producer_fut = async { + let mut files_sent: usize = 0; + let mut deferred_count: usize = 0; + + // Process files in batches for entry pre-fetching efficiency. + for batch_files in files.chunks(HARDLINK_BATCH) { + // Separate into ready (cached or already fetched) and pending. + let mut ready_files: Vec<&FileToMaterialize> = Vec::new(); + let mut pending_files: Vec<&FileToMaterialize> = Vec::new(); + + { + let fetched = fetched_set_ref.lock().unwrap(); + for f in batch_files { + if !missing_set.contains(&f.digest) || fetched.contains(&f.digest) { + ready_files.push(f); + } else { + pending_files.push(f); + } + } + } + + // Send ready files immediately. + if !ready_files.is_empty() { + let ready_digests: Vec = + ready_files.iter().map(|f| f.digest).collect(); + let entries = + filesystem_store.get_file_entries_batch(&ready_digests).await; + + for (file, entry) in ready_files.iter().zip(entries) { + let item: PipelineItem = ( + FileToMaterialize { + digest: file.digest, + dest: file.dest.clone(), + #[cfg(target_family = "unix")] + unix_mode: file.unix_mode, + mtime: file.mtime.clone(), + }, + entry, + ); + if tx.send(item).await.is_err() { + return Ok::<_, Error>(producer_start.elapsed()); + } + files_sent += 1; + } + } + + // Wait for pending files as their blobs arrive. + if !pending_files.is_empty() { + deferred_count += pending_files.len(); + let mut remaining = pending_files; + + loop { + if remaining.is_empty() { + break; + } + + // Check for fetcher errors. + if let Some(e) = fetch_error_ref.lock().unwrap().take() { + return Err(e); } - if let Some(mtime) = mtime { - spawn_blocking!("download_to_directory_set_mtime", move || { - set_file_mtime( - &dest, - FileTime::from_unix_time(mtime.seconds, mtime.nanos as u32), - ) - .err_tip(|| { - format!("Failed to set mtime in download_to_directory {dest}") - }) - }) - .await - .err_tip( - || "Failed to launch spawn_blocking in download_to_directory", - )??; + + // Partition remaining into newly ready and still pending. + let mut newly_ready: Vec<&FileToMaterialize> = Vec::new(); + let mut still_pending: Vec<&FileToMaterialize> = Vec::new(); + { + let fetched = fetched_set_ref.lock().unwrap(); + for f in remaining { + if fetched.contains(&f.digest) { + newly_ready.push(f); + } else { + still_pending.push(f); + } + } } - Ok(()) - }) - .map_err(move |e| e.append(format!("for digest {digest}"))) - .boxed(), - ); - } - for directory in directory.directories { - let digest: DigestInfo = directory - .digest - .err_tip(|| "Expected Digest to exist in Directory::directories::digest")? - .try_into() - .err_tip(|| "In Directory::file::digest")?; - let new_directory_path = format!("{}/{}", current_directory, directory.name); - futures.push( - async move { - fs::create_dir(&new_directory_path) - .await - .err_tip(|| format!("Could not create directory {new_directory_path}"))?; - download_to_directory( - cas_store, - filesystem_store, - &digest, - &new_directory_path, - ) - .await - .err_tip(|| format!("in download_to_directory : {new_directory_path}"))?; - Ok(()) + if !newly_ready.is_empty() { + let ready_digests: Vec = + newly_ready.iter().map(|f| f.digest).collect(); + let entries = + filesystem_store.get_file_entries_batch(&ready_digests).await; + + for (file, entry) in newly_ready.iter().zip(entries) { + let item: PipelineItem = ( + FileToMaterialize { + digest: file.digest, + dest: file.dest.clone(), + #[cfg(target_family = "unix")] + unix_mode: file.unix_mode, + mtime: file.mtime.clone(), + }, + entry, + ); + if tx.send(item).await.is_err() { + return Ok(producer_start.elapsed()); + } + files_sent += 1; + } + } + + remaining = still_pending; + if !remaining.is_empty() { + // Wait until the fetcher signals new arrivals. + fetched_notify_ref.notified().await; + } + } } - .boxed(), + } + + let producer_elapsed = producer_start.elapsed(); + debug!( + files_sent, + deferred = deferred_count, + elapsed_ms = producer_elapsed.as_millis() as u64, + "producer: finished sending all files", ); - } - #[cfg(target_family = "unix")] - for symlink_node in directory.symlinks { - let dest = format!("{}/{}", current_directory, symlink_node.name); - futures.push( - async move { - fs::symlink(&symlink_node.target, &dest).await.err_tip(|| { - format!( - "Could not create symlink {} -> {}", - symlink_node.target, dest + // Explicitly drop the sender so the consumer's rx.recv() + // returns None and the stream ends. join3 keeps all futures + // alive until all complete, so without this the consumer + // would wait forever. + drop(tx); + + Ok(producer_start.elapsed()) + }; + + // --- Consumer future --- + // Reads from the channel and hardlinks with bounded concurrency. + let hardlink_start = std::time::Instant::now(); + let slow_hardlinks = std::sync::atomic::AtomicU32::new(0); + let max_hardlink_ms = std::sync::atomic::AtomicU64::new(0); + let links_completed = std::sync::atomic::AtomicUsize::new(0); + + let consumer_fut = async { + let stream = futures::stream::unfold(rx, |mut rx| async { + rx.recv().await.map(|item| (Ok::(item), rx)) + }); + + stream + .try_for_each_concurrent(HARDLINK_CONCURRENCY, |(file, prefetched)| { + let slow_hardlinks = &slow_hardlinks; + let max_hardlink_ms = &max_hardlink_ms; + let links_completed = &links_completed; + async move { + let digest = file.digest; + let dest = file.dest.clone(); + let link_start = std::time::Instant::now(); + hardlink_and_set_metadata_prefetched( + cas_store, filesystem_store, file, prefetched, ) - })?; - Ok(()) - } - .boxed(), - ); - } + .await + .map_err(move |e| { + let mut e = e.append(format!("for digest {digest}")); + if e.code == Code::NotFound { + e.details.push(make_precondition_failure_any(digest)); + } + e + })?; + let link_elapsed = link_start.elapsed(); + let link_ms = link_elapsed.as_millis() as u64; + + links_completed.fetch_add(1, Ordering::Relaxed); + max_hardlink_ms.fetch_max(link_ms, Ordering::Relaxed); + + if link_ms > 50 { + slow_hardlinks.fetch_add(1, Ordering::Relaxed); + warn!( + dest = %dest, + digest = ?digest, + elapsed_ms = link_ms, + "pipeline: slow hardlink (>50ms)", + ); + } + Ok(()) + } + }) + .await + }; + + // Run all three concurrently. The fetcher and producer share state + // via fetched_set + Notify. The producer and consumer share the + // mpsc channel. The consumer drops when the producer's tx drops. + let (_, producer_result, consumer_result) = + futures::future::join3(fetcher_fut, producer_fut, consumer_fut).await; + + // Check consumer first (it's the critical path). + consumer_result?; + // Then check producer. + let producer_elapsed = producer_result?; + + let hardlink_elapsed = hardlink_start.elapsed(); + let fetch_elapsed = fetch_start.elapsed(); + let slow_count = slow_hardlinks.load(Ordering::Relaxed); + let max_link_ms = max_hardlink_ms.load(Ordering::Relaxed); + let total_linked = links_completed.load(Ordering::Relaxed); + let fetcher_elapsed = fetcher_start.elapsed(); + + debug!( + total_missing = missing_digests.len(), + total_missing_bytes = missing_bytes, + fetch_elapsed_ms = fetcher_elapsed.as_millis() as u64, + throughput_mbps = format!("{:.1}", throughput_mbps(missing_bytes, fetcher_elapsed)), + "download_to_directory: fetch phase completed", + ); + + debug!( + total_links = total_linked, + elapsed_ms = hardlink_elapsed.as_millis() as u64, + slow_links_over_50ms = slow_count, + max_link_ms, + avg_link_us = if total_linked > 0 { + hardlink_elapsed.as_micros() as u64 / total_linked as u64 + } else { 0 }, + producer_ms = producer_elapsed.as_millis() as u64, + total_elapsed_ms = fetch_elapsed.as_millis() as u64, + "download_to_directory: hardlink phase completed", + ); + + let total_bytes: u64 = unique_digests.iter().map(|d| d.size_bytes()).sum(); + let total_ms = phase_start.elapsed().as_millis(); + debug!( + tree_resolve_ms, + has_check_ms = has_check_ms - tree_resolve_ms, + fetch_ms = fetcher_elapsed.as_millis() as u64, + hardlink_ms = hardlink_elapsed.as_millis() as u64, + total_ms, + num_files = unique_digests.len(), + total_bytes, + throughput_mbps = format!("{:.1}", throughput_mbps(total_bytes, phase_start.elapsed())), + "download_to_directory completed", + ); - while futures.try_next().await?.is_some() {} Ok(()) } .boxed() @@ -331,13 +1533,13 @@ async fn upload_file( ) -> Result { let is_executable = is_executable(&metadata, &full_path); let file_size = metadata.len(); - let file = fs::open_file(&full_path, 0, u64::MAX) + let file = fs::open_file(&full_path, 0) .await .err_tip(|| format!("Could not open file {full_path:?}"))?; let (digest, mut file) = hasher .hasher() - .digest_for_file(&full_path, file.into_inner(), Some(file_size)) + .digest_for_file(&full_path, file, Some(file_size)) .await .err_tip(|| format!("Failed to hash file in digest_for_file failed for {full_path:?}"))?; @@ -355,7 +1557,7 @@ async fn upload_file( // Only upload if the digest doesn't already exist, this should be // a much cheaper operation than an upload. let cas_store = cas_store.as_store_driver_pin(); - let store_key: nativelink_util::store_trait::StoreKey<'_> = digest.into(); + let store_key: StoreKey<'_> = digest.into(); let has_start = std::time::Instant::now(); if cas_store .has(store_key.borrow()) @@ -376,7 +1578,8 @@ async fn upload_file( "upload_file: digest not in CAS, starting upload", ); - file.rewind().await.err_tip(|| "Could not rewind file")?; + std::io::Seek::seek(file.as_std_mut(), std::io::SeekFrom::Start(0)) + .err_tip(|| "Could not rewind file")?; // Note: For unknown reasons we appear to be hitting: // https://github.com/rust-lang/rust/issues/92096 @@ -393,12 +1596,28 @@ async fn upload_file( ) .await .map(|_slot| ()); - trace!( - ?digest, - upload_elapsed_ms = file_upload_start.elapsed().as_millis(), - success = upload_result.is_ok(), - "upload_file: update_with_whole_file completed", - ); + let upload_elapsed = file_upload_start.elapsed(); + + match &upload_result { + Ok(()) => { + debug!( + ?digest, + size_bytes = digest.size_bytes(), + elapsed_ms = upload_elapsed.as_millis() as u64, + throughput_mbps = format!("{:.1}", throughput_mbps(digest.size_bytes(), upload_elapsed)), + "upload_file: CAS write completed", + ); + } + Err(e) => { + error!( + ?digest, + size_bytes = digest.size_bytes(), + elapsed_ms = upload_elapsed.as_millis() as u64, + ?e, + "upload_file: CAS write failed", + ); + } + } match upload_result { Ok(()) => Ok(()), @@ -456,13 +1675,18 @@ async fn upload_symlink( // Detect if our symlink is inside our work directory, if it is find the // relative path otherwise use the absolute path. let target = if full_target_path.starts_with(full_work_directory_path.as_ref()) { - let full_target_path = RelativePath::from_path(&full_target_path) - .map_err(|v| make_err!(Code::Internal, "Could not convert {} to RelativePath", v))?; - RelativePath::from_path(full_work_directory_path.as_ref()) - .map_err(|v| make_err!(Code::Internal, "Could not convert {} to RelativePath", v))? - .relative(full_target_path) - .normalize() - .into_string() + full_target_path + .strip_prefix(full_work_directory_path.as_ref()) + .map_err(|e| make_err!(Code::Internal, "Could not strip work dir prefix: {}", e))? + .to_str() + .err_tip(|| { + make_err!( + Code::Internal, + "Could not convert '{:?}' to string", + full_target_path + ) + })? + .to_string() } else { full_target_path .to_str() @@ -627,7 +1851,7 @@ async fn process_side_channel_file( let mut json_contents = String::new(); { // Note: Scoping `file_slot` allows the file_slot semaphore to be released faster. - let mut file_slot = match fs::open_file(side_channel_file, 0, u64::MAX).await { + let mut file_slot = match fs::open_file(side_channel_file, 0).await { Ok(file_slot) => file_slot, Err(e) => { if e.code != Code::NotFound { @@ -637,9 +1861,7 @@ async fn process_side_channel_file( return Ok(None); } }; - file_slot - .read_to_string(&mut json_contents) - .await + std::io::Read::read_to_string(file_slot.as_std_mut(), &mut json_contents) .err_tip(|| "Error reading side channel file")?; } @@ -675,9 +1897,17 @@ async fn do_cleanup( debug!("Worker cleaning up"); // Note: We need to be careful to keep trying to cleanup even if one of the steps fails. - let remove_dir_result = fs::remove_dir_all(action_directory) - .await - .err_tip(|| format!("Could not remove working directory {action_directory}")); + let remove_dir_result = match fs::remove_dir_all(action_directory).await { + Ok(()) => Ok(()), + Err(_) => { + // On macOS, Spotlight/Finder can momentarily recreate files + // (e.g. .DS_Store) during deletion, causing ENOTEMPTY. A + // short delay and single retry is sufficient. + tokio::time::sleep(Duration::from_millis(100)).await; + fs::remove_dir_all(action_directory).await + } + } + .err_tip(|| format!("Could not remove working directory {action_directory}")); if let Err(err) = running_actions_manager.cleanup_action(operation_id) { error!(%operation_id, ?err, "Error cleaning up action"); @@ -846,25 +2076,161 @@ impl RunningActionImpl { }; { // Create all directories needed for our output paths. This is required by the bazel spec. + let work_dir_for_output = self.work_directory.clone(); + // Mutex serializes the slow-path symlink replacement to avoid + // concurrent tasks racing on the same symlink (EEXIST / ENOENT). + let symlink_fix_lock = Arc::new(tokio::sync::Mutex::new(())); let prepare_output_directories = |output_file| { + let work_dir = work_dir_for_output.clone(); + let lock = symlink_fix_lock.clone(); let full_output_path = if command.working_directory.is_empty() { - format!("{}/{}", self.work_directory, output_file) + format!("{}/{}", work_dir, output_file) } else { format!( "{}/{}/{}", - self.work_directory, command.working_directory, output_file + work_dir, command.working_directory, output_file ) }; async move { let full_parent_path = Path::new(&full_output_path) .parent() .err_tip(|| format!("Parent path for {full_output_path} has no parent"))?; - fs::create_dir_all(full_parent_path).await.err_tip(|| { - format!( - "Error creating output directory {} (file)", + + // Fast path: create_dir_all and verify the directory is writable. + // create_dir_all succeeds even if the directory is read-only + // (it already exists), but rustc needs write access for outputs. + if fs::create_dir_all(full_parent_path).await.is_ok() { + let mut dir_writable = true; + #[cfg(target_family = "unix")] + if let Ok(m) = fs::metadata(full_parent_path).await { + dir_writable = m.mode() & 0o200 != 0; + } + if dir_writable { + return Result::<(), Error>::Ok(()); + } + // Directory exists but is not writable (likely through + // a symlink to the read-only cache). Fall through to fix. + } + + // Slow path: serialize to avoid concurrent symlink replacement races. + let _guard = lock.lock().await; + + // Re-check under lock — another task may have already fixed it. + if fs::create_dir_all(full_parent_path).await.is_ok() { + let mut dir_writable = true; + #[cfg(target_family = "unix")] + if let Ok(m) = fs::metadata(full_parent_path).await { + dir_writable = m.mode() & 0o200 != 0; + } + if dir_writable { + return Result::<(), Error>::Ok(()); + } + } + + // Walk the path and replace blocking symlinks with writable + // shallow-copy directories that preserve access to all + // original entries via absolute symlinks. + let work_root = Path::new(&work_dir); + let relative = full_parent_path.strip_prefix(work_root) + .map_err(|_| make_err!( + Code::Internal, + "Output path {} not under work dir {}", + full_parent_path.display(), + work_root.display() + ))?; + + let mut current = work_root.to_path_buf(); + for component in relative.components() { + let component_name = component.as_os_str(); + let next = current.join(component_name); + + match fs::symlink_metadata(&next).await { + Ok(meta) => { + #[cfg(target_family = "unix")] + if meta.is_symlink() { + // Check if resolved target is a read-only directory + let needs_replace = match fs::canonicalize(&next).await { + Ok(resolved) => { + match fs::metadata(&resolved).await { + Ok(m) => m.is_dir() && (m.mode() & 0o200 == 0), + Err(_) => false, + } + } + Err(_) => false, + }; + + if needs_replace { + let resolved = fs::canonicalize(&next).await + .err_tip(|| format!("Failed to resolve: {}", next.display()))?; + + // Replace symlink with a writable shallow-copy directory. + // Each entry in the original target gets an absolute symlink, + // except for self-referential entries (e.g., bazel-out -> .). + fs::remove_file(&next).await + .err_tip(|| format!("Failed to remove symlink: {}", next.display()))?; + fs::create_dir(&next).await + .err_tip(|| format!("Failed to create dir: {}", next.display()))?; + + let rd = fs::read_dir(&resolved).await + .err_tip(|| format!("Failed to read dir: {}", resolved.display()))?; + let (_permit, mut inner_rd) = rd.into_inner(); + while let Some(entry) = inner_rd.next_entry().await + .err_tip(|| format!("Failed to iterate: {}", resolved.display()))? + { + let entry_name = entry.file_name(); + // Skip self-referential entries (bazel-out -> . creates + // an entry pointing back to the replaced dir itself). + if entry_name == component_name { + continue; + } + let abs_target = resolved.join(&entry_name); + let link = next.join(&entry_name); + if let Err(e) = fs::symlink(&abs_target, &link).await { + warn!( + link = %link.display(), + target = %abs_target.display(), + ?e, + "prepare_output_dirs: failed to create shallow-copy symlink", + ); + } + } + + // Retry — the fix at this level may be sufficient. + if fs::create_dir_all(full_parent_path).await.is_ok() { + return Ok(()); + } + } + } + + #[cfg(target_family = "unix")] + if meta.is_dir() && (meta.mode() & 0o200 == 0) { + // Read-only directory in the work tree (not through symlink). + // Safe to make writable since work dirs are independent copies. + let mut perms = meta.permissions(); + perms.set_mode(meta.mode() | 0o200); + drop(fs::set_permissions(&next, perms).await); + } + } + Err(_) => { + // Path doesn't exist — create remaining dirs. + fs::create_dir_all(full_parent_path).await + .err_tip(|| format!( + "Error creating output directory {}", + full_parent_path.display() + ))?; + return Ok(()); + } + } + + current = next; + } + + // Final attempt after all fixes applied. + fs::create_dir_all(full_parent_path).await + .err_tip(|| format!( + "Error creating output directory {} (after symlink fixes)", full_parent_path.display() - ) - })?; + ))?; Result::<(), Error>::Ok(()) } }; @@ -928,7 +2294,70 @@ impl RunningActionImpl { // figure out toolchain misconfiguration issues. // De-bloat the `debug` level by using the `trace` // level more effectively and adjust this. - info!(?args, "Executing command",); + debug!(?args, "Executing command",); + + // Diagnostic: log permissions of .sh files in the work directory tree + // to debug EACCES errors on remote workers. + #[cfg(target_family = "unix")] + { + use std::os::unix::fs::{MetadataExt, PermissionsExt}; + let work_dir = format!( + "{}/{}", + self.work_directory, command_proto.working_directory + ); + let mut check_dirs = vec![work_dir.clone()]; + let mut sh_count = 0u32; + let mut bad_count = 0u32; + while let Some(dir) = check_dirs.pop() { + if let Ok(mut entries) = tokio::fs::read_dir(&dir).await { + while let Ok(Some(entry)) = entries.next_entry().await { + let path = entry.path(); + if let Ok(meta) = tokio::fs::symlink_metadata(&path).await { + if meta.is_dir() { + check_dirs.push(path.to_string_lossy().to_string()); + } else if path.extension().is_some_and(|e| e == "sh") { + sh_count += 1; + let mode = meta.permissions().mode(); + let nlink = meta.nlink(); + let is_symlink = meta.file_type().is_symlink(); + if mode & 0o111 == 0 { + bad_count += 1; + event!( + target: "nativelink::diag", + Level::WARN, + path = %path.display(), + mode = format!("{mode:04o}"), + nlink, + is_symlink, + "NON-EXEC .sh file in work dir" + ); + } else { + event!( + target: "nativelink::diag", + Level::INFO, + path = %path.display(), + mode = format!("{mode:04o}"), + nlink, + is_symlink, + "OK .sh file in work dir" + ); + } + } + } + } + } + } + if sh_count > 0 { + event!( + target: "nativelink::diag", + Level::INFO, + sh_count, + bad_count, + "sh file permission scan complete" + ); + } + } + let mut command_builder = process::Command::new(args[0]); command_builder .args(&args[1..]) @@ -1086,7 +2515,7 @@ impl RunningActionImpl { { let joined_command = args.join(OsStr::new(" ")); let command = joined_command.to_string_lossy(); - info!( + debug!( seconds = self.action_info.timeout.as_secs_f32(), %command, "Command timed out" @@ -1131,7 +2560,7 @@ impl RunningActionImpl { exit_code }); - info!(?args, "Command complete"); + debug!(?args, "Command complete"); let maybe_error_override = if let Some(side_channel_file) = maybe_side_channel_file { process_side_channel_file(side_channel_file.clone(), &args, requested_timeout).await @@ -1208,7 +2637,10 @@ impl RunningActionImpl { state.execution_metadata.clone(), ) }; - let cas_store = self.running_actions_manager.cas_store.as_ref(); + // Upload outputs to the fast store (local FilesystemStore) only. + // The slow store (remote CAS) upload is deferred to the background + // after the execution result is reported, reducing latency. + let cas_store = self.running_actions_manager.cas_store.fast_store(); let hasher = self.action_info.unique_qualifier.digest_function(); let mut output_path_futures = FuturesUnordered::new(); @@ -1359,10 +2791,12 @@ impl RunningActionImpl { .update_oneshot(digest, data) .await .err_tip(|| "Uploading stdout")?; + let elapsed = start.elapsed(); debug!( ?digest, - data_len, - elapsed_ms = start.elapsed().as_millis(), + size_bytes = data_len, + elapsed_ms = elapsed.as_millis() as u64, + throughput_mbps = format!("{:.1}", throughput_mbps(data_len as u64, elapsed)), "upload_results: stdout upload completed", ); Result::::Ok(digest) @@ -1376,10 +2810,12 @@ impl RunningActionImpl { .update_oneshot(digest, data) .await .err_tip(|| "Uploading stderr")?; + let elapsed = start.elapsed(); debug!( ?digest, - data_len, - elapsed_ms = start.elapsed().as_millis(), + size_bytes = data_len, + elapsed_ms = elapsed.as_millis() as u64, + throughput_mbps = format!("{:.1}", throughput_mbps(data_len as u64, elapsed)), "upload_results: stderr upload completed", ); Result::::Ok(digest) @@ -1431,6 +2867,25 @@ impl RunningActionImpl { let mut state = self.state.lock(); execution_metadata.worker_completed_timestamp = (self.running_actions_manager.callbacks.now_fn)(); + + // Log phase durations for every action so we can diagnose latency. + let duration_ms = |start: SystemTime, end: SystemTime| -> i64 { + end.duration_since(start) + .map(|d| d.as_millis() as i64) + .unwrap_or_else(|e| -(e.duration().as_millis() as i64)) + }; + let em = &execution_metadata; + debug!( + operation_id = ?self.operation_id, + queue_ms = duration_ms(em.queued_timestamp, em.worker_start_timestamp), + input_fetch_ms = duration_ms(em.input_fetch_start_timestamp, em.input_fetch_completed_timestamp), + execution_ms = duration_ms(em.execution_start_timestamp, em.execution_completed_timestamp), + output_upload_ms = duration_ms(em.output_upload_start_timestamp, em.output_upload_completed_timestamp), + worker_overhead_ms = duration_ms(em.worker_start_timestamp, em.input_fetch_start_timestamp), + total_worker_ms = duration_ms(em.worker_start_timestamp, em.worker_completed_timestamp), + "Action phase timing", + ); + state.action_result = Some(ActionResult { output_files, output_folders, @@ -1532,7 +2987,7 @@ impl RunningAction for RunningActionImpl { async fn upload_results(self: Arc) -> Result, Error> { let upload_timeout = self.running_actions_manager.max_upload_timeout; let operation_id = self.operation_id.clone(); - info!( + debug!( ?operation_id, upload_timeout_s = upload_timeout.as_secs(), "upload_results: starting with timeout", @@ -1542,11 +2997,13 @@ impl RunningAction for RunningActionImpl { .upload_results .wrap(Self::inner_upload_results(self)); + let stall_warned = AtomicBool::new(false); let stall_warn_fut = async { let mut elapsed_secs = 0u64; loop { tokio::time::sleep(Duration::from_secs(60)).await; elapsed_secs += 60; + stall_warned.store(true, Ordering::Relaxed); warn!( ?operation_id, elapsed_s = elapsed_secs, @@ -1556,6 +3013,7 @@ impl RunningAction for RunningActionImpl { } }; + let upload_start = Instant::now(); let res = tokio::time::timeout(upload_timeout, async { tokio::pin!(upload_fut); tokio::pin!(stall_warn_fut); @@ -1573,8 +3031,18 @@ impl RunningAction for RunningActionImpl { operation_id, ) })?; - if let Err(ref e) = res { - warn!(?operation_id, ?e, "Error during upload_results"); + match &res { + Ok(_) if stall_warned.load(Ordering::Relaxed) => { + debug!( + ?operation_id, + elapsed_s = upload_start.elapsed().as_secs(), + "upload_results: completed after stall", + ); + } + Err(e) => { + warn!(?operation_id, ?e, "Error during upload_results"); + } + _ => {} } res } @@ -1638,7 +3106,25 @@ pub trait RunningActionsManager: Sync + Send + Sized + Unpin + 'static { operation_id: &OperationId, ) -> impl Future> + Send; + /// Spawn a background task to upload action output blobs from the local + /// fast store to the remote slow store. No-op by default. + fn spawn_upload_to_remote(self: &Arc, _action_result: &ActionResult) {} + fn metrics(&self) -> &Arc; + + /// Returns the digests of input root directories cached in the worker's + /// directory cache. Returns an empty Vec if no directory cache is configured. + fn cached_directory_digests(&self) -> impl Future> + Send; + + /// Returns ALL subtree digests across all cached directory entries. + /// Used for the initial full snapshot on (re)connect. + fn all_subtree_digests(&self) -> impl Future> + Send; + + /// Atomically takes the pending subtree digest changes since the last call. + /// Returns (added, removed) digest lists and clears the internal state. + fn take_pending_subtree_changes( + &self, + ) -> impl Future, Vec)> + Send; } /// A function to get the current system time, used to allow mocking for tests @@ -1803,11 +3289,22 @@ impl UploadActionResults { results_cache_policy: None, digest_function: hasher.proto_digest_func().into(), }; - return grpc_store + let size_bytes = update_action_request.encoded_len() as u64; + let start = std::time::Instant::now(); + grpc_store .update_action_result(Request::new(update_action_request)) .await .map(|_| ()) - .err_tip(|| "Caching ActionResult"); + .err_tip(|| "Caching ActionResult")?; + let elapsed = start.elapsed(); + debug!( + ?action_digest, + size_bytes, + elapsed_ms = elapsed.as_millis() as u64, + throughput_mbps = format!("{:.1}", throughput_mbps(size_bytes, elapsed)), + "AC write completed (grpc)", + ); + return Ok(()); } let mut store_data = BytesMut::with_capacity(ESTIMATED_DIGEST_SIZE); @@ -1815,10 +3312,21 @@ impl UploadActionResults { .encode(&mut store_data) .err_tip(|| "Encoding ActionResult for caching")?; + let size_bytes = store_data.len() as u64; + let start = std::time::Instant::now(); ac_store .update_oneshot(action_digest, store_data.split().freeze()) .await - .err_tip(|| "Caching ActionResult") + .err_tip(|| "Caching ActionResult")?; + let elapsed = start.elapsed(); + debug!( + ?action_digest, + size_bytes, + elapsed_ms = elapsed.as_millis() as u64, + throughput_mbps = format!("{:.1}", throughput_mbps(size_bytes, elapsed)), + "AC write completed", + ); + Ok(()) } async fn upload_historical_results_with_message( @@ -1863,7 +3371,7 @@ impl UploadActionResults { return Ok(()); } - let mut execute_response = to_execute_response(action_result.clone()); + let execute_response = to_execute_response(action_result.clone()); // In theory exit code should always be != 0 if there's an error, but for safety we // catch both. @@ -1873,51 +3381,66 @@ impl UploadActionResults { self.failure_message_template.clone() }; - let upload_historical_results_with_message_result = if should_upload_historical_results { - let maybe_message = self - .upload_historical_results_with_message( - action_info, - execute_response.clone(), + // Extract AC result proto before concurrent uploads (independent of message). + let ac_result_proto = if should_upload_ac_results { + Some( + execute_response + .result + .clone() + .err_tip(|| "No result set in cache_action_result")?, + ) + } else { + None + }; + + // Run historical + AC uploads concurrently — they are independent. + let historical_fut = async { + if should_upload_historical_results { + match self + .upload_historical_results_with_message( + action_info, + execute_response, + message_template, + hasher, + ) + .await + { + Ok(message) => Ok(Some(message)), + Err(e) => Err(e), + } + } else { + match Self::format_execute_response_message( message_template, + action_info, + None, hasher, - ) - .await; - match maybe_message { - Ok(message) => { - action_result.message.clone_from(&message); - execute_response.message = message; - Ok(()) - } - Err(e) => Result::<(), Error>::Err(e), - } - } else { - match Self::format_execute_response_message(message_template, action_info, None, hasher) - { - Ok(message) => { - action_result.message.clone_from(&message); - execute_response.message = message; - Ok(()) + ) { + Ok(message) => Ok(Some(message)), + Err(e) => { + Err(e).err_tip(|| "Could not format message in cache_action_result") + } } - Err(e) => Err(e).err_tip(|| "Could not format message in cache_action_result"), } }; - // Note: Done in this order because we assume most results will succeed and most configs will - // either always upload upload historical results or only upload on filure. In which case - // we can avoid an extra clone of the protos by doing this last with the above assumption. - let ac_upload_results = if should_upload_ac_results { - self.upload_ac_results( - action_info, - execute_response - .result - .err_tip(|| "No result set in cache_action_result")?, - hasher, - ) - .await - } else { - Ok(()) + let ac_fut = async { + if let Some(proto) = ac_result_proto { + self.upload_ac_results(action_info, proto, hasher).await + } else { + Ok(()) + } }; - upload_historical_results_with_message_result.merge(ac_upload_results) + + let (historical_result, ac_result) = futures::future::join(historical_fut, ac_fut).await; + + // Apply message from historical upload. + if let Ok(Some(message)) = &historical_result { + action_result.message.clone_from(message); + } + + historical_result + .map(|_| ()) + .merge(ac_result) } } @@ -1933,6 +3456,10 @@ pub struct RunningActionsManagerArgs<'a> { pub max_upload_timeout: Duration, pub timeout_handled_externally: bool, pub directory_cache: Option>, + /// Worker-local locality map for registering peer hints from StartExecute. + /// When present, peer_hints from the scheduler are registered here so that + /// WorkerProxyStore can fetch blobs from peer workers. + pub peer_locality_map: Option, } struct CleanupGuard { @@ -1980,6 +3507,8 @@ pub struct RunningActionsManagerImpl { /// Optional directory cache for improving performance by caching reconstructed /// input directories and using hardlinks. directory_cache: Option>, + /// Worker-local locality map for registering peer hints from StartExecute. + peer_locality_map: Option, } impl RunningActionsManagerImpl { @@ -2024,6 +3553,7 @@ impl RunningActionsManagerImpl { cleaning_up_operations: Mutex::new(HashSet::new()), cleanup_complete_notify: Arc::new(Notify::new()), directory_cache: args.directory_cache, + peer_locality_map: args.peer_locality_map, }) } @@ -2037,6 +3567,243 @@ impl RunningActionsManagerImpl { ) } + /// Spawn a background task that uploads all action output blobs from the + /// fast store (local FilesystemStore) to the slow store (remote CAS). + /// This is called after the execution result has been reported to the + /// scheduler, so it does not block action completion latency. + /// + /// To prevent a race condition where the EvictingMap evicts small blobs + /// before the background task can read them, we pre-read all small blobs + /// (<=1 MiB) from the fast store *before* spawning the background task. + /// The pre-read data is passed into the spawned task via a HashMap, so + /// the background upload never needs to re-read small blobs from the + /// store. Large blobs are streamed directly from the store as before + /// (they are much less likely to be evicted quickly due to their size). + pub fn spawn_upload_to_remote(self: &Arc, action_result: &ActionResult) { + let slow_store = self.cas_store.slow_store(); + if slow_store + .inner_store(None::>) + .optimized_for(StoreOptimizations::NoopUpdates) + { + return; + } + // Respect slow_direction config — when set to Get or ReadOnly, + // the slow store should not receive writes (same check as + // FastSlowStore::update). + let dir = self.cas_store.slow_direction(); + if dir == StoreDirection::Get || dir == StoreDirection::ReadOnly { + return; + } + + let mut digests = Vec::new(); + let mut tree_digests = Vec::new(); + for file in &action_result.output_files { + if file.digest.size_bytes() > 0 { + digests.push(file.digest); + } + } + for folder in &action_result.output_folders { + if folder.tree_digest.size_bytes() > 0 { + digests.push(folder.tree_digest); + tree_digests.push(folder.tree_digest); + } + } + if action_result.stdout_digest.size_bytes() > 0 { + digests.push(action_result.stdout_digest); + } + if action_result.stderr_digest.size_bytes() > 0 { + digests.push(action_result.stderr_digest); + } + if digests.is_empty() { + return; + } + + let cas_store = self.cas_store.clone(); + tokio::spawn(async move { + let fast_store = cas_store.fast_store(); + let slow_store = cas_store.slow_store(); + let start = std::time::Instant::now(); + + // Small blobs use update_oneshot which routes through + // BatchUpdateBlobs for efficient coalescing. Large blobs + // stream through a channel to avoid loading into memory. + const BATCH_THRESHOLD: u64 = 1024 * 1024; // 1 MiB + + // Phase 1: Pre-read all known small blobs into memory to + // prevent the eviction race condition. The EvictingMap can + // evict tiny blobs (e.g. 4-byte tree blobs, stdout, stderr) + // before the background task gets a chance to read them. + // By reading them eagerly at the start of the spawned task + // (which runs immediately), we capture the data before any + // subsequent action's uploads can trigger eviction. + let mut preread_data: HashMap = + HashMap::with_capacity(digests.len()); + + // Pre-read initial small digests (stdout, stderr, tree blobs, + // small output files). + let preread_futures: FuturesUnordered<_> = digests + .iter() + .filter(|d| d.size_bytes() <= BATCH_THRESHOLD) + .copied() + .map(|digest| async move { + let result = fast_store.get_part_unchunked(digest, 0, None).await; + (digest, result) + }) + .collect(); + let preread_results: Vec<_> = preread_futures.collect().await; + for (digest, result) in preread_results { + match result { + Ok(data) => { + preread_data.insert(digest, data); + } + Err(e) => { + warn!( + ?digest, + ?e, + "upload_to_remote: failed to pre-read small blob from fast store", + ); + } + } + } + + // Extract file digests from output directory trees. Use + // pre-read data if available (avoids re-reading from store). + for tree_digest in &tree_digests { + let tree_result = if let Some(data) = preread_data.get(tree_digest) { + ProtoTree::decode(data.clone()) + .map_err(|e| make_err!(Code::Internal, "Failed to decode Tree proto: {e}")) + } else { + get_and_decode_digest::(fast_store, (*tree_digest).into()).await + }; + match tree_result { + Ok(tree) => { + let file_digests: Vec = tree + .children + .into_iter() + .chain(tree.root) + .flat_map(|dir| dir.files) + .filter_map(|f| f.digest.and_then(|d| DigestInfo::try_from(d).ok())) + .filter(|d| d.size_bytes() > 0) + .collect(); + debug!( + ?tree_digest, + file_count = file_digests.len(), + "upload_to_remote: extracted file digests from output directory tree", + ); + // Pre-read any newly-discovered small file digests. + let new_preread_futures: FuturesUnordered<_> = file_digests + .iter() + .filter(|d| { + d.size_bytes() <= BATCH_THRESHOLD + && !preread_data.contains_key(d) + }) + .copied() + .map(|digest| async move { + let result = + fast_store.get_part_unchunked(digest, 0, None).await; + (digest, result) + }) + .collect(); + let new_results: Vec<_> = new_preread_futures.collect().await; + for (digest, result) in new_results { + match result { + Ok(data) => { + preread_data.insert(digest, data); + } + Err(e) => { + warn!( + ?digest, + ?e, + "upload_to_remote: failed to pre-read tree file blob", + ); + } + } + } + digests.extend(file_digests); + } + Err(e) => { + warn!( + ?tree_digest, + ?e, + "upload_to_remote: failed to decode tree for file digest extraction", + ); + } + } + } + + let total = digests.len(); + let preread_count = preread_data.len(); + debug!( + total_digests = total, + preread_count, + tree_count = tree_digests.len(), + "upload_to_remote: starting background CAS upload", + ); + + // Phase 2: Upload all digests to the slow store. Small blobs + // use pre-read data; large blobs stream from the fast store. + let mut success_count = 0u64; + let mut fail_count = 0u64; + let mut uploads = FuturesUnordered::new(); + for digest in digests { + // Use pre-read data for small blobs that were captured + // eagerly. This avoids the eviction race where EvictingMap + // removes the blob before we can read it. + let cached_data = preread_data.remove(&digest); + uploads.push(async move { + let result = if let Some(data) = cached_data { + // Data was pre-read -- upload directly without + // touching the fast store. + slow_store.update_oneshot(digest, data).await + } else if digest.size_bytes() <= BATCH_THRESHOLD { + // Small blob that wasn't pre-read (e.g. pre-read + // failed). Try reading from the store as fallback. + match fast_store.get_part_unchunked(digest, 0, None).await { + Ok(data) => slow_store.update_oneshot(digest, data).await, + Err(e) => Err(e), + } + } else { + let (tx, rx) = make_buf_channel_pair(); + let read_fut = fast_store.get(digest, tx); + let write_fut = slow_store.update( + digest, + rx, + UploadSizeInfo::ExactSize(digest.size_bytes()), + ); + let (read_res, write_res) = tokio::join!(read_fut, write_fut); + read_res.merge(write_res) + }; + match result { + Ok(()) => true, + Err(e) => { + warn!( + ?digest, + ?e, + "upload_to_remote: failed to upload digest", + ); + false + } + } + }); + } + while let Some(ok) = uploads.next().await { + if ok { + success_count += 1; + } else { + fail_count += 1; + } + } + + debug!( + total_digests = total, + success_count, + fail_count, + elapsed_ms = start.elapsed().as_millis() as u64, + "upload_to_remote: background CAS upload completed", + ); + }); + } + /// Fixes a race condition that occurs when an action fails to execute on a worker, and the same worker /// attempts to re-execute the same action before the physical cleanup (file is removed) completes. /// See this issue for additional details: @@ -2232,6 +3999,30 @@ impl RunningActionsManager for RunningActionsManagerImpl { self.metrics .create_and_add_action .wrap(async move { + // Extract peer hints BEFORE consuming start_execute. + let peer_hints = start_execute.peer_hints.clone(); + if !peer_hints.is_empty() { + if let Some(ref locality_map) = self.peer_locality_map { + let mut map = locality_map.write(); + let mut total_registered = 0usize; + for hint in &peer_hints { + if let Some(ref digest_proto) = hint.digest { + if let Ok(digest) = DigestInfo::try_from(digest_proto) { + for endpoint in &hint.peer_endpoints { + map.register_blobs(endpoint, &[digest]); + total_registered += 1; + } + } + } + } + debug!( + hints = peer_hints.len(), + registrations = total_registered, + "Registered peer hints from scheduler into worker locality map" + ); + } + } + let queued_timestamp = start_execute .queued_timestamp .and_then(|time| time.try_into().ok()) @@ -2356,10 +4147,35 @@ impl RunningActionsManager for RunningActionsManagerImpl { ); } + fn spawn_upload_to_remote(self: &Arc, action_result: &ActionResult) { + RunningActionsManagerImpl::spawn_upload_to_remote(self, action_result); + } + #[inline] fn metrics(&self) -> &Arc { &self.metrics } + + async fn cached_directory_digests(&self) -> Vec { + match &self.directory_cache { + Some(cache) => cache.cached_digests().await, + None => Vec::new(), + } + } + + async fn all_subtree_digests(&self) -> Vec { + match &self.directory_cache { + Some(cache) => cache.all_subtree_digests().await, + None => Vec::new(), + } + } + + async fn take_pending_subtree_changes(&self) -> (Vec, Vec) { + match &self.directory_cache { + Some(cache) => cache.take_pending_subtree_changes().await, + None => (Vec::new(), Vec::new()), + } + } } #[derive(Debug, Default, MetricsComponent)] diff --git a/nativelink-worker/src/worker_api_client_wrapper.rs b/nativelink-worker/src/worker_api_client_wrapper.rs index 1e2791fc0..364c60275 100644 --- a/nativelink-worker/src/worker_api_client_wrapper.rs +++ b/nativelink-worker/src/worker_api_client_wrapper.rs @@ -19,7 +19,8 @@ use nativelink_error::{make_err, Error, ResultExt}; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::update_for_scheduler::Update; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::worker_api_client::WorkerApiClient; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ - ConnectWorkerRequest, ExecuteComplete, ExecuteResult, GoingAwayRequest, KeepAliveRequest, UpdateForScheduler, UpdateForWorker + BlobsAvailableNotification, ConnectWorkerRequest, ExecuteComplete, + ExecuteResult, GoingAwayRequest, KeepAliveRequest, UpdateForScheduler, UpdateForWorker, }; use tokio::sync::mpsc::Sender; use tonic::codec::Streaming; @@ -53,6 +54,11 @@ pub trait WorkerApiClientTrait: Clone + Sync + Send + Sized + Unpin { &mut self, request: ExecuteComplete, ) -> impl Future> + Send; + + fn blobs_available( + &mut self, + request: BlobsAvailableNotification, + ) -> impl Future> + Send; } #[derive(Debug, Clone)] @@ -133,4 +139,11 @@ impl WorkerApiClientTrait for WorkerApiClientWrapper { async fn execution_complete(&mut self, request: ExecuteComplete) -> Result<(), Error> { self.send_update(Update::ExecuteComplete(request)).await } + + async fn blobs_available( + &mut self, + request: BlobsAvailableNotification, + ) -> Result<(), Error> { + self.send_update(Update::BlobsAvailable(request)).await + } } diff --git a/nativelink-worker/src/worker_utils.rs b/nativelink-worker/src/worker_utils.rs index 3135e0be3..b07a91abb 100644 --- a/nativelink-worker/src/worker_utils.rs +++ b/nativelink-worker/src/worker_utils.rs @@ -32,6 +32,7 @@ pub async fn make_connect_worker_request( worker_properties: &HashMap, extra_envs: &HashMap, max_inflight_tasks: u64, + cas_endpoint: String, ) -> Result { let mut futures = vec![]; for (property_name, worker_property) in worker_properties { @@ -106,5 +107,6 @@ pub async fn make_connect_worker_request( worker_id_prefix, properties: try_join_all(futures).await?.into_iter().flatten().collect(), max_inflight_tasks, + cas_endpoint, }) } diff --git a/nativelink-worker/tests/local_worker_test.rs b/nativelink-worker/tests/local_worker_test.rs index efc3a61fa..49af0b124 100644 --- a/nativelink-worker/tests/local_worker_test.rs +++ b/nativelink-worker/tests/local_worker_test.rs @@ -35,12 +35,12 @@ use nativelink_config::stores::{ }; use nativelink_error::{Code, Error, make_err, make_input_err}; use nativelink_macro::nativelink_test; -use nativelink_proto::build::bazel::remote::execution::v2::Platform; +use nativelink_proto::build::bazel::remote::execution::v2::{Digest, Platform}; use nativelink_proto::build::bazel::remote::execution::v2::platform::Property; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::update_for_worker::Update; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ - ConnectWorkerRequest, ConnectionResult, ExecuteResult, KillOperationRequest, StartExecute, - UpdateForWorker, execute_result, + ConnectWorkerRequest, ConnectionResult, ExecuteResult, KillOperationRequest, PeerHint, + StartExecute, UpdateForWorker, execute_result, }; use nativelink_store::fast_slow_store::FastSlowStore; use nativelink_store::filesystem_store::FilesystemStore; @@ -58,7 +58,6 @@ use nativelink_worker::local_worker::preconditions_met; use pretty_assertions::assert_eq; use prost::Message; use rand::Rng; -use tokio::io::AsyncWriteExt; use utils::local_worker_test_utils::{ setup_grpc_stream, setup_local_worker, setup_local_worker_with_config, }; @@ -128,6 +127,7 @@ async fn platform_properties_smoke_test() -> Result<(), Error> { } ], max_inflight_tasks: 0, + cas_endpoint: String::new(), } ); @@ -262,6 +262,7 @@ async fn blake3_digest_function_registered_properly() -> Result<(), Error> { queued_timestamp: None, platform: Some(Platform::default()), worker_id: expected_worker_id.clone(), + peer_hints: Vec::new(), })), }) .unwrap(), @@ -352,6 +353,7 @@ async fn simple_worker_start_action_test() -> Result<(), Error> { queued_timestamp: None, platform: Some(Platform::default()), worker_id: expected_worker_id.clone(), + peer_hints: Vec::new(), })), }) .unwrap(), @@ -490,8 +492,10 @@ async fn new_local_worker_removes_work_directory_before_start_test() -> Result<( fs::create_dir_all(format!("{}/{}", work_directory, "another_dir")).await?; let mut file = fs::create_file(OsString::from(format!("{}/{}", work_directory, "foo.txt"))).await?; - file.write_all(b"Hello, world!").await?; - file.as_mut().sync_all().await?; + Write::write_all(file.as_std_mut(), b"Hello, world!") + .map_err(|e| Into::::into(e))?; + file.as_std().sync_all() + .map_err(|e| Into::::into(e))?; drop(file); new_local_worker( Arc::new(LocalWorkerConfig { @@ -627,6 +631,7 @@ async fn experimental_precondition_script_fails() -> Result<(), Error> { queued_timestamp: None, platform: Some(Platform::default()), worker_id: expected_worker_id.clone(), + peer_hints: Vec::new(), })), }) .unwrap(), @@ -714,6 +719,7 @@ async fn kill_action_request_kills_action() -> Result<(), Error> { queued_timestamp: None, platform: Some(Platform::default()), worker_id: expected_worker_id.clone(), + peer_hints: Vec::new(), })), }) .unwrap(), @@ -765,3 +771,494 @@ async fn preconditions_met_extra_envs() -> Result<(), Error> { assert!(logs_contain("test_value_for_demo_env")); Ok(()) } + +#[nativelink_test] +async fn worker_translates_not_found_to_failed_precondition_test() -> Result<(), Error> { + let mut test_context = setup_local_worker(HashMap::new()).await; + let streaming_response = test_context.maybe_streaming_response.take().unwrap(); + + { + // Ensure our worker connects and properties were sent. + let props = test_context + .client + .expect_connect_worker(Ok(streaming_response)) + .await; + assert_eq!(props, ConnectWorkerRequest::default()); + } + + let expected_worker_id = "foobar".to_string(); + + let tx_stream = test_context.maybe_tx_stream.take().unwrap(); + { + // First initialize our worker by sending the response to the connection request. + tx_stream + .send(Frame::data( + encode_stream_proto(&UpdateForWorker { + update: Some(Update::ConnectionResult(ConnectionResult { + worker_id: expected_worker_id.clone(), + })), + }) + .unwrap(), + )) + .await + .map_err(|e| make_input_err!("Could not send : {:?}", e))?; + } + + let action_digest = DigestInfo::new([3u8; 32], 10); + let action_info = ActionInfo { + command_digest: DigestInfo::new([1u8; 32], 10), + input_root_digest: DigestInfo::new([2u8; 32], 10), + timeout: Duration::from_secs(1), + platform_properties: HashMap::new(), + priority: 0, + load_timestamp: SystemTime::UNIX_EPOCH, + insert_timestamp: SystemTime::UNIX_EPOCH, + unique_qualifier: ActionUniqueQualifier::Uncacheable(ActionUniqueKey { + instance_name: INSTANCE_NAME.to_string(), + digest_function: DigestHasherFunc::Sha256, + digest: action_digest, + }), + }; + + { + // Send execution request. + tx_stream + .send(Frame::data( + encode_stream_proto(&UpdateForWorker { + update: Some(Update::StartAction(StartExecute { + execute_request: Some((&action_info).into()), + operation_id: String::new(), + queued_timestamp: None, + platform: Some(Platform::default()), + worker_id: expected_worker_id.clone(), + peer_hints: Vec::new(), + })), + }) + .unwrap(), + )) + .await + .map_err(|e| make_input_err!("Could not send : {:?}", e))?; + } + + let running_action = Arc::new(MockRunningAction::new()); + + // Send and wait for response from create_and_add_action to RunningActionsManager. + test_context + .actions_manager + .expect_create_and_add_action(Ok(running_action.clone())) + .await; + + // Make the action fail with a NotFound error during get_finished_result. + // This simulates a missing input blob scenario. + running_action + .simple_expect_get_finished_result(Err(make_err!(Code::NotFound, "Object not found"))) + .await?; + + // Now our client should be notified that our runner finished. + let execution_response = test_context.client.expect_execution_response(Ok(())).await; + + // The worker should have translated NotFound into FailedPrecondition per the REAPI spec. + let error_status = match execution_response.result { + Some(execute_result::Result::InternalError(status)) => status, + other => panic!( + "Expected InternalError result, got: {:?}", + other + ), + }; + + assert_eq!( + error_status.code, + Code::FailedPrecondition as i32, + "Expected NotFound to be translated to FailedPrecondition" + ); + assert!( + error_status.message.contains("One or more input blobs missing"), + "Expected error message to contain 'One or more input blobs missing', got: {}", + error_status.message + ); + + Ok(()) +} + +#[nativelink_test] +async fn peer_hints_passed_to_action_manager_test() -> Result<(), Error> { + let mut test_context = setup_local_worker(HashMap::new()).await; + let streaming_response = test_context.maybe_streaming_response.take().unwrap(); + + { + // Ensure our worker connects and properties were sent. + let props = test_context + .client + .expect_connect_worker(Ok(streaming_response)) + .await; + assert_eq!(props, ConnectWorkerRequest::default()); + } + + let expected_worker_id = "foobar".to_string(); + + let tx_stream = test_context.maybe_tx_stream.take().unwrap(); + { + // First initialize our worker by sending the response to the connection request. + tx_stream + .send(Frame::data( + encode_stream_proto(&UpdateForWorker { + update: Some(Update::ConnectionResult(ConnectionResult { + worker_id: expected_worker_id.clone(), + })), + }) + .unwrap(), + )) + .await + .map_err(|e| make_input_err!("Could not send : {:?}", e))?; + } + + let action_digest = DigestInfo::new([3u8; 32], 10); + let action_info = ActionInfo { + command_digest: DigestInfo::new([1u8; 32], 10), + input_root_digest: DigestInfo::new([2u8; 32], 10), + timeout: Duration::from_secs(1), + platform_properties: HashMap::new(), + priority: 0, + load_timestamp: SystemTime::UNIX_EPOCH, + insert_timestamp: SystemTime::UNIX_EPOCH, + unique_qualifier: ActionUniqueQualifier::Uncacheable(ActionUniqueKey { + instance_name: INSTANCE_NAME.to_string(), + digest_function: DigestHasherFunc::Sha256, + digest: action_digest, + }), + }; + + // Create peer hints: digest D1 is available on "worker-a:50081". + let d1 = DigestInfo::new([10u8; 32], 500); + let peer_hints = vec![PeerHint { + digest: Some(Digest::from(d1)), + peer_endpoints: vec!["worker-a:50081".to_string()], + }]; + + { + // Send execution request with peer_hints populated. + tx_stream + .send(Frame::data( + encode_stream_proto(&UpdateForWorker { + update: Some(Update::StartAction(StartExecute { + execute_request: Some((&action_info).into()), + operation_id: String::new(), + queued_timestamp: None, + platform: Some(Platform::default()), + worker_id: expected_worker_id.clone(), + peer_hints: peer_hints.clone(), + })), + }) + .unwrap(), + )) + .await + .map_err(|e| make_input_err!("Could not send : {:?}", e))?; + } + + let running_action = Arc::new(MockRunningAction::new()); + + // Send and wait for response from create_and_add_action to RunningActionsManager. + // This returns the (worker_id, StartExecute) that was passed to the mock. + let (received_worker_id, received_start_execute) = test_context + .actions_manager + .expect_create_and_add_action(Ok(running_action.clone())) + .await; + + // Verify worker_id is passed correctly. + assert_eq!(received_worker_id, expected_worker_id); + + // Verify peer_hints arrived intact at the mock RunningActionsManager. + assert_eq!( + received_start_execute.peer_hints.len(), + 1, + "Expected exactly one peer hint" + ); + assert_eq!( + received_start_execute.peer_hints[0].digest, + Some(Digest::from(d1)), + "Peer hint digest should match the one we sent" + ); + assert_eq!( + received_start_execute.peer_hints[0].peer_endpoints, + vec!["worker-a:50081".to_string()], + "Peer hint endpoint should match the one we sent" + ); + + // Complete the action normally so the test can clean up. + running_action + .simple_expect_get_finished_result(Ok(ActionResult::default())) + .await?; + + // Expect the action result to be cached. + let _cached = test_context + .actions_manager + .expect_cache_action_result() + .await; + + Ok(()) +} + +#[nativelink_test] +async fn empty_peer_hints_action_starts_normally_test() -> Result<(), Error> { + let mut test_context = setup_local_worker(HashMap::new()).await; + let streaming_response = test_context.maybe_streaming_response.take().unwrap(); + + { + let props = test_context + .client + .expect_connect_worker(Ok(streaming_response)) + .await; + assert_eq!(props, ConnectWorkerRequest::default()); + } + + let expected_worker_id = "foobar".to_string(); + + let tx_stream = test_context.maybe_tx_stream.take().unwrap(); + { + tx_stream + .send(Frame::data( + encode_stream_proto(&UpdateForWorker { + update: Some(Update::ConnectionResult(ConnectionResult { + worker_id: expected_worker_id.clone(), + })), + }) + .unwrap(), + )) + .await + .map_err(|e| make_input_err!("Could not send : {:?}", e))?; + } + + let action_digest = DigestInfo::new([3u8; 32], 10); + let action_info = ActionInfo { + command_digest: DigestInfo::new([1u8; 32], 10), + input_root_digest: DigestInfo::new([2u8; 32], 10), + timeout: Duration::from_secs(1), + platform_properties: HashMap::new(), + priority: 0, + load_timestamp: SystemTime::UNIX_EPOCH, + insert_timestamp: SystemTime::UNIX_EPOCH, + unique_qualifier: ActionUniqueQualifier::Uncacheable(ActionUniqueKey { + instance_name: INSTANCE_NAME.to_string(), + digest_function: DigestHasherFunc::Sha256, + digest: action_digest, + }), + }; + + { + // Send execution request with empty peer_hints. + tx_stream + .send(Frame::data( + encode_stream_proto(&UpdateForWorker { + update: Some(Update::StartAction(StartExecute { + execute_request: Some((&action_info).into()), + operation_id: String::new(), + queued_timestamp: None, + platform: Some(Platform::default()), + worker_id: expected_worker_id.clone(), + peer_hints: Vec::new(), + })), + }) + .unwrap(), + )) + .await + .map_err(|e| make_input_err!("Could not send : {:?}", e))?; + } + + let running_action = Arc::new(MockRunningAction::new()); + + let (received_worker_id, received_start_execute) = test_context + .actions_manager + .expect_create_and_add_action(Ok(running_action.clone())) + .await; + + // Verify worker_id is passed correctly. + assert_eq!(received_worker_id, expected_worker_id); + + // Verify empty peer_hints doesn't cause any issues. + assert!( + received_start_execute.peer_hints.is_empty(), + "Expected peer_hints to be empty" + ); + + let action_result = ActionResult { + output_files: vec![], + output_folders: vec![], + output_file_symlinks: vec![], + output_directory_symlinks: vec![], + exit_code: 0, + stdout_digest: DigestInfo::new([21u8; 32], 10), + stderr_digest: DigestInfo::new([22u8; 32], 10), + execution_metadata: ExecutionMetadata { + worker: expected_worker_id.clone(), + queued_timestamp: SystemTime::UNIX_EPOCH, + worker_start_timestamp: SystemTime::UNIX_EPOCH, + worker_completed_timestamp: SystemTime::UNIX_EPOCH, + input_fetch_start_timestamp: SystemTime::UNIX_EPOCH, + input_fetch_completed_timestamp: SystemTime::UNIX_EPOCH, + execution_start_timestamp: SystemTime::UNIX_EPOCH, + execution_completed_timestamp: SystemTime::UNIX_EPOCH, + output_upload_start_timestamp: SystemTime::UNIX_EPOCH, + output_upload_completed_timestamp: SystemTime::UNIX_EPOCH, + }, + server_logs: HashMap::new(), + error: None, + message: String::new(), + }; + + // Complete the action normally. + running_action + .simple_expect_get_finished_result(Ok(action_result.clone())) + .await?; + + // Expect the action result to be cached. + let (stored_digest, stored_result, _digest_hasher) = test_context + .actions_manager + .expect_cache_action_result() + .await; + assert_eq!(stored_digest, action_digest); + assert_eq!(stored_result, action_result); + + // Verify we get the execution response back. + let execution_response = test_context.client.expect_execution_response(Ok(())).await; + assert_eq!( + execution_response, + ExecuteResult { + instance_name: INSTANCE_NAME.to_string(), + operation_id: String::new(), + result: Some(execute_result::Result::ExecuteResponse( + ActionStage::Completed(action_result).into() + )), + } + ); + + Ok(()) +} + +#[nativelink_test] +async fn multiple_peer_hints_with_multiple_endpoints_test() -> Result<(), Error> { + let mut test_context = setup_local_worker(HashMap::new()).await; + let streaming_response = test_context.maybe_streaming_response.take().unwrap(); + + { + let props = test_context + .client + .expect_connect_worker(Ok(streaming_response)) + .await; + assert_eq!(props, ConnectWorkerRequest::default()); + } + + let expected_worker_id = "foobar".to_string(); + + let tx_stream = test_context.maybe_tx_stream.take().unwrap(); + { + tx_stream + .send(Frame::data( + encode_stream_proto(&UpdateForWorker { + update: Some(Update::ConnectionResult(ConnectionResult { + worker_id: expected_worker_id.clone(), + })), + }) + .unwrap(), + )) + .await + .map_err(|e| make_input_err!("Could not send : {:?}", e))?; + } + + let action_digest = DigestInfo::new([3u8; 32], 10); + let action_info = ActionInfo { + command_digest: DigestInfo::new([1u8; 32], 10), + input_root_digest: DigestInfo::new([2u8; 32], 10), + timeout: Duration::from_secs(1), + platform_properties: HashMap::new(), + priority: 0, + load_timestamp: SystemTime::UNIX_EPOCH, + insert_timestamp: SystemTime::UNIX_EPOCH, + unique_qualifier: ActionUniqueQualifier::Uncacheable(ActionUniqueKey { + instance_name: INSTANCE_NAME.to_string(), + digest_function: DigestHasherFunc::Sha256, + digest: action_digest, + }), + }; + + // Create multiple peer hints with multiple endpoints. + let d1 = DigestInfo::new([10u8; 32], 500); + let d2 = DigestInfo::new([11u8; 32], 1000); + let peer_hints = vec![ + PeerHint { + digest: Some(Digest::from(d1)), + peer_endpoints: vec![ + "worker-a:50081".to_string(), + "worker-b:50081".to_string(), + ], + }, + PeerHint { + digest: Some(Digest::from(d2)), + peer_endpoints: vec!["worker-c:50081".to_string()], + }, + ]; + + { + tx_stream + .send(Frame::data( + encode_stream_proto(&UpdateForWorker { + update: Some(Update::StartAction(StartExecute { + execute_request: Some((&action_info).into()), + operation_id: String::new(), + queued_timestamp: None, + platform: Some(Platform::default()), + worker_id: expected_worker_id.clone(), + peer_hints: peer_hints.clone(), + })), + }) + .unwrap(), + )) + .await + .map_err(|e| make_input_err!("Could not send : {:?}", e))?; + } + + let running_action = Arc::new(MockRunningAction::new()); + + let (_received_worker_id, received_start_execute) = test_context + .actions_manager + .expect_create_and_add_action(Ok(running_action.clone())) + .await; + + // Verify all peer_hints arrived intact. + assert_eq!( + received_start_execute.peer_hints.len(), + 2, + "Expected exactly two peer hints" + ); + + // Verify first hint: d1 available on worker-a and worker-b. + assert_eq!( + received_start_execute.peer_hints[0].digest, + Some(Digest::from(d1)), + ); + assert_eq!( + received_start_execute.peer_hints[0].peer_endpoints, + vec!["worker-a:50081".to_string(), "worker-b:50081".to_string()], + ); + + // Verify second hint: d2 available on worker-c. + assert_eq!( + received_start_execute.peer_hints[1].digest, + Some(Digest::from(d2)), + ); + assert_eq!( + received_start_execute.peer_hints[1].peer_endpoints, + vec!["worker-c:50081".to_string()], + ); + + // Complete the action normally. + running_action + .simple_expect_get_finished_result(Ok(ActionResult::default())) + .await?; + + let _cached = test_context + .actions_manager + .expect_cache_action_result() + .await; + + Ok(()) +} diff --git a/nativelink-worker/tests/running_actions_manager_test.rs b/nativelink-worker/tests/running_actions_manager_test.rs index 0c630bc41..5d1b56a31 100644 --- a/nativelink-worker/tests/running_actions_manager_test.rs +++ b/nativelink-worker/tests/running_actions_manager_test.rs @@ -41,12 +41,12 @@ mod tests { use nativelink_proto::build::bazel::remote::execution::v2::command::EnvironmentVariable; #[cfg_attr(target_family = "windows", allow(unused_imports))] use nativelink_proto::build::bazel::remote::execution::v2::{ - Action, ActionResult as ProtoActionResult, Command, Directory, DirectoryNode, + Action, ActionResult as ProtoActionResult, Command, Digest, Directory, DirectoryNode, ExecuteRequest, ExecuteResponse, FileNode, NodeProperties, Platform, SymlinkNode, Tree, digest_function::Value as ProtoDigestFunction, platform::Property, }; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ - HistoricalExecuteResponse, StartExecute, + HistoricalExecuteResponse, PeerHint, StartExecute, }; use nativelink_proto::google::rpc::Status; use nativelink_store::ac_utils::{get_and_decode_digest, serialize_and_upload_message}; @@ -60,6 +60,7 @@ mod tests { use nativelink_util::action_messages::{ ActionResult, ExecutionMetadata, FileInfo, NameOrPath, OperationId, }; + use nativelink_util::blob_locality_map::new_shared_blob_locality_map; use nativelink_util::common::{DigestInfo, fs}; use nativelink_util::digest_hasher::{DigestHasher, DigestHasherFunc}; use nativelink_util::store_trait::{Store, StoreLike}; @@ -429,6 +430,506 @@ mod tests { Ok(()) } + #[nativelink_test] + async fn download_to_directory_batch_existence_check_test( + ) -> Result<(), Box> { + // Verifies that files already in the fast store are hardlinked + // without being re-fetched from the slow store. + const FILE1_NAME: &str = "cached_file.txt"; + const FILE1_CONTENT: &str = "ALREADY_IN_FAST"; + const FILE2_NAME: &str = "uncached_file.txt"; + const FILE2_CONTENT: &str = "ONLY_IN_SLOW"; + + let (fast_store, slow_store, cas_store, _ac_store) = setup_stores().await?; + + let root_directory_digest = { + let file1_content_digest = DigestInfo::new([10u8; 32], FILE1_CONTENT.len() as u64); + let file2_content_digest = DigestInfo::new([11u8; 32], FILE2_CONTENT.len() as u64); + + // Put file1 in BOTH slow and fast store (simulates a cached blob). + slow_store + .as_ref() + .update_oneshot(file1_content_digest, FILE1_CONTENT.into()) + .await?; + fast_store + .as_ref() + .update_oneshot(file1_content_digest, FILE1_CONTENT.into()) + .await?; + + // Put file2 ONLY in slow store (simulates a cache miss). + slow_store + .as_ref() + .update_oneshot(file2_content_digest, FILE2_CONTENT.into()) + .await?; + + let root_directory_digest = DigestInfo::new([12u8; 32], 32); + let root_directory = Directory { + files: vec![ + FileNode { + name: FILE1_NAME.to_string(), + digest: Some(file1_content_digest.into()), + ..Default::default() + }, + FileNode { + name: FILE2_NAME.to_string(), + digest: Some(file2_content_digest.into()), + ..Default::default() + }, + ], + ..Default::default() + }; + + slow_store + .as_ref() + .update_oneshot(root_directory_digest, root_directory.encode_to_vec().into()) + .await?; + root_directory_digest + }; + + let download_dir = make_temp_path("download_dir_batch_check"); + fs::create_dir_all(&download_dir).await?; + download_to_directory( + cas_store.as_ref(), + fast_store.as_pin(), + &root_directory_digest, + &download_dir, + ) + .await?; + + // Both files should be present with correct content. + let file1_content = fs::read(format!("{download_dir}/{FILE1_NAME}")).await?; + assert_eq!(from_utf8(&file1_content)?, FILE1_CONTENT); + + let file2_content = fs::read(format!("{download_dir}/{FILE2_NAME}")).await?; + assert_eq!(from_utf8(&file2_content)?, FILE2_CONTENT); + + Ok(()) + } + + #[nativelink_test] + async fn download_to_directory_dedup_digests_test( + ) -> Result<(), Box> { + // Verifies that multiple files sharing the same digest content + // are all materialized correctly (the digest is only downloaded once + // but hardlinked to multiple destinations). + const SHARED_CONTENT: &str = "SHARED_CONTENT_DATA"; + const FILE_A_NAME: &str = "file_a.txt"; + const FILE_B_NAME: &str = "file_b.txt"; + const FILE_C_NAME: &str = "file_c.txt"; + + let (fast_store, slow_store, cas_store, _ac_store) = setup_stores().await?; + + let root_directory_digest = { + let shared_digest = DigestInfo::new([20u8; 32], SHARED_CONTENT.len() as u64); + slow_store + .as_ref() + .update_oneshot(shared_digest, SHARED_CONTENT.into()) + .await?; + + let root_directory_digest = DigestInfo::new([21u8; 32], 32); + let root_directory = Directory { + files: vec![ + FileNode { + name: FILE_A_NAME.to_string(), + digest: Some(shared_digest.into()), + ..Default::default() + }, + FileNode { + name: FILE_B_NAME.to_string(), + digest: Some(shared_digest.into()), + ..Default::default() + }, + FileNode { + name: FILE_C_NAME.to_string(), + digest: Some(shared_digest.into()), + ..Default::default() + }, + ], + ..Default::default() + }; + + slow_store + .as_ref() + .update_oneshot(root_directory_digest, root_directory.encode_to_vec().into()) + .await?; + root_directory_digest + }; + + let download_dir = make_temp_path("download_dir_dedup"); + fs::create_dir_all(&download_dir).await?; + download_to_directory( + cas_store.as_ref(), + fast_store.as_pin(), + &root_directory_digest, + &download_dir, + ) + .await?; + + // All three files should exist with the same content. + for name in &[FILE_A_NAME, FILE_B_NAME, FILE_C_NAME] { + let content = fs::read(format!("{download_dir}/{name}")).await?; + assert_eq!(from_utf8(&content)?, SHARED_CONTENT, "Mismatch for {name}"); + } + + Ok(()) + } + + #[nativelink_test] + async fn download_to_directory_deep_nested_tree_test( + ) -> Result<(), Box> { + // Verifies that deeply nested directory trees (3 levels) are resolved + // correctly via the recursive fallback path (MemoryStore). + const LEAF_FILE_NAME: &str = "leaf.txt"; + const LEAF_CONTENT: &str = "DEEP_LEAF_DATA"; + + let (fast_store, slow_store, cas_store, _ac_store) = setup_stores().await?; + + let root_directory_digest = { + let leaf_content_digest = DigestInfo::new([30u8; 32], LEAF_CONTENT.len() as u64); + slow_store + .as_ref() + .update_oneshot(leaf_content_digest, LEAF_CONTENT.into()) + .await?; + + // Level 3 (deepest): directory containing a file + let level3_digest = DigestInfo::new([31u8; 32], 32); + let level3_dir = Directory { + files: vec![FileNode { + name: LEAF_FILE_NAME.to_string(), + digest: Some(leaf_content_digest.into()), + ..Default::default() + }], + ..Default::default() + }; + slow_store + .as_ref() + .update_oneshot(level3_digest, level3_dir.encode_to_vec().into()) + .await?; + + // Level 2: directory containing level3 + let level2_digest = DigestInfo::new([32u8; 32], 32); + let level2_dir = Directory { + directories: vec![DirectoryNode { + name: "level3".to_string(), + digest: Some(level3_digest.into()), + }], + ..Default::default() + }; + slow_store + .as_ref() + .update_oneshot(level2_digest, level2_dir.encode_to_vec().into()) + .await?; + + // Level 1 (root): directory containing level2 + let root_digest = DigestInfo::new([33u8; 32], 32); + let root_dir = Directory { + directories: vec![DirectoryNode { + name: "level2".to_string(), + digest: Some(level2_digest.into()), + }], + ..Default::default() + }; + slow_store + .as_ref() + .update_oneshot(root_digest, root_dir.encode_to_vec().into()) + .await?; + root_digest + }; + + let download_dir = make_temp_path("download_dir_deep"); + fs::create_dir_all(&download_dir).await?; + download_to_directory( + cas_store.as_ref(), + fast_store.as_pin(), + &root_directory_digest, + &download_dir, + ) + .await?; + + // Verify the deeply nested file exists with correct content. + let leaf_path = format!("{download_dir}/level2/level3/{LEAF_FILE_NAME}"); + let leaf_content = fs::read(&leaf_path).await?; + assert_eq!(from_utf8(&leaf_content)?, LEAF_CONTENT); + + // Verify intermediate directories exist. + let level2_meta = fs::metadata(format!("{download_dir}/level2")).await?; + assert!(level2_meta.is_dir()); + let level3_meta = fs::metadata(format!("{download_dir}/level2/level3")).await?; + assert!(level3_meta.is_dir()); + + Ok(()) + } + + #[nativelink_test] + async fn download_to_directory_empty_directory_test( + ) -> Result<(), Box> { + // Verifies that an empty root directory is handled correctly. + let (fast_store, slow_store, cas_store, _ac_store) = setup_stores().await?; + + let root_directory_digest = { + let root_digest = DigestInfo::new([40u8; 32], 32); + let root_dir = Directory::default(); + slow_store + .as_ref() + .update_oneshot(root_digest, root_dir.encode_to_vec().into()) + .await?; + root_digest + }; + + let download_dir = make_temp_path("download_dir_empty"); + fs::create_dir_all(&download_dir).await?; + download_to_directory( + cas_store.as_ref(), + fast_store.as_pin(), + &root_directory_digest, + &download_dir, + ) + .await?; + + // Directory should exist and be empty. + let meta = fs::metadata(&download_dir).await?; + assert!(meta.is_dir()); + + Ok(()) + } + + #[nativelink_test] + async fn download_to_directory_many_files_test( + ) -> Result<(), Box> { + // Verifies that a directory with many files (simulating a real build + // with many inputs) is handled correctly by the batch existence check + // and parallel download paths. + const FILE_COUNT: usize = 50; + + let (fast_store, slow_store, cas_store, _ac_store) = setup_stores().await?; + + let root_directory_digest = { + let mut file_nodes = Vec::with_capacity(FILE_COUNT); + for i in 0..FILE_COUNT { + let content = format!("content_of_file_{i}"); + // Create unique digests using the index. + let mut hash = [0u8; 32]; + hash[0] = 50; + hash[1] = (i >> 8) as u8; + hash[2] = (i & 0xff) as u8; + let digest = DigestInfo::new(hash, content.len() as u64); + + slow_store + .as_ref() + .update_oneshot(digest, content.into()) + .await?; + + // Pre-populate every 3rd file in the fast store to test + // the mixed cached/uncached path. + if i % 3 == 0 { + let content_again = format!("content_of_file_{i}"); + fast_store + .as_ref() + .update_oneshot(digest, content_again.into()) + .await?; + } + + file_nodes.push(FileNode { + name: format!("file_{i:04}.txt"), + digest: Some(digest.into()), + ..Default::default() + }); + } + + let root_digest = DigestInfo::new([51u8; 32], 32); + let root_dir = Directory { + files: file_nodes, + ..Default::default() + }; + slow_store + .as_ref() + .update_oneshot(root_digest, root_dir.encode_to_vec().into()) + .await?; + root_digest + }; + + let download_dir = make_temp_path("download_dir_many"); + fs::create_dir_all(&download_dir).await?; + download_to_directory( + cas_store.as_ref(), + fast_store.as_pin(), + &root_directory_digest, + &download_dir, + ) + .await?; + + // Verify all files. + for i in 0..FILE_COUNT { + let expected = format!("content_of_file_{i}"); + let path = format!("{download_dir}/file_{i:04}.txt"); + let content = fs::read(&path).await?; + assert_eq!( + from_utf8(&content)?, + expected, + "Content mismatch for file {i}" + ); + } + + Ok(()) + } + + #[nativelink_test] + async fn download_to_directory_missing_blob_returns_error_test( + ) -> Result<(), Box> { + // Verifies that a reference to a missing blob in the slow store + // propagates an error (not silently ignored). + const FILE_NAME: &str = "missing.txt"; + + let (fast_store, slow_store, cas_store, _ac_store) = setup_stores().await?; + + let root_directory_digest = { + // Reference a file content digest that does NOT exist in any store. + let missing_content_digest = DigestInfo::new([60u8; 32], 100); + + let root_digest = DigestInfo::new([61u8; 32], 32); + let root_directory = Directory { + files: vec![FileNode { + name: FILE_NAME.to_string(), + digest: Some(missing_content_digest.into()), + ..Default::default() + }], + ..Default::default() + }; + + slow_store + .as_ref() + .update_oneshot(root_digest, root_directory.encode_to_vec().into()) + .await?; + root_digest + }; + + let download_dir = make_temp_path("download_dir_missing_blob"); + fs::create_dir_all(&download_dir).await?; + let result = download_to_directory( + cas_store.as_ref(), + fast_store.as_pin(), + &root_directory_digest, + &download_dir, + ) + .await; + + assert!(result.is_err(), "Expected error for missing blob"); + Ok(()) + } + + #[nativelink_test] + async fn download_to_directory_missing_directory_digest_returns_error_test( + ) -> Result<(), Box> { + // Verifies that a DirectoryNode referencing a non-existent directory + // digest propagates an error during tree resolution. + let (fast_store, slow_store, cas_store, _ac_store) = setup_stores().await?; + + let root_directory_digest = { + // Reference a child directory digest that does NOT exist. + let missing_child_digest = DigestInfo::new([70u8; 32], 32); + + let root_digest = DigestInfo::new([71u8; 32], 32); + let root_directory = Directory { + directories: vec![DirectoryNode { + name: "missing_dir".to_string(), + digest: Some(missing_child_digest.into()), + }], + ..Default::default() + }; + + slow_store + .as_ref() + .update_oneshot(root_digest, root_directory.encode_to_vec().into()) + .await?; + root_digest + }; + + let download_dir = make_temp_path("download_dir_missing_dir"); + fs::create_dir_all(&download_dir).await?; + let result = download_to_directory( + cas_store.as_ref(), + fast_store.as_pin(), + &root_directory_digest, + &download_dir, + ) + .await; + + assert!(result.is_err(), "Expected error for missing directory digest"); + Ok(()) + } + + #[nativelink_test] + async fn download_to_directory_zero_digest_file_test( + ) -> Result<(), Box> { + // Verifies that zero-digest (empty) files are created correctly. + // Zero-digest files have special handling and skip batch existence checks. + const EMPTY_FILE_NAME: &str = "empty.txt"; + const NORMAL_FILE_NAME: &str = "normal.txt"; + const NORMAL_CONTENT: &str = "NORMAL_DATA"; + + // SHA-256 of zero bytes. + const ZERO_HASH: [u8; 32] = [ + 0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14, 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, + 0xb9, 0x24, 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c, 0xa4, 0x95, 0x99, 0x1b, + 0x78, 0x52, 0xb8, 0x55, + ]; + + let (fast_store, slow_store, cas_store, _ac_store) = setup_stores().await?; + + let root_directory_digest = { + let zero_digest = DigestInfo::new(ZERO_HASH, 0); + let normal_digest = DigestInfo::new([80u8; 32], NORMAL_CONTENT.len() as u64); + slow_store + .as_ref() + .update_oneshot(normal_digest, NORMAL_CONTENT.into()) + .await?; + + let root_digest = DigestInfo::new([81u8; 32], 32); + let root_directory = Directory { + files: vec![ + FileNode { + name: EMPTY_FILE_NAME.to_string(), + digest: Some(zero_digest.into()), + ..Default::default() + }, + FileNode { + name: NORMAL_FILE_NAME.to_string(), + digest: Some(normal_digest.into()), + ..Default::default() + }, + ], + ..Default::default() + }; + + slow_store + .as_ref() + .update_oneshot(root_digest, root_directory.encode_to_vec().into()) + .await?; + root_digest + }; + + let download_dir = make_temp_path("download_dir_zero"); + fs::create_dir_all(&download_dir).await?; + download_to_directory( + cas_store.as_ref(), + fast_store.as_pin(), + &root_directory_digest, + &download_dir, + ) + .await?; + + // Zero-digest file should exist and be empty. + let empty_path = format!("{download_dir}/{EMPTY_FILE_NAME}"); + let empty_content = fs::read(&empty_path).await?; + assert_eq!(empty_content.len(), 0, "Zero-digest file should be empty"); + + // Normal file should also exist. + let normal_content = fs::read(format!("{download_dir}/{NORMAL_FILE_NAME}")).await?; + assert_eq!(from_utf8(&normal_content)?, NORMAL_CONTENT); + + Ok(()) + } + #[nativelink_test] async fn ensure_output_files_full_directories_are_created_no_working_directory_test() -> Result<(), Box> { @@ -460,6 +961,7 @@ mod tests { max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, }, Callbacks { now_fn: test_monotonic_clock, @@ -529,6 +1031,7 @@ mod tests { queued_timestamp: None, platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .await?; @@ -584,6 +1087,7 @@ mod tests { max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, }, Callbacks { now_fn: test_monotonic_clock, @@ -655,6 +1159,7 @@ mod tests { queued_timestamp: None, platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .await?; @@ -689,7 +1194,7 @@ mod tests { monotonic_clock(&CLOCK) } - let (_, slow_store, cas_store, ac_store) = setup_stores().await?; + let (_, _slow_store, cas_store, ac_store) = setup_stores().await?; let root_action_directory = make_temp_path("root_action_directory"); fs::create_dir_all(&root_action_directory).await?; @@ -710,6 +1215,7 @@ mod tests { max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, }, Callbacks { now_fn: test_monotonic_clock, @@ -797,23 +1303,24 @@ mod tests { queued_timestamp: None, platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .await?; run_action(running_action_impl.clone()).await? }; - let file_content = slow_store + let file_content = cas_store .as_ref() .get_part_unchunked(action_result.output_files[0].digest, 0, None) .await?; assert_eq!(from_utf8(&file_content)?, "123 "); - let stdout_content = slow_store + let stdout_content = cas_store .as_ref() .get_part_unchunked(action_result.stdout_digest, 0, None) .await?; assert_eq!(from_utf8(&stdout_content)?, "foo-stdout "); - let stderr_content = slow_store + let stderr_content = cas_store .as_ref() .get_part_unchunked(action_result.stderr_digest, 0, None) .await?; @@ -871,7 +1378,7 @@ mod tests { monotonic_clock(&CLOCK) } - let (_, slow_store, cas_store, ac_store) = setup_stores().await?; + let (_, _slow_store, cas_store, ac_store) = setup_stores().await?; let root_action_directory = make_temp_path("root_action_directory"); fs::create_dir_all(&root_action_directory).await?; @@ -892,6 +1399,7 @@ mod tests { max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, }, Callbacks { now_fn: test_monotonic_clock, @@ -978,23 +1486,24 @@ mod tests { queued_timestamp: None, platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .await?; run_action(running_action_impl.clone()).await? }; - let file_content = slow_store + let file_content = cas_store .as_ref() .get_part_unchunked(action_result.output_files[0].digest, 0, None) .await?; assert_eq!(from_utf8(&file_content)?, "123 "); - let stdout_content = slow_store + let stdout_content = cas_store .as_ref() .get_part_unchunked(action_result.stdout_digest, 0, None) .await?; assert_eq!(from_utf8(&stdout_content)?, "foo-stdout "); - let stderr_content = slow_store + let stderr_content = cas_store .as_ref() .get_part_unchunked(action_result.stderr_digest, 0, None) .await?; @@ -1054,7 +1563,7 @@ mod tests { monotonic_clock(&CLOCK) } - let (_, slow_store, cas_store, ac_store) = setup_stores().await?; + let (_, _slow_store, cas_store, ac_store) = setup_stores().await?; let root_action_directory = make_temp_path("root_action_directory"); fs::create_dir_all(&root_action_directory).await?; @@ -1075,6 +1584,7 @@ mod tests { max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, }, Callbacks { now_fn: test_monotonic_clock, @@ -1143,6 +1653,7 @@ mod tests { queued_timestamp: Some(queued_timestamp.into()), platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .await?; @@ -1150,7 +1661,7 @@ mod tests { run_action(running_action_impl.clone()).await? }; let tree = get_and_decode_digest::( - slow_store.as_ref(), + cas_store.as_ref(), action_result.output_folders[0].tree_digest.into(), ) .await?; @@ -1284,6 +1795,7 @@ mod tests { max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, }, Callbacks { now_fn: test_monotonic_clock, @@ -1347,6 +1859,7 @@ mod tests { queued_timestamp: Some(queued_timestamp.into()), platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .await?; @@ -1420,6 +1933,7 @@ mod tests { max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, })?); #[cfg(target_family = "unix")] @@ -1497,6 +2011,7 @@ mod tests { queued_timestamp: Some(make_system_time(1000).into()), platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .await?; @@ -1624,6 +2139,7 @@ exit 0 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, })?); #[cfg(target_family = "unix")] let arguments = vec!["printf".to_string(), EXPECTED_STDOUT.to_string()]; @@ -1678,6 +2194,7 @@ exit 0 queued_timestamp: Some(make_system_time(1000).into()), platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .await?; @@ -1801,6 +2318,7 @@ exit 0 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, })?); #[cfg(target_family = "unix")] let arguments = vec!["printf".to_string(), EXPECTED_STDOUT.to_string()]; @@ -1865,6 +2383,7 @@ exit 0 queued_timestamp: Some(make_system_time(1000).into()), platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .await?; @@ -1972,6 +2491,7 @@ exit 1 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, })?); let arguments = vec!["true".to_string()]; let command = Command { @@ -2023,6 +2543,7 @@ exit 1 queued_timestamp: Some(make_system_time(1000).into()), platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .await?; @@ -2057,6 +2578,7 @@ exit 1 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, })?); let action_digest = DigestInfo::new([2u8; 32], 32); @@ -2133,6 +2655,7 @@ exit 1 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, })?); let action_digest = DigestInfo::new([2u8; 32], 32); @@ -2215,6 +2738,7 @@ exit 1 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, })?); let action_digest = DigestInfo::new([2u8; 32], 32); @@ -2318,6 +2842,7 @@ exit 1 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, })?); let action_digest = DigestInfo::new([2u8; 32], 32); @@ -2365,6 +2890,7 @@ exit 1 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, })?); let action_digest = DigestInfo::new([2u8; 32], 32); @@ -2434,6 +2960,7 @@ exit 1 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, })?); let action_digest = DigestInfo::new([2u8; 32], 32); @@ -2554,6 +3081,7 @@ exit 1 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, }, Callbacks { now_fn: test_monotonic_clock, @@ -2582,6 +3110,7 @@ exit 1 queued_timestamp: Some(make_system_time(1000).into()), platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .and_then(|action| { @@ -2642,6 +3171,7 @@ exit 1 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, }, Callbacks { now_fn: test_monotonic_clock, @@ -2670,6 +3200,7 @@ exit 1 queued_timestamp: Some(make_system_time(1000).into()), platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .and_then(|action| { @@ -2730,6 +3261,7 @@ exit 1 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, }, Callbacks { now_fn: test_monotonic_clock, @@ -2758,6 +3290,7 @@ exit 1 queued_timestamp: Some(make_system_time(1000).into()), platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .and_then(|action| { @@ -2815,6 +3348,7 @@ exit 1 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, }, Callbacks { now_fn: test_monotonic_clock, @@ -2891,6 +3425,7 @@ exit 1 queued_timestamp: Some(make_system_time(1000).into()), platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .and_then(|action| { @@ -2968,6 +3503,7 @@ exit 1 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, }, Callbacks { now_fn: test_monotonic_clock, @@ -3041,6 +3577,7 @@ exit 1 queued_timestamp: Some(make_system_time(1000).into()), platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .await?; @@ -3138,6 +3675,7 @@ exit 1 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, }, Callbacks { now_fn: test_monotonic_clock, @@ -3239,6 +3777,7 @@ exit 1 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, })?); let queued_timestamp = make_system_time(1000); @@ -3296,6 +3835,7 @@ exit 1 queued_timestamp: Some(queued_timestamp.into()), platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .await?; @@ -3326,7 +3866,7 @@ exit 1 monotonic_clock(&CLOCK) } - let (_, slow_store, cas_store, ac_store) = setup_stores().await?; + let (_, _slow_store, cas_store, ac_store) = setup_stores().await?; let root_action_directory = make_temp_path("root_action_directory"); fs::create_dir_all(&root_action_directory).await?; @@ -3354,6 +3894,7 @@ exit 1 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, }, Callbacks { now_fn: test_monotonic_clock, @@ -3431,23 +3972,24 @@ exit 1 queued_timestamp: None, platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .await?; run_action(running_action_impl.clone()).await? }; - let file_content = slow_store + let file_content = cas_store .as_ref() .get_part_unchunked(action_result.output_files[0].digest, 0, None) .await?; assert_eq!(from_utf8(&file_content)?, "123 "); - let stdout_content = slow_store + let stdout_content = cas_store .as_ref() .get_part_unchunked(action_result.stdout_digest, 0, None) .await?; assert_eq!(from_utf8(&stdout_content)?, "foo-stdout "); - let stderr_content = slow_store + let stderr_content = cas_store .as_ref() .get_part_unchunked(action_result.stderr_digest, 0, None) .await?; @@ -3535,6 +4077,7 @@ exit 1 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, }, Callbacks { now_fn: test_monotonic_clock, @@ -3614,6 +4157,7 @@ exit 1 queued_timestamp: Some(make_system_time(1000).into()), platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .await?; @@ -3656,6 +4200,7 @@ exit 1 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, })?); // Create a simple action @@ -3734,6 +4279,7 @@ exit 1 queued_timestamp: Some(SystemTime::now().into()), platform: None, worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .await; @@ -3798,6 +4344,7 @@ exit 1 max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), timeout_handled_externally: false, directory_cache: None, + peer_locality_map: None, })?); // Create a simple action @@ -3846,6 +4393,7 @@ exit 1 queued_timestamp: Some(SystemTime::now().into()), platform: None, worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .await?; @@ -3867,6 +4415,7 @@ exit 1 queued_timestamp: Some(SystemTime::now().into()), platform: None, worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), }, ) .await; @@ -3884,4 +4433,273 @@ exit 1 fs::remove_dir_all(&root_action_directory).await?; Ok(()) } + + /// Helper: set up a RunningActionsManagerImpl with stores, a root directory, + /// and a minimal action (empty command + empty input root) uploaded to the CAS. + /// Returns (manager, execute_request, action) for use in peer hint tests. + async fn setup_peer_hint_test( + peer_locality_map: Option, + ) -> Result< + ( + Arc, + ExecuteRequest, + Action, + String, + ), + Box, + > { + let (_, _, cas_store, ac_store) = setup_stores().await?; + let root_action_directory = make_temp_path("root_action_directory"); + fs::create_dir_all(&root_action_directory).await?; + + let running_actions_manager = + Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { + root_action_directory: root_action_directory.clone(), + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + peer_locality_map, + })?); + + // Upload a minimal command + empty input root + action to CAS. + #[cfg(target_family = "unix")] + let arguments = vec![ + "sh".to_string(), + "-c".to_string(), + "true".to_string(), + ]; + #[cfg(target_family = "windows")] + let arguments = vec![ + "cmd".to_string(), + "/C".to_string(), + "echo ok".to_string(), + ]; + + let command = Command { + arguments, + output_paths: vec![], + working_directory: ".".to_string(), + environment_variables: vec![EnvironmentVariable { + name: "PATH".to_string(), + value: env::var("PATH").unwrap(), + }], + ..Default::default() + }; + let command_digest = serialize_and_upload_message( + &command, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let input_root_digest = serialize_and_upload_message( + &Directory::default(), + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let action = Action { + command_digest: Some(command_digest.into()), + input_root_digest: Some(input_root_digest.into()), + ..Default::default() + }; + let action_digest = serialize_and_upload_message( + &action, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + + let execute_request = ExecuteRequest { + action_digest: Some(action_digest.into()), + ..Default::default() + }; + + Ok(( + running_actions_manager, + execute_request, + action, + root_action_directory, + )) + } + + #[nativelink_test] + async fn test_peer_hints_registered_in_locality_map( + ) -> Result<(), Box> { + const WORKER_ID: &str = "peer_hint_worker"; + + let locality_map = new_shared_blob_locality_map(); + let (running_actions_manager, execute_request, action, root_action_directory) = + setup_peer_hint_test(Some(locality_map.clone())).await?; + + let d1 = DigestInfo::new([0xAA; 32], 1000); + let d1_proto: Digest = d1.into(); + + let running_action = running_actions_manager + .clone() + .create_and_add_action( + WORKER_ID.to_string(), + StartExecute { + execute_request: Some(execute_request), + operation_id: OperationId::default().to_string(), + queued_timestamp: Some(make_system_time(1000).into()), + platform: action.platform.clone(), + worker_id: WORKER_ID.to_string(), + peer_hints: vec![PeerHint { + digest: Some(d1_proto), + peer_endpoints: vec!["worker-a:50081".to_string()], + }], + }, + ) + .await?; + + // Verify the locality map was populated. + { + let map = locality_map.read(); + let workers = map.lookup_workers(&d1); + assert_eq!(workers.len(), 1, "Expected 1 endpoint for d1"); + assert_eq!(&*workers[0], "worker-a:50081"); + } + + // Clean up. + running_action.cleanup().await?; + fs::remove_dir_all(&root_action_directory).await?; + Ok(()) + } + + #[nativelink_test] + async fn test_empty_peer_hints_no_error() -> Result<(), Box> { + const WORKER_ID: &str = "empty_hints_worker"; + + let locality_map = new_shared_blob_locality_map(); + let (running_actions_manager, execute_request, action, root_action_directory) = + setup_peer_hint_test(Some(locality_map.clone())).await?; + + let running_action = running_actions_manager + .clone() + .create_and_add_action( + WORKER_ID.to_string(), + StartExecute { + execute_request: Some(execute_request), + operation_id: OperationId::default().to_string(), + queued_timestamp: Some(make_system_time(1000).into()), + platform: action.platform.clone(), + worker_id: WORKER_ID.to_string(), + peer_hints: Vec::new(), + }, + ) + .await?; + + // Locality map should be empty. + { + let map = locality_map.read(); + assert_eq!(map.digest_count(), 0, "Expected no digests in locality map"); + assert_eq!( + map.endpoint_count(), + 0, + "Expected no endpoints in locality map" + ); + } + + running_action.cleanup().await?; + fs::remove_dir_all(&root_action_directory).await?; + Ok(()) + } + + #[nativelink_test] + async fn test_peer_hints_without_locality_map() -> Result<(), Box> { + const WORKER_ID: &str = "no_map_worker"; + + // Pass None for peer_locality_map. + let (running_actions_manager, execute_request, action, root_action_directory) = + setup_peer_hint_test(None).await?; + + let d1 = DigestInfo::new([0xBB; 32], 500); + let d1_proto: Digest = d1.into(); + + // Should not panic or error even though peer_hints are provided. + let running_action = running_actions_manager + .clone() + .create_and_add_action( + WORKER_ID.to_string(), + StartExecute { + execute_request: Some(execute_request), + operation_id: OperationId::default().to_string(), + queued_timestamp: Some(make_system_time(1000).into()), + platform: action.platform.clone(), + worker_id: WORKER_ID.to_string(), + peer_hints: vec![PeerHint { + digest: Some(d1_proto), + peer_endpoints: vec!["worker-x:50081".to_string()], + }], + }, + ) + .await?; + + running_action.cleanup().await?; + fs::remove_dir_all(&root_action_directory).await?; + Ok(()) + } + + #[nativelink_test] + async fn test_multiple_endpoints_per_hint() -> Result<(), Box> { + const WORKER_ID: &str = "multi_endpoint_worker"; + + let locality_map = new_shared_blob_locality_map(); + let (running_actions_manager, execute_request, action, root_action_directory) = + setup_peer_hint_test(Some(locality_map.clone())).await?; + + let d1 = DigestInfo::new([0xCC; 32], 2000); + let d1_proto: Digest = d1.into(); + + let running_action = running_actions_manager + .clone() + .create_and_add_action( + WORKER_ID.to_string(), + StartExecute { + execute_request: Some(execute_request), + operation_id: OperationId::default().to_string(), + queued_timestamp: Some(make_system_time(1000).into()), + platform: action.platform.clone(), + worker_id: WORKER_ID.to_string(), + peer_hints: vec![PeerHint { + digest: Some(d1_proto), + peer_endpoints: vec![ + "worker-a:50081".to_string(), + "worker-b:50081".to_string(), + ], + }], + }, + ) + .await?; + + // Both endpoints should be registered for d1. + { + let map = locality_map.read(); + let workers = map.lookup_workers(&d1); + assert_eq!(workers.len(), 2, "Expected 2 endpoints for d1"); + assert!( + workers.iter().any(|w| &**w == "worker-a:50081"), + "Expected worker-a:50081 in endpoints" + ); + assert!( + workers.iter().any(|w| &**w == "worker-b:50081"), + "Expected worker-b:50081 in endpoints" + ); + } + + running_action.cleanup().await?; + fs::remove_dir_all(&root_action_directory).await?; + Ok(()) + } } diff --git a/nativelink-worker/tests/utils/local_worker_test_utils.rs b/nativelink-worker/tests/utils/local_worker_test_utils.rs index a655fe613..3f79a09b1 100644 --- a/nativelink-worker/tests/utils/local_worker_test_utils.rs +++ b/nativelink-worker/tests/utils/local_worker_test_utils.rs @@ -32,13 +32,10 @@ use nativelink_worker::local_worker::LocalWorker; use nativelink_worker::worker_api_client_wrapper::WorkerApiClientTrait; use tokio::sync::{broadcast, mpsc}; use tonic::Status; -use tonic::{ - Response, - Streaming, - codec::Codec, // Needed for .decoder(). - codec::CompressionEncoding, - codec::ProstCodec, -}; +use tonic::{Response, Streaming, codec::CompressionEncoding}; +use tonic_prost::ProstCodec; +// Needed for .decoder(). +use tonic::codec::Codec; use super::mock_running_actions_manager::MockRunningActionsManager; @@ -186,6 +183,13 @@ impl WorkerApiClientTrait for MockWorkerApiClient { async fn execution_complete(&mut self, _request: ExecuteComplete) -> Result<(), Error> { Ok(()) } + + async fn blobs_available( + &mut self, + _request: nativelink_proto::com::github::trace_machina::nativelink::remote_execution::BlobsAvailableNotification, + ) -> Result<(), Error> { + Ok(()) + } } pub(crate) fn setup_grpc_stream() -> ( @@ -213,6 +217,8 @@ pub(crate) async fn setup_local_worker_with_config( Box::pin(async move { Ok(mock_worker_api_client) }) }), Box::new(move |_| Box::pin(async move { /* No sleep */ })), + None, // No periodic BlobsAvailable in tests + None, // No CAS server guard in tests ); let (shutdown_tx_test, _) = broadcast::channel::(BROADCAST_CAPACITY); diff --git a/nativelink-worker/tests/utils/mock_running_actions_manager.rs b/nativelink-worker/tests/utils/mock_running_actions_manager.rs index 4efe50132..254aa0850 100644 --- a/nativelink-worker/tests/utils/mock_running_actions_manager.rs +++ b/nativelink-worker/tests/utils/mock_running_actions_manager.rs @@ -183,6 +183,18 @@ impl RunningActionsManager for MockRunningActionsManager { fn metrics(&self) -> &Arc { &self.metrics } + + async fn cached_directory_digests(&self) -> Vec { + Vec::new() + } + + async fn all_subtree_digests(&self) -> Vec { + Vec::new() + } + + async fn take_pending_subtree_changes(&self) -> (Vec, Vec) { + (Vec::new(), Vec::new()) + } } #[derive(Debug)] diff --git a/nativelink-worker/tests/worker_utils_test.rs b/nativelink-worker/tests/worker_utils_test.rs index 62e16b574..a1cb01cc8 100644 --- a/nativelink-worker/tests/worker_utils_test.rs +++ b/nativelink-worker/tests/worker_utils_test.rs @@ -22,7 +22,7 @@ async fn make_connect_worker_request_with_extra_envs() -> Result<(), Error> { extra_envs.insert("PATH".into(), env::var("PATH").unwrap()); let res = - make_connect_worker_request("1234".to_string(), &worker_properties, &extra_envs, 1).await?; + make_connect_worker_request("1234".to_string(), &worker_properties, &extra_envs, 1, String::new()).await?; assert_eq!( res.properties.first(), Some(&Property { diff --git a/src/bin/nativelink.rs b/src/bin/nativelink.rs index cfad2a0e4..9f8ee8b81 100644 --- a/src/bin/nativelink.rs +++ b/src/bin/nativelink.rs @@ -16,6 +16,7 @@ use core::net::SocketAddr; use core::time::Duration; use std::collections::{HashMap, HashSet}; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; use async_lock::Mutex as AsyncMutex; use axum::Router; @@ -45,6 +46,7 @@ use nativelink_service::fetch_server::FetchServer; use nativelink_service::health_server::HealthServer; use nativelink_service::push_server::PushServer; use nativelink_service::worker_api_server::WorkerApiServer; +use nativelink_util::blob_locality_map; use nativelink_store::default_store_factory::store_factory; use nativelink_store::store_manager::StoreManager; use nativelink_util::common::fs::set_open_file_limit; @@ -63,6 +65,7 @@ use nativelink_util::{background_spawn, fs, spawn}; use nativelink_worker::local_worker::new_local_worker; use rustls_pki_types::pem::PemObject; use rustls_pki_types::{CertificateRevocationListDer, PrivateKeyDer}; +use socket2::SockRef; use tokio::net::TcpListener; use tokio::select; #[cfg(target_family = "unix")] @@ -145,7 +148,13 @@ impl RoutesExt for Routes { } /// If this value changes update the documentation in the config definition. -const DEFAULT_MAX_DECODING_MESSAGE_SIZE: usize = 4 * 1024 * 1024; +const DEFAULT_MAX_DECODING_MESSAGE_SIZE: usize = 64 * 1024 * 1024; + +/// Server-side encoding (response) limit. Bazel's Java gRPC client defaults +/// to 4 MiB max inbound message size, so we default to 4 MiB. Workers that +/// need larger responses should use a separate listener with a higher +/// `max_encoding_message_size` in the config. +const DEFAULT_MAX_ENCODING_MESSAGE_SIZE: usize = 4 * 1024 * 1024; macro_rules! service_setup { ($v: tt, $http_config: tt) => {{ @@ -156,6 +165,12 @@ macro_rules! service_setup { $http_config.max_decoding_message_size }; service = service.max_decoding_message_size(max_decoding_message_size); + let max_encoding_message_size = if $http_config.max_encoding_message_size == 0 { + DEFAULT_MAX_ENCODING_MESSAGE_SIZE + } else { + $http_config.max_encoding_message_size + }; + service = service.max_encoding_message_size(max_encoding_message_size); let send_algo = &$http_config.compression.send_compression_algorithm; if let Some(encoding) = into_encoding(send_algo.unwrap_or(HttpCompressionAlgorithm::None)) { service = service.send_compressed(encoding); @@ -181,6 +196,7 @@ async fn inner_main( const fn into_encoding(from: HttpCompressionAlgorithm) -> Option { match from { HttpCompressionAlgorithm::Gzip => Some(CompressionEncoding::Gzip), + HttpCompressionAlgorithm::Zstd => Some(CompressionEncoding::Zstd), HttpCompressionAlgorithm::None => None, } } @@ -229,11 +245,17 @@ async fn inner_main( }) .transpose()?; + // Create a shared blob locality map for peer-to-peer blob sharing. + // This map is shared between the scheduler (for locality scoring and + // peer hint generation) and WorkerApiServer (for receiving + // BlobsAvailable updates from workers). + let locality_map = blob_locality_map::new_shared_blob_locality_map(); + let mut action_schedulers = HashMap::new(); let mut worker_schedulers = HashMap::new(); for SchedulerConfig { name, spec } in cfg.schedulers.iter().flatten() { let (maybe_action_scheduler, maybe_worker_scheduler) = - scheduler_factory(spec, &store_manager, maybe_origin_event_tx.as_ref()) + scheduler_factory(spec, &store_manager, maybe_origin_event_tx.as_ref(), Some(locality_map.clone())) .err_tip(|| format!("Failed to create scheduler '{name}'"))?; if let Some(action_scheduler) = maybe_action_scheduler { action_schedulers.insert(name.clone(), action_scheduler.clone()); @@ -245,6 +267,41 @@ async fn inner_main( let server_cfgs: Vec = cfg.servers.into_iter().collect(); + // Wrap CAS stores with WorkerProxyStore so the server can proxy reads + // to workers that have the blob (discovered via BlobsAvailable reports). + { + let mut cas_store_names: HashSet = HashSet::new(); + for server_cfg in &server_cfgs { + if let Some(ref services) = server_cfg.services { + if let Some(ref cas_cfgs) = services.cas { + for c in cas_cfgs { + cas_store_names.insert(c.config.cas_store.clone()); + } + } + if let Some(ref bs_cfgs) = services.bytestream { + for c in bs_cfgs { + cas_store_names.insert(c.config.cas_store.clone()); + } + } + } + } + for store_name in &cas_store_names { + if let Some(original_store) = store_manager.get_store(store_name) { + let proxy_store = nativelink_util::store_trait::Store::new( + nativelink_store::worker_proxy_store::WorkerProxyStore::new( + original_store, + locality_map.clone(), + ), + ); + store_manager.add_store(store_name, proxy_store); + info!( + store_name, + "Wrapped CAS store with WorkerProxyStore for peer blob sharing" + ); + } + } + } + for server_cfg in server_cfgs { let services = server_cfg .services @@ -327,7 +384,7 @@ async fn inner_main( services .worker_api .map_or(Ok(None), |cfg| { - WorkerApiServer::new(&cfg, &worker_schedulers) + WorkerApiServer::new(&cfg, &worker_schedulers, Some(locality_map.clone())) .map(|v| Some(service_setup!(v, http_config))) }) .err_tip(|| "Could not create WorkerApi service")?, @@ -518,18 +575,27 @@ async fn inner_main( || "Could not convert experimental_http2_max_pending_accept_reset_streams", )?); } - if let Some(value) = http_config.experimental_http2_initial_stream_window_size { - http.http2().initial_stream_window_size(value); - } - if let Some(value) = http_config.experimental_http2_initial_connection_window_size { - http.http2().initial_connection_window_size(value); - } + // Default to 16 MiB stream window and 32 MiB connection window + // to avoid capping per-stream throughput at ~64 MB/s with 1ms RTT + // (hyper's default of 64 KiB is too small for high-bandwidth links). + http.http2().initial_stream_window_size( + http_config + .experimental_http2_initial_stream_window_size + .unwrap_or(16 * 1024 * 1024), + ); + http.http2().initial_connection_window_size( + http_config + .experimental_http2_initial_connection_window_size + .unwrap_or(32 * 1024 * 1024), + ); if let Some(value) = http_config.experimental_http2_adaptive_window { http.http2().adaptive_window(value); } - if let Some(value) = http_config.experimental_http2_max_frame_size { - http.http2().max_frame_size(value); - } + http.http2().max_frame_size( + http_config + .experimental_http2_max_frame_size + .unwrap_or(64 * 1024), + ); if let Some(value) = http_config.experimental_http2_max_concurrent_streams { http.http2().max_concurrent_streams(value); } @@ -537,11 +603,14 @@ async fn inner_main( http.http2() .keep_alive_timeout(Duration::from_secs(u64::from(value))); } - if let Some(value) = http_config.experimental_http2_max_send_buf_size { - http.http2().max_send_buf_size( - usize::try_from(value).err_tip(|| "Could not convert http2_max_send_buf_size")?, - ); - } + http.http2().max_send_buf_size( + usize::try_from( + http_config + .experimental_http2_max_send_buf_size + .unwrap_or(2 * 1024 * 1024), + ) + .err_tip(|| "Could not convert http2_max_send_buf_size")?, + ); if http_config.experimental_http2_enable_connect_protocol == Some(true) { http.http2().enable_connect_protocol(); } @@ -555,6 +624,25 @@ async fn inner_main( accept_result = tcp_listener.accept() => { match accept_result { Ok((tcp_stream, remote_addr)) => { + // Disable Nagle's algorithm to reduce latency + // on small writes (e.g., gRPC frames). + if let Err(err) = tcp_stream.set_nodelay(true) { + error!( + target: "nativelink::services", + ?err, + "Failed to set TCP_NODELAY" + ); + } + // Enable TCP keepalive to detect dead connections. + // Uses system defaults (tcp_keepalive_time/intvl/probes). + let sock_ref = SockRef::from(&tcp_stream); + if let Err(err) = sock_ref.set_keepalive(true) { + error!( + target: "nativelink::services", + ?err, + "Failed to set SO_KEEPALIVE" + ); + } info!( target: "nativelink::services", ?remote_addr, @@ -710,6 +798,12 @@ fn get_config() -> Result { CasConfig::try_from_json5_file(&args.config_file) } +/// Dump all thread stacks to a timestamped file for post-mortem analysis. +/// Reads /proc/self/task/*/comm, status, wchan, and stack (if permitted). +fn dump_thread_stacks() { + nativelink_util::stall_detector::dump_thread_stacks("runtime-watchdog"); +} + fn main() -> Result<(), Box> { #[expect(clippy::disallowed_methods, reason = "starting main runtime")] let runtime = tokio::runtime::Builder::new_multi_thread() @@ -761,7 +855,7 @@ fn main() -> Result<(), Box> { tokio::signal::ctrl_c() .await .expect("Failed to listen to SIGINT"); - eprintln!("User terminated process via SIGINT"); + error!("User terminated process via SIGINT"); std::process::exit(130); }); @@ -785,6 +879,57 @@ fn main() -> Result<(), Box> { std::process::exit(143); }); + // Spawn a heartbeat task inside the tokio runtime and an external + // watchdog OS thread that detects when the runtime stalls. + let heartbeat_counter = Arc::new(AtomicU64::new(0)); + let heartbeat_counter_task = heartbeat_counter.clone(); + #[expect(clippy::disallowed_methods, reason = "runtime watchdog heartbeat")] + runtime.spawn(async move { + let mut ticker = tokio::time::interval(Duration::from_millis(500)); + loop { + ticker.tick().await; + heartbeat_counter_task.fetch_add(1, Ordering::Relaxed); + } + }); + std::thread::Builder::new() + .name("runtime-watchdog".to_string()) + .spawn(move || { + let stall_threshold = Duration::from_secs(2); + let check_interval = Duration::from_secs(1); + loop { + let before = heartbeat_counter.load(Ordering::Relaxed); + std::thread::sleep(check_interval); + let after = heartbeat_counter.load(Ordering::Relaxed); + if before == after { + let stall_start = std::time::Instant::now(); + let mut stall_logged = false; + // Confirmed stall — wait until it resolves to measure duration. + loop { + std::thread::sleep(Duration::from_millis(100)); + let now = heartbeat_counter.load(Ordering::Relaxed); + if now != after { + let stall_duration = stall_start.elapsed(); + error!( + "RUNTIME STALL RESOLVED: tokio runtime was unresponsive for {:.1}s (heartbeat stuck at {after})", + stall_duration.as_secs_f64() + check_interval.as_secs_f64(), + ); + break; + } + if !stall_logged && stall_start.elapsed() > stall_threshold { + stall_logged = true; + let total = stall_threshold.as_secs_f64() + + check_interval.as_secs_f64(); + error!( + "RUNTIME STALL IN PROGRESS: tokio runtime unresponsive for >{total:.1}s (heartbeat stuck at {after})", + ); + dump_thread_stacks(); + } + } + } + } + }) + .expect("Failed to spawn runtime watchdog thread"); + #[expect(clippy::disallowed_methods, reason = "waiting on everything to finish")] runtime .block_on(async { diff --git a/tests/blobs_available_integration_test.rs b/tests/blobs_available_integration_test.rs new file mode 100644 index 000000000..2fc83d48f --- /dev/null +++ b/tests/blobs_available_integration_test.rs @@ -0,0 +1,879 @@ +// Copyright 2025 The NativeLink Authors. All rights reserved. +// +// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License +// (the "License"); you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// See LICENSE file for details +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Integration test: 1 nativelink server + 3 workers exercising BlobsAvailable. +//! +//! Verifies the callback-based BlobsAvailable reporting pipeline: +//! 1. Workers connect and register with the scheduler +//! 2. Each worker sends an initial full-snapshot BlobsAvailable +//! 3. Blobs uploaded to a worker's CAS trigger the on_insert callback +//! 4. The next periodic tick sends a delta with just the new blobs +//! 5. The server processes notifications and populates the locality map +//! 6. When a worker disconnects, the server cleans up the locality map + +use std::io::{BufRead, BufReader}; +use std::path::{Path, PathBuf}; +use std::process::{Child, Command, Stdio}; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Mutex}; +use std::time::Duration; + +use tracing::error; + +use nativelink_proto::build::bazel::remote::execution::v2::{ + batch_update_blobs_request, + content_addressable_storage_client::ContentAddressableStorageClient, BatchReadBlobsRequest, + BatchUpdateBlobsRequest, Digest, +}; +use sha2::{Digest as Sha2Digest, Sha256}; +use tempfile::TempDir; +use tonic::metadata::MetadataValue; +use tonic::transport::Channel; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/// Allocate a free TCP port by binding to port 0 and extracting the OS-assigned port. +fn get_free_port() -> u16 { + let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap(); + listener.local_addr().unwrap().port() +} + +struct Ports { + public: u16, + worker_api: u16, + cas: [u16; 3], +} + +fn allocate_ports() -> Ports { + Ports { + public: get_free_port(), + worker_api: get_free_port(), + cas: [get_free_port(), get_free_port(), get_free_port()], + } +} + +/// Write a JSON5 config with 1 server (2 listeners) + 3 workers. +fn write_config(temp_dir: &Path, ports: &Ports) -> PathBuf { + let d = temp_dir.to_string_lossy().replace('\\', "/"); + let config = format!( + r#"{{ + stores: [ + {{ name: "AC_STORE", memory: {{ eviction_policy: {{ max_bytes: 100000000 }} }} }}, + {{ name: "SERVER_CAS", memory: {{ eviction_policy: {{ max_bytes: 100000000 }} }} }}, + {{ + name: "W1_STORE", + fast_slow: {{ + fast: {{ filesystem: {{ + content_path: "{d}/w1/cas", + temp_path: "{d}/w1/tmp", + eviction_policy: {{ max_bytes: 100000000 }}, + }} }}, + slow: {{ grpc: {{ + instance_name: "main", + endpoints: [{{ address: "grpc://127.0.0.1:{public}" }}], + store_type: "cas", + }} }}, + slow_direction: "get", + }}, + }}, + {{ + name: "W2_STORE", + fast_slow: {{ + fast: {{ filesystem: {{ + content_path: "{d}/w2/cas", + temp_path: "{d}/w2/tmp", + eviction_policy: {{ max_bytes: 100000000 }}, + }} }}, + slow: {{ grpc: {{ + instance_name: "main", + endpoints: [{{ address: "grpc://127.0.0.1:{public}" }}], + store_type: "cas", + }} }}, + slow_direction: "get", + }}, + }}, + {{ + name: "W3_STORE", + fast_slow: {{ + fast: {{ filesystem: {{ + content_path: "{d}/w3/cas", + temp_path: "{d}/w3/tmp", + eviction_policy: {{ max_bytes: 100000000 }}, + }} }}, + slow: {{ grpc: {{ + instance_name: "main", + endpoints: [{{ address: "grpc://127.0.0.1:{public}" }}], + store_type: "cas", + }} }}, + slow_direction: "get", + }}, + }}, + ], + schedulers: [ + {{ + name: "MAIN", + simple: {{ + supported_platform_properties: {{ cpu_count: "minimum" }}, + }}, + }}, + ], + workers: [ + {{ local: {{ + name: "worker-1", + worker_api_endpoint: {{ uri: "grpc://127.0.0.1:{wapi}" }}, + cas_fast_slow_store: "W1_STORE", + cas_server_port: {c1}, + blobs_available_interval_ms: 200, + work_directory: "{d}/w1/work", + upload_action_result: {{ upload_ac_results_strategy: "never" }}, + platform_properties: {{ cpu_count: {{ values: ["1"] }} }}, + }} }}, + {{ local: {{ + name: "worker-2", + worker_api_endpoint: {{ uri: "grpc://127.0.0.1:{wapi}" }}, + cas_fast_slow_store: "W2_STORE", + cas_server_port: {c2}, + blobs_available_interval_ms: 200, + work_directory: "{d}/w2/work", + upload_action_result: {{ upload_ac_results_strategy: "never" }}, + platform_properties: {{ cpu_count: {{ values: ["1"] }} }}, + }} }}, + {{ local: {{ + name: "worker-3", + worker_api_endpoint: {{ uri: "grpc://127.0.0.1:{wapi}" }}, + cas_fast_slow_store: "W3_STORE", + cas_server_port: {c3}, + blobs_available_interval_ms: 200, + work_directory: "{d}/w3/work", + upload_action_result: {{ upload_ac_results_strategy: "never" }}, + platform_properties: {{ cpu_count: {{ values: ["1"] }} }}, + }} }}, + ], + servers: [ + {{ + name: "public", + listener: {{ http: {{ socket_address: "127.0.0.1:{public}" }} }}, + services: {{ + cas: [{{ instance_name: "main", cas_store: "SERVER_CAS" }}], + bytestream: [{{ instance_name: "main", cas_store: "SERVER_CAS" }}], + capabilities: [{{ instance_name: "main", remote_execution: {{ scheduler: "MAIN" }} }}], + }}, + }}, + {{ + name: "worker_api", + listener: {{ http: {{ socket_address: "127.0.0.1:{wapi}" }} }}, + services: {{ + worker_api: {{ scheduler: "MAIN" }}, + }}, + }}, + ], +}}"#, + d = d, + wapi = ports.worker_api, + c1 = ports.cas[0], + c2 = ports.cas[1], + c3 = ports.cas[2], + public = ports.public, + ); + let config_path = temp_dir.join("config.json5"); + std::fs::write(&config_path, config).unwrap(); + config_path +} + +/// Compute SHA-256 digest of data, returning (hex_hash, size). +fn sha256_digest(data: &[u8]) -> (String, i64) { + let mut hasher = Sha256::new(); + hasher.update(data); + let hash = format!("{:x}", hasher.finalize()); + (hash, data.len() as i64) +} + +/// Holds a spawned nativelink process and its collected log lines. +struct NativeLinkProcess { + child: Child, + log_lines: Arc>>, + /// Set to false when stderr reader thread finishes (child exited). + child_alive: Arc, +} + +impl NativeLinkProcess { + /// Spawn the nativelink binary with the given config file. + fn spawn(config_path: &Path) -> Self { + let binary = env!("CARGO_BIN_EXE_nativelink"); + + let mut child = Command::new(binary) + .arg(config_path.to_str().unwrap()) + .env( + "RUST_LOG", + "nativelink=trace,nativelink_worker=trace,nativelink_service=trace", + ) + // Disable ANSI color codes for easier log parsing. + .env("NO_COLOR", "1") + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .expect("Failed to spawn nativelink binary"); + + let log_lines: Arc>> = Arc::new(Mutex::new(Vec::new())); + let child_alive = Arc::new(AtomicBool::new(true)); + + // Collect stderr lines in a background thread. + let stderr = child.stderr.take().expect("Failed to capture stderr"); + let log_lines_stderr = log_lines.clone(); + let child_alive_stderr = child_alive.clone(); + std::thread::spawn(move || { + let reader = BufReader::new(stderr); + for line in reader.lines() { + match line { + Ok(line) => { + log_lines_stderr.lock().unwrap().push(line); + } + Err(_) => break, + } + } + child_alive_stderr.store(false, Ordering::Relaxed); + }); + + // Also collect stdout in case tracing writes there. + let stdout = child.stdout.take().expect("Failed to capture stdout"); + let log_lines_stdout = log_lines.clone(); + std::thread::spawn(move || { + let reader = BufReader::new(stdout); + for line in reader.lines() { + match line { + Ok(line) => { + log_lines_stdout.lock().unwrap().push(line); + } + Err(_) => break, + } + } + }); + + Self { child, log_lines, child_alive } + } + + /// Wait until at least `count` log lines matching `pattern` appear. + /// Returns false if the deadline expires or the child process exits. + async fn wait_for_log_count(&self, pattern: &str, count: usize, timeout: Duration) -> bool { + let deadline = tokio::time::Instant::now() + timeout; + loop { + { + let lines = self.log_lines.lock().unwrap(); + let found = lines.iter().filter(|l| l.contains(pattern)).count(); + if found >= count { + return true; + } + } + if tokio::time::Instant::now() > deadline { + return false; + } + // Fail fast if the child process has exited. + if !self.child_alive.load(Ordering::Relaxed) { + // Give a brief moment for final log lines to flush. + tokio::time::sleep(Duration::from_millis(200)).await; + let lines = self.log_lines.lock().unwrap(); + let found = lines.iter().filter(|l| l.contains(pattern)).count(); + if found < count { + error!( + "!!! Child process exited while waiting for pattern={:?} count={} (found {}). Last 30 lines:", + pattern, count, found, + ); + for line in lines.iter().rev().take(30).collect::>().into_iter().rev() { + error!(" {line}"); + } + } + return found >= count; + } + tokio::time::sleep(Duration::from_millis(100)).await; + } + } + + /// Count how many log lines match `pattern`. + fn count_logs(&self, pattern: &str) -> usize { + let lines = self.log_lines.lock().unwrap(); + lines.iter().filter(|l| l.contains(pattern)).count() + } + + /// Get all log lines matching `pattern`. + fn grep_logs(&self, pattern: &str) -> Vec { + let lines = self.log_lines.lock().unwrap(); + lines + .iter() + .filter(|l| l.contains(pattern)) + .cloned() + .collect() + } +} + +impl Drop for NativeLinkProcess { + fn drop(&mut self) { + // Send SIGKILL to stop the process. + let _ = self.child.kill(); + let _ = self.child.wait(); + } +} + +/// Upload a blob to a worker's CAS endpoint via BatchUpdateBlobs. +async fn upload_blob_to_worker_cas( + port: u16, + data: &[u8], +) -> Result<(), Box> { + let channel = Channel::from_shared(format!("http://127.0.0.1:{port}")) + .unwrap() + .connect_timeout(Duration::from_secs(5)) + .timeout(Duration::from_secs(10)) + .connect() + .await?; + + let mut client = ContentAddressableStorageClient::new(channel); + + let (hash, size) = sha256_digest(data); + + let request = BatchUpdateBlobsRequest { + instance_name: String::new(), + requests: vec![batch_update_blobs_request::Request { + digest: Some(Digest { + hash, + size_bytes: size, + }), + data: data.to_vec().into(), + compressor: 0, + }], + digest_function: 0, // SHA256 + }; + + client.batch_update_blobs(request).await?; + Ok(()) +} + +/// Read a blob from a CAS endpoint via BatchReadBlobs. +/// Returns Ok(data) on success, or Err on gRPC/transport error. +/// A gRPC OK with a non-OK status in the response means the blob was not found. +async fn read_blob_from_cas( + port: u16, + instance_name: &str, + hash: &str, + size: i64, +) -> Result>, Box> { + let channel = Channel::from_shared(format!("http://127.0.0.1:{port}")) + .unwrap() + .connect_timeout(Duration::from_secs(5)) + .timeout(Duration::from_secs(10)) + .connect() + .await?; + + let mut client = ContentAddressableStorageClient::new(channel); + + let request = BatchReadBlobsRequest { + instance_name: instance_name.to_string(), + digests: vec![Digest { + hash: hash.to_string(), + size_bytes: size, + }], + acceptable_compressors: vec![], + digest_function: 0, + }; + + let response = client.batch_read_blobs(request).await?; + let inner = response.into_inner(); + + if let Some(resp) = inner.responses.first() { + // status code 0 = OK + if resp.status.as_ref().is_some_and(|s| s.code == 0) { + return Ok(Some(resp.data.to_vec())); + } + } + Ok(None) +} + +/// Represents a per-digest result from BatchReadBlobs. +#[allow(dead_code)] +struct CasReadResult { + /// gRPC status code (0 = OK, 14 = Unavailable, 5 = NotFound, etc.) + code: i32, + /// Status message (may contain redirect prefix for worker requests). + message: String, + /// Blob data (empty if not OK). + data: Vec, +} + +/// Read a blob from a CAS endpoint with the `x-nativelink-worker` header set, +/// simulating a worker-to-server request. Returns the raw per-digest result. +async fn read_blob_from_cas_as_worker( + port: u16, + instance_name: &str, + hash: &str, + size: i64, +) -> Result> { + let channel = Channel::from_shared(format!("http://127.0.0.1:{port}")) + .unwrap() + .connect_timeout(Duration::from_secs(5)) + .timeout(Duration::from_secs(10)) + .connect() + .await?; + + let mut client = ContentAddressableStorageClient::new(channel); + + let mut request = tonic::Request::new(BatchReadBlobsRequest { + instance_name: instance_name.to_string(), + digests: vec![Digest { + hash: hash.to_string(), + size_bytes: size, + }], + acceptable_compressors: vec![], + digest_function: 0, + }); + // Mark this as a worker request so the server returns a redirect + // instead of proxying the blob data. + request + .metadata_mut() + .insert("x-nativelink-worker", MetadataValue::from_static("true")); + + let response = client.batch_read_blobs(request).await?; + let inner = response.into_inner(); + + let resp = inner + .responses + .into_iter() + .next() + .expect("Expected at least one response"); + let status = resp.status.unwrap_or_default(); + Ok(CasReadResult { + code: status.code, + message: status.message, + data: resp.data.to_vec(), + }) +} + +// --------------------------------------------------------------------------- +// Test +// --------------------------------------------------------------------------- + +/// Verify the full BlobsAvailable pipeline with 3 workers. +/// +/// Steps: +/// 1. Start a nativelink server with 3 workers, each with a CAS port +/// 2. Wait for all workers to register and start BlobsAvailable reporting +/// 3. Verify that each worker sends an initial full-snapshot BlobsAvailable +/// 4. Upload unique blobs to each worker's CAS endpoint +/// 5. Wait for the next periodic tick to send a delta BlobsAvailable +/// 6. Verify the server logs show the blobs being registered in the locality map +/// 7. Shutdown and verify cleanup +#[tokio::test(flavor = "multi_thread")] +async fn test_blobs_available_three_workers() { + let temp_dir = TempDir::new().expect("Failed to create temp dir"); + let ports = allocate_ports(); + let config_path = write_config(temp_dir.path(), &ports); + + // --- Phase 1: Start the server --- + + let process = NativeLinkProcess::spawn(&config_path); + + // Wait for both server listeners to be ready. + let startup_timeout = Duration::from_secs(30); + assert!( + process + .wait_for_log_count("Ready, listening on", 2, startup_timeout) + .await, + "Server did not start both listeners within timeout. \ + Lines captured: {}. Last 20 lines:\n{}", + process.log_lines.lock().unwrap().len(), + { + let lines = process.log_lines.lock().unwrap(); + lines.iter().rev().take(20).rev().cloned().collect::>().join("\n") + }, + ); + + + // --- Phase 2: Wait for all 3 workers to connect --- + assert!( + process + .wait_for_log_count("Worker registered with scheduler", 3, Duration::from_secs(15)) + .await, + "Not all 3 workers registered. Found {} registrations. Logs:\n{}", + process.count_logs("Worker registered with scheduler"), + process.grep_logs("Worker registered").join("\n"), + ); + + // --- Phase 3: Verify BlobsAvailable reporting was registered --- + assert!( + process + .wait_for_log_count( + "Registered periodic BlobsAvailable reporting", + 3, + Duration::from_secs(5), + ) + .await, + "Not all 3 workers registered BlobsAvailable callbacks. Found {}.", + process.count_logs("Registered periodic BlobsAvailable reporting"), + ); + + // --- Phase 4: Wait for initial full-snapshot BlobsAvailable --- + // Each worker sends a full snapshot (is_first=true) on the first periodic tick. + // blobs_available_interval_ms=200, so this should happen within ~1 second. + assert!( + process + .wait_for_log_count("Sent periodic BlobsAvailable", 3, Duration::from_secs(5)) + .await, + "Not all 3 workers sent initial BlobsAvailable. Found {}.", + process.count_logs("Sent periodic BlobsAvailable"), + ); + + // Verify that the initial snapshots had is_first=true. + let initial_logs = process.grep_logs("Sent periodic BlobsAvailable"); + let is_first_count = initial_logs.iter().filter(|l| l.contains("is_first=true") || l.contains("is_first: true")).count(); + assert!( + is_first_count >= 3, + "Expected at least 3 is_first=true BlobsAvailable, found {is_first_count}. Logs:\n{}", + initial_logs.join("\n"), + ); + + + // --- Phase 5: Upload blobs to each worker's CAS --- + // Capture the send count BEFORE uploads so we can detect new delta sends. + let before_upload_send_count = process.count_logs("Sent periodic BlobsAvailable"); + let blob_data: Vec> = vec![ + b"Hello from worker-1! This is test blob data.".to_vec(), + b"Hello from worker-2! Different test blob data.".to_vec(), + b"Hello from worker-3! Yet another test blob.".to_vec(), + ]; + + for (i, data) in blob_data.iter().enumerate() { + let port = ports.cas[i]; + // Retry a few times in case the worker CAS server isn't ready yet. + let mut uploaded = false; + for _ in 0..10 { + match upload_blob_to_worker_cas(port, data).await { + Ok(()) => { + uploaded = true; + break; + } + Err(_) => { + tokio::time::sleep(Duration::from_millis(500)).await; + } + } + } + assert!(uploaded, "Failed to upload blob to worker-{}", i + 1); + } + + // --- Phase 6: Wait for delta BlobsAvailable with the new blobs --- + // After uploading, the BlobChangeTracker's on_insert callback fires. + // The next periodic tick (within 200ms) will send a delta. + // We captured before_upload_send_count before uploads started. + assert!( + process + .wait_for_log_count( + "Sent periodic BlobsAvailable", + before_upload_send_count + 3, + Duration::from_secs(5), + ) + .await, + "Workers did not send delta BlobsAvailable after blob upload. \ + Had {before_upload_send_count} sends before upload, now have {}.", + process.count_logs("Sent periodic BlobsAvailable"), + ); + + // --- Phase 7: Verify server-side logging --- + // The WorkerApiServer should log "Registering blobs available from worker" + // for both the initial snapshot and the delta. + let server_register_count = process.count_logs("Registering blobs available from worker"); + assert!( + server_register_count >= 3, + "Expected at least 3 'Registering blobs available from worker' logs, found {server_register_count}.", + ); + + // --- Phase 8: Verify delta-specific behavior --- + // After the initial full snapshot, subsequent sends should be deltas. + let all_sends = process.grep_logs("Sent periodic BlobsAvailable"); + let delta_sends = all_sends + .iter() + .filter(|l| l.contains("is_first=false") || l.contains("is_first: false")) + .count(); + assert!( + delta_sends >= 3, + "Expected at least 3 delta BlobsAvailable sends (is_first=false), found {delta_sends}.", + ); + + + // --- Phase 10: Verify no-change ticks are skipped (trace level) --- + // Workers that have no changes since last tick should log + // "BlobsAvailable: no changes since last tick, skipping" at trace level. + // Give a little extra time for ticks with no changes. + tokio::time::sleep(Duration::from_millis(500)).await; + let skip_count = process.count_logs("no changes since last tick, skipping"); + // We expect at least some skips once the delta has been sent and there + // are no further changes. + assert!( + skip_count > 0, + "Expected at least some 'no changes since last tick, skipping' trace logs \ + (workers should skip sending when there are no new changes).", + ); + + // --- Phase 11: Verify the starting CAS server logs --- + let cas_server_logs = process.grep_logs("Starting worker CAS server for peer blob sharing"); + assert_eq!( + cas_server_logs.len(), + 3, + "Expected 3 worker CAS server start logs, found {}. Logs:\n{}", + cas_server_logs.len(), + cas_server_logs.join("\n"), + ); + + + // --- Phase 12: Worker-2 reads blob from Worker-1 via peer sharing --- + // Upload a unique blob to Worker-1's CAS only. After BlobsAvailable + // propagates to the server's locality map, Worker-2 can fetch the blob + // through the chain: Worker-2 CAS → slow store (GrpcStore → server) → + // server WorkerProxyStore → locality map → Worker-1 CAS. + let cross_worker_blob = b"cross-worker test blob for peer sharing"; + let (cw_hash, cw_size) = sha256_digest(cross_worker_blob); + + // Capture count BEFORE the upload so the delta is not missed. + let before_register = process.count_logs("Registering blobs available from worker"); + + // Upload to Worker-1's CAS. + upload_blob_to_worker_cas(ports.cas[0], cross_worker_blob) + .await + .expect("Failed to upload cross-worker blob to worker-1"); + + // Read the blob back from Worker-1's CAS — should succeed directly. + let data = read_blob_from_cas(ports.cas[0], "", &cw_hash, cw_size) + .await + .expect("gRPC read from worker-1 failed"); + assert_eq!( + data.as_deref(), + Some(cross_worker_blob.as_slice()), + "Blob read from worker-1's CAS should match uploaded data", + ); + assert!( + process + .wait_for_log_count( + "Registering blobs available from worker", + before_register + 1, + Duration::from_secs(5), + ) + .await, + "Server did not register BlobsAvailable after cross-worker blob upload.", + ); + + // Now read from Worker-2's CAS — Worker-2 doesn't have the blob locally, + // so its effective_cas_store chain kicks in: + // fast (FilesystemStore) miss → slow (WorkerProxyStore(GrpcStore → server)) + // → server redirects → WorkerProxyStore follows redirect → Worker-1 → success + let data = read_blob_from_cas(ports.cas[1], "", &cw_hash, cw_size) + .await + .expect("gRPC read from worker-2 failed"); + + assert_eq!( + data.as_deref(), + Some(cross_worker_blob.as_slice()), + "Worker-2 should fetch the blob from Worker-1 via peer sharing", + ); + + // --- Phase 13: Server proxies CAS read to a worker --- + // The server's CAS (SERVER_CAS) is an empty MemoryStore wrapped with + // WorkerProxyStore. When a blob is not found locally, WorkerProxyStore + // consults the server-side locality map (populated by BlobsAvailable) + // and proxies the read to the worker that has it. + + // Upload a unique blob to Worker-3's CAS. + let proxy_blob = b"proxy test blob - only on worker-3"; + let (px_hash, px_size) = sha256_digest(proxy_blob); + + // Capture count BEFORE the upload so the delta is not missed. + let before_register = process.count_logs("Registering blobs available from worker"); + + upload_blob_to_worker_cas(ports.cas[2], proxy_blob) + .await + .expect("Failed to upload proxy blob to worker-3"); + assert!( + process + .wait_for_log_count( + "Registering blobs available from worker", + before_register + 1, + Duration::from_secs(5), + ) + .await, + "Server did not register new BlobsAvailable after proxy blob upload.", + ); + + // Now read the blob via the server's public CAS endpoint. + // The server's MemoryStore doesn't have it, so WorkerProxyStore should + // proxy the read to Worker-3's CAS. + let data = read_blob_from_cas(ports.public, "main", &px_hash, px_size) + .await + .expect("gRPC read from server failed"); + + assert_eq!( + data.as_deref(), + Some(proxy_blob.as_slice()), + "Server should proxy the CAS read to worker-3 and return the blob", + ); + + // Verify the WorkerProxyStore logged the proxy operation. + assert!( + process + .wait_for_log_count( + "WorkerProxyStore: successfully proxied blob from worker", + 1, + Duration::from_secs(3), + ) + .await, + "Expected WorkerProxyStore to log successful proxy read. Logs:\n{}", + process + .grep_logs("WorkerProxyStore") + .join("\n"), + ); + + // --- Phase 14: Verify proxy vs redirect behavior --- + // Non-worker requests to the server's CAS should get proxied data. + // Worker requests (with x-nativelink-worker header) should get a redirect. + + // Upload a fresh blob to Worker-1 for this test. + let redirect_blob = b"redirect vs proxy test blob - only on worker-1"; + let (rd_hash, rd_size) = sha256_digest(redirect_blob); + + // Capture count BEFORE the upload so the delta is not missed. + let before_register = process.count_logs("Registering blobs available from worker"); + + upload_blob_to_worker_cas(ports.cas[0], redirect_blob) + .await + .expect("Failed to upload redirect test blob to worker-1"); + assert!( + process + .wait_for_log_count( + "Registering blobs available from worker", + before_register + 1, + Duration::from_secs(5), + ) + .await, + "Server did not register BlobsAvailable for redirect test blob.", + ); + + // 14a: Non-worker request → server proxies data back. + let data = read_blob_from_cas(ports.public, "main", &rd_hash, rd_size) + .await + .expect("Non-worker read from server failed"); + assert_eq!( + data.as_deref(), + Some(redirect_blob.as_slice()), + "Non-worker request should get proxied blob data from the server", + ); + + // 14b: Worker request → server returns redirect with peer endpoints. + let result = read_blob_from_cas_as_worker(ports.public, "main", &rd_hash, rd_size) + .await + .expect("Worker read from server failed at transport level"); + // The server should return FailedPrecondition (code 9) with NL_REDIRECT: + // prefix containing the worker endpoint(s) that have the blob. + // FailedPrecondition is used instead of Unavailable so the GrpcStore + // retrier does not waste time retrying what is actually a redirect. + assert_eq!( + result.code, 9, // Code::FailedPrecondition + "Worker request should get FailedPrecondition redirect, got code={} message={:?}", + result.code, result.message, + ); + assert!( + result.message.contains("NL_REDIRECT:"), + "Worker redirect message should contain NL_REDIRECT: prefix, got: {:?}", + result.message, + ); + // The redirect should contain Worker-1's CAS endpoint. + // Workers advertise as grpc://:, so check for the port. + let expected_port_suffix = format!(":{}", ports.cas[0]); + assert!( + result.message.contains(&expected_port_suffix), + "Redirect should contain worker-1's CAS port ({}), got: {:?}", + expected_port_suffix, result.message, + ); + + // --- Phase 15: Multi-worker redirect lists all endpoints --- + // Upload a blob to Worker-1, then read it from Worker-2 (which populates + // Worker-2's CAS via the peer fetch). After Worker-2's BlobsAvailable + // propagates, a worker request to the server should get a redirect + // listing BOTH Worker-1 and Worker-2 as endpoints. + let multi_blob = b"multi-redirect test blob for phase 15"; + let (multi_hash, multi_size) = sha256_digest(multi_blob); + + let before_register = process.count_logs("Registering blobs available from worker"); + + // Upload to Worker-1. + upload_blob_to_worker_cas(ports.cas[0], multi_blob) + .await + .expect("Failed to upload multi-redirect blob to worker-1"); + + // Wait for the server to register the blob from Worker-1. + assert!( + process + .wait_for_log_count( + "Registering blobs available from worker", + before_register + 1, + Duration::from_secs(5), + ) + .await, + "Server did not register BlobsAvailable for multi-redirect blob.", + ); + + let before_register = process.count_logs("Registering blobs available from worker"); + + // Read from Worker-2's CAS — this triggers peer fetch from Worker-1, + // populating Worker-2's local CAS. + let data = read_blob_from_cas(ports.cas[1], "", &multi_hash, multi_size) + .await + .expect("Worker-2 peer fetch failed for multi-redirect blob"); + assert_eq!( + data.as_deref(), + Some(multi_blob.as_slice()), + "Worker-2 should fetch multi-redirect blob from Worker-1", + ); + + // Wait for Worker-2's BlobsAvailable to propagate the newly cached blob. + assert!( + process + .wait_for_log_count( + "Registering blobs available from worker", + before_register + 1, + Duration::from_secs(5), + ) + .await, + "Server did not register Worker-2's BlobsAvailable after peer fetch.", + ); + + // Now a worker request should get a redirect listing BOTH workers. + let result = read_blob_from_cas_as_worker(ports.public, "main", &multi_hash, multi_size) + .await + .expect("Worker read for multi-redirect failed"); + assert_eq!( + result.code, 9, + "Multi-redirect should use FailedPrecondition, got code={} message={:?}", + result.code, result.message, + ); + assert!( + result.message.contains("NL_REDIRECT:"), + "Multi-redirect should contain NL_REDIRECT: prefix, got: {:?}", + result.message, + ); + // Both Worker-1 and Worker-2 CAS ports should appear in the redirect. + let w1_port = format!(":{}", ports.cas[0]); + let w2_port = format!(":{}", ports.cas[1]); + assert!( + result.message.contains(&w1_port) && result.message.contains(&w2_port), + "Redirect should list both worker-1 ({}) and worker-2 ({}), got: {:?}", + w1_port, w2_port, result.message, + ); + + // Process is killed on drop. +} diff --git a/tests/execute_peer_sharing_test.rs b/tests/execute_peer_sharing_test.rs new file mode 100644 index 000000000..d7a01b688 --- /dev/null +++ b/tests/execute_peer_sharing_test.rs @@ -0,0 +1,734 @@ +// Copyright 2025 The NativeLink Authors. All rights reserved. +// +// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License +// (the "License"); you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// See LICENSE file for details +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Integration test: Execute dependent actions where the second action's +//! inputs are fetched from the first action's worker via peer-to-peer blob +//! sharing (WorkerProxyStore redirects). +//! +//! Topology: +//! - 1 nativelink server (CAS + Execution + WorkerApi) +//! - 2 workers with peer CAS servers and distinct `worker_id` properties +//! +//! Flow: +//! 1. Action A targets worker-1, produces output blob +//! 2. BlobsAvailable propagates output digests to the server's locality map +//! 3. Action B targets worker-2, depends on A's output — fetched via peer +//! sharing (WorkerProxyStore proxy → Worker-1 CAS) +//! 4. Action C targets worker-1, depends on B's output — fetched from +//! worker-2, verifying bi-directional peer sharing + +use std::io::{BufRead, BufReader}; +use std::path::{Path, PathBuf}; +use std::process::{Child, Command as ProcessCommand, Stdio}; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Mutex}; +use std::time::Duration; + +use tracing::error; + +use nativelink_proto::build::bazel::remote::execution::v2::{ + batch_update_blobs_request, content_addressable_storage_client::ContentAddressableStorageClient, + digest_function, execution_client::ExecutionClient, platform, Action, BatchUpdateBlobsRequest, + Command, Digest, Directory, ExecuteRequest, ExecuteResponse, FileNode, Platform, +}; +use nativelink_proto::google::longrunning::operation; +use prost::Message; +use sha2::{Digest as Sha2Digest, Sha256}; +use tempfile::TempDir; +use tonic::transport::Channel; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +fn get_free_port() -> u16 { + let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap(); + listener.local_addr().unwrap().port() +} + +struct Ports { + public: u16, + worker_api: u16, + cas: [u16; 2], +} + +fn allocate_ports() -> Ports { + Ports { + public: get_free_port(), + worker_api: get_free_port(), + cas: [get_free_port(), get_free_port()], + } +} + +/// Compute SHA-256 digest of data, returning a proto Digest. +fn sha256_digest_proto(data: &[u8]) -> Digest { + let mut hasher = Sha256::new(); + hasher.update(data); + Digest { + hash: format!("{:x}", hasher.finalize()), + size_bytes: data.len() as i64, + } +} + +/// Serialize a prost Message and compute its digest. +fn digest_of_message(msg: &M) -> (Vec, Digest) { + let data = msg.encode_to_vec(); + let digest = sha256_digest_proto(&data); + (data, digest) +} + +/// Write a JSON5 config with execution service, 2 workers with distinct +/// `worker_id` platform properties for deterministic action routing. +fn write_config(temp_dir: &Path, ports: &Ports) -> PathBuf { + let d = temp_dir.to_string_lossy().replace('\\', "/"); + let config = format!( + r#"{{ + stores: [ + {{ name: "AC_STORE", memory: {{ eviction_policy: {{ max_bytes: 100000000 }} }} }}, + {{ name: "SERVER_CAS", memory: {{ eviction_policy: {{ max_bytes: 100000000 }} }} }}, + {{ + name: "W1_STORE", + fast_slow: {{ + fast: {{ filesystem: {{ + content_path: "{d}/w1/cas", + temp_path: "{d}/w1/tmp", + eviction_policy: {{ max_bytes: 100000000 }}, + }} }}, + slow: {{ grpc: {{ + instance_name: "main", + endpoints: [{{ address: "grpc://127.0.0.1:{public}" }}], + store_type: "cas", + }} }}, + slow_direction: "get", + }}, + }}, + {{ + name: "W2_STORE", + fast_slow: {{ + fast: {{ filesystem: {{ + content_path: "{d}/w2/cas", + temp_path: "{d}/w2/tmp", + eviction_policy: {{ max_bytes: 100000000 }}, + }} }}, + slow: {{ grpc: {{ + instance_name: "main", + endpoints: [{{ address: "grpc://127.0.0.1:{public}" }}], + store_type: "cas", + }} }}, + slow_direction: "get", + }}, + }}, + ], + schedulers: [ + {{ + name: "MAIN", + simple: {{ + supported_platform_properties: {{ + cpu_count: "minimum", + worker_id: "exact", + }}, + }}, + }}, + ], + workers: [ + {{ local: {{ + name: "worker-1", + worker_api_endpoint: {{ uri: "grpc://127.0.0.1:{wapi}" }}, + cas_fast_slow_store: "W1_STORE", + cas_server_port: {c1}, + blobs_available_interval_ms: 200, + work_directory: "{d}/w1/work", + upload_action_result: {{ + ac_store: "AC_STORE", + upload_ac_results_strategy: "success_only", + }}, + platform_properties: {{ + cpu_count: {{ values: ["1"] }}, + worker_id: {{ values: ["w1"] }}, + }}, + }} }}, + {{ local: {{ + name: "worker-2", + worker_api_endpoint: {{ uri: "grpc://127.0.0.1:{wapi}" }}, + cas_fast_slow_store: "W2_STORE", + cas_server_port: {c2}, + blobs_available_interval_ms: 200, + work_directory: "{d}/w2/work", + upload_action_result: {{ + ac_store: "AC_STORE", + upload_ac_results_strategy: "success_only", + }}, + platform_properties: {{ + cpu_count: {{ values: ["1"] }}, + worker_id: {{ values: ["w2"] }}, + }}, + }} }}, + ], + servers: [ + {{ + name: "public", + listener: {{ http: {{ socket_address: "127.0.0.1:{public}" }} }}, + services: {{ + cas: [{{ instance_name: "main", cas_store: "SERVER_CAS" }}], + ac: [{{ instance_name: "main", ac_store: "AC_STORE" }}], + bytestream: [{{ instance_name: "main", cas_store: "SERVER_CAS" }}], + capabilities: [{{ instance_name: "main", remote_execution: {{ scheduler: "MAIN" }} }}], + execution: [{{ instance_name: "main", cas_store: "SERVER_CAS", scheduler: "MAIN" }}], + }}, + }}, + {{ + name: "worker_api", + listener: {{ http: {{ socket_address: "127.0.0.1:{wapi}" }} }}, + services: {{ + worker_api: {{ scheduler: "MAIN" }}, + }}, + }}, + ], +}}"#, + d = d, + wapi = ports.worker_api, + c1 = ports.cas[0], + c2 = ports.cas[1], + public = ports.public, + ); + let config_path = temp_dir.join("config.json5"); + std::fs::write(&config_path, config).unwrap(); + config_path +} + +struct NativeLinkProcess { + child: Child, + log_lines: Arc>>, + child_alive: Arc, +} + +impl NativeLinkProcess { + fn spawn(config_path: &Path) -> Self { + let binary = env!("CARGO_BIN_EXE_nativelink"); + + let mut child = ProcessCommand::new(binary) + .arg(config_path.to_str().unwrap()) + .env( + "RUST_LOG", + "nativelink=trace,nativelink_worker=trace,nativelink_service=trace,nativelink_store=trace", + ) + .env("NO_COLOR", "1") + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .expect("Failed to spawn nativelink binary"); + + let log_lines: Arc>> = Arc::new(Mutex::new(Vec::new())); + let child_alive = Arc::new(AtomicBool::new(true)); + + let stderr = child.stderr.take().unwrap(); + let log_lines_stderr = log_lines.clone(); + let child_alive_stderr = child_alive.clone(); + std::thread::spawn(move || { + for line in BufReader::new(stderr).lines() { + match line { + Ok(line) => log_lines_stderr.lock().unwrap().push(line), + Err(_) => break, + } + } + child_alive_stderr.store(false, Ordering::Relaxed); + }); + + let stdout = child.stdout.take().unwrap(); + let log_lines_stdout = log_lines.clone(); + std::thread::spawn(move || { + for line in BufReader::new(stdout).lines() { + match line { + Ok(line) => log_lines_stdout.lock().unwrap().push(line), + Err(_) => break, + } + } + }); + + Self { + child, + log_lines, + child_alive, + } + } + + async fn wait_for_log_count(&self, pattern: &str, count: usize, timeout: Duration) -> bool { + let deadline = tokio::time::Instant::now() + timeout; + loop { + { + let lines = self.log_lines.lock().unwrap(); + if lines.iter().filter(|l| l.contains(pattern)).count() >= count { + return true; + } + } + if tokio::time::Instant::now() > deadline { + return false; + } + if !self.child_alive.load(Ordering::Relaxed) { + tokio::time::sleep(Duration::from_millis(200)).await; + let lines = self.log_lines.lock().unwrap(); + let found = lines.iter().filter(|l| l.contains(pattern)).count(); + if found < count { + error!( + "!!! Child exited waiting for pattern={pattern:?} count={count} (found {found}). Last 40 lines:", + ); + for line in lines.iter().rev().take(40).collect::>().into_iter().rev() { + error!(" {line}"); + } + } + return found >= count; + } + tokio::time::sleep(Duration::from_millis(100)).await; + } + } + + fn count_logs(&self, pattern: &str) -> usize { + self.log_lines + .lock() + .unwrap() + .iter() + .filter(|l| l.contains(pattern)) + .count() + } + + fn grep_logs(&self, pattern: &str) -> Vec { + self.log_lines + .lock() + .unwrap() + .iter() + .filter(|l| l.contains(pattern)) + .cloned() + .collect() + } + + /// Print all logs for debugging. + fn dump_logs(&self, label: &str) { + let lines = self.log_lines.lock().unwrap(); + error!("=== {label} ({} lines) ===", lines.len()); + for line in lines.iter() { + error!(" {line}"); + } + error!("=== end {label} ==="); + } +} + +impl Drop for NativeLinkProcess { + fn drop(&mut self) { + let _ = self.child.kill(); + let _ = self.child.wait(); + } +} + +/// Upload multiple blobs to the server's CAS via BatchUpdateBlobs. +async fn upload_blobs_to_cas( + channel: &Channel, + blobs: &[(Vec, Digest)], +) -> Result<(), Box> { + let mut client = ContentAddressableStorageClient::new(channel.clone()); + let requests: Vec<_> = blobs + .iter() + .map(|(data, digest)| batch_update_blobs_request::Request { + digest: Some(digest.clone()), + data: data.clone().into(), + compressor: 0, + }) + .collect(); + client + .batch_update_blobs(BatchUpdateBlobsRequest { + instance_name: "main".to_string(), + requests, + digest_function: digest_function::Value::Sha256.into(), + }) + .await?; + Ok(()) +} + +/// Execute an action and wait for it to complete, returning the ExecuteResponse. +async fn execute_and_wait( + channel: &Channel, + action_digest: Digest, +) -> Result> { + let mut client = ExecutionClient::new(channel.clone()); + let request = ExecuteRequest { + instance_name: "main".to_string(), + action_digest: Some(action_digest), + skip_cache_lookup: true, + digest_function: digest_function::Value::Sha256.into(), + execution_policy: None, + results_cache_policy: None, + }; + + let response = client.execute(request).await?; + let mut stream = response.into_inner(); + + let mut last_response: Option = None; + while let Some(op) = stream.message().await? { + if op.done { + if let Some(operation::Result::Response(any)) = op.result { + let exec_response = ExecuteResponse::decode(any.value.as_ref())?; + last_response = Some(exec_response); + } + break; + } + } + + last_response.ok_or_else(|| "Execute stream ended without done=true".into()) +} + +/// Build a Platform proto targeting a specific worker. +fn make_platform(worker_id: &str) -> Platform { + Platform { + properties: vec![ + platform::Property { + name: "cpu_count".to_string(), + value: "1".to_string(), + }, + platform::Property { + name: "worker_id".to_string(), + value: worker_id.to_string(), + }, + ], + } +} + +/// Build and upload an action targeted at a specific worker. +async fn create_action( + channel: &Channel, + arguments: Vec, + output_files: Vec, + input_root: &Directory, + target_worker: &str, +) -> Result> { + let command = Command { + arguments, + output_files, + ..Default::default() + }; + let (cmd_data, cmd_digest) = digest_of_message(&command); + + let (root_data, root_digest) = digest_of_message(input_root); + + let action = Action { + command_digest: Some(cmd_digest.clone()), + input_root_digest: Some(root_digest.clone()), + do_not_cache: true, + platform: Some(make_platform(target_worker)), + ..Default::default() + }; + let (action_data, action_digest) = digest_of_message(&action); + + upload_blobs_to_cas( + channel, + &[ + (cmd_data, cmd_digest), + (root_data, root_digest), + (action_data, action_digest.clone()), + ], + ) + .await?; + + Ok(action_digest) +} + +// --------------------------------------------------------------------------- +// Test +// --------------------------------------------------------------------------- + +/// Execute a chain of 3 dependent actions on alternating workers, exercising +/// peer-to-peer blob sharing in both directions. +/// +/// Action A → worker-1: `echo -n "HELLO_FROM_ACTION_A" > output.txt` +/// Action B → worker-2: `cat input.txt > output.txt && echo -n "_PLUS_B" >> output.txt` +/// (input = A's output, fetched from worker-1 via peer sharing) +/// Action C → worker-1: `echo -n "_PLUS_C" > output.txt && cat input.txt >> output.txt` +/// (input = B's output, fetched from worker-2 via peer sharing) +#[tokio::test(flavor = "multi_thread")] +async fn test_execute_dependent_actions_with_peer_sharing() { + let temp_dir = TempDir::new().expect("Failed to create temp dir"); + let ports = allocate_ports(); + let config_path = write_config(temp_dir.path(), &ports); + + let process = NativeLinkProcess::spawn(&config_path); + + // Wait for server listeners. + assert!( + process + .wait_for_log_count("Ready, listening on", 2, Duration::from_secs(30)) + .await, + "Server did not start. Last 20 lines:\n{}", + { + let lines = process.grep_logs(""); + lines.iter().rev().take(20).collect::>().iter().rev() + .map(|s| s.as_str()).collect::>().join("\n") + }, + ); + + // Wait for both workers to register. + assert!( + process + .wait_for_log_count("Worker registered with scheduler", 2, Duration::from_secs(15)) + .await, + "Not all workers registered. Found {}.", + process.count_logs("Worker registered with scheduler"), + ); + + // Wait for initial BlobsAvailable snapshots. + assert!( + process + .wait_for_log_count("Sent periodic BlobsAvailable", 2, Duration::from_secs(5)) + .await, + "Workers did not send initial BlobsAvailable.", + ); + + let channel = Channel::from_shared(format!("http://127.0.0.1:{}", ports.public)) + .unwrap() + .connect_timeout(Duration::from_secs(5)) + .timeout(Duration::from_secs(60)) + .connect() + .await + .expect("Failed to connect to server"); + + // ===================================================================== + // ACTION A → worker-1: Produce a known output blob + // ===================================================================== + let action_a_digest = create_action( + &channel, + vec![ + "/bin/sh".to_string(), + "-c".to_string(), + "echo -n 'HELLO_FROM_ACTION_A' > output.txt".to_string(), + ], + vec!["output.txt".to_string()], + &Directory::default(), + "w1", + ) + .await + .expect("Failed to create Action A"); + + let before_register = process.count_logs("Registering blobs available from worker"); + + let response_a = execute_and_wait(&channel, action_a_digest) + .await + .expect("Action A execution failed"); + + let result_a = response_a + .result + .as_ref() + .expect("Action A missing ActionResult"); + assert_eq!( + result_a.exit_code, 0, + "Action A exit_code={}", + result_a.exit_code, + ); + assert_eq!(result_a.output_files.len(), 1, "Action A output count"); + + let output_a_digest = result_a.output_files[0] + .digest + .as_ref() + .expect("Action A output missing digest"); + let expected_a = b"HELLO_FROM_ACTION_A"; + let expected_a_digest = sha256_digest_proto(expected_a); + assert_eq!( + output_a_digest.hash, expected_a_digest.hash, + "Action A output digest mismatch", + ); + + // Wait for BlobsAvailable to propagate A's outputs to the locality map. + assert!( + process + .wait_for_log_count( + "Registering blobs available from worker", + before_register + 1, + Duration::from_secs(5), + ) + .await, + "BlobsAvailable not registered after Action A.", + ); + + // ===================================================================== + // ACTION B → worker-2: Depends on A's output (peer sharing: w1 → w2) + // ===================================================================== + // Worker-2 does not have A's output locally. The fetch chain: + // Worker-2 FastStore (miss) → GrpcStore → server CAS → + // WorkerProxyStore → locality map (w1 has it) → proxy from w1's CAS + let input_root_b = Directory { + files: vec![FileNode { + name: "input.txt".to_string(), + digest: Some(output_a_digest.clone()), + is_executable: false, + node_properties: None, + }], + ..Default::default() + }; + + let action_b_digest = create_action( + &channel, + vec![ + "/bin/sh".to_string(), + "-c".to_string(), + "cat input.txt > output.txt && echo -n '_PLUS_B' >> output.txt".to_string(), + ], + vec!["output.txt".to_string()], + &input_root_b, + "w2", + ) + .await + .expect("Failed to create Action B"); + + let proxy_before_b = process.count_logs("WorkerProxyStore: successfully read blob from redirected peer"); + + let before_register = process.count_logs("Registering blobs available from worker"); + + let response_b = execute_and_wait(&channel, action_b_digest) + .await + .expect("Action B execution failed"); + + let result_b = response_b + .result + .as_ref() + .expect("Action B missing ActionResult"); + assert_eq!( + result_b.exit_code, 0, + "Action B exit_code={}\nAll logs:\n{}", + result_b.exit_code, + process.grep_logs("").join("\n"), + ); + assert_eq!(result_b.output_files.len(), 1, "Action B output count"); + + let output_b_digest = result_b.output_files[0] + .digest + .as_ref() + .expect("Action B output missing digest"); + let expected_b = b"HELLO_FROM_ACTION_A_PLUS_B"; + let expected_b_digest = sha256_digest_proto(expected_b); + assert_eq!( + output_b_digest.hash, expected_b_digest.hash, + "Action B output digest mismatch. Expected {:?}, got hash {}", + String::from_utf8_lossy(expected_b), + output_b_digest.hash, + ); + + // Verify peer sharing: worker-2 received a redirect from the server's + // WorkerProxyStore and fetched A's output directly from worker-1's CAS. + let proxy_after_b = process.count_logs("WorkerProxyStore: successfully read blob from redirected peer"); + if proxy_after_b <= proxy_before_b { + process.dump_logs("Action B peer sharing failure"); + } + assert!( + proxy_after_b > proxy_before_b, + "Expected peer redirect from worker-1 for Action A's output. \ + Redirect count before={proxy_before_b} after={proxy_after_b}.", + ); + + // Wait for BlobsAvailable after Action B. + assert!( + process + .wait_for_log_count( + "Registering blobs available from worker", + before_register + 1, + Duration::from_secs(5), + ) + .await, + "BlobsAvailable not registered after Action B.", + ); + + // ===================================================================== + // ACTION C → worker-1: Depends on B's output (peer sharing: w2 → w1) + // ===================================================================== + // B's output is only on worker-2. Worker-1 must peer-fetch it. + // This verifies bi-directional peer sharing. + let input_root_c = Directory { + files: vec![FileNode { + name: "input.txt".to_string(), + digest: Some(output_b_digest.clone()), + is_executable: false, + node_properties: None, + }], + ..Default::default() + }; + + let action_c_digest = create_action( + &channel, + vec![ + "/bin/sh".to_string(), + "-c".to_string(), + "echo -n '_PLUS_C' > output.txt && cat input.txt >> output.txt".to_string(), + ], + vec!["output.txt".to_string()], + &input_root_c, + "w1", + ) + .await + .expect("Failed to create Action C"); + + let proxy_before_c = process.count_logs("WorkerProxyStore: successfully read blob from redirected peer"); + + let response_c = execute_and_wait(&channel, action_c_digest) + .await + .expect("Action C execution failed"); + + let result_c = response_c + .result + .as_ref() + .expect("Action C missing ActionResult"); + assert_eq!( + result_c.exit_code, 0, + "Action C exit_code={}", + result_c.exit_code, + ); + assert_eq!(result_c.output_files.len(), 1, "Action C output count"); + + let output_c_digest = result_c.output_files[0] + .digest + .as_ref() + .expect("Action C output missing digest"); + let expected_c = b"_PLUS_CHELLO_FROM_ACTION_A_PLUS_B"; + let expected_c_digest = sha256_digest_proto(expected_c); + assert_eq!( + output_c_digest.hash, expected_c_digest.hash, + "Action C output digest mismatch. Expected {:?}, got hash {}", + String::from_utf8_lossy(expected_c), + output_c_digest.hash, + ); + + // Verify peer redirect for Action C (w2 → w1 direction). + let proxy_after_c = process.count_logs("WorkerProxyStore: successfully read blob from redirected peer"); + assert!( + proxy_after_c > proxy_before_c, + "Expected peer redirect from worker-2 for Action B's output. \ + Redirect count before={proxy_before_c} after={proxy_after_c}. \ + WorkerProxyStore logs:\n{}", + process.grep_logs("WorkerProxyStore").join("\n"), + ); + + // ===================================================================== + // Summary assertions + // ===================================================================== + + // At least 2 proxy operations (one per cross-worker fetch). + let total_proxies = process.count_logs("WorkerProxyStore: successfully read blob from redirected peer"); + assert!( + total_proxies >= 2, + "Expected at least 2 peer redirect reads (A→w2, B→w1), got {total_proxies}", + ); + + // BlobsAvailable should have been registered multiple times. + let total_registrations = process.count_logs("Registering blobs available from worker"); + assert!( + total_registrations >= 4, + "Expected at least 4 BlobsAvailable registrations, got {total_registrations}", + ); + + // Process is killed on drop. +} diff --git a/toolchain-examples/nativelink-config.json5 b/toolchain-examples/nativelink-config.json5 index 7e40a65e4..8e66c47e0 100644 --- a/toolchain-examples/nativelink-config.json5 +++ b/toolchain-examples/nativelink-config.json5 @@ -47,6 +47,8 @@ OSFamily: "priority", "container-image": "priority", }, + // Enable locality-aware scheduling. + cas_store: "WORKER_FAST_SLOW_STORE", }, }, ], @@ -57,6 +59,8 @@ uri: "grpc://127.0.0.1:50061", }, cas_fast_slow_store: "WORKER_FAST_SLOW_STORE", + // Expose a CAS server for peer-to-peer blob sharing. + cas_server_port: 50081, upload_action_result: { ac_store: "AC_MAIN_STORE", },