Retries for fetching snapshots (microsoft#7317)

cjen1-msft · achamayou · cjen1-msft · commit 528765e964e4 · 2025-09-30T10:32:18.000+01:00
Co-authored-by: Amaury Chamayou &lt;amaury@xargs.fr&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ### Added
 
 - Improved logging of snapshot digests (#7300)
+- Node will now retry when fetching snapshots. This is controlled with `command.join.fetch_snapshot_max_attempts` and `command.join.fetch_snapshot_retry_interval`. (#7317)
 
 [6.0.14]: https://github.com/microsoft/CCF/releases/tag/ccf-6.0.14
 
diff --git a/doc/host_config_schema/cchost_config.json b/doc/host_config_schema/cchost_config.json
@@ -406,6 +406,17 @@
                     "type": "boolean",
                     "default": true,
                     "description": "Whether to ask the target for a newer snapshot before joining. The node will ask the target what their latest snapshot is, and if that is later than what the node has locally, will fetch it via RPC before launching. Should generally only be turned off for specific test cases"
+                  },
+                  "fetch_snapshot_max_attempts": {
+                    "type": "integer",
+                    "default": 3,
+                    "description": "Maximum number of attempts to fetch a recent snapshot from the target node",
+                    "minimum": 1
+                  },
+                  "fetch_snapshot_retry_interval": {
+                    "type": "string",
+                    "default": "1000ms",
+                    "description": "Interval (time string) between retries to fetch a recent snapshot from the target node"
                   }
                 },
                 "required": ["target_rpc_address"],
diff --git a/src/host/configuration.h b/src/host/configuration.h
@@ -150,6 +150,8 @@ namespace host
         ccf::ds::TimeString retry_timeout = {"1000ms"};
         bool follow_redirect = true;
         bool fetch_recent_snapshot = true;
+        size_t fetch_snapshot_max_attempts = 3;
+        ccf::ds::TimeString fetch_snapshot_retry_interval = {"1000ms"};
 
         bool operator==(const Join&) const = default;
       };
@@ -212,7 +214,9 @@ namespace host
     CCHostConfig::Command::Join,
     retry_timeout,
     follow_redirect,
-    fetch_recent_snapshot);
+    fetch_recent_snapshot,
+    fetch_snapshot_max_attempts,
+    fetch_snapshot_retry_interval);
 
   DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(CCHostConfig::Command::Recover);
   DECLARE_JSON_REQUIRED_FIELDS(CCHostConfig::Command::Recover);
diff --git a/src/host/main.cpp b/src/host/main.cpp
@@ -878,7 +878,9 @@ int main(int argc, char** argv) // NOLINT(bugprone-exception-escape)
         auto latest_peer_snapshot = snapshots::fetch_from_peer(
           config.command.join.target_rpc_address,
           config.command.service_certificate_file,
-          latest_local_idx);
+            latest_local_idx,
+            config.command.join.fetch_snapshot_max_attempts,
+            config.command.join.fetch_snapshot_retry_interval.count_ms());
 
         if (latest_peer_snapshot.has_value())
         {
diff --git a/src/snapshots/fetch.h b/src/snapshots/fetch.h
@@ -40,7 +40,7 @@ namespace snapshots
     std::vector<uint8_t> snapshot_data;
   };
 
-  static std::optional<SnapshotResponse> fetch_from_peer(
+  static std::optional<SnapshotResponse> try_fetch_from_peer(
     const std::string& peer_address,
     const std::string& path_to_peer_cert,
     size_t latest_local_snapshot)
@@ -289,4 +289,36 @@ namespace snapshots
       return std::nullopt;
     }
   }
+
+  static std::optional<SnapshotResponse> fetch_from_peer(
+    const std::string& peer_address,
+    const std::string& path_to_peer_cert,
+    size_t latest_local_snapshot,
+    size_t max_attempts,
+    size_t retry_delay_ms)
+  {
+    for (size_t attempt = 0; attempt < max_attempts; ++attempt)
+    {
+      LOG_INFO_FMT(
+        "Fetching snapshot from {} (attempt {}/{})",
+        peer_address,
+        attempt + 1,
+        max_attempts);
+
+      if (attempt > 0)
+      {
+        std::this_thread::sleep_for(std::chrono::milliseconds(retry_delay_ms));
+      }
+
+      auto response = try_fetch_from_peer(
+        peer_address, path_to_peer_cert, latest_local_snapshot);
+      if (response.has_value())
+      {
+        return response;
+      }
+    }
+    LOG_INFO_FMT(
+      "Exceeded maximum snapshot fetch retries ({}), giving up", max_attempts);
+    return std::nullopt;
+  }
 }
diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py
@@ -31,6 +31,8 @@
 import sys
 import pathlib
 import infra.concurrency
+import ccf.read_ledger
+import re
 
 from loguru import logger as LOG
 
@@ -1553,6 +1555,67 @@ def test_error_message_on_failure_to_read_aci_sec_context(args):
         ), f"Did not find expected log messages: {expected_log_messages}"
 
 
+def test_error_message_on_failure_to_fetch_snapshot(const_args):
+    args = copy.deepcopy(const_args)
+    args.nodes = infra.e2e_args.min_nodes(args, 0)
+    with infra.network.network(
+        args.nodes,
+        args.binary_dir,
+        args.debug_nodes,
+        pdb=args.pdb,
+    ) as network:
+        network.start_and_open(args)
+
+        primary, _ = network.find_primary()
+
+        new_node = network.create_node("local://localhost")
+
+        # Shut down primary to cause snapshot fetch to fail
+        primary.remote.stop()
+
+        failed = False
+        try:
+            LOG.info("Starting join")
+            network.join_node(
+                new_node,
+                args.package,
+                args,
+                target_node=primary,
+                timeout=10,
+                from_snapshot=False,
+                wait_for_node_in_store=False,
+            )
+            new_node.wait_for_node_to_join(timeout=5)
+        except Exception as e:
+            LOG.info(f"Joining node could not join as expected {e}")
+            failed = True
+
+        assert failed, "Joining node could not join failed node as expected"
+
+        expected_log_messages = [
+            re.compile(r"Fetching snapshot from .* \(attempt 1/3\)"),
+            re.compile(r"Fetching snapshot from .* \(attempt 2/3\)"),
+            re.compile(r"Fetching snapshot from .* \(attempt 3/3\)"),
+            re.compile(
+                r"Exceeded maximum snapshot fetch retries \([0-9]+\), giving up"
+            ),
+        ]
+
+        out_path, _ = new_node.get_logs()
+        for line in open(out_path, "r", encoding="utf-8").readlines():
+            for expected in expected_log_messages:
+                match = re.search(expected, line)
+                if match:
+                    expected_log_messages.remove(expected)
+                    LOG.info(f"Found expected log message: {line}")
+            if len(expected_log_messages) == 0:
+                break
+
+        assert (
+            len(expected_log_messages) == 0
+        ), f"Did not find expected log messages: {expected_log_messages}"
+
+
 def run(args):
     run_max_uncommitted_tx_count(args)
     run_file_operations(args)
diff --git a/tests/infra/network.py b/tests/infra/network.py
@@ -354,10 +354,11 @@ def _setup_node(
         **kwargs,
     ):
         # Contact primary if no target node is set
-        primary, _ = self.find_primary(
-            timeout=args.ledger_recovery_timeout if recovery else 10
-        )
-        target_node = target_node or primary
+        if target_node is None:
+            primary, _ = self.find_primary(
+                timeout=args.ledger_recovery_timeout if recovery else 10
+            )
+            target_node = primary
         LOG.info(f"Joining from target node {target_node.local_node_id}")
 
         committed_ledger_dirs = read_only_ledger_dirs or []
@@ -369,6 +370,9 @@ def _setup_node(
         if from_snapshot:
             # Only retrieve snapshot from primary if the snapshot directory is not specified
             if snapshots_dir is None:
+                primary, _ = self.find_primary(
+                    timeout=args.ledger_recovery_timeout if recovery else 10
+                )
                 read_only_snapshots_dir = self.get_committed_snapshots(primary)
             if os.listdir(snapshots_dir) or os.listdir(read_only_snapshots_dir):
                 LOG.info(
@@ -1040,10 +1044,11 @@ def join_node(
         target_node=None,
         timeout=JOIN_TIMEOUT,
         stop_on_error=False,
+        wait_for_node_in_store=True,
         **kwargs,
     ):
         self.setup_join_node(node, lib_name, args, target_node, **kwargs)
-        self.run_join_node(node, timeout, stop_on_error)
+        self.run_join_node(node, timeout, stop_on_error, wait_for_node_in_store)
 
     def trust_node(
         self,