From 868ab33017e7199ec36ff2b47da8104547a21dee Mon Sep 17 00:00:00 2001 From: Shivprakash Muley Date: Tue, 10 Mar 2026 19:34:44 +0530 Subject: [PATCH 1/2] OCPBUGS-66983: Fix race condition in gather_core_dumps pod name retrieval --- collection-scripts/gather_core_dumps | 39 ++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/collection-scripts/gather_core_dumps b/collection-scripts/gather_core_dumps index c25d6996..cd045f7f 100755 --- a/collection-scripts/gather_core_dumps +++ b/collection-scripts/gather_core_dumps @@ -7,22 +7,44 @@ mkdir -p "${CORE_DUMP_PATH}"/ function get_dump_off_node { local debugPod="" - #Get debug pod's name - debugPod=$(oc debug --to-namespace="default" node/"$1" -o jsonpath='{.metadata.name}') + #Start Debug pod in background and capture output to get pod name + local tmpfile=$(mktemp) + oc debug --to-namespace="default" node/"$1" -- /bin/bash -c 'sleep 300' > "$tmpfile" 2>&1 & - #Start Debug pod force it to stay up until removed in "default" namespace - oc debug --to-namespace="default" node/"$1" -- /bin/bash -c 'sleep 300' >/dev/null 2>&1 & + #Wait for the debug pod to be created and extract its name with exponential backoff + local max_attempts=10 # Fewer attempts needed with exponential backoff + local attempt=0 + local base_delay=0.1 # Starting delay in seconds + local max_delay=2.0 # Cap the maximum delay - #Mimic a normal oc call, i.e pause between two successive calls to allow pod to register - sleep 2 - oc wait -n "default" --for=condition=Ready pod/"$debugPod" --timeout=30s + # Initial delay to allow pod creation to start + sleep 0.5 + + while [ -z "$debugPod" ] && [ $attempt -lt $max_attempts ]; do + debugPod=$(sed -n 's/.*pod\/\([^ ]*\).*/\1/p' "$tmpfile" 2>/dev/null | head -1) + if [ -z "$debugPod" ]; then + # Calculate exponential backoff: base_delay * 2^attempt + local delay=$(awk -v base="$base_delay" -v exponent="$attempt" -v max="$max_delay" \ + 'BEGIN {d = base * (2 ^ exponent); print (d > max) ? max : d}') + sleep "$delay" + attempt=$((attempt + 1)) + fi + done + rm -f "$tmpfile" + + #Wait for pod to be ready + if [ -n "$debugPod" ]; then + oc wait -n "default" --for=condition=Ready pod/"$debugPod" --timeout=30s > /dev/null 2>&1 + fi if [ -z "$debugPod" ]; then echo "Debug pod for node ""$1"" never activated" else #Copy Core Dumps out of Nodes suppress Stdout echo "Copying core dumps on node ""$1""" - oc cp --loglevel 1 -n "default" "$debugPod":/host/var/lib/systemd/coredump "${CORE_DUMP_PATH}"/"$1"_core_dump >/dev/null 2>&1 && PIDS+=($!) + if ! oc cp --loglevel 1 -n "default" "$debugPod":/host/var/lib/systemd/coredump "${CORE_DUMP_PATH}"/"$1"_core_dump > /dev/null 2>&1; then + echo "Warning: Failed to copy core dumps from node $1" + fi #clean up debug pod after we are done using them oc delete pod "$debugPod" -n "default" @@ -33,6 +55,7 @@ function gather_core_dump_data { #Run coredump pull function on all nodes in parallel for NODE in ${NODES}; do get_dump_off_node "${NODE}" & + PIDS+=($!) done } From 8eaf47099c9d7d1248c5a041400d522cd4683f91 Mon Sep 17 00:00:00 2001 From: Shivprakash Muley Date: Fri, 20 Mar 2026 12:25:46 +0530 Subject: [PATCH 2/2] OCPBUGS-66983: Handled failures around wait and tmpfile --- collection-scripts/gather_core_dumps | 46 ++++++++++++++++++---------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/collection-scripts/gather_core_dumps b/collection-scripts/gather_core_dumps index cd045f7f..83849c26 100755 --- a/collection-scripts/gather_core_dumps +++ b/collection-scripts/gather_core_dumps @@ -5,11 +5,17 @@ CORE_DUMP_PATH=${OUT:-"${BASE_COLLECTION_PATH}/node_core_dumps"} mkdir -p "${CORE_DUMP_PATH}"/ function get_dump_off_node { + local node="$1" local debugPod="" + local oc_debug_pid="" + local tmpfile - #Start Debug pod in background and capture output to get pod name - local tmpfile=$(mktemp) - oc debug --to-namespace="default" node/"$1" -- /bin/bash -c 'sleep 300' > "$tmpfile" 2>&1 & + tmpfile=$(mktemp) + trap 'rm -f "$tmpfile"' RETURN + + # Start Debug pod in background and capture output to get pod name + oc debug --to-namespace="default" node/"$node" -- /bin/bash -c 'sleep 300' > "$tmpfile" 2>&1 & + oc_debug_pid=$! #Wait for the debug pod to be created and extract its name with exponential backoff local max_attempts=10 # Fewer attempts needed with exponential backoff @@ -32,27 +38,33 @@ function get_dump_off_node { done rm -f "$tmpfile" - #Wait for pod to be ready - if [ -n "$debugPod" ]; then - oc wait -n "default" --for=condition=Ready pod/"$debugPod" --timeout=30s > /dev/null 2>&1 + if [ -z "$debugPod" ]; then + kill "${oc_debug_pid}" 2>/dev/null + wait "${oc_debug_pid}" 2>/dev/null + echo "Debug pod for node $node never activated" + return fi - if [ -z "$debugPod" ]; then - echo "Debug pod for node ""$1"" never activated" - else - #Copy Core Dumps out of Nodes suppress Stdout - echo "Copying core dumps on node ""$1""" - if ! oc cp --loglevel 1 -n "default" "$debugPod":/host/var/lib/systemd/coredump "${CORE_DUMP_PATH}"/"$1"_core_dump > /dev/null 2>&1; then - echo "Warning: Failed to copy core dumps from node $1" - fi + if ! oc wait -n "default" --for=condition=Ready pod/"$debugPod" --timeout=30s > /dev/null 2>&1; then + echo "Warning: Debug pod $debugPod on node $node did not become Ready in time" + oc delete pod "$debugPod" -n "default" --wait=false > /dev/null 2>&1 + kill "${oc_debug_pid}" 2>/dev/null + wait "${oc_debug_pid}" 2>/dev/null + return + fi - #clean up debug pod after we are done using them - oc delete pod "$debugPod" -n "default" + echo "Copying core dumps on node $node" + if ! oc cp --loglevel 1 -n "default" "$debugPod":/host/var/lib/systemd/coredump "${CORE_DUMP_PATH}/${node}_core_dump" > /dev/null 2>&1; then + echo "Warning: Failed to copy core dumps from node $node" fi + + oc delete pod "$debugPod" -n "default" --wait=false > /dev/null 2>&1 + kill "${oc_debug_pid}" 2>/dev/null + wait "${oc_debug_pid}" 2>/dev/null } function gather_core_dump_data { - #Run coredump pull function on all nodes in parallel + # Run coredump pull function on all nodes in parallel for NODE in ${NODES}; do get_dump_off_node "${NODE}" & PIDS+=($!)