Skip to content

Commit adaa2c8

Browse files
committed
Enable API server's watch termination grace period by default
By default, the API server doesn't terminate long-running watches during shutdown: Active watch traffic is subject to regular HTTP server shutdown behavior and will delay the shutdown until the overall HTTP request timeout is reached. Previously, k0s relied on a fixed supervisor stop timeout of five seconds, which did not account for this behavior under realistic load. Enable the API server's shutdown watch termination grace period by default so active watch streams are drained during shutdown. Since watch traffic accounts for most long-running API activity in normal clusters, this allows for generally faster API server shutdowns, aligning with k0s's requirement to promptly respond to shutdown requests from the init system. Derive the supervisor stop timeout from the API server flags, then clamp it to the range of 5 to 20 seconds to stay within the typical init system stop time budget. Set the watch termination grace period to the stop timeout minus two seconds, if not explicitly specified by the user. Finally, add a k0s controller command line flag to explicitly set the API server's stop timeout and bypass its automatic calculation. Signed-off-by: Tom Wieczorek <twieczorek@mirantis.com>
1 parent eed9fee commit adaa2c8

5 files changed

Lines changed: 108 additions & 6 deletions

File tree

cmd/controller/controller.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,7 @@ func (c *command) start(ctx context.Context, flags *config.ControllerOptions, de
302302
LogLevel: c.LogLevels.KubeAPIServer,
303303
Storage: storageBackend,
304304
EnableKonnectivity: enableKonnectivity,
305+
StopTimeout: flags.APIServerStopTimeout,
305306

306307
// If k0s reconciles the kubernetes endpoint, the API server shouldn't do it.
307308
DisableEndpointReconciler: enableK0sEndpointReconciler,

cmd/controller/controller_test.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ Examples:
5050
Note: Token can be passed either as a CLI argument, a flag, or an environment variable
5151
5252
Flags:
53+
--api-server-stop-timeout duration time to wait for the API server to stop
5354
-c, --config string config file, use '-' to read the config from stdin (default `+defaultConfigPath+`)
5455
--cri-socket string container runtime socket to use, default to internal containerd. Format: [remote|docker]:[path-to-socket]
5556
--data-dir string Data Directory for k0s. DO NOT CHANGE for an existing setup, things will break! (default `+defaultDataDir+`)
@@ -82,3 +83,21 @@ Flags:
8283
-v, --verbose Verbose logging (default true)
8384
`, out.String())
8485
}
86+
87+
func TestControllerCmd_Flags(t *testing.T) {
88+
if runtime.GOOS != "linux" {
89+
t.Skip("Running controllers is only supported on Linux")
90+
}
91+
92+
t.Run("api-server-stop-timeout", func(t *testing.T) {
93+
expected := `invalid argument "0s" for "--api-server-stop-timeout" flag: must be positive`
94+
var stdout, stderr strings.Builder
95+
underTest := cmd.NewRootCmd()
96+
underTest.SetArgs([]string{"controller", "--api-server-stop-timeout", "0s"})
97+
underTest.SetOut(&stdout)
98+
underTest.SetErr(&stderr)
99+
assert.ErrorContains(t, underTest.Execute(), expected)
100+
assert.Empty(t, stdout.String())
101+
assert.Equal(t, "Error: "+expected+"\n", stderr.String())
102+
})
103+
}

cmd/install/controller_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ With the controller subcommand you can setup a single node cluster by running:
4343
4444
4545
Flags:
46+
--api-server-stop-timeout duration time to wait for the API server to stop
4647
-c, --config string config file, use '-' to read the config from stdin (default `+defaultConfigPath+`)
4748
--cri-socket string container runtime socket to use, default to internal containerd. Format: [remote|docker]:[path-to-socket]
4849
--data-dir string Data Directory for k0s. DO NOT CHANGE for an existing setup, things will break! (default `+defaultDataDir+`)

pkg/component/controller/apiserver.go

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"path/filepath"
1717
"strconv"
1818
"strings"
19+
"time"
1920

2021
"github.com/sirupsen/logrus"
2122

@@ -38,6 +39,7 @@ type APIServer struct {
3839
Storage manager.Component
3940
EnableKonnectivity bool
4041
DisableEndpointReconciler bool
42+
StopTimeout time.Duration
4143

4244
supervisor *supervisor.Supervisor
4345
executablePath string
@@ -154,19 +156,59 @@ func (a *APIServer) Start(ctx context.Context) error {
154156
args["endpoint-reconciler-type"] = "none"
155157
}
156158

159+
stopTimeout := a.StopTimeout
160+
161+
// If the timeout hasn't been specified, do a
162+
// best guess based on the API server flags.
163+
if stopTimeout <= 0 {
164+
requestTimeout := 1 * time.Minute
165+
if value, ok := args["request-timeout"]; ok {
166+
if parsed, err := time.ParseDuration(value); err == nil {
167+
requestTimeout = parsed
168+
}
169+
}
170+
171+
watchTerminationGrace := 0 * time.Second
172+
if value, ok := args["shutdown-watch-termination-grace-period"]; ok {
173+
if parsed, err := time.ParseDuration(value); err == nil {
174+
watchTerminationGrace = parsed
175+
}
176+
}
177+
178+
stopTimeout = max(requestTimeout, watchTerminationGrace) + (2 * time.Second)
179+
180+
// Clamp the timeout between 5 and 20 seconds. We can't wait for too long
181+
// currently because the init system will likely kill the process otherwise.
182+
stopTimeout = max(5*time.Second, min(stopTimeout, 20*time.Second))
183+
}
184+
185+
// Enable the API server's watch-drain facility on shutdown, if that flag
186+
// hasn't been specified by the user. Without this flag, the API server will
187+
// almost always encounter the request timeout if anything is connected to
188+
// it via client-go watches. These have a timeout of between five and ten
189+
// minutes. Note that other types of long-running requests, such as log
190+
// streams, can still prevent a timely shutdown. However, there's not much
191+
// that can be done about them apart from setting a short request timeout.
192+
if _, ok := args["shutdown-watch-termination-grace-period"]; !ok {
193+
if gracePeriod := stopTimeout - 2*time.Second; gracePeriod > 0 {
194+
args["shutdown-watch-termination-grace-period"] = gracePeriod.String()
195+
}
196+
}
197+
157198
var apiServerArgs []string
158199
for name, value := range args {
159200
apiServerArgs = append(apiServerArgs, fmt.Sprintf("--%s=%s", name, value))
160201
}
161202
apiServerArgs = append(apiServerArgs, a.ClusterConfig.Spec.API.RawArgs...)
162203

163204
a.supervisor = &supervisor.Supervisor{
164-
Name: kubeAPIComponentName,
165-
BinPath: a.executablePath,
166-
RunDir: a.K0sVars.RunDir,
167-
DataDir: a.K0sVars.DataDir,
168-
Args: apiServerArgs,
169-
UID: a.uid,
205+
Name: kubeAPIComponentName,
206+
BinPath: a.executablePath,
207+
RunDir: a.K0sVars.RunDir,
208+
DataDir: a.K0sVars.DataDir,
209+
Args: apiServerArgs,
210+
UID: a.uid,
211+
TimeoutStop: stopTimeout,
170212
}
171213

172214
etcdArgs, err := getEtcdArgs(a.ClusterConfig.Spec.Storage, a.K0sVars)

pkg/config/cli.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
package config
55

66
import (
7+
"errors"
78
"fmt"
89
"runtime"
910
"slices"
@@ -57,6 +58,7 @@ type ControllerOptions struct {
5758
EnableMetricsScraper bool
5859
KubeControllerManagerExtraArgs string
5960
FeatureGates featuregate.FeatureGates
61+
APIServerStopTimeout time.Duration
6062

6163
enableWorker, singleNode bool
6264
}
@@ -302,6 +304,7 @@ func GetControllerFlags(controllerOpts *ControllerOptions) *pflag.FlagSet {
302304
flagset.StringVar(&controllerOpts.KubeControllerManagerExtraArgs, "kube-controller-manager-extra-args", "", "extra args for kube-controller-manager")
303305
flagset.BoolVar(&controllerOpts.InitOnly, "init-only", false, "only initialize controller and exit")
304306
flagset.Var(&controllerOpts.FeatureGates, "feature-gates", "feature gates to enable (comma separated list of key=value pairs)")
307+
flagset.Var((*positiveDurationFlag)(&controllerOpts.APIServerStopTimeout), "api-server-stop-timeout", "time to wait for the API server to stop")
305308
return flagset
306309
}
307310

@@ -334,3 +337,39 @@ func GetCmdOpts(cobraCmd command) (*CLIOptions, error) {
334337
K0sVars: k0sVars,
335338
}, nil
336339
}
340+
341+
type positiveDurationFlag time.Duration
342+
343+
// Type implements [pflag.Value].
344+
func (f *positiveDurationFlag) Type() string {
345+
return "duration"
346+
}
347+
348+
// String implements [pflag.Value].
349+
func (f *positiveDurationFlag) String() string {
350+
if *(*time.Duration)(f) <= 0 {
351+
return ""
352+
}
353+
354+
return (*time.Duration)(f).String()
355+
}
356+
357+
// Set implements [pflag.Value].
358+
func (f *positiveDurationFlag) Set(value string) error {
359+
if value == "" {
360+
*(*time.Duration)(f) = 0
361+
return nil
362+
}
363+
364+
parsed, err := time.ParseDuration(value)
365+
if err != nil {
366+
return err
367+
}
368+
369+
if parsed <= 0 {
370+
return errors.New("must be positive")
371+
}
372+
373+
*(*time.Duration)(f) = parsed
374+
return nil
375+
}

0 commit comments

Comments
 (0)