77 "encoding/json"
88 "errors"
99 "fmt"
10- "net"
1110 "net/http"
1211 "net/netip"
1312 "strings"
@@ -22,7 +21,6 @@ import (
2221 "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
2322 "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v8"
2423 "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v7"
25- "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/privatedns/armprivatedns"
2624 "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources/v3"
2725 "github.com/google/uuid"
2826 corev1 "k8s.io/api/core/v1"
@@ -44,7 +42,6 @@ type Cluster struct {
4442 SubnetID string
4543 ClusterParams * ClusterParams
4644 Bastion * Bastion
47- ProxyURL string
4845}
4946
5047// Returns true if the cluster is configured with Azure CNI
@@ -110,21 +107,7 @@ func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.Manag
110107 needACR := isNetworkIsolated || attachPrivateAcr
111108 acrNonAnon := dag .Run2 (g , kube , identity , addACR (cluster , needACR , true ))
112109 acrAnon := dag .Run2 (g , kube , identity , addACR (cluster , needACR , false ))
113- debugDeps := append ([]dag.Dep {acrNonAnon , acrAnon }, networkDeps ... )
114- proxyURL := dag .Go1 (g , kube , func (ctx context.Context , k * Kubeclient ) (string , error ) {
115- if err := k .EnsureDebugDaemonsets (ctx , isNetworkIsolated , config .GetPrivateACRName (true , * cluster .Location )); err != nil {
116- return "" , err
117- }
118- if isNetworkIsolated {
119- return "" , nil
120- }
121- return k .GetProxyURL (ctx )
122- }, debugDeps ... )
123- if ! isNetworkIsolated {
124- dag .Run (g , func (ctx context.Context ) error {
125- return setupPrivateDNSForAPIServer (ctx , cluster )
126- })
127- }
110+ dag .Run1 (g , kube , ensureDebugDaemonsets (cluster , isNetworkIsolated ), append ([]dag.Dep {acrNonAnon , acrAnon }, networkDeps ... )... )
128111 extract := dag .Go1 (g , kube , extractClusterParams (cluster ))
129112
130113 if err := g .Wait (); err != nil {
@@ -137,7 +120,6 @@ func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.Manag
137120 SubnetID : subnet .MustGet (),
138121 ClusterParams : extract .MustGet (),
139122 Bastion : bastion .MustGet (),
140- ProxyURL : proxyURL .MustGet (),
141123 }, nil
142124}
143125
@@ -150,6 +132,12 @@ func addACR(cluster *armcontainerservice.ManagedCluster, needACR, isNonAnonymous
150132 }
151133}
152134
135+ func ensureDebugDaemonsets (cluster * armcontainerservice.ManagedCluster , isNetworkIsolated bool ) func (context.Context , * Kubeclient ) error {
136+ return func (ctx context.Context , k * Kubeclient ) error {
137+ return k .EnsureDebugDaemonsets (ctx , isNetworkIsolated , config .GetPrivateACRName (true , * cluster .Location ))
138+ }
139+ }
140+
153141func extractClusterParams (cluster * armcontainerservice.ManagedCluster ) func (context.Context , * Kubeclient ) (* ClusterParams , error ) {
154142 return func (ctx context.Context , k * Kubeclient ) (* ClusterParams , error ) {
155143 return extractClusterParameters (ctx , cluster , k )
@@ -417,35 +405,25 @@ func createNewAKSClusterWithRetry(ctx context.Context, cluster *armcontainerserv
417405 return createdCluster , nil
418406 }
419407
420- if isRetryableClusterError (err ) {
408+ // Check if the error is a 409 Conflict
409+ var respErr * azcore.ResponseError
410+ if errors .As (err , & respErr ) && respErr .StatusCode == 409 {
421411 lastErr = err
422- toolkit .Logf (ctx , "Attempt %d failed with retryable error : %v. Retrying in %v..." , attempt + 1 , err , retryInterval )
412+ toolkit .Logf (ctx , "Attempt %d failed with 409 Conflict : %v. Retrying in %v..." , attempt + 1 , err , retryInterval )
423413
424414 select {
425415 case <- time .After (retryInterval ):
416+ // Continue to next iteration
426417 case <- ctx .Done ():
427418 return nil , fmt .Errorf ("context canceled while retrying cluster creation: %w" , ctx .Err ())
428419 }
429420 } else {
421+ // If it's not a 409 error, return immediately
430422 return nil , fmt .Errorf ("failed to create cluster: %w" , err )
431423 }
432424 }
433425
434- return nil , fmt .Errorf ("failed to create cluster after %d attempts: %w" , maxRetries , lastErr )
435- }
436-
437- // isRetryableClusterError returns true for transient cluster creation errors
438- // that can be resolved by retrying, such as 409 Conflict (concurrent operations)
439- // and NotFound during managed identity reconciliation (stale references after cluster deletion).
440- func isRetryableClusterError (err error ) bool {
441- var respErr * azcore.ResponseError
442- if ! errors .As (err , & respErr ) {
443- return false
444- }
445- if respErr .StatusCode == 409 {
446- return true
447- }
448- return respErr .ErrorCode == "NotFound" && strings .Contains (err .Error (), "Reconcile managed identity credential failed" )
426+ return nil , fmt .Errorf ("failed to create cluster after %d attempts due to persistent 409 Conflict: %w" , maxRetries , lastErr )
449427}
450428
451429func ensureMaintenanceConfiguration (ctx context.Context , cluster * armcontainerservice.ManagedCluster ) error {
@@ -827,70 +805,3 @@ func ensureResourceGroup(ctx context.Context, location string) (armresources.Res
827805 }
828806 return rg .ResourceGroup , nil
829807}
830-
831- // setupPrivateDNSForAPIServer creates a private DNS zone for the API server FQDN
832- // linked to the cluster VNet with an A record pointing to the current public IP.
833- // Simulates a customer environment with minimal private DNS entries.
834- func setupPrivateDNSForAPIServer (ctx context.Context , cluster * armcontainerservice.ManagedCluster ) error {
835- defer toolkit .LogStepCtx (ctx , "setting up private DNS for API server" )()
836-
837- fqdn := * cluster .Properties .Fqdn
838- nodeRG := * cluster .Properties .NodeResourceGroup
839-
840- ips , err := net .LookupHost (fqdn )
841- if err != nil {
842- return fmt .Errorf ("resolving API server FQDN %q: %w" , fqdn , err )
843- }
844-
845- var aRecords []* armprivatedns.ARecord
846- for _ , ip := range ips {
847- if parsed := net .ParseIP (ip ); parsed != nil && parsed .To4 () != nil {
848- aRecords = append (aRecords , & armprivatedns.ARecord {IPv4Address : to .Ptr (ip )})
849- }
850- }
851- if len (aRecords ) == 0 {
852- return fmt .Errorf ("no IPv4 addresses for %q" , fqdn )
853- }
854-
855- zoneName := fqdn
856- if err := wait .PollUntilContextTimeout (ctx , 5 * time .Second , 2 * time .Minute , true , func (ctx context.Context ) (bool , error ) {
857- _ , err := createPrivateZone (ctx , nodeRG , zoneName )
858- if err != nil {
859- var respErr * azcore.ResponseError
860- if errors .As (err , & respErr ) && respErr .StatusCode == 409 {
861- return false , nil // concurrent operation, retry
862- }
863- return false , err
864- }
865- return true , nil
866- }); err != nil {
867- return fmt .Errorf ("creating private zone %q: %w" , zoneName , err )
868- }
869-
870- vnet , err := getClusterVNet (ctx , nodeRG )
871- if err != nil {
872- return fmt .Errorf ("getting cluster VNet: %w" , err )
873- }
874- if err := wait .PollUntilContextTimeout (ctx , 5 * time .Second , 2 * time .Minute , true , func (ctx context.Context ) (bool , error ) {
875- err := createPrivateDNSLink (ctx , vnet , nodeRG , zoneName )
876- if err != nil {
877- var respErr * azcore.ResponseError
878- if errors .As (err , & respErr ) && respErr .StatusCode == 409 {
879- return false , nil
880- }
881- return false , err
882- }
883- return true , nil
884- }); err != nil {
885- return fmt .Errorf ("linking private zone to VNet: %w" , err )
886- }
887-
888- _ , err = config .Azure .RecordSetClient .CreateOrUpdate (ctx , nodeRG , zoneName , armprivatedns .RecordTypeA , "@" ,
889- armprivatedns.RecordSet {Properties : & armprivatedns.RecordSetProperties {TTL : to.Ptr [int64 ](300 ), ARecords : aRecords }}, nil )
890- if err != nil {
891- return fmt .Errorf ("creating A record in zone %q: %w" , zoneName , err )
892- }
893-
894- toolkit .Logf (ctx , "private DNS zone %q → %v" , zoneName , ips )
895- return nil
896- }
0 commit comments