-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathvariables.tf
More file actions
192 lines (166 loc) · 4.65 KB
/
variables.tf
File metadata and controls
192 lines (166 loc) · 4.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
variable "ssh_public_key_path" {
description = "The ssh public key authorized to login to the cluster."
type = string
}
variable "location" {
description = "The location in which to create the cluster."
type = string
}
variable "project_id" {
description = "The project in which to create the cluster."
type = string
}
variable "vpc_subnet_id" {
description = "The vpc subnet id."
type = string
default = null
}
variable "slurm_head_node_count" {
description = "The number of slurm head nodes."
type = number
default = 2
}
variable "slurm_head_node_type" {
description = "The slurm head node instance type."
type = string
default = "c1a.16x"
}
# This is only required when using an infiniband enabled instance type for the head nodes.
variable "slurm_head_node_ib_partition_id" {
description = "The ib partition in which to create the head node."
type = string
default = null
}
variable "slurm_head_node_reservation_id" {
description = "The slurm head node reservation id"
type = string
default = null
}
variable "slurm_login_node_count" {
description = "The number of slurm login nodes."
type = number
default = 2
}
variable "slurm_login_node_type" {
description = "The slurm login node instance type."
type = string
default = "c1a.16x"
}
variable "slurm_login_node_reservation_id" {
description = "The slurm login node reservation id"
type = string
default = null
}
# This is only required when using an infiniband enabled instance type for the login nodes.
variable "slurm_login_node_ib_partition_id" {
description = "The ib partition in which to create the login node."
type = string
default = null
}
variable "partitions" {
description = "Partition configuration"
type = list(object({
name = string
count = number
type = string
imex_support = bool
ib_partition_id = string
image = string
custom_image = string
reservation_id = string
extra_args = map(string)
}))
default = [
{
name = "partition1"
count = 0
type = "b200-180gb-sxm-ib.8x"
imex_support = false
ib_partition_id = null
image = "ubuntu22.04-nvidia-slurm:latest"
custom_image = null
reservation_id = null
extra_args = {
"Default" = "YES",
"MaxTime" = "INFINITE",
"State" = "UP",
}
}
]
}
variable "slurm_users" {
description = "Additional users"
type = list(object({
name = string
uid = number
ssh_pubkey = string
is_sudoer = optional(bool, false)
}))
default = []
}
variable "enable_observability" {
description = "Enable observability stack (Prometheus, Grafana, GPU monitoring)"
type = bool
default = false
}
variable "grafana_admin_password" {
description = "Admin password for Grafana (if observability is enabled)"
type = string
default = "admin"
sensitive = true
}
variable "slurm_data_disk_size" {
description = "The slurm data disk size."
type = string
default = "1024000GiB"
}
variable "slurm_home_disk_size" {
description = "The slurm home directory size."
type = string
default = "20480GiB"
}
variable "slurmctld_disk_size" {
description = "The slurmctld disk size. This is required to persist slurm cluster state"
type = string
default = "1024GiB"
}
variable "pre_existing_slurm_home_disk_id" {
description = "Use a pre-existing Slurm VAST data disk"
type = string
default = null
}
variable "pre_existing_slurm_data_disk_id" {
description = "Use a pre-existing Slurm VAST data disk"
type = string
default = null
}
variable "slurm_data_disk_mount_path" {
description = "This is the training/checkpoint disk mount path"
type = string
default = "/data"
}
variable "vastnfs_version" {
description = "The VAST NFS driver version"
type = string
default = "4.0.35"
}
variable "vast_nfs_server_host" {
description = "The VAST NFS server hostname or IP address used as the NFS mount source"
type = string
default = "172.27.255.2"
}
variable "vast_nfs_remoteports" {
description = "The VAST NFS remoteports range used in NFS mount options"
type = string
default = "172.27.255.2-172.27.255.17"
}
variable "head_node_custom_image_name" {
description = "name:tag of your Custom Image for Head Nodes"
type = string
default = null
}
variable "login_node_custom_image_name" {
description = "name:tag of your Custom Image for Login Nodes"
type = string
default = null
}