Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 200 additions & 24 deletions crates/api/src/site_explorer/machine_creator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
use std::collections::HashMap;
use std::net::IpAddr;
use std::sync::Arc;
use std::sync::atomic::Ordering;

use carbide_uuid::machine::MachineId;
use db::{ObjectColumnFilter, Transaction};
Expand Down Expand Up @@ -114,26 +115,46 @@ impl MachineCreator {

// Zero-dpu case: If the explored host had no DPUs, we can create the machine now
if managed_host.explored_host.dpus.is_empty() {
if !self.config.allow_zero_dpu_hosts {
if self.config.allow_zero_dpu_hosts {
if let Some(machine_id) = self
.create_zero_dpu_machine(
&mut txn,
&managed_host,
report,
metadata.unwrap_or(&Metadata::default()),
)
.await?
{
managed_host.machine_id = Some(machine_id);
} else {
// Site explorer has already created a machine for this endpoint previously, skip.
return Ok(false);
}
tracing::info!("Created managed_host with zero DPUs");
} else if self.config.use_onboard_nic.load(Ordering::Relaxed) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still don't understand what this change is supposed to do. If we've gotten to this point in site_explorer, we've seen no DPU's on the host, and if zero-DPU configuration is allowed, we already ingest it with no DPU's.

Why do we need another config setting called use_onboard_nic, and another function create_onboard_nic_machine? What do these do that are different from the zero-dpu path?

Copy link
Author

@vinodchitraliNVIDIA vinodchitraliNVIDIA Feb 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kensimon is there any known bug ? Couple of months ago I tried with zero dpu flag. It din't work. Since the GB200 machine has multiple MAC address. Faced issue there. Also DPU list in managed host is not empty.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not that I know of? If there is a bug that you found, we should fix it. I don't think we need a fully separate code path for GB200's when we can just fix the current one.

If you have an issue with GB200's in zero-dpu mode, could you file an nvbug and assign it to me? I need logs/details/reproduction steps if possible.

let host_onboard_nic_mac_address = None;
// expected_machine.map(|m| m.host_nics[0].mac_address);
if let Some(machine_id) = self
.create_onboard_nic_machine(
&mut txn,
&managed_host,
report,
metadata.unwrap_or(&Metadata::default()),
host_onboard_nic_mac_address,
)
.await?
{
managed_host.machine_id = Some(machine_id);
} else {
// Site explorer has already created a machine for this endpoint previously, skip.
return Ok(false);
}
tracing::info!("Created managed_host with onboard NIC");
} else {
let error = CarbideError::NoDpusInMachine(managed_host.explored_host.host_bmc_ip);
tracing::error!(%error, "Cannot create managed host for explored endpoint with no DPUs: Zero-dpu hosts are disallowed by config");
tracing::error!(%error, "Cannot create managed host for explored endpoint with no DPUs: Zero-dpu hosts are disallowed by config and onboard NIC is not enabled");
return Err(error);
}
if let Some(machine_id) = self
.create_zero_dpu_machine(
&mut txn,
&managed_host,
report,
metadata.unwrap_or(&Metadata::default()),
)
.await?
{
managed_host.machine_id = Some(machine_id);
} else {
// Site explorer has already created a machine for this endpoint previously, skip.
return Ok(false);
}
tracing::info!("Created managed_host with zero DPUs");
}

let mut dpu_ids = vec![];
Expand Down Expand Up @@ -175,19 +196,22 @@ impl MachineCreator {
"Failed to get machine ID for host: {managed_host:#?}"
)))?;

db::machine::update_state(
&mut txn,
&host_machine_id,
&ManagedHostState::DpuDiscoveringState {
let starting_state = if self.config.use_onboard_nic.load(Ordering::Relaxed) {
ManagedHostState::HostInit {
machine_state: model::machine::MachineState::EnableIpmiOverLan,
}
} else {
ManagedHostState::DpuDiscoveringState {
dpu_states: DpuDiscoveringStates {
states: dpu_ids
.into_iter()
.map(|x| (x, DpuDiscoveringState::Initializing))
.collect::<HashMap<MachineId, DpuDiscoveringState>>(),
},
},
)
.await?;
}
};

db::machine::update_state(&mut txn, &host_machine_id, &starting_state).await?;

txn.commit().await?;

Expand Down Expand Up @@ -328,6 +352,158 @@ impl MachineCreator {
Ok(Some(*machine_id))
}

// Returns MachineId if machene was created.
async fn create_onboard_nic_machine(
&self,
txn: &mut PgConnection,
managed_host: &ManagedHost<'_>,
report: &mut EndpointExplorationReport,
metadata: &Metadata,
host_onboard_nic_mac_address: Option<mac_address::MacAddress>,
) -> CarbideResult<Option<MachineId>> {
let onboard_nic_mac_address =
if let Some(onboard_nic_mac_address) = host_onboard_nic_mac_address {
// Check if the onboard NIC mac address is correct from redfish report
if !report
.all_mac_addresses()
.iter()
.any(|mac| mac == &onboard_nic_mac_address)
{
return Err(CarbideError::internal(format!(
"Onboard NIC mac address not found in exploration report: {:#?}",
onboard_nic_mac_address
)));
}
onboard_nic_mac_address
} else {
if report.all_mac_addresses().is_empty() {
return Err(CarbideError::internal(format!(
"No mac addresses found in exploration report: {:#?}",
report
)));
}
// If not specified then use first mac address from redfish report
*report.all_mac_addresses().first().unwrap()
};

if db::machine::find_by_mac_address(txn, &onboard_nic_mac_address)
.await?
.is_some()
{
return Ok(None);
}

// If we already minted this machine and it hasn't DHCP'd yet, there will be an
// predicted_machine_interface with this MAC address. If so, also skip.
if !db::predicted_machine_interface::find_by(
txn,
ObjectColumnFilter::One(
db::predicted_machine_interface::MacAddressColumn,
&onboard_nic_mac_address,
),
)
.await?
.is_empty()
{
return Ok(None);
}

let machine_id = match managed_host.machine_id.as_ref() {
Some(machine_id) => machine_id,
None => {
// Mint a predicted-host machine_id from the exploration report
report.generate_machine_id(true)?.unwrap()
}
};

tracing::info!(%machine_id, "Minted predicted host ID for onboard NIC machine");

let existing_machine = db::machine::find_one(
txn,
machine_id,
MachineSearchConfig {
include_predicted_host: true,
..Default::default()
},
)
.await?;

if let Some(existing_machine) = existing_machine {
// There's already a machine with this ID, but we already looked above for machines with
// the same MAC address as this one, so something's weird here. Log this host's mac
// addresses and the ones from the colliding hosts to help in diagnosis.
let existing_macs = existing_machine
.hardware_info
.as_ref()
.map(|hw| hw.all_mac_addresses())
.unwrap_or_default();
tracing::warn!(
%machine_id,
?existing_macs,
predicted_host_macs=?onboard_nic_mac_address,
"Predicted host already exists, with different mac addresses from this one. Potentially multiple machines with same serial number?"
);
return Ok(None);
}

self.create_machine_from_explored_managed_host(
txn,
managed_host,
machine_id,
metadata,
None,
false,
)
.await?;

// Create and attach a non-DPU machine_interface to the host for every MAC address we see in
// the exploration report
if let Some(machine_interface) =
db::machine_interface::find_by_mac_address(txn, onboard_nic_mac_address)
.await?
.into_iter()
.next()
{
// There's already a machine_interface with this MAC...
if let Some(existing_machine_id) = machine_interface.machine_id {
// ...If it has a MachineId, something's gone wrong. We already checked db::machine::find_by_mac()
// above for all mac addresses, and returned Ok(false) if any were found. Finding an interface
// with this MAC with a non-nil machine_id is a contradiction.
tracing::error!(
%onboard_nic_mac_address,
%machine_id,
%existing_machine_id,
"BUG! Found existing machine_interface with this MAC address, we should not have gotten here!"
);
// return Err(CarbideError::AlreadyFoundError {
// kind: "MachineInterface",
// id: mac_address.to_string(),
// });
} else {
// ...If it has no MachineId, the host must have DHCP'd before site-explorer ran. Set it to the new machine ID.
tracing::info!(%onboard_nic_mac_address, %machine_id, "Migrating unowned machine_interface to new managed host");
db::machine_interface::associate_interface_with_machine(
&machine_interface.id,
MachineInterfaceAssociation::Machine(*machine_id),
txn,
)
.await?;
}
} else {
db::predicted_machine_interface::create(
NewPredictedMachineInterface {
machine_id,
mac_address: onboard_nic_mac_address,
expected_network_segment_type: NetworkSegmentType::Admin,
},
txn,
)
.await?;
}

Ok(Some(*machine_id))
}

// create_dpu does everything needed to create a DPU as part of a newly discovered managed host.
// If the DPU does not exist in the machines table, the function creates a new DPU machine and configures it appropriately. create_dpu returns true.
// If the DPU already exists in the machines table, this is a no-op. create_dpu returns false.
Expand Down
9 changes: 9 additions & 0 deletions crates/api/src/state_controller/machine/handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8840,6 +8840,15 @@ async fn set_host_boot_order(
SetBootOrderState::SetBootOrder => {
if mh_snapshot.dpu_snapshots.is_empty() {
// MachineState::SetBootOrder is a NO-OP for the Zero-DPU case
if ctx.services.site_config.force_dpu_nic_mode {
redfish_client
.boot_first(Boot::UefiHttp)
.await
.map_err(|e| StateHandlerError::RedfishError {
operation: "boot_first",
error: e,
})?;
}
Ok(SetBootOrderOutcome::Done)
} else {
let primary_interface = mh_snapshot
Expand Down
71 changes: 70 additions & 1 deletion crates/api/src/tests/site_explorer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ use mac_address::MacAddress;
use model::expected_machine::ExpectedMachineData;
use model::hardware_info::HardwareInfo;
use model::machine::machine_search_config::MachineSearchConfig;
use model::machine::{LoadSnapshotOptions, Machine, ManagedHostStateSnapshot};
use model::machine::{LoadSnapshotOptions, Machine, ManagedHostState, ManagedHostStateSnapshot};
use model::metadata::Metadata;
use model::power_shelf::PowerShelfControllerState;
use model::site_explorer::{
Expand Down Expand Up @@ -336,6 +336,75 @@ async fn test_site_explorer_default_pause_ingestion_and_poweron(
Ok(())
}

#[crate::sqlx_test]
async fn test_site_explorer_onboard_nic_creates_managed_host(
pool: sqlx::PgPool,
) -> Result<(), Box<dyn std::error::Error>> {
let env = common::api_fixtures::create_test_env(pool.clone()).await;

let mut machines = vec![FakeMachine::new(
"6a:6b:6c:6d:6e:70",
"VendorOnboardNic",
&env.underlay_segment,
)];
machines.discover_dhcp(&env).await?;

let endpoint_explorer = Arc::new(MockEndpointExplorer::default());
let mock_host = machines[0].as_mock_host(vec![]);

endpoint_explorer.insert_endpoint_results(vec![(
machines[0].ip.parse().unwrap(),
Ok(mock_host.clone().into()),
)]);

let explorer_config = SiteExplorerConfig {
enabled: true,
explorations_per_run: 1,
concurrent_explorations: 1,
run_interval: std::time::Duration::from_secs(1),
create_machines: Arc::new(true.into()),
allow_zero_dpu_hosts: false,
use_onboard_nic: Arc::new(true.into()),
..Default::default()
};
let test_meter = TestMeter::default();
let explorer = SiteExplorer::new(
env.pool.clone(),
explorer_config,
test_meter.meter(),
endpoint_explorer.clone(),
Arc::new(env.config.get_firmware_config()),
env.common_pools.clone(),
env.api.work_lock_manager_handle.clone(),
env.rms_sim.as_rms_client(),
);

explorer.run_single_iteration().await.unwrap();

let mut txn = env.pool.begin().await?;
let explored = db::explored_endpoints::find_all(&mut txn).await.unwrap();
assert_eq!(explored.len(), 1);
db::explored_endpoints::set_preingestion_complete(explored[0].address, &mut txn)
.await
.unwrap();
txn.commit().await?;

explorer.run_single_iteration().await.unwrap();

let mut txn = env.pool.begin().await?;
let managed_hosts =
db::managed_host::load_all(&mut txn, LoadSnapshotOptions::default()).await?;
assert_eq!(managed_hosts.len(), 1);
// check DPUs attached to the managed host
assert_eq!(managed_hosts[0].dpu_snapshots.len(), 0);
assert!(matches!(
managed_hosts[0].managed_state,
ManagedHostState::HostInit { .. }
));

Ok(())
}

#[crate::sqlx_test]
async fn test_site_explorer_main(pool: sqlx::PgPool) -> Result<(), Box<dyn std::error::Error>> {
let env = common::api_fixtures::create_test_env(pool.clone()).await;
Expand Down
Loading