diff --git a/Cargo.lock b/Cargo.lock index d03d25a6bf..488b201937 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3202,8 +3202,7 @@ dependencies = [ [[package]] name = "oci-spec" version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8445a2631507cec628a15fdd6154b54a3ab3f20ed4fe9d73a3b8b7a4e1ba03a" +source = "git+https://github.com/youki-dev/oci-spec-rs?branch=main#926197a42cf9fd831aebac287fe12a5d4bf6c481" dependencies = [ "const_format", "derive_builder", diff --git a/Cargo.toml b/Cargo.toml index 515544ecc0..ec21737cb5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,6 +3,10 @@ resolver = "2" members = ["crates/*", "tests/contest/*", "tools/*"] exclude = ["experiment/seccomp", "experiment/selinux"] +# NOTE: temp for developing +[patch.crates-io] +oci-spec = { git = "https://github.com/youki-dev/oci-spec-rs", branch = "main" } + [workspace.dependencies] anyhow = "1.0.102" caps = "0.5.6" diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index 10a41bdfa7..1e3c290058 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -12,7 +12,9 @@ use super::{Container, ContainerStatus}; use crate::error::{CreateContainerError, LibcontainerError, MissingSpecError}; use crate::notify_socket::NotifyListener; use crate::process::args::{ContainerArgs, ContainerType}; -use crate::process::intel_rdt::delete_resctrl_subdirectory; +use crate::process::intel_rdt::{ + delete_resctrl_monitoring_subdirectory, delete_resctrl_subdirectory, +}; use crate::process::{self}; use crate::syscall::syscall::SyscallType; use crate::user_ns::UserNamespaceConfig; @@ -192,7 +194,7 @@ impl ContainerBuilderImpl { pid_file: self.pid_file.to_owned(), }; - let (init_pid, need_to_clean_up_intel_rdt_dir) = + let (init_pid, intel_rdt_dir, intel_rdt_monitoring_dir) = process::container_main_process::container_main_process(&container_args).map_err( |err| { tracing::error!("failed to run container process {}", err); @@ -206,7 +208,8 @@ impl ContainerBuilderImpl { .set_status(ContainerStatus::Created) .set_creator(nix::unistd::geteuid().as_raw()) .set_pid(init_pid.as_raw()) - .set_clean_up_intel_rdt_directory(need_to_clean_up_intel_rdt_dir) + .set_intel_rdt_dir(intel_rdt_dir) + .set_intel_rdt_monitoring_dir(intel_rdt_monitoring_dir) .save()?; } @@ -231,9 +234,16 @@ impl ContainerBuilderImpl { } if let Some(container) = &self.container { - if let Some(true) = container.clean_up_intel_rdt_subdirectory() { - if let Err(e) = delete_resctrl_subdirectory(container.id()) { - tracing::error!(id = ?container.id(), error = ?e, "failed to delete resctrl subdirectory"); + if let Some(path) = container.intel_rdt_monitoring_dir() { + if let Err(e) = delete_resctrl_monitoring_subdirectory(path) { + tracing::error!(path = ?path, error = ?e, "failed to delete resctrl monitoring subdirectory"); + errors.push(e.to_string()); + } + } + + if let Some(path) = container.intel_rdt_dir() { + if let Err(e) = delete_resctrl_subdirectory(path) { + tracing::error!(path = ?path, error = ?e, "failed to delete resctrl subdirectory"); errors.push(e.to_string()); } } diff --git a/crates/libcontainer/src/container/container.rs b/crates/libcontainer/src/container/container.rs index 12c2ee4eb0..3945bd2c1a 100644 --- a/crates/libcontainer/src/container/container.rs +++ b/crates/libcontainer/src/container/container.rs @@ -130,13 +130,22 @@ impl Container { self } - pub fn set_clean_up_intel_rdt_directory(&mut self, clean_up: bool) -> &mut Self { - self.state.clean_up_intel_rdt_subdirectory = Some(clean_up); + pub fn set_intel_rdt_dir(&mut self, dir: Option) -> &mut Self { + self.state.intel_rdt_dir = dir; self } - pub fn clean_up_intel_rdt_subdirectory(&self) -> Option { - self.state.clean_up_intel_rdt_subdirectory + pub fn intel_rdt_dir(&self) -> Option<&PathBuf> { + self.state.intel_rdt_dir.as_ref() + } + + pub fn set_intel_rdt_monitoring_dir(&mut self, dir: Option) -> &mut Self { + self.state.intel_rdt_monitoring_dir = dir; + self + } + + pub fn intel_rdt_monitoring_dir(&self) -> Option<&PathBuf> { + self.state.intel_rdt_monitoring_dir.as_ref() } pub fn status(&self) -> ContainerStatus { diff --git a/crates/libcontainer/src/container/container_delete.rs b/crates/libcontainer/src/container/container_delete.rs index f167e8018e..23097dbce6 100644 --- a/crates/libcontainer/src/container/container_delete.rs +++ b/crates/libcontainer/src/container/container_delete.rs @@ -8,7 +8,9 @@ use super::{Container, ContainerStatus}; use crate::config::YoukiConfig; use crate::error::LibcontainerError; use crate::hooks; -use crate::process::intel_rdt::delete_resctrl_subdirectory; +use crate::process::intel_rdt::{ + delete_resctrl_monitoring_subdirectory, delete_resctrl_subdirectory, +}; impl Container { /// Deletes the container @@ -69,41 +71,12 @@ impl Container { // Once reached here, the container is verified that it can be deleted. debug_assert!(self.status().can_delete()); - if let Some(true) = &self.clean_up_intel_rdt_subdirectory() { - if let Err(err) = delete_resctrl_subdirectory(self.id()) { - tracing::warn!( - "failed to delete resctrl subdirectory due to: {err:?}, continue to delete" - ); - } - } - + let mut config_opt = None; if self.root.exists() { match YoukiConfig::load(&self.root) { Ok(config) => { - tracing::debug!("config: {:?}", config); - - // remove the cgroup created for the container - // check https://man7.org/linux/man-pages/man7/cgroups.7.html - // creating and removing cgroups section for more information on cgroups - let cmanager = libcgroups::common::create_cgroup_manager( - libcgroups::common::CgroupConfig { - cgroup_path: config.cgroup_path.to_owned(), - systemd_cgroup: self.systemd(), - container_name: self.id().to_string(), - }, - )?; - cmanager.remove().map_err(|err| { - tracing::error!(cgroup_path = ?config.cgroup_path, "failed to remove cgroup due to: {err:?}"); - err - })?; - - if let Some(hooks) = config.hooks.as_ref() { - hooks::run_hooks(hooks.poststop().as_ref(), Some(&self.state), None, None) - .map_err(|err| { - tracing::error!(err = ?err, "failed to run post stop hooks"); - err - })?; - } + tracing::debug!("config: {config:?}"); + config_opt = Some(config); } Err(err) => { // There is a brief window where the container state is @@ -115,7 +88,49 @@ impl Container { ); } } + } + + if let Some(path) = self.intel_rdt_monitoring_dir() { + if let Err(err) = delete_resctrl_monitoring_subdirectory(path) { + tracing::warn!( + "failed to delete resctrl monitoring subdirectory due to: {err:?}, continue to delete" + ); + } + } + if let Some(path) = self.intel_rdt_dir() { + if let Err(err) = delete_resctrl_subdirectory(path) { + tracing::warn!( + "failed to delete resctrl subdirectory due to: {err:?}, continue to delete" + ); + } + } + + if let Some(config) = config_opt { + // remove the cgroup created for the container + // check https://man7.org/linux/man-pages/man7/cgroups.7.html + // creating and removing cgroups section for more information on cgroups + let cmanager = + libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig { + cgroup_path: config.cgroup_path.to_owned(), + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + })?; + cmanager.remove().map_err(|err| { + tracing::error!(cgroup_path = ?config.cgroup_path, "failed to remove cgroup due to: {err:?}"); + err + })?; + + if let Some(hooks) = config.hooks.as_ref() { + hooks::run_hooks(hooks.poststop().as_ref(), Some(&self.state), None, None) + .map_err(|err| { + tracing::error!(err = ?err, "failed to run post stop hooks"); + err + })?; + } + } + + if self.root.exists() { // remove the directory storing container state tracing::debug!("remove dir {:?}", self.root); fs::remove_dir_all(&self.root).map_err(|err| { diff --git a/crates/libcontainer/src/container/state.rs b/crates/libcontainer/src/container/state.rs index a803529ca5..497bcb3f5f 100644 --- a/crates/libcontainer/src/container/state.rs +++ b/crates/libcontainer/src/container/state.rs @@ -119,8 +119,14 @@ pub struct State { pub creator: Option, // Specifies if systemd should be used to manage cgroups pub use_systemd: bool, - // Specifies if the Intel RDT subdirectory needs be cleaned up. - pub clean_up_intel_rdt_subdirectory: Option, + // Specifies the Intel RDT subdirectory path that needs to be cleaned up. + pub intel_rdt_dir: Option, + /// Indicates the dedicated monitoring group subdirectory (`mon_groups/`) + /// within the resctrl filesystem that needs to be cleaned up. + /// As per OCI runtime-spec v1.3.0, if `enable_monitoring` is true, the runtime MUST + /// create this group and MUST remove it when the container is deleted. + #[serde(skip_serializing_if = "Option::is_none")] + pub intel_rdt_monitoring_dir: Option, } impl State { @@ -142,7 +148,8 @@ impl State { created: None, creator: None, use_systemd: false, - clean_up_intel_rdt_subdirectory: None, + intel_rdt_dir: None, + intel_rdt_monitoring_dir: None, } } diff --git a/crates/libcontainer/src/process/container_main_process.rs b/crates/libcontainer/src/process/container_main_process.rs index 6414aa71f5..1fdda6e54a 100644 --- a/crates/libcontainer/src/process/container_main_process.rs +++ b/crates/libcontainer/src/process/container_main_process.rs @@ -50,7 +50,9 @@ pub enum ProcessError { type Result = std::result::Result; -pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bool)> { +pub fn container_main_process( + container_args: &ContainerArgs, +) -> Result<(Pid, Option, Option)> { // We use a set of channels to communicate between parent and child process. // Each channel is uni-directional. Because we will pass these channel to // cloned process, we have to be deligent about closing any unused channel. @@ -132,7 +134,8 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo // The intermediate process will send the init pid once it forks the init // process. The intermediate process should exit after this point. let init_pid = main_receiver.wait_for_intermediate_ready()?; - let mut need_to_clean_up_intel_rdt_subdirectory = false; + let mut intel_rdt_dir = None; + let mut intel_rdt_monitoring_dir = None; if let Some(linux) = container_args.spec.linux() { if let Some(intel_rdt) = linux.intel_rdt() { @@ -140,8 +143,9 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo .container .as_ref() .map(|container| container.id()); - need_to_clean_up_intel_rdt_subdirectory = - setup_intel_rdt(container_id, &init_pid, intel_rdt)?; + let (dir, mon_dir) = setup_intel_rdt(container_id, &init_pid, intel_rdt)?; + intel_rdt_dir = dir; + intel_rdt_monitoring_dir = mon_dir; } } @@ -278,7 +282,7 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo Err(err) => return Err(ProcessError::WaitIntermediateProcess(err)), }; - Ok((init_pid, need_to_clean_up_intel_rdt_subdirectory)) + Ok((init_pid, intel_rdt_dir, intel_rdt_monitoring_dir)) } fn setup_mapping(config: &UserNamespaceConfig, pid: Pid) -> Result<()> { diff --git a/crates/libcontainer/src/process/intel_rdt.rs b/crates/libcontainer/src/process/intel_rdt.rs index f8c14794b4..be26b3a8e8 100644 --- a/crates/libcontainer/src/process/intel_rdt.rs +++ b/crates/libcontainer/src/process/intel_rdt.rs @@ -63,17 +63,25 @@ pub enum ParseLineError { L3Line, #[error("L3 token has wrong number of fields")] L3Token, + #[error("Generic line doesn't match validation")] + GenericLine, + #[error("Generic token has wrong number of fields")] + GenericToken, } type Result = std::result::Result; -pub fn delete_resctrl_subdirectory(id: &str) -> Result<()> { +/// Removes the main resource control group (CLOS) directory for the container. +/// This function includes a safety check (`parent == dir`) to ensure that only directories +/// immediately under the resctrl mount point are deleted, preventing accidental deletion +/// of nested kernel files or other sensitive directories. +pub fn delete_resctrl_subdirectory(path: &Path) -> Result<()> { let dir = find_resctrl_mount_point().map_err(|err| { - tracing::error!("failed to find resctrl mount point: {}", err); + tracing::error!("failed to find resctrl mount point: {err}"); err })?; - let container_resctrl_path = dir.join(id).canonicalize().map_err(|err| { - tracing::error!(?dir, ?id, "failed to canonicalize path: {}", err); + let container_resctrl_path = path.canonicalize().map_err(|err| { + tracing::error!(?dir, ?path, "failed to canonicalize path: {err}"); IntelRdtError::Canonicalize(err) })?; match container_resctrl_path.parent() { @@ -82,7 +90,7 @@ pub fn delete_resctrl_subdirectory(id: &str) -> Result<()> { Some(parent) => { if parent == dir && container_resctrl_path.exists() { fs::remove_dir(&container_resctrl_path).map_err(|err| { - tracing::error!(path = ?container_resctrl_path, "failed to remove resctrl subdirectory: {}", err); + tracing::error!(path = ?container_resctrl_path, "failed to remove resctrl subdirectory: {err}"); IntelRdtError::RemoveSubdirectory(err) })?; } else { @@ -94,6 +102,25 @@ pub fn delete_resctrl_subdirectory(id: &str) -> Result<()> { Ok(()) } +/// Removes the dedicated monitoring group subdirectory (`mon_groups/`). +/// Unlike `delete_resctrl_subdirectory`, this does not enforce the `parent == dir` safety check, +/// because monitoring groups are intentionally nested deeper in the filesystem tree +/// (either under a specific CLOS ID or under the root `mon_groups`). The path provided here +/// is inherently trusted as it was resolved securely during container creation. +pub fn delete_resctrl_monitoring_subdirectory(path: &Path) -> Result<()> { + if path.exists() { + fs::remove_dir(path).map_err(|err| { + tracing::error!( + ?path, + "failed to remove resctrl monitoring subdirectory: {err}" + ); + IntelRdtError::RemoveSubdirectory(err) + })?; + } + + Ok(()) +} + /// Finds the resctrl mount path by looking at the process mountinfo data. pub fn find_resctrl_mount_point() -> Result { let reader = BufReader::new(ProcfsHandle::new()?.open( @@ -118,50 +145,85 @@ pub fn find_resctrl_mount_point() -> Result { Err(IntelRdtError::ResctrlMountPointNotFound) } -/// Adds container PID to the tasks file in the correct resctrl -/// pseudo-filesystem subdirectory. Creates the directory if needed based on -/// the rules in Linux OCI runtime config spec. -fn write_container_pid_to_resctrl_tasks( - path: &Path, - id: &str, +/// Sets up the main resource control group (CLOS) for the container. +/// This involves creating the subdirectory within the resctrl filesystem if needed, +/// and adding the container's PID to the group's `tasks` file. +/// +/// Returns `true` if the runtime created the directory, or `false` if it already existed. +fn setup_resctrl_group( + resctrl_container_dir: &Path, init_pid: Pid, only_clos_id_set: bool, ) -> Result { - let tasks = path.to_owned().join(id).join("tasks"); - let dir = tasks.parent(); - match dir { - None => Err(IntelRdtError::InvalidResctrlDirectory), - Some(resctrl_container_dir) => { - let mut created_dir = false; - if !resctrl_container_dir.exists() { - if only_clos_id_set { - // Directory doesn't exist and only clos_id is set: error out. - return Err(IntelRdtError::NoClosIDDirectory); - } - fs::create_dir_all(resctrl_container_dir).map_err(|err| { - tracing::error!("failed to create resctrl subdirectory: {}", err); - IntelRdtError::CreateClosIDDirectory(err) - })?; - created_dir = true; - } - // TODO(ipuustin): File doesn't need to be created, but it's easier - // to test this way. Fix the tests so that the fake resctrl - // filesystem is pre-populated. - let mut file = OpenOptions::new() - .create(true) - .append(true) - .open(tasks) - .map_err(|err| { - tracing::error!("failed to open resctrl tasks file: {}", err); - IntelRdtError::OpenClosIDDirectory(err) - })?; - write!(file, "{init_pid}").map_err(|err| { - tracing::error!("failed to write to resctrl tasks file: {}", err); - IntelRdtError::WriteClosIDDirectory(err) - })?; - Ok(created_dir) + let mut created_dir = false; + + if !resctrl_container_dir.exists() { + if only_clos_id_set { + return Err(IntelRdtError::NoClosIDDirectory); } + fs::create_dir_all(resctrl_container_dir).map_err(|err| { + tracing::error!("failed to create resctrl subdirectory: {}", err); + IntelRdtError::CreateClosIDDirectory(err) + })?; + created_dir = true; } + + let tasks = resctrl_container_dir.join("tasks"); + // TODO(ipuustin): File doesn't need to be created, but it's easier + // to test this way. Fix the tests so that the fake resctrl + // filesystem is pre-populated. + let mut file = OpenOptions::new() + .create(true) + .append(true) + .open(tasks) + .map_err(|err| { + tracing::error!("failed to open resctrl tasks file: {}", err); + IntelRdtError::OpenClosIDDirectory(err) + })?; + + write!(file, "{init_pid}").map_err(|err| { + tracing::error!("failed to write to resctrl tasks file: {}", err); + IntelRdtError::WriteClosIDDirectory(err) + })?; + + Ok(created_dir) +} + +/// Creates a dedicated monitoring group (`mon_groups/`) inside the container's +/// Intel RDT resource control directory and adds the container's PID to its `tasks` file. +/// +/// As per OCI runtime-spec v1.3.0, if `enable_monitoring` is set to true, the runtime MUST +/// create this subdirectory. This allows users to track the resource utilization (e.g., L3 cache, +/// memory bandwidth) of the container independently from the parent allocation group (CLOS). +/// +/// Returns `true` if the runtime created the directory, or `false` if it already existed. +fn setup_monitoring_group(mon_dir: &Path, init_pid: Pid) -> Result { + let mut created_dir = false; + + if !mon_dir.exists() { + fs::create_dir_all(mon_dir).map_err(|err| { + tracing::error!("failed to create resctrl monitoring subdirectory: {}", err); + IntelRdtError::CreateClosIDDirectory(err) + })?; + created_dir = true; + } + + let tasks = mon_dir.join("tasks"); + let mut file = OpenOptions::new() + .create(true) + .append(true) + .open(tasks) + .map_err(|err| { + tracing::error!("failed to open resctrl monitoring tasks file: {}", err); + IntelRdtError::OpenClosIDDirectory(err) + })?; + + write!(file, "{init_pid}").map_err(|err| { + tracing::error!("failed to write to resctrl monitoring tasks file: {}", err); + IntelRdtError::WriteClosIDDirectory(err) + })?; + + Ok(created_dir) } /// Merges the two schemas together, removing lines starting with "MB:" from @@ -199,7 +261,7 @@ enum LineType { L3DataLine, L3CodeLine, MbLine, - Unknown, + Generic(String), } #[derive(PartialEq)] @@ -265,7 +327,48 @@ fn parse_l3_line(line: &str) -> std::result::Result, Par Ok(token_map) } -/// Get the resctrl line type. We only support L3{,CODE,DATA} and MB. +/// Parse tokens from generic resctrl lines. +/// OCI runtime-spec v1.3.0 introduced the `schemata` list, allowing users to +/// specify any hardware resource (e.g. `L2`, `SMBA`). To safely verify these +/// new resources during `is_same_schema`, we must parse them into token maps +/// so we can perform strict semantic equality checks rather than basic string matching. +/// +/// Example: +/// A line like "L2:0=00ff;1=f0" is parsed into a token map: +/// {"0": "ff", "1": "f0"} +/// Leading zeros are automatically stripped from the values to ensure +/// "00ff" correctly matches "ff" during equality comparison. +fn parse_generic_line(line: &str) -> std::result::Result, ParseLineError> { + let mut token_map = HashMap::new(); + + static OTHER_VALIDATE_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"^[A-Za-z0-9]+:(?:\s|;)*(?:\w+\s*=\s*[[:xdigit:]]+)?(?:(?:\s*;+\s*)+\w+\s*=\s*[[:xdigit:]]+)*(?:\s|;)*$").unwrap() + }); + + static OTHER_CAPTURE_RE: LazyLock = + LazyLock::new(|| Regex::new(r"(\w+)\s*=\s*([[:xdigit:]]+)").unwrap()); + if !OTHER_VALIDATE_RE.is_match(line) { + return Err(ParseLineError::GenericLine); + } + + for token in OTHER_CAPTURE_RE.captures_iter(line) { + match (token.get(1), token.get(2)) { + (Some(key), Some(value)) => { + let val_str = value.as_str().trim_start_matches('0'); + let final_val = if val_str.is_empty() { "0" } else { val_str }; + token_map.insert(key.as_str().to_string(), final_val.to_string()); + } + _ => return Err(ParseLineError::GenericToken), + } + } + + Ok(token_map) +} + +/// Get the resctrl line type. +/// Supports traditional L3{,CODE,DATA} and MB resources. +/// Also supports generic hardware resources (e.g., L2, SMBA) as introduced +/// in OCI runtime-spec v1.3.0 via the `schemata` list feature. fn get_line_type(line: &str) -> LineType { if line.starts_with("L3:") { return LineType::L3Line; @@ -280,20 +383,37 @@ fn get_line_type(line: &str) -> LineType { return LineType::MbLine; } - // Empty or unknown line. - LineType::Unknown + // OCI runtime-spec v1.3.0 generic schemata list support. + // If it's not a legacy L3/MB line, we extract the prefix before the colon + // (e.g. "L2" from "L2:0=ff") and store it as `LineType::Other(prefix)`. + // This allows us to retain the prefix for strict schema verification later, + // rather than blindly dropping unknown hardware resources. + if let Some(pos) = line.find(':') { + let prefix = &line[..pos]; + if prefix.chars().all(|c| c.is_alphanumeric() || c == '_') { + return LineType::Generic(prefix.to_string()); + } + } + + LineType::Generic(String::new()) } /// Parse a resctrl line. fn parse_line(line: &str) -> Option> { let line_type = get_line_type(line); - let maybe_tokens = match line_type { + let maybe_tokens = match &line_type { LineType::L3Line => parse_l3_line(line).map(Some), LineType::L3DataLine => parse_l3_line(line).map(Some), LineType::L3CodeLine => parse_l3_line(line).map(Some), LineType::MbLine => parse_mb_line(line).map(Some), - LineType::Unknown => Ok(None), + LineType::Generic(prefix) => { + if prefix.is_empty() { + Ok(None) + } else { + parse_generic_line(line).map(Some) + } + } }; match maybe_tokens { @@ -328,6 +448,22 @@ fn is_same_schema(combined_schema: &str, existing_schema: &str) -> Result Ok(compare_lines(&combined, &existing)) } +/// Retrieves the schemata data to be written to the resctrl filesystem. +/// OCI runtime-spec v1.3.0 introduced the generic `schemata` list, which +/// supersedes `l3_cache_schema` and `mem_bw_schema`. If the list is provided, +/// we join the array elements with a newline `\n` to format them correctly +/// for the Linux kernel's `schemata` file requirements. If omitted, we +/// fall back to combining the legacy fields. +fn get_schemata_data(intel_rdt: &LinuxIntelRdt) -> Option { + if let Some(schemata) = intel_rdt.schemata() { + if !schemata.is_empty() { + return Some(schemata.join("\n")); + } + } + + combine_l3_cache_and_mem_bw_schemas(intel_rdt.l3_cache_schema(), intel_rdt.mem_bw_schema()) +} + /// Combines the l3_cache_schema and mem_bw_schema values together with the /// rules given in Linux OCI runtime config spec. If clos_id_was_set parameter /// is true and the directory wasn't created, the rules say that the schemas @@ -336,13 +472,12 @@ fn is_same_schema(combined_schema: &str, existing_schema: &str) -> Result fn write_resctrl_schemata( path: &Path, id: &str, - l3_cache_schema: &Option, - mem_bw_schema: &Option, + intel_rdt: &LinuxIntelRdt, clos_id_was_set: bool, created_dir: bool, ) -> Result<()> { let schemata = path.to_owned().join(id).join("schemata"); - let maybe_combined_schema = combine_l3_cache_and_mem_bw_schemas(l3_cache_schema, mem_bw_schema); + let maybe_combined_schema = get_schemata_data(intel_rdt); if let Some(combined_schema) = maybe_combined_schema { if clos_id_was_set && !created_dir { @@ -371,48 +506,53 @@ fn write_resctrl_schemata( Ok(()) } -/// Sets up Intel RDT configuration for the container process based on the -/// OCI config. The result bool tells whether or not we need to clean up -/// the created subdirectory. +/// Sets up Intel RDT configuration for the container process based on the OCI config. +/// This handles setting up the main resource allocation group (CLOS), applying schemata, +/// and setting up the dedicated monitoring group if `enable_monitoring` is true. +/// +/// Returns a tuple of two optional paths: `(intel_rdt_dir, intel_rdt_monitoring_dir)`. +/// These indicate whether the runtime created the main group directory and/or the +/// monitoring directory, respectively. The runtime MUST remove these created +/// directories when the container is deleted. pub fn setup_intel_rdt( maybe_container_id: Option<&str>, init_pid: &Pid, intel_rdt: &LinuxIntelRdt, -) -> Result { +) -> Result<(Option, Option)> { // Find mounted resctrl filesystem, error out if it can't be found. - let path = find_resctrl_mount_point().inspect_err(|_err| { + let mount_point = find_resctrl_mount_point().inspect_err(|_err| { tracing::error!("failed to find a mounted resctrl file system"); })?; - let clos_id_set = intel_rdt.clos_id().is_some(); - let only_clos_id_set = - clos_id_set && intel_rdt.l3_cache_schema().is_none() && intel_rdt.mem_bw_schema().is_none(); - let id = match (intel_rdt.clos_id(), maybe_container_id) { - (Some(clos_id), _) => clos_id, - (None, Some(container_id)) => container_id, - (None, None) => Err(IntelRdtError::ResctrlIdNotFound)?, - }; - let created_dir = write_container_pid_to_resctrl_tasks(&path, id, *init_pid, only_clos_id_set) - .inspect_err(|_err| { - tracing::error!("failed to write container pid to resctrl tasks file"); - })?; - write_resctrl_schemata( - &path, - id, - intel_rdt.l3_cache_schema(), - intel_rdt.mem_bw_schema(), - clos_id_set, - created_dir, - ) - .inspect_err(|_err| { - tracing::error!("failed to write schemata to resctrl schemata file"); - })?; + let container_id = maybe_container_id.ok_or(IntelRdtError::ResctrlIdNotFound)?; + let clos_id_set = intel_rdt.clos_id().is_some(); + let id = intel_rdt.clos_id().as_deref().unwrap_or(container_id); + let only_clos_id_set = clos_id_set && get_schemata_data(intel_rdt).is_none(); + let container_dir = mount_point.join(id); + let created_dir = setup_resctrl_group(&container_dir, *init_pid, only_clos_id_set)?; + + write_resctrl_schemata(&mount_point, id, intel_rdt, clos_id_set, created_dir).inspect_err( + |_err| { + tracing::error!("failed to write schemata to resctrl schemata file"); + }, + )?; + + let mut created_monitoring_dir = None; + if intel_rdt.enable_monitoring().unwrap_or(false) { + let mon_dir = container_dir.join("mon_groups").join(container_id); + if setup_monitoring_group(&mon_dir, *init_pid)? { + created_monitoring_dir = Some(mon_dir); + } + } // If closID is not set and the runtime has created the sub-directory, // the runtime MUST remove the sub-directory when the container is deleted. - let need_to_delete_directory = !clos_id_set && created_dir; + let mut need_to_delete_directory = None; + if !clos_id_set && created_dir { + need_to_delete_directory = Some(container_dir); + } - Ok(need_to_delete_directory) + Ok((need_to_delete_directory, created_monitoring_dir)) } #[cfg(test)] @@ -454,6 +594,29 @@ mod test { assert!(val.lines().any(|line| line == "L3:2=f")); assert!(!val.lines().any(|line| line == "MB:0=20;1=70")); + // Generic schemata in the legacy L3 field should be passed through + let l3_generic = "L3:0=f;1=f0\nL2:0=f\nMB:0=20;1=70"; + let res = combine_l3_cache_and_mem_bw_schemas( + &Some(l3_generic.to_owned()), + &Some(bw_1.to_owned()), + ); + assert!(res.is_some()); + let val = res.unwrap(); + assert!(val.lines().any(|line| line == "L2:0=f")); + assert!(!val.lines().any(|line| line == "MB:0=20;1=70")); + + // Messy whitespace and semicolons around MB in L3 should still be stripped + let l3_messy_mb = "L3:0=f\nMB: 0=10; 1=20 ;;"; + let res = combine_l3_cache_and_mem_bw_schemas( + &Some(l3_messy_mb.to_owned()), + &Some(bw_1.to_owned()), + ); + assert!(res.is_some()); + let val = res.unwrap(); + assert!(val.lines().any(|line| line == "L3:0=f")); + assert!(!val.lines().any(|line| line.starts_with("MB: 0=10"))); + assert!(val.lines().any(|line| line == bw_1)); + Ok(()) } @@ -466,6 +629,8 @@ mod test { assert!(is_same_schema("MB:0=bar;1=f0", "MB:0=bar;1=f0")?); assert!(is_same_schema("L3:", "L3:")?); assert!(is_same_schema("MB:", "MB:")?); + assert!(is_same_schema("L2:0=f;1=f0", "L2:0=f;1=f0")?); + assert!(is_same_schema("SMBA:0=20", "SMBA:0=20")?); // Different schemas. assert!(!is_same_schema("L3:0=f;1=f0", "L3:2=f")?); @@ -476,6 +641,10 @@ mod test { assert!(!is_same_schema("L3:0=f", "L3DATA:0=f")?); assert!(!is_same_schema("L3CODE:0=f", "L3:0=f")?); assert!(!is_same_schema("MB:0=f", "L3:0=f")?); + assert!(!is_same_schema("L2:0=f", "L3:0=f")?); + assert!(!is_same_schema("L2:0=f;1=f0", "L2:0=ff;1=f0")?); + assert!(!is_same_schema("SMBA:0=20", "SMBA:0=30")?); + assert!(!is_same_schema("SMBA:0=20", "MBA:0=20")?); // Exact same multi-line schema. assert!(is_same_schema( @@ -483,11 +652,8 @@ mod test { "L3:0=f;1=f0\nL3:2=f" )?); - // Unknown line type is ignored. - assert!(is_same_schema( - "L3:0=f;1=f0\nL3:2=f\nBAR:foo", - "L3:0=f;1=f0\nL3:2=f" - )?); + // Malformed generic line types now cause verification to fail + assert!(is_same_schema("L3:0=f;1=f0\nL3:2=f\nBAR:foo", "L3:0=f;1=f0\nL3:2=f").is_err()); // Different multi-line schema. assert!(!is_same_schema( @@ -507,15 +673,20 @@ mod test { // Same schema, different token order. assert!(is_same_schema("L3:1=f0;0=0", "L3:0=0;1=f0")?); + assert!(is_same_schema("L2:1=f0;0=f", "L2:0=f;1=f0")?); // Same schema, different whitespace and semicolons. assert!(is_same_schema("L3:;; 0 = f; ; 1=f0", "L3:0=f;1 = f0;;")?); + assert!(is_same_schema("L2:;; 0 = f; ; 1=f0", "L2:0=f;1 = f0;;")?); + assert!(is_same_schema("L2:0=f;1=f0;", "L2:0=f;1=f0")?); + assert!(is_same_schema("L2: 0 = ff ", "L2:0=ff")?); // Same schema, different leading zeros in masks. assert!(is_same_schema("L3:0=000f", "L3:0=0f")?); assert!(is_same_schema("L3:0=000f", "L3:0=0f")?); assert!(is_same_schema("L3:0=f", "L3:0=0f")?); assert!(is_same_schema("L3:0=0", "L3:0=0000")?); + assert!(is_same_schema("L2:0=00ff;1=000f0", "L2:0=ff;1=f0")?); // Invalid schemas. assert!(is_same_schema("L3:1=;0=f", "L3:1=;0=f").is_err()); @@ -525,63 +696,150 @@ mod test { assert!(is_same_schema("MB:1=;0=f", "MB:1=;0=f").is_err()); assert!(is_same_schema("MB:=0;0=f", "MB:=0;0=f").is_err()); assert!(is_same_schema("MB:1=0=3;0=f", "MB:1=0=3;0=f").is_err()); + assert!(is_same_schema("L2:0=invalid_hex_string", "L2:0=invalid_hex_string").is_err()); + assert!( + is_same_schema( + "L2:0=0123456789abcdef0123456789abcdef0123456789abcdefzz", + "L2:0=0123456789abcdef0123456789abcdef0123456789abcdefzz" + ) + .is_err() + ); + + // Generic schema handling leading zeros correctly + assert!(is_same_schema("L2:0=00ff;1=000f0", "L2:0=ff;1=f0")?); + + // Zero value edge cases for generic schemas + assert!(is_same_schema("L2:0=0;1=00", "L2:0=0;1=0")?); + + // Empty lines are ignored + assert!(is_same_schema( + "L3:0=f;1=f0\n\nL2:0=f\n", + "L3:0=f;1=f0\nL2:0=f" + )?); + + Ok(()) + } + + #[test] + fn test_get_line_type() { + assert!(matches!(get_line_type("L3:0=f"), LineType::L3Line)); + assert!(matches!(get_line_type("L3DATA:0=f"), LineType::L3DataLine)); + assert!(matches!(get_line_type("L3CODE:0=f"), LineType::L3CodeLine)); + assert!(matches!(get_line_type("MB:0=70"), LineType::MbLine)); + + let generic_l2 = get_line_type("L2:0=f"); + if let LineType::Generic(prefix) = generic_l2 { + assert_eq!(prefix, "L2"); + } else { + panic!("Expected LineType::Generic"); + } + + let generic_smba = get_line_type("SMBA:0=20"); + if let LineType::Generic(prefix) = generic_smba { + assert_eq!(prefix, "SMBA"); + } else { + panic!("Expected LineType::Generic"); + } + } + + #[test] + fn test_parse_generic_line() -> Result<()> { + let parsed = parse_generic_line("L2:0=00ff;1=f0")?; + assert_eq!(parsed.get("0").unwrap(), "ff"); + assert_eq!(parsed.get("1").unwrap(), "f0"); + + let parsed_zero = parse_generic_line("L2:0=0000;1=00")?; + assert_eq!(parsed_zero.get("0").unwrap(), "0"); + assert_eq!(parsed_zero.get("1").unwrap(), "0"); + + assert!(parse_generic_line("L2:0=;1=f0").is_err()); + assert!(parse_generic_line("L2:0=invalid_hex").is_err()); Ok(()) } #[test] - fn test_write_pid_to_resctrl_tasks() -> Result<()> { + fn test_get_schemata_data() { + use oci_spec::runtime::LinuxIntelRdtBuilder; + let rdt_modern = LinuxIntelRdtBuilder::default() + .schemata(vec!["L2:0=f;1=f0".to_owned(), "SMBA:0=20".to_owned()]) + .build() + .unwrap(); + let combined_modern = get_schemata_data(&rdt_modern).unwrap(); + assert_eq!(combined_modern, "L2:0=f;1=f0\nSMBA:0=20"); + } + + #[test] + fn test_setup_resctrl_group() -> Result<()> { let tmp = tempfile::tempdir().unwrap(); // Create the directory for id "foo". - let res = - write_container_pid_to_resctrl_tasks(tmp.path(), "foo", Pid::from_raw(1000), false); + let container_dir = tmp.path().join("foo"); + let res = setup_resctrl_group(&container_dir, Pid::from_raw(1000), false); assert!(res.unwrap()); // new directory created - let res = fs::read_to_string(tmp.path().join("foo").join("tasks")); + let res = fs::read_to_string(container_dir.join("tasks")); assert!(res.unwrap() == "1000"); // Create the same directory the second time. - let res = - write_container_pid_to_resctrl_tasks(tmp.path(), "foo", Pid::from_raw(1500), false); + let res = setup_resctrl_group(&container_dir, Pid::from_raw(1500), false); assert!(!res.unwrap()); // no new directory created - // If just clos_id then throw an error. - let res = - write_container_pid_to_resctrl_tasks(tmp.path(), "foobar", Pid::from_raw(2000), true); + // If just clos_id then throw an error if the directory doesn't exist. + let foobar_dir = tmp.path().join("foobar"); + let res = setup_resctrl_group(&foobar_dir, Pid::from_raw(2000), true); assert!(res.is_err()); // If the directory already exists then it's fine to have just clos_id. - let res = - write_container_pid_to_resctrl_tasks(tmp.path(), "foo", Pid::from_raw(2500), true); + let res = setup_resctrl_group(&container_dir, Pid::from_raw(2500), true); assert!(!res.unwrap()); // no new directory created Ok(()) } #[test] - fn test_write_resctrl_schemata() -> Result<()> { + fn test_setup_monitoring_group() -> Result<()> { let tmp = tempfile::tempdir().unwrap(); - let res = - write_container_pid_to_resctrl_tasks(tmp.path(), "foobar", Pid::from_raw(1000), false); + let mon_dir = tmp.path().join("mon_groups").join("container_id"); + let res = setup_monitoring_group(&mon_dir, Pid::from_raw(1000)); assert!(res.unwrap()); // new directory created + let res = fs::read_to_string(mon_dir.join("tasks")); + assert!(res.unwrap() == "1000"); + + let res = setup_monitoring_group(&mon_dir, Pid::from_raw(1500)); + assert!(!res.unwrap()); // no new directory created + + let res = fs::read_to_string(mon_dir.join("tasks")); + assert!(res.unwrap() == "10001500"); + + Ok(()) + } + + #[test] + fn test_write_resctrl_schemata() -> Result<()> { + use oci_spec::runtime::LinuxIntelRdtBuilder; + let tmp = tempfile::tempdir().unwrap(); + let foobar_dir = tmp.path().join("foobar"); + + let res = setup_resctrl_group(&foobar_dir, Pid::from_raw(1000), false); + assert!(res.unwrap()); + // No schemes, clos_id was not set, directory created (with container id). - let res = write_resctrl_schemata(tmp.path(), "foobar", &None, &None, false, true); + let empty_rdt = LinuxIntelRdtBuilder::default().build().unwrap(); + let res = write_resctrl_schemata(tmp.path(), "foobar", &empty_rdt, false, true); assert!(res.is_ok()); let res = fs::read_to_string(tmp.path().join("foobar").join("schemata")); assert!(res.is_err()); // File not found because no schemes. let l3_1 = "L3:0=f;1=f0\nL3:2=f\nMB:0=20;1=70"; let bw_1 = "MB:0=70;1=20"; - let res = write_resctrl_schemata( - tmp.path(), - "foobar", - &Some(l3_1.to_owned()), - &Some(bw_1.to_owned()), - false, - true, - ); + let rdt_combined = LinuxIntelRdtBuilder::default() + .l3_cache_schema(l3_1.to_owned()) + .mem_bw_schema(bw_1.to_owned()) + .build() + .unwrap(); + let res = write_resctrl_schemata(tmp.path(), "foobar", &rdt_combined, false, true); assert!(res.is_ok()); let res = fs::read_to_string(tmp.path().join("foobar").join("schemata")); @@ -594,29 +852,37 @@ mod test { // Try the verification case. If the directory existed (was not created // by us) and the clos_id was set, it needs to contain the same data as // we are trying to set. This is the same data: - let res = write_resctrl_schemata( - tmp.path(), - "foobar", - &Some(l3_1.to_owned()), - &Some(bw_1.to_owned()), - true, - false, - ); + let res = write_resctrl_schemata(tmp.path(), "foobar", &rdt_combined, true, false); assert!(res.is_ok()); // And this different data: let l3_2 = "L3:0=f;1=f0\nMB:0=20;1=70"; let bw_2 = "MB:0=70;1=20"; - let res = write_resctrl_schemata( - tmp.path(), - "foobar", - &Some(l3_2.to_owned()), - &Some(bw_2.to_owned()), - true, - false, - ); + let rdt_different = LinuxIntelRdtBuilder::default() + .l3_cache_schema(l3_2.to_owned()) + .mem_bw_schema(bw_2.to_owned()) + .build() + .unwrap(); + let res = write_resctrl_schemata(tmp.path(), "foobar", &rdt_different, true, false); assert!(res.is_err()); + // Test modern schemata field overriding everything + let rdt_schemata = LinuxIntelRdtBuilder::default() + .l3_cache_schema(l3_1.to_owned()) + .mem_bw_schema(bw_1.to_owned()) + .schemata(vec!["L2:0=f;1=f0".to_owned()]) + .build() + .unwrap(); + + let foobar_modern_dir = tmp.path().join("foobar_modern"); + let _ = setup_resctrl_group(&foobar_modern_dir, Pid::from_raw(1001), false); + let res = write_resctrl_schemata(tmp.path(), "foobar_modern", &rdt_schemata, false, true); + + assert!(res.is_ok()); + let written_data = + fs::read_to_string(tmp.path().join("foobar_modern").join("schemata")).unwrap(); + assert_eq!(written_data, "L2:0=f;1=f0\n"); + Ok(()) } } diff --git a/crates/libcontainer/src/seccomp/mod.rs b/crates/libcontainer/src/seccomp/mod.rs index 4c94e66f71..55d3adb9ef 100644 --- a/crates/libcontainer/src/seccomp/mod.rs +++ b/crates/libcontainer/src/seccomp/mod.rs @@ -48,12 +48,14 @@ pub enum SeccompError { SetCtlNnp { source: libseccomp::error::SeccompError, }, + #[error("unsupported architecture: {0}")] + UnsupportedArch(String), } type Result = std::result::Result; -fn translate_arch(arch: Arch) -> ScmpArch { - match arch { +fn translate_arch(arch: Arch) -> Result { + Ok(match arch { Arch::ScmpArchNative => ScmpArch::Native, Arch::ScmpArchX86 => ScmpArch::X86, Arch::ScmpArchX86_64 => ScmpArch::X8664, @@ -72,7 +74,12 @@ fn translate_arch(arch: Arch) -> ScmpArch { Arch::ScmpArchS390 => ScmpArch::S390, Arch::ScmpArchS390x => ScmpArch::S390X, Arch::ScmpArchRiscv64 => ScmpArch::Riscv64, - } + // The following architectures were added in OCI runtime-spec v1.3.0 + // (Parisc, Parisc64, Loongarch64, M68k, Sh), but are not yet + // supported by the libseccomp rust crate (v0.4.0). + // Returning an error for now until libseccomp updates its `ScmpArch` enum. + _ => return Err(SeccompError::UnsupportedArch(format!("{arch:?}"))), + }) } fn translate_action(action: LinuxSeccompAction, errno: Option) -> Result { @@ -172,7 +179,7 @@ pub fn initialize_seccomp(seccomp: &LinuxSeccomp) -> Result> { if let Some(architectures) = seccomp.architectures() { for &arch in architectures { tracing::trace!(?arch, "adding architecture"); - ctx.add_arch(translate_arch(arch)) + ctx.add_arch(translate_arch(arch)?) .map_err(|err| SeccompError::AddArch { source: err, arch })?; } } diff --git a/tests/contest/contest/src/tests/intel_rdt/intel_rdt_test.rs b/tests/contest/contest/src/tests/intel_rdt/intel_rdt_test.rs index ef3dbc55e4..8bb4ee2609 100644 --- a/tests/contest/contest/src/tests/intel_rdt/intel_rdt_test.rs +++ b/tests/contest/contest/src/tests/intel_rdt/intel_rdt_test.rs @@ -10,11 +10,13 @@ fn create_spec( maybe_l3_cache: Option<&str>, maybe_mem_bw: Option<&str>, maybe_clos_id: Option<&str>, + enable_monitoring: Option, ) -> Result { let mut intel_rdt = LinuxIntelRdt::default(); intel_rdt.set_l3_cache_schema(maybe_l3_cache.map(|x| x.to_owned())); intel_rdt.set_mem_bw_schema(maybe_mem_bw.map(|x| x.to_owned())); intel_rdt.set_clos_id(maybe_clos_id.map(|x| x.to_owned())); + intel_rdt.set_enable_monitoring(enable_monitoring); // Create the Linux Spec let linux_spec = LinuxBuilder::default() @@ -33,16 +35,46 @@ fn create_spec( pub fn test_intel_rdt() -> TestResult { let cases = vec![ - test_result!(create_spec(Some("L3:0=fff"), Some("MB:0=70"), None)), - test_result!(create_spec(Some("L3:0=fff"), None, None)), - test_result!(create_spec(None, Some("MB:0=70"), None)), - test_result!(create_spec(None, None, None)), + test_result!(create_spec(Some("L3:0=fff"), Some("MB:0=70"), None, None)), + test_result!(create_spec(Some("L3:0=fff"), None, None, None)), + test_result!(create_spec(None, Some("MB:0=70"), None, None)), + test_result!(create_spec(None, None, None, None)), + test_result!(create_spec(None, None, None, Some(true))), + test_result!(create_spec( + Some("L3:0=fff"), + Some("MB:0=70"), + None, + Some(true) + )), + test_result!(create_spec(Some("L3:0=fff"), None, None, Some(true))), + test_result!(create_spec(None, Some("MB:0=70"), None, Some(true))), ]; for spec in cases.into_iter() { + let spec_clone = spec.clone(); let test_result = test_outside_container(&spec, &|data| { test_result!(check_container_created(&data)); + // If enable_monitoring is true, verify the mon_groups directory exists + if let Some(linux) = spec_clone.linux() + && let Some(intel_rdt) = linux.intel_rdt() + && let Some(true) = intel_rdt.enable_monitoring() + { + let mount_point = find_resctrl_mount_point().unwrap(); + let container_id = &data.id; + let clos_id = intel_rdt.clos_id().as_deref().unwrap_or(container_id); + let mon_dir = mount_point + .join(clos_id) + .join("mon_groups") + .join(container_id); + if !mon_dir.exists() { + return TestResult::Failed(anyhow::anyhow!( + "Monitoring directory does not exist: {:?}", + mon_dir + )); + } + } + TestResult::Passed }); if let TestResult::Failed(_) = test_result {