From 5c8369621ee7415a8cb272dfd2762973b65da864 Mon Sep 17 00:00:00 2001 From: Eric Traut Date: Thu, 14 May 2026 17:55:50 -0700 Subject: [PATCH 1/3] tui: recover local state db startup failures --- codex-rs/cli/src/main.rs | 234 ++++++++++++++++++++++- codex-rs/tui/src/cli.rs | 4 +- codex-rs/tui/src/lib.rs | 58 +++++- codex-rs/tui/src/startup_error.rs | 29 +++ codex-rs/utils/cli/src/shared_options.rs | 2 +- 5 files changed, 311 insertions(+), 16 deletions(-) create mode 100644 codex-rs/tui/src/startup_error.rs diff --git a/codex-rs/cli/src/main.rs b/codex-rs/cli/src/main.rs index 6e1ccac8673f..fb76a28ddd85 100644 --- a/codex-rs/cli/src/main.rs +++ b/codex-rs/cli/src/main.rs @@ -34,6 +34,7 @@ use codex_state::state_db_path; use codex_tui::AppExitInfo; use codex_tui::Cli as TuiCli; use codex_tui::ExitReason; +use codex_tui::LocalStateDbStartupError; use codex_tui::UpdateAction; use codex_utils_absolute_path::AbsolutePathBuf; use codex_utils_cli::CliConfigOverrides; @@ -1981,13 +1982,55 @@ async fn run_interactive_tui( }; *slot = Some(auth_token); } - codex_tui::run_main( - interactive, - arg0_paths, - codex_config::LoaderOverrides::default(), - remote_endpoint, - ) - .await + let start_tui = || { + codex_tui::run_main( + interactive.clone(), + arg0_paths.clone(), + codex_config::LoaderOverrides::default(), + remote_endpoint.clone(), + ) + }; + match start_tui().await { + Ok(exit_info) => Ok(exit_info), + Err(err) => { + let Some(startup_error) = local_state_db_startup_error(&err) else { + return Err(err); + }; + if local_state_db_is_locked(startup_error.detail()) { + print_local_state_db_locked_guidance(startup_error); + return Ok(AppExitInfo::fatal(startup_error.to_string())); + } + if !confirm_local_state_db_repair(startup_error)? { + print_local_state_db_diagnostic_guidance(startup_error); + return Ok(AppExitInfo::fatal(startup_error.to_string())); + } + + match repair_local_state_db_files(startup_error).await { + Ok(backups) => print_local_state_db_repair_backups(&backups), + Err(repair_err) => { + print_local_state_db_diagnostic_guidance(startup_error); + return Ok(AppExitInfo::fatal(format!( + "failed to repair Codex local data automatically: {repair_err}" + ))); + } + } + + match start_tui().await { + Ok(exit_info) => Ok(exit_info), + Err(retry_err) => { + let Some(retry_startup_error) = local_state_db_startup_error(&retry_err) else { + return Err(retry_err); + }; + if local_state_db_is_locked(retry_startup_error.detail()) { + print_local_state_db_locked_guidance(retry_startup_error); + } else { + print_local_state_db_diagnostic_guidance(retry_startup_error); + } + Ok(AppExitInfo::fatal(retry_startup_error.to_string())) + } + } + } + } } fn confirm(prompt: &str) -> std::io::Result { @@ -1999,6 +2042,131 @@ fn confirm(prompt: &str) -> std::io::Result { Ok(answer.eq_ignore_ascii_case("y") || answer.eq_ignore_ascii_case("yes")) } +fn local_state_db_startup_error(err: &std::io::Error) -> Option<&LocalStateDbStartupError> { + err.get_ref() + .and_then(|err| err.downcast_ref::()) +} + +fn local_state_db_is_locked(detail: &str) -> bool { + let detail = detail.to_ascii_lowercase(); + detail.contains("database is locked") || detail.contains("database is busy") +} + +fn confirm_local_state_db_repair( + startup_error: &LocalStateDbStartupError, +) -> std::io::Result { + eprintln!("Codex couldn't start because its local database appears to be damaged."); + eprintln!("Codex can try a safe repair by backing up those files and rebuilding them."); + print_local_state_db_technical_details(startup_error); + confirm("Repair Codex local data now? [y/N]: ") +} + +async fn repair_local_state_db_files( + startup_error: &LocalStateDbStartupError, +) -> std::io::Result> { + let state_db_path = startup_error.state_db_path(); + let sqlite_home = state_db_path.parent().ok_or_else(|| { + std::io::Error::other("state database path does not have a parent directory") + })?; + let timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map_or(0, |duration| duration.as_secs()); + let repair_suffix = format!("codex-repair-{timestamp}"); + let mut backups = Vec::new(); + + match tokio::fs::metadata(sqlite_home).await { + Ok(metadata) if metadata.is_dir() => {} + Ok(_) => { + backups.push(backup_local_state_path(sqlite_home, &repair_suffix).await?); + tokio::fs::create_dir_all(sqlite_home).await?; + } + Err(err) if err.kind() == std::io::ErrorKind::NotFound => { + tokio::fs::create_dir_all(sqlite_home).await?; + } + Err(err) => return Err(err), + } + + let logs_db_path = codex_state::logs_db_path(sqlite_home); + for path in sqlite_repair_paths(state_db_path) + .into_iter() + .chain(sqlite_repair_paths(logs_db_path.as_path())) + { + if tokio::fs::try_exists(path.as_path()).await? { + backups.push(backup_local_state_path(path.as_path(), &repair_suffix).await?); + } + } + + if backups.is_empty() { + return Err(std::io::Error::other( + "no repairable Codex local data files were found", + )); + } + + Ok(backups) +} + +fn sqlite_repair_paths(db_path: &std::path::Path) -> Vec { + let mut wal_path = db_path.as_os_str().to_os_string(); + wal_path.push("-wal"); + let mut shm_path = db_path.as_os_str().to_os_string(); + shm_path.push("-shm"); + vec![ + db_path.to_path_buf(), + PathBuf::from(wal_path), + PathBuf::from(shm_path), + ] +} + +async fn backup_local_state_path( + path: &std::path::Path, + repair_suffix: &str, +) -> std::io::Result { + let file_name = path.file_name().ok_or_else(|| { + std::io::Error::other(format!( + "cannot create a repair backup name for {}", + path.display() + )) + })?; + let mut sequence = 0; + loop { + let mut backup_name = file_name.to_os_string(); + backup_name.push(format!(".{repair_suffix}.{sequence}.bak")); + let backup_path = path.with_file_name(backup_name); + if !tokio::fs::try_exists(backup_path.as_path()).await? { + tokio::fs::rename(path, backup_path.as_path()).await?; + return Ok(backup_path); + } + sequence += 1; + } +} + +fn print_local_state_db_repair_backups(backups: &[PathBuf]) { + eprintln!("Backed up Codex local data before repair:"); + for backup in backups { + eprintln!(" {}", backup.display()); + } + eprintln!("Retrying startup with rebuilt local data..."); +} + +fn print_local_state_db_diagnostic_guidance(startup_error: &LocalStateDbStartupError) { + eprintln!("Codex couldn't start because its local database appears to be damaged."); + eprintln!("Run `codex doctor` to check your setup and get next-step guidance."); + eprintln!("If this keeps happening, share the technical details below when asking for help."); + print_local_state_db_technical_details(startup_error); +} + +fn print_local_state_db_technical_details(startup_error: &LocalStateDbStartupError) { + eprintln!("Technical details:"); + eprintln!(" Location: {}", startup_error.state_db_path().display()); + eprintln!(" Cause: {}", startup_error.detail()); +} + +fn print_local_state_db_locked_guidance(startup_error: &LocalStateDbStartupError) { + eprintln!("Codex couldn't start because another Codex process is using its local data."); + eprintln!("Quit any other copies of Codex that may still be running, then try again."); + print_local_state_db_technical_details(startup_error); +} + /// Build the final `TuiCli` for a `codex resume` invocation. fn finalize_resume_interactive( mut interactive: TuiCli, @@ -2102,6 +2270,7 @@ mod tests { use codex_protocol::ThreadId; use codex_tui::TokenUsage; use pretty_assertions::assert_eq; + use tempfile::TempDir; fn finalize_resume_from_args(args: &[&str]) -> TuiCli { let cli = MultitoolCli::try_parse_from(args).expect("parse"); @@ -3325,4 +3494,55 @@ mod tests { .to_overrides() .expect_err("feature should be rejected") } + + #[tokio::test] + async fn local_state_db_repair_backs_up_owned_database_files() -> std::io::Result<()> { + let temp_dir = TempDir::new()?; + let state_path = codex_state::state_db_path(temp_dir.path()); + let logs_path = codex_state::logs_db_path(temp_dir.path()); + let state_sidecars = sqlite_repair_paths(state_path.as_path()); + tokio::fs::write(state_path.as_path(), b"state").await?; + tokio::fs::write(state_sidecars[1].as_path(), b"state-wal").await?; + tokio::fs::write(logs_path.as_path(), b"logs").await?; + + let startup_error = + LocalStateDbStartupError::new(state_path.clone(), "corrupt".to_string()); + let backups = repair_local_state_db_files(&startup_error).await?; + + assert_eq!(backups.len(), 3); + assert!(!tokio::fs::try_exists(state_path.as_path()).await?); + assert!(!tokio::fs::try_exists(state_sidecars[1].as_path()).await?); + assert!(!tokio::fs::try_exists(logs_path.as_path()).await?); + for backup in backups { + assert!(tokio::fs::try_exists(backup.as_path()).await?); + } + Ok(()) + } + + #[tokio::test] + async fn local_state_db_repair_replaces_blocking_sqlite_home_file() -> std::io::Result<()> { + let temp_dir = TempDir::new()?; + let sqlite_home = temp_dir.path().join("sqlite-home"); + tokio::fs::write(sqlite_home.as_path(), b"not-a-directory").await?; + let startup_error = LocalStateDbStartupError::new( + codex_state::state_db_path(sqlite_home.as_path()), + "File exists".to_string(), + ); + + let backups = repair_local_state_db_files(&startup_error).await?; + + assert_eq!(backups.len(), 1); + assert!(tokio::fs::metadata(sqlite_home.as_path()).await?.is_dir()); + assert!(tokio::fs::try_exists(backups[0].as_path()).await?); + Ok(()) + } + + #[test] + fn local_state_db_lock_failures_skip_repair() { + assert!(local_state_db_is_locked("database is locked")); + assert!(local_state_db_is_locked("database is busy")); + assert!(!local_state_db_is_locked( + "database disk image is malformed" + )); + } } diff --git a/codex-rs/tui/src/cli.rs b/codex-rs/tui/src/cli.rs index 4ec69bf26f33..13ddffb14a54 100644 --- a/codex-rs/tui/src/cli.rs +++ b/codex-rs/tui/src/cli.rs @@ -5,7 +5,7 @@ use codex_utils_cli::ApprovalModeCliArg; use codex_utils_cli::CliConfigOverrides; use codex_utils_cli::SharedCliOptions; -#[derive(Parser, Debug)] +#[derive(Parser, Clone, Debug)] #[command(version)] pub struct Cli { /// Optional user prompt to start the session. @@ -89,7 +89,7 @@ impl std::ops::DerefMut for Cli { } } -#[derive(Debug, Default)] +#[derive(Clone, Debug, Default)] pub struct TuiSharedCliOptions(SharedCliOptions); impl TuiSharedCliOptions { diff --git a/codex-rs/tui/src/lib.rs b/codex-rs/tui/src/lib.rs index 32ca6617f74f..8558987e9fa2 100644 --- a/codex-rs/tui/src/lib.rs +++ b/codex-rs/tui/src/lib.rs @@ -15,6 +15,7 @@ use crate::legacy_core::format_exec_policy_error_with_source; use crate::legacy_core::windows_sandbox::WindowsSandboxLevelExt; use crate::session_resume::ResolveCwdOutcome; use crate::session_resume::resolve_cwd_for_resume_or_fork; +pub use crate::startup_error::LocalStateDbStartupError; use additional_dirs::add_dir_warning_message; use app::App; pub use app::AppExitInfo; @@ -167,6 +168,7 @@ mod session_state; mod shimmer; mod skills_helpers; mod slash_command; +mod startup_error; mod startup_hooks_review; mod status; mod status_indicator_widget; @@ -312,6 +314,21 @@ pub(crate) enum AppServerTarget { Remote { endpoint: RemoteAppServerEndpoint }, } +async fn init_state_db_for_app_server_target( + config: &Config, + app_server_target: &AppServerTarget, +) -> std::io::Result> { + match app_server_target { + AppServerTarget::Embedded => state_db::try_init(config).await.map(Some).map_err(|err| { + std::io::Error::other(LocalStateDbStartupError::new( + codex_state::state_db_path(config.sqlite_home.as_path()), + err.to_string(), + )) + }), + AppServerTarget::Remote { .. } => Ok(state_db::get_state_db(config).await), + } +} + fn remote_addr_has_explicit_port(addr: &str, parsed: &Url) -> bool { let Some(host) = parsed.host_str() else { return false; @@ -509,7 +526,7 @@ pub(crate) async fn start_app_server_for_picker( pub(crate) async fn start_embedded_app_server_for_picker( config: &Config, ) -> color_eyre::Result { - let state_db = state_db::init(config).await; + let state_db = init_state_db_for_app_server_target(config, &AppServerTarget::Embedded).await?; start_app_server_for_picker( config, &AppServerTarget::Embedded, @@ -989,10 +1006,7 @@ pub async fn run_main( otel.as_ref(), otel_originator.as_str(), ); - let state_db = match &app_server_target { - AppServerTarget::Embedded => state_db::init(&config).await, - AppServerTarget::Remote { .. } => state_db::get_state_db(&config).await, - }; + let state_db = init_state_db_for_app_server_target(&config, &app_server_target).await?; let effective_toml = config.config_layer_stack.effective_config(); match effective_toml.try_into() { @@ -1823,7 +1837,8 @@ mod tests { async fn start_test_embedded_app_server( config: Config, ) -> color_eyre::Result { - let state_db = state_db::init(&config).await; + let state_db = + init_state_db_for_app_server_target(&config, &AppServerTarget::Embedded).await?; start_embedded_app_server( Arg0DispatchPaths::default(), config, @@ -2416,6 +2431,37 @@ mod tests { ); Ok(()) } + + #[tokio::test] + async fn embedded_state_db_failure_is_typed_for_cli_recovery() -> color_eyre::Result<()> { + let temp_dir = TempDir::new()?; + let mut config = build_config(&temp_dir).await?; + let occupied_sqlite_home = temp_dir.path().join("sqlite-home"); + std::fs::write(&occupied_sqlite_home, "occupied")?; + config.sqlite_home = occupied_sqlite_home.clone(); + + let err = + match init_state_db_for_app_server_target(&config, &AppServerTarget::Embedded).await { + Ok(_) => panic!("embedded startup should surface state db init failures"), + Err(err) => err, + }; + let startup_error = err + .get_ref() + .and_then(|err| err.downcast_ref::()) + .expect("state db startup failure should retain its typed context"); + + assert_eq!( + startup_error.state_db_path(), + codex_state::state_db_path(occupied_sqlite_home.as_path()).as_path() + ); + assert!( + startup_error + .detail() + .contains("failed to initialize state runtime"), + "startup error should preserve the underlying state db failure" + ); + Ok(()) + } #[tokio::test] #[serial] async fn windows_shows_trust_prompt_with_sandbox() -> std::io::Result<()> { diff --git a/codex-rs/tui/src/startup_error.rs b/codex-rs/tui/src/startup_error.rs new file mode 100644 index 000000000000..55b691d836b1 --- /dev/null +++ b/codex-rs/tui/src/startup_error.rs @@ -0,0 +1,29 @@ +use std::path::Path; +use std::path::PathBuf; + +#[derive(Debug, thiserror::Error)] +#[error( + "failed to initialize sqlite state db at {}: {detail}", + state_db_path.display() +)] +pub struct LocalStateDbStartupError { + state_db_path: PathBuf, + detail: String, +} + +impl LocalStateDbStartupError { + pub fn new(state_db_path: PathBuf, detail: String) -> Self { + Self { + state_db_path, + detail, + } + } + + pub fn state_db_path(&self) -> &Path { + self.state_db_path.as_path() + } + + pub fn detail(&self) -> &str { + self.detail.as_str() + } +} diff --git a/codex-rs/utils/cli/src/shared_options.rs b/codex-rs/utils/cli/src/shared_options.rs index 8735cb6db8ca..b59a3b5c8535 100644 --- a/codex-rs/utils/cli/src/shared_options.rs +++ b/codex-rs/utils/cli/src/shared_options.rs @@ -5,7 +5,7 @@ use clap::Args; use codex_protocol::config_types::ProfileV2Name; use std::path::PathBuf; -#[derive(Args, Debug, Default)] +#[derive(Args, Clone, Debug, Default)] pub struct SharedCliOptions { /// Optional image(s) to attach to the initial prompt. #[arg( From c79d7da70192db177b6d7f3068b506e566e68eb6 Mon Sep 17 00:00:00 2001 From: Eric Traut Date: Thu, 14 May 2026 18:19:59 -0700 Subject: [PATCH 2/3] cli: isolate state db recovery flow --- codex-rs/cli/src/main.rs | 205 ++------------------------ codex-rs/cli/src/state_db_recovery.rs | 178 ++++++++++++++++++++++ 2 files changed, 193 insertions(+), 190 deletions(-) create mode 100644 codex-rs/cli/src/state_db_recovery.rs diff --git a/codex-rs/cli/src/main.rs b/codex-rs/cli/src/main.rs index fb76a28ddd85..dce9891e61f9 100644 --- a/codex-rs/cli/src/main.rs +++ b/codex-rs/cli/src/main.rs @@ -34,7 +34,6 @@ use codex_state::state_db_path; use codex_tui::AppExitInfo; use codex_tui::Cli as TuiCli; use codex_tui::ExitReason; -use codex_tui::LocalStateDbStartupError; use codex_tui::UpdateAction; use codex_utils_absolute_path::AbsolutePathBuf; use codex_utils_cli::CliConfigOverrides; @@ -52,6 +51,7 @@ mod doctor; mod marketplace_cmd; mod mcp_cmd; mod plugin_cmd; +mod state_db_recovery; #[cfg(not(windows))] mod wsl_paths; @@ -59,6 +59,7 @@ use crate::mcp_cmd::McpCli; use crate::plugin_cmd::PluginCli; use crate::plugin_cmd::PluginSubcommand; use doctor::DoctorCommand; +use state_db_recovery as local_state_db; use codex_config::LoaderOverrides; use codex_core::build_models_manager; @@ -1993,22 +1994,22 @@ async fn run_interactive_tui( match start_tui().await { Ok(exit_info) => Ok(exit_info), Err(err) => { - let Some(startup_error) = local_state_db_startup_error(&err) else { + let Some(startup_error) = local_state_db::startup_error(&err) else { return Err(err); }; - if local_state_db_is_locked(startup_error.detail()) { - print_local_state_db_locked_guidance(startup_error); + if local_state_db::is_locked(startup_error.detail()) { + local_state_db::print_locked_guidance(startup_error); return Ok(AppExitInfo::fatal(startup_error.to_string())); } - if !confirm_local_state_db_repair(startup_error)? { - print_local_state_db_diagnostic_guidance(startup_error); + if !local_state_db::confirm_repair(startup_error)? { + local_state_db::print_diagnostic_guidance(startup_error); return Ok(AppExitInfo::fatal(startup_error.to_string())); } - match repair_local_state_db_files(startup_error).await { - Ok(backups) => print_local_state_db_repair_backups(&backups), + match local_state_db::repair_files(startup_error).await { + Ok(backups) => local_state_db::print_repair_backups(&backups), Err(repair_err) => { - print_local_state_db_diagnostic_guidance(startup_error); + local_state_db::print_diagnostic_guidance(startup_error); return Ok(AppExitInfo::fatal(format!( "failed to repair Codex local data automatically: {repair_err}" ))); @@ -2018,13 +2019,14 @@ async fn run_interactive_tui( match start_tui().await { Ok(exit_info) => Ok(exit_info), Err(retry_err) => { - let Some(retry_startup_error) = local_state_db_startup_error(&retry_err) else { + let Some(retry_startup_error) = local_state_db::startup_error(&retry_err) + else { return Err(retry_err); }; - if local_state_db_is_locked(retry_startup_error.detail()) { - print_local_state_db_locked_guidance(retry_startup_error); + if local_state_db::is_locked(retry_startup_error.detail()) { + local_state_db::print_locked_guidance(retry_startup_error); } else { - print_local_state_db_diagnostic_guidance(retry_startup_error); + local_state_db::print_diagnostic_guidance(retry_startup_error); } Ok(AppExitInfo::fatal(retry_startup_error.to_string())) } @@ -2042,131 +2044,6 @@ fn confirm(prompt: &str) -> std::io::Result { Ok(answer.eq_ignore_ascii_case("y") || answer.eq_ignore_ascii_case("yes")) } -fn local_state_db_startup_error(err: &std::io::Error) -> Option<&LocalStateDbStartupError> { - err.get_ref() - .and_then(|err| err.downcast_ref::()) -} - -fn local_state_db_is_locked(detail: &str) -> bool { - let detail = detail.to_ascii_lowercase(); - detail.contains("database is locked") || detail.contains("database is busy") -} - -fn confirm_local_state_db_repair( - startup_error: &LocalStateDbStartupError, -) -> std::io::Result { - eprintln!("Codex couldn't start because its local database appears to be damaged."); - eprintln!("Codex can try a safe repair by backing up those files and rebuilding them."); - print_local_state_db_technical_details(startup_error); - confirm("Repair Codex local data now? [y/N]: ") -} - -async fn repair_local_state_db_files( - startup_error: &LocalStateDbStartupError, -) -> std::io::Result> { - let state_db_path = startup_error.state_db_path(); - let sqlite_home = state_db_path.parent().ok_or_else(|| { - std::io::Error::other("state database path does not have a parent directory") - })?; - let timestamp = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .map_or(0, |duration| duration.as_secs()); - let repair_suffix = format!("codex-repair-{timestamp}"); - let mut backups = Vec::new(); - - match tokio::fs::metadata(sqlite_home).await { - Ok(metadata) if metadata.is_dir() => {} - Ok(_) => { - backups.push(backup_local_state_path(sqlite_home, &repair_suffix).await?); - tokio::fs::create_dir_all(sqlite_home).await?; - } - Err(err) if err.kind() == std::io::ErrorKind::NotFound => { - tokio::fs::create_dir_all(sqlite_home).await?; - } - Err(err) => return Err(err), - } - - let logs_db_path = codex_state::logs_db_path(sqlite_home); - for path in sqlite_repair_paths(state_db_path) - .into_iter() - .chain(sqlite_repair_paths(logs_db_path.as_path())) - { - if tokio::fs::try_exists(path.as_path()).await? { - backups.push(backup_local_state_path(path.as_path(), &repair_suffix).await?); - } - } - - if backups.is_empty() { - return Err(std::io::Error::other( - "no repairable Codex local data files were found", - )); - } - - Ok(backups) -} - -fn sqlite_repair_paths(db_path: &std::path::Path) -> Vec { - let mut wal_path = db_path.as_os_str().to_os_string(); - wal_path.push("-wal"); - let mut shm_path = db_path.as_os_str().to_os_string(); - shm_path.push("-shm"); - vec![ - db_path.to_path_buf(), - PathBuf::from(wal_path), - PathBuf::from(shm_path), - ] -} - -async fn backup_local_state_path( - path: &std::path::Path, - repair_suffix: &str, -) -> std::io::Result { - let file_name = path.file_name().ok_or_else(|| { - std::io::Error::other(format!( - "cannot create a repair backup name for {}", - path.display() - )) - })?; - let mut sequence = 0; - loop { - let mut backup_name = file_name.to_os_string(); - backup_name.push(format!(".{repair_suffix}.{sequence}.bak")); - let backup_path = path.with_file_name(backup_name); - if !tokio::fs::try_exists(backup_path.as_path()).await? { - tokio::fs::rename(path, backup_path.as_path()).await?; - return Ok(backup_path); - } - sequence += 1; - } -} - -fn print_local_state_db_repair_backups(backups: &[PathBuf]) { - eprintln!("Backed up Codex local data before repair:"); - for backup in backups { - eprintln!(" {}", backup.display()); - } - eprintln!("Retrying startup with rebuilt local data..."); -} - -fn print_local_state_db_diagnostic_guidance(startup_error: &LocalStateDbStartupError) { - eprintln!("Codex couldn't start because its local database appears to be damaged."); - eprintln!("Run `codex doctor` to check your setup and get next-step guidance."); - eprintln!("If this keeps happening, share the technical details below when asking for help."); - print_local_state_db_technical_details(startup_error); -} - -fn print_local_state_db_technical_details(startup_error: &LocalStateDbStartupError) { - eprintln!("Technical details:"); - eprintln!(" Location: {}", startup_error.state_db_path().display()); - eprintln!(" Cause: {}", startup_error.detail()); -} - -fn print_local_state_db_locked_guidance(startup_error: &LocalStateDbStartupError) { - eprintln!("Codex couldn't start because another Codex process is using its local data."); - eprintln!("Quit any other copies of Codex that may still be running, then try again."); - print_local_state_db_technical_details(startup_error); -} - /// Build the final `TuiCli` for a `codex resume` invocation. fn finalize_resume_interactive( mut interactive: TuiCli, @@ -2270,7 +2147,6 @@ mod tests { use codex_protocol::ThreadId; use codex_tui::TokenUsage; use pretty_assertions::assert_eq; - use tempfile::TempDir; fn finalize_resume_from_args(args: &[&str]) -> TuiCli { let cli = MultitoolCli::try_parse_from(args).expect("parse"); @@ -3494,55 +3370,4 @@ mod tests { .to_overrides() .expect_err("feature should be rejected") } - - #[tokio::test] - async fn local_state_db_repair_backs_up_owned_database_files() -> std::io::Result<()> { - let temp_dir = TempDir::new()?; - let state_path = codex_state::state_db_path(temp_dir.path()); - let logs_path = codex_state::logs_db_path(temp_dir.path()); - let state_sidecars = sqlite_repair_paths(state_path.as_path()); - tokio::fs::write(state_path.as_path(), b"state").await?; - tokio::fs::write(state_sidecars[1].as_path(), b"state-wal").await?; - tokio::fs::write(logs_path.as_path(), b"logs").await?; - - let startup_error = - LocalStateDbStartupError::new(state_path.clone(), "corrupt".to_string()); - let backups = repair_local_state_db_files(&startup_error).await?; - - assert_eq!(backups.len(), 3); - assert!(!tokio::fs::try_exists(state_path.as_path()).await?); - assert!(!tokio::fs::try_exists(state_sidecars[1].as_path()).await?); - assert!(!tokio::fs::try_exists(logs_path.as_path()).await?); - for backup in backups { - assert!(tokio::fs::try_exists(backup.as_path()).await?); - } - Ok(()) - } - - #[tokio::test] - async fn local_state_db_repair_replaces_blocking_sqlite_home_file() -> std::io::Result<()> { - let temp_dir = TempDir::new()?; - let sqlite_home = temp_dir.path().join("sqlite-home"); - tokio::fs::write(sqlite_home.as_path(), b"not-a-directory").await?; - let startup_error = LocalStateDbStartupError::new( - codex_state::state_db_path(sqlite_home.as_path()), - "File exists".to_string(), - ); - - let backups = repair_local_state_db_files(&startup_error).await?; - - assert_eq!(backups.len(), 1); - assert!(tokio::fs::metadata(sqlite_home.as_path()).await?.is_dir()); - assert!(tokio::fs::try_exists(backups[0].as_path()).await?); - Ok(()) - } - - #[test] - fn local_state_db_lock_failures_skip_repair() { - assert!(local_state_db_is_locked("database is locked")); - assert!(local_state_db_is_locked("database is busy")); - assert!(!local_state_db_is_locked( - "database disk image is malformed" - )); - } } diff --git a/codex-rs/cli/src/state_db_recovery.rs b/codex-rs/cli/src/state_db_recovery.rs new file mode 100644 index 000000000000..cc5f9e29166c --- /dev/null +++ b/codex-rs/cli/src/state_db_recovery.rs @@ -0,0 +1,178 @@ +use codex_tui::LocalStateDbStartupError; +use std::path::PathBuf; + +pub(crate) fn startup_error(err: &std::io::Error) -> Option<&LocalStateDbStartupError> { + err.get_ref() + .and_then(|err| err.downcast_ref::()) +} + +pub(crate) fn is_locked(detail: &str) -> bool { + let detail = detail.to_ascii_lowercase(); + detail.contains("database is locked") || detail.contains("database is busy") +} + +pub(crate) fn confirm_repair(startup_error: &LocalStateDbStartupError) -> std::io::Result { + eprintln!("Codex couldn't start because its local database appears to be damaged."); + eprintln!("Codex can try a safe repair by backing up those files and rebuilding them."); + print_technical_details(startup_error); + crate::confirm("Repair Codex local data now? [y/N]: ") +} + +pub(crate) async fn repair_files( + startup_error: &LocalStateDbStartupError, +) -> std::io::Result> { + let state_db_path = startup_error.state_db_path(); + let sqlite_home = state_db_path.parent().ok_or_else(|| { + std::io::Error::other("state database path does not have a parent directory") + })?; + let timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map_or(0, |duration| duration.as_secs()); + let repair_suffix = format!("codex-repair-{timestamp}"); + let mut backups = Vec::new(); + + match tokio::fs::metadata(sqlite_home).await { + Ok(metadata) if metadata.is_dir() => {} + Ok(_) => { + backups.push(backup_path(sqlite_home, &repair_suffix).await?); + tokio::fs::create_dir_all(sqlite_home).await?; + } + Err(err) if err.kind() == std::io::ErrorKind::NotFound => { + tokio::fs::create_dir_all(sqlite_home).await?; + } + Err(err) => return Err(err), + } + + let logs_db_path = codex_state::logs_db_path(sqlite_home); + for path in sqlite_paths(state_db_path) + .into_iter() + .chain(sqlite_paths(logs_db_path.as_path())) + { + if tokio::fs::try_exists(path.as_path()).await? { + backups.push(backup_path(path.as_path(), &repair_suffix).await?); + } + } + + if backups.is_empty() { + return Err(std::io::Error::other( + "no repairable Codex local data files were found", + )); + } + + Ok(backups) +} + +pub(crate) fn print_repair_backups(backups: &[PathBuf]) { + eprintln!("Backed up Codex local data before repair:"); + for backup in backups { + eprintln!(" {}", backup.display()); + } + eprintln!("Retrying startup with rebuilt local data..."); +} + +pub(crate) fn print_diagnostic_guidance(startup_error: &LocalStateDbStartupError) { + eprintln!("Codex couldn't start because its local database appears to be damaged."); + eprintln!("Run `codex doctor` to check your setup and get next-step guidance."); + eprintln!("If this keeps happening, share the technical details below when asking for help."); + print_technical_details(startup_error); +} + +pub(crate) fn print_locked_guidance(startup_error: &LocalStateDbStartupError) { + eprintln!("Codex couldn't start because another Codex process is using its local data."); + eprintln!("Quit any other copies of Codex that may still be running, then try again."); + print_technical_details(startup_error); +} + +fn sqlite_paths(db_path: &std::path::Path) -> Vec { + let mut wal_path = db_path.as_os_str().to_os_string(); + wal_path.push("-wal"); + let mut shm_path = db_path.as_os_str().to_os_string(); + shm_path.push("-shm"); + vec![ + db_path.to_path_buf(), + PathBuf::from(wal_path), + PathBuf::from(shm_path), + ] +} + +async fn backup_path(path: &std::path::Path, repair_suffix: &str) -> std::io::Result { + let file_name = path.file_name().ok_or_else(|| { + std::io::Error::other(format!( + "cannot create a repair backup name for {}", + path.display() + )) + })?; + let mut sequence = 0; + loop { + let mut backup_name = file_name.to_os_string(); + backup_name.push(format!(".{repair_suffix}.{sequence}.bak")); + let backup_path = path.with_file_name(backup_name); + if !tokio::fs::try_exists(backup_path.as_path()).await? { + tokio::fs::rename(path, backup_path.as_path()).await?; + return Ok(backup_path); + } + sequence += 1; + } +} + +fn print_technical_details(startup_error: &LocalStateDbStartupError) { + eprintln!("Technical details:"); + eprintln!(" Location: {}", startup_error.state_db_path().display()); + eprintln!(" Cause: {}", startup_error.detail()); +} + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + use tempfile::TempDir; + + #[tokio::test] + async fn repair_backs_up_owned_database_files() -> std::io::Result<()> { + let temp_dir = TempDir::new()?; + let state_path = codex_state::state_db_path(temp_dir.path()); + let logs_path = codex_state::logs_db_path(temp_dir.path()); + let state_sidecars = sqlite_paths(state_path.as_path()); + tokio::fs::write(state_path.as_path(), b"state").await?; + tokio::fs::write(state_sidecars[1].as_path(), b"state-wal").await?; + tokio::fs::write(logs_path.as_path(), b"logs").await?; + + let startup_error = + LocalStateDbStartupError::new(state_path.clone(), "corrupt".to_string()); + let backups = repair_files(&startup_error).await?; + + assert_eq!(backups.len(), 3); + assert!(!tokio::fs::try_exists(state_path.as_path()).await?); + assert!(!tokio::fs::try_exists(state_sidecars[1].as_path()).await?); + assert!(!tokio::fs::try_exists(logs_path.as_path()).await?); + for backup in backups { + assert!(tokio::fs::try_exists(backup.as_path()).await?); + } + Ok(()) + } + + #[tokio::test] + async fn repair_replaces_blocking_sqlite_home_file() -> std::io::Result<()> { + let temp_dir = TempDir::new()?; + let sqlite_home = temp_dir.path().join("sqlite-home"); + tokio::fs::write(sqlite_home.as_path(), b"not-a-directory").await?; + let startup_error = LocalStateDbStartupError::new( + codex_state::state_db_path(sqlite_home.as_path()), + "File exists".to_string(), + ); + + let backups = repair_files(&startup_error).await?; + + assert_eq!(backups.len(), 1); + assert!(tokio::fs::metadata(sqlite_home.as_path()).await?.is_dir()); + assert!(tokio::fs::try_exists(backups[0].as_path()).await?); + Ok(()) + } + + #[test] + fn lock_failures_skip_repair() { + assert!(is_locked("database is locked")); + assert!(is_locked("database is busy")); + assert!(!is_locked("database disk image is malformed")); + } +} From df55f493301b4c9c15c6a107c8c909a0498ed758 Mon Sep 17 00:00:00 2001 From: Eric Traut Date: Thu, 14 May 2026 18:23:29 -0700 Subject: [PATCH 3/3] cli: simplify state db startup retry handling --- codex-rs/cli/src/main.rs | 67 ++++++++++++--------------- codex-rs/cli/src/state_db_recovery.rs | 5 ++ 2 files changed, 34 insertions(+), 38 deletions(-) diff --git a/codex-rs/cli/src/main.rs b/codex-rs/cli/src/main.rs index dce9891e61f9..fd79bd96bda6 100644 --- a/codex-rs/cli/src/main.rs +++ b/codex-rs/cli/src/main.rs @@ -1991,47 +1991,38 @@ async fn run_interactive_tui( remote_endpoint.clone(), ) }; - match start_tui().await { - Ok(exit_info) => Ok(exit_info), - Err(err) => { - let Some(startup_error) = local_state_db::startup_error(&err) else { - return Err(err); - }; - if local_state_db::is_locked(startup_error.detail()) { - local_state_db::print_locked_guidance(startup_error); - return Ok(AppExitInfo::fatal(startup_error.to_string())); - } - if !local_state_db::confirm_repair(startup_error)? { - local_state_db::print_diagnostic_guidance(startup_error); - return Ok(AppExitInfo::fatal(startup_error.to_string())); - } - - match local_state_db::repair_files(startup_error).await { - Ok(backups) => local_state_db::print_repair_backups(&backups), - Err(repair_err) => { - local_state_db::print_diagnostic_guidance(startup_error); - return Ok(AppExitInfo::fatal(format!( - "failed to repair Codex local data automatically: {repair_err}" - ))); - } - } + let mut attempted_repair = false; + loop { + let err = match start_tui().await { + Ok(exit_info) => return Ok(exit_info), + Err(err) => err, + }; + let Some(startup_error) = local_state_db::startup_error(&err) else { + return Err(err); + }; + if local_state_db::is_locked(startup_error.detail()) { + local_state_db::print_locked_guidance(startup_error); + return Ok(AppExitInfo::fatal(startup_error.to_string())); + } + if attempted_repair { + local_state_db::print_diagnostic_guidance(startup_error); + return Ok(AppExitInfo::fatal(startup_error.to_string())); + } + if !local_state_db::confirm_repair(startup_error)? { + local_state_db::print_diagnostic_guidance(startup_error); + return Ok(AppExitInfo::fatal(startup_error.to_string())); + } - match start_tui().await { - Ok(exit_info) => Ok(exit_info), - Err(retry_err) => { - let Some(retry_startup_error) = local_state_db::startup_error(&retry_err) - else { - return Err(retry_err); - }; - if local_state_db::is_locked(retry_startup_error.detail()) { - local_state_db::print_locked_guidance(retry_startup_error); - } else { - local_state_db::print_diagnostic_guidance(retry_startup_error); - } - Ok(AppExitInfo::fatal(retry_startup_error.to_string())) - } + match local_state_db::repair_files(startup_error).await { + Ok(backups) => local_state_db::print_repair_backups(&backups), + Err(repair_err) => { + local_state_db::print_diagnostic_guidance(startup_error); + return Ok(AppExitInfo::fatal(format!( + "failed to repair Codex local data automatically: {repair_err}" + ))); } } + attempted_repair = true; } } diff --git a/codex-rs/cli/src/state_db_recovery.rs b/codex-rs/cli/src/state_db_recovery.rs index cc5f9e29166c..8db134540ab7 100644 --- a/codex-rs/cli/src/state_db_recovery.rs +++ b/codex-rs/cli/src/state_db_recovery.rs @@ -1,3 +1,8 @@ +//! CLI recovery for local state database startup failures. +//! +//! This keeps user-facing repair and lock-contention handling out of the main +//! CLI dispatch path while preserving the TUI startup error as the boundary type. + use codex_tui::LocalStateDbStartupError; use std::path::PathBuf;