diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json index fdaaf3e857d0..c7f963df36d7 100644 --- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json +++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json @@ -6314,6 +6314,25 @@ } ] }, + "AutoCompactTokenLimitScope": { + "description": "Selects which part of the active context is charged against `model_auto_compact_token_limit`.", + "oneOf": [ + { + "description": "Count the full active context against the limit.", + "enum": [ + "total" + ], + "type": "string" + }, + { + "description": "Count sampled output and later growth after the carried window prefix.", + "enum": [ + "body_after_prefix" + ], + "type": "string" + } + ] + }, "AutoReviewDecisionSource": { "description": "[UNSTABLE] Source that produced a terminal approval auto-review decision.", "enum": [ @@ -7162,6 +7181,16 @@ "null" ] }, + "model_auto_compact_token_limit_scope": { + "anyOf": [ + { + "$ref": "#/definitions/v2/AutoCompactTokenLimitScope" + }, + { + "type": "null" + } + ] + }, "model_context_window": { "format": "int64", "type": [ diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json index e413d792349c..0fee6fda8c43 100644 --- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json +++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json @@ -826,6 +826,25 @@ } ] }, + "AutoCompactTokenLimitScope": { + "description": "Selects which part of the active context is charged against `model_auto_compact_token_limit`.", + "oneOf": [ + { + "description": "Count the full active context against the limit.", + "enum": [ + "total" + ], + "type": "string" + }, + { + "description": "Count sampled output and later growth after the carried window prefix.", + "enum": [ + "body_after_prefix" + ], + "type": "string" + } + ] + }, "AutoReviewDecisionSource": { "description": "[UNSTABLE] Source that produced a terminal approval auto-review decision.", "enum": [ @@ -3551,6 +3570,16 @@ "null" ] }, + "model_auto_compact_token_limit_scope": { + "anyOf": [ + { + "$ref": "#/definitions/AutoCompactTokenLimitScope" + }, + { + "type": "null" + } + ] + }, "model_context_window": { "format": "int64", "type": [ diff --git a/codex-rs/app-server-protocol/schema/json/v2/ConfigReadResponse.json b/codex-rs/app-server-protocol/schema/json/v2/ConfigReadResponse.json index 81364a6f4030..7595f7fd0093 100644 --- a/codex-rs/app-server-protocol/schema/json/v2/ConfigReadResponse.json +++ b/codex-rs/app-server-protocol/schema/json/v2/ConfigReadResponse.json @@ -188,6 +188,25 @@ } ] }, + "AutoCompactTokenLimitScope": { + "description": "Selects which part of the active context is charged against `model_auto_compact_token_limit`.", + "oneOf": [ + { + "description": "Count the full active context against the limit.", + "enum": [ + "total" + ], + "type": "string" + }, + { + "description": "Count sampled output and later growth after the carried window prefix.", + "enum": [ + "body_after_prefix" + ], + "type": "string" + } + ] + }, "Config": { "additionalProperties": true, "properties": { @@ -280,6 +299,16 @@ "null" ] }, + "model_auto_compact_token_limit_scope": { + "anyOf": [ + { + "$ref": "#/definitions/AutoCompactTokenLimitScope" + }, + { + "type": "null" + } + ] + }, "model_context_window": { "format": "int64", "type": [ diff --git a/codex-rs/app-server-protocol/schema/typescript/AutoCompactTokenLimitScope.ts b/codex-rs/app-server-protocol/schema/typescript/AutoCompactTokenLimitScope.ts new file mode 100644 index 000000000000..3d9a56c82a67 --- /dev/null +++ b/codex-rs/app-server-protocol/schema/typescript/AutoCompactTokenLimitScope.ts @@ -0,0 +1,9 @@ +// GENERATED CODE! DO NOT MODIFY BY HAND! + +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. + +/** + * Selects which part of the active context is charged against + * `model_auto_compact_token_limit`. + */ +export type AutoCompactTokenLimitScope = "total" | "body_after_prefix"; diff --git a/codex-rs/app-server-protocol/schema/typescript/index.ts b/codex-rs/app-server-protocol/schema/typescript/index.ts index 97ea43560192..8be75af546fd 100644 --- a/codex-rs/app-server-protocol/schema/typescript/index.ts +++ b/codex-rs/app-server-protocol/schema/typescript/index.ts @@ -5,6 +5,7 @@ export type { AgentPath } from "./AgentPath"; export type { ApplyPatchApprovalParams } from "./ApplyPatchApprovalParams"; export type { ApplyPatchApprovalResponse } from "./ApplyPatchApprovalResponse"; export type { AuthMode } from "./AuthMode"; +export type { AutoCompactTokenLimitScope } from "./AutoCompactTokenLimitScope"; export type { ClientInfo } from "./ClientInfo"; export type { ClientNotification } from "./ClientNotification"; export type { ClientRequest } from "./ClientRequest"; diff --git a/codex-rs/app-server-protocol/schema/typescript/v2/Config.ts b/codex-rs/app-server-protocol/schema/typescript/v2/Config.ts index ba24663e8798..29eae9877419 100644 --- a/codex-rs/app-server-protocol/schema/typescript/v2/Config.ts +++ b/codex-rs/app-server-protocol/schema/typescript/v2/Config.ts @@ -1,6 +1,7 @@ // GENERATED CODE! DO NOT MODIFY BY HAND! // This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. +import type { AutoCompactTokenLimitScope } from "../AutoCompactTokenLimitScope"; import type { ForcedLoginMethod } from "../ForcedLoginMethod"; import type { ReasoningEffort } from "../ReasoningEffort"; import type { ReasoningSummary } from "../ReasoningSummary"; @@ -16,7 +17,7 @@ import type { SandboxMode } from "./SandboxMode"; import type { SandboxWorkspaceWrite } from "./SandboxWorkspaceWrite"; import type { ToolsV2 } from "./ToolsV2"; -export type Config = {model: string | null, review_model: string | null, model_context_window: bigint | null, model_auto_compact_token_limit: bigint | null, model_provider: string | null, approval_policy: AskForApproval | null, /** +export type Config = {model: string | null, review_model: string | null, model_context_window: bigint | null, model_auto_compact_token_limit: bigint | null, model_auto_compact_token_limit_scope: AutoCompactTokenLimitScope | null, model_provider: string | null, approval_policy: AskForApproval | null, /** * [UNSTABLE] Optional default for where approval requests are routed for * review. */ diff --git a/codex-rs/app-server-protocol/src/protocol/v2/config.rs b/codex-rs/app-server-protocol/src/protocol/v2/config.rs index b46515d81142..a34e6c530f9a 100644 --- a/codex-rs/app-server-protocol/src/protocol/v2/config.rs +++ b/codex-rs/app-server-protocol/src/protocol/v2/config.rs @@ -3,6 +3,7 @@ use super::AskForApproval; use super::SandboxMode; use super::shared::default_enabled; use codex_experimental_api_macros::ExperimentalApi; +use codex_protocol::config_types::AutoCompactTokenLimitScope; use codex_protocol::config_types::ForcedLoginMethod; use codex_protocol::config_types::ReasoningSummary; use codex_protocol::config_types::Verbosity; @@ -251,6 +252,7 @@ pub struct Config { pub review_model: Option, pub model_context_window: Option, pub model_auto_compact_token_limit: Option, + pub model_auto_compact_token_limit_scope: Option, pub model_provider: Option, #[experimental(nested)] pub approval_policy: Option, diff --git a/codex-rs/app-server-protocol/src/protocol/v2/tests.rs b/codex-rs/app-server-protocol/src/protocol/v2/tests.rs index b445fc4d01af..fbc692c20161 100644 --- a/codex-rs/app-server-protocol/src/protocol/v2/tests.rs +++ b/codex-rs/app-server-protocol/src/protocol/v2/tests.rs @@ -1511,6 +1511,7 @@ fn config_granular_approval_policy_is_marked_experimental() { review_model: None, model_context_window: None, model_auto_compact_token_limit: None, + model_auto_compact_token_limit_scope: None, model_provider: None, approval_policy: Some(AskForApproval::Granular { sandbox_approval: false, @@ -1551,6 +1552,7 @@ fn config_approvals_reviewer_is_marked_experimental() { review_model: None, model_context_window: None, model_auto_compact_token_limit: None, + model_auto_compact_token_limit_scope: None, model_provider: None, approval_policy: None, approvals_reviewer: Some(ApprovalsReviewer::AutoReview), @@ -1585,6 +1587,7 @@ fn config_nested_profile_granular_approval_policy_is_marked_experimental() { review_model: None, model_context_window: None, model_auto_compact_token_limit: None, + model_auto_compact_token_limit_scope: None, model_provider: None, approval_policy: None, approvals_reviewer: None, @@ -1641,6 +1644,7 @@ fn config_nested_profile_approvals_reviewer_is_marked_experimental() { review_model: None, model_context_window: None, model_auto_compact_token_limit: None, + model_auto_compact_token_limit_scope: None, model_provider: None, approval_policy: None, approvals_reviewer: None, diff --git a/codex-rs/config/src/config_toml.rs b/codex-rs/config/src/config_toml.rs index 76ba338fc0e7..43f75ffc289e 100644 --- a/codex-rs/config/src/config_toml.rs +++ b/codex-rs/config/src/config_toml.rs @@ -38,6 +38,7 @@ use codex_model_provider_info::ModelProviderInfo; use codex_model_provider_info::OLLAMA_CHAT_PROVIDER_REMOVED_ERROR; use codex_model_provider_info::OLLAMA_OSS_PROVIDER_ID; use codex_model_provider_info::OPENAI_PROVIDER_ID; +use codex_protocol::config_types::AutoCompactTokenLimitScope; use codex_protocol::config_types::ForcedLoginMethod; use codex_protocol::config_types::Personality; use codex_protocol::config_types::ReasoningSummary; @@ -156,6 +157,10 @@ pub struct ConfigToml { /// Token usage threshold triggering auto-compaction of conversation history. pub model_auto_compact_token_limit: Option, + /// Controls whether the auto-compaction limit applies to the full context or + /// only to tokens after the carried prefix in the current compaction window. + pub model_auto_compact_token_limit_scope: Option, + /// Default approval policy for executing commands. pub approval_policy: Option, diff --git a/codex-rs/core-api/src/lib.rs b/codex-rs/core-api/src/lib.rs index 04ebaf8e7e6f..e87ee82f3090 100644 --- a/codex-rs/core-api/src/lib.rs +++ b/codex-rs/core-api/src/lib.rs @@ -58,6 +58,7 @@ pub use codex_models_manager::manager::SharedModelsManager; pub use codex_protocol::ThreadId; pub use codex_protocol::config_types::AltScreenMode; pub use codex_protocol::config_types::ApprovalsReviewer; +pub use codex_protocol::config_types::AutoCompactTokenLimitScope; pub use codex_protocol::config_types::CollaborationModeMask; pub use codex_protocol::config_types::ShellEnvironmentPolicy; pub use codex_protocol::config_types::WebSearchMode; diff --git a/codex-rs/core/config.schema.json b/codex-rs/core/config.schema.json index 07f49a3cd630..f0eb76d62018 100644 --- a/codex-rs/core/config.schema.json +++ b/codex-rs/core/config.schema.json @@ -309,6 +309,25 @@ } ] }, + "AutoCompactTokenLimitScope": { + "description": "Selects which part of the active context is charged against `model_auto_compact_token_limit`.", + "oneOf": [ + { + "description": "Count the full active context against the limit.", + "enum": [ + "total" + ], + "type": "string" + }, + { + "description": "Count sampled output and later growth after the carried window prefix.", + "enum": [ + "body_after_prefix" + ], + "type": "string" + } + ] + }, "AutoReviewToml": { "properties": { "policy": { @@ -4544,6 +4563,14 @@ "format": "int64", "type": "integer" }, + "model_auto_compact_token_limit_scope": { + "allOf": [ + { + "$ref": "#/definitions/AutoCompactTokenLimitScope" + } + ], + "description": "Controls whether the auto-compaction limit applies to the full context or only to tokens after the carried prefix in the current compaction window." + }, "model_catalog_json": { "allOf": [ { diff --git a/codex-rs/core/src/client.rs b/codex-rs/core/src/client.rs index cd1fcb6696d8..f604a634581e 100644 --- a/codex-rs/core/src/client.rs +++ b/codex-rs/core/src/client.rs @@ -931,7 +931,7 @@ impl Drop for ModelClientSession { } impl ModelClientSession { - fn reset_websocket_session(&mut self) { + pub(crate) fn reset_websocket_session(&mut self) { self.websocket_session.connection = None; self.websocket_session.last_request = None; self.websocket_session.last_response_rx = None; diff --git a/codex-rs/core/src/config/config_tests.rs b/codex-rs/core/src/config/config_tests.rs index 06ee405495de..850d9ab571b5 100644 --- a/codex-rs/core/src/config/config_tests.rs +++ b/codex-rs/core/src/config/config_tests.rs @@ -7711,6 +7711,7 @@ async fn test_precedence_fixture_with_o3_profile() -> std::io::Result<()> { review_model: None, model_context_window: None, model_auto_compact_token_limit: None, + model_auto_compact_token_limit_scope: AutoCompactTokenLimitScope::Total, service_tier: None, model_provider_id: "openai".to_string(), model_provider: fixture.openai_provider.clone(), @@ -8164,6 +8165,7 @@ async fn test_precedence_fixture_with_gpt3_profile() -> std::io::Result<()> { review_model: None, model_context_window: None, model_auto_compact_token_limit: None, + model_auto_compact_token_limit_scope: AutoCompactTokenLimitScope::Total, service_tier: None, model_provider_id: "openai-custom".to_string(), model_provider: fixture.openai_custom_provider.clone(), @@ -8331,6 +8333,7 @@ async fn test_precedence_fixture_with_zdr_profile() -> std::io::Result<()> { review_model: None, model_context_window: None, model_auto_compact_token_limit: None, + model_auto_compact_token_limit_scope: AutoCompactTokenLimitScope::Total, service_tier: None, model_provider_id: "openai".to_string(), model_provider: fixture.openai_provider.clone(), @@ -8483,6 +8486,7 @@ async fn test_precedence_fixture_with_gpt5_profile() -> std::io::Result<()> { review_model: None, model_context_window: None, model_auto_compact_token_limit: None, + model_auto_compact_token_limit_scope: AutoCompactTokenLimitScope::Total, service_tier: None, model_provider_id: "openai".to_string(), model_provider: fixture.openai_provider.clone(), diff --git a/codex-rs/core/src/config/mod.rs b/codex-rs/core/src/config/mod.rs index 74fdb8383c36..dbd6cad470b5 100644 --- a/codex-rs/core/src/config/mod.rs +++ b/codex-rs/core/src/config/mod.rs @@ -77,6 +77,7 @@ use codex_model_provider_info::built_in_model_providers; use codex_model_provider_info::merge_configured_model_providers; use codex_models_manager::ModelsManagerConfig; use codex_protocol::config_types::AltScreenMode; +use codex_protocol::config_types::AutoCompactTokenLimitScope; use codex_protocol::config_types::ForcedLoginMethod; use codex_protocol::config_types::Personality; use codex_protocol::config_types::ReasoningSummary; @@ -527,6 +528,10 @@ pub struct Config { /// Token usage threshold triggering auto-compaction of conversation history. pub model_auto_compact_token_limit: Option, + /// Controls whether `model_auto_compact_token_limit` applies to the full + /// active context or only tokens after the carried compaction-window prefix. + pub model_auto_compact_token_limit_scope: AutoCompactTokenLimitScope, + /// Key into the model_providers map that specifies which provider to use. pub model_provider_id: String, @@ -3373,6 +3378,9 @@ impl Config { review_model, model_context_window: cfg.model_context_window, model_auto_compact_token_limit: cfg.model_auto_compact_token_limit, + model_auto_compact_token_limit_scope: cfg + .model_auto_compact_token_limit_scope + .unwrap_or_default(), model_provider_id, model_provider, cwd: resolved_cwd, diff --git a/codex-rs/core/src/guardian/review_session.rs b/codex-rs/core/src/guardian/review_session.rs index 389373c8deb8..15d6c142caec 100644 --- a/codex-rs/core/src/guardian/review_session.rs +++ b/codex-rs/core/src/guardian/review_session.rs @@ -7,6 +7,7 @@ use std::time::Duration; use anyhow::anyhow; use codex_analytics::GuardianReviewAnalyticsResult; use codex_analytics::GuardianReviewSessionKind; +use codex_protocol::config_types::AutoCompactTokenLimitScope; use codex_protocol::config_types::Personality; use codex_protocol::config_types::ReasoningSummary as ReasoningSummaryConfig; use codex_protocol::models::PermissionProfile; @@ -138,6 +139,7 @@ struct GuardianReviewSessionReuseKey { model_provider: ModelProviderInfo, model_context_window: Option, model_auto_compact_token_limit: Option, + model_auto_compact_token_limit_scope: AutoCompactTokenLimitScope, model_reasoning_effort: Option, model_reasoning_summary: Option, permissions: Permissions, @@ -162,6 +164,7 @@ impl GuardianReviewSessionReuseKey { model_provider: spawn_config.model_provider.clone(), model_context_window: spawn_config.model_context_window, model_auto_compact_token_limit: spawn_config.model_auto_compact_token_limit, + model_auto_compact_token_limit_scope: spawn_config.model_auto_compact_token_limit_scope, model_reasoning_effort: spawn_config.model_reasoning_effort, model_reasoning_summary: spawn_config.model_reasoning_summary, permissions: spawn_config.permissions.clone(), @@ -1155,6 +1158,34 @@ mod tests { ); } + #[tokio::test] + async fn guardian_review_session_compact_scope_change_invalidates_cached_session() { + let parent_config = crate::config::test_config().await; + let cached_spawn_config = build_guardian_review_session_config( + &parent_config, + /*live_network_config*/ None, + "active-model", + /*reasoning_effort*/ None, + ) + .expect("cached guardian config"); + let cached_reuse_key = + GuardianReviewSessionReuseKey::from_spawn_config(&cached_spawn_config); + + let mut changed_parent_config = parent_config; + changed_parent_config.model_auto_compact_token_limit_scope = + AutoCompactTokenLimitScope::BodyAfterPrefix; + let next_spawn_config = build_guardian_review_session_config( + &changed_parent_config, + /*live_network_config*/ None, + "active-model", + /*reasoning_effort*/ None, + ) + .expect("next guardian config"); + let next_reuse_key = GuardianReviewSessionReuseKey::from_spawn_config(&next_spawn_config); + + assert_ne!(cached_reuse_key, next_reuse_key); + } + #[tokio::test] async fn guardian_review_session_config_disables_hooks() { let mut parent_config = crate::config::test_config().await; diff --git a/codex-rs/core/src/session/mod.rs b/codex-rs/core/src/session/mod.rs index 3f1eab893644..ddab7752f717 100644 --- a/codex-rs/core/src/session/mod.rs +++ b/codex-rs/core/src/session/mod.rs @@ -78,6 +78,7 @@ use codex_protocol::approvals::ExecPolicyAmendment; use codex_protocol::approvals::NetworkPolicyAmendment; use codex_protocol::approvals::NetworkPolicyRuleAction; use codex_protocol::config_types::ApprovalsReviewer; +use codex_protocol::config_types::AutoCompactTokenLimitScope; use codex_protocol::config_types::ModeKind; use codex_protocol::config_types::Settings; use codex_protocol::config_types::WebSearchMode; @@ -288,6 +289,7 @@ use crate::rollout::map_session_init_error; use crate::session_startup_prewarm::SessionStartupPrewarmHandle; use crate::shell; use crate::shell_snapshot::ShellSnapshot; +use crate::state::AutoCompactWindowSnapshot; use crate::state::PendingRequestPermissions; use crate::state::SessionServices; use crate::state::SessionState; @@ -1089,6 +1091,11 @@ impl Session { state.get_total_token_usage(state.server_reasoning_included()) } + pub(crate) async fn auto_compact_window_snapshot(&self) -> AutoCompactWindowSnapshot { + let state = self.state.lock().await; + state.auto_compact_window_snapshot() + } + pub(crate) async fn get_total_token_usage_breakdown(&self) -> TotalTokenUsageBreakdown { let state = self.state.lock().await; state.history.get_total_token_usage_breakdown() @@ -1247,11 +1254,41 @@ impl Session { reconstructed_rollout.reference_context_item, ) .await; + let prefix_tokens = if matches!( + turn_context.config.model_auto_compact_token_limit_scope, + AutoCompactTokenLimitScope::BodyAfterPrefix + ) { + let history = self.clone_history().await; + let base_instructions = self.get_base_instructions().await; + history.estimate_token_count_with_base_instructions(&base_instructions) + } else { + None + }; + if let Some(prefix_tokens) = prefix_tokens { + self.set_auto_compact_window_estimated_prefill_for_scope(turn_context, prefix_tokens) + .await; + } self.set_previous_turn_settings(previous_turn_settings.clone()) .await; previous_turn_settings } + async fn set_auto_compact_window_estimated_prefill_for_scope( + &self, + turn_context: &TurnContext, + tokens: i64, + ) { + if !matches!( + turn_context.config.model_auto_compact_token_limit_scope, + AutoCompactTokenLimitScope::BodyAfterPrefix + ) { + return; + } + + let mut state = self.state.lock().await; + state.set_auto_compact_window_estimated_prefill(tokens); + } + fn last_token_info_from_rollout(rollout_items: &[RolloutItem]) -> Option { rollout_items.iter().rev().find_map(|item| match item { RolloutItem::EventMsg(EventMsg::TokenCount(ev)) => ev.info.clone(), @@ -2568,8 +2605,11 @@ impl Session { reference_context_item: Option, compacted_item: CompactedItem, ) { - self.replace_history(items, reference_context_item.clone()) - .await; + { + let mut state = self.state.lock().await; + state.replace_history(items, reference_context_item.clone()); + state.start_next_auto_compact_window(); + } self.persist_rollout_items(&[RolloutItem::Compacted(compacted_item)]) .await; @@ -2927,6 +2967,12 @@ impl Session { let mut state = self.state.lock().await; state .update_token_info_from_usage(token_usage, turn_context.model_context_window()); + if matches!( + turn_context.config.model_auto_compact_token_limit_scope, + AutoCompactTokenLimitScope::BodyAfterPrefix + ) { + state.ensure_auto_compact_window_server_prefill_from_usage(token_usage); + } state.token_info() }; if let Some(token_info) = token_info.as_ref() { @@ -2974,6 +3020,11 @@ impl Session { state.set_token_info(Some(info)); } + self.set_auto_compact_window_estimated_prefill_for_scope( + turn_context, + estimated_total_tokens, + ) + .await; self.send_token_count_event(turn_context).await; } diff --git a/codex-rs/core/src/session/turn.rs b/codex-rs/core/src/session/turn.rs index 5c4567b9fca5..7b1a59f610bd 100644 --- a/codex-rs/core/src/session/turn.rs +++ b/codex-rs/core/src/session/turn.rs @@ -75,6 +75,7 @@ use codex_hooks::HookEvent; use codex_hooks::HookEventAfterAgent; use codex_hooks::HookPayload; use codex_hooks::HookResult; +use codex_protocol::config_types::AutoCompactTokenLimitScope; use codex_protocol::config_types::ModeKind; use codex_protocol::config_types::ServiceTier; use codex_protocol::error::CodexErr; @@ -146,26 +147,31 @@ pub(crate) async fn run_turn( return None; } - let model_info = turn_context.model_info.clone(); - let auto_compact_limit = model_info.auto_compact_token_limit().unwrap_or(i64::MAX); let mut client_session = prewarmed_client_session.unwrap_or_else(|| sess.services.model_client.new_session()); // TODO(ccunningham): Pre-turn compaction runs before context updates and the // new user message are recorded. Estimate pending incoming items (context // diffs/full reinjection + user input) and trigger compaction preemptively // when they would push the thread over the compaction threshold. - if let Err(err) = run_pre_sampling_compact(&sess, &turn_context, &mut client_session).await { - if err.to_codex_protocol_error() == CodexErrorInfo::UsageLimitExceeded - && let Err(err) = sess - .goal_runtime_apply(GoalRuntimeEvent::UsageLimitReached { - turn_context: turn_context.as_ref(), - }) - .await - { - warn!("failed to usage-limit active goal after usage-limit error: {err}"); - } - error!("Failed to run pre-sampling compact"); - return None; + let pre_sampling_compact = + match run_pre_sampling_compact(&sess, &turn_context, &mut client_session).await { + Ok(pre_sampling_compact) => pre_sampling_compact, + Err(err) => { + if err.to_codex_protocol_error() == CodexErrorInfo::UsageLimitExceeded + && let Err(err) = sess + .goal_runtime_apply(GoalRuntimeEvent::UsageLimitReached { + turn_context: turn_context.as_ref(), + }) + .await + { + warn!("failed to usage-limit active goal after usage-limit error: {err}"); + } + error!("Failed to run pre-sampling compact"); + return None; + } + }; + if pre_sampling_compact.reset_client_session { + client_session.reset_websocket_session(); } sess.record_context_updates_and_set_reference_context_item(turn_context.as_ref()) @@ -342,17 +348,24 @@ pub(crate) async fn run_turn( can_drain_pending_input = true; let has_pending_input = sess.input_queue.has_pending_input(&sess.active_turn).await; let needs_follow_up = model_needs_follow_up || has_pending_input; - let total_usage_tokens = sess.get_total_token_usage().await; - let token_limit_reached = total_usage_tokens >= auto_compact_limit; + let token_status = + auto_compact_token_status(sess.as_ref(), turn_context.as_ref()).await; + let token_limit_reached = token_status.token_limit_reached; let estimated_token_count = sess.get_estimated_token_count(turn_context.as_ref()).await; trace!( turn_id = %turn_context.sub_id, - total_usage_tokens, + total_usage_tokens = token_status.active_context_tokens, + auto_compact_scope_tokens = token_status.auto_compact_scope_tokens, estimated_token_count = ?estimated_token_count, - auto_compact_limit, + auto_compact_scope_limit = token_status.auto_compact_scope_limit, + auto_compact_limit_scope = ?turn_context.config.model_auto_compact_token_limit_scope, + auto_compact_window_ordinal = ?token_status.auto_compact_window_ordinal, + auto_compact_window_prefill_tokens = ?token_status.auto_compact_window_prefill_tokens, + full_context_window_limit = ?token_status.full_context_window_limit, + full_context_window_limit_reached = token_status.full_context_window_limit_reached, token_limit_reached, model_needs_follow_up, has_pending_input, @@ -362,7 +375,7 @@ pub(crate) async fn run_turn( // as long as compaction works well in getting us way below the token limit, we shouldn't worry about being in an infinite loop. if token_limit_reached && needs_follow_up { - match run_auto_compact( + let reset_client_session = match run_auto_compact( &sess, &turn_context, &mut client_session, @@ -372,7 +385,7 @@ pub(crate) async fn run_turn( ) .await { - Ok(()) => {} + Ok(reset_client_session) => reset_client_session, Err(err) => { if err.to_codex_protocol_error() == CodexErrorInfo::UsageLimitExceeded && let Err(err) = sess @@ -388,6 +401,9 @@ pub(crate) async fn run_turn( return None; } }; + if reset_client_session { + client_session.reset_websocket_session(); + } can_drain_pending_input = !model_needs_follow_up; continue; } @@ -743,27 +759,88 @@ async fn track_turn_resolved_config_analytics( }); } +struct PreSamplingCompactResult { + reset_client_session: bool, +} + +#[derive(Debug)] +struct AutoCompactTokenStatus { + // Full active context usage, independent of the configured auto-compact scope. + active_context_tokens: i64, + // Usage counted against `model_auto_compact_token_limit` for the current scope. + auto_compact_scope_tokens: i64, + auto_compact_scope_limit: i64, + full_context_window_limit: Option, + auto_compact_window_ordinal: Option, + auto_compact_window_prefill_tokens: Option, + full_context_window_limit_reached: bool, + token_limit_reached: bool, +} + +async fn auto_compact_token_status( + sess: &Session, + turn_context: &TurnContext, +) -> AutoCompactTokenStatus { + let active_context_tokens = sess.get_total_token_usage().await; + let mut auto_compact_window_ordinal = None; + let mut auto_compact_window_prefill_tokens = None; + let (auto_compact_scope_tokens, auto_compact_scope_limit, full_context_window_limit) = + match turn_context.config.model_auto_compact_token_limit_scope { + AutoCompactTokenLimitScope::Total => ( + active_context_tokens, + turn_context + .model_info + .auto_compact_token_limit() + .unwrap_or(i64::MAX), + None, + ), + AutoCompactTokenLimitScope::BodyAfterPrefix => { + let window = sess.auto_compact_window_snapshot().await; + auto_compact_window_ordinal = Some(window.ordinal); + auto_compact_window_prefill_tokens = window.prefill_input_tokens; + let baseline = window.prefill_input_tokens.unwrap_or(active_context_tokens); + ( + active_context_tokens.saturating_sub(baseline), + turn_context + .config + .model_auto_compact_token_limit + .or_else(|| turn_context.model_info.auto_compact_token_limit()) + .unwrap_or(i64::MAX), + turn_context.model_context_window(), + ) + } + }; + let full_context_window_limit_reached = + full_context_window_limit.is_some_and(|full_context_window_limit| { + active_context_tokens >= full_context_window_limit + }); + let token_limit_reached = + auto_compact_scope_tokens >= auto_compact_scope_limit || full_context_window_limit_reached; + + AutoCompactTokenStatus { + active_context_tokens, + auto_compact_scope_tokens, + auto_compact_scope_limit, + full_context_window_limit, + auto_compact_window_ordinal, + auto_compact_window_prefill_tokens, + full_context_window_limit_reached, + token_limit_reached, + } +} + async fn run_pre_sampling_compact( sess: &Arc, turn_context: &Arc, client_session: &mut ModelClientSession, -) -> CodexResult<()> { - let total_usage_tokens_before_compaction = sess.get_total_token_usage().await; - maybe_run_previous_model_inline_compact( - sess, - turn_context, - client_session, - total_usage_tokens_before_compaction, - ) - .await?; - let total_usage_tokens = sess.get_total_token_usage().await; - let auto_compact_limit = turn_context - .model_info - .auto_compact_token_limit() - .unwrap_or(i64::MAX); - // Compact if the total usage tokens are greater than the auto compact limit - if total_usage_tokens >= auto_compact_limit { - run_auto_compact( +) -> CodexResult { + let mut pre_sampling_compacted = + maybe_run_previous_model_inline_compact(sess, turn_context, client_session).await?; + let mut reset_client_session = pre_sampling_compacted; + let token_status = auto_compact_token_status(sess.as_ref(), turn_context.as_ref()).await; + // Compact if the configured auto-compaction budget or usable context window is exhausted. + if token_status.token_limit_reached { + reset_client_session |= run_auto_compact( sess, turn_context, client_session, @@ -772,22 +849,26 @@ async fn run_pre_sampling_compact( CompactionPhase::PreTurn, ) .await?; + pre_sampling_compacted = true; } - Ok(()) + Ok(PreSamplingCompactResult { + reset_client_session: pre_sampling_compacted && reset_client_session, + }) } /// Runs pre-sampling compaction against the previous model when switching to a smaller /// context-window model. /// -/// Returns `Err(_)` only when compaction was attempted and failed. +/// Returns `Ok(true)` when compaction ran successfully, `Ok(false)` when compaction was skipped +/// because the model/context-window preconditions were not met, and `Err(_)` only when compaction +/// was attempted and failed. async fn maybe_run_previous_model_inline_compact( sess: &Arc, turn_context: &Arc, client_session: &mut ModelClientSession, - total_usage_tokens: i64, -) -> CodexResult<()> { +) -> CodexResult { let Some(previous_turn_settings) = sess.previous_turn_settings().await else { - return Ok(()); + return Ok(false); }; let previous_model_turn_context = Arc::new( turn_context @@ -796,20 +877,31 @@ async fn maybe_run_previous_model_inline_compact( ); let Some(old_context_window) = previous_model_turn_context.model_context_window() else { - return Ok(()); + return Ok(false); }; let Some(new_context_window) = turn_context.model_context_window() else { - return Ok(()); + return Ok(false); }; - let new_auto_compact_limit = turn_context - .model_info - .auto_compact_token_limit() - .unwrap_or(i64::MAX); - let should_run = total_usage_tokens > new_auto_compact_limit + let active_context_tokens = sess.get_total_token_usage().await; + let previous_model_limit_reached = match turn_context + .config + .model_auto_compact_token_limit_scope + { + AutoCompactTokenLimitScope::Total => { + let new_auto_compact_limit = turn_context + .model_info + .auto_compact_token_limit() + .unwrap_or(i64::MAX); + active_context_tokens > new_auto_compact_limit + || active_context_tokens >= new_context_window + } + AutoCompactTokenLimitScope::BodyAfterPrefix => active_context_tokens >= new_context_window, + }; + let should_run = previous_model_limit_reached && previous_model_turn_context.model_info.slug != turn_context.model_info.slug && old_context_window > new_context_window; if should_run { - run_auto_compact( + let _ = run_auto_compact( sess, &previous_model_turn_context, client_session, @@ -818,8 +910,9 @@ async fn maybe_run_previous_model_inline_compact( CompactionPhase::PreTurn, ) .await?; + return Ok(true); } - Ok(()) + Ok(false) } async fn run_auto_compact( @@ -829,7 +922,7 @@ async fn run_auto_compact( initial_context_injection: InitialContextInjection, reason: CompactionReason, phase: CompactionPhase, -) -> CodexResult<()> { +) -> CodexResult { if should_use_remote_compact_task(turn_context.provider.info()) { if turn_context.features.enabled(Feature::RemoteCompactionV2) { run_inline_remote_auto_compact_task_v2( @@ -841,7 +934,7 @@ async fn run_auto_compact( phase, ) .await?; - return Ok(()); + return Ok(false); } run_inline_remote_auto_compact_task( Arc::clone(sess), @@ -861,7 +954,7 @@ async fn run_auto_compact( ) .await?; } - Ok(()) + Ok(true) } pub(super) fn collect_explicit_app_ids_from_skill_items( diff --git a/codex-rs/core/src/state/auto_compact_window.rs b/codex-rs/core/src/state/auto_compact_window.rs new file mode 100644 index 000000000000..d1deb3c2882b --- /dev/null +++ b/codex-rs/core/src/state/auto_compact_window.rs @@ -0,0 +1,145 @@ +use codex_protocol::protocol::TokenUsage; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) struct AutoCompactWindowSnapshot { + pub(crate) ordinal: u64, + pub(crate) prefill_input_tokens: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum AutoCompactWindowPrefill { + ServerObserved(i64), + Estimated(i64), +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(super) struct AutoCompactWindow { + ordinal: u64, + /// Absolute input-token baseline for the current compaction window. + /// + /// `body_after_prefix` subtracts this from later active-context usage. It is + /// not the growth itself; server-observed usage replaces estimated + /// resume/recompute baselines when available. + prefill_input_tokens: Option, +} + +impl AutoCompactWindow { + pub(super) fn new() -> Self { + Self { + ordinal: 1, + prefill_input_tokens: None, + } + } + + pub(super) fn clear_prefill(&mut self) { + self.prefill_input_tokens = None; + } + + pub(super) fn start_next(&mut self) { + self.ordinal = self.ordinal.saturating_add(1); + self.clear_prefill(); + } + + /// Records the request-input side of the first server usage sample. The + /// sampled output from that response is body growth and should remain + /// counted against the scoped auto-compact budget. + pub(super) fn ensure_server_observed_prefill_from_usage(&mut self, usage: &TokenUsage) { + if matches!( + self.prefill_input_tokens, + Some(AutoCompactWindowPrefill::ServerObserved(_)) + ) { + return; + } + + self.prefill_input_tokens = Some(AutoCompactWindowPrefill::ServerObserved( + usage.input_tokens.max(0), + )); + } + + pub(super) fn set_estimated_prefill(&mut self, tokens: i64) { + if matches!( + self.prefill_input_tokens, + Some(AutoCompactWindowPrefill::ServerObserved(_)) + ) { + return; + } + + self.prefill_input_tokens = Some(AutoCompactWindowPrefill::Estimated(tokens.max(0))); + } + + pub(super) fn snapshot(&self) -> AutoCompactWindowSnapshot { + let prefill_input_tokens = match self.prefill_input_tokens { + Some(AutoCompactWindowPrefill::ServerObserved(tokens)) + | Some(AutoCompactWindowPrefill::Estimated(tokens)) => Some(tokens), + None => None, + }; + AutoCompactWindowSnapshot { + ordinal: self.ordinal, + prefill_input_tokens, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn tracks_prefill_and_window_boundaries() { + let mut window = AutoCompactWindow::new(); + + assert_eq!( + window.snapshot(), + AutoCompactWindowSnapshot { + ordinal: 1, + prefill_input_tokens: None, + } + ); + + window.set_estimated_prefill(/*tokens*/ 150); + assert_eq!( + window.snapshot(), + AutoCompactWindowSnapshot { + ordinal: 1, + prefill_input_tokens: Some(150), + } + ); + + window.ensure_server_observed_prefill_from_usage(&TokenUsage { + input_tokens: 120, + total_tokens: 170, + ..Default::default() + }); + assert_eq!( + window.snapshot(), + AutoCompactWindowSnapshot { + ordinal: 1, + prefill_input_tokens: Some(120), + } + ); + + window.ensure_server_observed_prefill_from_usage(&TokenUsage { + input_tokens: 130, + total_tokens: 180, + ..Default::default() + }); + window.set_estimated_prefill(/*tokens*/ 90); + assert_eq!( + window.snapshot(), + AutoCompactWindowSnapshot { + ordinal: 1, + prefill_input_tokens: Some(120), + } + ); + + window.start_next(); + assert_eq!( + window.snapshot(), + AutoCompactWindowSnapshot { + ordinal: 2, + prefill_input_tokens: None, + } + ); + } +} diff --git a/codex-rs/core/src/state/mod.rs b/codex-rs/core/src/state/mod.rs index 13f3bf6c86f2..3122ec5f2599 100644 --- a/codex-rs/core/src/state/mod.rs +++ b/codex-rs/core/src/state/mod.rs @@ -1,7 +1,9 @@ +mod auto_compact_window; mod service; mod session; mod turn; +pub(crate) use auto_compact_window::AutoCompactWindowSnapshot; pub(crate) use service::SessionServices; pub(crate) use session::SessionState; pub(crate) use turn::ActiveTurn; diff --git a/codex-rs/core/src/state/session.rs b/codex-rs/core/src/state/session.rs index 7121a775245c..0903d0d07326 100644 --- a/codex-rs/core/src/state/session.rs +++ b/codex-rs/core/src/state/session.rs @@ -5,6 +5,8 @@ use codex_protocol::models::ResponseItem; use codex_sandboxing::policy_transforms::merge_permission_profiles; use std::collections::HashSet; +use super::auto_compact_window::AutoCompactWindow; +use super::auto_compact_window::AutoCompactWindowSnapshot; use crate::context_manager::ContextManager; use crate::session::PreviousTurnSettings; use crate::session::session::SessionConfiguration; @@ -26,6 +28,8 @@ pub(crate) struct SessionState { /// model/realtime handling on subsequent regular turns (including full-context /// reinjection after resume or `/compact`). previous_turn_settings: Option, + /// Runtime accounting state for the active auto-compaction window. + auto_compact_window: AutoCompactWindow, /// Startup prewarmed session prepared during session initialization. pub(crate) startup_prewarm: Option, pub(crate) active_connector_selection: HashSet, @@ -45,6 +49,7 @@ impl SessionState { server_reasoning_included: false, mcp_dependency_prompted: HashSet::new(), previous_turn_settings: None, + auto_compact_window: AutoCompactWindow::new(), startup_prewarm: None, active_connector_selection: HashSet::new(), pending_session_start_source: None, @@ -94,6 +99,7 @@ impl SessionState { self.history.replace(items); self.history .set_reference_context_item(reference_context_item); + self.auto_compact_window.clear_prefill(); } pub(crate) fn set_token_info(&mut self, info: Option) { @@ -117,6 +123,26 @@ impl SessionState { self.history.update_token_info(usage, model_context_window); } + pub(crate) fn ensure_auto_compact_window_server_prefill_from_usage( + &mut self, + usage: &TokenUsage, + ) { + self.auto_compact_window + .ensure_server_observed_prefill_from_usage(usage); + } + + pub(crate) fn set_auto_compact_window_estimated_prefill(&mut self, tokens: i64) { + self.auto_compact_window.set_estimated_prefill(tokens); + } + + pub(crate) fn start_next_auto_compact_window(&mut self) { + self.auto_compact_window.start_next(); + } + + pub(crate) fn auto_compact_window_snapshot(&self) -> AutoCompactWindowSnapshot { + self.auto_compact_window.snapshot() + } + pub(crate) fn token_info(&self) -> Option { self.history.token_info() } diff --git a/codex-rs/core/src/state/session_tests.rs b/codex-rs/core/src/state/session_tests.rs index 5e90cc881dd2..b6b5a057a019 100644 --- a/codex-rs/core/src/state/session_tests.rs +++ b/codex-rs/core/src/state/session_tests.rs @@ -1,5 +1,6 @@ use super::*; use crate::session::tests::make_session_configuration_for_tests; +use crate::state::AutoCompactWindowSnapshot; use codex_protocol::protocol::CreditsSnapshot; use codex_protocol::protocol::RateLimitWindow; use pretty_assertions::assert_eq; @@ -61,6 +62,24 @@ async fn set_rate_limits_defaults_limit_id_to_codex_when_missing() { ); } +#[tokio::test] +async fn replace_history_clears_auto_compact_window_prefill_without_advancing() { + let session_configuration = make_session_configuration_for_tests().await; + let mut state = SessionState::new(session_configuration); + + state.start_next_auto_compact_window(); + state.set_auto_compact_window_estimated_prefill(/*tokens*/ 100); + state.replace_history(Vec::new(), /*reference_context_item*/ None); + + assert_eq!( + state.auto_compact_window_snapshot(), + AutoCompactWindowSnapshot { + ordinal: 2, + prefill_input_tokens: None, + } + ); +} + #[tokio::test] async fn set_rate_limits_defaults_to_codex_when_limit_id_missing_after_other_bucket() { let session_configuration = make_session_configuration_for_tests().await; diff --git a/codex-rs/core/tests/suite/compact.rs b/codex-rs/core/tests/suite/compact.rs index b07adfc61c3a..140b4ec2b11c 100644 --- a/codex-rs/core/tests/suite/compact.rs +++ b/codex-rs/core/tests/suite/compact.rs @@ -7,6 +7,7 @@ use codex_login::CodexAuth; use codex_model_provider_info::ModelProviderInfo; use codex_model_provider_info::built_in_model_providers; use codex_models_manager::bundled_models_response; +use codex_protocol::config_types::AutoCompactTokenLimitScope; use codex_protocol::items::TurnItem; use codex_protocol::models::PermissionProfile; use codex_protocol::openai_models::ModelInfo; @@ -126,6 +127,22 @@ fn set_test_compact_prompt(config: &mut Config) { config.compact_prompt = Some(SUMMARIZATION_PROMPT.to_string()); } +fn ev_completed_with_usage(id: &str, input_tokens: i64, output_tokens: i64) -> Value { + json!({ + "type": "response.completed", + "response": { + "id": id, + "usage": { + "input_tokens": input_tokens, + "input_tokens_details": null, + "output_tokens": output_tokens, + "output_tokens_details": null, + "total_tokens": input_tokens + output_tokens + } + } + }) +} + fn body_contains_text(body: &str, text: &str) -> bool { body.contains(&json_fragment(text)) } @@ -2074,6 +2091,100 @@ async fn pre_sampling_compact_runs_on_switch_to_smaller_context_model() { ); } +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn body_after_prefix_model_switch_budget_compacts_with_next_model() { + skip_if_no_network!(); + + let server = MockServer::start().await; + let previous_model = "gpt-5.3-codex"; + let next_model = "gpt-5.2"; + + let models_mock = mount_models_once( + &server, + ModelsResponse { + models: vec![ + model_info_with_context_window(previous_model, /*context_window*/ 273_000), + model_info_with_context_window(next_model, /*context_window*/ 125_000), + ], + }, + ) + .await; + + let request_log = mount_sse_sequence( + &server, + vec![ + sse(vec![ + ev_assistant_message("m1", "before switch"), + ev_completed_with_usage("r1", /*input_tokens*/ 100, /*output_tokens*/ 50), + ]), + sse(vec![ + ev_assistant_message("m2", "BODY_BUDGET_SUMMARY"), + ev_completed_with_tokens("r2", /*total_tokens*/ 10), + ]), + sse(vec![ + ev_assistant_message("m3", "after switch"), + ev_completed_with_tokens("r3", /*total_tokens*/ 100), + ]), + ], + ) + .await; + + let model_provider = non_openai_model_provider(&server); + let mut builder = test_codex() + .with_auth(CodexAuth::create_dummy_chatgpt_auth_for_testing()) + .with_model(previous_model) + .with_config(move |config| { + config.model_provider = model_provider; + set_test_compact_prompt(config); + let _ = config.features.enable(Feature::RemoteModels); + config.model_auto_compact_token_limit = Some(20); + config.model_auto_compact_token_limit_scope = + AutoCompactTokenLimitScope::BodyAfterPrefix; + }); + let test = builder.build(&server).await.expect("build test codex"); + + test.codex + .submit(disabled_permission_user_turn( + "before switch", + test.cwd.path().to_path_buf(), + previous_model.to_string(), + )) + .await + .expect("submit first user turn"); + wait_for_event(&test.codex, |event| { + matches!(event, EventMsg::TurnComplete(_)) + }) + .await; + + test.codex + .submit(disabled_permission_user_turn( + "after switch", + test.cwd.path().to_path_buf(), + next_model.to_string(), + )) + .await + .expect("submit second user turn"); + assert_compaction_uses_turn_lifecycle_id(&test.codex).await; + + let requests = request_log.requests(); + assert_eq!(models_mock.requests().len(), 1); + assert_eq!( + requests.len(), + 3, + "expected user, compact, and follow-up requests" + ); + assert_eq!( + requests[0].body_json()["model"].as_str(), + Some(previous_model) + ); + assert_eq!(requests[1].body_json()["model"].as_str(), Some(next_model)); + assert_eq!(requests[2].body_json()["model"].as_str(), Some(next_model)); + assert!( + body_contains_text(&requests[1].body_json().to_string(), SUMMARIZATION_PROMPT), + "body-budget compaction request should include summarization prompt" + ); +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn pre_sampling_compact_runs_after_resume_and_switch_to_smaller_model() { skip_if_no_network!(); @@ -3011,6 +3122,237 @@ async fn auto_compact_clamps_config_limit_to_context_window() { ); } +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn auto_compact_body_after_prefix_ignores_starting_window_prefix() { + skip_if_no_network!(); + + let server = start_mock_server().await; + + let first_turn = sse(vec![ + ev_assistant_message("m1", FIRST_REPLY), + ev_completed_with_usage("r1", /*input_tokens*/ 600, /*output_tokens*/ 50), + ]); + let second_turn = sse(vec![ + ev_assistant_message("m2", SECOND_LARGE_REPLY), + ev_completed_with_usage("r2", /*input_tokens*/ 700, /*output_tokens*/ 50), + ]); + let auto_compact_turn = sse(vec![ + ev_assistant_message("m3", AUTO_SUMMARY_TEXT), + ev_completed_with_tokens("r3", /*total_tokens*/ 20), + ]); + let third_turn = sse(vec![ + ev_assistant_message("m4", FINAL_REPLY), + ev_completed_with_usage("r4", /*input_tokens*/ 750, /*output_tokens*/ 20), + ]); + let request_log = mount_sse_sequence( + &server, + vec![first_turn, second_turn, auto_compact_turn, third_turn], + ) + .await; + + let model_provider = non_openai_model_provider(&server); + let test = test_codex() + .with_config(move |config| { + config.model_provider = model_provider; + set_test_compact_prompt(config); + config.model_context_window = Some(1_000); + config.model_auto_compact_token_limit = Some(100); + config.model_auto_compact_token_limit_scope = + AutoCompactTokenLimitScope::BodyAfterPrefix; + }) + .build(&server) + .await + .expect("build codex"); + + for user in ["PREFIX_FREE_ONE", "PREFIX_FREE_TWO"] { + test.submit_turn(user).await.expect("submit turn"); + } + + assert_eq!( + request_log.requests().len(), + 2, + "the first two turns should not compact just because the prefix exceeds the body budget" + ); + + test.submit_turn("PREFIX_FREE_THREE") + .await + .expect("submit third turn"); + + let requests = request_log.requests(); + assert_eq!( + requests.len(), + 4, + "third turn should include pre-turn compaction plus the post-compaction request" + ); + let compact_body = requests[2].body_json().to_string(); + assert!( + body_contains_text(&compact_body, SUMMARIZATION_PROMPT), + "body-after-prefix mode should compact once tokens after the first assistant sample exceed the configured budget" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn auto_compact_body_after_prefix_counts_growth_after_compaction() { + skip_if_no_network!(); + + let server = start_mock_server().await; + + let first_turn = sse(vec![ + ev_assistant_message("m1", FIRST_REPLY), + ev_completed_with_usage("r1", /*input_tokens*/ 100, /*output_tokens*/ 50), + ]); + let first_auto_compact_turn = sse(vec![ + ev_assistant_message("m2", AUTO_SUMMARY_TEXT), + ev_completed_with_tokens("r2", /*total_tokens*/ 20), + ]); + let second_turn = sse(vec![ + ev_assistant_message("m3", SECOND_LARGE_REPLY), + ev_completed_with_usage( + "r3", /*input_tokens*/ 100_000, /*output_tokens*/ 10, + ), + ]); + let third_turn = sse(vec![ + ev_assistant_message("m4", FINAL_REPLY), + ev_completed_with_usage( + "r4", /*input_tokens*/ 100_100, /*output_tokens*/ 5, + ), + ]); + let second_auto_compact_turn = sse(vec![ + ev_assistant_message("m5", AUTO_SUMMARY_TEXT), + ev_completed_with_tokens("r5", /*total_tokens*/ 20), + ]); + let fourth_turn = sse(vec![ + ev_assistant_message("m6", FINAL_REPLY), + ev_completed_with_usage("r6", /*input_tokens*/ 80, /*output_tokens*/ 5), + ]); + let request_log = mount_sse_sequence( + &server, + vec![ + first_turn, + first_auto_compact_turn, + second_turn, + third_turn, + second_auto_compact_turn, + fourth_turn, + ], + ) + .await; + + let model_provider = non_openai_model_provider(&server); + let test = test_codex() + .with_config(move |config| { + config.model_provider = model_provider; + set_test_compact_prompt(config); + config.model_context_window = Some(200_000); + config.model_auto_compact_token_limit = Some(40); + config.model_auto_compact_token_limit_scope = + AutoCompactTokenLimitScope::BodyAfterPrefix; + }) + .build(&server) + .await + .expect("build codex"); + + test.submit_turn("WINDOW_PREFIX") + .await + .expect("submit first turn"); + test.submit_turn("GROWTH_AFTER_COMPACT") + .await + .expect("submit second turn"); + + let requests = request_log.requests(); + assert_eq!( + requests.len(), + 3, + "second turn should compact first and then sample the new growth" + ); + + test.submit_turn("AFTER_GROWTH") + .await + .expect("submit third turn"); + + let requests = request_log.requests(); + assert_eq!( + requests.len(), + 4, + "the first server-observed input in the new window should become the prefill baseline" + ); + + test.submit_turn("AFTER_GROWTH_TRIGGER") + .await + .expect("submit fourth turn"); + + let requests = request_log.requests(); + assert_eq!( + requests.len(), + 6, + "fourth turn should compact because later post-compaction growth counted against the body budget" + ); + let compact_body = requests[4].body_json().to_string(); + assert!( + body_contains_text(&compact_body, SUMMARIZATION_PROMPT), + "post-compaction growth should trigger a second body-after-prefix compaction" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn auto_compact_body_after_prefix_still_caps_at_context_window() { + skip_if_no_network!(); + + let server = start_mock_server().await; + + let first_turn = sse(vec![ + ev_assistant_message("m1", FIRST_REPLY), + ev_completed_with_usage("r1", /*input_tokens*/ 80, /*output_tokens*/ 5), + ]); + let second_turn = sse(vec![ + ev_assistant_message("m2", SECOND_LARGE_REPLY), + ev_completed_with_usage("r2", /*input_tokens*/ 98, /*output_tokens*/ 1), + ]); + let auto_compact_turn = sse(vec![ + ev_assistant_message("m3", AUTO_SUMMARY_TEXT), + ev_completed_with_tokens("r3", /*total_tokens*/ 20), + ]); + let third_turn = sse(vec![ + ev_assistant_message("m4", FINAL_REPLY), + ev_completed_with_usage("r4", /*input_tokens*/ 80, /*output_tokens*/ 5), + ]); + let request_log = mount_sse_sequence( + &server, + vec![first_turn, second_turn, auto_compact_turn, third_turn], + ) + .await; + + let model_provider = non_openai_model_provider(&server); + let test = test_codex() + .with_config(move |config| { + config.model_provider = model_provider; + set_test_compact_prompt(config); + config.model_context_window = Some(100); + config.model_auto_compact_token_limit = Some(200); + config.model_auto_compact_token_limit_scope = + AutoCompactTokenLimitScope::BodyAfterPrefix; + }) + .build(&server) + .await + .expect("build codex"); + + for user in ["CONTEXT_CAP_ONE", "CONTEXT_CAP_TWO", "CONTEXT_CAP_THREE"] { + test.submit_turn(user).await.expect("submit turn"); + } + + let requests = request_log.requests(); + assert_eq!( + requests.len(), + 4, + "third turn should compact before sampling because total context hit the usable window" + ); + let compact_body = requests[2].body_json().to_string(); + assert!( + body_contains_text(&compact_body, SUMMARIZATION_PROMPT), + "body-after-prefix mode should still clamp the total threshold to the usable context window" + ); +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn auto_compact_counts_encrypted_reasoning_before_last_user() { skip_if_no_network!(); diff --git a/codex-rs/protocol/src/config_types.rs b/codex-rs/protocol/src/config_types.rs index 22c4f5154800..b4b4759c9b71 100644 --- a/codex-rs/protocol/src/config_types.rs +++ b/codex-rs/protocol/src/config_types.rs @@ -21,6 +21,21 @@ use wildmatch::WildMatchPattern; use crate::openai_models::ReasoningEffort; +/// Selects which part of the active context is charged against +/// `model_auto_compact_token_limit`. +#[derive( + Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq, Eq, Display, JsonSchema, TS, +)] +#[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] +pub enum AutoCompactTokenLimitScope { + /// Count the full active context against the limit. + #[default] + Total, + /// Count sampled output and later growth after the carried window prefix. + BodyAfterPrefix, +} + /// A summary of the reasoning performed by the model. This can be useful for /// debugging and understanding the model's reasoning process. /// See https://platform.openai.com/docs/guides/reasoning?api-mode=responses#reasoning-summaries diff --git a/codex-rs/thread-manager-sample/src/main.rs b/codex-rs/thread-manager-sample/src/main.rs index 3410711e3f35..cf499c7703cd 100644 --- a/codex-rs/thread-manager-sample/src/main.rs +++ b/codex-rs/thread-manager-sample/src/main.rs @@ -15,6 +15,7 @@ use codex_core_api::Arg0DispatchPaths; use codex_core_api::AskForApproval; use codex_core_api::AuthCredentialsStoreMode; use codex_core_api::AuthManager; +use codex_core_api::AutoCompactTokenLimitScope; use codex_core_api::CodexThread; use codex_core_api::Config; use codex_core_api::ConfigLayerStack; @@ -168,6 +169,7 @@ fn new_config(model: Option, arg0_paths: Arg0DispatchPaths) -> anyhow::R review_model: None, model_context_window: None, model_auto_compact_token_limit: None, + model_auto_compact_token_limit_scope: AutoCompactTokenLimitScope::Total, model_provider_id, model_provider, personality: None,