Skip to content

Commit ab719a3

Browse files
hhoikooclaude
andcommitted
feat(BA-4330): Add configurable BatchSpanProcessor queue and batch size
Add max_queue_size and max_export_batch_size to OTELConfig and OpenTelemetrySpec, defaulting to 65536 and 4096 respectively. The SDK defaults (2048/512) are insufficient for production GraphQL workloads and cause span drops during burst traffic. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 9dcb2a1 commit ab719a3

8 files changed

Lines changed: 55 additions & 3 deletions

File tree

src/ai/backend/agent/server.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1521,6 +1521,8 @@ async def service_discovery_ctx(
15211521
endpoint=local_config.otel.endpoint,
15221522
service_instance_id=meta.id,
15231523
service_instance_name=meta.display_name,
1524+
max_queue_size=local_config.otel.max_queue_size,
1525+
max_export_batch_size=local_config.otel.max_export_batch_size,
15241526
)
15251527
BraceStyleAdapter.apply_otel(otel_spec)
15261528
try:

src/ai/backend/appproxy/coordinator/server.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -767,6 +767,8 @@ async def service_discovery_ctx(root_ctx: RootContext) -> AsyncIterator[None]:
767767
endpoint=root_ctx.local_config.otel.endpoint,
768768
service_instance_id=meta.id,
769769
service_instance_name=meta.display_name,
770+
max_queue_size=root_ctx.local_config.otel.max_queue_size,
771+
max_export_batch_size=root_ctx.local_config.otel.max_export_batch_size,
770772
)
771773
BraceStyleAdapter.apply_otel(otel_spec)
772774
try:

src/ai/backend/appproxy/worker/server.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,8 @@ async def service_discovery_ctx(root_ctx: RootContext) -> AsyncIterator[None]:
560560
endpoint=root_ctx.local_config.otel.endpoint,
561561
service_instance_id=meta.id,
562562
service_instance_name=meta.display_name,
563+
max_queue_size=root_ctx.local_config.otel.max_queue_size,
564+
max_export_batch_size=root_ctx.local_config.otel.max_export_batch_size,
563565
)
564566
BraceStyleAdapter.apply_otel(otel_spec)
565567
try:

src/ai/backend/common/configs/otel.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,36 @@ class OTELConfig(BaseConfigSchema):
5858
example=ConfigExample(local="http://localhost:4317", prod="http://otel-collector:4317"),
5959
),
6060
]
61+
max_queue_size: Annotated[
62+
int,
63+
Field(
64+
default=65536,
65+
validation_alias=AliasChoices("max-queue-size", "max_queue_size"),
66+
serialization_alias="max-queue-size",
67+
),
68+
BackendAIConfigMeta(
69+
description=(
70+
"Maximum number of spans queued for export. "
71+
"Spans are dropped when the queue is full. "
72+
"The default (65536) accommodates burst traffic from GraphQL workloads."
73+
),
74+
added_version="26.2.0",
75+
example=ConfigExample(local="2048", prod="65536"),
76+
),
77+
]
78+
max_export_batch_size: Annotated[
79+
int,
80+
Field(
81+
default=4096,
82+
validation_alias=AliasChoices("max-export-batch-size", "max_export_batch_size"),
83+
serialization_alias="max-export-batch-size",
84+
),
85+
BackendAIConfigMeta(
86+
description=(
87+
"Maximum number of spans exported in a single batch. "
88+
"Larger batches reduce export overhead but increase memory usage."
89+
),
90+
added_version="26.2.0",
91+
example=ConfigExample(local="512", prod="4096"),
92+
),
93+
]

src/ai/backend/logging/otel.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ class OpenTelemetrySpec:
2525
endpoint: str
2626
service_instance_id: uuid.UUID
2727
service_instance_name: str
28+
max_queue_size: int
29+
max_export_batch_size: int
2830

2931
def to_resource(self) -> Resource:
3032
attributes = {
@@ -62,7 +64,11 @@ def apply_otel_loggers(loggers: Iterable[logging.Logger], spec: OpenTelemetrySpe
6264
def apply_otel_tracer(spec: OpenTelemetrySpec) -> None:
6365
tracer_provider = TracerProvider(resource=spec.to_resource())
6466
span_exporter = OTLPSpanExporter(endpoint=spec.endpoint)
65-
span_processor = BatchSpanProcessor(span_exporter)
67+
span_processor = BatchSpanProcessor(
68+
span_exporter,
69+
max_queue_size=spec.max_queue_size,
70+
max_export_batch_size=spec.max_export_batch_size,
71+
)
6672
tracer_provider.add_span_processor(span_processor)
6773
trace.set_tracer_provider(tracer_provider)
6874
logging.info("OpenTelemetry tracing initialized successfully.")

src/ai/backend/manager/server.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -844,13 +844,16 @@ async def service_discovery_ctx(root_ctx: RootContext) -> AsyncIterator[None]:
844844

845845
if root_ctx.config_provider.config.otel.enabled:
846846
meta = root_ctx.sd_loop.metadata
847+
otel_config = root_ctx.config_provider.config.otel
847848
otel_spec = OpenTelemetrySpec(
848849
service_name=meta.service_group,
849850
service_version=meta.version,
850-
log_level=root_ctx.config_provider.config.otel.log_level,
851-
endpoint=root_ctx.config_provider.config.otel.endpoint,
851+
log_level=otel_config.log_level,
852+
endpoint=otel_config.endpoint,
852853
service_instance_id=meta.id,
853854
service_instance_name=meta.display_name,
855+
max_queue_size=otel_config.max_queue_size,
856+
max_export_batch_size=otel_config.max_export_batch_size,
854857
)
855858
BraceStyleAdapter.apply_otel(otel_spec)
856859
try:

src/ai/backend/storage/server.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,8 @@ async def service_discovery_ctx(
543543
endpoint=local_config.otel.endpoint,
544544
service_instance_id=meta.id,
545545
service_instance_name=meta.display_name,
546+
max_queue_size=local_config.otel.max_queue_size,
547+
max_export_batch_size=local_config.otel.max_export_batch_size,
546548
)
547549
BraceStyleAdapter.apply_otel(otel_spec)
548550
try:

src/ai/backend/web/server.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -917,6 +917,8 @@ async def service_discovery_ctx(config: WebServerUnifiedConfig) -> AsyncGenerato
917917
endpoint=config.otel.endpoint,
918918
service_instance_id=uuid.uuid4(),
919919
service_instance_name=instance_name,
920+
max_queue_size=config.otel.max_queue_size,
921+
max_export_batch_size=config.otel.max_export_batch_size,
920922
)
921923
BraceStyleAdapter.apply_otel(otel_spec)
922924
yield

0 commit comments

Comments
 (0)