Add session-level support for custom @scorer decorator by smoorjani · Pull Request #21106 · mlflow/mlflow

smoorjani · 2026-02-24T18:06:57Z

Related Issues/PRs

#xxx

What changes are proposed in this pull request?

Auto-detect session-level scorers by inspecting the function signature for a session parameter. Validate at decoration time that session is not combined with single-turn parameters (inputs, outputs, trace).

How is this PR tested?

Existing unit/integration tests
New unit/integration tests
Manual tests

OSS

import time
import uuid

import mlflow
from mlflow.entities import Feedback, SpanType
from mlflow.genai.scorers import scorer
from mlflow.genai.scorers.registry import get_scorer, list_scorers
from mlflow.tracing.constant import TraceMetadataKey

mlflow.set_tracking_uri("http://localhost:5000")

# ── Define scorers ───────────────────────────────────────────────────────────


@scorer
def response_length(outputs) -> Feedback:
    length = len(str(outputs))
    return Feedback(value=length > 20, rationale=f"Response length: {length}")


@scorer
def conversation_completeness(session) -> Feedback:
    total_turns = len(session)
    error_turns = sum(1 for t in session if t.info.status == "ERROR")
    return Feedback(value=error_turns == 0, rationale=f"{total_turns} turns, {error_turns} errors")


@scorer
def session_goal_met(session, expectations) -> Feedback:
    last_trace = max(session, key=lambda t: t.info.request_time)
    goal = expectations.get("goal", "")
    response = str(last_trace.data.response) if last_trace.data else ""
    met = goal.lower() in response.lower() if goal else True
    return Feedback(value=met, rationale=f"Goal '{goal}' in final response: {met}")


# ── Auto-detection ───────────────────────────────────────────────────────────

assert response_length.is_session_level_scorer is False
assert conversation_completeness.is_session_level_scorer is True
assert session_goal_met.is_session_level_scorer is True
print("Auto-detection OK")

# ── Register scorers ────────────────────────────────────────────────────────

experiment_name = f"demo-session-scorer-{uuid.uuid4().hex[:8]}"
mlflow.set_experiment(experiment_name)
experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

reg_single = response_length.register(name="response_length", experiment_id=experiment_id)
reg_session = conversation_completeness.register(
    name="conversation_completeness", experiment_id=experiment_id
)
reg_goal = session_goal_met.register(name="session_goal_met", experiment_id=experiment_id)
print(f"Registered: {reg_single.name}, {reg_session.name}, {reg_goal.name}")

scorers = list_scorers(experiment_id=experiment_id)
assert len(scorers) == 3
print(f"Listed {len(scorers)} scorers")

retrieved = get_scorer(name="response_length", experiment_id=experiment_id)
assert retrieved.name == "response_length"
print(f"Retrieved: {retrieved.name}")

# ── Create session traces ────────────────────────────────────────────────────

session_id = f"session-{uuid.uuid4().hex[:8]}"


def create_trace(name, user_input, bot_response, sid=None):
    metadata = {TraceMetadataKey.TRACE_SESSION: sid} if sid else None
    span = mlflow.start_span_no_context(
        name, span_type=SpanType.CHAIN, inputs={"user_input": user_input}, metadata=metadata
    )
    time.sleep(0.05)
    span.set_outputs({"bot_response": bot_response})
    span.end()


create_trace("turn_1", "What is MLflow?", "MLflow is an open-source ML platform.", session_id)
create_trace("turn_2", "How do I track experiments?", "Use mlflow.log_param().", session_id)
create_trace("turn_3", "Thanks!", "You're welcome!", session_id)
create_trace("standalone", "Quick question", "Short answer")

time.sleep(3)
all_traces = mlflow.search_traces(locations=[experiment_id], return_type="list")
session_traces = [
    t for t in all_traces if t.info.trace_metadata.get(TraceMetadataKey.TRACE_SESSION) == session_id
]
assert len(all_traces) == 4
assert len(session_traces) == 3
print(f"Created {len(all_traces)} traces ({len(session_traces)} in session)")

# ── Run scorers ──────────────────────────────────────────────────────────────

r = response_length.run(outputs="MLflow is an open-source platform for ML lifecycle management.")
print(f"response_length: value={r.value}, rationale='{r.rationale}'")
assert r.value is True

r = conversation_completeness.run(session=session_traces)
print(f"conversation_completeness: value={r.value}, rationale='{r.rationale}'")
assert r.value is True

r = session_goal_met.run(session=session_traces, expectations={"goal": "welcome"})
print(f"session_goal_met: value={r.value}, rationale='{r.rationale}'")
assert r.value is True

Databricks

import os
import time
import uuid

import mlflow
from mlflow.entities import Feedback, SpanType
from mlflow.genai.scorers import ScorerSamplingConfig, scorer
from mlflow.tracing.constant import TraceMetadataKey

EXPERIMENT_ID = "98459650931566"

mlflow.set_tracking_uri("databricks")
mlflow.set_experiment(experiment_id=EXPERIMENT_ID)

# ── Define scorers

@scorer
def response_length(outputs) -> bool:
    return len(str(outputs)) > 20


@scorer
def conversation_completeness(session) -> Feedback:
    total_turns = len(session)
    error_turns = sum(1 for t in session if t.info.status == "ERROR")
    return Feedback(value=error_turns == 0, rationale=f"{total_turns} turns, {error_turns} errors")


assert response_length.is_session_level_scorer is False
assert conversation_completeness.is_session_level_scorer is True
print("Auto-detection OK")

# ── Register & start scorers

run_id = uuid.uuid4().hex[:8]

reg_single = response_length.register(name=f"response_length_{run_id}", experiment_id=EXPERIMENT_ID)
reg_session = conversation_completeness.register(
    name=f"conversation_completeness_{run_id}", experiment_id=EXPERIMENT_ID
)
print(f"Registered {reg_single.name} -> {reg_single.status}")
print(f"Registered {reg_session.name} -> {reg_session.status}")

started_single = reg_single.start(
    experiment_id=EXPERIMENT_ID,
    sampling_config=ScorerSamplingConfig(sample_rate=1.0),
)
print(f"Started {started_single.name} -> {started_single.status}")

# Session-level scorers can be registered but not yet scheduled on the Databricks backend
try:
    reg_session.start(
        experiment_id=EXPERIMENT_ID,
        sampling_config=ScorerSamplingConfig(sample_rate=1.0),
    )
except Exception as e:
    print(f"Session scorer start() not yet supported server-side: {type(e).__name__}")

# ── Create session traces

session_id = f"session-{uuid.uuid4().hex[:8]}"


def create_trace(name, user_input, bot_response, sid=None):
    metadata = {TraceMetadataKey.TRACE_SESSION: sid} if sid else None
    span = mlflow.start_span_no_context(
        name, span_type=SpanType.CHAIN, inputs={"user_input": user_input}, metadata=metadata
    )
    time.sleep(0.05)
    span.set_outputs({"bot_response": bot_response})
    span.end()


create_trace("turn_1", "What is MLflow?", "MLflow is an open-source ML platform.", session_id)
create_trace("turn_2", "How do I track experiments?", "Use mlflow.log_param().", session_id)
create_trace("turn_3", "Thanks!", "You're welcome!", session_id)
create_trace("standalone", "Quick question", "Short answer")

time.sleep(5)
all_traces = mlflow.search_traces(locations=[EXPERIMENT_ID], return_type="list")
session_traces = [
    t for t in all_traces if t.info.trace_metadata.get(TraceMetadataKey.TRACE_SESSION) == session_id
]
print(f"Created traces ({len(session_traces)} in session)")

# ── Run scorers directly

r = response_length.run(outputs="MLflow is an open-source platform for ML lifecycle management.")
print(f"response_length: value={r}")

r = conversation_completeness.run(session=session_traces)
print(f"conversation_completeness: value={r.value}, rationale='{r.rationale}'")

# ── Clean up: stop scorers

started_single.stop(experiment_id=EXPERIMENT_ID)
print("Stopped scorer")

print("\nAll checks passed!")

Does this PR require documentation update?

Does this PR require updating the MLflow Skills repository?

No. You can skip the rest of this section.
Yes. Please link the corresponding PR or explain how you plan to update it.

Release Notes

Is this a user-facing change?

No. You can skip the rest of this section.
Yes. Give a description of this change to be included in the release notes for MLflow users.

Support custom-code session-level scorers.

What component(s), interfaces, languages, and integrations does this PR affect?

Components

How should the PR be classified in the release notes? Choose one:

rn/none - No description will be included. The PR will be mentioned only by the PR number in the "Small Bugfixes and Documentation Updates" section
rn/breaking-change - The PR will be mentioned in the "Breaking Changes" section
rn/feature - A new user-facing feature worth mentioning in the release notes
rn/bug-fix - A user-facing bug fix worth mentioning in the release notes
rn/documentation - A user-facing documentation change worth mentioning in the release notes

Should this PR be included in the next patch release?

Yes should be selected for bug fixes, documentation updates, and other small changes. No should be selected for new features and larger changes. If you're unsure about the release classification of this PR, leave this unchecked to let the maintainers decide.

What is a minor/patch release?

Minor release: a release that increments the second part of the version number (e.g., 1.2.0 -> 1.3.0).
Bug fixes, doc updates and new features usually go into minor releases.
Patch release: a release that increments the third part of the version number (e.g., 1.2.0 -> 1.2.1).
Bug fixes and doc updates usually go into patch releases.

Yes (this PR will be cherry-picked and included in the next patch release)
No (this PR will be included in the next minor release)

github-actions · 2026-02-24T18:07:12Z

🛠 DevTools 🛠

Install mlflow from this PR

# mlflow
pip install git+https://github.com/mlflow/mlflow.git@refs/pull/21106/merge
# mlflow-skinny
pip install git+https://github.com/mlflow/mlflow.git@refs/pull/21106/merge#subdirectory=libs/skinny

For Databricks, use the following command:

%sh curl -LsSf https://raw.githubusercontent.com/mlflow/mlflow/HEAD/dev/install-skinny.sh | sh -s pull/21106/merge

github-actions · 2026-02-24T18:17:55Z

Documentation preview for e712774 is available at:

https://pr-21106--mlflow-docs-preview.netlify.app/docs/latest/

More info

Ignore this comment if this PR does not change the documentation.
The preview is updated when a new commit is pushed to this PR.
This comment was created by this workflow run.
The documentation was built by this workflow run.

B-Step62 · 2026-02-26T08:05:18Z

Can we update https://mlflow.org/docs/latest/genai/eval-monitor/scorers/custom/ in a follow-up?

Auto-detect session-level scorers by inspecting the function signature for a `session` parameter. Validate at decoration time that `session` is not combined with single-turn parameters (inputs, outputs, trace). Co-Authored-By: Claude <noreply@anthropic.com> Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>

Co-Authored-By: Claude <noreply@anthropic.com> Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>

@scorer

The security check in _reconstruct_decorator_scorer was blocking deserialization for remote Databricks access. This prevented registering custom @scorer scorers from a local machine. Relax the check to allow exec when connected to a Databricks workspace, since the serialized code originates from the user's local function. Co-Authored-By: Claude <noreply@anthropic.com> Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>

- Use ClassVar instead of closure for is_session_level_scorer - Test full serialization roundtrip (dump + load) - Add session-level scorer invocation test with mock traces - Move imports back to top level Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>

Remove is_in_databricks_runtime() from the deserialization guard since it doesn't effectively prevent code execution risks (scorers in Databricks runtime can still access critical info and make network requests). The real security wall is the registration gate (Databricks URI check), which ensures only authenticated users can register scorers. Keeping is_in_databricks_runtime() would also theoretically allow loading a scorer from an OSS server in Databricks runtime, so removing it is strictly safer. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>

Remove is_in_databricks_runtime patches from tests since the import was removed from base.py. Tests now only mock is_databricks_uri. Co-Authored-By: Claude <noreply@anthropic.com> Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>

Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com> Co-authored-by: Claude <noreply@anthropic.com>

smoorjani requested a review from AveshCSingh February 24, 2026 18:07

github-actions Bot added size/M area/evaluation MLflow Evaluation labels Feb 24, 2026

github-actions Bot added rn/none List under Small Changes in Changelogs. v3.10.1 labels Feb 24, 2026

AveshCSingh suggested changes Feb 24, 2026

View reviewed changes

Comment thread mlflow/genai/scorers/base.py Outdated

Comment thread mlflow/genai/scorers/base.py Outdated

Comment thread tests/genai/scorers/test_scorer.py

Comment thread tests/genai/scorers/test_scorer.py

github-actions Bot assigned AveshCSingh Feb 24, 2026

smoorjani requested review from AveshCSingh and B-Step62 February 24, 2026 22:15

B-Step62 approved these changes Feb 26, 2026

View reviewed changes

B-Step62 reviewed Feb 26, 2026

View reviewed changes

github-actions Bot assigned B-Step62 Feb 26, 2026

smoorjani enabled auto-merge February 26, 2026 19:15

smoorjani disabled auto-merge February 26, 2026 19:16

smoorjani force-pushed the custom-code-session-scorer branch from e976b3a to 7cbec7a Compare February 26, 2026 21:28

smoorjani and others added 8 commits February 26, 2026 14:02

Fix lint: use walrus operator and remove exec in tests

b7a8b03

Co-Authored-By: Claude <noreply@anthropic.com> Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>

Use real traces in session-level scorer invocation test

6dcbc29

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>

Extract _create_test_trace as top-level helper

982e198

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>

smoorjani force-pushed the custom-code-session-scorer branch from 7cbec7a to e712774 Compare February 26, 2026 22:03

AveshCSingh approved these changes Feb 26, 2026

View reviewed changes

smoorjani enabled auto-merge February 26, 2026 23:25

smoorjani added this pull request to the merge queue Feb 26, 2026

Merged via the queue into mlflow:master with commit 9087cd3 Feb 26, 2026
69 of 71 checks passed

smoorjani deleted the custom-code-session-scorer branch February 26, 2026 23:41

github-actions Bot added size/L Large PR (200-499 LoC) and removed size/M labels Feb 26, 2026

daniellok-db pushed a commit to daniellok-db/mlflow that referenced this pull request Mar 5, 2026

Add session-level support for custom @scorer decorator (mlflow#21106)

01eca6e

Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com> Co-authored-by: Claude <noreply@anthropic.com>

daniellok-db pushed a commit to daniellok-db/mlflow that referenced this pull request Mar 5, 2026

Add session-level support for custom @scorer decorator (mlflow#21106)

947125a

Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com> Co-authored-by: Claude <noreply@anthropic.com>

daniellok-db pushed a commit that referenced this pull request Mar 5, 2026

Add session-level support for custom @scorer decorator (#21106)

88005c3

Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com> Co-authored-by: Claude <noreply@anthropic.com>

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add session-level support for custom @scorer decorator#21106

Add session-level support for custom @scorer decorator#21106
smoorjani merged 8 commits intomlflow:masterfrom
smoorjani:custom-code-session-scorer

smoorjani commented Feb 24, 2026 •

edited

Loading

Uh oh!

github-actions Bot commented Feb 24, 2026

Install mlflow from this PR

Uh oh!

github-actions Bot commented Feb 24, 2026 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

B-Step62 Feb 26, 2026

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

Conversation

smoorjani commented Feb 24, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Related Issues/PRs

What changes are proposed in this pull request?

How is this PR tested?

Does this PR require documentation update?

Does this PR require updating the MLflow Skills repository?

Release Notes

Is this a user-facing change?

What component(s), interfaces, languages, and integrations does this PR affect?

How should the PR be classified in the release notes? Choose one:

Should this PR be included in the next patch release?

Uh oh!

github-actions Bot commented Feb 24, 2026

Install mlflow from this PR

Uh oh!

github-actions Bot commented Feb 24, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

B-Step62 Feb 26, 2026

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

smoorjani commented Feb 24, 2026 •

edited

Loading

github-actions Bot commented Feb 24, 2026 •

edited

Loading