Skip to content

Add session-level support for custom @scorer decorator#21106

Merged
smoorjani merged 8 commits intomlflow:masterfrom
smoorjani:custom-code-session-scorer
Feb 26, 2026
Merged

Add session-level support for custom @scorer decorator#21106
smoorjani merged 8 commits intomlflow:masterfrom
smoorjani:custom-code-session-scorer

Conversation

@smoorjani
Copy link
Copy Markdown
Collaborator

@smoorjani smoorjani commented Feb 24, 2026

Related Issues/PRs

#xxx

What changes are proposed in this pull request?

Auto-detect session-level scorers by inspecting the function signature for a session parameter. Validate at decoration time that session is not combined with single-turn parameters (inputs, outputs, trace).

How is this PR tested?

  • Existing unit/integration tests
  • New unit/integration tests
  • Manual tests

OSS

import time
import uuid

import mlflow
from mlflow.entities import Feedback, SpanType
from mlflow.genai.scorers import scorer
from mlflow.genai.scorers.registry import get_scorer, list_scorers
from mlflow.tracing.constant import TraceMetadataKey

mlflow.set_tracking_uri("http://localhost:5000")

# ── Define scorers ───────────────────────────────────────────────────────────


@scorer
def response_length(outputs) -> Feedback:
    length = len(str(outputs))
    return Feedback(value=length > 20, rationale=f"Response length: {length}")


@scorer
def conversation_completeness(session) -> Feedback:
    total_turns = len(session)
    error_turns = sum(1 for t in session if t.info.status == "ERROR")
    return Feedback(value=error_turns == 0, rationale=f"{total_turns} turns, {error_turns} errors")


@scorer
def session_goal_met(session, expectations) -> Feedback:
    last_trace = max(session, key=lambda t: t.info.request_time)
    goal = expectations.get("goal", "")
    response = str(last_trace.data.response) if last_trace.data else ""
    met = goal.lower() in response.lower() if goal else True
    return Feedback(value=met, rationale=f"Goal '{goal}' in final response: {met}")


# ── Auto-detection ───────────────────────────────────────────────────────────

assert response_length.is_session_level_scorer is False
assert conversation_completeness.is_session_level_scorer is True
assert session_goal_met.is_session_level_scorer is True
print("Auto-detection OK")

# ── Register scorers ────────────────────────────────────────────────────────

experiment_name = f"demo-session-scorer-{uuid.uuid4().hex[:8]}"
mlflow.set_experiment(experiment_name)
experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

reg_single = response_length.register(name="response_length", experiment_id=experiment_id)
reg_session = conversation_completeness.register(
    name="conversation_completeness", experiment_id=experiment_id
)
reg_goal = session_goal_met.register(name="session_goal_met", experiment_id=experiment_id)
print(f"Registered: {reg_single.name}, {reg_session.name}, {reg_goal.name}")

scorers = list_scorers(experiment_id=experiment_id)
assert len(scorers) == 3
print(f"Listed {len(scorers)} scorers")

retrieved = get_scorer(name="response_length", experiment_id=experiment_id)
assert retrieved.name == "response_length"
print(f"Retrieved: {retrieved.name}")

# ── Create session traces ────────────────────────────────────────────────────

session_id = f"session-{uuid.uuid4().hex[:8]}"


def create_trace(name, user_input, bot_response, sid=None):
    metadata = {TraceMetadataKey.TRACE_SESSION: sid} if sid else None
    span = mlflow.start_span_no_context(
        name, span_type=SpanType.CHAIN, inputs={"user_input": user_input}, metadata=metadata
    )
    time.sleep(0.05)
    span.set_outputs({"bot_response": bot_response})
    span.end()


create_trace("turn_1", "What is MLflow?", "MLflow is an open-source ML platform.", session_id)
create_trace("turn_2", "How do I track experiments?", "Use mlflow.log_param().", session_id)
create_trace("turn_3", "Thanks!", "You're welcome!", session_id)
create_trace("standalone", "Quick question", "Short answer")

time.sleep(3)
all_traces = mlflow.search_traces(locations=[experiment_id], return_type="list")
session_traces = [
    t for t in all_traces if t.info.trace_metadata.get(TraceMetadataKey.TRACE_SESSION) == session_id
]
assert len(all_traces) == 4
assert len(session_traces) == 3
print(f"Created {len(all_traces)} traces ({len(session_traces)} in session)")

# ── Run scorers ──────────────────────────────────────────────────────────────

r = response_length.run(outputs="MLflow is an open-source platform for ML lifecycle management.")
print(f"response_length: value={r.value}, rationale='{r.rationale}'")
assert r.value is True

r = conversation_completeness.run(session=session_traces)
print(f"conversation_completeness: value={r.value}, rationale='{r.rationale}'")
assert r.value is True

r = session_goal_met.run(session=session_traces, expectations={"goal": "welcome"})
print(f"session_goal_met: value={r.value}, rationale='{r.rationale}'")
assert r.value is True
image image

Databricks

import os
import time
import uuid

import mlflow
from mlflow.entities import Feedback, SpanType
from mlflow.genai.scorers import ScorerSamplingConfig, scorer
from mlflow.tracing.constant import TraceMetadataKey

EXPERIMENT_ID = "98459650931566"

mlflow.set_tracking_uri("databricks")
mlflow.set_experiment(experiment_id=EXPERIMENT_ID)

# ── Define scorers

@scorer
def response_length(outputs) -> bool:
    return len(str(outputs)) > 20


@scorer
def conversation_completeness(session) -> Feedback:
    total_turns = len(session)
    error_turns = sum(1 for t in session if t.info.status == "ERROR")
    return Feedback(value=error_turns == 0, rationale=f"{total_turns} turns, {error_turns} errors")


assert response_length.is_session_level_scorer is False
assert conversation_completeness.is_session_level_scorer is True
print("Auto-detection OK")

# ── Register & start scorers

run_id = uuid.uuid4().hex[:8]

reg_single = response_length.register(name=f"response_length_{run_id}", experiment_id=EXPERIMENT_ID)
reg_session = conversation_completeness.register(
    name=f"conversation_completeness_{run_id}", experiment_id=EXPERIMENT_ID
)
print(f"Registered {reg_single.name} -> {reg_single.status}")
print(f"Registered {reg_session.name} -> {reg_session.status}")

started_single = reg_single.start(
    experiment_id=EXPERIMENT_ID,
    sampling_config=ScorerSamplingConfig(sample_rate=1.0),
)
print(f"Started {started_single.name} -> {started_single.status}")

# Session-level scorers can be registered but not yet scheduled on the Databricks backend
try:
    reg_session.start(
        experiment_id=EXPERIMENT_ID,
        sampling_config=ScorerSamplingConfig(sample_rate=1.0),
    )
except Exception as e:
    print(f"Session scorer start() not yet supported server-side: {type(e).__name__}")

# ── Create session traces

session_id = f"session-{uuid.uuid4().hex[:8]}"


def create_trace(name, user_input, bot_response, sid=None):
    metadata = {TraceMetadataKey.TRACE_SESSION: sid} if sid else None
    span = mlflow.start_span_no_context(
        name, span_type=SpanType.CHAIN, inputs={"user_input": user_input}, metadata=metadata
    )
    time.sleep(0.05)
    span.set_outputs({"bot_response": bot_response})
    span.end()


create_trace("turn_1", "What is MLflow?", "MLflow is an open-source ML platform.", session_id)
create_trace("turn_2", "How do I track experiments?", "Use mlflow.log_param().", session_id)
create_trace("turn_3", "Thanks!", "You're welcome!", session_id)
create_trace("standalone", "Quick question", "Short answer")

time.sleep(5)
all_traces = mlflow.search_traces(locations=[EXPERIMENT_ID], return_type="list")
session_traces = [
    t for t in all_traces if t.info.trace_metadata.get(TraceMetadataKey.TRACE_SESSION) == session_id
]
print(f"Created traces ({len(session_traces)} in session)")

# ── Run scorers directly

r = response_length.run(outputs="MLflow is an open-source platform for ML lifecycle management.")
print(f"response_length: value={r}")

r = conversation_completeness.run(session=session_traces)
print(f"conversation_completeness: value={r.value}, rationale='{r.rationale}'")

# ── Clean up: stop scorers

started_single.stop(experiment_id=EXPERIMENT_ID)
print("Stopped scorer")

print("\nAll checks passed!")
image

Does this PR require documentation update?

  • No. You can skip the rest of this section.
  • Yes. I've updated:
    • Examples
    • API references
    • Instructions

Does this PR require updating the MLflow Skills repository?

  • No. You can skip the rest of this section.
  • Yes. Please link the corresponding PR or explain how you plan to update it.

Release Notes

Is this a user-facing change?

  • No. You can skip the rest of this section.
  • Yes. Give a description of this change to be included in the release notes for MLflow users.

Support custom-code session-level scorers.

What component(s), interfaces, languages, and integrations does this PR affect?

Components

  • area/tracking: Tracking Service, tracking client APIs, autologging
  • area/models: MLmodel format, model serialization/deserialization, flavors
  • area/model-registry: Model Registry service, APIs, and the fluent client calls for Model Registry
  • area/scoring: MLflow Model server, model deployment tools, Spark UDFs
  • area/evaluation: MLflow model evaluation features, evaluation metrics, and evaluation workflows
  • area/gateway: MLflow AI Gateway client APIs, server, and third-party integrations
  • area/prompts: MLflow prompt engineering features, prompt templates, and prompt management
  • area/tracing: MLflow Tracing features, tracing APIs, and LLM tracing functionality
  • area/projects: MLproject format, project running backends
  • area/uiux: Front-end, user experience, plotting, JavaScript, JavaScript dev server
  • area/build: Build and test infrastructure for MLflow
  • area/docs: MLflow documentation pages

How should the PR be classified in the release notes? Choose one:

  • rn/none - No description will be included. The PR will be mentioned only by the PR number in the "Small Bugfixes and Documentation Updates" section
  • rn/breaking-change - The PR will be mentioned in the "Breaking Changes" section
  • rn/feature - A new user-facing feature worth mentioning in the release notes
  • rn/bug-fix - A user-facing bug fix worth mentioning in the release notes
  • rn/documentation - A user-facing documentation change worth mentioning in the release notes

Should this PR be included in the next patch release?

Yes should be selected for bug fixes, documentation updates, and other small changes. No should be selected for new features and larger changes. If you're unsure about the release classification of this PR, leave this unchecked to let the maintainers decide.

What is a minor/patch release?
  • Minor release: a release that increments the second part of the version number (e.g., 1.2.0 -> 1.3.0).
    Bug fixes, doc updates and new features usually go into minor releases.
  • Patch release: a release that increments the third part of the version number (e.g., 1.2.0 -> 1.2.1).
    Bug fixes and doc updates usually go into patch releases.
  • Yes (this PR will be cherry-picked and included in the next patch release)
  • No (this PR will be included in the next minor release)

@github-actions github-actions Bot added size/M area/evaluation MLflow Evaluation labels Feb 24, 2026
@github-actions
Copy link
Copy Markdown
Contributor

🛠 DevTools 🛠

Install mlflow from this PR

# mlflow
pip install git+https://github.com/mlflow/mlflow.git@refs/pull/21106/merge
# mlflow-skinny
pip install git+https://github.com/mlflow/mlflow.git@refs/pull/21106/merge#subdirectory=libs/skinny

For Databricks, use the following command:

%sh curl -LsSf https://raw.githubusercontent.com/mlflow/mlflow/HEAD/dev/install-skinny.sh | sh -s pull/21106/merge

@github-actions github-actions Bot added rn/none List under Small Changes in Changelogs. v3.10.1 labels Feb 24, 2026
@github-actions
Copy link
Copy Markdown
Contributor

github-actions Bot commented Feb 24, 2026

Documentation preview for e712774 is available at:

More info
  • Ignore this comment if this PR does not change the documentation.
  • The preview is updated when a new commit is pushed to this PR.
  • This comment was created by this workflow run.
  • The documentation was built by this workflow run.

Comment thread mlflow/genai/scorers/base.py Outdated
Comment thread mlflow/genai/scorers/base.py Outdated
Comment thread tests/genai/scorers/test_scorer.py
Comment thread tests/genai/scorers/test_scorer.py
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@smoorjani smoorjani enabled auto-merge February 26, 2026 19:15
@smoorjani smoorjani disabled auto-merge February 26, 2026 19:16
@smoorjani smoorjani force-pushed the custom-code-session-scorer branch from e976b3a to 7cbec7a Compare February 26, 2026 21:28
smoorjani and others added 8 commits February 26, 2026 14:02
Auto-detect session-level scorers by inspecting the function signature
for a `session` parameter. Validate at decoration time that `session`
is not combined with single-turn parameters (inputs, outputs, trace).

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>
Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>
The security check in _reconstruct_decorator_scorer was blocking
deserialization for remote Databricks access. This prevented
registering custom @scorer scorers from a local machine. Relax the
check to allow exec when connected to a Databricks workspace, since
the serialized code originates from the user's local function.

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>
- Use ClassVar instead of closure for is_session_level_scorer
- Test full serialization roundtrip (dump + load)
- Add session-level scorer invocation test with mock traces
- Move imports back to top level

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>
Remove is_in_databricks_runtime() from the deserialization guard since
it doesn't effectively prevent code execution risks (scorers in
Databricks runtime can still access critical info and make network
requests). The real security wall is the registration gate (Databricks
URI check), which ensures only authenticated users can register scorers.

Keeping is_in_databricks_runtime() would also theoretically allow
loading a scorer from an OSS server in Databricks runtime, so removing
it is strictly safer.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>
Remove is_in_databricks_runtime patches from tests since the import was
removed from base.py. Tests now only mock is_databricks_uri.

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>
@smoorjani smoorjani force-pushed the custom-code-session-scorer branch from 7cbec7a to e712774 Compare February 26, 2026 22:03
@smoorjani smoorjani enabled auto-merge February 26, 2026 23:25
@smoorjani smoorjani added this pull request to the merge queue Feb 26, 2026
Merged via the queue into mlflow:master with commit 9087cd3 Feb 26, 2026
69 of 71 checks passed
@smoorjani smoorjani deleted the custom-code-session-scorer branch February 26, 2026 23:41
@github-actions github-actions Bot added size/L Large PR (200-499 LoC) and removed size/M labels Feb 26, 2026
daniellok-db pushed a commit to daniellok-db/mlflow that referenced this pull request Mar 5, 2026
Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>
Co-authored-by: Claude <noreply@anthropic.com>
daniellok-db pushed a commit to daniellok-db/mlflow that referenced this pull request Mar 5, 2026
Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>
Co-authored-by: Claude <noreply@anthropic.com>
daniellok-db pushed a commit that referenced this pull request Mar 5, 2026
Signed-off-by: Samraj Moorjani <samraj.moorjani@databricks.com>
Co-authored-by: Claude <noreply@anthropic.com>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

area/evaluation MLflow Evaluation rn/none List under Small Changes in Changelogs. size/L Large PR (200-499 LoC) v3.10.1

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants