Skip to content

parallelize lute lint#1088

Open
wmccrthy wants to merge 20 commits into
luau-lang:primaryfrom
wmccrthy:parallelize-lint
Open

parallelize lute lint#1088
wmccrthy wants to merge 20 commits into
luau-lang:primaryfrom
wmccrthy:parallelize-lint

Conversation

@wmccrthy
Copy link
Copy Markdown
Contributor

@wmccrthy wmccrthy commented May 9, 2026

Distribute walking input paths and linting files across workers. Number of workers capped by the minimum of available system threads and # of lintee files.

side note: it'd be reallyyyy sweet if we had documentation on the vm lib and how to use it, as it took me a while to figure out what it's constraints were and how to work around them. If i'm missing something, pls lmk 😄

Testing

All tests passing.

Also manually sanity checked on some local repos with the script in the collapsable below. Verified that every linted file had same the # of violations when linted with old / new build. Seeing pretty big improvements.
Comparing 5/08 nightly release (default) against release build from this branch (parallel)

Repo Old New Diff Diff % Old Violations New Violations Violation Diff
lua-apps 46.12s 7.75s 38.38s 83.2% 22943 22943 0
foundation 2.47s 0.56s 1.90s 77.2% 855 855 0
lute 0.65s 0.26s 0.39s 60.0% 23 23 0
Test script
local process = require("@std/process")
local path = require("@std/path")
local json = require("@std/json")
local types = require("./lute/cli/commands/lint/types")

local SEVERITY_MAP: { [number]: types.severity } = {
	[1] = "error",
	[2] = "warning",
	[3] = "info",
	[4] = "hint",
}

local function convertRange(range: any): { beginLine: number, beginColumn: number, endLine: number, endColumn: number }
	return {
		beginLine = range.start.line + 1,
		beginColumn = range.start.character + 1,
		endLine = range["end"].line + 1,
		endColumn = range["end"].character,
	}
end

local function convertDiagnostic(item: any, sourcepath: string): types.LintViolation
	local violation: types.LintViolation = {
		lintname = item.code,
		message = item.message,
		severity = (SEVERITY_MAP[item.severity] or "warning") :: types.severity,
		location = convertRange(item.range) :: any,
		sourcepath = path.parse(sourcepath),
		target = item.codeDescription,
	}

	if item.suggestedfix then
		violation.suggestedfix = {
			fix = item.suggestedfix.fix,
			location = if item.suggestedfix.range then convertRange(item.suggestedfix.range) :: any else nil,
		}
	end

	return violation
end

local function parseLintJsonOutput(jsonString: string): { [string]: { types.LintViolation } }
	local jsonStart = string.find(jsonString, "{")
	if not jsonStart then
		return {}
	end

	local parsed = json.deserialize(jsonString:sub(jsonStart, #jsonString)) :: any
	if not parsed or not parsed.items then
		return {}
	end

	local violationsByFile: { [string]: { types.LintViolation } } = {}

	for _, fileEntry in parsed.items do
		local filePath = fileEntry.uri :: string
		local violations: { types.LintViolation } = {}

		for _, item in fileEntry.items do
			table.insert(violations, convertDiagnostic(item, filePath))
		end

		violationsByFile[filePath] = violations
	end

	return violationsByFile
end

local function countViolations(results: { [string]: { types.LintViolation } }): number
	local count = 0
	for _, violations in results do
		count += #violations
	end
	return count
end

local function formatSeconds(value: number): string
	return string.format("%.2fs", value)
end

local function formatPercent(value: number): string
	return string.format("%.1f%%", value)
end

-- local build = path.join(".", "build", "xcode", "debug", "lute", "cli", "lute")

local releaseBuild = path.join(".", "build", "xcode", "release", "lute", "cli", "lute")

local matrix: { [string]: string } = {
	lute = "./",
	["lua-apps"] = "../roblox/lua-apps",
	["foundation"] = "../roblox/foundation",
}

type Result = {
	repo: string,
	old: number,
	new: number,
	diffSeconds: number,
	diffPercent: number,
	oldViolations: number,
	newViolations: number,
	violationDiff: number,
}

local results: { Result } = {}

for repo, repoPath in matrix do
	local start = os.clock()

	local normalLute = process.run({ "lute", "lint", "-j", repoPath })
	local normalTime = os.clock() - start

	start = os.clock()
	local parallelLint = process.run({ path.format(releaseBuild), "lint", "-j", repoPath })
	local parallelTime = os.clock() - start

	local specialResults = parseLintJsonOutput(parallelLint.stdout)

	local normalResults = parseLintJsonOutput(normalLute.stdout)

	for file, violations in specialResults do
		local normalViolations = normalResults[file]
		assert(normalViolations ~= nil, `Missing normal lint results for {file}`)
		assert(#violations == #normalViolations, `Violation count mismatch for {file}`)
	end

	for file, violations in normalResults do
		local specialViolations = specialResults[file]
		assert(specialViolations ~= nil, `Missing parallel lint results for {file}`)
		assert(#violations == #specialViolations, `Violation count mismatch for {file}`)
	end

	local normalViolationCount = countViolations(normalResults)
	local parallelViolationCount = countViolations(specialResults)
	local diffSeconds = normalTime - parallelTime
	local diffPercent = if normalTime > 0 then (diffSeconds / normalTime) * 100 else 0

	table.insert(results, {
		repo = repo,
		old = normalTime,
		new = parallelTime,
		diffSeconds = diffSeconds,
		diffPercent = diffPercent,
		oldViolations = normalViolationCount,
		newViolations = parallelViolationCount,
		violationDiff = parallelViolationCount - normalViolationCount,
	})
end

-- pretty print results in markdown table
print("| Repo | Old | New | Diff | Diff % | Old Violations | New Violations | Violation Diff |")
print("| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |")

for _, result in results do
	print(
		`| {result.repo} | {formatSeconds(result.old)} | {formatSeconds(result.new)} | {formatSeconds(
			result.diffSeconds
		)} | {formatPercent(result.diffPercent)} | {result.oldViolations} | {result.newViolations} | {result.violationDiff} |`
	)
end

@wmccrthy wmccrthy changed the title parallelizing lute lint parallelize lute lint May 9, 2026
@wmccrthy wmccrthy marked this pull request as ready for review May 9, 2026 23:45
Comment thread lute/cli/commands/lint/init.luau Outdated
@skberkeley
Copy link
Copy Markdown
Contributor

This is a pretty large change and difficult to review at the moment. It seems like this is currently a refactor combined with the functional changes to parallelize linting. If that's the case, could I ask that you split out the refactor part into a separate PR to make it easier to review?

@wmccrthy
Copy link
Copy Markdown
Contributor Author

This is a pretty large change and difficult to review at the moment. It seems like this is currently a refactor combined with the functional changes to parallelize linting. If that's the case, could I ask that you split out the refactor part into a separate PR to make it easier to review?

Sure! Some of the refactor is tightly coupled to the constraints of using vm, so even with an additional PR I will have to move a couple things around from the refactor PR -> parallel PR, but it should be more digestible regardless

@wmccrthy
Copy link
Copy Markdown
Contributor Author

wmccrthy commented May 11, 2026

This is a pretty large change and difficult to review at the moment. It seems like this is currently a refactor combined with the functional changes to parallelize linting. If that's the case, could I ask that you split out the refactor part into a separate PR to make it easier to review?

@skberkeley refactor up here, i'll rebase and clean up this branch once that is merged

skberkeley pushed a commit that referenced this pull request May 14, 2026
Separating refactor from parallelization change, as per [this
request](#1088 (comment)).
This PR moves business logic from `lute lint` into a separate `lintCore`
module, providing a clear pathway to re-use logic without duplication
when we add parallelization to `lute lint`.

Changes are entirely non-functional.
Copy link
Copy Markdown
Contributor

@skberkeley skberkeley left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this change feels quite risky to me; can you gate it behind a command line flag?

Comment thread lute/cli/commands/lint/init.luau Outdated
Comment thread lute/cli/commands/lint/types.luau Outdated
Comment thread lute/cli/commands/lint/init.luau Outdated
Comment on lines +100 to +103
rulePath = rulePath,
noDefaults = noDefaults,
configPath = passedConfigVal,
verbose = VERBOSE,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are we passing these things around in a table?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also why does every worker need to do initialize?

Copy link
Copy Markdown
Contributor Author

@wmccrthy wmccrthy May 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't pass functions as args to the separate VM instance, so the best workaround I know of is re-initializing based on the location of relevant files (rules and config). If there's a better alternative, I'm all ears

Passing the args as a table is just a stylistic choice

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i see, it sounds like we should tease apart the parts of initialize which load the rules and the parts that load the config, and do the config loading before spawning the workers then

creating and reading from tables is more costly, and this isn't an option table, so we should pass the arguments around unwrapped

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Re passing as tables:
Sounds good, will change.

Re only initializing rules:
I think we still need to initialize both, since rule configs have arbitrary typing; it's totally possible, for instance, that a custom rule has an option that expects a function.

It looks like there's an issue up regarding support for passing all primitive types through the VM barrier, so we likely won't need initialize in the long-run. I can add a comment calling this out

Comment thread lute/cli/commands/lint/init.luau Outdated
Comment thread lute/cli/commands/lint/init.luau Outdated
Comment on lines +113 to +144
while curr ~= nil do
if hasIgnores then
local inputFilePathString = pathLib.format(curr)

if
parseIgnores.isIgnored(
ignoreData,
inputFilePathString,
fs.type(inputFilePathString) == "dir"
)
then
curr = walker()
continue
end
end

local success, res = pcall(function()
return worker.lintFile({
file = pathLib.format(curr :: pathLib.Path),
autofix = job.autoFixEnabled,
verbose = VERBOSE,
})
end)
if success then
allViolations[curr] = res :: { types.LintViolation }
else
print(`(worker) Error linting file '{curr}': {res}`)
end

curr = walker()
end
curJob = lintNext()
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this feels like a strange way to balance tasks across workers; if we give one worker a folder with many files and subfolders, we'll just end up waiting for that one. can you add a comment explaining that we don't have thread safe data structures so this is kind of the best we can do?

Copy link
Copy Markdown
Contributor Author

@wmccrthy wmccrthy May 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea that's fair; I also explored keeping the inputFilePaths walking sequential, queueing jobs as files were walked, and having workers pull from the queue, but found it was empirically slower. The scenario you call out definitely makes sense in theory but also seems impossible given that getSourceFiles returns exclusively files and no directories. I kept the fs walk logic because that is how lintPaths worked previously, but maybe it's worth dropping 🤷

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd recommend scanning files and then dispatching them in groups afterwards. see what lute test/ the format PR does.

we should probably abstract this worker pool soon so we can share it between test, lint, transform, format, etc. it's all very similar

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi! I'll put up my draft for this sometime this week. When that happens I can refactor this code during my PR :)

@wmccrthy
Copy link
Copy Markdown
Contributor Author

this change feels quite risky to me; can you gate it behind a command line flag?

Yes, my only concern is duplicating test logic as I like the safety of having every test case exercise the parallel codepath

Comment thread lute/cli/commands/lint/init.luau Outdated
Comment thread lute/cli/commands/lint/init.luau Outdated
Comment thread lute/cli/commands/lint/init.luau Outdated
local inputFilePathString = pathLib.format(curr)
if not PARALLEL then
for _, inputPath in inputFiles do
local walker = fs.walk(inputPath, { recursive = true })
Copy link
Copy Markdown
Contributor Author

@wmccrthy wmccrthy May 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed fs walking in the parallel path since we identified it's redundant given how getSourceFiles works. Kept it in non-parallel to keep changes as minimal as possible, but maybe worth removing too? Will defer to @skberkeley

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah probably worth refactoring in a separate PR?

Comment thread tests/cli/lint.test.luau Outdated
@wmccrthy wmccrthy requested a review from skberkeley May 15, 2026 20:31
Copy link
Copy Markdown
Contributor

@skberkeley skberkeley left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good, thanks for taking my feedback into consideration! I'm going to ask @~Vighnesh-V to take a look too since he understands the VM/task library the best

Comment thread lute/cli/commands/lint/init.luau Outdated
local inputFilePathString = pathLib.format(curr)
if not PARALLEL then
for _, inputPath in inputFiles do
local walker = fs.walk(inputPath, { recursive = true })
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah probably worth refactoring in a separate PR?

Comment thread lute/cli/commands/lint/init.luau Outdated

curr = walker()
end
end
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we early return here we don't need to wrap the parallel part in an else block

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm going to be a bit annoying and ask that we extract the code for 'running' lints into its own file. Probably something that exposes runSequential and runParallel as part of its export surface.

@skberkeley
Copy link
Copy Markdown
Contributor

Oh, also maybe worth updating our CI to run lute lint in parallel?

Comment thread docs/cli/lint/index.md
Comment on lines +74 to +76
### `-p, --parallel`

Lint files in parallel instead of sequentially.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why isn't this just the default? why wouldn't you want this? tests already run them in parallel by default

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with Nick here. This should be parallel by default. Can you add the inverse (-s, --sequentially) instead, so users can opt in to determinism?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure

@Nicell
Copy link
Copy Markdown
Collaborator

Nicell commented May 19, 2026

this change feels quite risky to me; can you gate it behind a command line flag?

I think a flag means that no one will use it. lute should be fast by default, if we're not confident enough to have it on by default, why are we confident enough to add it behind a CLI flag that people are much less likely to use?

Comment thread docs/cli/lint/index.md
Comment on lines +74 to +76
### `-p, --parallel`

Lint files in parallel instead of sequentially.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with Nick here. This should be parallel by default. Can you add the inverse (-s, --sequentially) instead, so users can opt in to determinism?

Comment thread lute/cli/commands/lint/init.luau Outdated

curr = walker()
end
end
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm going to be a bit annoying and ask that we extract the code for 'running' lints into its own file. Probably something that exposes runSequential and runParallel as part of its export surface.

Comment thread lute/cli/commands/lint/init.luau Outdated
Comment on lines +89 to +90
if
parseIgnores.isIgnored(ignoreData, inputFilePathString, fs.type(inputFilePathString) == "dir")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this actually what stylua does if the condition is too long?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems that way

Comment thread lute/cli/commands/lint/init.luau Outdated
task.spawn(function()
xpcall(function()
local worker = vm.create("@self/worker")
-- Initialize will be UNNECESSARY once we can pass functions to separate VM
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does this mean?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, serializing functions across vm bodies. Could I suggest instead that we add support for adding arguments to vm.create calls, and the module can initialize state based on that?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

e.g. vm.create(.., rulesPath, noDefaults, passedConfigVal, VERBOSE)?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense. Probably out of the scope of this PR though? Happy to handle in a separate PR and update this accordingly

Comment thread lute/cli/commands/lint/init.luau Outdated
Comment on lines +203 to +204
VERBOSE = args:has("verbose")
PARALLEL = args:has("parallel")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Capital case is usually used for constants. Can we stick these options in a table and then read them in here please?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As in have a module-scoped table like { verbose: boolean, parallel: boolean, }? If so, why is that better than just changing the casing?

Comment thread lute/cli/commands/lint/init.luau Outdated
passedConfigVal,
autofixEnabled,
ignoreData
) -- LUAUFIX: type refinement on line 305 should mean that cast on inputFiles isn't needed
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this file only 282 lines? Where does this point to now?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question, from a quick look at commit history it looks like this comment's been bouncing around for a while. Maybe we just remove it

Comment thread tests/cli/lint.test.luau Outdated
fs.removeDirectory(modulePath, { recursive = true })
end)

suite:case("luteLintParallelMultipleInputFiles", function(assert)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can these be expressed as a snapshot test?

Copy link
Copy Markdown
Contributor Author

@wmccrthy wmccrthy May 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're going to default to parallel, I'll swap these cases for --sequential. We can use snapshots for those

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On second thought, it doesn't really make sense to have dedicated snapshot tests (or explicit tests) for sequential linting. For the sake of safety I'd prefer if all test cases exercised both sequential and parallel code paths. This may be a bit redundant, but casts the widest net for catching regressions

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants