Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 21 additions & 13 deletions pkg/xsysinfo/gguf.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package xsysinfo

import (
"errors"

gguf "github.com/gpustack/gguf-parser-go"
)

Expand All @@ -18,35 +16,45 @@
func EstimateGGUFVRAMUsage(f *gguf.GGUFFile, availableVRAM uint64) (*VRAMEstimate, error) {
// Get model metadata
m := f.Metadata()
a := f.Architecture()

estimate := f.EstimateLLaMACppRun()

lmes := estimate.SummarizeItem(true, 0, 0)
estimatedVRAM := uint64(0)
availableLayers := lmes.OffloadLayers // TODO: check if we can just use OffloadLayers here

for _, vram := range lmes.VRAMs {
estimatedVRAM += uint64(vram.NonUMA)
}

// Calculate base model size
modelSize := uint64(m.Size)

if a.BlockCount == 0 {
return nil, errors.New("block count is 0")
if availableLayers == 0 {
availableLayers = 1
}

if estimatedVRAM == 0 {
estimatedVRAM = 1
}

// Estimate number of layers that can fit in VRAM
// Each layer typically requires about 1/32 of the model size
layerSize := modelSize / uint64(a.BlockCount)
estimatedLayers := int(availableVRAM / layerSize)
layerSize := estimatedVRAM / availableLayers

// If we can't fit even one layer, we need to do full offload
isFullOffload := estimatedLayers <= 0
if isFullOffload {
estimatedLayers = 0
estimatedLayers := int(availableVRAM / layerSize)

Check failure

Code scanning / gosec

integer overflow conversion uint64 -> int Error

integer overflow conversion uint64 -> int
if availableVRAM > estimatedVRAM {
estimatedLayers = int(availableLayers)

Check failure

Code scanning / gosec

integer overflow conversion uint64 -> int Error

integer overflow conversion uint64 -> int
}

// Calculate estimated VRAM usage
estimatedVRAM := uint64(estimatedLayers) * layerSize

return &VRAMEstimate{
TotalVRAM: availableVRAM,
AvailableVRAM: availableVRAM,
ModelSize: modelSize,
EstimatedLayers: estimatedLayers,
EstimatedVRAM: estimatedVRAM,
IsFullOffload: isFullOffload,
IsFullOffload: availableVRAM > estimatedVRAM,
}, nil
}
Loading