104 lines
3.2 KiB
Go
104 lines
3.2 KiB
Go
package gemma4
|
|
|
|
import (
|
|
"image"
|
|
"math"
|
|
|
|
"golang.org/x/image/draw"
|
|
|
|
"github.com/ollama/ollama/fs"
|
|
)
|
|
|
|
type ImageProcessor struct {
|
|
patchSize int
|
|
numChannels int
|
|
nMerge int
|
|
minPixels int
|
|
maxPixels int
|
|
}
|
|
|
|
func newImageProcessor(c fs.Config) ImageProcessor {
|
|
patchSize := int(c.Uint("vision.patch_size", 16))
|
|
nMerge := int(c.Uint("vision.projector.scale_factor", 3))
|
|
numChannels := int(c.Uint("vision.num_channels", 3))
|
|
|
|
// Token limits from reference: min=40, max=280 output tokens after pooling.
|
|
// Convert to pixel counts: tokens * nMerge^2 * patchSize^2
|
|
minTokens := 40
|
|
maxTokens := 280
|
|
patchArea := patchSize * patchSize * nMerge * nMerge
|
|
minPixels := minTokens * patchArea
|
|
maxPixels := maxTokens * patchArea
|
|
|
|
return ImageProcessor{
|
|
patchSize: patchSize,
|
|
numChannels: numChannels,
|
|
nMerge: nMerge,
|
|
minPixels: minPixels,
|
|
maxPixels: maxPixels,
|
|
}
|
|
}
|
|
|
|
// ProcessImage resizes an image preserving aspect ratio, aligning dimensions
|
|
// to (patchSize * nMerge) boundaries, and normalizes pixels to [-1, 1].
|
|
// Returns the float32 pixel data and the actual output dimensions.
|
|
func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, int, int, error) {
|
|
// Compute target size preserving aspect ratio
|
|
alignSize := p.patchSize * p.nMerge
|
|
targetW, targetH := p.smartResize(img.Bounds().Dx(), img.Bounds().Dy(), alignSize)
|
|
|
|
// Resize directly without alpha compositing, matching MLX reference.
|
|
dst := image.NewRGBA(image.Rect(0, 0, targetW, targetH))
|
|
draw.BiLinear.Scale(dst, dst.Bounds(), img, img.Bounds(), draw.Over, nil)
|
|
|
|
// Normalize to [-1, 1] using mean=0.5, std=0.5: (pixel/255 - 0.5) / 0.5 = 2*pixel/255 - 1
|
|
data := p.pack(dst)
|
|
return data, targetW, targetH, nil
|
|
}
|
|
|
|
// smartResize computes target dimensions that preserve aspect ratio and
|
|
// align to alignSize boundaries. It scales the image to fill the maximum
|
|
// patch budget (maxPixels), matching the MLX reference.
|
|
func (p *ImageProcessor) smartResize(origW, origH, alignSize int) (int, int) {
|
|
totalPx := origW * origH
|
|
|
|
var targetW, targetH int
|
|
if p.maxPixels > 0 && totalPx > 0 {
|
|
factor := math.Sqrt(float64(p.maxPixels) / float64(totalPx))
|
|
targetH = max(alignSize, int(math.Floor(factor*float64(origH)/float64(alignSize)))*alignSize)
|
|
targetW = max(alignSize, int(math.Floor(factor*float64(origW)/float64(alignSize)))*alignSize)
|
|
} else {
|
|
targetH = max(alignSize, (origH/alignSize)*alignSize)
|
|
targetW = max(alignSize, (origW/alignSize)*alignSize)
|
|
}
|
|
|
|
return targetW, targetH
|
|
}
|
|
|
|
// pack extracts RGB values from an image and normalizes to [-1, 1].
|
|
// Returns channel-first layout: [R..., G..., B...].
|
|
func (p *ImageProcessor) pack(img image.Image) []float32 {
|
|
bounds := img.Bounds()
|
|
w := bounds.Dx()
|
|
h := bounds.Dy()
|
|
size := w * h
|
|
|
|
pixelVals := make([]float32, 3*size)
|
|
rOff, gOff, bOff := 0, size, 2*size
|
|
|
|
for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
|
|
for x := bounds.Min.X; x < bounds.Max.X; x++ {
|
|
c := img.At(x, y)
|
|
r, g, b, _ := c.RGBA()
|
|
idx := (y-bounds.Min.Y)*w + (x - bounds.Min.X)
|
|
|
|
// Normalize [0, 255] -> [-1, 1]: 2 * (val/255) - 1
|
|
pixelVals[rOff+idx] = float32(r>>8)/255.0*2.0 - 1.0
|
|
pixelVals[gOff+idx] = float32(g>>8)/255.0*2.0 - 1.0
|
|
pixelVals[bOff+idx] = float32(b>>8)/255.0*2.0 - 1.0
|
|
}
|
|
}
|
|
|
|
return pixelVals
|
|
}
|