ollama source for Momentry Core verification
This commit is contained in:
174
model/models/glmocr/imageprocessor.go
Normal file
174
model/models/glmocr/imageprocessor.go
Normal file
@@ -0,0 +1,174 @@
|
||||
package glmocr
|
||||
|
||||
import (
|
||||
"image"
|
||||
"log/slog"
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/model/imageproc"
|
||||
)
|
||||
|
||||
type ImageProcessor struct {
|
||||
imageSize int
|
||||
patchSize int
|
||||
temporalPatchSize int
|
||||
spatialMergeSize int
|
||||
minPixels int
|
||||
maxPixels int
|
||||
factor int
|
||||
imageMean [3]float32
|
||||
imageStd [3]float32
|
||||
}
|
||||
|
||||
func newImageProcessor(c fs.Config) ImageProcessor {
|
||||
patchSize := int(c.Uint("vision.patch_size", 14))
|
||||
spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
|
||||
temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
|
||||
|
||||
// Read normalization values from config if available, otherwise use CLIP defaults
|
||||
imageMean := c.Floats("vision.image_mean", imageproc.ClipDefaultMean[:])
|
||||
imageStd := c.Floats("vision.image_std", imageproc.ClipDefaultSTD[:])
|
||||
|
||||
// Default max_pixels: 2048 * patchSize^2 * mergeSize^2 * temporal = ~3.2M pixels
|
||||
// This limits to ~16k patches (4k output tokens) to keep memory stable without flash attention
|
||||
defaultMaxPixels := 2048 * patchSize * patchSize * spatialMergeSize * spatialMergeSize * temporalPatchSize
|
||||
|
||||
return ImageProcessor{
|
||||
imageSize: int(c.Uint("vision.image_size", 336)),
|
||||
patchSize: patchSize,
|
||||
temporalPatchSize: temporalPatchSize,
|
||||
spatialMergeSize: spatialMergeSize,
|
||||
minPixels: int(c.Uint("vision.min_pixels", uint32(8*patchSize*patchSize*spatialMergeSize*spatialMergeSize*temporalPatchSize))),
|
||||
maxPixels: int(c.Uint("vision.max_pixels", uint32(defaultMaxPixels))),
|
||||
factor: patchSize * spatialMergeSize,
|
||||
imageMean: [3]float32{imageMean[0], imageMean[1], imageMean[2]},
|
||||
imageStd: [3]float32{imageStd[0], imageStd[1], imageStd[2]},
|
||||
}
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
|
||||
factor := p.factor
|
||||
temporalFactor := p.temporalPatchSize
|
||||
numFrames := temporalFactor // single image
|
||||
|
||||
if height < factor || width < factor {
|
||||
// Scale up small images
|
||||
scale := float64(factor) / float64(min(height, width))
|
||||
height = int(math.Ceil(float64(height) * scale))
|
||||
width = int(math.Ceil(float64(width) * scale))
|
||||
}
|
||||
|
||||
if temporalFactor <= 0 {
|
||||
slog.Warn("temporal_patch_size must be > 0, defaulting to 1")
|
||||
temporalFactor = 1
|
||||
}
|
||||
if numFrames < temporalFactor {
|
||||
slog.Warn("num_frames must be >= temporal_patch_size, adjusting num_frames", "num_frames", numFrames, "temporal_patch_size", temporalFactor)
|
||||
numFrames = temporalFactor
|
||||
}
|
||||
if aspectRatio := float64(max(height, width)) / float64(min(height, width)); aspectRatio > 200 {
|
||||
slog.Warn("aspect ratio exceeds 200, image quality may be affected", "aspect_ratio", aspectRatio)
|
||||
}
|
||||
|
||||
round := func(x float64) int { return int(math.RoundToEven(x)) }
|
||||
|
||||
hBar := round(float64(height)/float64(factor)) * factor
|
||||
wBar := round(float64(width)/float64(factor)) * factor
|
||||
tBar := round(float64(numFrames)/float64(temporalFactor)) * temporalFactor
|
||||
|
||||
if tBar*hBar*wBar > p.maxPixels {
|
||||
beta := math.Sqrt(float64(numFrames*height*width) / float64(p.maxPixels))
|
||||
hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
|
||||
wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
|
||||
} else if tBar*hBar*wBar < p.minPixels {
|
||||
beta := math.Sqrt(float64(p.minPixels) / float64(numFrames*height*width))
|
||||
hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
|
||||
wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
|
||||
}
|
||||
|
||||
return hBar, wBar
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) {
|
||||
img = imageproc.Composite(img)
|
||||
|
||||
origWidth := img.Bounds().Dx()
|
||||
origHeight := img.Bounds().Dy()
|
||||
|
||||
// Calculate smart resize dimensions
|
||||
resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
|
||||
|
||||
// Resize image
|
||||
resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeCatmullrom)
|
||||
|
||||
// Normalize pixels - output format is [C, H, W] with rescale and channelFirst
|
||||
// We keep [C, H, W] for patch extraction
|
||||
normalizedPixels := imageproc.Normalize(resizedImg, p.imageMean, p.imageStd, true, true)
|
||||
|
||||
// Calculate grid dimensions (after Conv2D patching)
|
||||
grid := &Grid{
|
||||
Height: resizedHeight / p.patchSize,
|
||||
Width: resizedWidth / p.patchSize,
|
||||
Temporal: 1, // Single image
|
||||
ImageHeight: resizedHeight,
|
||||
ImageWidth: resizedWidth,
|
||||
}
|
||||
|
||||
patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
return patches, grid, nil
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
|
||||
channels := 3
|
||||
patchSize := p.patchSize
|
||||
mergeSize := p.spatialMergeSize
|
||||
temporalPatchSize := p.temporalPatchSize
|
||||
|
||||
numPatches := grid.Temporal * grid.Height * grid.Width
|
||||
patchDim := channels * temporalPatchSize * patchSize * patchSize
|
||||
result := make([]float32, numPatches*patchDim)
|
||||
patchIndex := 0
|
||||
|
||||
// Single temporal frame handling (copies to all frames)
|
||||
for range grid.Temporal {
|
||||
for h := 0; h < grid.Height; h += mergeSize {
|
||||
for w := 0; w < grid.Width; w += mergeSize {
|
||||
for mh := range mergeSize {
|
||||
for mw := range mergeSize {
|
||||
baseOffset := patchIndex * patchDim
|
||||
for c := range channels {
|
||||
channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
|
||||
for py := range patchSize {
|
||||
for px := range patchSize {
|
||||
y := (h+mh)*patchSize + py
|
||||
x := (w+mw)*patchSize + px
|
||||
srcIdx := c*height*width + y*width + x
|
||||
dstIdx := channelOffset + (py * patchSize) + px
|
||||
result[dstIdx] = pixels[srcIdx]
|
||||
}
|
||||
}
|
||||
|
||||
if temporalPatchSize > 1 {
|
||||
frameSize := patchSize * patchSize
|
||||
for tp := 1; tp < temporalPatchSize; tp++ {
|
||||
currentFrameOffset := channelOffset + (tp * frameSize)
|
||||
copy(result[currentFrameOffset:currentFrameOffset+frameSize],
|
||||
result[channelOffset:channelOffset+frameSize])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
patchIndex++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
Reference in New Issue
Block a user