package nemotronh import ( "errors" "image" "math" "slices" "github.com/ollama/ollama/fs" "github.com/ollama/ollama/model/imageproc" ) type ImageProcessor struct { imageSize int patchSize int numChannels int maxTiles int minNumPatches int maxNumPatches int useThumbnail bool projectorScale int imageMean [3]float32 imageStd [3]float32 } type processedVisionTile struct { data []float32 size image.Point } func newImageProcessor(c fs.Config) ImageProcessor { mean := c.Floats("vision.image_mean") std := c.Floats("vision.image_std") processor := ImageProcessor{ imageSize: int(c.Uint("vision.image_size", 512)), patchSize: int(c.Uint("vision.patch_size", 16)), numChannels: int(c.Uint("vision.num_channels", 3)), maxTiles: int(c.Uint("vision.max_tiles", 12)), minNumPatches: int(c.Uint("vision.min_num_patches")), maxNumPatches: int(c.Uint("vision.max_num_patches")), useThumbnail: c.Bool("vision.use_thumbnail", true), projectorScale: int(c.Uint("vision.projector.scale_factor", 2)), imageMean: imageproc.ClipDefaultMean, imageStd: imageproc.ClipDefaultSTD, } if len(mean) >= 3 { processor.imageMean = [3]float32{mean[0], mean[1], mean[2]} } if len(std) >= 3 { processor.imageStd = [3]float32{std[0], std[1], std[2]} } if processor.imageSize <= 0 { processor.imageSize = 512 } if processor.patchSize <= 0 { processor.patchSize = 16 } if processor.numChannels <= 0 { processor.numChannels = 3 } if processor.maxTiles <= 0 { processor.maxTiles = 12 } if processor.projectorScale <= 0 { processor.projectorScale = 2 } return processor } func (p ImageProcessor) ProcessImage(img image.Image) ([]processedVisionTile, error) { img = imageproc.Composite(img) if p.useDynamicResolution() { return p.processDynamicImage(img) } return p.processTiledImage(img), nil } func (p ImageProcessor) useDynamicResolution() bool { return p.minNumPatches > 0 || p.maxNumPatches > 0 } func (p ImageProcessor) processTiledImage(img image.Image) []processedVisionTile { bounds := img.Bounds() origWidth := bounds.Dx() origHeight := bounds.Dy() targetRatios := nemotronTargetRatios(p.maxTiles) gridWidth, gridHeight := findClosestAspectRatio(float64(origWidth)/float64(origHeight), targetRatios, origWidth, origHeight, p.imageSize) targetWidth := p.imageSize * gridWidth targetHeight := p.imageSize * gridHeight resized := resizeImageBicubicCHW(img, targetWidth, targetHeight) tiles := make([]processedVisionTile, 0, gridWidth*gridHeight+1) for row := range gridHeight { for col := range gridWidth { tile := cropCHWRegion( resized, targetWidth, targetHeight, p.numChannels, col*p.imageSize, row*p.imageSize, p.imageSize, p.imageSize, ) tiles = append(tiles, processedVisionTile{ data: normalizeVisionCHW(tile, p.imageMean, p.imageStd), size: image.Point{X: p.imageSize, Y: p.imageSize}, }) } } if p.useThumbnail && len(tiles) > 1 { thumbnail := resizeImageBicubicCHW(img, p.imageSize, p.imageSize) tiles = append(tiles, processedVisionTile{ data: normalizeVisionCHW(thumbnail, p.imageMean, p.imageStd), size: image.Point{X: p.imageSize, Y: p.imageSize}, }) } return tiles } func (p ImageProcessor) processDynamicImage(img image.Image) ([]processedVisionTile, error) { bounds := img.Bounds() origWidth := bounds.Dx() origHeight := bounds.Dy() patchesWidth, patchesHeight := p.dynamicPatchGrid(origWidth, origHeight) if patchesWidth <= 0 || patchesHeight <= 0 { return nil, errors.New("nemotron_h_omni: invalid dynamic image patch grid") } targetWidth := patchesWidth * p.patchSize targetHeight := patchesHeight * p.patchSize resized := resizeImageBicubicCHW(img, targetWidth, targetHeight) return []processedVisionTile{{ data: normalizeVisionCHW(resized, p.imageMean, p.imageStd), size: image.Point{X: targetWidth, Y: targetHeight}, }}, nil } func (p ImageProcessor) dynamicPatchGrid(origWidth, origHeight int) (int, int) { patchesHeight := max(1, int(math.Round(float64(origHeight)/float64(p.patchSize)+0.5))) patchesWidth := max(1, int(math.Round(float64(origWidth)/float64(p.patchSize)+0.5))) patches := patchesHeight * patchesWidth currentNumPatchesAvailable := p.maxNumPatches if currentNumPatchesAvailable <= 0 { currentNumPatchesAvailable = max(patches, p.minNumPatches) } factor := math.Min(math.Sqrt(float64(currentNumPatchesAvailable)/float64(patches)), 1.0) targetPatchesHeight := max(1, int(math.Floor(factor*float64(patchesHeight)))) targetPatchesWidth := max(1, int(math.Floor(factor*float64(patchesWidth)))) if currentNumPatchesAvailable > p.minNumPatches && targetPatchesHeight*targetPatchesWidth < p.minNumPatches { upFactor := math.Sqrt(float64(p.minNumPatches) / float64(targetPatchesHeight*targetPatchesWidth)) targetPatchesHeight = int(math.Ceil(upFactor * float64(targetPatchesHeight))) targetPatchesWidth = int(math.Ceil(upFactor * float64(targetPatchesWidth))) } targetPatchesHeight = roundPatchGridForPixelShuffle(targetPatchesHeight, targetPatchesWidth, currentNumPatchesAvailable, p.projectorScale) targetPatchesWidth = roundPatchGridForPixelShuffle(targetPatchesWidth, targetPatchesHeight, currentNumPatchesAvailable, p.projectorScale) return targetPatchesWidth, targetPatchesHeight } func roundPatchGridForPixelShuffle(v, other, maxPatches, divisor int) int { if divisor <= 1 { return v } rem := v % divisor if rem == 0 { return v } inc := divisor - rem if (v+inc)*other <= maxPatches { return v + inc } return max(divisor, v-rem) } type nemotronImageRatio struct { width int height int } func nemotronTargetRatios(maxTiles int) []nemotronImageRatio { targetRatios := make([]nemotronImageRatio, 0, maxTiles*maxTiles) for n := 1; n <= maxTiles; n++ { for w := 1; w <= n; w++ { for h := 1; h <= n; h++ { if w*h > maxTiles { continue } targetRatios = append(targetRatios, nemotronImageRatio{width: w, height: h}) } } } unique := targetRatios[:0] for _, ratio := range targetRatios { if slices.Contains(unique, ratio) { continue } unique = append(unique, ratio) } slices.SortFunc(unique, func(a, b nemotronImageRatio) int { return a.width*a.height - b.width*b.height }) return unique } func findClosestAspectRatio(aspectRatio float64, targetRatios []nemotronImageRatio, width, height, imageSize int) (int, int) { bestRatio := nemotronImageRatio{width: 1, height: 1} bestRatioDiff := math.MaxFloat64 area := width * height for _, ratio := range targetRatios { targetAspectRatio := float64(ratio.width) / float64(ratio.height) ratioDiff := math.Abs(aspectRatio - targetAspectRatio) if ratioDiff < bestRatioDiff { bestRatioDiff = ratioDiff bestRatio = ratio continue } if ratioDiff == bestRatioDiff && area > int(0.5*float64(imageSize*imageSize*ratio.width*ratio.height)) { bestRatio = ratio } } return bestRatio.width, bestRatio.height } func resizeImageBicubicCHW(img image.Image, outW, outH int) []float32 { bounds := img.Bounds() inW := bounds.Dx() inH := bounds.Dy() src := make([]float32, 3*inW*inH) for y := range inH { for x := range inW { r, g, b, _ := img.At(bounds.Min.X+x, bounds.Min.Y+y).RGBA() src[y*inW+x] = float32(r>>8) / 255.0 src[inW*inH+y*inW+x] = float32(g>>8) / 255.0 src[2*inW*inH+y*inW+x] = float32(b>>8) / 255.0 } } dst := make([]float32, 3*outW*outH) scaleX := float64(inW) / float64(outW) scaleY := float64(inH) / float64(outH) for oy := range outH { srcY := scaleY*(float64(oy)+0.5) - 0.5 yBase := int(math.Floor(srcY)) yFrac := clampUnit(srcY - float64(yBase)) wy := torchBicubicWeights(yFrac) for ox := range outW { srcX := scaleX*(float64(ox)+0.5) - 0.5 xBase := int(math.Floor(srcX)) xFrac := clampUnit(srcX - float64(xBase)) wx := torchBicubicWeights(xFrac) for c := range 3 { var sum float64 channelOffset := c * inW * inH for ky := range 4 { iy := clampIndex(yBase-1+ky, 0, inH-1) for kx := range 4 { ix := clampIndex(xBase-1+kx, 0, inW-1) sum += float64(src[channelOffset+iy*inW+ix]) * wy[ky] * wx[kx] } } dst[c*outW*outH+oy*outW+ox] = float32(sum) } } } return dst } func cropCHWRegion(values []float32, width, height, channels, left, top, cropW, cropH int) []float32 { out := make([]float32, channels*cropW*cropH) channelSize := width * height cropSize := cropW * cropH for c := range channels { srcBase := c * channelSize dstBase := c * cropSize for y := range cropH { copy(out[dstBase+y*cropW:dstBase+(y+1)*cropW], values[srcBase+(top+y)*width+left:srcBase+(top+y)*width+left+cropW]) } } return out } func normalizeVisionCHW(values []float32, mean, std [3]float32) []float32 { out := make([]float32, len(values)) channelSize := len(values) / 3 for c := range 3 { base := c * channelSize for i := range channelSize { out[base+i] = (values[base+i] - mean[c]) / std[c] } } return out } func torchBicubicWeights(t float64) [4]float64 { const a = -0.75 return [4]float64{ bicubicConvolution2(t+1.0, a), bicubicConvolution1(t, a), bicubicConvolution1(1.0-t, a), bicubicConvolution2(2.0-t, a), } } func bicubicConvolution1(x, a float64) float64 { return ((a+2)*x-(a+3))*x*x + 1 } func bicubicConvolution2(x, a float64) float64 { return ((a*x-5*a)*x+8*a)*x - 4*a } func clampUnit(v float64) float64 { if v < 0 { return 0 } if v > 1 { return 1 } return v } func clampIndex(v, lo, hi int) int { if v < lo { return lo } if v > hi { return hi } return v }