356 lines
9.4 KiB
Go
356 lines
9.4 KiB
Go
package nemotronh
|
|
|
|
import (
|
|
"errors"
|
|
"image"
|
|
"math"
|
|
"slices"
|
|
|
|
"github.com/ollama/ollama/fs"
|
|
"github.com/ollama/ollama/model/imageproc"
|
|
)
|
|
|
|
type ImageProcessor struct {
|
|
imageSize int
|
|
patchSize int
|
|
numChannels int
|
|
maxTiles int
|
|
minNumPatches int
|
|
maxNumPatches int
|
|
useThumbnail bool
|
|
projectorScale int
|
|
imageMean [3]float32
|
|
imageStd [3]float32
|
|
}
|
|
|
|
type processedVisionTile struct {
|
|
data []float32
|
|
size image.Point
|
|
}
|
|
|
|
func newImageProcessor(c fs.Config) ImageProcessor {
|
|
mean := c.Floats("vision.image_mean")
|
|
std := c.Floats("vision.image_std")
|
|
|
|
processor := ImageProcessor{
|
|
imageSize: int(c.Uint("vision.image_size", 512)),
|
|
patchSize: int(c.Uint("vision.patch_size", 16)),
|
|
numChannels: int(c.Uint("vision.num_channels", 3)),
|
|
maxTiles: int(c.Uint("vision.max_tiles", 12)),
|
|
minNumPatches: int(c.Uint("vision.min_num_patches")),
|
|
maxNumPatches: int(c.Uint("vision.max_num_patches")),
|
|
useThumbnail: c.Bool("vision.use_thumbnail", true),
|
|
projectorScale: int(c.Uint("vision.projector.scale_factor", 2)),
|
|
imageMean: imageproc.ClipDefaultMean,
|
|
imageStd: imageproc.ClipDefaultSTD,
|
|
}
|
|
|
|
if len(mean) >= 3 {
|
|
processor.imageMean = [3]float32{mean[0], mean[1], mean[2]}
|
|
}
|
|
if len(std) >= 3 {
|
|
processor.imageStd = [3]float32{std[0], std[1], std[2]}
|
|
}
|
|
if processor.imageSize <= 0 {
|
|
processor.imageSize = 512
|
|
}
|
|
if processor.patchSize <= 0 {
|
|
processor.patchSize = 16
|
|
}
|
|
if processor.numChannels <= 0 {
|
|
processor.numChannels = 3
|
|
}
|
|
if processor.maxTiles <= 0 {
|
|
processor.maxTiles = 12
|
|
}
|
|
if processor.projectorScale <= 0 {
|
|
processor.projectorScale = 2
|
|
}
|
|
|
|
return processor
|
|
}
|
|
|
|
func (p ImageProcessor) ProcessImage(img image.Image) ([]processedVisionTile, error) {
|
|
img = imageproc.Composite(img)
|
|
if p.useDynamicResolution() {
|
|
return p.processDynamicImage(img)
|
|
}
|
|
|
|
return p.processTiledImage(img), nil
|
|
}
|
|
|
|
func (p ImageProcessor) useDynamicResolution() bool {
|
|
return p.minNumPatches > 0 || p.maxNumPatches > 0
|
|
}
|
|
|
|
func (p ImageProcessor) processTiledImage(img image.Image) []processedVisionTile {
|
|
bounds := img.Bounds()
|
|
origWidth := bounds.Dx()
|
|
origHeight := bounds.Dy()
|
|
targetRatios := nemotronTargetRatios(p.maxTiles)
|
|
gridWidth, gridHeight := findClosestAspectRatio(float64(origWidth)/float64(origHeight), targetRatios, origWidth, origHeight, p.imageSize)
|
|
|
|
targetWidth := p.imageSize * gridWidth
|
|
targetHeight := p.imageSize * gridHeight
|
|
resized := resizeImageBicubicCHW(img, targetWidth, targetHeight)
|
|
|
|
tiles := make([]processedVisionTile, 0, gridWidth*gridHeight+1)
|
|
for row := range gridHeight {
|
|
for col := range gridWidth {
|
|
tile := cropCHWRegion(
|
|
resized,
|
|
targetWidth,
|
|
targetHeight,
|
|
p.numChannels,
|
|
col*p.imageSize,
|
|
row*p.imageSize,
|
|
p.imageSize,
|
|
p.imageSize,
|
|
)
|
|
tiles = append(tiles, processedVisionTile{
|
|
data: normalizeVisionCHW(tile, p.imageMean, p.imageStd),
|
|
size: image.Point{X: p.imageSize, Y: p.imageSize},
|
|
})
|
|
}
|
|
}
|
|
|
|
if p.useThumbnail && len(tiles) > 1 {
|
|
thumbnail := resizeImageBicubicCHW(img, p.imageSize, p.imageSize)
|
|
tiles = append(tiles, processedVisionTile{
|
|
data: normalizeVisionCHW(thumbnail, p.imageMean, p.imageStd),
|
|
size: image.Point{X: p.imageSize, Y: p.imageSize},
|
|
})
|
|
}
|
|
|
|
return tiles
|
|
}
|
|
|
|
func (p ImageProcessor) processDynamicImage(img image.Image) ([]processedVisionTile, error) {
|
|
bounds := img.Bounds()
|
|
origWidth := bounds.Dx()
|
|
origHeight := bounds.Dy()
|
|
patchesWidth, patchesHeight := p.dynamicPatchGrid(origWidth, origHeight)
|
|
if patchesWidth <= 0 || patchesHeight <= 0 {
|
|
return nil, errors.New("nemotron_h_omni: invalid dynamic image patch grid")
|
|
}
|
|
|
|
targetWidth := patchesWidth * p.patchSize
|
|
targetHeight := patchesHeight * p.patchSize
|
|
resized := resizeImageBicubicCHW(img, targetWidth, targetHeight)
|
|
|
|
return []processedVisionTile{{
|
|
data: normalizeVisionCHW(resized, p.imageMean, p.imageStd),
|
|
size: image.Point{X: targetWidth, Y: targetHeight},
|
|
}}, nil
|
|
}
|
|
|
|
func (p ImageProcessor) dynamicPatchGrid(origWidth, origHeight int) (int, int) {
|
|
patchesHeight := max(1, int(math.Round(float64(origHeight)/float64(p.patchSize)+0.5)))
|
|
patchesWidth := max(1, int(math.Round(float64(origWidth)/float64(p.patchSize)+0.5)))
|
|
|
|
patches := patchesHeight * patchesWidth
|
|
currentNumPatchesAvailable := p.maxNumPatches
|
|
if currentNumPatchesAvailable <= 0 {
|
|
currentNumPatchesAvailable = max(patches, p.minNumPatches)
|
|
}
|
|
|
|
factor := math.Min(math.Sqrt(float64(currentNumPatchesAvailable)/float64(patches)), 1.0)
|
|
targetPatchesHeight := max(1, int(math.Floor(factor*float64(patchesHeight))))
|
|
targetPatchesWidth := max(1, int(math.Floor(factor*float64(patchesWidth))))
|
|
|
|
if currentNumPatchesAvailable > p.minNumPatches && targetPatchesHeight*targetPatchesWidth < p.minNumPatches {
|
|
upFactor := math.Sqrt(float64(p.minNumPatches) / float64(targetPatchesHeight*targetPatchesWidth))
|
|
targetPatchesHeight = int(math.Ceil(upFactor * float64(targetPatchesHeight)))
|
|
targetPatchesWidth = int(math.Ceil(upFactor * float64(targetPatchesWidth)))
|
|
}
|
|
|
|
targetPatchesHeight = roundPatchGridForPixelShuffle(targetPatchesHeight, targetPatchesWidth, currentNumPatchesAvailable, p.projectorScale)
|
|
targetPatchesWidth = roundPatchGridForPixelShuffle(targetPatchesWidth, targetPatchesHeight, currentNumPatchesAvailable, p.projectorScale)
|
|
|
|
return targetPatchesWidth, targetPatchesHeight
|
|
}
|
|
|
|
func roundPatchGridForPixelShuffle(v, other, maxPatches, divisor int) int {
|
|
if divisor <= 1 {
|
|
return v
|
|
}
|
|
rem := v % divisor
|
|
if rem == 0 {
|
|
return v
|
|
}
|
|
|
|
inc := divisor - rem
|
|
if (v+inc)*other <= maxPatches {
|
|
return v + inc
|
|
}
|
|
return max(divisor, v-rem)
|
|
}
|
|
|
|
type nemotronImageRatio struct {
|
|
width int
|
|
height int
|
|
}
|
|
|
|
func nemotronTargetRatios(maxTiles int) []nemotronImageRatio {
|
|
targetRatios := make([]nemotronImageRatio, 0, maxTiles*maxTiles)
|
|
for n := 1; n <= maxTiles; n++ {
|
|
for w := 1; w <= n; w++ {
|
|
for h := 1; h <= n; h++ {
|
|
if w*h > maxTiles {
|
|
continue
|
|
}
|
|
targetRatios = append(targetRatios, nemotronImageRatio{width: w, height: h})
|
|
}
|
|
}
|
|
}
|
|
|
|
unique := targetRatios[:0]
|
|
for _, ratio := range targetRatios {
|
|
if slices.Contains(unique, ratio) {
|
|
continue
|
|
}
|
|
unique = append(unique, ratio)
|
|
}
|
|
|
|
slices.SortFunc(unique, func(a, b nemotronImageRatio) int {
|
|
return a.width*a.height - b.width*b.height
|
|
})
|
|
|
|
return unique
|
|
}
|
|
|
|
func findClosestAspectRatio(aspectRatio float64, targetRatios []nemotronImageRatio, width, height, imageSize int) (int, int) {
|
|
bestRatio := nemotronImageRatio{width: 1, height: 1}
|
|
bestRatioDiff := math.MaxFloat64
|
|
area := width * height
|
|
|
|
for _, ratio := range targetRatios {
|
|
targetAspectRatio := float64(ratio.width) / float64(ratio.height)
|
|
ratioDiff := math.Abs(aspectRatio - targetAspectRatio)
|
|
if ratioDiff < bestRatioDiff {
|
|
bestRatioDiff = ratioDiff
|
|
bestRatio = ratio
|
|
continue
|
|
}
|
|
|
|
if ratioDiff == bestRatioDiff && area > int(0.5*float64(imageSize*imageSize*ratio.width*ratio.height)) {
|
|
bestRatio = ratio
|
|
}
|
|
}
|
|
|
|
return bestRatio.width, bestRatio.height
|
|
}
|
|
|
|
func resizeImageBicubicCHW(img image.Image, outW, outH int) []float32 {
|
|
bounds := img.Bounds()
|
|
inW := bounds.Dx()
|
|
inH := bounds.Dy()
|
|
src := make([]float32, 3*inW*inH)
|
|
|
|
for y := range inH {
|
|
for x := range inW {
|
|
r, g, b, _ := img.At(bounds.Min.X+x, bounds.Min.Y+y).RGBA()
|
|
src[y*inW+x] = float32(r>>8) / 255.0
|
|
src[inW*inH+y*inW+x] = float32(g>>8) / 255.0
|
|
src[2*inW*inH+y*inW+x] = float32(b>>8) / 255.0
|
|
}
|
|
}
|
|
|
|
dst := make([]float32, 3*outW*outH)
|
|
scaleX := float64(inW) / float64(outW)
|
|
scaleY := float64(inH) / float64(outH)
|
|
|
|
for oy := range outH {
|
|
srcY := scaleY*(float64(oy)+0.5) - 0.5
|
|
yBase := int(math.Floor(srcY))
|
|
yFrac := clampUnit(srcY - float64(yBase))
|
|
wy := torchBicubicWeights(yFrac)
|
|
|
|
for ox := range outW {
|
|
srcX := scaleX*(float64(ox)+0.5) - 0.5
|
|
xBase := int(math.Floor(srcX))
|
|
xFrac := clampUnit(srcX - float64(xBase))
|
|
wx := torchBicubicWeights(xFrac)
|
|
|
|
for c := range 3 {
|
|
var sum float64
|
|
channelOffset := c * inW * inH
|
|
for ky := range 4 {
|
|
iy := clampIndex(yBase-1+ky, 0, inH-1)
|
|
for kx := range 4 {
|
|
ix := clampIndex(xBase-1+kx, 0, inW-1)
|
|
sum += float64(src[channelOffset+iy*inW+ix]) * wy[ky] * wx[kx]
|
|
}
|
|
}
|
|
dst[c*outW*outH+oy*outW+ox] = float32(sum)
|
|
}
|
|
}
|
|
}
|
|
|
|
return dst
|
|
}
|
|
|
|
func cropCHWRegion(values []float32, width, height, channels, left, top, cropW, cropH int) []float32 {
|
|
out := make([]float32, channels*cropW*cropH)
|
|
channelSize := width * height
|
|
cropSize := cropW * cropH
|
|
for c := range channels {
|
|
srcBase := c * channelSize
|
|
dstBase := c * cropSize
|
|
for y := range cropH {
|
|
copy(out[dstBase+y*cropW:dstBase+(y+1)*cropW], values[srcBase+(top+y)*width+left:srcBase+(top+y)*width+left+cropW])
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func normalizeVisionCHW(values []float32, mean, std [3]float32) []float32 {
|
|
out := make([]float32, len(values))
|
|
channelSize := len(values) / 3
|
|
for c := range 3 {
|
|
base := c * channelSize
|
|
for i := range channelSize {
|
|
out[base+i] = (values[base+i] - mean[c]) / std[c]
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func torchBicubicWeights(t float64) [4]float64 {
|
|
const a = -0.75
|
|
return [4]float64{
|
|
bicubicConvolution2(t+1.0, a),
|
|
bicubicConvolution1(t, a),
|
|
bicubicConvolution1(1.0-t, a),
|
|
bicubicConvolution2(2.0-t, a),
|
|
}
|
|
}
|
|
|
|
func bicubicConvolution1(x, a float64) float64 {
|
|
return ((a+2)*x-(a+3))*x*x + 1
|
|
}
|
|
|
|
func bicubicConvolution2(x, a float64) float64 {
|
|
return ((a*x-5*a)*x+8*a)*x - 4*a
|
|
}
|
|
|
|
func clampUnit(v float64) float64 {
|
|
if v < 0 {
|
|
return 0
|
|
}
|
|
if v > 1 {
|
|
return 1
|
|
}
|
|
return v
|
|
}
|
|
|
|
func clampIndex(v, lo, hi int) int {
|
|
if v < lo {
|
|
return lo
|
|
}
|
|
if v > hi {
|
|
return hi
|
|
}
|
|
return v
|
|
}
|