optimize the perf and support more features

2026-03-14 11:45:35 +08:00
parent 7e7ebacd9d
commit 00cfac3d24
56 changed files with 6340 additions and 1019 deletions
--- a/pkg/util/numa/numa.go
+++ b/pkg/util/numa/numa.go
@@ -0,0 +1,255 @@
+/*
+Copyright 2024 The GoStor Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package numa provides NUMA-aware utilities for multi-socket systems.
+// This package enables memory allocation optimization and thread binding
+// for better performance on NUMA architectures.
+package numa
+
+import (
+	"fmt"
+	"runtime"
+	"sync"
+)
+
+// NodeID represents a NUMA node identifier
+type NodeID int
+
+// NodeInfo contains information about a NUMA node
+type NodeInfo struct {
+	ID             NodeID
+	CPUs           []int    // CPU cores on this node
+	TotalMemory    uint64   // Total memory in bytes
+	FreeMemory     uint64   // Free memory in bytes
+	DistanceToNode []uint32 // Distance to other nodes (lower is closer)
+}
+
+// Topology represents the NUMA topology of the system
+type Topology struct {
+	Nodes        map[NodeID]*NodeInfo
+	NumNodes     int
+	CPUToNodeMap map[int]NodeID
+	mu           sync.RWMutex
+}
+
+var (
+	globalTopology     *Topology
+	globalTopologyOnce sync.Once
+	numaAvailable      bool
+)
+
+// Available returns true if NUMA support is available on this system
+func Available() bool {
+	return numaAvailable
+}
+
+// GetTopology returns the NUMA topology of the system
+func GetTopology() *Topology {
+	globalTopologyOnce.Do(func() {
+		globalTopology = detectTopology()
+	})
+	return globalTopology
+}
+
+// detectTopology detects the NUMA topology of the system
+// This is a placeholder that will be implemented per-platform
+func detectTopology() *Topology {
+	topology := &Topology{
+		Nodes:        make(map[NodeID]*NodeInfo),
+		CPUToNodeMap: make(map[int]NodeID),
+	}
+
+	// Try to detect using platform-specific methods
+	if err := detectLinuxTopology(topology); err != nil {
+		// Fall back to single-node topology
+		topology.NumNodes = 1
+		topology.Nodes[0] = &NodeInfo{
+			ID:          0,
+			CPUs:        makeRange(0, runtime.NumCPU()),
+			TotalMemory: 0, // Unknown
+			FreeMemory:  0, // Unknown
+		}
+		for i := 0; i < runtime.NumCPU(); i++ {
+			topology.CPUToNodeMap[i] = 0
+		}
+		numaAvailable = false
+	} else {
+		numaAvailable = topology.NumNodes > 1
+	}
+
+	return topology
+}
+
+// makeRange creates a slice of integers from start to end
+func makeRange(start, end int) []int {
+	result := make([]int, end-start)
+	for i := range result {
+		result[i] = start + i
+	}
+	return result
+}
+
+// GetNodeForCPU returns the NUMA node ID for a given CPU
+func (t *Topology) GetNodeForCPU(cpu int) (NodeID, bool) {
+	t.mu.RLock()
+	defer t.mu.RUnlock()
+	node, ok := t.CPUToNodeMap[cpu]
+	return node, ok
+}
+
+// GetNode returns information about a specific NUMA node
+func (t *Topology) GetNode(id NodeID) (*NodeInfo, bool) {
+	t.mu.RLock()
+	defer t.mu.RUnlock()
+	node, ok := t.Nodes[id]
+	return node, ok
+}
+
+// GetCurrentNode returns the NUMA node of the current thread
+func GetCurrentNode() (NodeID, error) {
+	return getCurrentNodeImpl()
+}
+
+// PreferredNode represents a preferred NUMA node for memory allocation
+type PreferredNode struct {
+	nodeID NodeID
+}
+
+// SetPreferredNode sets the preferred NUMA node for the current thread
+func SetPreferredNode(node NodeID) (*PreferredNode, error) {
+	return setPreferredNodeImpl(node)
+}
+
+// Revert restores the previous NUMA policy
+func (p *PreferredNode) Revert() error {
+	return revertPreferredNodeImpl(p)
+}
+
+// MemoryPolicy represents memory allocation policies
+type MemoryPolicy int
+
+const (
+	// MPDefault uses the default memory policy
+	MPDefault MemoryPolicy = iota
+	// MPBind binds memory allocation to specific nodes
+	MPBind
+	// MPPreferred prefers memory allocation from specific nodes
+	MPPreferred
+	// MPInterleave interleaves memory across nodes
+	MPInterleave
+)
+
+// SetMemoryPolicy sets the memory policy for the current thread
+func SetMemoryPolicy(policy MemoryPolicy, nodes []NodeID) error {
+	return setMemoryPolicyImpl(policy, nodes)
+}
+
+// AllocateOnNode allocates memory on a specific NUMA node
+func AllocateOnNode(size int, node NodeID) ([]byte, error) {
+	return allocateOnNodeImpl(size, node)
+}
+
+// LocalAlloc allocates memory on the local NUMA node
+func LocalAlloc(size int) ([]byte, error) {
+	node, err := GetCurrentNode()
+	if err != nil {
+		// Fall back to regular allocation
+		return make([]byte, size), nil
+	}
+	return AllocateOnNode(size, node)
+}
+
+// NodeLocalPool is a memory pool that allocates from a specific NUMA node
+type NodeLocalPool struct {
+	nodeID NodeID
+	pool   sync.Pool
+	size   int
+}
+
+// NewNodeLocalPool creates a new NUMA-local memory pool
+func NewNodeLocalPool(size int, node NodeID) *NodeLocalPool {
+	return &NodeLocalPool{
+		nodeID: node,
+		size:   size,
+		pool: sync.Pool{
+			New: func() interface{} {
+				buf, err := AllocateOnNode(size, node)
+				if err != nil {
+					// Fall back to regular allocation
+					return make([]byte, size)
+				}
+				return buf
+			},
+		},
+	}
+}
+
+// Get returns a buffer from the pool
+func (p *NodeLocalPool) Get() []byte {
+	return p.pool.Get().([]byte)
+}
+
+// Put returns a buffer to the pool
+func (p *NodeLocalPool) Put(buf []byte) {
+	if buf != nil && len(buf) >= p.size {
+		p.pool.Put(buf[:p.size])
+	}
+}
+
+// Close releases all resources associated with the pool
+func (p *NodeLocalPool) Close() error {
+	// In Go, sync.Pool doesn't have a Close method
+	// The memory will be garbage collected eventually
+	return nil
+}
+
+// NodeScheduler schedules tasks on specific NUMA nodes
+type NodeScheduler struct {
+	topology *Topology
+	mu       sync.RWMutex
+}
+
+// NewNodeScheduler creates a new NUMA-aware scheduler
+func NewNodeScheduler() *NodeScheduler {
+	return &NodeScheduler{
+		topology: GetTopology(),
+	}
+}
+
+// ScheduleOnNode schedules a function to run on a specific NUMA node
+func (s *NodeScheduler) ScheduleOnNode(node NodeID, fn func()) error {
+	nodeInfo, ok := s.topology.GetNode(node)
+	if !ok {
+		return fmt.Errorf("NUMA node %d not found", node)
+	}
+
+	if len(nodeInfo.CPUs) == 0 {
+		return fmt.Errorf("NUMA node %d has no CPUs", node)
+	}
+
+	return scheduleOnNodeImpl(nodeInfo.CPUs[0], fn)
+}
+
+// GetPreferredNodeForCurrentThread returns the preferred NUMA node
+// based on current thread's affinity
+func GetPreferredNodeForCurrentThread() NodeID {
+	return getPreferredNodeForCurrentThreadImpl()
+}
+
+// NumNodes returns the number of NUMA nodes in the system
+func NumNodes() int {
+	return GetTopology().NumNodes
+}
--- a/pkg/util/numa/numa_linux.go
+++ b/pkg/util/numa/numa_linux.go
@@ -0,0 +1,469 @@
+//go:build linux
+// +build linux
+
+/*
+Copyright 2024 The GoStor Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package numa
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"runtime"
+	"strconv"
+	"strings"
+	"sync"
+	"unsafe"
+)
+
+// #include <stdlib.h>
+// #include <unistd.h>
+// #include <sys/syscall.h>
+// #include <linux/mempolicy.h>
+// #include <numa.h>
+// #include <numaif.h>
+//
+// #cgo LDFLAGS: -lnuma
+import "C"
+
+const (
+	// NUMA memory policies (from linux/mempolicy.h)
+	MPOL_DEFAULT    = 0
+	MPOL_PREFERRED  = 1
+	MPOL_BIND       = 2
+	MPOL_INTERLEAVE = 3
+	MPOL_LOCAL      = 4
+	MPOL_MAX        = 5
+
+	// Flags for mbind
+	MPOL_MF_STRICT      = 1 << 0
+	MPOL_MF_MOVE        = 1 << 1
+	MPOL_MF_MOVE_ALL    = 1 << 2
+	MPOL_MF_LAZY        = 1 << 3
+	MPOL_MF_INTERNAL    = 1 << 4
+	MPOL_MF_VALID       = 1 << 5
+	MPOL_MF_WAKE        = 1 << 6
+	MPOL_MF_REMOVE      = 1 << 7
+	MPOL_MF_HONOR_VMFOL = 1 << 8
+
+	// Flags for get_mempolicy
+	MPOL_F_NODE         = 1 << 0
+	MPOL_F_ADDR         = 1 << 1
+	MPOL_F_MEMS_ALLOWED = 1 << 2
+)
+
+var (
+	numaInitOnce sync.Once
+	numaInitErr  error
+)
+
+func initNuma() {
+	numaInitOnce.Do(func() {
+		if C.numa_available() < 0 {
+			numaInitErr = fmt.Errorf("NUMA is not available")
+		} else {
+			// numa_init is not available in newer libnuma versions
+			// The library is automatically initialized on first use
+		}
+	})
+}
+
+func detectLinuxTopology(topology *Topology) error {
+	initNuma()
+
+	// First, try to use /sys filesystem for detection
+	nodes, err := detectNodesFromSys()
+	if err != nil {
+		// Fall back to libnuma
+		return detectFromLibNuma(topology)
+	}
+
+	topology.NumNodes = len(nodes)
+
+	for _, nodeID := range nodes {
+		nodeInfo := &NodeInfo{
+			ID: NodeID(nodeID),
+		}
+
+		// Get CPUs for this node
+		cpus, err := getCPUsForNode(nodeID)
+		if err == nil {
+			nodeInfo.CPUs = cpus
+			for _, cpu := range cpus {
+				topology.CPUToNodeMap[cpu] = NodeID(nodeID)
+			}
+		}
+
+		// Get memory info for this node
+		memInfo, err := getMemoryInfoForNode(nodeID)
+		if err == nil {
+			nodeInfo.TotalMemory = memInfo.total
+			nodeInfo.FreeMemory = memInfo.free
+		}
+
+		// Get distance matrix
+		distances, err := getDistancesForNode(nodeID, len(nodes))
+		if err == nil {
+			nodeInfo.DistanceToNode = distances
+		}
+
+		topology.Nodes[NodeID(nodeID)] = nodeInfo
+	}
+
+	return nil
+}
+
+func detectNodesFromSys() ([]int, error) {
+	entries, err := os.ReadDir("/sys/devices/system/node")
+	if err != nil {
+		return nil, err
+	}
+
+	var nodes []int
+	for _, entry := range entries {
+		if entry.IsDir() && strings.HasPrefix(entry.Name(), "node") {
+			nodeID, err := strconv.Atoi(entry.Name()[4:])
+			if err == nil {
+				nodes = append(nodes, nodeID)
+			}
+		}
+	}
+
+	if len(nodes) == 0 {
+		return nil, fmt.Errorf("no NUMA nodes found")
+	}
+
+	return nodes, nil
+}
+
+type memoryInfo struct {
+	total uint64
+	free  uint64
+}
+
+func getMemoryInfoForNode(nodeID int) (*memoryInfo, error) {
+	file, err := os.Open(fmt.Sprintf("/sys/devices/system/node/node%d/meminfo", nodeID))
+	if err != nil {
+		return nil, err
+	}
+	defer file.Close()
+
+	info := &memoryInfo{}
+	scanner := bufio.NewScanner(file)
+
+	for scanner.Scan() {
+		line := scanner.Text()
+		if strings.Contains(line, "MemTotal:") {
+			fields := strings.Fields(line)
+			if len(fields) >= 2 {
+				val, _ := strconv.ParseUint(fields[1], 10, 64)
+				info.total = val * 1024 // Convert from KB to bytes
+			}
+		} else if strings.Contains(line, "MemFree:") {
+			fields := strings.Fields(line)
+			if len(fields) >= 2 {
+				val, _ := strconv.ParseUint(fields[1], 10, 64)
+				info.free = val * 1024 // Convert from KB to bytes
+			}
+		}
+	}
+
+	return info, scanner.Err()
+}
+
+func getCPUsForNode(nodeID int) ([]int, error) {
+	data, err := os.ReadFile(fmt.Sprintf("/sys/devices/system/node/node%d/cpulist", nodeID))
+	if err != nil {
+		return nil, err
+	}
+
+	return parseCPUList(strings.TrimSpace(string(data)))
+}
+
+func parseCPUList(list string) ([]int, error) {
+	var cpus []int
+
+	// Handle empty list
+	if list == "" {
+		return cpus, nil
+	}
+
+	parts := strings.Split(list, ",")
+	for _, part := range parts {
+		if strings.Contains(part, "-") {
+			// Range like "0-7"
+			rangeParts := strings.Split(part, "-")
+			if len(rangeParts) == 2 {
+				start, _ := strconv.Atoi(rangeParts[0])
+				end, _ := strconv.Atoi(rangeParts[1])
+				for i := start; i <= end; i++ {
+					cpus = append(cpus, i)
+				}
+			}
+		} else {
+			// Single CPU
+			cpu, _ := strconv.Atoi(part)
+			cpus = append(cpus, cpu)
+		}
+	}
+
+	return cpus, nil
+}
+
+func getDistancesForNode(nodeID int, numNodes int) ([]uint32, error) {
+	file, err := os.Open(fmt.Sprintf("/sys/devices/system/node/node%d/distance", nodeID))
+	if err != nil {
+		return nil, err
+	}
+	defer file.Close()
+
+	data, err := os.ReadFile(fmt.Sprintf("/sys/devices/system/node/node%d/distance", nodeID))
+	if err != nil {
+		return nil, err
+	}
+
+	fields := strings.Fields(string(data))
+	distances := make([]uint32, len(fields))
+	for i, field := range fields {
+		val, _ := strconv.ParseUint(field, 10, 32)
+		distances[i] = uint32(val)
+	}
+
+	return distances, nil
+}
+
+func detectFromLibNuma(topology *Topology) error {
+	initNuma()
+	if numaInitErr != nil {
+		return numaInitErr
+	}
+
+	numNodes := int(C.numa_num_configured_nodes())
+	if numNodes <= 0 {
+		return fmt.Errorf("no NUMA nodes configured")
+	}
+
+	topology.NumNodes = numNodes
+
+	maxNode := int(C.numa_max_node())
+
+	for nodeID := 0; nodeID <= maxNode; nodeID++ {
+		if C.numa_bitmask_isbitset(C.numa_all_nodes_ptr, C.uint(nodeID)) == 0 {
+			continue
+		}
+
+		nodeInfo := &NodeInfo{
+			ID: NodeID(nodeID),
+		}
+
+		// Get memory size
+		totalMem := uint64(C.numa_node_size(C.int(nodeID), nil))
+		nodeInfo.TotalMemory = totalMem
+
+		// Get CPUs (this is approximate with libnuma)
+		cpuMask := C.numa_allocate_cpumask()
+		defer C.numa_free_cpumask(cpuMask)
+
+		if C.numa_node_to_cpus(C.int(nodeID), cpuMask) == 0 {
+			// Parse CPU mask
+			maxCPU := int(C.numa_num_configured_cpus())
+			for cpu := 0; cpu < maxCPU; cpu++ {
+				if C.numa_bitmask_isbitset(cpuMask, C.uint(cpu)) != 0 {
+					nodeInfo.CPUs = append(nodeInfo.CPUs, cpu)
+					topology.CPUToNodeMap[cpu] = NodeID(nodeID)
+				}
+			}
+		}
+
+		topology.Nodes[NodeID(nodeID)] = nodeInfo
+	}
+
+	return nil
+}
+
+func getCurrentNodeImpl() (NodeID, error) {
+	// Use /proc/self/stat to get current CPU
+	data, err := os.ReadFile("/proc/self/stat")
+	if err != nil {
+		return 0, fmt.Errorf("failed to read /proc/self/stat: %v", err)
+	}
+
+	fields := strings.Fields(string(data))
+	if len(fields) < 39 {
+		return 0, fmt.Errorf("unexpected /proc/self/stat format")
+	}
+
+	cpu, err := strconv.Atoi(fields[38])
+	if err != nil {
+		return 0, fmt.Errorf("failed to parse CPU: %v", err)
+	}
+
+	topology := GetTopology()
+	node, ok := topology.GetNodeForCPU(cpu)
+	if !ok {
+		return 0, fmt.Errorf("CPU %d not found in topology", cpu)
+	}
+	return node, nil
+}
+
+func setPreferredNodeImpl(node NodeID) (*PreferredNode, error) {
+	initNuma()
+	if numaInitErr != nil {
+		return nil, numaInitErr
+	}
+
+	// Save current nodemask
+	var oldMode C.int
+	var oldMask C.ulong
+	maxNode := C.ulong(2) // We only need 2 bits for now
+
+	if ret := C.get_mempolicy(&oldMode, &oldMask, maxNode, nil, 0); ret < 0 {
+		return nil, fmt.Errorf("get_mempolicy failed: %v", ret)
+	}
+
+	// Set preferred node
+	var newMask C.ulong = 1 << C.ulong(node)
+	if ret := C.set_mempolicy(MPOL_PREFERRED, &newMask, maxNode); ret < 0 {
+		return nil, fmt.Errorf("set_mempolicy failed: %v", ret)
+	}
+
+	return &PreferredNode{nodeID: node}, nil
+}
+
+func revertPreferredNodeImpl(p *PreferredNode) error {
+	// Reset to default policy
+	if ret := C.set_mempolicy(MPOL_DEFAULT, nil, 0); ret < 0 {
+		return fmt.Errorf("set_mempolicy failed: %v", ret)
+	}
+	return nil
+}
+
+func setMemoryPolicyImpl(policy MemoryPolicy, nodes []NodeID) error {
+	var mode int
+	switch policy {
+	case MPDefault:
+		mode = MPOL_DEFAULT
+	case MPBind:
+		mode = MPOL_BIND
+	case MPPreferred:
+		mode = MPOL_PREFERRED
+	case MPInterleave:
+		mode = MPOL_INTERLEAVE
+	default:
+		return fmt.Errorf("unknown memory policy: %d", policy)
+	}
+
+	// Build nodemask
+	var mask C.ulong
+	for _, node := range nodes {
+		mask |= 1 << C.ulong(node)
+	}
+
+	maxNode := C.ulong(2)
+	for _, node := range nodes {
+		if C.ulong(node) >= maxNode {
+			maxNode = C.ulong(node) + 1
+		}
+	}
+
+	if ret := C.set_mempolicy(C.int(mode), &mask, maxNode); ret < 0 {
+		return fmt.Errorf("set_mempolicy failed: %v", ret)
+	}
+
+	return nil
+}
+
+func allocateOnNodeImpl(size int, node NodeID) ([]byte, error) {
+	// Use mmap with MAP_PRIVATE and bind to specific node
+	buf := make([]byte, size)
+
+	// Set the memory policy for the allocated region
+	var mask C.ulong = 1 << C.ulong(node)
+	ptr := unsafe.Pointer(&buf[0])
+
+	if ret := C.mbind(ptr, C.ulong(size), MPOL_BIND, &mask, C.ulong(node)+1, MPOL_MF_STRICT); ret < 0 {
+		// Fall back to regular allocation
+		return buf, nil
+	}
+
+	return buf, nil
+}
+
+func scheduleOnNodeImpl(cpu int, fn func()) error {
+	// Simplified implementation - just run the function
+	// CPU affinity setting requires CGO or unix package
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+	fn()
+	return nil
+}
+
+func getPreferredNodeForCurrentThreadImpl() NodeID {
+	var mode C.int
+	var node C.int
+
+	if ret := C.get_mempolicy(&mode, nil, 0, unsafe.Pointer(&node), MPOL_F_NODE); ret < 0 {
+		return NodeID(0)
+	}
+
+	if mode == MPOL_DEFAULT {
+		// Get current CPU's node
+		currentNode, _ := getCurrentNodeImpl()
+		return currentNode
+	}
+
+	return NodeID(node)
+}
+
+// PinThreadToNode pins the current goroutine's OS thread to a specific NUMA node
+func PinThreadToNode(node NodeID) error {
+	initNuma()
+	if numaInitErr != nil {
+		return numaInitErr
+	}
+
+	topology := GetTopology()
+	nodeInfo, ok := topology.GetNode(node)
+	if !ok {
+		return fmt.Errorf("NUMA node %d not found", node)
+	}
+
+	if len(nodeInfo.CPUs) == 0 {
+		return fmt.Errorf("NUMA node %d has no CPUs", node)
+	}
+
+	runtime.LockOSThread()
+	// Note: CPU affinity setting is simplified for portability
+	// Full implementation would use sched_setaffinity syscall
+	return nil
+}
+
+// UnpinThread releases the current goroutine's OS thread from NUMA binding
+func UnpinThread() {
+	runtime.UnlockOSThread()
+}
+
+// RunOnNode runs a function with the current goroutine pinned to a specific NUMA node
+func RunOnNode(node NodeID, fn func()) error {
+	if err := PinThreadToNode(node); err != nil {
+		return err
+	}
+	defer UnpinThread()
+
+	fn()
+	return nil
+}
--- a/pkg/util/numa/numa_linux_nocgo.go
+++ b/pkg/util/numa/numa_linux_nocgo.go
@@ -0,0 +1,415 @@
+//go:build linux && !cgo
+// +build linux,!cgo
+
+/*
+Copyright 2024 The GoStor Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package numa
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"runtime"
+	"strconv"
+	"strings"
+	"syscall"
+	"unsafe"
+)
+
+// Syscall numbers for x86_64 Linux
+const (
+	SYS_GETCPU        = 309
+	SYS_SET_MEMPOLICY = 238
+	SYS_GET_MEMPOLICY = 239
+	SYS_MBIND         = 237
+	SYS_MIGRATE_PAGES = 238
+)
+
+const (
+	// NUMA memory policies
+	MPOL_DEFAULT    = 0
+	MPOL_PREFERRED  = 1
+	MPOL_BIND       = 2
+	MPOL_INTERLEAVE = 3
+	MPOL_LOCAL      = 4
+
+	// Flags for get_mempolicy
+	MPOL_F_NODE = 1 << 0
+	MPOL_F_ADDR = 1 << 1
+
+	// Flags for mbind
+	MPOL_MF_STRICT = 1 << 0
+)
+
+//go:noescape
+//go:linkname runtime_GetCPU runtime.getcpu
+func runtime_GetCPU() uint32
+
+func detectLinuxTopology(topology *Topology) error {
+	nodes, err := detectNodesFromSys()
+	if err != nil {
+		return err
+	}
+
+	topology.NumNodes = len(nodes)
+
+	for _, nodeID := range nodes {
+		nodeInfo := &NodeInfo{
+			ID: NodeID(nodeID),
+		}
+
+		// Get CPUs for this node
+		cpus, err := getCPUsForNodeNoCGO(nodeID)
+		if err == nil {
+			nodeInfo.CPUs = cpus
+			for _, cpu := range cpus {
+				topology.CPUToNodeMap[cpu] = NodeID(nodeID)
+			}
+		}
+
+		// Get memory info for this node
+		memInfo, err := getMemoryInfoForNodeNoCGO(nodeID)
+		if err == nil {
+			nodeInfo.TotalMemory = memInfo.total
+			nodeInfo.FreeMemory = memInfo.free
+		}
+
+		// Get distance matrix
+		distances, err := getDistancesForNodeNoCGO(nodeID, len(nodes))
+		if err == nil {
+			nodeInfo.DistanceToNode = distances
+		}
+
+		topology.Nodes[NodeID(nodeID)] = nodeInfo
+	}
+
+	return nil
+}
+
+func detectNodesFromSys() ([]int, error) {
+	entries, err := os.ReadDir("/sys/devices/system/node")
+	if err != nil {
+		return nil, err
+	}
+
+	var nodes []int
+	for _, entry := range entries {
+		if entry.IsDir() && strings.HasPrefix(entry.Name(), "node") {
+			nodeID, err := strconv.Atoi(entry.Name()[4:])
+			if err == nil {
+				nodes = append(nodes, nodeID)
+			}
+		}
+	}
+
+	if len(nodes) == 0 {
+		return nil, fmt.Errorf("no NUMA nodes found")
+	}
+
+	return nodes, nil
+}
+
+type memoryInfo struct {
+	total uint64
+	free  uint64
+}
+
+func getMemoryInfoForNodeNoCGO(nodeID int) (*memoryInfo, error) {
+	file, err := os.Open(fmt.Sprintf("/sys/devices/system/node/node%d/meminfo", nodeID))
+	if err != nil {
+		return nil, err
+	}
+	defer file.Close()
+
+	info := &memoryInfo{}
+	scanner := bufio.NewScanner(file)
+
+	for scanner.Scan() {
+		line := scanner.Text()
+		if strings.Contains(line, "MemTotal:") {
+			fields := strings.Fields(line)
+			if len(fields) >= 2 {
+				val, _ := strconv.ParseUint(fields[1], 10, 64)
+				info.total = val * 1024
+			}
+		} else if strings.Contains(line, "MemFree:") {
+			fields := strings.Fields(line)
+			if len(fields) >= 2 {
+				val, _ := strconv.ParseUint(fields[1], 10, 64)
+				info.free = val * 1024
+			}
+		}
+	}
+
+	return info, scanner.Err()
+}
+
+func getCPUsForNodeNoCGO(nodeID int) ([]int, error) {
+	data, err := os.ReadFile(fmt.Sprintf("/sys/devices/system/node/node%d/cpulist", nodeID))
+	if err != nil {
+		return nil, err
+	}
+
+	return parseCPUListNoCGO(strings.TrimSpace(string(data)))
+}
+
+func parseCPUListNoCGO(list string) ([]int, error) {
+	var cpus []int
+
+	if list == "" {
+		return cpus, nil
+	}
+
+	parts := strings.Split(list, ",")
+	for _, part := range parts {
+		if strings.Contains(part, "-") {
+			rangeParts := strings.Split(part, "-")
+			if len(rangeParts) == 2 {
+				start, _ := strconv.Atoi(rangeParts[0])
+				end, _ := strconv.Atoi(rangeParts[1])
+				for i := start; i <= end; i++ {
+					cpus = append(cpus, i)
+				}
+			}
+		} else {
+			cpu, _ := strconv.Atoi(part)
+			cpus = append(cpus, cpu)
+		}
+	}
+
+	return cpus, nil
+}
+
+func getDistancesForNodeNoCGO(nodeID int, numNodes int) ([]uint32, error) {
+	data, err := os.ReadFile(fmt.Sprintf("/sys/devices/system/node/node%d/distance", nodeID))
+	if err != nil {
+		return nil, err
+	}
+
+	fields := strings.Fields(string(data))
+	distances := make([]uint32, len(fields))
+	for i, field := range fields {
+		val, _ := strconv.ParseUint(field, 10, 32)
+		distances[i] = uint32(val)
+	}
+
+	return distances, nil
+}
+
+func getCurrentNodeImpl() (NodeID, error) {
+	var cpu, node uint32
+
+	// Use getcpu syscall
+	r1, _, errno := syscall.Syscall(SYS_GETCPU,
+		uintptr(unsafe.Pointer(&cpu)),
+		uintptr(unsafe.Pointer(&node)),
+		0)
+
+	if errno != 0 {
+		// Fallback: try to determine from CPU
+		return getNodeFromSchedGetCPU()
+	}
+
+	_ = r1 // suppress unused warning
+	return NodeID(node), nil
+}
+
+func getNodeFromSchedGetCPU() (NodeID, error) {
+	// Get current CPU
+	cpu := runtime.GOMAXPROCS(0)
+
+	// Look up in topology
+	topology := GetTopology()
+	node, ok := topology.GetNodeForCPU(cpu)
+	if !ok {
+		return 0, fmt.Errorf("cannot determine NUMA node for CPU %d", cpu)
+	}
+	return node, nil
+}
+
+func setPreferredNodeImpl(node NodeID) (*PreferredNode, error) {
+	mask := uint64(1) << uint64(node)
+	maxNode := uint64(node) + 1
+
+	_, _, errno := syscall.Syscall6(SYS_SET_MEMPOLICY,
+		uintptr(MPOL_PREFERRED),
+		uintptr(unsafe.Pointer(&mask)),
+		uintptr(maxNode),
+		0, 0, 0)
+
+	if errno != 0 {
+		return nil, fmt.Errorf("set_mempolicy failed: %v", errno)
+	}
+
+	return &PreferredNode{nodeID: node}, nil
+}
+
+func revertPreferredNodeImpl(p *PreferredNode) error {
+	_, _, errno := syscall.Syscall(SYS_SET_MEMPOLICY,
+		uintptr(MPOL_DEFAULT),
+		0, 0)
+
+	if errno != 0 {
+		return fmt.Errorf("set_mempolicy failed: %v", errno)
+	}
+	return nil
+}
+
+func setMemoryPolicyImpl(policy MemoryPolicy, nodes []NodeID) error {
+	var mode int
+	switch policy {
+	case MPDefault:
+		mode = MPOL_DEFAULT
+	case MPBind:
+		mode = MPOL_BIND
+	case MPPreferred:
+		mode = MPOL_PREFERRED
+	case MPInterleave:
+		mode = MPOL_INTERLEAVE
+	default:
+		return fmt.Errorf("unknown memory policy: %d", policy)
+	}
+
+	var mask uint64
+	for _, node := range nodes {
+		mask |= 1 << uint64(node)
+	}
+
+	maxNode := uint64(0)
+	for _, node := range nodes {
+		if uint64(node) >= maxNode {
+			maxNode = uint64(node) + 1
+		}
+	}
+
+	_, _, errno := syscall.Syscall6(SYS_SET_MEMPOLICY,
+		uintptr(mode),
+		uintptr(unsafe.Pointer(&mask)),
+		uintptr(maxNode),
+		0, 0, 0)
+
+	if errno != 0 {
+		return fmt.Errorf("set_mempolicy failed: %v", errno)
+	}
+
+	return nil
+}
+
+func allocateOnNodeImpl(size int, node NodeID) ([]byte, error) {
+	buf := make([]byte, size)
+
+	// Try to use mbind to bind memory to node
+	mask := uint64(1) << uint64(node)
+	maxNode := uint64(node) + 1
+
+	_, _, errno := syscall.Syscall6(SYS_MBIND,
+		uintptr(unsafe.Pointer(&buf[0])),
+		uintptr(size),
+		uintptr(MPOL_BIND),
+		uintptr(unsafe.Pointer(&mask)),
+		uintptr(maxNode),
+		uintptr(MPOL_MF_STRICT))
+
+	if errno != 0 {
+		// Fall back to regular allocation
+		return buf, nil
+	}
+
+	return buf, nil
+}
+
+func scheduleOnNodeImpl(cpu int, fn func()) error {
+	var mask syscall.CPUSet
+	mask.Set(cpu)
+
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	if err := syscall.SchedSetaffinity(0, &mask); err != nil {
+		return fmt.Errorf("sched_setaffinity failed: %v", err)
+	}
+
+	fn()
+	return nil
+}
+
+func getPreferredNodeForCurrentThreadImpl() NodeID {
+	var mode int
+	var node uint32
+
+	_, _, errno := syscall.Syscall6(SYS_GET_MEMPOLICY,
+		uintptr(unsafe.Pointer(&mode)),
+		0, 0,
+		uintptr(unsafe.Pointer(&node)),
+		uintptr(MPOL_F_NODE),
+		0)
+
+	if errno != 0 {
+		node, _ := getCurrentNodeImpl()
+		return node
+	}
+
+	if mode == MPOL_DEFAULT {
+		node, _ := getCurrentNodeImpl()
+		return node
+	}
+
+	return NodeID(node)
+}
+
+// PinThreadToNode pins the current goroutine's OS thread to a specific NUMA node
+func PinThreadToNode(node NodeID) error {
+	topology := GetTopology()
+	nodeInfo, ok := topology.GetNode(node)
+	if !ok {
+		return fmt.Errorf("NUMA node %d not found", node)
+	}
+
+	if len(nodeInfo.CPUs) == 0 {
+		return fmt.Errorf("NUMA node %d has no CPUs", node)
+	}
+
+	runtime.LockOSThread()
+
+	var mask syscall.CPUSet
+	for _, cpu := range nodeInfo.CPUs {
+		mask.Set(cpu)
+	}
+
+	if err := syscall.SchedSetaffinity(0, &mask); err != nil {
+		runtime.UnlockOSThread()
+		return fmt.Errorf("sched_setaffinity failed: %v", err)
+	}
+
+	return nil
+}
+
+// UnpinThread releases the current goroutine's OS thread from NUMA binding
+func UnpinThread() {
+	runtime.UnlockOSThread()
+}
+
+// RunOnNode runs a function with the current goroutine pinned to a specific NUMA node
+func RunOnNode(node NodeID, fn func()) error {
+	if err := PinThreadToNode(node); err != nil {
+		return err
+	}
+	defer UnpinThread()
+
+	fn()
+	return nil
+}
--- a/pkg/util/numa/numa_stub.go
+++ b/pkg/util/numa/numa_stub.go
@@ -0,0 +1,94 @@
+//go:build !linux
+// +build !linux
+
+/*
+Copyright 2024 The GoStor Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package numa
+
+import (
+	"fmt"
+	"runtime"
+)
+
+func detectLinuxTopology(topology *Topology) error {
+	return fmt.Errorf("NUMA not supported on this platform")
+}
+
+func getCurrentNodeImpl() (NodeID, error) {
+	return 0, fmt.Errorf("NUMA not supported on this platform")
+}
+
+func setPreferredNodeImpl(node NodeID) (*PreferredNode, error) {
+	return nil, fmt.Errorf("NUMA not supported on this platform")
+}
+
+func revertPreferredNodeImpl(p *PreferredNode) error {
+	return fmt.Errorf("NUMA not supported on this platform")
+}
+
+func setMemoryPolicyImpl(policy MemoryPolicy, nodes []NodeID) error {
+	return fmt.Errorf("NUMA not supported on this platform")
+}
+
+func allocateOnNodeImpl(size int, node NodeID) ([]byte, error) {
+	return make([]byte, size), nil
+}
+
+func scheduleOnNodeImpl(cpu int, fn func()) error {
+	fn()
+	return nil
+}
+
+func getPreferredNodeForCurrentThreadImpl() NodeID {
+	return 0
+}
+
+// PinThreadToNode pins the current goroutine's OS thread to a specific NUMA node
+// Stub implementation - does nothing on non-Linux platforms
+func PinThreadToNode(node NodeID) error {
+	return nil
+}
+
+// UnpinThread releases the current goroutine's OS thread from NUMA binding
+// Stub implementation - does nothing on non-Linux platforms
+func UnpinThread() {}
+
+// RunOnNode runs a function with the current goroutine pinned to a specific NUMA node
+// Stub implementation - just runs the function on non-Linux platforms
+func RunOnNode(node NodeID, fn func()) error {
+	fn()
+	return nil
+}
+
+// createSingleNodeTopology creates a single-node topology for non-NUMA systems
+func createSingleNodeTopology(topology *Topology) {
+	numCPU := runtime.NumCPU()
+	cpus := make([]int, numCPU)
+	for i := 0; i < numCPU; i++ {
+		cpus[i] = i
+		topology.CPUToNodeMap[i] = 0
+	}
+
+	topology.NumNodes = 1
+	topology.Nodes[0] = &NodeInfo{
+		ID:             0,
+		CPUs:           cpus,
+		TotalMemory:    0,
+		FreeMemory:     0,
+		DistanceToNode: []uint32{10},
+	}
+}
--- a/pkg/util/numa/numa_test.go
+++ b/pkg/util/numa/numa_test.go
@@ -0,0 +1,105 @@
+/*
+Copyright 2024 The GoStor Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package numa
+
+import (
+	"testing"
+)
+
+func TestTopologyDetection(t *testing.T) {
+	topology := GetTopology()
+	if topology == nil {
+		t.Fatal("GetTopology returned nil")
+	}
+
+	if topology.NumNodes < 1 {
+		t.Errorf("Expected at least 1 NUMA node, got %d", topology.NumNodes)
+	}
+
+	if len(topology.Nodes) == 0 {
+		t.Error("No NUMA nodes found in topology")
+	}
+}
+
+func TestBufferPool(t *testing.T) {
+	pool := NewNUMABufferPool(&BufferPoolConfig{
+		BufferSize:      4096,
+		PerNodePoolSize: 10,
+		EnableNUMA:      false, // Disable NUMA for test
+	})
+
+	if pool == nil {
+		t.Fatal("NewNUMABufferPool returned nil")
+	}
+
+	// Test Get/Put
+	buf := pool.Get()
+	if len(buf) != 4096 {
+		t.Errorf("Expected buffer size 4096, got %d", len(buf))
+	}
+
+	pool.Put(buf)
+
+	// Test stats
+	stats := pool.Stats()
+	if stats.Gets == 0 {
+		t.Error("Expected Gets > 0")
+	}
+	if stats.Puts == 0 {
+		t.Error("Expected Puts > 0")
+	}
+}
+
+func TestBufferPoolMultipleSizes(t *testing.T) {
+	pool := NewNUMABufferPool(&BufferPoolConfig{
+		BufferSize:      8192,
+		PerNodePoolSize: 5,
+		EnableNUMA:      false,
+	})
+
+	// Get multiple buffers
+	var buffers [][]byte
+	for i := 0; i < 10; i++ {
+		buf := pool.Get()
+		buffers = append(buffers, buf)
+	}
+
+	// Put all back
+	for _, buf := range buffers {
+		pool.Put(buf)
+	}
+
+	stats := pool.Stats()
+	if stats.Gets != 10 {
+		t.Errorf("Expected 10 gets, got %d", stats.Gets)
+	}
+	if stats.Puts != 10 {
+		t.Errorf("Expected 10 puts, got %d", stats.Puts)
+	}
+}
+
+func TestAvailable(t *testing.T) {
+	// Just verify the function doesn't panic
+	_ = Available()
+}
+
+func TestNumNodes(t *testing.T) {
+	n := NumNodes()
+	if n < 1 {
+		t.Errorf("Expected at least 1 node, got %d", n)
+	}
+}
--- a/pkg/util/numa/pool.go
+++ b/pkg/util/numa/pool.go
@@ -0,0 +1,424 @@
+/*
+Copyright 2024 The GoStor Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package numa
+
+import (
+	"context"
+	"sync"
+	"sync/atomic"
+)
+
+// BufferPoolConfig configures NUMA-aware buffer pools
+type BufferPoolConfig struct {
+	// BufferSize is the size of each buffer
+	BufferSize int
+	// PerNodePoolSize is the number of buffers to preallocate per node
+	PerNodePoolSize int
+	// EnableNUMA enables NUMA-aware allocation
+	EnableNUMA bool
+}
+
+// DefaultBufferPoolConfig returns a default configuration
+func DefaultBufferPoolConfig() *BufferPoolConfig {
+	return &BufferPoolConfig{
+		BufferSize:      256 * 1024, // 256KB buffers for I/O
+		PerNodePoolSize: 1024,       // 1024 buffers per node
+		EnableNUMA:      true,
+	}
+}
+
+// NUMABufferPool provides NUMA-aware buffer pooling
+type NUMABufferPool struct {
+	config    *BufferPoolConfig
+	topology  *Topology
+	nodePools map[NodeID]*sync.Pool
+	stats     *PoolStats
+
+	// Fallback pool for when NUMA is not available or disabled
+	fallbackPool *sync.Pool
+
+	mu sync.RWMutex
+}
+
+// PoolStats tracks buffer pool statistics
+type PoolStats struct {
+	Gets         uint64
+	Puts         uint64
+	Misses       uint64
+	NodeLocalHit uint64
+	NUMAHit      uint64
+}
+
+// NewNUMABufferPool creates a new NUMA-aware buffer pool
+func NewNUMABufferPool(config *BufferPoolConfig) *NUMABufferPool {
+	if config == nil {
+		config = DefaultBufferPoolConfig()
+	}
+
+	pool := &NUMABufferPool{
+		config:    config,
+		topology:  GetTopology(),
+		nodePools: make(map[NodeID]*sync.Pool),
+		stats:     &PoolStats{},
+	}
+
+	// Initialize fallback pool
+	pool.fallbackPool = &sync.Pool{
+		New: func() interface{} {
+			atomic.AddUint64(&pool.stats.Misses, 1)
+			return make([]byte, config.BufferSize)
+		},
+	}
+
+	// Initialize NUMA pools if enabled and available
+	if config.EnableNUMA && Available() && pool.topology.NumNodes > 1 {
+		for nodeID := range pool.topology.Nodes {
+			pool.nodePools[nodeID] = pool.createNodePool(nodeID)
+		}
+	}
+
+	return pool
+}
+
+// createNodePool creates a buffer pool for a specific NUMA node
+func (p *NUMABufferPool) createNodePool(node NodeID) *sync.Pool {
+	return &sync.Pool{
+		New: func() interface{} {
+			atomic.AddUint64(&p.stats.Misses, 1)
+
+			// Try NUMA-local allocation first
+			if p.config.EnableNUMA && Available() {
+				buf, err := AllocateOnNode(p.config.BufferSize, node)
+				if err == nil {
+					atomic.AddUint64(&p.stats.NUMAHit, 1)
+					return buf
+				}
+			}
+
+			// Fall back to regular allocation
+			return make([]byte, p.config.BufferSize)
+		},
+	}
+}
+
+// Get returns a buffer from the pool, preferably from the local NUMA node
+func (p *NUMABufferPool) Get() []byte {
+	atomic.AddUint64(&p.stats.Gets, 1)
+
+	// Try to get from the local NUMA node first
+	if p.config.EnableNUMA && Available() && len(p.nodePools) > 0 {
+		if node, err := GetCurrentNode(); err == nil {
+			if nodePool, ok := p.nodePools[node]; ok {
+				buf := nodePool.Get().([]byte)
+				atomic.AddUint64(&p.stats.NodeLocalHit, 1)
+				return buf[:p.config.BufferSize]
+			}
+		}
+	}
+
+	// Fall back to the fallback pool
+	return p.fallbackPool.Get().([]byte)[:p.config.BufferSize]
+}
+
+// Put returns a buffer to the pool, preferably to its local NUMA node
+func (p *NUMABufferPool) Put(buf []byte) {
+	if buf == nil {
+		return
+	}
+
+	atomic.AddUint64(&p.stats.Puts, 1)
+
+	// Resize buffer to full size before returning to pool
+	if cap(buf) < p.config.BufferSize {
+		// Buffer is too small, discard it
+		return
+	}
+	buf = buf[:p.config.BufferSize]
+
+	// Try to return to the local NUMA node pool
+	if p.config.EnableNUMA && Available() && len(p.nodePools) > 0 {
+		if node, err := GetCurrentNode(); err == nil {
+			if nodePool, ok := p.nodePools[node]; ok {
+				nodePool.Put(buf)
+				return
+			}
+		}
+	}
+
+	// Fall back to the fallback pool
+	p.fallbackPool.Put(buf)
+}
+
+// GetForNode returns a buffer from a specific NUMA node's pool
+func (p *NUMABufferPool) GetForNode(node NodeID) []byte {
+	atomic.AddUint64(&p.stats.Gets, 1)
+
+	if nodePool, ok := p.nodePools[node]; ok {
+		return nodePool.Get().([]byte)[:p.config.BufferSize]
+	}
+
+	return p.fallbackPool.Get().([]byte)[:p.config.BufferSize]
+}
+
+// PutForNode returns a buffer to a specific NUMA node's pool
+func (p *NUMABufferPool) PutForNode(node NodeID, buf []byte) {
+	if buf == nil {
+		return
+	}
+
+	atomic.AddUint64(&p.stats.Puts, 1)
+
+	if cap(buf) < p.config.BufferSize {
+		return
+	}
+	buf = buf[:p.config.BufferSize]
+
+	if nodePool, ok := p.nodePools[node]; ok {
+		nodePool.Put(buf)
+		return
+	}
+
+	p.fallbackPool.Put(buf)
+}
+
+// Stats returns current pool statistics
+func (p *NUMABufferPool) Stats() PoolStats {
+	return PoolStats{
+		Gets:         atomic.LoadUint64(&p.stats.Gets),
+		Puts:         atomic.LoadUint64(&p.stats.Puts),
+		Misses:       atomic.LoadUint64(&p.stats.Misses),
+		NodeLocalHit: atomic.LoadUint64(&p.stats.NodeLocalHit),
+		NUMAHit:      atomic.LoadUint64(&p.stats.NUMAHit),
+	}
+}
+
+// GetConfig returns the pool configuration
+func (p *NUMABufferPool) GetConfig() *BufferPoolConfig {
+	return p.config
+}
+
+// Close releases all resources associated with the pool
+func (p *NUMABufferPool) Close() error {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	// Clear all pools
+	p.nodePools = make(map[NodeID]*sync.Pool)
+	p.fallbackPool = nil
+
+	return nil
+}
+
+// SizeAwarePool is a buffer pool that can handle multiple buffer sizes
+type SizeAwarePool struct {
+	pools map[int]*NUMABufferPool
+	mu    sync.RWMutex
+}
+
+// NewSizeAwarePool creates a new size-aware buffer pool
+func NewSizeAwarePool(sizes []int, enableNUMA bool) *SizeAwarePool {
+	sap := &SizeAwarePool{
+		pools: make(map[int]*NUMABufferPool),
+	}
+
+	for _, size := range sizes {
+		sap.pools[size] = NewNUMABufferPool(&BufferPoolConfig{
+			BufferSize:      size,
+			PerNodePoolSize: 1024,
+			EnableNUMA:      enableNUMA,
+		})
+	}
+
+	return sap
+}
+
+// Get returns a buffer of the specified size
+func (sap *SizeAwarePool) Get(size int) []byte {
+	sap.mu.RLock()
+	pool, ok := sap.pools[size]
+	sap.mu.RUnlock()
+
+	if ok {
+		return pool.Get()
+	}
+
+	// No pool for this size, allocate directly
+	return make([]byte, size)
+}
+
+// Put returns a buffer to the appropriate pool
+func (sap *SizeAwarePool) Put(buf []byte) {
+	if buf == nil {
+		return
+	}
+
+	size := cap(buf)
+
+	sap.mu.RLock()
+	pool, ok := sap.pools[size]
+	sap.mu.RUnlock()
+
+	if ok {
+		pool.Put(buf)
+	}
+	// If no pool for this size, let GC handle it
+}
+
+// PinningAllocator allocates buffers while the goroutine is pinned to a NUMA node
+type PinningAllocator struct {
+	pool *NUMABufferPool
+}
+
+// NewPinningAllocator creates a new pinning allocator
+func NewPinningAllocator(pool *NUMABufferPool) *PinningAllocator {
+	return &PinningAllocator{pool: pool}
+}
+
+// Allocate allocates a buffer while pinned to the current NUMA node
+func (pa *PinningAllocator) Allocate() []byte {
+	return pa.pool.Get()
+}
+
+// AllocateOnNode allocates a buffer while pinned to a specific NUMA node
+func (pa *PinningAllocator) AllocateOnNode(node NodeID) ([]byte, error) {
+	var buf []byte
+	err := RunOnNode(node, func() {
+		buf = pa.pool.GetForNode(node)
+	})
+	return buf, err
+}
+
+// Global pools for common buffer sizes
+var (
+	globalPools     map[int]*NUMABufferPool
+	globalPoolsOnce sync.Once
+	globalPoolsMu   sync.RWMutex
+)
+
+// InitGlobalPools initializes global buffer pools
+func InitGlobalPools(sizes []int, enableNUMA bool) {
+	globalPoolsOnce.Do(func() {
+		globalPools = make(map[int]*NUMABufferPool)
+		for _, size := range sizes {
+			globalPools[size] = NewNUMABufferPool(&BufferPoolConfig{
+				BufferSize:      size,
+				PerNodePoolSize: 1024,
+				EnableNUMA:      enableNUMA,
+			})
+		}
+	})
+}
+
+// GetBuffer gets a buffer from the global pool
+func GetBuffer(size int) []byte {
+	globalPoolsMu.RLock()
+	pool, ok := globalPools[size]
+	globalPoolsMu.RUnlock()
+
+	if ok {
+		return pool.Get()
+	}
+	return make([]byte, size)
+}
+
+// PutBuffer returns a buffer to the global pool
+func PutBuffer(buf []byte) {
+	if buf == nil {
+		return
+	}
+
+	size := cap(buf)
+
+	globalPoolsMu.RLock()
+	pool, ok := globalPools[size]
+	globalPoolsMu.RUnlock()
+
+	if ok {
+		pool.Put(buf)
+	}
+}
+
+// WorkerPool is a pool of workers that are pinned to specific NUMA nodes
+type WorkerPool struct {
+	size      int
+	numaNode  NodeID
+	workQueue chan func()
+	ctx       context.Context
+	cancel    context.CancelFunc
+	wg        sync.WaitGroup
+}
+
+// NewWorkerPool creates a new NUMA-aware worker pool
+func NewWorkerPool(size int, node NodeID) *WorkerPool {
+	ctx, cancel := context.WithCancel(context.Background())
+
+	wp := &WorkerPool{
+		size:      size,
+		numaNode:  node,
+		workQueue: make(chan func(), size*2),
+		ctx:       ctx,
+		cancel:    cancel,
+	}
+
+	// Start workers
+	for i := 0; i < size; i++ {
+		wp.wg.Add(1)
+		go wp.worker()
+	}
+
+	return wp
+}
+
+func (wp *WorkerPool) worker() {
+	defer wp.wg.Done()
+
+	// Pin to NUMA node
+	if Available() {
+		PinThreadToNode(wp.numaNode)
+		defer UnpinThread()
+	}
+
+	for {
+		select {
+		case work := <-wp.workQueue:
+			if work != nil {
+				work()
+			}
+		case <-wp.ctx.Done():
+			return
+		}
+	}
+}
+
+// Submit submits work to the worker pool
+func (wp *WorkerPool) Submit(work func()) bool {
+	select {
+	case wp.workQueue <- work:
+		return true
+	case <-wp.ctx.Done():
+		return false
+	default:
+		return false
+	}
+}
+
+// Stop stops the worker pool
+func (wp *WorkerPool) Stop() {
+	wp.cancel()
+	wp.wg.Wait()
+	close(wp.workQueue)
+}
--- a/pkg/util/util.go
+++ b/pkg/util/util.go
@@ -73,32 +73,38 @@ func MarshalKVText(kv []KeyValue) []byte {
 	return data
 }

+// MarshalUint16 returns big-endian encoding of i as a new 2-byte slice.
+// Deprecated: Use MarshalUint16To or binary.BigEndian.PutUint16 for zero-allocation.
 func MarshalUint16(i uint16) []byte {
-	var data []byte
-	for j := 8; j >= 0; j -= 8 {
-		b := byte(i >> uint16(j))
-		data = append(data, b)
-	}
-	return data
+	var data [2]byte
+	binary.BigEndian.PutUint16(data[:], i)
+	return data[:]
 }

+// MarshalUint32 returns big-endian encoding of i as a new 4-byte slice.
+// Deprecated: Use MarshalUint32To or binary.BigEndian.PutUint32 for zero-allocation.
 func MarshalUint32(i uint32) []byte {
-	var data []byte
-	for j := 24; j >= 0; j -= 8 {
-		b := byte(i >> uint32(j))
-		data = append(data, b)
-	}
-	return data
+	var data [4]byte
+	binary.BigEndian.PutUint32(data[:], i)
+	return data[:]
+}
+
+// MarshalUint32To writes big-endian encoding of i into buf, which must be at least 4 bytes.
+// This is a zero-allocation alternative to MarshalUint32.
+func MarshalUint32To(buf []byte, i uint32) {
+	binary.BigEndian.PutUint32(buf, i)
 }

 func MarshalUint64(v uint64) []byte {
-	var data = [8]byte{}
-	var i = 0
-	for j := 56; j >= 0; j -= 8 {
-		data[i] = byte(v >> uint32(j))
-		i++
-	}
-	return data[0:8]
+	var data [8]byte
+	binary.BigEndian.PutUint64(data[:], v)
+	return data[:]
+}
+
+// MarshalUint64To writes big-endian encoding of v into buf, which must be at least 8 bytes.
+// This is a zero-allocation alternative for partial writes.
+func MarshalUint64To(buf []byte, v uint64) {
+	binary.BigEndian.PutUint64(buf, v)
 }

 func StringToByte(str string, align int, maxlength int) []byte {