optimize the perf and support more features

This commit is contained in:
Lei Xue
2026-03-14 11:45:35 +08:00
parent 7e7ebacd9d
commit 00cfac3d24
56 changed files with 6340 additions and 1019 deletions

255
pkg/util/numa/numa.go Normal file
View File

@@ -0,0 +1,255 @@
/*
Copyright 2024 The GoStor Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package numa provides NUMA-aware utilities for multi-socket systems.
// This package enables memory allocation optimization and thread binding
// for better performance on NUMA architectures.
package numa
import (
"fmt"
"runtime"
"sync"
)
// NodeID represents a NUMA node identifier
type NodeID int
// NodeInfo contains information about a NUMA node
type NodeInfo struct {
ID NodeID
CPUs []int // CPU cores on this node
TotalMemory uint64 // Total memory in bytes
FreeMemory uint64 // Free memory in bytes
DistanceToNode []uint32 // Distance to other nodes (lower is closer)
}
// Topology represents the NUMA topology of the system
type Topology struct {
Nodes map[NodeID]*NodeInfo
NumNodes int
CPUToNodeMap map[int]NodeID
mu sync.RWMutex
}
var (
globalTopology *Topology
globalTopologyOnce sync.Once
numaAvailable bool
)
// Available returns true if NUMA support is available on this system
func Available() bool {
return numaAvailable
}
// GetTopology returns the NUMA topology of the system
func GetTopology() *Topology {
globalTopologyOnce.Do(func() {
globalTopology = detectTopology()
})
return globalTopology
}
// detectTopology detects the NUMA topology of the system
// This is a placeholder that will be implemented per-platform
func detectTopology() *Topology {
topology := &Topology{
Nodes: make(map[NodeID]*NodeInfo),
CPUToNodeMap: make(map[int]NodeID),
}
// Try to detect using platform-specific methods
if err := detectLinuxTopology(topology); err != nil {
// Fall back to single-node topology
topology.NumNodes = 1
topology.Nodes[0] = &NodeInfo{
ID: 0,
CPUs: makeRange(0, runtime.NumCPU()),
TotalMemory: 0, // Unknown
FreeMemory: 0, // Unknown
}
for i := 0; i < runtime.NumCPU(); i++ {
topology.CPUToNodeMap[i] = 0
}
numaAvailable = false
} else {
numaAvailable = topology.NumNodes > 1
}
return topology
}
// makeRange creates a slice of integers from start to end
func makeRange(start, end int) []int {
result := make([]int, end-start)
for i := range result {
result[i] = start + i
}
return result
}
// GetNodeForCPU returns the NUMA node ID for a given CPU
func (t *Topology) GetNodeForCPU(cpu int) (NodeID, bool) {
t.mu.RLock()
defer t.mu.RUnlock()
node, ok := t.CPUToNodeMap[cpu]
return node, ok
}
// GetNode returns information about a specific NUMA node
func (t *Topology) GetNode(id NodeID) (*NodeInfo, bool) {
t.mu.RLock()
defer t.mu.RUnlock()
node, ok := t.Nodes[id]
return node, ok
}
// GetCurrentNode returns the NUMA node of the current thread
func GetCurrentNode() (NodeID, error) {
return getCurrentNodeImpl()
}
// PreferredNode represents a preferred NUMA node for memory allocation
type PreferredNode struct {
nodeID NodeID
}
// SetPreferredNode sets the preferred NUMA node for the current thread
func SetPreferredNode(node NodeID) (*PreferredNode, error) {
return setPreferredNodeImpl(node)
}
// Revert restores the previous NUMA policy
func (p *PreferredNode) Revert() error {
return revertPreferredNodeImpl(p)
}
// MemoryPolicy represents memory allocation policies
type MemoryPolicy int
const (
// MPDefault uses the default memory policy
MPDefault MemoryPolicy = iota
// MPBind binds memory allocation to specific nodes
MPBind
// MPPreferred prefers memory allocation from specific nodes
MPPreferred
// MPInterleave interleaves memory across nodes
MPInterleave
)
// SetMemoryPolicy sets the memory policy for the current thread
func SetMemoryPolicy(policy MemoryPolicy, nodes []NodeID) error {
return setMemoryPolicyImpl(policy, nodes)
}
// AllocateOnNode allocates memory on a specific NUMA node
func AllocateOnNode(size int, node NodeID) ([]byte, error) {
return allocateOnNodeImpl(size, node)
}
// LocalAlloc allocates memory on the local NUMA node
func LocalAlloc(size int) ([]byte, error) {
node, err := GetCurrentNode()
if err != nil {
// Fall back to regular allocation
return make([]byte, size), nil
}
return AllocateOnNode(size, node)
}
// NodeLocalPool is a memory pool that allocates from a specific NUMA node
type NodeLocalPool struct {
nodeID NodeID
pool sync.Pool
size int
}
// NewNodeLocalPool creates a new NUMA-local memory pool
func NewNodeLocalPool(size int, node NodeID) *NodeLocalPool {
return &NodeLocalPool{
nodeID: node,
size: size,
pool: sync.Pool{
New: func() interface{} {
buf, err := AllocateOnNode(size, node)
if err != nil {
// Fall back to regular allocation
return make([]byte, size)
}
return buf
},
},
}
}
// Get returns a buffer from the pool
func (p *NodeLocalPool) Get() []byte {
return p.pool.Get().([]byte)
}
// Put returns a buffer to the pool
func (p *NodeLocalPool) Put(buf []byte) {
if buf != nil && len(buf) >= p.size {
p.pool.Put(buf[:p.size])
}
}
// Close releases all resources associated with the pool
func (p *NodeLocalPool) Close() error {
// In Go, sync.Pool doesn't have a Close method
// The memory will be garbage collected eventually
return nil
}
// NodeScheduler schedules tasks on specific NUMA nodes
type NodeScheduler struct {
topology *Topology
mu sync.RWMutex
}
// NewNodeScheduler creates a new NUMA-aware scheduler
func NewNodeScheduler() *NodeScheduler {
return &NodeScheduler{
topology: GetTopology(),
}
}
// ScheduleOnNode schedules a function to run on a specific NUMA node
func (s *NodeScheduler) ScheduleOnNode(node NodeID, fn func()) error {
nodeInfo, ok := s.topology.GetNode(node)
if !ok {
return fmt.Errorf("NUMA node %d not found", node)
}
if len(nodeInfo.CPUs) == 0 {
return fmt.Errorf("NUMA node %d has no CPUs", node)
}
return scheduleOnNodeImpl(nodeInfo.CPUs[0], fn)
}
// GetPreferredNodeForCurrentThread returns the preferred NUMA node
// based on current thread's affinity
func GetPreferredNodeForCurrentThread() NodeID {
return getPreferredNodeForCurrentThreadImpl()
}
// NumNodes returns the number of NUMA nodes in the system
func NumNodes() int {
return GetTopology().NumNodes
}

469
pkg/util/numa/numa_linux.go Normal file
View File

@@ -0,0 +1,469 @@
//go:build linux
// +build linux
/*
Copyright 2024 The GoStor Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package numa
import (
"bufio"
"fmt"
"os"
"runtime"
"strconv"
"strings"
"sync"
"unsafe"
)
// #include <stdlib.h>
// #include <unistd.h>
// #include <sys/syscall.h>
// #include <linux/mempolicy.h>
// #include <numa.h>
// #include <numaif.h>
//
// #cgo LDFLAGS: -lnuma
import "C"
const (
// NUMA memory policies (from linux/mempolicy.h)
MPOL_DEFAULT = 0
MPOL_PREFERRED = 1
MPOL_BIND = 2
MPOL_INTERLEAVE = 3
MPOL_LOCAL = 4
MPOL_MAX = 5
// Flags for mbind
MPOL_MF_STRICT = 1 << 0
MPOL_MF_MOVE = 1 << 1
MPOL_MF_MOVE_ALL = 1 << 2
MPOL_MF_LAZY = 1 << 3
MPOL_MF_INTERNAL = 1 << 4
MPOL_MF_VALID = 1 << 5
MPOL_MF_WAKE = 1 << 6
MPOL_MF_REMOVE = 1 << 7
MPOL_MF_HONOR_VMFOL = 1 << 8
// Flags for get_mempolicy
MPOL_F_NODE = 1 << 0
MPOL_F_ADDR = 1 << 1
MPOL_F_MEMS_ALLOWED = 1 << 2
)
var (
numaInitOnce sync.Once
numaInitErr error
)
func initNuma() {
numaInitOnce.Do(func() {
if C.numa_available() < 0 {
numaInitErr = fmt.Errorf("NUMA is not available")
} else {
// numa_init is not available in newer libnuma versions
// The library is automatically initialized on first use
}
})
}
func detectLinuxTopology(topology *Topology) error {
initNuma()
// First, try to use /sys filesystem for detection
nodes, err := detectNodesFromSys()
if err != nil {
// Fall back to libnuma
return detectFromLibNuma(topology)
}
topology.NumNodes = len(nodes)
for _, nodeID := range nodes {
nodeInfo := &NodeInfo{
ID: NodeID(nodeID),
}
// Get CPUs for this node
cpus, err := getCPUsForNode(nodeID)
if err == nil {
nodeInfo.CPUs = cpus
for _, cpu := range cpus {
topology.CPUToNodeMap[cpu] = NodeID(nodeID)
}
}
// Get memory info for this node
memInfo, err := getMemoryInfoForNode(nodeID)
if err == nil {
nodeInfo.TotalMemory = memInfo.total
nodeInfo.FreeMemory = memInfo.free
}
// Get distance matrix
distances, err := getDistancesForNode(nodeID, len(nodes))
if err == nil {
nodeInfo.DistanceToNode = distances
}
topology.Nodes[NodeID(nodeID)] = nodeInfo
}
return nil
}
func detectNodesFromSys() ([]int, error) {
entries, err := os.ReadDir("/sys/devices/system/node")
if err != nil {
return nil, err
}
var nodes []int
for _, entry := range entries {
if entry.IsDir() && strings.HasPrefix(entry.Name(), "node") {
nodeID, err := strconv.Atoi(entry.Name()[4:])
if err == nil {
nodes = append(nodes, nodeID)
}
}
}
if len(nodes) == 0 {
return nil, fmt.Errorf("no NUMA nodes found")
}
return nodes, nil
}
type memoryInfo struct {
total uint64
free uint64
}
func getMemoryInfoForNode(nodeID int) (*memoryInfo, error) {
file, err := os.Open(fmt.Sprintf("/sys/devices/system/node/node%d/meminfo", nodeID))
if err != nil {
return nil, err
}
defer file.Close()
info := &memoryInfo{}
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
if strings.Contains(line, "MemTotal:") {
fields := strings.Fields(line)
if len(fields) >= 2 {
val, _ := strconv.ParseUint(fields[1], 10, 64)
info.total = val * 1024 // Convert from KB to bytes
}
} else if strings.Contains(line, "MemFree:") {
fields := strings.Fields(line)
if len(fields) >= 2 {
val, _ := strconv.ParseUint(fields[1], 10, 64)
info.free = val * 1024 // Convert from KB to bytes
}
}
}
return info, scanner.Err()
}
func getCPUsForNode(nodeID int) ([]int, error) {
data, err := os.ReadFile(fmt.Sprintf("/sys/devices/system/node/node%d/cpulist", nodeID))
if err != nil {
return nil, err
}
return parseCPUList(strings.TrimSpace(string(data)))
}
func parseCPUList(list string) ([]int, error) {
var cpus []int
// Handle empty list
if list == "" {
return cpus, nil
}
parts := strings.Split(list, ",")
for _, part := range parts {
if strings.Contains(part, "-") {
// Range like "0-7"
rangeParts := strings.Split(part, "-")
if len(rangeParts) == 2 {
start, _ := strconv.Atoi(rangeParts[0])
end, _ := strconv.Atoi(rangeParts[1])
for i := start; i <= end; i++ {
cpus = append(cpus, i)
}
}
} else {
// Single CPU
cpu, _ := strconv.Atoi(part)
cpus = append(cpus, cpu)
}
}
return cpus, nil
}
func getDistancesForNode(nodeID int, numNodes int) ([]uint32, error) {
file, err := os.Open(fmt.Sprintf("/sys/devices/system/node/node%d/distance", nodeID))
if err != nil {
return nil, err
}
defer file.Close()
data, err := os.ReadFile(fmt.Sprintf("/sys/devices/system/node/node%d/distance", nodeID))
if err != nil {
return nil, err
}
fields := strings.Fields(string(data))
distances := make([]uint32, len(fields))
for i, field := range fields {
val, _ := strconv.ParseUint(field, 10, 32)
distances[i] = uint32(val)
}
return distances, nil
}
func detectFromLibNuma(topology *Topology) error {
initNuma()
if numaInitErr != nil {
return numaInitErr
}
numNodes := int(C.numa_num_configured_nodes())
if numNodes <= 0 {
return fmt.Errorf("no NUMA nodes configured")
}
topology.NumNodes = numNodes
maxNode := int(C.numa_max_node())
for nodeID := 0; nodeID <= maxNode; nodeID++ {
if C.numa_bitmask_isbitset(C.numa_all_nodes_ptr, C.uint(nodeID)) == 0 {
continue
}
nodeInfo := &NodeInfo{
ID: NodeID(nodeID),
}
// Get memory size
totalMem := uint64(C.numa_node_size(C.int(nodeID), nil))
nodeInfo.TotalMemory = totalMem
// Get CPUs (this is approximate with libnuma)
cpuMask := C.numa_allocate_cpumask()
defer C.numa_free_cpumask(cpuMask)
if C.numa_node_to_cpus(C.int(nodeID), cpuMask) == 0 {
// Parse CPU mask
maxCPU := int(C.numa_num_configured_cpus())
for cpu := 0; cpu < maxCPU; cpu++ {
if C.numa_bitmask_isbitset(cpuMask, C.uint(cpu)) != 0 {
nodeInfo.CPUs = append(nodeInfo.CPUs, cpu)
topology.CPUToNodeMap[cpu] = NodeID(nodeID)
}
}
}
topology.Nodes[NodeID(nodeID)] = nodeInfo
}
return nil
}
func getCurrentNodeImpl() (NodeID, error) {
// Use /proc/self/stat to get current CPU
data, err := os.ReadFile("/proc/self/stat")
if err != nil {
return 0, fmt.Errorf("failed to read /proc/self/stat: %v", err)
}
fields := strings.Fields(string(data))
if len(fields) < 39 {
return 0, fmt.Errorf("unexpected /proc/self/stat format")
}
cpu, err := strconv.Atoi(fields[38])
if err != nil {
return 0, fmt.Errorf("failed to parse CPU: %v", err)
}
topology := GetTopology()
node, ok := topology.GetNodeForCPU(cpu)
if !ok {
return 0, fmt.Errorf("CPU %d not found in topology", cpu)
}
return node, nil
}
func setPreferredNodeImpl(node NodeID) (*PreferredNode, error) {
initNuma()
if numaInitErr != nil {
return nil, numaInitErr
}
// Save current nodemask
var oldMode C.int
var oldMask C.ulong
maxNode := C.ulong(2) // We only need 2 bits for now
if ret := C.get_mempolicy(&oldMode, &oldMask, maxNode, nil, 0); ret < 0 {
return nil, fmt.Errorf("get_mempolicy failed: %v", ret)
}
// Set preferred node
var newMask C.ulong = 1 << C.ulong(node)
if ret := C.set_mempolicy(MPOL_PREFERRED, &newMask, maxNode); ret < 0 {
return nil, fmt.Errorf("set_mempolicy failed: %v", ret)
}
return &PreferredNode{nodeID: node}, nil
}
func revertPreferredNodeImpl(p *PreferredNode) error {
// Reset to default policy
if ret := C.set_mempolicy(MPOL_DEFAULT, nil, 0); ret < 0 {
return fmt.Errorf("set_mempolicy failed: %v", ret)
}
return nil
}
func setMemoryPolicyImpl(policy MemoryPolicy, nodes []NodeID) error {
var mode int
switch policy {
case MPDefault:
mode = MPOL_DEFAULT
case MPBind:
mode = MPOL_BIND
case MPPreferred:
mode = MPOL_PREFERRED
case MPInterleave:
mode = MPOL_INTERLEAVE
default:
return fmt.Errorf("unknown memory policy: %d", policy)
}
// Build nodemask
var mask C.ulong
for _, node := range nodes {
mask |= 1 << C.ulong(node)
}
maxNode := C.ulong(2)
for _, node := range nodes {
if C.ulong(node) >= maxNode {
maxNode = C.ulong(node) + 1
}
}
if ret := C.set_mempolicy(C.int(mode), &mask, maxNode); ret < 0 {
return fmt.Errorf("set_mempolicy failed: %v", ret)
}
return nil
}
func allocateOnNodeImpl(size int, node NodeID) ([]byte, error) {
// Use mmap with MAP_PRIVATE and bind to specific node
buf := make([]byte, size)
// Set the memory policy for the allocated region
var mask C.ulong = 1 << C.ulong(node)
ptr := unsafe.Pointer(&buf[0])
if ret := C.mbind(ptr, C.ulong(size), MPOL_BIND, &mask, C.ulong(node)+1, MPOL_MF_STRICT); ret < 0 {
// Fall back to regular allocation
return buf, nil
}
return buf, nil
}
func scheduleOnNodeImpl(cpu int, fn func()) error {
// Simplified implementation - just run the function
// CPU affinity setting requires CGO or unix package
runtime.LockOSThread()
defer runtime.UnlockOSThread()
fn()
return nil
}
func getPreferredNodeForCurrentThreadImpl() NodeID {
var mode C.int
var node C.int
if ret := C.get_mempolicy(&mode, nil, 0, unsafe.Pointer(&node), MPOL_F_NODE); ret < 0 {
return NodeID(0)
}
if mode == MPOL_DEFAULT {
// Get current CPU's node
currentNode, _ := getCurrentNodeImpl()
return currentNode
}
return NodeID(node)
}
// PinThreadToNode pins the current goroutine's OS thread to a specific NUMA node
func PinThreadToNode(node NodeID) error {
initNuma()
if numaInitErr != nil {
return numaInitErr
}
topology := GetTopology()
nodeInfo, ok := topology.GetNode(node)
if !ok {
return fmt.Errorf("NUMA node %d not found", node)
}
if len(nodeInfo.CPUs) == 0 {
return fmt.Errorf("NUMA node %d has no CPUs", node)
}
runtime.LockOSThread()
// Note: CPU affinity setting is simplified for portability
// Full implementation would use sched_setaffinity syscall
return nil
}
// UnpinThread releases the current goroutine's OS thread from NUMA binding
func UnpinThread() {
runtime.UnlockOSThread()
}
// RunOnNode runs a function with the current goroutine pinned to a specific NUMA node
func RunOnNode(node NodeID, fn func()) error {
if err := PinThreadToNode(node); err != nil {
return err
}
defer UnpinThread()
fn()
return nil
}

View File

@@ -0,0 +1,415 @@
//go:build linux && !cgo
// +build linux,!cgo
/*
Copyright 2024 The GoStor Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package numa
import (
"bufio"
"fmt"
"os"
"runtime"
"strconv"
"strings"
"syscall"
"unsafe"
)
// Syscall numbers for x86_64 Linux
const (
SYS_GETCPU = 309
SYS_SET_MEMPOLICY = 238
SYS_GET_MEMPOLICY = 239
SYS_MBIND = 237
SYS_MIGRATE_PAGES = 238
)
const (
// NUMA memory policies
MPOL_DEFAULT = 0
MPOL_PREFERRED = 1
MPOL_BIND = 2
MPOL_INTERLEAVE = 3
MPOL_LOCAL = 4
// Flags for get_mempolicy
MPOL_F_NODE = 1 << 0
MPOL_F_ADDR = 1 << 1
// Flags for mbind
MPOL_MF_STRICT = 1 << 0
)
//go:noescape
//go:linkname runtime_GetCPU runtime.getcpu
func runtime_GetCPU() uint32
func detectLinuxTopology(topology *Topology) error {
nodes, err := detectNodesFromSys()
if err != nil {
return err
}
topology.NumNodes = len(nodes)
for _, nodeID := range nodes {
nodeInfo := &NodeInfo{
ID: NodeID(nodeID),
}
// Get CPUs for this node
cpus, err := getCPUsForNodeNoCGO(nodeID)
if err == nil {
nodeInfo.CPUs = cpus
for _, cpu := range cpus {
topology.CPUToNodeMap[cpu] = NodeID(nodeID)
}
}
// Get memory info for this node
memInfo, err := getMemoryInfoForNodeNoCGO(nodeID)
if err == nil {
nodeInfo.TotalMemory = memInfo.total
nodeInfo.FreeMemory = memInfo.free
}
// Get distance matrix
distances, err := getDistancesForNodeNoCGO(nodeID, len(nodes))
if err == nil {
nodeInfo.DistanceToNode = distances
}
topology.Nodes[NodeID(nodeID)] = nodeInfo
}
return nil
}
func detectNodesFromSys() ([]int, error) {
entries, err := os.ReadDir("/sys/devices/system/node")
if err != nil {
return nil, err
}
var nodes []int
for _, entry := range entries {
if entry.IsDir() && strings.HasPrefix(entry.Name(), "node") {
nodeID, err := strconv.Atoi(entry.Name()[4:])
if err == nil {
nodes = append(nodes, nodeID)
}
}
}
if len(nodes) == 0 {
return nil, fmt.Errorf("no NUMA nodes found")
}
return nodes, nil
}
type memoryInfo struct {
total uint64
free uint64
}
func getMemoryInfoForNodeNoCGO(nodeID int) (*memoryInfo, error) {
file, err := os.Open(fmt.Sprintf("/sys/devices/system/node/node%d/meminfo", nodeID))
if err != nil {
return nil, err
}
defer file.Close()
info := &memoryInfo{}
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
if strings.Contains(line, "MemTotal:") {
fields := strings.Fields(line)
if len(fields) >= 2 {
val, _ := strconv.ParseUint(fields[1], 10, 64)
info.total = val * 1024
}
} else if strings.Contains(line, "MemFree:") {
fields := strings.Fields(line)
if len(fields) >= 2 {
val, _ := strconv.ParseUint(fields[1], 10, 64)
info.free = val * 1024
}
}
}
return info, scanner.Err()
}
func getCPUsForNodeNoCGO(nodeID int) ([]int, error) {
data, err := os.ReadFile(fmt.Sprintf("/sys/devices/system/node/node%d/cpulist", nodeID))
if err != nil {
return nil, err
}
return parseCPUListNoCGO(strings.TrimSpace(string(data)))
}
func parseCPUListNoCGO(list string) ([]int, error) {
var cpus []int
if list == "" {
return cpus, nil
}
parts := strings.Split(list, ",")
for _, part := range parts {
if strings.Contains(part, "-") {
rangeParts := strings.Split(part, "-")
if len(rangeParts) == 2 {
start, _ := strconv.Atoi(rangeParts[0])
end, _ := strconv.Atoi(rangeParts[1])
for i := start; i <= end; i++ {
cpus = append(cpus, i)
}
}
} else {
cpu, _ := strconv.Atoi(part)
cpus = append(cpus, cpu)
}
}
return cpus, nil
}
func getDistancesForNodeNoCGO(nodeID int, numNodes int) ([]uint32, error) {
data, err := os.ReadFile(fmt.Sprintf("/sys/devices/system/node/node%d/distance", nodeID))
if err != nil {
return nil, err
}
fields := strings.Fields(string(data))
distances := make([]uint32, len(fields))
for i, field := range fields {
val, _ := strconv.ParseUint(field, 10, 32)
distances[i] = uint32(val)
}
return distances, nil
}
func getCurrentNodeImpl() (NodeID, error) {
var cpu, node uint32
// Use getcpu syscall
r1, _, errno := syscall.Syscall(SYS_GETCPU,
uintptr(unsafe.Pointer(&cpu)),
uintptr(unsafe.Pointer(&node)),
0)
if errno != 0 {
// Fallback: try to determine from CPU
return getNodeFromSchedGetCPU()
}
_ = r1 // suppress unused warning
return NodeID(node), nil
}
func getNodeFromSchedGetCPU() (NodeID, error) {
// Get current CPU
cpu := runtime.GOMAXPROCS(0)
// Look up in topology
topology := GetTopology()
node, ok := topology.GetNodeForCPU(cpu)
if !ok {
return 0, fmt.Errorf("cannot determine NUMA node for CPU %d", cpu)
}
return node, nil
}
func setPreferredNodeImpl(node NodeID) (*PreferredNode, error) {
mask := uint64(1) << uint64(node)
maxNode := uint64(node) + 1
_, _, errno := syscall.Syscall6(SYS_SET_MEMPOLICY,
uintptr(MPOL_PREFERRED),
uintptr(unsafe.Pointer(&mask)),
uintptr(maxNode),
0, 0, 0)
if errno != 0 {
return nil, fmt.Errorf("set_mempolicy failed: %v", errno)
}
return &PreferredNode{nodeID: node}, nil
}
func revertPreferredNodeImpl(p *PreferredNode) error {
_, _, errno := syscall.Syscall(SYS_SET_MEMPOLICY,
uintptr(MPOL_DEFAULT),
0, 0)
if errno != 0 {
return fmt.Errorf("set_mempolicy failed: %v", errno)
}
return nil
}
func setMemoryPolicyImpl(policy MemoryPolicy, nodes []NodeID) error {
var mode int
switch policy {
case MPDefault:
mode = MPOL_DEFAULT
case MPBind:
mode = MPOL_BIND
case MPPreferred:
mode = MPOL_PREFERRED
case MPInterleave:
mode = MPOL_INTERLEAVE
default:
return fmt.Errorf("unknown memory policy: %d", policy)
}
var mask uint64
for _, node := range nodes {
mask |= 1 << uint64(node)
}
maxNode := uint64(0)
for _, node := range nodes {
if uint64(node) >= maxNode {
maxNode = uint64(node) + 1
}
}
_, _, errno := syscall.Syscall6(SYS_SET_MEMPOLICY,
uintptr(mode),
uintptr(unsafe.Pointer(&mask)),
uintptr(maxNode),
0, 0, 0)
if errno != 0 {
return fmt.Errorf("set_mempolicy failed: %v", errno)
}
return nil
}
func allocateOnNodeImpl(size int, node NodeID) ([]byte, error) {
buf := make([]byte, size)
// Try to use mbind to bind memory to node
mask := uint64(1) << uint64(node)
maxNode := uint64(node) + 1
_, _, errno := syscall.Syscall6(SYS_MBIND,
uintptr(unsafe.Pointer(&buf[0])),
uintptr(size),
uintptr(MPOL_BIND),
uintptr(unsafe.Pointer(&mask)),
uintptr(maxNode),
uintptr(MPOL_MF_STRICT))
if errno != 0 {
// Fall back to regular allocation
return buf, nil
}
return buf, nil
}
func scheduleOnNodeImpl(cpu int, fn func()) error {
var mask syscall.CPUSet
mask.Set(cpu)
runtime.LockOSThread()
defer runtime.UnlockOSThread()
if err := syscall.SchedSetaffinity(0, &mask); err != nil {
return fmt.Errorf("sched_setaffinity failed: %v", err)
}
fn()
return nil
}
func getPreferredNodeForCurrentThreadImpl() NodeID {
var mode int
var node uint32
_, _, errno := syscall.Syscall6(SYS_GET_MEMPOLICY,
uintptr(unsafe.Pointer(&mode)),
0, 0,
uintptr(unsafe.Pointer(&node)),
uintptr(MPOL_F_NODE),
0)
if errno != 0 {
node, _ := getCurrentNodeImpl()
return node
}
if mode == MPOL_DEFAULT {
node, _ := getCurrentNodeImpl()
return node
}
return NodeID(node)
}
// PinThreadToNode pins the current goroutine's OS thread to a specific NUMA node
func PinThreadToNode(node NodeID) error {
topology := GetTopology()
nodeInfo, ok := topology.GetNode(node)
if !ok {
return fmt.Errorf("NUMA node %d not found", node)
}
if len(nodeInfo.CPUs) == 0 {
return fmt.Errorf("NUMA node %d has no CPUs", node)
}
runtime.LockOSThread()
var mask syscall.CPUSet
for _, cpu := range nodeInfo.CPUs {
mask.Set(cpu)
}
if err := syscall.SchedSetaffinity(0, &mask); err != nil {
runtime.UnlockOSThread()
return fmt.Errorf("sched_setaffinity failed: %v", err)
}
return nil
}
// UnpinThread releases the current goroutine's OS thread from NUMA binding
func UnpinThread() {
runtime.UnlockOSThread()
}
// RunOnNode runs a function with the current goroutine pinned to a specific NUMA node
func RunOnNode(node NodeID, fn func()) error {
if err := PinThreadToNode(node); err != nil {
return err
}
defer UnpinThread()
fn()
return nil
}

View File

@@ -0,0 +1,94 @@
//go:build !linux
// +build !linux
/*
Copyright 2024 The GoStor Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package numa
import (
"fmt"
"runtime"
)
func detectLinuxTopology(topology *Topology) error {
return fmt.Errorf("NUMA not supported on this platform")
}
func getCurrentNodeImpl() (NodeID, error) {
return 0, fmt.Errorf("NUMA not supported on this platform")
}
func setPreferredNodeImpl(node NodeID) (*PreferredNode, error) {
return nil, fmt.Errorf("NUMA not supported on this platform")
}
func revertPreferredNodeImpl(p *PreferredNode) error {
return fmt.Errorf("NUMA not supported on this platform")
}
func setMemoryPolicyImpl(policy MemoryPolicy, nodes []NodeID) error {
return fmt.Errorf("NUMA not supported on this platform")
}
func allocateOnNodeImpl(size int, node NodeID) ([]byte, error) {
return make([]byte, size), nil
}
func scheduleOnNodeImpl(cpu int, fn func()) error {
fn()
return nil
}
func getPreferredNodeForCurrentThreadImpl() NodeID {
return 0
}
// PinThreadToNode pins the current goroutine's OS thread to a specific NUMA node
// Stub implementation - does nothing on non-Linux platforms
func PinThreadToNode(node NodeID) error {
return nil
}
// UnpinThread releases the current goroutine's OS thread from NUMA binding
// Stub implementation - does nothing on non-Linux platforms
func UnpinThread() {}
// RunOnNode runs a function with the current goroutine pinned to a specific NUMA node
// Stub implementation - just runs the function on non-Linux platforms
func RunOnNode(node NodeID, fn func()) error {
fn()
return nil
}
// createSingleNodeTopology creates a single-node topology for non-NUMA systems
func createSingleNodeTopology(topology *Topology) {
numCPU := runtime.NumCPU()
cpus := make([]int, numCPU)
for i := 0; i < numCPU; i++ {
cpus[i] = i
topology.CPUToNodeMap[i] = 0
}
topology.NumNodes = 1
topology.Nodes[0] = &NodeInfo{
ID: 0,
CPUs: cpus,
TotalMemory: 0,
FreeMemory: 0,
DistanceToNode: []uint32{10},
}
}

105
pkg/util/numa/numa_test.go Normal file
View File

@@ -0,0 +1,105 @@
/*
Copyright 2024 The GoStor Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package numa
import (
"testing"
)
func TestTopologyDetection(t *testing.T) {
topology := GetTopology()
if topology == nil {
t.Fatal("GetTopology returned nil")
}
if topology.NumNodes < 1 {
t.Errorf("Expected at least 1 NUMA node, got %d", topology.NumNodes)
}
if len(topology.Nodes) == 0 {
t.Error("No NUMA nodes found in topology")
}
}
func TestBufferPool(t *testing.T) {
pool := NewNUMABufferPool(&BufferPoolConfig{
BufferSize: 4096,
PerNodePoolSize: 10,
EnableNUMA: false, // Disable NUMA for test
})
if pool == nil {
t.Fatal("NewNUMABufferPool returned nil")
}
// Test Get/Put
buf := pool.Get()
if len(buf) != 4096 {
t.Errorf("Expected buffer size 4096, got %d", len(buf))
}
pool.Put(buf)
// Test stats
stats := pool.Stats()
if stats.Gets == 0 {
t.Error("Expected Gets > 0")
}
if stats.Puts == 0 {
t.Error("Expected Puts > 0")
}
}
func TestBufferPoolMultipleSizes(t *testing.T) {
pool := NewNUMABufferPool(&BufferPoolConfig{
BufferSize: 8192,
PerNodePoolSize: 5,
EnableNUMA: false,
})
// Get multiple buffers
var buffers [][]byte
for i := 0; i < 10; i++ {
buf := pool.Get()
buffers = append(buffers, buf)
}
// Put all back
for _, buf := range buffers {
pool.Put(buf)
}
stats := pool.Stats()
if stats.Gets != 10 {
t.Errorf("Expected 10 gets, got %d", stats.Gets)
}
if stats.Puts != 10 {
t.Errorf("Expected 10 puts, got %d", stats.Puts)
}
}
func TestAvailable(t *testing.T) {
// Just verify the function doesn't panic
_ = Available()
}
func TestNumNodes(t *testing.T) {
n := NumNodes()
if n < 1 {
t.Errorf("Expected at least 1 node, got %d", n)
}
}

424
pkg/util/numa/pool.go Normal file
View File

@@ -0,0 +1,424 @@
/*
Copyright 2024 The GoStor Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package numa
import (
"context"
"sync"
"sync/atomic"
)
// BufferPoolConfig configures NUMA-aware buffer pools
type BufferPoolConfig struct {
// BufferSize is the size of each buffer
BufferSize int
// PerNodePoolSize is the number of buffers to preallocate per node
PerNodePoolSize int
// EnableNUMA enables NUMA-aware allocation
EnableNUMA bool
}
// DefaultBufferPoolConfig returns a default configuration
func DefaultBufferPoolConfig() *BufferPoolConfig {
return &BufferPoolConfig{
BufferSize: 256 * 1024, // 256KB buffers for I/O
PerNodePoolSize: 1024, // 1024 buffers per node
EnableNUMA: true,
}
}
// NUMABufferPool provides NUMA-aware buffer pooling
type NUMABufferPool struct {
config *BufferPoolConfig
topology *Topology
nodePools map[NodeID]*sync.Pool
stats *PoolStats
// Fallback pool for when NUMA is not available or disabled
fallbackPool *sync.Pool
mu sync.RWMutex
}
// PoolStats tracks buffer pool statistics
type PoolStats struct {
Gets uint64
Puts uint64
Misses uint64
NodeLocalHit uint64
NUMAHit uint64
}
// NewNUMABufferPool creates a new NUMA-aware buffer pool
func NewNUMABufferPool(config *BufferPoolConfig) *NUMABufferPool {
if config == nil {
config = DefaultBufferPoolConfig()
}
pool := &NUMABufferPool{
config: config,
topology: GetTopology(),
nodePools: make(map[NodeID]*sync.Pool),
stats: &PoolStats{},
}
// Initialize fallback pool
pool.fallbackPool = &sync.Pool{
New: func() interface{} {
atomic.AddUint64(&pool.stats.Misses, 1)
return make([]byte, config.BufferSize)
},
}
// Initialize NUMA pools if enabled and available
if config.EnableNUMA && Available() && pool.topology.NumNodes > 1 {
for nodeID := range pool.topology.Nodes {
pool.nodePools[nodeID] = pool.createNodePool(nodeID)
}
}
return pool
}
// createNodePool creates a buffer pool for a specific NUMA node
func (p *NUMABufferPool) createNodePool(node NodeID) *sync.Pool {
return &sync.Pool{
New: func() interface{} {
atomic.AddUint64(&p.stats.Misses, 1)
// Try NUMA-local allocation first
if p.config.EnableNUMA && Available() {
buf, err := AllocateOnNode(p.config.BufferSize, node)
if err == nil {
atomic.AddUint64(&p.stats.NUMAHit, 1)
return buf
}
}
// Fall back to regular allocation
return make([]byte, p.config.BufferSize)
},
}
}
// Get returns a buffer from the pool, preferably from the local NUMA node
func (p *NUMABufferPool) Get() []byte {
atomic.AddUint64(&p.stats.Gets, 1)
// Try to get from the local NUMA node first
if p.config.EnableNUMA && Available() && len(p.nodePools) > 0 {
if node, err := GetCurrentNode(); err == nil {
if nodePool, ok := p.nodePools[node]; ok {
buf := nodePool.Get().([]byte)
atomic.AddUint64(&p.stats.NodeLocalHit, 1)
return buf[:p.config.BufferSize]
}
}
}
// Fall back to the fallback pool
return p.fallbackPool.Get().([]byte)[:p.config.BufferSize]
}
// Put returns a buffer to the pool, preferably to its local NUMA node
func (p *NUMABufferPool) Put(buf []byte) {
if buf == nil {
return
}
atomic.AddUint64(&p.stats.Puts, 1)
// Resize buffer to full size before returning to pool
if cap(buf) < p.config.BufferSize {
// Buffer is too small, discard it
return
}
buf = buf[:p.config.BufferSize]
// Try to return to the local NUMA node pool
if p.config.EnableNUMA && Available() && len(p.nodePools) > 0 {
if node, err := GetCurrentNode(); err == nil {
if nodePool, ok := p.nodePools[node]; ok {
nodePool.Put(buf)
return
}
}
}
// Fall back to the fallback pool
p.fallbackPool.Put(buf)
}
// GetForNode returns a buffer from a specific NUMA node's pool
func (p *NUMABufferPool) GetForNode(node NodeID) []byte {
atomic.AddUint64(&p.stats.Gets, 1)
if nodePool, ok := p.nodePools[node]; ok {
return nodePool.Get().([]byte)[:p.config.BufferSize]
}
return p.fallbackPool.Get().([]byte)[:p.config.BufferSize]
}
// PutForNode returns a buffer to a specific NUMA node's pool
func (p *NUMABufferPool) PutForNode(node NodeID, buf []byte) {
if buf == nil {
return
}
atomic.AddUint64(&p.stats.Puts, 1)
if cap(buf) < p.config.BufferSize {
return
}
buf = buf[:p.config.BufferSize]
if nodePool, ok := p.nodePools[node]; ok {
nodePool.Put(buf)
return
}
p.fallbackPool.Put(buf)
}
// Stats returns current pool statistics
func (p *NUMABufferPool) Stats() PoolStats {
return PoolStats{
Gets: atomic.LoadUint64(&p.stats.Gets),
Puts: atomic.LoadUint64(&p.stats.Puts),
Misses: atomic.LoadUint64(&p.stats.Misses),
NodeLocalHit: atomic.LoadUint64(&p.stats.NodeLocalHit),
NUMAHit: atomic.LoadUint64(&p.stats.NUMAHit),
}
}
// GetConfig returns the pool configuration
func (p *NUMABufferPool) GetConfig() *BufferPoolConfig {
return p.config
}
// Close releases all resources associated with the pool
func (p *NUMABufferPool) Close() error {
p.mu.Lock()
defer p.mu.Unlock()
// Clear all pools
p.nodePools = make(map[NodeID]*sync.Pool)
p.fallbackPool = nil
return nil
}
// SizeAwarePool is a buffer pool that can handle multiple buffer sizes
type SizeAwarePool struct {
pools map[int]*NUMABufferPool
mu sync.RWMutex
}
// NewSizeAwarePool creates a new size-aware buffer pool
func NewSizeAwarePool(sizes []int, enableNUMA bool) *SizeAwarePool {
sap := &SizeAwarePool{
pools: make(map[int]*NUMABufferPool),
}
for _, size := range sizes {
sap.pools[size] = NewNUMABufferPool(&BufferPoolConfig{
BufferSize: size,
PerNodePoolSize: 1024,
EnableNUMA: enableNUMA,
})
}
return sap
}
// Get returns a buffer of the specified size
func (sap *SizeAwarePool) Get(size int) []byte {
sap.mu.RLock()
pool, ok := sap.pools[size]
sap.mu.RUnlock()
if ok {
return pool.Get()
}
// No pool for this size, allocate directly
return make([]byte, size)
}
// Put returns a buffer to the appropriate pool
func (sap *SizeAwarePool) Put(buf []byte) {
if buf == nil {
return
}
size := cap(buf)
sap.mu.RLock()
pool, ok := sap.pools[size]
sap.mu.RUnlock()
if ok {
pool.Put(buf)
}
// If no pool for this size, let GC handle it
}
// PinningAllocator allocates buffers while the goroutine is pinned to a NUMA node
type PinningAllocator struct {
pool *NUMABufferPool
}
// NewPinningAllocator creates a new pinning allocator
func NewPinningAllocator(pool *NUMABufferPool) *PinningAllocator {
return &PinningAllocator{pool: pool}
}
// Allocate allocates a buffer while pinned to the current NUMA node
func (pa *PinningAllocator) Allocate() []byte {
return pa.pool.Get()
}
// AllocateOnNode allocates a buffer while pinned to a specific NUMA node
func (pa *PinningAllocator) AllocateOnNode(node NodeID) ([]byte, error) {
var buf []byte
err := RunOnNode(node, func() {
buf = pa.pool.GetForNode(node)
})
return buf, err
}
// Global pools for common buffer sizes
var (
globalPools map[int]*NUMABufferPool
globalPoolsOnce sync.Once
globalPoolsMu sync.RWMutex
)
// InitGlobalPools initializes global buffer pools
func InitGlobalPools(sizes []int, enableNUMA bool) {
globalPoolsOnce.Do(func() {
globalPools = make(map[int]*NUMABufferPool)
for _, size := range sizes {
globalPools[size] = NewNUMABufferPool(&BufferPoolConfig{
BufferSize: size,
PerNodePoolSize: 1024,
EnableNUMA: enableNUMA,
})
}
})
}
// GetBuffer gets a buffer from the global pool
func GetBuffer(size int) []byte {
globalPoolsMu.RLock()
pool, ok := globalPools[size]
globalPoolsMu.RUnlock()
if ok {
return pool.Get()
}
return make([]byte, size)
}
// PutBuffer returns a buffer to the global pool
func PutBuffer(buf []byte) {
if buf == nil {
return
}
size := cap(buf)
globalPoolsMu.RLock()
pool, ok := globalPools[size]
globalPoolsMu.RUnlock()
if ok {
pool.Put(buf)
}
}
// WorkerPool is a pool of workers that are pinned to specific NUMA nodes
type WorkerPool struct {
size int
numaNode NodeID
workQueue chan func()
ctx context.Context
cancel context.CancelFunc
wg sync.WaitGroup
}
// NewWorkerPool creates a new NUMA-aware worker pool
func NewWorkerPool(size int, node NodeID) *WorkerPool {
ctx, cancel := context.WithCancel(context.Background())
wp := &WorkerPool{
size: size,
numaNode: node,
workQueue: make(chan func(), size*2),
ctx: ctx,
cancel: cancel,
}
// Start workers
for i := 0; i < size; i++ {
wp.wg.Add(1)
go wp.worker()
}
return wp
}
func (wp *WorkerPool) worker() {
defer wp.wg.Done()
// Pin to NUMA node
if Available() {
PinThreadToNode(wp.numaNode)
defer UnpinThread()
}
for {
select {
case work := <-wp.workQueue:
if work != nil {
work()
}
case <-wp.ctx.Done():
return
}
}
}
// Submit submits work to the worker pool
func (wp *WorkerPool) Submit(work func()) bool {
select {
case wp.workQueue <- work:
return true
case <-wp.ctx.Done():
return false
default:
return false
}
}
// Stop stops the worker pool
func (wp *WorkerPool) Stop() {
wp.cancel()
wp.wg.Wait()
close(wp.workQueue)
}

View File

@@ -73,32 +73,38 @@ func MarshalKVText(kv []KeyValue) []byte {
return data
}
// MarshalUint16 returns big-endian encoding of i as a new 2-byte slice.
// Deprecated: Use MarshalUint16To or binary.BigEndian.PutUint16 for zero-allocation.
func MarshalUint16(i uint16) []byte {
var data []byte
for j := 8; j >= 0; j -= 8 {
b := byte(i >> uint16(j))
data = append(data, b)
}
return data
var data [2]byte
binary.BigEndian.PutUint16(data[:], i)
return data[:]
}
// MarshalUint32 returns big-endian encoding of i as a new 4-byte slice.
// Deprecated: Use MarshalUint32To or binary.BigEndian.PutUint32 for zero-allocation.
func MarshalUint32(i uint32) []byte {
var data []byte
for j := 24; j >= 0; j -= 8 {
b := byte(i >> uint32(j))
data = append(data, b)
}
return data
var data [4]byte
binary.BigEndian.PutUint32(data[:], i)
return data[:]
}
// MarshalUint32To writes big-endian encoding of i into buf, which must be at least 4 bytes.
// This is a zero-allocation alternative to MarshalUint32.
func MarshalUint32To(buf []byte, i uint32) {
binary.BigEndian.PutUint32(buf, i)
}
func MarshalUint64(v uint64) []byte {
var data = [8]byte{}
var i = 0
for j := 56; j >= 0; j -= 8 {
data[i] = byte(v >> uint32(j))
i++
}
return data[0:8]
var data [8]byte
binary.BigEndian.PutUint64(data[:], v)
return data[:]
}
// MarshalUint64To writes big-endian encoding of v into buf, which must be at least 8 bytes.
// This is a zero-allocation alternative for partial writes.
func MarshalUint64To(buf []byte, v uint64) {
binary.BigEndian.PutUint64(buf, v)
}
func StringToByte(str string, align int, maxlength int) []byte {