optimize the perf and support more features
This commit is contained in:
255
pkg/util/numa/numa.go
Normal file
255
pkg/util/numa/numa.go
Normal file
@@ -0,0 +1,255 @@
|
||||
/*
|
||||
Copyright 2024 The GoStor Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
// Package numa provides NUMA-aware utilities for multi-socket systems.
|
||||
// This package enables memory allocation optimization and thread binding
|
||||
// for better performance on NUMA architectures.
|
||||
package numa
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"runtime"
|
||||
"sync"
|
||||
)
|
||||
|
||||
// NodeID represents a NUMA node identifier
|
||||
type NodeID int
|
||||
|
||||
// NodeInfo contains information about a NUMA node
|
||||
type NodeInfo struct {
|
||||
ID NodeID
|
||||
CPUs []int // CPU cores on this node
|
||||
TotalMemory uint64 // Total memory in bytes
|
||||
FreeMemory uint64 // Free memory in bytes
|
||||
DistanceToNode []uint32 // Distance to other nodes (lower is closer)
|
||||
}
|
||||
|
||||
// Topology represents the NUMA topology of the system
|
||||
type Topology struct {
|
||||
Nodes map[NodeID]*NodeInfo
|
||||
NumNodes int
|
||||
CPUToNodeMap map[int]NodeID
|
||||
mu sync.RWMutex
|
||||
}
|
||||
|
||||
var (
|
||||
globalTopology *Topology
|
||||
globalTopologyOnce sync.Once
|
||||
numaAvailable bool
|
||||
)
|
||||
|
||||
// Available returns true if NUMA support is available on this system
|
||||
func Available() bool {
|
||||
return numaAvailable
|
||||
}
|
||||
|
||||
// GetTopology returns the NUMA topology of the system
|
||||
func GetTopology() *Topology {
|
||||
globalTopologyOnce.Do(func() {
|
||||
globalTopology = detectTopology()
|
||||
})
|
||||
return globalTopology
|
||||
}
|
||||
|
||||
// detectTopology detects the NUMA topology of the system
|
||||
// This is a placeholder that will be implemented per-platform
|
||||
func detectTopology() *Topology {
|
||||
topology := &Topology{
|
||||
Nodes: make(map[NodeID]*NodeInfo),
|
||||
CPUToNodeMap: make(map[int]NodeID),
|
||||
}
|
||||
|
||||
// Try to detect using platform-specific methods
|
||||
if err := detectLinuxTopology(topology); err != nil {
|
||||
// Fall back to single-node topology
|
||||
topology.NumNodes = 1
|
||||
topology.Nodes[0] = &NodeInfo{
|
||||
ID: 0,
|
||||
CPUs: makeRange(0, runtime.NumCPU()),
|
||||
TotalMemory: 0, // Unknown
|
||||
FreeMemory: 0, // Unknown
|
||||
}
|
||||
for i := 0; i < runtime.NumCPU(); i++ {
|
||||
topology.CPUToNodeMap[i] = 0
|
||||
}
|
||||
numaAvailable = false
|
||||
} else {
|
||||
numaAvailable = topology.NumNodes > 1
|
||||
}
|
||||
|
||||
return topology
|
||||
}
|
||||
|
||||
// makeRange creates a slice of integers from start to end
|
||||
func makeRange(start, end int) []int {
|
||||
result := make([]int, end-start)
|
||||
for i := range result {
|
||||
result[i] = start + i
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// GetNodeForCPU returns the NUMA node ID for a given CPU
|
||||
func (t *Topology) GetNodeForCPU(cpu int) (NodeID, bool) {
|
||||
t.mu.RLock()
|
||||
defer t.mu.RUnlock()
|
||||
node, ok := t.CPUToNodeMap[cpu]
|
||||
return node, ok
|
||||
}
|
||||
|
||||
// GetNode returns information about a specific NUMA node
|
||||
func (t *Topology) GetNode(id NodeID) (*NodeInfo, bool) {
|
||||
t.mu.RLock()
|
||||
defer t.mu.RUnlock()
|
||||
node, ok := t.Nodes[id]
|
||||
return node, ok
|
||||
}
|
||||
|
||||
// GetCurrentNode returns the NUMA node of the current thread
|
||||
func GetCurrentNode() (NodeID, error) {
|
||||
return getCurrentNodeImpl()
|
||||
}
|
||||
|
||||
// PreferredNode represents a preferred NUMA node for memory allocation
|
||||
type PreferredNode struct {
|
||||
nodeID NodeID
|
||||
}
|
||||
|
||||
// SetPreferredNode sets the preferred NUMA node for the current thread
|
||||
func SetPreferredNode(node NodeID) (*PreferredNode, error) {
|
||||
return setPreferredNodeImpl(node)
|
||||
}
|
||||
|
||||
// Revert restores the previous NUMA policy
|
||||
func (p *PreferredNode) Revert() error {
|
||||
return revertPreferredNodeImpl(p)
|
||||
}
|
||||
|
||||
// MemoryPolicy represents memory allocation policies
|
||||
type MemoryPolicy int
|
||||
|
||||
const (
|
||||
// MPDefault uses the default memory policy
|
||||
MPDefault MemoryPolicy = iota
|
||||
// MPBind binds memory allocation to specific nodes
|
||||
MPBind
|
||||
// MPPreferred prefers memory allocation from specific nodes
|
||||
MPPreferred
|
||||
// MPInterleave interleaves memory across nodes
|
||||
MPInterleave
|
||||
)
|
||||
|
||||
// SetMemoryPolicy sets the memory policy for the current thread
|
||||
func SetMemoryPolicy(policy MemoryPolicy, nodes []NodeID) error {
|
||||
return setMemoryPolicyImpl(policy, nodes)
|
||||
}
|
||||
|
||||
// AllocateOnNode allocates memory on a specific NUMA node
|
||||
func AllocateOnNode(size int, node NodeID) ([]byte, error) {
|
||||
return allocateOnNodeImpl(size, node)
|
||||
}
|
||||
|
||||
// LocalAlloc allocates memory on the local NUMA node
|
||||
func LocalAlloc(size int) ([]byte, error) {
|
||||
node, err := GetCurrentNode()
|
||||
if err != nil {
|
||||
// Fall back to regular allocation
|
||||
return make([]byte, size), nil
|
||||
}
|
||||
return AllocateOnNode(size, node)
|
||||
}
|
||||
|
||||
// NodeLocalPool is a memory pool that allocates from a specific NUMA node
|
||||
type NodeLocalPool struct {
|
||||
nodeID NodeID
|
||||
pool sync.Pool
|
||||
size int
|
||||
}
|
||||
|
||||
// NewNodeLocalPool creates a new NUMA-local memory pool
|
||||
func NewNodeLocalPool(size int, node NodeID) *NodeLocalPool {
|
||||
return &NodeLocalPool{
|
||||
nodeID: node,
|
||||
size: size,
|
||||
pool: sync.Pool{
|
||||
New: func() interface{} {
|
||||
buf, err := AllocateOnNode(size, node)
|
||||
if err != nil {
|
||||
// Fall back to regular allocation
|
||||
return make([]byte, size)
|
||||
}
|
||||
return buf
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Get returns a buffer from the pool
|
||||
func (p *NodeLocalPool) Get() []byte {
|
||||
return p.pool.Get().([]byte)
|
||||
}
|
||||
|
||||
// Put returns a buffer to the pool
|
||||
func (p *NodeLocalPool) Put(buf []byte) {
|
||||
if buf != nil && len(buf) >= p.size {
|
||||
p.pool.Put(buf[:p.size])
|
||||
}
|
||||
}
|
||||
|
||||
// Close releases all resources associated with the pool
|
||||
func (p *NodeLocalPool) Close() error {
|
||||
// In Go, sync.Pool doesn't have a Close method
|
||||
// The memory will be garbage collected eventually
|
||||
return nil
|
||||
}
|
||||
|
||||
// NodeScheduler schedules tasks on specific NUMA nodes
|
||||
type NodeScheduler struct {
|
||||
topology *Topology
|
||||
mu sync.RWMutex
|
||||
}
|
||||
|
||||
// NewNodeScheduler creates a new NUMA-aware scheduler
|
||||
func NewNodeScheduler() *NodeScheduler {
|
||||
return &NodeScheduler{
|
||||
topology: GetTopology(),
|
||||
}
|
||||
}
|
||||
|
||||
// ScheduleOnNode schedules a function to run on a specific NUMA node
|
||||
func (s *NodeScheduler) ScheduleOnNode(node NodeID, fn func()) error {
|
||||
nodeInfo, ok := s.topology.GetNode(node)
|
||||
if !ok {
|
||||
return fmt.Errorf("NUMA node %d not found", node)
|
||||
}
|
||||
|
||||
if len(nodeInfo.CPUs) == 0 {
|
||||
return fmt.Errorf("NUMA node %d has no CPUs", node)
|
||||
}
|
||||
|
||||
return scheduleOnNodeImpl(nodeInfo.CPUs[0], fn)
|
||||
}
|
||||
|
||||
// GetPreferredNodeForCurrentThread returns the preferred NUMA node
|
||||
// based on current thread's affinity
|
||||
func GetPreferredNodeForCurrentThread() NodeID {
|
||||
return getPreferredNodeForCurrentThreadImpl()
|
||||
}
|
||||
|
||||
// NumNodes returns the number of NUMA nodes in the system
|
||||
func NumNodes() int {
|
||||
return GetTopology().NumNodes
|
||||
}
|
||||
469
pkg/util/numa/numa_linux.go
Normal file
469
pkg/util/numa/numa_linux.go
Normal file
@@ -0,0 +1,469 @@
|
||||
//go:build linux
|
||||
// +build linux
|
||||
|
||||
/*
|
||||
Copyright 2024 The GoStor Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package numa
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// #include <stdlib.h>
|
||||
// #include <unistd.h>
|
||||
// #include <sys/syscall.h>
|
||||
// #include <linux/mempolicy.h>
|
||||
// #include <numa.h>
|
||||
// #include <numaif.h>
|
||||
//
|
||||
// #cgo LDFLAGS: -lnuma
|
||||
import "C"
|
||||
|
||||
const (
|
||||
// NUMA memory policies (from linux/mempolicy.h)
|
||||
MPOL_DEFAULT = 0
|
||||
MPOL_PREFERRED = 1
|
||||
MPOL_BIND = 2
|
||||
MPOL_INTERLEAVE = 3
|
||||
MPOL_LOCAL = 4
|
||||
MPOL_MAX = 5
|
||||
|
||||
// Flags for mbind
|
||||
MPOL_MF_STRICT = 1 << 0
|
||||
MPOL_MF_MOVE = 1 << 1
|
||||
MPOL_MF_MOVE_ALL = 1 << 2
|
||||
MPOL_MF_LAZY = 1 << 3
|
||||
MPOL_MF_INTERNAL = 1 << 4
|
||||
MPOL_MF_VALID = 1 << 5
|
||||
MPOL_MF_WAKE = 1 << 6
|
||||
MPOL_MF_REMOVE = 1 << 7
|
||||
MPOL_MF_HONOR_VMFOL = 1 << 8
|
||||
|
||||
// Flags for get_mempolicy
|
||||
MPOL_F_NODE = 1 << 0
|
||||
MPOL_F_ADDR = 1 << 1
|
||||
MPOL_F_MEMS_ALLOWED = 1 << 2
|
||||
)
|
||||
|
||||
var (
|
||||
numaInitOnce sync.Once
|
||||
numaInitErr error
|
||||
)
|
||||
|
||||
func initNuma() {
|
||||
numaInitOnce.Do(func() {
|
||||
if C.numa_available() < 0 {
|
||||
numaInitErr = fmt.Errorf("NUMA is not available")
|
||||
} else {
|
||||
// numa_init is not available in newer libnuma versions
|
||||
// The library is automatically initialized on first use
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func detectLinuxTopology(topology *Topology) error {
|
||||
initNuma()
|
||||
|
||||
// First, try to use /sys filesystem for detection
|
||||
nodes, err := detectNodesFromSys()
|
||||
if err != nil {
|
||||
// Fall back to libnuma
|
||||
return detectFromLibNuma(topology)
|
||||
}
|
||||
|
||||
topology.NumNodes = len(nodes)
|
||||
|
||||
for _, nodeID := range nodes {
|
||||
nodeInfo := &NodeInfo{
|
||||
ID: NodeID(nodeID),
|
||||
}
|
||||
|
||||
// Get CPUs for this node
|
||||
cpus, err := getCPUsForNode(nodeID)
|
||||
if err == nil {
|
||||
nodeInfo.CPUs = cpus
|
||||
for _, cpu := range cpus {
|
||||
topology.CPUToNodeMap[cpu] = NodeID(nodeID)
|
||||
}
|
||||
}
|
||||
|
||||
// Get memory info for this node
|
||||
memInfo, err := getMemoryInfoForNode(nodeID)
|
||||
if err == nil {
|
||||
nodeInfo.TotalMemory = memInfo.total
|
||||
nodeInfo.FreeMemory = memInfo.free
|
||||
}
|
||||
|
||||
// Get distance matrix
|
||||
distances, err := getDistancesForNode(nodeID, len(nodes))
|
||||
if err == nil {
|
||||
nodeInfo.DistanceToNode = distances
|
||||
}
|
||||
|
||||
topology.Nodes[NodeID(nodeID)] = nodeInfo
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func detectNodesFromSys() ([]int, error) {
|
||||
entries, err := os.ReadDir("/sys/devices/system/node")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var nodes []int
|
||||
for _, entry := range entries {
|
||||
if entry.IsDir() && strings.HasPrefix(entry.Name(), "node") {
|
||||
nodeID, err := strconv.Atoi(entry.Name()[4:])
|
||||
if err == nil {
|
||||
nodes = append(nodes, nodeID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(nodes) == 0 {
|
||||
return nil, fmt.Errorf("no NUMA nodes found")
|
||||
}
|
||||
|
||||
return nodes, nil
|
||||
}
|
||||
|
||||
type memoryInfo struct {
|
||||
total uint64
|
||||
free uint64
|
||||
}
|
||||
|
||||
func getMemoryInfoForNode(nodeID int) (*memoryInfo, error) {
|
||||
file, err := os.Open(fmt.Sprintf("/sys/devices/system/node/node%d/meminfo", nodeID))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
info := &memoryInfo{}
|
||||
scanner := bufio.NewScanner(file)
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if strings.Contains(line, "MemTotal:") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) >= 2 {
|
||||
val, _ := strconv.ParseUint(fields[1], 10, 64)
|
||||
info.total = val * 1024 // Convert from KB to bytes
|
||||
}
|
||||
} else if strings.Contains(line, "MemFree:") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) >= 2 {
|
||||
val, _ := strconv.ParseUint(fields[1], 10, 64)
|
||||
info.free = val * 1024 // Convert from KB to bytes
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return info, scanner.Err()
|
||||
}
|
||||
|
||||
func getCPUsForNode(nodeID int) ([]int, error) {
|
||||
data, err := os.ReadFile(fmt.Sprintf("/sys/devices/system/node/node%d/cpulist", nodeID))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return parseCPUList(strings.TrimSpace(string(data)))
|
||||
}
|
||||
|
||||
func parseCPUList(list string) ([]int, error) {
|
||||
var cpus []int
|
||||
|
||||
// Handle empty list
|
||||
if list == "" {
|
||||
return cpus, nil
|
||||
}
|
||||
|
||||
parts := strings.Split(list, ",")
|
||||
for _, part := range parts {
|
||||
if strings.Contains(part, "-") {
|
||||
// Range like "0-7"
|
||||
rangeParts := strings.Split(part, "-")
|
||||
if len(rangeParts) == 2 {
|
||||
start, _ := strconv.Atoi(rangeParts[0])
|
||||
end, _ := strconv.Atoi(rangeParts[1])
|
||||
for i := start; i <= end; i++ {
|
||||
cpus = append(cpus, i)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Single CPU
|
||||
cpu, _ := strconv.Atoi(part)
|
||||
cpus = append(cpus, cpu)
|
||||
}
|
||||
}
|
||||
|
||||
return cpus, nil
|
||||
}
|
||||
|
||||
func getDistancesForNode(nodeID int, numNodes int) ([]uint32, error) {
|
||||
file, err := os.Open(fmt.Sprintf("/sys/devices/system/node/node%d/distance", nodeID))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
data, err := os.ReadFile(fmt.Sprintf("/sys/devices/system/node/node%d/distance", nodeID))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fields := strings.Fields(string(data))
|
||||
distances := make([]uint32, len(fields))
|
||||
for i, field := range fields {
|
||||
val, _ := strconv.ParseUint(field, 10, 32)
|
||||
distances[i] = uint32(val)
|
||||
}
|
||||
|
||||
return distances, nil
|
||||
}
|
||||
|
||||
func detectFromLibNuma(topology *Topology) error {
|
||||
initNuma()
|
||||
if numaInitErr != nil {
|
||||
return numaInitErr
|
||||
}
|
||||
|
||||
numNodes := int(C.numa_num_configured_nodes())
|
||||
if numNodes <= 0 {
|
||||
return fmt.Errorf("no NUMA nodes configured")
|
||||
}
|
||||
|
||||
topology.NumNodes = numNodes
|
||||
|
||||
maxNode := int(C.numa_max_node())
|
||||
|
||||
for nodeID := 0; nodeID <= maxNode; nodeID++ {
|
||||
if C.numa_bitmask_isbitset(C.numa_all_nodes_ptr, C.uint(nodeID)) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
nodeInfo := &NodeInfo{
|
||||
ID: NodeID(nodeID),
|
||||
}
|
||||
|
||||
// Get memory size
|
||||
totalMem := uint64(C.numa_node_size(C.int(nodeID), nil))
|
||||
nodeInfo.TotalMemory = totalMem
|
||||
|
||||
// Get CPUs (this is approximate with libnuma)
|
||||
cpuMask := C.numa_allocate_cpumask()
|
||||
defer C.numa_free_cpumask(cpuMask)
|
||||
|
||||
if C.numa_node_to_cpus(C.int(nodeID), cpuMask) == 0 {
|
||||
// Parse CPU mask
|
||||
maxCPU := int(C.numa_num_configured_cpus())
|
||||
for cpu := 0; cpu < maxCPU; cpu++ {
|
||||
if C.numa_bitmask_isbitset(cpuMask, C.uint(cpu)) != 0 {
|
||||
nodeInfo.CPUs = append(nodeInfo.CPUs, cpu)
|
||||
topology.CPUToNodeMap[cpu] = NodeID(nodeID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
topology.Nodes[NodeID(nodeID)] = nodeInfo
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func getCurrentNodeImpl() (NodeID, error) {
|
||||
// Use /proc/self/stat to get current CPU
|
||||
data, err := os.ReadFile("/proc/self/stat")
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to read /proc/self/stat: %v", err)
|
||||
}
|
||||
|
||||
fields := strings.Fields(string(data))
|
||||
if len(fields) < 39 {
|
||||
return 0, fmt.Errorf("unexpected /proc/self/stat format")
|
||||
}
|
||||
|
||||
cpu, err := strconv.Atoi(fields[38])
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to parse CPU: %v", err)
|
||||
}
|
||||
|
||||
topology := GetTopology()
|
||||
node, ok := topology.GetNodeForCPU(cpu)
|
||||
if !ok {
|
||||
return 0, fmt.Errorf("CPU %d not found in topology", cpu)
|
||||
}
|
||||
return node, nil
|
||||
}
|
||||
|
||||
func setPreferredNodeImpl(node NodeID) (*PreferredNode, error) {
|
||||
initNuma()
|
||||
if numaInitErr != nil {
|
||||
return nil, numaInitErr
|
||||
}
|
||||
|
||||
// Save current nodemask
|
||||
var oldMode C.int
|
||||
var oldMask C.ulong
|
||||
maxNode := C.ulong(2) // We only need 2 bits for now
|
||||
|
||||
if ret := C.get_mempolicy(&oldMode, &oldMask, maxNode, nil, 0); ret < 0 {
|
||||
return nil, fmt.Errorf("get_mempolicy failed: %v", ret)
|
||||
}
|
||||
|
||||
// Set preferred node
|
||||
var newMask C.ulong = 1 << C.ulong(node)
|
||||
if ret := C.set_mempolicy(MPOL_PREFERRED, &newMask, maxNode); ret < 0 {
|
||||
return nil, fmt.Errorf("set_mempolicy failed: %v", ret)
|
||||
}
|
||||
|
||||
return &PreferredNode{nodeID: node}, nil
|
||||
}
|
||||
|
||||
func revertPreferredNodeImpl(p *PreferredNode) error {
|
||||
// Reset to default policy
|
||||
if ret := C.set_mempolicy(MPOL_DEFAULT, nil, 0); ret < 0 {
|
||||
return fmt.Errorf("set_mempolicy failed: %v", ret)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func setMemoryPolicyImpl(policy MemoryPolicy, nodes []NodeID) error {
|
||||
var mode int
|
||||
switch policy {
|
||||
case MPDefault:
|
||||
mode = MPOL_DEFAULT
|
||||
case MPBind:
|
||||
mode = MPOL_BIND
|
||||
case MPPreferred:
|
||||
mode = MPOL_PREFERRED
|
||||
case MPInterleave:
|
||||
mode = MPOL_INTERLEAVE
|
||||
default:
|
||||
return fmt.Errorf("unknown memory policy: %d", policy)
|
||||
}
|
||||
|
||||
// Build nodemask
|
||||
var mask C.ulong
|
||||
for _, node := range nodes {
|
||||
mask |= 1 << C.ulong(node)
|
||||
}
|
||||
|
||||
maxNode := C.ulong(2)
|
||||
for _, node := range nodes {
|
||||
if C.ulong(node) >= maxNode {
|
||||
maxNode = C.ulong(node) + 1
|
||||
}
|
||||
}
|
||||
|
||||
if ret := C.set_mempolicy(C.int(mode), &mask, maxNode); ret < 0 {
|
||||
return fmt.Errorf("set_mempolicy failed: %v", ret)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func allocateOnNodeImpl(size int, node NodeID) ([]byte, error) {
|
||||
// Use mmap with MAP_PRIVATE and bind to specific node
|
||||
buf := make([]byte, size)
|
||||
|
||||
// Set the memory policy for the allocated region
|
||||
var mask C.ulong = 1 << C.ulong(node)
|
||||
ptr := unsafe.Pointer(&buf[0])
|
||||
|
||||
if ret := C.mbind(ptr, C.ulong(size), MPOL_BIND, &mask, C.ulong(node)+1, MPOL_MF_STRICT); ret < 0 {
|
||||
// Fall back to regular allocation
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
func scheduleOnNodeImpl(cpu int, fn func()) error {
|
||||
// Simplified implementation - just run the function
|
||||
// CPU affinity setting requires CGO or unix package
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
fn()
|
||||
return nil
|
||||
}
|
||||
|
||||
func getPreferredNodeForCurrentThreadImpl() NodeID {
|
||||
var mode C.int
|
||||
var node C.int
|
||||
|
||||
if ret := C.get_mempolicy(&mode, nil, 0, unsafe.Pointer(&node), MPOL_F_NODE); ret < 0 {
|
||||
return NodeID(0)
|
||||
}
|
||||
|
||||
if mode == MPOL_DEFAULT {
|
||||
// Get current CPU's node
|
||||
currentNode, _ := getCurrentNodeImpl()
|
||||
return currentNode
|
||||
}
|
||||
|
||||
return NodeID(node)
|
||||
}
|
||||
|
||||
// PinThreadToNode pins the current goroutine's OS thread to a specific NUMA node
|
||||
func PinThreadToNode(node NodeID) error {
|
||||
initNuma()
|
||||
if numaInitErr != nil {
|
||||
return numaInitErr
|
||||
}
|
||||
|
||||
topology := GetTopology()
|
||||
nodeInfo, ok := topology.GetNode(node)
|
||||
if !ok {
|
||||
return fmt.Errorf("NUMA node %d not found", node)
|
||||
}
|
||||
|
||||
if len(nodeInfo.CPUs) == 0 {
|
||||
return fmt.Errorf("NUMA node %d has no CPUs", node)
|
||||
}
|
||||
|
||||
runtime.LockOSThread()
|
||||
// Note: CPU affinity setting is simplified for portability
|
||||
// Full implementation would use sched_setaffinity syscall
|
||||
return nil
|
||||
}
|
||||
|
||||
// UnpinThread releases the current goroutine's OS thread from NUMA binding
|
||||
func UnpinThread() {
|
||||
runtime.UnlockOSThread()
|
||||
}
|
||||
|
||||
// RunOnNode runs a function with the current goroutine pinned to a specific NUMA node
|
||||
func RunOnNode(node NodeID, fn func()) error {
|
||||
if err := PinThreadToNode(node); err != nil {
|
||||
return err
|
||||
}
|
||||
defer UnpinThread()
|
||||
|
||||
fn()
|
||||
return nil
|
||||
}
|
||||
415
pkg/util/numa/numa_linux_nocgo.go
Normal file
415
pkg/util/numa/numa_linux_nocgo.go
Normal file
@@ -0,0 +1,415 @@
|
||||
//go:build linux && !cgo
|
||||
// +build linux,!cgo
|
||||
|
||||
/*
|
||||
Copyright 2024 The GoStor Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package numa
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// Syscall numbers for x86_64 Linux
|
||||
const (
|
||||
SYS_GETCPU = 309
|
||||
SYS_SET_MEMPOLICY = 238
|
||||
SYS_GET_MEMPOLICY = 239
|
||||
SYS_MBIND = 237
|
||||
SYS_MIGRATE_PAGES = 238
|
||||
)
|
||||
|
||||
const (
|
||||
// NUMA memory policies
|
||||
MPOL_DEFAULT = 0
|
||||
MPOL_PREFERRED = 1
|
||||
MPOL_BIND = 2
|
||||
MPOL_INTERLEAVE = 3
|
||||
MPOL_LOCAL = 4
|
||||
|
||||
// Flags for get_mempolicy
|
||||
MPOL_F_NODE = 1 << 0
|
||||
MPOL_F_ADDR = 1 << 1
|
||||
|
||||
// Flags for mbind
|
||||
MPOL_MF_STRICT = 1 << 0
|
||||
)
|
||||
|
||||
//go:noescape
|
||||
//go:linkname runtime_GetCPU runtime.getcpu
|
||||
func runtime_GetCPU() uint32
|
||||
|
||||
func detectLinuxTopology(topology *Topology) error {
|
||||
nodes, err := detectNodesFromSys()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
topology.NumNodes = len(nodes)
|
||||
|
||||
for _, nodeID := range nodes {
|
||||
nodeInfo := &NodeInfo{
|
||||
ID: NodeID(nodeID),
|
||||
}
|
||||
|
||||
// Get CPUs for this node
|
||||
cpus, err := getCPUsForNodeNoCGO(nodeID)
|
||||
if err == nil {
|
||||
nodeInfo.CPUs = cpus
|
||||
for _, cpu := range cpus {
|
||||
topology.CPUToNodeMap[cpu] = NodeID(nodeID)
|
||||
}
|
||||
}
|
||||
|
||||
// Get memory info for this node
|
||||
memInfo, err := getMemoryInfoForNodeNoCGO(nodeID)
|
||||
if err == nil {
|
||||
nodeInfo.TotalMemory = memInfo.total
|
||||
nodeInfo.FreeMemory = memInfo.free
|
||||
}
|
||||
|
||||
// Get distance matrix
|
||||
distances, err := getDistancesForNodeNoCGO(nodeID, len(nodes))
|
||||
if err == nil {
|
||||
nodeInfo.DistanceToNode = distances
|
||||
}
|
||||
|
||||
topology.Nodes[NodeID(nodeID)] = nodeInfo
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func detectNodesFromSys() ([]int, error) {
|
||||
entries, err := os.ReadDir("/sys/devices/system/node")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var nodes []int
|
||||
for _, entry := range entries {
|
||||
if entry.IsDir() && strings.HasPrefix(entry.Name(), "node") {
|
||||
nodeID, err := strconv.Atoi(entry.Name()[4:])
|
||||
if err == nil {
|
||||
nodes = append(nodes, nodeID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(nodes) == 0 {
|
||||
return nil, fmt.Errorf("no NUMA nodes found")
|
||||
}
|
||||
|
||||
return nodes, nil
|
||||
}
|
||||
|
||||
type memoryInfo struct {
|
||||
total uint64
|
||||
free uint64
|
||||
}
|
||||
|
||||
func getMemoryInfoForNodeNoCGO(nodeID int) (*memoryInfo, error) {
|
||||
file, err := os.Open(fmt.Sprintf("/sys/devices/system/node/node%d/meminfo", nodeID))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
info := &memoryInfo{}
|
||||
scanner := bufio.NewScanner(file)
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if strings.Contains(line, "MemTotal:") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) >= 2 {
|
||||
val, _ := strconv.ParseUint(fields[1], 10, 64)
|
||||
info.total = val * 1024
|
||||
}
|
||||
} else if strings.Contains(line, "MemFree:") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) >= 2 {
|
||||
val, _ := strconv.ParseUint(fields[1], 10, 64)
|
||||
info.free = val * 1024
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return info, scanner.Err()
|
||||
}
|
||||
|
||||
func getCPUsForNodeNoCGO(nodeID int) ([]int, error) {
|
||||
data, err := os.ReadFile(fmt.Sprintf("/sys/devices/system/node/node%d/cpulist", nodeID))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return parseCPUListNoCGO(strings.TrimSpace(string(data)))
|
||||
}
|
||||
|
||||
func parseCPUListNoCGO(list string) ([]int, error) {
|
||||
var cpus []int
|
||||
|
||||
if list == "" {
|
||||
return cpus, nil
|
||||
}
|
||||
|
||||
parts := strings.Split(list, ",")
|
||||
for _, part := range parts {
|
||||
if strings.Contains(part, "-") {
|
||||
rangeParts := strings.Split(part, "-")
|
||||
if len(rangeParts) == 2 {
|
||||
start, _ := strconv.Atoi(rangeParts[0])
|
||||
end, _ := strconv.Atoi(rangeParts[1])
|
||||
for i := start; i <= end; i++ {
|
||||
cpus = append(cpus, i)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
cpu, _ := strconv.Atoi(part)
|
||||
cpus = append(cpus, cpu)
|
||||
}
|
||||
}
|
||||
|
||||
return cpus, nil
|
||||
}
|
||||
|
||||
func getDistancesForNodeNoCGO(nodeID int, numNodes int) ([]uint32, error) {
|
||||
data, err := os.ReadFile(fmt.Sprintf("/sys/devices/system/node/node%d/distance", nodeID))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fields := strings.Fields(string(data))
|
||||
distances := make([]uint32, len(fields))
|
||||
for i, field := range fields {
|
||||
val, _ := strconv.ParseUint(field, 10, 32)
|
||||
distances[i] = uint32(val)
|
||||
}
|
||||
|
||||
return distances, nil
|
||||
}
|
||||
|
||||
func getCurrentNodeImpl() (NodeID, error) {
|
||||
var cpu, node uint32
|
||||
|
||||
// Use getcpu syscall
|
||||
r1, _, errno := syscall.Syscall(SYS_GETCPU,
|
||||
uintptr(unsafe.Pointer(&cpu)),
|
||||
uintptr(unsafe.Pointer(&node)),
|
||||
0)
|
||||
|
||||
if errno != 0 {
|
||||
// Fallback: try to determine from CPU
|
||||
return getNodeFromSchedGetCPU()
|
||||
}
|
||||
|
||||
_ = r1 // suppress unused warning
|
||||
return NodeID(node), nil
|
||||
}
|
||||
|
||||
func getNodeFromSchedGetCPU() (NodeID, error) {
|
||||
// Get current CPU
|
||||
cpu := runtime.GOMAXPROCS(0)
|
||||
|
||||
// Look up in topology
|
||||
topology := GetTopology()
|
||||
node, ok := topology.GetNodeForCPU(cpu)
|
||||
if !ok {
|
||||
return 0, fmt.Errorf("cannot determine NUMA node for CPU %d", cpu)
|
||||
}
|
||||
return node, nil
|
||||
}
|
||||
|
||||
func setPreferredNodeImpl(node NodeID) (*PreferredNode, error) {
|
||||
mask := uint64(1) << uint64(node)
|
||||
maxNode := uint64(node) + 1
|
||||
|
||||
_, _, errno := syscall.Syscall6(SYS_SET_MEMPOLICY,
|
||||
uintptr(MPOL_PREFERRED),
|
||||
uintptr(unsafe.Pointer(&mask)),
|
||||
uintptr(maxNode),
|
||||
0, 0, 0)
|
||||
|
||||
if errno != 0 {
|
||||
return nil, fmt.Errorf("set_mempolicy failed: %v", errno)
|
||||
}
|
||||
|
||||
return &PreferredNode{nodeID: node}, nil
|
||||
}
|
||||
|
||||
func revertPreferredNodeImpl(p *PreferredNode) error {
|
||||
_, _, errno := syscall.Syscall(SYS_SET_MEMPOLICY,
|
||||
uintptr(MPOL_DEFAULT),
|
||||
0, 0)
|
||||
|
||||
if errno != 0 {
|
||||
return fmt.Errorf("set_mempolicy failed: %v", errno)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func setMemoryPolicyImpl(policy MemoryPolicy, nodes []NodeID) error {
|
||||
var mode int
|
||||
switch policy {
|
||||
case MPDefault:
|
||||
mode = MPOL_DEFAULT
|
||||
case MPBind:
|
||||
mode = MPOL_BIND
|
||||
case MPPreferred:
|
||||
mode = MPOL_PREFERRED
|
||||
case MPInterleave:
|
||||
mode = MPOL_INTERLEAVE
|
||||
default:
|
||||
return fmt.Errorf("unknown memory policy: %d", policy)
|
||||
}
|
||||
|
||||
var mask uint64
|
||||
for _, node := range nodes {
|
||||
mask |= 1 << uint64(node)
|
||||
}
|
||||
|
||||
maxNode := uint64(0)
|
||||
for _, node := range nodes {
|
||||
if uint64(node) >= maxNode {
|
||||
maxNode = uint64(node) + 1
|
||||
}
|
||||
}
|
||||
|
||||
_, _, errno := syscall.Syscall6(SYS_SET_MEMPOLICY,
|
||||
uintptr(mode),
|
||||
uintptr(unsafe.Pointer(&mask)),
|
||||
uintptr(maxNode),
|
||||
0, 0, 0)
|
||||
|
||||
if errno != 0 {
|
||||
return fmt.Errorf("set_mempolicy failed: %v", errno)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func allocateOnNodeImpl(size int, node NodeID) ([]byte, error) {
|
||||
buf := make([]byte, size)
|
||||
|
||||
// Try to use mbind to bind memory to node
|
||||
mask := uint64(1) << uint64(node)
|
||||
maxNode := uint64(node) + 1
|
||||
|
||||
_, _, errno := syscall.Syscall6(SYS_MBIND,
|
||||
uintptr(unsafe.Pointer(&buf[0])),
|
||||
uintptr(size),
|
||||
uintptr(MPOL_BIND),
|
||||
uintptr(unsafe.Pointer(&mask)),
|
||||
uintptr(maxNode),
|
||||
uintptr(MPOL_MF_STRICT))
|
||||
|
||||
if errno != 0 {
|
||||
// Fall back to regular allocation
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
func scheduleOnNodeImpl(cpu int, fn func()) error {
|
||||
var mask syscall.CPUSet
|
||||
mask.Set(cpu)
|
||||
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
if err := syscall.SchedSetaffinity(0, &mask); err != nil {
|
||||
return fmt.Errorf("sched_setaffinity failed: %v", err)
|
||||
}
|
||||
|
||||
fn()
|
||||
return nil
|
||||
}
|
||||
|
||||
func getPreferredNodeForCurrentThreadImpl() NodeID {
|
||||
var mode int
|
||||
var node uint32
|
||||
|
||||
_, _, errno := syscall.Syscall6(SYS_GET_MEMPOLICY,
|
||||
uintptr(unsafe.Pointer(&mode)),
|
||||
0, 0,
|
||||
uintptr(unsafe.Pointer(&node)),
|
||||
uintptr(MPOL_F_NODE),
|
||||
0)
|
||||
|
||||
if errno != 0 {
|
||||
node, _ := getCurrentNodeImpl()
|
||||
return node
|
||||
}
|
||||
|
||||
if mode == MPOL_DEFAULT {
|
||||
node, _ := getCurrentNodeImpl()
|
||||
return node
|
||||
}
|
||||
|
||||
return NodeID(node)
|
||||
}
|
||||
|
||||
// PinThreadToNode pins the current goroutine's OS thread to a specific NUMA node
|
||||
func PinThreadToNode(node NodeID) error {
|
||||
topology := GetTopology()
|
||||
nodeInfo, ok := topology.GetNode(node)
|
||||
if !ok {
|
||||
return fmt.Errorf("NUMA node %d not found", node)
|
||||
}
|
||||
|
||||
if len(nodeInfo.CPUs) == 0 {
|
||||
return fmt.Errorf("NUMA node %d has no CPUs", node)
|
||||
}
|
||||
|
||||
runtime.LockOSThread()
|
||||
|
||||
var mask syscall.CPUSet
|
||||
for _, cpu := range nodeInfo.CPUs {
|
||||
mask.Set(cpu)
|
||||
}
|
||||
|
||||
if err := syscall.SchedSetaffinity(0, &mask); err != nil {
|
||||
runtime.UnlockOSThread()
|
||||
return fmt.Errorf("sched_setaffinity failed: %v", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// UnpinThread releases the current goroutine's OS thread from NUMA binding
|
||||
func UnpinThread() {
|
||||
runtime.UnlockOSThread()
|
||||
}
|
||||
|
||||
// RunOnNode runs a function with the current goroutine pinned to a specific NUMA node
|
||||
func RunOnNode(node NodeID, fn func()) error {
|
||||
if err := PinThreadToNode(node); err != nil {
|
||||
return err
|
||||
}
|
||||
defer UnpinThread()
|
||||
|
||||
fn()
|
||||
return nil
|
||||
}
|
||||
94
pkg/util/numa/numa_stub.go
Normal file
94
pkg/util/numa/numa_stub.go
Normal file
@@ -0,0 +1,94 @@
|
||||
//go:build !linux
|
||||
// +build !linux
|
||||
|
||||
/*
|
||||
Copyright 2024 The GoStor Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package numa
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"runtime"
|
||||
)
|
||||
|
||||
func detectLinuxTopology(topology *Topology) error {
|
||||
return fmt.Errorf("NUMA not supported on this platform")
|
||||
}
|
||||
|
||||
func getCurrentNodeImpl() (NodeID, error) {
|
||||
return 0, fmt.Errorf("NUMA not supported on this platform")
|
||||
}
|
||||
|
||||
func setPreferredNodeImpl(node NodeID) (*PreferredNode, error) {
|
||||
return nil, fmt.Errorf("NUMA not supported on this platform")
|
||||
}
|
||||
|
||||
func revertPreferredNodeImpl(p *PreferredNode) error {
|
||||
return fmt.Errorf("NUMA not supported on this platform")
|
||||
}
|
||||
|
||||
func setMemoryPolicyImpl(policy MemoryPolicy, nodes []NodeID) error {
|
||||
return fmt.Errorf("NUMA not supported on this platform")
|
||||
}
|
||||
|
||||
func allocateOnNodeImpl(size int, node NodeID) ([]byte, error) {
|
||||
return make([]byte, size), nil
|
||||
}
|
||||
|
||||
func scheduleOnNodeImpl(cpu int, fn func()) error {
|
||||
fn()
|
||||
return nil
|
||||
}
|
||||
|
||||
func getPreferredNodeForCurrentThreadImpl() NodeID {
|
||||
return 0
|
||||
}
|
||||
|
||||
// PinThreadToNode pins the current goroutine's OS thread to a specific NUMA node
|
||||
// Stub implementation - does nothing on non-Linux platforms
|
||||
func PinThreadToNode(node NodeID) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// UnpinThread releases the current goroutine's OS thread from NUMA binding
|
||||
// Stub implementation - does nothing on non-Linux platforms
|
||||
func UnpinThread() {}
|
||||
|
||||
// RunOnNode runs a function with the current goroutine pinned to a specific NUMA node
|
||||
// Stub implementation - just runs the function on non-Linux platforms
|
||||
func RunOnNode(node NodeID, fn func()) error {
|
||||
fn()
|
||||
return nil
|
||||
}
|
||||
|
||||
// createSingleNodeTopology creates a single-node topology for non-NUMA systems
|
||||
func createSingleNodeTopology(topology *Topology) {
|
||||
numCPU := runtime.NumCPU()
|
||||
cpus := make([]int, numCPU)
|
||||
for i := 0; i < numCPU; i++ {
|
||||
cpus[i] = i
|
||||
topology.CPUToNodeMap[i] = 0
|
||||
}
|
||||
|
||||
topology.NumNodes = 1
|
||||
topology.Nodes[0] = &NodeInfo{
|
||||
ID: 0,
|
||||
CPUs: cpus,
|
||||
TotalMemory: 0,
|
||||
FreeMemory: 0,
|
||||
DistanceToNode: []uint32{10},
|
||||
}
|
||||
}
|
||||
105
pkg/util/numa/numa_test.go
Normal file
105
pkg/util/numa/numa_test.go
Normal file
@@ -0,0 +1,105 @@
|
||||
/*
|
||||
Copyright 2024 The GoStor Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package numa
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestTopologyDetection(t *testing.T) {
|
||||
topology := GetTopology()
|
||||
if topology == nil {
|
||||
t.Fatal("GetTopology returned nil")
|
||||
}
|
||||
|
||||
if topology.NumNodes < 1 {
|
||||
t.Errorf("Expected at least 1 NUMA node, got %d", topology.NumNodes)
|
||||
}
|
||||
|
||||
if len(topology.Nodes) == 0 {
|
||||
t.Error("No NUMA nodes found in topology")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBufferPool(t *testing.T) {
|
||||
pool := NewNUMABufferPool(&BufferPoolConfig{
|
||||
BufferSize: 4096,
|
||||
PerNodePoolSize: 10,
|
||||
EnableNUMA: false, // Disable NUMA for test
|
||||
})
|
||||
|
||||
if pool == nil {
|
||||
t.Fatal("NewNUMABufferPool returned nil")
|
||||
}
|
||||
|
||||
// Test Get/Put
|
||||
buf := pool.Get()
|
||||
if len(buf) != 4096 {
|
||||
t.Errorf("Expected buffer size 4096, got %d", len(buf))
|
||||
}
|
||||
|
||||
pool.Put(buf)
|
||||
|
||||
// Test stats
|
||||
stats := pool.Stats()
|
||||
if stats.Gets == 0 {
|
||||
t.Error("Expected Gets > 0")
|
||||
}
|
||||
if stats.Puts == 0 {
|
||||
t.Error("Expected Puts > 0")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBufferPoolMultipleSizes(t *testing.T) {
|
||||
pool := NewNUMABufferPool(&BufferPoolConfig{
|
||||
BufferSize: 8192,
|
||||
PerNodePoolSize: 5,
|
||||
EnableNUMA: false,
|
||||
})
|
||||
|
||||
// Get multiple buffers
|
||||
var buffers [][]byte
|
||||
for i := 0; i < 10; i++ {
|
||||
buf := pool.Get()
|
||||
buffers = append(buffers, buf)
|
||||
}
|
||||
|
||||
// Put all back
|
||||
for _, buf := range buffers {
|
||||
pool.Put(buf)
|
||||
}
|
||||
|
||||
stats := pool.Stats()
|
||||
if stats.Gets != 10 {
|
||||
t.Errorf("Expected 10 gets, got %d", stats.Gets)
|
||||
}
|
||||
if stats.Puts != 10 {
|
||||
t.Errorf("Expected 10 puts, got %d", stats.Puts)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAvailable(t *testing.T) {
|
||||
// Just verify the function doesn't panic
|
||||
_ = Available()
|
||||
}
|
||||
|
||||
func TestNumNodes(t *testing.T) {
|
||||
n := NumNodes()
|
||||
if n < 1 {
|
||||
t.Errorf("Expected at least 1 node, got %d", n)
|
||||
}
|
||||
}
|
||||
424
pkg/util/numa/pool.go
Normal file
424
pkg/util/numa/pool.go
Normal file
@@ -0,0 +1,424 @@
|
||||
/*
|
||||
Copyright 2024 The GoStor Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package numa
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
// BufferPoolConfig configures NUMA-aware buffer pools
|
||||
type BufferPoolConfig struct {
|
||||
// BufferSize is the size of each buffer
|
||||
BufferSize int
|
||||
// PerNodePoolSize is the number of buffers to preallocate per node
|
||||
PerNodePoolSize int
|
||||
// EnableNUMA enables NUMA-aware allocation
|
||||
EnableNUMA bool
|
||||
}
|
||||
|
||||
// DefaultBufferPoolConfig returns a default configuration
|
||||
func DefaultBufferPoolConfig() *BufferPoolConfig {
|
||||
return &BufferPoolConfig{
|
||||
BufferSize: 256 * 1024, // 256KB buffers for I/O
|
||||
PerNodePoolSize: 1024, // 1024 buffers per node
|
||||
EnableNUMA: true,
|
||||
}
|
||||
}
|
||||
|
||||
// NUMABufferPool provides NUMA-aware buffer pooling
|
||||
type NUMABufferPool struct {
|
||||
config *BufferPoolConfig
|
||||
topology *Topology
|
||||
nodePools map[NodeID]*sync.Pool
|
||||
stats *PoolStats
|
||||
|
||||
// Fallback pool for when NUMA is not available or disabled
|
||||
fallbackPool *sync.Pool
|
||||
|
||||
mu sync.RWMutex
|
||||
}
|
||||
|
||||
// PoolStats tracks buffer pool statistics
|
||||
type PoolStats struct {
|
||||
Gets uint64
|
||||
Puts uint64
|
||||
Misses uint64
|
||||
NodeLocalHit uint64
|
||||
NUMAHit uint64
|
||||
}
|
||||
|
||||
// NewNUMABufferPool creates a new NUMA-aware buffer pool
|
||||
func NewNUMABufferPool(config *BufferPoolConfig) *NUMABufferPool {
|
||||
if config == nil {
|
||||
config = DefaultBufferPoolConfig()
|
||||
}
|
||||
|
||||
pool := &NUMABufferPool{
|
||||
config: config,
|
||||
topology: GetTopology(),
|
||||
nodePools: make(map[NodeID]*sync.Pool),
|
||||
stats: &PoolStats{},
|
||||
}
|
||||
|
||||
// Initialize fallback pool
|
||||
pool.fallbackPool = &sync.Pool{
|
||||
New: func() interface{} {
|
||||
atomic.AddUint64(&pool.stats.Misses, 1)
|
||||
return make([]byte, config.BufferSize)
|
||||
},
|
||||
}
|
||||
|
||||
// Initialize NUMA pools if enabled and available
|
||||
if config.EnableNUMA && Available() && pool.topology.NumNodes > 1 {
|
||||
for nodeID := range pool.topology.Nodes {
|
||||
pool.nodePools[nodeID] = pool.createNodePool(nodeID)
|
||||
}
|
||||
}
|
||||
|
||||
return pool
|
||||
}
|
||||
|
||||
// createNodePool creates a buffer pool for a specific NUMA node
|
||||
func (p *NUMABufferPool) createNodePool(node NodeID) *sync.Pool {
|
||||
return &sync.Pool{
|
||||
New: func() interface{} {
|
||||
atomic.AddUint64(&p.stats.Misses, 1)
|
||||
|
||||
// Try NUMA-local allocation first
|
||||
if p.config.EnableNUMA && Available() {
|
||||
buf, err := AllocateOnNode(p.config.BufferSize, node)
|
||||
if err == nil {
|
||||
atomic.AddUint64(&p.stats.NUMAHit, 1)
|
||||
return buf
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to regular allocation
|
||||
return make([]byte, p.config.BufferSize)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Get returns a buffer from the pool, preferably from the local NUMA node
|
||||
func (p *NUMABufferPool) Get() []byte {
|
||||
atomic.AddUint64(&p.stats.Gets, 1)
|
||||
|
||||
// Try to get from the local NUMA node first
|
||||
if p.config.EnableNUMA && Available() && len(p.nodePools) > 0 {
|
||||
if node, err := GetCurrentNode(); err == nil {
|
||||
if nodePool, ok := p.nodePools[node]; ok {
|
||||
buf := nodePool.Get().([]byte)
|
||||
atomic.AddUint64(&p.stats.NodeLocalHit, 1)
|
||||
return buf[:p.config.BufferSize]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to the fallback pool
|
||||
return p.fallbackPool.Get().([]byte)[:p.config.BufferSize]
|
||||
}
|
||||
|
||||
// Put returns a buffer to the pool, preferably to its local NUMA node
|
||||
func (p *NUMABufferPool) Put(buf []byte) {
|
||||
if buf == nil {
|
||||
return
|
||||
}
|
||||
|
||||
atomic.AddUint64(&p.stats.Puts, 1)
|
||||
|
||||
// Resize buffer to full size before returning to pool
|
||||
if cap(buf) < p.config.BufferSize {
|
||||
// Buffer is too small, discard it
|
||||
return
|
||||
}
|
||||
buf = buf[:p.config.BufferSize]
|
||||
|
||||
// Try to return to the local NUMA node pool
|
||||
if p.config.EnableNUMA && Available() && len(p.nodePools) > 0 {
|
||||
if node, err := GetCurrentNode(); err == nil {
|
||||
if nodePool, ok := p.nodePools[node]; ok {
|
||||
nodePool.Put(buf)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to the fallback pool
|
||||
p.fallbackPool.Put(buf)
|
||||
}
|
||||
|
||||
// GetForNode returns a buffer from a specific NUMA node's pool
|
||||
func (p *NUMABufferPool) GetForNode(node NodeID) []byte {
|
||||
atomic.AddUint64(&p.stats.Gets, 1)
|
||||
|
||||
if nodePool, ok := p.nodePools[node]; ok {
|
||||
return nodePool.Get().([]byte)[:p.config.BufferSize]
|
||||
}
|
||||
|
||||
return p.fallbackPool.Get().([]byte)[:p.config.BufferSize]
|
||||
}
|
||||
|
||||
// PutForNode returns a buffer to a specific NUMA node's pool
|
||||
func (p *NUMABufferPool) PutForNode(node NodeID, buf []byte) {
|
||||
if buf == nil {
|
||||
return
|
||||
}
|
||||
|
||||
atomic.AddUint64(&p.stats.Puts, 1)
|
||||
|
||||
if cap(buf) < p.config.BufferSize {
|
||||
return
|
||||
}
|
||||
buf = buf[:p.config.BufferSize]
|
||||
|
||||
if nodePool, ok := p.nodePools[node]; ok {
|
||||
nodePool.Put(buf)
|
||||
return
|
||||
}
|
||||
|
||||
p.fallbackPool.Put(buf)
|
||||
}
|
||||
|
||||
// Stats returns current pool statistics
|
||||
func (p *NUMABufferPool) Stats() PoolStats {
|
||||
return PoolStats{
|
||||
Gets: atomic.LoadUint64(&p.stats.Gets),
|
||||
Puts: atomic.LoadUint64(&p.stats.Puts),
|
||||
Misses: atomic.LoadUint64(&p.stats.Misses),
|
||||
NodeLocalHit: atomic.LoadUint64(&p.stats.NodeLocalHit),
|
||||
NUMAHit: atomic.LoadUint64(&p.stats.NUMAHit),
|
||||
}
|
||||
}
|
||||
|
||||
// GetConfig returns the pool configuration
|
||||
func (p *NUMABufferPool) GetConfig() *BufferPoolConfig {
|
||||
return p.config
|
||||
}
|
||||
|
||||
// Close releases all resources associated with the pool
|
||||
func (p *NUMABufferPool) Close() error {
|
||||
p.mu.Lock()
|
||||
defer p.mu.Unlock()
|
||||
|
||||
// Clear all pools
|
||||
p.nodePools = make(map[NodeID]*sync.Pool)
|
||||
p.fallbackPool = nil
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// SizeAwarePool is a buffer pool that can handle multiple buffer sizes
|
||||
type SizeAwarePool struct {
|
||||
pools map[int]*NUMABufferPool
|
||||
mu sync.RWMutex
|
||||
}
|
||||
|
||||
// NewSizeAwarePool creates a new size-aware buffer pool
|
||||
func NewSizeAwarePool(sizes []int, enableNUMA bool) *SizeAwarePool {
|
||||
sap := &SizeAwarePool{
|
||||
pools: make(map[int]*NUMABufferPool),
|
||||
}
|
||||
|
||||
for _, size := range sizes {
|
||||
sap.pools[size] = NewNUMABufferPool(&BufferPoolConfig{
|
||||
BufferSize: size,
|
||||
PerNodePoolSize: 1024,
|
||||
EnableNUMA: enableNUMA,
|
||||
})
|
||||
}
|
||||
|
||||
return sap
|
||||
}
|
||||
|
||||
// Get returns a buffer of the specified size
|
||||
func (sap *SizeAwarePool) Get(size int) []byte {
|
||||
sap.mu.RLock()
|
||||
pool, ok := sap.pools[size]
|
||||
sap.mu.RUnlock()
|
||||
|
||||
if ok {
|
||||
return pool.Get()
|
||||
}
|
||||
|
||||
// No pool for this size, allocate directly
|
||||
return make([]byte, size)
|
||||
}
|
||||
|
||||
// Put returns a buffer to the appropriate pool
|
||||
func (sap *SizeAwarePool) Put(buf []byte) {
|
||||
if buf == nil {
|
||||
return
|
||||
}
|
||||
|
||||
size := cap(buf)
|
||||
|
||||
sap.mu.RLock()
|
||||
pool, ok := sap.pools[size]
|
||||
sap.mu.RUnlock()
|
||||
|
||||
if ok {
|
||||
pool.Put(buf)
|
||||
}
|
||||
// If no pool for this size, let GC handle it
|
||||
}
|
||||
|
||||
// PinningAllocator allocates buffers while the goroutine is pinned to a NUMA node
|
||||
type PinningAllocator struct {
|
||||
pool *NUMABufferPool
|
||||
}
|
||||
|
||||
// NewPinningAllocator creates a new pinning allocator
|
||||
func NewPinningAllocator(pool *NUMABufferPool) *PinningAllocator {
|
||||
return &PinningAllocator{pool: pool}
|
||||
}
|
||||
|
||||
// Allocate allocates a buffer while pinned to the current NUMA node
|
||||
func (pa *PinningAllocator) Allocate() []byte {
|
||||
return pa.pool.Get()
|
||||
}
|
||||
|
||||
// AllocateOnNode allocates a buffer while pinned to a specific NUMA node
|
||||
func (pa *PinningAllocator) AllocateOnNode(node NodeID) ([]byte, error) {
|
||||
var buf []byte
|
||||
err := RunOnNode(node, func() {
|
||||
buf = pa.pool.GetForNode(node)
|
||||
})
|
||||
return buf, err
|
||||
}
|
||||
|
||||
// Global pools for common buffer sizes
|
||||
var (
|
||||
globalPools map[int]*NUMABufferPool
|
||||
globalPoolsOnce sync.Once
|
||||
globalPoolsMu sync.RWMutex
|
||||
)
|
||||
|
||||
// InitGlobalPools initializes global buffer pools
|
||||
func InitGlobalPools(sizes []int, enableNUMA bool) {
|
||||
globalPoolsOnce.Do(func() {
|
||||
globalPools = make(map[int]*NUMABufferPool)
|
||||
for _, size := range sizes {
|
||||
globalPools[size] = NewNUMABufferPool(&BufferPoolConfig{
|
||||
BufferSize: size,
|
||||
PerNodePoolSize: 1024,
|
||||
EnableNUMA: enableNUMA,
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// GetBuffer gets a buffer from the global pool
|
||||
func GetBuffer(size int) []byte {
|
||||
globalPoolsMu.RLock()
|
||||
pool, ok := globalPools[size]
|
||||
globalPoolsMu.RUnlock()
|
||||
|
||||
if ok {
|
||||
return pool.Get()
|
||||
}
|
||||
return make([]byte, size)
|
||||
}
|
||||
|
||||
// PutBuffer returns a buffer to the global pool
|
||||
func PutBuffer(buf []byte) {
|
||||
if buf == nil {
|
||||
return
|
||||
}
|
||||
|
||||
size := cap(buf)
|
||||
|
||||
globalPoolsMu.RLock()
|
||||
pool, ok := globalPools[size]
|
||||
globalPoolsMu.RUnlock()
|
||||
|
||||
if ok {
|
||||
pool.Put(buf)
|
||||
}
|
||||
}
|
||||
|
||||
// WorkerPool is a pool of workers that are pinned to specific NUMA nodes
|
||||
type WorkerPool struct {
|
||||
size int
|
||||
numaNode NodeID
|
||||
workQueue chan func()
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
wg sync.WaitGroup
|
||||
}
|
||||
|
||||
// NewWorkerPool creates a new NUMA-aware worker pool
|
||||
func NewWorkerPool(size int, node NodeID) *WorkerPool {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
wp := &WorkerPool{
|
||||
size: size,
|
||||
numaNode: node,
|
||||
workQueue: make(chan func(), size*2),
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
}
|
||||
|
||||
// Start workers
|
||||
for i := 0; i < size; i++ {
|
||||
wp.wg.Add(1)
|
||||
go wp.worker()
|
||||
}
|
||||
|
||||
return wp
|
||||
}
|
||||
|
||||
func (wp *WorkerPool) worker() {
|
||||
defer wp.wg.Done()
|
||||
|
||||
// Pin to NUMA node
|
||||
if Available() {
|
||||
PinThreadToNode(wp.numaNode)
|
||||
defer UnpinThread()
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case work := <-wp.workQueue:
|
||||
if work != nil {
|
||||
work()
|
||||
}
|
||||
case <-wp.ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Submit submits work to the worker pool
|
||||
func (wp *WorkerPool) Submit(work func()) bool {
|
||||
select {
|
||||
case wp.workQueue <- work:
|
||||
return true
|
||||
case <-wp.ctx.Done():
|
||||
return false
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Stop stops the worker pool
|
||||
func (wp *WorkerPool) Stop() {
|
||||
wp.cancel()
|
||||
wp.wg.Wait()
|
||||
close(wp.workQueue)
|
||||
}
|
||||
@@ -73,32 +73,38 @@ func MarshalKVText(kv []KeyValue) []byte {
|
||||
return data
|
||||
}
|
||||
|
||||
// MarshalUint16 returns big-endian encoding of i as a new 2-byte slice.
|
||||
// Deprecated: Use MarshalUint16To or binary.BigEndian.PutUint16 for zero-allocation.
|
||||
func MarshalUint16(i uint16) []byte {
|
||||
var data []byte
|
||||
for j := 8; j >= 0; j -= 8 {
|
||||
b := byte(i >> uint16(j))
|
||||
data = append(data, b)
|
||||
}
|
||||
return data
|
||||
var data [2]byte
|
||||
binary.BigEndian.PutUint16(data[:], i)
|
||||
return data[:]
|
||||
}
|
||||
|
||||
// MarshalUint32 returns big-endian encoding of i as a new 4-byte slice.
|
||||
// Deprecated: Use MarshalUint32To or binary.BigEndian.PutUint32 for zero-allocation.
|
||||
func MarshalUint32(i uint32) []byte {
|
||||
var data []byte
|
||||
for j := 24; j >= 0; j -= 8 {
|
||||
b := byte(i >> uint32(j))
|
||||
data = append(data, b)
|
||||
}
|
||||
return data
|
||||
var data [4]byte
|
||||
binary.BigEndian.PutUint32(data[:], i)
|
||||
return data[:]
|
||||
}
|
||||
|
||||
// MarshalUint32To writes big-endian encoding of i into buf, which must be at least 4 bytes.
|
||||
// This is a zero-allocation alternative to MarshalUint32.
|
||||
func MarshalUint32To(buf []byte, i uint32) {
|
||||
binary.BigEndian.PutUint32(buf, i)
|
||||
}
|
||||
|
||||
func MarshalUint64(v uint64) []byte {
|
||||
var data = [8]byte{}
|
||||
var i = 0
|
||||
for j := 56; j >= 0; j -= 8 {
|
||||
data[i] = byte(v >> uint32(j))
|
||||
i++
|
||||
}
|
||||
return data[0:8]
|
||||
var data [8]byte
|
||||
binary.BigEndian.PutUint64(data[:], v)
|
||||
return data[:]
|
||||
}
|
||||
|
||||
// MarshalUint64To writes big-endian encoding of v into buf, which must be at least 8 bytes.
|
||||
// This is a zero-allocation alternative for partial writes.
|
||||
func MarshalUint64To(buf []byte, v uint64) {
|
||||
binary.BigEndian.PutUint64(buf, v)
|
||||
}
|
||||
|
||||
func StringToByte(str string, align int, maxlength int) []byte {
|
||||
|
||||
Reference in New Issue
Block a user