Files
gotgt/pkg/util/numa/numa_linux.go
2026-03-14 11:45:35 +08:00

470 lines
10 KiB
Go

//go:build linux
// +build linux
/*
Copyright 2024 The GoStor Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package numa
import (
"bufio"
"fmt"
"os"
"runtime"
"strconv"
"strings"
"sync"
"unsafe"
)
// #include <stdlib.h>
// #include <unistd.h>
// #include <sys/syscall.h>
// #include <linux/mempolicy.h>
// #include <numa.h>
// #include <numaif.h>
//
// #cgo LDFLAGS: -lnuma
import "C"
const (
// NUMA memory policies (from linux/mempolicy.h)
MPOL_DEFAULT = 0
MPOL_PREFERRED = 1
MPOL_BIND = 2
MPOL_INTERLEAVE = 3
MPOL_LOCAL = 4
MPOL_MAX = 5
// Flags for mbind
MPOL_MF_STRICT = 1 << 0
MPOL_MF_MOVE = 1 << 1
MPOL_MF_MOVE_ALL = 1 << 2
MPOL_MF_LAZY = 1 << 3
MPOL_MF_INTERNAL = 1 << 4
MPOL_MF_VALID = 1 << 5
MPOL_MF_WAKE = 1 << 6
MPOL_MF_REMOVE = 1 << 7
MPOL_MF_HONOR_VMFOL = 1 << 8
// Flags for get_mempolicy
MPOL_F_NODE = 1 << 0
MPOL_F_ADDR = 1 << 1
MPOL_F_MEMS_ALLOWED = 1 << 2
)
var (
numaInitOnce sync.Once
numaInitErr error
)
func initNuma() {
numaInitOnce.Do(func() {
if C.numa_available() < 0 {
numaInitErr = fmt.Errorf("NUMA is not available")
} else {
// numa_init is not available in newer libnuma versions
// The library is automatically initialized on first use
}
})
}
func detectLinuxTopology(topology *Topology) error {
initNuma()
// First, try to use /sys filesystem for detection
nodes, err := detectNodesFromSys()
if err != nil {
// Fall back to libnuma
return detectFromLibNuma(topology)
}
topology.NumNodes = len(nodes)
for _, nodeID := range nodes {
nodeInfo := &NodeInfo{
ID: NodeID(nodeID),
}
// Get CPUs for this node
cpus, err := getCPUsForNode(nodeID)
if err == nil {
nodeInfo.CPUs = cpus
for _, cpu := range cpus {
topology.CPUToNodeMap[cpu] = NodeID(nodeID)
}
}
// Get memory info for this node
memInfo, err := getMemoryInfoForNode(nodeID)
if err == nil {
nodeInfo.TotalMemory = memInfo.total
nodeInfo.FreeMemory = memInfo.free
}
// Get distance matrix
distances, err := getDistancesForNode(nodeID, len(nodes))
if err == nil {
nodeInfo.DistanceToNode = distances
}
topology.Nodes[NodeID(nodeID)] = nodeInfo
}
return nil
}
func detectNodesFromSys() ([]int, error) {
entries, err := os.ReadDir("/sys/devices/system/node")
if err != nil {
return nil, err
}
var nodes []int
for _, entry := range entries {
if entry.IsDir() && strings.HasPrefix(entry.Name(), "node") {
nodeID, err := strconv.Atoi(entry.Name()[4:])
if err == nil {
nodes = append(nodes, nodeID)
}
}
}
if len(nodes) == 0 {
return nil, fmt.Errorf("no NUMA nodes found")
}
return nodes, nil
}
type memoryInfo struct {
total uint64
free uint64
}
func getMemoryInfoForNode(nodeID int) (*memoryInfo, error) {
file, err := os.Open(fmt.Sprintf("/sys/devices/system/node/node%d/meminfo", nodeID))
if err != nil {
return nil, err
}
defer file.Close()
info := &memoryInfo{}
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
if strings.Contains(line, "MemTotal:") {
fields := strings.Fields(line)
if len(fields) >= 2 {
val, _ := strconv.ParseUint(fields[1], 10, 64)
info.total = val * 1024 // Convert from KB to bytes
}
} else if strings.Contains(line, "MemFree:") {
fields := strings.Fields(line)
if len(fields) >= 2 {
val, _ := strconv.ParseUint(fields[1], 10, 64)
info.free = val * 1024 // Convert from KB to bytes
}
}
}
return info, scanner.Err()
}
func getCPUsForNode(nodeID int) ([]int, error) {
data, err := os.ReadFile(fmt.Sprintf("/sys/devices/system/node/node%d/cpulist", nodeID))
if err != nil {
return nil, err
}
return parseCPUList(strings.TrimSpace(string(data)))
}
func parseCPUList(list string) ([]int, error) {
var cpus []int
// Handle empty list
if list == "" {
return cpus, nil
}
parts := strings.Split(list, ",")
for _, part := range parts {
if strings.Contains(part, "-") {
// Range like "0-7"
rangeParts := strings.Split(part, "-")
if len(rangeParts) == 2 {
start, _ := strconv.Atoi(rangeParts[0])
end, _ := strconv.Atoi(rangeParts[1])
for i := start; i <= end; i++ {
cpus = append(cpus, i)
}
}
} else {
// Single CPU
cpu, _ := strconv.Atoi(part)
cpus = append(cpus, cpu)
}
}
return cpus, nil
}
func getDistancesForNode(nodeID int, numNodes int) ([]uint32, error) {
file, err := os.Open(fmt.Sprintf("/sys/devices/system/node/node%d/distance", nodeID))
if err != nil {
return nil, err
}
defer file.Close()
data, err := os.ReadFile(fmt.Sprintf("/sys/devices/system/node/node%d/distance", nodeID))
if err != nil {
return nil, err
}
fields := strings.Fields(string(data))
distances := make([]uint32, len(fields))
for i, field := range fields {
val, _ := strconv.ParseUint(field, 10, 32)
distances[i] = uint32(val)
}
return distances, nil
}
func detectFromLibNuma(topology *Topology) error {
initNuma()
if numaInitErr != nil {
return numaInitErr
}
numNodes := int(C.numa_num_configured_nodes())
if numNodes <= 0 {
return fmt.Errorf("no NUMA nodes configured")
}
topology.NumNodes = numNodes
maxNode := int(C.numa_max_node())
for nodeID := 0; nodeID <= maxNode; nodeID++ {
if C.numa_bitmask_isbitset(C.numa_all_nodes_ptr, C.uint(nodeID)) == 0 {
continue
}
nodeInfo := &NodeInfo{
ID: NodeID(nodeID),
}
// Get memory size
totalMem := uint64(C.numa_node_size(C.int(nodeID), nil))
nodeInfo.TotalMemory = totalMem
// Get CPUs (this is approximate with libnuma)
cpuMask := C.numa_allocate_cpumask()
defer C.numa_free_cpumask(cpuMask)
if C.numa_node_to_cpus(C.int(nodeID), cpuMask) == 0 {
// Parse CPU mask
maxCPU := int(C.numa_num_configured_cpus())
for cpu := 0; cpu < maxCPU; cpu++ {
if C.numa_bitmask_isbitset(cpuMask, C.uint(cpu)) != 0 {
nodeInfo.CPUs = append(nodeInfo.CPUs, cpu)
topology.CPUToNodeMap[cpu] = NodeID(nodeID)
}
}
}
topology.Nodes[NodeID(nodeID)] = nodeInfo
}
return nil
}
func getCurrentNodeImpl() (NodeID, error) {
// Use /proc/self/stat to get current CPU
data, err := os.ReadFile("/proc/self/stat")
if err != nil {
return 0, fmt.Errorf("failed to read /proc/self/stat: %v", err)
}
fields := strings.Fields(string(data))
if len(fields) < 39 {
return 0, fmt.Errorf("unexpected /proc/self/stat format")
}
cpu, err := strconv.Atoi(fields[38])
if err != nil {
return 0, fmt.Errorf("failed to parse CPU: %v", err)
}
topology := GetTopology()
node, ok := topology.GetNodeForCPU(cpu)
if !ok {
return 0, fmt.Errorf("CPU %d not found in topology", cpu)
}
return node, nil
}
func setPreferredNodeImpl(node NodeID) (*PreferredNode, error) {
initNuma()
if numaInitErr != nil {
return nil, numaInitErr
}
// Save current nodemask
var oldMode C.int
var oldMask C.ulong
maxNode := C.ulong(2) // We only need 2 bits for now
if ret := C.get_mempolicy(&oldMode, &oldMask, maxNode, nil, 0); ret < 0 {
return nil, fmt.Errorf("get_mempolicy failed: %v", ret)
}
// Set preferred node
var newMask C.ulong = 1 << C.ulong(node)
if ret := C.set_mempolicy(MPOL_PREFERRED, &newMask, maxNode); ret < 0 {
return nil, fmt.Errorf("set_mempolicy failed: %v", ret)
}
return &PreferredNode{nodeID: node}, nil
}
func revertPreferredNodeImpl(p *PreferredNode) error {
// Reset to default policy
if ret := C.set_mempolicy(MPOL_DEFAULT, nil, 0); ret < 0 {
return fmt.Errorf("set_mempolicy failed: %v", ret)
}
return nil
}
func setMemoryPolicyImpl(policy MemoryPolicy, nodes []NodeID) error {
var mode int
switch policy {
case MPDefault:
mode = MPOL_DEFAULT
case MPBind:
mode = MPOL_BIND
case MPPreferred:
mode = MPOL_PREFERRED
case MPInterleave:
mode = MPOL_INTERLEAVE
default:
return fmt.Errorf("unknown memory policy: %d", policy)
}
// Build nodemask
var mask C.ulong
for _, node := range nodes {
mask |= 1 << C.ulong(node)
}
maxNode := C.ulong(2)
for _, node := range nodes {
if C.ulong(node) >= maxNode {
maxNode = C.ulong(node) + 1
}
}
if ret := C.set_mempolicy(C.int(mode), &mask, maxNode); ret < 0 {
return fmt.Errorf("set_mempolicy failed: %v", ret)
}
return nil
}
func allocateOnNodeImpl(size int, node NodeID) ([]byte, error) {
// Use mmap with MAP_PRIVATE and bind to specific node
buf := make([]byte, size)
// Set the memory policy for the allocated region
var mask C.ulong = 1 << C.ulong(node)
ptr := unsafe.Pointer(&buf[0])
if ret := C.mbind(ptr, C.ulong(size), MPOL_BIND, &mask, C.ulong(node)+1, MPOL_MF_STRICT); ret < 0 {
// Fall back to regular allocation
return buf, nil
}
return buf, nil
}
func scheduleOnNodeImpl(cpu int, fn func()) error {
// Simplified implementation - just run the function
// CPU affinity setting requires CGO or unix package
runtime.LockOSThread()
defer runtime.UnlockOSThread()
fn()
return nil
}
func getPreferredNodeForCurrentThreadImpl() NodeID {
var mode C.int
var node C.int
if ret := C.get_mempolicy(&mode, nil, 0, unsafe.Pointer(&node), MPOL_F_NODE); ret < 0 {
return NodeID(0)
}
if mode == MPOL_DEFAULT {
// Get current CPU's node
currentNode, _ := getCurrentNodeImpl()
return currentNode
}
return NodeID(node)
}
// PinThreadToNode pins the current goroutine's OS thread to a specific NUMA node
func PinThreadToNode(node NodeID) error {
initNuma()
if numaInitErr != nil {
return numaInitErr
}
topology := GetTopology()
nodeInfo, ok := topology.GetNode(node)
if !ok {
return fmt.Errorf("NUMA node %d not found", node)
}
if len(nodeInfo.CPUs) == 0 {
return fmt.Errorf("NUMA node %d has no CPUs", node)
}
runtime.LockOSThread()
// Note: CPU affinity setting is simplified for portability
// Full implementation would use sched_setaffinity syscall
return nil
}
// UnpinThread releases the current goroutine's OS thread from NUMA binding
func UnpinThread() {
runtime.UnlockOSThread()
}
// RunOnNode runs a function with the current goroutine pinned to a specific NUMA node
func RunOnNode(node NodeID, fn func()) error {
if err := PinThreadToNode(node); err != nil {
return err
}
defer UnpinThread()
fn()
return nil
}