//go:build linux // +build linux /* Copyright 2024 The GoStor Authors All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // Package iouring provides an io_uring-based backing store for high-performance // asynchronous I/O operations on Linux 5.1+ systems. package iouring import ( "fmt" "os" "runtime" "sync" "sync/atomic" "syscall" "unsafe" log "github.com/sirupsen/logrus" "github.com/gostor/gotgt/pkg/api" "github.com/gostor/gotgt/pkg/scsi" ) const ( IoUringBackingStorage = "iouring" // Default queue depth for io_uring DefaultQueueDepth = 4096 // Minimum kernel version required (5.1) MinKernelMajor = 5 MinKernelMinor = 1 ) // io_uring constants (from linux/io_uring.h) const ( IORING_SETUP_IOPOLL = 1 << 0 IORING_SETUP_SQPOLL = 1 << 1 IORING_SETUP_SQ_AFF = 1 << 2 IORING_SETUP_CQSIZE = 1 << 3 IORING_SETUP_CLAMP = 1 << 4 IORING_SETUP_ATTACH_WQ = 1 << 5 IORING_SETUP_R_DISABLED = 1 << 6 IORING_FSYNC_DATASYNC = 1 << 0 IORING_TIMEOUT_ABS = 1 << 0 IORING_OFF_SQ_RING = 0 IORING_OFF_CQ_RING = 0x8000000 IORING_OFF_SQES = 0x10000000 IORING_OP_NOP = 0 IORING_OP_READV = 1 IORING_OP_WRITEV = 2 IORING_OP_FSYNC = 3 IORING_OP_READ_FIXED = 4 IORING_OP_WRITE_FIXED = 5 IORING_OP_POLL_ADD = 6 IORING_OP_POLL_REMOVE = 7 IORING_OP_SYNC_FILE_RANGE = 8 IORING_OP_SENDMSG = 9 IORING_OP_RECVMSG = 10 IORING_OP_TIMEOUT = 11 IORING_OP_TIMEOUT_REMOVE = 12 IORING_OP_ACCEPT = 13 IORING_OP_ASYNC_CANCEL = 14 IORING_OP_LINK_TIMEOUT = 15 IORING_OP_CONNECT = 16 IORING_OP_FALLOCATE = 17 IORING_OP_OPENAT = 18 IORING_OP_CLOSE = 19 IORING_OP_FILES_UPDATE = 20 IORING_OP_STATX = 21 IORING_OP_READ = 22 IORING_OP_WRITE = 23 IORING_OP_FADVISE = 24 IORING_OP_MADVISE = 25 IORING_OP_SEND = 26 IORING_OP_RECV = 27 IORING_OP_OPENAT2 = 28 IORING_OP_EPOLL_CTL = 29 IORING_OP_SPLICE = 30 IORING_OP_PROVIDE_BUFFERS = 31 IORING_OP_REMOVE_BUFFERS = 32 IORING_OP_TEE = 33 IORING_OP_SHUTDOWN = 34 IORING_OP_RENAMEAT = 35 IORING_OP_UNLINKAT = 36 IORING_OP_MKDIRAT = 37 IORING_OP_SYMLINKAT = 38 IORING_OP_LINKAT = 39 IORING_OP_MSG_RING = 40 IORING_OP_FSETXATTR = 41 IORING_OP_SETXATTR = 42 IORING_OP_FGETXATTR = 43 IORING_OP_GETXATTR = 44 IORING_OP_SOCKET = 45 IORING_OP_URING_CMD = 46 IORING_OP_SEND_ZC = 47 IORING_OP_SENDMSG_ZC = 48 IORING_CQE_F_BUFFER = 1 << 0 IORING_CQE_F_MORE = 1 << 1 ) // io_uring structures // Note: These are simplified structures for the operations we need type ioUring struct { fd int sq *ioUringSq cq *ioUringCq flags uint32 ringSize int } type ioUringSq struct { head *uint32 tail *uint32 ringMask *uint32 ringEntries *uint32 flags *uint32 dropped *uint32 array *uint32 sqes []ioSqringEntry } type ioUringCq struct { head *uint32 tail *uint32 ringMask *uint32 ringEntries *uint32 overflow *uint32 cqes []ioCqringEntry } type ioSqringEntry struct { opcode uint8 flags uint8 ioprio uint16 fd int32 off uint64 addr uint64 len uint32 userData uint64 } type ioCqringEntry struct { userData uint64 res int32 flags uint32 } type ioUringParams struct { sqEntries uint32 cqEntries uint32 flags uint32 sqThreadCPU uint32 sqThreadIdle uint32 features uint32 wqFd uint32 resv [3]uint32 sqOff ioSqringOffsets cqOff ioCqringOffsets } type ioSqringOffsets struct { head uint32 tail uint32 ringMask uint32 ringEntries uint32 flags uint32 dropped uint32 array uint32 resv1 uint32 resv2 uint64 } type ioCqringOffsets struct { head uint32 tail uint32 ringMask uint32 ringEntries uint32 overflow uint32 cqes uint32 flags uint32 resv1 uint32 resv2 uint64 } type ioUringCqe struct { userData uint64 res int32 flags uint32 } var ioUringEnabled = false func init() { if isKernelVersionSupported() { ioUringEnabled = true scsi.RegisterBackingStore(IoUringBackingStorage, newIOUringBackingStore) log.Info("io_uring backing store registered (kernel supports io_uring)") } else { log.Info("io_uring backing store not available (requires Linux 5.1+)") } } func isKernelVersionSupported() bool { var uname syscall.Utsname if err := syscall.Uname(&uname); err != nil { return false } // Parse kernel version (simplified) // Format is typically "5.15.0-generic" major := int(uname.Release[0] - '0') minor := int(uname.Release[2] - '0') if major > MinKernelMajor { return true } if major == MinKernelMajor && minor >= MinKernelMinor { return true } return false } // IOUringBackingStore implements BackingStore using io_uring type IOUringBackingStore struct { scsi.BaseBackingStore file *os.File ring *ioUring queueDepth int // Synchronization submitMu sync.Mutex // Statistics opsSubmitted uint64 opsCompleted uint64 } func newIOUringBackingStore() (api.BackingStore, error) { return &IOUringBackingStore{ BaseBackingStore: scsi.BaseBackingStore{ Name: IoUringBackingStorage, DataSize: 0, OflagsSupported: 0, }, queueDepth: DefaultQueueDepth, }, nil } // Open opens the backing file and initializes io_uring func (bs *IOUringBackingStore) Open(dev *api.SCSILu, path string) error { var mode os.FileMode finfo, err := os.Stat(path) if err != nil { return err } mode = finfo.Mode() f, err := os.OpenFile(path, os.O_RDWR|syscall.O_DIRECT, os.ModePerm) if err != nil { // Try without O_DIRECT if not supported f, err = os.OpenFile(path, os.O_RDWR, os.ModePerm) if err != nil { return err } } if (mode & os.ModeDevice) != 0 { pos, err := f.Seek(0, os.SEEK_END) if err != nil { f.Close() return err } bs.DataSize = uint64(pos) } else { bs.DataSize = uint64(finfo.Size()) } bs.file = f // Initialize io_uring ring, err := bs.initIOUring() if err != nil { f.Close() return fmt.Errorf("failed to initialize io_uring: %v", err) } bs.ring = ring log.Infof("io_uring backing store opened: %s (queue depth: %d)", path, bs.queueDepth) return nil } func (bs *IOUringBackingStore) initIOUring() (*ioUring, error) { params := &ioUringParams{} // Setup io_uring fd, _, errno := syscall.Syscall(425, // __NR_io_uring_setup uintptr(bs.queueDepth), uintptr(unsafe.Pointer(params)), 0) if errno != 0 { return nil, fmt.Errorf("io_uring_setup failed: %v", errno) } ring := &ioUring{ fd: int(fd), ringSize: int(params.sqEntries), flags: params.flags, } // Map the submission queue ring sqRingSize := params.sqOff.array + params.sqEntries*uint32(unsafe.Sizeof(uint32(0))) cqRingSize := params.cqOff.cqes + params.cqEntries*uint32(unsafe.Sizeof(ioCqringEntry{})) if params.features&1 != 0 { // IORING_FEAT_SINGLE_MMAP if cqRingSize > sqRingSize { sqRingSize = cqRingSize } cqRingSize = sqRingSize } // mmap submission queue sqPtr, _, errno := syscall.Syscall6(syscall.SYS_MMAP, 0, uintptr(sqRingSize), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED|syscall.MAP_POPULATE, uintptr(fd), uintptr(IORING_OFF_SQ_RING)) if errno != 0 { syscall.Close(int(fd)) return nil, fmt.Errorf("mmap sq ring failed: %v", errno) } sqBase := sqPtr // mmap completion queue (if not single mmap) var cqPtr uintptr if params.features&1 != 0 { cqPtr = sqPtr } else { cqPtr, _, errno = syscall.Syscall6(syscall.SYS_MMAP, 0, uintptr(cqRingSize), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED|syscall.MAP_POPULATE, uintptr(fd), uintptr(IORING_OFF_CQ_RING)) if errno != 0 { syscall.Syscall(syscall.SYS_MUNMAP, sqPtr, uintptr(sqRingSize), 0) syscall.Close(int(fd)) return nil, fmt.Errorf("mmap cq ring failed: %v", errno) } } cqBase := cqPtr // mmap SQEs sqeSize := uint32(unsafe.Sizeof(ioSqringEntry{})) sqePtr, _, errno := syscall.Syscall6(syscall.SYS_MMAP, 0, uintptr(uint32(bs.queueDepth)*sqeSize), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED|syscall.MAP_POPULATE, uintptr(fd), uintptr(IORING_OFF_SQES)) if errno != 0 { syscall.Syscall(syscall.SYS_MUNMAP, sqPtr, uintptr(sqRingSize), 0) if cqPtr != sqPtr { syscall.Syscall(syscall.SYS_MUNMAP, cqPtr, uintptr(cqRingSize), 0) } syscall.Close(int(fd)) return nil, fmt.Errorf("mmap sqes failed: %v", errno) } // Setup submission queue sq := &ioUringSq{ head: (*uint32)(unsafe.Pointer(sqBase + uintptr(params.sqOff.head))), tail: (*uint32)(unsafe.Pointer(sqBase + uintptr(params.sqOff.tail))), ringMask: (*uint32)(unsafe.Pointer(sqBase + uintptr(params.sqOff.ringMask))), ringEntries: (*uint32)(unsafe.Pointer(sqBase + uintptr(params.sqOff.ringEntries))), flags: (*uint32)(unsafe.Pointer(sqBase + uintptr(params.sqOff.flags))), dropped: (*uint32)(unsafe.Pointer(sqBase + uintptr(params.sqOff.dropped))), array: (*uint32)(unsafe.Pointer(sqBase + uintptr(params.sqOff.array))), sqes: make([]ioSqringEntry, bs.queueDepth), } copy(unsafe.Slice((*ioSqringEntry)(unsafe.Pointer(sqePtr)), bs.queueDepth), sq.sqes) // Setup completion queue cq := &ioUringCq{ head: (*uint32)(unsafe.Pointer(cqBase + uintptr(params.cqOff.head))), tail: (*uint32)(unsafe.Pointer(cqBase + uintptr(params.cqOff.tail))), ringMask: (*uint32)(unsafe.Pointer(cqBase + uintptr(params.cqOff.ringMask))), ringEntries: (*uint32)(unsafe.Pointer(cqBase + uintptr(params.cqOff.ringEntries))), overflow: (*uint32)(unsafe.Pointer(cqBase + uintptr(params.cqOff.overflow))), cqes: make([]ioCqringEntry, params.cqEntries), } copy(unsafe.Slice((*ioCqringEntry)(unsafe.Pointer(cqBase+uintptr(params.cqOff.cqes))), params.cqEntries), cq.cqes) ring.sq = sq ring.cq = cq return ring, nil } // Close closes the backing file and io_uring func (bs *IOUringBackingStore) Close(dev *api.SCSILu) error { if bs.ring != nil { bs.closeIOUring() bs.ring = nil } if bs.file != nil { return bs.file.Close() } return nil } func (bs *IOUringBackingStore) closeIOUring() { if bs.ring != nil && bs.ring.fd >= 0 { syscall.Close(bs.ring.fd) } } // Init initializes the backing store func (bs *IOUringBackingStore) Init(dev *api.SCSILu, Opts string) error { return nil } // Exit exits the backing store func (bs *IOUringBackingStore) Exit(dev *api.SCSILu) error { return nil } // Size returns the size of the backing store func (bs *IOUringBackingStore) Size(dev *api.SCSILu) uint64 { return bs.DataSize } // Read reads data from the backing file using io_uring func (bs *IOUringBackingStore) Read(offset, tl int64) ([]byte, error) { if bs.file == nil { return nil, fmt.Errorf("backing store is not open") } buf := make([]byte, tl) // Prepare read operation bs.submitMu.Lock() defer bs.submitMu.Unlock() // Get next SQE sqe := bs.getSqe() if sqe == nil { // Ring is full, submit pending operations first if err := bs.submit(); err != nil { return nil, err } sqe = bs.getSqe() if sqe == nil { return nil, fmt.Errorf("io_uring queue full") } } // Setup read operation *sqe = ioSqringEntry{ opcode: IORING_OP_READ, fd: int32(bs.file.Fd()), off: uint64(offset), addr: uint64(uintptr(unsafe.Pointer(&buf[0]))), len: uint32(tl), userData: 1, // 1 = read operation } // Submit and wait for completion if err := bs.submitAndWait(1); err != nil { return nil, err } // Get completion cqe, err := bs.getCqe() if err != nil { return nil, err } if cqe.res < 0 { return nil, fmt.Errorf("read failed: %d", cqe.res) } atomic.AddUint64(&bs.opsCompleted, 1) return buf[:cqe.res], nil } // Write writes data to the backing file using io_uring func (bs *IOUringBackingStore) Write(wbuf []byte, offset int64) error { if bs.file == nil { return fmt.Errorf("backing store is not open") } bs.submitMu.Lock() defer bs.submitMu.Unlock() // Get next SQE sqe := bs.getSqe() if sqe == nil { if err := bs.submit(); err != nil { return err } sqe = bs.getSqe() if sqe == nil { return fmt.Errorf("io_uring queue full") } } // Setup write operation *sqe = ioSqringEntry{ opcode: IORING_OP_WRITE, fd: int32(bs.file.Fd()), off: uint64(offset), addr: uint64(uintptr(unsafe.Pointer(&wbuf[0]))), len: uint32(len(wbuf)), userData: 2, // 2 = write operation } // Submit and wait for completion if err := bs.submitAndWait(1); err != nil { return err } // Get completion cqe, err := bs.getCqe() if err != nil { return err } if cqe.res < 0 { return fmt.Errorf("write failed: %d", cqe.res) } if cqe.res != int32(len(wbuf)) { return fmt.Errorf("short write: %d != %d", cqe.res, len(wbuf)) } atomic.AddUint64(&bs.opsCompleted, 1) return nil } // DataSync syncs data to disk using io_uring func (bs *IOUringBackingStore) DataSync(offset, tl int64) error { if bs.file == nil { return fmt.Errorf("backing store is not open") } bs.submitMu.Lock() defer bs.submitMu.Unlock() sqe := bs.getSqe() if sqe == nil { if err := bs.submit(); err != nil { return err } sqe = bs.getSqe() if sqe == nil { return fmt.Errorf("io_uring queue full") } } *sqe = ioSqringEntry{ opcode: IORING_OP_FSYNC, fd: int32(bs.file.Fd()), len: IORING_FSYNC_DATASYNC, userData: 3, // 3 = fsync operation } if err := bs.submitAndWait(1); err != nil { return err } cqe, err := bs.getCqe() if err != nil { return err } if cqe.res < 0 { return fmt.Errorf("fsync failed: %d", cqe.res) } atomic.AddUint64(&bs.opsCompleted, 1) return nil } // DataAdvise provides advice about data access patterns func (bs *IOUringBackingStore) DataAdvise(offset, length int64, advise uint32) error { if bs.file == nil { return fmt.Errorf("backing store is not open") } // Use posix_fadvise via syscall _, _, errno := syscall.Syscall6(syscall.SYS_FADVISE64, uintptr(bs.file.Fd()), uintptr(offset), uintptr(length), uintptr(advise), 0, 0) if errno != 0 { return errno } return nil } // Unmap is a no-op for file-based storage func (bs *IOUringBackingStore) Unmap([]api.UnmapBlockDescriptor) error { return nil } // getSqe gets the next available submission queue entry func (bs *IOUringBackingStore) getSqe() *ioSqringEntry { sq := bs.ring.sq tail := atomic.LoadUint32(sq.tail) next := tail + 1 if next-atomic.LoadUint32(sq.head) > uint32(bs.ring.ringSize) { return nil // Queue is full } idx := tail & *sq.ringMask return &sq.sqes[idx] } // submit submits pending SQEs to the kernel func (bs *IOUringBackingStore) submit() error { if bs.ring == nil { return fmt.Errorf("io_uring not initialized") } // Update tail atomic.StoreUint32(bs.ring.sq.tail, atomic.LoadUint32(bs.ring.sq.tail)+1) // Submit using io_uring_enter syscall _, _, errno := syscall.Syscall6(426, // __NR_io_uring_enter uintptr(bs.ring.fd), uintptr(1), // submit 1 operation 0, // min complete 0, // flags 0, 0) if errno != 0 { return fmt.Errorf("io_uring_enter failed: %v", errno) } atomic.AddUint64(&bs.opsSubmitted, 1) return nil } // submitAndWait submits operations and waits for completions func (bs *IOUringBackingStore) submitAndWait(minComplete uint32) error { if bs.ring == nil { return fmt.Errorf("io_uring not initialized") } // Update tail atomic.StoreUint32(bs.ring.sq.tail, atomic.LoadUint32(bs.ring.sq.tail)+1) // Submit and wait _, _, errno := syscall.Syscall6(426, // __NR_io_uring_enter uintptr(bs.ring.fd), uintptr(1), // submit 1 operation uintptr(minComplete), // min complete 0, // flags 0, 0) if errno != 0 { return fmt.Errorf("io_uring_enter failed: %v", errno) } return nil } // getCqe gets a completion queue entry func (bs *IOUringBackingStore) getCqe() (*ioCqringEntry, error) { cq := bs.ring.cq // Wait for completion for atomic.LoadUint32(cq.head) == atomic.LoadUint32(cq.tail) { // Spin-wait for completion runtime.Gosched() } head := atomic.LoadUint32(cq.head) idx := head & *cq.ringMask cqe := &cq.cqes[idx] // Update head atomic.StoreUint32(cq.head, head+1) return cqe, nil } // Stats returns io_uring statistics func (bs *IOUringBackingStore) Stats() (submitted, completed uint64) { return atomic.LoadUint64(&bs.opsSubmitted), atomic.LoadUint64(&bs.opsCompleted) } // Available returns true if io_uring is available on this system func Available() bool { return ioUringEnabled }