LearnGO/go/pkg/mod/github.com/cilium/ebpf@v0.11.0/perf/ring.go
2024-09-19 21:38:24 -04:00

275 lines
6.5 KiB
Go

package perf
import (
"errors"
"fmt"
"io"
"math"
"os"
"runtime"
"sync/atomic"
"unsafe"
"github.com/cilium/ebpf/internal/unix"
)
// perfEventRing is a page of metadata followed by
// a variable number of pages which form a ring buffer.
type perfEventRing struct {
fd int
cpu int
mmap []byte
ringReader
}
func newPerfEventRing(cpu, perCPUBuffer, watermark int, overwritable bool) (*perfEventRing, error) {
if watermark >= perCPUBuffer {
return nil, errors.New("watermark must be smaller than perCPUBuffer")
}
fd, err := createPerfEvent(cpu, watermark, overwritable)
if err != nil {
return nil, err
}
if err := unix.SetNonblock(fd, true); err != nil {
unix.Close(fd)
return nil, err
}
protections := unix.PROT_READ
if !overwritable {
protections |= unix.PROT_WRITE
}
mmap, err := unix.Mmap(fd, 0, perfBufferSize(perCPUBuffer), protections, unix.MAP_SHARED)
if err != nil {
unix.Close(fd)
return nil, fmt.Errorf("can't mmap: %v", err)
}
// This relies on the fact that we allocate an extra metadata page,
// and that the struct is smaller than an OS page.
// This use of unsafe.Pointer isn't explicitly sanctioned by the
// documentation, since a byte is smaller than sampledPerfEvent.
meta := (*unix.PerfEventMmapPage)(unsafe.Pointer(&mmap[0]))
var reader ringReader
if overwritable {
reader = newReverseReader(meta, mmap[meta.Data_offset:meta.Data_offset+meta.Data_size])
} else {
reader = newForwardReader(meta, mmap[meta.Data_offset:meta.Data_offset+meta.Data_size])
}
ring := &perfEventRing{
fd: fd,
cpu: cpu,
mmap: mmap,
ringReader: reader,
}
runtime.SetFinalizer(ring, (*perfEventRing).Close)
return ring, nil
}
// perfBufferSize returns a valid mmap buffer size for use with perf_event_open (1+2^n pages)
func perfBufferSize(perCPUBuffer int) int {
pageSize := os.Getpagesize()
// Smallest whole number of pages
nPages := (perCPUBuffer + pageSize - 1) / pageSize
// Round up to nearest power of two number of pages
nPages = int(math.Pow(2, math.Ceil(math.Log2(float64(nPages)))))
// Add one for metadata
nPages += 1
return nPages * pageSize
}
func (ring *perfEventRing) Close() {
runtime.SetFinalizer(ring, nil)
_ = unix.Close(ring.fd)
_ = unix.Munmap(ring.mmap)
ring.fd = -1
ring.mmap = nil
}
func createPerfEvent(cpu, watermark int, overwritable bool) (int, error) {
if watermark == 0 {
watermark = 1
}
bits := unix.PerfBitWatermark
if overwritable {
bits |= unix.PerfBitWriteBackward
}
attr := unix.PerfEventAttr{
Type: unix.PERF_TYPE_SOFTWARE,
Config: unix.PERF_COUNT_SW_BPF_OUTPUT,
Bits: uint64(bits),
Sample_type: unix.PERF_SAMPLE_RAW,
Wakeup: uint32(watermark),
}
attr.Size = uint32(unsafe.Sizeof(attr))
fd, err := unix.PerfEventOpen(&attr, -1, cpu, -1, unix.PERF_FLAG_FD_CLOEXEC)
if err != nil {
return -1, fmt.Errorf("can't create perf event: %w", err)
}
return fd, nil
}
type ringReader interface {
loadHead()
size() int
writeTail()
Read(p []byte) (int, error)
}
type forwardReader struct {
meta *unix.PerfEventMmapPage
head, tail uint64
mask uint64
ring []byte
}
func newForwardReader(meta *unix.PerfEventMmapPage, ring []byte) *forwardReader {
return &forwardReader{
meta: meta,
head: atomic.LoadUint64(&meta.Data_head),
tail: atomic.LoadUint64(&meta.Data_tail),
// cap is always a power of two
mask: uint64(cap(ring) - 1),
ring: ring,
}
}
func (rr *forwardReader) loadHead() {
rr.head = atomic.LoadUint64(&rr.meta.Data_head)
}
func (rr *forwardReader) size() int {
return len(rr.ring)
}
func (rr *forwardReader) writeTail() {
// Commit the new tail. This lets the kernel know that
// the ring buffer has been consumed.
atomic.StoreUint64(&rr.meta.Data_tail, rr.tail)
}
func (rr *forwardReader) Read(p []byte) (int, error) {
start := int(rr.tail & rr.mask)
n := len(p)
// Truncate if the read wraps in the ring buffer
if remainder := cap(rr.ring) - start; n > remainder {
n = remainder
}
// Truncate if there isn't enough data
if remainder := int(rr.head - rr.tail); n > remainder {
n = remainder
}
copy(p, rr.ring[start:start+n])
rr.tail += uint64(n)
if rr.tail == rr.head {
return n, io.EOF
}
return n, nil
}
type reverseReader struct {
meta *unix.PerfEventMmapPage
// head is the position where the kernel last wrote data.
head uint64
// read is the position we read the next data from. Updated as reads are made.
read uint64
// tail is the end of the ring buffer. No reads must be made past it.
tail uint64
mask uint64
ring []byte
}
func newReverseReader(meta *unix.PerfEventMmapPage, ring []byte) *reverseReader {
rr := &reverseReader{
meta: meta,
mask: uint64(cap(ring) - 1),
ring: ring,
}
rr.loadHead()
return rr
}
func (rr *reverseReader) loadHead() {
// The diagram below represents an overwritable perf ring buffer:
//
// head read tail
// | | |
// V V V
// +---+--------+------------+---------+--------+
// | |H-D....D|H-C........C|H-B.....B|H-A....A|
// +---+--------+------------+---------+--------+
// <--Write from right to left
// Read from left to right-->
// (H means header)
//
// The buffer is read left to right beginning from head to tail.
// [head, read) is the read portion of the buffer, [read, tail) the unread one.
// read is adjusted as we progress through the buffer.
// Avoid reading sample D multiple times by discarding unread samples C, B, A.
rr.tail = rr.head
// Get the new head and starting reading from it.
rr.head = atomic.LoadUint64(&rr.meta.Data_head)
rr.read = rr.head
if rr.tail-rr.head > uint64(cap(rr.ring)) {
// ring has been fully written, only permit at most cap(rr.ring)
// bytes to be read.
rr.tail = rr.head + uint64(cap(rr.ring))
}
}
func (rr *reverseReader) size() int {
return len(rr.ring)
}
func (rr *reverseReader) writeTail() {
// We do not care about tail for over writable perf buffer.
// So, this function is noop.
}
func (rr *reverseReader) Read(p []byte) (int, error) {
start := int(rr.read & rr.mask)
n := len(p)
// Truncate if the read wraps in the ring buffer
if remainder := cap(rr.ring) - start; n > remainder {
n = remainder
}
// Truncate if there isn't enough data
if remainder := int(rr.tail - rr.read); n > remainder {
n = remainder
}
copy(p, rr.ring[start:start+n])
rr.read += uint64(n)
if rr.read == rr.tail {
return n, io.EOF
}
return n, nil
}