275 lines
6.5 KiB
Go
275 lines
6.5 KiB
Go
package perf
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"math"
|
|
"os"
|
|
"runtime"
|
|
"sync/atomic"
|
|
"unsafe"
|
|
|
|
"github.com/cilium/ebpf/internal/unix"
|
|
)
|
|
|
|
// perfEventRing is a page of metadata followed by
|
|
// a variable number of pages which form a ring buffer.
|
|
type perfEventRing struct {
|
|
fd int
|
|
cpu int
|
|
mmap []byte
|
|
ringReader
|
|
}
|
|
|
|
func newPerfEventRing(cpu, perCPUBuffer, watermark int, overwritable bool) (*perfEventRing, error) {
|
|
if watermark >= perCPUBuffer {
|
|
return nil, errors.New("watermark must be smaller than perCPUBuffer")
|
|
}
|
|
|
|
fd, err := createPerfEvent(cpu, watermark, overwritable)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err := unix.SetNonblock(fd, true); err != nil {
|
|
unix.Close(fd)
|
|
return nil, err
|
|
}
|
|
|
|
protections := unix.PROT_READ
|
|
if !overwritable {
|
|
protections |= unix.PROT_WRITE
|
|
}
|
|
|
|
mmap, err := unix.Mmap(fd, 0, perfBufferSize(perCPUBuffer), protections, unix.MAP_SHARED)
|
|
if err != nil {
|
|
unix.Close(fd)
|
|
return nil, fmt.Errorf("can't mmap: %v", err)
|
|
}
|
|
|
|
// This relies on the fact that we allocate an extra metadata page,
|
|
// and that the struct is smaller than an OS page.
|
|
// This use of unsafe.Pointer isn't explicitly sanctioned by the
|
|
// documentation, since a byte is smaller than sampledPerfEvent.
|
|
meta := (*unix.PerfEventMmapPage)(unsafe.Pointer(&mmap[0]))
|
|
|
|
var reader ringReader
|
|
if overwritable {
|
|
reader = newReverseReader(meta, mmap[meta.Data_offset:meta.Data_offset+meta.Data_size])
|
|
} else {
|
|
reader = newForwardReader(meta, mmap[meta.Data_offset:meta.Data_offset+meta.Data_size])
|
|
}
|
|
|
|
ring := &perfEventRing{
|
|
fd: fd,
|
|
cpu: cpu,
|
|
mmap: mmap,
|
|
ringReader: reader,
|
|
}
|
|
runtime.SetFinalizer(ring, (*perfEventRing).Close)
|
|
|
|
return ring, nil
|
|
}
|
|
|
|
// perfBufferSize returns a valid mmap buffer size for use with perf_event_open (1+2^n pages)
|
|
func perfBufferSize(perCPUBuffer int) int {
|
|
pageSize := os.Getpagesize()
|
|
|
|
// Smallest whole number of pages
|
|
nPages := (perCPUBuffer + pageSize - 1) / pageSize
|
|
|
|
// Round up to nearest power of two number of pages
|
|
nPages = int(math.Pow(2, math.Ceil(math.Log2(float64(nPages)))))
|
|
|
|
// Add one for metadata
|
|
nPages += 1
|
|
|
|
return nPages * pageSize
|
|
}
|
|
|
|
func (ring *perfEventRing) Close() {
|
|
runtime.SetFinalizer(ring, nil)
|
|
|
|
_ = unix.Close(ring.fd)
|
|
_ = unix.Munmap(ring.mmap)
|
|
|
|
ring.fd = -1
|
|
ring.mmap = nil
|
|
}
|
|
|
|
func createPerfEvent(cpu, watermark int, overwritable bool) (int, error) {
|
|
if watermark == 0 {
|
|
watermark = 1
|
|
}
|
|
|
|
bits := unix.PerfBitWatermark
|
|
if overwritable {
|
|
bits |= unix.PerfBitWriteBackward
|
|
}
|
|
|
|
attr := unix.PerfEventAttr{
|
|
Type: unix.PERF_TYPE_SOFTWARE,
|
|
Config: unix.PERF_COUNT_SW_BPF_OUTPUT,
|
|
Bits: uint64(bits),
|
|
Sample_type: unix.PERF_SAMPLE_RAW,
|
|
Wakeup: uint32(watermark),
|
|
}
|
|
|
|
attr.Size = uint32(unsafe.Sizeof(attr))
|
|
fd, err := unix.PerfEventOpen(&attr, -1, cpu, -1, unix.PERF_FLAG_FD_CLOEXEC)
|
|
if err != nil {
|
|
return -1, fmt.Errorf("can't create perf event: %w", err)
|
|
}
|
|
return fd, nil
|
|
}
|
|
|
|
type ringReader interface {
|
|
loadHead()
|
|
size() int
|
|
writeTail()
|
|
Read(p []byte) (int, error)
|
|
}
|
|
|
|
type forwardReader struct {
|
|
meta *unix.PerfEventMmapPage
|
|
head, tail uint64
|
|
mask uint64
|
|
ring []byte
|
|
}
|
|
|
|
func newForwardReader(meta *unix.PerfEventMmapPage, ring []byte) *forwardReader {
|
|
return &forwardReader{
|
|
meta: meta,
|
|
head: atomic.LoadUint64(&meta.Data_head),
|
|
tail: atomic.LoadUint64(&meta.Data_tail),
|
|
// cap is always a power of two
|
|
mask: uint64(cap(ring) - 1),
|
|
ring: ring,
|
|
}
|
|
}
|
|
|
|
func (rr *forwardReader) loadHead() {
|
|
rr.head = atomic.LoadUint64(&rr.meta.Data_head)
|
|
}
|
|
|
|
func (rr *forwardReader) size() int {
|
|
return len(rr.ring)
|
|
}
|
|
|
|
func (rr *forwardReader) writeTail() {
|
|
// Commit the new tail. This lets the kernel know that
|
|
// the ring buffer has been consumed.
|
|
atomic.StoreUint64(&rr.meta.Data_tail, rr.tail)
|
|
}
|
|
|
|
func (rr *forwardReader) Read(p []byte) (int, error) {
|
|
start := int(rr.tail & rr.mask)
|
|
|
|
n := len(p)
|
|
// Truncate if the read wraps in the ring buffer
|
|
if remainder := cap(rr.ring) - start; n > remainder {
|
|
n = remainder
|
|
}
|
|
|
|
// Truncate if there isn't enough data
|
|
if remainder := int(rr.head - rr.tail); n > remainder {
|
|
n = remainder
|
|
}
|
|
|
|
copy(p, rr.ring[start:start+n])
|
|
rr.tail += uint64(n)
|
|
|
|
if rr.tail == rr.head {
|
|
return n, io.EOF
|
|
}
|
|
|
|
return n, nil
|
|
}
|
|
|
|
type reverseReader struct {
|
|
meta *unix.PerfEventMmapPage
|
|
// head is the position where the kernel last wrote data.
|
|
head uint64
|
|
// read is the position we read the next data from. Updated as reads are made.
|
|
read uint64
|
|
// tail is the end of the ring buffer. No reads must be made past it.
|
|
tail uint64
|
|
mask uint64
|
|
ring []byte
|
|
}
|
|
|
|
func newReverseReader(meta *unix.PerfEventMmapPage, ring []byte) *reverseReader {
|
|
rr := &reverseReader{
|
|
meta: meta,
|
|
mask: uint64(cap(ring) - 1),
|
|
ring: ring,
|
|
}
|
|
rr.loadHead()
|
|
return rr
|
|
}
|
|
|
|
func (rr *reverseReader) loadHead() {
|
|
// The diagram below represents an overwritable perf ring buffer:
|
|
//
|
|
// head read tail
|
|
// | | |
|
|
// V V V
|
|
// +---+--------+------------+---------+--------+
|
|
// | |H-D....D|H-C........C|H-B.....B|H-A....A|
|
|
// +---+--------+------------+---------+--------+
|
|
// <--Write from right to left
|
|
// Read from left to right-->
|
|
// (H means header)
|
|
//
|
|
// The buffer is read left to right beginning from head to tail.
|
|
// [head, read) is the read portion of the buffer, [read, tail) the unread one.
|
|
// read is adjusted as we progress through the buffer.
|
|
|
|
// Avoid reading sample D multiple times by discarding unread samples C, B, A.
|
|
rr.tail = rr.head
|
|
|
|
// Get the new head and starting reading from it.
|
|
rr.head = atomic.LoadUint64(&rr.meta.Data_head)
|
|
rr.read = rr.head
|
|
|
|
if rr.tail-rr.head > uint64(cap(rr.ring)) {
|
|
// ring has been fully written, only permit at most cap(rr.ring)
|
|
// bytes to be read.
|
|
rr.tail = rr.head + uint64(cap(rr.ring))
|
|
}
|
|
}
|
|
|
|
func (rr *reverseReader) size() int {
|
|
return len(rr.ring)
|
|
}
|
|
|
|
func (rr *reverseReader) writeTail() {
|
|
// We do not care about tail for over writable perf buffer.
|
|
// So, this function is noop.
|
|
}
|
|
|
|
func (rr *reverseReader) Read(p []byte) (int, error) {
|
|
start := int(rr.read & rr.mask)
|
|
|
|
n := len(p)
|
|
// Truncate if the read wraps in the ring buffer
|
|
if remainder := cap(rr.ring) - start; n > remainder {
|
|
n = remainder
|
|
}
|
|
|
|
// Truncate if there isn't enough data
|
|
if remainder := int(rr.tail - rr.read); n > remainder {
|
|
n = remainder
|
|
}
|
|
|
|
copy(p, rr.ring[start:start+n])
|
|
rr.read += uint64(n)
|
|
|
|
if rr.read == rr.tail {
|
|
return n, io.EOF
|
|
}
|
|
|
|
return n, nil
|
|
}
|