whatcanGOwrong
This commit is contained in:
@@ -0,0 +1,6 @@
|
||||
// Package perf allows interacting with Linux perf_events.
|
||||
//
|
||||
// BPF allows submitting custom perf_events to a ring-buffer set up
|
||||
// by userspace. This is very useful to push things like packet samples
|
||||
// from BPF to a daemon running in user space.
|
||||
package perf
|
||||
@@ -0,0 +1,458 @@
|
||||
package perf
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"runtime"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/cilium/ebpf"
|
||||
"github.com/cilium/ebpf/internal"
|
||||
"github.com/cilium/ebpf/internal/epoll"
|
||||
"github.com/cilium/ebpf/internal/unix"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrClosed = os.ErrClosed
|
||||
errEOR = errors.New("end of ring")
|
||||
)
|
||||
|
||||
var perfEventHeaderSize = binary.Size(perfEventHeader{})
|
||||
|
||||
// perfEventHeader must match 'struct perf_event_header` in <linux/perf_event.h>.
|
||||
type perfEventHeader struct {
|
||||
Type uint32
|
||||
Misc uint16
|
||||
Size uint16
|
||||
}
|
||||
|
||||
func cpuForEvent(event *unix.EpollEvent) int {
|
||||
return int(event.Pad)
|
||||
}
|
||||
|
||||
// Record contains either a sample or a counter of the
|
||||
// number of lost samples.
|
||||
type Record struct {
|
||||
// The CPU this record was generated on.
|
||||
CPU int
|
||||
|
||||
// The data submitted via bpf_perf_event_output.
|
||||
// Due to a kernel bug, this can contain between 0 and 7 bytes of trailing
|
||||
// garbage from the ring depending on the input sample's length.
|
||||
RawSample []byte
|
||||
|
||||
// The number of samples which could not be output, since
|
||||
// the ring buffer was full.
|
||||
LostSamples uint64
|
||||
}
|
||||
|
||||
// Read a record from a reader and tag it as being from the given CPU.
|
||||
//
|
||||
// buf must be at least perfEventHeaderSize bytes long.
|
||||
func readRecord(rd io.Reader, rec *Record, buf []byte, overwritable bool) error {
|
||||
// Assert that the buffer is large enough.
|
||||
buf = buf[:perfEventHeaderSize]
|
||||
_, err := io.ReadFull(rd, buf)
|
||||
if errors.Is(err, io.EOF) {
|
||||
return errEOR
|
||||
} else if err != nil {
|
||||
return fmt.Errorf("read perf event header: %v", err)
|
||||
}
|
||||
|
||||
header := perfEventHeader{
|
||||
internal.NativeEndian.Uint32(buf[0:4]),
|
||||
internal.NativeEndian.Uint16(buf[4:6]),
|
||||
internal.NativeEndian.Uint16(buf[6:8]),
|
||||
}
|
||||
|
||||
switch header.Type {
|
||||
case unix.PERF_RECORD_LOST:
|
||||
rec.RawSample = rec.RawSample[:0]
|
||||
rec.LostSamples, err = readLostRecords(rd)
|
||||
return err
|
||||
|
||||
case unix.PERF_RECORD_SAMPLE:
|
||||
rec.LostSamples = 0
|
||||
// We can reuse buf here because perfEventHeaderSize > perfEventSampleSize.
|
||||
rec.RawSample, err = readRawSample(rd, buf, rec.RawSample)
|
||||
return err
|
||||
|
||||
default:
|
||||
return &unknownEventError{header.Type}
|
||||
}
|
||||
}
|
||||
|
||||
func readLostRecords(rd io.Reader) (uint64, error) {
|
||||
// lostHeader must match 'struct perf_event_lost in kernel sources.
|
||||
var lostHeader struct {
|
||||
ID uint64
|
||||
Lost uint64
|
||||
}
|
||||
|
||||
err := binary.Read(rd, internal.NativeEndian, &lostHeader)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("can't read lost records header: %v", err)
|
||||
}
|
||||
|
||||
return lostHeader.Lost, nil
|
||||
}
|
||||
|
||||
var perfEventSampleSize = binary.Size(uint32(0))
|
||||
|
||||
// This must match 'struct perf_event_sample in kernel sources.
|
||||
type perfEventSample struct {
|
||||
Size uint32
|
||||
}
|
||||
|
||||
func readRawSample(rd io.Reader, buf, sampleBuf []byte) ([]byte, error) {
|
||||
buf = buf[:perfEventSampleSize]
|
||||
if _, err := io.ReadFull(rd, buf); err != nil {
|
||||
return nil, fmt.Errorf("read sample size: %w", err)
|
||||
}
|
||||
|
||||
sample := perfEventSample{
|
||||
internal.NativeEndian.Uint32(buf),
|
||||
}
|
||||
|
||||
var data []byte
|
||||
if size := int(sample.Size); cap(sampleBuf) < size {
|
||||
data = make([]byte, size)
|
||||
} else {
|
||||
data = sampleBuf[:size]
|
||||
}
|
||||
|
||||
if _, err := io.ReadFull(rd, data); err != nil {
|
||||
return nil, fmt.Errorf("read sample: %w", err)
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// Reader allows reading bpf_perf_event_output
|
||||
// from user space.
|
||||
type Reader struct {
|
||||
poller *epoll.Poller
|
||||
deadline time.Time
|
||||
|
||||
// mu protects read/write access to the Reader structure with the
|
||||
// exception of 'pauseFds', which is protected by 'pauseMu'.
|
||||
// If locking both 'mu' and 'pauseMu', 'mu' must be locked first.
|
||||
mu sync.Mutex
|
||||
|
||||
// Closing a PERF_EVENT_ARRAY removes all event fds
|
||||
// stored in it, so we keep a reference alive.
|
||||
array *ebpf.Map
|
||||
rings []*perfEventRing
|
||||
epollEvents []unix.EpollEvent
|
||||
epollRings []*perfEventRing
|
||||
eventHeader []byte
|
||||
|
||||
// pauseFds are a copy of the fds in 'rings', protected by 'pauseMu'.
|
||||
// These allow Pause/Resume to be executed independently of any ongoing
|
||||
// Read calls, which would otherwise need to be interrupted.
|
||||
pauseMu sync.Mutex
|
||||
pauseFds []int
|
||||
|
||||
paused bool
|
||||
overwritable bool
|
||||
}
|
||||
|
||||
// ReaderOptions control the behaviour of the user
|
||||
// space reader.
|
||||
type ReaderOptions struct {
|
||||
// The number of written bytes required in any per CPU buffer before
|
||||
// Read will process data. Must be smaller than PerCPUBuffer.
|
||||
// The default is to start processing as soon as data is available.
|
||||
Watermark int
|
||||
// This perf ring buffer is overwritable, once full the oldest event will be
|
||||
// overwritten by newest.
|
||||
Overwritable bool
|
||||
}
|
||||
|
||||
// NewReader creates a new reader with default options.
|
||||
//
|
||||
// array must be a PerfEventArray. perCPUBuffer gives the size of the
|
||||
// per CPU buffer in bytes. It is rounded up to the nearest multiple
|
||||
// of the current page size.
|
||||
func NewReader(array *ebpf.Map, perCPUBuffer int) (*Reader, error) {
|
||||
return NewReaderWithOptions(array, perCPUBuffer, ReaderOptions{})
|
||||
}
|
||||
|
||||
// NewReaderWithOptions creates a new reader with the given options.
|
||||
func NewReaderWithOptions(array *ebpf.Map, perCPUBuffer int, opts ReaderOptions) (pr *Reader, err error) {
|
||||
if perCPUBuffer < 1 {
|
||||
return nil, errors.New("perCPUBuffer must be larger than 0")
|
||||
}
|
||||
|
||||
var (
|
||||
fds []int
|
||||
nCPU = int(array.MaxEntries())
|
||||
rings = make([]*perfEventRing, 0, nCPU)
|
||||
pauseFds = make([]int, 0, nCPU)
|
||||
)
|
||||
|
||||
poller, err := epoll.New()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err != nil {
|
||||
poller.Close()
|
||||
for _, fd := range fds {
|
||||
unix.Close(fd)
|
||||
}
|
||||
for _, ring := range rings {
|
||||
if ring != nil {
|
||||
ring.Close()
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// bpf_perf_event_output checks which CPU an event is enabled on,
|
||||
// but doesn't allow using a wildcard like -1 to specify "all CPUs".
|
||||
// Hence we have to create a ring for each CPU.
|
||||
for i := 0; i < nCPU; i++ {
|
||||
ring, err := newPerfEventRing(i, perCPUBuffer, opts.Watermark, opts.Overwritable)
|
||||
if errors.Is(err, unix.ENODEV) {
|
||||
// The requested CPU is currently offline, skip it.
|
||||
rings = append(rings, nil)
|
||||
pauseFds = append(pauseFds, -1)
|
||||
continue
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create perf ring for CPU %d: %v", i, err)
|
||||
}
|
||||
rings = append(rings, ring)
|
||||
pauseFds = append(pauseFds, ring.fd)
|
||||
|
||||
if err := poller.Add(ring.fd, i); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
array, err = array.Clone()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pr = &Reader{
|
||||
array: array,
|
||||
rings: rings,
|
||||
poller: poller,
|
||||
deadline: time.Time{},
|
||||
epollEvents: make([]unix.EpollEvent, len(rings)),
|
||||
epollRings: make([]*perfEventRing, 0, len(rings)),
|
||||
eventHeader: make([]byte, perfEventHeaderSize),
|
||||
pauseFds: pauseFds,
|
||||
overwritable: opts.Overwritable,
|
||||
}
|
||||
if err = pr.Resume(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
runtime.SetFinalizer(pr, (*Reader).Close)
|
||||
return pr, nil
|
||||
}
|
||||
|
||||
// Close frees resources used by the reader.
|
||||
//
|
||||
// It interrupts calls to Read.
|
||||
//
|
||||
// Calls to perf_event_output from eBPF programs will return
|
||||
// ENOENT after calling this method.
|
||||
func (pr *Reader) Close() error {
|
||||
if err := pr.poller.Close(); err != nil {
|
||||
if errors.Is(err, os.ErrClosed) {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("close poller: %w", err)
|
||||
}
|
||||
|
||||
// Trying to poll will now fail, so Read() can't block anymore. Acquire the
|
||||
// lock so that we can clean up.
|
||||
pr.mu.Lock()
|
||||
defer pr.mu.Unlock()
|
||||
|
||||
for _, ring := range pr.rings {
|
||||
if ring != nil {
|
||||
ring.Close()
|
||||
}
|
||||
}
|
||||
pr.rings = nil
|
||||
pr.pauseFds = nil
|
||||
pr.array.Close()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// SetDeadline controls how long Read and ReadInto will block waiting for samples.
|
||||
//
|
||||
// Passing a zero time.Time will remove the deadline. Passing a deadline in the
|
||||
// past will prevent the reader from blocking if there are no records to be read.
|
||||
func (pr *Reader) SetDeadline(t time.Time) {
|
||||
pr.mu.Lock()
|
||||
defer pr.mu.Unlock()
|
||||
|
||||
pr.deadline = t
|
||||
}
|
||||
|
||||
// Read the next record from the perf ring buffer.
|
||||
//
|
||||
// The function blocks until there are at least Watermark bytes in one
|
||||
// of the per CPU buffers. Records from buffers below the Watermark
|
||||
// are not returned.
|
||||
//
|
||||
// Records can contain between 0 and 7 bytes of trailing garbage from the ring
|
||||
// depending on the input sample's length.
|
||||
//
|
||||
// Calling Close interrupts the function.
|
||||
//
|
||||
// Returns os.ErrDeadlineExceeded if a deadline was set.
|
||||
func (pr *Reader) Read() (Record, error) {
|
||||
var r Record
|
||||
|
||||
return r, pr.ReadInto(&r)
|
||||
}
|
||||
|
||||
var errMustBePaused = fmt.Errorf("perf ringbuffer: must have been paused before reading overwritable buffer")
|
||||
|
||||
// ReadInto is like Read except that it allows reusing Record and associated buffers.
|
||||
func (pr *Reader) ReadInto(rec *Record) error {
|
||||
pr.mu.Lock()
|
||||
defer pr.mu.Unlock()
|
||||
|
||||
pr.pauseMu.Lock()
|
||||
defer pr.pauseMu.Unlock()
|
||||
|
||||
if pr.overwritable && !pr.paused {
|
||||
return errMustBePaused
|
||||
}
|
||||
|
||||
if pr.rings == nil {
|
||||
return fmt.Errorf("perf ringbuffer: %w", ErrClosed)
|
||||
}
|
||||
|
||||
for {
|
||||
if len(pr.epollRings) == 0 {
|
||||
// NB: The deferred pauseMu.Unlock will panic if Wait panics, which
|
||||
// might obscure the original panic.
|
||||
pr.pauseMu.Unlock()
|
||||
nEvents, err := pr.poller.Wait(pr.epollEvents, pr.deadline)
|
||||
pr.pauseMu.Lock()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Re-validate pr.paused since we dropped pauseMu.
|
||||
if pr.overwritable && !pr.paused {
|
||||
return errMustBePaused
|
||||
}
|
||||
|
||||
for _, event := range pr.epollEvents[:nEvents] {
|
||||
ring := pr.rings[cpuForEvent(&event)]
|
||||
pr.epollRings = append(pr.epollRings, ring)
|
||||
|
||||
// Read the current head pointer now, not every time
|
||||
// we read a record. This prevents a single fast producer
|
||||
// from keeping the reader busy.
|
||||
ring.loadHead()
|
||||
}
|
||||
}
|
||||
|
||||
// Start at the last available event. The order in which we
|
||||
// process them doesn't matter, and starting at the back allows
|
||||
// resizing epollRings to keep track of processed rings.
|
||||
err := pr.readRecordFromRing(rec, pr.epollRings[len(pr.epollRings)-1])
|
||||
if err == errEOR {
|
||||
// We've emptied the current ring buffer, process
|
||||
// the next one.
|
||||
pr.epollRings = pr.epollRings[:len(pr.epollRings)-1]
|
||||
continue
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// Pause stops all notifications from this Reader.
|
||||
//
|
||||
// While the Reader is paused, any attempts to write to the event buffer from
|
||||
// BPF programs will return -ENOENT.
|
||||
//
|
||||
// Subsequent calls to Read will block until a call to Resume.
|
||||
func (pr *Reader) Pause() error {
|
||||
pr.pauseMu.Lock()
|
||||
defer pr.pauseMu.Unlock()
|
||||
|
||||
if pr.pauseFds == nil {
|
||||
return fmt.Errorf("%w", ErrClosed)
|
||||
}
|
||||
|
||||
for i := range pr.pauseFds {
|
||||
if err := pr.array.Delete(uint32(i)); err != nil && !errors.Is(err, ebpf.ErrKeyNotExist) {
|
||||
return fmt.Errorf("could't delete event fd for CPU %d: %w", i, err)
|
||||
}
|
||||
}
|
||||
|
||||
pr.paused = true
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Resume allows this perf reader to emit notifications.
|
||||
//
|
||||
// Subsequent calls to Read will block until the next event notification.
|
||||
func (pr *Reader) Resume() error {
|
||||
pr.pauseMu.Lock()
|
||||
defer pr.pauseMu.Unlock()
|
||||
|
||||
if pr.pauseFds == nil {
|
||||
return fmt.Errorf("%w", ErrClosed)
|
||||
}
|
||||
|
||||
for i, fd := range pr.pauseFds {
|
||||
if fd == -1 {
|
||||
continue
|
||||
}
|
||||
|
||||
if err := pr.array.Put(uint32(i), uint32(fd)); err != nil {
|
||||
return fmt.Errorf("couldn't put event fd %d for CPU %d: %w", fd, i, err)
|
||||
}
|
||||
}
|
||||
|
||||
pr.paused = false
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// NB: Has to be preceded by a call to ring.loadHead.
|
||||
func (pr *Reader) readRecordFromRing(rec *Record, ring *perfEventRing) error {
|
||||
defer ring.writeTail()
|
||||
|
||||
rec.CPU = ring.cpu
|
||||
err := readRecord(ring, rec, pr.eventHeader, pr.overwritable)
|
||||
if pr.overwritable && (errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF)) {
|
||||
return errEOR
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
type unknownEventError struct {
|
||||
eventType uint32
|
||||
}
|
||||
|
||||
func (uev *unknownEventError) Error() string {
|
||||
return fmt.Sprintf("unknown event type: %d", uev.eventType)
|
||||
}
|
||||
|
||||
// IsUnknownEvent returns true if the error occurred
|
||||
// because an unknown event was submitted to the perf event ring.
|
||||
func IsUnknownEvent(err error) bool {
|
||||
var uee *unknownEventError
|
||||
return errors.As(err, &uee)
|
||||
}
|
||||
@@ -0,0 +1,634 @@
|
||||
package perf
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"syscall"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/cilium/ebpf"
|
||||
"github.com/cilium/ebpf/asm"
|
||||
"github.com/cilium/ebpf/internal"
|
||||
"github.com/cilium/ebpf/internal/testutils"
|
||||
"github.com/cilium/ebpf/internal/testutils/fdtrace"
|
||||
"github.com/cilium/ebpf/internal/unix"
|
||||
|
||||
qt "github.com/frankban/quicktest"
|
||||
)
|
||||
|
||||
var (
|
||||
readTimeout = 250 * time.Millisecond
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
fdtrace.TestMain(m)
|
||||
}
|
||||
|
||||
func TestPerfReader(t *testing.T) {
|
||||
events := perfEventArray(t)
|
||||
|
||||
rd, err := NewReader(events, 4096)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer rd.Close()
|
||||
|
||||
outputSamples(t, events, 5)
|
||||
|
||||
checkRecord(t, rd)
|
||||
|
||||
rd.SetDeadline(time.Now().Add(4 * time.Millisecond))
|
||||
_, err = rd.Read()
|
||||
qt.Assert(t, errors.Is(err, os.ErrDeadlineExceeded), qt.IsTrue, qt.Commentf("expected os.ErrDeadlineExceeded"))
|
||||
}
|
||||
|
||||
func TestReaderSetDeadline(t *testing.T) {
|
||||
events := perfEventArray(t)
|
||||
|
||||
rd, err := NewReader(events, 4096)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer rd.Close()
|
||||
|
||||
rd.SetDeadline(time.Now().Add(-time.Second))
|
||||
if _, err := rd.Read(); !errors.Is(err, os.ErrDeadlineExceeded) {
|
||||
t.Error("Expected os.ErrDeadlineExceeded from first Read, got:", err)
|
||||
}
|
||||
if _, err := rd.Read(); !errors.Is(err, os.ErrDeadlineExceeded) {
|
||||
t.Error("Expected os.ErrDeadlineExceeded from second Read, got:", err)
|
||||
}
|
||||
}
|
||||
|
||||
func outputSamples(tb testing.TB, events *ebpf.Map, sampleSizes ...byte) {
|
||||
prog := outputSamplesProg(tb, events, sampleSizes...)
|
||||
|
||||
ret, _, err := prog.Test(internal.EmptyBPFContext)
|
||||
testutils.SkipIfNotSupported(tb, err)
|
||||
if err != nil {
|
||||
tb.Fatal(err)
|
||||
}
|
||||
|
||||
if errno := syscall.Errno(-int32(ret)); errno != 0 {
|
||||
tb.Fatal("Expected 0 as return value, got", errno)
|
||||
}
|
||||
}
|
||||
|
||||
// outputSamplesProg creates a program which submits a series of samples to a PerfEventArray.
|
||||
//
|
||||
// The format of each sample is:
|
||||
//
|
||||
// index: 0 1 2 3 ... size - 1
|
||||
// content: size id 0xff 0xff ... 0xff [padding]
|
||||
//
|
||||
// padding is an implementation detail of the perf buffer and 1-7 bytes long. The
|
||||
// contents are undefined.
|
||||
func outputSamplesProg(tb testing.TB, events *ebpf.Map, sampleSizes ...byte) *ebpf.Program {
|
||||
tb.Helper()
|
||||
|
||||
// Requires at least 4.9 (0515e5999a46 "bpf: introduce BPF_PROG_TYPE_PERF_EVENT program type")
|
||||
testutils.SkipOnOldKernel(tb, "4.9", "perf events support")
|
||||
|
||||
const bpfFCurrentCPU = 0xffffffff
|
||||
|
||||
var maxSampleSize byte
|
||||
for _, sampleSize := range sampleSizes {
|
||||
if sampleSize < 2 {
|
||||
tb.Fatalf("Sample size %d is too small to contain size and counter", sampleSize)
|
||||
}
|
||||
if sampleSize > maxSampleSize {
|
||||
maxSampleSize = sampleSize
|
||||
}
|
||||
}
|
||||
|
||||
// Fill a buffer on the stack, and stash context somewhere
|
||||
insns := asm.Instructions{
|
||||
asm.LoadImm(asm.R0, ^int64(0), asm.DWord),
|
||||
asm.Mov.Reg(asm.R9, asm.R1),
|
||||
}
|
||||
|
||||
bufDwords := int(maxSampleSize/8) + 1
|
||||
for i := 0; i < bufDwords; i++ {
|
||||
insns = append(insns,
|
||||
asm.StoreMem(asm.RFP, int16(i+1)*-8, asm.R0, asm.DWord),
|
||||
)
|
||||
}
|
||||
|
||||
for i, sampleSize := range sampleSizes {
|
||||
insns = append(insns,
|
||||
// Restore stashed context.
|
||||
asm.Mov.Reg(asm.R1, asm.R9),
|
||||
// map
|
||||
asm.LoadMapPtr(asm.R2, events.FD()),
|
||||
// flags
|
||||
asm.LoadImm(asm.R3, bpfFCurrentCPU, asm.DWord),
|
||||
// buffer
|
||||
asm.Mov.Reg(asm.R4, asm.RFP),
|
||||
asm.Add.Imm(asm.R4, int32(bufDwords*-8)),
|
||||
// buffer[0] = size
|
||||
asm.StoreImm(asm.R4, 0, int64(sampleSize), asm.Byte),
|
||||
// buffer[1] = i
|
||||
asm.StoreImm(asm.R4, 1, int64(i&math.MaxUint8), asm.Byte),
|
||||
// size
|
||||
asm.Mov.Imm(asm.R5, int32(sampleSize)),
|
||||
asm.FnPerfEventOutput.Call(),
|
||||
)
|
||||
}
|
||||
|
||||
insns = append(insns, asm.Return())
|
||||
|
||||
prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
|
||||
License: "GPL",
|
||||
Type: ebpf.XDP,
|
||||
Instructions: insns,
|
||||
})
|
||||
if err != nil {
|
||||
tb.Fatal(err)
|
||||
}
|
||||
tb.Cleanup(func() { prog.Close() })
|
||||
|
||||
return prog
|
||||
}
|
||||
|
||||
func checkRecord(tb testing.TB, rd *Reader) (id int) {
|
||||
tb.Helper()
|
||||
|
||||
rec, err := rd.Read()
|
||||
qt.Assert(tb, err, qt.IsNil)
|
||||
|
||||
qt.Assert(tb, rec.CPU >= 0, qt.IsTrue, qt.Commentf("Record has invalid CPU number"))
|
||||
|
||||
size := int(rec.RawSample[0])
|
||||
qt.Assert(tb, len(rec.RawSample) >= size, qt.IsTrue, qt.Commentf("RawSample is at least size bytes"))
|
||||
|
||||
for i, v := range rec.RawSample[2:size] {
|
||||
qt.Assert(tb, v, qt.Equals, byte(0xff), qt.Commentf("filler at position %d should match", i+2))
|
||||
}
|
||||
|
||||
// padding is ignored since it's value is undefined.
|
||||
|
||||
return int(rec.RawSample[1])
|
||||
}
|
||||
|
||||
func TestPerfReaderLostSample(t *testing.T) {
|
||||
// To generate a lost sample perf record:
|
||||
//
|
||||
// 1. Fill the perf ring buffer almost completely, with the output_large program.
|
||||
// The buffer is sized in number of pages, which are architecture dependant.
|
||||
//
|
||||
// 2. Write an extra event that doesn't fit in the space remaining.
|
||||
//
|
||||
// 3. Write a smaller event that does fit, with output_single program.
|
||||
// Lost sample records are generated opportunistically, when the kernel
|
||||
// is writing an event and realizes that there were events lost previously.
|
||||
//
|
||||
// The event size is hardcoded in the test BPF programs, there's no way
|
||||
// to parametrize it without rebuilding the programs.
|
||||
//
|
||||
// The event size needs to be selected so that, for any page size, there are at least
|
||||
// 48 bytes left in the perf ring page after filling it with a whole number of events:
|
||||
//
|
||||
// - PERF_RECORD_LOST: 8 (perf_event_header) + 16 (PERF_RECORD_LOST)
|
||||
//
|
||||
// - output_single: 8 (perf_event_header) + 4 (size) + 5 (payload) + 7 (padding to 64bits)
|
||||
//
|
||||
// By selecting an event size of the form 2^n + 2^(n+1), for any page size 2^(n+m), m >= 0,
|
||||
// the number of bytes left, x, after filling a page with a whole number of events is:
|
||||
//
|
||||
// 2^(n+m) 2^n * 2^m
|
||||
// x = 2^n * frac(---------------) <=> x = 2^n * frac(---------------)
|
||||
// 2^n + 2^(n+1) 2^n + 2^n * 2
|
||||
//
|
||||
// 2^n * 2^m
|
||||
// <=> x = 2^n * frac(---------------)
|
||||
// 2^n * (1 + 2)
|
||||
//
|
||||
// 2^m
|
||||
// <=> x = 2^n * frac(-----)
|
||||
// 3
|
||||
//
|
||||
// 1 2
|
||||
// <=> x = 2^n * - or x = 2^n * -
|
||||
// 3 3
|
||||
//
|
||||
// Selecting n = 6, we have:
|
||||
//
|
||||
// x = 64 or x = 128, no matter the page size 2^(6+m)
|
||||
//
|
||||
// event size = 2^6 + 2^7 = 192
|
||||
//
|
||||
// Accounting for perf headers, output_large uses a 180 byte payload:
|
||||
//
|
||||
// 8 (perf_event_header) + 4 (size) + 180 (payload)
|
||||
const (
|
||||
eventSize = 192
|
||||
)
|
||||
|
||||
var (
|
||||
pageSize = os.Getpagesize()
|
||||
maxEvents = (pageSize / eventSize)
|
||||
)
|
||||
if remainder := pageSize % eventSize; remainder != 64 && remainder != 128 {
|
||||
// Page size isn't 2^(6+m), m >= 0
|
||||
t.Fatal("unsupported page size:", pageSize)
|
||||
}
|
||||
|
||||
var sampleSizes []byte
|
||||
// Fill the ring with the maximum number of output_large events that will fit,
|
||||
// and generate a lost event by writing an additional event.
|
||||
for i := 0; i < maxEvents+1; i++ {
|
||||
sampleSizes = append(sampleSizes, 180)
|
||||
}
|
||||
|
||||
// Generate a small event to trigger the lost record
|
||||
sampleSizes = append(sampleSizes, 5)
|
||||
|
||||
events := perfEventArray(t)
|
||||
|
||||
rd, err := NewReader(events, pageSize)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer rd.Close()
|
||||
|
||||
outputSamples(t, events, sampleSizes...)
|
||||
|
||||
for range sampleSizes {
|
||||
record, err := rd.Read()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if record.RawSample == nil && record.LostSamples != 1 {
|
||||
t.Fatal("Expected a record with LostSamples 1, got", record.LostSamples)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPerfReaderOverwritable(t *testing.T) {
|
||||
// Smallest buffer size.
|
||||
pageSize := os.Getpagesize()
|
||||
|
||||
const sampleSize = math.MaxUint8
|
||||
|
||||
// Account for perf header (8) and size (4), align to 8 bytes as perf does.
|
||||
realSampleSize := internal.Align(sampleSize+8+4, 8)
|
||||
maxEvents := pageSize / realSampleSize
|
||||
|
||||
var sampleSizes []byte
|
||||
for i := 0; i < maxEvents; i++ {
|
||||
sampleSizes = append(sampleSizes, sampleSize)
|
||||
}
|
||||
// Append an extra sample that will overwrite the first sample.
|
||||
sampleSizes = append(sampleSizes, sampleSize)
|
||||
|
||||
events := perfEventArray(t)
|
||||
|
||||
rd, err := NewReaderWithOptions(events, pageSize, ReaderOptions{Overwritable: true})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer rd.Close()
|
||||
|
||||
_, err = rd.Read()
|
||||
qt.Assert(t, err, qt.ErrorIs, errMustBePaused)
|
||||
|
||||
outputSamples(t, events, sampleSizes...)
|
||||
|
||||
qt.Assert(t, rd.Pause(), qt.IsNil)
|
||||
rd.SetDeadline(time.Now())
|
||||
|
||||
nextID := maxEvents
|
||||
for i := 0; i < maxEvents; i++ {
|
||||
id := checkRecord(t, rd)
|
||||
qt.Assert(t, id, qt.Equals, nextID)
|
||||
nextID--
|
||||
}
|
||||
}
|
||||
|
||||
func TestPerfReaderOverwritableEmpty(t *testing.T) {
|
||||
events := perfEventArray(t)
|
||||
rd, err := NewReaderWithOptions(events, os.Getpagesize(), ReaderOptions{Overwritable: true})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer rd.Close()
|
||||
|
||||
err = rd.Pause()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
rd.SetDeadline(time.Now().Add(4 * time.Millisecond))
|
||||
_, err = rd.Read()
|
||||
qt.Assert(t, errors.Is(err, os.ErrDeadlineExceeded), qt.IsTrue, qt.Commentf("expected os.ErrDeadlineExceeded"))
|
||||
|
||||
err = rd.Resume()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPerfReaderClose(t *testing.T) {
|
||||
events := perfEventArray(t)
|
||||
|
||||
rd, err := NewReader(events, 4096)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer rd.Close()
|
||||
|
||||
errs := make(chan error, 1)
|
||||
waiting := make(chan struct{})
|
||||
go func() {
|
||||
close(waiting)
|
||||
_, err := rd.Read()
|
||||
errs <- err
|
||||
}()
|
||||
|
||||
<-waiting
|
||||
|
||||
// Close should interrupt Read
|
||||
if err := rd.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
select {
|
||||
case <-errs:
|
||||
case <-time.After(time.Second):
|
||||
t.Fatal("Close doesn't interrupt Read")
|
||||
}
|
||||
|
||||
// And we should be able to call it multiple times
|
||||
if err := rd.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, err := rd.Read(); err == nil {
|
||||
t.Fatal("Read on a closed PerfReader doesn't return an error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreatePerfEvent(t *testing.T) {
|
||||
fd, err := createPerfEvent(0, 1, false)
|
||||
if err != nil {
|
||||
t.Fatal("Can't create perf event:", err)
|
||||
}
|
||||
unix.Close(fd)
|
||||
}
|
||||
|
||||
func TestReadRecord(t *testing.T) {
|
||||
var buf bytes.Buffer
|
||||
|
||||
err := binary.Write(&buf, internal.NativeEndian, &perfEventHeader{})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
var rec Record
|
||||
err = readRecord(&buf, &rec, make([]byte, perfEventHeaderSize), false)
|
||||
if !IsUnknownEvent(err) {
|
||||
t.Error("readRecord should return unknown event error, got", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPause(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
events := perfEventArray(t)
|
||||
|
||||
rd, err := NewReader(events, 4096)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer rd.Close()
|
||||
|
||||
// Reader is already unpaused by default. It should be idempotent.
|
||||
if err = rd.Resume(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Write a sample. The reader should read it.
|
||||
prog := outputSamplesProg(t, events, 5)
|
||||
ret, _, err := prog.Test(internal.EmptyBPFContext)
|
||||
testutils.SkipIfNotSupported(t, err)
|
||||
if err != nil || ret != 0 {
|
||||
t.Fatal("Can't write sample")
|
||||
}
|
||||
if _, err := rd.Read(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Pause. No notification should trigger.
|
||||
if err = rd.Pause(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
errChan := make(chan error, 1)
|
||||
go func() {
|
||||
// Read one notification then send any errors and exit.
|
||||
_, err := rd.Read()
|
||||
errChan <- err
|
||||
}()
|
||||
ret, _, err = prog.Test(internal.EmptyBPFContext)
|
||||
if err == nil && ret == 0 {
|
||||
t.Fatal("Unexpectedly wrote sample while paused")
|
||||
} // else Success
|
||||
select {
|
||||
case err := <-errChan:
|
||||
// Failure: Pause was unsuccessful.
|
||||
t.Fatalf("received notification on paused reader: %s", err)
|
||||
case <-time.After(readTimeout):
|
||||
// Success
|
||||
}
|
||||
|
||||
// Pause should be idempotent.
|
||||
if err = rd.Pause(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Resume. Now notifications should continue.
|
||||
if err = rd.Resume(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
ret, _, err = prog.Test(internal.EmptyBPFContext)
|
||||
if err != nil || ret != 0 {
|
||||
t.Fatal("Can't write sample")
|
||||
}
|
||||
select {
|
||||
case err := <-errChan:
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
} // else Success
|
||||
case <-time.After(readTimeout):
|
||||
t.Fatal("timed out waiting for notification after resume")
|
||||
}
|
||||
|
||||
if err = rd.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Pause/Resume after close should be no-op.
|
||||
err = rd.Pause()
|
||||
qt.Assert(t, err, qt.Not(qt.Equals), ErrClosed, qt.Commentf("returns unwrapped ErrClosed"))
|
||||
qt.Assert(t, errors.Is(err, ErrClosed), qt.IsTrue, qt.Commentf("doesn't wrap ErrClosed"))
|
||||
|
||||
err = rd.Resume()
|
||||
qt.Assert(t, err, qt.Not(qt.Equals), ErrClosed, qt.Commentf("returns unwrapped ErrClosed"))
|
||||
qt.Assert(t, errors.Is(err, ErrClosed), qt.IsTrue, qt.Commentf("doesn't wrap ErrClosed"))
|
||||
}
|
||||
|
||||
func BenchmarkReader(b *testing.B) {
|
||||
events := perfEventArray(b)
|
||||
prog := outputSamplesProg(b, events, 80)
|
||||
|
||||
rd, err := NewReader(events, 4096)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
defer rd.Close()
|
||||
|
||||
buf := internal.EmptyBPFContext
|
||||
|
||||
b.ResetTimer()
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
ret, _, err := prog.Test(buf)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
} else if errno := syscall.Errno(-int32(ret)); errno != 0 {
|
||||
b.Fatal("Expected 0 as return value, got", errno)
|
||||
}
|
||||
|
||||
if _, err = rd.Read(); err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkReadInto(b *testing.B) {
|
||||
events := perfEventArray(b)
|
||||
prog := outputSamplesProg(b, events, 80)
|
||||
|
||||
rd, err := NewReader(events, 4096)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
defer rd.Close()
|
||||
|
||||
buf := internal.EmptyBPFContext
|
||||
|
||||
b.ResetTimer()
|
||||
b.ReportAllocs()
|
||||
|
||||
var rec Record
|
||||
for i := 0; i < b.N; i++ {
|
||||
// NB: Submitting samples into the perf event ring dominates
|
||||
// the benchmark time unfortunately.
|
||||
ret, _, err := prog.Test(buf)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
} else if errno := syscall.Errno(-int32(ret)); errno != 0 {
|
||||
b.Fatal("Expected 0 as return value, got", errno)
|
||||
}
|
||||
|
||||
if err := rd.ReadInto(&rec); err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This exists just to make the example below nicer.
|
||||
func bpfPerfEventOutputProgram() (*ebpf.Program, *ebpf.Map) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// ExamplePerfReader submits a perf event using BPF,
|
||||
// and then reads it in user space.
|
||||
//
|
||||
// The BPF will look something like this:
|
||||
//
|
||||
// struct map events __section("maps") = {
|
||||
// .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
|
||||
// };
|
||||
//
|
||||
// __section("xdp") int output_single(void *ctx) {
|
||||
// unsigned char buf[] = {
|
||||
// 1, 2, 3, 4, 5
|
||||
// };
|
||||
//
|
||||
// return perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &buf[0], 5);
|
||||
// }
|
||||
//
|
||||
// Also see BPF_F_CTXLEN_MASK if you want to sample packet data
|
||||
// from SKB or XDP programs.
|
||||
func ExampleReader() {
|
||||
prog, events := bpfPerfEventOutputProgram()
|
||||
defer prog.Close()
|
||||
defer events.Close()
|
||||
|
||||
rd, err := NewReader(events, 4096)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer rd.Close()
|
||||
|
||||
// Writes out a sample with content 1,2,3,4,4
|
||||
ret, _, err := prog.Test(internal.EmptyBPFContext)
|
||||
if err != nil || ret != 0 {
|
||||
panic("Can't write sample")
|
||||
}
|
||||
|
||||
record, err := rd.Read()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
// Data is padded with 0 for alignment
|
||||
fmt.Println("Sample:", record.RawSample)
|
||||
}
|
||||
|
||||
// ReadRecord allows reducing memory allocations.
|
||||
func ExampleReader_ReadInto() {
|
||||
prog, events := bpfPerfEventOutputProgram()
|
||||
defer prog.Close()
|
||||
defer events.Close()
|
||||
|
||||
rd, err := NewReader(events, 4096)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer rd.Close()
|
||||
|
||||
for i := 0; i < 2; i++ {
|
||||
// Write out two samples
|
||||
ret, _, err := prog.Test(internal.EmptyBPFContext)
|
||||
if err != nil || ret != 0 {
|
||||
panic("Can't write sample")
|
||||
}
|
||||
}
|
||||
|
||||
var rec Record
|
||||
for i := 0; i < 2; i++ {
|
||||
if err := rd.ReadInto(&rec); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
fmt.Println("Sample:", rec.RawSample[:5])
|
||||
}
|
||||
}
|
||||
|
||||
func perfEventArray(tb testing.TB) *ebpf.Map {
|
||||
events, err := ebpf.NewMap(&ebpf.MapSpec{
|
||||
Type: ebpf.PerfEventArray,
|
||||
})
|
||||
if err != nil {
|
||||
tb.Fatal(err)
|
||||
}
|
||||
tb.Cleanup(func() { events.Close() })
|
||||
return events
|
||||
}
|
||||
@@ -0,0 +1,274 @@
|
||||
package perf
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
"os"
|
||||
"runtime"
|
||||
"sync/atomic"
|
||||
"unsafe"
|
||||
|
||||
"github.com/cilium/ebpf/internal/unix"
|
||||
)
|
||||
|
||||
// perfEventRing is a page of metadata followed by
|
||||
// a variable number of pages which form a ring buffer.
|
||||
type perfEventRing struct {
|
||||
fd int
|
||||
cpu int
|
||||
mmap []byte
|
||||
ringReader
|
||||
}
|
||||
|
||||
func newPerfEventRing(cpu, perCPUBuffer, watermark int, overwritable bool) (*perfEventRing, error) {
|
||||
if watermark >= perCPUBuffer {
|
||||
return nil, errors.New("watermark must be smaller than perCPUBuffer")
|
||||
}
|
||||
|
||||
fd, err := createPerfEvent(cpu, watermark, overwritable)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := unix.SetNonblock(fd, true); err != nil {
|
||||
unix.Close(fd)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
protections := unix.PROT_READ
|
||||
if !overwritable {
|
||||
protections |= unix.PROT_WRITE
|
||||
}
|
||||
|
||||
mmap, err := unix.Mmap(fd, 0, perfBufferSize(perCPUBuffer), protections, unix.MAP_SHARED)
|
||||
if err != nil {
|
||||
unix.Close(fd)
|
||||
return nil, fmt.Errorf("can't mmap: %v", err)
|
||||
}
|
||||
|
||||
// This relies on the fact that we allocate an extra metadata page,
|
||||
// and that the struct is smaller than an OS page.
|
||||
// This use of unsafe.Pointer isn't explicitly sanctioned by the
|
||||
// documentation, since a byte is smaller than sampledPerfEvent.
|
||||
meta := (*unix.PerfEventMmapPage)(unsafe.Pointer(&mmap[0]))
|
||||
|
||||
var reader ringReader
|
||||
if overwritable {
|
||||
reader = newReverseReader(meta, mmap[meta.Data_offset:meta.Data_offset+meta.Data_size])
|
||||
} else {
|
||||
reader = newForwardReader(meta, mmap[meta.Data_offset:meta.Data_offset+meta.Data_size])
|
||||
}
|
||||
|
||||
ring := &perfEventRing{
|
||||
fd: fd,
|
||||
cpu: cpu,
|
||||
mmap: mmap,
|
||||
ringReader: reader,
|
||||
}
|
||||
runtime.SetFinalizer(ring, (*perfEventRing).Close)
|
||||
|
||||
return ring, nil
|
||||
}
|
||||
|
||||
// perfBufferSize returns a valid mmap buffer size for use with perf_event_open (1+2^n pages)
|
||||
func perfBufferSize(perCPUBuffer int) int {
|
||||
pageSize := os.Getpagesize()
|
||||
|
||||
// Smallest whole number of pages
|
||||
nPages := (perCPUBuffer + pageSize - 1) / pageSize
|
||||
|
||||
// Round up to nearest power of two number of pages
|
||||
nPages = int(math.Pow(2, math.Ceil(math.Log2(float64(nPages)))))
|
||||
|
||||
// Add one for metadata
|
||||
nPages += 1
|
||||
|
||||
return nPages * pageSize
|
||||
}
|
||||
|
||||
func (ring *perfEventRing) Close() {
|
||||
runtime.SetFinalizer(ring, nil)
|
||||
|
||||
_ = unix.Close(ring.fd)
|
||||
_ = unix.Munmap(ring.mmap)
|
||||
|
||||
ring.fd = -1
|
||||
ring.mmap = nil
|
||||
}
|
||||
|
||||
func createPerfEvent(cpu, watermark int, overwritable bool) (int, error) {
|
||||
if watermark == 0 {
|
||||
watermark = 1
|
||||
}
|
||||
|
||||
bits := unix.PerfBitWatermark
|
||||
if overwritable {
|
||||
bits |= unix.PerfBitWriteBackward
|
||||
}
|
||||
|
||||
attr := unix.PerfEventAttr{
|
||||
Type: unix.PERF_TYPE_SOFTWARE,
|
||||
Config: unix.PERF_COUNT_SW_BPF_OUTPUT,
|
||||
Bits: uint64(bits),
|
||||
Sample_type: unix.PERF_SAMPLE_RAW,
|
||||
Wakeup: uint32(watermark),
|
||||
}
|
||||
|
||||
attr.Size = uint32(unsafe.Sizeof(attr))
|
||||
fd, err := unix.PerfEventOpen(&attr, -1, cpu, -1, unix.PERF_FLAG_FD_CLOEXEC)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("can't create perf event: %w", err)
|
||||
}
|
||||
return fd, nil
|
||||
}
|
||||
|
||||
type ringReader interface {
|
||||
loadHead()
|
||||
size() int
|
||||
writeTail()
|
||||
Read(p []byte) (int, error)
|
||||
}
|
||||
|
||||
type forwardReader struct {
|
||||
meta *unix.PerfEventMmapPage
|
||||
head, tail uint64
|
||||
mask uint64
|
||||
ring []byte
|
||||
}
|
||||
|
||||
func newForwardReader(meta *unix.PerfEventMmapPage, ring []byte) *forwardReader {
|
||||
return &forwardReader{
|
||||
meta: meta,
|
||||
head: atomic.LoadUint64(&meta.Data_head),
|
||||
tail: atomic.LoadUint64(&meta.Data_tail),
|
||||
// cap is always a power of two
|
||||
mask: uint64(cap(ring) - 1),
|
||||
ring: ring,
|
||||
}
|
||||
}
|
||||
|
||||
func (rr *forwardReader) loadHead() {
|
||||
rr.head = atomic.LoadUint64(&rr.meta.Data_head)
|
||||
}
|
||||
|
||||
func (rr *forwardReader) size() int {
|
||||
return len(rr.ring)
|
||||
}
|
||||
|
||||
func (rr *forwardReader) writeTail() {
|
||||
// Commit the new tail. This lets the kernel know that
|
||||
// the ring buffer has been consumed.
|
||||
atomic.StoreUint64(&rr.meta.Data_tail, rr.tail)
|
||||
}
|
||||
|
||||
func (rr *forwardReader) Read(p []byte) (int, error) {
|
||||
start := int(rr.tail & rr.mask)
|
||||
|
||||
n := len(p)
|
||||
// Truncate if the read wraps in the ring buffer
|
||||
if remainder := cap(rr.ring) - start; n > remainder {
|
||||
n = remainder
|
||||
}
|
||||
|
||||
// Truncate if there isn't enough data
|
||||
if remainder := int(rr.head - rr.tail); n > remainder {
|
||||
n = remainder
|
||||
}
|
||||
|
||||
copy(p, rr.ring[start:start+n])
|
||||
rr.tail += uint64(n)
|
||||
|
||||
if rr.tail == rr.head {
|
||||
return n, io.EOF
|
||||
}
|
||||
|
||||
return n, nil
|
||||
}
|
||||
|
||||
type reverseReader struct {
|
||||
meta *unix.PerfEventMmapPage
|
||||
// head is the position where the kernel last wrote data.
|
||||
head uint64
|
||||
// read is the position we read the next data from. Updated as reads are made.
|
||||
read uint64
|
||||
// tail is the end of the ring buffer. No reads must be made past it.
|
||||
tail uint64
|
||||
mask uint64
|
||||
ring []byte
|
||||
}
|
||||
|
||||
func newReverseReader(meta *unix.PerfEventMmapPage, ring []byte) *reverseReader {
|
||||
rr := &reverseReader{
|
||||
meta: meta,
|
||||
mask: uint64(cap(ring) - 1),
|
||||
ring: ring,
|
||||
}
|
||||
rr.loadHead()
|
||||
return rr
|
||||
}
|
||||
|
||||
func (rr *reverseReader) loadHead() {
|
||||
// The diagram below represents an overwritable perf ring buffer:
|
||||
//
|
||||
// head read tail
|
||||
// | | |
|
||||
// V V V
|
||||
// +---+--------+------------+---------+--------+
|
||||
// | |H-D....D|H-C........C|H-B.....B|H-A....A|
|
||||
// +---+--------+------------+---------+--------+
|
||||
// <--Write from right to left
|
||||
// Read from left to right-->
|
||||
// (H means header)
|
||||
//
|
||||
// The buffer is read left to right beginning from head to tail.
|
||||
// [head, read) is the read portion of the buffer, [read, tail) the unread one.
|
||||
// read is adjusted as we progress through the buffer.
|
||||
|
||||
// Avoid reading sample D multiple times by discarding unread samples C, B, A.
|
||||
rr.tail = rr.head
|
||||
|
||||
// Get the new head and starting reading from it.
|
||||
rr.head = atomic.LoadUint64(&rr.meta.Data_head)
|
||||
rr.read = rr.head
|
||||
|
||||
if rr.tail-rr.head > uint64(cap(rr.ring)) {
|
||||
// ring has been fully written, only permit at most cap(rr.ring)
|
||||
// bytes to be read.
|
||||
rr.tail = rr.head + uint64(cap(rr.ring))
|
||||
}
|
||||
}
|
||||
|
||||
func (rr *reverseReader) size() int {
|
||||
return len(rr.ring)
|
||||
}
|
||||
|
||||
func (rr *reverseReader) writeTail() {
|
||||
// We do not care about tail for over writable perf buffer.
|
||||
// So, this function is noop.
|
||||
}
|
||||
|
||||
func (rr *reverseReader) Read(p []byte) (int, error) {
|
||||
start := int(rr.read & rr.mask)
|
||||
|
||||
n := len(p)
|
||||
// Truncate if the read wraps in the ring buffer
|
||||
if remainder := cap(rr.ring) - start; n > remainder {
|
||||
n = remainder
|
||||
}
|
||||
|
||||
// Truncate if there isn't enough data
|
||||
if remainder := int(rr.tail - rr.read); n > remainder {
|
||||
n = remainder
|
||||
}
|
||||
|
||||
copy(p, rr.ring[start:start+n])
|
||||
rr.read += uint64(n)
|
||||
|
||||
if rr.read == rr.tail {
|
||||
return n, io.EOF
|
||||
}
|
||||
|
||||
return n, nil
|
||||
}
|
||||
@@ -0,0 +1,182 @@
|
||||
package perf
|
||||
|
||||
import (
|
||||
"io"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/cilium/ebpf/internal/unix"
|
||||
qt "github.com/frankban/quicktest"
|
||||
)
|
||||
|
||||
func TestRingBufferReader(t *testing.T) {
|
||||
ring := makeForwardRing(2, 0)
|
||||
checkRead(t, ring, []byte{0, 1}, io.EOF)
|
||||
checkRead(t, ring, []byte{}, io.EOF)
|
||||
|
||||
// Wrapping read
|
||||
ring = makeForwardRing(2, 1)
|
||||
checkRead(t, ring, []byte{1}, nil)
|
||||
checkRead(t, ring, []byte{0}, io.EOF)
|
||||
checkRead(t, ring, []byte{}, io.EOF)
|
||||
}
|
||||
|
||||
func TestRingBufferReverseReader(t *testing.T) {
|
||||
// First case: read 4, starting from offset 2.
|
||||
// The buffer should contain the following:
|
||||
//
|
||||
// [0 1 2 3]
|
||||
// ^
|
||||
// |
|
||||
// head
|
||||
//
|
||||
// As we read from position 2, we should get [2, 3].
|
||||
// Then, when we read it for the second time, we should get [0, 1] as we would
|
||||
// have looped around the buffer.
|
||||
ring := makeReverseRing(4, 2)
|
||||
checkRead(t, ring, []byte{2, 3}, nil)
|
||||
checkRead(t, ring, []byte{0, 1}, io.EOF)
|
||||
checkRead(t, ring, []byte{}, io.EOF)
|
||||
|
||||
// Complicated case: read bytes until previous_head.
|
||||
//
|
||||
// [0 1 2 3]
|
||||
// ^ ^
|
||||
// | |
|
||||
// | +---previous_head
|
||||
// head
|
||||
ring = makeReverseRing(4, 2)
|
||||
checkReadBuffer(t, ring, []byte{2}, nil, make([]byte, 1))
|
||||
// Next read would be {3}, but we don't consume it.
|
||||
|
||||
// Pretend the kernel wrote another 2 bytes.
|
||||
ring.meta.Data_head -= 2
|
||||
ring.loadHead()
|
||||
|
||||
// {3} is discarded.
|
||||
checkRead(t, ring, []byte{0, 1}, io.EOF)
|
||||
|
||||
// Complicated case: read the whole buffer because it was "overwritten".
|
||||
//
|
||||
// [0 1 2 3]
|
||||
// ^
|
||||
// |
|
||||
// +---previous_head
|
||||
// |
|
||||
// head
|
||||
//
|
||||
// So, we should first read [2, 3] then [0, 1].
|
||||
ring = makeReverseRing(4, 2)
|
||||
ring.meta.Data_head -= ring.meta.Data_size
|
||||
ring.loadHead()
|
||||
|
||||
checkRead(t, ring, []byte{2, 3}, nil)
|
||||
checkRead(t, ring, []byte{0, 1}, io.EOF)
|
||||
}
|
||||
|
||||
// ensure that the next call to Read() yields the correct result.
|
||||
//
|
||||
// Read is called with a buffer that is larger than want so
|
||||
// that corner cases around wrapping can be checked. Use
|
||||
// checkReadBuffer if that is not desired.
|
||||
func checkRead(t *testing.T, r io.Reader, want []byte, wantErr error) {
|
||||
checkReadBuffer(t, r, want, wantErr, make([]byte, len(want)+1))
|
||||
}
|
||||
|
||||
func checkReadBuffer(t *testing.T, r io.Reader, want []byte, wantErr error, buf []byte) {
|
||||
t.Helper()
|
||||
|
||||
n, err := r.Read(buf)
|
||||
buf = buf[:n]
|
||||
qt.Assert(t, err, qt.Equals, wantErr)
|
||||
qt.Assert(t, buf, qt.DeepEquals, want)
|
||||
}
|
||||
|
||||
func makeBuffer(size int) []byte {
|
||||
buf := make([]byte, size)
|
||||
for i := range buf {
|
||||
buf[i] = byte(i)
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func makeReverseRing(size, offset int) *reverseReader {
|
||||
if size != 0 && (size&(size-1)) != 0 {
|
||||
panic("size must be power of two")
|
||||
}
|
||||
|
||||
meta := unix.PerfEventMmapPage{
|
||||
Data_head: 0 - uint64(size) - uint64(offset),
|
||||
Data_tail: 0, // never written by the kernel
|
||||
Data_size: uint64(size),
|
||||
}
|
||||
|
||||
return newReverseReader(&meta, makeBuffer(size))
|
||||
}
|
||||
|
||||
func makeForwardRing(size, offset int) *forwardReader {
|
||||
if size != 0 && (size&(size-1)) != 0 {
|
||||
panic("size must be power of two")
|
||||
}
|
||||
|
||||
meta := unix.PerfEventMmapPage{
|
||||
Data_head: uint64(size + offset),
|
||||
Data_tail: uint64(offset),
|
||||
Data_size: uint64(size),
|
||||
}
|
||||
|
||||
return newForwardReader(&meta, makeBuffer(size))
|
||||
}
|
||||
|
||||
func TestPerfEventRing(t *testing.T) {
|
||||
check := func(buffer, watermark int, overwritable bool) {
|
||||
ring, err := newPerfEventRing(0, buffer, watermark, overwritable)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
size := ring.size()
|
||||
|
||||
// Ring size should be at least as big as buffer
|
||||
if size < buffer {
|
||||
t.Fatalf("ring size %d smaller than buffer %d", size, buffer)
|
||||
}
|
||||
|
||||
// Ring size should be of the form 2^n pages (meta page has already been removed)
|
||||
if size%os.Getpagesize() != 0 {
|
||||
t.Fatalf("ring size %d not whole number of pages (pageSize %d)", size, os.Getpagesize())
|
||||
}
|
||||
nPages := size / os.Getpagesize()
|
||||
if nPages&(nPages-1) != 0 {
|
||||
t.Fatalf("ring size %d (%d pages) not a power of two pages (pageSize %d)", size, nPages, os.Getpagesize())
|
||||
}
|
||||
}
|
||||
|
||||
// watermark > buffer
|
||||
_, err := newPerfEventRing(0, 8192, 8193, false)
|
||||
if err == nil {
|
||||
t.Fatal("watermark > buffer allowed")
|
||||
}
|
||||
_, err = newPerfEventRing(0, 8192, 8193, true)
|
||||
if err == nil {
|
||||
t.Fatal("watermark > buffer allowed")
|
||||
}
|
||||
|
||||
// watermark == buffer
|
||||
_, err = newPerfEventRing(0, 8192, 8192, false)
|
||||
if err == nil {
|
||||
t.Fatal("watermark == buffer allowed")
|
||||
}
|
||||
_, err = newPerfEventRing(0, 8192, 8192, true)
|
||||
if err == nil {
|
||||
t.Fatal("watermark == buffer allowed")
|
||||
}
|
||||
|
||||
// buffer not a power of two, watermark < buffer
|
||||
check(8193, 8192, false)
|
||||
check(8193, 8192, true)
|
||||
|
||||
// large buffer not a multiple of page size at all (prime)
|
||||
check(65537, 8192, false)
|
||||
check(65537, 8192, true)
|
||||
}
|
||||
Reference in New Issue
Block a user