whatcanGOwrong

This commit is contained in:
2024-09-19 21:38:24 -04:00
commit d0ae4d841d
17908 changed files with 4096831 additions and 0 deletions
@@ -0,0 +1,6 @@
// Package perf allows interacting with Linux perf_events.
//
// BPF allows submitting custom perf_events to a ring-buffer set up
// by userspace. This is very useful to push things like packet samples
// from BPF to a daemon running in user space.
package perf
@@ -0,0 +1,458 @@
package perf
import (
"encoding/binary"
"errors"
"fmt"
"io"
"os"
"runtime"
"sync"
"time"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/internal"
"github.com/cilium/ebpf/internal/epoll"
"github.com/cilium/ebpf/internal/unix"
)
var (
ErrClosed = os.ErrClosed
errEOR = errors.New("end of ring")
)
var perfEventHeaderSize = binary.Size(perfEventHeader{})
// perfEventHeader must match 'struct perf_event_header` in <linux/perf_event.h>.
type perfEventHeader struct {
Type uint32
Misc uint16
Size uint16
}
func cpuForEvent(event *unix.EpollEvent) int {
return int(event.Pad)
}
// Record contains either a sample or a counter of the
// number of lost samples.
type Record struct {
// The CPU this record was generated on.
CPU int
// The data submitted via bpf_perf_event_output.
// Due to a kernel bug, this can contain between 0 and 7 bytes of trailing
// garbage from the ring depending on the input sample's length.
RawSample []byte
// The number of samples which could not be output, since
// the ring buffer was full.
LostSamples uint64
}
// Read a record from a reader and tag it as being from the given CPU.
//
// buf must be at least perfEventHeaderSize bytes long.
func readRecord(rd io.Reader, rec *Record, buf []byte, overwritable bool) error {
// Assert that the buffer is large enough.
buf = buf[:perfEventHeaderSize]
_, err := io.ReadFull(rd, buf)
if errors.Is(err, io.EOF) {
return errEOR
} else if err != nil {
return fmt.Errorf("read perf event header: %v", err)
}
header := perfEventHeader{
internal.NativeEndian.Uint32(buf[0:4]),
internal.NativeEndian.Uint16(buf[4:6]),
internal.NativeEndian.Uint16(buf[6:8]),
}
switch header.Type {
case unix.PERF_RECORD_LOST:
rec.RawSample = rec.RawSample[:0]
rec.LostSamples, err = readLostRecords(rd)
return err
case unix.PERF_RECORD_SAMPLE:
rec.LostSamples = 0
// We can reuse buf here because perfEventHeaderSize > perfEventSampleSize.
rec.RawSample, err = readRawSample(rd, buf, rec.RawSample)
return err
default:
return &unknownEventError{header.Type}
}
}
func readLostRecords(rd io.Reader) (uint64, error) {
// lostHeader must match 'struct perf_event_lost in kernel sources.
var lostHeader struct {
ID uint64
Lost uint64
}
err := binary.Read(rd, internal.NativeEndian, &lostHeader)
if err != nil {
return 0, fmt.Errorf("can't read lost records header: %v", err)
}
return lostHeader.Lost, nil
}
var perfEventSampleSize = binary.Size(uint32(0))
// This must match 'struct perf_event_sample in kernel sources.
type perfEventSample struct {
Size uint32
}
func readRawSample(rd io.Reader, buf, sampleBuf []byte) ([]byte, error) {
buf = buf[:perfEventSampleSize]
if _, err := io.ReadFull(rd, buf); err != nil {
return nil, fmt.Errorf("read sample size: %w", err)
}
sample := perfEventSample{
internal.NativeEndian.Uint32(buf),
}
var data []byte
if size := int(sample.Size); cap(sampleBuf) < size {
data = make([]byte, size)
} else {
data = sampleBuf[:size]
}
if _, err := io.ReadFull(rd, data); err != nil {
return nil, fmt.Errorf("read sample: %w", err)
}
return data, nil
}
// Reader allows reading bpf_perf_event_output
// from user space.
type Reader struct {
poller *epoll.Poller
deadline time.Time
// mu protects read/write access to the Reader structure with the
// exception of 'pauseFds', which is protected by 'pauseMu'.
// If locking both 'mu' and 'pauseMu', 'mu' must be locked first.
mu sync.Mutex
// Closing a PERF_EVENT_ARRAY removes all event fds
// stored in it, so we keep a reference alive.
array *ebpf.Map
rings []*perfEventRing
epollEvents []unix.EpollEvent
epollRings []*perfEventRing
eventHeader []byte
// pauseFds are a copy of the fds in 'rings', protected by 'pauseMu'.
// These allow Pause/Resume to be executed independently of any ongoing
// Read calls, which would otherwise need to be interrupted.
pauseMu sync.Mutex
pauseFds []int
paused bool
overwritable bool
}
// ReaderOptions control the behaviour of the user
// space reader.
type ReaderOptions struct {
// The number of written bytes required in any per CPU buffer before
// Read will process data. Must be smaller than PerCPUBuffer.
// The default is to start processing as soon as data is available.
Watermark int
// This perf ring buffer is overwritable, once full the oldest event will be
// overwritten by newest.
Overwritable bool
}
// NewReader creates a new reader with default options.
//
// array must be a PerfEventArray. perCPUBuffer gives the size of the
// per CPU buffer in bytes. It is rounded up to the nearest multiple
// of the current page size.
func NewReader(array *ebpf.Map, perCPUBuffer int) (*Reader, error) {
return NewReaderWithOptions(array, perCPUBuffer, ReaderOptions{})
}
// NewReaderWithOptions creates a new reader with the given options.
func NewReaderWithOptions(array *ebpf.Map, perCPUBuffer int, opts ReaderOptions) (pr *Reader, err error) {
if perCPUBuffer < 1 {
return nil, errors.New("perCPUBuffer must be larger than 0")
}
var (
fds []int
nCPU = int(array.MaxEntries())
rings = make([]*perfEventRing, 0, nCPU)
pauseFds = make([]int, 0, nCPU)
)
poller, err := epoll.New()
if err != nil {
return nil, err
}
defer func() {
if err != nil {
poller.Close()
for _, fd := range fds {
unix.Close(fd)
}
for _, ring := range rings {
if ring != nil {
ring.Close()
}
}
}
}()
// bpf_perf_event_output checks which CPU an event is enabled on,
// but doesn't allow using a wildcard like -1 to specify "all CPUs".
// Hence we have to create a ring for each CPU.
for i := 0; i < nCPU; i++ {
ring, err := newPerfEventRing(i, perCPUBuffer, opts.Watermark, opts.Overwritable)
if errors.Is(err, unix.ENODEV) {
// The requested CPU is currently offline, skip it.
rings = append(rings, nil)
pauseFds = append(pauseFds, -1)
continue
}
if err != nil {
return nil, fmt.Errorf("failed to create perf ring for CPU %d: %v", i, err)
}
rings = append(rings, ring)
pauseFds = append(pauseFds, ring.fd)
if err := poller.Add(ring.fd, i); err != nil {
return nil, err
}
}
array, err = array.Clone()
if err != nil {
return nil, err
}
pr = &Reader{
array: array,
rings: rings,
poller: poller,
deadline: time.Time{},
epollEvents: make([]unix.EpollEvent, len(rings)),
epollRings: make([]*perfEventRing, 0, len(rings)),
eventHeader: make([]byte, perfEventHeaderSize),
pauseFds: pauseFds,
overwritable: opts.Overwritable,
}
if err = pr.Resume(); err != nil {
return nil, err
}
runtime.SetFinalizer(pr, (*Reader).Close)
return pr, nil
}
// Close frees resources used by the reader.
//
// It interrupts calls to Read.
//
// Calls to perf_event_output from eBPF programs will return
// ENOENT after calling this method.
func (pr *Reader) Close() error {
if err := pr.poller.Close(); err != nil {
if errors.Is(err, os.ErrClosed) {
return nil
}
return fmt.Errorf("close poller: %w", err)
}
// Trying to poll will now fail, so Read() can't block anymore. Acquire the
// lock so that we can clean up.
pr.mu.Lock()
defer pr.mu.Unlock()
for _, ring := range pr.rings {
if ring != nil {
ring.Close()
}
}
pr.rings = nil
pr.pauseFds = nil
pr.array.Close()
return nil
}
// SetDeadline controls how long Read and ReadInto will block waiting for samples.
//
// Passing a zero time.Time will remove the deadline. Passing a deadline in the
// past will prevent the reader from blocking if there are no records to be read.
func (pr *Reader) SetDeadline(t time.Time) {
pr.mu.Lock()
defer pr.mu.Unlock()
pr.deadline = t
}
// Read the next record from the perf ring buffer.
//
// The function blocks until there are at least Watermark bytes in one
// of the per CPU buffers. Records from buffers below the Watermark
// are not returned.
//
// Records can contain between 0 and 7 bytes of trailing garbage from the ring
// depending on the input sample's length.
//
// Calling Close interrupts the function.
//
// Returns os.ErrDeadlineExceeded if a deadline was set.
func (pr *Reader) Read() (Record, error) {
var r Record
return r, pr.ReadInto(&r)
}
var errMustBePaused = fmt.Errorf("perf ringbuffer: must have been paused before reading overwritable buffer")
// ReadInto is like Read except that it allows reusing Record and associated buffers.
func (pr *Reader) ReadInto(rec *Record) error {
pr.mu.Lock()
defer pr.mu.Unlock()
pr.pauseMu.Lock()
defer pr.pauseMu.Unlock()
if pr.overwritable && !pr.paused {
return errMustBePaused
}
if pr.rings == nil {
return fmt.Errorf("perf ringbuffer: %w", ErrClosed)
}
for {
if len(pr.epollRings) == 0 {
// NB: The deferred pauseMu.Unlock will panic if Wait panics, which
// might obscure the original panic.
pr.pauseMu.Unlock()
nEvents, err := pr.poller.Wait(pr.epollEvents, pr.deadline)
pr.pauseMu.Lock()
if err != nil {
return err
}
// Re-validate pr.paused since we dropped pauseMu.
if pr.overwritable && !pr.paused {
return errMustBePaused
}
for _, event := range pr.epollEvents[:nEvents] {
ring := pr.rings[cpuForEvent(&event)]
pr.epollRings = append(pr.epollRings, ring)
// Read the current head pointer now, not every time
// we read a record. This prevents a single fast producer
// from keeping the reader busy.
ring.loadHead()
}
}
// Start at the last available event. The order in which we
// process them doesn't matter, and starting at the back allows
// resizing epollRings to keep track of processed rings.
err := pr.readRecordFromRing(rec, pr.epollRings[len(pr.epollRings)-1])
if err == errEOR {
// We've emptied the current ring buffer, process
// the next one.
pr.epollRings = pr.epollRings[:len(pr.epollRings)-1]
continue
}
return err
}
}
// Pause stops all notifications from this Reader.
//
// While the Reader is paused, any attempts to write to the event buffer from
// BPF programs will return -ENOENT.
//
// Subsequent calls to Read will block until a call to Resume.
func (pr *Reader) Pause() error {
pr.pauseMu.Lock()
defer pr.pauseMu.Unlock()
if pr.pauseFds == nil {
return fmt.Errorf("%w", ErrClosed)
}
for i := range pr.pauseFds {
if err := pr.array.Delete(uint32(i)); err != nil && !errors.Is(err, ebpf.ErrKeyNotExist) {
return fmt.Errorf("could't delete event fd for CPU %d: %w", i, err)
}
}
pr.paused = true
return nil
}
// Resume allows this perf reader to emit notifications.
//
// Subsequent calls to Read will block until the next event notification.
func (pr *Reader) Resume() error {
pr.pauseMu.Lock()
defer pr.pauseMu.Unlock()
if pr.pauseFds == nil {
return fmt.Errorf("%w", ErrClosed)
}
for i, fd := range pr.pauseFds {
if fd == -1 {
continue
}
if err := pr.array.Put(uint32(i), uint32(fd)); err != nil {
return fmt.Errorf("couldn't put event fd %d for CPU %d: %w", fd, i, err)
}
}
pr.paused = false
return nil
}
// NB: Has to be preceded by a call to ring.loadHead.
func (pr *Reader) readRecordFromRing(rec *Record, ring *perfEventRing) error {
defer ring.writeTail()
rec.CPU = ring.cpu
err := readRecord(ring, rec, pr.eventHeader, pr.overwritable)
if pr.overwritable && (errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF)) {
return errEOR
}
return err
}
type unknownEventError struct {
eventType uint32
}
func (uev *unknownEventError) Error() string {
return fmt.Sprintf("unknown event type: %d", uev.eventType)
}
// IsUnknownEvent returns true if the error occurred
// because an unknown event was submitted to the perf event ring.
func IsUnknownEvent(err error) bool {
var uee *unknownEventError
return errors.As(err, &uee)
}
@@ -0,0 +1,634 @@
package perf
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"math"
"os"
"syscall"
"testing"
"time"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/internal"
"github.com/cilium/ebpf/internal/testutils"
"github.com/cilium/ebpf/internal/testutils/fdtrace"
"github.com/cilium/ebpf/internal/unix"
qt "github.com/frankban/quicktest"
)
var (
readTimeout = 250 * time.Millisecond
)
func TestMain(m *testing.M) {
fdtrace.TestMain(m)
}
func TestPerfReader(t *testing.T) {
events := perfEventArray(t)
rd, err := NewReader(events, 4096)
if err != nil {
t.Fatal(err)
}
defer rd.Close()
outputSamples(t, events, 5)
checkRecord(t, rd)
rd.SetDeadline(time.Now().Add(4 * time.Millisecond))
_, err = rd.Read()
qt.Assert(t, errors.Is(err, os.ErrDeadlineExceeded), qt.IsTrue, qt.Commentf("expected os.ErrDeadlineExceeded"))
}
func TestReaderSetDeadline(t *testing.T) {
events := perfEventArray(t)
rd, err := NewReader(events, 4096)
if err != nil {
t.Fatal(err)
}
defer rd.Close()
rd.SetDeadline(time.Now().Add(-time.Second))
if _, err := rd.Read(); !errors.Is(err, os.ErrDeadlineExceeded) {
t.Error("Expected os.ErrDeadlineExceeded from first Read, got:", err)
}
if _, err := rd.Read(); !errors.Is(err, os.ErrDeadlineExceeded) {
t.Error("Expected os.ErrDeadlineExceeded from second Read, got:", err)
}
}
func outputSamples(tb testing.TB, events *ebpf.Map, sampleSizes ...byte) {
prog := outputSamplesProg(tb, events, sampleSizes...)
ret, _, err := prog.Test(internal.EmptyBPFContext)
testutils.SkipIfNotSupported(tb, err)
if err != nil {
tb.Fatal(err)
}
if errno := syscall.Errno(-int32(ret)); errno != 0 {
tb.Fatal("Expected 0 as return value, got", errno)
}
}
// outputSamplesProg creates a program which submits a series of samples to a PerfEventArray.
//
// The format of each sample is:
//
// index: 0 1 2 3 ... size - 1
// content: size id 0xff 0xff ... 0xff [padding]
//
// padding is an implementation detail of the perf buffer and 1-7 bytes long. The
// contents are undefined.
func outputSamplesProg(tb testing.TB, events *ebpf.Map, sampleSizes ...byte) *ebpf.Program {
tb.Helper()
// Requires at least 4.9 (0515e5999a46 "bpf: introduce BPF_PROG_TYPE_PERF_EVENT program type")
testutils.SkipOnOldKernel(tb, "4.9", "perf events support")
const bpfFCurrentCPU = 0xffffffff
var maxSampleSize byte
for _, sampleSize := range sampleSizes {
if sampleSize < 2 {
tb.Fatalf("Sample size %d is too small to contain size and counter", sampleSize)
}
if sampleSize > maxSampleSize {
maxSampleSize = sampleSize
}
}
// Fill a buffer on the stack, and stash context somewhere
insns := asm.Instructions{
asm.LoadImm(asm.R0, ^int64(0), asm.DWord),
asm.Mov.Reg(asm.R9, asm.R1),
}
bufDwords := int(maxSampleSize/8) + 1
for i := 0; i < bufDwords; i++ {
insns = append(insns,
asm.StoreMem(asm.RFP, int16(i+1)*-8, asm.R0, asm.DWord),
)
}
for i, sampleSize := range sampleSizes {
insns = append(insns,
// Restore stashed context.
asm.Mov.Reg(asm.R1, asm.R9),
// map
asm.LoadMapPtr(asm.R2, events.FD()),
// flags
asm.LoadImm(asm.R3, bpfFCurrentCPU, asm.DWord),
// buffer
asm.Mov.Reg(asm.R4, asm.RFP),
asm.Add.Imm(asm.R4, int32(bufDwords*-8)),
// buffer[0] = size
asm.StoreImm(asm.R4, 0, int64(sampleSize), asm.Byte),
// buffer[1] = i
asm.StoreImm(asm.R4, 1, int64(i&math.MaxUint8), asm.Byte),
// size
asm.Mov.Imm(asm.R5, int32(sampleSize)),
asm.FnPerfEventOutput.Call(),
)
}
insns = append(insns, asm.Return())
prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
License: "GPL",
Type: ebpf.XDP,
Instructions: insns,
})
if err != nil {
tb.Fatal(err)
}
tb.Cleanup(func() { prog.Close() })
return prog
}
func checkRecord(tb testing.TB, rd *Reader) (id int) {
tb.Helper()
rec, err := rd.Read()
qt.Assert(tb, err, qt.IsNil)
qt.Assert(tb, rec.CPU >= 0, qt.IsTrue, qt.Commentf("Record has invalid CPU number"))
size := int(rec.RawSample[0])
qt.Assert(tb, len(rec.RawSample) >= size, qt.IsTrue, qt.Commentf("RawSample is at least size bytes"))
for i, v := range rec.RawSample[2:size] {
qt.Assert(tb, v, qt.Equals, byte(0xff), qt.Commentf("filler at position %d should match", i+2))
}
// padding is ignored since it's value is undefined.
return int(rec.RawSample[1])
}
func TestPerfReaderLostSample(t *testing.T) {
// To generate a lost sample perf record:
//
// 1. Fill the perf ring buffer almost completely, with the output_large program.
// The buffer is sized in number of pages, which are architecture dependant.
//
// 2. Write an extra event that doesn't fit in the space remaining.
//
// 3. Write a smaller event that does fit, with output_single program.
// Lost sample records are generated opportunistically, when the kernel
// is writing an event and realizes that there were events lost previously.
//
// The event size is hardcoded in the test BPF programs, there's no way
// to parametrize it without rebuilding the programs.
//
// The event size needs to be selected so that, for any page size, there are at least
// 48 bytes left in the perf ring page after filling it with a whole number of events:
//
// - PERF_RECORD_LOST: 8 (perf_event_header) + 16 (PERF_RECORD_LOST)
//
// - output_single: 8 (perf_event_header) + 4 (size) + 5 (payload) + 7 (padding to 64bits)
//
// By selecting an event size of the form 2^n + 2^(n+1), for any page size 2^(n+m), m >= 0,
// the number of bytes left, x, after filling a page with a whole number of events is:
//
// 2^(n+m) 2^n * 2^m
// x = 2^n * frac(---------------) <=> x = 2^n * frac(---------------)
// 2^n + 2^(n+1) 2^n + 2^n * 2
//
// 2^n * 2^m
// <=> x = 2^n * frac(---------------)
// 2^n * (1 + 2)
//
// 2^m
// <=> x = 2^n * frac(-----)
// 3
//
// 1 2
// <=> x = 2^n * - or x = 2^n * -
// 3 3
//
// Selecting n = 6, we have:
//
// x = 64 or x = 128, no matter the page size 2^(6+m)
//
// event size = 2^6 + 2^7 = 192
//
// Accounting for perf headers, output_large uses a 180 byte payload:
//
// 8 (perf_event_header) + 4 (size) + 180 (payload)
const (
eventSize = 192
)
var (
pageSize = os.Getpagesize()
maxEvents = (pageSize / eventSize)
)
if remainder := pageSize % eventSize; remainder != 64 && remainder != 128 {
// Page size isn't 2^(6+m), m >= 0
t.Fatal("unsupported page size:", pageSize)
}
var sampleSizes []byte
// Fill the ring with the maximum number of output_large events that will fit,
// and generate a lost event by writing an additional event.
for i := 0; i < maxEvents+1; i++ {
sampleSizes = append(sampleSizes, 180)
}
// Generate a small event to trigger the lost record
sampleSizes = append(sampleSizes, 5)
events := perfEventArray(t)
rd, err := NewReader(events, pageSize)
if err != nil {
t.Fatal(err)
}
defer rd.Close()
outputSamples(t, events, sampleSizes...)
for range sampleSizes {
record, err := rd.Read()
if err != nil {
t.Fatal(err)
}
if record.RawSample == nil && record.LostSamples != 1 {
t.Fatal("Expected a record with LostSamples 1, got", record.LostSamples)
}
}
}
func TestPerfReaderOverwritable(t *testing.T) {
// Smallest buffer size.
pageSize := os.Getpagesize()
const sampleSize = math.MaxUint8
// Account for perf header (8) and size (4), align to 8 bytes as perf does.
realSampleSize := internal.Align(sampleSize+8+4, 8)
maxEvents := pageSize / realSampleSize
var sampleSizes []byte
for i := 0; i < maxEvents; i++ {
sampleSizes = append(sampleSizes, sampleSize)
}
// Append an extra sample that will overwrite the first sample.
sampleSizes = append(sampleSizes, sampleSize)
events := perfEventArray(t)
rd, err := NewReaderWithOptions(events, pageSize, ReaderOptions{Overwritable: true})
if err != nil {
t.Fatal(err)
}
defer rd.Close()
_, err = rd.Read()
qt.Assert(t, err, qt.ErrorIs, errMustBePaused)
outputSamples(t, events, sampleSizes...)
qt.Assert(t, rd.Pause(), qt.IsNil)
rd.SetDeadline(time.Now())
nextID := maxEvents
for i := 0; i < maxEvents; i++ {
id := checkRecord(t, rd)
qt.Assert(t, id, qt.Equals, nextID)
nextID--
}
}
func TestPerfReaderOverwritableEmpty(t *testing.T) {
events := perfEventArray(t)
rd, err := NewReaderWithOptions(events, os.Getpagesize(), ReaderOptions{Overwritable: true})
if err != nil {
t.Fatal(err)
}
defer rd.Close()
err = rd.Pause()
if err != nil {
t.Fatal(err)
}
rd.SetDeadline(time.Now().Add(4 * time.Millisecond))
_, err = rd.Read()
qt.Assert(t, errors.Is(err, os.ErrDeadlineExceeded), qt.IsTrue, qt.Commentf("expected os.ErrDeadlineExceeded"))
err = rd.Resume()
if err != nil {
t.Fatal(err)
}
}
func TestPerfReaderClose(t *testing.T) {
events := perfEventArray(t)
rd, err := NewReader(events, 4096)
if err != nil {
t.Fatal(err)
}
defer rd.Close()
errs := make(chan error, 1)
waiting := make(chan struct{})
go func() {
close(waiting)
_, err := rd.Read()
errs <- err
}()
<-waiting
// Close should interrupt Read
if err := rd.Close(); err != nil {
t.Fatal(err)
}
select {
case <-errs:
case <-time.After(time.Second):
t.Fatal("Close doesn't interrupt Read")
}
// And we should be able to call it multiple times
if err := rd.Close(); err != nil {
t.Fatal(err)
}
if _, err := rd.Read(); err == nil {
t.Fatal("Read on a closed PerfReader doesn't return an error")
}
}
func TestCreatePerfEvent(t *testing.T) {
fd, err := createPerfEvent(0, 1, false)
if err != nil {
t.Fatal("Can't create perf event:", err)
}
unix.Close(fd)
}
func TestReadRecord(t *testing.T) {
var buf bytes.Buffer
err := binary.Write(&buf, internal.NativeEndian, &perfEventHeader{})
if err != nil {
t.Fatal(err)
}
var rec Record
err = readRecord(&buf, &rec, make([]byte, perfEventHeaderSize), false)
if !IsUnknownEvent(err) {
t.Error("readRecord should return unknown event error, got", err)
}
}
func TestPause(t *testing.T) {
t.Parallel()
events := perfEventArray(t)
rd, err := NewReader(events, 4096)
if err != nil {
t.Fatal(err)
}
defer rd.Close()
// Reader is already unpaused by default. It should be idempotent.
if err = rd.Resume(); err != nil {
t.Fatal(err)
}
// Write a sample. The reader should read it.
prog := outputSamplesProg(t, events, 5)
ret, _, err := prog.Test(internal.EmptyBPFContext)
testutils.SkipIfNotSupported(t, err)
if err != nil || ret != 0 {
t.Fatal("Can't write sample")
}
if _, err := rd.Read(); err != nil {
t.Fatal(err)
}
// Pause. No notification should trigger.
if err = rd.Pause(); err != nil {
t.Fatal(err)
}
errChan := make(chan error, 1)
go func() {
// Read one notification then send any errors and exit.
_, err := rd.Read()
errChan <- err
}()
ret, _, err = prog.Test(internal.EmptyBPFContext)
if err == nil && ret == 0 {
t.Fatal("Unexpectedly wrote sample while paused")
} // else Success
select {
case err := <-errChan:
// Failure: Pause was unsuccessful.
t.Fatalf("received notification on paused reader: %s", err)
case <-time.After(readTimeout):
// Success
}
// Pause should be idempotent.
if err = rd.Pause(); err != nil {
t.Fatal(err)
}
// Resume. Now notifications should continue.
if err = rd.Resume(); err != nil {
t.Fatal(err)
}
ret, _, err = prog.Test(internal.EmptyBPFContext)
if err != nil || ret != 0 {
t.Fatal("Can't write sample")
}
select {
case err := <-errChan:
if err != nil {
t.Fatal(err)
} // else Success
case <-time.After(readTimeout):
t.Fatal("timed out waiting for notification after resume")
}
if err = rd.Close(); err != nil {
t.Fatal(err)
}
// Pause/Resume after close should be no-op.
err = rd.Pause()
qt.Assert(t, err, qt.Not(qt.Equals), ErrClosed, qt.Commentf("returns unwrapped ErrClosed"))
qt.Assert(t, errors.Is(err, ErrClosed), qt.IsTrue, qt.Commentf("doesn't wrap ErrClosed"))
err = rd.Resume()
qt.Assert(t, err, qt.Not(qt.Equals), ErrClosed, qt.Commentf("returns unwrapped ErrClosed"))
qt.Assert(t, errors.Is(err, ErrClosed), qt.IsTrue, qt.Commentf("doesn't wrap ErrClosed"))
}
func BenchmarkReader(b *testing.B) {
events := perfEventArray(b)
prog := outputSamplesProg(b, events, 80)
rd, err := NewReader(events, 4096)
if err != nil {
b.Fatal(err)
}
defer rd.Close()
buf := internal.EmptyBPFContext
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
ret, _, err := prog.Test(buf)
if err != nil {
b.Fatal(err)
} else if errno := syscall.Errno(-int32(ret)); errno != 0 {
b.Fatal("Expected 0 as return value, got", errno)
}
if _, err = rd.Read(); err != nil {
b.Fatal(err)
}
}
}
func BenchmarkReadInto(b *testing.B) {
events := perfEventArray(b)
prog := outputSamplesProg(b, events, 80)
rd, err := NewReader(events, 4096)
if err != nil {
b.Fatal(err)
}
defer rd.Close()
buf := internal.EmptyBPFContext
b.ResetTimer()
b.ReportAllocs()
var rec Record
for i := 0; i < b.N; i++ {
// NB: Submitting samples into the perf event ring dominates
// the benchmark time unfortunately.
ret, _, err := prog.Test(buf)
if err != nil {
b.Fatal(err)
} else if errno := syscall.Errno(-int32(ret)); errno != 0 {
b.Fatal("Expected 0 as return value, got", errno)
}
if err := rd.ReadInto(&rec); err != nil {
b.Fatal(err)
}
}
}
// This exists just to make the example below nicer.
func bpfPerfEventOutputProgram() (*ebpf.Program, *ebpf.Map) {
return nil, nil
}
// ExamplePerfReader submits a perf event using BPF,
// and then reads it in user space.
//
// The BPF will look something like this:
//
// struct map events __section("maps") = {
// .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
// };
//
// __section("xdp") int output_single(void *ctx) {
// unsigned char buf[] = {
// 1, 2, 3, 4, 5
// };
//
// return perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &buf[0], 5);
// }
//
// Also see BPF_F_CTXLEN_MASK if you want to sample packet data
// from SKB or XDP programs.
func ExampleReader() {
prog, events := bpfPerfEventOutputProgram()
defer prog.Close()
defer events.Close()
rd, err := NewReader(events, 4096)
if err != nil {
panic(err)
}
defer rd.Close()
// Writes out a sample with content 1,2,3,4,4
ret, _, err := prog.Test(internal.EmptyBPFContext)
if err != nil || ret != 0 {
panic("Can't write sample")
}
record, err := rd.Read()
if err != nil {
panic(err)
}
// Data is padded with 0 for alignment
fmt.Println("Sample:", record.RawSample)
}
// ReadRecord allows reducing memory allocations.
func ExampleReader_ReadInto() {
prog, events := bpfPerfEventOutputProgram()
defer prog.Close()
defer events.Close()
rd, err := NewReader(events, 4096)
if err != nil {
panic(err)
}
defer rd.Close()
for i := 0; i < 2; i++ {
// Write out two samples
ret, _, err := prog.Test(internal.EmptyBPFContext)
if err != nil || ret != 0 {
panic("Can't write sample")
}
}
var rec Record
for i := 0; i < 2; i++ {
if err := rd.ReadInto(&rec); err != nil {
panic(err)
}
fmt.Println("Sample:", rec.RawSample[:5])
}
}
func perfEventArray(tb testing.TB) *ebpf.Map {
events, err := ebpf.NewMap(&ebpf.MapSpec{
Type: ebpf.PerfEventArray,
})
if err != nil {
tb.Fatal(err)
}
tb.Cleanup(func() { events.Close() })
return events
}
@@ -0,0 +1,274 @@
package perf
import (
"errors"
"fmt"
"io"
"math"
"os"
"runtime"
"sync/atomic"
"unsafe"
"github.com/cilium/ebpf/internal/unix"
)
// perfEventRing is a page of metadata followed by
// a variable number of pages which form a ring buffer.
type perfEventRing struct {
fd int
cpu int
mmap []byte
ringReader
}
func newPerfEventRing(cpu, perCPUBuffer, watermark int, overwritable bool) (*perfEventRing, error) {
if watermark >= perCPUBuffer {
return nil, errors.New("watermark must be smaller than perCPUBuffer")
}
fd, err := createPerfEvent(cpu, watermark, overwritable)
if err != nil {
return nil, err
}
if err := unix.SetNonblock(fd, true); err != nil {
unix.Close(fd)
return nil, err
}
protections := unix.PROT_READ
if !overwritable {
protections |= unix.PROT_WRITE
}
mmap, err := unix.Mmap(fd, 0, perfBufferSize(perCPUBuffer), protections, unix.MAP_SHARED)
if err != nil {
unix.Close(fd)
return nil, fmt.Errorf("can't mmap: %v", err)
}
// This relies on the fact that we allocate an extra metadata page,
// and that the struct is smaller than an OS page.
// This use of unsafe.Pointer isn't explicitly sanctioned by the
// documentation, since a byte is smaller than sampledPerfEvent.
meta := (*unix.PerfEventMmapPage)(unsafe.Pointer(&mmap[0]))
var reader ringReader
if overwritable {
reader = newReverseReader(meta, mmap[meta.Data_offset:meta.Data_offset+meta.Data_size])
} else {
reader = newForwardReader(meta, mmap[meta.Data_offset:meta.Data_offset+meta.Data_size])
}
ring := &perfEventRing{
fd: fd,
cpu: cpu,
mmap: mmap,
ringReader: reader,
}
runtime.SetFinalizer(ring, (*perfEventRing).Close)
return ring, nil
}
// perfBufferSize returns a valid mmap buffer size for use with perf_event_open (1+2^n pages)
func perfBufferSize(perCPUBuffer int) int {
pageSize := os.Getpagesize()
// Smallest whole number of pages
nPages := (perCPUBuffer + pageSize - 1) / pageSize
// Round up to nearest power of two number of pages
nPages = int(math.Pow(2, math.Ceil(math.Log2(float64(nPages)))))
// Add one for metadata
nPages += 1
return nPages * pageSize
}
func (ring *perfEventRing) Close() {
runtime.SetFinalizer(ring, nil)
_ = unix.Close(ring.fd)
_ = unix.Munmap(ring.mmap)
ring.fd = -1
ring.mmap = nil
}
func createPerfEvent(cpu, watermark int, overwritable bool) (int, error) {
if watermark == 0 {
watermark = 1
}
bits := unix.PerfBitWatermark
if overwritable {
bits |= unix.PerfBitWriteBackward
}
attr := unix.PerfEventAttr{
Type: unix.PERF_TYPE_SOFTWARE,
Config: unix.PERF_COUNT_SW_BPF_OUTPUT,
Bits: uint64(bits),
Sample_type: unix.PERF_SAMPLE_RAW,
Wakeup: uint32(watermark),
}
attr.Size = uint32(unsafe.Sizeof(attr))
fd, err := unix.PerfEventOpen(&attr, -1, cpu, -1, unix.PERF_FLAG_FD_CLOEXEC)
if err != nil {
return -1, fmt.Errorf("can't create perf event: %w", err)
}
return fd, nil
}
type ringReader interface {
loadHead()
size() int
writeTail()
Read(p []byte) (int, error)
}
type forwardReader struct {
meta *unix.PerfEventMmapPage
head, tail uint64
mask uint64
ring []byte
}
func newForwardReader(meta *unix.PerfEventMmapPage, ring []byte) *forwardReader {
return &forwardReader{
meta: meta,
head: atomic.LoadUint64(&meta.Data_head),
tail: atomic.LoadUint64(&meta.Data_tail),
// cap is always a power of two
mask: uint64(cap(ring) - 1),
ring: ring,
}
}
func (rr *forwardReader) loadHead() {
rr.head = atomic.LoadUint64(&rr.meta.Data_head)
}
func (rr *forwardReader) size() int {
return len(rr.ring)
}
func (rr *forwardReader) writeTail() {
// Commit the new tail. This lets the kernel know that
// the ring buffer has been consumed.
atomic.StoreUint64(&rr.meta.Data_tail, rr.tail)
}
func (rr *forwardReader) Read(p []byte) (int, error) {
start := int(rr.tail & rr.mask)
n := len(p)
// Truncate if the read wraps in the ring buffer
if remainder := cap(rr.ring) - start; n > remainder {
n = remainder
}
// Truncate if there isn't enough data
if remainder := int(rr.head - rr.tail); n > remainder {
n = remainder
}
copy(p, rr.ring[start:start+n])
rr.tail += uint64(n)
if rr.tail == rr.head {
return n, io.EOF
}
return n, nil
}
type reverseReader struct {
meta *unix.PerfEventMmapPage
// head is the position where the kernel last wrote data.
head uint64
// read is the position we read the next data from. Updated as reads are made.
read uint64
// tail is the end of the ring buffer. No reads must be made past it.
tail uint64
mask uint64
ring []byte
}
func newReverseReader(meta *unix.PerfEventMmapPage, ring []byte) *reverseReader {
rr := &reverseReader{
meta: meta,
mask: uint64(cap(ring) - 1),
ring: ring,
}
rr.loadHead()
return rr
}
func (rr *reverseReader) loadHead() {
// The diagram below represents an overwritable perf ring buffer:
//
// head read tail
// | | |
// V V V
// +---+--------+------------+---------+--------+
// | |H-D....D|H-C........C|H-B.....B|H-A....A|
// +---+--------+------------+---------+--------+
// <--Write from right to left
// Read from left to right-->
// (H means header)
//
// The buffer is read left to right beginning from head to tail.
// [head, read) is the read portion of the buffer, [read, tail) the unread one.
// read is adjusted as we progress through the buffer.
// Avoid reading sample D multiple times by discarding unread samples C, B, A.
rr.tail = rr.head
// Get the new head and starting reading from it.
rr.head = atomic.LoadUint64(&rr.meta.Data_head)
rr.read = rr.head
if rr.tail-rr.head > uint64(cap(rr.ring)) {
// ring has been fully written, only permit at most cap(rr.ring)
// bytes to be read.
rr.tail = rr.head + uint64(cap(rr.ring))
}
}
func (rr *reverseReader) size() int {
return len(rr.ring)
}
func (rr *reverseReader) writeTail() {
// We do not care about tail for over writable perf buffer.
// So, this function is noop.
}
func (rr *reverseReader) Read(p []byte) (int, error) {
start := int(rr.read & rr.mask)
n := len(p)
// Truncate if the read wraps in the ring buffer
if remainder := cap(rr.ring) - start; n > remainder {
n = remainder
}
// Truncate if there isn't enough data
if remainder := int(rr.tail - rr.read); n > remainder {
n = remainder
}
copy(p, rr.ring[start:start+n])
rr.read += uint64(n)
if rr.read == rr.tail {
return n, io.EOF
}
return n, nil
}
@@ -0,0 +1,182 @@
package perf
import (
"io"
"os"
"testing"
"github.com/cilium/ebpf/internal/unix"
qt "github.com/frankban/quicktest"
)
func TestRingBufferReader(t *testing.T) {
ring := makeForwardRing(2, 0)
checkRead(t, ring, []byte{0, 1}, io.EOF)
checkRead(t, ring, []byte{}, io.EOF)
// Wrapping read
ring = makeForwardRing(2, 1)
checkRead(t, ring, []byte{1}, nil)
checkRead(t, ring, []byte{0}, io.EOF)
checkRead(t, ring, []byte{}, io.EOF)
}
func TestRingBufferReverseReader(t *testing.T) {
// First case: read 4, starting from offset 2.
// The buffer should contain the following:
//
// [0 1 2 3]
// ^
// |
// head
//
// As we read from position 2, we should get [2, 3].
// Then, when we read it for the second time, we should get [0, 1] as we would
// have looped around the buffer.
ring := makeReverseRing(4, 2)
checkRead(t, ring, []byte{2, 3}, nil)
checkRead(t, ring, []byte{0, 1}, io.EOF)
checkRead(t, ring, []byte{}, io.EOF)
// Complicated case: read bytes until previous_head.
//
// [0 1 2 3]
// ^ ^
// | |
// | +---previous_head
// head
ring = makeReverseRing(4, 2)
checkReadBuffer(t, ring, []byte{2}, nil, make([]byte, 1))
// Next read would be {3}, but we don't consume it.
// Pretend the kernel wrote another 2 bytes.
ring.meta.Data_head -= 2
ring.loadHead()
// {3} is discarded.
checkRead(t, ring, []byte{0, 1}, io.EOF)
// Complicated case: read the whole buffer because it was "overwritten".
//
// [0 1 2 3]
// ^
// |
// +---previous_head
// |
// head
//
// So, we should first read [2, 3] then [0, 1].
ring = makeReverseRing(4, 2)
ring.meta.Data_head -= ring.meta.Data_size
ring.loadHead()
checkRead(t, ring, []byte{2, 3}, nil)
checkRead(t, ring, []byte{0, 1}, io.EOF)
}
// ensure that the next call to Read() yields the correct result.
//
// Read is called with a buffer that is larger than want so
// that corner cases around wrapping can be checked. Use
// checkReadBuffer if that is not desired.
func checkRead(t *testing.T, r io.Reader, want []byte, wantErr error) {
checkReadBuffer(t, r, want, wantErr, make([]byte, len(want)+1))
}
func checkReadBuffer(t *testing.T, r io.Reader, want []byte, wantErr error, buf []byte) {
t.Helper()
n, err := r.Read(buf)
buf = buf[:n]
qt.Assert(t, err, qt.Equals, wantErr)
qt.Assert(t, buf, qt.DeepEquals, want)
}
func makeBuffer(size int) []byte {
buf := make([]byte, size)
for i := range buf {
buf[i] = byte(i)
}
return buf
}
func makeReverseRing(size, offset int) *reverseReader {
if size != 0 && (size&(size-1)) != 0 {
panic("size must be power of two")
}
meta := unix.PerfEventMmapPage{
Data_head: 0 - uint64(size) - uint64(offset),
Data_tail: 0, // never written by the kernel
Data_size: uint64(size),
}
return newReverseReader(&meta, makeBuffer(size))
}
func makeForwardRing(size, offset int) *forwardReader {
if size != 0 && (size&(size-1)) != 0 {
panic("size must be power of two")
}
meta := unix.PerfEventMmapPage{
Data_head: uint64(size + offset),
Data_tail: uint64(offset),
Data_size: uint64(size),
}
return newForwardReader(&meta, makeBuffer(size))
}
func TestPerfEventRing(t *testing.T) {
check := func(buffer, watermark int, overwritable bool) {
ring, err := newPerfEventRing(0, buffer, watermark, overwritable)
if err != nil {
t.Fatal(err)
}
size := ring.size()
// Ring size should be at least as big as buffer
if size < buffer {
t.Fatalf("ring size %d smaller than buffer %d", size, buffer)
}
// Ring size should be of the form 2^n pages (meta page has already been removed)
if size%os.Getpagesize() != 0 {
t.Fatalf("ring size %d not whole number of pages (pageSize %d)", size, os.Getpagesize())
}
nPages := size / os.Getpagesize()
if nPages&(nPages-1) != 0 {
t.Fatalf("ring size %d (%d pages) not a power of two pages (pageSize %d)", size, nPages, os.Getpagesize())
}
}
// watermark > buffer
_, err := newPerfEventRing(0, 8192, 8193, false)
if err == nil {
t.Fatal("watermark > buffer allowed")
}
_, err = newPerfEventRing(0, 8192, 8193, true)
if err == nil {
t.Fatal("watermark > buffer allowed")
}
// watermark == buffer
_, err = newPerfEventRing(0, 8192, 8192, false)
if err == nil {
t.Fatal("watermark == buffer allowed")
}
_, err = newPerfEventRing(0, 8192, 8192, true)
if err == nil {
t.Fatal("watermark == buffer allowed")
}
// buffer not a power of two, watermark < buffer
check(8193, 8192, false)
check(8193, 8192, true)
// large buffer not a multiple of page size at all (prime)
check(65537, 8192, false)
check(65537, 8192, true)
}