whatcanGOwrong

This commit is contained in:
2024-09-19 21:38:24 -04:00
commit d0ae4d841d
17908 changed files with 4096831 additions and 0 deletions
@@ -0,0 +1,6 @@
// Package ringbuf allows interacting with Linux BPF ring buffer.
//
// BPF allows submitting custom events to a BPF ring buffer map set up
// by userspace. This is very useful to push things like packet samples
// from BPF to a daemon running in user space.
package ringbuf
@@ -0,0 +1,226 @@
package ringbuf
import (
"encoding/binary"
"errors"
"fmt"
"io"
"os"
"sync"
"time"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/internal"
"github.com/cilium/ebpf/internal/epoll"
"github.com/cilium/ebpf/internal/unix"
)
var (
ErrClosed = os.ErrClosed
errEOR = errors.New("end of ring")
errDiscard = errors.New("sample discarded")
errBusy = errors.New("sample not committed yet")
)
var ringbufHeaderSize = binary.Size(ringbufHeader{})
// ringbufHeader from 'struct bpf_ringbuf_hdr' in kernel/bpf/ringbuf.c
type ringbufHeader struct {
Len uint32
PgOff uint32
}
func (rh *ringbufHeader) isBusy() bool {
return rh.Len&unix.BPF_RINGBUF_BUSY_BIT != 0
}
func (rh *ringbufHeader) isDiscard() bool {
return rh.Len&unix.BPF_RINGBUF_DISCARD_BIT != 0
}
func (rh *ringbufHeader) dataLen() int {
return int(rh.Len & ^uint32(unix.BPF_RINGBUF_BUSY_BIT|unix.BPF_RINGBUF_DISCARD_BIT))
}
type Record struct {
RawSample []byte
}
// Read a record from an event ring.
//
// buf must be at least ringbufHeaderSize bytes long.
func readRecord(rd *ringbufEventRing, rec *Record, buf []byte) error {
rd.loadConsumer()
buf = buf[:ringbufHeaderSize]
if _, err := io.ReadFull(rd, buf); err == io.EOF {
return errEOR
} else if err != nil {
return fmt.Errorf("read event header: %w", err)
}
header := ringbufHeader{
internal.NativeEndian.Uint32(buf[0:4]),
internal.NativeEndian.Uint32(buf[4:8]),
}
if header.isBusy() {
// the next sample in the ring is not committed yet so we
// exit without storing the reader/consumer position
// and start again from the same position.
return errBusy
}
/* read up to 8 byte alignment */
dataLenAligned := uint64(internal.Align(header.dataLen(), 8))
if header.isDiscard() {
// when the record header indicates that the data should be
// discarded, we skip it by just updating the consumer position
// to the next record instead of normal Read() to avoid allocating data
// and reading/copying from the ring (which normally keeps track of the
// consumer position).
rd.skipRead(dataLenAligned)
rd.storeConsumer()
return errDiscard
}
if cap(rec.RawSample) < int(dataLenAligned) {
rec.RawSample = make([]byte, dataLenAligned)
} else {
rec.RawSample = rec.RawSample[:dataLenAligned]
}
if _, err := io.ReadFull(rd, rec.RawSample); err != nil {
return fmt.Errorf("read sample: %w", err)
}
rd.storeConsumer()
rec.RawSample = rec.RawSample[:header.dataLen()]
return nil
}
// Reader allows reading bpf_ringbuf_output
// from user space.
type Reader struct {
poller *epoll.Poller
// mu protects read/write access to the Reader structure
mu sync.Mutex
ring *ringbufEventRing
epollEvents []unix.EpollEvent
header []byte
haveData bool
deadline time.Time
}
// NewReader creates a new BPF ringbuf reader.
func NewReader(ringbufMap *ebpf.Map) (*Reader, error) {
if ringbufMap.Type() != ebpf.RingBuf {
return nil, fmt.Errorf("invalid Map type: %s", ringbufMap.Type())
}
maxEntries := int(ringbufMap.MaxEntries())
if maxEntries == 0 || (maxEntries&(maxEntries-1)) != 0 {
return nil, fmt.Errorf("ringbuffer map size %d is zero or not a power of two", maxEntries)
}
poller, err := epoll.New()
if err != nil {
return nil, err
}
if err := poller.Add(ringbufMap.FD(), 0); err != nil {
poller.Close()
return nil, err
}
ring, err := newRingBufEventRing(ringbufMap.FD(), maxEntries)
if err != nil {
poller.Close()
return nil, fmt.Errorf("failed to create ringbuf ring: %w", err)
}
return &Reader{
poller: poller,
ring: ring,
epollEvents: make([]unix.EpollEvent, 1),
header: make([]byte, ringbufHeaderSize),
}, nil
}
// Close frees resources used by the reader.
//
// It interrupts calls to Read.
func (r *Reader) Close() error {
if err := r.poller.Close(); err != nil {
if errors.Is(err, os.ErrClosed) {
return nil
}
return err
}
// Acquire the lock. This ensures that Read isn't running.
r.mu.Lock()
defer r.mu.Unlock()
if r.ring != nil {
r.ring.Close()
r.ring = nil
}
return nil
}
// SetDeadline controls how long Read and ReadInto will block waiting for samples.
//
// Passing a zero time.Time will remove the deadline.
func (r *Reader) SetDeadline(t time.Time) {
r.mu.Lock()
defer r.mu.Unlock()
r.deadline = t
}
// Read the next record from the BPF ringbuf.
//
// Returns os.ErrClosed if Close is called on the Reader, or os.ErrDeadlineExceeded
// if a deadline was set.
func (r *Reader) Read() (Record, error) {
var rec Record
return rec, r.ReadInto(&rec)
}
// ReadInto is like Read except that it allows reusing Record and associated buffers.
func (r *Reader) ReadInto(rec *Record) error {
r.mu.Lock()
defer r.mu.Unlock()
if r.ring == nil {
return fmt.Errorf("ringbuffer: %w", ErrClosed)
}
for {
if !r.haveData {
_, err := r.poller.Wait(r.epollEvents[:cap(r.epollEvents)], r.deadline)
if err != nil {
return err
}
r.haveData = true
}
for {
err := readRecord(r.ring, rec, r.header)
if err == errBusy || err == errDiscard {
continue
}
if err == errEOR {
r.haveData = false
break
}
return err
}
}
}
@@ -0,0 +1,328 @@
package ringbuf
import (
"errors"
"os"
"syscall"
"testing"
"time"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/internal"
"github.com/cilium/ebpf/internal/testutils"
"github.com/cilium/ebpf/internal/testutils/fdtrace"
"github.com/google/go-cmp/cmp"
)
func TestMain(m *testing.M) {
fdtrace.TestMain(m)
}
func TestRingbufReader(t *testing.T) {
testutils.SkipOnOldKernel(t, "5.8", "BPF ring buffer")
readerTests := []struct {
name string
messages []int
want map[int][]byte
}{
{
name: "send one short sample",
messages: []int{5},
want: map[int][]byte{
5: {1, 2, 3, 4, 4},
},
},
{
name: "send three short samples, the second is discarded",
messages: []int{5, 10, 15},
want: map[int][]byte{
5: {1, 2, 3, 4, 4},
15: {1, 2, 3, 4, 4, 3, 2, 1, 1, 2, 3, 4, 4, 3, 2},
},
},
}
for _, tt := range readerTests {
t.Run(tt.name, func(t *testing.T) {
prog, events := mustOutputSamplesProg(t, 0, tt.messages...)
rd, err := NewReader(events)
if err != nil {
t.Fatal(err)
}
defer rd.Close()
ret, _, err := prog.Test(internal.EmptyBPFContext)
testutils.SkipIfNotSupported(t, err)
if err != nil {
t.Fatal(err)
}
if errno := syscall.Errno(-int32(ret)); errno != 0 {
t.Fatal("Expected 0 as return value, got", errno)
}
raw := make(map[int][]byte)
for len(raw) < len(tt.want) {
record, err := rd.Read()
if err != nil {
t.Fatal("Can't read samples:", err)
}
raw[len(record.RawSample)] = record.RawSample
}
if diff := cmp.Diff(tt.want, raw); diff != "" {
t.Errorf("Read samples mismatch (-want +got):\n%s", diff)
}
})
}
}
func outputSamplesProg(flags int32, sampleSizes ...int) (*ebpf.Program, *ebpf.Map, error) {
events, err := ebpf.NewMap(&ebpf.MapSpec{
Type: ebpf.RingBuf,
MaxEntries: 4096,
})
if err != nil {
return nil, nil, err
}
var maxSampleSize int
for _, sampleSize := range sampleSizes {
if sampleSize > maxSampleSize {
maxSampleSize = sampleSize
}
}
insns := asm.Instructions{
asm.LoadImm(asm.R0, 0x0102030404030201, asm.DWord),
asm.Mov.Reg(asm.R9, asm.R1),
}
bufDwords := (maxSampleSize / 8) + 1
for i := 0; i < bufDwords; i++ {
insns = append(insns,
asm.StoreMem(asm.RFP, int16(i+1)*-8, asm.R0, asm.DWord),
)
}
for sampleIdx, sampleSize := range sampleSizes {
insns = append(insns,
asm.LoadMapPtr(asm.R1, events.FD()),
asm.Mov.Imm(asm.R2, int32(sampleSize)),
asm.Mov.Imm(asm.R3, int32(0)),
asm.FnRingbufReserve.Call(),
asm.JEq.Imm(asm.R0, 0, "exit"),
asm.Mov.Reg(asm.R5, asm.R0),
)
for i := 0; i < sampleSize; i++ {
insns = append(insns,
asm.LoadMem(asm.R4, asm.RFP, int16(i+1)*-1, asm.Byte),
asm.StoreMem(asm.R5, int16(i), asm.R4, asm.Byte),
)
}
// discard every even sample
if sampleIdx&1 != 0 {
insns = append(insns,
asm.Mov.Reg(asm.R1, asm.R5),
asm.Mov.Imm(asm.R2, flags),
asm.FnRingbufDiscard.Call(),
)
} else {
insns = append(insns,
asm.Mov.Reg(asm.R1, asm.R5),
asm.Mov.Imm(asm.R2, flags),
asm.FnRingbufSubmit.Call(),
)
}
}
insns = append(insns,
asm.Mov.Imm(asm.R0, int32(0)).WithSymbol("exit"),
asm.Return(),
)
prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
License: "MIT",
Type: ebpf.XDP,
Instructions: insns,
})
if err != nil {
events.Close()
return nil, nil, err
}
return prog, events, nil
}
func mustOutputSamplesProg(tb testing.TB, flags int32, sampleSizes ...int) (*ebpf.Program, *ebpf.Map) {
tb.Helper()
prog, events, err := outputSamplesProg(flags, sampleSizes...)
if err != nil {
tb.Fatal(err)
}
tb.Cleanup(func() {
prog.Close()
events.Close()
})
return prog, events
}
func TestReaderBlocking(t *testing.T) {
testutils.SkipOnOldKernel(t, "5.8", "BPF ring buffer")
prog, events := mustOutputSamplesProg(t, 0, 5)
ret, _, err := prog.Test(internal.EmptyBPFContext)
testutils.SkipIfNotSupported(t, err)
if err != nil {
t.Fatal(err)
}
if errno := syscall.Errno(-int32(ret)); errno != 0 {
t.Fatal("Expected 0 as return value, got", errno)
}
rd, err := NewReader(events)
if err != nil {
t.Fatal(err)
}
defer rd.Close()
if _, err := rd.Read(); err != nil {
t.Fatal("Can't read first sample:", err)
}
errs := make(chan error, 1)
go func() {
_, err := rd.Read()
errs <- err
}()
select {
case err := <-errs:
t.Fatal("Read returns error instead of blocking:", err)
case <-time.After(100 * time.Millisecond):
}
// Close should interrupt blocking Read
if err := rd.Close(); err != nil {
t.Fatal(err)
}
select {
case err := <-errs:
if !errors.Is(err, ErrClosed) {
t.Fatal("Expected os.ErrClosed from interrupted Read, got:", err)
}
case <-time.After(time.Second):
t.Fatal("Close doesn't interrupt Read")
}
// And we should be able to call it multiple times
if err := rd.Close(); err != nil {
t.Fatal(err)
}
if _, err := rd.Read(); !errors.Is(err, ErrClosed) {
t.Fatal("Second Read on a closed RingbufReader doesn't return ErrClosed")
}
}
func TestReaderSetDeadline(t *testing.T) {
testutils.SkipOnOldKernel(t, "5.8", "BPF ring buffer")
_, events := mustOutputSamplesProg(t, 0, 5)
rd, err := NewReader(events)
if err != nil {
t.Fatal(err)
}
defer rd.Close()
rd.SetDeadline(time.Now().Add(-time.Second))
if _, err := rd.Read(); !errors.Is(err, os.ErrDeadlineExceeded) {
t.Error("Expected os.ErrDeadlineExceeded from first Read, got:", err)
}
if _, err := rd.Read(); !errors.Is(err, os.ErrDeadlineExceeded) {
t.Error("Expected os.ErrDeadlineExceeded from second Read, got:", err)
}
}
func BenchmarkReader(b *testing.B) {
testutils.SkipOnOldKernel(b, "5.8", "BPF ring buffer")
readerBenchmarks := []struct {
name string
flags int32
}{
{
name: "normal epoll with timeout -1",
},
}
for _, bm := range readerBenchmarks {
b.Run(bm.name, func(b *testing.B) {
prog, events := mustOutputSamplesProg(b, bm.flags, 80)
rd, err := NewReader(events)
if err != nil {
b.Fatal(err)
}
defer rd.Close()
buf := internal.EmptyBPFContext
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
ret, _, err := prog.Test(buf)
if err != nil {
b.Fatal(err)
} else if errno := syscall.Errno(-int32(ret)); errno != 0 {
b.Fatal("Expected 0 as return value, got", errno)
}
_, err = rd.Read()
if err != nil {
b.Fatal("Can't read samples:", err)
}
}
})
}
}
func BenchmarkReadInto(b *testing.B) {
testutils.SkipOnOldKernel(b, "5.8", "BPF ring buffer")
prog, events := mustOutputSamplesProg(b, 0, 80)
rd, err := NewReader(events)
if err != nil {
b.Fatal(err)
}
defer rd.Close()
buf := internal.EmptyBPFContext
b.ResetTimer()
b.ReportAllocs()
var rec Record
for i := 0; i < b.N; i++ {
ret, _, err := prog.Test(buf)
if err != nil {
b.Fatal(err)
} else if errno := syscall.Errno(-int32(ret)); errno != 0 {
b.Fatal("Expected 0 as return value, got", errno)
}
if err := rd.ReadInto(&rec); err != nil {
b.Fatal("Can't read samples:", err)
}
}
}
@@ -0,0 +1,109 @@
package ringbuf
import (
"fmt"
"io"
"os"
"runtime"
"sync/atomic"
"unsafe"
"github.com/cilium/ebpf/internal/unix"
)
type ringbufEventRing struct {
prod []byte
cons []byte
*ringReader
}
func newRingBufEventRing(mapFD, size int) (*ringbufEventRing, error) {
cons, err := unix.Mmap(mapFD, 0, os.Getpagesize(), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED)
if err != nil {
return nil, fmt.Errorf("can't mmap consumer page: %w", err)
}
prod, err := unix.Mmap(mapFD, (int64)(os.Getpagesize()), os.Getpagesize()+2*size, unix.PROT_READ, unix.MAP_SHARED)
if err != nil {
_ = unix.Munmap(cons)
return nil, fmt.Errorf("can't mmap data pages: %w", err)
}
cons_pos := (*uint64)(unsafe.Pointer(&cons[0]))
prod_pos := (*uint64)(unsafe.Pointer(&prod[0]))
ring := &ringbufEventRing{
prod: prod,
cons: cons,
ringReader: newRingReader(cons_pos, prod_pos, prod[os.Getpagesize():]),
}
runtime.SetFinalizer(ring, (*ringbufEventRing).Close)
return ring, nil
}
func (ring *ringbufEventRing) Close() {
runtime.SetFinalizer(ring, nil)
_ = unix.Munmap(ring.prod)
_ = unix.Munmap(ring.cons)
ring.prod = nil
ring.cons = nil
}
type ringReader struct {
// These point into mmap'ed memory and must be accessed atomically.
prod_pos, cons_pos *uint64
cons uint64
mask uint64
ring []byte
}
func newRingReader(cons_ptr, prod_ptr *uint64, ring []byte) *ringReader {
return &ringReader{
prod_pos: prod_ptr,
cons_pos: cons_ptr,
cons: atomic.LoadUint64(cons_ptr),
// cap is always a power of two
mask: uint64(cap(ring)/2 - 1),
ring: ring,
}
}
func (rr *ringReader) loadConsumer() {
rr.cons = atomic.LoadUint64(rr.cons_pos)
}
func (rr *ringReader) storeConsumer() {
atomic.StoreUint64(rr.cons_pos, rr.cons)
}
// clamp delta to 'end' if 'start+delta' is beyond 'end'
func clamp(start, end, delta uint64) uint64 {
if remainder := end - start; delta > remainder {
return remainder
}
return delta
}
func (rr *ringReader) skipRead(skipBytes uint64) {
rr.cons += clamp(rr.cons, atomic.LoadUint64(rr.prod_pos), skipBytes)
}
func (rr *ringReader) Read(p []byte) (int, error) {
prod := atomic.LoadUint64(rr.prod_pos)
n := clamp(rr.cons, prod, uint64(len(p)))
start := rr.cons & rr.mask
copy(p, rr.ring[start:start+n])
rr.cons += n
if prod == rr.cons {
return int(n), io.EOF
}
return int(n), nil
}
@@ -0,0 +1,67 @@
package ringbuf
import (
"bytes"
"io"
"testing"
)
func TestRingBufferReader(t *testing.T) {
buf := make([]byte, 2)
ring := makeRing(2, 0)
n, err := ring.Read(buf)
if err != io.EOF {
t.Error("Expected io.EOF, got", err)
}
if n != 2 {
t.Errorf("Expected to read 2 bytes, got %d", n)
}
if !bytes.Equal(buf, []byte{0, 1}) {
t.Error("Expected [0, 1], got", buf)
}
n, err = ring.Read(buf)
if err != io.EOF {
t.Error("Expected io.EOF, got", err)
}
if n != 0 {
t.Error("Expected to read 0 bytes, got", n)
}
buf = make([]byte, 4)
ring = makeRing(4, 4)
n, err = io.ReadFull(ring, buf)
if err != nil {
t.Error("Expected nil, got", err)
}
if n != 4 {
t.Errorf("Expected to read 4 bytes, got %d", n)
}
if !bytes.Equal(buf, []byte{0, 1, 2, 3}) {
t.Error("Expected [0, 1, 2, 3], got", buf)
}
n, err = ring.Read(buf)
if err != io.EOF {
t.Error("Expected io.EOF, got", err)
}
if n != 0 {
t.Error("Expected to read 0 bytes, got", n)
}
}
func makeRing(size, offset int) *ringReader {
if size != 0 && (size&(size-1)) != 0 {
panic("size must be power of two")
}
ring := make([]byte, 2*size)
for i := range ring {
ring[i] = byte(i)
}
consumer := uint64(offset)
producer := uint64(len(ring)/2 + offset)
return newRingReader(&consumer, &producer, ring)
}