Benchmarking sync/atomic, a low-level package for atomic memory access in Go

2024-09-23 golang

sync/atomic is a low-level package for atomically reading and writing memory in Go. In the case of arm64, CPU instructions such as LDADDALD that manipulate memory atomically are executed. When reading and writing values from multiple goroutines, the usual method is to communicate via a channel or lock with sync.Mutex, so I have not had the opportunity to call the functions of this package, which require careful handling. However, it is called by libraries such as Prometheus Go client, and I was curious to see how different it is, so I ran some benchmarks.

$ go version
go version go1.23.0 darwin/arm64

$ system_profiler SPHardwareDataType

Hardware:

    Hardware Overview:

      Model Name: MacBook Air
      Model Identifier: Mac14,2
      Model Number: Z15Y001BPJ/A
      Chip: Apple M2
      Total Number of Cores: 8 (4 performance and 4 efficiency)
      Memory: 16 GB
      ...

Load

Load value from several goroutines using atomic.LoadInt64, sync.RWMutex, and channel.

package main

import (
	"fmt"
	"sync"
	"sync/atomic"
	"testing"
)

func BenchmarkAtomicLoad(b *testing.B) {
	var x int64
	for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
		b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
			var wg sync.WaitGroup
			for i := 0; i < numGoroutines; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()
					for j := 0; j < b.N/numGoroutines; j++ {
						atomic.LoadInt64(&x)
					}
				}()
			}
			wg.Wait()
		})
	}
}

func BenchmarkMutexLoad(b *testing.B) {
	var x int64
	var mu sync.RWMutex
	for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
		b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
			var wg sync.WaitGroup
			for i := 0; i < numGoroutines; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()
					for j := 0; j < b.N/numGoroutines; j++ {
						mu.RLock()
						_ = x
						mu.RUnlock()
					}
				}()
			}
			wg.Wait()
		})
	}
}

func BenchmarkChannelLoad(b *testing.B) {
	var x int64
	for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
		b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
			var wg sync.WaitGroup
			ch := make(chan int64, b.N)
			for i := 0; i < b.N; i++ {
				ch <- x
			}
			
			b.ResetTimer()

			for i := 0; i < numGoroutines; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()
					for j := 0; j < b.N/numGoroutines; j++ {
						<-ch
					}
				}()
			}
			wg.Wait()
		})
	}
}

atomic.LoadInt64 is several dozen times faster compared to others, reaching the scale of CPU clock cycles (3.49GHz → 0.29ns). Also, while increasing the number of parallel processes makes the others slower, atomic has some overhead, but the overall execution time is still short.

# go test -bench=. -cpu=8
BenchmarkAtomicLoad/goroutines-1-8              1000000000               0.5997 ns/op
BenchmarkAtomicLoad/goroutines-2-8              1000000000               0.3114 ns/op
BenchmarkAtomicLoad/goroutines-4-8              1000000000               0.1607 ns/op
BenchmarkAtomicLoad/goroutines-8-8              1000000000               0.1475 ns/op
BenchmarkAtomicLoad/goroutines-16-8             1000000000               0.1439 ns/op

BenchmarkMutexLoad/goroutines-1-8               121244922                9.565 ns/op
BenchmarkMutexLoad/goroutines-2-8               48834085                25.86 ns/op
BenchmarkMutexLoad/goroutines-4-8               35727799                35.35 ns/op
BenchmarkMutexLoad/goroutines-8-8               16799722                70.55 ns/op
BenchmarkMutexLoad/goroutines-16-8              18368448                72.44 ns/op

BenchmarkChannelLoad/goroutines-1-8             78505557                15.51 ns/op
BenchmarkChannelLoad/goroutines-2-8             58123460                25.45 ns/op
BenchmarkChannelLoad/goroutines-4-8             40698946                30.50 ns/op
BenchmarkChannelLoad/goroutines-8-8             34705275                34.15 ns/op
BenchmarkChannelLoad/goroutines-16-8            38005101                35.36 ns/op

Store

Similarly, compare atomic.StoreInt64 with Mutex and channel.

package main

import (
	"fmt"
	"sync"
	"sync/atomic"
	"testing"
)

func BenchmarkAtomicStore(b *testing.B) {
	var x int64
	for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
		b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
			var wg sync.WaitGroup
			for i := 0; i < numGoroutines; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()
					for j := 0; j < b.N/numGoroutines; j++ {
						atomic.StoreInt64(&x, int64(j))
					}
				}()
			}
			wg.Wait()
		})
	}
}

func BenchmarkMutexStore(b *testing.B) {
	var x int64
	var mu sync.RWMutex
	for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
		b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
			var wg sync.WaitGroup
			for i := 0; i < numGoroutines; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()
					for j := 0; j < b.N/numGoroutines; j++ {
						mu.Lock()
						x = int64(j)
						mu.Unlock()
					}
				}()
			}
			wg.Wait()
		})
	}
	_ = x
}

func BenchmarkChannelStore(b *testing.B) {
	for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
		b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
			var wg sync.WaitGroup
			ch := make(chan int64, b.N)
			done := make(chan bool)

			b.ResetTimer()

			go func() {
				for j := range ch {
					_ = j
				}
				done <- true
			}()

			for i := 0; i < numGoroutines; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()
					for j := 0; j < b.N/numGoroutines; j++ {
						ch <- int64(j)
					}
				}()
			}
			wg.Wait()
			close(ch)
			<-done
		})
	}
}

The execution time of atomic also increases with the number of parallel processes, but the increase is not as large as that of Mutex, which takes an exclusive lock each time. The execution time of channel does not increase much even when the number of parallel processes is increased, but atomic is still several times faster.

# go test -bench=. -cpu=8
BenchmarkAtomicStore/goroutines-1-8             1000000000               0.6098 ns/op
BenchmarkAtomicStore/goroutines-2-8             541424420                2.225 ns/op
BenchmarkAtomicStore/goroutines-4-8             342192758                3.520 ns/op
BenchmarkAtomicStore/goroutines-8-8             213443438                5.775 ns/op
BenchmarkAtomicStore/goroutines-16-8            218372529                5.596 ns/op

BenchmarkMutexStore/goroutines-1-8              73933736                17.38 ns/op
BenchmarkMutexStore/goroutines-2-8              23721489                61.15 ns/op
BenchmarkMutexStore/goroutines-4-8              10803847               108.3 ns/op
BenchmarkMutexStore/goroutines-8-8              11043054               110.8 ns/op
BenchmarkMutexStore/goroutines-16-8             10729257               113.7 ns/op

BenchmarkChannelStore/goroutines-1-8            30754740                39.72 ns/op
BenchmarkChannelStore/goroutines-2-8            15660642               101.8 ns/op
BenchmarkChannelStore/goroutines-4-8            25406817                52.38 ns/op
BenchmarkChannelStore/goroutines-8-8            18875343                53.60 ns/op
BenchmarkChannelStore/goroutines-16-8           22960464                52.04 ns/op

Add

Finally, try atomic.AddInt64().

package main

import (
	"fmt"
	"sync"
	"sync/atomic"
	"testing"
)

func BenchmarkAtomicAdd(b *testing.B) {
	var x int64
	for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
		b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
			var wg sync.WaitGroup
			for i := 0; i < numGoroutines; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()
					for j := 0; j < b.N/numGoroutines; j++ {
						atomic.AddInt64(&x, int64(j))
					}
				}()
			}
			wg.Wait()
		})
	}
}

func BenchmarkMutexAdd(b *testing.B) {
	var x int64
	var mu sync.RWMutex
	for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
		b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
			var wg sync.WaitGroup
			for i := 0; i < numGoroutines; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()
					for j := 0; j < b.N/numGoroutines; j++ {
						mu.Lock()
						x += int64(j)
						mu.Unlock()
					}
				}()
			}
			wg.Wait()
		})
	}
	_ = x
}

func BenchmarkChannelAdd(b *testing.B) {
	for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
		b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
			var wg sync.WaitGroup
			ch := make(chan int64, b.N)
			done := make(chan bool)

			b.ResetTimer()

			go func() {
				var x int64
				for j := range ch {
					x += j
				}
				done <- true
			}()

			for i := 0; i < numGoroutines; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()
					for j := 0; j < b.N/numGoroutines; j++ {
						ch <- int64(j)
					}
				}()
			}
			wg.Wait()
			close(ch)
			<-done
		})
	}
}

The execution time of atomic increased and the difference with channel narrowed to less than double.

# go test -bench=. -cpu=8
BenchmarkAtomicAdd/goroutines-1-8               298405735                3.840 ns/op
BenchmarkAtomicAdd/goroutines-2-8               100000000               12.98 ns/op
BenchmarkAtomicAdd/goroutines-4-8               70431834                17.26 ns/op
BenchmarkAtomicAdd/goroutines-8-8               34193878                35.93 ns/op
BenchmarkAtomicAdd/goroutines-16-8              34694154                36.35 ns/op

BenchmarkMutexAdd/goroutines-1-8                71619702                16.14 ns/op
BenchmarkMutexAdd/goroutines-2-8                20886546                67.79 ns/op
BenchmarkMutexAdd/goroutines-4-8                10610626               113.9 ns/op
BenchmarkMutexAdd/goroutines-8-8                10662531               111.8 ns/op
BenchmarkMutexAdd/goroutines-16-8               10869400               111.1 ns/op

BenchmarkChannelAdd/goroutines-1-8              30208658                39.65 ns/op
BenchmarkChannelAdd/goroutines-2-8              10090827               144.2 ns/op
BenchmarkChannelAdd/goroutines-4-8              23074944                52.90 ns/op
BenchmarkChannelAdd/goroutines-8-8              23826226                53.23 ns/op
BenchmarkChannelAdd/goroutines-16-8             23380171                51.86 ns/op