Benchmarking sync/atomic, a low-level package for atomic memory access in Go
golangsync/atomic is a low-level package for atomically reading and writing memory in Go. In the case of arm64, CPU instructions such as LDADDALD that manipulate memory atomically are executed. When reading and writing values from multiple goroutines, the usual method is to communicate via a channel or lock with sync.Mutex, so I have not had the opportunity to call the functions of this package, which require careful handling. However, it is called by libraries such as Prometheus Go client, and I was curious to see how different it is, so I ran some benchmarks.
$ go version
go version go1.23.0 darwin/arm64
$ system_profiler SPHardwareDataType
Hardware:
Hardware Overview:
Model Name: MacBook Air
Model Identifier: Mac14,2
Model Number: Z15Y001BPJ/A
Chip: Apple M2
Total Number of Cores: 8 (4 performance and 4 efficiency)
Memory: 16 GB
...
Load
Load value from several goroutines using atomic.LoadInt64, sync.RWMutex, and channel.
package main
import (
"fmt"
"sync"
"sync/atomic"
"testing"
)
func BenchmarkAtomicLoad(b *testing.B) {
var x int64
for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
var wg sync.WaitGroup
for i := 0; i < numGoroutines; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for j := 0; j < b.N/numGoroutines; j++ {
atomic.LoadInt64(&x)
}
}()
}
wg.Wait()
})
}
}
func BenchmarkMutexLoad(b *testing.B) {
var x int64
var mu sync.RWMutex
for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
var wg sync.WaitGroup
for i := 0; i < numGoroutines; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for j := 0; j < b.N/numGoroutines; j++ {
mu.RLock()
_ = x
mu.RUnlock()
}
}()
}
wg.Wait()
})
}
}
func BenchmarkChannelLoad(b *testing.B) {
var x int64
for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
var wg sync.WaitGroup
ch := make(chan int64, b.N)
for i := 0; i < b.N; i++ {
ch <- x
}
b.ResetTimer()
for i := 0; i < numGoroutines; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for j := 0; j < b.N/numGoroutines; j++ {
<-ch
}
}()
}
wg.Wait()
})
}
}
atomic.LoadInt64 is several dozen times faster compared to others, reaching the scale of CPU clock cycles (3.49GHz → 0.29ns). Also, while increasing the number of parallel processes makes the others slower, atomic has some overhead, but the overall execution time is still short.
# go test -bench=. -cpu=8
BenchmarkAtomicLoad/goroutines-1-8 1000000000 0.5997 ns/op
BenchmarkAtomicLoad/goroutines-2-8 1000000000 0.3114 ns/op
BenchmarkAtomicLoad/goroutines-4-8 1000000000 0.1607 ns/op
BenchmarkAtomicLoad/goroutines-8-8 1000000000 0.1475 ns/op
BenchmarkAtomicLoad/goroutines-16-8 1000000000 0.1439 ns/op
BenchmarkMutexLoad/goroutines-1-8 121244922 9.565 ns/op
BenchmarkMutexLoad/goroutines-2-8 48834085 25.86 ns/op
BenchmarkMutexLoad/goroutines-4-8 35727799 35.35 ns/op
BenchmarkMutexLoad/goroutines-8-8 16799722 70.55 ns/op
BenchmarkMutexLoad/goroutines-16-8 18368448 72.44 ns/op
BenchmarkChannelLoad/goroutines-1-8 78505557 15.51 ns/op
BenchmarkChannelLoad/goroutines-2-8 58123460 25.45 ns/op
BenchmarkChannelLoad/goroutines-4-8 40698946 30.50 ns/op
BenchmarkChannelLoad/goroutines-8-8 34705275 34.15 ns/op
BenchmarkChannelLoad/goroutines-16-8 38005101 35.36 ns/op
Store
Similarly, compare atomic.StoreInt64 with Mutex and channel.
package main
import (
"fmt"
"sync"
"sync/atomic"
"testing"
)
func BenchmarkAtomicStore(b *testing.B) {
var x int64
for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
var wg sync.WaitGroup
for i := 0; i < numGoroutines; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for j := 0; j < b.N/numGoroutines; j++ {
atomic.StoreInt64(&x, int64(j))
}
}()
}
wg.Wait()
})
}
}
func BenchmarkMutexStore(b *testing.B) {
var x int64
var mu sync.RWMutex
for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
var wg sync.WaitGroup
for i := 0; i < numGoroutines; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for j := 0; j < b.N/numGoroutines; j++ {
mu.Lock()
x = int64(j)
mu.Unlock()
}
}()
}
wg.Wait()
})
}
_ = x
}
func BenchmarkChannelStore(b *testing.B) {
for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
var wg sync.WaitGroup
ch := make(chan int64, b.N)
done := make(chan bool)
b.ResetTimer()
go func() {
for j := range ch {
_ = j
}
done <- true
}()
for i := 0; i < numGoroutines; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for j := 0; j < b.N/numGoroutines; j++ {
ch <- int64(j)
}
}()
}
wg.Wait()
close(ch)
<-done
})
}
}
The execution time of atomic also increases with the number of parallel processes, but the increase is not as large as that of Mutex, which takes an exclusive lock each time. The execution time of channel does not increase much even when the number of parallel processes is increased, but atomic is still several times faster.
# go test -bench=. -cpu=8
BenchmarkAtomicStore/goroutines-1-8 1000000000 0.6098 ns/op
BenchmarkAtomicStore/goroutines-2-8 541424420 2.225 ns/op
BenchmarkAtomicStore/goroutines-4-8 342192758 3.520 ns/op
BenchmarkAtomicStore/goroutines-8-8 213443438 5.775 ns/op
BenchmarkAtomicStore/goroutines-16-8 218372529 5.596 ns/op
BenchmarkMutexStore/goroutines-1-8 73933736 17.38 ns/op
BenchmarkMutexStore/goroutines-2-8 23721489 61.15 ns/op
BenchmarkMutexStore/goroutines-4-8 10803847 108.3 ns/op
BenchmarkMutexStore/goroutines-8-8 11043054 110.8 ns/op
BenchmarkMutexStore/goroutines-16-8 10729257 113.7 ns/op
BenchmarkChannelStore/goroutines-1-8 30754740 39.72 ns/op
BenchmarkChannelStore/goroutines-2-8 15660642 101.8 ns/op
BenchmarkChannelStore/goroutines-4-8 25406817 52.38 ns/op
BenchmarkChannelStore/goroutines-8-8 18875343 53.60 ns/op
BenchmarkChannelStore/goroutines-16-8 22960464 52.04 ns/op
Add
Finally, try atomic.AddInt64().
package main
import (
"fmt"
"sync"
"sync/atomic"
"testing"
)
func BenchmarkAtomicAdd(b *testing.B) {
var x int64
for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
var wg sync.WaitGroup
for i := 0; i < numGoroutines; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for j := 0; j < b.N/numGoroutines; j++ {
atomic.AddInt64(&x, int64(j))
}
}()
}
wg.Wait()
})
}
}
func BenchmarkMutexAdd(b *testing.B) {
var x int64
var mu sync.RWMutex
for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
var wg sync.WaitGroup
for i := 0; i < numGoroutines; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for j := 0; j < b.N/numGoroutines; j++ {
mu.Lock()
x += int64(j)
mu.Unlock()
}
}()
}
wg.Wait()
})
}
_ = x
}
func BenchmarkChannelAdd(b *testing.B) {
for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
var wg sync.WaitGroup
ch := make(chan int64, b.N)
done := make(chan bool)
b.ResetTimer()
go func() {
var x int64
for j := range ch {
x += j
}
done <- true
}()
for i := 0; i < numGoroutines; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for j := 0; j < b.N/numGoroutines; j++ {
ch <- int64(j)
}
}()
}
wg.Wait()
close(ch)
<-done
})
}
}
The execution time of atomic increased and the difference with channel narrowed to less than double.
# go test -bench=. -cpu=8
BenchmarkAtomicAdd/goroutines-1-8 298405735 3.840 ns/op
BenchmarkAtomicAdd/goroutines-2-8 100000000 12.98 ns/op
BenchmarkAtomicAdd/goroutines-4-8 70431834 17.26 ns/op
BenchmarkAtomicAdd/goroutines-8-8 34193878 35.93 ns/op
BenchmarkAtomicAdd/goroutines-16-8 34694154 36.35 ns/op
BenchmarkMutexAdd/goroutines-1-8 71619702 16.14 ns/op
BenchmarkMutexAdd/goroutines-2-8 20886546 67.79 ns/op
BenchmarkMutexAdd/goroutines-4-8 10610626 113.9 ns/op
BenchmarkMutexAdd/goroutines-8-8 10662531 111.8 ns/op
BenchmarkMutexAdd/goroutines-16-8 10869400 111.1 ns/op
BenchmarkChannelAdd/goroutines-1-8 30208658 39.65 ns/op
BenchmarkChannelAdd/goroutines-2-8 10090827 144.2 ns/op
BenchmarkChannelAdd/goroutines-4-8 23074944 52.90 ns/op
BenchmarkChannelAdd/goroutines-8-8 23826226 53.23 ns/op
BenchmarkChannelAdd/goroutines-16-8 23380171 51.86 ns/op