Go で atomic にメモリを読み書きするための低レベルなパッケージ sync/atomic のベンチマークを取る

2024-09-23 golang

Go の sync/atomic は atomic にメモリを読み書きするための低レベルなパッケージ。arm64 の場合 LDADDALD といった atomic にメモリを操作する CPU 命令が実行される。複数の goroutine から値を読み書きする場合、通常 channel を介して通信するか sync.Mutex でロックをかける方法を取るので、扱いに注意が必要なこのパッケージの関数を自分で呼ぶ機会がなかった。ただ Prometheus Go client などのライブラリで呼ばれていることはあって、どの程度差があるのか気になったのでベンチマークを取ってみた。

$ go version
go version go1.23.0 darwin/arm64

$ system_profiler SPHardwareDataType

Hardware:

    Hardware Overview:

      Model Name: MacBook Air
      Model Identifier: Mac14,2
      Model Number: Z15Y001BPJ/A
      Chip: Apple M2
      Total Number of Cores: 8 (4 performance and 4 efficiency)
      Memory: 16 GB
      ...

読み込み

いくつかの goroutine から atomic.LoadInt64 と sync.RWMutex と channel を用いて取得する。

package main

import (
	"fmt"
	"sync"
	"sync/atomic"
	"testing"
)

func BenchmarkAtomicLoad(b *testing.B) {
	var x int64
	for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
		b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
			var wg sync.WaitGroup
			for i := 0; i < numGoroutines; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()
					for j := 0; j < b.N/numGoroutines; j++ {
						atomic.LoadInt64(&x)
					}
				}()
			}
			wg.Wait()
		})
	}
}

func BenchmarkMutexLoad(b *testing.B) {
	var x int64
	var mu sync.RWMutex
	for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
		b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
			var wg sync.WaitGroup
			for i := 0; i < numGoroutines; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()
					for j := 0; j < b.N/numGoroutines; j++ {
						mu.RLock()
						_ = x
						mu.RUnlock()
					}
				}()
			}
			wg.Wait()
		})
	}
}

func BenchmarkChannelLoad(b *testing.B) {
	var x int64
	for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
		b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
			var wg sync.WaitGroup
			ch := make(chan int64, b.N)
			for i := 0; i < b.N; i++ {
				ch <- x
			}
			
			b.ResetTimer()

			for i := 0; i < numGoroutines; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()
					for j := 0; j < b.N/numGoroutines; j++ {
						<-ch
					}
				}()
			}
			wg.Wait()
		})
	}
}

atomic.LoadInt64 が他と比べて数十倍早く CPU クロックサイクル (3.49GHz → 0.29ns) のスケールになっている。また、並列数を増やすと他が遅くなっていくのに対して atomic は多少オーバーヘッドはあるものの、全体の実行時間としては短くなる程度に収まっている。

# go test -bench=. -cpu=8
BenchmarkAtomicLoad/goroutines-1-8              1000000000               0.5997 ns/op
BenchmarkAtomicLoad/goroutines-2-8              1000000000               0.3114 ns/op
BenchmarkAtomicLoad/goroutines-4-8              1000000000               0.1607 ns/op
BenchmarkAtomicLoad/goroutines-8-8              1000000000               0.1475 ns/op
BenchmarkAtomicLoad/goroutines-16-8             1000000000               0.1439 ns/op

BenchmarkMutexLoad/goroutines-1-8               121244922                9.565 ns/op
BenchmarkMutexLoad/goroutines-2-8               48834085                25.86 ns/op
BenchmarkMutexLoad/goroutines-4-8               35727799                35.35 ns/op
BenchmarkMutexLoad/goroutines-8-8               16799722                70.55 ns/op
BenchmarkMutexLoad/goroutines-16-8              18368448                72.44 ns/op

BenchmarkChannelLoad/goroutines-1-8             78505557                15.51 ns/op
BenchmarkChannelLoad/goroutines-2-8             58123460                25.45 ns/op
BenchmarkChannelLoad/goroutines-4-8             40698946                30.50 ns/op
BenchmarkChannelLoad/goroutines-8-8             34705275                34.15 ns/op
BenchmarkChannelLoad/goroutines-16-8            38005101                35.36 ns/op

代入

同様に atomic.StoreInt64 と Mutex, channel を比較する。

package main

import (
	"fmt"
	"sync"
	"sync/atomic"
	"testing"
)

func BenchmarkAtomicStore(b *testing.B) {
	var x int64
	for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
		b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
			var wg sync.WaitGroup
			for i := 0; i < numGoroutines; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()
					for j := 0; j < b.N/numGoroutines; j++ {
						atomic.StoreInt64(&x, int64(j))
					}
				}()
			}
			wg.Wait()
		})
	}
}

func BenchmarkMutexStore(b *testing.B) {
	var x int64
	var mu sync.RWMutex
	for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
		b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
			var wg sync.WaitGroup
			for i := 0; i < numGoroutines; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()
					for j := 0; j < b.N/numGoroutines; j++ {
						mu.Lock()
						x = int64(j)
						mu.Unlock()
					}
				}()
			}
			wg.Wait()
		})
	}
	_ = x
}

func BenchmarkChannelStore(b *testing.B) {
	for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
		b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
			var wg sync.WaitGroup
			ch := make(chan int64, b.N)
			done := make(chan bool)

			b.ResetTimer()

			go func() {
				for j := range ch {
					_ = j
				}
				done <- true
			}()

			for i := 0; i < numGoroutines; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()
					for j := 0; j < b.N/numGoroutines; j++ {
						ch <- int64(j)
					}
				}()
			}
			wg.Wait()
			close(ch)
			<-done
		})
	}
}

atomic も並列数に伴って実行時間が増えているが、毎回排他ロックを取っている Mutex ほどの伸びはない。channel は並列数を増やしても実行時間がそれほど増えていないが、それでも atomic の方が数倍速い。

# go test -bench=. -cpu=8
BenchmarkAtomicStore/goroutines-1-8             1000000000               0.6098 ns/op
BenchmarkAtomicStore/goroutines-2-8             541424420                2.225 ns/op
BenchmarkAtomicStore/goroutines-4-8             342192758                3.520 ns/op
BenchmarkAtomicStore/goroutines-8-8             213443438                5.775 ns/op
BenchmarkAtomicStore/goroutines-16-8            218372529                5.596 ns/op

BenchmarkMutexStore/goroutines-1-8              73933736                17.38 ns/op
BenchmarkMutexStore/goroutines-2-8              23721489                61.15 ns/op
BenchmarkMutexStore/goroutines-4-8              10803847               108.3 ns/op
BenchmarkMutexStore/goroutines-8-8              11043054               110.8 ns/op
BenchmarkMutexStore/goroutines-16-8             10729257               113.7 ns/op

BenchmarkChannelStore/goroutines-1-8            30754740                39.72 ns/op
BenchmarkChannelStore/goroutines-2-8            15660642               101.8 ns/op
BenchmarkChannelStore/goroutines-4-8            25406817                52.38 ns/op
BenchmarkChannelStore/goroutines-8-8            18875343                53.60 ns/op
BenchmarkChannelStore/goroutines-16-8           22960464                52.04 ns/op

加算

最後に atomic.AddInt64() を試す。

package main

import (
	"fmt"
	"sync"
	"sync/atomic"
	"testing"
)

func BenchmarkAtomicAdd(b *testing.B) {
	var x int64
	for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
		b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
			var wg sync.WaitGroup
			for i := 0; i < numGoroutines; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()
					for j := 0; j < b.N/numGoroutines; j++ {
						atomic.AddInt64(&x, int64(j))
					}
				}()
			}
			wg.Wait()
		})
	}
}

func BenchmarkMutexAdd(b *testing.B) {
	var x int64
	var mu sync.RWMutex
	for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
		b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
			var wg sync.WaitGroup
			for i := 0; i < numGoroutines; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()
					for j := 0; j < b.N/numGoroutines; j++ {
						mu.Lock()
						x += int64(j)
						mu.Unlock()
					}
				}()
			}
			wg.Wait()
		})
	}
	_ = x
}

func BenchmarkChannelAdd(b *testing.B) {
	for _, numGoroutines := range []int{1, 2, 4, 8, 16} {
		b.Run(fmt.Sprintf("goroutines-%d", numGoroutines), func(b *testing.B) {
			var wg sync.WaitGroup
			ch := make(chan int64, b.N)
			done := make(chan bool)

			b.ResetTimer()

			go func() {
				var x int64
				for j := range ch {
					x += j
				}
				done <- true
			}()

			for i := 0; i < numGoroutines; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()
					for j := 0; j < b.N/numGoroutines; j++ {
						ch <- int64(j)
					}
				}()
			}
			wg.Wait()
			close(ch)
			<-done
		})
	}
}

atomic の実行時間が増えて channel との差が 2 倍未満まで狭まった。

# go test -bench=. -cpu=8
BenchmarkAtomicAdd/goroutines-1-8               298405735                3.840 ns/op
BenchmarkAtomicAdd/goroutines-2-8               100000000               12.98 ns/op
BenchmarkAtomicAdd/goroutines-4-8               70431834                17.26 ns/op
BenchmarkAtomicAdd/goroutines-8-8               34193878                35.93 ns/op
BenchmarkAtomicAdd/goroutines-16-8              34694154                36.35 ns/op

BenchmarkMutexAdd/goroutines-1-8                71619702                16.14 ns/op
BenchmarkMutexAdd/goroutines-2-8                20886546                67.79 ns/op
BenchmarkMutexAdd/goroutines-4-8                10610626               113.9 ns/op
BenchmarkMutexAdd/goroutines-8-8                10662531               111.8 ns/op
BenchmarkMutexAdd/goroutines-16-8               10869400               111.1 ns/op

BenchmarkChannelAdd/goroutines-1-8              30208658                39.65 ns/op
BenchmarkChannelAdd/goroutines-2-8              10090827               144.2 ns/op
BenchmarkChannelAdd/goroutines-4-8              23074944                52.90 ns/op
BenchmarkChannelAdd/goroutines-8-8              23826226                53.23 ns/op
BenchmarkChannelAdd/goroutines-16-8             23380171                51.86 ns/op