Files
httprunner/hrp/pkg/boomer/runner.go
2022-10-11 11:17:58 +08:00

1418 lines
34 KiB
Go

package boomer
import (
"fmt"
"math/rand"
"os"
"runtime/debug"
"strconv"
"sync"
"sync/atomic"
"time"
"github.com/jinzhu/copier"
"github.com/olekukonko/tablewriter"
"github.com/pkg/errors"
"github.com/rs/zerolog/log"
"github.com/httprunner/httprunner/v4/hrp/internal/builtin"
"github.com/httprunner/httprunner/v4/hrp/pkg/boomer/grpc/messager"
)
const (
StateInit = iota + 1 // initializing
StateSpawning // spawning
StateRunning // running
StateStopping // stopping
StateStopped // stopped
StateQuitting // quitting
StateMissing // missing
)
func getStateName(state int32) (stateName string) {
switch state {
case StateInit:
stateName = "initializing"
case StateSpawning:
stateName = "spawning"
case StateRunning:
stateName = "running"
case StateStopping:
stateName = "stopping"
case StateStopped:
stateName = "stopped"
case StateQuitting:
stateName = "quitting"
case StateMissing:
stateName = "missing"
}
return
}
const (
reportStatsInterval = 3 * time.Second
heartbeatInterval = 1 * time.Second
heartbeatLiveness = 3 * time.Second
stateMachineInterval = 1 * time.Second
)
type Loop struct {
loopCount int64 // more than 0
acquiredCount int64 // count acquired of load testing
finishedCount int64 // count finished of load testing
}
func (l *Loop) isFinished() bool {
// return true when there are no remaining loop count to test
return atomic.LoadInt64(&l.finishedCount) == l.loopCount
}
func (l *Loop) acquire() bool {
// get one ticket when there are still remaining loop count to test
// return true when getting ticket successfully
if atomic.LoadInt64(&l.acquiredCount) < l.loopCount {
atomic.AddInt64(&l.acquiredCount, 1)
return true
}
return false
}
func (l *Loop) increaseFinishedCount() {
atomic.AddInt64(&l.finishedCount, 1)
}
type Controller struct {
mutex sync.RWMutex
once sync.Once
currentClientsNum int64 // current clients count
spawnCount int64 // target clients to spawn
spawnRate float64
rebalance chan bool // dynamically balance boomer running parameters
spawnDone chan struct{}
tasks []*Task
}
func (c *Controller) setSpawn(spawnCount int64, spawnRate float64) {
c.mutex.Lock()
defer c.mutex.Unlock()
if spawnCount > 0 {
atomic.StoreInt64(&c.spawnCount, spawnCount)
}
if spawnRate > 0 {
c.spawnRate = spawnRate
}
}
func (c *Controller) setSpawnCount(spawnCount int64) {
if spawnCount > 0 {
atomic.StoreInt64(&c.spawnCount, spawnCount)
}
}
func (c *Controller) setSpawnRate(spawnRate float64) {
c.mutex.Lock()
defer c.mutex.Unlock()
if spawnRate > 0 {
c.spawnRate = spawnRate
}
}
func (c *Controller) getSpawnCount() int64 {
c.mutex.RLock()
defer c.mutex.RUnlock()
return atomic.LoadInt64(&c.spawnCount)
}
func (c *Controller) getSpawnRate() float64 {
c.mutex.RLock()
defer c.mutex.RUnlock()
return c.spawnRate
}
func (c *Controller) getSpawnDone() chan struct{} {
c.mutex.RLock()
defer c.mutex.RUnlock()
return c.spawnDone
}
func (c *Controller) getCurrentClientsNum() int64 {
c.mutex.RLock()
defer c.mutex.RUnlock()
return atomic.LoadInt64(&c.currentClientsNum)
}
func (c *Controller) spawnCompete() {
close(c.spawnDone)
}
func (c *Controller) getRebalanceChan() chan bool {
c.mutex.RLock()
defer c.mutex.RUnlock()
return c.rebalance
}
func (c *Controller) isFinished() bool {
// return true when workers acquired
return atomic.LoadInt64(&c.currentClientsNum) == atomic.LoadInt64(&c.spawnCount)
}
func (c *Controller) acquire() bool {
// get one ticket when there are still remaining spawn count to test
// return true when getting ticket successfully
if atomic.LoadInt64(&c.currentClientsNum) < atomic.LoadInt64(&c.spawnCount) {
atomic.AddInt64(&c.currentClientsNum, 1)
return true
}
return false
}
func (c *Controller) erase() bool {
// return true if acquiredCount > spawnCount
if atomic.LoadInt64(&c.currentClientsNum) > atomic.LoadInt64(&c.spawnCount) {
atomic.AddInt64(&c.currentClientsNum, -1)
return true
}
return false
}
func (c *Controller) increaseFinishedCount() {
atomic.AddInt64(&c.currentClientsNum, -1)
}
func (c *Controller) reset() {
c.mutex.Lock()
defer c.mutex.Unlock()
atomic.StoreInt64(&c.spawnCount, 0)
c.spawnRate = 0
atomic.StoreInt64(&c.currentClientsNum, 0)
c.spawnDone = make(chan struct{})
c.rebalance = make(chan bool)
c.tasks = []*Task{}
c.once = sync.Once{}
}
type runner struct {
state int32
tasks []*Task
totalTaskWeight int
mutex sync.RWMutex
rateLimiter RateLimiter
rateLimitEnabled bool
stats *requestStats
spawnCount int64 // target clients to spawn
spawnRate float64
runTime int64
controller *Controller
loop *Loop // specify loop count for testcase, count = loopCount * spawnCount
// stop signals the run goroutine should shutdown.
stopChan chan bool
// all running workers(goroutines) will select on this channel.
// stopping is closed by run goroutine on shutdown.
stoppingChan chan bool
// done is closed when all goroutines from start() complete.
doneChan chan bool
// when this channel is closed, all statistics are reported successfully
reportedChan chan bool
// close this channel will stop all goroutines used in runner.
closeChan chan bool
// wgMu blocks concurrent waitgroup mutation while boomer stopping
wgMu sync.RWMutex
// wg is used to wait for all running workers(goroutines) that depends on the boomer state
// to exit when stopping the boomer.
wg sync.WaitGroup
outputs []Output
}
func (r *runner) setSpawnRate(spawnRate float64) {
r.mutex.Lock()
defer r.mutex.Unlock()
if spawnRate > 0 {
r.spawnRate = spawnRate
}
}
func (r *runner) getSpawnRate() float64 {
r.mutex.RLock()
defer r.mutex.RUnlock()
return r.spawnRate
}
func (r *runner) setRunTime(runTime int64) {
atomic.StoreInt64(&r.runTime, runTime)
}
func (r *runner) getRunTime() int64 {
return atomic.LoadInt64(&r.runTime)
}
func (r *runner) getSpawnCount() int64 {
return atomic.LoadInt64(&r.spawnCount)
}
func (r *runner) setSpawnCount(spawnCount int64) {
atomic.StoreInt64(&r.spawnCount, spawnCount)
}
// safeRun runs fn and recovers from unexpected panics.
// it prevents panics from Task.Fn crashing boomer.
func (r *runner) safeRun(fn func()) {
defer func() {
// don't panic
err := recover()
if err != nil {
stackTrace := debug.Stack()
errMsg := fmt.Sprintf("%v", err)
os.Stderr.Write([]byte(errMsg))
os.Stderr.Write([]byte("\n"))
os.Stderr.Write(stackTrace)
}
}()
fn()
}
func (r *runner) addOutput(o Output) {
r.outputs = append(r.outputs, o)
}
func (r *runner) outputOnStart() {
size := len(r.outputs)
if size == 0 {
return
}
wg := sync.WaitGroup{}
wg.Add(size)
for _, output := range r.outputs {
go func(o Output) {
o.OnStart()
wg.Done()
}(output)
}
wg.Wait()
}
func (r *runner) outputOnEvent(data map[string]interface{}) {
size := len(r.outputs)
if size == 0 {
return
}
wg := sync.WaitGroup{}
wg.Add(size)
for _, output := range r.outputs {
go func(o Output) {
o.OnEvent(data)
wg.Done()
}(output)
}
wg.Wait()
}
func (r *runner) outputOnStop() {
defer func() {
r.outputs = make([]Output, 0)
}()
size := len(r.outputs)
if size == 0 {
return
}
wg := sync.WaitGroup{}
wg.Add(size)
for _, output := range r.outputs {
go func(o Output) {
o.OnStop()
wg.Done()
}(output)
}
wg.Wait()
}
func (r *runner) reportStats() {
data := r.stats.collectReportData()
data["user_count"] = r.controller.getCurrentClientsNum()
data["state"] = atomic.LoadInt32(&r.state)
r.outputOnEvent(data)
}
func (r *runner) reportTestResult() {
// convert stats in total
var statsTotal interface{} = r.stats.total.serialize()
entryTotalOutput, err := deserializeStatsEntry(statsTotal)
if err != nil {
return
}
duration := time.Duration(entryTotalOutput.LastRequestTimestamp-entryTotalOutput.StartTime) * time.Millisecond
currentTime := time.Now()
println(fmt.Sprint("=========================================== Statistics Summary =========================================="))
println(fmt.Sprintf("Current time: %s, Users: %v, Duration: %v, Accumulated Transactions: %d Passed, %d Failed",
currentTime.Format("2006/01/02 15:04:05"), r.controller.getCurrentClientsNum(), duration, r.stats.transactionPassed, r.stats.transactionFailed))
table := tablewriter.NewWriter(os.Stdout)
table.SetHeader([]string{"Name", "# requests", "# fails", "Median", "Average", "Min", "Max", "Content Size", "# reqs/sec", "# fails/sec"})
row := make([]string, 10)
row[0] = entryTotalOutput.Name
row[1] = strconv.FormatInt(entryTotalOutput.NumRequests, 10)
row[2] = strconv.FormatInt(entryTotalOutput.NumFailures, 10)
row[3] = strconv.FormatInt(entryTotalOutput.medianResponseTime, 10)
row[4] = strconv.FormatFloat(entryTotalOutput.avgResponseTime, 'f', 2, 64)
row[5] = strconv.FormatInt(entryTotalOutput.MinResponseTime, 10)
row[6] = strconv.FormatInt(entryTotalOutput.MaxResponseTime, 10)
row[7] = strconv.FormatInt(entryTotalOutput.avgContentLength, 10)
row[8] = strconv.FormatFloat(entryTotalOutput.currentRps, 'f', 2, 64)
row[9] = strconv.FormatFloat(entryTotalOutput.currentFailPerSec, 'f', 2, 64)
table.Append(row)
table.Render()
println()
}
func (r *runner) reset() {
r.controller.reset()
r.stats.clearAll()
r.stoppingChan = make(chan bool)
r.doneChan = make(chan bool)
r.reportedChan = make(chan bool)
}
func (r *runner) runTimeCheck(runTime int64) {
if runTime <= 0 {
return
}
stopTime := time.Now().Unix() + runTime
ticker := time.NewTicker(time.Second)
for {
select {
case <-r.stopChan:
return
case <-ticker.C:
if time.Now().Unix() > stopTime {
r.stop()
return
}
}
}
}
func (r *runner) spawnWorkers(spawnCount int64, spawnRate float64, quit chan bool, spawnCompleteFunc func()) {
r.updateState(StateSpawning)
log.Info().
Int64("spawnCount", spawnCount).
Float64("spawnRate", spawnRate).
Msg("Spawning workers")
r.controller.setSpawn(spawnCount, spawnRate)
for {
select {
case <-quit:
// quit spawning goroutine
log.Info().Msg("Quitting spawning workers")
return
default:
if r.isStarting() && r.controller.acquire() {
// spawn workers with rate limit
sleepTime := time.Duration(1000000/r.controller.getSpawnRate()) * time.Microsecond
time.Sleep(sleepTime)
// loop count per worker
var workerLoop *Loop
if r.loop != nil {
workerLoop = &Loop{loopCount: atomic.LoadInt64(&r.loop.loopCount) / r.controller.spawnCount}
}
r.goAttach(func() {
for {
select {
case <-quit:
r.controller.increaseFinishedCount()
return
default:
if workerLoop != nil && !workerLoop.acquire() {
r.controller.increaseFinishedCount()
return
}
if r.rateLimitEnabled {
blocked := r.rateLimiter.Acquire()
if !blocked {
task := r.getTask()
r.safeRun(task.Fn)
}
} else {
task := r.getTask()
r.safeRun(task.Fn)
}
if workerLoop != nil {
// finished count of total
r.loop.increaseFinishedCount()
// finished count of single worker
workerLoop.increaseFinishedCount()
if r.loop.isFinished() {
go r.stop()
r.controller.increaseFinishedCount()
return
}
}
if r.controller.erase() {
return
}
}
}
})
continue
}
r.controller.once.Do(
func() {
// spawning compete
r.controller.spawnCompete()
if spawnCompleteFunc != nil {
spawnCompleteFunc()
}
r.updateState(StateRunning)
},
)
<-r.controller.getRebalanceChan()
if r.isStarting() {
// rebalance spawn count
r.controller.setSpawn(r.getSpawnCount(), r.getSpawnRate())
}
}
}
}
// goAttach creates a goroutine on a given function and tracks it using
// the runner waitgroup.
// The passed function should interrupt on r.stoppingNotify().
func (r *runner) goAttach(f func()) {
r.wgMu.RLock() // this blocks with ongoing close(s.stopping)
defer r.wgMu.RUnlock()
select {
case <-r.stoppingChan:
log.Warn().Msg("runner has stopped; skipping GoAttach")
return
default:
}
// now safe to add since waitgroup wait has not started yet
r.wg.Add(1)
go func() {
defer r.wg.Done()
f()
}()
}
// setTasks will set the runner's task list AND the total task weight
// which is used to get a random task later
func (r *runner) setTasks(t []*Task) {
r.mutex.Lock()
defer r.mutex.Unlock()
r.tasks = t
weightSum := 0
for _, task := range r.tasks {
weightSum += task.Weight
}
r.totalTaskWeight = weightSum
}
func (r *runner) getTask() *Task {
r.mutex.RLock()
defer r.mutex.RUnlock()
tasksCount := len(r.tasks)
if tasksCount == 0 {
log.Error().Msg("no valid testcase found")
os.Exit(1)
} else if tasksCount == 1 {
// Fast path
return r.tasks[0]
}
rs := rand.New(rand.NewSource(time.Now().UnixNano()))
totalWeight := r.totalTaskWeight
if totalWeight <= 0 {
// If all the tasks have not weights defined, they have the same chance to run
randNum := rs.Intn(tasksCount)
return r.tasks[randNum]
}
randNum := rs.Intn(totalWeight)
runningSum := 0
for _, task := range r.tasks {
runningSum += task.Weight
if runningSum > randNum {
return task
}
}
return nil
}
func (r *runner) statsStart() {
ticker := time.NewTicker(reportStatsInterval)
for {
select {
// record stats
case t := <-r.stats.transactionChan:
r.stats.logTransaction(t.name, t.success, t.elapsedTime, t.contentSize)
case m := <-r.stats.requestSuccessChan:
r.stats.logRequest(m.requestType, m.name, m.responseTime, m.responseLength)
case n := <-r.stats.requestFailureChan:
r.stats.logRequest(n.requestType, n.name, n.responseTime, 0)
r.stats.logError(n.requestType, n.name, n.errMsg)
// report stats
case <-ticker.C:
r.reportStats()
// close reportedChan and return if the last stats is reported successfully
if !r.isStarting() && !r.isStopping() {
close(r.reportedChan)
log.Info().Msg("Quitting statsStart")
return
}
}
}
}
func (r *runner) stop() {
// stop previous goroutines without blocking
// those goroutines will exit when r.safeRun returns
r.gracefulStop()
if r.rateLimitEnabled {
r.rateLimiter.Stop()
}
r.updateState(StateStopped)
}
// gracefulStop stops the boomer gracefully, and shuts down the running goroutine.
// gracefulStop should be called after a start(), otherwise it will block forever.
// When stopping leader, Stop transfers its leadership to one of its peers
// before stopping the boomer.
// gracefulStop terminates the boomer and performs any necessary finalization.
// Do and Process cannot be called after Stop has been invoked.
func (r *runner) gracefulStop() {
select {
case r.stopChan <- true:
case <-r.doneChan:
return
}
<-r.doneChan
}
// stopNotify returns a channel that receives a bool type value
// when the runner is stopped.
func (r *runner) stopNotify() <-chan bool { return r.doneChan }
func (r *runner) getState() int32 {
return atomic.LoadInt32(&r.state)
}
func (r *runner) updateState(state int32) {
log.Debug().Int32("from", atomic.LoadInt32(&r.state)).Int32("to", state).Msg("update runner state")
atomic.StoreInt32(&r.state, state)
}
func (r *runner) isStarting() bool {
return r.getState() == StateRunning || r.getState() == StateSpawning
}
func (r *runner) isStopping() bool {
return r.getState() == StateStopping
}
type localRunner struct {
runner
profile *Profile
}
func newLocalRunner(spawnCount int64, spawnRate float64) *localRunner {
return &localRunner{
runner: runner{
state: StateInit,
stats: newRequestStats(),
spawnCount: spawnCount,
spawnRate: spawnRate,
controller: &Controller{},
outputs: make([]Output, 0),
stopChan: make(chan bool),
closeChan: make(chan bool),
wg: sync.WaitGroup{},
wgMu: sync.RWMutex{},
},
}
}
func (r *localRunner) start() {
r.updateState(StateInit)
// init localRunner
r.reset()
// start rate limiter
if r.rateLimitEnabled {
r.rateLimiter.Start()
}
// output setup
r.outputOnStart()
go r.runTimeCheck(r.getRunTime())
go r.spawnWorkers(r.getSpawnCount(), r.getSpawnRate(), r.stoppingChan, nil)
defer func() {
// block concurrent waitgroup adds in GoAttach while stopping
r.wgMu.Lock()
r.updateState(StateStopping)
close(r.stoppingChan)
close(r.controller.rebalance)
r.wgMu.Unlock()
// wait for goroutines before closing
r.wg.Wait()
close(r.doneChan)
// wait until all stats are reported successfully
<-r.reportedChan
// report test result
r.reportTestResult()
// output teardown
r.outputOnStop()
r.updateState(StateQuitting)
}()
// start stats report
go r.statsStart()
<-r.stopChan
}
func (r *localRunner) stop() {
if r.runner.isStarting() {
r.runner.stop()
}
}
// workerRunner connects to the master, spawns goroutines and collects stats.
type workerRunner struct {
runner
nodeID string
masterHost string
masterPort int
client *grpcClient
profile *Profile
testCasesBytes []byte
tasksChan chan *task
mutex sync.Mutex
ignoreQuit bool
}
func newWorkerRunner(masterHost string, masterPort int) (r *workerRunner) {
r = &workerRunner{
runner: runner{
stats: newRequestStats(),
outputs: make([]Output, 0),
controller: &Controller{},
stopChan: make(chan bool),
closeChan: make(chan bool),
},
masterHost: masterHost,
masterPort: masterPort,
nodeID: getNodeID(),
tasksChan: make(chan *task, 10),
mutex: sync.Mutex{},
ignoreQuit: false,
}
return r
}
func (r *workerRunner) spawnComplete() {
data := make(map[string][]byte)
data["count"] = builtin.Int64ToBytes(r.controller.getSpawnCount())
r.client.sendChannel() <- newGenericMessage("spawning_complete", data, r.nodeID)
}
func (r *workerRunner) onSpawnMessage(msg *genericMessage) {
r.client.sendChannel() <- newGenericMessage("spawning", nil, r.nodeID)
if msg.Profile == nil {
log.Error().Msg("miss profile")
}
profile := BytesToProfile(msg.Profile)
r.setSpawnCount(profile.SpawnCount)
r.setSpawnRate(profile.SpawnRate)
if msg.Tasks == nil && len(r.tasks) == 0 {
log.Error().Msg("miss tasks")
}
r.tasksChan <- &task{
Profile: profile,
TestCasesBytes: msg.Tasks,
}
log.Info().Msg("on spawn message successfully")
}
func (r *workerRunner) onRebalanceMessage(msg *genericMessage) {
if msg.Profile == nil {
log.Error().Msg("miss profile")
}
profile := BytesToProfile(msg.Profile)
r.setSpawnCount(profile.SpawnCount)
r.setSpawnRate(profile.SpawnRate)
r.tasksChan <- &task{
Profile: profile,
}
log.Info().Msg("on rebalance message successfully")
}
// Runner acts as a state machine.
func (r *workerRunner) onMessage(msg *genericMessage) {
switch r.getState() {
case StateInit:
switch msg.Type {
case "spawn":
r.onSpawnMessage(msg)
case "quit":
if r.ignoreQuit {
log.Warn().Msg("master already quit, waiting to reconnect master.")
break
}
r.close()
}
case StateSpawning:
fallthrough
case StateRunning:
switch msg.Type {
case "spawn":
r.onSpawnMessage(msg)
case "rebalance":
r.onRebalanceMessage(msg)
case "stop":
r.stop()
case "quit":
r.stop()
if r.ignoreQuit {
log.Warn().Msg("master already quit, waiting to reconnect master.")
break
}
r.close()
log.Info().Msg("Recv quit message from master, all the goroutines are stopped")
}
case StateStopped:
switch msg.Type {
case "spawn":
r.onSpawnMessage(msg)
case "quit":
if r.ignoreQuit {
log.Warn().Msg("master already quit, waiting to reconnect master.")
break
}
r.close()
}
}
}
func (r *workerRunner) onStopped() {
r.client.sendChannel() <- newGenericMessage("client_stopped", nil, r.nodeID)
}
func (r *workerRunner) onQuiting() {
if r.getState() != StateQuitting {
r.client.sendChannel() <- newQuitMessage(r.nodeID)
}
r.updateState(StateQuitting)
}
func (r *workerRunner) startListener() {
for {
select {
case msg := <-r.client.recvChannel():
r.onMessage(msg)
case <-r.closeChan:
return
}
}
}
// run worker service
func (r *workerRunner) run() {
println("==================== HttpRunner Worker for Distributed Load Testing ==================== ")
r.updateState(StateInit)
r.client = newClient(r.masterHost, r.masterPort, r.nodeID)
println(fmt.Sprintf("ready to connect master to %s:%d", r.masterHost, r.masterPort))
err := r.client.start()
if err != nil {
log.Error().Err(err).Msg(fmt.Sprintf("failed to connect to master(%s:%d)", r.masterHost, r.masterPort))
}
// register worker information to master
if err = r.client.register(r.client.config.ctx); err != nil {
log.Error().Err(err).Msg("failed to register")
}
err = r.client.newBiStreamClient()
if err != nil {
log.Error().Err(err).Msg("failed to establish bidirectional stream, waiting master launched")
}
go r.client.recv()
go r.client.send()
defer func() {
// wait for goroutines before closing
r.wg.Wait()
// notify master that worker is quitting
r.onQuiting()
ticker := time.NewTicker(1 * time.Second)
if r.client != nil {
// waitting for quit message is sent to master
select {
case <-r.client.disconnectedChannel():
case <-ticker.C:
log.Warn().Msg("timeout waiting for sending quit message to master, boomer will quit any way.")
}
// sign out from master
if err = r.client.signOut(r.client.config.ctx); err != nil {
log.Info().Err(err).Msg("failed to sign out")
}
// close grpc client
r.client.close()
}
}()
// listen to master
go r.startListener()
// tell master, I'm ready
log.Info().Msg("send client ready signal")
r.client.sendChannel() <- newClientReadyMessageToMaster(r.nodeID)
// heartbeat
// See: https://github.com/locustio/locust/commit/a8c0d7d8c588f3980303358298870f2ea394ab93
ticker := time.NewTicker(heartbeatInterval)
for {
select {
case <-ticker.C:
if r.getState() == StateMissing {
err = r.client.register(r.client.config.ctx)
if err != nil {
continue
}
err = r.client.newBiStreamClient()
if err != nil {
continue
}
r.updateState(StateInit)
}
if atomic.LoadInt32(&r.client.failCount) > 3 {
go r.stop()
if !r.isStarting() && !r.isStopping() {
r.updateState(StateMissing)
}
continue
}
CPUUsage := GetCurrentCPUPercent()
MemoryUsage := GetCurrentMemoryPercent()
PidCPUUsage := GetCurrentPidCPUUsage()
PidMemoryUsage := GetCurrentPidMemoryUsage()
data := map[string][]byte{
"state": builtin.Int64ToBytes(int64(r.getState())),
"current_cpu_usage": builtin.Float64ToByte(CPUUsage),
"current_pid_cpu_usage": builtin.Float64ToByte(PidCPUUsage),
"current_memory_usage": builtin.Float64ToByte(MemoryUsage),
"current_pid_memory_usage": builtin.Float64ToByte(PidMemoryUsage),
"current_users": builtin.Int64ToBytes(r.controller.getCurrentClientsNum()),
}
r.client.sendChannel() <- newGenericMessage("heartbeat", data, r.nodeID)
case <-r.closeChan:
return
}
}
}
func (r *workerRunner) start() {
r.mutex.Lock()
defer r.mutex.Unlock()
r.updateState(StateInit)
r.reset()
// start rate limiter
if r.rateLimitEnabled {
r.rateLimiter.Start()
}
r.outputOnStart()
go r.runTimeCheck(r.getRunTime())
go r.spawnWorkers(r.getSpawnCount(), r.getSpawnRate(), r.stoppingChan, r.spawnComplete)
defer func() {
// block concurrent waitgroup adds in GoAttach while stopping
r.wgMu.Lock()
r.updateState(StateStopping)
close(r.controller.rebalance)
close(r.stoppingChan)
r.wgMu.Unlock()
// wait for goroutines before closing
r.wg.Wait()
// reset loop
if r.loop != nil {
r.loop = nil
}
close(r.doneChan)
// wait until all stats are reported successfully
<-r.reportedChan
// report test result
r.reportTestResult()
// output teardown
r.outputOnStop()
// notify master that worker is stopped
r.onStopped()
}()
// start stats report
go r.statsStart()
<-r.stopChan
}
func (r *workerRunner) stop() {
if r.isStarting() {
r.runner.stop()
}
}
func (r *workerRunner) close() {
close(r.closeChan)
}
// masterRunner controls worker to spawn goroutines and collect stats.
type masterRunner struct {
runner
masterBindHost string
masterBindPort int
server *grpcServer
autoStart bool
expectWorkers int
expectWorkersMaxWait int
profile *Profile
parseTestCasesChan chan bool
testCaseBytesChan chan []byte
testCasesBytes []byte
}
func newMasterRunner(masterBindHost string, masterBindPort int) *masterRunner {
return &masterRunner{
runner: runner{
state: StateInit,
stoppingChan: make(chan bool),
doneChan: make(chan bool),
closeChan: make(chan bool),
wg: sync.WaitGroup{},
wgMu: sync.RWMutex{},
},
masterBindHost: masterBindHost,
masterBindPort: masterBindPort,
server: newServer(masterBindHost, masterBindPort),
parseTestCasesChan: make(chan bool),
testCaseBytesChan: make(chan []byte),
}
}
func (r *masterRunner) setExpectWorkers(expectWorkers int, expectWorkersMaxWait int) {
r.expectWorkers = expectWorkers
r.expectWorkersMaxWait = expectWorkersMaxWait
}
func (r *masterRunner) heartbeatWorker() {
log.Info().Msg("heartbeatWorker, listen and record heartbeat from worker")
heartBeatTicker := time.NewTicker(heartbeatInterval)
reportTicker := time.NewTicker(heartbeatLiveness)
for {
select {
case <-r.closeChan:
return
case <-heartBeatTicker.C:
r.server.clients.Range(func(key, value interface{}) bool {
workerInfo, ok := value.(*WorkerNode)
if !ok {
log.Error().Msg("failed to get worker information")
}
go func() {
if atomic.LoadInt32(&workerInfo.Heartbeat) < 0 {
if workerInfo.getState() != StateMissing {
workerInfo.setState(StateMissing)
}
} else {
atomic.AddInt32(&workerInfo.Heartbeat, -1)
}
}()
return true
})
case <-reportTicker.C:
r.reportStats()
}
}
}
func (r *masterRunner) clientListener() {
log.Info().Msg("clientListener, start to deal message from worker")
for {
select {
case <-r.closeChan:
return
case msg := <-r.server.recvChannel():
worker, ok := r.server.getClients().Load(msg.NodeID)
if !ok {
continue
}
workerInfo, ok := worker.(*WorkerNode)
if !ok {
continue
}
go func() {
switch msg.Type {
case typeClientReady:
workerInfo.setState(StateInit)
case typeClientStopped:
workerInfo.setState(StateStopped)
case typeHeartbeat:
if workerInfo.getState() == StateMissing {
workerInfo.setState(int32(builtin.BytesToInt64(msg.Data["state"])))
}
workerInfo.updateHeartbeat(3)
currentCPUUsage, ok := msg.Data["current_cpu_usage"]
if ok {
workerInfo.updateCPUUsage(builtin.ByteToFloat64(currentCPUUsage))
}
currentPidCpuUsage, ok := msg.Data["current_pid_cpu_usage"]
if ok {
workerInfo.updateWorkerCPUUsage(builtin.ByteToFloat64(currentPidCpuUsage))
}
currentMemoryUsage, ok := msg.Data["current_memory_usage"]
if ok {
workerInfo.updateMemoryUsage(builtin.ByteToFloat64(currentMemoryUsage))
}
currentPidMemoryUsage, ok := msg.Data["current_pid_memory_usage"]
if ok {
workerInfo.updateWorkerMemoryUsage(builtin.ByteToFloat64(currentPidMemoryUsage))
}
currentUsers, ok := msg.Data["current_users"]
if ok {
workerInfo.updateUserCount(builtin.BytesToInt64(currentUsers))
}
case typeSpawning:
workerInfo.setState(StateSpawning)
case typeSpawningComplete:
workerInfo.setState(StateRunning)
case typeQuit:
if workerInfo.getState() == StateQuitting {
break
}
workerInfo.setState(StateQuitting)
case typeException:
// Todo
default:
}
}()
}
}
}
func (r *masterRunner) stateMachine() {
ticker := time.NewTicker(stateMachineInterval)
for {
select {
case <-r.closeChan:
return
case <-ticker.C:
switch r.getState() {
case StateSpawning:
if r.server.getCurrentUsers() == int(r.getSpawnCount()) {
log.Warn().Msg("all workers spawn done, setting state as running")
r.updateState(StateRunning)
}
case StateRunning:
if r.server.getStartingClientsLength() == 0 {
r.updateState(StateStopped)
continue
}
if r.server.getWorkersLengthByState(StateInit) != 0 {
err := r.rebalance()
if err != nil {
log.Error().Err(err).Msg("failed to rebalance")
}
}
case StateStopping:
if r.server.getReadyClientsLength() == r.server.getAvailableClientsLength() {
r.updateState(StateStopped)
}
}
}
}
}
func (r *masterRunner) run() {
r.updateState(StateInit)
// start grpc server
err := r.server.start()
if err != nil {
log.Error().Err(err).Msg("failed to start grpc server")
return
}
defer func() {
// close server
r.server.close()
}()
if r.autoStart {
go func() {
log.Info().Msg("auto start, waiting expected workers joined")
ticker := time.NewTicker(1 * time.Second)
tickerMaxWait := time.NewTicker(time.Duration(r.expectWorkersMaxWait) * time.Second)
for {
select {
case <-r.closeChan:
return
case <-ticker.C:
c := r.server.getAvailableClientsLength()
log.Info().Msg(fmt.Sprintf("expected worker number: %v, current worker count: %v", r.expectWorkers, c))
if c >= r.expectWorkers {
err = r.start()
if err != nil {
log.Error().Err(err).Msg("failed to run")
os.Exit(1)
}
return
}
case <-tickerMaxWait.C:
log.Warn().Msg("reached max wait time, quiting")
r.onQuiting()
os.Exit(1)
}
}
}()
}
// master state machine
r.goAttach(r.stateMachine)
// listen and deal message from worker
r.goAttach(r.clientListener)
// listen and record heartbeat from worker
r.heartbeatWorker()
<-r.closeChan
}
func (r *masterRunner) start() error {
numWorkers := r.server.getAvailableClientsLength()
if numWorkers == 0 {
return errors.New("current available workers: 0")
}
// fetching testcases
testCasesBytes, err := r.fetchTestCases()
if err != nil {
return err
}
workerProfile := &Profile{}
if err := copier.Copy(workerProfile, r.profile); err != nil {
log.Error().Err(err).Msg("copy workerProfile failed")
return err
}
// spawn count
spawnCounts := builtin.SplitInteger(int(r.profile.SpawnCount), numWorkers)
// spawn rate
spawnRate := workerProfile.SpawnRate / float64(numWorkers)
if spawnRate < 1 {
spawnRate = 1
}
// max RPS
maxRPSs := builtin.SplitInteger(int(workerProfile.MaxRPS), numWorkers)
r.updateState(StateSpawning)
log.Info().Msg("send spawn data to worker")
cur := 0
r.server.clients.Range(func(key, value interface{}) bool {
if workerInfo, ok := value.(*WorkerNode); ok {
if workerInfo.getState() == StateQuitting || workerInfo.getState() == StateMissing {
return true
}
if workerProfile.SpawnCount > 0 {
workerProfile.SpawnCount = int64(spawnCounts[cur])
}
workerProfile.MaxRPS = int64(maxRPSs[cur])
workerProfile.SpawnRate = spawnRate
workerInfo.getStream() <- &messager.StreamResponse{
Type: "spawn",
Profile: ProfileToBytes(workerProfile),
NodeID: workerInfo.ID,
Tasks: testCasesBytes,
}
cur++
}
return true
})
log.Warn().Interface("profile", r.profile).Msg("send spawn data to worker successfully")
return nil
}
func (r *masterRunner) rebalance() error {
numWorkers := r.server.getAvailableClientsLength()
if numWorkers == 0 {
return errors.New("current available workers: 0")
}
workerProfile := &Profile{}
if err := copier.Copy(workerProfile, r.profile); err != nil {
log.Error().Err(err).Msg("copy workerProfile failed")
return err
}
// spawn count
spawnCounts := builtin.SplitInteger(int(r.profile.SpawnCount), numWorkers)
// spawn rate
spawnRate := workerProfile.SpawnRate / float64(numWorkers)
if spawnRate < 1 {
spawnRate = 1
}
// max RPS
maxRPSs := builtin.SplitInteger(int(workerProfile.MaxRPS), numWorkers)
cur := 0
log.Info().Msg("send spawn data to worker")
r.server.clients.Range(func(key, value interface{}) bool {
if workerInfo, ok := value.(*WorkerNode); ok {
if workerInfo.getState() == StateQuitting || workerInfo.getState() == StateMissing {
return true
}
if workerProfile.SpawnCount > 0 {
workerProfile.SpawnCount = int64(spawnCounts[cur])
}
workerProfile.MaxRPS = int64(maxRPSs[cur])
workerProfile.SpawnRate = spawnRate
if workerInfo.getState() == StateInit {
workerInfo.getStream() <- &messager.StreamResponse{
Type: "spawn",
Profile: ProfileToBytes(workerProfile),
NodeID: workerInfo.ID,
Tasks: r.testCasesBytes,
}
} else {
workerInfo.getStream() <- &messager.StreamResponse{
Type: "rebalance",
Profile: ProfileToBytes(workerProfile),
NodeID: workerInfo.ID,
}
}
cur++
}
return true
})
log.Warn().Msg("send rebalance data to worker successfully")
return nil
}
func (r *masterRunner) fetchTestCases() ([]byte, error) {
ticker := time.NewTicker(30 * time.Second)
if len(r.testCaseBytesChan) > 0 {
<-r.testCaseBytesChan
}
r.parseTestCasesChan <- true
select {
case <-ticker.C:
return nil, errors.New("parse testcases timeout")
case testCasesBytes := <-r.testCaseBytesChan:
r.testCasesBytes = testCasesBytes
return testCasesBytes, nil
}
}
func (r *masterRunner) stop() error {
if r.isStarting() {
r.updateState(StateStopping)
r.server.sendBroadcasts(&genericMessage{Type: "stop"})
return nil
} else {
return errors.New("already stopped")
}
}
func (r *masterRunner) onQuiting() {
if r.getState() != StateQuitting {
r.server.sendBroadcasts(&genericMessage{
Type: "quit",
})
}
r.updateState(StateQuitting)
}
func (r *masterRunner) close() {
r.onQuiting()
close(r.closeChan)
}
func (r *masterRunner) reportStats() {
currentTime := time.Now()
println()
println("==================================== HttpRunner Master for Distributed Load Testing ==================================== ")
println(fmt.Sprintf("Current time: %s, State: %v, Current Available Workers: %v, Target Users: %v, Current Users: %v",
currentTime.Format("2006/01/02 15:04:05"), getStateName(r.getState()), r.server.getAvailableClientsLength(), r.getSpawnCount(), r.server.getCurrentUsers()))
table := tablewriter.NewWriter(os.Stdout)
table.SetColMinWidth(0, 40)
table.SetColMinWidth(1, 10)
table.SetColMinWidth(2, 10)
table.SetHeader([]string{"Worker ID", "IP", "State", "Current Users", "CPU Usage (%)", "Memory Usage (%)"})
for _, worker := range r.server.getAllWorkers() {
row := make([]string, 6)
row[0] = worker.ID
row[1] = worker.IP
row[2] = fmt.Sprintf("%v", getStateName(worker.State))
row[3] = fmt.Sprintf("%v", worker.UserCount)
row[4] = fmt.Sprintf("%.2f", worker.CPUUsage)
row[5] = fmt.Sprintf("%.2f", worker.MemoryUsage)
table.Append(row)
}
table.Render()
println()
}