Go性能调优2 – 性能调优示例

性能调优过程

  1. 设定优化目标(如:qps达到多少,内存消耗低于多少)
  2. 分析系统瓶颈点
  3. 优化瓶颈点

ps: 而不是盲目的,无限制的优化我们的程序

常见分析指标

  • Wall Time 程序运行的绝对时间,其中可能会调用外部系统,可能会等待web响应,会影响绝对时间
  • CPU Time
  • Block Time
  • Memory allocation 内存分配
  • GC times/time spent GC的次数/GC的耗时

调优示例代码

optimization.go

package optimization

import (
    "encoding/json"
    "strconv"
)

func createRequest() string {
    payload := make([]int, 100, 100)
    for i := 0; i < 100; i++ {
        payload[i] = i
    }
    req := Request{"demo_transaction", payload}
    v, err := json.Marshal(&req)
    if err != nil {
        panic(err)
    }
    return string(v)
}

func processRequest(reqs []string) []string {
    reps := []string{}
    for _, req := range reqs {
        reqObj := &Request{}
        json.Unmarshal([]byte(req), reqObj)
        ret := ""
        for _, e := range reqObj.PayLoad {
            ret += strconv.Itoa(e) + ","
        }
        repObj := &Response{reqObj.TransactionID, ret}
        repJson, err := json.Marshal(&repObj)
        if err != nil {
            panic(err)
        }
        reps = append(reps, string(repJson))
    }
    return reps
}

optimization_test.go

package optimization

import "testing"

func TestCreateRequest(t *testing.T) {
    str := createRequest()
    t.Log(str)
}

func TestProcessRequest(t *testing.T) {
    reqs := []string{}
    reqs = append(reqs, createRequest())
    reps := processRequest(reqs)
    t.Log(reps[0])
}

func BenchmarkProcessRequest(b *testing.B) {
    reqs := []string{}
    reqs = append(reqs, createRequest())
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        _ = processRequest(reqs)
    }
    b.StopTimer()
}

structs.go

package optimization

// Request is the input of the filter
type Request struct {
    TransactionID string `json:"transaction_id"`
    PayLoad       []int  `json:"payload"`
}

// Response is the output of the filter
type Response struct {
    TransactionID string `json:"transaction_id"`
    Exp           string `json:"exp"`
}

先来看下初始的数据

[root@VM_0_14_centos ch32]# go test -bench=.
goos: linux
goarch: amd64
pkg: learn/src/ch32
BenchmarkProcessRequest-2          27771             45306 ns/op
PASS
ok      learn/src/ch32  1.705s
[root@VM_0_14_centos ch32]# go test -bench=. -cpuprofile=cpu.prof
goos: linux
goarch: amd64
pkg: learn/src/ch32
BenchmarkProcessRequest-2          27256             46167 ns/op
PASS
ok      learn/src/ch32  1.845s
[root@VM_0_14_centos ch32]# go tool pprof cpu.prof
File: ch32.test
Type: cpu
Time: Nov 11, 2020 at 4:05pm (CST)
Duration: 1.84s, Total samples = 1.91s (103.86%)
Entering interactive mode (type "help" for commands, "o" for options)
(pprof) top -cum
Showing nodes accounting for 200ms, 10.47% of 1910ms total
Showing top 10 nodes out of 161
      flat  flat%   sum%        cum   cum%
         0     0%     0%     1690ms 88.48%  learn/src/ch32.BenchmarkProcessRequest
      40ms  2.09%  2.09%     1690ms 88.48%  learn/src/ch32.processRequest
         0     0%  2.09%     1690ms 88.48%  testing.(*B).launch
         0     0%  2.09%     1690ms 88.48%  testing.(*B).runN
         0     0%  2.09%     1220ms 63.87%  encoding/json.Unmarshal
         0     0%  2.09%     1050ms 54.97%  encoding/json.(*decodeState).object
         0     0%  2.09%     1050ms 54.97%  encoding/json.(*decodeState).unmarshal
      20ms  1.05%  3.14%     1050ms 54.97%  encoding/json.(*decodeState).value
     110ms  5.76%  8.90%     1010ms 52.88%  encoding/json.(*decodeState).array
      30ms  1.57% 10.47%      530ms 27.75%  encoding/json.(*decodeState).literalStore
(pprof) list processRequest
Total: 1.91s
ROUTINE ======================== learn/src/ch32.processRequest in /root/go_learn/src/ch32/optimization.go
      40ms      1.69s (flat, cum) 88.48% of Total
         .          .     20:
         .          .     21:func processRequest(reqs []string) []string {
         .          .     22:   reps := []string{}
         .          .     23:   for _, req := range reqs {
         .          .     24:           reqObj := &Request{}
         .      1.22s     25:           json.Unmarshal([]byte(req), reqObj)
         .          .     26:           ret := ""
      30ms       30ms     27:           for _, e := range reqObj.PayLoad {
      10ms      420ms     28:                   ret += strconv.Itoa(e) + ","
         .          .     29:           }
         .       10ms     30:           repObj := &Response{reqObj.TransactionID, ret}
         .       10ms     31:           repJson, err := json.Marshal(&repObj)
         .          .     32:           if err != nil {
         .          .     33:                   panic(err)
         .          .     34:           }
         .          .     35:           reps = append(reps, string(repJson))
         .          .     36:   }

可以看到时间基本消耗在 json.Unmarshal, 我们可以使用 easyJson优化

[root@VM_0_14_centos ch32]# ~/go/bin/easyjson -all structs.go

修改 optimization.go对应代码

···

// json.Unmarshal([]byte(req), reqObj)
reqObj.UnmarshalJSON([]byte(req))
ret := ""
for _, e := range reqObj.PayLoad {
    ret += strconv.Itoa(e) + ","
}

···

// repJson, err := json.Marshal(&repObj)
repJson, err := repObj.MarshalJSON()

···

修改后,再次用benck查看

[root@VM_0_14_centos ch32]# go test -bench=.
goos: linux
goarch: amd64
pkg: learn/src/ch32
BenchmarkProcessRequest-2          60118             20171 ns/op
PASS
ok      learn/src/ch32  1.422s
[root@VM_0_14_centos ch32]# go test -bench=. -cpuprofile=cpu.prof
goos: linux
goarch: amd64
pkg: learn/src/ch32
BenchmarkProcessRequest-2          53934             23482 ns/op
PASS
ok      learn/src/ch32  1.635s
[root@VM_0_14_centos ch32]# go tool pprof cpu.prof
File: ch32.test
Type: cpu
Time: Nov 11, 2020 at 4:29pm (CST)
Duration: 1.63s, Total samples = 1.82s (111.89%)
Entering interactive mode (type "help" for commands, "o" for options)
(pprof) top -cum
Showing nodes accounting for 0.33s, 18.13% of 1.82s total
Showing top 10 nodes out of 149
      flat  flat%   sum%        cum   cum%
         0     0%     0%      1.46s 80.22%  learn/src/ch32.BenchmarkProcessRequest
     0.01s  0.55%  0.55%      1.46s 80.22%  learn/src/ch32.processRequest
         0     0%  0.55%      1.46s 80.22%  testing.(*B).launch
         0     0%  0.55%      1.46s 80.22%  testing.(*B).runN
     0.04s  2.20%  2.75%      0.77s 42.31%  runtime.concatstring3
     0.14s  7.69% 10.44%      0.73s 40.11%  runtime.concatstrings
         0     0% 10.44%      0.53s 29.12%  learn/src/ch32.(*Request).UnmarshalJSON (inline)
     0.01s  0.55% 10.99%      0.53s 29.12%  learn/src/ch32.easyjson6a975c40DecodeLearnSrcCh321
     0.11s  6.04% 17.03%      0.50s 27.47%  runtime.mallocgc
     0.02s  1.10% 18.13%      0.45s 24.73%  runtime.rawstringtmp
(pprof) list processRequest
Total: 1.82s
ROUTINE ======================== learn/src/ch32.processRequest in /root/go_learn/src/ch32/optimization.go
      10ms      1.46s (flat, cum) 80.22% of Total
         .          .     20:
         .          .     21:func processRequest(reqs []string) []string {
         .          .     22:   reps := []string{}
         .          .     23:   for _, req := range reqs {
         .          .     24:           reqObj := &Request{}
         .      540ms     25:           reqObj.UnmarshalJSON([]byte(req))
         .          .     26:           // json.Unmarshal([]byte(req), reqObj)
         .          .     27:           ret := ""
         .          .     28:           for _, e := range reqObj.PayLoad {
      10ms      810ms     29:                   ret += strconv.Itoa(e) + ","
         .          .     30:           }
         .          .     31:           repObj := &Response{reqObj.TransactionID, ret}
         .       70ms     32:           repJson, err := repObj.MarshalJSON()
         .          .     33:           // repJson, err := json.Marshal(&repObj)
         .          .     34:           if err != nil {
         .          .     35:                   panic(err)
         .          .     36:           }
         .       40ms     37:           reps = append(reps, string(repJson))
         .          .     38:   }
         .          .     39:   return reps
         .          .     40:}
         .          .     41:
         .          .     42:func processRequestOld(reqs []string) []string {

可以看到json.Unmarshal改用easyjson后已经从 1.22s 优化到 540ms。

现在的最耗时的语句变为 ret += strconv.Itoa(e) + "," 字符串相加的语句,我们知道Go语言的 string 是不可变的byte slice,每次相加都会生成一个新的 string 会去给内存复制数据,原有的那片对GC也有一定的影响。

这部分代码跟内存相关,可以先做一个内存的 prof 查看下内存的消耗情况:

[root@VM_0_14_centos ch32]# go test -bench=. -memprofile=mem.prof
goos: linux
goarch: amd64
pkg: learn/src/ch32
BenchmarkProcessRequest-2          49272             20444 ns/op
PASS
ok      learn/src/ch32  1.266s
[root@VM_0_14_centos ch32]# go tool pprof mem.prof
File: ch32.test
Type: alloc_space
Time: Nov 11, 2020 at 5:01pm (CST)
Entering interactive mode (type "help" for commands, "o" for options)
(pprof) top -cum
Showing nodes accounting for 1034.76MB, 96.23% of 1075.27MB total
Dropped 1 node (cum <= 5.38MB)
Showing top 10 nodes out of 17
      flat  flat%   sum%        cum   cum%
         0     0%     0%  1075.27MB   100%  learn/src/ch32.BenchmarkProcessRequest
  902.18MB 83.90% 83.90%  1075.27MB   100%  learn/src/ch32.processRequest
         0     0% 83.90%  1075.27MB   100%  testing.(*B).launch
         0     0% 83.90%  1075.27MB   100%  testing.(*B).runN
         0     0% 83.90%   121.58MB 11.31%  learn/src/ch32.(*Request).UnmarshalJSON (inline)
  120.58MB 11.21% 95.12%   121.58MB 11.31%  learn/src/ch32.easyjson6a975c40DecodeLearnSrcCh321
         0     0% 95.12%    51.51MB  4.79%  learn/src/ch32.Response.MarshalJSON
         0     0% 95.12%    32.01MB  2.98%  github.com/mailru/easyjson/buffer.(*Buffer).EnsureSpace (inline)
      12MB  1.12% 96.23%    32.01MB  2.98%  github.com/mailru/easyjson/buffer.(*Buffer).ensureSpaceSlow
         0     0% 96.23%    32.01MB  2.98%  learn/src/ch32.easyjson6a975c40EncodeLearnSrcCh32
(pprof) list processRequest
Total: 1.05GB
ROUTINE ======================== learn/src/ch32.processRequest in /root/go_learn/src/ch32/optimization.go
  902.18MB     1.05GB (flat, cum)   100% of Total
         .          .     20:
         .          .     21:func processRequest(reqs []string) []string {
         .          .     22:   reps := []string{}
         .          .     23:   for _, req := range reqs {
         .          .     24:           reqObj := &Request{}
   23.01MB   144.58MB     25:           reqObj.UnmarshalJSON([]byte(req))
         .          .     26:           // json.Unmarshal([]byte(req), reqObj)
         .          .     27:           ret := ""
         .          .     28:           for _, e := range reqObj.PayLoad {
  852.66MB   852.66MB     29:                   ret += strconv.Itoa(e) + ","
         .          .     30:           }
         .          .     31:           repObj := &Response{reqObj.TransactionID, ret}
         .    51.51MB     32:           repJson, err := repObj.MarshalJSON()
         .          .     33:           // repJson, err := json.Marshal(&repObj)
         .          .     34:           if err != nil {
         .          .     35:                   panic(err)
         .          .     36:           }
   26.51MB    26.51MB     37:           reps = append(reps, string(repJson))
         .          .     38:   }
         .          .     39:   return reps
         .          .     40:}
         .          .     41:
         .          .     42:func processRequestOld(reqs []string) []string {

可以看出主要耗费内存的就是这句:ret += strconv.Itoa(e) + ","

优化 optimization.go

        ...
        // ret := ""
        var buf strings.Builder
        for _, e := range reqObj.PayLoad {
            // ret += strconv.Itoa(e) + ","
            buf.WriteString(strconv.Itoa(e))
            buf.WriteString(",")
        }
        repObj := &Response{reqObj.TransactionID, buf.String()}
        ...

再次运行benchmark:

[root@VM_0_14_centos ch32]# go test -bench=.
goos: linux
goarch: amd64
pkg: learn/src/ch32
BenchmarkProcessRequest-2         120964             10544 ns/op
PASS
ok      learn/src/ch32  1.383s

可以看出明显时间消耗由 20171ns/op 变为 10544ns/op

[root@VM_0_14_centos ch32]# go test -bench=. -cpuprofile=cpu.prof
goos: linux
goarch: amd64
pkg: learn/src/ch32
BenchmarkProcessRequest-2         108517             10991 ns/op
PASS
ok      learn/src/ch32  1.434s
[root@VM_0_14_centos ch32]# go tool pprof cpu.prof 
File: ch32.test
Type: cpu
Time: Nov 11, 2020 at 5:15pm (CST)
Duration: 1.42s, Total samples = 1.46s (102.47%)
Entering interactive mode (type "help" for commands, "o" for options)
(pprof) top -cum
Showing nodes accounting for 0.27s, 18.49% of 1.46s total
Showing top 10 nodes out of 111
      flat  flat%   sum%        cum   cum%
         0     0%     0%      1.36s 93.15%  learn/src/ch32.BenchmarkProcessRequest
     0.01s  0.68%  0.68%      1.36s 93.15%  learn/src/ch32.processRequest
         0     0%  0.68%      1.36s 93.15%  testing.(*B).launch
         0     0%  0.68%      1.36s 93.15%  testing.(*B).runN
         0     0%  0.68%      0.88s 60.27%  learn/src/ch32.(*Request).UnmarshalJSON (inline)
     0.04s  2.74%  3.42%      0.88s 60.27%  learn/src/ch32.easyjson6a975c40DecodeLearnSrcCh321
     0.03s  2.05%  5.48%      0.39s 26.71%  github.com/mailru/easyjson/jlexer.(*Lexer).IsDelim
     0.13s  8.90% 14.38%      0.34s 23.29%  github.com/mailru/easyjson/jlexer.(*Lexer).FetchToken
         0     0% 14.38%      0.34s 23.29%  github.com/mailru/easyjson/jlexer.(*Lexer).Int (inline)
     0.06s  4.11% 18.49%      0.34s 23.29%  github.com/mailru/easyjson/jlexer.(*Lexer).Int64
(pprof) list processRequest
Total: 1.46s
ROUTINE ======================== learn/src/ch32.processRequest in /root/go_learn/src/ch32/optimization.go
      10ms      1.36s (flat, cum) 93.15% of Total
         .          .     22:func processRequest(reqs []string) []string {
         .          .     23:   reps := []string{}
         .          .     24:   for _, req := range reqs {
         .          .     25:           reqObj := &Request{}
         .          .     26:           // json.Unmarshal([]byte(req), reqObj)
         .      900ms     27:           reqObj.UnmarshalJSON([]byte(req))
         .          .     28:           // ret := ""
         .          .     29:           var buf strings.Builder
         .          .     30:           for _, e := range reqObj.PayLoad {
         .          .     31:                   // ret += strconv.Itoa(e) + ","
         .       90ms     32:                   buf.WriteString(strconv.Itoa(e))
      10ms      120ms     33:                   buf.WriteString(",")
         .          .     34:           }
         .          .     35:           repObj := &Response{reqObj.TransactionID, buf.String()}
         .          .     36:           // repJson, err := json.Marshal(&repObj)
         .      220ms     37:           repJson, err := repObj.MarshalJSON()
         .          .     38:           if err != nil {
         .          .     39:                   panic(err)
         .          .     40:           }
         .       30ms     41:           reps = append(reps, string(repJson))
         .          .     42:   }
         .          .     43:   return reps
         .          .     44:}
         .          .     45:
         .          .     46:func processRequestOld(reqs []string) []string {

可以看出字符拼接语句已经从 810ms 降到了 90ms+120ms;

注:上面的时间仅供参考,因为是用benchmark做profile的,时间越短 benchmark就运行越多的次数

再看一下修改后内存的消耗情况:

[root@VM_0_14_centos ch32]# go test -bench=. -memprofile=mem.prof
goos: linux
goarch: amd64
pkg: learn/src/ch32
BenchmarkProcessRequest-2         121188             10223 ns/op
PASS
ok      learn/src/ch32  1.349s
[root@VM_0_14_centos ch32]# go tool pprof mem.prof
File: ch32.test
Type: alloc_space
Time: Nov 11, 2020 at 5:22pm (CST)
Entering interactive mode (type "help" for commands, "o" for options)
(pprof) top -cum
Showing nodes accounting for 471.73MB, 83.09% of 567.75MB total
Dropped 1 node (cum <= 2.84MB)
Showing top 10 nodes out of 18
      flat  flat%   sum%        cum   cum%
         0     0%     0%   567.75MB   100%  learn/src/ch32.BenchmarkProcessRequest
   93.53MB 16.47% 16.47%   567.75MB   100%  learn/src/ch32.processRequest
         0     0% 16.47%   567.75MB   100%  testing.(*B).launch
         0     0% 16.47%   567.75MB   100%  testing.(*B).runN
         0     0% 16.47%   246.66MB 43.44%  learn/src/ch32.(*Request).UnmarshalJSON (inline)
  245.66MB 43.27% 59.74%   246.66MB 43.44%  learn/src/ch32.easyjson6a975c40DecodeLearnSrcCh321
         0     0% 59.74%   118.53MB 20.88%  learn/src/ch32.Response.MarshalJSON
  109.03MB 19.20% 78.95%   109.03MB 19.20%  strings.(*Builder).WriteString (inline)
         0     0% 78.95%    72.51MB 12.77%  github.com/mailru/easyjson/buffer.(*Buffer).EnsureSpace (inline)
   23.50MB  4.14% 83.09%    72.51MB 12.77%  github.com/mailru/easyjson/buffer.(*Buffer).ensureSpaceSlow
(pprof) list processRequest
Total: 567.75MB
ROUTINE ======================== learn/src/ch32.processRequest in /root/go_learn/src/ch32/optimization.go
   93.53MB   567.75MB (flat, cum)   100% of Total
         .          .     22:func processRequest(reqs []string) []string {
         .          .     23:   reps := []string{}
         .          .     24:   for _, req := range reqs {
         .          .     25:           reqObj := &Request{}
         .          .     26:           // json.Unmarshal([]byte(req), reqObj)
   49.02MB   295.68MB     27:           reqObj.UnmarshalJSON([]byte(req))
         .          .     28:           // ret := ""
         .          .     29:           var buf strings.Builder
         .          .     30:           for _, e := range reqObj.PayLoad {
         .          .     31:                   // ret += strconv.Itoa(e) + ","
         .    47.01MB     32:                   buf.WriteString(strconv.Itoa(e))
         .    62.03MB     33:                   buf.WriteString(",")
         .          .     34:           }
         .          .     35:           repObj := &Response{reqObj.TransactionID, buf.String()}
         .          .     36:           // repJson, err := json.Marshal(&repObj)
         .   118.53MB     37:           repJson, err := repObj.MarshalJSON()
         .          .     38:           if err != nil {
         .          .     39:                   panic(err)
         .          .     40:           }
   44.51MB    44.51MB     41:           reps = append(reps, string(repJson))
         .          .     42:   }
         .          .     43:   return reps
         .          .     44:}
         .          .     45:
         .          .     46:func processRequestOld(reqs []string) []string {

可以看出内存的消耗也明显下降了许多。内存分配的次数少了整体的性能有明显的提升了。

与原有的代码对比:

[root@VM_0_14_centos ch32]# go test -bench=.
goos: linux
goarch: amd64
pkg: learn/src/ch32
BenchmarkProcessRequest-2         121729              9937 ns/op
BenchmarkProcessRequestOld-2       36087             36520 ns/op
PASS
ok      learn/src/ch32  3.969s
暂无评论

发送评论 编辑评论


				
|´・ω・)ノ
ヾ(≧∇≦*)ゝ
(☆ω☆)
(╯‵□′)╯︵┴─┴
 ̄﹃ ̄
(/ω\)
∠( ᐛ 」∠)_
(๑•̀ㅁ•́ฅ)
→_→
୧(๑•̀⌄•́๑)૭
٩(ˊᗜˋ*)و
(ノ°ο°)ノ
(´இ皿இ`)
⌇●﹏●⌇
(ฅ´ω`ฅ)
(╯°A°)╯︵○○○
φ( ̄∇ ̄o)
ヾ(´・ ・`。)ノ"
( ง ᵒ̌皿ᵒ̌)ง⁼³₌₃
(ó﹏ò。)
Σ(っ °Д °;)っ
( ,,´・ω・)ノ"(´っω・`。)
╮(╯▽╰)╭
o(*////▽////*)q
>﹏<
( ๑´•ω•) "(ㆆᴗㆆ)
😂
😀
😅
😊
🙂
🙃
😌
😍
😘
😜
😝
😏
😒
🙄
😳
😡
😔
😫
😱
😭
💩
👻
🙌
🖕
👍
👫
👬
👭
🌚
🌝
🙈
💊
😶
🙏
🍦
🍉
😣
Source: github.com/k4yt3x/flowerhd
颜文字
Emoji
小恐龙
花!
上一篇
下一篇