需求
想知道接口调用时长的增长率,调用频次很高的接口,调用时间超过500毫秒的接口,接口请求数监控(方便知道接口被调用需求增长了)
分析
这里有个陷阱,$http_x_forwarded_for
可能会包含逗号,
,用逗号做分隔符的话会有bug,省事点直接注释掉
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
| log_format main
'{"@timestamp":"$time_iso8601",'
'"host":"$hostname",'
'"server_ip":"$server_addr",'
'"client_ip":"$remote_addr",'
# '"xff":"$http_x_forwarded_for",'
'"domain":"$host",'
'"url":"$uri",'
'"referer":"$http_referer",'
'"args":"$args",'
'"upstreamtime":"$upstream_response_time",'
'"responsetime":"$request_time",'
'"request_method":"$request_method",'
'"status":"$status",'
'"size":"$body_bytes_sent",'
'"request_length":"$request_length",'
'"protocol":"$server_protocol",'
'"upstreamhost":"$upstream_addr",'
'"file_dir":"$request_filename",'
'"http_user_agent":"$http_user_agent"'
'}';
|
1
| {"@timestamp":"2025-09-10T15:26:06+08:00","host":"nginx02","server_ip":"10.129.240.85","client_ip":"10.130.75.34","domain":"ywmom-pre.sunwoda-evb.com","url":"/mes/ProductionGroupInfo","referer":"-","args":"-","upstreamtime":"0.012","responsetime":"0.013","request_method":"POST","status":"200","size":"140","request_length":"2243","protocol":"HTTP/1.1","upstreamhost":"10.129.240.123:80","file_dir":"/usr/local/nginx/html/mes/ProductionGroupInfo","http_user_agent":"-"}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
| # awk中变量是全局作用域,同名变量是同一个变量
BEGIN {
FS = "[,{]"
records1 = 0
records2 = 0
records3 = 0
records4_twosecago_post = 0
records4_twosecago_pre = 0
records4_onwsecago_post = 0
records4_onesecago_pre = 0
currtime = getTimeSeconds()
twosecago = getTimeSeconds(-2)
onesecago = getTimeSeconds(-1)
btob = between_twominago_oneminago_begin = getTimeSeconds(-120)
btoe = between_twominago_oneminago_end = getTimeSeconds(-61)
bonb = between_oneminago_now_begin = getTimeSeconds(-60)
bone = between_oneminago_now_end = onesecago
exclude_domain["harbor-mom.example.com"]
exclude_domain["logs-mom.example.com"]
exclude_domain["oss-mom.example.com"]
exclude_domain["ossapi-mom.example.com"]
exclude_domain["gateway-mom.example.com"]
exclude_domain["idm-mom.example.com"]
exclude_domain["mom.example.com"]
exclude_domain["paas-mom.example.com"]
# 需求1平均时长增长率告警:2分钟前到1分钟前 至 前1分钟内 同个接口平均调用时长增长率超过x%
x = 50
# 需求2调用频次告警:1秒钟前同IP同接口调用超过y次(包含)
y = 6
# 需求3调用时长告警:前1分钟内接口调用时长大于z秒,支持小数
z = 0.5
# 需求4接口请求数告警:2秒前、1秒前
mompost = "mom-post.example.com"
mompre = "mom-pre.example.com"
z_post = z # 中后工序
z_pre = 5 # 前工序
post_exclude_url["/mes/LoginCheck"]
post_exclude_url["/mes/SaveInOutStockTime"]
post_exclude_url["/mes/GetTime"]
pre_exclude_url["/mes/preLoginCheck"]
pre_exclude_url["/mes/GetTime"]
}
function getTimeSeconds(offset) {
if (offset == "") offset = 0
return strftime("%Y-%m-%dT%H:%M:%S", systime() + offset)
}
function sendMsg(msg) {
hook = "https://open.feishu.cn/open-apis/bot/v2/hook/uuiduuid-uuid-uuid-uuid-uuiduuiduuid"
data = "'{\"msg_type\":\"text\",\"content\":{\"text\":\"" msg "\"}}'"
command = "curl -X POST -H \"Content-Type: application/json\" -d " data " " hook
system(command)
}
{
gsub(/"/, "", $0)
match($2, /@timestamp:(.*)\+08:00/, timestamp)
match($5, /client_ip:(.*)/, client_ip)
match($6, /domain:(.*)/, domain)
match($7, /url:(.*)/, url)
match($11, /responsetime:(.*)/, responsetime)
if (domain[1] in exclude_domain) next
ts = timestamp[1] # 请求时间
ip = client_ip[1] # 客户端IP
api = domain[1] url[1] # 接口
rt = responsetime[1] # 响应时长/调用时长
if (ts >= btob && ts <= btoe) { # 2分钟前到1分钟前
bto_count[api]++
bto_rttotal[api] += rt
}
else if (ts >= bonb && ts <= bone) { # 前1分钟内
if (domain[1] == mompost) {
if (url[1] in post_exclude_url) next
if (rt > z_post) {
records3++
log3 = log3 rt "秒 " ts " " ip " " api "\\n"
}
if (ts == twosecago) {
records4_twosecago_post++
}
else if (ts == onesecago) {
records4_onesecago_post++
}
}
else if (domain[1] == mompre) {
if (url[1] in pre_exclude_url) next
if (rt > z_pre) {
records3++
log3 = log3 rt "秒 " ts " " ip " " api "\\n"
}
if (ts == twosecago) {
records4_twosecago_pre++
}
else if (ts == onesecago) {
records4_onesecago_pre++
}
}
# 注释掉第一版不区分前后工序
#if (rt > z) {
# records3++
# log3 = log3 rt "秒 " ts " " ip " " api "\\n"
#}
bon_count[api]++
bon_rttotal[api] += rt
if (ts == onesecago) { # 1秒钟前 onesecago = bone
ts_ip_api = ts " " ip " " api
onesecago_count[ts_ip_api]++
if (onesecago_count[ts_ip_api] == y) records2++
}
}
}
END {
# 打印当前时间(脚本执行开始时间)
if (records1 > 0 || records2 > 0 || records3 > 0 || records4_twosecago_post > 0 || records4_twosecago_pre > 0 || records4_onesecago_post > 0 || records4_onesecago_pre > 0) {
sendMsg("当前时间(脚本执行开始时间):" currtime)
}
# 需求1
for (i in bto_count) {
if (i in bon_count) { # 取交集
bto_rtavg = bto_rttotal[i] / bto_count[i]
bon_rtavg = bon_rttotal[i] / bon_count[i]
if (bto_rtavg > 0 && bon_rtavg > 0.4)
growth_ratio = bon_rtavg / bto_rtavg - 1
else
growth_ratio = 0
growth_pct = growth_ratio * 100
if (growth_pct > x) {
records1++
log1 = log1 bto_rtavg"秒("bto_count[i]"次平均) " bon_rtavg"秒("bon_count[i]"次平均) " "增长" growth_pct "% " i "\\n"
}
}
}
if (records1 > 0) {
content1_1 = "平均时长增长率告警: 2分钟前到1分钟前("btob"到"btoe") 至 前1分钟内("bonb"到"bone")"
content1_2 = " 同个接口平均调用时长增长率超过" x "%,有以下" records1 "条记录:" "\\n"
msg1 = content1_1 content1_2 log1
sendMsg(msg1)
}
# 需求2
if (records2 > 0) {
PROCINFO["sorted_in"] = "@val_num_desc"
for (j in onesecago_count){
if (onesecago_count[j] >= y) {
log2 = log2 onesecago_count[j] "次 " j "\\n"
}
}
content2 = "调用频次告警:1秒钟前("onesecago")同IP同接口调用超过" y "次(包含),有以下" records2 "条记录:" "\\n"
msg2 = content2 log2
sendMsg(msg2)
}
# 需求3
if (records3 > 0) {
content3 = "调用时长告警:前1分钟内("bonb"到"bone")接口调用时长大于" z "秒,有以下" records3 "条记录:" "\\n"
msg3 = content3 log3
sendMsg(msg3)
}
# 需求4
if (records4_twosecago_post > 0 || records4_twosecago_pre > 0 || records4_onesecago_post > 0 || records4_onesecago_pre > 0 ) {
content4 = "接口请求数告警:\\n"
content4_post = "中后工序 2秒前("twosecago" "records4_twosecago_post"次) 1秒前("onesecago" "records4_onesecago_post"次)\\n"
content4_pre = "前工序 2秒前("twosecago" "records4_twosecago_pre"次) 1秒前("onesecago" "records4_onesecago_pre"次)"
msg4 = content4 content4_post content4_pre
sendMsg(msg4)
}
# 分割符
if (records1 > 0 || records2 > 0 || records3 > 0 || records4_twosecago_post > 0 || records4_twosecago_pre > 0 || records4_onesecago_post > 0 || records4_onesecago_pre > 0) {
endtime = getTimeSeconds()
sendMsg("脚本执行结束时间:"endtime"------------------------------")
}
}
|
执行脚本
1分钟执行一次,如果接口被调用越来越多,取最后3万可能还不够,需适当调整。
建议使用方法2,方法1有个已知缺陷,当前循环与下一次循环间隔不是sleep的1分钟,而是1分钟加awk消耗的时间。
1
2
3
4
5
6
7
| #!/bin/bash
echo $$ > /tmp/mom-monitor.pid
while true; do
tail -30000 /var/log/nginx/access.log | awk -f mom.awk
sleep 1m
done
|
1
| nohup ./mom-monitor.sh &> /dev/null &
|
1
| * * * * * tail -30000 /var/log/nginx/access.log | awk -f /var/log/nginx/mom.awk
|
飞书告警结果
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
| 当前时间(脚本执行开始时间):2025-09-10T15:43:01
平均时长增长率告警: 2分钟前到1分钟前(2025-09-10T15:41:01到2025-09-10T15:42:00) 至 前1分钟内(2025-09-10T15:42:01到2025-09-10T15:43:00) 同个接口平均调用时长增长率超过50%,有以下1条记录:
0.201333秒(3次平均) 1.033秒(1次平均) 增长413.079% mom-post.example.com/mes/FittingBinding
调用频次告警:1秒钟前(2025-09-10T15:43:00)同IP同接口调用超过6次(包含),有以下7条记录:
12次 2025-09-10T15:43:00 10.130.75.10 mom-pre.example.com/mes/ProductionGroupInfo
10次 2025-09-10T15:43:00 10.130.14.95 mom-post.example.com/mes/OutStationCheckData
10次 2025-09-10T15:43:00 10.130.15.81 mom-post.example.com/mes/GetSpecifications
8次 2025-09-10T15:43:00 10.130.20.58 mom-post.example.com/mes/MaterialControl
8次 2025-09-10T15:43:00 10.130.15.81 mom-post.example.com/mes/OutStationCheckData
6次 2025-09-10T15:43:00 10.130.20.199 mom-post.example.com/mes/GetTestData
6次 2025-09-10T15:43:00 10.130.14.95 mom-post.example.com/mes/FittingCheckForCell
调用时长告警:前1分钟内(2025-09-10T15:42:01到2025-09-10T15:43:00)接口调用时长大于0.5秒,有以下6条记录:
0.861秒 2025-09-10T15:42:01 10.130.36.71 mom-post.example.com/mes/OutStationCheckData
0.759秒 2025-09-10T15:42:01 10.130.15.164 mom-post.example.com/mes/InStationCheck
0.728秒 2025-09-10T15:42:01 10.130.15.164 mom-post.example.com/mes/InStationCheck
0.835秒 2025-09-10T15:42:01 10.130.14.61 mom-post.example.com/mes/InStationCheck
0.674秒 2025-09-10T15:42:01 10.130.21.71 mom-post.example.com/mes/InStationCheck
0.739秒 2025-09-10T15:42:01 10.130.36.172 mom-post.example.com/mes/InStationCheck
接口请求数告警:
中后工序 2秒前(2025-09-10T15:42:59 90次) 1秒前(2025-09-10T15:43:00 158次)
前工序 2秒前(2025-09-10T15:42:59 76次) 1秒前(2025-09-10T15:43:00 91次)
脚本执行结束时间:2025-09-10T15:43:05------------------------------
|