官方文档:https://prometheus.io/docs/prometheus/latest/installation/
下载:https://prometheus.io/download/
docker pull prom/prometheus:v2.31.1 docker run -it -d --restart=always -p 9090 prom/prometheus:v2.31.1
# apt search prometheus # apt-cache madison prometheus prometheus | 2.15.2+ds-2 | http://mirrors.aliyun.com/ubuntu focal/universe amd64 Packages prometheus | 2.15.2+ds-2 | http://mirrors.aliyun.com/ubuntu focal/universe Sources #apt install prometheus
官方部署文档:https://github.com/prometheus-operator/kube-prometheus
# git clone -b release-0.9 https://github.com/prometheus-operator/kube-prometheus.git #注意版本与k8s版本的对应关系 # cd kube-prometheus/ # kubectl apply -f manifests/setup # kubectl apply -f manifests/ 可以视情况修改manifests和manifests/setup(grep -R image: *.yaml)中yaml文件中的镜像源。
kubectl port-forward --help #在哪个node上执行就访问哪个node,临时暴露端口 kubectl --namespace monitoring port-forward --address 0.0.0.0 svc/prometheus-k8s 9090:9090
# vim manifests/prometheus-service.yaml apiVersion: v1 kind: Service metadata: labels: app.kubernetes.io/component: prometheus app.kubernetes.io/name: prometheus app.kubernetes.io/part-of: kube-prometheus app.kubernetes.io/version: 2.29.1 prometheus: k8s name: prometheus-k8s namespace: monitoring spec: type: NodePort ports: - name: web port: 9090 nodePort: 39090 targetPort: web selector: app: prometheus app.kubernetes.io/component: prometheus app.kubernetes.io/name: prometheus app.kubernetes.io/part-of: kube-prometheus prometheus: k8s sessionAffinity: ClientIP # kubectl apply -f manifests/prometheus-service.yaml service/prometheus-k8s configured
# kubectl --namespace monitoring port-forward --address 0.0.0.0 svc/grafana 3000:3000
# vim manifests/grafana-service.yaml apiVersion: v1 kind: Service metadata: labels: app.kubernetes.io/component: grafana app.kubernetes.io/name: grafana app.kubernetes.io/part-of: kube-prometheus app.kubernetes.io/version: 8.1.1 name: grafana namespace: monitoring spec: type: NodePort ports: - name: http port: 3000 targetPort: http nodePort: 33000 selector: app.kubernetes.io/component: grafana app.kubernetes.io/name: grafana app.kubernetes.io/part-of: kube-prometheus
# mkdir /apps # tar xvf prometheus-2.31.1.linux-amd64.tar.gz # ln -sv /apps/prometheus-2.31.1.linux-amd64 /apps/prometheus '/apps/prometheus' -> '/apps/prometheus-2.31.1.linux-amd64' # cd /apps/prometheus # ll prometheus* #prometheus 服务可执行程序 prometheus.yml #配置文件 promtool* #测试工具,用于检测配置 prometheus 配置文件、检测 metrics 数据等 # ./promtool check config prometheus.yml Checking prometheus.yml SUCCESS: 0 rule files found
# vim /etc/systemd/system/prometheus.service [Unit] Description=Prometheus Server Documentation=https://prometheus.io/docs/introduction/overview/ After=network.target [Service] Restart=on-failure WorkingDirectory=/apps/prometheus/ ExecStart=/apps/prometheus/prometheus --config.file=/apps/prometheus/prometheus.yml [Install] WantedBy=multi-user.target
# systemctl daemon-reload && systemctl restart prometheus && systemctl enable prometheus
https://prometheus.io/download/
# wget https://github.com/prometheus/node_exporter/releases/download/v1.2.2/node_exporter-1.2.2.linux-amd64.tar.gz # mkdir /apps # tar xf node_exporter-1.2.2.linux-amd64.tar.gz -C /apps # ln -sv node_exporter-1.2.2.linux-amd64/ node_exporter
# vim /etc/systemd/system/node-exporter.service [Unit] Description=Prometheus Node Exporter After=network.target [Service] ExecStart=/apps/node_exporter/node_exporter --web.listen-address=":9110" [Install] WantedBy=multi-user.target
# systemctl daemon-reload && systemctl restart node-exporter && systemctl enable node-exporter.service
配置 Prometheus 通过 node exporter 采集 node 节点的监控指标数据。
# vim /apps/prometheus/prometheus.yml # my global config global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.数据收集间隔时间,如果不配置默认为一分钟 evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.规 则扫描间隔时间,如果不配置默认为一分钟 # scrape_timeout is set to the global default (10s).超时时间 # Alertmanager configuration 报警通知配置 alerting: alertmanagers: - static_configs: - targets: # - alertmanager:9093 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: #规则配置 # - "first_rules.yml" # - "second_rules.yml" # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. scrape_configs: #数据采集目标配置 # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config. - job_name: "prometheus" # metrics_path defaults to '/metrics' # scheme defaults to 'http'. static_configs: - targets: ["localhost:9090"] - job_name: 'promethues-node' static_configs: - targets: ['172.16.244.111:9110','172.16.244.112:9110']
# systemctl restart prometheus.service
https://prometheus.io/download/#blackbox_exporter
blackbox_exporter 是 Prometheus 官方提供的一个 exporter,可以通过 HTTP, HTTPS, DNS, TCP 和 ICMP 对被监控节点进行监控和数据采集。
- HTTP/HTPPS:URL/API 可用性检测
# wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.19.0/blackbox_exporter-0.19.0.linux-amd64.tar.gz # tar xf /root/blackbox_exporter-0.19.0.linux-amd64.tar.gz -C /apps # ln -sv /apps/blackbox_exporter-0.19.0.linux-amd64 /apps/blackbox_exporter
# cat blackbox.yml #一般不用动,因为blackbox是通过prometheus server端来配置对哪个服务进行什么样的监控,server端来发送监控请求,client端将监控数据收集到后推送给server端,因此client端的配置模块只是定义了一些启动的模块。 modules: http_2xx: prober: http http_post_2xx: prober: http http: method: POST tcp_connect: prober: tcp pop3s_banner: prober: tcp tcp: query_response: - expect: "^+OK" tls: true tls_config: insecure_skip_verify: false ssh_banner: prober: tcp tcp: query_response: - expect: "^SSH-2.0-" - send: "SSH-2.0-blackbox-ssh-check" irc_banner: prober: tcp tcp: query_response: - send: "NICK prober" - send: "USER prober prober prober :prober" - expect: "PING :([^ ]+)" send: "PONG ${1}" - expect: "^:[^ ]+ 001" icmp: prober: icmp
# vim /etc/systemd/system/blackbox-exporter.service [Unit] Description=Prometheus Blackbox Exporter After=network.target [Service] Type=simple User=root Group=root ExecStart=/apps/blackbox_exporter/blackbox_exporter \ --config.file=/apps/blackbox_exporter/blackbox.yml \ --web.listen-address=:9115 Restart=on-failure [Install] WantedBy=multi-user.target
systemctl daemon-reload && systemctl restart blackbox-exporter.service && systemctl enable blackbox-exporter.service
# vim /apps/prometheus/prometheus.yml # 网站监控 - job_name: 'http_status' metrics_path: /probe params: module: [http_2xx] static_configs: - targets: ['http://www.xiaomi.com', 'http://www.baidu.com'] labels: instance: http_status group: web relabel_configs: - source_labels: [__address__] #relabel 通 过 将 __address__( 当 前 目 标 地 址 ) 写 入__param_target 标签来创建一个 label。 target_label: __param_target #监控目标 www.xiaomi.com,作为__address__的 value - source_labels: [__param_target] #监控目标 target_label: url #将监控目标与 url 创建一个 label - target_label: __address__ replacement: 172.16.244.111:9115
# /apps/prometheus/promtool check config /apps/prometheus/prometheus.yml Checking /apps/prometheus/prometheus.yml SUCCESS: 0 rule files found # systemctl restart prometheus.service
# vim /apps/prometheus/prometheus.yml # icmp 检测 - job_name: 'ping_status' metrics_path: /probe params: module: [icmp] static_configs: - targets: ['172.31.0.2',"223.6.6.6"] labels: instance: 'ping_status' group: 'icmp' relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: ip #将 ip 与__param_target 创建一个 label - target_label: __address__ replacement: 172.16.244.111:9115
# /apps/prometheus/promtool check config /apps/prometheus/prometheus.yml Checking /apps/prometheus/prometheus.yml SUCCESS: 0 rule files found # systemctl restart prometheus.service
# vim /apps/prometheus/prometheus.yml # 端口监控 - job_name: 'port_status' metrics_path: /probe params: module: [tcp_connect] static_configs: - targets: ['172.16.244.101:9100','172.16.244.132:80','172.16.244.202:6443'] labels: instance: 'port_status' group: 'port' relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: ip - target_label: __address__ replacement: 172.16.244.111:9115
# /apps/prometheus/promtool check config /apps/prometheus/prometheus.yml Checking /apps/prometheus/prometheus.yml SUCCESS: 0 rule files found # systemctl restart prometheus.service
https://grafana.com/docs/
grafana 是一个开源的可视化工具,可以调用 prometheus、mysql 等数据源进行更绚丽的前 端可视化。
# sudo apt-get install -y adduser libfontconfig1 # dpkg -i grafana-enterprise_7.5.11_amd64.deb 配置文件: # vim /etc/grafana/grafana.ini [server] # Protocol (http, https, socket) protocol = http # The ip address to bind to, empty will bind to all interfaces http_addr = 0.0.0.0 # The http port to use http_port = 3000 # systemctl restart grafana-server # systemctl enable grafana-server
node exporter 模板(ID:8919)
blackbox exporter 模板(ID:9719)
Prometheus 提供一个函数式的表达式语言 PromQL (Prometheus Query Language),可以使用户实时地查找和聚合时间序列数据,表达式计算结果可以在图表中展示,也可以在 Prometheus表达式浏览器中以表格形式展示,或者作为数据源,以HTTP API的方式提供给 外部系统使用。
node_memory_MemTotal_bytes #查询 node 节点总内存大小
node_memory_MemFree_bytes #查询 node 节点剩余可用内存
node_memory_MemTotal_bytes{instance=“172.31.7.111:9100”} #查询指定节点的总内存
node_memory_MemFree_bytes{instance=“172.31.7.111:9100”} #查询指定节点的可用内存
node_disk_io_time_seconds_total{device=“sda”} #查询指定磁盘的每秒磁盘 io
node_filesystem_free_bytes{device="/dev/sda1",fstype=“xfs”,mountpoint="/"} #查看指定磁盘的磁盘剩余空间
node_load1 0.1 #CPU 1min 负载
- 瞬时向量 (instant vector):是一组时间序列,每个时间序列包含单个数据样本,比如node_memory_MemTotal_bytes 查询当前剩余内存就是一个瞬时向量,该表达式的返回值中只会包含该时间序列中的最新的一个样本值,而相应的这样的表达式称之为瞬时向量表达式。
- 范围向量(range vector):是指在任何一个时间范围内,抓取的所有度量指标数据.比如最近一天的网卡流量趋势图。
- 标量(scalar):是一个浮点数类型的数据值,使用 node_load1 获取到时一个瞬时向量,但是可用使用内置函数 scalar()将瞬时向量转换为标量。
- 字符串(string):字符串类型的数据,目前使用较少
- = :选择与提供的字符串完全相同的标签。
- != :选择与提供的字符串不相同的标签。
- =~ :选择正则表达式与提供的字符串(或子字符串)相匹配的标签。
- !~ :选择正则表达式与提供的字符串(或子字符串)不匹配的标签。
#查询格式{
node_load1{instance=“172.16.244.100:9100”}
node_load1{job=“promethues-node”}node_load1{job=“promethues-node”,instance=“172.16.244.100:9100”}
node_load1{job=“promethues-node”,instance!=“172.16.244.100:9100”}
s-秒
m - 分钟
h - 小时
d-天
w-周
y-年
node_memory_MemTotal_bytes{} # 瞬时向量表达式,选择当前最新的数据
node_memory_MemTotal_bytes{}[5m] # 区间向量表达式,选择以当前时间为基准,5 分钟内的数据node_memory_MemTotal_bytes{instance=“172.31.7.111:9100”}[5m]
+ 加法
- 减法
* 乘法
/ 除法% 模
^ 幂等
node_memory_MemFree_bytes/1024/1024 #将内存进行单位转换
node_disk_read_bytes_total{device=“sda”} + node_disk_written_bytes_total{device=“sda”} #计算磁盘每秒读写数据量
sum (求和)
min (最小值)
max (最大值)
avg (平均值)
stddev (标准差)
stdvar (标准差异)
count (计数)
count_values (对 value 进行计数)
bottomk (样本值最小的 k 个元素)
topk (样本值最大的 k 个元素)
quantile (分布统计)
max(node_memory_MemFree_bytes) #某个指标数据的最大值
sum(http_requests_total) #计算 http_requests_total 最近的请求总量
cadvisor 由谷歌开源,cadvisor 不仅可以搜集一台机器上所有运行的容器信息,还提供基础 查询界面和 http 接口,方便其他组件如 Prometheus 进行数据抓取,cAdvisor 可以对节点机 器上的资源及容器进行实时监控和性能数据采集,包括 CPU 使用情况、内存使用情况、网 络吞吐量及文件系统使用情况。
k8s 1.12 之前 cadvisor 集成在 node 节点的上 kubelet 服务中,从 1.12 版本开始分离为两个组 件,因此需要在 node 节点单独部署 cadvisor
https://github.com/google/cadvisor
# docker load -i cadvisor-v0.39.2.tar.gz # docker tag gcr.io/cadvisor/cadvisor:v0.39.2 harbor.k8s.local/k8s/cadvisor:v0.39.2 # docker push harbor.k8s.local/k8s/cadvisor:v0.39.2
docker run -it -d \ --restart=always \ --volume=/:/rootfs:ro \ --volume=/var/run:/var/run:ro \ --volume=/sys:/sys:ro \ --volume=/var/lib/docker/:/var/lib/docker:ro \ --volume=/dev/disk/:/dev/disk:ro \ --publish=8080:8080 \ --detach=true \ --name=cadvisor \ --privileged \ --device=/dev/kmsg \ harbor.k8s.local/k8s/cadvisor:v0.39.2
官方文档:https://github.com/google/cadvisor/tree/master/deploy/kubernetes
# vim /apps/prometheus/prometheus.yml - job_name: 'prometheus-containers' static_configs: - targets: ["172.16.244.111:8080","172.16.244.112:8080","172.16.244.113:8080"]
容器模板ID:395 893
prometheus—>触发阈值—>超出持续时间—>alertmanager—>分组|抑制|静默—>媒体类型 —>邮件|钉钉|微信等。
- 分组(group): 将类似性质的警报合并为单个通知,比如网络通知、主机通知、服务通知。
- 静默(silences): 是一种简单的特定时间静音的机制,例如:服务器要升级维护可以先设置这 个时间段告警静默。
- 抑制(inhibition): 当警报发出后,停止重复发送由此警报引发的其他警报即合并一个故障引 起的多个报警事件,可以消除冗余告警
# tar xf alertmanager-0.23.0.linux-amd64.tar.gz -C /apps # ln -sv /apps/alertmanager-0.23.0.linux-amd64 /apps/alertmanager # vim /etc/systemd/system/alertmanager.service [Unit] Description=Prometheus Server Documentation=https://prometheus.io/docs/introduction/overview/ After=network.target [Service] Restart=on-failure WorkingDirectory=/apps/alertmanager ExecStart=/apps/alertmanager/alertmanager [Install] WantedBy=multi-user.target
官方配置文档:https://prometheus.io/docs/alerting/configuration/
global: smtp_from: #发件人邮箱地址 smtp_smarthost: #邮箱 smtp 地址。 smtp_auth_username: #发件人的登陆用户名,默认和发件人地址一致。 smtp_auth_password: #发件人的登陆密码,有时候是授权码。 smtp_require_tls: #是否需要 tls 协议。默认是 true。 wechat_api_url: #企业微信 API 地址。 wechat_api_secret: #企业微信 API secret wechat_api_corp_id: #企业微信 corp id 信息。 resolve_timeout: #在指定时间内没有产生新的事件就发送恢复通知
# pwd /apps/alertmanager # cat alertmanager.yml global: resolve_timeout: 5m #在指定时间内没有产生新的事件就发送恢复通知 smtp_smarthost: 'smtp.126.com:465' smtp_from: 'xiaoyizi@126.com' smtp_auth_username: 'xiaoyizi@126.com' smtp_auth_password: 'TJNTDNDFLAKXOFM' smtp_hello: '@126.com' smtp_require_tls: false route: #route 用来设置报警的分发策略 group_by: ['alertname'] #采用哪个标签来作为分组依据 group_wait: 10s #组告警等待时间。也就是告警产生后等待 10s,如果有同组告警一起发出 group_interval: 2s #两组告警的间隔时间 repeat_interval: 2m #重复告警的间隔时间,减少相同邮件的发送频率 receiver: 'web.hook' #设置接收人 receivers: - name: 'web.hook' #webhook_configs: #- url: 'http://127.0.0.1:5001/' email_configs: - to: 'xiaoyizi@126.com' inhibit_rules: #抑制的规则 - source_match: #源匹配级别,当匹配成功发出通知,但是其他的通知将被抑制 severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'dev', 'instance']
# systemctl daemon-reload && systemctl restart alertmanager && systemctl enable alertmanager # lsof -i:9093 COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME alertmana 39441 root 8u IPv6 232407 0t0 TCP *:9093 (LISTEN) #命令行查看目前是否有告警产生 root@deploy:/apps/alertmanager# ./amtool alert --alertmanager.url=http://172.16.244.100:9093 Alertname Starts At Summary State
global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). # Alertmanager configuration alerting: alertmanagers: - static_configs: - targets: - 172.16.244.100:9093 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: # - "first_rules.yml" # - "second_rules.yml" - "rules.yml"
# vim /apps/prometheus/rules.yml groups: - name: alertmanager_pod.rules rules: - alert: Pod_all_cpu_usage expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 1 for: 2m labels: severity: critical service: pods annotations: description: 容器 {{ $labels.name }} CPU 资源利用率大于 10% , (current value is {{ $value }}) summary: Dev CPU 负载告警 - alert: Pod_all_memory_usage expr: sort_desc(avg by(name)(irate(container_memory_usage_bytes{name!=""}[5m]))*100) > 10 #内存大于 10% #expr: sort_desc(avg by(name)(irate(node_memory_MemFree_bytes {name!=""}[5m]))) > 2 #内存大于 2G for: 2m labels: severity: critical annotations: description: 容器 {{ $labels.name }} Memory 资源利用率大于 2G , (current value is {{ $value }}) summary: Dev Memory 负载告警 - alert: Pod_all_network_receive_usage expr: sum by(name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 1 for: 2m labels: severity: critical annotations: description: 容器 {{ $labels.name }} network_receive 资源利用率大于 50M ,(current value is {{ $value }}) - alert: node 内存可用大小 expr: node_memory_MemFree_bytes > 1 #写错,做测试 for: 2m labels: severity: critical annotations: description: 容器可用内存小于 100k
root@deploy:/apps/prometheus# ./promtool check rules rules.yml Checking rules.yml SUCCESS: 4 rules found
# systemctl restart prometheus