说明
不对节点功能以及架构进行说明
未把Remote Storage隔离出来,后续优化
采用Ansible部署,可参考之前文章
部分配置采用默认,做案例时做修改
时间同步
一. 初始环境
主机名 | IP | 部署服务 |
---|---|---|
vm1 | 192.168.0.107 | consul_1.2.4、alertmanager-0.22.2、node_exporter-1.1.2、prometheus-2.27.1、自定义metrics |
vm2 | 192.168.0.106 | consul_1.2.4、node_exporter-1.1.2、自定义metrics |
vm3 | 192.168.0.108 | consul_1.2.4、node_exporter-1.1.2、自定义metrics |
二. 实现拓扑图
三. node_exporter
解压即可启动
日志路径还需分析,默认/var/log/message下
[Service]
User=root
Group=root
ExecStart=/opt/app/node_exporter/node_exporter --web.listen-address=:9321 --log.level=debug
[Install]
WantedBy=multi-user.target
[Unit]
Description=node_exporter
After=network.target
四. Consul
4.1 consul配置
ip等参数自行修改
{
"bind_addr": "192.168.0.107",
"client_addr": "0.0.0.0",
"datacenter": "wtc-consul",
"data_dir": "/opt/app/consul/data",
"log_level": "INFO",
"log_file": "/opt/app/consul/logs/consul.log",
"log_rotate_duration": "24h",
"enable_syslog": false,
"enable_debug": true,
"node_name": "consul-vm1",
"server": true,
"ui": true,
"bootstrap_expect": 3, // 3节点,leader 随机选取
"leave_on_terminate": false,
"skip_leave_on_interrupt": true,
"encrypt_verify_incoming":false,
"encrypt_verify_outgoing":false,
"rejoin_after_leave": true,
"retry_join": [ // 不需要写本机IP
"192.168.0.106",
"192.168.0.108"
],
"ports": {
"http": 8500, // web listen and client register address
"dns": 8600,
"serf_lan":8301,
"serf_wan":8302,
"server":8300
}
}
4.2 consul_agent.json配置
注意:service[-1]末尾不要携带逗号services 多个节点同时注册,新的node_exporter新增内容即可
{
"services":[
{
"Id": "node_exporter_vm1",
"Name": "vm1",
"Tags": [
"node_exporter",
"vmware_vm1"
],
"Address": "192.168.0.107",
"Port": 9321,
"Meta": {
"service": "node_exporter",
"use": "monitor_env",
"idc": "beijing"
},
"Check": [{
"HTTP": "http://192.168.0.107:9321/metrics",
"Interval": "10s",
"timeout": "5s"
}]
}
]
}
4.3 consul 启动配置
[Unit]
Description="consul-service"
Requires=network-online.target
After=network-online.target
[Service]
User=root
Group=root
ExecStart=/usr/local/bin/consul agent -config-dir=/opt/app/consul/config
ExecReload=/usr/local/bin/consul reload // 热加载配置
KillMode=process
Restart=on-failure
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
EOF
http://consul_ip:8500
五. AlartManager
默认配置
数据存储目录需额外创建
[Unit]
Description=Alertmanager
After=network.target
[Service]
Type=simple
User=prometheus
ExecStart=/opt/app/alertmanager/alertmanager --config.file=/opt/app/alertmanager/alertmanager.yml --storage.path=/opt/app/alertmanager/data
Restart=on-failure
[Install]
WantedBy=multi-user.target
http://alertmanager_ip:9093
六. Prometheus
默认配置
[Unit]
Description=Prometheus Server
Documentation=https:/prometheus.io/docs/introduction/overview/
After=network.target
[Service]
User=root
Group=root
Type=simple
Restart=on-failure
WorkingDirectory=/opt/app/prometheus
ExecStart=/opt/app/prometheus/prometheus --config.file=/opt/app/prometheus/prometheus.yml --log.level=info --storage.tsdb.retention=1500d
[Install]
WantedBy=multi-user.target
http://prometheus_ip:9090