This time, I will use Prometheus to track how my servers are doing. I will write alert rules for protheus to check that my servers are still exporting metric data for prometheus
Let's create a project
mkdir prometheus-monitor
cd prometheus-monitor
Then create a rule for prometheus
mkdir prometheus
touch prometheus/alert.rules.yml
groups:
- name: AllInstance
rules:
- alert: InstanceDown
expr: up == 0
for: 10s
labels:
severity: critical
annotations:
summary: "Server Unavailable"
description: "Server -> {{ $labels.job }} - {{ $labels.instance }}"
The rule named InstanceDown will be triggered when prometheus can't fetch metric data from a server (condition: up == 0) and it will set the job name and the instance name in annotations' description.
Now, write the prometheus config files
touch prometheus/prometheus.yml
global:
scrape_interval: 1m
scrape_timeout: 5s
rule_files:
- alert.rules.yml
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]
scrape_configs:
- job_name: myjob
metrics_path: /metrics
scheme: http
follow_redirects: true
static_configs:
- targets:
- "172.18.117.234:8081"
- "172.18.117.234:8082"
Since I use Alertmanager to send alert data, so let's config the alertmanager
mkdir alertmanager
touch alertmanager/alertmanager.yml
global:
route:
group_by: [...]
repeat_interval: 3h
receiver: webhook_issues
receivers:
- name: webhook_issues
webhook_configs:
- url: http://172.18.117.234:8080/webhook
From the two configs above, 172.18.117.234
is my machine ip and you will need to change it to be your ip address for your server
And I will run prometheus on Docker, then I will create a docker-compose file
touch docker-compose.yml
version: '3'
services:
alertmanager:
image: prom/alertmanager
ports:
- 9093:9093
volumes:
- ./alertmanager:/etc/alertmanager
prometheus:
image: prom/prometheus
ports:
- 9090:9090
volumes:
- ./prometheus:/etc/prometheus
That's it for the configs. Next, let's build a server to test
go mod init goprom
touch main.go
package main
import (
"context"
"fmt"
"log"
"math/rand"
"os"
"sync"
"time"
"github.com/gin-gonic/gin"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/prometheus"
"go.opentelemetry.io/otel/metric/global"
"go.opentelemetry.io/otel/metric/instrument"
"go.opentelemetry.io/otel/sdk/metric/aggregator/histogram"
controller "go.opentelemetry.io/otel/sdk/metric/controller/basic"
"go.opentelemetry.io/otel/sdk/metric/export/aggregation"
processor "go.opentelemetry.io/otel/sdk/metric/processor/basic"
selector "go.opentelemetry.io/otel/sdk/metric/selector/simple"
)
var (
lemonsKey = attribute.Key("ex.com/lemons")
)
func initMeterServer() {
config := prometheus.Config{
DefaultHistogramBoundaries: []float64{0.1, 0.2, 0.5, 1, 2},
}
c := controller.New(
processor.NewFactory(
selector.NewWithHistogramDistribution(
histogram.WithExplicitBoundaries(config.DefaultHistogramBoundaries),
),
aggregation.CumulativeTemporalitySelector(),
processor.WithMemory(true),
),
)
exporter, err := prometheus.New(config, c)
if err != nil {
log.Panicf("failed to initialize prometheus exporter %v", err)
}
global.SetMeterProvider(exporter.MeterProvider())
r := gin.Default()
r.GET("/metrics", gin.WrapH(exporter))
go func() {
_ = r.Run(":" + os.Getenv("PORT"))
}()
fmt.Println("Prometheus server running on :8080")
}
func main() {
initMeterServer()
meter := global.Meter("")
observerLock := new(sync.RWMutex)
var value float64
commonAttrs := []attribute.KeyValue{attribute.String("A", "1"), attribute.String("A", "2")}
gaugeObserver, err := meter.AsyncFloat64().Gauge("http_request_sample")
if err != nil {
log.Panicf("failed to initialize instrument: %v", err)
}
_ = meter.RegisterCallback([]instrument.Asynchronous{gaugeObserver}, func(ctx context.Context) {
// use to capture the current value
observerLock.RLock()
value := value
observerLock.RUnlock()
gaugeObserver.Observe(ctx, value, commonAttrs...)
})
histogram, err := meter.SyncFloat64().Histogram("http_request_duration_seconds")
if err != nil {
log.Panicf("failed to initialize instrument: %v", err)
}
ctx := context.Background()
for {
observerLock.Lock()
value = rand.Float64() * 3
histogram.Record(ctx, value, commonAttrs...)
observerLock.Unlock()
time.Sleep(1 * time.Second)
}
}
We will use this to export data for prometheus
Next, write a webhook server
mdkir webhook
touch webhook/main.go
package main
import (
"github.com/gin-gonic/gin"
)
func main() {
r := gin.Default()
r.POST("/webhook", discordhook)
r.Run(":8080")
}
package main
import (
"bytes"
"encoding/json"
"fmt"
"log"
"net/http"
"os"
"strings"
"github.com/gin-gonic/gin"
)
//ref: https://prometheus.io/docs/alerting/latest/configuration/#webhook_config
type AlertManagerWebhook struct {
Version string `json:"version"`
GroupKey string `json:"groupKey"`
TruncatedAlerts int `json:"truncatedAlerts"`
Status string `json:"status"`
Receiver string `json:"receiver"`
GroupLabels map[string]string `json:"groupLabels"`
CommonLabels map[string]string `json:"commonLabels"`
CommonAnnotations map[string]string `json:"commonAnnotations"`
ExternalURL string `json:"externalURL"`
Alerts []struct {
Status string `json:"status"`
Label map[string]string `json:"labels"`
Annotations map[string]string `json:"annotations"`
StartsAt string `json:"startsAt"`
EndsAt string `json:"endsAt"`
GeneratorURL string `json:"generatorURL"`
Fingerprint string `json:"fingerprint"`
} `json:"alerts"`
}
//ref: https://discord.com/developers/docs/resources/webhook#execute-webhook
type (
DiscordEmbedField struct {
Name string `json:"name"`
Value string `json:"value"`
Inline bool `json:"inline"`
}
DiscordEmbed struct {
Title string `json:"title"`
Description string `json:"description"`
Fields []DiscordEmbedField `json:"fields"`
}
DiscordHook struct {
Content string `json:"content"`
Embeds []DiscordEmbed `json:"embeds"`
}
)
func discordhook(ctx *gin.Context) {
info := &AlertManagerWebhook{}
json.NewDecoder(ctx.Request.Body).Decode(info)
fmt.Printf("%+v\n", info)
hook := DiscordHook{
Content: "=== Alert ===",
Embeds: []DiscordEmbed{
{
Title: fmt.Sprintf("[%s] %s", strings.ToUpper(info.Status), info.CommonAnnotations["summary"]),
Description: info.CommonAnnotations["description"],
},
},
}
var buf bytes.Buffer
json.NewEncoder(&buf).Encode(hook)
if _, err := http.Post(os.Getenv("WEBHOOK_URL"), "application/json", &buf); err != nil {
log.Println(err)
}
}
To run all of this
# start the webhook
WEBHOOK_URL=<your-webhook-url> go run webhook/main.go
# open two terminals and run these two commands separately for the metric data server
PORT=8081 go run .
PORT=8082 go run .
Now you can visit the metric data at localhost:8081/metrics and localhost:8082/metrics. Check the prometheus server at localhost:9090 to see how it is working.
Next, try to stop the servers (except the webhook server). Prometheus will fail to collect metric data and the alert rules will be triggered. Once everything meets the requirements, it will send data to your webhook server and you will see the webhook send a message in your discord channel.
If you start the servers again, prometheus will send an alert again but the status will be resolve
instead of firing
which means the problem is fixed.
sources