Browse Source

devenv: grafana high availability (ha) test setup

Marcus Efraimsson 7 years ago
parent
commit
f6b8d3a1c2

+ 1 - 0
devenv/docker/ha_test/.gitignore

@@ -0,0 +1 @@
+grafana/provisioning/dashboards/alerts/alert-*

+ 137 - 0
devenv/docker/ha_test/README.md

@@ -0,0 +1,137 @@
+# Grafana High Availability (HA) test setup
+
+A set of docker compose services which together creates a Grafana HA test setup with capability of easily
+scaling up/down number of Grafana instances.
+
+Included services
+
+* Grafana
+* Mysql - Grafana configuration database and session storage
+* Prometheus - Monitoring of Grafana and used as datasource of provisioned alert rules
+* Nginx - Reverse proxy for Grafana and Prometheus. Enables browsing Grafana/Prometheus UI using a hostname
+
+## Prerequisites
+
+### Build grafana docker container
+
+Build a Grafana docker container from current branch and commit and tag it as grafana/grafana:dev.
+
+```bash
+$ cd <grafana repo>
+$ make build-docker-full
+```
+
+### Virtual host names
+
+#### Alternative 1 - Use dnsmasq
+
+```bash
+$ sudo apt-get install dnsmasq
+$ echo 'address=/loc/127.0.0.1' | sudo tee /etc/dnsmasq.d/dnsmasq-loc.conf > /dev/null
+$ sudo /etc/init.d/dnsmasq restart
+$ ping whatever.loc
+PING whatever.loc (127.0.0.1) 56(84) bytes of data.
+64 bytes from localhost (127.0.0.1): icmp_seq=1 ttl=64 time=0.076 ms
+--- whatever.loc ping statistics ---
+1 packet transmitted, 1 received, 0% packet loss, time 1998ms
+```
+
+#### Alternative 2 - Manually update /etc/hosts
+
+Update your `/etc/hosts` to be able to access Grafana and/or Prometheus UI using a hostname.
+
+```bash
+$ cat /etc/hosts
+127.0.0.1       grafana.loc
+127.0.0.1       prometheus.loc
+```
+
+## Start services
+
+```bash
+$ docker-compose up -d
+```
+
+Browse
+* http://grafana.loc/
+* http://prometheus.loc/
+
+Check for any errors
+
+```bash
+$ docker-compose logs | grep error
+```
+
+### Scale Grafana instances up/down
+
+Scale number of Grafana instances to `<instances>`
+
+```bash
+$ docker-compose up --scale grafana=<instances> -d
+# for example 3 instances
+$ docker-compose up --scale grafana=3 -d
+```
+
+## Test alerting
+
+### Create notification channels
+
+Creates default notification channels, if not already exists
+
+```bash
+$ ./alerts.sh setup
+```
+
+### Slack notifications
+
+Disable
+
+```bash
+$ ./alerts.sh slack -d
+```
+
+Enable and configure url
+
+```bash
+$ ./alerts.sh slack -u https://hooks.slack.com/services/...
+```
+
+Enable, configure url and enable reminders
+
+```bash
+$ ./alerts.sh slack -u https://hooks.slack.com/services/... -r -e 10m
+```
+
+### Provision alert dashboards with alert rules
+
+Provision 1 dashboard/alert rule (default)
+
+```bash
+$ ./alerts.sh provision
+```
+
+Provision 10 dashboards/alert rules
+
+```bash
+$ ./alerts.sh provision -a 10
+```
+
+Provision 10 dashboards/alert rules and change condition to `gt > 100`
+
+```bash
+$ ./alerts.sh provision -a 10 -c 100
+```
+
+### Pause/unpause all alert rules
+
+Pause
+
+```bash
+$ ./alerts.sh pause
+```
+
+Unpause
+
+```bash
+$ ./alerts.sh unpause
+```

+ 156 - 0
devenv/docker/ha_test/alerts.sh

@@ -0,0 +1,156 @@
+#!/bin/bash
+
+requiresJsonnet() {
+		if ! type "jsonnet" > /dev/null; then
+				echo "you need you install jsonnet to run this script"
+				echo "follow the instructions on https://github.com/google/jsonnet"
+				exit 1
+		fi
+}
+
+setup() {
+	STATUS=$(curl -s -o /dev/null -w '%{http_code}' http://admin:admin@grafana.loc/api/alert-notifications/1)
+  if [ $STATUS -eq 200 ]; then
+    echo "Email already exists, skipping..."
+  else
+		curl -H "Content-Type: application/json" \
+		-d '{
+			"name": "Email",
+			"type":  "email",
+			"isDefault": false,
+			"sendReminder": false,
+			"uploadImage": true,
+			"settings": {
+				"addresses": "user@test.com"
+			}
+		}' \
+		http://admin:admin@grafana.loc/api/alert-notifications
+  fi
+
+	STATUS=$(curl -s -o /dev/null -w '%{http_code}' http://admin:admin@grafana.loc/api/alert-notifications/2)
+  if [ $STATUS -eq 200 ]; then
+    echo "Slack already exists, skipping..."
+  else
+		curl -H "Content-Type: application/json" \
+		-d '{
+			"name": "Slack",
+			"type":  "slack",
+			"isDefault": false,
+			"sendReminder": false,
+			"uploadImage": true
+		}' \
+		http://admin:admin@grafana.loc/api/alert-notifications
+  fi
+}
+
+slack() {
+	enabled=true
+	url=''
+	remind=false
+	remindEvery='10m'
+
+	while getopts ":e:u:dr" o; do
+    case "${o}" in
+				e)
+            remindEvery=${OPTARG}
+            ;;
+				u)
+            url=${OPTARG}
+            ;;
+				d)
+            enabled=false
+            ;;
+				r)
+            remind=true
+            ;;
+    esac
+	done
+	shift $((OPTIND-1))
+
+	curl -X PUT \
+		-H "Content-Type: application/json" \
+		-d '{
+			"id": 2,
+			"name": "Slack",
+			"type":  "slack",
+			"isDefault": '$enabled',
+			"sendReminder": '$remind',
+			"frequency": "'$remindEvery'",
+			"uploadImage": true,
+			"settings": {
+				"url": "'$url'"
+			}
+		}' \
+		http://admin:admin@grafana.loc/api/alert-notifications/2
+}
+
+provision() {
+	alerts=1
+	condition=65
+	while getopts ":a:c:" o; do
+    case "${o}" in
+        a)
+            alerts=${OPTARG}
+            ;;
+				c)
+            condition=${OPTARG}
+            ;;
+    esac
+	done
+	shift $((OPTIND-1))
+
+	requiresJsonnet
+
+	rm -rf grafana/provisioning/dashboards/alerts/alert-*.json
+	jsonnet -m grafana/provisioning/dashboards/alerts grafana/provisioning/alerts.jsonnet --ext-code alerts=$alerts --ext-code condition=$condition
+}
+
+pause() {
+	curl -H "Content-Type: application/json" \
+  -d '{"paused":true}' \
+  http://admin:admin@grafana.loc/api/admin/pause-all-alerts
+}
+
+unpause() {
+	curl -H "Content-Type: application/json" \
+  -d '{"paused":false}' \
+  http://admin:admin@grafana.loc/api/admin/pause-all-alerts
+}
+
+usage() {
+	echo -e "Usage: ./alerts.sh COMMAND [OPTIONS]\n"
+	echo -e "Commands"
+	echo -e "  setup\t\t creates default alert notification channels"
+	echo -e "  slack\t\t configure slack notification channel"
+	echo -e "    [-d]\t\t\t disable notifier, default enabled"
+	echo -e "    [-u]\t\t\t url"
+	echo -e "    [-r]\t\t\t send reminders"
+	echo -e "    [-e <remind every>]\t\t default 10m\n"
+	echo -e "  provision\t provision alerts"
+	echo -e "    [-a <alert rule count>]\t default 1"
+	echo -e "    [-c <condition value>]\t default 65\n"
+	echo -e "  pause\t\t pause all alerts"
+	echo -e "  unpause\t unpause all alerts"
+}
+
+main() {
+	local cmd=$1
+
+	if [[ $cmd == "setup" ]]; then
+		setup
+	elif [[ $cmd == "slack" ]]; then
+		slack "${@:2}"
+	elif [[ $cmd == "provision" ]]; then
+		provision "${@:2}"
+	elif [[ $cmd == "pause" ]]; then
+		pause
+	elif [[ $cmd == "unpause" ]]; then
+		unpause
+	fi
+
+  if [[ -z "$cmd" ]]; then
+		usage
+	fi
+}
+
+main "$@"

+ 57 - 0
devenv/docker/ha_test/docker-compose.yaml

@@ -0,0 +1,57 @@
+version: "2.1"
+
+services:
+  nginx-proxy:
+    image: jwilder/nginx-proxy
+    ports:
+      - "80:80"
+    volumes:
+      - /var/run/docker.sock:/tmp/docker.sock:ro
+
+  mysql:
+    image: mysql
+    environment:
+      MYSQL_ROOT_PASSWORD: rootpass
+      MYSQL_DATABASE: grafana
+      MYSQL_USER: grafana
+      MYSQL_PASSWORD: password
+    healthcheck:
+      test: ["CMD", "mysqladmin" ,"ping", "-h", "localhost"]
+      timeout: 10s
+      retries: 10
+
+  grafana:
+    image: grafana/grafana:dev
+    volumes:
+      - ./grafana/provisioning/:/etc/grafana/provisioning/
+    environment:
+      - VIRTUAL_HOST=grafana.loc
+      - GF_SERVER_ROOT_URL=http://grafana.loc
+      - GF_DATABASE_TYPE=mysql
+      - GF_DATABASE_HOST=mysql:3306
+      - GF_DATABASE_NAME=grafana
+      - GF_DATABASE_USER=grafana
+      - GF_DATABASE_PASSWORD=password
+      - GF_SESSION_PROVIDER=mysql
+      - GF_SESSION_PROVIDER_CONFIG=grafana:password@tcp(mysql:3306)/grafana?allowNativePasswords=true
+    ports:
+      - 3000
+    depends_on:
+      mysql:
+        condition: service_healthy
+
+  prometheus:
+    image: prom/prometheus:v2.4.2
+    volumes:
+      - ./prometheus/:/etc/prometheus/
+    environment:
+      - VIRTUAL_HOST=prometheus.loc
+    ports:
+      - 9090
+
+  # mysqld-exporter:
+  #   image: prom/mysqld-exporter
+  #   environment:
+  #     - DATA_SOURCE_NAME=grafana:password@(mysql:3306)/
+  #   ports:
+  #     - 9104

+ 202 - 0
devenv/docker/ha_test/grafana/provisioning/alerts.jsonnet

@@ -0,0 +1,202 @@
+local numAlerts = std.extVar('alerts');
+local condition = std.extVar('condition');
+local arr = std.range(1, numAlerts);
+
+local alertDashboardTemplate = {
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": null,
+  "links": [],
+  "panels": [
+    {
+      "alert": {
+        "conditions": [
+          {
+            "evaluator": {
+              "params": [
+                65
+              ],
+              "type": "gt"
+            },
+            "operator": {
+              "type": "and"
+            },
+            "query": {
+              "params": [
+                "A",
+                "5m",
+                "now"
+              ]
+            },
+            "reducer": {
+              "params": [],
+              "type": "avg"
+            },
+            "type": "query"
+          }
+        ],
+        "executionErrorState": "alerting",
+        "frequency": "10s",
+        "handler": 1,
+        "name": "bulk alerting",
+        "noDataState": "no_data",
+        "notifications": [
+          {
+            "id": 2
+          }
+        ]
+      },
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "Prometheus",
+      "fill": 1,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "id": 2,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "$$hashKey": "object:117",
+          "expr": "go_goroutines",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "refId": "A"
+        }
+      ],
+      "thresholds": [
+        {
+          "colorMode": "critical",
+          "fill": true,
+          "line": true,
+          "op": "gt",
+          "value": 50
+        }
+      ],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Panel Title",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ]
+    }
+  ],
+  "schemaVersion": 16,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-6h",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "",
+  "title": "New dashboard",
+  "uid": null,
+  "version": 0
+};
+
+
+{
+  ['alert-' + std.toString(x) + '.json']:
+    alertDashboardTemplate + {
+      panels: [
+        alertDashboardTemplate.panels[0] +
+        {
+          alert+: {
+            name: 'Alert rule ' + x,
+            conditions: [
+              alertDashboardTemplate.panels[0].alert.conditions[0] +
+              {
+                evaluator+: {
+                  params: [condition]
+                }
+              },
+            ],
+          },
+        },
+      ],
+      uid: 'alert-' + x,
+      title: 'Alert ' + x
+    },
+      for x in arr
+}

+ 8 - 0
devenv/docker/ha_test/grafana/provisioning/dashboards/alerts.yaml

@@ -0,0 +1,8 @@
+apiVersion: 1
+
+providers:
+ - name: 'Alerts'
+   folder: 'Alerts'
+   type: file
+   options:
+     path: /etc/grafana/provisioning/dashboards/alerts

+ 172 - 0
devenv/docker/ha_test/grafana/provisioning/dashboards/alerts/overview.json

@@ -0,0 +1,172 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "-- Grafana --",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "links": [],
+  "panels": [
+    {
+      "aliasColors": {
+        "Active alerts": "#bf1b00"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "Prometheus",
+      "fill": 1,
+      "gridPos": {
+        "h": 12,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 2,
+      "interval": "",
+      "legend": {
+        "alignAsTable": true,
+        "avg": false,
+        "current": true,
+        "max": false,
+        "min": false,
+        "rightSide": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 2,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [
+        {
+          "alias": "Active grafana instances",
+          "dashes": true,
+          "fill": 0
+        }
+      ],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(increase(grafana_alerting_notification_sent_total[1m])) by(job)",
+          "format": "time_series",
+          "instant": false,
+          "interval": "1m",
+          "intervalFactor": 1,
+          "legendFormat": "Notifications sent",
+          "refId": "A"
+        },
+        {
+          "expr": "min(grafana_alerting_active_alerts) without(instance)",
+          "format": "time_series",
+          "interval": "1m",
+          "intervalFactor": 1,
+          "legendFormat": "Active alerts",
+          "refId": "B"
+        },
+        {
+          "expr": "count(up{job=\"grafana\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Active grafana instances",
+          "refId": "C"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Notifications sent vs active alerts",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": 3
+      }
+    }
+  ],
+  "schemaVersion": 16,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "",
+  "title": "Overview",
+  "uid": "xHy7-hAik",
+  "version": 6
+}

+ 11 - 0
devenv/docker/ha_test/grafana/provisioning/datasources/datasources.yaml

@@ -0,0 +1,11 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    jsonData:
+      timeInterval: 10s
+      queryTimeout: 30s
+      httpMethod: POST

+ 39 - 0
devenv/docker/ha_test/prometheus/prometheus.yml

@@ -0,0 +1,39 @@
+# my global config
+global:
+  scrape_interval:     10s # By default, scrape targets every 15 seconds.
+  evaluation_interval: 10s # By default, scrape targets every 15 seconds.
+  # scrape_timeout is set to the global default (10s).
+
+# Load and evaluate rules in this file every 'evaluation_interval' seconds.
+#rule_files:
+# - "alert.rules"
+# - "first.rules"
+# - "second.rules"
+
+# alerting:
+#   alertmanagers:
+#   - scheme: http
+#     static_configs:
+#     - targets:
+#       - "127.0.0.1:9093"
+
+scrape_configs:
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
+
+  - job_name: 'grafana'
+    dns_sd_configs:
+      - names:
+        - 'grafana'
+        type: 'A'
+        port: 3000
+        refresh_interval: 10s
+
+  # - job_name: 'mysql'
+  #   dns_sd_configs:
+  #     - names:
+  #       - 'mysqld-exporter'
+  #       type: 'A'
+  #       port: 9104
+  #       refresh_interval: 10s