Compare commits
19 Commits
9a4c7c9440
...
16e0d6eadb
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
16e0d6eadb | ||
|
|
298f053835 | ||
|
|
6acb2d17dd | ||
|
|
b6e379a3f2 | ||
|
|
c5a54827d4 | ||
|
|
6f1bcecf25 | ||
|
|
5bf47c73a7 | ||
|
|
681b788ac4 | ||
|
|
0eeeecb549 | ||
|
|
44665bae12 | ||
|
|
8dc0ec798f | ||
|
|
e9d1eed01b | ||
|
|
8d2999fe87 | ||
|
|
d80641623e | ||
|
|
22227d9ffc | ||
|
|
3e9ea95ad7 | ||
|
|
d76dbf6e3c | ||
|
|
ef5d83b188 | ||
|
|
74e9eb8dcb |
@@ -5,8 +5,8 @@ These variables are required by multiple roles. Example values included.
|
||||
timezone: 'Europe/Helsinki'
|
||||
admin_email: 'admin@domain.tld'
|
||||
|
||||
smtp_server: smtp.domain.tld
|
||||
smtp_from: sender@domain.tld
|
||||
smtp_server: smtp.domain.tld # Smtp server, must be reachable on port 587 with tls
|
||||
smtp_from: sender@domain.tld # Address to send mail from
|
||||
```
|
||||
|
||||
# Optional variables
|
||||
@@ -15,7 +15,7 @@ These variables are used by multiple roles and have the following default values
|
||||
```
|
||||
reverse_proxy_type: caddy # Allowed values: caddy, traefik, none
|
||||
|
||||
smtp_from: # not defined, no smtp login by default
|
||||
smtp_user: # not defined, no smtp login by default
|
||||
smtp_pw: # not defined, see above
|
||||
|
||||
```
|
||||
|
||||
1
roles/alertmanager/README.md
Normal file
1
roles/alertmanager/README.md
Normal file
@@ -0,0 +1 @@
|
||||
Sets up a prometheus alertmanager docker container.
|
||||
8
roles/alertmanager/defaults/main.yml
Normal file
8
roles/alertmanager/defaults/main.yml
Normal file
@@ -0,0 +1,8 @@
|
||||
---
|
||||
|
||||
alertmanager_storage_retention: "{{ prometheus_storage_retention | default('3650d') }}"
|
||||
|
||||
alertmanager_smtp_server: "{{ smtp_server | default('') }}"
|
||||
alertmanager_smtp_from: "{{ smtp_from | default('') }}"
|
||||
alertmanager_smtp_user: "{{ smtp_user | default('') }}"
|
||||
alertmanager_smtp_pw: "{{ smtp_pw | default('') }}"
|
||||
79
roles/alertmanager/meta/argument_specs.yml
Normal file
79
roles/alertmanager/meta/argument_specs.yml
Normal file
@@ -0,0 +1,79 @@
|
||||
---
|
||||
|
||||
argument_specs:
|
||||
main:
|
||||
short_description: Prometheus alertmanager docker container
|
||||
options:
|
||||
alertmanager_storage_retention:
|
||||
description: Period of time for which alertmanager data is stored for. A number followed by unit (s, m, h, d, w, y).
|
||||
type: str
|
||||
required: false
|
||||
default: "{{ prometheus_storage_retention | default('3650d') }}"
|
||||
|
||||
alertmanager_smtp_server:
|
||||
description: Smtp server to use for sending mail. Must be reachable on port 587. Emails not sent if not defined
|
||||
type: str
|
||||
required: false
|
||||
default: "{{ smtp_server | default('') }}"
|
||||
alertmanager_smtp_from:
|
||||
description: Address to send mail from. Required if sending emails.
|
||||
type: str
|
||||
required: false
|
||||
default: "{{ smtp_from | default('') }}"
|
||||
alertmanager_smtp_user:
|
||||
description: User to login to smtp server with. No authentication if not defined.
|
||||
type: str
|
||||
required: false
|
||||
default: "{{ smtp_user | default('') }}"
|
||||
alertmanager_smtp_pw:
|
||||
description: Password for the smtp user
|
||||
type: str
|
||||
required: false
|
||||
default: "{{ smtp_pw | default('') }}"
|
||||
smtp_server:
|
||||
description: Global smtp server value, default for alertmanager_smtp_server
|
||||
type: str
|
||||
required: false
|
||||
smtp_from:
|
||||
description: Global smtp from value, default for alertmanager_smtp_from
|
||||
type: str
|
||||
required: "{{ alertmanager_smtp_server | length > 0 and alertmanager_smtp_from | length == 0 }}"
|
||||
smtp_user:
|
||||
description: Global smtp user value, default for alertmanager_smtp_user
|
||||
type: str
|
||||
required: false
|
||||
smtp_pw:
|
||||
description: Global smtp password value, default for alertmanager_smtp_pw
|
||||
type: str
|
||||
required: "{{ alertmanager_smtp_server | length > 0 and alertmanager_smtp_user | length > 0 and alertmanager_smtp_pw | length == 0 }}"
|
||||
|
||||
# All options after this will be passed directly to the container role
|
||||
docker_service_suffix:
|
||||
description: "Passed to container role"
|
||||
required: false
|
||||
docker_host_user:
|
||||
description: "Passed to container role"
|
||||
required: false
|
||||
|
||||
database_passwords:
|
||||
description: "Passed to container role"
|
||||
required: false
|
||||
docker_additional_services:
|
||||
description: "Passed to container role"
|
||||
required: false
|
||||
|
||||
docker_volume_type:
|
||||
description: "Passed to container role"
|
||||
required: false
|
||||
reverse_proxy_type:
|
||||
description: "Passed to container role"
|
||||
required: false
|
||||
ports:
|
||||
description: "Passed to container role"
|
||||
required: false
|
||||
docker_vhost_domains:
|
||||
description: "Passed to container role"
|
||||
required: false
|
||||
docker_entrypoint:
|
||||
description: "Passed to container role"
|
||||
required: false
|
||||
19
roles/alertmanager/tasks/main.yml
Normal file
19
roles/alertmanager/tasks/main.yml
Normal file
@@ -0,0 +1,19 @@
|
||||
---
|
||||
|
||||
- name: Prometheus alertmanager container
|
||||
include_role:
|
||||
name: container
|
||||
vars:
|
||||
docker_service: alertmanager
|
||||
docker_image: prom/alertmanager
|
||||
reverse_proxy_type: none
|
||||
docker_command:
|
||||
- "--config.file=/etc/alertmanager/alertmanager.yml"
|
||||
- "--storage.path=/alertmanager"
|
||||
- "--data.retention={{ alertmanager_storage_retention }}"
|
||||
|
||||
docker_mounts:
|
||||
- name: data
|
||||
path: /alertmanager
|
||||
- template: alertmanager.yml
|
||||
path: /etc/alertmanager/alertmanager.yml
|
||||
68
roles/alertmanager/templates/alertmanager.yml.j2
Normal file
68
roles/alertmanager/templates/alertmanager.yml.j2
Normal file
@@ -0,0 +1,68 @@
|
||||
---
|
||||
|
||||
# {{ ansible_managed }}
|
||||
|
||||
global:
|
||||
# The smarthost and SMTP sender used for mail notifications.
|
||||
{% if alertmanager_smtp_server | length > 0 %}
|
||||
smtp_smarthost: '{{ alertmanager_smtp_server }}:587'
|
||||
smtp_from: '{{ alertmanager_smtp_from }}'
|
||||
{% if alertmanager_smtp_user | length > 0 %}
|
||||
smtp_auth_username: '{{ alertmanager_smtp_user }}'
|
||||
smtp_auth_password: '{{ alertmanager_smtp_pw }}'
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
# The directory from which notification templates are read.
|
||||
templates:
|
||||
- '/etc/alertmanager/template/*.tmpl'
|
||||
|
||||
# The root route on which each incoming alert enters.
|
||||
route:
|
||||
# The labels by which incoming alerts are grouped together. For example,
|
||||
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
|
||||
# be batched into a single group.
|
||||
#
|
||||
# To aggregate by all possible labels use '...' as the sole label name.
|
||||
# This effectively disables aggregation entirely, passing through all
|
||||
# alerts as-is. This is unlikely to be what you want, unless you have
|
||||
# a very low alert volume or your upstream notification system performs
|
||||
# its own grouping. Example: group_by: [...]
|
||||
group_by: ['alertname', 'cluster', 'service']
|
||||
|
||||
# When a new group of alerts is created by an incoming alert, wait at
|
||||
# least 'group_wait' to send the initial notification.
|
||||
# This way ensures that you get multiple alerts for the same group that start
|
||||
# firing shortly after another are batched together on the first
|
||||
# notification.
|
||||
group_wait: 30s
|
||||
|
||||
# When the first notification was sent, wait 'group_interval' to send a batch
|
||||
# of new alerts that started firing for that group.
|
||||
group_interval: 5m
|
||||
|
||||
# If an alert has successfully been sent, wait 'repeat_interval' to
|
||||
# resend them.
|
||||
repeat_interval: 3h
|
||||
|
||||
# A default receiver
|
||||
receiver: uumas_email
|
||||
|
||||
# All the above attributes are inherited by all child routes and can
|
||||
# overwritten on each.
|
||||
|
||||
# The child route trees.
|
||||
routes: {{ alertmanager_routes }}
|
||||
|
||||
inhibit_rules:
|
||||
- source_matchers: [severity="critical"]
|
||||
target_matchers: [severity="warning"]
|
||||
# Apply inhibition if the alertname is the same.
|
||||
# CAUTION:
|
||||
# If all label names listed in `equal` are missing
|
||||
# from both the source and target alerts,
|
||||
# the inhibition rule will apply!
|
||||
equal: [alertname, cluster, service]
|
||||
|
||||
|
||||
receivers: {{ alertmanager_receivers }}
|
||||
@@ -1,5 +1,9 @@
|
||||
---
|
||||
|
||||
- name: Set docker service full name (required because docker_mounts uses it)
|
||||
set_fact:
|
||||
authentik_service_name: "authentik{{ '_' + docker_service_suffix if docker_service_suffix is defined else '' }}"
|
||||
|
||||
- name: Authentik container
|
||||
ansible.builtin.import_role:
|
||||
name: container
|
||||
@@ -13,6 +17,13 @@
|
||||
docker_additional_services:
|
||||
- redis
|
||||
docker_env: "{{ authentik_common_env | combine(authentik_env) }}"
|
||||
docker_mounts:
|
||||
- path: /media
|
||||
absolute_name: "{{ authentik_service_name }}_media"
|
||||
- path: /templates
|
||||
absolute_name: "{{ authentik_service_name }}_templates"
|
||||
- path: /certs
|
||||
name: "{{ authentik_service_name }}_certs"
|
||||
|
||||
- name: Authentik worker container
|
||||
ansible.builtin.import_role:
|
||||
@@ -24,3 +35,10 @@
|
||||
- worker
|
||||
reverse_proxy_type: none
|
||||
docker_env: "{{ authentik_common_env | combine(authentik_env) }}"
|
||||
docker_mounts:
|
||||
- path: /media
|
||||
absolute_name: "{{ authentik_service_name }}_media"
|
||||
- path: /templates
|
||||
absolute_name: "{{ authentik_service_name }}_templates"
|
||||
- path: /certs
|
||||
name: "{{ authentik_service_name }}_certs"
|
||||
|
||||
7
roles/container/handlers/main.yml
Normal file
7
roles/container/handlers/main.yml
Normal file
@@ -0,0 +1,7 @@
|
||||
---
|
||||
|
||||
- name: Restart container {{ docker_service_name }}
|
||||
community.docker.docker_container:
|
||||
name: "{{ docker_service_name }}"
|
||||
restart: true
|
||||
when: not container_out.changed
|
||||
@@ -118,6 +118,10 @@ argument_specs:
|
||||
description: "Name of template without .j2 extension. Will be templated at /opt/<service>[/suffix]/mounts/<template> and mounted inside the container."
|
||||
type: str
|
||||
required: false
|
||||
copypath:
|
||||
description: "Name of file or directory to copy. Will be deployed from files/<copypath> to /opt/<service>[/suffix]/mounts/<copypath> and mounted inside the container."
|
||||
type: str
|
||||
required: false
|
||||
reverse_proxy_type:
|
||||
description: "Defines which kind of reverse proxy to configure for the container. Traefik support is experimental."
|
||||
type: str
|
||||
|
||||
@@ -34,7 +34,7 @@
|
||||
source: pull
|
||||
force_source: true
|
||||
register: pulled_image
|
||||
when: dockerfile is not defined or dockerfile | length == 0
|
||||
when: not dockerfile_needed
|
||||
|
||||
- name: Set container_image variable
|
||||
set_fact:
|
||||
|
||||
@@ -39,6 +39,7 @@
|
||||
- name: Set assistive variables
|
||||
set_fact:
|
||||
template_mounts_needed: "{{ docker_mounts | selectattr('template', 'defined') | list | length > 0 }}"
|
||||
copypath_mounts_needed: "{{ docker_mounts | selectattr('copypath', 'defined') | list | length > 0 }}"
|
||||
volumes_needed: "{{ docker_mounts | selectattr('name', 'defined') | list | length > 0 or docker_database != 'none' }}"
|
||||
dockerfile_needed: "{{ dockerfile | length > 0 }}"
|
||||
db_config_mounts_needed: "{{ docker_mariadb_config | length > 0 }}"
|
||||
@@ -48,8 +49,8 @@
|
||||
named_volumes_needed: "{{ volumes_needed and docker_volume_type == 'named' }}"
|
||||
- name: Set even more assistive variables
|
||||
set_fact:
|
||||
create_opt_directory: "{{ dockerfile_needed or docker_host_user or bind_volumes_needed or template_mounts_needed or db_config_mounts_needed }}"
|
||||
create_mounts_directory: "{{ bind_volumes_needed or template_mounts_needed or db_config_mounts_needed }}"
|
||||
create_opt_directory: "{{ dockerfile_needed or docker_host_user or bind_volumes_needed or template_mounts_needed or copypath_mounts_needed or db_config_mounts_needed }}"
|
||||
create_mounts_directory: "{{ bind_volumes_needed or template_mounts_needed or copypath_mounts_needed or db_config_mounts_needed }}"
|
||||
|
||||
- name: Set docker service full name
|
||||
set_fact:
|
||||
|
||||
@@ -7,16 +7,16 @@
|
||||
when: docker_network_mode is not defined or docker_network_mode != 'host' or docker_networks | length > 0
|
||||
block:
|
||||
- name: Set networks variable to {{ docker_service_name }}
|
||||
set_fact:
|
||||
ansible.builtin.set_fact:
|
||||
container_networks:
|
||||
- name: "{{ docker_service_name }}"
|
||||
when: docker_networks | length == 0
|
||||
- name: Set networks variable to {{ docker_networks }}
|
||||
set_fact:
|
||||
ansible.builtin.set_fact:
|
||||
container_networks: "{{ docker_networks }}"
|
||||
when: docker_networks | length > 0
|
||||
- name: Create docker networks
|
||||
docker_network:
|
||||
community.docker.docker_network:
|
||||
name: "{{ item.name }}"
|
||||
loop: "{{ container_networks }}"
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
when: reverse_proxy_type != 'none'
|
||||
|
||||
- name: Create directory /opt/{{ docker_service }}
|
||||
file:
|
||||
ansible.builtin.file:
|
||||
path: "/opt/{{ docker_service }}"
|
||||
state: directory
|
||||
mode: 0755
|
||||
@@ -42,18 +42,18 @@
|
||||
when: create_opt_directory and docker_service_suffix is defined
|
||||
block:
|
||||
- name: Create directory /opt/{{ docker_service + '/' + docker_service_suffix }}
|
||||
file:
|
||||
ansible.builtin.file:
|
||||
path: "/opt/{{ docker_service }}/{{ docker_service_suffix }}"
|
||||
state: directory
|
||||
owner: "{{ user.uid | default(omit) }}"
|
||||
group: "{{ user.group | default(omit) }}"
|
||||
mode: 0755
|
||||
- name: Set container_workdir variable
|
||||
set_fact:
|
||||
ansible.builtin.set_fact:
|
||||
container_workdir: /opt/{{ docker_service }}/{{ docker_service_suffix }}
|
||||
|
||||
- name: Set container_workdir variable
|
||||
set_fact:
|
||||
ansible.builtin.set_fact:
|
||||
container_workdir: /opt/{{ docker_service }}
|
||||
when: docker_service_suffix is not defined
|
||||
|
||||
@@ -61,11 +61,11 @@
|
||||
when: create_mounts_directory
|
||||
block:
|
||||
- name: Set docker_mounts_dir
|
||||
set_fact:
|
||||
ansible.builtin.set_fact:
|
||||
docker_mounts_dir: "{{ container_workdir }}/mounts"
|
||||
|
||||
- name: Create directory {{ docker_mounts_dir }}
|
||||
file:
|
||||
ansible.builtin.file:
|
||||
path: "{{ docker_mounts_dir }}"
|
||||
state: directory
|
||||
owner: "{{ user.uid | default(omit) }}"
|
||||
@@ -84,9 +84,9 @@
|
||||
import_tasks: volumes.yml
|
||||
|
||||
- name: "Container for {{ docker_service_name }}"
|
||||
docker_container:
|
||||
community.docker.docker_container:
|
||||
name: "{{ docker_service_name }}"
|
||||
image: "{{ container_image.Id if container_image != '' else docker_image }}"
|
||||
image: "{{ container_image.Id if (not ansible_check_mode) or (container_image | length > 0) else docker_image }}"
|
||||
user: "{{ docker_user if docker_host_user else omit }}"
|
||||
mounts: "{{ docker_volume_definition }}"
|
||||
published_ports: "{{ container_published_ports + docker_published_ports }}"
|
||||
@@ -101,6 +101,9 @@
|
||||
register: container_out
|
||||
|
||||
- name: Reset docker_mounts if converted from docker_volumes
|
||||
set_fact:
|
||||
ansible.builtin.set_fact:
|
||||
docker_mounts: []
|
||||
when: final_docker_volumes | length > 0
|
||||
|
||||
- name: Flush handlers to trigger container restart
|
||||
ansible.builtin.meta: flush_handlers
|
||||
|
||||
@@ -14,12 +14,12 @@
|
||||
state: directory
|
||||
owner: "{{ mount_owner if (item.set_owner is not defined or item.set_owner) and mount_owner | length > 0 else omit }}"
|
||||
group: "{{ mount_group if (item.set_group is not defined or item.set_group) and mount_group | length > 0 else omit }}"
|
||||
when: item.name is defined
|
||||
when: item.name is defined and docker_volume_type == 'bind'
|
||||
loop: "{{ docker_mounts }}"
|
||||
- name: Set docker_volume_definition for named binds
|
||||
set_fact:
|
||||
docker_volume_definition: "{{ docker_volume_definition + [{'source': docker_mounts_dir + '/' + item.name, 'target': item.path, 'type': 'bind'}] }}"
|
||||
when: item.name is defined
|
||||
when: item.name is defined and docker_volume_type == 'bind'
|
||||
loop: "{{ docker_mounts }}"
|
||||
|
||||
- name: Template docker template mounts for {{ docker_service_name }}
|
||||
@@ -28,12 +28,26 @@
|
||||
dest: "{{ docker_mounts_dir }}/{{ item.template }}"
|
||||
when: item.template is defined
|
||||
loop: "{{ docker_mounts }}"
|
||||
notify: Restart container {{ docker_service_name }}
|
||||
- name: Set docker_volume_definition for template mounts
|
||||
set_fact:
|
||||
docker_volume_definition: "{{ docker_volume_definition + [{'source': docker_mounts_dir + '/' + item.template, 'target': item.path, 'type': 'bind', 'read_only': true}] }}"
|
||||
when: item.template is defined
|
||||
loop: "{{ docker_mounts }}"
|
||||
|
||||
- name: Copy docker copypath mounts for {{ docker_service_name }}
|
||||
copy:
|
||||
src: "files/{{ item.copypath }}"
|
||||
dest: "{{ docker_mounts_dir }}/"
|
||||
when: item.copypath is defined
|
||||
loop: "{{ docker_mounts }}"
|
||||
notify: Restart container {{ docker_service_name }}
|
||||
- name: Set docker_volume_definition for copypath mounts
|
||||
set_fact:
|
||||
docker_volume_definition: "{{ docker_volume_definition + [{'source': docker_mounts_dir + '/' + item.copypath, 'target': item.path, 'type': 'bind', 'read_only': true}] }}"
|
||||
when: item.copypath is defined
|
||||
loop: "{{ docker_mounts }}"
|
||||
|
||||
- name: Set docker_volume_definition for named volumes
|
||||
set_fact:
|
||||
docker_volume_definition: "{{ docker_volume_definition + [{'source': docker_service_name + '_' + item.name, 'target': item.path, 'type': 'volume'}] }}"
|
||||
|
||||
9
roles/grafana/defaults/main.yml
Normal file
9
roles/grafana/defaults/main.yml
Normal file
@@ -0,0 +1,9 @@
|
||||
---
|
||||
|
||||
grafana_oauth_enabled: false
|
||||
grafana_oauth_scopes:
|
||||
- openid
|
||||
- profile
|
||||
- email
|
||||
grafana_oauth_allow_sign_up: true
|
||||
grafana_oauth_auto_login: false
|
||||
@@ -5,6 +5,59 @@ argument_specs:
|
||||
short_description: Grafana
|
||||
description: "Sets up a grafana docker container"
|
||||
options:
|
||||
grafana_oauth_enabled:
|
||||
description: Enables generic OAuth2 authentication.
|
||||
type: bool
|
||||
required: false
|
||||
default: false
|
||||
grafana_oauth_name:
|
||||
description: Name that refers to the generic OAuth2 authentication from the Grafana user interface.
|
||||
type: str
|
||||
required: false
|
||||
grafana_oauth_client_id:
|
||||
description: Client ID provided by your OAuth2 app.
|
||||
type: str
|
||||
required: "{{ grafana_oauth_enabled }}"
|
||||
grafana_oauth_client_secret:
|
||||
description: Client secret provided by your OAuth2 app.
|
||||
type: str
|
||||
required: "{{ grafana_oauth_enabled }}"
|
||||
grafana_oauth_auth_url:
|
||||
description: Authorization endpoint of your OAuth2 provider.
|
||||
type: str
|
||||
required: "{{ grafana_oauth_enabled }}"
|
||||
grafana_oauth_token_url:
|
||||
description: Endpoint used to obtain the OAuth2 access token.
|
||||
type: str
|
||||
required: "{{ grafana_oauth_enabled }}"
|
||||
grafana_oauth_api_url:
|
||||
description: Endpoint used to obtain user information compatible with OpenID UserInfo.
|
||||
type: str
|
||||
required: "{{ grafana_oauth_enabled }}"
|
||||
grafana_oauth_scopes:
|
||||
description: List of OAuth2 scopes.
|
||||
type: list
|
||||
required: false
|
||||
items: str
|
||||
default:
|
||||
- openid
|
||||
- profile
|
||||
- email
|
||||
grafana_oauth_role_attribute_path:
|
||||
description: JMESPath expression to use for Grafana role lookup. Grafana will first evaluate the expression using the OAuth2 ID token. If no role is found, the expression will be evaluated using the user information obtained from the UserInfo endpoint. The result of the evaluation should be a valid Grafana role (Viewer, Editor, Admin or GrafanaAdmin).
|
||||
type: str
|
||||
required: false
|
||||
grafana_oauth_allow_sign_up:
|
||||
description: Controls Grafana user creation through the generic OAuth2 login. Only existing Grafana users can log in with generic OAuth if set to false.
|
||||
type: bool
|
||||
required: false
|
||||
default: true
|
||||
grafana_oauth_auto_login:
|
||||
description: Set to true to enable users to bypass the login screen and automatically log in. This setting is ignored if you configure multiple auth providers to use auto-login.
|
||||
type: bool
|
||||
required: false
|
||||
default: false
|
||||
|
||||
database_passwords:
|
||||
description: "Passed to container role"
|
||||
required: true
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
docker_service: grafana
|
||||
docker_image: grafana/grafana
|
||||
docker_image_http_port: 3000
|
||||
docker_volumes:
|
||||
docker_mounts:
|
||||
- name: data
|
||||
path: /var/lib/grafana
|
||||
docker_database: postgres
|
||||
@@ -19,3 +19,15 @@
|
||||
GF_DATABASE_PASSWORD: "{{ database_passwords.grafana }}"
|
||||
GF_SERVER_DOMAIN: "{{ docker_vhost_domains.grafana[0] }}"
|
||||
GF_SERVER_ROOT_URL: "https://{{ docker_vhost_domains.grafana[0] }}"
|
||||
|
||||
GF_AUTH_GENERIC_OAUTH_ENABLED: "{{ 'true' if grafana_oauth_enabled else 'false' }}"
|
||||
GF_AUTH_GENERIC_OAUTH_NAME: "{{ grafana_oauth_name | default(omit) }}"
|
||||
GF_AUTH_GENERIC_OAUTH_CLIENT_ID: "{{ grafana_oauth_client_id }}"
|
||||
GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET: "{{ grafana_oauth_client_secret }}"
|
||||
GF_AUTH_GENERIC_OAUTH_AUTH_URL: "{{ grafana_oauth_auth_url }}"
|
||||
GF_AUTH_GENERIC_OAUTH_TOKEN_URL: "{{ grafana_oauth_token_url }}"
|
||||
GF_AUTH_GENERIC_OAUTH_API_URL: "{{ grafana_oauth_api_url }}"
|
||||
GF_AUTH_GENERIC_OAUTH_SCOPES: "{{ grafana_oauth_scopes | join(' ') }}"
|
||||
GF_AUTH_GENERIC_OAUTH_ALLOW_SIGN_UP: "{{ 'true' if grafana_oauth_allow_sign_up else 'false' }}"
|
||||
GF_AUTH_GENERIC_OAUTH_AUTO_LOGIN: "{{ 'true' if grafana_oauth_auto_login else 'false' }}"
|
||||
GF_AUTH_GENERIC_OAUTH_ROLE_ATTRIBUTE_PATH: "{{ grafana_oauth_role_attribute_path | default(omit) }}"
|
||||
|
||||
@@ -3,5 +3,14 @@
|
||||
prometheus_scrape_interval: 5s
|
||||
prometheus_evaluation_interval: 15s
|
||||
prometheus_storage_retention: 3650d
|
||||
prometheus_install_grafana: false
|
||||
|
||||
prometheus_hcloud_enabled: false
|
||||
prometheus_hcloud_relabel_configs: []
|
||||
|
||||
prometheus_install_alertmanager: true
|
||||
prometheus_install_webhook: false
|
||||
prometheus_install_grafana: false
|
||||
|
||||
alertmanager_storage_retention: "{{ prometheus_storage_retention }}"
|
||||
|
||||
prometheus_ping_hosts: []
|
||||
|
||||
95
roles/prometheus/files/alerting/blackbox-exporter.yaml
Normal file
95
roles/prometheus/files/alerting/blackbox-exporter.yaml
Normal file
@@ -0,0 +1,95 @@
|
||||
---
|
||||
|
||||
groups:
|
||||
- name: BlackboxExporter
|
||||
rules:
|
||||
|
||||
- alert: BlackboxAllWanProbesFailed
|
||||
expr: 'sum by (host_type) (probe_success{host_type="wan"})==0'
|
||||
for: 5s
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Lost internet access
|
||||
descrtiption: Failed to contact any wan probes
|
||||
|
||||
- alert: BlackboxProbeFailed
|
||||
expr: 'probe_success == 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: error
|
||||
annotations:
|
||||
summary: Unable to reach (instance {{ $labels.instance }})
|
||||
description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxConfigurationReloadFailure
|
||||
expr: 'blackbox_exporter_config_last_reload_successful != 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Blackbox configuration reload failure (instance {{ $labels.instance }})
|
||||
description: "Blackbox configuration reload failure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxSlowProbe
|
||||
expr: 'avg_over_time(probe_duration_seconds[1m]) > 1'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Getting slow responses from (instance {{ $labels.instance }})
|
||||
description: "Blackbox probe took more than 1s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxProbeHttpFailure
|
||||
expr: 'probe_http_status_code <= 199 OR probe_http_status_code >= 400'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: error
|
||||
annotations:
|
||||
summary: HTTP failure (instance {{ $labels.instance }})
|
||||
description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxSslCertificateWillExpireSoon
|
||||
expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: SSL certificate will expire soon (instance {{ $labels.instance }})
|
||||
description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxSslCertificateWillExpireSoon
|
||||
expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: error
|
||||
annotations:
|
||||
summary: SSL certificate expiry imminent (instance {{ $labels.instance }})
|
||||
description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxSslCertificateExpired
|
||||
expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: SSL certificate expired (instance {{ $labels.instance }})
|
||||
description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxProbeSlowHttp
|
||||
expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Slow HTTP responses from (instance {{ $labels.instance }})
|
||||
description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxProbeSlowPing
|
||||
expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Slow ping responses from (instance {{ $labels.instance }})
|
||||
description: "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
349
roles/prometheus/files/alerting/node-exporter.yaml
Normal file
349
roles/prometheus/files/alerting/node-exporter.yaml
Normal file
@@ -0,0 +1,349 @@
|
||||
---
|
||||
|
||||
groups:
|
||||
|
||||
- name: NodeExporter
|
||||
|
||||
rules:
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostMemoryUnderMemoryPressure
|
||||
expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostMemoryIsUnderutilized
|
||||
expr: '(100 - (rate(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1w
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host Memory is underutilized (instance {{ $labels.instance }})
|
||||
description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputIn
|
||||
expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputOut
|
||||
expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskWriteRate
|
||||
expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostDiskWillFillIn24Hours
|
||||
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfInodes
|
||||
expr: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of inodes (instance {{ $labels.instance }})
|
||||
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostFilesystemDeviceError
|
||||
expr: 'node_filesystem_device_error == 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host filesystem device error (instance {{ $labels.instance }})
|
||||
description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostInodesWillFillIn24Hours
|
||||
expr: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadLatency
|
||||
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read latency (instance {{ $labels.instance }})
|
||||
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskWriteLatency
|
||||
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
||||
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostHighCpuLoad
|
||||
expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuIsUnderutilized
|
||||
expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1w
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
||||
description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
||||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuHighIowait
|
||||
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU high iowait (instance {{ $labels.instance }})
|
||||
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskIo
|
||||
expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
||||
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostContextSwitching
|
||||
expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host context switching (instance {{ $labels.instance }})
|
||||
description: "Context switching is growing on the node (> 10000 / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSwapIsFillingUp
|
||||
expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSystemdServiceCrashed
|
||||
expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host systemd service crashed (instance {{ $labels.instance }})
|
||||
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNodeOvertemperatureAlarm
|
||||
expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
||||
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRaidArrayGotInactive
|
||||
expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
||||
description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRaidDiskFailure
|
||||
expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
||||
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostKernelVersionDeviations
|
||||
expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 6h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
||||
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostEdacCorrectableErrorsDetected
|
||||
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostEdacUncorrectableErrorsDetected
|
||||
expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkReceiveErrors
|
||||
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Receive Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkTransmitErrors
|
||||
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkInterfaceSaturated
|
||||
expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
|
||||
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkBondDegraded
|
||||
expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
|
||||
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostConntrackLimit
|
||||
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host conntrack limit (instance {{ $labels.instance }})
|
||||
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostClockSkew
|
||||
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock skew (instance {{ $labels.instance }})
|
||||
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostClockNotSynchronising
|
||||
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRequiresReboot
|
||||
expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 4h
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host requires reboot (instance {{ $labels.instance }})
|
||||
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
58
roles/prometheus/files/recording/node-exporter.yaml
Normal file
58
roles/prometheus/files/recording/node-exporter.yaml
Normal file
@@ -0,0 +1,58 @@
|
||||
"groups":
|
||||
- "name": "node-exporter.rules"
|
||||
"rules":
|
||||
- "expr": |
|
||||
count without (cpu) (
|
||||
count without (mode) (
|
||||
node_cpu_seconds_total{job="node"}
|
||||
)
|
||||
)
|
||||
"record": "instance:node_num_cpu:sum"
|
||||
- "expr": |
|
||||
1 - avg without (cpu, mode) (
|
||||
rate(node_cpu_seconds_total{job="node", mode="idle"}[1m])
|
||||
)
|
||||
"record": "instance:node_cpu_utilisation:rate1m"
|
||||
- "expr": |
|
||||
(
|
||||
node_load1{job="node"}
|
||||
/
|
||||
instance:node_num_cpu:sum{job="node"}
|
||||
)
|
||||
"record": "instance:node_load1_per_cpu:ratio"
|
||||
- "expr": |
|
||||
1 - (
|
||||
node_memory_MemAvailable_bytes{job="node"}
|
||||
/
|
||||
node_memory_MemTotal_bytes{job="node"}
|
||||
)
|
||||
"record": "instance:node_memory_utilisation:ratio"
|
||||
- "expr": |
|
||||
rate(node_vmstat_pgmajfault{job="node"}[1m])
|
||||
"record": "instance:node_vmstat_pgmajfault:rate1m"
|
||||
- "expr": |
|
||||
rate(node_disk_io_time_seconds_total{job="node", device!=""}[1m])
|
||||
"record": "instance_device:node_disk_io_time_seconds:rate1m"
|
||||
- "expr": |
|
||||
rate(node_disk_io_time_weighted_seconds_total{job="node", device!=""}[1m])
|
||||
"record": "instance_device:node_disk_io_time_weighted_seconds:rate1m"
|
||||
- "expr": |
|
||||
sum without (device) (
|
||||
rate(node_network_receive_bytes_total{job="node", device!="lo"}[1m])
|
||||
)
|
||||
"record": "instance:node_network_receive_bytes_excluding_lo:rate1m"
|
||||
- "expr": |
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_bytes_total{job="node", device!="lo"}[1m])
|
||||
)
|
||||
"record": "instance:node_network_transmit_bytes_excluding_lo:rate1m"
|
||||
- "expr": |
|
||||
sum without (device) (
|
||||
rate(node_network_receive_drop_total{job="node", device!="lo"}[1m])
|
||||
)
|
||||
"record": "instance:node_network_receive_drop_excluding_lo:rate1m"
|
||||
- "expr": |
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_drop_total{job="node", device!="lo"}[1m])
|
||||
)
|
||||
"record": "instance:node_network_transmit_drop_excluding_lo:rate1m"
|
||||
@@ -19,10 +19,15 @@ argument_specs:
|
||||
type: str
|
||||
required: false
|
||||
default: 3650d
|
||||
prometheus_hcloud_token:
|
||||
description: Access token for hetzner cloud service discovery. It will be enabled if this variable is defined
|
||||
type: str
|
||||
prometheus_hcloud_enabled:
|
||||
description: Whether to use hcloud discovery
|
||||
type: bool
|
||||
required: false
|
||||
default: false
|
||||
prometheus_hcloud_token:
|
||||
description: Access token for hetzner cloud service discovery.
|
||||
type: str
|
||||
required: "{{ prometheus_hcloud_enabled }}"
|
||||
prometheus_hcloud_relabel_configs:
|
||||
description: Relabel configs for hcloud
|
||||
type: list
|
||||
@@ -40,11 +45,77 @@ argument_specs:
|
||||
replacement:
|
||||
type: str
|
||||
required: false
|
||||
|
||||
prometheus_install_grafana:
|
||||
description: If true, installs grafana in the same docker network as prometheus and configures it with prometheus as data source
|
||||
description: If true, installs grafana in the same docker network as prometheus
|
||||
type: bool
|
||||
required: false
|
||||
default: false
|
||||
prometheus_install_alertmanager:
|
||||
description: If true, installs alertmanager in the same docker network as prometheus and configures it
|
||||
type: bool
|
||||
required: false
|
||||
default: true
|
||||
prometheus_install_webhook:
|
||||
description: If true, installs webhook server in the same docker network as prometheus and configures it
|
||||
type: bool
|
||||
required: false
|
||||
default: false
|
||||
|
||||
alertmanager_storage_retention:
|
||||
description: Period of time for which alertmanager data is stored for. A number followed by unit (s, m, h, d, w, y). Passed directly to alertmanager role
|
||||
type: str
|
||||
required: false
|
||||
default: "{{ prometheus_storage_retention | default('3650d') }}"
|
||||
|
||||
prometheus_webhook_handlers:
|
||||
description: List of webhook server handlers
|
||||
type: list
|
||||
required: "{{ prometheus_install_webhook }}"
|
||||
elements: dict
|
||||
options:
|
||||
id:
|
||||
description: specifies the ID of your hook. This value is used to create the HTTP endpoint
|
||||
type: str
|
||||
required: true
|
||||
execute-command:
|
||||
description: specifies the command that should be executed when the hook is triggered
|
||||
type: str
|
||||
required: true
|
||||
command-working-directory:
|
||||
description: specifies the working directory that will be used for the script when it's executed
|
||||
type: str
|
||||
required: false
|
||||
pass-arguments-to-command:
|
||||
description: >
|
||||
specifies the list of arguments that will be passed to the
|
||||
command. See for more info:
|
||||
https://github.com/adnanh/webhook/blob/master/docs/Referencing-Request-Values.md
|
||||
type: list
|
||||
required: false
|
||||
elements: dict
|
||||
options:
|
||||
source:
|
||||
description: Source of the argument. Use `string` to specify argument here.
|
||||
type: str
|
||||
required: true
|
||||
choices:
|
||||
- string
|
||||
- header
|
||||
- url
|
||||
- request
|
||||
- payload
|
||||
name:
|
||||
description: Argument if source is string, otherwise the source attribute name.
|
||||
type: str
|
||||
required: true
|
||||
|
||||
prometheus_ping_hosts:
|
||||
description: List of hosts to gather ping metrics for using prometheus blackbox exporter
|
||||
type: list
|
||||
required: false
|
||||
default: []
|
||||
elements: str
|
||||
|
||||
# All options after this will be passed directly to the container role
|
||||
docker_service_suffix:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
---
|
||||
|
||||
- name: Prometheus container
|
||||
import_role:
|
||||
include_role:
|
||||
name: container
|
||||
vars:
|
||||
docker_service: prometheus
|
||||
@@ -13,13 +13,51 @@
|
||||
- "--storage.tsdb.retention={{ prometheus_storage_retention }}"
|
||||
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
|
||||
- "--web.console.templates=/usr/share/prometheus/consoles"
|
||||
|
||||
docker_volumes:
|
||||
docker_mounts:
|
||||
- name: data
|
||||
path: /prometheus
|
||||
- template: prometheus.yml
|
||||
path: /etc/prometheus/prometheus.yml
|
||||
|
||||
- copypath: recording
|
||||
path: /etc/prometheus/recording
|
||||
- copypath: alerting
|
||||
path: /etc/prometheus/alerting
|
||||
|
||||
- name: Alertmanager container for prometheus
|
||||
include_role:
|
||||
name: alertmanager
|
||||
vars:
|
||||
docker_networks:
|
||||
- name: prometheus
|
||||
when: prometheus_install_alertmanager
|
||||
|
||||
- name: Webhook container for prometheus
|
||||
include_role:
|
||||
name: container
|
||||
vars:
|
||||
docker_service: prometheus_webhook
|
||||
docker_image: thecatlady/webhook
|
||||
reverse_proxy_type: none
|
||||
docker_mounts:
|
||||
- template: webhooks.yaml
|
||||
path: /config/hooks.yml
|
||||
docker_networks:
|
||||
- name: prometheus
|
||||
when: prometheus_install_webhook
|
||||
|
||||
- name: Blackbox exporter for prometheus
|
||||
include_role:
|
||||
name: container
|
||||
vars:
|
||||
docker_service: blackbox_exporter
|
||||
docker_image: prom/blackbox-exporter
|
||||
reverse_proxy_type: none
|
||||
docker_mounts:
|
||||
- template: blackbox_exporter.yml
|
||||
path: /etc/blackbox_exporter/config.yml
|
||||
docker_networks:
|
||||
- name: prometheus
|
||||
|
||||
- name: Grafana container for prometheus
|
||||
include_role:
|
||||
name: grafana
|
||||
|
||||
8
roles/prometheus/templates/blackbox_exporter.yml.j2
Normal file
8
roles/prometheus/templates/blackbox_exporter.yml.j2
Normal file
@@ -0,0 +1,8 @@
|
||||
---
|
||||
|
||||
modules:
|
||||
icmp:
|
||||
prober: icmp
|
||||
timeout: 5s
|
||||
icmp:
|
||||
preferred_ip_protocol: "ip4"
|
||||
@@ -1,5 +1,7 @@
|
||||
---
|
||||
|
||||
# {{ ansible_managed }}
|
||||
|
||||
# my global config
|
||||
global:
|
||||
scrape_interval: {{ prometheus_scrape_interval }}
|
||||
@@ -10,25 +12,54 @@ alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
# - alertmanager:9093
|
||||
{% if prometheus_install_alertmanager %}
|
||||
- alertmanager:9093
|
||||
{%- endif %}
|
||||
|
||||
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
|
||||
rule_files:
|
||||
# - "first_rules.yml"
|
||||
# - "second_rules.yml"
|
||||
- "/etc/prometheus/recording/*.yaml"
|
||||
- "/etc/prometheus/alerting/*.yaml"
|
||||
|
||||
# A scrape configuration containing exactly one endpoint to scrape:
|
||||
# Here it's Prometheus itself.
|
||||
scrape_configs:
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ["localhost:9090"]
|
||||
|
||||
{% if prometheus_hcloud_token is defined %}
|
||||
- job_name: "blackbox"
|
||||
static_configs:
|
||||
- targets: ["blackbox_exporter:9115"]
|
||||
|
||||
{% if prometheus_ping_hosts | length > 0 %}
|
||||
- job_name: "icmp"
|
||||
metrics_path: "/probe"
|
||||
params:
|
||||
module: ["icmp"]
|
||||
static_configs:
|
||||
- targets:
|
||||
{% for host in prometheus_ping_hosts %}
|
||||
- "{{ host.name }}::{{ host.type | default('monitored') }}"
|
||||
{% endfor %}
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
regex: '(.+)::(.+)'
|
||||
target_label: __param_target
|
||||
replacement: '${1}'
|
||||
- source_labels: [__address__]
|
||||
regex: '(.+)::(.+)'
|
||||
target_label: host_type
|
||||
replacement: '${2}'
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox_exporter:9115
|
||||
{%- endif %}
|
||||
|
||||
{% if prometheus_hcloud_enabled %}
|
||||
- job_name: hcloud
|
||||
hetzner_sd_configs:
|
||||
- role: hcloud
|
||||
authorization:
|
||||
credentials: {{ prometheus_hcloud_token }}
|
||||
relabel_configs: {{ prometheus_hcloud_relabel_configs }}
|
||||
{% endif %}
|
||||
{%- endif %}
|
||||
|
||||
5
roles/prometheus/templates/webhooks.yaml.j2
Normal file
5
roles/prometheus/templates/webhooks.yaml.j2
Normal file
@@ -0,0 +1,5 @@
|
||||
---
|
||||
|
||||
{% for item in prometheus_webhook_handlers %}
|
||||
- {{ item }}
|
||||
{% endfor %}
|
||||
Reference in New Issue
Block a user