Compare commits

..

19 Commits

Author SHA1 Message Date
uumas
16e0d6eadb update smtp docs 2023-12-21 01:36:43 +02:00
uumas
298f053835 authentik: add mounts 2023-12-21 01:36:13 +02:00
uumas
6acb2d17dd grafana: add oauth support 2023-12-21 01:35:34 +02:00
uumas
b6e379a3f2 add alertmanager role 2023-12-21 01:33:54 +02:00
uumas
c5a54827d4 prometheus: add blackbox exporter 2023-12-21 01:32:53 +02:00
uumas
6f1bcecf25 container: formatting 2023-12-21 01:31:34 +02:00
uumas
5bf47c73a7 container: add switch for hcloud 2023-12-21 01:31:05 +02:00
uumas
681b788ac4 prometheus: add support for installing webhook server 2023-12-21 01:22:53 +02:00
uumas
0eeeecb549 prometheus: config cleanup, actually use recording and alerting rules 2023-12-21 01:19:23 +02:00
uumas
44665bae12 prometheus: install alertmanager 2023-12-21 01:18:27 +02:00
uumas
8dc0ec798f promehteus: add alerting and recording rules 2023-12-21 01:14:31 +02:00
uumas
e9d1eed01b container: flush handler in the end 2023-12-21 01:12:24 +02:00
uumas
8d2999fe87 container: fix check mode 2023-12-21 01:12:15 +02:00
uumas
d80641623e container: switch to fqcns 2023-12-21 01:12:02 +02:00
uumas
22227d9ffc container: small fixes 2023-12-21 01:10:56 +02:00
uumas
3e9ea95ad7 container: add copypath mounts for copying whole directories to mount in container 2023-12-21 01:08:02 +02:00
uumas
d76dbf6e3c container: add restart container handler for changed template 2023-12-21 01:03:52 +02:00
uumas
ef5d83b188 prometheus: use docker_mounts instead of volumes 2023-12-21 01:00:39 +02:00
uumas
74e9eb8dcb prometheus: include instead of import 2023-12-21 01:00:04 +02:00
25 changed files with 997 additions and 37 deletions

View File

@@ -5,8 +5,8 @@ These variables are required by multiple roles. Example values included.
timezone: 'Europe/Helsinki'
admin_email: 'admin@domain.tld'
smtp_server: smtp.domain.tld
smtp_from: sender@domain.tld
smtp_server: smtp.domain.tld # Smtp server, must be reachable on port 587 with tls
smtp_from: sender@domain.tld # Address to send mail from
```
# Optional variables
@@ -15,7 +15,7 @@ These variables are used by multiple roles and have the following default values
```
reverse_proxy_type: caddy # Allowed values: caddy, traefik, none
smtp_from: # not defined, no smtp login by default
smtp_user: # not defined, no smtp login by default
smtp_pw: # not defined, see above
```

View File

@@ -0,0 +1 @@
Sets up a prometheus alertmanager docker container.

View File

@@ -0,0 +1,8 @@
---
alertmanager_storage_retention: "{{ prometheus_storage_retention | default('3650d') }}"
alertmanager_smtp_server: "{{ smtp_server | default('') }}"
alertmanager_smtp_from: "{{ smtp_from | default('') }}"
alertmanager_smtp_user: "{{ smtp_user | default('') }}"
alertmanager_smtp_pw: "{{ smtp_pw | default('') }}"

View File

@@ -0,0 +1,79 @@
---
argument_specs:
main:
short_description: Prometheus alertmanager docker container
options:
alertmanager_storage_retention:
description: Period of time for which alertmanager data is stored for. A number followed by unit (s, m, h, d, w, y).
type: str
required: false
default: "{{ prometheus_storage_retention | default('3650d') }}"
alertmanager_smtp_server:
description: Smtp server to use for sending mail. Must be reachable on port 587. Emails not sent if not defined
type: str
required: false
default: "{{ smtp_server | default('') }}"
alertmanager_smtp_from:
description: Address to send mail from. Required if sending emails.
type: str
required: false
default: "{{ smtp_from | default('') }}"
alertmanager_smtp_user:
description: User to login to smtp server with. No authentication if not defined.
type: str
required: false
default: "{{ smtp_user | default('') }}"
alertmanager_smtp_pw:
description: Password for the smtp user
type: str
required: false
default: "{{ smtp_pw | default('') }}"
smtp_server:
description: Global smtp server value, default for alertmanager_smtp_server
type: str
required: false
smtp_from:
description: Global smtp from value, default for alertmanager_smtp_from
type: str
required: "{{ alertmanager_smtp_server | length > 0 and alertmanager_smtp_from | length == 0 }}"
smtp_user:
description: Global smtp user value, default for alertmanager_smtp_user
type: str
required: false
smtp_pw:
description: Global smtp password value, default for alertmanager_smtp_pw
type: str
required: "{{ alertmanager_smtp_server | length > 0 and alertmanager_smtp_user | length > 0 and alertmanager_smtp_pw | length == 0 }}"
# All options after this will be passed directly to the container role
docker_service_suffix:
description: "Passed to container role"
required: false
docker_host_user:
description: "Passed to container role"
required: false
database_passwords:
description: "Passed to container role"
required: false
docker_additional_services:
description: "Passed to container role"
required: false
docker_volume_type:
description: "Passed to container role"
required: false
reverse_proxy_type:
description: "Passed to container role"
required: false
ports:
description: "Passed to container role"
required: false
docker_vhost_domains:
description: "Passed to container role"
required: false
docker_entrypoint:
description: "Passed to container role"
required: false

View File

@@ -0,0 +1,19 @@
---
- name: Prometheus alertmanager container
include_role:
name: container
vars:
docker_service: alertmanager
docker_image: prom/alertmanager
reverse_proxy_type: none
docker_command:
- "--config.file=/etc/alertmanager/alertmanager.yml"
- "--storage.path=/alertmanager"
- "--data.retention={{ alertmanager_storage_retention }}"
docker_mounts:
- name: data
path: /alertmanager
- template: alertmanager.yml
path: /etc/alertmanager/alertmanager.yml

View File

@@ -0,0 +1,68 @@
---
# {{ ansible_managed }}
global:
# The smarthost and SMTP sender used for mail notifications.
{% if alertmanager_smtp_server | length > 0 %}
smtp_smarthost: '{{ alertmanager_smtp_server }}:587'
smtp_from: '{{ alertmanager_smtp_from }}'
{% if alertmanager_smtp_user | length > 0 %}
smtp_auth_username: '{{ alertmanager_smtp_user }}'
smtp_auth_password: '{{ alertmanager_smtp_pw }}'
{% endif %}
{% endif %}
# The directory from which notification templates are read.
templates:
- '/etc/alertmanager/template/*.tmpl'
# The root route on which each incoming alert enters.
route:
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
#
# To aggregate by all possible labels use '...' as the sole label name.
# This effectively disables aggregation entirely, passing through all
# alerts as-is. This is unlikely to be what you want, unless you have
# a very low alert volume or your upstream notification system performs
# its own grouping. Example: group_by: [...]
group_by: ['alertname', 'cluster', 'service']
# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 30s
# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 5m
# If an alert has successfully been sent, wait 'repeat_interval' to
# resend them.
repeat_interval: 3h
# A default receiver
receiver: uumas_email
# All the above attributes are inherited by all child routes and can
# overwritten on each.
# The child route trees.
routes: {{ alertmanager_routes }}
inhibit_rules:
- source_matchers: [severity="critical"]
target_matchers: [severity="warning"]
# Apply inhibition if the alertname is the same.
# CAUTION:
# If all label names listed in `equal` are missing
# from both the source and target alerts,
# the inhibition rule will apply!
equal: [alertname, cluster, service]
receivers: {{ alertmanager_receivers }}

View File

@@ -1,5 +1,9 @@
---
- name: Set docker service full name (required because docker_mounts uses it)
set_fact:
authentik_service_name: "authentik{{ '_' + docker_service_suffix if docker_service_suffix is defined else '' }}"
- name: Authentik container
ansible.builtin.import_role:
name: container
@@ -13,6 +17,13 @@
docker_additional_services:
- redis
docker_env: "{{ authentik_common_env | combine(authentik_env) }}"
docker_mounts:
- path: /media
absolute_name: "{{ authentik_service_name }}_media"
- path: /templates
absolute_name: "{{ authentik_service_name }}_templates"
- path: /certs
name: "{{ authentik_service_name }}_certs"
- name: Authentik worker container
ansible.builtin.import_role:
@@ -24,3 +35,10 @@
- worker
reverse_proxy_type: none
docker_env: "{{ authentik_common_env | combine(authentik_env) }}"
docker_mounts:
- path: /media
absolute_name: "{{ authentik_service_name }}_media"
- path: /templates
absolute_name: "{{ authentik_service_name }}_templates"
- path: /certs
name: "{{ authentik_service_name }}_certs"

View File

@@ -0,0 +1,7 @@
---
- name: Restart container {{ docker_service_name }}
community.docker.docker_container:
name: "{{ docker_service_name }}"
restart: true
when: not container_out.changed

View File

@@ -118,6 +118,10 @@ argument_specs:
description: "Name of template without .j2 extension. Will be templated at /opt/<service>[/suffix]/mounts/<template> and mounted inside the container."
type: str
required: false
copypath:
description: "Name of file or directory to copy. Will be deployed from files/<copypath> to /opt/<service>[/suffix]/mounts/<copypath> and mounted inside the container."
type: str
required: false
reverse_proxy_type:
description: "Defines which kind of reverse proxy to configure for the container. Traefik support is experimental."
type: str

View File

@@ -34,7 +34,7 @@
source: pull
force_source: true
register: pulled_image
when: dockerfile is not defined or dockerfile | length == 0
when: not dockerfile_needed
- name: Set container_image variable
set_fact:

View File

@@ -39,6 +39,7 @@
- name: Set assistive variables
set_fact:
template_mounts_needed: "{{ docker_mounts | selectattr('template', 'defined') | list | length > 0 }}"
copypath_mounts_needed: "{{ docker_mounts | selectattr('copypath', 'defined') | list | length > 0 }}"
volumes_needed: "{{ docker_mounts | selectattr('name', 'defined') | list | length > 0 or docker_database != 'none' }}"
dockerfile_needed: "{{ dockerfile | length > 0 }}"
db_config_mounts_needed: "{{ docker_mariadb_config | length > 0 }}"
@@ -48,8 +49,8 @@
named_volumes_needed: "{{ volumes_needed and docker_volume_type == 'named' }}"
- name: Set even more assistive variables
set_fact:
create_opt_directory: "{{ dockerfile_needed or docker_host_user or bind_volumes_needed or template_mounts_needed or db_config_mounts_needed }}"
create_mounts_directory: "{{ bind_volumes_needed or template_mounts_needed or db_config_mounts_needed }}"
create_opt_directory: "{{ dockerfile_needed or docker_host_user or bind_volumes_needed or template_mounts_needed or copypath_mounts_needed or db_config_mounts_needed }}"
create_mounts_directory: "{{ bind_volumes_needed or template_mounts_needed or copypath_mounts_needed or db_config_mounts_needed }}"
- name: Set docker service full name
set_fact:

View File

@@ -7,16 +7,16 @@
when: docker_network_mode is not defined or docker_network_mode != 'host' or docker_networks | length > 0
block:
- name: Set networks variable to {{ docker_service_name }}
set_fact:
ansible.builtin.set_fact:
container_networks:
- name: "{{ docker_service_name }}"
when: docker_networks | length == 0
- name: Set networks variable to {{ docker_networks }}
set_fact:
ansible.builtin.set_fact:
container_networks: "{{ docker_networks }}"
when: docker_networks | length > 0
- name: Create docker networks
docker_network:
community.docker.docker_network:
name: "{{ item.name }}"
loop: "{{ container_networks }}"
@@ -25,7 +25,7 @@
when: reverse_proxy_type != 'none'
- name: Create directory /opt/{{ docker_service }}
file:
ansible.builtin.file:
path: "/opt/{{ docker_service }}"
state: directory
mode: 0755
@@ -42,18 +42,18 @@
when: create_opt_directory and docker_service_suffix is defined
block:
- name: Create directory /opt/{{ docker_service + '/' + docker_service_suffix }}
file:
ansible.builtin.file:
path: "/opt/{{ docker_service }}/{{ docker_service_suffix }}"
state: directory
owner: "{{ user.uid | default(omit) }}"
group: "{{ user.group | default(omit) }}"
mode: 0755
- name: Set container_workdir variable
set_fact:
ansible.builtin.set_fact:
container_workdir: /opt/{{ docker_service }}/{{ docker_service_suffix }}
- name: Set container_workdir variable
set_fact:
ansible.builtin.set_fact:
container_workdir: /opt/{{ docker_service }}
when: docker_service_suffix is not defined
@@ -61,11 +61,11 @@
when: create_mounts_directory
block:
- name: Set docker_mounts_dir
set_fact:
ansible.builtin.set_fact:
docker_mounts_dir: "{{ container_workdir }}/mounts"
- name: Create directory {{ docker_mounts_dir }}
file:
ansible.builtin.file:
path: "{{ docker_mounts_dir }}"
state: directory
owner: "{{ user.uid | default(omit) }}"
@@ -84,9 +84,9 @@
import_tasks: volumes.yml
- name: "Container for {{ docker_service_name }}"
docker_container:
community.docker.docker_container:
name: "{{ docker_service_name }}"
image: "{{ container_image.Id if container_image != '' else docker_image }}"
image: "{{ container_image.Id if (not ansible_check_mode) or (container_image | length > 0) else docker_image }}"
user: "{{ docker_user if docker_host_user else omit }}"
mounts: "{{ docker_volume_definition }}"
published_ports: "{{ container_published_ports + docker_published_ports }}"
@@ -101,6 +101,9 @@
register: container_out
- name: Reset docker_mounts if converted from docker_volumes
set_fact:
ansible.builtin.set_fact:
docker_mounts: []
when: final_docker_volumes | length > 0
- name: Flush handlers to trigger container restart
ansible.builtin.meta: flush_handlers

View File

@@ -14,12 +14,12 @@
state: directory
owner: "{{ mount_owner if (item.set_owner is not defined or item.set_owner) and mount_owner | length > 0 else omit }}"
group: "{{ mount_group if (item.set_group is not defined or item.set_group) and mount_group | length > 0 else omit }}"
when: item.name is defined
when: item.name is defined and docker_volume_type == 'bind'
loop: "{{ docker_mounts }}"
- name: Set docker_volume_definition for named binds
set_fact:
docker_volume_definition: "{{ docker_volume_definition + [{'source': docker_mounts_dir + '/' + item.name, 'target': item.path, 'type': 'bind'}] }}"
when: item.name is defined
when: item.name is defined and docker_volume_type == 'bind'
loop: "{{ docker_mounts }}"
- name: Template docker template mounts for {{ docker_service_name }}
@@ -28,12 +28,26 @@
dest: "{{ docker_mounts_dir }}/{{ item.template }}"
when: item.template is defined
loop: "{{ docker_mounts }}"
notify: Restart container {{ docker_service_name }}
- name: Set docker_volume_definition for template mounts
set_fact:
docker_volume_definition: "{{ docker_volume_definition + [{'source': docker_mounts_dir + '/' + item.template, 'target': item.path, 'type': 'bind', 'read_only': true}] }}"
when: item.template is defined
loop: "{{ docker_mounts }}"
- name: Copy docker copypath mounts for {{ docker_service_name }}
copy:
src: "files/{{ item.copypath }}"
dest: "{{ docker_mounts_dir }}/"
when: item.copypath is defined
loop: "{{ docker_mounts }}"
notify: Restart container {{ docker_service_name }}
- name: Set docker_volume_definition for copypath mounts
set_fact:
docker_volume_definition: "{{ docker_volume_definition + [{'source': docker_mounts_dir + '/' + item.copypath, 'target': item.path, 'type': 'bind', 'read_only': true}] }}"
when: item.copypath is defined
loop: "{{ docker_mounts }}"
- name: Set docker_volume_definition for named volumes
set_fact:
docker_volume_definition: "{{ docker_volume_definition + [{'source': docker_service_name + '_' + item.name, 'target': item.path, 'type': 'volume'}] }}"

View File

@@ -0,0 +1,9 @@
---
grafana_oauth_enabled: false
grafana_oauth_scopes:
- openid
- profile
- email
grafana_oauth_allow_sign_up: true
grafana_oauth_auto_login: false

View File

@@ -5,6 +5,59 @@ argument_specs:
short_description: Grafana
description: "Sets up a grafana docker container"
options:
grafana_oauth_enabled:
description: Enables generic OAuth2 authentication.
type: bool
required: false
default: false
grafana_oauth_name:
description: Name that refers to the generic OAuth2 authentication from the Grafana user interface.
type: str
required: false
grafana_oauth_client_id:
description: Client ID provided by your OAuth2 app.
type: str
required: "{{ grafana_oauth_enabled }}"
grafana_oauth_client_secret:
description: Client secret provided by your OAuth2 app.
type: str
required: "{{ grafana_oauth_enabled }}"
grafana_oauth_auth_url:
description: Authorization endpoint of your OAuth2 provider.
type: str
required: "{{ grafana_oauth_enabled }}"
grafana_oauth_token_url:
description: Endpoint used to obtain the OAuth2 access token.
type: str
required: "{{ grafana_oauth_enabled }}"
grafana_oauth_api_url:
description: Endpoint used to obtain user information compatible with OpenID UserInfo.
type: str
required: "{{ grafana_oauth_enabled }}"
grafana_oauth_scopes:
description: List of OAuth2 scopes.
type: list
required: false
items: str
default:
- openid
- profile
- email
grafana_oauth_role_attribute_path:
description: JMESPath expression to use for Grafana role lookup. Grafana will first evaluate the expression using the OAuth2 ID token. If no role is found, the expression will be evaluated using the user information obtained from the UserInfo endpoint. The result of the evaluation should be a valid Grafana role (Viewer, Editor, Admin or GrafanaAdmin).
type: str
required: false
grafana_oauth_allow_sign_up:
description: Controls Grafana user creation through the generic OAuth2 login. Only existing Grafana users can log in with generic OAuth if set to false.
type: bool
required: false
default: true
grafana_oauth_auto_login:
description: Set to true to enable users to bypass the login screen and automatically log in. This setting is ignored if you configure multiple auth providers to use auto-login.
type: bool
required: false
default: false
database_passwords:
description: "Passed to container role"
required: true

View File

@@ -7,7 +7,7 @@
docker_service: grafana
docker_image: grafana/grafana
docker_image_http_port: 3000
docker_volumes:
docker_mounts:
- name: data
path: /var/lib/grafana
docker_database: postgres
@@ -19,3 +19,15 @@
GF_DATABASE_PASSWORD: "{{ database_passwords.grafana }}"
GF_SERVER_DOMAIN: "{{ docker_vhost_domains.grafana[0] }}"
GF_SERVER_ROOT_URL: "https://{{ docker_vhost_domains.grafana[0] }}"
GF_AUTH_GENERIC_OAUTH_ENABLED: "{{ 'true' if grafana_oauth_enabled else 'false' }}"
GF_AUTH_GENERIC_OAUTH_NAME: "{{ grafana_oauth_name | default(omit) }}"
GF_AUTH_GENERIC_OAUTH_CLIENT_ID: "{{ grafana_oauth_client_id }}"
GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET: "{{ grafana_oauth_client_secret }}"
GF_AUTH_GENERIC_OAUTH_AUTH_URL: "{{ grafana_oauth_auth_url }}"
GF_AUTH_GENERIC_OAUTH_TOKEN_URL: "{{ grafana_oauth_token_url }}"
GF_AUTH_GENERIC_OAUTH_API_URL: "{{ grafana_oauth_api_url }}"
GF_AUTH_GENERIC_OAUTH_SCOPES: "{{ grafana_oauth_scopes | join(' ') }}"
GF_AUTH_GENERIC_OAUTH_ALLOW_SIGN_UP: "{{ 'true' if grafana_oauth_allow_sign_up else 'false' }}"
GF_AUTH_GENERIC_OAUTH_AUTO_LOGIN: "{{ 'true' if grafana_oauth_auto_login else 'false' }}"
GF_AUTH_GENERIC_OAUTH_ROLE_ATTRIBUTE_PATH: "{{ grafana_oauth_role_attribute_path | default(omit) }}"

View File

@@ -3,5 +3,14 @@
prometheus_scrape_interval: 5s
prometheus_evaluation_interval: 15s
prometheus_storage_retention: 3650d
prometheus_install_grafana: false
prometheus_hcloud_enabled: false
prometheus_hcloud_relabel_configs: []
prometheus_install_alertmanager: true
prometheus_install_webhook: false
prometheus_install_grafana: false
alertmanager_storage_retention: "{{ prometheus_storage_retention }}"
prometheus_ping_hosts: []

View File

@@ -0,0 +1,95 @@
---
groups:
- name: BlackboxExporter
rules:
- alert: BlackboxAllWanProbesFailed
expr: 'sum by (host_type) (probe_success{host_type="wan"})==0'
for: 5s
labels:
severity: critical
annotations:
summary: Lost internet access
descrtiption: Failed to contact any wan probes
- alert: BlackboxProbeFailed
expr: 'probe_success == 0'
for: 0m
labels:
severity: error
annotations:
summary: Unable to reach (instance {{ $labels.instance }})
description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxConfigurationReloadFailure
expr: 'blackbox_exporter_config_last_reload_successful != 1'
for: 0m
labels:
severity: warning
annotations:
summary: Blackbox configuration reload failure (instance {{ $labels.instance }})
description: "Blackbox configuration reload failure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSlowProbe
expr: 'avg_over_time(probe_duration_seconds[1m]) > 1'
for: 1m
labels:
severity: warning
annotations:
summary: Getting slow responses from (instance {{ $labels.instance }})
description: "Blackbox probe took more than 1s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeHttpFailure
expr: 'probe_http_status_code <= 199 OR probe_http_status_code >= 400'
for: 0m
labels:
severity: error
annotations:
summary: HTTP failure (instance {{ $labels.instance }})
description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20'
for: 0m
labels:
severity: warning
annotations:
summary: SSL certificate will expire soon (instance {{ $labels.instance }})
description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
for: 0m
labels:
severity: error
annotations:
summary: SSL certificate expiry imminent (instance {{ $labels.instance }})
description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateExpired
expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
for: 0m
labels:
severity: critical
annotations:
summary: SSL certificate expired (instance {{ $labels.instance }})
description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeSlowHttp
expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
for: 1m
labels:
severity: warning
annotations:
summary: Slow HTTP responses from (instance {{ $labels.instance }})
description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeSlowPing
expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1'
for: 1m
labels:
severity: warning
annotations:
summary: Slow ping responses from (instance {{ $labels.instance }})
description: "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@@ -0,0 +1,349 @@
---
groups:
- name: NodeExporter
rules:
- alert: HostOutOfMemory
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryIsUnderutilized
expr: '(100 - (rate(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1w
labels:
severity: info
annotations:
summary: Host Memory is underutilized (instance {{ $labels.instance }})
description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadRate
expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteRate
expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostDiskWillFillIn24Hours
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes
expr: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostFilesystemDeviceError
expr: 'node_filesystem_device_error == 1'
for: 0m
labels:
severity: critical
annotations:
summary: Host filesystem device error (instance {{ $labels.instance }})
description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostInodesWillFillIn24Hours
expr: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteLatency
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad
expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 10m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuIsUnderutilized
expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1w
labels:
severity: info
annotations:
summary: Host CPU is underutilized (instance {{ $labels.instance }})
description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuHighIowait
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }})
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskIo
expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostContextSwitching
expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host context switching (instance {{ $labels.instance }})
description: "Context switching is growing on the node (> 10000 / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{ $labels.instance }})
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSystemdServiceCrashed
expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host systemd service crashed (instance {{ $labels.instance }})
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostPhysicalComponentTooHot
expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNodeOvertemperatureAlarm
expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: critical
annotations:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidArrayGotInactive
expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: critical
annotations:
summary: Host RAID array got inactive (instance {{ $labels.instance }})
description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidDiskFailure
expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host RAID disk failure (instance {{ $labels.instance }})
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostKernelVersionDeviations
expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 6h
labels:
severity: warning
annotations:
summary: Host kernel version deviations (instance {{ $labels.instance }})
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOomKillDetected
expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacCorrectableErrorsDetected
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacUncorrectableErrorsDetected
expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkReceiveErrors
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Receive Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkTransmitErrors
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkInterfaceSaturated
expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1m
labels:
severity: warning
annotations:
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkBondDegraded
expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostConntrackLimit
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockSkew
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 10m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockNotSynchronising
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRequiresReboot
expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 4h
labels:
severity: info
annotations:
summary: Host requires reboot (instance {{ $labels.instance }})
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@@ -0,0 +1,58 @@
"groups":
- "name": "node-exporter.rules"
"rules":
- "expr": |
count without (cpu) (
count without (mode) (
node_cpu_seconds_total{job="node"}
)
)
"record": "instance:node_num_cpu:sum"
- "expr": |
1 - avg without (cpu, mode) (
rate(node_cpu_seconds_total{job="node", mode="idle"}[1m])
)
"record": "instance:node_cpu_utilisation:rate1m"
- "expr": |
(
node_load1{job="node"}
/
instance:node_num_cpu:sum{job="node"}
)
"record": "instance:node_load1_per_cpu:ratio"
- "expr": |
1 - (
node_memory_MemAvailable_bytes{job="node"}
/
node_memory_MemTotal_bytes{job="node"}
)
"record": "instance:node_memory_utilisation:ratio"
- "expr": |
rate(node_vmstat_pgmajfault{job="node"}[1m])
"record": "instance:node_vmstat_pgmajfault:rate1m"
- "expr": |
rate(node_disk_io_time_seconds_total{job="node", device!=""}[1m])
"record": "instance_device:node_disk_io_time_seconds:rate1m"
- "expr": |
rate(node_disk_io_time_weighted_seconds_total{job="node", device!=""}[1m])
"record": "instance_device:node_disk_io_time_weighted_seconds:rate1m"
- "expr": |
sum without (device) (
rate(node_network_receive_bytes_total{job="node", device!="lo"}[1m])
)
"record": "instance:node_network_receive_bytes_excluding_lo:rate1m"
- "expr": |
sum without (device) (
rate(node_network_transmit_bytes_total{job="node", device!="lo"}[1m])
)
"record": "instance:node_network_transmit_bytes_excluding_lo:rate1m"
- "expr": |
sum without (device) (
rate(node_network_receive_drop_total{job="node", device!="lo"}[1m])
)
"record": "instance:node_network_receive_drop_excluding_lo:rate1m"
- "expr": |
sum without (device) (
rate(node_network_transmit_drop_total{job="node", device!="lo"}[1m])
)
"record": "instance:node_network_transmit_drop_excluding_lo:rate1m"

View File

@@ -19,10 +19,15 @@ argument_specs:
type: str
required: false
default: 3650d
prometheus_hcloud_token:
description: Access token for hetzner cloud service discovery. It will be enabled if this variable is defined
type: str
prometheus_hcloud_enabled:
description: Whether to use hcloud discovery
type: bool
required: false
default: false
prometheus_hcloud_token:
description: Access token for hetzner cloud service discovery.
type: str
required: "{{ prometheus_hcloud_enabled }}"
prometheus_hcloud_relabel_configs:
description: Relabel configs for hcloud
type: list
@@ -40,11 +45,77 @@ argument_specs:
replacement:
type: str
required: false
prometheus_install_grafana:
description: If true, installs grafana in the same docker network as prometheus and configures it with prometheus as data source
description: If true, installs grafana in the same docker network as prometheus
type: bool
required: false
default: false
prometheus_install_alertmanager:
description: If true, installs alertmanager in the same docker network as prometheus and configures it
type: bool
required: false
default: true
prometheus_install_webhook:
description: If true, installs webhook server in the same docker network as prometheus and configures it
type: bool
required: false
default: false
alertmanager_storage_retention:
description: Period of time for which alertmanager data is stored for. A number followed by unit (s, m, h, d, w, y). Passed directly to alertmanager role
type: str
required: false
default: "{{ prometheus_storage_retention | default('3650d') }}"
prometheus_webhook_handlers:
description: List of webhook server handlers
type: list
required: "{{ prometheus_install_webhook }}"
elements: dict
options:
id:
description: specifies the ID of your hook. This value is used to create the HTTP endpoint
type: str
required: true
execute-command:
description: specifies the command that should be executed when the hook is triggered
type: str
required: true
command-working-directory:
description: specifies the working directory that will be used for the script when it's executed
type: str
required: false
pass-arguments-to-command:
description: >
specifies the list of arguments that will be passed to the
command. See for more info:
https://github.com/adnanh/webhook/blob/master/docs/Referencing-Request-Values.md
type: list
required: false
elements: dict
options:
source:
description: Source of the argument. Use `string` to specify argument here.
type: str
required: true
choices:
- string
- header
- url
- request
- payload
name:
description: Argument if source is string, otherwise the source attribute name.
type: str
required: true
prometheus_ping_hosts:
description: List of hosts to gather ping metrics for using prometheus blackbox exporter
type: list
required: false
default: []
elements: str
# All options after this will be passed directly to the container role
docker_service_suffix:

View File

@@ -1,7 +1,7 @@
---
- name: Prometheus container
import_role:
include_role:
name: container
vars:
docker_service: prometheus
@@ -13,12 +13,50 @@
- "--storage.tsdb.retention={{ prometheus_storage_retention }}"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
docker_volumes:
docker_mounts:
- name: data
path: /prometheus
- template: prometheus.yml
path: /etc/prometheus/prometheus.yml
- copypath: recording
path: /etc/prometheus/recording
- copypath: alerting
path: /etc/prometheus/alerting
- name: Alertmanager container for prometheus
include_role:
name: alertmanager
vars:
docker_networks:
- name: prometheus
when: prometheus_install_alertmanager
- name: Webhook container for prometheus
include_role:
name: container
vars:
docker_service: prometheus_webhook
docker_image: thecatlady/webhook
reverse_proxy_type: none
docker_mounts:
- template: webhooks.yaml
path: /config/hooks.yml
docker_networks:
- name: prometheus
when: prometheus_install_webhook
- name: Blackbox exporter for prometheus
include_role:
name: container
vars:
docker_service: blackbox_exporter
docker_image: prom/blackbox-exporter
reverse_proxy_type: none
docker_mounts:
- template: blackbox_exporter.yml
path: /etc/blackbox_exporter/config.yml
docker_networks:
- name: prometheus
- name: Grafana container for prometheus
include_role:

View File

@@ -0,0 +1,8 @@
---
modules:
icmp:
prober: icmp
timeout: 5s
icmp:
preferred_ip_protocol: "ip4"

View File

@@ -1,5 +1,7 @@
---
# {{ ansible_managed }}
# my global config
global:
scrape_interval: {{ prometheus_scrape_interval }}
@@ -10,25 +12,54 @@ alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
{% if prometheus_install_alertmanager %}
- alertmanager:9093
{%- endif %}
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
- "/etc/prometheus/recording/*.yaml"
- "/etc/prometheus/alerting/*.yaml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
{% if prometheus_hcloud_token is defined %}
- job_name: "blackbox"
static_configs:
- targets: ["blackbox_exporter:9115"]
{% if prometheus_ping_hosts | length > 0 %}
- job_name: "icmp"
metrics_path: "/probe"
params:
module: ["icmp"]
static_configs:
- targets:
{% for host in prometheus_ping_hosts %}
- "{{ host.name }}::{{ host.type | default('monitored') }}"
{% endfor %}
relabel_configs:
- source_labels: [__address__]
regex: '(.+)::(.+)'
target_label: __param_target
replacement: '${1}'
- source_labels: [__address__]
regex: '(.+)::(.+)'
target_label: host_type
replacement: '${2}'
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox_exporter:9115
{%- endif %}
{% if prometheus_hcloud_enabled %}
- job_name: hcloud
hetzner_sd_configs:
- role: hcloud
authorization:
credentials: {{ prometheus_hcloud_token }}
relabel_configs: {{ prometheus_hcloud_relabel_configs }}
{% endif %}
{%- endif %}

View File

@@ -0,0 +1,5 @@
---
{% for item in prometheus_webhook_handlers %}
- {{ item }}
{% endfor %}