From b333bbebbdc0f487573cb78e8f4dbe50b9a19156 Mon Sep 17 00:00:00 2001
From: uumas <gitcommit@uumas.fi>
Date: Sun, 14 Sep 2025 03:21:33 +0300
Subject: [PATCH] Add prometheus role

---
 roles/prometheus/README.md                    |   1 +
 roles/prometheus/defaults/main.yaml           |   4 +
 roles/prometheus/meta/argument_specs.yml      |  35 ++
 roles/prometheus/tasks/main.yml               |  28 ++
 .../alerting/blackbox-exporter.yaml.j2        |  97 ++++++
 .../templates/alerting/node-exporter.yaml.j2  | 322 ++++++++++++++++++
 .../templates/blackbox_exporter.yml.j2        |   5 +
 roles/prometheus/templates/prometheus.yml.j2  |  55 +++
 8 files changed, 547 insertions(+)
 create mode 100644 roles/prometheus/README.md
 create mode 100644 roles/prometheus/defaults/main.yaml
 create mode 100644 roles/prometheus/meta/argument_specs.yml
 create mode 100644 roles/prometheus/tasks/main.yml
 create mode 100644 roles/prometheus/templates/alerting/blackbox-exporter.yaml.j2
 create mode 100644 roles/prometheus/templates/alerting/node-exporter.yaml.j2
 create mode 100644 roles/prometheus/templates/blackbox_exporter.yml.j2
 create mode 100644 roles/prometheus/templates/prometheus.yml.j2

diff --git a/roles/prometheus/README.md b/roles/prometheus/README.md
new file mode 100644
index 0000000..8a9ec6e
--- /dev/null
+++ b/roles/prometheus/README.md
@@ -0,0 +1 @@
+Installs and configures prometheus
diff --git a/roles/prometheus/defaults/main.yaml b/roles/prometheus/defaults/main.yaml
new file mode 100644
index 0000000..ee8c846
--- /dev/null
+++ b/roles/prometheus/defaults/main.yaml
@@ -0,0 +1,4 @@
+---
+prometheus_additional_networks: []
+
+prometheus_ping_hosts: []
diff --git a/roles/prometheus/meta/argument_specs.yml b/roles/prometheus/meta/argument_specs.yml
new file mode 100644
index 0000000..f30de2f
--- /dev/null
+++ b/roles/prometheus/meta/argument_specs.yml
@@ -0,0 +1,35 @@
+---
+argument_specs:
+  main:
+    description: Installs and configures prometheus
+    options:
+      prometheus_additional_networks:
+        description: >-
+          A list of additional podman networks for the prometheus container (in
+          addition to prometheus network).
+        type: list
+        required: false
+        default: []
+        elements: str
+
+      prometheus_ping_hosts:
+        description: List of hosts to ping
+        type: list
+        required: false
+        default: []
+        elements: dict
+        options:
+          name:
+            description: Hostname to ping
+            type: str
+            required: true
+          type:
+            description: >-
+              Type of host. Monitored hosts are pinged to check if they are up.
+              Wan hosts are pinged to check if prometheus has internet access.
+            type: str
+            required: false
+            default: monitored
+            choices:
+              - monitored
+              - wan
diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml
new file mode 100644
index 0000000..0c4592b
--- /dev/null
+++ b/roles/prometheus/tasks/main.yml
@@ -0,0 +1,28 @@
+---
+- name: Prometheus
+  ansible.builtin.import_role:
+    name: service
+  vars:
+    service_name: prometheus
+    service_container_image: "docker.io/prom/prometheus:latest"
+    service_container_mounts:
+      - type: template
+        source: prometheus.yml.j2
+        destination: /etc/prometheus/prometheus.yml
+      - type: volume
+        source: data
+        destination: /prometheus
+      - type: template
+        source: alerting/node-exporter.yaml.j2
+        destination: /etc/prometheus/alerting/node-exporter.yaml
+      - type: template
+        source: alerting/blackbox-exporter.yaml.j2
+        destination: /etc/prometheus/alerting/blackbox-exporter.yaml
+    service_container_additional_networks: "{{ prometheus_additional_networks }}"
+    service_additional_containers:
+      - name: blackbox-exporter
+        image: docker.io/prom/blackbox-exporter:latest
+        mounts:
+          - type: template
+            source: blackbox_exporter.yml.j2
+            destination: /etc/blackbox_exporter/config.yml
diff --git a/roles/prometheus/templates/alerting/blackbox-exporter.yaml.j2 b/roles/prometheus/templates/alerting/blackbox-exporter.yaml.j2
new file mode 100644
index 0000000..59d33c1
--- /dev/null
+++ b/roles/prometheus/templates/alerting/blackbox-exporter.yaml.j2
@@ -0,0 +1,97 @@
+{% raw %}
+groups:
+
+- name: BlackboxExporter
+
+  rules:
+
+    - alert: BlackboxAllWanProbesFailed
+      expr: 'sum by (host_type) (probe_success{host_type="wan"})==0'
+      for: 5s
+      labels:
+        severity: critical
+      annotations:
+        summary: Lost internet access
+        descrtiption: Failed to contact any wan probes
+
+    - alert: BlackboxProbeFailed
+      expr: 'probe_success == 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Blackbox probe failed (instance {{ $labels.instance }})
+        description: "Probe failed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: BlackboxConfigurationReloadFailure
+      expr: 'blackbox_exporter_config_last_reload_successful != 1'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Blackbox configuration reload failure (instance {{ $labels.instance }})
+        description: "Blackbox configuration reload failure\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: BlackboxSlowProbe
+      expr: 'avg_over_time(probe_duration_seconds[1m]) > 1'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Blackbox slow probe (instance {{ $labels.instance }})
+        description: "Blackbox probe took more than 1s to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: BlackboxProbeHttpFailure
+      expr: 'probe_http_status_code <= 199 OR probe_http_status_code >= 400'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
+        description: "HTTP status code is not 200-399\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: BlackboxSslCertificateWillExpireSoon
+      expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
+        description: "SSL certificate expires in less than 20 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: BlackboxSslCertificateWillExpireSoon
+      expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
+        description: "SSL certificate expires in less than 3 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: BlackboxSslCertificateExpired
+      expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
+        description: "SSL certificate has expired already\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: BlackboxProbeSlowHttp
+      expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
+        description: "HTTP request took more than 1s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: BlackboxProbeSlowPing
+      expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Blackbox probe slow ping (instance {{ $labels.instance }})
+        description: "Blackbox ping took more than 1s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+{% endraw %}
diff --git a/roles/prometheus/templates/alerting/node-exporter.yaml.j2 b/roles/prometheus/templates/alerting/node-exporter.yaml.j2
new file mode 100644
index 0000000..46552a4
--- /dev/null
+++ b/roles/prometheus/templates/alerting/node-exporter.yaml.j2
@@ -0,0 +1,322 @@
+{% raw %}
+groups:
+
+- name: NodeExporter
+
+  rules:
+
+    - alert: HostOutOfMemory
+      expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of memory (instance {{ $labels.instance }})
+        description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostMemoryUnderMemoryPressure
+      expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host memory under memory pressure (instance {{ $labels.instance }})
+        description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostMemoryIsUnderutilized
+      expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
+      for: 0m
+      labels:
+        severity: info
+      annotations:
+        summary: Host Memory is underutilized (instance {{ $labels.instance }})
+        description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualNetworkThroughputIn
+      expr: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual network throughput in (instance {{ $labels.instance }})
+        description: "Host receive bandwidth is high (>80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualNetworkThroughputOut
+      expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual network throughput out (instance {{ $labels.instance }})
+        description: "Host transmit bandwidth is high (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskReadRate
+      expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk read rate (instance {{ $labels.instance }})
+        description: "Disk is too busy (IO wait > 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOutOfDiskSpace
+      expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host out of disk space (instance {{ $labels.instance }})
+        description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostDiskMayFillIn24Hours
+      expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem will likely run out of space within the next 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOutOfInodes
+      expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host out of inodes (instance {{ $labels.instance }})
+        description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostFilesystemDeviceError
+      expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host filesystem device error (instance {{ $labels.instance }})
+        description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostInodesMayFillIn24Hours
+      expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskReadLatency
+      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk read latency (instance {{ $labels.instance }})
+        description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskWriteLatency
+      expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk write latency (instance {{ $labels.instance }})
+        description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostHighCpuLoad
+      expr: '1 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host high CPU load (instance {{ $labels.instance }})
+        description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostCpuIsUnderutilized
+      expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
+      for: 1w
+      labels:
+        severity: info
+      annotations:
+        summary: Host CPU is underutilized (instance {{ $labels.instance }})
+        description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostCpuStealNoisyNeighbor
+      expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+        description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostCpuHighIowait
+      expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host CPU high iowait (instance {{ $labels.instance }})
+        description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskIo
+      expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk IO (instance {{ $labels.instance }})
+        description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostContextSwitchingHigh
+      expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host context switching high (instance {{ $labels.instance }})
+        description: "Context switching is growing on the node (twice the daily average during the last 15m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostSwapIsFillingUp
+      expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host swap is filling up (instance {{ $labels.instance }})
+        description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostSystemdServiceCrashed
+      expr: '(node_systemd_unit_state{state="failed"} == 1)'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host systemd service crashed (instance {{ $labels.instance }})
+        description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostPhysicalComponentTooHot
+      expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host physical component too hot (instance {{ $labels.instance }})
+        description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNodeOvertemperatureAlarm
+      expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+        description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostSoftwareRaidInsufficientDrives
+      expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
+        description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostSoftwareRaidDiskFailure
+      expr: '(node_md_disks{state="failed"} > 0)'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host software RAID disk failure (instance {{ $labels.instance }})
+        description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostKernelVersionDeviations
+      expr: 'changes(node_uname_info[1h]) > 0'
+      for: 0m
+      labels:
+        severity: info
+      annotations:
+        summary: Host kernel version deviations (instance {{ $labels.instance }})
+        description: "Kernel version for {{ $labels.instance }} has changed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOomKillDetected
+      expr: '(increase(node_vmstat_oom_kill[1m]) > 0)'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host OOM kill detected (instance {{ $labels.instance }})
+        description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostEdacCorrectableErrorsDetected
+      expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
+      for: 0m
+      labels:
+        severity: info
+      annotations:
+        summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostEdacUncorrectableErrorsDetected
+      expr: '(node_edac_uncorrectable_errors_total > 0)'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkReceiveErrors
+      expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Receive Errors (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkTransmitErrors
+      expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkBondDegraded
+      expr: '((node_bonding_active - node_bonding_slaves) != 0)'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Bond Degraded (instance {{ $labels.instance }})
+        description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostConntrackLimit
+      expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host conntrack limit (instance {{ $labels.instance }})
+        description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostClockSkew
+      expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host clock skew (instance {{ $labels.instance }})
+        description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostClockNotSynchronising
+      expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host clock not synchronising (instance {{ $labels.instance }})
+        description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+{% endraw %}
diff --git a/roles/prometheus/templates/blackbox_exporter.yml.j2 b/roles/prometheus/templates/blackbox_exporter.yml.j2
new file mode 100644
index 0000000..b387fd1
--- /dev/null
+++ b/roles/prometheus/templates/blackbox_exporter.yml.j2
@@ -0,0 +1,5 @@
+---
+modules:
+  icmp:
+    prober: icmp
+    timeout: 5s
diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2
new file mode 100644
index 0000000..4eeb3c2
--- /dev/null
+++ b/roles/prometheus/templates/prometheus.yml.j2
@@ -0,0 +1,55 @@
+---
+# {{ ansible_managed }}
+
+global:
+  scrape_interval: 10s
+  evaluation_interval: 10s
+
+# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
+rule_files:
+  - "/etc/prometheus/recording/*.yaml"
+  - "/etc/prometheus/alerting/*.yaml"
+
+scrape_configs:
+  - job_name: prometheus
+    static_configs:
+      - targets:
+          - localhost:9090
+
+  - job_name: blackbox
+    static_configs:
+      - targets:
+          - blackbox-exporter:9115
+
+  - job_name: node
+    static_configs:
+      - targets:
+          - host.containers.internal:9100
+
+{% if prometheus_ping_hosts | length > 0 %}
+  - job_name: "icmp"
+    metrics_path: "/probe"
+    params:
+      module: ["icmp"]
+    static_configs:
+      - targets:
+{% for host in prometheus_ping_hosts %}
+          - "{{ host.name }}::{{ host.type | default('monitored') }}"
+{% endfor %}
+    relabel_configs:
+      - source_labels:
+          - __address__
+        regex: '(.+)::(.+)'
+        target_label: __param_target
+        replacement: '${1}'
+      - source_labels:
+          - __address__
+        regex: '(.+)::(.+)'
+        target_label: host_type
+        replacement: '${2}'
+      - source_labels:
+          - __param_target
+        target_label: instance
+      - target_label: __address__
+        replacement: blackbox_exporter:9115
+{%- endif %}