Compare commits

...

11 Commits

Author SHA1 Message Date
uumas
b2540e2bd3 service: Validation, set default for mounts in additional containers 2025-11-26 22:26:12 +02:00
uumas
47088fd1a0 Allow setting container entrypoint 2025-11-26 22:25:54 +02:00
uumas
abf3859af7 container: rename task 2025-10-06 16:18:16 +03:00
uumas
bdec55ffc7 Use auth file instead of creds in quadlet files 2025-10-06 16:17:27 +03:00
uumas
2712cf2865 service: Support setting templated file mode 2025-09-16 12:37:20 +03:00
uumas
c5fb7f599c podman: Login to registries 2025-09-16 12:36:58 +03:00
uumas
597faa3fd5 service: Set static ips for other containers too 2025-09-15 12:34:57 +03:00
uumas
16babfd5ed service: Have service depend on oauth2-proxy socat socket if set 2025-09-15 12:34:21 +03:00
uumas
d3542993dd container: Set network ip range when using a static container ip 2025-09-15 12:32:22 +03:00
uumas
a93c26864d network: Support setting ip range 2025-09-15 12:31:40 +03:00
uumas
b333bbebbd Add prometheus role 2025-09-14 03:21:33 +03:00
34 changed files with 684 additions and 83 deletions

View File

@@ -1,5 +1,6 @@
--- ---
container_command: [] container_command: []
container_entrypoint: ""
container_user: "" container_user: ""
container_mounts: [] container_mounts: []
container_publish_ports: [] container_publish_ports: []
@@ -11,6 +12,3 @@ container_auto_start: true
container_auto_update: true container_auto_update: true
container_requires: [] container_requires: []
container_wants: [] container_wants: []
container_image_creds:
username: ""
password: ""

View File

@@ -13,6 +13,11 @@ argument_specs:
required: false required: false
default: [] default: []
elements: str elements: str
container_entrypoint:
description: Entrypoint to use for the continaer
type: str
required: false
default: ""
container_user: container_user:
description: The UID to run as inside the container description: The UID to run as inside the container
type: str type: str
@@ -23,22 +28,6 @@ argument_specs:
description: "The image to run in the container, in FQIN format (registry/imagename:tag)" description: "The image to run in the container, in FQIN format (registry/imagename:tag)"
type: str type: str
required: true required: true
container_image_creds:
description: Credentials used to authenticate with the registry
type: dict
required: false
default:
username: ""
password: ""
options:
username:
description: Username
type: str
required: true
password:
description: Password
type: str
required: true
container_mounts: container_mounts:
description: List of bind mounts or volumes to be mounted inside the container. description: List of bind mounts or volumes to be mounted inside the container.

View File

@@ -7,7 +7,6 @@
name: image name: image
vars: vars:
image_name: "{{ container_image }}" image_name: "{{ container_image }}"
image_creds: "{{ container_image_creds }}"
when: image_created_images is not defined or container_image not in image_created_images when: image_created_images is not defined or container_image not in image_created_images
- name: Create networks for container {{ container_name }} - name: Create networks for container {{ container_name }}
@@ -15,11 +14,8 @@
name: network name: network
vars: vars:
network_name: "{{ network }}" network_name: "{{ network }}"
network_subnet: >- network_subnet: "{{ _container_network_subnet if network_index == 0 else '' }}"
{{ network_range: "{{ _container_network_range if network_index == 0 else '' }}"
container_ip | ansible.utils.ipsubnet(24)
if (container_ip | length > 0 and network_index == 0) else ''
}}
when: network_created_networks is not defined or network not in network_created_networks when: network_created_networks is not defined or network not in network_created_networks
loop: "{{ container_networks }}" loop: "{{ container_networks }}"
loop_control: loop_control:
@@ -44,17 +40,19 @@
ansible.builtin.include_tasks: secrets.yaml ansible.builtin.include_tasks: secrets.yaml
when: container_secrets | length > 0 when: container_secrets | length > 0
- name: Create container service {{ container_name }} - name: Create container {{ container_name }}
containers.podman.podman_container: containers.podman.podman_container:
image: "{{ _container_image }}" image: "{{ _container_image }}"
name: "{{ container_name }}" name: "{{ container_name }}"
command: "{{ container_command or omit }}" command: "{{ container_command or omit }}"
entrypoint: "{{ container_entrypoint or omit }}"
user: "{{ container_user or omit }}" user: "{{ container_user or omit }}"
mount: "{{ _container_mounts | map('items') | map('map', 'join', '=') | map('join', ',') }}" mount: "{{ _container_mounts | map('items') | map('map', 'join', '=') | map('join', ',') }}"
network: "{{ _container_networks_with_ip }}" network: "{{ _container_networks_with_ip }}"
publish: "{{ container_publish_ports }}" publish: "{{ container_publish_ports }}"
secrets: "{{ _container_secrets }}" secrets: "{{ _container_secrets }}"
env: "{{ container_env }}" env: "{{ container_env }}"
label: "{{ _container_labels if _container_labels | length > 0 else omit }}"
state: quadlet state: quadlet
quadlet_file_mode: "0600" quadlet_file_mode: "0600"
quadlet_options: "{{ _container_quadlet_options }}" quadlet_options: "{{ _container_quadlet_options }}"

View File

@@ -11,6 +11,22 @@ _container_networks_with_ip: >-
] ]
+ _container_networks[1:] + _container_networks[1:]
}} }}
_container_network_subnet: >-
{{ container_ip | ansible.utils.ipsubnet(24) if container_ip | length > 0 else '' }}
_container_network_subnet_ranges: >-
{{
[
_container_network_subnet | ansible.utils.ipsubnet(25, 0),
_container_network_subnet | ansible.utils.ipsubnet(25, 1)
] if container_ip | length > 0 else ''
}}
_container_network_range: >-
{{
_container_network_subnet_ranges |
reject('ansible.utils.supernet_of', container_ip) |
first
if container_ip | length > 0 else ''
}}
_container_volumes: "{{ container_mounts | selectattr('type', '==', 'volume') }}" _container_volumes: "{{ container_mounts | selectattr('type', '==', 'volume') }}"
@@ -52,6 +68,15 @@ _container_secrets: >-
| map('join', ',') | map('join', ',')
}} }}
_container_labels: >-
{{
{'io.containers.autoupdate.authfile': '/etc/containers/auth.json'}
if container_auto_update and
container_image.split('/')[0] in
podman_registry_accounts | map(attribute='registry')
else {}
}}
_container_quadlet_unit_options: | _container_quadlet_unit_options: |
[Unit] [Unit]
Description=Container {{ container_name }} Description=Container {{ container_name }}

View File

@@ -1,4 +0,0 @@
---
image_creds:
username: ""
password: ""

View File

@@ -9,19 +9,3 @@ argument_specs:
description: "The image FQIN (format registry/imagename:tag)" description: "The image FQIN (format registry/imagename:tag)"
type: str type: str
required: true required: true
image_creds:
description: Credentials used to authenticate with the registry
type: dict
required: false
default:
username: ""
password: ""
options:
username:
description: Username
type: str
required: true
password:
description: Password
type: str
required: true

View File

@@ -6,9 +6,14 @@
- name: Create container image service {{ image_name }} - name: Create container image service {{ image_name }}
containers.podman.podman_image: containers.podman.podman_image:
name: "{{ image_name }}" name: "{{ image_name }}"
username: "{{ image_creds.username if image_creds.username | length > 0 else omit }}"
password: "{{ image_creds.password if image_creds.password | length > 0 else omit }}"
state: quadlet state: quadlet
quadlet_filename: "{{ image_name | replace('/', '_') }}" quadlet_filename: "{{ image_name | replace('/', '_') }}"
quadlet_file_mode: "0600" quadlet_file_mode: "0600"
quadlet_options: >-
{{
['AuthFile=/etc/containers/auth.json']
if image_name.split('/')[0] in
podman_registry_accounts | map(attribute='registry')
else []
}}
notify: Reload systemd daemon notify: Reload systemd daemon

View File

@@ -1,3 +1,4 @@
--- ---
network_driver: bridge network_driver: bridge
network_subnet: "" network_subnet: ""
network_range: ""

View File

@@ -20,3 +20,8 @@ argument_specs:
type: str type: str
required: false required: false
default: "" default: ""
network_range:
description: Range to allocate ip addresses from
type: str
required: false
default: ""

View File

@@ -6,6 +6,7 @@
quadlet_file_mode: "0644" quadlet_file_mode: "0644"
driver: "{{ network_driver }}" driver: "{{ network_driver }}"
subnet: "{{ network_subnet if network_subnet | length > 0 else omit }}" subnet: "{{ network_subnet if network_subnet | length > 0 else omit }}"
ip_range: "{{ network_range if network_range | length > 0 else omit }}"
quadlet_options: >- quadlet_options: >-
{{ {{
['Options=parent=' ~ ansible_default_ipv4.interface] ['Options=parent=' ~ ansible_default_ipv4.interface]

View File

@@ -0,0 +1,2 @@
---
podman_registry_accounts: []

View File

@@ -2,4 +2,23 @@
argument_specs: argument_specs:
main: main:
description: Installs podman description: Installs podman
options: {} options:
podman_registry_accounts:
description: Dict of accounts for container repositories
type: list
required: false
default: []
elements: dict
options:
registry:
description: Registry server to login to
type: str
required: true
username:
description: Username
type: str
required: true
password:
description: Password / token
type: str
required: true

View File

@@ -10,3 +10,12 @@
name: podman-auto-update.timer name: podman-auto-update.timer
state: started state: started
enabled: true enabled: true
- name: Login to registries
containers.podman.podman_login:
registry: "{{ item.registry }}"
username: "{{ item.username }}"
password: "{{ item.password }}"
authfile: /etc/containers/auth.json
loop: "{{ podman_registry_accounts }}"
no_log: true

View File

@@ -0,0 +1 @@
Installs and configures prometheus

View File

@@ -0,0 +1,4 @@
---
prometheus_additional_networks: []
prometheus_ping_hosts: []

View File

@@ -0,0 +1,35 @@
---
argument_specs:
main:
description: Installs and configures prometheus
options:
prometheus_additional_networks:
description: >-
A list of additional podman networks for the prometheus container (in
addition to prometheus network).
type: list
required: false
default: []
elements: str
prometheus_ping_hosts:
description: List of hosts to ping
type: list
required: false
default: []
elements: dict
options:
name:
description: Hostname to ping
type: str
required: true
type:
description: >-
Type of host. Monitored hosts are pinged to check if they are up.
Wan hosts are pinged to check if prometheus has internet access.
type: str
required: false
default: monitored
choices:
- monitored
- wan

View File

@@ -0,0 +1,28 @@
---
- name: Prometheus
ansible.builtin.import_role:
name: service
vars:
service_name: prometheus
service_container_image: "docker.io/prom/prometheus:latest"
service_container_mounts:
- type: template
source: prometheus.yml.j2
destination: /etc/prometheus/prometheus.yml
- type: volume
source: data
destination: /prometheus
- type: template
source: alerting/node-exporter.yaml.j2
destination: /etc/prometheus/alerting/node-exporter.yaml
- type: template
source: alerting/blackbox-exporter.yaml.j2
destination: /etc/prometheus/alerting/blackbox-exporter.yaml
service_container_additional_networks: "{{ prometheus_additional_networks }}"
service_additional_containers:
- name: blackbox-exporter
image: docker.io/prom/blackbox-exporter:latest
mounts:
- type: template
source: blackbox_exporter.yml.j2
destination: /etc/blackbox_exporter/config.yml

View File

@@ -0,0 +1,97 @@
{% raw %}
groups:
- name: BlackboxExporter
rules:
- alert: BlackboxAllWanProbesFailed
expr: 'sum by (host_type) (probe_success{host_type="wan"})==0'
for: 5s
labels:
severity: critical
annotations:
summary: Lost internet access
descrtiption: Failed to contact any wan probes
- alert: BlackboxProbeFailed
expr: 'probe_success == 0'
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox probe failed (instance {{ $labels.instance }})
description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxConfigurationReloadFailure
expr: 'blackbox_exporter_config_last_reload_successful != 1'
for: 0m
labels:
severity: warning
annotations:
summary: Blackbox configuration reload failure (instance {{ $labels.instance }})
description: "Blackbox configuration reload failure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSlowProbe
expr: 'avg_over_time(probe_duration_seconds[1m]) > 1'
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox slow probe (instance {{ $labels.instance }})
description: "Blackbox probe took more than 1s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeHttpFailure
expr: 'probe_http_status_code <= 199 OR probe_http_status_code >= 400'
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20'
for: 0m
labels:
severity: warning
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateExpired
expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeSlowHttp
expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeSlowPing
expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1'
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox probe slow ping (instance {{ $labels.instance }})
description: "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
{% endraw %}

View File

@@ -0,0 +1,322 @@
{% raw %}
groups:
- name: NodeExporter
rules:
- alert: HostOutOfMemory
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)'
for: 0m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryIsUnderutilized
expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
for: 0m
labels:
severity: info
annotations:
summary: Host Memory is underutilized (instance {{ $labels.instance }})
description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
for: 0m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
for: 0m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host transmit bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadRate
expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
for: 0m
labels:
severity: warning
annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is too busy (IO wait > 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
for: 2m
labels:
severity: critical
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostDiskMayFillIn24Hours
expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes
expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
for: 2m
labels:
severity: critical
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostFilesystemDeviceError
expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
for: 2m
labels:
severity: critical
annotations:
summary: Host filesystem device error (instance {{ $labels.instance }})
description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostInodesMayFillIn24Hours
expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteLatency
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad
expr: '1 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
for: 10m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuIsUnderutilized
expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
for: 1w
labels:
severity: info
annotations:
summary: Host CPU is underutilized (instance {{ $labels.instance }})
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuHighIowait
expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }})
description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskIo
expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostContextSwitchingHigh
expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
for: 0m
labels:
severity: warning
annotations:
summary: Host context switching high (instance {{ $labels.instance }})
description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)'
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{ $labels.instance }})
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSystemdServiceCrashed
expr: '(node_systemd_unit_state{state="failed"} == 1)'
for: 0m
labels:
severity: warning
annotations:
summary: Host systemd service crashed (instance {{ $labels.instance }})
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostPhysicalComponentTooHot
expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNodeOvertemperatureAlarm
expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
for: 0m
labels:
severity: critical
annotations:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSoftwareRaidInsufficientDrives
expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)'
for: 0m
labels:
severity: critical
annotations:
summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSoftwareRaidDiskFailure
expr: '(node_md_disks{state="failed"} > 0)'
for: 2m
labels:
severity: warning
annotations:
summary: Host software RAID disk failure (instance {{ $labels.instance }})
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostKernelVersionDeviations
expr: 'changes(node_uname_info[1h]) > 0'
for: 0m
labels:
severity: info
annotations:
summary: Host kernel version deviations (instance {{ $labels.instance }})
description: "Kernel version for {{ $labels.instance }} has changed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOomKillDetected
expr: '(increase(node_vmstat_oom_kill[1m]) > 0)'
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacCorrectableErrorsDetected
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacUncorrectableErrorsDetected
expr: '(node_edac_uncorrectable_errors_total > 0)'
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkReceiveErrors
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Receive Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkTransmitErrors
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkBondDegraded
expr: '((node_bonding_active - node_bonding_slaves) != 0)'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostConntrackLimit
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)'
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockSkew
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
for: 10m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockNotSynchronising
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
{% endraw %}

View File

@@ -0,0 +1,5 @@
---
modules:
icmp:
prober: icmp
timeout: 5s

View File

@@ -0,0 +1,55 @@
---
# {{ ansible_managed }}
global:
scrape_interval: 10s
evaluation_interval: 10s
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/etc/prometheus/recording/*.yaml"
- "/etc/prometheus/alerting/*.yaml"
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- localhost:9090
- job_name: blackbox
static_configs:
- targets:
- blackbox-exporter:9115
- job_name: node
static_configs:
- targets:
- host.containers.internal:9100
{% if prometheus_ping_hosts | length > 0 %}
- job_name: "icmp"
metrics_path: "/probe"
params:
module: ["icmp"]
static_configs:
- targets:
{% for host in prometheus_ping_hosts %}
- "{{ host.name }}::{{ host.type | default('monitored') }}"
{% endfor %}
relabel_configs:
- source_labels:
- __address__
regex: '(.+)::(.+)'
target_label: __param_target
replacement: '${1}'
- source_labels:
- __address__
regex: '(.+)::(.+)'
target_label: host_type
replacement: '${2}'
- source_labels:
- __param_target
target_label: instance
- target_label: __address__
replacement: blackbox_exporter:9115
{%- endif %}

View File

@@ -1,5 +1,6 @@
--- ---
service_container_command: [] service_container_command: []
service_container_entrypoint: ""
service_domains: [] service_domains: []
service_container_http_port: 0 service_container_http_port: 0
@@ -27,7 +28,3 @@ service_additional_containers: []
service_requires: [] service_requires: []
service_wants: [] service_wants: []
service_auto_update: true service_auto_update: true
service_container_image_creds:
username: ""
password: ""

View File

@@ -14,6 +14,11 @@ argument_specs:
required: false required: false
default: [] default: []
elements: str elements: str
service_container_entrypoint:
description: Entrypoint to use in the service container
type: str
required: false
default: ""
service_domains: service_domains:
description: A list of domains which should be proxied to the main service container description: A list of domains which should be proxied to the main service container
@@ -66,22 +71,6 @@ argument_specs:
description: "The image to run in the service container(s), in FQIN format (registry/imagename:tag)." description: "The image to run in the service container(s), in FQIN format (registry/imagename:tag)."
type: str type: str
required: true required: true
service_container_image_creds:
description: Credentials used to authenticate with the registry
type: dict
required: false
default:
username: ""
password: ""
options:
username:
description: Username
type: str
required: true
password:
description: Password
type: str
required: true
service_container_user: service_container_user:
description: The UID to run as inside the container description: The UID to run as inside the container
@@ -150,6 +139,11 @@ argument_specs:
type: str type: str
required: false required: false
default: "" default: ""
mode:
description: Templated file permissions
type: str
required: false
default: "0644"
volume_device: volume_device:
description: >- description: >-
The path of a device which is mounted for the volume. The path of a device which is mounted for the volume.
@@ -315,6 +309,11 @@ argument_specs:
required: false required: false
default: "[]" default: "[]"
elements: str elements: str
entrypoint:
description: Entrypoint to use in the container
type: str
required: false
default: ""
mounts: mounts:
description: List of bind mounts or volumes to be mounted inside the main service container. description: List of bind mounts or volumes to be mounted inside the main service container.
type: list type: list

View File

@@ -5,12 +5,13 @@
vars: vars:
container_name: "{{ _service_additional_container.name }}" container_name: "{{ _service_additional_container.name }}"
container_image: "{{ _service_additional_container.image | default(service_container_image) }}" container_image: "{{ _service_additional_container.image | default(service_container_image) }}"
container_image_creds: "{{ service_container_image_creds }}"
container_command: "{{ _service_additional_container.command | default([]) }}" container_command: "{{ _service_additional_container.command | default([]) }}"
container_entrypoint: "{{ _service_additional_container.entrypoint | default('') }}"
container_user: "{{ service_container_user }}" container_user: "{{ service_container_user }}"
container_mounts: "{{ _service_additional_container_mounts }}" container_mounts: "{{ _service_additional_container_mounts }}"
container_publish_ports: "{{ _service_additional_container.publish_ports | default([]) }}" container_publish_ports: "{{ _service_additional_container.publish_ports | default([]) }}"
container_networks: "{{ _service_container_networks }}" container_networks: "{{ _service_container_networks }}"
container_ip: "{{ _service_additional_container_ip }}"
container_secrets: "{{ _service_additional_container.secrets | default(_service_container_secrets) }}" container_secrets: "{{ _service_additional_container.secrets | default(_service_container_secrets) }}"
container_env: "{{ _service_additional_container.env | default(service_container_env) }}" container_env: "{{ _service_additional_container.env | default(service_container_env) }}"
container_requires: "{{ _service_container_requires }}" container_requires: "{{ _service_container_requires }}"
@@ -19,3 +20,4 @@
loop: "{{ _service_additional_containers }}" loop: "{{ _service_additional_containers }}"
loop_control: loop_control:
loop_var: _service_additional_container loop_var: _service_additional_container
index_var: _service_additional_container_index

View File

@@ -10,6 +10,8 @@
source: "{{ _service_database_name }}" source: "{{ _service_database_name }}"
destination: /var/lib/postgresql/data destination: /var/lib/postgresql/data
container_networks: "{{ _service_database_networks }}" container_networks: "{{ _service_database_networks }}"
container_ip: >-
{{ service_container_ip | ansible.utils.ipmath(1) if _service_static_ip else '' }}
container_secrets: container_secrets:
- name: "{{ _service_database_name }}" - name: "{{ _service_database_name }}"
target: "{{ service_database_type }}" target: "{{ service_database_type }}"

View File

@@ -28,8 +28,8 @@
vars: vars:
container_name: "{{ service_name }}" container_name: "{{ service_name }}"
container_image: "{{ service_container_image }}" container_image: "{{ service_container_image }}"
container_image_creds: "{{ service_container_image_creds }}"
container_command: "{{ service_container_command }}" container_command: "{{ service_container_command }}"
container_entrypoint: "{{ service_container_entrypoint }}"
container_user: "{{ service_container_user }}" container_user: "{{ service_container_user }}"
container_mounts: "{{ _service_container_mounts }}" container_mounts: "{{ _service_container_mounts }}"
container_publish_ports: "{{ service_container_publish_ports }}" container_publish_ports: "{{ service_container_publish_ports }}"
@@ -47,6 +47,8 @@
vars: vars:
socat_service_name: "{{ service_name }}" socat_service_name: "{{ service_name }}"
socat_target_http_port: "{{ service_container_http_port }}" socat_target_http_port: "{{ service_container_http_port }}"
socat_container_ip: >-
{{ service_container_ip | ansible.utils.ipmath(3) if _service_static_ip else '' }}
- name: Reverse proxy for {{ service_name }} - name: Reverse proxy for {{ service_name }}
ansible.builtin.include_tasks: proxy.yaml ansible.builtin.include_tasks: proxy.yaml

View File

@@ -34,3 +34,4 @@
vars: vars:
socat_service_name: "{{ service_name }}-oauth2-proxy" socat_service_name: "{{ service_name }}-oauth2-proxy"
socat_target_http_port: 4180 socat_target_http_port: 4180
socat_container_ip: ""

View File

@@ -7,4 +7,6 @@
container_image: docker.io/valkey/valkey:alpine container_image: docker.io/valkey/valkey:alpine
container_networks: container_networks:
- "{{ service_name }}" - "{{ service_name }}"
container_ip: >-
{{ service_container_ip | ansible.utils.ipmath(2) if _service_static_ip else '' }}
container_auto_update: "{{ service_auto_update }}" container_auto_update: "{{ service_auto_update }}"

View File

@@ -18,6 +18,7 @@
container_user: nobody container_user: nobody
container_networks: container_networks:
- "{{ socat_service_name }}" - "{{ socat_service_name }}"
container_ip: "{{ socat_container_ip }}"
container_requires: container_requires:
- "{{ socat_service_name }}-socat.socket" - "{{ socat_service_name }}-socat.socket"
- "{{ socat_service_name }}.service" - "{{ socat_service_name }}.service"

View File

@@ -22,6 +22,6 @@
ansible.builtin.template: ansible.builtin.template:
src: "{{ item[0].source }}" src: "{{ item[0].source }}"
dest: "{{ item[1] }}" dest: "{{ item[1] }}"
mode: "0644" mode: "{{ item[0].mode | default('0644') }}"
notify: Restart container service {{ service_name }} notify: Restart container service {{ service_name }}
loop: "{{ _service_all_template_mounts | zip(_service_all_template_mount_host_files) }}" loop: "{{ _service_all_template_mounts | zip(_service_all_template_mount_host_files) }}"

View File

@@ -1,4 +1,9 @@
--- ---
- name: Fail if service_name is empty
ansible.builtin.fail:
msg: service_name must not be empty
when: service_name | length == 0
- name: Fail if service_container_user is not string - name: Fail if service_container_user is not string
ansible.builtin.fail: ansible.builtin.fail:
msg: "service_container_user must be a string, not int." msg: "service_container_user must be a string, not int."

View File

@@ -11,6 +11,13 @@ _service_additional_containers: >-
| map('combine') | map('combine')
}} }}
_service_additional_container_ip: >-
{{
service_container_ip |
ansible.utils.ipmath(20 + _service_additional_container_index)
if _service_static_ip else ''
}}
_service_additional_volume_mounts: "{{ _service_additional_container.mounts | selectattr('type', '==', 'volume') }}" _service_additional_volume_mounts: "{{ _service_additional_container.mounts | selectattr('type', '==', 'volume') }}"
_service_additional_template_mounts: "{{ _service_additional_container.mounts | selectattr('type', '==', 'template') }}" _service_additional_template_mounts: "{{ _service_additional_container.mounts | selectattr('type', '==', 'template') }}"
@@ -32,12 +39,13 @@ _service_additional_container_template_mounts: >-
{{ {{
([{'readonly': true}] * _service_additional_template_mounts | length) | ([{'readonly': true}] * _service_additional_template_mounts | length) |
zip( zip(
_service_additional_template_mounts,
_service_additional_template_mounts | _service_additional_template_mounts |
map(attribute='source') | community.general.remove_keys(['mode']),
map('regex_replace', '\.j2$', '') | _service_additional_template_mounts |
map('regex_replace', '^', _service_host_directory ~ '/mounts/') | map(attribute='source') |
map('community.general.dict_kv', 'source'), map('regex_replace', '\.j2$', '') |
map('regex_replace', '^', _service_host_directory ~ '/mounts/') |
map('community.general.dict_kv', 'source'),
([{'type': 'bind'}] * _service_additional_template_mounts | length) ([{'type': 'bind'}] * _service_additional_template_mounts | length)
) | ) |
map('combine') map('combine')

View File

@@ -1,5 +1,6 @@
--- ---
_service_container_networks: "{{ [service_name] + service_container_additional_networks }}" _service_container_networks: "{{ [service_name] + service_container_additional_networks }}"
_service_static_ip: "{{ service_container_ip | length > 0 }}"
_service_container_requires: >- _service_container_requires: >-
{{ {{
@@ -11,6 +12,7 @@ _service_container_wants: >-
{{ {{
service_wants service_wants
+ ([service_name + '-socat.socket'] if service_container_http_port > 0 else []) + ([service_name + '-socat.socket'] if service_container_http_port > 0 else [])
+ ([service_name + '-oauth2-proxy-socat.socket'] if _service_oauth2_proxy else [])
+ _service_additional_containers + _service_additional_containers
| map(attribute='name') | map(attribute='name')
| map('regex_replace', '$', '.service') | map('regex_replace', '$', '.service')

View File

@@ -19,12 +19,13 @@ _service_container_template_mounts: >-
{{ {{
([{'readonly': true}] * _service_template_mounts | length) | ([{'readonly': true}] * _service_template_mounts | length) |
zip( zip(
_service_template_mounts,
_service_template_mounts | _service_template_mounts |
map(attribute='source') | community.general.remove_keys(['mode']),
map('regex_replace', '\.j2$', '') | _service_template_mounts |
map('regex_replace', '^', _service_host_directory ~ '/mounts/') | map(attribute='source') |
map('community.general.dict_kv', 'source'), map('regex_replace', '\.j2$', '') |
map('regex_replace', '^', _service_host_directory ~ '/mounts/') |
map('community.general.dict_kv', 'source'),
([{'type': 'bind'}] * _service_template_mounts | length) ([{'type': 'bind'}] * _service_template_mounts | length)
) | ) |
map('combine') map('combine')
@@ -44,7 +45,7 @@ _service_all_template_mounts: >-
_service_template_mounts + _service_template_mounts +
( (
_service_additional_containers | _service_additional_containers |
map(attribute='mounts') | map(attribute='mounts', default=[]) |
flatten flatten
) )
) | ) |