Compare commits

...

20 Commits

Author SHA1 Message Date
uumas
b2540e2bd3 service: Validation, set default for mounts in additional containers 2025-11-26 22:26:12 +02:00
uumas
47088fd1a0 Allow setting container entrypoint 2025-11-26 22:25:54 +02:00
uumas
abf3859af7 container: rename task 2025-10-06 16:18:16 +03:00
uumas
bdec55ffc7 Use auth file instead of creds in quadlet files 2025-10-06 16:17:27 +03:00
uumas
2712cf2865 service: Support setting templated file mode 2025-09-16 12:37:20 +03:00
uumas
c5fb7f599c podman: Login to registries 2025-09-16 12:36:58 +03:00
uumas
597faa3fd5 service: Set static ips for other containers too 2025-09-15 12:34:57 +03:00
uumas
16babfd5ed service: Have service depend on oauth2-proxy socat socket if set 2025-09-15 12:34:21 +03:00
uumas
d3542993dd container: Set network ip range when using a static container ip 2025-09-15 12:32:22 +03:00
uumas
a93c26864d network: Support setting ip range 2025-09-15 12:31:40 +03:00
uumas
b333bbebbd Add prometheus role 2025-09-14 03:21:33 +03:00
uumas
fea49be8d1 Use service-specific oauth2-proxy instances 2025-09-14 03:10:20 +03:00
uumas
aaca377811 service: Support static ip for service container 2025-09-14 03:09:28 +03:00
uumas
0b73582f36 container: Support static ip for container 2025-09-14 03:08:24 +03:00
uumas
ad50e05ee9 network: Support static subnet 2025-09-14 03:07:28 +03:00
uumas
12f32f5824 network: Support macvlan driver 2025-09-14 03:07:03 +03:00
uumas
586f98bc9f synapse: Use federation port 8448 2025-09-14 03:05:06 +03:00
uumas
a29908b507 podman: Ensure auto update timer is enabled 2025-09-14 03:04:22 +03:00
uumas
c96997a4ec lint 2025-09-13 17:36:05 +03:00
uumas
014edb08ac service: fix template mounts for additional containers 2025-08-28 11:02:35 +03:00
45 changed files with 884 additions and 153 deletions

View File

@@ -1,15 +1,14 @@
---
container_command: []
container_entrypoint: ""
container_user: ""
container_mounts: []
container_publish_ports: []
container_networks: []
container_ip: ""
container_secrets: []
container_env: {}
container_auto_start: true
container_auto_update: true
container_requires: []
container_wants: []
container_image_creds:
username: ""
password: ""

View File

@@ -13,6 +13,11 @@ argument_specs:
required: false
default: []
elements: str
container_entrypoint:
description: Entrypoint to use for the continaer
type: str
required: false
default: ""
container_user:
description: The UID to run as inside the container
type: str
@@ -23,22 +28,6 @@ argument_specs:
description: "The image to run in the container, in FQIN format (registry/imagename:tag)"
type: str
required: true
container_image_creds:
description: Credentials used to authenticate with the registry
type: dict
required: false
default:
username: ""
password: ""
options:
username:
description: Username
type: str
required: true
password:
description: Password
type: str
required: true
container_mounts:
description: List of bind mounts or volumes to be mounted inside the container.
@@ -115,6 +104,11 @@ argument_specs:
required: false
default: []
elements: str
container_ip:
description: IPv4 address for the container in the first network defined in container_networks
type: str
required: false
default: ""
container_secrets:
description: A list of secrets available to the container as file or environment variable
type: list

View File

@@ -7,7 +7,6 @@
name: image
vars:
image_name: "{{ container_image }}"
image_creds: "{{ container_image_creds }}"
when: image_created_images is not defined or container_image not in image_created_images
- name: Create networks for container {{ container_name }}
@@ -15,10 +14,13 @@
name: network
vars:
network_name: "{{ network }}"
network_subnet: "{{ _container_network_subnet if network_index == 0 else '' }}"
network_range: "{{ _container_network_range if network_index == 0 else '' }}"
when: network_created_networks is not defined or network not in network_created_networks
loop: "{{ container_networks }}"
loop_control:
loop_var: network
index_var: network_index
- name: Create volumes for container {{ container_name }}
ansible.builtin.include_role:
@@ -38,17 +40,19 @@
ansible.builtin.include_tasks: secrets.yaml
when: container_secrets | length > 0
- name: Create container service {{ container_name }}
- name: Create container {{ container_name }}
containers.podman.podman_container:
image: "{{ _container_image }}"
name: "{{ container_name }}"
command: "{{ container_command or omit }}"
entrypoint: "{{ container_entrypoint or omit }}"
user: "{{ container_user or omit }}"
mount: "{{ _container_mounts | map('items') | map('map', 'join', '=') | map('join', ',') }}"
network: "{{ container_networks | map('regex_replace', '$', '.network') }}"
network: "{{ _container_networks_with_ip }}"
publish: "{{ container_publish_ports }}"
secrets: "{{ _container_secrets }}"
env: "{{ container_env }}"
label: "{{ _container_labels if _container_labels | length > 0 else omit }}"
state: quadlet
quadlet_file_mode: "0600"
quadlet_options: "{{ _container_quadlet_options }}"

View File

@@ -1,6 +1,33 @@
---
_container_image: "{{ container_image | replace('/', '_') ~ '.image' }}"
_container_networks: "{{ container_networks | map('regex_replace', '$', '.network') }}"
_container_networks_with_ip: >-
{{
[
_container_networks[0] ~ (
':ip=' ~ container_ip if container_ip | length > 0 else ''
)
]
+ _container_networks[1:]
}}
_container_network_subnet: >-
{{ container_ip | ansible.utils.ipsubnet(24) if container_ip | length > 0 else '' }}
_container_network_subnet_ranges: >-
{{
[
_container_network_subnet | ansible.utils.ipsubnet(25, 0),
_container_network_subnet | ansible.utils.ipsubnet(25, 1)
] if container_ip | length > 0 else ''
}}
_container_network_range: >-
{{
_container_network_subnet_ranges |
reject('ansible.utils.supernet_of', container_ip) |
first
if container_ip | length > 0 else ''
}}
_container_volumes: "{{ container_mounts | selectattr('type', '==', 'volume') }}"
_container_mount_sources: "{{ container_mounts | map(attribute='source') }}"
@@ -41,6 +68,15 @@ _container_secrets: >-
| map('join', ',')
}}
_container_labels: >-
{{
{'io.containers.autoupdate.authfile': '/etc/containers/auth.json'}
if container_auto_update and
container_image.split('/')[0] in
podman_registry_accounts | map(attribute='registry')
else {}
}}
_container_quadlet_unit_options: |
[Unit]
Description=Container {{ container_name }}

View File

@@ -1,4 +0,0 @@
---
image_creds:
username: ""
password: ""

View File

@@ -9,19 +9,3 @@ argument_specs:
description: "The image FQIN (format registry/imagename:tag)"
type: str
required: true
image_creds:
description: Credentials used to authenticate with the registry
type: dict
required: false
default:
username: ""
password: ""
options:
username:
description: Username
type: str
required: true
password:
description: Password
type: str
required: true

View File

@@ -6,9 +6,14 @@
- name: Create container image service {{ image_name }}
containers.podman.podman_image:
name: "{{ image_name }}"
username: "{{ image_creds.username if image_creds.username | length > 0 else omit }}"
password: "{{ image_creds.password if image_creds.password | length > 0 else omit }}"
state: quadlet
quadlet_filename: "{{ image_name | replace('/', '_') }}"
quadlet_file_mode: "0600"
quadlet_options: >-
{{
['AuthFile=/etc/containers/auth.json']
if image_name.split('/')[0] in
podman_registry_accounts | map(attribute='registry')
else []
}}
notify: Reload systemd daemon

View File

@@ -0,0 +1,4 @@
---
network_driver: bridge
network_subnet: ""
network_range: ""

View File

@@ -0,0 +1,7 @@
---
- name: Restart network service {{ network_name }}
ansible.builtin.systemd_service:
name: "{{ network_name }}-network.service"
state: restarted
daemon_reload: true
ignore_errors: "{{ ansible_check_mode }}"

View File

@@ -7,3 +7,21 @@ argument_specs:
description: Name of the network. Must be unique within a host.
type: str
required: true
network_driver:
description: Driver to manage the network
type: str
required: false
default: bridge
choices:
- bridge
- macvlan
network_subnet:
description: Subnet for the network
type: str
required: false
default: ""
network_range:
description: Range to allocate ip addresses from
type: str
required: false
default: ""

View File

@@ -1,10 +1,18 @@
---
- name: "Create container network service {{ network_name }}"
- name: Create container network service {{ network_name }}
containers.podman.podman_network:
name: "{{ network_name }}"
state: quadlet
quadlet_file_mode: "0644"
notify: Reload systemd daemon
driver: "{{ network_driver }}"
subnet: "{{ network_subnet if network_subnet | length > 0 else omit }}"
ip_range: "{{ network_range if network_range | length > 0 else omit }}"
quadlet_options: >-
{{
['Options=parent=' ~ ansible_default_ipv4.interface]
if network_driver == 'macvlan' else []
}}
notify: Restart network service {{ network_name }}
- name: Add network to created networks variable
ansible.builtin.set_fact:

View File

@@ -1 +0,0 @@
Sets up a oauth2-proxy container

View File

@@ -1,17 +0,0 @@
---
argument_specs:
main:
description: "Sets up a oauth2-proxy container"
options:
oauth2_proxy_oidc_issuer_url:
description: the OpenID Connect issuer URL
type: str
required: true
oauth2_proxy_client_id:
description: the OAuth client ID
type: str
required: true
oauth2_proxy_client_secret:
description: the OAuth client secret
type: str
required: true

View File

@@ -1,24 +0,0 @@
---
- name: OAuth2 Proxy
ansible.builtin.import_role:
name: service
vars:
service_name: oauth2-proxy
service_container_image: "quay.io/oauth2-proxy/oauth2-proxy:latest-alpine"
service_container_http_port: 4180
service_container_command:
- --config
- /oauth2-proxy.cfg
- --client-secret-file
- /run/secrets/client_secret
service_container_mounts:
- type: template
source: oauth2-proxy.cfg.j2
destination: /oauth2-proxy.cfg
service_container_secrets:
- name: cookie_secret
length: 32
type: env
target: OAUTH2_PROXY_COOKIE_SECRET
- name: client_secret
value: "{{ oauth2_proxy_client_secret }}"

View File

@@ -1,11 +0,0 @@
# OAuth2 Proxy Configuration
http_address = "0.0.0.0:4180"
# OIDC Provider Configuration
provider = "oidc"
oidc_issuer_url = "{{ oauth2_proxy_oidc_issuer_url }}"
client_id = "{{ oauth2_proxy_client_id }}"
code_challenge_method = "S256"
skip_provider_button = "true"
email_domains = "*"

View File

@@ -0,0 +1,2 @@
---
podman_registry_accounts: []

View File

@@ -2,4 +2,23 @@
argument_specs:
main:
description: Installs podman
options: {}
options:
podman_registry_accounts:
description: Dict of accounts for container repositories
type: list
required: false
default: []
elements: dict
options:
registry:
description: Registry server to login to
type: str
required: true
username:
description: Username
type: str
required: true
password:
description: Password / token
type: str
required: true

View File

@@ -4,3 +4,18 @@
name:
- podman
- aardvark-dns
- name: Ensure podman auto update timer is enabled
ansible.builtin.systemd_service:
name: podman-auto-update.timer
state: started
enabled: true
- name: Login to registries
containers.podman.podman_login:
registry: "{{ item.registry }}"
username: "{{ item.username }}"
password: "{{ item.password }}"
authfile: /etc/containers/auth.json
loop: "{{ podman_registry_accounts }}"
no_log: true

View File

@@ -0,0 +1 @@
Installs and configures prometheus

View File

@@ -0,0 +1,4 @@
---
prometheus_additional_networks: []
prometheus_ping_hosts: []

View File

@@ -0,0 +1,35 @@
---
argument_specs:
main:
description: Installs and configures prometheus
options:
prometheus_additional_networks:
description: >-
A list of additional podman networks for the prometheus container (in
addition to prometheus network).
type: list
required: false
default: []
elements: str
prometheus_ping_hosts:
description: List of hosts to ping
type: list
required: false
default: []
elements: dict
options:
name:
description: Hostname to ping
type: str
required: true
type:
description: >-
Type of host. Monitored hosts are pinged to check if they are up.
Wan hosts are pinged to check if prometheus has internet access.
type: str
required: false
default: monitored
choices:
- monitored
- wan

View File

@@ -0,0 +1,28 @@
---
- name: Prometheus
ansible.builtin.import_role:
name: service
vars:
service_name: prometheus
service_container_image: "docker.io/prom/prometheus:latest"
service_container_mounts:
- type: template
source: prometheus.yml.j2
destination: /etc/prometheus/prometheus.yml
- type: volume
source: data
destination: /prometheus
- type: template
source: alerting/node-exporter.yaml.j2
destination: /etc/prometheus/alerting/node-exporter.yaml
- type: template
source: alerting/blackbox-exporter.yaml.j2
destination: /etc/prometheus/alerting/blackbox-exporter.yaml
service_container_additional_networks: "{{ prometheus_additional_networks }}"
service_additional_containers:
- name: blackbox-exporter
image: docker.io/prom/blackbox-exporter:latest
mounts:
- type: template
source: blackbox_exporter.yml.j2
destination: /etc/blackbox_exporter/config.yml

View File

@@ -0,0 +1,97 @@
{% raw %}
groups:
- name: BlackboxExporter
rules:
- alert: BlackboxAllWanProbesFailed
expr: 'sum by (host_type) (probe_success{host_type="wan"})==0'
for: 5s
labels:
severity: critical
annotations:
summary: Lost internet access
descrtiption: Failed to contact any wan probes
- alert: BlackboxProbeFailed
expr: 'probe_success == 0'
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox probe failed (instance {{ $labels.instance }})
description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxConfigurationReloadFailure
expr: 'blackbox_exporter_config_last_reload_successful != 1'
for: 0m
labels:
severity: warning
annotations:
summary: Blackbox configuration reload failure (instance {{ $labels.instance }})
description: "Blackbox configuration reload failure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSlowProbe
expr: 'avg_over_time(probe_duration_seconds[1m]) > 1'
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox slow probe (instance {{ $labels.instance }})
description: "Blackbox probe took more than 1s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeHttpFailure
expr: 'probe_http_status_code <= 199 OR probe_http_status_code >= 400'
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20'
for: 0m
labels:
severity: warning
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateExpired
expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeSlowHttp
expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeSlowPing
expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1'
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox probe slow ping (instance {{ $labels.instance }})
description: "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
{% endraw %}

View File

@@ -0,0 +1,322 @@
{% raw %}
groups:
- name: NodeExporter
rules:
- alert: HostOutOfMemory
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)'
for: 0m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryIsUnderutilized
expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
for: 0m
labels:
severity: info
annotations:
summary: Host Memory is underutilized (instance {{ $labels.instance }})
description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
for: 0m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
for: 0m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host transmit bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadRate
expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
for: 0m
labels:
severity: warning
annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is too busy (IO wait > 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
for: 2m
labels:
severity: critical
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostDiskMayFillIn24Hours
expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes
expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
for: 2m
labels:
severity: critical
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostFilesystemDeviceError
expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
for: 2m
labels:
severity: critical
annotations:
summary: Host filesystem device error (instance {{ $labels.instance }})
description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostInodesMayFillIn24Hours
expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteLatency
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad
expr: '1 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
for: 10m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuIsUnderutilized
expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
for: 1w
labels:
severity: info
annotations:
summary: Host CPU is underutilized (instance {{ $labels.instance }})
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuHighIowait
expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }})
description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskIo
expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostContextSwitchingHigh
expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
for: 0m
labels:
severity: warning
annotations:
summary: Host context switching high (instance {{ $labels.instance }})
description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)'
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{ $labels.instance }})
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSystemdServiceCrashed
expr: '(node_systemd_unit_state{state="failed"} == 1)'
for: 0m
labels:
severity: warning
annotations:
summary: Host systemd service crashed (instance {{ $labels.instance }})
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostPhysicalComponentTooHot
expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNodeOvertemperatureAlarm
expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
for: 0m
labels:
severity: critical
annotations:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSoftwareRaidInsufficientDrives
expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)'
for: 0m
labels:
severity: critical
annotations:
summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSoftwareRaidDiskFailure
expr: '(node_md_disks{state="failed"} > 0)'
for: 2m
labels:
severity: warning
annotations:
summary: Host software RAID disk failure (instance {{ $labels.instance }})
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostKernelVersionDeviations
expr: 'changes(node_uname_info[1h]) > 0'
for: 0m
labels:
severity: info
annotations:
summary: Host kernel version deviations (instance {{ $labels.instance }})
description: "Kernel version for {{ $labels.instance }} has changed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOomKillDetected
expr: '(increase(node_vmstat_oom_kill[1m]) > 0)'
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacCorrectableErrorsDetected
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacUncorrectableErrorsDetected
expr: '(node_edac_uncorrectable_errors_total > 0)'
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkReceiveErrors
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Receive Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkTransmitErrors
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkBondDegraded
expr: '((node_bonding_active - node_bonding_slaves) != 0)'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostConntrackLimit
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)'
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockSkew
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
for: 10m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockNotSynchronising
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
{% endraw %}

View File

@@ -0,0 +1,5 @@
---
modules:
icmp:
prober: icmp
timeout: 5s

View File

@@ -0,0 +1,55 @@
---
# {{ ansible_managed }}
global:
scrape_interval: 10s
evaluation_interval: 10s
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/etc/prometheus/recording/*.yaml"
- "/etc/prometheus/alerting/*.yaml"
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- localhost:9090
- job_name: blackbox
static_configs:
- targets:
- blackbox-exporter:9115
- job_name: node
static_configs:
- targets:
- host.containers.internal:9100
{% if prometheus_ping_hosts | length > 0 %}
- job_name: "icmp"
metrics_path: "/probe"
params:
module: ["icmp"]
static_configs:
- targets:
{% for host in prometheus_ping_hosts %}
- "{{ host.name }}::{{ host.type | default('monitored') }}"
{% endfor %}
relabel_configs:
- source_labels:
- __address__
regex: '(.+)::(.+)'
target_label: __param_target
replacement: '${1}'
- source_labels:
- __address__
regex: '(.+)::(.+)'
target_label: host_type
replacement: '${2}'
- source_labels:
- __param_target
target_label: instance
- target_label: __address__
replacement: blackbox_exporter:9115
{%- endif %}

View File

@@ -1,5 +1,6 @@
---
service_container_command: []
service_container_entrypoint: ""
service_domains: []
service_container_http_port: 0
@@ -7,6 +8,7 @@ service_vhost_locations: []
service_proxy_pass_host_header: true
service_proxy_auth_type: none
service_container_ip: ""
service_container_additional_networks: []
service_container_user: ""
service_container_publish_ports: []
@@ -26,7 +28,3 @@ service_additional_containers: []
service_requires: []
service_wants: []
service_auto_update: true
service_container_image_creds:
username: ""
password: ""

View File

@@ -1,7 +1,14 @@
---
- name: "Restart socat socket for {{ service_name }}"
- name: Restart socat socket for {{ service_name }}
ansible.builtin.systemd_service:
name: "{{ service_name }}-socat.socket"
state: restarted
daemon_reload: true
ignore_errors: '{{ ansible_check_mode }}'
- name: Restart socat socket for {{ service_name ~ '-oauth2-proxy' }}
ansible.builtin.systemd_service:
name: "{{ service_name }}-oauth2-proxy-socat.socket"
state: restarted
daemon_reload: true
ignore_errors: '{{ ansible_check_mode }}'

View File

@@ -14,6 +14,11 @@ argument_specs:
required: false
default: []
elements: str
service_container_entrypoint:
description: Entrypoint to use in the service container
type: str
required: false
default: ""
service_domains:
description: A list of domains which should be proxied to the main service container
@@ -36,7 +41,6 @@ argument_specs:
service_proxy_auth_type:
description: >-
Set to oauth2-proxy to use OAuth2 Proxy for vhost authentication.
The oauth2-proxy role must be run separately.
type: str
required: false
default: none
@@ -49,32 +53,35 @@ argument_specs:
required: false
default: []
service_oauth2_proxy_issuer_url:
description: >-
OpenID Connect issuer URL. Required if service_proxy_auth_type is oauth2-proxy.
type: str
required: false
oauth2_proxy_client_id:
description: OAuth client ID. Required if service_proxy_auth_type is oauth2-proxy.
type: str
required: false
oauth2_proxy_client_secret:
description: OAuth client secret. Required if service_proxy_auth_type is oauth2-proxy.
type: str
required: false
service_container_image:
description: "The image to run in the service container(s), in FQIN format (registry/imagename:tag)."
type: str
required: true
service_container_image_creds:
description: Credentials used to authenticate with the registry
type: dict
required: false
default:
username: ""
password: ""
options:
username:
description: Username
type: str
required: true
password:
description: Password
type: str
required: true
service_container_user:
description: The UID to run as inside the container
type: str
required: false
default: ""
service_container_ip:
description: Static ip for the container in it's network
type: str
required: false
default: ""
service_container_additional_networks:
description: >-
A list of additional podman networks for the service container (in
@@ -132,6 +139,11 @@ argument_specs:
type: str
required: false
default: ""
mode:
description: Templated file permissions
type: str
required: false
default: "0644"
volume_device:
description: >-
The path of a device which is mounted for the volume.
@@ -297,6 +309,11 @@ argument_specs:
required: false
default: "[]"
elements: str
entrypoint:
description: Entrypoint to use in the container
type: str
required: false
default: ""
mounts:
description: List of bind mounts or volumes to be mounted inside the main service container.
type: list

View File

@@ -5,12 +5,13 @@
vars:
container_name: "{{ _service_additional_container.name }}"
container_image: "{{ _service_additional_container.image | default(service_container_image) }}"
container_image_creds: "{{ service_container_image_creds }}"
container_command: "{{ _service_additional_container.command | default([]) }}"
container_entrypoint: "{{ _service_additional_container.entrypoint | default('') }}"
container_user: "{{ service_container_user }}"
container_mounts: "{{ _service_additional_container_mounts }}"
container_publish_ports: "{{ _service_additional_container.publish_ports | default([]) }}"
container_networks: "{{ _service_container_networks }}"
container_ip: "{{ _service_additional_container_ip }}"
container_secrets: "{{ _service_additional_container.secrets | default(_service_container_secrets) }}"
container_env: "{{ _service_additional_container.env | default(service_container_env) }}"
container_requires: "{{ _service_container_requires }}"
@@ -19,3 +20,4 @@
loop: "{{ _service_additional_containers }}"
loop_control:
loop_var: _service_additional_container
index_var: _service_additional_container_index

View File

@@ -10,6 +10,8 @@
source: "{{ _service_database_name }}"
destination: /var/lib/postgresql/data
container_networks: "{{ _service_database_networks }}"
container_ip: >-
{{ service_container_ip | ansible.utils.ipmath(1) if _service_static_ip else '' }}
container_secrets:
- name: "{{ _service_database_name }}"
target: "{{ service_database_type }}"

View File

@@ -28,12 +28,13 @@
vars:
container_name: "{{ service_name }}"
container_image: "{{ service_container_image }}"
container_image_creds: "{{ service_container_image_creds }}"
container_command: "{{ service_container_command }}"
container_entrypoint: "{{ service_container_entrypoint }}"
container_user: "{{ service_container_user }}"
container_mounts: "{{ _service_container_mounts }}"
container_publish_ports: "{{ service_container_publish_ports }}"
container_networks: "{{ _service_container_networks }}"
container_ip: "{{ service_container_ip }}"
container_secrets: "{{ _service_container_secrets }}"
container_env: "{{ service_container_env }}"
container_requires: "{{ _service_container_requires }}"
@@ -43,6 +44,11 @@
- name: Socat for {{ service_name }}
ansible.builtin.include_tasks: socat.yaml
when: service_container_http_port > 0
vars:
socat_service_name: "{{ service_name }}"
socat_target_http_port: "{{ service_container_http_port }}"
socat_container_ip: >-
{{ service_container_ip | ansible.utils.ipmath(3) if _service_static_ip else '' }}
- name: Reverse proxy for {{ service_name }}
ansible.builtin.include_tasks: proxy.yaml

View File

@@ -0,0 +1,37 @@
---
- name: OAuth2 Proxy container for {{ service_name }}
ansible.builtin.import_role:
name: container
vars:
container_name: "{{ service_name }}-oauth2-proxy"
container_image: "quay.io/oauth2-proxy/oauth2-proxy:latest-alpine"
container_command:
- --client-secret-file
- /run/secrets/client-secret
- --cookie-secret-file
- /run/secrets/cookie-secret
container_networks:
- "{{ service_name }}-oauth2-proxy"
container_secrets:
- name: "{{ service_name }}-oauth2-proxy-cookie-secret"
length: 32
target: cookie-secret
- name: "{{ service_name }}-oauth2-proxy-client-secret"
value: "{{ service_oauth2_proxy_client_secret }}"
target: client-secret
container_env:
OAUTH2_PROXY_HTTP_ADDRESS: 0.0.0.0:4180
OAUTH2_PROXY_PROVIDER: oidc
OAUTH2_PROXY_OIDC_ISSUER_URL: "{{ service_oauth2_proxy_issuer_url }}"
OAUTH2_PROXY_CLIENT_ID: "{{ service_oauth2_proxy_client_id }}"
OAUTH2_PROXY_CODE_CHALLENGE_METHOD: S256
OAUTH2_PROXY_SKIP_PROVIDER_BUTTON: "true"
OAUTH2_PROXY_EMAIL_DOMAINS: "*"
container_auto_update: "{{ service_auto_update }}"
- name: Socat for OAuth2 Proxy for {{ service_name }}
ansible.builtin.import_tasks: socat.yaml
vars:
socat_service_name: "{{ service_name }}-oauth2-proxy"
socat_target_http_port: 4180
socat_container_ip: ""

View File

@@ -1,4 +1,8 @@
---
- name: OAuth2 proxy for {{ service_name }}
ansible.builtin.include_tasks: oauth2_proxy.yaml
when: _service_oauth2_proxy
- name: Reverse proxy for {{ service_name }}
ansible.builtin.import_role:
name: uumas.general.vhost

View File

@@ -7,4 +7,6 @@
container_image: docker.io/valkey/valkey:alpine
container_networks:
- "{{ service_name }}"
container_ip: >-
{{ service_container_ip | ansible.utils.ipmath(2) if _service_static_ip else '' }}
container_auto_update: "{{ service_auto_update }}"

View File

@@ -1,26 +1,26 @@
---
- name: Socat socket for {{ service_name }}
- name: Socat socket for {{ socat_service_name }}
ansible.builtin.template:
src: socat.socket.j2
dest: /etc/systemd/system/{{ service_name }}-socat.socket
dest: /etc/systemd/system/{{ socat_service_name }}-socat.socket
mode: "0644"
notify: Restart socat socket for {{ service_name }}
notify: Restart socat socket for {{ socat_service_name }}
- name: Socat container for {{ service_name }}
- name: Socat container for {{ socat_service_name }}
ansible.builtin.import_role:
name: container
vars:
container_name: "{{ service_name }}-socat"
container_name: "{{ socat_service_name }}-socat"
container_image: "docker.io/alpine/socat:latest"
container_command:
- "ACCEPT-FD:3,fork"
- "TCP:{{ service_name }}:{{ service_container_http_port }}"
- "TCP:{{ socat_service_name }}:{{ socat_target_http_port }}"
container_user: nobody
container_networks:
- "{{ service_name }}"
- "{{ socat_service_name }}"
container_ip: "{{ socat_container_ip }}"
container_requires:
- "{{ service_name }}-socat.socket"
- "{{ service_name }}.service"
- "{{ socat_service_name }}-socat.socket"
- "{{ socat_service_name }}.service"
container_auto_start: false
container_auto_update: "{{ service_auto_update }}"

View File

@@ -11,10 +11,17 @@
state: directory
mode: "0700"
- name: Create service template mount directories
ansible.builtin.file:
path: "{{ _service_host_directory }}/mounts/{{ item }}"
state: directory
mode: "0700"
loop: "{{ _service_all_template_mount_directories }}"
- name: Template files for template mounts
ansible.builtin.template:
src: "{{ item[0].source }}"
dest: "{{ item[1].source }}"
mode: "0644"
notify: "Restart container service {{ service_name }}"
loop: "{{ _service_template_mounts | zip(_service_container_template_mounts) }}"
dest: "{{ item[1] }}"
mode: "{{ item[0].mode | default('0644') }}"
notify: Restart container service {{ service_name }}
loop: "{{ _service_all_template_mounts | zip(_service_all_template_mount_host_files) }}"

View File

@@ -1,4 +1,9 @@
---
- name: Fail if service_name is empty
ansible.builtin.fail:
msg: service_name must not be empty
when: service_name | length == 0
- name: Fail if service_container_user is not string
ansible.builtin.fail:
msg: "service_container_user must be a string, not int."

View File

@@ -1,6 +1,6 @@
# {{ ansible_managed }}
[Unit]
Description={{ service_name }} socat socket
Description={{ socat_service_name }} socat socket
[Socket]
ListenStream=/run/{{ service_name }}-socat.sock
ListenStream=/run/{{ socat_service_name }}-socat.sock

View File

@@ -11,6 +11,13 @@ _service_additional_containers: >-
| map('combine')
}}
_service_additional_container_ip: >-
{{
service_container_ip |
ansible.utils.ipmath(20 + _service_additional_container_index)
if _service_static_ip else ''
}}
_service_additional_volume_mounts: "{{ _service_additional_container.mounts | selectattr('type', '==', 'volume') }}"
_service_additional_template_mounts: "{{ _service_additional_container.mounts | selectattr('type', '==', 'template') }}"
@@ -32,7 +39,8 @@ _service_additional_container_template_mounts: >-
{{
([{'readonly': true}] * _service_additional_template_mounts | length) |
zip(
_service_additional_template_mounts,
_service_additional_template_mounts |
community.general.remove_keys(['mode']),
_service_additional_template_mounts |
map(attribute='source') |
map('regex_replace', '\.j2$', '') |

View File

@@ -1,5 +1,6 @@
---
_service_container_networks: "{{ [service_name] + service_container_additional_networks }}"
_service_static_ip: "{{ service_container_ip | length > 0 }}"
_service_container_requires: >-
{{
@@ -10,7 +11,8 @@ _service_container_requires: >-
_service_container_wants: >-
{{
service_wants
+ ([service_name + '-socat.socket'] if service_domains | length > 0 else [])
+ ([service_name + '-socat.socket'] if service_container_http_port > 0 else [])
+ ([service_name + '-oauth2-proxy-socat.socket'] if _service_oauth2_proxy else [])
+ _service_additional_containers
| map(attribute='name')
| map('regex_replace', '$', '.service')

View File

@@ -19,7 +19,8 @@ _service_container_template_mounts: >-
{{
([{'readonly': true}] * _service_template_mounts | length) |
zip(
_service_template_mounts,
_service_template_mounts |
community.general.remove_keys(['mode']),
_service_template_mounts |
map(attribute='source') |
map('regex_replace', '\.j2$', '') |
@@ -36,3 +37,33 @@ _service_container_mounts: >-
_service_container_bind_mounts +
_service_container_template_mounts
}}
_service_all_template_mounts: >-
{{
(
_service_template_mounts +
(
_service_additional_containers |
map(attribute='mounts', default=[]) |
flatten
)
) |
selectattr('type', '==', 'template') |
unique
}}
_service_all_template_mount_directories: >-
{{
_service_all_template_mounts |
map(attribute='source') |
map('dirname') |
unique |
select('!=', '')
}}
_service_all_template_mount_host_files: >-
{{
_service_all_template_mounts |
map(attribute='source') |
map('regex_replace', '\.j2$', '') |
map('regex_replace', '^', _service_host_directory ~ '/mounts/')
}}

View File

@@ -5,7 +5,7 @@ _service_proxy_headers: "{{ _service_replacement_host_header if not service_prox
_service_oauth2_proxy: "{{ service_proxy_auth_type == 'oauth2-proxy' }}"
_service_oauth2_socket: >-
{{ '/run/oauth2-proxy-socat.sock' if _service_oauth2_proxy else '' }}
{{ '/run/' ~ service_name ~ '-oauth2-proxy-socat.sock' if _service_oauth2_proxy else '' }}
_service_oauth2_proxy_location:
path: /oauth2/*
proxy_target_socket: "{{ _service_oauth2_socket }}"

View File

@@ -72,3 +72,21 @@
matrix_authentication_service_upstream_oauth2_scope: "{{ synapse_oidc_provider_scopes | join(' ') }}"
matrix_authentication_service_upstream_oauth2_claims_imports: "{{ synapse_oidc_provider_mas_claims_imports }}"
matrix_authentication_service_upstream_oauth2_human_name: "{{ synapse_oidc_provider_name }}"
- name: Reverse proxy synapse federation
ansible.builtin.import_role:
name: uumas.general.vhost
vars:
vhost_type: reverse_proxy
vhost_id: synapse-federation
vhost_domains:
- "{{ synapse_external_domain }}:8448"
vhost_proxy_target_netproto: unix
vhost_proxy_target_socket: "/run/synapse-socat.sock"
- name: Open port for synapse federation
ansible.posix.firewalld:
service: matrix
state: enabled
permanent: true
immediate: true

View File

@@ -4,4 +4,4 @@
name: "{{ volume_name }}-volume.service"
state: restarted
daemon_reload: true
ignore_errors: '{{ ansible_check_mode }}'
ignore_errors: "{{ ansible_check_mode }}"