Mass Commit 06/08/2021

Signed-off-by: Georg <georg@lysergic.dev>
This commit is contained in:
Georg Pfuetzenreuter 2021-08-06 17:42:45 +02:00
parent 1791f4374b
commit 4bd9e36ff4
Signed by: Georg
GPG Key ID: 1DAF57F49F8E8F22
26 changed files with 659 additions and 0 deletions

46
Cachet/.env Normal file
View File

@ -0,0 +1,46 @@
APP_ENV=production
APP_DEBUG=false
APP_URL=http://cachet.local
APP_TIMEZONE=UTC
APP_KEY=$CACHETAPIKEY
DEBUGBAR_ENABLED=false
DB_DRIVER=mysql
DB_HOST=$DBHOST
DB_UNIX_SOCKET=null
DB_DATABASE=$DB
DB_USERNAME=$DBUSER
DB_PASSWORD=$DBPASS
DB_PORT=null
DB_PREFIX=null
CACHE_DRIVER=file
SESSION_DRIVER=file
QUEUE_DRIVER=sync
CACHET_BEACON=false
CACHET_EMOJI=false
CACHET_AUTO_TWITTER=false
MAIL_DRIVER=log
MAIL_HOST=$SMTPHOST
MAIL_PORT=465
MAIL_USERNAME=null
MAIL_PASSWORD=null
MAIL_ADDRESS=null
MAIL_NAME=null
MAIL_ENCRYPTION=tls
REDIS_HOST=$DBHOST_REDIS
REDIS_DATABASE=$DB_REDIS
REDIS_PORT=null
GITHUB_TOKEN=null
NEXMO_KEY=null
NEXMO_SECRET=null
NEXMO_SMS_FROM=Cachet
TRUSTED_PROXIES=status.lib.casa
APP_LOG=file

View File

@ -0,0 +1 @@
There are multiple pieces of software going by similar names. Most of them are very outdated and faulty, we ended up using https://github.com/oxyno-zeta/prometheus-cachethq with slight modifications (hardcoded listening addresses, as the config value did not work for some reason).

View File

@ -0,0 +1,88 @@
# Log configuration
log:
# Log level
level: debug
# Log format
format: text
# Server configurations
server:
listenAddr: "127.0.0.2"
port: 8081
# Cachet configuration
cachet:
url: http://172.16.9.2:8033
apiKey: $CACHETAPIKEY
# Targets
targets:
# - component:
# name: nginx
# status: PARTIAL_OUTAGE
# groupName: LibertaCasa
# alerts:
# - name: lysergic-nginx-down
# - component:
# name: Node_Exporter_Mercury
# status: PARTIAL_OUTAGE
# groupName: LibertaCasa
# alerts:
# - name: lysergic-node-down
- component:
name: Jitsi
status: PARTIAL_OUTAGE
alerts:
- name: JITSI-WEB-DOWN
- component:
name: Element
status: PARTIAL_OUTAGE
groupName: LibertaCasa
alerts:
- name: ELEMENT-WEB-DOWN
- component:
name: Gitea
status: PARTIAL_OUTAGE
alerts:
- name: GITEA-WEB-DOWN
- component:
name: cgit
status: PARTIAL_OUTAGE
groupName: LibertaCasa
alerts:
- name: CGIT-WEB-DOWN
- component:
name: Etherpad
status: PARTIAL_OUTAGE
alerts:
- name: ETHERPAD-WEB-DOWN
- component:
name: PrivateBin
status: PARTIAL_OUTAGE
alerts:
- name: PASTA-WEB-DOWN
- component:
name: Website
status: PARTIAL_OUTAGE
groupName: xKek
alerts:
- name: XKEK-WEB-DOWN
- component:
name: SearX
status: PARTIAL_OUTAGE
groupName: xKek
alerts:
- name: SEARX-WEB-DOWN
- component:
name: Yacy
status: PARTIAL_OUTAGE
groupName: xKek
alerts:
- name: YACY-WEB-DOWN
# - labels:
# label1: value1
# incident:
# name: ""
# content: ""
# status: INVESTIGATING
# public: true

View File

@ -0,0 +1,13 @@
[Unit]
Description=Prometheus-CachetHQ
After=network.target prometheus.service
[Service]
User=cachet
Group=cachet
Type=simple
WorkingDirectory=/opt/prometheus-cachethq
ExecStart=/opt/prometheus-cachethq/prometheus-cachethq
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,94 @@
groups:
- name: lysergic-blackbox_exporters
rules:
- alert: LIBERTACASA-WEB-DOWN
expr: probe_success{instance=~"https://liberta.casa|https://www.lysergic.dev"} == 0
for: 30s
labels:
severity: warning
project: LYSERGIC
annotations:
title: Node {{ $labels.instance }} is down
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance}} for more than 30 seconds. Node seems down.
- alert: XKEK-WEB-DOWN
expr: probe_success{instance=~"https://xkek.net|https://kekx.net"} == 0
for: 30s
labels:
severity: warning
project: LYSERGIC
annotations:
title: Node {{ $labels.instance }} is down
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance}} for more than 30 seconds. Node seems down.
- alert: JITSI-WEB-DOWN
expr: probe_success{instance=~"https://meet.lysergic.dev"} == 0
for: 30s
labels:
severity: warning
project: LYSERGIC
annotations:
title: Node {{ $labels.instance }} is down
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance}} for more than 30 seconds. Node seems down.
- alert: PASTA-WEB-DOWN
expr: probe_success{instance=~"https://pasta.lysergic.dev"} == 0
for: 30s
labels:
severity: warning
project: LYSERGIC
annotations:
title: Node {{ $labels.instance }} is down
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance}} for more than 30 seconds. Node seems down.
- alert: GITEA-WEB-DOWN
expr: probe_success{instance=~"https://git.com.de"} == 0
for: 30s
labels:
severity: warning
project: LYSERGIC
annotations:
title: Node {{ $labels.instance }} is down
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance}} for more than 30 seconds. Node seems down.
- alert: CGIT-WEB-DOWN
expr: probe_success{instance=~"https://git.casa"} == 0
for: 30s
labels:
severity: warning
project: LYSERGIC
annotations:
title: Node {{ $labels.instance }} is down
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance}} for more than 30 seconds. Node seems down.
- alert: ELEMENT-WEB-DOWN
expr: probe_success{instance=~"https://element.liberta.casa"} == 0
for: 30s
labels:
severity: warning
project: LYSERGIC
annotations:
title: Node {{ $labels.instance }} is down
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance}} for more than 30 seconds. Node seems down.
- alert: ETHERPAD-WEB-DOWN
expr: probe_success{instance=~"https://pad.hugz.io"} == 0
for: 30s
labels:
severity: warning
project: LYSERGIC
annotations:
title: Node {{ $labels.instance }} is down
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance}} for more than 30 seconds. Node seems down.
- alert: SEARX-WEB-DOWN
expr: probe_success{instance=~"https://searx.xkek.net"} == 0
for: 120s
labels:
severity: warning
project: LYSERGIC
annotations:
title: Node {{ $labels.instance }} is down
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance}} for more than 30 seconds. Node seems down.
- alert: YACY-WEB-DOWN
expr: probe_success{instance=~"https://yacy.xkek.net"} == 0
for: 30s
labels:
severity: warning
project: LYSERGIC
annotations:
title: Node {{ $labels.instance }} is down
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance}} for more than 30 seconds. Node seems down.
/etc/prometheus/alerts/lysergic/blackbox.yml lines 29-93/93 (END)

View File

@ -0,0 +1,13 @@
groups:
- name: lysergic-nginx_exporters
rules:
- alert: lysergic-nginx-down
expr: nginx_up{project="LYSERGIC", job="nginx_exporters"} == 0
for: 1m
labels:
job: nginx_exporters
severity: WARNING
project: LYSERGIC
annotations:
title: nginx {{ $labels.instance }} is down
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance}} for more than 1 minute. nginx seems down.

View File

@ -0,0 +1,13 @@
groups:
- name: lysergic-node_exporters
rules:
- alert: lysergic-node-down
expr: up{project="LYSERGIC", job="node_exporters"} == 0
for: 1m
labels:
job: node_exporters
severity: warning
project: LYSERGIC
annotations:
title: Node {{ $labels.instance }} is down
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance}} for more than 1 minute. Node seems down.

View File

@ -0,0 +1,23 @@
groups:
- name: lysergic-wireguard_exporters
rules:
- alert: mercury-wireguard-down
expr: wireguard_peer_last_handshake_seconds{device="wg0", instance="localhost:9586", job="wireguard-mercury", public_key="$WG0_PUBKEY"} - time() <- 125
for: 10s
labels:
job: wireguard-mercury
severity: CRITICAL
project: LYSERGIC
annotations:
title: WireGuard {{ $labels.instance }} is down
description: 'Failed to scrape {{ $labels.job }} on {{ $labels.instance}} for more than 2 minutes. Uplink to Psyched seems down. Affected services: Global infrastructure monitoring. Other outages might no longer be tracked. Urgent investigation is advised.'
- alert: lysergic-wireguard-down
expr: wireguard_peer_last_handshake_seconds{project="LYSERGIC",device!="wg1"} - time() <- 130
for: 10s
labels:
job: wireguard-lysergic
severity: CRITICAL
project: LYSERGIC
annotations:
title: WireGuard Incident
description: 'No handshakes for {{ $labels.device }} in {{ $labels.job }} on {{ $labels.instance}} have been exchanged for an extended amount of time. Critical services could be affected.'

View File

@ -0,0 +1,21 @@
groups:
- name: tripsit-blackbox_exporters
rules:
- alert: TRIPSIT.ME-WEB-INCIDENT
expr: probe_success{instance=~"https://tripsit.me|https://chat.tripsit.me|https://chat.tripsit.me/chat|https://drugs.tripsit.me|https://benzo.tripsit.me|https://dxm.tripsit.me|https://combo.tripsit.me|https://wiki.tripsit.me|https://tripbot.tripsit.me"} == 0
for: 15s
labels:
severity: critical
project: TRIPSIT
annotations:
title: Node {{ $labels.instance }} is down
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance}} for more than 15 seconds. Node seems down.
- alert: TRIPSIT.DEV-WEB-INCIDENT
expr: probe_success{instance=~"https://mail.tripsit.dev|https://dopamine.tripsit.dev"} == 0
for: 1m
labels:
severity: warning
project: TRIPSIT
annotations:
title: Node {{ $labels.instance }} is down
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance}} for more than 1 minute. Node seems down.

View File

@ -0,0 +1,13 @@
groups:
- name: tripsit-node_exporters
rules:
- alert: TRIPSIT-NODE-INCIDENT
expr: up{project="TRIPSIT", job="node_exporters"} == 0
for: 30s
labels:
severity: warning
job: node_exporters
project: TRIPSIT
annotations:
title: Node {{ $labels.instance }} is down
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance}} for more than 1 minute. Node seems down.

View File

@ -0,0 +1,13 @@
[Unit]
Description=Prometheus
After=network.target
[Service]
User=prometheus
Group=prometheus
Type=simple
ExecStart=/opt/prometheus/prometheus --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/var/lib/prometheus --web.console.templates=/opt/prometheus/consoles --web.console.libraries=/opt/prometheus/console_libraries --web.external-url=https://prometheus.lysergic.dev:4433
ExecReload=/usr/bin/kill -HUP $MAINPID
[Install]
WantedBy=multi-user.target

69
Prometheus/prometheus.yml Normal file
View File

@ -0,0 +1,69 @@
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 1m # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/etc/prometheus/alerts/lysergic/*.yml"
# - "/etc/prometheus/alerts/tripsit/*.yml" - Disabled on 22/07/2021
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'node_exporters'
file_sd_configs:
- files:
- '/etc/prometheus/targets/node-*.json'
# - job_name: 'node_exporters_02'
# file_sd_configs:
# - files:
# - '/etc/prometheus/targets/node-tripsit.json'
# proxy_url: 'https://dopamine.tripsit.dev:9493/'
#params:
# _scheme: [https]
- job_name: 'nginx_exporters'
file_sd_configs:
- files:
- '/etc/prometheus/targets/nginx-*.json'
- job_name: 'etcd_metrics'
static_configs:
- targets: ['$ETCDNODE01:2379', '$ETCDNODE02:2379', '$ETCDNODE03:2379']
scheme: https
tls_config:
ca_file: $ETCDCACRT
cert_file: $ETCDCLIENTCRT
key_file: $ETCDCLIENTKEY
- job_name: 'blackbox'
metrics_path: /probe
params:
module: [http_2xx]
file_sd_configs:
- files: ['/etc/prometheus/targets/blackbox*.yml']
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115
- job_name: 'wireguard_exporters'
file_sd_configs:
- files:
- '/etc/prometheus/targets/wireguard-*.json'

View File

@ -0,0 +1 @@
These configuration files are currently not available publicly.

View File

@ -0,0 +1,17 @@
[Unit]
After=network.target
Description=Prometheus Alertmanager
Wants=network-online.target
After=network-online.target
[Service]
Type=simple
User=alertmanager
Group=prometheus
WorkingDirectory=/var/lib/alertmanager
Restart=always
ExecStart=/opt/alertmanager/alertmanager --config.file=/etc/prometheus/alertmanager.yml --web.external-url=https://prometheus.lysergic.dev:4433/alertmanager --web.route-prefix=/ --cluster.listen-address=
ExecReload=/usr/bin/kill -HUP $MAINPID
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,57 @@
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'smtp-local'
routes:
- receiver: 'irc-tripsit'
match:
project: TRIPSIT
- receiver: 'mayday-tripsit'
match:
project: tripsit-critical
- receiver: 'lysergic'
continue: true
match:
project: LYSERGIC
receivers:
- name: 'smtp-local'
email_configs:
- to: '$MAIL_SYSTEM'
from: '$MAIL_ALERTMANAGER'
require_tls: false
smarthost: '$SMTPHOST'
send_resolved: yes
- name: 'irc-tripsit'
webhook_configs:
- url: 'http://127.0.0.1:2420/_systems'
send_resolved: yes
- name: 'mayday-tripsit'
webhook_configs:
- url: 'http://127.0.0.1:2420/tripsit-dev'
email_configs:
- to: '$MAIL_SYSTEM'
from: '$MAIL_ALERTMANAGER'
require_tls: false
smarthost: '127.0.0.1:25'
send_resolved: yes
- name: 'irc-libertacasa'
webhook_configs:
- url: 'http://127.0.0.1:2410/universe'
send_resolved: yes
- name: 'lysergic'
webhook_configs:
- url: 'http://127.0.0.1:2410/universe'
send_resolved: yes
- url: http://127.0.0.2:8081/prometheus/webhook
send_resolved: yes
email_configs:
- to: '$MAIL_SYSTEM'
from: '$MAIL_ALERTMANAGER'
require_tls: false
smarthost: '127.0.0.1:25'
send_resolved: yes

View File

@ -0,0 +1,14 @@
[Unit]
After=network.target
Description=Alertmanager IRC Relay for LibertaCasa
[Service]
Type=simple
User=alertmanager-irc
Group=alertmanager-irc
WorkingDirectory=/opt/alertmanager-irc
Restart=always
ExecStart=/opt/alertmanager-irc/alertmanager-irc-relay --config /etc/prometheus/alertmanager-irc-libertacasa.yaml
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,32 @@
http_host: 127.0.0.1
http_port: 2410
irc_host: irc.liberta.casa
irc_port: 6697
irc_host_password: watchdog:$PASS
irc_nickname: watchdog
# Password used to identify with NickServ
#irc_nickname_password: $NSPASS
# Use this IRC real name
irc_realname: Alertmanager on mercury.lysergic.dev
irc_channels:
- name: "#universe"
password: $JTOKEN
# - name: "#admins"
msg_once_per_alert_group: no
use_privmsg: yes
msg_template: "Severity: {{ .Labels.severity }} - Job: {{ .Labels.job }} - Title: {{ .Annotations.title}} - Body: {{ .Annotations.description }} - Summary: {{ .Labels.alertname }} on {{ .Labels.instance }} is {{ .Status }}"
# Note: When sending only one message per alert group the default
# msg_template is set to
# "Alert {{ .GroupLabels.alertname }} for {{ .GroupLabels.job }} is {{ .Status }}"
# Set the internal buffer size for alerts received but not yet sent to IRC.
alert_buffer_size: 2048
nickserv_identify_patterns:
- "identify via /msg NickServ identify <password>"
- "type /msg NickServ IDENTIFY password"
- "authenticate yourself to services with the IDENTIFY command"

View File

@ -0,0 +1,15 @@
[Unit]
After=network.target
Description=Alertmanager IRC Relay for TripSit
[Service]
Type=simple
User=alertmanager-irc
Group=alertmanager-irc
WorkingDirectory=/opt/alertmanager-irc
Restart=always
ExecStart=/opt/alertmanager-irc/alertmanager-irc-relay --config /etc/prometheus/alertmanager-irc-tripsit.yaml
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,30 @@
http_host: 127.0.0.1
http_port: 2420
irc_host: newirc.tripsit.me
irc_port: 6697
irc_nickname: watchdog
# Password used to identify with NickServ
irc_nickname_password: $NSPASS
# Use this IRC real name
irc_realname: Infrastructure Monitoring
irc_channels:
- name: "#_systems"
- name: "#tripsit-dev"
msg_once_per_alert_group: no
use_privmsg: yes
msg_template: "ALERT: {{ .Labels.alertname }} on {{ .Labels.instance }} is {{ .Status }}"
# Note: When sending only one message per alert group the default
# msg_template is set to
# "Alert {{ .GroupLabels.alertname }} for {{ .GroupLabels.job }} is {{ .Status }}"
# Set the internal buffer size for alerts received but not yet sent to IRC.
alert_buffer_size: 2048
nickserv_identify_patterns:
- "identify via /msg NickServ identify <password>"
- "type /msg NickServ IDENTIFY password"
- "authenticate yourself to services with the IDENTIFY command"

View File

@ -0,0 +1,36 @@
modules:
http_2xx:
prober: http
timeout: 10s
http:
preferred_ip_protocol: "ipv4"
http_post_2xx:
prober: http
http:
method: POST
tcp_connect:
prober: tcp
pop3s_banner:
prober: tcp
tcp:
query_response:
- expect: "^+OK"
tls: true
tls_config:
insecure_skip_verify: false
ssh_banner:
prober: tcp
tcp:
query_response:
- expect: "^SSH-2.0-"
irc_banner:
prober: tcp
tcp:
query_response:
- send: "NICK prober"
- send: "USER prober prober prober :prober"
- expect: "PING :([^ ]+)"
send: "PONG ${1}"
- expect: "^:[^ ]+ 001"
icmp:
prober: icmp

View File

@ -0,0 +1,13 @@
[Unit]
Description=Prometheus Blackbox Exporter
Wants=network-online.target
After=network-online.target
[Service]
Type=simple
User=blackbox_exporter
Group=prometheus
ExecStart=/opt/blackbox_exporter/blackbox_exporter --config.file=/etc/prometheus/blackbox.yml --web.external-url=https://prometheus.lysergic.dev:4433/blackbox --web.route-prefix=/
[Install]
WantedBy=multi-user.target

1
nginx_exporter/README.md Normal file
View File

@ -0,0 +1 @@
We are using this one: https://github.com/nginxinc/nginx-prometheus-exporter

View File

@ -0,0 +1,12 @@
[Unit]
Description=Nginx Exporter
After=network.target
[Service]
User=nginx_exporter
Group=prometheus
Type=simple
ExecStart=/opt/nginx_exporter/nginx-prometheus-exporter -nginx.scrape-uri=http://127.0.0.2:8080/stub_status
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,12 @@
[Unit]
Description=Prometheus Node Exporter
After=network.target
[Service]
User=node_exporter
Group=prometheus
Type=simple
ExecStart=/opt/node_exporter/node_exporter
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1 @@
We are using this one: https://github.com/mdlayher/wireguard_exporter

View File

@ -0,0 +1,11 @@
[Unit]
Description=Prometheus WireGuard Exporter
After=network.target
[Service]
Type=simple
Restart=always
ExecStart=/usr/local/bin/wireguard_exporter -wireguard.peer-file /etc/wireguard/peers.toml
[Install]
WantedBy=multi-user.target