global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files:
- "rules/*.yml"
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: "node"
static_configs:
- targets: ["localhost:9100"] scrape_configs:
# Kubernetes pods
- job_name: "kubernetes-pods"
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
# File-based discovery
- job_name: "file-sd"
file_sd_configs:
- files:
- "targets/*.json"
refresh_interval: 5m
# EC2 discovery
- job_name: "ec2"
ec2_sd_configs:
- region: us-east-1
access_key: KEY
secret_key: SECRET # Instant vector
http_requests_total
# Label matching
http_requests_total{job="api"}
http_requests_total{job="api", status="200"}
# Regex matching
http_requests_total{job=~"api|web"} # matches
http_requests_total{job!~"test.*"} # not matches
# Range vector (last 5 minutes)
http_requests_total[5m]
# Offset (1 hour ago)
http_requests_total offset 1h # Sum by label
sum(http_requests_total) by (job)
sum by (job) (http_requests_total)
# Other aggregations
avg(http_requests_total)
min(http_requests_total)
max(http_requests_total)
count(http_requests_total)
# Group by multiple labels
sum(rate(http_requests_total[5m])) by (job, method)
# Without specific labels
sum without (instance) (http_requests_total)
# Topk/Bottomk
topk(5, http_requests_total)
bottomk(5, http_requests_total) # Rate (per-second average)
rate(http_requests_total[5m])
# irate (instant rate)
irate(http_requests_total[5m])
# Increase (total increase)
increase(http_requests_total[1h])
# Delta (for gauges)
delta(temperature[1h])
# idelta
idelta(temperature[5m]) # Absolute value
abs(metric)
# Ceiling/Floor
ceil(metric)
floor(metric)
# Round
round(metric, 0.1) # nearest 0.1
# Clamp
clamp(metric, 0, 100) # clamp between 0-100
clamp_min(metric, 0)
clamp_max(metric, 100)
# Logarithm
ln(metric)
log2(metric)
log10(metric)
# Exponential
exp(metric) # Current time
time()
# Timestamp of metric
timestamp(metric)
# Time since metric was updated
time() - timestamp(metric)
# Day of month, hour, etc.
day_of_month()
day_of_week()
hour()
minute()
month()
year() # Average over time
avg_over_time(metric[5m])
# Sum over time
sum_over_time(metric[1h])
# Min/Max over time
min_over_time(metric[5m])
max_over_time(metric[5m])
# Count over time
count_over_time(metric[5m])
# Standard deviation
stddev_over_time(metric[5m])
# Quantile over time
quantile_over_time(0.95, metric[5m]) # Replace label
label_replace(metric, "new_label", "$1", "old_label", "(.*)")
# Join labels
label_join(metric, "combined", "-", "label1", "label2")
# Drop labels
metric * on() group_left() other_metric # Requests per second
sum(rate(http_requests_total[5m])) by (job)
# Error rate
sum(rate(http_requests_total{status=~"5.."}[5m])) /
sum(rate(http_requests_total[5m]))
# Error percentage
100 * sum(rate(http_requests_total{status=~"5.."}[5m])) /
sum(rate(http_requests_total[5m])) # Average latency
rate(http_request_duration_seconds_sum[5m]) /
rate(http_request_duration_seconds_count[5m])
# 95th percentile (histogram)
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
)
# 99th percentile by job
histogram_quantile(0.99,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job)
) # CPU usage percentage
100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# Memory usage percentage
100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)
# Disk usage percentage
100 - (node_filesystem_avail_bytes{mountpoint="/"} /
node_filesystem_size_bytes{mountpoint="/"} * 100) groups:
- name: example
rules:
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m])) /
sum(rate(http_requests_total[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | printf "%.2f" }}%"
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} down" groups:
- name: http_rules
rules:
# Pre-computed rate
- record: job:http_requests:rate5m
expr: sum(rate(http_requests_total[5m])) by (job)
# Pre-computed error rate
- record: job:http_errors:rate5m
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m])) by (job) /
sum(rate(http_requests_total[5m])) by (job) global:
smtp_smarthost: "smtp.example.com:587"
smtp_from: "alerts@example.com"
route:
receiver: "default"
group_by: [alertname, job]
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
routes:
- match:
severity: critical
receiver: "pagerduty"
- match:
severity: warning
receiver: "slack"
receivers:
- name: "default"
email_configs:
- to: "team@example.com"
- name: "slack"
slack_configs:
- api_url: "https://hooks.slack.com/services/xxx"
channel: "#alerts"
- name: "pagerduty"
pagerduty_configs:
- service_key: "xxx" # Counter - only increases
# Example: total requests, errors, bytes
# HELP http_requests_total Total HTTP requests
# TYPE http_requests_total counter
http_requests_total{method="GET", status="200"} 1234
# Query: rate of increase
rate(http_requests_total[5m]) # Gauge - can go up or down
# Example: temperature, memory usage, queue size
# HELP node_memory_free_bytes Free memory
# TYPE node_memory_free_bytes gauge
node_memory_free_bytes 1073741824
# Query: current value or average
avg_over_time(node_memory_free_bytes[5m]) # Histogram - samples in buckets
# Example: request duration, response size
# HELP http_request_duration_seconds Request duration
# TYPE http_request_duration_seconds histogram
http_request_duration_seconds_bucket{le="0.1"} 500
http_request_duration_seconds_bucket{le="0.5"} 800
http_request_duration_seconds_bucket{le="1"} 950
http_request_duration_seconds_bucket{le="+Inf"} 1000
http_request_duration_seconds_sum 450
http_request_duration_seconds_count 1000
# Query: percentile
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) # Summary - pre-calculated quantiles
# Example: request duration with quantiles
# HELP rpc_duration_seconds RPC duration
# TYPE rpc_duration_seconds summary
rpc_duration_seconds{quantile="0.5"} 0.05
rpc_duration_seconds{quantile="0.9"} 0.08
rpc_duration_seconds{quantile="0.99"} 0.12
rpc_duration_seconds_sum 1000
rpc_duration_seconds_count 5000 curl http://localhost:9090/api/v1/query?query=up | 즉시 쿼리 |
curl "http://localhost:9090/api/v1/query_range?query=up&start=2024-01-01T00:00:00Z&end=2024-01-02T00:00:00Z&step=15s" | 범위 쿼리 |
curl http://localhost:9090/api/v1/series?match[]=up | 시리즈 찾기 |
curl http://localhost:9090/api/v1/labels | 레이블 목록 |
curl http://localhost:9090/api/v1/label/job/values | 레이블 값 |
curl http://localhost:9090/api/v1/targets | 타겟 목록 |
curl http://localhost:9090/api/v1/rules | 규칙 목록 |
curl http://localhost:9090/api/v1/alerts | 알림 목록 |