Build a Production Monitoring Stack with Prometheus and Grafana

Monitoring is the backbone of reliable infrastructure. In this tutorial, you'll build a complete observability stack using Prometheus for metrics collection and Grafana for visualization — all orchestrated with Docker Compose.

What You'll Build

  • Prometheus server collecting metrics from multiple targets
  • Grafana dashboards with real-time visualizations
  • Node Exporter for system metrics (CPU, memory, disk)
  • A sample Python app with custom metrics
  • Alertmanager with Slack notifications
  • Prerequisites

  • Docker and Docker Compose installed
  • Basic understanding of YAML configuration
  • A Slack webhook URL (optional, for alerts)
  • ---

    Step 1: Project Structure

    Create the project directory:

    mkdir monitoring-stack && cd monitoring-stack
    

    mkdir -p prometheus grafana/provisioning/datasources grafana/provisioning/dashboards alertmanager app

    Final structure:

    monitoring-stack/
    

    ├── docker-compose.yml

    ├── prometheus/

    │ ├── prometheus.yml

    │ └── alert.rules.yml

    ├── grafana/

    │ └── provisioning/

    │ ├── datasources/

    │ │ └── prometheus.yml

    │ └── dashboards/

    │ ├── dashboard.yml

    │ └── node-exporter.json

    ├── alertmanager/

    │ └── alertmanager.yml

    └── app/

    ├── app.py

    ├── requirements.txt

    └── Dockerfile

    ---

    Step 2: Docker Compose Configuration

    Create docker-compose.yml:

    version: '3.8'
    
    

    services:

    prometheus:

    image: prom/prometheus:v2.51.0

    container_name: prometheus

    ports:

    - "9090:9090"

    volumes:

    - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml

    - ./prometheus/alert.rules.yml:/etc/prometheus/alert.rules.yml

    - prometheus_data:/prometheus

    command:

    - '--config.file=/etc/prometheus/prometheus.yml'

    - '--storage.tsdb.path=/prometheus'

    - '--storage.tsdb.retention.time=30d'

    - '--web.enable-lifecycle'

    restart: unless-stopped

    networks:

    - monitoring

    grafana:

    image: grafana/grafana:10.3.1

    container_name: grafana

    ports:

    - "3000:3000"

    environment:

    - GF_SECURITY_ADMIN_USER=admin

    - GF_SECURITY_ADMIN_PASSWORD=changeme

    - GF_USERS_ALLOW_SIGN_UP=false

    volumes:

    - grafana_data:/var/lib/grafana

    - ./grafana/provisioning:/etc/grafana/provisioning

    restart: unless-stopped

    networks:

    - monitoring

    node-exporter:

    image: prom/node-exporter:v1.7.0

    container_name: node-exporter

    ports:

    - "9100:9100"

    volumes:

    - /proc:/host/proc:ro

    - /sys:/host/sys:ro

    - /:/rootfs:ro

    command:

    - '--path.procfs=/host/proc'

    - '--path.rootfs=/rootfs'

    - '--path.sysfs=/host/sys'

    - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'

    restart: unless-stopped

    networks:

    - monitoring

    alertmanager:

    image: prom/alertmanager:v0.27.0

    container_name: alertmanager

    ports:

    - "9093:9093"

    volumes:

    - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml

    restart: unless-stopped

    networks:

    - monitoring

    sample-app:

    build: ./app

    container_name: sample-app

    ports:

    - "8000:8000"

    restart: unless-stopped

    networks:

    - monitoring

    volumes:

    prometheus_data:

    grafana_data:

    networks:

    monitoring:

    driver: bridge

    ---

    Step 3: Prometheus Configuration

    Create prometheus/prometheus.yml:

    global:
    

    scrape_interval: 15s

    evaluation_interval: 15s

    alerting:

    alertmanagers:

    - static_configs:

    - targets:

    - alertmanager:9093

    rule_files:

    - "alert.rules.yml"

    scrape_configs:

    - job_name: 'prometheus'

    static_configs:

    - targets: ['localhost:9090']

    - job_name: 'node-exporter'

    static_configs:

    - targets: ['node-exporter:9100']

    - job_name: 'sample-app'

    static_configs:

    - targets: ['sample-app:8000']

    Create prometheus/alert.rules.yml:

    groups:
    

    - name: system_alerts

    rules:

    - alert: HighCPUUsage

    expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80

    for: 5m

    labels:

    severity: warning

    annotations:

    summary: "High CPU usage on {{ $labels.instance }}"

    description: "CPU usage is above 80% for more than 5 minutes (current: {{ $value }}%)"

    - alert: HighMemoryUsage

    expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85

    for: 5m

    labels:

    severity: warning

    annotations:

    summary: "High memory usage on {{ $labels.instance }}"

    description: "Memory usage is above 85% (current: {{ $value }}%)"

    - alert: DiskSpaceRunningLow

    expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 15

    for: 10m

    labels:

    severity: critical

    annotations:

    summary: "Disk space running low on {{ $labels.instance }}"

    description: "Available disk space is below 15% (current: {{ $value }}%)"

    - alert: TargetDown

    expr: up == 0

    for: 1m

    labels:

    severity: critical

    annotations:

    summary: "Target {{ $labels.instance }} is down"

    description: "{{ $labels.job }} target {{ $labels.instance }} has been down for more than 1 minute."

    - alert: HighRequestLatency

    expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1

    for: 5m

    labels:

    severity: warning

    annotations:

    summary: "High request latency on {{ $labels.instance }}"

    description: "95th percentile latency is above 1s (current: {{ $value }}s)"

    ---

    Step 4: Sample Python Application with Custom Metrics

    Create app/requirements.txt:

    prometheus-client==0.20.0
    

    flask==3.0.0

    Create app/app.py:

    import time
    

    import random

    from flask import Flask, Response

    from prometheus_client import (

    Counter, Histogram, Gauge,

    generate_latest, CONTENT_TYPE_LATEST

    )

    app = Flask(__name__)

    # Define custom metrics

    REQUEST_COUNT = Counter(

    'http_requests_total',

    'Total HTTP requests',

    ['method', 'endpoint', 'status']

    )

    REQUEST_DURATION = Histogram(

    'http_request_duration_seconds',

    'HTTP request duration in seconds',

    ['method', 'endpoint'],

    buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0]

    )

    ACTIVE_REQUESTS = Gauge(

    'http_active_requests',

    'Number of active HTTP requests'

    )

    ITEMS_IN_QUEUE = Gauge(

    'app_items_in_queue',

    'Number of items waiting in the processing queue'

    )

    def track_request(method, endpoint):

    """Decorator to track request metrics."""

    def decorator(f):

    def wrapper(args, *kwargs):

    ACTIVE_REQUESTS.inc()

    start_time = time.time()

    try:

    result = f(args, *kwargs)

    status = 200

    return result

    except Exception:

    status = 500

    raise

    finally:

    duration = time.time() - start_time

    REQUEST_COUNT.labels(method, endpoint, status).inc()

    REQUEST_DURATION.labels(method, endpoint).observe(duration)

    ACTIVE_REQUESTS.dec()

    wrapper.__name__ = f.__name__

    return wrapper

    return decorator

    @app.route('/')

    @track_request('GET', '/')

    def home():

    # Simulate variable processing time

    time.sleep(random.uniform(0.01, 0.1))

    return {'status': 'ok', 'message': 'Hello from the monitored app!'}

    @app.route('/api/process')

    @track_request('GET', '/api/process')

    def process():

    # Simulate heavier processing

    time.sleep(random.uniform(0.05, 0.5))

    ITEMS_IN_QUEUE.set(random.randint(0, 100))

    return {'status': 'processed', 'queue_size': random.randint(0, 100)}

    @app.route('/api/slow')

    @track_request('GET', '/api/slow')

    def slow_endpoint():

    # Intentionally slow for testing alerts

    time.sleep(random.uniform(0.5, 2.0))

    return {'status': 'done', 'note': 'This was intentionally slow'}

    @app.route('/metrics')

    def metrics():

    return Response(generate_latest(), mimetype=CONTENT_TYPE_LATEST)

    if __name__ == '__main__':

    app.run(host='0.0.0.0', port=8000)

    Create app/Dockerfile:

    FROM python:3.12-slim
    

    WORKDIR /app

    COPY requirements.txt .

    RUN pip install --no-cache-dir -r requirements.txt

    COPY app.py .

    EXPOSE 8000

    CMD ["python", "app.py"]

    ---

    Step 5: Alertmanager Configuration

    Create alertmanager/alertmanager.yml:

    global:
    

    resolve_timeout: 5m

    route:

    group_by: ['alertname', 'severity']

    group_wait: 10s

    group_interval: 10s

    repeat_interval: 1h

    receiver: 'slack-notifications'

    routes:

    - match:

    severity: critical

    receiver: 'slack-critical'

    repeat_interval: 15m

    receivers:

    - name: 'slack-notifications'

    slack_configs:

    - api_url: 'https://hooks.slack.com/services/YOUR/WEBHOOK/URL'

    channel: '#monitoring'

    title: '{{ .GroupLabels.alertname }}'

    text: '{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}'

    send_resolved: true

    - name: 'slack-critical'

    slack_configs:

    - api_url: 'https://hooks.slack.com/services/YOUR/WEBHOOK/URL'

    channel: '#incidents'

    title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'

    text: '{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}'

    send_resolved: true

    ---

    Step 6: Grafana Auto-Provisioning

    Create grafana/provisioning/datasources/prometheus.yml:

    apiVersion: 1
    
    

    datasources:

    - name: Prometheus

    type: prometheus

    access: proxy

    url: http://prometheus:9090

    isDefault: true

    editable: false

    Create grafana/provisioning/dashboards/dashboard.yml:

    apiVersion: 1
    
    

    providers:

    - name: 'default'

    orgId: 1

    folder: ''

    type: file

    disableDeletion: false

    editable: true

    options:

    path: /etc/grafana/provisioning/dashboards

    foldersFromFilesStructure: false

    ---

    Step 7: Launch and Test

    # Start the entire stack
    

    docker compose up -d

    # Verify all services are running

    docker compose ps

    # Check Prometheus targets

    curl http://localhost:9090/api/v1/targets | python -m json.tool

    # Generate some traffic to the sample app

    for i in $(seq 1 50); do

    curl -s http://localhost:8000/ > /dev/null

    curl -s http://localhost:8000/api/process > /dev/null

    curl -s http://localhost:8000/api/slow > /dev/null

    done

    # Check metrics

    curl http://localhost:8000/metrics

    Access the services:

  • Prometheus: http://localhost:9090
  • Grafana: http://localhost:3000 (admin/changeme)
  • Alertmanager: http://localhost:9093
  • Sample App: http://localhost:8000
  • ---

    Step 8: Useful PromQL Queries

    Try these queries in the Prometheus UI:

    # Request rate per second
    

    rate(http_requests_total[5m])

    # 95th percentile latency

    histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))

    # CPU usage percentage

    100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)

    # Memory usage percentage

    (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100

    # Disk usage percentage

    (1 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"})) * 100

    # Active HTTP requests

    http_active_requests

    ---

    Step 9: Production Hardening Tips

    1. Secure Grafana: Change default passwords, enable HTTPS, configure OAuth

    2. Retention policy: Adjust --storage.tsdb.retention.time based on storage capacity

    3. Remote storage: Use Thanos or Cortex for long-term metric storage

    4. Service discovery: Replace static configs with Consul, Kubernetes, or EC2 SD

    5. Recording rules: Pre-compute expensive queries for dashboard performance

    # Example recording rule in prometheus.yml
    

    groups:

    - name: recording_rules

    rules:

    - record: job:http_request_duration_seconds:p95

    expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))

    ---

    Cleanup

    # Stop and remove everything
    

    docker compose down -v

    Next Steps

  • Add Loki for log aggregation alongside metrics
  • Integrate Jaeger or Tempo for distributed tracing
  • Set up Grafana OnCall for incident management
  • Explore OpenTelemetry for unified telemetry collection

You now have a production-grade monitoring stack! Start adding your own services as scrape targets and build custom dashboards for your specific needs.