thedocks/llama.cpp/compose.yml

services:
  llama-vulkan:
    image: ghcr.io/ggml-org/llama.cpp:server-vulkan
    container_name: llama-server-vulkan
    ports:
      - 8878:8080
    volumes:
      - /home/andy/.lmstudio/models/lmstudio-community:/root/.cache/llama.cpp/
      # Optionally mount a local .gguf file directly:
      # - /absolute/path/to/your-model.gguf:/model.gguf:ro
    devices:
      - /dev/dri:/dev/dri
      - /dev/kfd:/dev/kfd
    environment:
      # Uncomment to select specific GPU(s) if you have multiple
      # GGML_VK_VISIBLE_DEVICES: "0"  # Use first Vulkan device

      # Model loading: Option A - Download from Hugging Face (recommended)
      LLAMA_ARG_HF_REPO: "${HF_REPO:-ggml-org/Qwen3-1.7B-GGUF}"

      # Model loading: Option B - Use local file (uncomment, comment HF_REPO)
      # LLAMA_ARG_MODEL: "/model.gguf"

      # Server settings
      LLAMA_ARG_HOST: "0.0.0.0"
      LLAMA_ARG_PORT: "8878"
      LLAMA_ARG_N_GPU_LAYERS: "99"
      LLAMA_ARG_CTX_SIZE: "${CTX_SIZE:-8000}"

      # Optional performance settings
      LLAMA_ARG_BATCH_SIZE: "${BATCH_SIZE:-512}"
      LLAMA_ARG_THREADS: "${THREADS:-8}"

    restart: unless-stopped
    command: >
      --n-gpu-layers all
      --parallel 2
      --flash-attn on
      --ctx-size 10000
      --no-mmap
      --cache-type-k q8_0
      --cache-type-v q8_0
      --jinja
      --reasoning-budget 0
      --temp 0.8
      --top-p 0.95
      --top-k 20
      --min-p 0.0
      --presence-penalty 1.0
      --repeat-penalty 1.0
      --port 8080
      --host 0.0.0.0