thedocks/llama.cpp/compose.yml
2026-04-23 22:58:27 +01:00

53 lines
No EOL
1.5 KiB
YAML

services:
llama-vulkan:
image: ghcr.io/ggml-org/llama.cpp:server-vulkan
container_name: llama-server-vulkan
ports:
- 8878:8080
volumes:
- /home/andy/.lmstudio/models/lmstudio-community:/models
# Optionally mount a local .gguf file directly:
# - /absolute/path/to/your-model.gguf:/model.gguf:ro
devices:
- /dev/dri:/dev/dri
- /dev/kfd:/dev/kfd
environment:
# Uncomment to select specific GPU(s) if you have multiple
# GGML_VK_VISIBLE_DEVICES: "0" # Use first Vulkan device
# Model loading: Option A - Download from Hugging Face (recommended)
LLAMA_ARG_HF_REPO: "${HF_REPO:-ggml-org/Qwen3-1.7B-GGUF}"
# Model loading: Option B - Use local file (uncomment, comment HF_REPO)
# LLAMA_ARG_MODEL: "/model.gguf"
# Server settings
LLAMA_ARG_HOST: "0.0.0.0"
LLAMA_ARG_PORT: "8878"
LLAMA_ARG_N_GPU_LAYERS: "99"
LLAMA_ARG_CTX_SIZE: "${CTX_SIZE:-8000}"
# Optional performance settings
LLAMA_ARG_BATCH_SIZE: "${BATCH_SIZE:-512}"
LLAMA_ARG_THREADS: "${THREADS:-8}"
restart: unless-stopped
command: >
--models-dir /models
--n-gpu-layers all
--parallel 2
--flash-attn on
--ctx-size 10000
--no-mmap
--cache-type-k q8_0
--cache-type-v q8_0
--jinja
--reasoning-budget 0
--temp 0.8
--top-p 0.95
--top-k 20
--min-p 0.0
--presence-penalty 1.0
--repeat-penalty 1.0
--port 8080
--host 0.0.0.0