services: llama-vulkan: image: ghcr.io/ggml-org/llama.cpp:server-vulkan container_name: llama-server-vulkan ports: - 8878:8080 volumes: - /home/andy/.lmstudio/models/lmstudio-community:/models # Optionally mount a local .gguf file directly: # - /absolute/path/to/your-model.gguf:/model.gguf:ro devices: - /dev/dri:/dev/dri - /dev/kfd:/dev/kfd environment: # Uncomment to select specific GPU(s) if you have multiple # GGML_VK_VISIBLE_DEVICES: "0" # Use first Vulkan device # Model loading: Option A - Download from Hugging Face (recommended) LLAMA_ARG_HF_REPO: "${HF_REPO:-ggml-org/Qwen3-1.7B-GGUF}" # Model loading: Option B - Use local file (uncomment, comment HF_REPO) # LLAMA_ARG_MODEL: "/model.gguf" # Server settings LLAMA_ARG_HOST: "0.0.0.0" LLAMA_ARG_PORT: "8878" LLAMA_ARG_N_GPU_LAYERS: "99" LLAMA_ARG_CTX_SIZE: "${CTX_SIZE:-8000}" # Optional performance settings LLAMA_ARG_BATCH_SIZE: "${BATCH_SIZE:-512}" LLAMA_ARG_THREADS: "${THREADS:-8}" restart: unless-stopped command: > --models-dir /models --n-gpu-layers all --parallel 2 --flash-attn on --ctx-size 10000 --no-mmap --cache-type-k q8_0 --cache-type-v q8_0 --jinja --reasoning-budget 0 --temp 0.8 --top-p 0.95 --top-k 20 --min-p 0.0 --presence-penalty 1.0 --repeat-penalty 1.0 --port 8080 --host 0.0.0.0