[mod] add llama
This commit is contained in:
parent
6a5410a926
commit
b273987872
1 changed files with 38 additions and 0 deletions
|
|
@ -0,0 +1,38 @@
|
|||
services:
|
||||
llama-vulkan:
|
||||
image: ghcr.io/ggml-org/llama.cpp:server-vulkan
|
||||
container_name: llama-server-vulkan
|
||||
ports:
|
||||
- "${HOST_PORT:-8080}:8080"
|
||||
volumes:
|
||||
- /home/andy/.lmstudio/models/lmstudio-community:/root/.cache/llama.cpp/
|
||||
# Optionally mount a local .gguf file directly:
|
||||
# - /absolute/path/to/your-model.gguf:/model.gguf:ro
|
||||
devices:
|
||||
- /dev/dri:/dev/dri
|
||||
- /dev/kfd:/dev/kfd
|
||||
environment:
|
||||
# Uncomment to select specific GPU(s) if you have multiple
|
||||
# GGML_VK_VISIBLE_DEVICES: "0" # Use first Vulkan device
|
||||
|
||||
# Model loading: Option A - Download from Hugging Face (recommended)
|
||||
LLAMA_ARG_HF_REPO: "${HF_REPO:-ggml-org/Qwen3-1.7B-GGUF}"
|
||||
|
||||
# Model loading: Option B - Use local file (uncomment, comment HF_REPO)
|
||||
# LLAMA_ARG_MODEL: "/model.gguf"
|
||||
|
||||
# Server settings
|
||||
LLAMA_ARG_HOST: "0.0.0.0"
|
||||
LLAMA_ARG_PORT: "8878"
|
||||
LLAMA_ARG_N_GPU_LAYERS: "99"
|
||||
LLAMA_ARG_CTX_SIZE: "${CTX_SIZE:-8000}"
|
||||
|
||||
# Optional performance settings
|
||||
LLAMA_ARG_BATCH_SIZE: "${BATCH_SIZE:-512}"
|
||||
LLAMA_ARG_THREADS: "${THREADS:-8}"
|
||||
|
||||
restart: unless-stopped
|
||||
command: [
|
||||
"--jinja", # Enable Jinja templates for chat formatting
|
||||
"--flash-attn" # Enable flash attention if supported
|
||||
]
|
||||
Loading…
Add table
Add a link
Reference in a new issue