diff --git a/llama.cpp/compose.yml b/llama.cpp/compose.yml index e69de29..db0ba86 100644 --- a/llama.cpp/compose.yml +++ b/llama.cpp/compose.yml @@ -0,0 +1,38 @@ +services: + llama-vulkan: + image: ghcr.io/ggml-org/llama.cpp:server-vulkan + container_name: llama-server-vulkan + ports: + - "${HOST_PORT:-8080}:8080" + volumes: + - /home/andy/.lmstudio/models/lmstudio-community:/root/.cache/llama.cpp/ + # Optionally mount a local .gguf file directly: + # - /absolute/path/to/your-model.gguf:/model.gguf:ro + devices: + - /dev/dri:/dev/dri + - /dev/kfd:/dev/kfd + environment: + # Uncomment to select specific GPU(s) if you have multiple + # GGML_VK_VISIBLE_DEVICES: "0" # Use first Vulkan device + + # Model loading: Option A - Download from Hugging Face (recommended) + LLAMA_ARG_HF_REPO: "${HF_REPO:-ggml-org/Qwen3-1.7B-GGUF}" + + # Model loading: Option B - Use local file (uncomment, comment HF_REPO) + # LLAMA_ARG_MODEL: "/model.gguf" + + # Server settings + LLAMA_ARG_HOST: "0.0.0.0" + LLAMA_ARG_PORT: "8878" + LLAMA_ARG_N_GPU_LAYERS: "99" + LLAMA_ARG_CTX_SIZE: "${CTX_SIZE:-8000}" + + # Optional performance settings + LLAMA_ARG_BATCH_SIZE: "${BATCH_SIZE:-512}" + LLAMA_ARG_THREADS: "${THREADS:-8}" + + restart: unless-stopped + command: [ + "--jinja", # Enable Jinja templates for chat formatting + "--flash-attn" # Enable flash attention if supported + ] \ No newline at end of file