[mod] add llama

2026-04-23 22:34:03 +01:00 · 2026-04-23 22:34:03 +01:00 · b273987872
commit b273987872
parent 6a5410a926
1 changed files with 38 additions and 0 deletions
--- a/llama.cpp/compose.yml
+++ b/llama.cpp/compose.yml
@ -0,0 +1,38 @@
+services:
+  llama-vulkan:
+    image: ghcr.io/ggml-org/llama.cpp:server-vulkan
+    container_name: llama-server-vulkan
+    ports:
+      - "${HOST_PORT:-8080}:8080"
+    volumes:
+      - /home/andy/.lmstudio/models/lmstudio-community:/root/.cache/llama.cpp/
+      # Optionally mount a local .gguf file directly:
+      # - /absolute/path/to/your-model.gguf:/model.gguf:ro
+    devices:
+      - /dev/dri:/dev/dri
+      - /dev/kfd:/dev/kfd
+    environment:
+      # Uncomment to select specific GPU(s) if you have multiple
+      # GGML_VK_VISIBLE_DEVICES: "0"  # Use first Vulkan device
+      
+      # Model loading: Option A - Download from Hugging Face (recommended)
+      LLAMA_ARG_HF_REPO: "${HF_REPO:-ggml-org/Qwen3-1.7B-GGUF}"
+      
+      # Model loading: Option B - Use local file (uncomment, comment HF_REPO)
+      # LLAMA_ARG_MODEL: "/model.gguf"
+      
+      # Server settings
+      LLAMA_ARG_HOST: "0.0.0.0"
+      LLAMA_ARG_PORT: "8878"
+      LLAMA_ARG_N_GPU_LAYERS: "99"
+      LLAMA_ARG_CTX_SIZE: "${CTX_SIZE:-8000}"
+      
+      # Optional performance settings
+      LLAMA_ARG_BATCH_SIZE: "${BATCH_SIZE:-512}"
+      LLAMA_ARG_THREADS: "${THREADS:-8}"
+      
+    restart: unless-stopped
+    command: [
+      "--jinja",  # Enable Jinja templates for chat formatting
+      "--flash-attn"  # Enable flash attention if supported
+    ]