37 lines
975 B
YAML
37 lines
975 B
YAML
version: "3.9"
|
|
|
|
services:
|
|
sqlcoder-tgi:
|
|
image: ghcr.io/huggingface/text-generation-inference:2.1.0
|
|
container_name: sqlcoder-tgi
|
|
restart: unless-stopped
|
|
ports:
|
|
- "8080:80"
|
|
environment:
|
|
# Set your Hugging Face token to pull gated models
|
|
HUGGING_FACE_HUB_TOKEN: ${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}
|
|
# Pin to the 3090 (adjust if your GPU order differs)
|
|
CUDA_VISIBLE_DEVICES: "0"
|
|
DISABLE_FLASH_ATTENTION: "1"
|
|
CUDA_LAUNCH_BLOCKING: "1"
|
|
command:
|
|
- "--model-id=defog/sqlcoder-7b-2"
|
|
- "--dtype=float16"
|
|
- "--max-total-tokens=6144"
|
|
- "--max-input-length=4096"
|
|
- "--max-batch-prefill-tokens=4560"
|
|
- "--cuda-memory-fraction=0.9"
|
|
- "--trust-remote-code"
|
|
volumes:
|
|
- model-cache:/data
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- capabilities: [gpu]
|
|
count: 1
|
|
driver: nvidia
|
|
|
|
volumes:
|
|
model-cache:
|