version: "3.9" services: sqlcoder-tgi: image: ghcr.io/huggingface/text-generation-inference:2.1.0 container_name: sqlcoder-tgi restart: unless-stopped ports: - "8080:80" environment: # Set your Hugging Face token to pull gated models HUGGING_FACE_HUB_TOKEN: ${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} # Pin to the 3090 (adjust if your GPU order differs) CUDA_VISIBLE_DEVICES: "0" DISABLE_FLASH_ATTENTION: "1" CUDA_LAUNCH_BLOCKING: "1" command: - "--model-id=defog/sqlcoder-7b-2" - "--dtype=float16" - "--max-total-tokens=6144" - "--max-input-length=4096" - "--max-batch-prefill-tokens=4560" - "--cuda-memory-fraction=0.9" - "--trust-remote-code" volumes: - model-cache:/data deploy: resources: reservations: devices: - capabilities: [gpu] count: 1 driver: nvidia volumes: model-cache: