version: "3.9"

services:
  sqlcoder-tgi:
    image: ghcr.io/huggingface/text-generation-inference:2.1.0
    container_name: sqlcoder-tgi
    restart: unless-stopped
    ports:
      - "8080:80"
    environment:
      # Set your Hugging Face token to pull gated models
      HUGGING_FACE_HUB_TOKEN: ${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}
      # Pin to the 3090 (adjust if your GPU order differs)
      CUDA_VISIBLE_DEVICES: "0"
      DISABLE_FLASH_ATTENTION: "1"
      CUDA_LAUNCH_BLOCKING: "1"
    command:
      - "--model-id=defog/sqlcoder-7b-2"
      - "--dtype=float16"
      - "--max-total-tokens=6144"
      - "--max-input-length=4096"
      - "--max-batch-prefill-tokens=4560"
      - "--cuda-memory-fraction=0.9"
      - "--trust-remote-code"
    volumes:
      - model-cache:/data
    deploy:
      resources:
        reservations:
          devices:
            - capabilities: [gpu]
              count: 1
              driver: nvidia

volumes:
  model-cache: