add all files

2026-02-17 09:29:34 -06:00
parent b8c8d67c67
commit 782d203799
21925 changed files with 2433086 additions and 0 deletions
--- a/ai_agents/cane_agent/deployment/README.md
+++ b/ai_agents/cane_agent/deployment/README.md
@@ -0,0 +1,33 @@
+# Local TGI Deployment for SQLCoder
+
+This docker compose file runs Hugging Face Text Generation Inference (TGI) serving the `defog/sqlcoder-7b-2` model.
+
+## Prerequisites
+- NVIDIA GPU with recent drivers and CUDA runtime that matches the `nvidia-container-toolkit` installation.
+- Docker and `docker compose` v2.
+- Hugging Face access token with model download permissions (`HUGGING_FACE_HUB_TOKEN`).
+
+## Usage
+1. Export your Hugging Face token in the shell where you run compose:
+   ```powershell
+   $env:HUGGING_FACE_HUB_TOKEN = "hf_..."
+   ```
+2. Launch the stack:
+   ```powershell
+   docker compose -f db_agent/deployment/docker-compose.yml up -d
+   ```
+3. Check logs:
+   ```powershell
+   docker compose -f db_agent/deployment/docker-compose.yml logs -f
+   ```
+4. The TGI OpenAI-compatible endpoint will be available at `http://localhost:8080/v1`. Use it with `openai`-compatible SDKs or direct HTTP calls.
+
+## Notes
+- The compose file pins `CUDA_VISIBLE_DEVICES=2` to target the 24 GB RTX 3090; update if your GPU indices differ.
+- Token limits are tightened (`--max-total-tokens=3072`, `--max-input-length=2048`) to stay within 16–24 GB cards.
+- Models are cached on the `model-cache` volume to avoid re-downloading.
+- To shut down:
+  ```powershell
+  docker compose -f db_agent/deployment/docker-compose.yml down
+  ```
+- For CPU-only testing, remove the `deploy.resources` block and expect very slow inference.
--- a/ai_agents/cane_agent/deployment/docker-compose.yml
+++ b/ai_agents/cane_agent/deployment/docker-compose.yml
@@ -0,0 +1,36 @@
+version: "3.9"
+
+services:
+  sqlcoder-tgi:
+    image: ghcr.io/huggingface/text-generation-inference:2.1.0
+    container_name: sqlcoder-tgi
+    restart: unless-stopped
+    ports:
+      - "8080:80"
+    environment:
+      # Set your Hugging Face token to pull gated models
+      HUGGING_FACE_HUB_TOKEN: ${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}
+      # Pin to the 3090 (adjust if your GPU order differs)
+      CUDA_VISIBLE_DEVICES: "0"
+      DISABLE_FLASH_ATTENTION: "1"
+      CUDA_LAUNCH_BLOCKING: "1"
+    command:
+      - "--model-id=defog/sqlcoder-7b-2"
+      - "--dtype=float16"
+      - "--max-total-tokens=6144"
+      - "--max-input-length=4096"
+      - "--max-batch-prefill-tokens=4560"
+      - "--cuda-memory-fraction=0.9"
+      - "--trust-remote-code"
+    volumes:
+      - model-cache:/data
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - capabilities: [gpu]
+              count: 1
+              driver: nvidia
+
+volumes:
+  model-cache: