initial commit

2025-10-16 12:35:37 +00:00 · 2025-08-29 09:53:07 +02:00 · 2025-08-29 09:53:07 +02:00 · 7b063e8357
commit 7b063e8357
parent 27ffdf2668
9 changed files with 983 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+example/out
--- a/57
+++ b/57
@ -0,0 +1,57 @@
+FROM docker.io/pytorch/pytorch:2.3.1-cuda12.1-cudnn8-devel
+
+# arguments to build Docker Image using CUDA
+ARG USE_CUDA=0
+ARG TORCH_ARCH="7.0;7.5;8.0;8.6"
+
+ENV AM_I_DOCKER=True
+ENV BUILD_WITH_CUDA="${USE_CUDA}"
+ENV TORCH_CUDA_ARCH_LIST="${TORCH_ARCH}"
+ENV CUDA_HOME=/usr/local/cuda-12.1/
+# ensure CUDA is correctly set up
+ENV PATH=/usr/local/cuda-12.1/bin:${PATH}
+ENV LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64:${LD_LIBRARY_PATH}
+
+# install required packages and specific gcc/g++
+RUN apt-get update && apt-get install --no-install-recommends wget ffmpeg=7:* \
+    libsm6=2:* libxext6=2:* git=1:* nano vim=2:* ninja-build gcc-10 g++-10 git -y \
+    && apt-get clean && apt-get autoremove && rm -rf /var/lib/apt/lists/*
+
+ENV CC=gcc-10
+ENV CXX=g++-10
+
+# clone grounded sam2 repo
+WORKDIR /home/appuser
+RUN git clone https://github.com/IDEA-Research/Grounded-SAM-2
+
+# download sam2 checkpoints
+WORKDIR /home/appuser/Grounded-SAM-2/checkpoints
+RUN bash download_ckpts.sh
+
+# download grounding dino checkpoints
+WORKDIR /home/appuser/Grounded-SAM-2/gdino_checkpoints
+RUN bash download_ckpts.sh
+
+WORKDIR /home/appuser/Grounded-SAM-2
+
+# install essential Python packages
+RUN python -m pip install --upgrade pip "setuptools>=62.3.0,<75.9" wheel numpy \
+    opencv-python transformers supervision pycocotools addict yapf timm
+
+# install segment_anything package in editable mode
+RUN python -m pip install -e .
+
+# install grounding dino 
+RUN python -m pip install --no-build-isolation -e grounding_dino
+
+# install the server dependencies
+COPY requirements.txt requirements.txt
+RUN python -m pip install -r requirements.txt
+
+COPY app.py app.py
+COPY imagesegmentation.py imagesegmentation.py
+
+# RUN mkdir ../host
+
+# start the server
+ENTRYPOINT ["python", "app.py", "--log-level", "debug"]
--- a/43
+++ b/43
@ -0,0 +1,43 @@
+# Get version of CUDA and enable it for compilation if CUDA > 11.0
+# This solves https://github.com/IDEA-Research/Grounded-Segment-Anything/issues/53
+# and https://github.com/IDEA-Research/Grounded-Segment-Anything/issues/84
+# when running in Docker
+# Check if nvcc is installed
+NVCC := $(shell which nvcc)
+ifeq ($(NVCC),)
+	# NVCC not found
+	USE_CUDA := 0
+	NVCC_VERSION := "not installed"
+else
+	NVCC_VERSION := $(shell nvcc --version | grep -oP 'release \K[0-9.]+')
+	USE_CUDA := $(shell echo "$(NVCC_VERSION) > 11" | bc -l)
+endif
+
+# Add the list of supported ARCHs
+ifeq ($(USE_CUDA), 1)
+	TORCH_CUDA_ARCH_LIST := "7.0;7.5;8.0;8.6+PTX"
+	BUILD_MESSAGE := "Trying to build the image with CUDA support"
+else
+	TORCH_CUDA_ARCH_LIST :=
+	BUILD_MESSAGE := "CUDA $(NVCC_VERSION) is not supported"
+endif
+
+build: 
+	docker build --build-arg USE_CUDA=$(USE_CUDA) \
+	--build-arg TORCH_ARCH=$(TORCH_CUDA_ARCH_LIST) \
+	--progress=plain -t gsam2 .
+
+run:
+	docker run -d --gpus all \
+	--restart unless-stopped \
+	--name=gsam2 \
+	--ipc=host -p 13337:13337 gsam2
+
+run-bash:
+	docker run -it --rm --gpus all \
+	-v "${PWD}":/home/appuser/host \
+	--entrypoint bash \
+	--name=gsam2 \
+	--network=host \
+	--ipc=host gsam2
+
--- a/README.md
+++ b/README.md
@ -0,0 +1,14 @@
+# GSAM Service
+
+Simple server providing [Grounded SAM2](https://github.com/IDEA-Research/Grounded-SAM-2) through an REST API
+
+## Usage
+
+Build and run the container
+
+```
+make build
+make run
+```
+
+You can then connect to the server on port 13337. Have a look at the `example/main.go` for examples of the provided endpoints.
--- a/app.py
+++ b/app.py
@ -0,0 +1,303 @@
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import List, Tuple, Union, Optional
+import base64
+import io
+from PIL import Image
+import numpy as np
+import cv2
+from imagesegmentation import ImageSegmentation
+
+app = FastAPI(
+    title="GSAM2 API",
+    description="Grounded SAM 2 Image Segmentation API",
+    version="1.0.0",
+)
+
+segmentation_model = ImageSegmentation()
+
+# pydantic models for request validation
+class Point(BaseModel):
+    x: int
+    y: int
+    include: bool  # True for include, False for exclude
+
+
+class BoundingBox(BaseModel):
+    upper_left: Tuple[int, int]  # (x, y) coordinates
+    lower_right: Tuple[int, int]  # (x, y) coordinates
+
+
+class MaskFromTextRequest(BaseModel):
+    image: str  # base64 encoded image
+    text: str
+
+
+class MaskFromBBoxRequest(BaseModel):
+    image: str  # base64 encoded image
+    bboxes: List[BoundingBox]
+
+
+class MaskFromPointsRequest(BaseModel):
+    image: str  # base64 encoded image
+    points: List[Point]
+
+
+class MaskResult(BaseModel):
+    mask: str
+    score: float
+    bbox: BoundingBox  # bounding box generated from the mask
+
+    # fields, only populated in responses for MaskFromTextRequests
+    class_name: str = ""
+    dino_bbox: BoundingBox = BoundingBox(upper_left=(0, 0), lower_right=(0, 0))
+    center_of_mass: Tuple[float, float] = (0.0, 0.0)
+
+
+class MaskResponse(BaseModel):
+    masks: List[MaskResult]  # list of base64 encoded mask images and respectivescores
+    image: str  # base64 encoded result image
+
+
+def decode_base64_image(base64_string: str) -> Image.Image:
+    """Helper function to decode base64 image string to PIL Image"""
+    try:
+        # remove data URL prefix if present
+        if base64_string.startswith("data:image"):
+            base64_string = base64_string.split(",")[1]
+
+        image_data = base64.b64decode(base64_string)
+        image = Image.open(io.BytesIO(image_data))
+        return image
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Invalid image data: {str(e)}")
+
+
+def encode_mask_to_base64(mask: np.ndarray) -> str:
+    """Helper function to encode mask array to base64 string"""
+    try:
+        # convert mask to PIL Image (assuming binary mask)
+        mask_image = Image.fromarray((mask * 255).astype(np.uint8), mode="L")
+
+        # convert to base64
+        buffer = io.BytesIO()
+        mask_image.save(buffer, format="JPEG")
+        mask_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
+        return mask_base64
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to encode mask: {str(e)}")
+
+
+def encode_image_to_base64(image: np.ndarray) -> str:
+    """Helper function to encode cv2 image array to base64 string"""
+    try:
+        pil_image = Image.fromarray(image.astype(np.uint8))
+
+        # convert to base64
+        buffer = io.BytesIO()
+        pil_image.save(buffer, format="JPEG")
+        image_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
+        return image_base64
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to encode image: {str(e)}")
+
+
+@app.get("/")
+async def root():
+    return {"message": "GSAM2 API Server", "version": "1.0.0"}
+
+
+@app.post("/gsam2/image/maskfromtext", response_model=MaskResponse)
+async def mask_from_text(request: MaskFromTextRequest):
+    """
+    Generate segmentation masks from an image and text description.
+
+    Args:
+        request: Contains base64 encoded image and text description
+
+    Returns:
+        MaskResponse with list of base64 encoded masks, their scores and result image
+    """
+    try:
+        # decode the input image
+        pil_image = decode_base64_image(request.image)
+        text = request.text
+
+        # segment the image
+        masks, annotated_image = segmentation_model.segment_image_from_text(
+            pil_image, text
+        )
+
+        # encode the results
+        enc_masks = [
+            MaskResult(
+                mask=encode_mask_to_base64(mask),
+                score=score,
+                bbox=BoundingBox(
+                    upper_left=(bbox[0], bbox[1]), lower_right=(bbox[2], bbox[3])
+                ),
+                class_name=class_name,
+                dino_bbox=BoundingBox(
+                    upper_left=(round(dino_bbox[0]), round(dino_bbox[1])),
+                    lower_right=(round(dino_bbox[2]), round(dino_bbox[3])),
+                ),
+                center_of_mass=(com[0], com[1]),
+            )
+            for (mask, score, bbox, dino_bbox, class_name, com) in masks
+        ]
+        enc_annotated_image = encode_image_to_base64(annotated_image)
+
+        return MaskResponse(
+            masks=enc_masks,
+            image=enc_annotated_image,
+        )
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Processing error: {str(e)}")
+
+
+@app.post("/gsam2/image/maskfrombboxes", response_model=MaskResponse)
+async def mask_from_bbox(request: MaskFromBBoxRequest):
+    """
+    Generate segmentation masks from an image and bounding box.
+
+    Args:
+        request: Contains base64 encoded image and bounding box coordinates
+
+    Returns:
+        MaskResponse with list of base64 encoded masks, their scores and result image
+    """
+    try:
+        pil_image = decode_base64_image(request.image)
+
+        # validate bounding box coordinates
+        bboxes = None
+        for bbox in request.bboxes:
+            x1, y1 = bbox.upper_left
+            x2, y2 = bbox.lower_right
+
+            if x1 >= x2 or y1 >= y2:
+                raise HTTPException(
+                    status_code=400,
+                    detail="Invalid bounding box: upper_left must be above and left of lower_right",
+                )
+
+            if x1 < 0 or y1 < 0 or x2 > pil_image.width or y2 > pil_image.height:
+                raise HTTPException(
+                    status_code=400,
+                    detail="Bounding box coordinates out of image bounds",
+                )
+
+            # convert to numpy array format expected by ImageSegmentation
+            if bboxes is None:
+                bboxes = np.array([[x1, y1, x2, y2]])
+            else:
+                bboxes = np.vstack((bboxes, [[x1, y1, x2, y2]]))
+
+        if bboxes is None:
+            raise HTTPException(
+                status_code=400, detail="At least one bounding box is required"
+            )
+
+        # segment the image
+        (masks, annotated_image) = segmentation_model.segment_image_from_bbox(
+            pil_image, bboxes
+        )
+
+        # encode the results
+        enc_masks = [
+            MaskResult(
+                mask=encode_mask_to_base64(mask),
+                score=score,
+                bbox=BoundingBox(
+                    upper_left=(bbox[0], bbox[1]), lower_right=(bbox[2], bbox[3])
+                ),
+            )
+            for (mask, score, bbox) in masks
+        ]
+        enc_annotated_image = encode_image_to_base64(annotated_image)
+
+        return MaskResponse(
+            masks=enc_masks,
+            image=enc_annotated_image,
+        )
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Processing error: {str(e)}")
+
+
+@app.post("/gsam2/image/maskfrompoints", response_model=MaskResponse)
+async def mask_from_points(request: MaskFromPointsRequest):
+    """
+    Generate segmentation masks from an image and list of points with include/exclude indicators.
+
+    Args:
+        request: Contains base64 encoded image and list of points with include/exclude flags
+
+    Returns:
+        MaskResponse with list of base64 encoded masks, their scores and result image
+    """
+    try:
+        pil_image = decode_base64_image(request.image)
+
+        # validate point coordinates
+        for i, point in enumerate(request.points):
+            if (
+                point.x < 0
+                or point.x >= pil_image.width
+                or point.y < 0
+                or point.y >= pil_image.height
+            ):
+                raise HTTPException(
+                    status_code=400, detail=f"Point {i} coordinates out of image bounds"
+                )
+
+        # convert points to numpy array format expected by ImageSegmentation
+        points = None
+        if request.points is not None and len(request.points) > 0:
+            points = np.array(
+                [[point.x, point.y, point.include] for point in request.points]
+            )
+
+        if points is None:
+            raise HTTPException(
+                status_code=400, detail="At least one point is required"
+            )
+
+        # segment the image
+        (masks, annotated_image) = segmentation_model.segment_image_from_points(
+            pil_image, points
+        )
+
+        # encode the results
+        enc_masks = [
+            MaskResult(
+                mask=encode_mask_to_base64(mask),
+                score=score,
+                bbox=BoundingBox(
+                    upper_left=(bbox[0], bbox[1]), lower_right=(bbox[2], bbox[3])
+                ),
+            )
+            for (mask, score, bbox) in masks
+        ]
+        enc_annotated_image = encode_image_to_base64(annotated_image)
+
+        return MaskResponse(
+            masks=enc_masks,
+            image=enc_annotated_image,
+        )
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Processing error: {str(e)}")
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run(app, host="0.0.0.0", port=13337)
--- a/example/main.go
+++ b/example/main.go
@ -0,0 +1,245 @@
+package main
+
+import (
+	"bytes"
+	"encoding/base64"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"net/http/httputil"
+	"os"
+	"time"
+)
+
+var (
+	url        = "http://localhost:13337"
+	fromText   = "/gsam2/image/maskfromtext"
+	fromBboxes = "/gsam2/image/maskfrombboxes"
+	fromPoints = "/gsam2/image/maskfrompoints"
+
+	image = "truck.jpg"
+)
+
+type (
+	Point struct {
+		X       int  `json:"x"`
+		Y       int  `json:"y"`
+		Include bool `json:"include"`
+	}
+
+	BoundingBox struct {
+		UpperLeft  [2]int `json:"upper_left"`
+		LowerRight [2]int `json:"lower_right"`
+	}
+
+	MaskFromTextRequest struct {
+		Image string `json:"image"`
+		Text  string `json:"text"`
+	}
+
+	MaskFromBBoxRequest struct {
+		Image  string        `json:"image"`
+		Bboxes []BoundingBox `json:"bboxes"`
+	}
+
+	MaskFromPointsRequest struct {
+		Image  string  `json:"image"`
+		Points []Point `json:"points"`
+	}
+
+	MaskResult struct {
+		Mask  string      `json:"mask"`
+		Score float64     `json:"score"`
+		BBox  BoundingBox `json:"bbox"`
+		// fields, only populated in responses for MaskFromTextRequests
+		ClassName    string      `json:"class_name"`
+		DinoBBox     BoundingBox `json:"dino_bbox"`
+		CenterOfMass [2]float64  `json:"center_of_mass"`
+	}
+
+	MaskResponse struct {
+		Masks []MaskResult `json:"masks"`
+		Image string       `json:"image"`
+	}
+)
+
+func main() {
+	// ensure the out directory exists
+	os.Mkdir("out", 0755)
+
+	// load the sample image and base64 encode it
+	dat, err := os.ReadFile(image)
+	if err != nil {
+		fmt.Println(err)
+		return
+	}
+
+	// post to the different endpoints
+	c := &http.Client{Timeout: time.Minute}
+	encImage := base64.StdEncoding.EncodeToString(dat)
+
+	// from text
+	err = doFromText(c, "fromtext", encImage, "truck. tire.")
+	if err != nil {
+		fmt.Printf("error %s", err)
+		return
+	}
+
+	// from bboxes
+	err = doFromBboxes(c, "frombboxes", encImage, []BoundingBox{
+		{
+			UpperLeft:  [2]int{75, 275},
+			LowerRight: [2]int{1725, 850},
+		},
+		{
+			UpperLeft:  [2]int{425, 600},
+			LowerRight: [2]int{700, 875},
+		},
+		{
+			UpperLeft:  [2]int{1375, 550},
+			LowerRight: [2]int{1650, 800},
+		},
+		{
+			UpperLeft:  [2]int{1240, 675},
+			LowerRight: [2]int{1400, 750},
+		},
+	})
+	if err != nil {
+		fmt.Printf("error %s", err)
+		return
+	}
+
+	// from points
+	err = doFromPoints(c, "frompoints", encImage, []Point{
+		{X: 500, Y: 375, Include: true},
+		{X: 1125, Y: 625, Include: true},
+		{X: 575, Y: 750, Include: false},
+	})
+	if err != nil {
+		fmt.Printf("error %s", err)
+		return
+	}
+}
+
+func do(c *http.Client, req *http.Request, outname string) error {
+
+	dump, err := httputil.DumpRequest(req, false)
+	if err != nil {
+		return err
+	}
+	fmt.Println("request: ", string(dump))
+
+	resp, err := c.Do(req)
+	if err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode >= 300 {
+		dump, err := httputil.DumpResponse(resp, true)
+		if err != nil {
+			return err
+		}
+		fmt.Println("response: ", string(dump))
+		defer resp.Body.Close()
+	} else {
+		dump, err := httputil.DumpResponse(resp, false)
+		if err != nil {
+			return err
+		}
+		fmt.Println("response: ", string(dump))
+	}
+
+	bodyBytes, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return err
+	}
+
+	maskResp := MaskResponse{}
+	err = json.Unmarshal(bodyBytes, &maskResp)
+	if err != nil {
+		return err
+	}
+
+	// write the masks to a file
+	for _, mask := range maskResp.Masks {
+		dec, err := base64.StdEncoding.DecodeString(mask.Mask)
+		if err != nil {
+			return err
+		}
+		class := ""
+		if mask.ClassName != "" {
+			class = "-" + mask.ClassName
+		}
+		os.WriteFile(fmt.Sprintf("out/%s%s-%.4f.jpg", outname, class, mask.Score), dec, 0644)
+	}
+
+	dec, err := base64.StdEncoding.DecodeString(maskResp.Image)
+	if err != nil {
+		return err
+	}
+	os.WriteFile(fmt.Sprintf("out/%s.jpg", outname), dec, 0644)
+
+	return nil
+}
+
+func doFromText(c *http.Client, outname string, encImage string, text string) error {
+	req, err := http.NewRequest("POST", url+fromText, nil)
+	if err != nil {
+		return err
+	}
+	req.Header.Add("Accept", `application/json`)
+
+	body := MaskFromTextRequest{
+		Image: encImage,
+		Text:  text,
+	}
+	jsonBody, err := json.Marshal(body)
+	if err != nil {
+		return err
+	}
+	req.Body = io.NopCloser(bytes.NewBuffer(jsonBody))
+
+	return do(c, req, outname)
+}
+
+func doFromBboxes(c *http.Client, outname string, encImage string, bboxes []BoundingBox) error {
+	req, err := http.NewRequest("POST", url+fromBboxes, nil)
+	if err != nil {
+		return err
+	}
+	req.Header.Add("Accept", `application/json`)
+
+	body := MaskFromBBoxRequest{
+		Image:  encImage,
+		Bboxes: bboxes,
+	}
+	jsonBody, err := json.Marshal(body)
+	if err != nil {
+		return err
+	}
+	req.Body = io.NopCloser(bytes.NewBuffer(jsonBody))
+
+	return do(c, req, outname)
+}
+
+func doFromPoints(c *http.Client, outname string, encImage string, points []Point) error {
+	req, err := http.NewRequest("POST", url+fromPoints, nil)
+	if err != nil {
+		return err
+	}
+	req.Header.Add("Accept", `application/json`)
+
+	body := MaskFromPointsRequest{
+		Image:  encImage,
+		Points: points,
+	}
+	jsonBody, err := json.Marshal(body)
+	if err != nil {
+		return err
+	}
+	req.Body = io.NopCloser(bytes.NewBuffer(jsonBody))
+
+	return do(c, req, outname)
+}
--- a/example/truck.jpg
+++ b/example/truck.jpg
--- a/imagesegmentation.py
+++ b/imagesegmentation.py
@ -0,0 +1,313 @@
+import numpy as np
+import supervision as sv
+import cv2
+import PIL
+from scipy import ndimage
+from typing import List, Tuple, Union, Optional
+
+import torch
+from torchvision.ops import box_convert
+
+from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+
+from grounding_dino.groundingdino.util.inference import load_model, load_image, predict
+import grounding_dino.groundingdino.datasets.transforms as T
+
+
+class ImageSegmentation:
+    def __init__(self):
+        # select the device for computation
+        if torch.cuda.is_available():
+            self.device = torch.device("cuda")
+        else:
+            self.device = torch.device("cpu")
+        print(f"using device: {self.device}")
+
+        if self.device.type == "cuda":
+            # NOTE: somehow this didn't work locally inside a docker container
+            # use bfloat16 for the entire notebook
+            # orignal:
+            # torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
+            # might work without or this:
+            # torch.autocast("cuda", dtype=torch.float16).__enter__()
+            # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
+            if torch.cuda.get_device_properties(0).major >= 8:
+                torch.backends.cuda.matmul.allow_tf32 = True
+                torch.backends.cudnn.allow_tf32 = True
+
+        sam2_checkpoint = "checkpoints/sam2.1_hiera_large.pt"
+        model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"
+
+        self.sam2_model = build_sam2(model_cfg, sam2_checkpoint, device=self.device)
+        self.sam2_predictor = SAM2ImagePredictor(self.sam2_model)
+
+        grounding_dino_config = (
+            "grounding_dino/groundingdino/config/GroundingDINO_SwinT_OGC.py"
+        )
+        grounding_dino_checkpoint = "gdino_checkpoints/groundingdino_swint_ogc.pth"
+        self.box_threshold = 0.35
+        self.text_threshold = 0.25
+
+        self.grounding_model = load_model(
+            model_config_path=grounding_dino_config,
+            model_checkpoint_path=grounding_dino_checkpoint,
+            device=self.device,
+        )
+
+    def segment_image_from_text(self, pil_image: PIL.Image.Image, text: str):
+        """Generate segmentation masks from image and text description using Grounding DINO + SAM2.
+
+        Args:
+            pil_image: PIL image that should be segmented
+            text: object description(s) to be segmented
+
+        Returns:
+            List of C tuples (mask, score) with mask (HxW) and float score
+            Result image
+        """
+
+        # image preparation taken from load_image() in Grounded-SAM-2/grounding_dino/groundingdino/util/inference.py
+        pil_image = pil_image.convert("RGB")
+        transform = T.Compose(
+            [
+                T.RandomResize([800], max_size=1333),
+                T.ToTensor(),
+                T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+            ]
+        )
+        image = np.asarray(pil_image)
+        image_transformed, _ = transform(pil_image, None)
+
+        # set the image for sam2
+        self.sam2_predictor.set_image(image)
+
+        # predict the bounding boxes
+        boxes, confidences, labels = predict(
+            model=self.grounding_model,
+            image=image_transformed,
+            caption=text,
+            box_threshold=self.box_threshold,
+            text_threshold=self.text_threshold,
+            device=self.device,
+        )
+
+        if boxes is None or len(boxes) < 1:
+            return [], image
+
+        # process the box prompt for SAM 2
+        h, w, _ = image.shape
+        boxes = boxes * torch.Tensor([w, h, w, h])
+        input_boxes = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
+
+        # NOTE: somehow this didn't work locally inside a docker container
+        # torch.autocast(device_type=self.device, dtype=torch.bfloat16).__enter__()
+        # if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8:
+        #     # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
+        #     torch.backends.cuda.matmul.allow_tf32 = True
+        #     torch.backends.cudnn.allow_tf32 = True
+
+        masks, scores, logits = self.sam2_predictor.predict(
+            point_coords=None,
+            point_labels=None,
+            box=input_boxes,
+            multimask_output=False,
+        )
+
+        # convert the shape to (n, H, W)
+        if masks.ndim == 4:
+            masks = masks.squeeze(1)
+
+        confidences = confidences.numpy().tolist()
+        class_names = labels
+
+        class_ids = np.array(list(range(len(class_names))))
+
+        labels = [
+            f"{class_name} {confidence:.2f}"
+            for class_name, confidence in zip(class_names, confidences)
+        ]
+
+        return zip(
+            masks,
+            scores,
+            ImageSegmentation._bboxes_from_masks(masks),
+            input_boxes,
+            class_names,
+            ImageSegmentation._centers_of_mass_from_masks(masks),
+        ), ImageSegmentation._create_result_image(
+            image, masks, bboxes=input_boxes, labels=labels, class_ids=class_ids
+        )
+
+    def segment_image_from_bbox(self, pil_image: PIL.Image.Image, bboxes: np.array):
+        """Generate segmentation masks from image and bounding box coordinates using SAM2.
+
+        Args:
+            pil_image: PIL image that should be segmented
+            bboxes: Nx4 array of bounding boxes of objects to be segmented (x1, y1, x2, y2)
+
+        Returns:
+            List of C tuples (mask, score) with mask (HxW) and float score
+            Segmented image
+        """
+        image = np.asarray(pil_image.convert("RGB"))
+        self.sam2_predictor.set_image(image)
+
+        masks, scores, logits = self.sam2_predictor.predict(
+            point_coords=None,
+            point_labels=None,
+            box=np.array(bboxes),
+            multimask_output=False,
+        )
+
+        # convert the shape to (n, H, W)
+        if masks.ndim == 4:
+            masks = masks.squeeze(1)
+
+        return zip(
+            masks, scores, ImageSegmentation._bboxes_from_masks(masks)
+        ), ImageSegmentation._create_result_image(image, masks, bboxes=bboxes)
+
+    def segment_image_from_points(self, pil_image: PIL.Image.Image, points: np.array):
+        """Generate segmentation masks from image and point coordinates with include/exclude labels using SAM2.
+
+        Args:
+            pil_image: PIL image that should be segmented
+            points: Nx3 array of points with include/exclude flags of the objects to be segmented (x, y, include)
+
+        Returns:
+            List of C tuples (mask, score) with mask (HxW) and float score
+            Result image
+        """
+        image = np.asarray(pil_image.convert("RGB"))
+        self.sam2_predictor.set_image(image)
+
+        # convert points to coordinates and labels arrays
+        coords = np.array([[point[0], point[1]] for point in points])
+        labels = np.array([1 if point[2] else 0 for point in points])
+
+        masks, scores, logits = self.sam2_predictor.predict(
+            point_coords=coords,
+            point_labels=labels,
+            multimask_output=False,
+        )
+
+        # convert the shape to (n, H, W)
+        if masks.ndim == 4:
+            masks = masks.squeeze(1)
+
+        return zip(
+            masks, scores, ImageSegmentation._bboxes_from_masks(masks)
+        ), ImageSegmentation._create_result_image(image, masks, points=points)
+
+    def _create_result_image(
+        pil_image: PIL.Image.Image,
+        masks: np.array,
+        bboxes: np.array = [],
+        labels: np.array = [],
+        class_ids: np.array = None,
+        points: np.array = [],
+    ):
+        """Create annotated result image with masks, bounding boxes, labels, and points overlaid.
+
+        Args:
+            pil_image: PIL image that should be segmented
+            masks: NxHxW array of object mask(s)
+            bboxes: (optional) Nx4 array of objects bounding box(es) (x1, y1, x2, y2)
+            labels: (optional) Nx1 array of object label(s)
+            points: (optional) Nx3 array of object point(s) with include/exclude flags (x, y, include)
+
+        Returns:
+            List of C tuples (mask, score) with mask (HxW) and float score
+            Result image
+        """
+        img = np.array(pil_image)
+
+        # we have to define the bboxes in the detections even though we might not show them
+        detection_bboxes = bboxes
+        if bboxes is None or len(bboxes) == 0:
+            detection_bboxes = ImageSegmentation._bboxes_from_masks(masks)
+
+        detections = sv.Detections(
+            xyxy=detection_bboxes,  # (n, 4)
+            mask=masks.astype(bool),  # (n, h, w)
+            class_id=class_ids,
+        )
+        annotated_frame = img.copy()
+
+        # if there is no class ids (i.e. when using without Grounding DINO) we need to derive the
+        # color lookup
+        colorlookup = sv.ColorLookup.INDEX
+        if class_ids is not None and len(class_ids) > 0:
+            colorlookup = sv.ColorLookup.CLASS
+
+        # points
+        if points is not None:
+            for x, y, include in points:
+                # Green for include (True), Red for exclude (False)
+                color = (0, 255, 0) if include else (0, 0, 255)  # BGR format
+                cv2.circle(annotated_frame, (x, y), 8, (0, 0, 0), -1)  # Outer ring
+                cv2.circle(annotated_frame, (x, y), 5, color, -1)  # Filled circle
+
+        # bboxes
+        if len(bboxes) > 0:
+            box_annotator = sv.BoxAnnotator(color_lookup=colorlookup)
+            annotated_frame = box_annotator.annotate(
+                scene=annotated_frame, detections=detections
+            )
+
+        # labels
+        if labels is not None and len(labels) > 0:
+            label_annotator = sv.LabelAnnotator(color_lookup=colorlookup)
+            annotated_frame = label_annotator.annotate(
+                scene=annotated_frame, detections=detections, labels=labels
+            )
+
+        # mask
+        mask_annotator = sv.MaskAnnotator(color_lookup=colorlookup)
+        annotated_frame = mask_annotator.annotate(
+            scene=annotated_frame, detections=detections
+        )
+
+        return annotated_frame
+
+    def _bboxes_from_masks(masks: np.array):
+        """Create bounding boxes for the provided masks
+
+        Args:
+            masks: NxHxW array of object mask(s)
+
+        Returns:
+            bboxes: (optional) Nx4 array of mask bounding box (x1, y1, x2, y2)
+        """
+
+        bboxes = []
+        for mask in masks:
+            mask_bool = np.where(mask != 0)
+            if len(mask_bool) != 0 and len(mask_bool[1]) != 0 and len(mask_bool[0]) != 0:
+                bboxes.append(
+                    [
+                        int(np.min(mask_bool[1])),
+                        int(np.min(mask_bool[0])),
+                        int(np.max(mask_bool[1])),
+                        int(np.max(mask_bool[0])),
+                    ]
+                )
+            else:
+                bboxes.append([0, 0, 0, 0])
+
+        return np.array(bboxes)
+
+    def _centers_of_mass_from_masks(masks: np.array):
+        """Calculate centers of mass for the provided masks
+
+        Args:
+            masks: NxHxW array of object mask(s)
+
+        Returns:
+            centers_of_mass: (optional) Nx2 array of mask center of mass (x1, y1, x2, y2)
+        """
+
+        return np.array(
+            [[x, y] for mask in masks for y, x in [ndimage.center_of_mass(mask)]]
+        )
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,7 @@
+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+pydantic==2.5.0
+pillow==10.1.0
+numpy==1.24.3
+python-multipart==0.0.6
+opencv-python==4.8.1.78