fastsam3d部署

摘要：本文提出将原始SAM3D中的SAM模型替换为FastSAM模型，构建FastSAM3D系统，并通过ONNXRuntime或TensorRT框架部署以提升推理速度。代码实现了基于FastSAM的图像分割、点云处理和3D重建流程，包括ONNX模型推理、深度图与彩色图对齐、点云坐标转换及分组处理等功能。该系统通过优化后的模型架构和部署方案，显著提高了3D场景理解任务的执行效率。

给算法爸爸上香

362人浏览 · 2026-01-28 09:08:40

给算法爸爸上香 · 2026-01-28 09:08:40 发布

替换原始sam3d的sam为fastsam即为fastsam3d，通过onnxruntime或tensorrt框架部署后，推理速度能够大幅提升。

python

import os
import cv2
import numpy as np
import open3d as o3d
import torch
import json
import pointops
import argparse
import time
from os.path import join
from util import *

import onnxruntime
from fastsam import *
onnx_session = onnxruntime.InferenceSession("FastSAM-s.onnx", providers=['CUDAExecutionProvider'])


def get_seg(image):
    input = letterbox(image, input_shape)
    input = input[:, :, ::-1].transpose(2, 0, 1)[np.newaxis, :]  
    input = input / 255.0

    inputs = {}
    inputs['images'] = input.astype(np.float32)
    outputs = onnx_session.run(None, inputs)

    output = np.squeeze(outputs[0]).astype(dtype=np.float32)      
    mask = output[..., 4] > score_threshold 
    scores = output[mask, 4]
    output[..., :4] = xywh2xyxy(output[..., :4])
    box = output[mask, :4]
    boxes = np.concatenate([box, scores[..., np.newaxis]], axis=1)
    preds = output[mask, :]

    indices = nms(boxes, scores, score_threshold, nms_threshold)
    boxes = boxes[indices] 

    masks_in = np.array(preds)[indices][..., -32:]
    proto = np.squeeze(outputs[1]).astype(dtype=np.float32)
    c, mh, mw = proto.shape 
    masks = (masks_in @ proto.reshape(c, -1)).reshape(-1, mh, mw)    
    downsampled_bboxes = boxes.copy()
    downsampled_bboxes[:, 0] *= mw / input_shape[0]
    downsampled_bboxes[:, 2] *= mw / input_shape[0]
    downsampled_bboxes[:, 3] *= mh / input_shape[1]
    downsampled_bboxes[:, 1] *= mh / input_shape[1]
    masks = crop_mask(masks, downsampled_bboxes)
    resized_masks = []
    for mask in masks:
        mask = cv2.resize(mask, input_shape, cv2.INTER_LINEAR)
        mask = scale_mask(mask, input_shape, image.shape)
        resized_masks.append(mask)
    resized_masks = np.array(resized_masks)
    resized_masks = resized_masks > 0      

    masks = []
    for i in range(resized_masks.shape[0]):
        masks.append({"segmentation":resized_masks[i, :, :]})
    group_ids = np.full((image.shape[0], image.shape[1]), -1, dtype=int)
    group_counter = 0
    for i in range(len(masks)):
        group_ids[masks[i]["segmentation"]] = group_counter
        group_counter += 1
    return group_ids    #(480, 640)


def get_pcd(color_name, rgb_path, save_2dmask_path):
    intrinsic_path = join(rgb_path, 'intrinsics', 'intrinsic_depth.txt')
    depth_intrinsic = np.loadtxt(intrinsic_path)

    pose = join(rgb_path, 'pose', color_name[0:-4] + '.txt')
    depth = join(rgb_path, 'depth', color_name[0:-4] + '.png')
    color = join(rgb_path, 'color', color_name)

    depth_img = cv2.imread(depth, -1) # read 16bit grayscale image
    mask = (depth_img != 0)
    color_image = cv2.imread(color)
    color_image = cv2.resize(color_image, (640, 480))

    group_ids = get_seg(color_image)

    group_ids = group_ids[mask] #(480, 640)->276497

    color_image = np.reshape(color_image[mask], [-1,3])
    colors = np.zeros_like(color_image)
    colors[:,0] = color_image[:,2]
    colors[:,1] = color_image[:,1]
    colors[:,2] = color_image[:,0]

    pose = np.loadtxt(pose)
    x,y = np.meshgrid(np.linspace(0,depth_img.shape[1]-1,depth_img.shape[1]), np.linspace(0,depth_img.shape[0]-1,depth_img.shape[0]))
    uv_depth = np.zeros((depth_img.shape[0], depth_img.shape[1], 3))
    uv_depth[:,:,0] = x
    uv_depth[:,:,1] = y
    uv_depth[:,:,2] = depth_img / 1000.0
    uv_depth = np.reshape(uv_depth, [-1,3])
    uv_depth = uv_depth[np.where(uv_depth[:,2]!=0),:].squeeze()
    
    fx = depth_intrinsic[0,0]
    fy = depth_intrinsic[1,1]
    cx = depth_intrinsic[0,2]
    cy = depth_intrinsic[1,2]
    bx = depth_intrinsic[0,3]
    by = depth_intrinsic[1,3]
    n = uv_depth.shape[0]
    points = np.ones((n,4))
    points[:,0] = (uv_depth[:,0]-cx)*uv_depth[:,2]/fx + bx
    points[:,1] = (uv_depth[:,1]-cy)*uv_depth[:,2]/fy + by
    points[:,2] = uv_depth[:,2]
    points_world = np.dot(points, np.transpose(pose))
    group_ids = num_to_natural(group_ids)
    save_dict = dict(coord=points_world[:,:3], color=colors, group=group_ids)
    return save_dict


def make_open3d_point_cloud(input_dict, th):
    input_dict["group"] = remove_small_group(input_dict["group"], th)
    xyz = input_dict["coord"]
    if np.isnan(xyz).any():
        return None
    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(xyz)
    return pcd


def cal_group(input_dict, new_input_dict, match_inds, ratio=0.5):
    group_0 = input_dict["group"]
    group_1 = new_input_dict["group"]
    group_1[group_1 != -1] += group_0.max() + 1
    
    unique_groups, group_0_counts = np.unique(group_0, return_counts=True)
    group_0_counts = dict(zip(unique_groups, group_0_counts))
    unique_groups, group_1_counts = np.unique(group_1, return_counts=True)
    group_1_counts = dict(zip(unique_groups, group_1_counts))

    # Calculate the group number correspondence of overlapping points
    group_overlap = {}
    for i, j in match_inds:
        group_i = group_1[i]
        group_j = group_0[j]
        if group_i == -1:
            group_1[i] = group_0[j]
            continue
        if group_j == -1:
            continue
        if group_i not in group_overlap:
            group_overlap[group_i] = {}
        if group_j not in group_overlap[group_i]:
            group_overlap[group_i][group_j] = 0
        group_overlap[group_i][group_j] += 1

    # Update group information for point cloud 1
    for group_i, overlap_count in group_overlap.items():
        max_index = np.argmax(np.array(list(overlap_count.values())))
        group_j = list(overlap_count.keys())[max_index]
        count = list(overlap_count.values())[max_index]
        total_count = min(group_0_counts[group_j], group_1_counts[group_i]).astype(np.float32)
        if count / total_count >= ratio:
            group_1[group_1 == group_i] = group_j
    return group_1


def cal_2_scenes(pcd_list, index, voxel_size, voxelize, th=50):
    if len(index) == 1:
        return(pcd_list[index[0]])
    input_dict_0 = pcd_list[index[0]]
    input_dict_1 = pcd_list[index[1]]
    pcd0 = make_open3d_point_cloud(input_dict_0, th)
    pcd1 = make_open3d_point_cloud(input_dict_1, th)

    # Cal Dul-overlap
    match_inds = get_matching_indices(pcd1, pcd0, 1.5 * voxel_size, 1)
    pcd1_new_group = cal_group(input_dict_0, input_dict_1, match_inds)

    match_inds = get_matching_indices(pcd0, pcd1, 1.5 * voxel_size, 1)
    input_dict_1["group"] = pcd1_new_group
    pcd0_new_group = cal_group(input_dict_1, input_dict_0, match_inds)

    pcd_new_group = np.concatenate((pcd0_new_group, pcd1_new_group), axis=0)
    pcd_new_group = num_to_natural(pcd_new_group)
    pcd_new_coord = np.concatenate((input_dict_0["coord"], input_dict_1["coord"]), axis=0)
    pcd_new_color = np.concatenate((input_dict_0["color"], input_dict_1["color"]), axis=0)
    pcd_dict = dict(coord=pcd_new_coord, color=pcd_new_color, group=pcd_new_group)

    pcd_dict = voxelize(pcd_dict)
    return pcd_dict


def seg_pcd(scene_name, rgb_path, data_path, voxel_size, voxelize, th, save_2dmask_path):
    pcd_list = []
    color_names = sorted(os.listdir(join(scene_name, 'color')), key=lambda a: int(os.path.basename(a).split('.')[0]))
    for color_name in color_names:
        start = time.time()
        pcd_dict = get_pcd(color_name, rgb_path, save_2dmask_path)
        if len(pcd_dict["coord"]) == 0:
            continue
        pcd_dict = voxelize(pcd_dict)
        pcd_list.append(pcd_dict)
        print(color_name, time.time()-start, flush=True)
    
    while len(pcd_list) !=1:
        print(len(pcd_list), flush=True)
        new_pcd_list = []
        for indice in pairwise_indices(len(pcd_list)):
            pcd_frame = cal_2_scenes(pcd_list, indice, voxel_size=voxel_size, voxelize=voxelize)
            if pcd_frame is not None:
                new_pcd_list.append(pcd_frame)
        pcd_list = new_pcd_list
    seg_dict = pcd_list[0]
    seg_dict["group"] = num_to_natural(remove_small_group(seg_dict["group"], th))

    data_dict = torch.load(data_path)
    # np.savetxt("coord.txt", data_dict["coord"], fmt="%.6f")
    scene_coord = torch.tensor(data_dict["coord"]).cuda().contiguous()
    new_offset = torch.tensor(scene_coord.shape[0]).cuda()
    gen_coord = torch.tensor(seg_dict["coord"]).cuda().contiguous().float()
    offset = torch.tensor(gen_coord.shape[0]).cuda()
    gen_group = seg_dict["group"]
    indices, dis = pointops.knn_query(1, gen_coord, offset, scene_coord, new_offset)
    indices = indices.cpu().numpy()
    group = gen_group[indices.reshape(-1)].astype(np.int16)
    mask_dis = dis.reshape(-1).cpu().numpy() > 0.6
    group[mask_dis] = -1
    np.savetxt(os.path.basename(scene_name) + ".txt", num_to_natural(group), fmt="%d")


def pcd_ensemble(org_path, new_path, data_path, vis_path):
    new_pcd = np.loadtxt(new_path).astype(np.int16)
    new_pcd = num_to_natural(remove_small_group(new_pcd, 20))
    with open(org_path) as f:
        segments = json.load(f)
        org_pcd = np.array(segments['segIndices'])
    match_inds = [(i, i) for i in range(len(new_pcd))]
    new_group = cal_group(dict(group=new_pcd), dict(group=org_pcd), match_inds)
    print(new_group.shape)
    data = torch.load(data_path)
    visualize_partition(data["coord"], new_group, vis_path)


def get_args():
    '''Command line arguments.'''
    parser = argparse.ArgumentParser(description='Segment Anything on ScanNet.')
    parser.add_argument('--rgb_path', type=str, default='/data1/tfy/scannet_images/scene0000_00', help='the path of rgb data')
    parser.add_argument('--data_path', type=str, default='/data1/tfy/scannet_processed/train/scene0000_00.pth', help='the path of pointcload data')
    parser.add_argument('--save_2dmask_path', type=str, default='./', help='Where to save 2D segmentation result from SAM')
    parser.add_argument('--sam_checkpoint_path', type=str, default='/data1/tfy/scannet/sam_vit_h_4b8939.pth', help='the path of checkpoint for SAM')
    parser.add_argument('--img_size', default=[640, 480])
    parser.add_argument('--voxel_size', default=0.05)
    parser.add_argument('--th', default=50, help='threshold of ignoring small groups to avoid noise pixel')
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = get_args()
    print(args)
    voxelize = Voxelize(voxel_size=args.voxel_size, mode="train", keys=("coord", "color", "group"))
    seg_pcd(args.rgb_path, args.rgb_path, args.data_path, args.voxel_size, voxelize, args.th, args.save_2dmask_path)
     
    pcd_ensemble(org_path="/data1/tfy/scannet/scans/scene0000_00/scene0000_00_vh_clean_2.0.010000.segs.json", 
                new_path="build/scene0000_00.txt", 
                data_path="/data1/tfy/scannet_processed/train/scene0000_00.pth",
                vis_path="pred/cloud.ply")

cpp

#include "util.h"
#include "logger.h"
#include "fastsam.h"


int main(int argc, char** argv)
{
    TRTLogger logger;
	nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
    auto engine_data = load_file("FastSAM-s.engine");
	nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());
	if (engine == nullptr)
	{
		printf("Deserialize cuda engine failed.\n");
		return -1;
	}

	nvinfer1::IExecutionContext* execution_context = engine->createExecutionContext();

    float* output0_h = nullptr;
    cudaMallocHost(&output0_h, sizeof(float) * OUTPUT0_SIZE);
    float* output1_h = nullptr;
    cudaMallocHost(&output1_h, sizeof(float) * OUTPUT1_SIZE);

    uint8_t* input_d = nullptr;
    cudaMalloc(&input_d, sizeof(uint8_t) * 3 * 640 * 480);
    float* input0_d = nullptr;
    cudaMalloc(&input0_d, sizeof(float) * INPUT_SIZE);
    float* output0_d = nullptr;
    cudaMalloc(&output0_d, sizeof(float) * OUTPUT0_SIZE);
    float* output1_d = nullptr;
    cudaMalloc(&output1_d, sizeof(float) * OUTPUT1_SIZE);

    cv::Mat depth_intrinsic = loadPoseFromTxt("/data1/tfy/scannet_images/scene0000_00/intrinsics/intrinsic_depth.txt");
    float fx = depth_intrinsic.at<float>(0, 0);
    float fy = depth_intrinsic.at<float>(1, 1);
    float cx = depth_intrinsic.at<float>(0, 2);
    float cy = depth_intrinsic.at<float>(1, 2);
    float bx = depth_intrinsic.at<float>(0, 3);
    float by = depth_intrinsic.at<float>(1, 3);

    std::vector<pcd_tuple> pcd_list(279);
    auto start = std::chrono::steady_clock::now();
    for(int k=0; k<pcd_list.size();k++)
    {
        //auto start = std::chrono::steady_clock::now();
        cv::Mat color_image = cv::imread("/data1/tfy/scannet_images/scene0000_00/color/" + std::to_string(k*20) + ".jpg");
        cv::Mat depth_image = cv::imread("/data1/tfy/scannet_images/scene0000_00/depth/" + std::to_string(k*20) + ".png", cv::IMREAD_UNCHANGED);
        cv::Mat pose = loadPoseFromTxt("/data1/tfy/scannet_images/scene0000_00/pose/" + std::to_string(k*20) + ".txt");

        cv::Mat mask;
        cv::compare(depth_image, 0, mask, cv::CMP_NE); 

        cv::Mat group_id(color_image.rows, color_image.cols, CV_32S, cv::Scalar(-1));
        cudaMemcpy(input_d, color_image.data, sizeof(uint8_t) * 3 * 640 * 480, cudaMemcpyHostToDevice);
        get_seg(color_image, group_id, execution_context, input_d, input0_d, output0_d, output1_d);

        std::vector<int> group_ids;
        int* group_ptr = (int*)group_id.data;
        uchar* ptr = mask.data;
        for (int y = 0; y < mask.rows; ++y)
         {
            for (int x = 0; x < mask.cols; ++x) 
            {
                int id = (mask.cols * y + x);
                if (int(ptr[id]) != 0)
                    group_ids.push_back(group_ptr[id]);
            }
        }
           
        std::vector<cv::Vec3b> colors;
        uchar* p = color_image.data;
        ptr = mask.data;
        for (int y = 0; y < mask.rows; ++y)
         {
            for (int x = 0; x < mask.cols; ++x) 
            {
                int id = (mask.cols * y + x);
                if (int(ptr[id]) != 0)
                { 
                    uchar b = p[id * 3];
                    uchar g = p[id * 3 + 1];
                    uchar r = p[id * 3 + 2];
                    colors.push_back(cv::Vec3b(r, g, b)); 
                }
            }
        }

        cv::Mat uv_depth(depth_image.rows, depth_image.cols, CV_32FC3);
        ushort *pu = (ushort*)depth_image.data;
        float *pf = (float*)uv_depth.data;
        for (int y = 0; y < depth_image.rows; ++y)
         {
            for (int x = 0; x < depth_image.cols; ++x) 
            {
                int id = depth_image.cols * y + x;
                pf[id * 3] = x;
                pf[id * 3 + 1] = y;
                pf[id * 3 + 2] = (float) pu[id] / 1000.0f;
            }
        }

        std::vector<cv::Vec3f> valid_uv_depth;
        pf = (float*)uv_depth.data;
        for (int y = 0; y < depth_image.rows; ++y)
         {
            for (int x = 0; x < depth_image.cols; ++x) 
            {
                int id = (depth_image.cols * y + x) * 3;
                float u = pf[id];
                float v = pf[id + 1];
                float d = pf[id + 2];
                if (d != 0) 
                    valid_uv_depth.push_back(cv::Vec3f(u, v, d));
            }
        }

        cv::Mat points = cv::Mat::ones(valid_uv_depth.size(), 4, CV_32F); 
        pf = (float*)points.data;
        for (int i = 0; i < valid_uv_depth.size(); ++i) 
        {
            float u = valid_uv_depth[i][0];
            float v = valid_uv_depth[i][1];
            float z = valid_uv_depth[i][2];
            
            pf[i * 4 + 0] = (u - cx) * z / fx + bx;
            pf[i * 4 + 1] = (v - cy) * z / fy + by;
            pf[i * 4 + 2] = z;
        }

        cv::Mat points_world = points * pose.t(); 
        cv::Mat coord = points_world.colRange(0, 3);
        
        std::vector<cv::Point3f> coords(coord.rows);
        for (int i = 0; i < coord.rows; ++i) 
        {
            const float* row = coord.ptr<float>(i);
            coords[i] = cv::Point3f(row[0], row[1], row[2]);
        }

        std::vector<int> group_ids_natural = num_to_natural(group_ids);

        pcd_tuple save_dict;
        save_dict.coord = coords;
        save_dict.color = colors;
        save_dict.group = group_ids_natural;     
        pcd_list[k] = voxelize(save_dict);

        // auto end = std::chrono::steady_clock::now();
        // std::chrono::duration<double> duration = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);		
        // std::cout <<k <<" "<< duration.count() << "s" << std::endl;
    }
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> duration = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);		
    std::cout <<" "<< duration.count() << "s" << std::endl;

    while(pcd_list.size() !=1)
    {
        std::cout << "pcd_list.size() = " << pcd_list.size() << std::endl;
        std::vector<pcd_tuple> new_pcd_list;
        for(auto indice : pairwise_indices(pcd_list.size()))
        {
            pcd_tuple pcd_frame = cal_2_scenes(pcd_list, indice, 0.05, 50);
            new_pcd_list.push_back(pcd_frame);
        }
        pcd_list = new_pcd_list;
    }

    pcd_tuple seg_dict = pcd_list[0];

    std::vector<cv::Point3f> pcd = seg_dict.coord;
    seg_dict.group = num_to_natural(remove_small_group(seg_dict.group, 50));

    auto scene_coord = readPointsFromFile("../coord.txt");;
    auto gen_coord = seg_dict.coord;
    auto gen_group = seg_dict.group;
    int *indices; float *dis;
    cudaMallocHost(&indices, scene_coord.size() * sizeof(int));
    cudaMallocHost(&dis, scene_coord.size() * sizeof(float));
    knn_query(scene_coord, gen_coord, indices, dis, 0.05f, 1);

    std::vector<int> group(scene_coord.size());
    for(int i=0; i<scene_coord.size(); i++) 
    {
         group[i] = gen_group[indices[i]];
        if(sqrt(dis[i]) > 0.6)
            group[i] = -1;
    }

    std::vector<int> group_natural = num_to_natural(group);
    std::ofstream file_out("scene0000_00.txt");
    for( int i = 0; i < group_natural.size(); i++)
        file_out << group_natural[i] << std::endl;

    cudaFree(input_d);
    cudaFree(input0_d);
    cudaFree(output0_d);
    cudaFree(output1_d);
    cudaFreeHost(indices);
    cudaFreeHost(dis);

	return 0;
}

分割效果图：
在这里插入图片描述

完整代码工程见：https://github.com/taifyang/fastsam3d

腾讯云开发者社区

腾讯云面向开发者汇聚海量精品云计算使用和开发经验，营造开放的云计算技术生态圈。

更多推荐

终极指南：Flink SQL连接器版本管理从混乱到有序的升级之路

Apache Flink作为流处理领域的佼佼者，其SQL连接器的版本管理一直是开发者面临的核心挑战。本文将系统讲解Flink SQL连接器版本管理的最佳实践，帮助你轻松应对版本兼容性问题，实现从混乱到有序的升级之旅。## 连接器版本管理的常见痛点 😫在Flink应用开发中，连接器版本管理常常让开发者头疼不已。不同版本的连接器可能导致各种兼容性问题，例如API变更、功能差异甚至运行时错误。

腾讯云开发者社区

Elasticsearch复杂数据类型终极指南：从入门到精通

Elasticsearch作为功能强大的搜索引擎，支持多种复杂数据类型，让开发者能够灵活处理各种结构化和非结构化数据。本文将带你全面了解Elasticsearch中的复杂数据类型，从基础概念到实际应用，助你轻松掌握数据建模的核心技巧。## 内部对象：构建层级化数据结构在Elasticsearch中，对象类型（Object）是最基础的复杂数据类型之一，用于表示具有嵌套关系的数据。例如，我们可

腾讯云开发者社区

如何快速搭建Neon无服务器PostgreSQL：面向初学者的完整指南

Neon是一款革命性的无服务器PostgreSQL解决方案，它通过分离存储和计算层，实现了自动扩缩容、类代码式数据库分支以及零级扩展能力。本指南将帮助你从零开始搭建Neon开发环境，体验这款创新数据库的强大功能。## 准备工作：环境要求与依赖项在开始搭建Neon环境前，请确保你的系统满足以下要求：- Linux操作系统（推荐Ubuntu 20.04+或Debian 11+）- Git