
测试 cuda gpu 一秒钟能launch 多少次 kernel
启动100000次带参数的空 kernel, 总耗时124ms;
·
启动100000次带参数的空 kernel, 总耗时124ms;
编译:
nvcc vectorAdd.cu
源文件:
vectorAdd.cu
#include <algorithm>
#include <stdio.h>
#include <iostream>
using namespace std;
__global__
void saxpy(int n, float a, float *x, float *y)
{
/*
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
y[i] = a*x[i] + y[i];
*/
}
int main(void)
{
int N = 1<<10;
float *x, *y, *d_x, *d_y;
x = (float*)malloc(N*sizeof(float));
y = (float*)malloc(N*sizeof(float));
cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
dim3 grid_;
dim3 block_;
block_.x = 256;
grid_.x = (N+block_.x-1)/block_.x;
cudaEvent_t start, end;
cudaEventCreate(&start);
cudaEventCreate(&end);
cudaEventRecord(start);
// Perform SAXPY on 1M elements
for(unsigned i=0; i<100000; i++)
saxpy<<<grid_, block_, 0, NULL>>>(N, 2.0f, d_x, d_y);
cudaEventRecord(end);
cudaEventSynchronize(end);
//
float time_ms = 0.0f;
cudaEventElapsedTime(&time_ms, start, end);
std::cout << "CUDA Kernel time: " << time_ms << " ms" << std::endl;
cudaEventDestroy(start);
cudaEventDestroy(end);
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
float maxError = 0.0f;
int result = 0;
for (int i = 0; i < N; i++)
maxError = max(maxError, abs(y[i]-4.0f));
printf("Max error: %f\n", maxError);
if(maxError < 1e-5){
printf("SUCCESS\n");
}
else{
result = 1;
printf("FAILURE\n");
}
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
return result;
}
laptop rtx3060 显卡:
124ms/100000 = 0.00124 ms = 1.24us / launch
更多推荐
所有评论(0)