CUDA中的cudaMemcpy2D和cudaMallocPitch使用详解
#include#include#include#define N 3 //类似数组的行#define M 5 //类似数组的列#define GridSize 16#define BlockSize 16#includeusing namespace std;__global__ void kernel(float * d_matrix, size_t pit
·
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#define N 3 //类似数组的行
#define M 5 //类似数组的列
#define GridSize 16
#define BlockSize 16
#include<iostream>
using namespace std;
__global__ void kernel(float * d_matrix, size_t pitch) {
int count = 1;
for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < N; j += blockDim.y * gridDim.y)
{
float* row_d_matrix = (float*)((char*)d_matrix + j*pitch);
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x)
{
row_d_matrix[i] = count;
count++;
}
}
}
int main()
{
float *d_matrix;
float *dc_matrix = new float[M*N];
//dc_matrix = (float*)malloc(sizeof(float)*M*N);
size_t pitch;
cudaMallocPitch(&d_matrix, &pitch, M*sizeof(float), N);
for (int i = 0; i < M*N; i++)
dc_matrix[i] = i;
for (int i = 0; i < M*N; i++)
printf("%.2f ", dc_matrix[i]);
printf("\n");
cudaMemcpy2D(d_matrix, pitch, dc_matrix, M* sizeof(float), M * sizeof(float), N, cudaMemcpyHostToDevice);
kernel << <GridSize, BlockSize >> >(d_matrix, pitch);
cudaMemcpy2D(dc_matrix, M * sizeof(float), d_matrix, pitch, M * sizeof(float), N, cudaMemcpyDeviceToHost);
for (int i = 0; i < M*N; i++)
printf("%.2f ", dc_matrix[i]);
cudaFree(d_matrix);
free(dc_matrix);
return 0;
}
欢迎来到FlagOS开发社区,这里是一个汇聚了AI开发者、数据科学家、机器学习爱好者以及业界专家的活力平台。我们致力于成为业内领先的Triton技术交流与应用分享的殿堂,为推动人工智能技术的普及与深化应用贡献力量。
更多推荐
所有评论(0)