
Хост-функция
Хост функция
умножения матриц
#define BLOCK SIZE 16
#define BLOCK_SIZE 16
__global__ void
Muld(
float
*,
float
*,
int
,
int
,
float
*);
void
Mul(
const float
* A,
const float
* B,
int
hA,
int
wA,
int
wB,
float
* C) {
int
size;
// L d A
d B t th d i
// Load A and B to the device
float
* Ad; size = hA * wA *
sizeof
(
float
); cudaMalloc((
void
**)&Ad, size);
cudaMemcpy(Ad, A, size, cudaMemcpyHostToDevice);
float
* Bd; size = wA * wB *
sizeof
(
float
); cudaMalloc((
void
**)&Bd, size);
cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);
// Allocate C on the device
float
* Cd;
size = hA * wB *
sizeof
(
float
);
cudaMalloc((
void
**)&Cd, size);
// Compute the execution configuration assuming the matrix dimensions are multiples of BLOCK_SIZE
dim3
dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3
dimGrid(wB / dimBlock.x, hA / dimBlock.y);
// Launch the device computation
Muld
<<<
dimGrid, dimBlock
>>>
(Ad, Bd, wA, wB, Cd);
// Read C from the device
cudaMemcpy(C, Cd, size, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(Ad); cudaFree(Bd); cudaFree(Cd); }