
GPU ядро умножения матриц
GPU ядро умножения матриц
(продолжение)
As[ty][tx] = A[a + wA * ty + tx];
// Load the matrices from global memory to shared memory;
Bs[ty][tx] = B[b + wB * ty + tx];
// each thread loads one element of each matrix
__syncthreads();
// Synchronize to make sure the matrices are loaded
y
()
y
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
//
for
(
int
k = 0; k < BLOCK_SIZE; ++k)
Csub += As[ty][k] * Bs[k][tx];
// Synchronize to make sure that the preceding
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
}
// Write the block sub-matrix to global memory;
// each thread writes one element
int
c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
C[c + wB * ty + tx] = Csub;
}