# Multiplying matrices in CUDA using shared memory tiled technique

In the below homework code I need to find out what more boundary conditions are required for the tiled matrix multiplication to work. Please help me out I have tried for a week to find out what the problem is ?

```#include    <wb.h>

#define TILE_WIDTH 16

__global__ void matrixMultiplyShared(float * A, float * B, float * C,

int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns) {

//@@ Insert code to implement matrix multiplication here
//@@ You have to use shared memory for this MP
__shared__ float s_A[TILE_WIDTH][TILE_WIDTH];
__shared__ float s_B[TILE_WIDTH][TILE_WIDTH];

int bx= blockIdx.x ; int by = blockIdx.y ;

int Row = by*TILE_WIDTH + ty;
int Col = bx*TILE_WIDTH + tx;

if((Row < numARows  ) && (Col < numBColumns )) {

float Pvalue =0.0;
for (int m = 0; m < (numAColumns-1)/TILE_WIDTH+1; ++m) {

if((Row < numARows) && ( (m*TILE_WIDTH+tx) < numAColumns)) {

s_A[ty][tx] = A[Row*numAColumns +( m*TILE_WIDTH+tx)];
}
else
{
s_A[ty][tx] = 0.0;
}
if(((m*TILE_WIDTH+ty) < numBRows) && (Col < numBColumns)) {

s_B[ty][tx] = B[(m*TILE_WIDTH+ty)*numBColumns+Col];
}
else
{
s_B[ty][tx] = 0.0;
}

if((Row < numARows  ) && (Col < numBColumns )) {
for (int k = 0; k < TILE_WIDTH; ++k)
{

Pvalue += s_A[ty][k] * s_B[k][tx];

}

}
}
if((Row < numARows  ) && (Col < numBColumns )) {
C[Row*numCColumns+Col] = Pvalue;
}
}
else
return;
}
```

The problem is in the condition for entering into the loop for loading the TILE, that is, if((Row < numARows ) && (Col < numBColumns )), and also next time again checking the same condition while doing the actual computation for the resulting element for each loaded TILE, only the last condition, that is while writing to the global is enough. you can find a detailed implementation for reference here