how to replace a row in matrix by another according to a condition in CUDA

I have two matrices A and B both of size (3*1024). i wrote a small program in CUDAC to replace a row in Matrix A by the corresponding row in Matix B according to another two vectors val1 and val2, as example:

val1: [10 20 30]
val2: [7 17 6]
Matrix A:{1000 …1000,1000 …1000,1000 …1000}
Matrix B:{500 …500,500 …500,500 …500}

and i want the result for matrix A to be:

Matrix A:{500 …500,1000 …1000,500 …500}

this is the code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void Replacement2(float * oldval, float * newval, const int D, float *  A, const float *  B) {

	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;
	int index = col + row * D;
	
	if (newval[row] < oldval[row]) {
		A[index] = B[index];
		oldval[row] = newval[row];
		__syncthreads();
	}
	
}
int main()
{
	
	unsigned int size_old = 3 * 1024;
	unsigned int mem_size_old = sizeof(float)* size_old;

	unsigned int size_new = 3 * 1024;
	unsigned int mem_size_new = sizeof(float)* size_new;

	unsigned int sobj = sizeof(float)* 3;

	unsigned int snobj = sizeof(float)* 3;
	
	const float h_old_data[] = {1000 .......1000,1000 .......1000,1000 .......1000};
        float* h_old = (float*)memcpy(new float[3*1024], h_old_data, sizeof(h_old_data));
        const float h_new_data[] = {500 .......500,500 .......500,500 .......500};
float* h_new = (float*)memcpy(new float[3*1024], h_new_data, sizeof(h_new_data));

	const float h_val[] = { 10, 20, 30};
	float* OLDVAL = (float*)memcpy(new float[3], h_val, sizeof(h_val));

	const float h_newval[] = { 7.0, 70.0, 7.0};
	float* NEWVAL = (float*)memcpy(new float[3], h_newval, sizeof(h_newval));

	float* d_oldval;
	cudaMalloc((void**)&d_oldval, sizeof(sobj));
	cudaMemcpy(d_oldval, h_val, sizeof(sobj), cudaMemcpyHostToDevice);
	
	float* d_newval;
	cudaMalloc((void**)&d_newval, sizeof(sobj));
	cudaMemcpy(d_newval, h_newval, sizeof(sobj), cudaMemcpyHostToDevice);

	float* d_oldvec;
	cudaMalloc((void**)&d_oldvec, mem_size_old);
	cudaMemcpy(d_oldvec, h_old, mem_size_old, cudaMemcpyHostToDevice);

	float* d_newvec;
	cudaMalloc((void**)&d_newvec, mem_size_new);
	cudaMemcpy(d_newvec, h_new, mem_size_new, cudaMemcpyHostToDevice);

	Replacement2 << <dim3(1,3), dim3(1024, 1) >> >(d_oldval, d_newval, 1024, d_oldvec, d_newvec);

	cudaMemcpy(h_old, d_oldvec, mem_size_old, cudaMemcpyDeviceToHost);
	
	for (int x1 = 0; x1 < 3; x1++){
		for (int y1 = 0; y1 < 1024; y1++)
			fprintf(fpout, "%f  ", h_old[x1 * 1024 + y1]);
		fprintf(fpout, "\n");
	}

    return 0;
}

but the replacement happens only in the first row. what is the problem??

I spotted one problem (though it’s likely not the root cause of why the program is not working)

Never call __syncthreads() when not all threads of a block would be taking part in executing the statement.

The __syncthreads() must be outside the if() condition as it depends on the variable row which itself is a function of threadIdx.y.

+1 :)