Cuda kernel is not working and tried to detect errors using gpuAsset() but, no error message
My code is working with no error messages but it's not functioning "it does not write the secret word on the output file. Can you tell me if there is any error in my code? [code] #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <string.h> #include <time.h> #include <sys/time.h> #include <cuda_runtime.h> #include <cuda.h> #include <device_launch_parameters.h> #define INPUT_FILE_NAME "encodedfile.txt" #define LINE_SIZE 100 #define LINES_COUNT 15360 #define SPECIAL_CHAR ',' #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } // Function used to read the text file into a 1-d array and return a pointer to it char *read_file(FILE *file); // Serial program to find out the secret word void find_secret_word_serial(char *file_as_line_h, char **secret_word_h); void write_to_file(char *secret_word_h); __global__ void find_secret_word_parallel(char *file_as_line, char *secret_word, int n){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx <= n && idx>0){ // if current index does not exceed n: the total file length if (file_as_line[idx-1] == SPECIAL_CHAR){ secret_word[idx] = file_as_line[idx]; } } } int main(int argc, char **argv){ struct timeval stop, start; float elapsed; char *file_as_line_h = NULL; FILE *file = fopen(INPUT_FILE_NAME, "r"); char *secret_word_h; size_t n = LINE_SIZE * LINES_COUNT; secret_word_h = (char *) calloc(n, sizeof(char)); file_as_line_h= read_file(file); // Calculating the elapsing start time gettimeofday(&start, NULL); // Check if 's' passed to main function then run serial program else, run parallel program if (argc == 2 && !strcmp(argv[1], "s")) { find_secret_word_serial(file_as_line_h, &secret_word_h); }else if (argc == 2 && !strcmp(argv[1], "p")){ size_t size = n * sizeof(char); int num_blocks; int block_size; char *file_as_line_d, *secret_word_d; // Allocating memory space for variables on Device gpuErrchk(cudaMalloc((void **) &file_as_line_d, size)); gpuErrchk(cudaMalloc((void **) &secret_word_d, size)); printf("%s\n", file_as_line_d); // Copy our data from Host to Device gpuErrchk(cudaMemcpy(file_as_line_d, file_as_line_h, n, cudaMemcpyHostToDevice)); // Do calculation on Device side block_size = 1024; num_blocks = (int) (n / block_size); find_secret_word_parallel <<< num_blocks, block_size>>> (file_as_line_d, secret_word_d, n); gpuErrchk(cudaPeekAtLastError()); // Copying data back from Device to Host gpuErrchk(cudaMemcpy(secret_word_h, secret_word_d, n, cudaMemcpyDeviceToHost)); gpuErrchk(cudaDeviceSynchronize()); // Freeing the allocated memory space cudaFree(file_as_line_d); cudaFree(secret_word_d); }else{ printf("Error while receiving the arguments ro main function!\n"); exit(-1); } // Calculating the elapsing end time gettimeofday(&stop, NULL); elapsed = (stop.tv_sec - start.tv_sec) * 1000.0f + (stop.tv_usec-start.tv_usec) / 1000.0f; printf("%s\n", secret_word_h); printf("\nCode executed in %f milliseconds or %f seconds.\n", elapsed, elapsed/1000); write_to_file(secret_word_h); // Closing and freeing the memory fclose(file); free(secret_word_h); } // Function used to read the text file into a 1-d array and return a pointer to it char *read_file(FILE *file){ char *fileAsOneLine; char line[LINE_SIZE]; int i; if (!file){ printf("Could not open the text file! Check the name or existence of your file!\n"); exit(-1); } // Allocate memory for pointers to pointers to chars according to lines count fileAsOneLine = (char *)calloc((size_t)LINES_COUNT * LINE_SIZE, sizeof(char)); if (fileAsOneLine == NULL) { printf("Cannot allocate space for lines of the file!\n"); exit(-2); } for (i = 0; i < LINES_COUNT; i++) { fgets(line, LINE_SIZE, file); strcat(fileAsOneLine, line); } /* Return the array of chars */ return fileAsOneLine; } // Serial program to find out the secret word void find_secret_word_serial(char *file_as_line_h, char **secret_word_h){ int i; int secret_word_i = 0; char *temp = *secret_word_h; for (i = 0; i < strlen(file_as_line_h); i++) { if (file_as_line_h[i]== SPECIAL_CHAR){ temp[secret_word_i++] = file_as_line_h[++i]; } } } // Function to write out the secret word on "decoded.txt" file void write_to_file(char *secret_word_h){ FILE *output = fopen("decoded.txt", "w"); fprintf(output, "%s", secret_word_h); fclose(output); } [/code]
My code is working with no error messages but it's not functioning "it does not write the secret word on the output file. Can you tell me if there is any error in my code?
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <device_launch_parameters.h>


#define INPUT_FILE_NAME "encodedfile.txt"
#define LINE_SIZE 100
#define LINES_COUNT 15360
#define SPECIAL_CHAR ','


#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}


// Function used to read the text file into a 1-d array and return a pointer to it
char *read_file(FILE *file);

// Serial program to find out the secret word
void find_secret_word_serial(char *file_as_line_h, char **secret_word_h);

void write_to_file(char *secret_word_h);

__global__ void find_secret_word_parallel(char *file_as_line, char *secret_word, int n){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx <= n && idx>0){ // if current index does not exceed n: the total file length
if (file_as_line[idx-1] == SPECIAL_CHAR){
secret_word[idx] = file_as_line[idx];
}
}
}

int main(int argc, char **argv){
struct timeval stop, start;
float elapsed;
char *file_as_line_h = NULL;
FILE *file = fopen(INPUT_FILE_NAME, "r");
char *secret_word_h;
size_t n = LINE_SIZE * LINES_COUNT;

secret_word_h = (char *) calloc(n, sizeof(char));
file_as_line_h= read_file(file);

// Calculating the elapsing start time
gettimeofday(&start, NULL);

// Check if 's' passed to main function then run serial program else, run parallel program
if (argc == 2 && !strcmp(argv[1], "s")) {
find_secret_word_serial(file_as_line_h, &secret_word_h);
}else if (argc == 2 && !strcmp(argv[1], "p")){

size_t size = n * sizeof(char);

int num_blocks;
int block_size;

char *file_as_line_d, *secret_word_d;

// Allocating memory space for variables on Device
gpuErrchk(cudaMalloc((void **) &file_as_line_d, size));
gpuErrchk(cudaMalloc((void **) &secret_word_d, size));

printf("%s\n", file_as_line_d);

// Copy our data from Host to Device
gpuErrchk(cudaMemcpy(file_as_line_d, file_as_line_h, n, cudaMemcpyHostToDevice));

// Do calculation on Device side
block_size = 1024;
num_blocks = (int) (n / block_size);
find_secret_word_parallel <<< num_blocks, block_size>>> (file_as_line_d, secret_word_d, n);
gpuErrchk(cudaPeekAtLastError());


// Copying data back from Device to Host
gpuErrchk(cudaMemcpy(secret_word_h, secret_word_d, n, cudaMemcpyDeviceToHost));

gpuErrchk(cudaDeviceSynchronize());

// Freeing the allocated memory space
cudaFree(file_as_line_d);
cudaFree(secret_word_d);

}else{
printf("Error while receiving the arguments ro main function!\n");
exit(-1);
}

// Calculating the elapsing end time
gettimeofday(&stop, NULL);

elapsed = (stop.tv_sec - start.tv_sec) * 1000.0f
+ (stop.tv_usec-start.tv_usec) / 1000.0f;

printf("%s\n", secret_word_h);

printf("\nCode executed in %f milliseconds or %f seconds.\n", elapsed, elapsed/1000);

write_to_file(secret_word_h);

// Closing and freeing the memory
fclose(file);
free(secret_word_h);
}


// Function used to read the text file into a 1-d array and return a pointer to it
char *read_file(FILE *file){
char *fileAsOneLine;
char line[LINE_SIZE];
int i;

if (!file){
printf("Could not open the text file! Check the name or existence of your file!\n");
exit(-1);
}

// Allocate memory for pointers to pointers to chars according to lines count
fileAsOneLine = (char *)calloc((size_t)LINES_COUNT * LINE_SIZE, sizeof(char));
if (fileAsOneLine == NULL) {
printf("Cannot allocate space for lines of the file!\n");
exit(-2);
}

for (i = 0; i < LINES_COUNT; i++) {
fgets(line, LINE_SIZE, file);

strcat(fileAsOneLine, line);
}

/* Return the array of chars */
return fileAsOneLine;
}


// Serial program to find out the secret word
void find_secret_word_serial(char *file_as_line_h, char **secret_word_h){
int i;
int secret_word_i = 0;
char *temp = *secret_word_h;
for (i = 0; i < strlen(file_as_line_h); i++) {
if (file_as_line_h[i]== SPECIAL_CHAR){
temp[secret_word_i++] = file_as_line_h[++i];
}
}
}

// Function to write out the secret word on "decoded.txt" file
void write_to_file(char *secret_word_h){
FILE *output = fopen("decoded.txt", "w");

fprintf(output, "%s", secret_word_h);

fclose(output);
}
Attachments

encodedfile.txt

#1
Posted 12/30/2017 03:02 PM   
[code]if (file_as_line[idx-1] == SPECIAL_CHAR){[/code] what happens when idx is zero? Isn't idx-1 going to generate an illegal index? Try running your code with cuda-memcheck [code]if (file_as_line[idx-1] == SPECIAL_CHAR){ secret_word[idx] = file_as_line[idx]; }[/code] When the SPECIAL_CHAR is found, this will only copy one character from the input to the output. Other threads won't see the same point in the input. I recommend you run your code with cuda-memcheck before asking others for help. I also recommend providing a complete code (that someone else could compile and run) with a description of a sample input file, if you want help.
if (file_as_line[idx-1] == SPECIAL_CHAR){


what happens when idx is zero? Isn't idx-1 going to generate an illegal index? Try running your code with cuda-memcheck


if (file_as_line[idx-1] == SPECIAL_CHAR){
secret_word[idx] = file_as_line[idx];
}


When the SPECIAL_CHAR is found, this will only copy one character from the input to the output. Other threads won't see the same point in the input.

I recommend you run your code with cuda-memcheck before asking others for help.

I also recommend providing a complete code (that someone else could compile and run) with a description of a sample input file, if you want help.

#2
Posted 12/30/2017 03:33 PM   
Thanks, I am totally beginner in Cuda programming. I edited this to handle the array bounding but still not working. This program can be run using this command nvcc main.cu ./main p => for parallel function ./main s => for serial function I think I have a problem in memory copying from Host to Device. Note: I attached a test file and its name is hard coded in the file.
Thanks, I am totally beginner in Cuda programming. I edited this to handle the array bounding but still not working.
This program can be run using this command

nvcc main.cu

./main p => for parallel function
./main s => for serial function

I think I have a problem in memory copying from Host to Device.
Note: I attached a test file and its name is hard coded in the file.

#3
Posted 12/30/2017 03:48 PM   
OK so the goal of your program is to take the input file, and copy each character that is preceded by a comma (SPECIAL_CHAR) to an output string (and file). This is in a general category of problems called "stream compaction". Performing stream compaction is relatively easy in serial code but requires some special techniques in parallel. Your code has a variety of problems (e.g. a seg fault) but the most important one is conceptual/structural. Your parallel code will copy input to output like this: [code]input: 230olsdl,Msdif,0aldfjk,zalsdfjk... output: M 0 z ...[/code] when what you really want is: [code]input: 230olsdl,Msdif,0aldfjk,zalsdfjk... output: M0z...[/code] To get from what your program produces, to what you want, it's necessary to get rid of the "empty space". This is stream compaction. To do it in parallel, we need to: 1. figure out where each selected element should go 2. copy it there step 1 is mostly accomplished (usually) with a parallel scan (also called a parallel prefix sum). A scan operation does this: [code]input: 230olsdl,Msdif,0aldfjk,zalsdfjk... output: M 0 z ... scan: 0 1 2 ...[/code] Once we have the scan information, we can use it to copy the selected element to the correct location in the final result. You can write your own scan code: [url]https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html[/url] but I don't recommend it. Instead I would suggest using a library implementation of a scan. Of course, if you're going to use a library implementation of the scan, it's possible to just use a library implementation of a stream compaction operation, and take care of everything in one step. Thrust has both scan primitives: [url]https://thrust.github.io/doc/group__prefixsums.html[/url] and stream compaction algorithms: [url]https://thrust.github.io/doc/group__stream__compaction.html[/url] Back to your code, this will cause a seg fault: [code]gpuErrchk(cudaMalloc((void **) &file_as_line_d, size)); ... printf("%s\n", file_as_line_d);[/code] You are allocating file_as_line_d on the device, so it is a pointer to a location in device memory. You cannot use that in printf as if it were a pointer to a string in host memory. Trying to do so will seg fault. I'm not sure why you have it, I guess it is part of your debugging effort. This is also better but not quite right: [code]if (idx <= n && idx>0){ // if current index does not exceed n: the total file length if (file_as_line[idx-1] == SPECIAL_CHAR){ secret_word[idx] = file_as_line[idx];[/code] In C, when we are doing indexing into an array of length n, we must stop at position n-1, since C indexing starts at 0. Therefore the correct limit is: [code]if (idx < n && idx>0)[/code] Finally, you'll want to launch an extra block to pick up any remaining elements in your string: [code]num_blocks = (int) (n / block_size) +1;[/code] With those changes, your code runs without runtime error. However it is still producing the first output example I gave above, so your write-string-to-file routine sees a null character as the very first string element, and stops there. You need to do stream compaction at that point.
OK so the goal of your program is to take the input file, and copy each character that is preceded by a comma (SPECIAL_CHAR) to an output string (and file). This is in a general category of problems called "stream compaction". Performing stream compaction is relatively easy in serial code but requires some special techniques in parallel.

Your code has a variety of problems (e.g. a seg fault) but the most important one is conceptual/structural. Your parallel code will copy input to output like this:

input:   230olsdl,Msdif,0aldfjk,zalsdfjk...
output: M 0 z ...



when what you really want is:

input:   230olsdl,Msdif,0aldfjk,zalsdfjk...
output: M0z...



To get from what your program produces, to what you want, it's necessary to get rid of the "empty space". This is stream compaction. To do it in parallel, we need to:

1. figure out where each selected element should go
2. copy it there

step 1 is mostly accomplished (usually) with a parallel scan (also called a parallel prefix sum). A scan operation does this:

input:   230olsdl,Msdif,0aldfjk,zalsdfjk...
output: M 0 z ...
scan: 0 1 2 ...


Once we have the scan information, we can use it to copy the selected element to the correct location in the final result.

You can write your own scan code:

https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html

but I don't recommend it. Instead I would suggest using a library implementation of a scan. Of course, if you're going to use a library implementation of the scan, it's possible to just use a library implementation of a stream compaction operation, and take care of everything in one step. Thrust has both scan primitives:

https://thrust.github.io/doc/group__prefixsums.html

and stream compaction algorithms:

https://thrust.github.io/doc/group__stream__compaction.html

Back to your code, this will cause a seg fault:


gpuErrchk(cudaMalloc((void **) &file_as_line_d, size));
...
printf("%s\n", file_as_line_d);


You are allocating file_as_line_d on the device, so it is a pointer to a location in device memory. You cannot use that in printf as if it were a pointer to a string in host memory. Trying to do so will seg fault. I'm not sure why you have it, I guess it is part of your debugging effort.

This is also better but not quite right:


if (idx <= n && idx>0){              // if current index does not exceed n: the total file length
if (file_as_line[idx-1] == SPECIAL_CHAR){
secret_word[idx] = file_as_line[idx];


In C, when we are doing indexing into an array of length n, we must stop at position n-1, since C indexing starts at 0. Therefore the correct limit is:

if (idx < n && idx>0)


Finally, you'll want to launch an extra block to pick up any remaining elements in your string:

num_blocks = (int) (n / block_size) +1;



With those changes, your code runs without runtime error. However it is still producing the first output example I gave above, so your write-string-to-file routine sees a null character as the very first string element, and stops there. You need to do stream compaction at that point.

#4
Posted 12/30/2017 05:49 PM   
Thank you! I have two questions: 1) For file_as_line_h is the array of characters to be scanned and this needs to be copied to file_as_line_d as a whole. Does that cause any problem? 2) My big problem is having distributed and sparse so, we need to copy the only needed characters not the whole array, right?
Thank you! I have two questions:

1) For file_as_line_h is the array of characters to be scanned and this needs to be copied to file_as_line_d as a whole. Does that cause any problem?

2) My big problem is having distributed and sparse so, we need to copy the only needed characters not the whole array, right?

#5
Posted 12/30/2017 07:18 PM   
[quote=""]Thank you! I have two questions: 1) For file_as_line_h is the array of characters to be scanned and this needs to be copied to file_as_line_d as a whole. Does that cause any problem?[/quote] No, that is what you would want to do. [quote=""] 2) My big problem is having distributed and sparse so, we need to copy the only needed characters not the whole array, right? [/quote] Yes, your result is "sparse" so you need to copy just the needed characters, not the whole array. By the way, I don't think your use of fgets is giving you what you expect: [code]fgets(line, LINE_SIZE, file);[/code] you might want to read the man page for that carefully. As a result, your input file length is only about half the size of the file on disk.
said:Thank you! I have two questions:

1) For file_as_line_h is the array of characters to be scanned and this needs to be copied to file_as_line_d as a whole. Does that cause any problem?


No, that is what you would want to do.

said:
2) My big problem is having distributed and sparse so, we need to copy the only needed characters not the whole array, right?


Yes, your result is "sparse" so you need to copy just the needed characters, not the whole array.

By the way, I don't think your use of fgets is giving you what you expect:

fgets(line, LINE_SIZE, file);


you might want to read the man page for that carefully. As a result, your input file length is only about half the size of the file on disk.

#6
Posted 12/30/2017 07:24 PM   
I think you may have a bug in your serial computation routine. I believe this: [code]temp[secret_word_i++] = file_as_line_h[++i];[/code] should be this: [code]temp[secret_word_i++] = file_as_line_h[i+1];[/code] With that change, here is a sample code that uses thrust to do the whole operation: [code]$ cat t15.cu #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <string.h> #include <time.h> #include <sys/time.h> #include <cuda_runtime.h> #include <cuda.h> #include <device_launch_parameters.h> #include <thrust/copy.h> #include <thrust/execution_policy.h> using namespace thrust::placeholders; #define INPUT_FILE_NAME "encodedfile.txt" #define LINE_SIZE 102 #define LINES_COUNT 15360 #define SPECIAL_CHAR ',' #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } // Function used to read the text file into a 1-d array and return a pointer to it char *read_file(FILE *file); // Serial program to find out the secret word void find_secret_word_serial(char *file_as_line_h, char **secret_word_h); void write_to_file(char *secret_word_h); __global__ void find_secret_word_parallel(char *file_as_line, char *secret_word, int n){ int idx = blockIdx.x * blockDim.x + blockIdx.x; if (idx < n && idx>0){ // if current index does not exceed n: the total file length if (file_as_line[idx-1] == SPECIAL_CHAR){ secret_word[idx] = file_as_line[idx]; } } } int main(int argc, char **argv){ struct timeval stop, start; float elapsed; char *file_as_line_h = NULL; FILE *file = fopen(INPUT_FILE_NAME, "r"); char *secret_word_h; size_t n = LINE_SIZE * LINES_COUNT; secret_word_h = (char *) calloc(n, sizeof(char)); file_as_line_h= read_file(file); // Calculating the elapsing start time gettimeofday(&start, NULL); // Check if 's' passed to main function then run serial program else, run parallel program if (argc == 2 && !strcmp(argv[1], "s")) { find_secret_word_serial(file_as_line_h, &secret_word_h); }else if (argc == 2 && !strcmp(argv[1], "p")){ size_t size = n * sizeof(char); char *file_as_line_d, *secret_word_d; // Allocating memory space for variables on Device gpuErrchk(cudaMalloc((void **) &file_as_line_d, size)); gpuErrchk(cudaMalloc((void **) &secret_word_d, size)); //printf("%s\n", file_as_line_d); // Copy our data from Host to Device gpuErrchk(cudaMemcpy(file_as_line_d, file_as_line_h, n, cudaMemcpyHostToDevice)); #ifdef ORIG int num_blocks; int block_size; // Do calculation on Device side block_size = 1024; num_blocks = (int) (n / block_size) + 1; find_secret_word_parallel <<< num_blocks, block_size>>> (file_as_line_d, secret_word_d, n); gpuErrchk(cudaPeekAtLastError()); #else gpuErrchk(cudaMemset(secret_word_d, 0, size)); thrust::copy_if(thrust::device, file_as_line_d+1, file_as_line_d+n, file_as_line_d, secret_word_d, _1==SPECIAL_CHAR); #endif // Copying data back from Device to Host gpuErrchk(cudaMemcpy(secret_word_h, secret_word_d, n, cudaMemcpyDeviceToHost)); gpuErrchk(cudaDeviceSynchronize()); // Freeing the allocated memory space cudaFree(file_as_line_d); cudaFree(secret_word_d); }else{ printf("Error while receiving the arguments to main function!\n"); exit(-1); } // Calculating the elapsing end time gettimeofday(&stop, NULL); elapsed = (stop.tv_sec - start.tv_sec) * 1000.0f + (stop.tv_usec-start.tv_usec) / 1000.0f; //printf("%s\n", secret_word_h); printf("\nCode executed in %f milliseconds or %f seconds.\n", elapsed, elapsed/1000); write_to_file(secret_word_h); // Closing and freeing the memory fclose(file); free(secret_word_h); } // Function used to read the text file into a 1-d array and return a pointer to it char *read_file(FILE *file){ char *fileAsOneLine; char line[LINE_SIZE]; int i; if (!file){ printf("Could not open the text file! Check the name or existence of your file!\n"); exit(-1); } // Allocate memory for pointers to pointers to chars according to lines count fileAsOneLine = (char *)calloc((size_t)LINES_COUNT * LINE_SIZE, sizeof(char)); if (fileAsOneLine == NULL) { printf("Cannot allocate space for lines of the file!\n"); exit(-2); } for (i = 0; i < LINES_COUNT; i++) { fgets(line, LINE_SIZE, file); strcat(fileAsOneLine, line); } /* Return the array of chars */ return fileAsOneLine; } // Serial program to find out the secret word void find_secret_word_serial(char *file_as_line_h, char **secret_word_h){ int i; int secret_word_i = 0; char *temp = *secret_word_h; for (i = 0; i < strlen(file_as_line_h); i++) { if (file_as_line_h[i]== SPECIAL_CHAR){ temp[secret_word_i++] = file_as_line_h[i+1]; } } } // Function to write out the secret word on "decoded.txt" file void write_to_file(char *secret_word_h){ FILE *output = fopen("decoded.txt", "w"); fprintf(output, "%s", secret_word_h); fclose(output); } $ nvcc -arch=sm_35 t15.cu -o t15 $ ./t15 s Code executed in 80508.789062 milliseconds or 80.508789 seconds. $ mv decoded.txt decoded_s.txt $ ./t15 p Code executed in 316.825012 milliseconds or 0.316825 seconds. $ diff decoded.txt decoded_s.txt $[/code]
I think you may have a bug in your serial computation routine. I believe this:

temp[secret_word_i++] = file_as_line_h[++i];



should be this:

temp[secret_word_i++] = file_as_line_h[i+1];



With that change, here is a sample code that uses thrust to do the whole operation:

$ cat t15.cu
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <device_launch_parameters.h>
#include <thrust/copy.h>
#include <thrust/execution_policy.h>
using namespace thrust::placeholders;

#define INPUT_FILE_NAME "encodedfile.txt"
#define LINE_SIZE 102
#define LINES_COUNT 15360
#define SPECIAL_CHAR ','


#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}


// Function used to read the text file into a 1-d array and return a pointer to it
char *read_file(FILE *file);

// Serial program to find out the secret word
void find_secret_word_serial(char *file_as_line_h, char **secret_word_h);

void write_to_file(char *secret_word_h);

__global__ void find_secret_word_parallel(char *file_as_line, char *secret_word, int n){
int idx = blockIdx.x * blockDim.x + blockIdx.x;
if (idx < n && idx>0){ // if current index does not exceed n: the total file length
if (file_as_line[idx-1] == SPECIAL_CHAR){
secret_word[idx] = file_as_line[idx];
}
}
}

int main(int argc, char **argv){
struct timeval stop, start;
float elapsed;
char *file_as_line_h = NULL;
FILE *file = fopen(INPUT_FILE_NAME, "r");
char *secret_word_h;
size_t n = LINE_SIZE * LINES_COUNT;

secret_word_h = (char *) calloc(n, sizeof(char));
file_as_line_h= read_file(file);

// Calculating the elapsing start time
gettimeofday(&start, NULL);

// Check if 's' passed to main function then run serial program else, run parallel program
if (argc == 2 && !strcmp(argv[1], "s")) {
find_secret_word_serial(file_as_line_h, &secret_word_h);
}else if (argc == 2 && !strcmp(argv[1], "p")){

size_t size = n * sizeof(char);

char *file_as_line_d, *secret_word_d;

// Allocating memory space for variables on Device
gpuErrchk(cudaMalloc((void **) &file_as_line_d, size));
gpuErrchk(cudaMalloc((void **) &secret_word_d, size));

//printf("%s\n", file_as_line_d);

// Copy our data from Host to Device
gpuErrchk(cudaMemcpy(file_as_line_d, file_as_line_h, n, cudaMemcpyHostToDevice));

#ifdef ORIG
int num_blocks;
int block_size;
// Do calculation on Device side
block_size = 1024;
num_blocks = (int) (n / block_size) + 1;
find_secret_word_parallel <<< num_blocks, block_size>>> (file_as_line_d, secret_word_d, n);
gpuErrchk(cudaPeekAtLastError());
#else
gpuErrchk(cudaMemset(secret_word_d, 0, size));
thrust::copy_if(thrust::device, file_as_line_d+1, file_as_line_d+n, file_as_line_d, secret_word_d, _1==SPECIAL_CHAR);
#endif

// Copying data back from Device to Host
gpuErrchk(cudaMemcpy(secret_word_h, secret_word_d, n, cudaMemcpyDeviceToHost));

gpuErrchk(cudaDeviceSynchronize());

// Freeing the allocated memory space
cudaFree(file_as_line_d);
cudaFree(secret_word_d);

}else{
printf("Error while receiving the arguments to main function!\n");
exit(-1);
}

// Calculating the elapsing end time
gettimeofday(&stop, NULL);

elapsed = (stop.tv_sec - start.tv_sec) * 1000.0f
+ (stop.tv_usec-start.tv_usec) / 1000.0f;

//printf("%s\n", secret_word_h);

printf("\nCode executed in %f milliseconds or %f seconds.\n", elapsed, elapsed/1000);

write_to_file(secret_word_h);

// Closing and freeing the memory
fclose(file);
free(secret_word_h);
}


// Function used to read the text file into a 1-d array and return a pointer to it
char *read_file(FILE *file){
char *fileAsOneLine;
char line[LINE_SIZE];
int i;

if (!file){
printf("Could not open the text file! Check the name or existence of your file!\n");
exit(-1);
}

// Allocate memory for pointers to pointers to chars according to lines count
fileAsOneLine = (char *)calloc((size_t)LINES_COUNT * LINE_SIZE, sizeof(char));
if (fileAsOneLine == NULL) {
printf("Cannot allocate space for lines of the file!\n");
exit(-2);
}

for (i = 0; i < LINES_COUNT; i++) {
fgets(line, LINE_SIZE, file);
strcat(fileAsOneLine, line);
}

/* Return the array of chars */
return fileAsOneLine;
}


// Serial program to find out the secret word
void find_secret_word_serial(char *file_as_line_h, char **secret_word_h){
int i;
int secret_word_i = 0;
char *temp = *secret_word_h;
for (i = 0; i < strlen(file_as_line_h); i++) {
if (file_as_line_h[i]== SPECIAL_CHAR){
temp[secret_word_i++] = file_as_line_h[i+1];
}
}
}

// Function to write out the secret word on "decoded.txt" file
void write_to_file(char *secret_word_h){
FILE *output = fopen("decoded.txt", "w");

fprintf(output, "%s", secret_word_h);

fclose(output);
}
$ nvcc -arch=sm_35 t15.cu -o t15
$ ./t15 s

Code executed in 80508.789062 milliseconds or 80.508789 seconds.
$ mv decoded.txt decoded_s.txt
$ ./t15 p

Code executed in 316.825012 milliseconds or 0.316825 seconds.
$ diff decoded.txt decoded_s.txt
$

#7
Posted 12/30/2017 08:23 PM   
The compiler did not recognize this statement: [code]thrust::device[/code] Can you tell me why my sparse array causing problems? If the memory could read the whole input file, why it's disabled to store another one of the same size.
The compiler did not recognize this statement:
thrust::device

Can you tell me why my sparse array causing problems? If the memory could read the whole input file, why it's disabled to store another one of the same size.

#8
Posted 12/31/2017 09:35 AM   
Is it possible to solve this problem using global idx?
Is it possible to solve this problem using global idx?

#9
Posted 12/31/2017 09:36 AM   
[quote=""]The compiler did not recognize this statement: [code]thrust::device[/code] [/quote] What version of CUDA are you using? [quote=""] Can you tell me why my sparse array causing problems? If the memory could read the whole input file, why it's disabled to store another one of the same size. [/quote] The sparse array isn't causing problems. It's a natural first step in the solution process. You just have more work to do once you have the sparse array. Re-read this comment: [url]https://devtalk.nvidia.com/default/topic/1028131/cuda-programming-and-performance/cuda-kernel-is-not-working-and-tried-to-detect-errors-using-gpuasset-but-no-error-message/post/5229685/#5229685[/url]
said:The compiler did not recognize this statement:
thrust::device



What version of CUDA are you using?

said:
Can you tell me why my sparse array causing problems? If the memory could read the whole input file, why it's disabled to store another one of the same size.


The sparse array isn't causing problems. It's a natural first step in the solution process. You just have more work to do once you have the sparse array. Re-read this comment:

https://devtalk.nvidia.com/default/topic/1028131/cuda-programming-and-performance/cuda-kernel-is-not-working-and-tried-to-detect-errors-using-gpuasset-but-no-error-message/post/5229685/#5229685

#10
Posted 12/31/2017 03:16 PM   
CUDA 9.1
CUDA 9.1

#11
Posted 12/31/2017 04:58 PM   
So, even if the secret word is sparse I should not get an error. So, what is causing the kernel to be doing nothing?
So, even if the secret word is sparse I should not get an error. So, what is causing the kernel to be doing nothing?

#12
Posted 12/31/2017 05:03 PM   
The kernel is not doing nothing. It is copying a sparse pattern. The area of the output array not occupied by the sparse pattern is filled with zero. zero in a character string is the NULL character, and it is used to terminate a string. When you use a string-based approach to write the output array to a file: [code]fprintf(output, "%s", secret_word_h); ^ This expects a NULL-terminated string [/code] the writing of the output stops with the first character, because the first character in your output array is the NULL character, which terminates the "string". Regarding the compiler not recognizing thrust::device, add this line to the beginning of the code: [code]#include <thrust/execution_policy.h>[/code] I've already edited my previous code to include this at the proper place. Also, if you are on windows (appears that you are) you may run into the windows WDDM TDR timeout. If the kernel execution is longer than about 2 seconds, windows will terminate it. You can work around this if you want. Google "CUDA WDDM TDR" and you will find information about it.
The kernel is not doing nothing. It is copying a sparse pattern. The area of the output array not occupied by the sparse pattern is filled with zero. zero in a character string is the NULL character, and it is used to terminate a string. When you use a string-based approach to write the output array to a file:

fprintf(output, "%s", secret_word_h);
^
This expects a NULL-terminated string



the writing of the output stops with the first character, because the first character in your output array is the NULL character, which terminates the "string".

Regarding the compiler not recognizing thrust::device, add this line to the beginning of the code:

#include <thrust/execution_policy.h>


I've already edited my previous code to include this at the proper place.

Also, if you are on windows (appears that you are) you may run into the windows WDDM TDR timeout. If the kernel execution is longer than about 2 seconds, windows will terminate it. You can work around this if you want. Google "CUDA WDDM TDR" and you will find information about it.

#13
Posted 12/31/2017 08:43 PM   
Is there another way to copy my whole string to an output file without NUll problem?
Is there another way to copy my whole string to an output file without NUll problem?

#14
Posted 12/31/2017 09:52 PM   
Yes. If you just want to write everything, including the NULL characters to a file, it is possible. That has nothing to do with CUDA. You'll need to learn more about file I/O. In C it could easily, if tediously, be done with a loop using fputc: [url]http://www.cplusplus.com/reference/cstdio/fputc/[/url] If you wrote a loop with fputc, you could just skip over the NULL characters, which would effectively do the stream-compaction step for you. Of course, if you do that, you're very nearly back to the serial CPU code implementation. There are many possible ways to do it, of course.
Yes. If you just want to write everything, including the NULL characters to a file, it is possible. That has nothing to do with CUDA. You'll need to learn more about file I/O. In C it could easily, if tediously, be done with a loop using fputc:

http://www.cplusplus.com/reference/cstdio/fputc/

If you wrote a loop with fputc, you could just skip over the NULL characters, which would effectively do the stream-compaction step for you. Of course, if you do that, you're very nearly back to the serial CPU code implementation.

There are many possible ways to do it, of course.

#15
Posted 12/31/2017 09:59 PM   
Scroll To Top

Add Reply