i used sdk 5.0 and cygwin
when i compile this source code
#include <stdio.h>
#include <stdlib.h>
#include <string.h> /* strtok() */
#include <sys/types.h> /* open() */
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h> /* getopt() */
int _debug;
#include "kmeans.h"
/*---< usage() >------------------------------------------------------------*/
static void usage(char *argv0, float threshold) {
char *help =
"Usage: %s [switches] -i filename -n num_clusters\n"
" -i filename : file containing data to be clustered\n"
" -b : input file is in binary format (default no)\n"
" -n num_clusters: number of clusters (K must > 1)\n"
" -t threshold : threshold value (default %.4f)\n"
" -o : output timing results (default no)\n"
" -d : enable debug mode\n";
fprintf(stderr, help, argv0, threshold);
exit(-1);
}
/*---< main() >-------------------------------------------------------------*/
int main(int argc, char **argv) {
int opt;
extern char *optarg;
extern int optind;
int isBinaryFile, is_output_timing;
int numClusters, numCoords, numObjs;
int *membership; /* [numObjs] */
char *filename;
float **objects; /* [numObjs][numCoords] data objects */
float **clusters; /* [numClusters][numCoords] cluster center */
float threshold;
double timing, io_timing, clustering_timing;
int loop_iterations;
/* some default values */
_debug = 0;
threshold = 0.001;
numClusters = 0;
isBinaryFile = 0;
is_output_timing = 0;
filename = NULL;
while ( (opt=getopt(argc,argv,"p:i:n:t:abdo"))!= EOF) {
switch (opt) {
case 'i': filename=optarg;
break;
case 'b': isBinaryFile = 1;
break;
case 't': threshold=atof(optarg);
break;
case 'n': numClusters = atoi(optarg);
break;
case 'o': is_output_timing = 1;
break;
case 'd': _debug = 1;
break;
case '?': usage(argv[0], threshold);
break;
default: usage(argv[0], threshold);
break;
}
}
if (filename == 0 || numClusters <= 1) usage(argv[0], threshold);
if (is_output_timing) io_timing = wtime();
/* read data points from file ------------------------------------------*/
objects = file_read(isBinaryFile, filename, &numObjs, &numCoords);
if (objects == NULL) exit(1);
if (is_output_timing) {
timing = wtime();
io_timing = timing - io_timing;
clustering_timing = timing;
}
/* start the timer for the core computation -----------------------------*/
/* membership: the cluster id for each data object */
membership = (int*) malloc(numObjs * sizeof(int));
assert(membership != NULL);
clusters = cuda_kmeans(objects, numCoords, numObjs, numClusters, threshold,
membership, &loop_iterations);
free(objects[0]);
free(objects);
if (is_output_timing) {
timing = wtime();
clustering_timing = timing - clustering_timing;
}
/* output: the coordinates of the cluster centres ----------------------*/
file_write(filename, numClusters, numObjs, numCoords, clusters,
membership);
free(membership);
free(clusters[0]);
free(clusters);
/*---- output performance numbers ---------------------------------------*/
if (is_output_timing) {
io_timing += wtime() - timing;
printf("\nPerforming **** Regular Kmeans (CUDA version) ****\n");
printf("Input file: %s\n", filename);
printf("numObjs = %d\n", numObjs);
printf("numCoords = %d\n", numCoords);
printf("numClusters = %d\n", numClusters);
printf("threshold = %.4f\n", threshold);
printf("Loop iterations = %d\n", loop_iterations);
printf("I/O time = %10.4f sec\n", io_timing);
printf("Computation timing = %10.4f sec\n", clustering_timing);
}
return(0);
}
i get error message
cuda_main.cu
cuda_main.cu(48) : fatal error C1083: Cannot open include file: ‘unistd.h’: No such file or directory
Makefile:118: recipe for target ‘cuda_main.o’ failed
make: *** [cuda_main.o] Error 2
This file doesn’t appear to have any cuda-specific syntax in it.
Try compiling it with gcc from cygwin instead of nvcc (rename it to cuda_main.cpp). This will require changes to the Makefile. If gcc from cygwin can’t find unistd.h then your problem has nothing to do with cuda.
that source code above is for calling “cuda_kmeans” lines 90
this is the cuda_kmeans code
#include <stdio.h>
#include <stdlib.h>
#include "kmeans.h"
static inline int nextPowerOfTwo(int n) {
n--;
n = n >> 1 | n;
n = n >> 2 | n;
n = n >> 4 | n;
n = n >> 8 | n;
n = n >> 16 | n;
// n = n >> 32 | n; // For 64-bit ints
return ++n;
}
/*----< euclid_dist_2() >----------------------------------------------------*/
/* square of Euclid distance between two multi-dimensional points */
__host__ __device__ inline static
float euclid_dist_2(int numCoords,
int numObjs,
int numClusters,
float *objects, // [numCoords][numObjs]
float *clusters, // [numCoords][numClusters]
int objectId,
int clusterId)
{
int i;
float ans=0.0;
for (i = 0; i < numCoords; i++) {
ans += (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]) *
(objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]);
}
return(ans);
}
/*----< find_nearest_cluster() >---------------------------------------------*/
__global__ static
void find_nearest_cluster(int numCoords,
int numObjs,
int numClusters,
float *objects, // [numCoords][numObjs]
float *deviceClusters, // [numCoords][numClusters]
int *membership, // [numObjs]
int *intermediates)
{
extern __shared__ char sharedMemory[];
// The type chosen for membershipChanged must be large enough to support
// reductions! There are blockDim.x elements, one for each thread in the
// block. See numThreadsPerClusterBlock in cuda_kmeans().
unsigned char *membershipChanged = (unsigned char *)sharedMemory;
#if BLOCK_SHARED_MEM_OPTIMIZATION
float *clusters = (float *)(sharedMemory + blockDim.x);
#else
float *clusters = deviceClusters;
#endif
membershipChanged[threadIdx.x] = 0;
#if BLOCK_SHARED_MEM_OPTIMIZATION
// BEWARE: We can overrun our shared memory here if there are too many
// clusters or too many coordinates! For reference, a Tesla C1060 has 16
// KiB of shared memory per block, and a GeForce GTX 480 has 48 KiB of
// shared memory per block.
for (int i = threadIdx.x; i < numClusters; i += blockDim.x) {
for (int j = 0; j < numCoords; j++) {
clusters[numClusters * j + i] = deviceClusters[numClusters * j + i];
}
}
__syncthreads();
#endif
int objectId = blockDim.x * blockIdx.x + threadIdx.x;
if (objectId < numObjs) {
int index, i;
float dist, min_dist;
/* find the cluster id that has min distance to object */
index = 0;
min_dist = euclid_dist_2(numCoords, numObjs, numClusters,
objects, clusters, objectId, 0);
for (i=1; i<numClusters; i++) {
dist = euclid_dist_2(numCoords, numObjs, numClusters,
objects, clusters, objectId, i);
/* no need square root */
if (dist < min_dist) { /* find the min and its array index */
min_dist = dist;
index = i;
}
}
if (membership[objectId] != index) {
membershipChanged[threadIdx.x] = 1;
}
/* assign the membership to object objectId */
membership[objectId] = index;
__syncthreads(); // For membershipChanged[]
// blockDim.x *must* be a power of two!
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (threadIdx.x < s) {
membershipChanged[threadIdx.x] +=
membershipChanged[threadIdx.x + s];
}
__syncthreads();
}
if (threadIdx.x == 0) {
intermediates[blockIdx.x] = membershipChanged[0];
}
}
}
__global__ static
void compute_delta(int *deviceIntermediates,
int numIntermediates, // The actual number of intermediates
int numIntermediates2) // The next power of two
{
// The number of elements in this array should be equal to
// numIntermediates2, the number of threads launched. It *must* be a power
// of two!
extern __shared__ unsigned int intermediates[];
// Copy global intermediate values into shared memory.
intermediates[threadIdx.x] =
(threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0;
__syncthreads();
// numIntermediates2 *must* be a power of two!
for (unsigned int s = numIntermediates2 / 2; s > 0; s >>= 1) {
if (threadIdx.x < s) {
intermediates[threadIdx.x] += intermediates[threadIdx.x + s];
}
__syncthreads();
}
if (threadIdx.x == 0) {
deviceIntermediates[0] = intermediates[0];
}
}
/*----< cuda_kmeans() >-------------------------------------------------------*/
//
// ----------------------------------------
// DATA LAYOUT
//
// objects [numObjs][numCoords]
// clusters [numClusters][numCoords]
// dimObjects [numCoords][numObjs]
// dimClusters [numCoords][numClusters]
// newClusters [numCoords][numClusters]
// deviceObjects [numCoords][numObjs]
// deviceClusters [numCoords][numClusters]
// ----------------------------------------
//
/* return an array of cluster centers of size [numClusters][numCoords] */
float** cuda_kmeans(float **objects, /* in: [numObjs][numCoords] */
int numCoords, /* no. features */
int numObjs, /* no. objects */
int numClusters, /* no. clusters */
float threshold, /* % objects change membership */
int *membership, /* out: [numObjs] */
int *loop_iterations)
{
int i, j, index, loop=0;
int *newClusterSize; /* [numClusters]: no. objects assigned in each
new cluster */
float delta; /* % of objects change their clusters */
float **dimObjects;
float **clusters; /* out: [numClusters][numCoords] */
float **dimClusters;
float **newClusters; /* [numCoords][numClusters] */
float *deviceObjects;
float *deviceClusters;
int *deviceMembership;
int *deviceIntermediates;
// Copy objects given in [numObjs][numCoords] layout to new
// [numCoords][numObjs] layout
malloc2D(dimObjects, numCoords, numObjs, float);
for (i = 0; i < numCoords; i++) {
for (j = 0; j < numObjs; j++) {
dimObjects[i][j] = objects[j][i];
}
}
/* pick first numClusters elements of objects[] as initial cluster centers*/
malloc2D(dimClusters, numCoords, numClusters, float);
for (i = 0; i < numCoords; i++) {
for (j = 0; j < numClusters; j++) {
dimClusters[i][j] = dimObjects[i][j];
}
}
/* initialize membership[] */
for (i=0; i<numObjs; i++) membership[i] = -1;
/* need to initialize newClusterSize and newClusters[0] to all 0 */
newClusterSize = (int*) calloc(numClusters, sizeof(int));
assert(newClusterSize != NULL);
malloc2D(newClusters, numCoords, numClusters, float);
memset(newClusters[0], 0, numCoords * numClusters * sizeof(float));
// To support reduction, numThreadsPerClusterBlock *must* be a power of
// two, and it *must* be no larger than the number of bits that will
// fit into an unsigned char, the type used to keep track of membership
// changes in the kernel.
const unsigned int numThreadsPerClusterBlock = 128;
const unsigned int numClusterBlocks =
(numObjs + numThreadsPerClusterBlock - 1) / numThreadsPerClusterBlock;
#if BLOCK_SHARED_MEM_OPTIMIZATION
const unsigned int clusterBlockSharedDataSize =
numThreadsPerClusterBlock * sizeof(unsigned char) +
numClusters * numCoords * sizeof(float);
cudaDeviceProp deviceProp;
int deviceNum;
cudaGetDevice(&deviceNum);
cudaGetDeviceProperties(&deviceProp, deviceNum);
if (clusterBlockSharedDataSize > deviceProp.sharedMemPerBlock) {
err("WARNING: Your CUDA hardware has insufficient block shared memory. "
"You need to recompile with BLOCK_SHARED_MEM_OPTIMIZATION=0. "
"See the README for details.\n");
}
#else
const unsigned int clusterBlockSharedDataSize =
numThreadsPerClusterBlock * sizeof(unsigned char);
#endif
const unsigned int numReductionThreads =
nextPowerOfTwo(numClusterBlocks);
const unsigned int reductionBlockSharedDataSize =
numReductionThreads * sizeof(unsigned int);
checkCuda(cudaMalloc(&deviceObjects, numObjs*numCoords*sizeof(float)));
checkCuda(cudaMalloc(&deviceClusters, numClusters*numCoords*sizeof(float)));
checkCuda(cudaMalloc(&deviceMembership, numObjs*sizeof(int)));
checkCuda(cudaMalloc(&deviceIntermediates, numReductionThreads*sizeof(unsigned int)));
checkCuda(cudaMemcpy(deviceObjects, dimObjects[0],
numObjs*numCoords*sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(deviceMembership, membership,
numObjs*sizeof(int), cudaMemcpyHostToDevice));
do {
checkCuda(cudaMemcpy(deviceClusters, dimClusters[0],
numClusters*numCoords*sizeof(float), cudaMemcpyHostToDevice));
find_nearest_cluster
<<< numClusterBlocks, numThreadsPerClusterBlock, clusterBlockSharedDataSize >>>
(numCoords, numObjs, numClusters,
deviceObjects, deviceClusters, deviceMembership, deviceIntermediates);
cudaDeviceSynchronize(); checkLastCudaError();
compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>>
(deviceIntermediates, numClusterBlocks, numReductionThreads);
cudaDeviceSynchronize(); checkLastCudaError();
int d;
checkCuda(cudaMemcpy(&d, deviceIntermediates,
sizeof(int), cudaMemcpyDeviceToHost));
delta = (float)d;
checkCuda(cudaMemcpy(membership, deviceMembership,
numObjs*sizeof(int), cudaMemcpyDeviceToHost));
for (i=0; i<numObjs; i++) {
/* find the array index of nestest cluster center */
index = membership[i];
/* update new cluster centers : sum of objects located within */
newClusterSize[index]++;
for (j=0; j<numCoords; j++)
newClusters[j][index] += objects[i][j];
}
// TODO: Flip the nesting order
// TODO: Change layout of newClusters to [numClusters][numCoords]
/* average the sum and replace old cluster centers with newClusters */
for (i=0; i<numClusters; i++) {
for (j=0; j<numCoords; j++) {
if (newClusterSize[i] > 0)
dimClusters[j][i] = newClusters[j][i] / newClusterSize[i];
newClusters[j][i] = 0.0; /* set back to 0 */
}
newClusterSize[i] = 0; /* set back to 0 */
}
delta /= numObjs;
} while (delta > threshold && loop++ < 500);
*loop_iterations = loop + 1;
/* allocate a 2D space for returning variable clusters[] (coordinates
of cluster centers) */
malloc2D(clusters, numClusters, numCoords, float);
for (i = 0; i < numClusters; i++) {
for (j = 0; j < numCoords; j++) {
clusters[i][j] = dimClusters[j][i];
}
}
checkCuda(cudaFree(deviceObjects));
checkCuda(cudaFree(deviceClusters));
checkCuda(cudaFree(deviceMembership));
checkCuda(cudaFree(deviceIntermediates));
free(dimObjects[0]);
free(dimObjects);
free(dimClusters[0]);
free(dimClusters);
free(newClusters[0]);
free(newClusters);
free(newClusterSize);
return clusters;
}
Yes, I expected that. It doesn’t change my advice.
The cuda_kmeans code doesn’t seem to include unistd.h, so presumably it will compile properly with nvcc.
The cuda_main.cpp code should be compilable with gcc (I think.)
Then you link the two compiled objects together.
Something like this should already be happening in your makefile. I’m suggesting that you change the compiler used for cuda_main.cpp to gcc (and rename cuda_main.cu to cuda_main.cpp). Leave the compilation process for cuda_kmeans.cu alone.
all this program has 4 file cuda_main.cu cuda_io.cu cuda_wtime.cu cuda_kmeans.cu
for specific and get the code from here [url]https://github.com/serban/kmeans[/url]
i already rename to cpp, compile with gcc and change the Makefile,
first it comes error with cuda_io.cu , i change to cuda_io.cpp
error with cuda_wtime.cu, i change to cuda_wtime.cpp
and still can not compiled,
i have three version from this kmeans program = cuda version ,mpi version and sequential version
both of mpi and sequential version have unistd.h in it and can compiled successfully and running
any help? should i change the compiler? with what?
that source code above is for calling “cuda_kmeans” lines 90
this is the cuda_kmeans code?
I’m suggesting you try something similar in your cygwin environment. If it does not work, then I would suggest switching to an ordinary linux environment. That is what the project was originally set up for.
and then i used cmd (admin) all syntax works and the nvcc -c cuda_kmenas.cu make ‘obj’ file not an ‘o’ file
i get an error message when compile the last syntax that
g++ -o test cuda_io.o cuda_wtime.o cuda_main.o cuda_kmeans.o -L/usr/local/cuda/lib64 -lcudart
it says that cuda_kmeans.o: No Such file or directory
when i change the syntax to
g++ -o test cuda_io.o cuda_wtime.o cuda_main.o cuda_kmeans.obj -L/usr/local/cuda/lib64 -lcudart
it says another error like in the picture External Media
I indicated that what I had done was on linux. You will need to modify it for cygwin. You’ve already discovered the .o → .obj difference on linux vs. windows (for nvcc). Note that the nvcc you are using on windows is designed to be compatible with visual studio, not cygwin (which is why g++ in this case creates the .o file but nvcc creates the .obj file).
You also need to fix this path:
-L/usr/local/cuda/lib64
to match whatever is the path to your cudart.lib file on your machine.
If you’re not sure what that path is, use the windows file search utility to locate it.
Even if you make that change, it’s possible that it may still not work. As I mentioned, nvcc is designed to be compatible with visual studio on windows, not cygwin. There may be other linker discrepancies or other issues that prevent it from working properly.
The solution would be to switch to a supported linux environment.
so what based are you using to this project? i mean like what linux that you used? is that compile from linux terminal or other enviroment? and what cuda sdk you used? thank you
i had install ubuntu 14.04 LTS and cuda toolkit 4.0
all works until syntax
nvcc -c cuda_kmeans.cu
it says
dimas@dimas-pc:~/Documents/kmeans-master$ nvcc -c cuda_kmeans.cu
In file included from /usr/local/cuda/bin/…/include/cuda_runtime.h:59:0,
from :0:
/usr/local/cuda/bin/…/include/host_config.h:82:2: error: #error – unsupported GNU version! gcc 4.5 and up are not supported! #error – unsupported GNU version! gcc 4.5 and up are not supported!
^
what should i do?
i have download file cuda toolkit that deb 902 mb’cuda-repo-ubuntu1404-7-0-local_7.0-28_amd64.deb’
i follow that instructions pdf for ubuntu
first cd to folder where the file ini it , then
-sudo dpkg -i cuda-repo-ubuntu1404-7-0-local_7.0-28_amd64.deb – works
-sudo apt-get update – it works but output some failed like
Reading package lists… Done
Building dependency tree
Reading state information… Done
Some packages could not be installed. This may mean that you have
requested an impossible situation or if you are using the unstable
distribution that some required packages have not yet been created
or been moved out of Incoming.
The following information may help to resolve the situation:
The following packages have unmet dependencies:
unity-control-center : Depends: libcheese-gtk23 (>= 3.4.0) but it is not going to be installed
Depends: libcheese7 (>= 3.0.1) but it is not going to be installed
E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.
finally i can compile the cuda file, without change to cpp
i compile directly using ‘make cuda’
last i want to change from kmeans to fuzzy c means, any advice?
and thank you txbob for helping me