A few points that have been stated already above or in various links:
-
The address of a device function, or a device symbol, cannot be taken in host code. Therefore the address of a device function must first be transferred to a device symbol variable, and then the contents of that device symbol variable can be transferred to a host copy/version of that variable. Note that this seems to also preclude the passing of a device symbol by-value as a function argument.
-
CUDA API routines such as cudaMemcpyFromSymbol cannot be safely called in constructors (or destructors) of objects at global scope. This is due to CUDA lazy initialization.
With the above provisos in mind, one possible approach to allow for your code to compile and run without error is as follows:
$ cat Functions.h
/*
#-------------------------------------------------------------------------------
# Copyright (c) 2012 Daniel <dgrat> Frenzel.
# All rights reserved. This program and the accompanying materials
# are made available under the terms of the GNU Lesser Public License v2.1
# which accompanies this distribution, and is available at
# http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
#
# Contributors:
# Daniel <dgrat> Frenzel - initial API and implementation
#-------------------------------------------------------------------------------
*/
#ifndef TRANSFERFUNCTIONS_H_
#define TRANSFERFUNCTIONS_H_
#ifndef SWIG
#include <cmath>
#include <stdio.h>
#include <string.h>
#endif
#define PI 3.14159265358979323846f
typedef float (*pDistanceFu) (float, float);
typedef float (*pDecayFu) (float, float, float);
//////////////////////////////////////////////////////////////////////////////////////////////
/*
* Distance functions for self organizing maps
*/
//////////////////////////////////////////////////////////////////////////////////////////////
#ifdef __CUDACC__
__host__ __device__
#endif
inline static float
fcn_bubble_nhood (float dist, float sigmaT) {
if(dist < sigmaT)
return 1.f;
else return 0.f;
}
//////////////////////////////////////////////////////////////////////////////////////////////
#ifdef __CUDACC__
__host__ __device__
#endif
inline static float
fcn_gaussian_nhood (float dist, float sigmaT) {
return exp(-pow(dist, 2.f)/(2.f*pow(sigmaT, 2.f)));
}
#ifdef __CUDACC__
__device__
#endif
float (*d_fcn_gaussian_nhood)(float dist, float sigmaT) = fcn_gaussian_nhood;
//////////////////////////////////////////////////////////////////////////////////////////////
#ifdef __CUDACC__
__host__ __device__
#endif
inline static float
fcn_cutgaussian_nhood (float dist, float sigmaT) {
if(dist < sigmaT)
return exp(-pow(dist, 2.f)/(2.f*pow(sigmaT, 2.f)));
else return 0.f;
}
//////////////////////////////////////////////////////////////////////////////////////////////
#ifdef __CUDACC__
__host__ __device__
#endif
inline static float
fcn_mexican_nhood (float dist, float sigmaT) {
return 2.f/(sqrt(3.f * sigmaT) * pow(PI, 0.25f) ) *
(1.f-pow(dist, 2.f) / pow(sigmaT, 2.f) ) *
fcn_gaussian_nhood(dist, sigmaT);
}
//////////////////////////////////////////////////////////////////////////////////////////////
#ifdef __CUDACC__
__host__ __device__
#endif
inline static float
fcn_epanechicov_nhood (float dist, float sigmaT) {
float fVal = 1 - pow(dist/sigmaT, 2.f);
if(fVal > 0)
return fVal;
else return 0.f;
}
//////////////////////////////////////////////////////////////////////////////////////////////
#ifdef __CUDACC__
__host__ __device__
#endif
inline static float
fcn_rad_decay (float sigma0, float T, float lambda) {
return std::floor(sigma0*exp(-T/lambda) + 0.5f);
}
#ifdef __CUDACC__
__device__
#endif
float (*d_fcn_rad_decay)(float sigma0, float T, float lambda) = fcn_rad_decay;
//////////////////////////////////////////////////////////////////////////////////////////////
#ifdef __CUDACC__
__host__ __device__
#endif
inline static float
fcn_lrate_decay (float sigma0, float T, float lambda) {
return sigma0*exp(-T/lambda);
}
#ifdef __CUDACC__
__device__
#endif
float (*d_fcn_lrate_decay)(float sigma0, float T, float lambda) = fcn_lrate_decay;
/**
* @class DistFunction
* @brief Represents a neighborhood and decay function.
* Consists of a distance and a decay function.
* Normally just the neighborhood function is free to be changed.
*/
class DistFunction;
typedef float (*pDistanceFu) (float, float);
typedef float (*pDecayFu) (float, float, float);
typedef float (DistFunction::*pmDistanceFu) (float, float);
typedef float (DistFunction::*pmDecayFu) (float, float, float);
class DistFunction {
private:
pDistanceFu hDist;
pDecayFu hRadDecay;
pDecayFu hLRateDecay;
public:
DistFunction(char *, pDistanceFu, pDecayFu, pDecayFu);
void Assign();
char *name;
pDistanceFu distance;
pDecayFu rad_decay;
pDecayFu lrate_decay;
};
void test();
#endif /* TRANSFERFUNCTIONS_H_ */
$ cat Functions.cu
//#include <iostream>
#include "Functions.h"
#include <iostream>
#include <thrust/extrema.h>
#include <thrust/distance.h>
#include <thrust/device_vector.h>
DistFunction::DistFunction(char *cstr, pDistanceFu dist, pDecayFu rad, pDecayFu lrate) : name(cstr), distance(dist), rad_decay(rad), lrate_decay(lrate) {
}
void DistFunction::Assign() {
pDistanceFu hDist;
pDecayFu hRadDecay;
pDecayFu hLRateDecay;
cudaMemcpyFromSymbol(&hDist, distance, sizeof(pDistanceFu) );
cudaMemcpyFromSymbol(&hRadDecay, rad_decay, sizeof(pDecayFu) );
cudaMemcpyFromSymbol(&hLRateDecay, lrate_decay, sizeof(pDecayFu) );
distance = hDist;
rad_decay = hRadDecay;
lrate_decay = hLRateDecay;
}
struct sm20lrate_decay_functor {
float fCycle;
float fCycles;
DistFunction m_pfunc;
sm20lrate_decay_functor(const DistFunction &pfunc, float cycle, float cycles) : m_pfunc(pfunc), fCycle(cycle), fCycles(cycles) {}
__host__ __device__
float operator()(float lrate) {
return (m_pfunc.lrate_decay)(lrate, fCycle, fCycles);
}
};
void test() {
unsigned int iWidth = 4096;
thrust::device_vector<float> dvLearningRate(iWidth, 0.f);
thrust::device_vector<float> dvLRate(iWidth, 0.f);
float (*h_fcn_gaussian_nhood)(float dist, float sigmaT);
float (*h_fcn_rad_decay)(float sigma0, float T, float lambda);
float (*h_fcn_lrate_decay)(float sigma0, float T, float lambda);
cudaMemcpyFromSymbol(&h_fcn_gaussian_nhood, d_fcn_gaussian_nhood, sizeof(pDistanceFu) );
cudaMemcpyFromSymbol(&h_fcn_rad_decay, d_fcn_rad_decay, sizeof(pDecayFu) );
cudaMemcpyFromSymbol(&h_fcn_lrate_decay, d_fcn_lrate_decay, sizeof(pDecayFu) );
DistFunction fcn_gaussian = DistFunction(
(char*)"gaussian",
h_fcn_gaussian_nhood,
h_fcn_rad_decay,
h_fcn_lrate_decay
);
thrust::transform( dvLRate.begin(),
dvLRate.end(),
dvLearningRate.begin(),
sm20lrate_decay_functor(fcn_gaussian, 1, 100) );
}
$ nvcc -o f Functions.cu main.cpp -std=c++11
$ cuda-memcheck ./f
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
This is roughly following the first approach outlined here:
http://stackoverflow.com/questions/34879789/thrust-transform-throws-error-bulk-kernel-by-value-an-illegal-memory-access-w
I believe it should be possible to use the second approach (templating) as well, although I haven’t worked through an application of that to your code.
I’m not sure I entirely understand your style/usability complaints, so I can’t comment on whether or not such modifications meet your sense of decorum. I suspect your objections revolve around passing a device symbol as a function argument.