I am having trouble using INetwork::addPlugin(). I want to add a Reshape plugin when building CNN through API rather than caffe parser.
I write a plugin demo as follows. It generates error of segfault at nvinfer1::cudnn::PluginLayer::serializeParams() function. Could anyone help?
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include <cassert>
#include <cmath>
#include <ctime>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <sys/stat.h>
#include <vector>
#include <algorithm>
#define CHECK(status) \
{ \
if (status != 0) \
{ \
std::cout << "Cuda failure: " << status; \
abort(); \
} \
}
// stuff we know about the network and the input/output blobs
static const int INPUT_H = 4;
static const int INPUT_W = 4;
static const int OUTPUT_SIZE = INPUT_H * INPUT_W;
const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_BLOB_NAME = "output";
using namespace nvinfer1;
using namespace std;
// Logger for GIE info/warning/errors
class Logger : public nvinfer1::ILogger
{
public:
void log(nvinfer1::ILogger::Severity severity, const char* msg) override
{
// suppress info-level messages
if (severity == Severity::kINFO) return;
switch (severity)
{
case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
case Severity::kERROR: std::cerr << "ERROR: "; break;
case Severity::kWARNING: std::cerr << "WARNING: "; break;
case Severity::kINFO: std::cerr << "INFO: "; break;
default: std::cerr << "UNKNOWN: "; break;
}
std::cerr << msg << std::endl;
}
};
static Logger gLogger;
class Reshape : public IPlugin
{
public:
Reshape() {}
Reshape(const void* buffer, size_t size)
{
assert(size == sizeof(mCopySize));
mCopySize = *reinterpret_cast<const size_t*>(buffer);
}
int getNbOutputs() const override
{
cout << "getNbOutputs" << endl;
return 1;
}
Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
{
cout << "getOutputDimensions" << endl;
assert(nbInputDims == 1);
assert(index == 0);
assert(inputs[index].nbDims == 3);
return DimsCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]);
}
int initialize() override
{
cout << "initialize" << endl;
return 0;
}
void terminate() override
{
cout << "terminate" << endl;
}
size_t getWorkspaceSize(int) const override
{
cout << "getWorkspaceSize" << endl;
return 0;
}
// currently it is not possible for a plugin to execute "in place". Therefore we memcpy the data from the input to the output buffer
int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override
{
cout << "enqueue" << endl;
CHECK(cudaMemcpyAsync(outputs[0], inputs[0], mCopySize * batchSize, cudaMemcpyDeviceToDevice, stream));
return 0;
}
size_t getSerializationSize() override
{
cout << "getSerializationSize" << endl;
return sizeof(mCopySize);
}
void serialize(void* buffer) override
{
cout << "serialize" << endl;
*reinterpret_cast<size_t*>(buffer) = mCopySize;
}
void configure(const Dims*inputs, int nbInputs, const Dims* outputs, int nbOutputs, int) override
{
cout << "configure" << endl;
mCopySize = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2] * sizeof(float);
}
protected:
size_t mCopySize;
};
// Creat the Engine using only the API and not any parser.
ICudaEngine *
createMNISTEngine(unsigned int maxBatchSize, IBuilder *builder, DataType dt)
{
INetworkDefinition* network = builder->createNetwork();
// Create input of shape { 1, 1, 28, 28 } with name referenced by INPUT_BLOB_NAME
auto data = network->addInput(INPUT_BLOB_NAME, dt, DimsCHW{ 1, INPUT_H, INPUT_W});
assert(data != nullptr);
// Create a scale layer with default power/shift and specified scale parameter.
float scale_param = 2.0;
Weights power{DataType::kFLOAT, nullptr, 0};
Weights shift{DataType::kFLOAT, nullptr, 0};
Weights scale{DataType::kFLOAT, &scale_param, 1};
auto scale_1 = network->addScale(*data, ScaleMode::kUNIFORM, shift, scale, power);
assert(scale_1 != nullptr);
auto plugin_in = (scale_1->getOutput(0));
// (const void* buffer, size_t size)
size_t *reshape_size = new size_t;
Reshape reshape(reshape_size, sizeof(size_t));
auto plugin = network->addPlugin(&plugin_in, 1, reshape);
auto res = plugin->getOutput(0);
// Add a softmax layer to determine the probability.
res->setName(OUTPUT_BLOB_NAME);
network->markOutput(*res);
// Build the engine
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(1 << 20);
auto engine = builder->buildCudaEngine(*network);
// engine->serialize();
// we don't need the network any more
network->destroy();
return engine;
}
void APIToModel(unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with)
IHostMemory **modelStream)
{
// create the builder
IBuilder* builder = createInferBuilder(gLogger);
// create the model to populate the network, then set the outputs and create an engine
ICudaEngine* engine = createMNISTEngine(maxBatchSize, builder, DataType::kFLOAT);
assert(engine != nullptr);
// serialize the engine, then close everything down
(*modelStream) = engine->serialize();
engine->destroy();
builder->destroy();
}
void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine.getNbBindings() == 2);
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME),
outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// release the stream and the buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}
int main(int argc, char** argv)
{
// create a model using the API directly and serialize it to a stream
IHostMemory *modelStream{nullptr};
APIToModel(1, &modelStream);
float data[INPUT_H*INPUT_W];
cout << "input: " << endl;
for (int i = 0; i < INPUT_H*INPUT_W; i++) {
data[i] = i - 10;
cout << data[i] << " ";
}
std::cout << "\n\n";
IRuntime* runtime = createInferRuntime(gLogger);
ICudaEngine* engine = runtime->deserializeCudaEngine(modelStream->data(), modelStream->size(), nullptr);
if (modelStream) modelStream->destroy();
IExecutionContext *context = engine->createExecutionContext();
// run inference
float prob[OUTPUT_SIZE];
doInference(*context, data, prob, 1);
// destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();
cout << "output: " << endl;
for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
{
cout << prob[i] << " ";
}
std::cout << std::endl;
return 0;
}