Segmentation fault when use addPlugin( ) in tensorRT2.1.

I am having trouble using INetwork::addPlugin(). I want to add a Reshape plugin when building CNN through API rather than caffe parser.

I write a plugin demo as follows. It generates error of segfault at nvinfer1::cudnn::PluginLayer::serializeParams() function. Could anyone help?

#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include <cassert>
#include <cmath>
#include <ctime>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <sys/stat.h>
#include <vector>
#include <algorithm>

#define CHECK(status)                                    \
{                                                        \
    if (status != 0)                                    \
    {                                                    \
        std::cout << "Cuda failure: " << status;        \
        abort();                                        \
    }                                                    \
}

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 4;
static const int INPUT_W = 4;
static const int OUTPUT_SIZE = INPUT_H * INPUT_W;

const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_BLOB_NAME = "output";

using namespace nvinfer1;
using namespace std;

// Logger for GIE info/warning/errors
class Logger : public nvinfer1::ILogger            
{
    public:
    void log(nvinfer1::ILogger::Severity severity, const char* msg) override
    {
        // suppress info-level messages
        if (severity == Severity::kINFO) return;

        switch (severity)
        {
            case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
            case Severity::kERROR: std::cerr << "ERROR: "; break;
            case Severity::kWARNING: std::cerr << "WARNING: "; break;
            case Severity::kINFO: std::cerr << "INFO: "; break;
            default: std::cerr << "UNKNOWN: "; break;
        }
        std::cerr << msg << std::endl;
    }
};

static Logger gLogger;

class Reshape : public IPlugin
{
public:
    Reshape() {}
    Reshape(const void* buffer, size_t size)
    {
        assert(size == sizeof(mCopySize));
        mCopySize = *reinterpret_cast<const size_t*>(buffer);
    }

    int getNbOutputs() const override
    {
        cout << "getNbOutputs" << endl;
        return 1;
    }
    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
    {
        cout << "getOutputDimensions" << endl;
        assert(nbInputDims == 1);
        assert(index == 0);
        assert(inputs[index].nbDims == 3);
        return DimsCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]);
    }

    int initialize() override
    {
        cout << "initialize" << endl;
        return 0;
    }

    void terminate() override
    {
        cout << "terminate" << endl;
    }

    size_t getWorkspaceSize(int) const override
    {
        cout << "getWorkspaceSize" << endl;
        return 0;
    }

    // currently it is not possible for a plugin to execute "in place". Therefore we memcpy the data from the input to the output buffer
    int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override
    {
        cout << "enqueue" << endl;
        CHECK(cudaMemcpyAsync(outputs[0], inputs[0], mCopySize * batchSize, cudaMemcpyDeviceToDevice, stream));
        return 0;
    }

    size_t getSerializationSize() override
    {
        cout << "getSerializationSize" << endl;
        return sizeof(mCopySize);
    }

    void serialize(void* buffer) override
    {
        cout << "serialize" << endl;
        *reinterpret_cast<size_t*>(buffer) = mCopySize;
    }

    void configure(const Dims*inputs, int nbInputs, const Dims* outputs, int nbOutputs, int)    override
    {
        cout << "configure" << endl;
        mCopySize = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2] * sizeof(float);
    }

protected:
    size_t mCopySize;
};

// Creat the Engine using only the API and not any parser.
ICudaEngine *
createMNISTEngine(unsigned int maxBatchSize, IBuilder *builder, DataType dt)
{
    INetworkDefinition* network = builder->createNetwork();

    //  Create input of shape { 1, 1, 28, 28 } with name referenced by INPUT_BLOB_NAME
    auto data = network->addInput(INPUT_BLOB_NAME, dt, DimsCHW{ 1, INPUT_H, INPUT_W});
    assert(data != nullptr);

    // Create a scale layer with default power/shift and specified scale parameter.
    float scale_param = 2.0;
    Weights power{DataType::kFLOAT, nullptr, 0};
    Weights shift{DataType::kFLOAT, nullptr, 0};
    Weights scale{DataType::kFLOAT, &scale_param, 1};
    auto scale_1 = network->addScale(*data,  ScaleMode::kUNIFORM, shift, scale, power);
    assert(scale_1 != nullptr);

    auto plugin_in = (scale_1->getOutput(0));
    // (const void* buffer, size_t size)
    size_t *reshape_size = new size_t;
    Reshape reshape(reshape_size, sizeof(size_t));
    auto plugin = network->addPlugin(&plugin_in, 1, reshape);

    auto res = plugin->getOutput(0);
    // Add a softmax layer to determine the probability.
    res->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*res);

    // Build the engine
    builder->setMaxBatchSize(maxBatchSize);
    builder->setMaxWorkspaceSize(1 << 20);

    auto engine = builder->buildCudaEngine(*network);

    // engine->serialize();
    
    // we don't need the network any more
    network->destroy();

    return engine;
}

void APIToModel(unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with)
             IHostMemory **modelStream)
{
    // create the builder
    IBuilder* builder = createInferBuilder(gLogger);

    // create the model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createMNISTEngine(maxBatchSize, builder, DataType::kFLOAT);

    assert(engine != nullptr);

    // serialize the engine, then close everything down
    (*modelStream) = engine->serialize();
    engine->destroy();
    builder->destroy();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();
    // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
    // of these, but in this case we know that there is exactly one input and one output.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // note that indices are guaranteed to be less than IEngine::getNbBindings()
    int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME), 
        outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // create GPU buffers and a stream
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // release the stream and the buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv)
{
    // create a model using the API directly and serialize it to a stream
    IHostMemory *modelStream{nullptr};

    APIToModel(1, &modelStream);

    float data[INPUT_H*INPUT_W];
    cout << "input: " << endl;
    for (int i = 0; i < INPUT_H*INPUT_W; i++) {
        data[i] = i - 10;
        cout << data[i] << " ";
    }
    std::cout << "\n\n";

    IRuntime* runtime = createInferRuntime(gLogger);
    ICudaEngine* engine = runtime->deserializeCudaEngine(modelStream->data(), modelStream->size(), nullptr);
    if (modelStream) modelStream->destroy();

    IExecutionContext *context = engine->createExecutionContext();

    // run inference
    float prob[OUTPUT_SIZE];
    doInference(*context, data, prob, 1);

    // destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    cout << "output: " << endl;
    for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
    {
    	cout << prob[i] << " ";
    }
    std::cout << std::endl;
    return 0;
}