Hey Aasta,
Ive been using this github repo: https://github.com/NVIDIA-AI-IOT/tf_to_trt_image_classification with edits made to the uff_to_plan.cpp file as below:
#include <iostream>
#include <string>
#include <sstream>
#include <fstream>
#include <NvInfer.h>
#include <NvUffParser.h>
using namespace std;
using namespace nvinfer1;
using namespace nvuffparser;
class Logger : public ILogger
{
void log(Severity severity, const char * msg) override
{
cout << msg << endl;
}
} gLogger;
int toInteger(string value)
{
int valueInteger;
stringstream ss;
ss << value;
ss >> valueInteger;
return valueInteger;
}
DataType toDataType(string value)
{
if (value == "float")
return DataType::kFLOAT;
else if (value == "half")
return DataType::kHALF;
else
throw runtime_error("Unsupported data type");
}
int main(int argc, char *argv[])
{
if (argc != 10)
{
cout << "Usage: <uff_filename> <plan_filename> <input_name> <input_height> <input_width>"
<< " <output_name> <max_batch_size> <max_workspace_size> <data_type>\n";
return 1;
}
/* parse command line arguments */
string uffFilename = argv[1];
string planFilename = argv[2];
string inputName = argv[3];
int inputHeight = toInteger(argv[4]);
int inputWidth = toInteger(argv[5]);
string outputName = argv[6];
int maxBatchSize = toInteger(argv[7]);
int maxWorkspaceSize = toInteger(argv[8]);
DataType dataType = toDataType(argv[9]);
/* parse uff */
IBuilder *builder = createInferBuilder(gLogger);
INetworkDefinition *network = builder->createNetwork();
IUffParser *parser = createUffParser();
parser->registerInput(inputName.c_str(), DimsCHW(3, inputHeight, inputWidth), UffInputOrder::kNCHW);
parser->registerOutput(outputName.c_str());
if (!parser->parse(uffFilename.c_str(), *network, dataType))
{
cout << "Failed to parse UFF\n";
builder->destroy();
parser->destroy();
network->destroy();
return 1;
}
/* build engine */
if (dataType == DataType::kHALF)
builder->setHalf2Mode(true);
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(maxWorkspaceSize);
builder->setDefaultDeviceType(DeviceType::kDLA);
builder->setDLACore(0);
builder->setFp16Mode(true);
builder->allowGPUFallback(true);
ICudaEngine *engine = builder->buildCudaEngine(*network);
/* serialize engine and write to file */
ofstream planFile;
planFile.open(planFilename);
IHostMemory *serializedEngine = engine->serialize();
planFile.write((char *)serializedEngine->data(), serializedEngine->size());
planFile.close();
/* break down */
builder->destroy();
parser->destroy();
network->destroy();
engine->destroy();
serializedEngine->destroy();
return 0;
}
And edits made to the test_trt.cu file as below:
/**
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
* Full license terms provided in LICENSE.md file.
*/
#include <iostream>
#include <string>
#include <vector>
#include <sstream>
#include <chrono>
#include <stdexcept>
#include <fstream>
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>
#include <NvInfer.h>
#define MS_PER_SEC 1000.0
using namespace std;
using namespace nvinfer1;
class TestConfig;
typedef void (*preprocess_fn_t)(float *input, size_t channels, size_t height, size_t width, int batchsize);
void preprocessVgg(float *input, size_t channels, size_t height, size_t width, int batchsize);
void preprocessInception(float *input, size_t channels, size_t height, size_t width, int batchsize);
size_t argmax(float *input, size_t numel, int batchsize);
void test(const TestConfig &testConfig);
class TestConfig
{
public:
string imagePath;
string planPath;
string inputNodeName;
string outputNodeName;
string preprocessFnName;
string inputHeight;
string inputWidth;
string numOutputCategories;
string dataType;
string maxBatchSize;
string workspaceSize;
string numRuns;
string useMappedMemory;
string statsPath;
TestConfig(int argc, char * argv[])
{
imagePath = argv[1];
planPath = argv[2];
inputNodeName = argv[3];
inputHeight = argv[4];
inputWidth = argv[5];
outputNodeName = argv[6];
numOutputCategories = argv[7];
preprocessFnName = argv[8];
numRuns = argv[9];
dataType = argv[10];
maxBatchSize = argv[11];
workspaceSize = argv[12];
useMappedMemory = argv[13];
statsPath = argv[14];
//imagePath = "/home/nrwv46/ml_sandbox/data/newfie0.JPEG";
}
static string UsageString()
{
string s = "";
s += "imagePath: \n";
s += "planPath: \n";
s += "inputNodeName: \n";
s += "inputHeight: \n";
s += "inputWidth: \n";
s += "outputNodeName: \n";
s += "numOutputCategories: \n";
s += "preprocessFnName: \n";
s += "numRuns: \n";
s += "dataType: \n";
s += "maxBatchSize: \n";
s += "workspaceSize: \n";
s += "useMappedMemory: \n";
s += "statsPath: \n";
return s;
}
string ToString()
{
string s = "";
s += "imagePath: " + imagePath + "\n";
s += "planPath: " + planPath + "\n";
s += "inputNodeName: " + inputNodeName + "\n";
s += "inputHeight: " + inputHeight + "\n";
s += "inputWidth: " + inputWidth + "\n";
s += "outputNodeName: " + outputNodeName + "\n";
s += "numOutputCategories: " + numOutputCategories + "\n";
s += "preprocessFnName: " + preprocessFnName + "\n";
s += "numRuns: " + numRuns + "\n";
s += "dataType: " + dataType + "\n";
s += "maxBatchSize: " + maxBatchSize + "\n";
s += "workspaceSize: " + workspaceSize + "\n";
s += "useMappedMemory: " + useMappedMemory + "\n";
s += "statsPath: " + statsPath + "\n";
return s;
}
static int ToInteger(string value)
{
int valueInt;
stringstream ss;
ss << value;
ss >> valueInt;
return valueInt;
}
preprocess_fn_t PreprocessFn() const {
if (preprocessFnName == "preprocess_vgg")
return preprocessVgg;
else if (preprocessFnName == "preprocess_inception")
return preprocessInception;
else
throw runtime_error("Invalid preprocessing function name.");
}
int InputWidth() const { return ToInteger(inputWidth); }
int InputHeight() const { return ToInteger(inputHeight); }
int NumOutputCategories() const { return ToInteger(numOutputCategories); }
nvinfer1::DataType DataType() const {
if (dataType == "float")
return nvinfer1::DataType::kFLOAT;
else if (dataType == "half")
return nvinfer1::DataType::kHALF;
else
throw runtime_error("Invalid data type.");
}
int MaxBatchSize() const { return ToInteger(maxBatchSize); }
int WorkspaceSize() const { return ToInteger(workspaceSize); }
int NumRuns() const { return ToInteger(numRuns); }
int UseMappedMemory() const { return ToInteger(useMappedMemory); }
};
class Logger : public ILogger
{
void log(Severity severity, const char * msg) override
{
cout << msg << endl;
}
} gLogger;
int main(int argc, char * argv[])
{
//if (argc != 15)
//{
// cout << TestConfig::UsageString() << endl;
// return 0;
// }
TestConfig testConfig(argc, argv);
cout << "\ntestConfig: \n" << testConfig.ToString() << endl;
test(testConfig);
return 0;
}
void preprocessVgg(float * tensor, size_t channels, size_t height, size_t width, int batchsize)
{
const size_t strides[3] = { height * width, width, 1 };
const float mean[3] = { 123.68, 116.78, 103.94 };
for (int h = 1; h < batchsize + 1; h++) {
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
for (int k = 0; k < channels; k++)
{
const size_t offset = h * (k * strides[0] + i * strides[1] + j * strides[2]);
tensor[offset] -= mean[k];
}
}
}
}
}
void preprocessInception(float * tensor, size_t channels, size_t height, size_t width, int batchsize)
{
const size_t numel = channels * height * width;
for (int i = 0; i < numel; i++)
tensor[i] = 2.0 * (tensor[i] / 255.0 - 0.5);
}
size_t argmax(float * tensor, size_t numel, int batchsize)
{
if (numel <= 0)
return 0;
size_t maxIndex = 0;
float max = tensor[0];
for (int j = 1; j < batchsize+1; j++) {
vector <float> top_five;
for (int i = 0 + ((j-1)* numel) ; i < j * numel; i++)
{
top_five.push_back(tensor[i]);
if (tensor[i] > max)
{
maxIndex = i;
max = tensor[i];
}
cout << "val: " << tensor[i] <<" : " <<i << endl;
}
sort(top_five.begin(), top_five.end());
reverse(top_five.begin(), top_five.end());
cout << "\nClass value for Batch: " <<j << " is: " << maxIndex << " : " << max << endl;
cout << " Top five: ";
for(int b=0; b < 5; b++) {
cout << top_five[b]<< " ";
}
maxIndex = 0;
max = tensor[numel];
}
return maxIndex;
}
void test(const TestConfig &testConfig)
{
ifstream planFile(testConfig.planPath);
stringstream planBuffer;
planBuffer << planFile.rdbuf();
string plan = planBuffer.str();
IRuntime *runtime = createInferRuntime(gLogger);
ICudaEngine *engine = runtime->deserializeCudaEngine((void*)plan.data(),
plan.size(), nullptr);
IExecutionContext *context = engine->createExecutionContext();
int inputBindingIndex, outputBindingIndex;
inputBindingIndex = engine->getBindingIndex(testConfig.inputNodeName.c_str());
outputBindingIndex = engine->getBindingIndex(testConfig.outputNodeName.c_str());
int batchsize = stoi(testConfig.maxBatchSize, nullptr);
vector<cv::Mat> image_array;
// load and preprocess image
for(int i =0; i < batchsize; i++)
{
cv::Mat image = cv::imread(testConfig.imagePath, CV_LOAD_IMAGE_COLOR);
cv::cvtColor(image, image, cv::COLOR_BGR2RGB, 3);
cv::resize(image, image, cv::Size(testConfig.InputWidth(), testConfig.InputHeight()));
image_array.push_back(image);
}
const float mean[3] = { 123.68, 116.78, 103.94 };
const size_t height = image_array[0].rows;
const size_t width = image_array[0].cols;
const size_t channels = image_array[0].channels();
float input[height * width * channels * batchsize];
for (int i = 0, volImg = channels * height * width; i < batchsize; ++i)
{
for (int c = 0; c < channels; ++c)
{
// the color image to input should be in BGR order
for (unsigned j = 0, volChl = height * width; j < volChl; ++j){
input[i * volImg + c * volChl + j] = float(image_array[i].data[j * channels] - mean[c]);
}
}
}
// allocate memory on host / device for input / output
float *output;
float *inputDevice;
float *outputDevice;
size_t inputSize = batchsize * testConfig.InputHeight() * testConfig.InputWidth() * 3 * sizeof(float);
// need to multiply it by batch size below
cudaHostAlloc(&output, batchsize * testConfig.NumOutputCategories() * sizeof(float), cudaHostAllocMapped);
if (testConfig.UseMappedMemory())
{
cudaHostGetDevicePointer(&inputDevice, input, 0);
cudaHostGetDevicePointer(&outputDevice, output, 0);
}
else
{
cudaMalloc(&inputDevice, inputSize);
// need to multiply it by batch size below
cudaMalloc(&outputDevice, batchsize * testConfig.NumOutputCategories() * sizeof(float));
}
float *bindings[2];
bindings[inputBindingIndex] = inputDevice;
bindings[outputBindingIndex] = outputDevice;
// run and compute average time over numRuns iterations
double avgTime_in = 0;
double avgTime_exec = 0;
double avgTime_out = 0;
for (int i = 0; i < testConfig.NumRuns() + 1; i++)
{
chrono::duration<double> exec_diff;
chrono::duration<double> in_diff;
chrono::duration<double> out_diff;
if (testConfig.UseMappedMemory())
{
auto t0 = chrono::steady_clock::now();
context->execute(1, (void**)bindings);
auto t1 = chrono::steady_clock::now();
exec_diff = t1 - t0;
}
else
{
auto t0 = chrono::steady_clock::now();
cudaMemcpy(inputDevice, input, inputSize, cudaMemcpyHostToDevice); //seg fault here
auto t1 = chrono::steady_clock::now();
//change the 1 to batchsize
context->execute(batchsize, (void**)bindings);
auto t2 = chrono::steady_clock::now();
// need to multiply it by batch size below
cudaMemcpy(output, outputDevice, batchsize * testConfig.NumOutputCategories() * sizeof(float), cudaMemcpyDeviceToHost);
auto t3 = chrono::steady_clock::now();
in_diff = t1 - t0;
exec_diff = t2 - t1;
out_diff = t3 - t2;
}
if (i != 0)
avgTime_exec += MS_PER_SEC * exec_diff.count();
avgTime_in += MS_PER_SEC * in_diff.count();
avgTime_out += MS_PER_SEC * out_diff.count();
}
avgTime_in /= testConfig.NumRuns();
avgTime_exec /= (testConfig.NumRuns() * batchsize);
avgTime_out /= testConfig.NumRuns();
// save results to file
int maxCategoryIndex = argmax(output, testConfig.NumOutputCategories(), batchsize) + 1001 - testConfig.NumOutputCategories();
//cout << "\nMost likely category id is " << maxCategoryIndex << endl;
cout << "\nAverage Input Loading time in ms is " << avgTime_in << endl;
cout << "Average execution time/per image in ms is " << avgTime_exec << endl;
cout << "Average execution time total in ms is " << avgTime_exec * batchsize << endl;
cout << "Average Output Loading time in ms is " << avgTime_out << endl;
ofstream outfile;
outfile.open(testConfig.statsPath, ios_base::app);
outfile << "\n" << testConfig.planPath
<< " " << avgTime_exec;
// << " " << maxCategoryIndex
// << " " << testConfig.InputWidth()
// << " " << testConfig.InputHeight()
// << " " << testConfig.MaxBatchSize()
// << " " << testConfig.WorkspaceSize()
// << " " << testConfig.dataType
// << " " << testConfig.NumRuns()
// << " " << testConfig.UseMappedMemory();
outfile.close();
cudaFree(inputDevice);
cudaFree(outputDevice);
cudaFreeHost(input);
cudaFreeHost(output);
engine->destroy();
context->destroy();
runtime->destroy();
}
Other than that I pretty much use the same method described in the Github’s test_trt.py section and in the convert_plan.py section. The models are from the github’s download link as well.