nvidia-smi显示的CUDA版本和nvcc -V得到的CUDA版本会不一致,nvidia-smi显示的是支持的最高的
1.安装CUDA,从官网下载,可下在run的和deb的,按照提示的命令安装,安装完成后/usr/local/cuda*路径下
2.安装cudnn,从官网下载,下载tar版本的,解压后有include和lib64文件夹,执行如下命令安装
sudo cp cuda/include/cudnn.h /usr/local/cuda-11.3/include/ sudo cp cuda/lib64/libcudnn* /usr/local/cuda-11.3/lib64/ sudo chmod a+r /usr/local/cuda-11.3/include/cudnn.h sudo chmod a+r /usr/local/cuda-11.3/lib64/libcudnn*
3.安装TensorRT,从官网下载,下载tar版本,解压后有lib64文件夹,路径加入环境变量,进入python文件夹安装whl
注意事项:
1.原始CUDA但版本不对,一定要匹配,要不麻烦死,卸载nvidia,出错可以去“软件和更新”那其他软件那去掉一下相关的对沟,要不卸载不了,显示包不对应
sudo apt-get remove --auto-remove nvidia-cuda-toolkit
2.查看CUDA版本
cat /usr/local/cuda-11.3/version.json
3.添加环境变量
vim ~/.bashrc
加入
export PATH="/usr/local/cuda-11.3/bin:$PATH" export LD_LIBRARY_PATH="/usr/local/cuda-11.3/lib64:$LD_LIBRARY_PATH" export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/nercar/TensorRT-7.0.0.11/lib export CUDA_INSTALL_DIR=/usr/local/cuda-11.3/lib64 export CUDNN_INSTALL_DIR=/usr/local/cuda-11.3/lib64
执行
source ~/.bashrc nvcc -V
4.pycharm上出错没有so文件时,通过在pycharm配置环境变量解决,run里边配置环境
PYTHONUNBUFFERED=1;CUDNN_HOME=/usr/local/cuda-11.3/lib64;LD_LIBRARY_PATH=/usr/local/cuda-11.3/lib64\;/home/nercar/TensorRT-8.0.1.6/lib
5.tensorrt8以上会出现有些东西未声明,可以参照自带的历程
实现bisenet的推理
pth转onnx
import argparse import torch from lib.models import model_factory from configs import set_cfg_from_file import sys sys.path.insert(0, '.') torch.set_grad_enabled(False) parse = argparse.ArgumentParser() parse.add_argument('--config', dest='config', type=str, default='../configs/bisenetv2_city.py', ) parse.add_argument('--weight-path', dest='weight_pth', type=str, default='model/bisenetv2.pth') parse.add_argument('--outpath', dest='out_pth', type=str, default='model/bisenetv2.onnx') parse.add_argument('--aux-mode', dest='aux_mode', type=str, default='pred') args = parse.parse_args() cfg = set_cfg_from_file(args.config) if cfg.use_sync_bn: cfg.use_sync_bn = False net = model_factory[cfg.model_type](cfg.n_cats, aux_mode=args.aux_mode) net.load_state_dict(torch.load(args.weight_pth, map_location='cpu'), strict=False) net.eval() dummy_input = torch.randn(1, 3, 1024, 2048) input_names = ['input_image'] output_names = ['preds', ] torch.onnx.export(net, dummy_input, args.out_pth, input_names=input_names, output_names=output_names, verbose=False, opset_version=11)
onnx转trt
from __future__ import print_function import os import sys import cv2 import time import common import numpy as np import tensorrt as trt sys.path.insert(1, os.path.join(sys.path[0], "..")) TRT_LOGGER = trt.Logger() def get_engine(onnx_file_path, engine_file_path=""): def build_engine(): with trt.Builder(TRT_LOGGER) as builder, \ builder.create_network(common.EXPLICIT_BATCH) as network, \ builder.create_builder_config() as config, \ trt.OnnxParser(network, TRT_LOGGER) as parser, \ trt.Runtime(TRT_LOGGER) as runtime: config.max_workspace_size = 1 << 30 # 256MiB builder.max_batch_size = 1 if not os.path.exists(onnx_file_path): print('ONNX file {} not found.'.format(onnx_file_path)) exit(0) print('Loading ONNX file from path {}...'.format(onnx_file_path)) with open(onnx_file_path, 'rb') as model: print('Beginning ONNX file parsing') if not parser.parse(model.read()): print('ERROR: Failed to parse the ONNX file.') for error in range(parser.num_errors): print(parser.get_error(error)) return None network.get_input(0).shape = [1, 3, 512, 1024] print('Completed parsing of ONNX file') print('Building an engine from file {}; this may take a while...'.format(onnx_file_path)) plan = builder.build_serialized_network(network, config) engine = runtime.deserialize_cuda_engine(plan) print("Completed creating Engine") with open(engine_file_path, "wb") as f: f.write(plan) return engine if os.path.exists(engine_file_path): print("Reading engine from file {}".format(engine_file_path)) with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: return runtime.deserialize_cuda_engine(f.read()) else: return build_engine() mean = (0.3257, 0.3690, 0.3223) std = (0.2112, 0.2148, 0.2115) def main(): onnx_file_path = 'model/bisenetv2.onnx' engine_file_path = "model/bisenetv2.trt" input_image_path = 'data/1.png' image = cv2.imread(input_image_path, 1) image = cv2.resize(image, (1024, 512)) image = image/255. # image = ((image / 255.0) - mean) / std image = np.transpose(image, [2, 0, 1]) image = np.expand_dims(image, axis=0) image = np.array(image, dtype=np.float32, order='C') with get_engine(onnx_file_path, engine_file_path) as engine, engine.create_execution_context() as context: inputs, outputs, bindings, stream = common.allocate_buffers(engine) print('Running inference on image {}...'.format(input_image_path)) inputs[0].host = image start = time.time() trt_outputs = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) end = time.time() print("use time:", end-start) # stream.synchronize() dst = trt_outputs[0].reshape((1024, 2048)) cv2.namedWindow("dst", 0) cv2.imshow("dst", np.array(dst, np.uint8) * 10) cv2.waitKey(0) if __name__ == '__main__': main()
common
import argparse import os import time import numpy as np import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) def GiB(val): return val * 1 << 30 def add_help(description): parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) args, _ = parser.parse_known_args() def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[], err_msg=""): # Standard command-line arguments for all samples. kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data") parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-d", "--datadir", help="Location of the TensorRT sample data directory, and any additional data directories.", action="append", default=[kDEFAULT_DATA_ROOT]) args, _ = parser.parse_known_args() def get_data_path(data_dir): # If the subfolder exists, append it to the path, otherwise use the provided path as-is. data_path = os.path.join(data_dir, subfolder) if not os.path.exists(data_path): if data_dir != kDEFAULT_DATA_ROOT: print("WARNING: " + data_path + " does not exist. Trying " + data_dir + " instead.") data_path = data_dir # Make sure data directory exists. if not (os.path.exists(data_path)) and data_dir != kDEFAULT_DATA_ROOT: print("WARNING: {:} does not exist. Please provide the correct data path with the -d option.".format( data_path)) return data_path data_paths = [get_data_path(data_dir) for data_dir in args.datadir] return data_paths, locate_files(data_paths, find_files, err_msg) def locate_files(data_paths, filenames, err_msg=""): found_files = [None] * len(filenames) for data_path in data_paths: # Find all requested files. for index, (found, filename) in enumerate(zip(found_files, filenames)): if not found: file_path = os.path.abspath(os.path.join(data_path, filename)) if os.path.exists(file_path): found_files[index] = file_path # Check that all files were found for f, filename in zip(found_files, filenames): if not f or not os.path.exists(f): raise FileNotFoundError( "Could not find {:}. Searched in data paths: {:}\n{:}".format(filename, data_paths, err_msg)) return found_files # Simple helper data class that's a little nicer to use than a 2-tuple. class HostDeviceMem(object): def __init__(self, host_mem, device_mem): self.host = host_mem self.device = device_mem def __str__(self): return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) def __repr__(self): return self.__str__() # Allocates all buffers required for an engine, i.e. host/device inputs/outputs. def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] stream.synchronize() return [out.host for out in outputs] def do_inference_v2(context, bindings, inputs, outputs, stream): [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # stream.synchronize() return [out.host for out in outputs]